diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 815687fa174..d55df6deb4c 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1 @@ -* @kolchfa-aws @Naarcha-AWS @vagimeli @AMoo-Miki @natebower @dlvenable @stephen-crawford @epugh +* @kolchfa-aws @Naarcha-AWS @AMoo-Miki @natebower @dlvenable @epugh @sumobrian diff --git a/.github/ISSUE_TEMPLATE/issue_template.md b/.github/ISSUE_TEMPLATE/issue_template.md index 09855299403..b3e03c0ffe3 100644 --- a/.github/ISSUE_TEMPLATE/issue_template.md +++ b/.github/ISSUE_TEMPLATE/issue_template.md @@ -15,7 +15,6 @@ assignees: '' **Tell us about your request.** Provide a summary of the request. -***Version:** List the OpenSearch version to which this issue applies, e.g. 2.14, 2.12--2.14, or all. +**Version:** List the OpenSearch version to which this issue applies, e.g. 2.14, 2.12--2.14, or all. **What other resources are available?** Provide links to related issues, POCs, steps for testing, etc. - diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index fd4213b7e55..2cbea823848 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -2,7 +2,7 @@ _Describe what this change achieves._ ### Issues Resolved -Closes #[_insert issue number_] +Closes #[_delete this text, including the brackets, and replace with the issue number_] ### Version _List the OpenSearch version to which this PR applies, e.g. 2.14, 2.12--2.14, or all._ diff --git a/.github/vale/styles/OpenSearch/LinksExplicit.yml b/.github/vale/styles/OpenSearch/LinksExplicit.yml new file mode 100644 index 00000000000..a734259a3a7 --- /dev/null +++ b/.github/vale/styles/OpenSearch/LinksExplicit.yml @@ -0,0 +1,8 @@ +extends: existence +message: "In links, use '{{site.url}}{{site.baseurl}}' instead of 'https://www.opensearch.org/docs/latest'." +level: error +nonword: true +scope: raw +tokens: + - '\]\(https:\/\/www.opensearch.org\/docs\/latest' + \ No newline at end of file diff --git a/.github/vale/styles/OpenSearch/SubstitutionErroCaseSensitive.yml b/.github/vale/styles/OpenSearch/SubstitutionErroCaseSensitive.yml new file mode 100644 index 00000000000..6934470f1fc --- /dev/null +++ b/.github/vale/styles/OpenSearch/SubstitutionErroCaseSensitive.yml @@ -0,0 +1,8 @@ +extends: substitution +message: "Use '%s' instead of '%s'. Note the correct capitalization." +ignorecase: false +level: error +action: + name: replace +swap: + 'Retrieval-Augmented Generation': retrieval-augmented generation \ No newline at end of file diff --git a/.github/vale/styles/OpenSearch/SubstitutionsError.yml b/.github/vale/styles/OpenSearch/SubstitutionsError.yml index fdedce44d8e..d4d14f4162f 100644 --- a/.github/vale/styles/OpenSearch/SubstitutionsError.yml +++ b/.github/vale/styles/OpenSearch/SubstitutionsError.yml @@ -37,6 +37,7 @@ swap: 'pre-trained': pretrained 'premigration': pre-migration 're-enable': reenable + 'retrieval augmented generation': retrieval-augmented generation 'screen shot': screenshot 'sample request': example request 'sample response': example response @@ -53,3 +54,4 @@ swap: 'web site': website 'whitespace': white space 'user interface \(UI\)': UI + 'judgement': judgment diff --git a/.github/vale/styles/Vocab/OpenSearch/Products/accept.txt b/.github/vale/styles/Vocab/OpenSearch/Products/accept.txt index e33ac09744b..a308212b734 100644 --- a/.github/vale/styles/Vocab/OpenSearch/Products/accept.txt +++ b/.github/vale/styles/Vocab/OpenSearch/Products/accept.txt @@ -5,12 +5,16 @@ Amazon Amazon OpenSearch Serverless Amazon OpenSearch Service Amazon Bedrock +Amazon Kinesis Amazon SageMaker +AWS Secrets Manager Ansible Anthropic Claude +Apache Kafka Auditbeat AWS Cloud Cohere Command +Cohere Embed Cohere Embed English Cohere Embed Multilingual Cognito @@ -48,8 +52,10 @@ JSON Web Token Keycloak Kerberos Kibana +Kinesis Kubernetes Lambda +Langflow Linux Log4j Logstash @@ -68,6 +74,7 @@ OpenSearch Assistant OpenSearch Assistant Toolkit OpenSearch Benchmark OpenSearch Dashboards +OpenSearch Flow OpenSearch Playground OpenSearch Project OpenSearch Service @@ -85,16 +92,21 @@ Python PyTorch Querqy Query Workbench +RankLib RCF Summarize RPM Package Manager Ruby +Search Relevance Workbench Simple Schema for Observability Tableau Textract +Titan Titan Multimodal Embeddings Titan Text Embeddings TorchScript Tribuo VisBuilder Winlogbeat -Zstandard \ No newline at end of file +XGBoost +Zipf +Zstandard diff --git a/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt b/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt index c6d129c2c5f..06fd00cfe73 100644 --- a/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt +++ b/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt @@ -54,6 +54,7 @@ gibibyte [Ii]nstrumentations? [Ii]ntracluster [Jj]avadoc +[Jj]accard k-NN [Kk]eystore kibibyte @@ -85,6 +86,9 @@ p\d{2} [Pp]erformant [Pp]laintext [Pp]luggable +[Pp]ointwise +[Pp]reaggregate(s|d)? +[Pp]recompute(s|d)? [Pp]reconfigure [Pp]refetch [Pp]refilter @@ -94,6 +98,7 @@ p\d{2} [Pp]repper [Pp]reprocess [Pp]retrain +[Pp]rotobufs? [Pp]seudocode [Qq]uantiles? [Qq]uantiz(e|ation|ing|er) @@ -152,4 +157,4 @@ tebibyte [Uu]pvote(s|d)? [Ww]alkthrough [Ww]ebpage -xy \ No newline at end of file +xy diff --git a/.github/vale/styles/Vocab/OpenSearch/Words/reject.txt b/.github/vale/styles/Vocab/OpenSearch/Words/reject.txt index 8b137891791..5469b4d99e6 100644 --- a/.github/vale/styles/Vocab/OpenSearch/Words/reject.txt +++ b/.github/vale/styles/Vocab/OpenSearch/Words/reject.txt @@ -1 +1 @@ - +[Aa]ss diff --git a/.github/vale/tests/test-style-pos.md b/.github/vale/tests/test-style-pos.md index 1cf640f463f..ae7d74f1a9b 100644 --- a/.github/vale/tests/test-style-pos.md +++ b/.github/vale/tests/test-style-pos.md @@ -38,6 +38,8 @@ This sentence tests [links end slash]({{site.url}}{{site.baseurl}}/opensearch). This sentence tests [links mid slash]({{site.url}}{{site.baseurl}}opensearch). +This sentence tests [links explicit](https://www.opensearch.org/docs/latest/double-slash/). + This sentence tests log-in as a noun. To login, we test this as a verb. To test merge conflicts, remove tick marks in `<<<<<<< HEAD`. @@ -74,6 +76,8 @@ This sentence tests splling. This sentence tests substitution error by using the word indices. +This sentence tests substitution case-sensitive error by using the word Retrieval-Augmented Generation. + This sentence tests substitution suggestion due to its nature. This Table | tests capitalization diff --git a/.github/workflows/.delete_backport_branch.yml.swp b/.github/workflows/.delete_backport_branch.yml.swp deleted file mode 100644 index 248b66532ae..00000000000 Binary files a/.github/workflows/.delete_backport_branch.yml.swp and /dev/null differ diff --git a/.github/workflows/automerge-backport.yml b/.github/workflows/automerge-backport.yml index 0d33634862d..efa0bbd1462 100644 --- a/.github/workflows/automerge-backport.yml +++ b/.github/workflows/automerge-backport.yml @@ -25,7 +25,7 @@ jobs: MERGE_LABELS: "backport-automerge,!On hold" MERGE_FILTER_AUTHOR: "opensearch-trigger-bot[bot]" MERGE_REQUIRED_APPROVALS: "1" - MERGE_RETRIES: "20" + MERGE_RETRIES: "30" MERGE_RETRY_SLEEP: "10000" MERGE_ERROR_FAIL: "true" MERGE_FORKS: "false" diff --git a/.github/workflows/jekyll-spec-insert.yml b/.github/workflows/jekyll-spec-insert.yml new file mode 100644 index 00000000000..cefd477be23 --- /dev/null +++ b/.github/workflows/jekyll-spec-insert.yml @@ -0,0 +1,20 @@ +name: Lint and Test Jekyll Spec Insert +on: + push: + paths: + - 'spec-insert/**' + pull_request: + paths: + - 'spec-insert/**' +jobs: + lint-and-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ruby/setup-ruby@v1 + with: { ruby-version: 3.3.0 } + - run: bundle install + - working-directory: spec-insert + run: | + bundle exec rubocop + bundle exec rspec diff --git a/.github/workflows/pr_checklist.yml b/.github/workflows/pr_checklist.yml index b56174793e4..e34d0cecb2f 100644 --- a/.github/workflows/pr_checklist.yml +++ b/.github/workflows/pr_checklist.yml @@ -29,7 +29,7 @@ jobs: with: script: | let assignee = context.payload.pull_request.user.login; - const prOwners = ['Naarcha-AWS', 'kolchfa-aws', 'vagimeli', 'natebower']; + const prOwners = ['Naarcha-AWS', 'kolchfa-aws', 'natebower']; if (!prOwners.includes(assignee)) { assignee = 'kolchfa-aws' @@ -40,4 +40,4 @@ jobs: owner: context.repo.owner, repo: context.repo.repo, assignees: [assignee] - }); \ No newline at end of file + }); diff --git a/.github/workflows/update-api-components.yml b/.github/workflows/update-api-components.yml new file mode 100644 index 00000000000..c4377267a49 --- /dev/null +++ b/.github/workflows/update-api-components.yml @@ -0,0 +1,52 @@ +name: Update API Components +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * 0" # Every Sunday at midnight GMT +jobs: + update-api-components: + if: ${{ github.repository == 'opensearch-project/documentation-website' }} + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + + - run: git config --global pull.rebase true + + - uses: ruby/setup-ruby@v1 + with: { ruby-version: 3.3.0 } + + - run: bundle install + + - name: Download spec and insert into documentation + run: bundle exec jekyll spec-insert -F -R + + - name: Get current date + id: date + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV + + - name: GitHub App token + id: github_app_token + uses: tibdex/github-app-token@v2.1.0 + with: + app_id: ${{ secrets.APP_ID }} + private_key: ${{ secrets.APP_PRIVATE_KEY }} + + - name: Create pull request + uses: peter-evans/create-pull-request@v6 + with: + token: ${{ steps.github_app_token.outputs.token }} + commit-message: "Updated API components to reflect the latest OpenSearch API spec (${{ env.date }})" + title: "[AUTOCUT] Update API components to reflect the latest OpenSearch API spec (${{ env.date }})" + body: | + Update API components to reflect the latest [OpenSearch API spec](https://github.com/opensearch-project/opensearch-api-specification/releases/download/main-latest/opensearch-openapi.yaml). + Date: ${{ env.date }} + branch: update-api-components-${{ env.date }} + base: main + signoff: true + labels: autocut \ No newline at end of file diff --git a/.ruby-version b/.ruby-version deleted file mode 100644 index 47725433179..00000000000 --- a/.ruby-version +++ /dev/null @@ -1 +0,0 @@ -3.3.2 diff --git a/.vale.ini b/.vale.ini index 2fb470b9dc8..3a4171b36a6 100644 --- a/.vale.ini +++ b/.vale.ini @@ -8,12 +8,13 @@ BasedOnStyles = Vale, OpenSearch BlockIgnores = {%-?\s*comment[.|\s|\S]*?endcomment\s*-?%}, \ {%\s*raw[.|\s|\S]*?endraw\s*%}, \ + {%\s*capture[.|\s|\S]*?endcapture\s*%}, \ {:+\s*[\.\w-\s]*\s*}, \ - {%\s+[^%]*%} + {%\s*[^%]*%} # ignore variables -TokenIgnores = [a-zA-Z_]+((?:_|\.)[a-zA-Z]+)+ - +TokenIgnores = [_a-zA-Z][_a-zA-Z0-9]*(?:[_\.][_a-zA-Z0-9]+)+ + # override Vale spelling Vale.Spelling = NO Vale.Repetition = NO @@ -38,6 +39,7 @@ OpenSearch.LatinismsSubstitution = YES OpenSearch.LinksDoubleParentheses = YES OpenSearch.LinksDoubleSlash = YES OpenSearch.LinksEndSlash = YES +OpenSearch.LinksExplicit = YES OpenSearch.LinksMidSlash = YES OpenSearch.LoginNoun = YES OpenSearch.LoginVerb = YES @@ -60,6 +62,7 @@ OpenSearch.SpacingSlash = YES OpenSearch.SpacingWords = YES OpenSearch.Spelling = YES OpenSearch.StackedHeadings = YES +OpenSearch.SubstitutionsErrorCaseSensitive = YES OpenSearch.SubstitutionsError = YES OpenSearch.SubstitutionsSuggestion = YES OpenSearch.TableHeadings = YES diff --git a/API_STYLE_GUIDE.md b/API_STYLE_GUIDE.md index a058bbe7c22..fbe1febff95 100644 --- a/API_STYLE_GUIDE.md +++ b/API_STYLE_GUIDE.md @@ -31,13 +31,13 @@ The following sections describe the basic API documentation structure. Each sect Depending on where the documentation appears within a section or subsection, heading levels may be adjusted to fit with other content. 1. Name of API (heading level 2) -1. (Optional) Path and HTTP methods (heading level 3) -1. Path parameters (heading level 3) -1. Query parameters (heading level 3) -1. Request fields (heading level 3) +1. Endpoints (heading level 3) +1. (Optional) Path parameters (heading level 3) +1. (Optional) Query parameters (heading level 3) +1. (Optional) Request fields (heading level 3) 1. Example request (heading level 4) -1. Example response (heading level 4) -1. Response fields (heading level 3) +1. (Optional) Example response (heading level 4) +1. (Optional) Response fields (heading level 3) ## API name @@ -55,7 +55,7 @@ If applicable, provide any caveats to its usage with a note or tip, as in the fo "If you use the Security plugin, make sure you have the appropriate permissions." (To set this point in note-style format, follow the text on the next line with {: .note}) -### Path and HTTP methods +### Endpoints For relatively complex API calls that include path parameters, it's sometimes a good idea to provide an example so that users can visualize how the request is properly formed. This section is optional and includes examples that illustrate how the endpoint and path parameters fit together in the request. The following is an example of this section for the nodes stats API: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8ab3c2bd4f5..fed076d89bd 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -78,13 +78,13 @@ Follow these steps to set up your local copy of the repository: 1. Navigate to your cloned repository. -##### Building using locally installed packages +##### Building by using locally installed packages 1. Install [Ruby](https://www.ruby-lang.org/en/) if you don't already have it. We recommend [RVM](https://rvm.io/), but you can use any method you prefer: ``` curl -sSL https://get.rvm.io | bash -s stable - rvm install 3.2.4 + rvm install 3.3.2 ruby -v ``` @@ -100,9 +100,9 @@ Follow these steps to set up your local copy of the repository: bundle install ``` -##### Building using containerization +##### Building by using containerization -Assuming you have `docker-compose` installed, run the following command: +Assuming you have Docker installed, run the following command: ``` docker compose -f docker-compose.dev.yml up @@ -141,13 +141,18 @@ Here's how to build the website, make changes, and view them locally: ## Review process -We greatly appreciate all contributions to the documentation and will review them as quickly as possible. +We greatly appreciate all contributions to the documentation and will review them as quickly as possible. Documentation can be updated at any time and does not require waiting for a release. Once you have created a PR, the documenation review process is as follows: + +1. Ensure that the submitted documentation is technically accurate and all examples are working. If you are a developer implementing the feature, you can optionally ask one of your peers to conduct a technical review. If you need help finding a tech reviewer, tag a [maintainer](https://github.com/opensearch-project/documentation-website/blob/main/MAINTAINERS.md). +2. When you submit a PR, it's assigned to one of the doc reviewers. Once you have verified technical accuracy and all technical reviews are completed, tag the assignee of the PR for a doc review. +3. A doc reviewer (technical writer) performs a doc review. The doc reviewer may push edits to the PR directly or leave comments and suggestions for you to address (let us know in a comment if you have a preference). The doc reviewer will arrange for an editorial review. +4. The editor performs an editorial review. The editor may push edits to the PR directly or leave comments and editorial suggestions for you to address (let us know in a comment if you have a preference). +5. When you have addressed all comments, the PR is merged. It is important that you specify to which versions the PR is applicable when you create the PR so it can be backported to the correct branches. We support updates only for the latest documentation version; the previous versions are not updated. Once the PR is merged, the documentation is published on the documentation site. During the PR process, expect that there will be some back-and-forth. If you want your contribution to be merged quickly, try to respond to comments in a timely fashion, and let us know if you don't want to continue with the PR. We use the [Vale](https://github.com/errata-ai/vale) linter to ensure that our documentation adheres to the [OpenSearch Project Style Guidelines](STYLE_GUIDE.md). Addressing Vale comments on the PR expedites the review process. You can also install Vale locally so you can address the comments before creating a PR. For more information, see [Style linting](#style-linting). -If we accept the PR, we will merge it and will backport it to the appropriate branches. ### Style linting @@ -158,6 +163,23 @@ To ensure that our documentation adheres to the [OpenSearch Project Style Guidel Optionally, you can install the [Vale VSCode](https://github.com/chrischinchilla/vale-vscode) extension, which integrates Vale with Visual Studio Code. By default, only _errors_ and _warnings_ are underlined. To change the minimum alert level to include _suggestions_, go to **Vale VSCode** > **Extension Settings** and select **suggestion** in the **Vale > Vale CLI: Min Alert Level** dropdown list. +## Troubleshooting + +This section provides information about potential solutions for known issues. + +### Installing Ruby on an Apple silicon machine + +If you're having trouble installing Ruby with `rvm` on an Apple silicon machine, it could be because of an OpenSSL version misalignment. To fix this issue, use the following command, replacing `` with your [desired version](https://github.com/ruby/openssl/blob/master/README.md): + +``` +# Assumes Brew is installed +curl -sSL https://get.rvm.io | bash -s stable +rvm install 3.2.4 --with-openssl-dir=$(brew --prefix openssl@) +ruby -v +``` + ## Getting help For help with the contribution process, reach out to one of the [points of contact](README.md#points-of-contact). + + diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md new file mode 100644 index 00000000000..6ad15f90da0 --- /dev/null +++ b/DEVELOPER_GUIDE.md @@ -0,0 +1,189 @@ +# Developer guide +- [Introduction](#introduction) +- [Starting the Jekyll server locally](#starting-the-jekyll-server-locally) +- [Using the spec-insert Jekyll plugin](#using-the-spec-insert-jekyll-plugin) + - [Ignoring files and folders](#ignoring-files-and-folders) +- [CI/CD](#cicd) +- [Spec insert components](#spec-insert-components) + - [Query parameters](#query-parameters) + - [Path parameters](#path-parameters) + - [Endpoints](#endpoints) + +## Introduction + +The `.md` documents in this repository are rendered into HTML pages using [Jekyll](https://jekyllrb.com/). These HTML pages are hosted on [opensearch.org](https://opensearch.org/docs/latest/). + +## Starting the Jekyll server locally + +You can run the Jekyll server locally to view the rendered HTML pages using the following steps: + +1. Install [Ruby](https://www.ruby-lang.org/en/documentation/installation/) 3.1.0 or later for your operating system. +2. Install the required gems by running `bundle install`. +3. Run `bundle exec jekyll serve` to start the Jekyll server locally (this can take several minutes to complete). +4. Open your browser and navigate to `http://localhost:4000` to view the rendered HTML pages. + +## Using the `spec-insert` Jekyll plugin + +The `spec-insert` Jekyll plugin is used to insert API components into Markdown files. The plugin downloads the [latest OpenSearch specification](https://github.com/opensearch-project/opensearch-api-specification) and renders the API components from the spec. This aims to reduce the manual effort required to keep the documentation up to date. + +To use this plugin, make sure that you have installed Ruby 3.1.0 or later and the required gems by running `bundle install`. + +Edit your Markdown file and insert the following snippet where you want render an API component: + +```markdown + + +This is where the API component will be inserted. +Everything between the `spec_insert_start` and `spec_insert_end` tags will be overwritten. + + +``` + +Then run the following Jekyll command to render the API components: +```shell +bundle exec jekyll spec-insert +``` + +If you are working on multiple Markdown files and do not want to keep running the `jekyll spec-insert` command, you can add the `--watch` (or `-W`) flag to the command to watch for changes in the Markdown files and automatically render the API components: + +```shell +bundle exec jekyll spec-insert -W +``` + +By default, when the plugin encounters an error when processing a file, the plugin prints out the error then moves on to the next file. If you want it to short-circuit when an error occurs, add the `--fail-on-error` (or `-F`) flag to the command: + +```shell +bundle exec jekyll spec-insert -F +``` + +Depending on the text editor you are using, you may need to manually reload the file from disk to see the changes applied by the plugin if the editor does not automatically reload the file periodically. + +The plugin will pull the newest OpenSearch API spec from its [repository](https://github.com/opensearch-project/opensearch-api-specification) if the spec file does not exist locally or if it is older than 24 hours. To tell the plugin to always pull the newest spec, you can add the `--refresh-spec` (or `-R`) flag to the command: + +```shell +bundle exec jekyll spec-insert --refresh-spec +``` + +### Ignoring files and folders + +The `spec-insert` plugin ignores all files and folders listed in the [./_config.yml#exclude](./_config.yml) list, which is also the list of files and folders that Jekyll ignores. + +### Configuration + +You can update the configuration settings for this plugin through the [config.yml](./spec-insert/config.yml) file. + +**Note:** The tests for this plugin use a mock configuration [file](./spec-insert/spec/mock_config.yml) to assure that the tests still pass when the config file is altered. The expected output for the tests is based on the mock configuration file and will look different from the actual output when the plugin is run. + +## CI/CD +The `spec-insert` plugin is run as part of the CI/CD pipeline to ensure that the API components are up to date in the documentation. This is performed through the [update-api-components.yml](.github/workflows/update-api-components.yml) GitHub Actions workflow, which creates a pull request containing the updated API components every Sunday. + +## Spec insert components +All spec insert components accept the following arguments: + +- `api` (String; required): The name of the API to render the component from. This is equivalent to the `x-operation-group` field in the OpenSearch OpenAPI Spec. +- `component` (String; required): The name of the component to render, such as `query_parameters`, `path_parameters`, or `endpoints`. +- `omit_header` (Boolean; Default is `false`): If set to `true`, the markdown header of the component will not be rendered. + +### Endpoints +To insert endpoints for the `search` API, use the following snippet: + +```markdown + + +``` + +### Path parameters + +To insert a path parameters table of the `indices.create` API, use the following snippet. Use the `x-operation-group` field from OpenSearch OpenAPI Spec for the `api` value: + +```markdown + + +``` + +This table accepts the same arguments as the query parameters table except the `include_global` argument. + +### Query parameters + +To insert the API query parameters table of the `cat.indices` API, use the following snippet: + +```markdown + + +``` + +This will insert the query parameters of the `cat.indices` API into the `.md` file with three default columns: `Parameter`, `Data type`, and `Description`. You can customize the query parameters table by adding the `columns` argument which accepts a comma-separated list of column names. The available column names are: + +- `Parameter` +- `Data type` +- `Description` +- `Required` +- `Default` + +_When `Required`/`Default` is not chosen, the information will be written in the `Description` column._ + +You can also customize this component with the following settings: + +- `include_global` (Boolean; default is `false`): Includes global query parameters in the table. +- `include_deprecated` (Boolean; default is `true`): Includes deprecated parameters in the table. +- `pretty` (Boolean; default is `false`): Renders the table in the pretty format instead of the compact format. + +The following snippet inserts the specified columns into the query parameters table: + +```markdown + + +``` + +### Request and response bodies (Beta) + +To insert the request and response body tables of the `indices.create` API, use the following snippet: + +```markdown + + +``` + +**Note:**: These components are still a work in progress and may not render correctly for all APIs. + +## Spec insert coverage report +To generate a coverage report of the API components that are being used in the documentation, run the following command: + +```shell +cd spec-insert +bundle exec rake generate_utilization_coverage +``` + +The coverage report will be generated in the `spec-insert/utilization_coverage.md` by default. + +## Spec insert generate dry-run report + +To generate a dry-run report of all APIs with all available spec insert components, run the following command: + +```shell +cd spec-insert +bundle exec rake generate_dry_run_report +``` +This will also generate a markdown (.md) file for each API with their rendered components in the `spec-insert/dry_run` folder. This allows you to preview the rendered components for all APIs without modifying the original documentation files. A report summarizing the errors found during the dry-run will be generated in the `spec-insert/dry_run_report.md` file. diff --git a/FORMATTING_GUIDE.md b/FORMATTING_GUIDE.md index ea5f711798f..d69fe2231f6 100644 --- a/FORMATTING_GUIDE.md +++ b/FORMATTING_GUIDE.md @@ -9,6 +9,8 @@ This guide provides an overview of the formatted elements commonly used in the O * [Adding pages or sections](#adding-pages-or-sections) * [Buttons](#buttons) * [Callouts](#callouts) +* [Cards](#cards) +* [Code blocks](#code-blocks) * [Collapsible blocks](#collapsible-blocks) * [Dashes](#dashes) * [Horizontal rule](#horizontal-rule) @@ -22,6 +24,7 @@ This guide provides an overview of the formatted elements commonly used in the O * [Nested lists](#nested-lists) * [Lists with code snippets or images](#lists-with-code-snippets-or-images) * [Math](#math) +* [Steps](#steps) * [Tables](#tables) * [Text style](#text-style) * [Variables in curly braces](#variables-in-curly-braces) @@ -54,7 +57,7 @@ Each collection must have an `index.md` file that corresponds to the collection' ## Buttons -You can use either `copy` or `copy-curl` includes for code snippets. The `copy` include places a **Copy** button on the code snippet, while the `copy-curl` include places both **Copy** and **Copy as cURL** buttons. Use the `copy-curl` include for API requests. If an API request is already in the cURL format, use the `copy` include. +You can use either `copy` or `copy-curl` includes for code snippets formatted using triple backticks. The `copy` include places a **Copy** button on the code snippet, while the `copy-curl` include places both **Copy** and **Copy as cURL** buttons. Use the `copy-curl` include for API requests. If an API request is already in the cURL format, use the `copy` include. **Example of a `copy` include** @@ -109,6 +112,100 @@ For a callout with multiple paragraphs or lists, use `>`: ``` +## Cards + +To add a card to a page, specify it in the front matter as follows. The `description`, `link`, and `list` are optional. Use relative links. You can optionally style the text using HTML tags: + +```yaml +tutorial_cards: + - heading: "Getting started with semantic and hybrid search" + description: "Learn how to implement semantic and hybrid search" + link: "/vector-search/tutorials/neural-search-tutorial/" + list: + - "Platform: OpenSearch" + - "Model: Anthropic Claude" + - "Deployment: Amazon Bedrock" +``` + +Insert an include in the page body where you want the cards to appear: + +``` +{% include cards.html cards=page.tutorial_cards %} +``` + +## Code blocks + +There are two ways to format code blocks: + +1. **Single code block**: Use triple backticks and provide the highlighting language for the code block. For example, format a REST request in the following way: + ````json + ```json + PUT /hotels-index + { + "settings": { + "index.knn": true + }, + "mappings": { + "properties": { + "location": { + "type": "knn_vector", + "dimension": 2, + "space_type": "l2" + } + } + } + } + ``` + {% include copy-curl.html %} + ```` + For information about the copy and copy as cURL button include, see [Buttons](#buttons). +1. **Tabbed panel**: Use a tabbed panel to provide the same example in multiple programming languages. If using this method, the [buttons](#buttons) are inserted programmatically. Use the following syntax to provide the example in multiple languages. This example creates a tabbed panel with a **REST** and **Python** tabs: + ```` + {% capture step1_rest %} + PUT /hotels-index + { + "settings": { + "index.knn": true + }, + "mappings": { + "properties": { + "location": { + "type": "knn_vector", + "dimension": 2, + "space_type": "l2" + } + } + } + } + {% endcapture %} + + {% capture step1_python %} + from opensearchpy import OpenSearch + + client.indices.create( + index="hotels-index", + body={ + "settings": {"index.knn": True}, + "mappings": { + "properties": { + "location": { + "type": "knn_vector", + "dimension": 2, + "space_type": "l2" + } + } + } + } + ) + {% endcapture %} + + {% include code-block.html + rest=step1_rest + python=step1_python %} + ``` + ```` + The supported languages are listed in [this yaml file](/_data/code_languages.yml). + ## Collapsible blocks To insert an open collapsible block, use the `
` element as follows: @@ -399,6 +496,32 @@ Some Markdown paragraph. Here's a formula: And back to Markdown. ``` +Alternatively, you can use double dollar signs (`$$`) for both display and inline math directly in Markdown: + +``` +The probability of selecting pair $$i$$ is proportional to $$1 \over i^\alpha$$. +``` + +## Steps + +To insert steps, specify them in the front matter as follows. Steps are automatically numbered. Use relative links. The `description` and `link` are optional: + +```yaml +steps: + - heading: "Create an OpenSearch index" + description: "Create an OpenSearch index to store your embeddings." + link: "/vector-search/creating-vector-index/#storing-raw-vectors-or-embeddings-generated-outside-of-opensearch" + - heading: "Ingest embeddings" + description: "Ingest your embeddings into the index." + link: "/vector-search/ingesting-data/#raw-vector-ingestion" +``` + +Insert an include in the page body where you want the steps to appear: + +``` +{% include list.html list_items=page.steps%} +``` + ## Tables Markdown table columns are automatically sized, and there is no need to specify a different number of dashes in the formatting. diff --git a/Gemfile b/Gemfile index 7825dcd02bf..fee04f3c48d 100644 --- a/Gemfile +++ b/Gemfile @@ -1,4 +1,9 @@ -source "http://rubygems.org" +# frozen_string_literal: true + +source 'https://rubygems.org' + +# Manually add csv gem since Ruby 3.4.0 no longer includes it +gem 'csv', '~> 3.0' # Hello! This is where you manage which Jekyll version is used to run. # When you want to use a different version, change it below, save the @@ -8,12 +13,12 @@ source "http://rubygems.org" # # This will help ensure the proper Jekyll version is running. # Happy Jekylling! -gem "jekyll", "~> 4.3.2" +gem 'jekyll', '~> 4.3.2' # This is the default theme for new Jekyll sites. You may change this to anything you like. -gem "just-the-docs", "~> 0.3.3" -gem "jekyll-remote-theme", "~> 0.4" -gem "jekyll-redirect-from", "~> 0.16" +gem 'jekyll-redirect-from', '~> 0.16' +gem 'jekyll-remote-theme', '~> 0.4' +gem 'just-the-docs', '~> 0.3.3' # If you want to use GitHub Pages, remove the "gem "jekyll"" above and # uncomment the line below. To upgrade, run `bundle update github-pages`. @@ -22,21 +27,31 @@ gem "jekyll-redirect-from", "~> 0.16" # If you have any plugins, put them here! group :jekyll_plugins do - gem "jekyll-last-modified-at" - gem "jekyll-sitemap" + gem 'jekyll-last-modified-at' + gem 'jekyll-sitemap' + gem 'jekyll-spec-insert', :path => './spec-insert' end # Windows does not include zoneinfo files, so bundle the tzinfo-data gem -gem "tzinfo-data", platforms: [:mingw, :mswin, :x64_mingw, :jruby] +gem 'tzinfo-data', platforms: %i[mingw mswin x64_mingw jruby] # Performance-booster for watching directories on Windows -gem "wdm", "~> 0.1.0" if Gem.win_platform? +gem 'wdm', '~> 0.1.0' if Gem.win_platform? # Installs webrick dependency for building locally -gem "webrick", "~> 1.7" - +gem 'webrick', '~> 1.7' # Link checker -gem "typhoeus" -gem "ruby-link-checker" -gem "ruby-enum" +gem 'ruby-enum' +gem 'ruby-link-checker' +gem 'typhoeus' + +# Spec Insert +gem 'activesupport', '~> 7' +gem 'mustache', '~> 1' + +group :development, :test do + gem 'rspec' + gem 'rubocop', '~> 1.44', require: false + gem 'rubocop-rake', require: false +end diff --git a/MAINTAINERS.md b/MAINTAINERS.md index 55b908e0271..5eba67a0e4c 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -9,14 +9,15 @@ This document lists the maintainers in this repo. See [opensearch-project/.githu | Fanit Kolchina | [kolchfa-aws](https://github.com/kolchfa-aws) | Amazon | | Nate Archer | [Naarcha-AWS](https://github.com/Naarcha-AWS) | Amazon | | Nathan Bower | [natebower](https://github.com/natebower) | Amazon | -| Melissa Vagi | [vagimeli](https://github.com/vagimeli) | Amazon | | Miki Barahmand | [AMoo-Miki](https://github.com/AMoo-Miki) | Amazon | | David Venable | [dlvenable](https://github.com/dlvenable) | Amazon | -| Stephen Crawford | [stephen-crawford](https://github.com/stephen-crawford) | Amazon | +| Brian Presley | [sumobrian](https://github.com/sumobrian/) | Amazon | | Eric Pugh | [epugh](https://github.com/epugh) | OpenSource Connections | ## Emeritus -| Maintainer | GitHub ID | Affiliation | -| ---------------- | ----------------------------------------------- | ----------- | -| Heather Halter | [hdhalter](https://github.com/hdhalter) | Amazon | +| Maintainer | GitHub ID | Affiliation | +| ---------------- | ------------------------------------------------------- | ----------- | +| Heather Halter | [hdhalter](https://github.com/hdhalter) | Amazon | +| Melissa Vagi | [vagimeli](https://github.com/vagimeli) | Amazon | +| Stephen Crawford | [stephen-crawford](https://github.com/stephen-crawford) | Amazon | \ No newline at end of file diff --git a/README.md b/README.md index 66beb1948c9..807e1063094 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ # About the OpenSearch documentation repo The `documentation-website` repository contains the user documentation for OpenSearch. You can find the rendered documentation at [opensearch.org/docs](https://opensearch.org/docs). +The markdown files in this repository are rendered into HTML pages using [Jekyll](https://jekyllrb.com/). Check the [DEVELOPER_GUIDE](DEVELOPER_GUIDE.md) for more information about how to use Jekyll for this repository. ## Contributing @@ -23,7 +24,6 @@ If you encounter problems or have questions when contributing to the documentati - [kolchfa-aws](https://github.com/kolchfa-aws) - [Naarcha-AWS](https://github.com/Naarcha-AWS) -- [vagimeli](https://github.com/vagimeli) ## Code of conduct diff --git a/STYLE_GUIDE.md b/STYLE_GUIDE.md index e33f49697db..ab766992eac 100644 --- a/STYLE_GUIDE.md +++ b/STYLE_GUIDE.md @@ -23,11 +23,11 @@ The following naming conventions should be observed in OpenSearch Project conten #### Product names -Capitalize product names. The OpenSearch Project has three products: OpenSearch, OpenSearch Dashboards, and Data Prepper. For example: +Capitalize product names. The OpenSearch Project has three products: OpenSearch, OpenSearch Dashboards, and OpenSearch Data Prepper. For example: * "To install *OpenSearch*, download the Docker image." * "To access *OpenSearch Dashboards*, open your browser and navigate to http://localhost:5601/app/home." -* "*Data Prepper* contains the following components:" +* "*OpenSearch Data Prepper* contains the following components:" Capitalize the names of clients and tools. For example: @@ -128,6 +128,7 @@ The following table lists acronyms that you don't need to spell out. | CSV | comma-separated values | | DNS | Domain Name System | | DOS | disk operating system | +| Faiss | Facebook AI Similarity Search | | FAQ | frequently asked questions | | FTP | File Transfer Protocol | | GIF | Graphics Interchange Format | @@ -141,8 +142,10 @@ The following table lists acronyms that you don't need to spell out. | JPEG | Joint Photographic Experts Group | | JSON | JavaScript Object Notation | | k-NN | k-nearest neighbors | +| MS MARCO | Microsoft Machine Reading Comprehension | | NAT | network address translation | | NGINX | engine x | +| NMSLIB | Non-Metric Space Library | | PDF | Portable Document Format | | RAM | random access memory | | REST | Representational State Transfer | diff --git a/TERMS.md b/TERMS.md index f5c6fef6dbc..e46501926d3 100644 --- a/TERMS.md +++ b/TERMS.md @@ -46,7 +46,7 @@ Use to describe a list of items that are allowed (not blocked). Do not use as a **Amazon OpenSearch Service** -Amazon OpenSearch Service is a managed service that makes it easy to deploy, operate, and scale OpenSearch clusters in the AWS Cloud. Amazon OpenSearch Service is the successor to Amazon Elasticsearch Service (Amazon ES) and supports OpenSearch and legacy Elasticsearch OSS (up to 7.10, the final open-source version of the software). +Use "Amazon OpenSearch Service" on first appearance; "OpenSearch Service" is acceptable for subsequent appearances. Amazon OpenSearch Service is a managed service that makes it easy to deploy, operate, and scale OpenSearch clusters in the AWS Cloud. Amazon OpenSearch Service is the successor to Amazon Elasticsearch Service (Amazon ES) and supports OpenSearch and legacy Elasticsearch OSS (up to 7.10, the final open-source version of the software). **Anomaly Detection** @@ -196,6 +196,10 @@ Use data is, not data are. Don't use datas. Use pieces of data or equivalent to **data center** +**OpenSearch Data Prepper** + +Use "OpenSearch Data Prepper" on first appearance; "Data Prepper" is acceptable for subsequent appearances. OpenSearch Data Prepper is a server-side data collector capable of filtering, enriching, transforming, normalizing, and aggregating data for downstream analytics and visualization. Data Prepper also lets users build custom pipelines to improve the operational view of applications. + **dataset** **data source** @@ -299,6 +303,8 @@ Exception: *Execution* is unavoidable for third-party terms for which no alterna **Faiss** +Facebook AI Similarity Search. Do not define on first appearance. Faiss is a library that allows developers to quickly search for embeddings of multimedia documents that are similar to each other. + **file name** **frontend (n., adj.)** @@ -501,6 +507,10 @@ Do not use. Use *management account* instead. Avoid. Use _can_ or _might_ instead. +**MS MARCO** + +Microsoft Machine Reading Comprehension. Do not define on first appearance. MS MARCO is a collection of datasets focused on deep learning in search. + **multilayer, multilayered** **must, shall, should** @@ -521,6 +531,10 @@ Use _near real time_ as a noun; use near real-time as an adjective. Don't add a Spell out _near real time_ on first mention; _NRT_ can be used on subsequent mentions. +**NMSLIB** + +Non-Metric Space Library. Do not define on first appearance. NMSLIB is an efficient similarity search library and a toolkit for evaluation of k-NN methods for generic non-metric spaces. + **node** A server that stores your data and processes search requests with OpenSearch, usually as part of a cluster. Do not use _master node_ and avoid using _worker node_. @@ -604,6 +618,10 @@ Tools inside of OpenSearch that can be customized to enhance OpenSearch's functi **pop-up** +**preaggregate** + +**precompute** + **premise, premises** With reference to property and buildings, always form as plural. @@ -646,6 +664,8 @@ Copy of a primary shard. Helps improve performance when using indexes across mul Use as a synonym for repository, on second and subsequent use. +**retrieval-augmented generation (RAG)** + **RPM Package Manager (RPM)** Formerly known as RedHat Package Manager. An open-source package management system for use with Linux distributions. @@ -684,17 +704,13 @@ A piece of an index that consumes CPU and memory. Operates as a full Lucene inde Don't use. Both *simple* and *simply* are not neutral in tone and might sound condescending to some users. If you mean *only*, use *only* instead. -**since** - -Use only to describe time events. Don't use in place of *because*. - **slave** Do not use. Use *replica*, *secondary*, or *standby* instead. **Snapshot Management (SM)** -**solid state drive (SSD)** +**solid-state drive (SSD)** **standalone** diff --git a/_about/breaking-changes.md b/_about/breaking-changes.md index 6fb5660f78f..1070a5f04be 100644 --- a/_about/breaking-changes.md +++ b/_about/breaking-changes.md @@ -41,4 +41,151 @@ A Lucene upgrade forced OpenSearch to drop support for JDK 8. As a consequence, ### Wildcard query behavior for text fields -OpenSearch 2.5 contains a bug fix to correct the behavior of the `case_insensitive` parameter for the `wildcard` query on text fields. As a result, a wildcard query on text fields that ignored case sensitivity and erroneously returned results prior to the bug fix will not return the same results. For more information, see issue [#8711](https://github.com/opensearch-project/OpenSearch/issues/8711). \ No newline at end of file +OpenSearch 2.5 contains a bug fix that corrects the behavior of the `case_insensitive` parameter for the `wildcard` query on text fields. As a result, a wildcard query on text fields that ignored case sensitivity and erroneously returned results prior to the bug fix will not return the same results. For more information, see issue [#8711](https://github.com/opensearch-project/OpenSearch/issues/8711). + +## 2.19.0 + +### Nested value support in the text embedding processor +The `text_embedding` processor no longer replaces nested values like `_ingest._value` when evaluating fields like `title_tmp:_ingest._value.title_embedding`. Instead, you must directly specify the nested key as `books.title:title_embedding` to achieve the desired output. For more information, see issue [#1243](https://github.com/opensearch-project/neural-search/issues/1243). + +## 3.0.0 + +### JDK requirement + +The minimum supported JDK version is JDK 21. + +### System index access + +Access to system indexes through the REST API is no longer provided. This functionality has been deprecated since OpenSearch 1.x. For more information, see issue [#7936](https://github.com/opensearch-project/OpenSearch/issues/7936). + +### Document ID length limits + +The document ID length limit of 512 bytes is now consistently enforced across all APIs, including the Bulk API. Previously, the Bulk API allowed document IDs longer than 512 bytes. For more information, see issue [#6595](https://github.com/opensearch-project/OpenSearch/issues/6595). + +### Node role configuration + +The configuration of empty node roles using environment variables has been fixed. Setting `node.roles=` using environment variables now properly configures a coordinating-only node, consistent with the `opensearch.yml` configuration. For more information, see issue [#3412](https://github.com/opensearch-project/OpenSearch/issues/3412). + +### JSON processing limits + +New default limits have been introduced for JSON processing (using the Jackson library) throughout OpenSearch: + +- The maximum nesting depth of JSON objects and arrays is limited to 1,000 levels. +- The maximum length of JSON property names is limited to 50,000 units (bytes or chars, depending on the input source). + +These limits help prevent potential memory issues and denial-of-service attacks. For more information, see issue [#11278](https://github.com/opensearch-project/OpenSearch/issues/11278). + +### Nested query depth + +A new `index.query.max_nested_depth` setting has been introduced with a default value of `20` and a minimum value of `1`, limiting the maximum number of nesting levels for `nested` queries. For more information, see issue [#3268](https://github.com/opensearch-project/OpenSearch/issues/3268). + +### Thread pool settings + +The following deprecated thread pool settings have been removed: +- `thread_pool.test.max_queue_size` +- `thread_pool.test.min_queue_size` +For more information, see issue [#2595](https://github.com/opensearch-project/OpenSearch/issues/2595). + +### Index store settings + +The `index.store.hybrid.mmap.extensions` setting has been removed as part of improvements to `hybridfs` file handling. For more information, see pull request [#9392](https://github.com/opensearch-project/OpenSearch/pull/9392). + +### Transport Nio plugin + +The `transport-nio` plugin has been removed. Netty remains the standard network framework for both node-to-node and client-to-server communication. For more information, see issue [#16887](https://github.com/opensearch-project/OpenSearch/issues/16887). + +### Nodes API response format + +The format of indexing buffer values in the Nodes API response has changed: + +- `total_indexing_buffer_in_bytes` now displays raw bytes (for example, `53687091`). +- `total_indexing_buffer` now displays human-readable format (for example, `51.1mb`). + +For more information, see pull request [#17070](https://github.com/opensearch-project/OpenSearch/pull/17070). + +### PathHierarchy tokenizer + +The camel case `PathHierarchy` tokenizer name has been deprecated in favor of the snake case `path_hierarchy`. For more information, see pull request [#10894](https://github.com/opensearch-project/OpenSearch/pull/10894). + +### Security plugin + +The Blake2b hash implementation now uses the salt parameter correctly, which will result in different (though correct) hash values compared to previous versions. For more information, see pull request [#5089](https://github.com/opensearch-project/security/pull/5089). + +### k-NN plugin + +The following deprecated settings have been removed from the k-NN plugin: + +- `knn.plugin.enabled` setting +- `index.knn.algo_param.ef_construction` index setting +- `index.knn.algo_param.m` index setting +- `index.knn.space_type` index setting + +The NMSLIB engine is now deprecated. We recommend using the Faiss or Lucene engines instead. + +For more information, see pull request [#2564](https://github.com/opensearch-project/k-NN/pull/2564). + +### Performance Analyzer plugin + +The `performance-analyzer-rca` agent has been removed. We recommend transitioning to the [Telemetry plugin](https://github.com/opensearch-project/performance-analyzer/issues/585) for performance monitoring and analysis. The Telemetry plugin, using the OpenTelemetry framework, allows for seamless integration with lightweight open-source agents in order to publish performance metrics to observability stores. For more information, see issue [#591](https://github.com/opensearch-project/performance-analyzer-rca/issues/591). + +### SQL plugin + +- The OpenSearch query domain-specific language (DSL) response format has been removed. +- `DELETE` statement support has been removed. +- The `plugins.sql.delete.enabled` setting has been removed. +- The legacy Spark Connector module has been deprecated. For information about connecting to Spark, see [`async-query-core`](https://github.com/opensearch-project/sql/blob/main/async-query-core/README.md). +- Deprecated OpenDistro endpoints and legacy settings with the `opendistro` prefix have been removed. +- The `plugins.sql.pagination.api` has been removed and the Scroll API has been deprecated. Pagination now defaults to Point in Time. + +For more information, see issue [#3248](https://github.com/opensearch-project/sql/issues/3248). + +### OpenSearch Dashboards + +- Discover experience: + + - The `discover:newExperience` setting has been removed. + - The DataGrid table feature has been removed. + + For more information, see pull request [#9511](https://github.com/opensearch-project/OpenSearch-Dashboards/pull/9511). + +- Visualizations: The `dashboards-visualizations` plugin (including Gantt chart visualization) has been removed. We recommend transitioning to: + + - Vega visualization for flexible visualization needs. + - Trace analytics for trace-related use cases. + + For more information, see issue [#430](https://github.com/opensearch-project/dashboards-visualizations/issues/430). + +### Dashboards Observability plugin + +The legacy notebooks feature has been removed from `dashboards-observability`. Key changes include the following: + +- Legacy notebooks (previously stored in the `.opensearch-observability` index) are no longer supported. +- Only notebooks stored in the `.kibana` index (introduced in version 2.17) are supported. +- You must migrate your notebooks to the new storage system before upgrading to version 3.0. + +For more information, see issue [#2350](https://github.com/opensearch-project/dashboards-observability/issues/2350). + +### Searchable snapshots node role + +Nodes that use searchable snapshots must have the `warm` node role. Key changes include the following: + +- The `search` role no longer supports searchable snapshots. +- Nodes that handle searchable snapshot shards must be assigned the warm role. +- You must update node role configurations before upgrading to version 3.0 if your cluster uses searchable snapshots. + +For more information, see pull request [#17573](https://github.com/opensearch-project/OpenSearch/pull/17573). + +### Query groups + +Query groups have been renamed to **workload groups**. Key changes include the following: + +- The `wlm/query_group` endpoint is now the `wlm/workload_group` endpoint. +- The API responds with a `workloadGroupID` instead of a `queryGroupID`. +- All workload management cluster settings are now prepended with `wlm.workload_group`. + +For more information, see pull request [#9813](https://github.com/opensearch-project/OpenSearch/pull/17901). + +### ML Commons plugin + +- The `CatIndexTool` is removed in favor of the `ListIndexTool`. + diff --git a/_about/index.md b/_about/index.md index 041197eeba9..404a5a4d6f3 100644 --- a/_about/index.md +++ b/_about/index.md @@ -10,6 +10,51 @@ redirect_from: - /docs/opensearch/ - /opensearch/ - /opensearch/index/ +why_use: + - heading: "Vector database" + description: "Use OpenSearch as a vector database to combine the power of traditional search, analytics, and vector search" + link: "/vector-search/" + - heading: "Fast, scalable full-text search" + description: "Help users find the right information in your application, website, or data lake catalog" + link: "/search-plugins/" + - heading: "Application and infrastructure monitoring" + description: "Use observability logs, metrics, and traces to monitor your applications in real time" + link: "/observing-your-data/" + - heading: "Security and event information management" + description: "Centralize logs to enable real-time security monitoring and forensic analysis" + link: "/security/" +features: + - heading: "Vector search" + description: "Build AI/ML-powered vector search applications" + link: "/vector-search/" + - heading: "Machine learning" + description: "Integrate machine learning models into your workloads" + link: "/ml-commons-plugin/" + - heading: "Customizing your search" + description: "From optimizing performance to improving relevance, customize your search experience" + link: "/search-plugins/" + - heading: "Workflow automation" + description: "Automate complex OpenSearch setup and preprocessing tasks" + link: "/automating-configurations/" + - heading: "Anomaly detection" + description: "Identify atypical data and receive automatic notifications" + link: "/monitoring-plugins/ad/" + - heading: "Building visualizations" + description: "Visualize your data in OpenSearch Dashboards" + link: "/dashboards/" +getting_started: + - heading: "Get started with OpenSearch" + description: "Learn about OpenSearch and start ingesting and searching data" + link: "/getting-started/" + - heading: "Get started with OpenSearch Dashboards" + description: "Learn about OpenSearch Dashboards applications and tools used to visualize data" + link: "/dashboards/quickstart/" + - heading: "Get started with vector search" + description: "Learn about vector search options and build your first vector search application" + link: "/search-plugins/" + - heading: "Get started with OpenSearch security" + description: "Learn about security in OpenSearch" + link: "/getting-started/security/" --- {%- comment -%}The `/docs/opensearch/` redirect is specifically to support the UI links in OpenSearch Dashboards 1.0.0.{%- endcomment -%} @@ -22,70 +67,20 @@ This section contains documentation for OpenSearch and OpenSearch Dashboards. ## Getting started -To get started, explore the following documentation: - -- [Getting started guide]({{site.url}}{{site.baseurl}}/getting-started/): - - [Intro to OpenSearch]({{site.url}}{{site.baseurl}}/getting-started/intro/) - - [Installation quickstart]({{site.url}}{{site.baseurl}}/getting-started/quickstart/) - - [Communicate with OpenSearch]({{site.url}}{{site.baseurl}}/getting-started/communicate/) - - [Ingest data]({{site.url}}{{site.baseurl}}/getting-started/ingest-data/) - - [Search data]({{site.url}}{{site.baseurl}}/getting-started/search-data/) - - [Getting started with OpenSearch security]({{site.url}}{{site.baseurl}}/getting-started/security/) -- [Install OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/index/) -- [Install OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/index/) -- [FAQ](https://opensearch.org/faq) +{% include cards.html cards=page.getting_started %} ## Why use OpenSearch? - - - - - - - - - - - - - - - - - - - - - -
Fast, scalable full-text searchApplication and infrastructure monitoringSecurity and event information managementOperational health tracking
Fast, scalable full-text searchApplication and infrastructure monitoringSecurity and event information managementOperational health tracking
Help users find the right information within your application, website, or data lake catalog. Easily store and analyze log data, and set automated alerts for performance issues.Centralize logs to enable real-time security monitoring and forensic analysis.Use observability logs, metrics, and traces to monitor your applications in real time.
+{% include cards.html cards=page.why_use documentation_link=true %} ## Key features -OpenSearch provides several features to help index, secure, monitor, and analyze your data: - -- [Anomaly detection]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/) -- Identify atypical data and receive automatic notifications. -- [SQL]({{site.url}}{{site.baseurl}}/search-plugins/sql/index/) -- Use SQL or a Piped Processing Language (PPL) to query your data. -- [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/) -- Automate index operations. -- [Search methods]({{site.url}}{{site.baseurl}}/search-plugins/knn/) -- From traditional lexical search to advanced vector and hybrid search, discover the optimal search method for your use case. -- [Machine learning]({{site.url}}{{site.baseurl}}/ml-commons-plugin/index/) -- Integrate machine learning models into your workloads. -- [Workflow automation]({{site.url}}{{site.baseurl}}/automating-configurations/index/) -- Automate complex OpenSearch setup and preprocessing tasks. -- [Performance evaluation]({{site.url}}{{site.baseurl}}/monitoring-plugins/pa/) -- Monitor and optimize your cluster. -- [Asynchronous search]({{site.url}}{{site.baseurl}}/search-plugins/async/) -- Run search requests in the background. -- [Cross-cluster replication]({{site.url}}{{site.baseurl}}/replication-plugin/index/) -- Replicate your data across multiple OpenSearch clusters. - - -## The secure path forward - -OpenSearch includes a demo configuration so that you can get up and running quickly, but before using OpenSearch in a production environment, you must [configure the Security plugin manually]({{site.url}}{{site.baseurl}}/security/configuration/index/) with your own certificates, authentication method, users, and passwords. To get started, see [Getting started with OpenSearch security]({{site.url}}{{site.baseurl}}/getting-started/security/). - -## Looking for the Javadoc? +{% include cards.html cards=page.features%} -See [opensearch.org/javadocs/](https://opensearch.org/javadocs/). ## Get involved -[OpenSearch](https://opensearch.org) is supported by Amazon Web Services. All components are available under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0.html) on [GitHub](https://github.com/opensearch-project/). +[OpenSearch](https://opensearch.org) is supported by the OpenSearch Software Foundation. All components are available under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0.html) on [GitHub](https://github.com/opensearch-project/). The project welcomes GitHub issues, bug fixes, features, plugins, documentation---anything at all. To get involved, see [Contributing](https://opensearch.org/source.html) on the OpenSearch website. --- diff --git a/_about/version-history.md b/_about/version-history.md index b8b9e993097..c526f710f07 100644 --- a/_about/version-history.md +++ b/_about/version-history.md @@ -9,6 +9,12 @@ permalink: /version-history/ OpenSearch version | Release highlights | Release date :--- | :--- | :--- +[3.1.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.1.0.md) | Makes GPU acceleration for vector index builds generally available. Introduces memory-optimized search for Faiss indexes using Lucene HNSW, semantic field type for streamlined semantic search, and Search Relevance Workbench for search quality optimization. Makes star-tree indexes generally available with support for comprehensive query types. Enhances observability with ML Commons metrics integration, custom index support for OpenTelemetry data, and new PPL commands for JSON manipulation. Improves agent management with Update Agent API and persistent MCP tools. Includes security enhancements with immutable user objects and new resource sharing framework. For a full list of release highlights, see the Release Notes. | 24 June 2025 +[3.0.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.0.0.md) | Upgrades to Lucene 10 for improved indexing and vector search. Adds experimental gRPC support and pull-based ingestion from Kafka and Kinesis. Introduces GPU acceleration for vector operations and semantic sentence highlighting. Improves range query performance and hybrid search with z-score normalization. Adds plan-execute-reflect agents and native MCP protocol support for agentic workflows. Enhances security with a new Java agent replacing the Security Manager. Includes PPL query improvements with lookup, join, and subsearch commands. For a full list of release highlights, see the Release Notes. | 06 May 2025 +[2.19.2](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.19.2.md) | Improves query insights with better index handling, a new verbose API parameter, and a default index template. Fixes bugs across Query Insights, Observability, Flow Framework, and Dashboards. Includes multiple CVE fixes, test enhancements, and a new PGP key for artifact verification. For a full list of release highlights, see the Release Notes. | 29 April 2025 +[2.19.1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.19.1.md) | Adds execution hint for cardinality aggregator. Includes bug fixes for ML Commons, Query Insights Dashboards, and Remote Metadata SDK. Contains maintenance updates for several components. For a full list of release highlights, see the Release Notes. | 27 February 2025 +[2.19.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.19.0.md) | Adds workload management, additional query insights, and template queries. Introduces a query insights page to OpenSearch Dashboards. Includes improvements and bug fixes to snapshots, search statistics, star-tree search, and index management. For a full list of release highlights, see the Release Notes. | 11 February 2025 +[2.18.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.18.0.md) | Adds a redesigned home page, updated Discover interface, and collaborative workspaces to OpenSearch Dashboards. Includes improvements to ML inference processor and query grouping. Introduces reranking by field and paginated CAT APIs. Includes experimental OpenSearch Dashboards Assistant capabilities. For a full list of release highlights, see the Release Notes. | 05 November 2024 [2.17.1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.17.1.md) | Includes bug fixes for ML Commons, anomaly detection, k-NN, and security analytics. Adds various infrastructure and maintenance updates. For a full list of release highlights, see the Release Notes. | 1 October 2024 [2.17.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.17.0.md) | Includes disk-optimized vector search, binary quantization, and byte vector encoding in k-NN. Adds asynchronous batch ingestion for ML tasks. Provides search and query performance enhancements and a new custom trace source in trace analytics. Includes application-based configuration templates. For a full list of release highlights, see the Release Notes. | 17 September 2024 [2.16.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.16.0.md) | Includes built-in byte vector quantization and binary vector support in k-NN. Adds new sort, split, and ML inference search processors for search pipelines. Provides application-based configuration templates and additional plugins to integrate multiple data sources in OpenSearch Dashboards. Includes an experimental Batch Predict ML Commons API. For a full list of release highlights, see the Release Notes. | 06 August 2024 @@ -33,6 +39,7 @@ OpenSearch version | Release highlights | Release date [2.0.1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.0.1.md) | Includes bug fixes and maintenance updates for Alerting and Anomaly Detection. | 16 June 2022 [2.0.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.0.0.md) | Includes document-level monitors for alerting, OpenSearch Notifications plugins, and Geo Map Tiles in OpenSearch Dashboards. Also adds support for Lucene 9 and bug fixes for all OpenSearch plugins. For a full list of release highlights, see the Release Notes. | 26 May 2022 [2.0.0-rc1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.0.0-rc1.md) | The Release Candidate for 2.0.0. This version allows you to preview the upcoming 2.0.0 release before the GA release. The preview release adds document-level alerting, support for Lucene 9, and the ability to use term lookup queries in document level security. | 03 May 2022 +[1.3.20](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.20.md) | Includes enhancements to Anomaly Detection Dashboards, bug fixes for Alerting and Dashboards Reports, and maintenance updates for several OpenSearch components. | 11 December 2024 [1.3.19](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.19.md) | Includes bug fixes and maintenance updates for OpenSearch security, OpenSearch security Dashboards, and anomaly detection. | 27 August 2024 [1.3.18](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.18.md) | Includes maintenance updates for OpenSearch security. | 16 July 2024 [1.3.17](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-1.3.17.md) | Includes maintenance updates for OpenSearch security and OpenSearch Dashboards security. | 06 June 2024 diff --git a/_aggregations/bucket/auto-interval-date-histogram.md b/_aggregations/bucket/auto-interval-date-histogram.md index b7a95a3b89b..aa53562d260 100644 --- a/_aggregations/bucket/auto-interval-date-histogram.md +++ b/_aggregations/bucket/auto-interval-date-histogram.md @@ -105,6 +105,8 @@ GET /blogs/_search ``` {% include copy-curl.html %} +## Example response + The response shows that the blog posts were aggregated into two buckets. The interval was automatically set to 1 year, with all three 2022 blog posts collected in one bucket and the 2023 blog post in another: ```json diff --git a/_aggregations/bucket/children.md b/_aggregations/bucket/children.md index 1f493c4620f..c9b6c543766 100644 --- a/_aggregations/bucket/children.md +++ b/_aggregations/bucket/children.md @@ -2,47 +2,54 @@ layout: default title: Children parent: Bucket aggregations -grand_parent: Aggregations nav_order: 15 --- # Children -The `children` aggregation connects parent documents with their related child documents. This allows you to analyze relationships between different types of data in a single query, rather than needing to run multiple queries and combine the results manually. +The `children` aggregation is a bucket aggregation that creates a single bucket containing child documents, based on parent-child relationships defined in your index. ---- +The `children` aggregation works with the [join field type]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) to aggregate child documents that are associated with parent documents. + +The `children` aggregation identifies child documents that match specific child relation name, whereas the [`parent` aggregation]({{site.url}}{{site.baseurl}}/aggregations/bucket/parent/) identifies parent documents that have matching child documents. Both aggregations take the child relation name as input. -## Example index, sample data, and children aggregation query +## Parameters -For example, if you have a parent-child relationship between authors, posts, and comments, you can analyze the relationships between the different data types (`authors`, `posts`, and `comments`) in a single query. +The `children` aggregation takes the following parameters. -The `authors` aggregation groups the documents by the `author.keyword` field. This allows you to see the number of documents associates with each author. +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `type` | Required | String | The name of the child type from the join field. This identifies the parent-child relationship to use. | -In each author group, the `children` aggregation retrieves the associated posts. This gives you a breakdown of the posts written by each author. -In the `posts` aggregation, another `children` aggregation fetches the comments associated with each post. This provides you a way to see the comments for each individual post. +## Example -In the `comments` aggregation, the `value_count` aggregation counts the number of comments on each post. This allows you to gauge the engagement level for each post by seeing the number of comments it has received. +The following example builds a small company database with three employees. The employee records each have a child `join` relationship with a parent department record. -#### Example index +First, create a `company` index with a `join` field that maps departments (parents) to employees (children): ```json -PUT /blog-sample +PUT /company { "mappings": { "properties": { - "type": { "type": "keyword" }, - "name": { "type": "keyword" }, - "title": { "type": "text" }, - "content": { "type": "text" }, - "author": { "type": "keyword" }, - "post_id": { "type": "keyword" }, "join_field": { "type": "join", "relations": { - "author": "post", - "post": "comment" + "department": "employee" } + }, + "department_name": { + "type": "keyword" + }, + "employee_name": { + "type": "keyword" + }, + "salary": { + "type": "double" + }, + "hire_date": { + "type": "date" } } } @@ -50,86 +57,64 @@ PUT /blog-sample ``` {% include copy-curl.html %} -#### Sample documents +Next, populate the data with three departments and three employees. The parent-child assignments are presented in the following table. -```json -POST /blog-sample/_doc/1?routing=1 -{ - "type": "author", - "name": "John Doe", - "join_field": "author" -} +| Department (parent) | Employees (children) | +| :-- | :-- | +| `Accounting` | `Abel Anderson`, `Betty Billings` | +| `Engineering` | `Carl Carter` | +| `HR` | none | -POST /blog-sample/_doc/2?routing=1 -{ - "type": "post", - "title": "Introduction to OpenSearch", - "content": "OpenSearch is a powerful search and analytics engine...", - "author": "John Doe", - "join_field": { - "name": "post", - "parent": "1" - } -} +The `routing` parameter ensures that both parent and child documents are stored on the same shard, which is required in order for parent-child relationships to function correctly in OpenSearch: -POST /blog-sample/_doc/3?routing=1 -{ - "type": "comment", - "content": "Great article! Very informative.", - "join_field": { - "name": "comment", - "parent": "2" - } -} - -POST /blog-sample/_doc/4?routing=1 -{ - "type": "comment", - "content": "Thanks for the clear explanation.", - "join_field": { - "name": "comment", - "parent": "2" - } -} +```json +POST _bulk?routing=1 +{ "create": { "_index": "company", "_id": "1" } } +{ "type": "department", "department_name": "Accounting", "join_field": "department" } +{ "create": { "_index": "company", "_id": "2" } } +{ "type": "department", "department_name": "Engineering", "join_field": "department" } +{ "create": { "_index": "company", "_id": "3" } } +{ "type": "department", "department_name": "HR", "join_field": "department" } +{ "create": { "_index": "company", "_id": "4" } } +{ "type": "employee", "employee_name": "Abel Anderson", "salary": 120000, "hire_date": "2024-04-04", "join_field": { "name": "employee", "parent": "1" } } +{ "create": { "_index": "company", "_id": "5" } } +{ "type": "employee", "employee_name": "Betty Billings", "salary": 140000, "hire_date": "2023-05-05", "join_field": { "name": "employee", "parent": "1" } } +{ "create": { "_index": "company", "_id": "6" } } +{ "type": "employee", "employee_name": "Carl Carter", "salary": 140000, "hire_date": "2020-06-06", "join_field": { "name": "employee", "parent": "2" } } ``` {% include copy-curl.html %} -#### Example children aggregation query +The following request queries all the departments and then filters for the one named `Accounting`. It then uses the `children` aggregation to select the two documents that have a child relationship with the `Accounting` department. Finally, the `avg` subaggregation returns the average of the `Accounting` employees' salaries: ```json -GET /blog-sample/_search +GET /company/_search { "size": 0, + "query": { + "bool": { + "filter": [ + { + "term": { + "join_field": "department" + } + }, + { + "term": { + "department_name": "Accounting" + } + } + ] + } + }, "aggs": { - "authors": { - "terms": { - "field": "name.keyword" + "acc_employees": { + "children": { + "type": "employee" }, "aggs": { - "posts": { - "children": { - "type": "post" - }, - "aggs": { - "post_titles": { - "terms": { - "field": "title.keyword" - }, - "aggs": { - "comments": { - "children": { - "type": "comment" - }, - "aggs": { - "comment_count": { - "value_count": { - "field": "_id" - } - } - } - } - } - } + "avg_salary": { + "avg": { + "field": "salary" } } } @@ -139,13 +124,13 @@ GET /blog-sample/_search ``` {% include copy-curl.html %} -#### Example response +## Example response -The response should appear similar to the following example: +The response returns the selected department bucket, finds the `employee` type children of the department, and computes the `avg` of their salaries: ```json { - "took": 30, + "took": 379, "timed_out": false, "_shards": { "total": 1, @@ -155,18 +140,19 @@ The response should appear similar to the following example: }, "hits": { "total": { - "value": 4, + "value": 1, "relation": "eq" }, "max_score": null, "hits": [] }, "aggregations": { - "authors": { - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0, - "buckets": [] + "acc_employees": { + "doc_count": 2, + "avg_salary": { + "value": 110000 + } } } } -``` +``` \ No newline at end of file diff --git a/_aggregations/bucket/index.md b/_aggregations/bucket/index.md index e1a02b890db..430bc9c048d 100644 --- a/_aggregations/bucket/index.md +++ b/_aggregations/bucket/index.md @@ -22,6 +22,7 @@ You can use bucket aggregations to implement faceted navigation (usually placed OpenSearch supports the following bucket aggregations: - [Adjacency matrix]({{site.url}}{{site.baseurl}}/aggregations/bucket/adjacency-matrix/) +- [Auto-interval date histogram]({{site.url}}{{site.baseurl}}/aggregations/bucket/auto-interval-date-histogram/) - [Children]({{site.url}}{{site.baseurl}}/aggregations/bucket/children) - [Date histogram]({{site.url}}{{site.baseurl}}/aggregations/bucket/date-histogram/) - [Date range]({{site.url}}{{site.baseurl}}/aggregations/bucket/date-range/) @@ -38,7 +39,9 @@ OpenSearch supports the following bucket aggregations: - [Missing]({{site.url}}{{site.baseurl}}/aggregations/bucket/missing/) - [Multi-terms]({{site.url}}{{site.baseurl}}/aggregations/bucket/multi-terms/) - [Nested]({{site.url}}{{site.baseurl}}/aggregations/bucket/nested/) +- [Parent]({{site.url}}{{site.baseurl}}/aggregations/bucket/parent/) - [Range]({{site.url}}{{site.baseurl}}/aggregations/bucket/range/) +- [Rare terms]({{site.url}}{{site.baseurl}}/aggregations/bucket/rare-terms/) - [Reverse nested]({{site.url}}{{site.baseurl}}/aggregations/bucket/reverse-nested/) - [Sampler]({{site.url}}{{site.baseurl}}/aggregations/bucket/sampler/) - [Significant terms]({{site.url}}{{site.baseurl}}/aggregations/bucket/significant-terms/) diff --git a/_aggregations/bucket/nested.md b/_aggregations/bucket/nested.md index 89c44c6457d..affda8e4375 100644 --- a/_aggregations/bucket/nested.md +++ b/_aggregations/bucket/nested.md @@ -96,8 +96,8 @@ GET logs/_search "aggregations" : { "pages" : { "doc_count" : 2, - "min_price" : { - "value" : 200.0 + "min_load_time" : { + "value" : 200 } } } diff --git a/_aggregations/bucket/parent.md b/_aggregations/bucket/parent.md new file mode 100644 index 00000000000..012f89db7ce --- /dev/null +++ b/_aggregations/bucket/parent.md @@ -0,0 +1,153 @@ +--- +layout: default +title: Parent +parent: Bucket aggregations +nav_order: 145 +--- + +# Parent aggregations + +The `parent` aggregation is a bucket aggregation that creates a single bucket containing parent documents, based on parent-child relationships defined in your index. This aggregation enables you to perform analytics on parent documents that have the same matching child documents, allowing for powerful hierarchical data analysis. + +The `parent` aggregation works with the [`join` field type]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/), which establishes parent-child relationships within documents in the same index. + +The `parent` aggregation identifies parent documents that have matching child documents, whereas the [`children` aggregation]({{site.url}}{{site.baseurl}}/aggregations/bucket/children/) identifies child documents that match a certain child relation. Both aggregations take the child relation name as input. + + +## Parameters + +The `parent` aggregation takes the following parameters: + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `type` | Required | String | The name of the child type from the `join` field. | + +## Example + +The following example builds a small company database with three employees. The employee records each have a child `join` relationship with a parent department record. + +First, create a `company` index with a `join` field that maps departments (parents) to employees (children): + +```json +PUT /company +{ + "mappings": { + "properties": { + "join_field": { + "type": "join", + "relations": { + "department": "employee" + } + }, + "department_name": { + "type": "keyword" + }, + "employee_name": { + "type": "keyword" + }, + "salary": { + "type": "double" + }, + "hire_date": { + "type": "date" + } + } + } +} +``` +{% include copy-curl.html %} + +Next, populate the data with three departments and three employees. The parent-child assignments are presented in the following table. + +| Department (parent) | Employees (children) | +| :-- | :-- | +| `Accounting` | `Abel Anderson`, `Betty Billings` | +| `Engineering` | `Carl Carter` | +| `HR` | none | + +The `routing` parameter ensures that both parent and child documents are stored on the same shard, which is required in order for parent-child relationships to function correctly in OpenSearch: + +```json +POST _bulk?routing=1 +{ "create": { "_index": "company", "_id": "1" } } +{ "type": "department", "department_name": "Accounting", "join_field": "department" } +{ "create": { "_index": "company", "_id": "2" } } +{ "type": "department", "department_name": "Engineering", "join_field": "department" } +{ "create": { "_index": "company", "_id": "3" } } +{ "type": "department", "department_name": "HR", "join_field": "department" } +{ "create": { "_index": "company", "_id": "4" } } +{ "type": "employee", "employee_name": "Abel Anderson", "salary": 120000, "hire_date": "2024-04-04", "join_field": { "name": "employee", "parent": "1" } } +{ "create": { "_index": "company", "_id": "5" } } +{ "type": "employee", "employee_name": "Betty Billings", "salary": 140000, "hire_date": "2023-05-05", "join_field": { "name": "employee", "parent": "1" } } +{ "create": { "_index": "company", "_id": "6" } } +{ "type": "employee", "employee_name": "Carl Carter", "salary": 140000, "hire_date": "2020-06-06", "join_field": { "name": "employee", "parent": "2" } } +``` +{% include copy-curl.html %} + +Lastly, run an aggregation of all the departments that have a parent relationship with one or more employees: + +```json +GET /company/_search +{ + "size": 0, + "aggs": { + "all_departments": { + "parent": { + "type": "employee" + }, + "aggs": { + "departments": { + "terms": { + "field": "department_name" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +The `all_departments` parent aggregation returns all the departments with employee child documents. Note that the HR department is not represented: + +```json +{ + "took": 3, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 6, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "all_departments": { + "doc_count": 2, + "departments": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "Accounting", + "doc_count": 1 + }, + { + "key": "Engineering", + "doc_count": 1 + } + ] + } + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/bucket/rare-terms.md b/_aggregations/bucket/rare-terms.md new file mode 100644 index 00000000000..be03e6fccbf --- /dev/null +++ b/_aggregations/bucket/rare-terms.md @@ -0,0 +1,348 @@ +--- +layout: default +title: Rare terms +parent: Bucket aggregations +nav_order: 155 +--- + +# Rare terms aggregations + +The `rare_terms` aggregation is a bucket aggregation that identifies infrequent terms in a dataset. In contrast to the `terms` aggregation, which finds the most common terms, the `rare_terms` aggregation finds terms that appear with the lowest frequency. The `rare_terms` aggregation is suitable for applications like anomaly detection, long-tail analysis, and exception reporting. + +It is possible to use `terms` to search for infrequent values by ordering the returned values by ascending count (`"order": {"count": "asc"}`). However, we strongly discourage this practice because it can lead to inaccurate results when multiple shards are involved. A term that is globally infrequent might not appear as infrequent on every individual shard or might be entirely absent from the least frequent results returned by some shards. Conversely, a term that appears infrequently on one shard might be common on another. In both scenarios, rare terms can be missed during shard-level aggregation, resulting in incorrect overall results. Instead of the `terms` aggregation, we recommend using the `rare_terms` aggregation, which is specifically designed to handle these cases more accurately. +{: .warning} + +## Approximated results + +Computing exact results for the `rare_terms` aggregation necessitates compiling a complete map of the values on all shards, which requires excessive runtime memory. For this reason, the `rare_terms` aggregation results are approximated. + +Most errors in `rare_terms` computations are _false negatives_ or "missed" values, which define the _sensitivity_ of the aggregation's detection test. The `rare_terms` aggregation uses a CuckooFilter algorithm to achieve a balance of appropriate sensitivity and acceptable memory use. For a description of the CuckooFilter algorithm, see [this paper](https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf). + +## Controlling sensitivity + +Sensitivity error in the `rare_terms` aggregation algorithm is measured as the fraction of rare values that are missed, or `false negatives/target values`. For example, if the aggregation misses 100 rare values in a dataset with 5,000 rare values, sensitivity error is `100/5000 = 0.02`, or 2%. + +You can adjust the `precision` parameter in `rare_terms` aggregations to control the trade-off between sensitivity and memory use. + +These factors also affect the sensitivity-memory trade-off: + +- The total number of unique values +- The fraction of rare items in the dataset + +The following guidelines can help you decide which `precision` value to use. + +### Calculating memory use + +Runtime memory use is described in absolute terms, typically in MB of RAM. + +Memory use increases linearly with the number of unique items. The linear scaling factor varies from roughly 1.0 to 2.5 MB per 1 million unique values, depending on the `precision` parameter. For the default `precision` of `0.001`, the memory cost is about 1.75 MB per 1 million unique values. + +### Managing sensitivity error + +Sensitivity error increases linearly with the total number of unique values. For information about estimating the number of unique values, see [Cardinality aggregation]({{site.url}}{{site.baseurl}}/aggregations/metric/cardinality/). + +Sensitivity error rarely exceeds 2.5% at the default `precision`, even for datasets with 10--20 million unique values. For a `precision` of `0.00001`, sensitivity error is rarely above 0.6%. However, a very low absolute number of rare values can cause large variances in the error rate (if there are only two rare values, missing one of them results in a 50% error rate). + + +## Compatibility with other aggregations + +The `rare_terms` aggregation uses breadth-first collection mode and is incompatible with aggregations that require depth-first collection mode in some subaggregations and nesting configurations. + +For more information about breadth-first search in OpenSearch, see [Collect mode]({{site.url}}{{site.baseurl}}/aggregations/bucket/terms#collect-mode). + + +## Parameters + +The `rare_terms` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `field` | Required | String | The field to analyze for rare terms. Must be of a numeric type or a text type with a `keyword` mapping. | +| `max_doc_count` | Optional | Integer | The maximum document count required in order for a term to be considered rare. Default is `1`. Maximum is `100`. | +| `precision` | Optional | Integer | Controls the precision of the algorithm used to identify rare terms. Higher values provide more precise results but consume more memory. Default is `0.001`. Minimum (most precise allowable) is `0.00001`. | +| `include` | Optional | Array/regex | Terms to include in the result. Can be a regular expression or an array of values. | +| `exclude` | Optional | Array/regex | Terms to exclude from the result. Can be a regular expression or an array of values. | +| `missing` | Optional | String | The value to use for documents that do not have a value for the field being aggregated. | + + +## Example + +The following request returns all destination airport codes that appear only once in the OpenSearch Dashboards sample flight data: + +```json +GET /opensearch_dashboards_sample_data_flights/_search +{ + "size": 0, + "aggs": { + "rare_destination": { + "rare_terms": { + "field": "DestAirportID", + "max_doc_count": 1 + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +The response shows that there are two airports that meet the criterion of appearing only once in the data: + +```json +{ + "took": 12, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "rare_destination": { + "buckets": [ + { + "key": "ADL", + "doc_count": 1 + }, + { + "key": "BUF", + "doc_count": 1 + } + ] + } + } +} +``` + + +## Document count limit + +Use the `max_doc_count` parameter to specify the largest document count that the `rare_terms` aggregation can return. There is no limit on the number of terms returned by `rare_terms`, so a large `max_doc_count` value can potentially return very large result sets. For this reason, `100` is the largest allowable `max_doc_count`. + +The following request returns all destination airport codes that appear two times at most in the OpenSearch Dashboards sample flight data: + +```json +GET /opensearch_dashboards_sample_data_flights/_search +{ + "size": 0, + "aggs": { + "rare_destination": { + "rare_terms": { + "field": "DestAirportID", + "max_doc_count": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +The response shows that seven destination airport codes meet the criterion of appearing in two or fewer documents, including the two from the previous example: + +```json +{ + "took": 6, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "rare_destination": { + "buckets": [ + { + "key": "ADL", + "doc_count": 1 + }, + { + "key": "BUF", + "doc_count": 1 + }, + { + "key": "ABQ", + "doc_count": 2 + }, + { + "key": "AUH", + "doc_count": 2 + }, + { + "key": "BIL", + "doc_count": 2 + }, + { + "key": "BWI", + "doc_count": 2 + }, + { + "key": "MAD", + "doc_count": 2 + } + ] + } + } +} +``` + + +## Filtering (include and exclude) + +Use the `include` and `exclude` parameters to filter values returned by the `rare_terms` aggregation. Both parameters can be included in the same aggregation. The `exclude` filter takes precedence; any excluded values are removed from the result, regardless of whether they were explicitly included. + +The arguments to `include` and `exclude` can be regular expressions (regex), including string literals, or arrays. Mixing regex and array arguments results in an error. For example, the following combination is not allowed: + +```json +"rare_terms": { + "field": "DestAirportID", + "max_doc_count": 2, + "exclude": ["ABQ", "AUH"], + "include": "A.*" +} +``` + + +### Example: Filtering + +The following example modifies the previous example to include all airport codes beginning with "A" but exclude the "ABQ" airport code: + +```json +GET /opensearch_dashboards_sample_data_flights/_search +{ + "size": 0, + "aggs": { + "rare_destination": { + "rare_terms": { + "field": "DestAirportID", + "max_doc_count": 2, + "include": "A.*", + "exclude": "ABQ" + } + } + } +} +``` +{% include copy-curl.html %} + +The response shows the two airport codes that meet the filtering requirements: + +```json +{ + "took": 4, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "rare_destination": { + "buckets": [ + { + "key": "ADL", + "doc_count": 1 + }, + { + "key": "AUH", + "doc_count": 2 + } + ] + } + } +} +``` + + +### Example: Filtering with array input + +The following example returns all destination airport codes that appear two times at most in the OpenSearch Dashboards sample flight data but specifies an array of airport codes to exclude: + +```json +GET /opensearch_dashboards_sample_data_flights/_search +{ + "size": 0, + "aggs": { + "rare_destination": { + "rare_terms": { + "field": "DestAirportID", + "max_doc_count": 2, + "exclude": ["ABQ", "BIL", "MAD"] + } + } + } +} +``` +{% include copy-curl.html %} + +The response omits the excluded airport codes: + +```json +{ + "took": 6, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "rare_destination": { + "buckets": [ + { + "key": "ADL", + "doc_count": 1 + }, + { + "key": "BUF", + "doc_count": 1 + }, + { + "key": "AUH", + "doc_count": 2 + }, + { + "key": "BWI", + "doc_count": 2 + } + ] + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/bucket/terms.md b/_aggregations/bucket/terms.md index b36214e3f6a..d82b8a092b8 100644 --- a/_aggregations/bucket/terms.md +++ b/_aggregations/bucket/terms.md @@ -59,6 +59,9 @@ GET opensearch_dashboards_sample_data_logs/_search The values are returned with the key `key`. `doc_count` specifies the number of documents in each bucket. By default, the buckets are sorted in descending order of `doc-count`. +It is possible to use `terms` to search for infrequent values by ordering the returned values by ascending count (`"order": {"count": "asc"}`). However, we strongly discourage this practice because it can lead to inaccurate results when multiple shards are involved. A term that is globally infrequent might not appear as infrequent on every individual shard or might be entirely absent from the least frequent results returned by some shards. Conversely, a term that appears infrequently on one shard might be common on another. In both scenarios, rare terms can be missed during shard-level aggregation, resulting in incorrect overall results. Instead of the `terms` aggregation, we recommend using the `rare_terms` aggregation, which is specifically designed to handle these cases more accurately. +{: .warning} + ## Size and shard size parameters @@ -112,7 +115,7 @@ While the `doc_count` field provides a representation of the number of individua * The field does not support nested arrays; only positive integers can be used. * If a document does not contain the `_doc_count` field, aggregation uses the document to increase the count by 1. -OpenSearch features that rely on an accurate document count illustrate the importance of using the `_doc_count` field. To see how this field can be used to support other search tools, refer to [Index rollups](https://opensearch.org/docs/latest/im-plugin/index-rollups/index/), an OpenSearch feature for the Index Management (IM) plugin that stores documents with pre-aggregated data in rollup indexes. +OpenSearch features that rely on an accurate document count illustrate the importance of using the `_doc_count` field. To see how this field can be used to support other search tools, refer to [Index rollups]({{site.url}}{{site.baseurl}}/im-plugin/index-rollups/index/), an OpenSearch feature for the Index Management (IM) plugin that stores documents with pre-aggregated data in rollup indexes. {: .tip} #### Example request diff --git a/_aggregations/metric/average.md b/_aggregations/metric/average.md index 9ad0c582fe2..7e87b694f50 100644 --- a/_aggregations/metric/average.md +++ b/_aggregations/metric/average.md @@ -9,9 +9,20 @@ redirect_from: # Average aggregations -The `avg` metric is a single-value metric aggregations that returns the average value of a field. +The `avg` metric is a single-value metric that returns the average value of a field. -The following example calculates the average of the `taxful_total_price` field: +## Parameters + +The `avg` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `field` | Required | String | The field for which the average is computed. | +| `missing` | Optional | Float | The value to assign to missing instances of the field. By default, `avg` omits missing values from the calculation. | + +## Example + + The following example request calculates the average of the `taxful_total_price` field in the OpenSearch Dashboards e-commerce sample data: ```json GET opensearch_dashboards_sample_data_ecommerce/_search @@ -28,7 +39,9 @@ GET opensearch_dashboards_sample_data_ecommerce/_search ``` {% include copy-curl.html %} -#### Example response +## Example response + +The response contains the average of the `taxful_total_price`: ```json { @@ -55,3 +68,117 @@ GET opensearch_dashboards_sample_data_ecommerce/_search } } ``` + +You can use the aggregation name (`avg_taxful_total_price`) as a key to retrieve the aggregation from the response. + +## Missing values + +You can assign a value to missing instances of the aggregated field. See [Missing aggregations]({{site.url}}{{site.baseurl}}/aggregations/bucket/missing/) for more information. + +Prepare an example index by ingesting the following documents. Note that the second document is missing a `gpa` value: + +```json +POST _bulk +{ "create": { "_index": "students", "_id": "1" } } +{ "name": "John Doe", "gpa": 3.89, "grad_year": 2022} +{ "create": { "_index": "students", "_id": "2" } } +{ "name": "Jonathan Powers", "grad_year": 2025 } +{ "create": { "_index": "students", "_id": "3" } } +{ "name": "Jane Doe", "gpa": 3.52, "grad_year": 2024 } +``` +{% include copy-curl.html %} + +### Example: Replacing a missing value + +Take the average, replacing the missing GPA field with `0`: + +```json +GET students/_search +{ + "size": 0, + "aggs": { + "avg_gpa": { + "avg": { + "field": "gpa", + "missing": 0 + } + } + } +} +``` +{% include copy-curl.html %} + +The response is as follows. Compare to the next example, where missing values are ignored: + +```json +{ + "took": 12, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "avg_gpa": { + "value": 2.4700000286102295 + } + } +} +``` + +### Example: Ignoring a missing value + +Take the average but without assigning the `missing` parameter: + +```json +GET students/_search +{ + "size": 0, + "aggs": { + "avg_gpa": { + "avg": { + "field": "gpa" + } + } + } +} +``` +{% include copy-curl.html %} + +The aggregator calculates the average, omitting documents containing missing field values (the default behavior): + +```json +{ + "took": 255, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "avg_gpa": { + "value": 3.7050000429153442 + } + } +} +``` \ No newline at end of file diff --git a/_aggregations/metric/cardinality.md b/_aggregations/metric/cardinality.md index e03a561adb2..a2c678362c4 100644 --- a/_aggregations/metric/cardinality.md +++ b/_aggregations/metric/cardinality.md @@ -9,9 +9,25 @@ redirect_from: # Cardinality aggregations -The `cardinality` metric is a single-value metric aggregation that counts the number of unique or distinct values of a field. +The `cardinality` aggregation is a single-value metric aggregation that counts the number of unique or distinct values of a field. -The following example finds the number of unique products in an eCommerce store: + +Cardinality count is approximate. See [Controlling precision](#controlling-precision) for more information. + +## Parameters + +The `cardinality` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `field` | Required | String | The field for which the cardinality is estimated. | +| `precision_threshold` | Optional | Numeric | The threshold below which counts are expected to be close to accurate. See [Controlling precision](#controlling-precision) for more information. | +| `execution_hint` | Optional | String | How to run the aggregation. Valid values are `ordinals` and `direct`. | +| `missing` | Optional | Same as `field`'s type | The bucket used to store missing instances of the field. If not provided, missing values are ignored. | + +## Example + +The following example request finds the number of unique product IDs in the OpenSearch Dashboards sample e-commerce data: ```json GET opensearch_dashboards_sample_data_ecommerce/_search @@ -28,22 +44,53 @@ GET opensearch_dashboards_sample_data_ecommerce/_search ``` {% include copy-curl.html %} -#### Example response +## Example response + +As shown in the following example response, the aggregation returns the cardinality count in the `unique_products` variable: ```json -... - "aggregations" : { - "unique_products" : { - "value" : 7033 +{ + "took": 176, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4675, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "unique_products": { + "value": 7033 } } } ``` -Cardinality count is approximate. -If you have tens of thousands of products in your hypothetical store, an accurate cardinality calculation requires loading all the values into a hash set and returning its size. This approach doesn't scale well; it requires huge amounts of memory and can cause high latencies. +## Controlling precision + +An accurate cardinality calculation requires loading all the values into a hash set and returning its size. This approach doesn't scale well; it can require huge amounts of memory and cause high latencies. + +You can control the trade-off between memory and accuracy by using the `precision_threshold` setting. This parameter sets the threshold below which counts are expected to be close to accurate. Counts higher than this value may be less accurate. -You can control the trade-off between memory and accuracy with the `precision_threshold` setting. This setting defines the threshold below which counts are expected to be close to accurate. Above this value, counts might become a bit less accurate. The default value of `precision_threshold` is 3,000. The maximum supported value is 40,000. +The default value of `precision_threshold` is 3,000. The maximum supported value is 40,000. + +The cardinality aggregation uses the [HyperLogLog++ algorithm](https://static.googleusercontent.com/media/research.google.com/fr//pubs/archive/40671.pdf). Cardinality counts are typically very accurate up to the precision threshold and are within 6% of the true count in most other cases, even with a threshold of as low as 100. + +### Precomputing hashes + +For high-cardinality string fields, storing hash values for the index field and computing the cardinality of the hash can save compute and memory resources. Use this approach with caution; it is more efficient only for sets with long strings and/or high cardinality. Numeric fields and less memory-consuming string sets are better processed directly. + +### Example: Controlling precision + +Set the precision threshold to `10000` unique values: ```json GET opensearch_dashboards_sample_data_ecommerce/_search @@ -58,4 +105,48 @@ GET opensearch_dashboards_sample_data_ecommerce/_search } } } -``` \ No newline at end of file +``` +{% include copy-curl.html %} + +The response is similar to the result with the default threshold, but the returned value is slightly different. Vary the `precision_threshold` parameter to see how it affects the cardinality estimate. + +## Configuring aggregation execution + +You can control how an aggregation runs using the `execution_hint` setting. This setting supports two options: + +- `direct` – Uses field values directly. +- `ordinals` – Uses ordinals of the field. + +If you don't specify `execution_hint`, OpenSearch automatically chooses the best option for the field. + +Setting `ordinals` on a non-ordinal field has no effect. Similarly, `direct` has no effect on ordinal fields. +{: .note} + +This is an expert-level setting. Ordinals use byte arrays, where the array size depends on the field's cardinality. High-cardinality fields can consume significant heap memory, increasing the risk of out-of-memory errors. +{: .warning} + +### Example: Controlling execution + +The following request runs a cardinality aggregation using ordinals: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "unique_products": { + "cardinality": { + "field": "products.product_id", + "execution_hint": "ordinals" + } + } + } +} +``` +{% include copy-curl.html %} + +## Missing values + +You can assign a value to missing instances of the aggregated field. See [Missing aggregations]({{site.url}}{{site.baseurl}}/aggregations/bucket/missing/) for more information. + +Replacing missing values in a cardinality aggregation adds the replacement value to the list of unique values, increasing the actual cardinality by one. diff --git a/_aggregations/metric/extended-stats.md b/_aggregations/metric/extended-stats.md index 467fa348b78..79d3b6baa55 100644 --- a/_aggregations/metric/extended-stats.md +++ b/_aggregations/metric/extended-stats.md @@ -9,8 +9,41 @@ redirect_from: # Extended stats aggregations -The `extended_stats` aggregation is an extended version of the [`stats`]({{site.url}}{{site.baseurl}}/query-dsl/aggregations/metric/stats/) aggregation. Apart from including basic stats, `extended_stats` also returns stats such as `sum_of_squares`, `variance`, and `std_deviation`. -The following example returns extended stats for `taxful_total_price`: +The `extended_stats` aggregation is a more comprehensive version of the [`stats`]({{site.url}}{{site.baseurl}}/query-dsl/aggregations/metric/stats/) aggregation. As well as the basic statistical measures provided by `stats`, `extended_stats` calculates the following: + +- Sum of squares +- Variance +- Population variance +- Sampling variance +- Standard deviation +- Population standard deviation +- Sampling standard deviation +- Standard deviation bounds: + - Upper + - Lower + - Population upper + - Population lower + - Sampling upper + - Sampling lower + +The standard deviation and variance are population statistics; they are always equal to the population standard deviation and variance, respectively. + +The `std_deviation_bounds` object defines a range that spans the specified number of standard deviations above and below the mean (default is two standard deviations). This object is always included in the output but is meaningful only for normally distributed data. Before interpreting these values, verify that your dataset follows a normal distribution. + +## Parameters + +The `extended_stats` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `field` | Required | String | The name of the field for which the extended stats are returned. | +| `sigma` | Optional | Double (non-negative) | The number of standard deviations above and below the mean used to calculate the `std_deviation_bounds` interval. Default is `2`. | +| `missing` | Optional | Numeric | The value assigned to missing instances of the field. If not provided, documents containing missing values are omitted from the extended stats. | + +## Example + +The following example request returns extended stats for `taxful_total_price` in the OpenSearch Dashboards sample e-commerce data: + ```json GET opensearch_dashboards_sample_data_ecommerce/_search { @@ -26,7 +59,9 @@ GET opensearch_dashboards_sample_data_ecommerce/_search ``` {% include copy-curl.html %} -#### Example response +## Example response + +The response contains extended stats for `taxful_total_price`: ```json ... @@ -57,8 +92,13 @@ GET opensearch_dashboards_sample_data_ecommerce/_search } ``` -The `std_deviation_bounds` object provides a visual variance of the data with an interval of plus/minus two standard deviations from the mean. -To set the standard deviation to a different value, say 3, set `sigma` to 3: +## Defining bounds + +You can define the number of standard deviations used to calculate the `std_deviation_bounds` interval by setting the `sigma` parameter to any non-negative value. + +### Example: Defining bounds + +Set the number of `std_deviation_bounds` standard deviations to `3`: ```json GET opensearch_dashboards_sample_data_ecommerce/_search @@ -73,4 +113,145 @@ GET opensearch_dashboards_sample_data_ecommerce/_search } } } -``` \ No newline at end of file +``` +{% include copy-curl.html %} + +This changes the standard deviation bounds: + +```json +{ +... + "aggregations": { +... + "std_deviation_bounds": { + "upper": 233.44837084770438, + "lower": -83.33751356160813, + "upper_population": 233.44837084770438, + "lower_population": -83.33751356160813, + "upper_sampling": 233.46531398752978, + "lower_sampling": -83.35445670143353 + } + } + } +} +``` + +## Missing values + +You can assign a value to missing instances of the aggregated field. See [Missing aggregations]({{site.url}}{{site.baseurl}}/aggregations/bucket/missing/) for more information. + +Prepare an example index by ingesting the following documents: + +```json +POST _bulk +{ "create": { "_index": "students", "_id": "1" } } +{ "name": "John Doe", "gpa": 3.89, "grad_year": 2022} +{ "create": { "_index": "students", "_id": "2" } } +{ "name": "Jonathan Powers", "grad_year": 2025 } +{ "create": { "_index": "students", "_id": "3" } } +{ "name": "Jane Doe", "gpa": 3.52, "grad_year": 2024 } +``` +{% include copy-curl.html %} + +### Example: Replacing a missing value + +Compute `extended_stats`, replacing the missing GPA field with `0`: + +```json +GET students/_search +{ + "size": 0, + "aggs": { + "extended_stats_gpa": { + "extended_stats": { + "field": "gpa", + "missing": 0 + } + } + } +} +``` +{% include copy-curl.html %} + +In the response, all missing values of `gpa` are replaced with `0`: + +```json +... + "aggregations": { + "extended_stats_gpa": { + "count": 3, + "min": 0, + "max": 3.890000104904175, + "avg": 2.4700000286102295, + "sum": 7.4100000858306885, + "sum_of_squares": 27.522500681877148, + "variance": 3.0732667526245145, + "variance_population": 3.0732667526245145, + "variance_sampling": 4.609900128936772, + "std_deviation": 1.7530735160353415, + "std_deviation_population": 1.7530735160353415, + "std_deviation_sampling": 2.147067797936705, + "std_deviation_bounds": { + "upper": 5.976147060680912, + "lower": -1.0361470034604534, + "upper_population": 5.976147060680912, + "lower_population": -1.0361470034604534, + "upper_sampling": 6.7641356244836395, + "lower_sampling": -1.8241355672631805 + } + } + } +} +``` + +### Example: Ignoring a missing value + +Compute `extended_stats` but without assigning the `missing` parameter: + +```json +GET students/_search +{ + "size": 0, + "aggs": { + "extended_stats_gpa": { + "extended_stats": { + "field": "gpa" + } + } + } +} +``` +{% include copy-curl.html %} + +OpenSearch calculates the extended statistics, omitting documents containing missing field values (the default behavior): + +```json +... + "aggregations": { + "extended_stats_gpa": { + "count": 2, + "min": 3.5199999809265137, + "max": 3.890000104904175, + "avg": 3.7050000429153442, + "sum": 7.4100000858306885, + "sum_of_squares": 27.522500681877148, + "variance": 0.03422502293587115, + "variance_population": 0.03422502293587115, + "variance_sampling": 0.0684500458717423, + "std_deviation": 0.18500006198883057, + "std_deviation_population": 0.18500006198883057, + "std_deviation_sampling": 0.2616295967044675, + "std_deviation_bounds": { + "upper": 4.075000166893005, + "lower": 3.334999918937683, + "upper_population": 4.075000166893005, + "lower_population": 3.334999918937683, + "upper_sampling": 4.228259236324279, + "lower_sampling": 3.1817408495064092 + } + } + } +} +``` + +The document containing the missing GPA value is omitted from this calculation. Note the difference in `count`. diff --git a/_aggregations/metric/geobounds.md b/_aggregations/metric/geobounds.md index 9489c6b18e8..7dfe5f937b8 100644 --- a/_aggregations/metric/geobounds.md +++ b/_aggregations/metric/geobounds.md @@ -7,11 +7,22 @@ redirect_from: - /query-dsl/aggregations/metric/geobounds/ --- -## Geobounds aggregations +# Geobounds aggregation -The `geo_bounds` metric is a multi-value metric aggregation that calculates the [geographic bounding box](https://docs.ogc.org/is/12-063r5/12-063r5.html#30) containing all values of a given `geo_point` or `geo_shape` field. The bounding box is returned as the upper-left and lower-right vertices of the rectangle in terms of latitude and longitude. +The `geo_bounds` aggregation is a multi-value aggregation that calculates the [geographic bounding box](https://docs.ogc.org/is/12-063r5/12-063r5.html#30) encompassing a set of [`geo_point`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/geo-point/) or [`geo_shape`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/geo-shape/) objects. The bounding box is returned as the upper-left and lower-right vertices of the rectangle given as a decimal-encoded latitude-longitude (lat-lon) pair. -The following example returns the `geo_bounds` metrics for the `geoip.location` field: +## Parameters + +The `geo_bounds` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `field` | Required | String | The name of the field containing the geopoints or geoshapes for which the geobounds are computed. | +| `wrap_longitude` | Optional | Boolean | Whether to allow the bounding box to overlap the international date line. Default is `true`. | + +## Example + +The following example returns the `geo_bounds` for the `geoip.location` of every order in the e-commerce sample data (each `geoip.location` is a geopoint): ```json GET opensearch_dashboards_sample_data_ecommerce/_search @@ -26,30 +37,52 @@ GET opensearch_dashboards_sample_data_ecommerce/_search } } ``` +{% include copy-curl.html %} + +## Example response -#### Example response +As shown in the following example response, the aggregation returns the `geobounds` containing all geopoints in the `geoip.location` field: ```json -"aggregations" : { - "geo" : { - "bounds" : { - "top_left" : { - "lat" : 52.49999997206032, - "lon" : -118.20000001229346 - }, - "bottom_right" : { - "lat" : 4.599999985657632, - "lon" : 55.299999956041574 +{ + "took": 16, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4675, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "geo": { + "bounds": { + "top_left": { + "lat": 52.49999997206032, + "lon": -118.20000001229346 + }, + "bottom_right": { + "lat": 4.599999985657632, + "lon": 55.299999956041574 + } } } } - } } ``` ## Aggregating geoshapes -To run an aggregation on a geoshape field, first create an index and map the `location` field as a `geo_shape`: +You can run a `geo_bounds` aggregation on geoshapes. + +Prepare an example by inserting an index containing a geoshape field: ```json PUT national_parks @@ -65,48 +98,91 @@ PUT national_parks ``` {% include copy-curl.html %} -Next, index some documents into the `national_parks` index: +Ingest documents into the index. GeoJSON input specifies longitude first: ```json -PUT national_parks/_doc/1 -{ - "name": "Yellowstone National Park", - "location": - {"type": "envelope","coordinates": [ [-111.15, 45.12], [-109.83, 44.12] ]} -} +POST _bulk +{ "create": { "_index": "national_parks", "_id": "1" } } +{"name": "Yellowstone National Park", "location": {"type": "envelope","coordinates": [ [-111.15, 45.12], [-109.83, 44.12] ]}} +{ "create": { "_index": "national_parks", "_id": "2" } } +{ "name": "Yosemite National Park", "location": {"type": "envelope","coordinates": [ [-120.23, 38.16], [-119.05, 37.45] ]} } +{ "create": { "_index": "national_parks", "_id": "3" } } +{ "name": "Death Valley National Park", "location": {"type": "envelope","coordinates": [ [-117.34, 37.01], [-116.38, 36.25] ]} } +{ "create": { "_index": "national_parks", "_id": "4" } } +{ "name": "War In The Pacific National Historic Park Guam", "location": {"type": "point","coordinates": [144.72, 13.47]} } ``` {% include copy-curl.html %} +Run a `geo_bounds` aggregation on the `location` field: + ```json -PUT national_parks/_doc/2 +GET national_parks/_search { - "name": "Yosemite National Park", - "location": - {"type": "envelope","coordinates": [ [-120.23, 38.16], [-119.05, 37.45] ]} + "size": 0, + "aggregations": { + "grouped": { + "geo_bounds": { + "field": "location", + "wrap_longitude": true + } + } + } } ``` {% include copy-curl.html %} +The response contains the smallest geo-bounding box that encloses all shapes in the `location` field: + ```json -PUT national_parks/_doc/3 { - "name": "Death Valley National Park", - "location": - {"type": "envelope","coordinates": [ [-117.34, 37.01], [-116.38, 36.25] ]} + "took": 8, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "grouped": { + "bounds": { + "top_left": { + "lat": 45.11999997776002, + "lon": 144.71999991685152 + }, + "bottom_right": { + "lat": 13.469999986700714, + "lon": -109.83000006526709 + } + } + } + } } ``` -{% include copy-curl.html %} -You can run a `geo_bounds` aggregation on the `location` field as follows: +## Wrapping longitude + +If the optional `wrap_longitude` parameter is set to `true`, the bounding box can overlap the international date line (180° meridian) and return a `bounds` object in which the upper-left longitude is greater than the lower-right longitude. The default value for `wrap_longitude` is `true`. + +Rerun the `geo_bounds` aggregation on the national parks geoshape with `wrap_longitude` set to `false`: ```json GET national_parks/_search { + "size": 0, "aggregations": { "grouped": { "geo_bounds": { "field": "location", - "wrap_longitude": true + "wrap_longitude": false } } } @@ -114,115 +190,27 @@ GET national_parks/_search ``` {% include copy-curl.html %} -The optional `wrap_longitude` parameter specifies whether the bounding box returned by the aggregation can overlap the international date line (180° meridian). If `wrap_longitude` is set to `true`, the bounding box can overlap the international date line and return a `bounds` object in which the lower-left longitude is greater than the upper-right longitude. The default value for `wrap_longitude` is `true`. - -The response contains the geo-bounding box that encloses all shapes in the `location` field: - -
- - Response - - {: .text-delta} +Note that the new resulting geobound encompasses a larger area to avoid overlapping the dateline: ```json { - "took" : 3, - "timed_out" : false, - "_shards" : { - "total" : 1, - "successful" : 1, - "skipped" : 0, - "failed" : 0 - }, - "hits" : { - "total" : { - "value" : 3, - "relation" : "eq" - }, - "max_score" : 1.0, - "hits" : [ - { - "_index" : "national_parks", - "_id" : "1", - "_score" : 1.0, - "_source" : { - "name" : "Yellowstone National Park", - "location" : { - "type" : "envelope", - "coordinates" : [ - [ - -111.15, - 45.12 - ], - [ - -109.83, - 44.12 - ] - ] - } - } - }, - { - "_index" : "national_parks", - "_id" : "2", - "_score" : 1.0, - "_source" : { - "name" : "Yosemite National Park", - "location" : { - "type" : "envelope", - "coordinates" : [ - [ - -120.23, - 38.16 - ], - [ - -119.05, - 37.45 - ] - ] - } - } - }, - { - "_index" : "national_parks", - "_id" : "3", - "_score" : 1.0, - "_source" : { - "name" : "Death Valley National Park", - "location" : { - "type" : "envelope", - "coordinates" : [ - [ - -117.34, - 37.01 - ], - [ - -116.38, - 36.25 - ] - ] - } - } - } - ] - }, - "aggregations" : { - "Grouped" : { - "bounds" : { - "top_left" : { - "lat" : 45.11999997776002, - "lon" : -120.23000006563962 +... + "aggregations": { + "grouped": { + "bounds": { + "top_left": { + "lat": 45.11999997776002, + "lon": -120.23000006563962 }, - "bottom_right" : { - "lat" : 36.249999976716936, - "lon" : -109.83000006526709 + "bottom_right": { + "lat": 13.469999986700714, + "lon": 144.71999991685152 } } } } } ``` -
-Currently, OpenSearch supports geoshape aggregation through the API but not in OpenSearch Dashboards visualizations. If you'd like to see geoshape aggregation implemented for visualizations, upvote the related [GitHub issue](https://github.com/opensearch-project/dashboards-maps/issues/250). +OpenSearch supports geoshape aggregation through the API but not in OpenSearch Dashboards visualizations. {: .note} diff --git a/_aggregations/metric/geocentroid.md b/_aggregations/metric/geocentroid.md index 14a2d179bb1..509e7892e3d 100644 --- a/_aggregations/metric/geocentroid.md +++ b/_aggregations/metric/geocentroid.md @@ -7,62 +7,29 @@ nav_order: 45 # Geocentroid -The OpenSearch `geo_centroid` aggregation is a powerful tool that allows you to calculate the weighted geographic center or focal point of a set of spatial data points. This metric aggregation operates on `geo_point` fields and returns the centroid location as a latitude-longitude pair. +The `geo_centroid` aggregation calculates the geographic center or focal point of a set of `geo_point` values. It returns the centroid location as a latitude-longitude pair. -## Using the aggregation +## Parameters -Follow these steps to use the `geo_centroid` aggregation: +The `geo_centroid` aggregation takes the following parameter. -**1. Create an index with a `geopoint` field** +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `field` | Required | String | The name of the field containing the geopoints for which the geocentroid is computed. | -First, you need to create an index with a `geo_point` field type. This field stores the geographic coordinates you want to analyze. For example, to create an index called `restaurants` with a `location` field of type `geo_point`, use the following request: +## Example -```json -PUT /restaurants -{ - "mappings": { - "properties": { - "name": { - "type": "text" - }, - "location": { - "type": "geo_point" - } - } - } -} -``` -{% include copy-curl.html %} +The following example returns the `geo_centroid` for the `geoip.location` of every order in the e-commerce sample data. Each `geoip.location` is a geopoint: -**2. Index documents with spatial data** - -Next, index your documents containing the spatial data points you want to analyze. Make sure to include the `geo_point` field with the appropriate latitude-longitude coordinates. For example, index your documents using the following request: ```json -POST /restaurants/_bulk?refresh -{"index": {"_id": 1}} -{"name": "Cafe Delish", "location": "40.7128, -74.0059"} -{"index": {"_id": 2}} -{"name": "Tasty Bites", "location": "51.5074, -0.1278"} -{"index": {"_id": 3}} -{"name": "Sushi Palace", "location": "48.8566, 2.3522"} -{"index": {"_id": 4}} -{"name": "Burger Joint", "location": "34.0522, -118.2437"} -``` -{% include copy-curl.html %} - -**3. Run the `geo_centroid` aggregation** - -To caluculate the centroid location across all documents, run a search with the `geo_centroid` aggregation on the `geo_point` field. For example, use the following request: - -```json -GET /restaurants/_search +GET /opensearch_dashboards_sample_data_ecommerce/_search { "size": 0, "aggs": { "centroid": { "geo_centroid": { - "field": "location" + "field": "geoip.location" } } } @@ -70,101 +37,65 @@ GET /restaurants/_search ``` {% include copy-curl.html %} -The response includes a `centroid` object with `lat` and `lon` properties representing the weighted centroid location of all indexed data point, as shown in the following example: - - ```json - "aggregations": { - "centroid": { - "location": { - "lat": 43.78224998130463, - "lon": -47.506300045643 - }, - "count": 4 -``` -{% include copy-curl.html %} - -**4. Nest under other aggregations (optional)** +## Example response -You can also nest the `geo_centroid` aggregation under other bucket aggregations, such as `terms`, to calculate the centroid for subsets of your data. For example, to find the centroid location for each city, use the following request: +The response includes a `centroid` object with `lat` and `lon` properties representing the centroid location of all indexed data points: ```json -GET /restaurants/_search { - "size": 0, - "aggs": { - "cities": { - "terms": { - "field": "city.keyword" + "took": 35, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4675, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "centroid": { + "location": { + "lat": 35.54990372113027, + "lon": -9.079764742533712 }, - "aggs": { - "centroid": { - "geo_centroid": { - "field": "location" - } - } - } + "count": 4675 } } } ``` -{% include copy-curl.html %} - -This returns a centroid location for each city bucket, allowing you to analyze the geographic center of data points in different cities. - -## Using `geo_centroid` with the `geohash_grid` aggregation -The `geohash_grid` aggregation partitions geospatial data into buckets based on geohash prefixes. +The centroid location is in the Atlantic Ocean north of Morocco. This is not very meaningful, given the wide geographical dispersion of orders in the database. -When a document contains multiple geopoint values in a field, the `geohash_grid` aggregation assigns the document to multiple buckets, even if one or more of its geopoints are outside the bucket boundaries. This behavior is different from how individual geopoints are treated, where only those within the bucket boundaries are considered. +## Nesting under other aggregations -When you nest the `geo_centroid` aggregation under the `geohash_grid` aggregation, each centroid is calculated using all geopoints in a bucket, including those that may be outside the bucket boundaries. This can result in centroid locations that fall outside the geographic area represented by the bucket. +You can nest the `geo_centroid` aggregation inside bucket aggregations to calculate the centroid for subsets of your data. -#### Example +### Example: Nesting under a terms aggregation -In this example, the `geohash_grid` aggregation with a `precision` of `3` creates buckets based on geohash prefixes of length `3`. Because each document has multiple geopoints, they may be assigned to multiple buckets, even if some of the geopoints fall outside the bucket boundaries. +You can nest `geo_centroid` aggregations under `terms` buckets of a string field. -The `geo_centroid` subaggregation calculates the centroid for each bucket using all geopoints assigned to that bucket, including those outside the bucket boundaries. This means that the resulting centroid locations may not necessarily lie within the geographic area represented by the corresponding geohash bucket. - -First, create an index and index documents containing multiple geopoints: +To find the centroid location of `geoip` for the orders on each continent, sub-aggregate the centroid within the `geoip.continent_name` field: ```json -PUT /locations -{ - "mappings": { - "properties": { - "name": { - "type": "text" - }, - "coordinates": { - "type": "geo_point" - } - } - } -} - -POST /locations/_bulk?refresh -{"index": {"_id": 1}} -{"name": "Point A", "coordinates": ["40.7128, -74.0059", "51.5074, -0.1278"]} -{"index": {"_id": 2}} -{"name": "Point B", "coordinates": ["48.8566, 2.3522", "34.0522, -118.2437"]} -``` - -Then, run `geohash_grid` with the `geo_centroid` subaggregation: - -```json -GET /locations/_search +GET /opensearch_dashboards_sample_data_ecommerce/_search { "size": 0, "aggs": { - "grid": { - "geohash_grid": { - "field": "coordinates", - "precision": 3 + "continents": { + "terms": { + "field": "geoip.continent_name" }, "aggs": { "centroid": { "geo_centroid": { - "field": "coordinates" + "field": "geoip.location" } } } @@ -174,15 +105,17 @@ GET /locations/_search ``` {% include copy-curl.html %} -
-   -    Response -   -  {: .text-delta} +This returns a centroid location for each continent's bucket: + +
+ + Response + + {: .text-delta} ```json { - "took": 26, + "took": 34, "timed_out": false, "_shards": { "total": 1, @@ -192,57 +125,70 @@ GET /locations/_search }, "hits": { "total": { - "value": 2, + "value": 4675, "relation": "eq" }, "max_score": null, "hits": [] }, "aggregations": { - "grid": { + "continents": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, "buckets": [ { - "key": "u09", - "doc_count": 1, + "key": "Asia", + "doc_count": 1220, "centroid": { "location": { - "lat": 41.45439997315407, - "lon": -57.945750039070845 + "lat": 28.023606536509163, + "lon": 47.83377046025068 }, - "count": 2 + "count": 1220 } }, { - "key": "gcp", - "doc_count": 1, + "key": "North America", + "doc_count": 1206, "centroid": { "location": { - "lat": 46.11009998945519, - "lon": -37.06685005221516 + "lat": 39.06542286878007, + "lon": -85.36152573149485 }, - "count": 2 + "count": 1206 } }, { - "key": "dr5", - "doc_count": 1, + "key": "Europe", + "doc_count": 1172, "centroid": { "location": { - "lat": 46.11009998945519, - "lon": -37.06685005221516 + "lat": 48.125767892293325, + "lon": 2.7529009746915243 }, - "count": 2 + "count": 1172 } }, { - "key": "9q5", - "doc_count": 1, + "key": "Africa", + "doc_count": 899, "centroid": { "location": { - "lat": 41.45439997315407, - "lon": -57.945750039070845 + "lat": 30.780756367941297, + "lon": 13.464182392125318 }, - "count": 2 + "count": 899 + } + }, + { + "key": "South America", + "doc_count": 178, + "centroid": { + "location": { + "lat": 4.599999985657632, + "lon": -74.10000007599592 + }, + "count": 178 } } ] @@ -250,6 +196,4 @@ GET /locations/_search } } ``` -{% include copy-curl.html %} -
diff --git a/_aggregations/metric/matrix-stats.md b/_aggregations/metric/matrix-stats.md index 188f8745fb2..b81a948820d 100644 --- a/_aggregations/metric/matrix-stats.md +++ b/_aggregations/metric/matrix-stats.md @@ -9,8 +9,24 @@ redirect_from: # Matrix stats aggregations -The `matrix_stats` aggregation generates advanced stats for multiple fields in a matrix form. -The following example returns advanced stats in a matrix form for the `taxful_total_price` and `products.base_price` fields: +The `matrix_stats` aggregation is a multi-value metric aggregation that generates covariance statistics for two or more fields in matrix form. + +The `matrix_stats` aggregation does not support scripting. +{: .note} + +## Parameters + +The `matrix_stats` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `fields` | Required | String | An array of fields for which the matrix stats are computed. | +| `missing` | Optional | Object | The value to use in place of missing values. By default, missing values are ignored. See [Missing values](#missing-values). | +| `mode` | Optional | String | The value to use as a sample from a multi-valued or array field. Allowed values are `avg`, `min`, `max`, `sum`, and `median`. Default is `avg`. | + +## Example + +The following example returns statistics for the `taxful_total_price` and `products.base_price` fields in the OpenSearch Dashboards e-commerce sample data: ```json GET opensearch_dashboards_sample_data_ecommerce/_search @@ -27,60 +43,262 @@ GET opensearch_dashboards_sample_data_ecommerce/_search ``` {% include copy-curl.html %} -#### Example response +The response containes the aggregated results: ```json -... -"aggregations" : { - "matrix_stats_taxful_total_price" : { - "doc_count" : 4675, - "fields" : [ - { - "name" : "products.base_price", - "count" : 4675, - "mean" : 34.994239430147196, - "variance" : 360.5035285833703, - "skewness" : 5.530161335032702, - "kurtosis" : 131.16306324042148, - "covariance" : { - "products.base_price" : 360.5035285833703, - "taxful_total_price" : 846.6489362233166 +{ + "took": 250, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4675, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "matrix_stats_taxful_total_price": { + "doc_count": 4675, + "fields": [ + { + "name": "products.base_price", + "count": 4675, + "mean": 34.99423943014724, + "variance": 360.5035285833702, + "skewness": 5.530161335032689, + "kurtosis": 131.1630632404217, + "covariance": { + "products.base_price": 360.5035285833702, + "taxful_total_price": 846.6489362233169 + }, + "correlation": { + "products.base_price": 1, + "taxful_total_price": 0.8444765264325269 + } }, - "correlation" : { - "products.base_price" : 1.0, - "taxful_total_price" : 0.8444765264325268 + { + "name": "taxful_total_price", + "count": 4675, + "mean": 75.05542864304839, + "variance": 2788.1879749835425, + "skewness": 15.812149139923994, + "kurtosis": 619.1235507385886, + "covariance": { + "products.base_price": 846.6489362233169, + "taxful_total_price": 2788.1879749835425 + }, + "correlation": { + "products.base_price": 0.8444765264325269, + "taxful_total_price": 1 + } } - }, - { - "name" : "taxful_total_price", - "count" : 4675, - "mean" : 75.05542864304839, - "variance" : 2788.1879749835402, - "skewness" : 15.812149139924037, - "kurtosis" : 619.1235507385902, - "covariance" : { - "products.base_price" : 846.6489362233166, - "taxful_total_price" : 2788.1879749835402 + ] + } + } +} +``` + +The following table describes the response fields. + +| Statistic | Description | +| :--- | :--- | +| `count` | The number of documents sampled for the aggregation. | +| `mean` | The average value of the field computed from the sample. | +| `variance` | The square of deviation from the mean, a measure of data spread. | +| `skewness` | A measure of the distribution's asymmetry relative to the mean. See [Skewness](https://en.wikipedia.org/wiki/Skewness). | +| `kurtosis` | A measure of the tail-heaviness of a distribution. As the tails become lighter, kurtosis decreases. Kurtosis and skewness are evaluated to determine whether a population is likely to be [normally distributed](https://en.wikipedia.org/wiki/Normal_distribution). See [Kurtosis](https://en.wikipedia.org/wiki/Kurtosis).| +| `covariance` | A measure of the joint variability between two fields. A positive value means their values move in the same direction. | +| `correlation` | The normalized covariance, a measure of the strength of the relationship between two fields. Possible values are from -1 to 1, inclusive, indicating perfect negative to perfect positive linear correlation. A value of 0 indicates no discernible relationship between the variables. | + +## Missing values + +To define how missing values are treated, use the `missing` parameter. By default, missing values are ignored. + +For example, create an index in which document 1 is missing the `gpa` and `class_grades` fields: + +```json +POST _bulk +{ "create": { "_index": "students", "_id": "1" } } +{ "name": "John Doe" } +{ "create": { "_index": "students", "_id": "2" } } +{ "name": "Jonathan Powers", "gpa": 3.85, "class_grades": [3.0, 3.9, 4.0] } +{ "create": { "_index": "students", "_id": "3" } } +{ "name": "Jane Doe", "gpa": 3.52, "class_grades": [3.2, 2.1, 3.8] } +``` +{% include copy-curl.html %} + +First, run a `matrix_stats` aggregation without providing a `missing` parameter: + +```json +GET students/_search +{ + "size": 0, + "aggs": { + "matrix_stats_taxful_total_price": { + "matrix_stats": { + "fields": [ + "gpa", + "class_grades" + ], + "mode": "avg" + } + } + } +} +``` +{% include copy-curl.html %} + +OpenSearch ignores missing values when calculating the matrix statistics: + +```json +{ + "took": 5, + "timed_out": false, + "terminated_early": true, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "matrix_stats_taxful_total_price": { + "doc_count": 2, + "fields": [ + { + "name": "gpa", + "count": 2, + "mean": 3.684999942779541, + "variance": 0.05444997482300096, + "skewness": 0, + "kurtosis": 1, + "covariance": { + "gpa": 0.05444997482300096, + "class_grades": 0.09899998760223136 + }, + "correlation": { + "gpa": 1, + "class_grades": 0.9999999999999991 + } }, - "correlation" : { - "products.base_price" : 0.8444765264325268, - "taxful_total_price" : 1.0 + { + "name": "class_grades", + "count": 2, + "mean": 3.333333333333333, + "variance": 0.1800000381469746, + "skewness": 0, + "kurtosis": 1, + "covariance": { + "gpa": 0.09899998760223136, + "class_grades": 0.1800000381469746 + }, + "correlation": { + "gpa": 0.9999999999999991, + "class_grades": 1 + } + } + ] + } + } +} +``` + +To set the missing fields to `0`, provide the `missing` parameter as a key-value map. Even though `class_grades` is an array field, the `matrix_stats` aggregation flattens multi-valued numeric fields into a per-document average, so you must supply a single number as the missing value: + +```json +GET students/_search +{ + "size": 0, + "aggs": { + "matrix_stats_taxful_total_price": { + "matrix_stats": { + "fields": ["gpa", "class_grades"], + "mode": "avg", + "missing": { + "gpa": 0, + "class_grades": 0 } } - ] + } } - } } ``` +{% include copy-curl.html %} -The following table lists all response fields. - -Statistic | Description -:--- | :--- -`count` | The number of samples measured. -`mean` | The average value of the field measured from the sample. -`variance` | How far the values of the field measured are spread out from its mean value. The larger the variance, the more it's spread from its mean value. -`skewness` | An asymmetric measure of the distribution of the field's values around the mean. -`kurtosis` | A measure of the tail heaviness of a distribution. As the tail becomes lighter, kurtosis decreases. As the tail becomes heavier, kurtosis increases. To learn about kurtosis, see [Wikipedia](https://en.wikipedia.org/wiki/Kurtosis). -`covariance` | A measure of the joint variability between two fields. A positive value means their values move in the same direction and the other way around. -`correlation` | A measure of the strength of the relationship between two fields. The valid values are between [-1, 1]. A value of -1 means that the value is negatively correlated and a value of 1 means that it's positively correlated. A value of 0 means that there's no identifiable relationship between them. \ No newline at end of file +OpenSearch substitutes `0` for any missing `gpa` or `class_grades` values when calculating the matrix statistics: + +```json +{ + "took": 23, + "timed_out": false, + "terminated_early": true, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "matrix_stats_taxful_total_price": { + "doc_count": 3, + "fields": [ + { + "name": "gpa", + "count": 3, + "mean": 2.456666628519694, + "variance": 4.55363318017324, + "skewness": -0.688130006360758, + "kurtosis": 1.5, + "covariance": { + "gpa": 4.55363318017324, + "class_grades": 4.143944374667273 + }, + "correlation": { + "gpa": 1, + "class_grades": 0.9970184390038257 + } + }, + { + "name": "class_grades", + "count": 3, + "mean": 2.2222222222222223, + "variance": 3.793703722777191, + "skewness": -0.6323693521730989, + "kurtosis": 1.5000000000000002, + "covariance": { + "gpa": 4.143944374667273, + "class_grades": 3.793703722777191 + }, + "correlation": { + "gpa": 0.9970184390038257, + "class_grades": 1 + } + } + ] + } + } +} +``` diff --git a/_aggregations/metric/maximum.md b/_aggregations/metric/maximum.md index 1a1aaff6072..51536bff4a2 100644 --- a/_aggregations/metric/maximum.md +++ b/_aggregations/metric/maximum.md @@ -9,18 +9,32 @@ redirect_from: # Maximum aggregations -The `max` metric is a single-value metric aggregations that returns the maximum value of a field. +The `max` metric is a single-value metric that returns the maximum value of a field. -The following example calculates the maximum of the `taxful_total_price` field: +The `max` aggregation compares numeric fields using a `double` (double-precision) representation. Results should be considered approximate for fields containing `long` or `unsigned_long` integer values greater than 253 because the number of significant bits in a `double` mantissa is 53. +{: .note} + +## Parameters + +The `max` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `field` | Required | String | The name of the field for which the maximum is computed. | +| `missing` | Optional | Numeric | The value to assign to missing instances of the field. If not provided, documents containing missing values are omitted from the aggregation. | + +## Example + +This following example request finds the most expensive item---the item with the maximum value of the `base_unit_price`---in the OpenSearch Dashboards e-commerce sample data: ```json GET opensearch_dashboards_sample_data_ecommerce/_search { "size": 0, "aggs": { - "max_taxful_total_price": { + "max_base_unit_price": { "max": { - "field": "taxful_total_price" + "field": "products.base_unit_price" } } } @@ -28,11 +42,13 @@ GET opensearch_dashboards_sample_data_ecommerce/_search ``` {% include copy-curl.html %} -#### Example response +## Example response + +As shown in the following example response, the aggregation returns the maximum value of `products.base_unit_price`: ```json { - "took": 17, + "took": 24, "timed_out": false, "_shards": { "total": 1, @@ -49,9 +65,17 @@ GET opensearch_dashboards_sample_data_ecommerce/_search "hits": [] }, "aggregations": { - "max_taxful_total_price": { - "value": 2250 + "max_base_unit_price": { + "value": 540 } } } -``` \ No newline at end of file +``` + +You can use the aggregation name (`max_base_unit_price`) as a key to retrieve the aggregation from the response. + +## Missing values + +You can assign a value to missing instances of the aggregated field. See [Missing aggregations]({{site.url}}{{site.baseurl}}/aggregations/bucket/missing/) for more information. + +Missing values are normally ignored by `max`. If you use `missing` to assign a value greater than any existing value, `max` returns this replacement value as the maximum value. diff --git a/_aggregations/metric/median-absolute-deviation.md b/_aggregations/metric/median-absolute-deviation.md index a8824751589..f1cf94ec910 100644 --- a/_aggregations/metric/median-absolute-deviation.md +++ b/_aggregations/metric/median-absolute-deviation.md @@ -9,70 +9,41 @@ redirect_from: # Median absolute deviation aggregations -The `median_absolute_deviation` metric is a single-value metric aggregation that returns a median absolute deviation field. Median absolute deviation is a statistical measure of data variability. Because the median absolute deviation measures dispersion from the median, it provides a more robust measure of variability that is less affected by outliers in a dataset. +The `median_absolute_deviation` aggregation is a single-value metric aggregation. Median absolute deviation is a variability metric that measures dispersion from the median. -Median absolute deviation is calculated as follows:
-median_absolute_deviation = median(|Xi - Median(Xi)|) +Median absolute deviation is less affected by outliers than standard deviation, which relies on squared error terms and is useful for describing data that is not normally distributed. -The following example calculates the median absolute deviation of the `DistanceMiles` field in the sample dataset `opensearch_dashboards_sample_data_flights`: +Median absolute deviation is computed as follows: - -```json -GET opensearch_dashboards_sample_data_flights/_search -{ - "size": 0, - "aggs": { - "median_absolute_deviation_DistanceMiles": { - "median_absolute_deviation": { - "field": "DistanceMiles" - } - } - } -} ``` -{% include copy-curl.html %} +median_absolute_deviation = median( | xi - median(xi) | ) +``` -#### Example response -```json -{ - "took": 35, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 10000, - "relation": "gte" - }, - "max_score": null, - "hits": [] - }, - "aggregations": { - "median_absolute_deviation_distanceMiles": { - "value": 1829.8993624441966 - } - } -} -``` +OpenSearch estimates `median_absolute_deviation`, rather than calculating it directly, because of memory limitations. This estimation is computationally expensive. You can adjust the trade-off between estimation accuracy and performance. For more information, see [Adjusting estimation accuracy](https://github.com/opensearch-project/documentation-website/pull/9453/files#adjusting-estimation-accuracy). + +## Parameters + +The `median_absolute_deviation` aggregation takes the following parameters. -### Missing +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `field` | Required | String | The name of the numeric field for which the median absolute deviation is computed. | +| `missing` | Optional | Numeric | The value to assign to missing instances of the field. If not provided, documents with missing values are omitted from the estimation. | +| `compression` | Optional | Numeric | A parameter that [adjusts the balance between estimate accuracy and performance](#adjusting-estimation-accuracy). The value of `compression` must be greater than `0`. The default value is `1000`. | -By default, if a field is missing or has a null value in a document, it is ignored during computation. However, you can specify a value to be used for those missing or null fields by using the `missing` parameter, as shown in the following request: +## Example + +The following example calculates the median absolute deviation of the `DistanceMiles` field in the `opensearch_dashboards_sample_data_flights` dataset: ```json GET opensearch_dashboards_sample_data_flights/_search { "size": 0, "aggs": { - "median_absolute_deviation_distanceMiles": { + "median_absolute_deviation_DistanceMiles": { "median_absolute_deviation": { - "field": "DistanceMiles", - "missing": 1000 + "field": "DistanceMiles" } } } @@ -82,9 +53,11 @@ GET opensearch_dashboards_sample_data_flights/_search #### Example response +As shown in the following example response, the aggregation returns an estimate of the median absolute deviation in the `median_absolute_deviation_DistanceMiles` variable: + ```json { - "took": 7, + "took": 490, "timed_out": false, "_shards": { "total": 1, @@ -101,16 +74,22 @@ GET opensearch_dashboards_sample_data_flights/_search "hits": [] }, "aggregations": { - "median_absolute_deviation_distanceMiles": { - "value": 1829.6443646143355 + "median_absolute_deviation_DistanceMiles": { + "value": 1830.917892238693 } } } ``` -### Compression +## Missing values + +OpenSearch ignores missing and null values when computing `median_absolute_deviation`. -The median absolute deviation is calculated using the [t-digest](https://github.com/tdunning/t-digest/tree/main) data structure, which balances between performance and estimation accuracy through the `compression` parameter (default value: `1000`). Adjusting the `compression` value affects the trade-off between computational efficiency and precision. Lower `compression` values improve performance but may reduce estimation accuracy, while higher values enhance accuracy at the cost of increased computational overhead, as shown in the following request: +You can assign a value to missing instances of the aggregated field. See [Missing aggregations]({{site.url}}{{site.baseurl}}/aggregations/bucket/missing/) for more information. + +## Adjusting estimation accuracy + +The median absolute deviation is calculated using the [t-digest](https://github.com/tdunning/t-digest/tree/main) data structure, which takes a `compression` parameter to balance performance and estimation accuracy. Lower values of `compression` improve performance but may reduce estimation accuracy, as shown in the following request: ```json GET opensearch_dashboards_sample_data_flights/_search @@ -128,7 +107,12 @@ GET opensearch_dashboards_sample_data_flights/_search ``` {% include copy-curl.html %} -#### Example response +The estimation error depends on the dataset but is usually below 5%, even for `compression` values as low as `100`. (The low example value of `10` is used here to illustrate the trade-off effect and is not recommended.) + +Note the decreased computation time (`took` time) and the slightly less accurate value of the estimated parameter in the following response. + +For reference, OpenSearch's best estimate (with `compression` set arbitrarily high) for the median absolute deviation of `DistanceMiles` is `1831.076904296875`: + ```json { diff --git a/_aggregations/metric/minimum.md b/_aggregations/metric/minimum.md index 9455c71fea0..144cbca4d8b 100644 --- a/_aggregations/metric/minimum.md +++ b/_aggregations/metric/minimum.md @@ -9,18 +9,32 @@ redirect_from: # Minimum aggregations -The `min` metric is a single-value metric aggregations that returns the minimum value of a field. +The `min` metric is a single-value metric that returns the minimum value of a field. -The following example calculates the minimum of the `taxful_total_price` field: +The `min` aggregation compares numeric fields using a `double` (double-precision) representation. Results should be considered approximate for fields containing `long` or `unsigned_long` integers with absolute values greater than 253 because the number of significant bits in a `double` mantissa is 53. +{: .note} + +## Parameters + +The `min` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `field` | Required | String | The name of the field for which the minimum is computed. | +| `missing` | Optional | Numeric | The value to assign to missing instances of the field. If not provided, documents containing missing values are omitted from the aggregation. | + +## Example + +This following example request finds the least expensive item---the item with the minimum value of the `base_unit_price`---in the OpenSearch Dashboards e-commerce sample data: ```json GET opensearch_dashboards_sample_data_ecommerce/_search { "size": 0, "aggs": { - "min_taxful_total_price": { + "min_base_unit_price": { "min": { - "field": "taxful_total_price" + "field": "products.base_unit_price" } } } @@ -28,11 +42,13 @@ GET opensearch_dashboards_sample_data_ecommerce/_search ``` {% include copy-curl.html %} -#### Example response +## Example response + +As shown in the following example response, the aggregation returns the minimum value of `products.base_unit_price`: ```json { - "took": 13, + "took": 15, "timed_out": false, "_shards": { "total": 1, @@ -49,9 +65,17 @@ GET opensearch_dashboards_sample_data_ecommerce/_search "hits": [] }, "aggregations": { - "min_taxful_total_price": { - "value": 6.98828125 + "min_base_unit_price": { + "value": 5.98828125 } } } -``` \ No newline at end of file +``` + +You can use the aggregation name (`min_base_unit_price`) as a key to retrieve the aggregation from the response. + +## Missing values + +You can assign a value to missing instances of the aggregated field. See [Missing aggregations]({{site.url}}{{site.baseurl}}/aggregations/bucket/missing/) for more information. + +Missing values are normally ignored by `min`. If you use `missing` to assign a value lower than any existing value, `min` returns this replacement value as the minimum value. diff --git a/_aggregations/metric/percentile-ranks.md b/_aggregations/metric/percentile-ranks.md index 660cb01bd10..44562187b3f 100644 --- a/_aggregations/metric/percentile-ranks.md +++ b/_aggregations/metric/percentile-ranks.md @@ -43,4 +43,30 @@ GET opensearch_dashboards_sample_data_ecommerce/_search } } } -``` \ No newline at end of file +``` + +This response indicates that the value `10` is at the `5.5`th percentile and the value `15` is at the `8.3`rd percentile. + +As with the `percentiles` aggregation, you can control the level of approximation by setting the optional `tdigest.compression` field. A larger value increases the precision of the approximation but uses more heap space. The default value is 100. + +For example, use the following request to set `compression` to `200`: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "percentile_rank_taxful_total_price": { + "percentile_ranks": { + "field": "taxful_total_price", + "values": [ + 10, + 15 + ], + "tdigest": { + "compression": 200 + } + } + } + } +} \ No newline at end of file diff --git a/_aggregations/metric/percentile.md b/_aggregations/metric/percentile.md index 0f241306d14..d9168e4539b 100644 --- a/_aggregations/metric/percentile.md +++ b/_aggregations/metric/percentile.md @@ -51,3 +51,44 @@ GET opensearch_dashboards_sample_data_ecommerce/_search } } ``` + +You can control the level of approximation using the optional `tdigest.compression` field. A larger value indicates that the data structure that approximates percentiles is more accurate but uses more heap space. The default value is 100. + +For example, use the following request to set `compression` to `200`: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "percentile_taxful_total_price": { + "percentiles": { + "field": "taxful_total_price", + "tdigest": { + "compression": 200 + } + } + } + } +} +``` + +The default percentiles returned are `1, 5, 25, 50, 75, 95, 99`. You can specify other percentiles in the optional `percents` field. For example, to get the 99.9th and 99.99th percentiles, run the following request: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "percentile_taxful_total_price": { + "percentiles": { + "field": "taxful_total_price", + "percents": [99.9, 99.99] + } + } + } +} +``` +{% include copy-curl.html %} + +The specified value overrides the default percentiles, so only the percentiles you specify are returned. diff --git a/_aggregations/metric/weighted-avg.md b/_aggregations/metric/weighted-avg.md index 6f67939d6e3..2b68a0e45ee 100644 --- a/_aggregations/metric/weighted-avg.md +++ b/_aggregations/metric/weighted-avg.md @@ -3,67 +3,50 @@ layout: default title: Weighted average parent: Metric aggregations nav_order: 150 +has_math: true --- -# Weighted average +# Weighted average aggregations The `weighted_avg` aggregation calculates the weighted average of numeric values across documents. This is useful when you want to calculate an average but weight some data points more heavily than others. -## Weighted average calculation - -The weighted average is calculated as `(sum of value * weight) / (sum of weights)`. +The weighted average is calculated using the formula $$ \frac{\sum_{i=1}^n \text{value}_i \cdot \text{weight}_i}{\sum_{i=1}^n \text{weight}_i} $$. ## Parameters -When using the `weighted_avg` aggregation, you must define the following parameters: - -- `value`: The field or script used to obtain the average numeric values -- `weight`: The field or script used to obtain the weight for each value - -Optionally, you can specify the following parameters: +The `weighted_avg` aggregation takes the following parameters. -- `format`: A numeric format to apply to the output value -- `value_type`: A type hint for the values when using scripts or unmapped fields +| Parameter | Required/Optional | Description | +|---------------|----------|-------------| +| `value` | Required | Defines how to obtain the numeric values to average. Requires a `field` or `script`. | +| `weight` | Required | Defines how to obtain the weight for each value. Requires a `field` or `script`. | +| `format` | Optional | A [DecimalFormat](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/text/DecimalFormat.html) formatting string. Returns the formatted output in the aggregation's `value_as_string` property. | +| `value_type` | Optional | A type hint for the values when using scripts or unmapped fields. | -For the value or weight, you can specify the following parameters: +You can specify the following parameters within `value` or `weight`. -- `field`: The document field to use -- `missing`: A value or weight to use if the field is missing +| Parameter | Required/Optional | Description | +|------------|----------|-------------| +| `field` | Optional | The document field to use for the value or weight. | +| `missing` | Optional | A default value or weight to use when the field is missing. See [Missing values](#missing-values).| -## Using the aggregation +## Example -Follow these steps to use the `weighted_avg` aggregation: - -**1. Create an index and index some documents** +First, create an index and index some data. Notice that Product C is missing the `rating` and `num_reviews` fields: ```json -PUT /products - -POST /products/_doc/1 -{ - "name": "Product A", - "rating": 4, - "num_reviews": 100 -} - -POST /products/_doc/2 -{ - "name": "Product B", - "rating": 5, - "num_reviews": 20 -} - -POST /products/_doc/3 -{ - "name": "Product C", - "rating": 3, - "num_reviews": 50 -} +POST _bulk +{ "index": { "_index": "products" } } +{ "name": "Product A", "rating": 4.5, "num_reviews": 100 } +{ "index": { "_index": "products" } } +{ "name": "Product B", "rating": 3.8, "num_reviews": 50 } +{ "index": { "_index": "products" } } +{ "name": "Product C"} ``` {% include copy-curl.html %} -**2. Run the `weighted_avg` aggregation** +The following request uses the `weighted_avg` aggregation to calculate a weighted average product rating. In this context, each product's rating is weighted by its `num_reviews`. This means that products with more reviews will have a greater influence on the final average than those with fewer reviews: ```json GET /products/_search @@ -77,7 +60,8 @@ GET /products/_search }, "weight": { "field": "num_reviews" - } + }, + "format": "#.##" } } } @@ -85,45 +69,43 @@ GET /products/_search ``` {% include copy-curl.html %} -## Handling missing values - -The `missing` parameter allows you to specify default values for documents missing the `value` field or the `weight` field instead of excluding them from the calculation. +## Example response -The following is an example of this behavior. First, create an index and add sample documents. This example includes five documents with different combinations of missing values for the `rating` and `num_reviews` fields: +The response contains the `weighted_rating`, calculated as `weighted_avg = (4.5 * 100 + 3.8 * 50) / (100 + 50) = 4.27`. Only documents 1 and 2, which contain values for both `rating` and `num_reviews`, are considered: ```json -PUT /products { - "mappings": { - "properties": { - "name": { - "type": "text" - }, - "rating": { - "type": "double" - }, - "num_reviews": { - "type": "integer" - } + "took": 18, + "timed_out": false, + "terminated_early": true, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "weighted_rating": { + "value": 4.266666650772095, + "value_as_string": "4.27" } } } - -POST /_bulk -{ "index": { "_index": "products" } } -{ "name": "Product A", "rating": 4.5, "num_reviews": 100 } -{ "index": { "_index": "products" } } -{ "name": "Product B", "rating": 3.8, "num_reviews": 50 } -{ "index": { "_index": "products" } } -{ "name": "Product C", "rating": null, "num_reviews": 20 } -{ "index": { "_index": "products" } } -{ "name": "Product D", "rating": 4.2, "num_reviews": null } -{ "index": { "_index": "products" } } -{ "name": "Product E", "rating": null, "num_reviews": null } ``` -{% include copy-curl.html %} -Next, run the following `weighted_avg` aggregation: +## Missing values + +The `missing` parameter allows you to specify default values for documents missing the `value` field or the `weight` field instead of excluding them from the calculation. + +For example, you can assign products without ratings an "average" rating of 3.0 and set the `num_reviews` to 1 to give them a small non-zero weight: ```json GET /products/_search @@ -133,11 +115,14 @@ GET /products/_search "weighted_rating": { "weighted_avg": { "value": { - "field": "rating" + "field": "rating", + "missing": 3.0 }, "weight": { - "field": "num_reviews" - } + "field": "num_reviews", + "missing": 1 + }, + "format": "#.##" } } } @@ -145,4 +130,32 @@ GET /products/_search ``` {% include copy-curl.html %} -In the response, you can see that the missing values for `Product E` were completely ignored in the calculation. +The new weighted average is calculated as `weighted_avg = (4.5 * 100 + 3.8 * 50 + 3.0 * 1) / (100 + 50 + 1) = 4.26`: + +```json +{ + "took": 27, + "timed_out": false, + "terminated_early": true, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "weighted_rating": { + "value": 4.258278129906055, + "value_as_string": "4.26" + } + } +} +``` diff --git a/_aggregations/pipeline-agg.md b/_aggregations/pipeline-agg.md deleted file mode 100644 index 30fb0ecf01e..00000000000 --- a/_aggregations/pipeline-agg.md +++ /dev/null @@ -1,1260 +0,0 @@ ---- -layout: default -title: Pipeline aggregations -nav_order: 5 -has_children: false -redirect_from: - - /opensearch/pipeline-agg/ - - /query-dsl/aggregations/pipeline-agg/ ---- - -# Pipeline aggregations - -With pipeline aggregations, you can chain aggregations by piping the results of one aggregation as an input to another for a more nuanced output. - -You can use pipeline aggregations to compute complex statistical and mathematical measures like derivatives, moving averages, cumulative sums, and so on. - -## Pipeline aggregation syntax - -A pipeline aggregation uses the `buckets_path` property to access the results of other aggregations. -The `buckets_path` property has a specific syntax: - -``` -buckets_path = [,]*[, ]; -``` - -where: - -- `AGG_NAME` is the name of the aggregation. -- `AGG_SEPARATOR` separates aggregations. It's represented as `>`. -- `METRIC_SEPARATOR` separates aggregations from its metrics. It's represented as `.`. -- `METRIC` is the name of the metric, in case of multi-value metric aggregations. - -For example, `my_sum.sum` selects the `sum` metric of an aggregation called `my_sum`. `popular_tags>my_sum.sum` nests `my_sum.sum` into the `popular_tags` aggregation. - -You can also specify the following additional parameters: - -- `gap_policy`: Real-world data can contain gaps or null values. You can specify the policy to deal with such missing data with the `gap_policy` property. You can either set the `gap_policy` property to `skip` to skip the missing data and continue from the next available value, or `insert_zeros` to replace the missing values with zero and continue running. -- `format`: The type of format for the output value. For example, `yyyy-MM-dd` for a date value. - -## Quick example - -To sum all the buckets returned by the `sum_total_memory` aggregation: - -```json -GET opensearch_dashboards_sample_data_logs/_search -{ - "size": 0, - "aggs": { - "number_of_bytes": { - "histogram": { - "field": "bytes", - "interval": 10000 - }, - "aggs": { - "sum_total_memory": { - "sum": { - "field": "phpmemory" - } - } - } - }, - "sum_copies": { - "sum_bucket": { - "buckets_path": "number_of_bytes>sum_total_memory" - } - } - } -} -``` - -#### Example response - -```json -... -"aggregations" : { - "number_of_bytes" : { - "buckets" : [ - { - "key" : 0.0, - "doc_count" : 13372, - "sum_total_memory" : { - "value" : 9.12664E7 - } - }, - { - "key" : 10000.0, - "doc_count" : 702, - "sum_total_memory" : { - "value" : 0.0 - } - } - ] - }, - "sum_copies" : { - "value" : 9.12664E7 - } - } -} -``` - -## Types of pipeline aggregations - -Pipeline aggregations are of two types: - -### Sibling aggregations - -Sibling aggregations take the output of a nested aggregation and produce new buckets or new aggregations at the same level as the nested buckets. - -Sibling aggregations must be a multi-bucket aggregation (have multiple grouped values for a certain field) and the metric must be a numeric value. - -`min_bucket`, `max_bucket`, `sum_bucket`, and `avg_bucket` are common sibling aggregations. - -### Parent aggregations - -Parent aggregations take the output of an outer aggregation and produce new buckets or new aggregations at the same level as the existing buckets. - -Parent aggregations must have `min_doc_count` set to 0 (default for `histogram` aggregations) and the specified metric must be a numeric value. If `min_doc_count` is greater than `0`, some buckets are omitted, which might lead to incorrect results. - -`derivatives` and `cumulative_sum` are common parent aggregations. - -## avg_bucket, sum_bucket, min_bucket, max_bucket - -The `avg_bucket`, `sum_bucket`, `min_bucket`, and `max_bucket` aggregations are sibling aggregations that calculate the average, sum, minimum, and maximum values of a metric in each bucket of a previous aggregation. - -The following example creates a date histogram with a one-month interval. The `sum` sub-aggregation calculates the sum of all bytes for each month. Finally, the `avg_bucket` aggregation uses this sum to calculate the average number of bytes per month: - -```json -POST opensearch_dashboards_sample_data_logs/_search -{ - "size": 0, - "aggs": { - "visits_per_month": { - "date_histogram": { - "field": "@timestamp", - "interval": "month" - }, - "aggs": { - "sum_of_bytes": { - "sum": { - "field": "bytes" - } - } - } - }, - "avg_monthly_bytes": { - "avg_bucket": { - "buckets_path": "visits_per_month>sum_of_bytes" - } - } - } -} -``` - -#### Example response - -```json -... -"aggregations" : { - "visits_per_month" : { - "buckets" : [ - { - "key_as_string" : "2020-10-01T00:00:00.000Z", - "key" : 1601510400000, - "doc_count" : 1635, - "sum_of_bytes" : { - "value" : 9400200.0 - } - }, - { - "key_as_string" : "2020-11-01T00:00:00.000Z", - "key" : 1604188800000, - "doc_count" : 6844, - "sum_of_bytes" : { - "value" : 3.8880434E7 - } - }, - { - "key_as_string" : "2020-12-01T00:00:00.000Z", - "key" : 1606780800000, - "doc_count" : 5595, - "sum_of_bytes" : { - "value" : 3.1445055E7 - } - } - ] - }, - "avg_monthly_bytes" : { - "value" : 2.6575229666666668E7 - } - } -} -``` - -In a similar fashion, you can calculate the `sum_bucket`, `min_bucket`, and `max_bucket` values for the bytes per month. - -## stats_bucket, extended_stats_bucket - -The `stats_bucket` aggregation is a sibling aggregation that returns a variety of stats (`count`, `min`, `max`, `avg`, and `sum`) for the buckets of a previous aggregation. - -The following example returns the basic stats for the buckets returned by the `sum_of_bytes` aggregation nested into the `visits_per_month` aggregation: - -```json -GET opensearch_dashboards_sample_data_logs/_search -{ - "size": 0, - "aggs": { - "visits_per_month": { - "date_histogram": { - "field": "@timestamp", - "interval": "month" - }, - "aggs": { - "sum_of_bytes": { - "sum": { - "field": "bytes" - } - } - } - }, - "stats_monthly_bytes": { - "stats_bucket": { - "buckets_path": "visits_per_month>sum_of_bytes" - } - } - } -} -``` - -#### Example response - -```json -... -"stats_monthly_bytes" : { - "count" : 3, - "min" : 9400200.0, - "max" : 3.8880434E7, - "avg" : 2.6575229666666668E7, - "sum" : 7.9725689E7 - } - } -} -``` - -The `extended_stats` aggregation is an extended version of the `stats` aggregation. Apart from including basic stats, `extended_stats` also provides stats such as `sum_of_squares`, `variance`, and `std_deviation`. - -#### Example response - -```json -"stats_monthly_visits" : { - "count" : 3, - "min" : 9400200.0, - "max" : 3.8880434E7, - "avg" : 2.6575229666666668E7, - "sum" : 7.9725689E7, - "sum_of_squares" : 2.588843392021381E15, - "variance" : 1.5670496550438025E14, - "variance_population" : 1.5670496550438025E14, - "variance_sampling" : 2.3505744825657038E14, - "std_deviation" : 1.251818539183616E7, - "std_deviation_population" : 1.251818539183616E7, - "std_deviation_sampling" : 1.5331583357780447E7, - "std_deviation_bounds" : { - "upper" : 5.161160045033899E7, - "lower" : 1538858.8829943463, - "upper_population" : 5.161160045033899E7, - "lower_population" : 1538858.8829943463, - "upper_sampling" : 5.723839638222756E7, - "lower_sampling" : -4087937.0488942266 - } - } - } -} -``` - -## bucket_script, bucket_selector - -The `bucket_script` aggregation is a parent aggregation that executes a script to perform per-bucket calculations of a previous aggregation. Make sure the metrics are of numeric type and the returned values are also numeric. - -Use the `script` parameter to add your script. The script can be inline, in a file, or in an index. To enable inline scripting, add the following line to your `opensearch.yml` file in the `config` folder: - -```yaml -script.inline: on -``` - -The `buckets_path` property consists of multiple entries. Each entry is a key and a value. The key is the name of the value that you can use in the script. - -The basic syntax is: - -```json -{ - "bucket_script": { - "buckets_path": { - "my_var1": "the_sum", - "my_var2": "the_value_count" - }, - "script": "params.my_var1 / params.my_var2" - } -} -``` - -The following example uses the `sum` aggregation on the buckets generated by a date histogram. From the resultant buckets values, the percentage of RAM is calculated in an interval of 10,000 bytes in the context of a zip extension: - -```json -GET opensearch_dashboards_sample_data_logs/_search -{ - "size": 0, - "aggs": { - "sales_per_month": { - "histogram": { - "field": "bytes", - "interval": "10000" - }, - "aggs": { - "total_ram": { - "sum": { - "field": "machine.ram" - } - }, - "ext-type": { - "filter": { - "term": { - "extension.keyword": "zip" - } - }, - "aggs": { - "total_ram": { - "sum": { - "field": "machine.ram" - } - } - } - }, - "ram-percentage": { - "bucket_script": { - "buckets_path": { - "machineRam": "ext-type>total_ram", - "totalRam": "total_ram" - }, - "script": "params.machineRam / params.totalRam" - } - } - } - } - } -} -``` - -#### Example response - -```json -"aggregations" : { - "sales_per_month" : { - "buckets" : [ - { - "key" : 0.0, - "doc_count" : 13372, - "os-type" : { - "doc_count" : 1558, - "total_ram" : { - "value" : 2.0090783268864E13 - } - }, - "total_ram" : { - "value" : 1.7214228922368E14 - }, - "ram-percentage" : { - "value" : 0.11671032934131736 - } - }, - { - "key" : 10000.0, - "doc_count" : 702, - "os-type" : { - "doc_count" : 116, - "total_ram" : { - "value" : 1.622423896064E12 - } - }, - "total_ram" : { - "value" : 9.015136354304E12 - }, - "ram-percentage" : { - "value" : 0.17996665078608862 - } - } - ] - } - } -} -``` - -The RAM percentage is calculated and appended at the end of each bucket. - -The `bucket_selector` aggregation is a script-based aggregation that selects buckets returned by a `histogram` (or `date_histogram`) aggregation. Use it in scenarios where you don’t want certain buckets in the output based on conditions supplied by you. - -The `bucket_selector` aggregation executes a script to decide if a bucket stays in the parent multi-bucket aggregation. - -The basic syntax is: - -```json -{ - "bucket_selector": { - "buckets_path": { - "my_var1": "the_sum", - "my_var2": "the_value_count" - }, - "script": "params.my_var1 / params.my_var2" - } -} -``` - -The following example calculates the sum of bytes and then evaluates if this sum is greater than 20,000. If true, then the bucket is retained in the bucket list. Otherwise, it’s deleted from the final output. - -```json -GET opensearch_dashboards_sample_data_logs/_search -{ - "size": 0, - "aggs": { - "bytes_per_month": { - "date_histogram": { - "field": "@timestamp", - "calendar_interval": "month" - }, - "aggs": { - "total_bytes": { - "sum": { - "field": "bytes" - } - }, - "bytes_bucket_filter": { - "bucket_selector": { - "buckets_path": { - "totalBytes": "total_bytes" - }, - "script": "params.totalBytes > 20000" - } - } - } - } - } -} -``` - -#### Example response - -```json -"aggregations" : { - "bytes_per_month" : { - "buckets" : [ - { - "key_as_string" : "2020-10-01T00:00:00.000Z", - "key" : 1601510400000, - "doc_count" : 1635, - "total_bytes" : { - "value" : 9400200.0 - } - }, - { - "key_as_string" : "2020-11-01T00:00:00.000Z", - "key" : 1604188800000, - "doc_count" : 6844, - "total_bytes" : { - "value" : 3.8880434E7 - } - }, - { - "key_as_string" : "2020-12-01T00:00:00.000Z", - "key" : 1606780800000, - "doc_count" : 5595, - "total_bytes" : { - "value" : 3.1445055E7 - } - } - ] - } - } -} -``` - -## bucket_sort - -The `bucket_sort` aggregation is a parent aggregation that sorts buckets of a previous aggregation. - -You can specify several sort fields together with the corresponding sort order. Additionally, you can sort each bucket based on its key, count, or its sub-aggregations. You can also truncate the buckets by setting `from` and `size` parameters. - -Syntax - -```json -{ - "bucket_sort": { - "sort": [ - {"sort_field_1": {"order": "asc"}}, - {"sort_field_2": {"order": "desc"}}, - "sort_field_3" - ], - "from":1, - "size":3 - } -} -``` - -The following example sorts the buckets of a `date_histogram` aggregation based on the computed `total_sum` values. We sort the buckets in descending order so that the buckets with the highest number of bytes are returned first. - -```json -GET opensearch_dashboards_sample_data_logs/_search -{ - "size": 0, - "aggs": { - "sales_per_month": { - "date_histogram": { - "field": "@timestamp", - "calendar_interval": "month" - }, - "aggs": { - "total_bytes": { - "sum": { - "field": "bytes" - } - }, - "bytes_bucket_sort": { - "bucket_sort": { - "sort": [ - { "total_bytes": { "order": "desc" } } - ], - "size": 3 - } - } - } - } - } -} -``` - -#### Example response - -```json -"aggregations" : { - "sales_per_month" : { - "buckets" : [ - { - "key_as_string" : "2020-11-01T00:00:00.000Z", - "key" : 1604188800000, - "doc_count" : 6844, - "total_bytes" : { - "value" : 3.8880434E7 - } - }, - { - "key_as_string" : "2020-12-01T00:00:00.000Z", - "key" : 1606780800000, - "doc_count" : 5595, - "total_bytes" : { - "value" : 3.1445055E7 - } - }, - { - "key_as_string" : "2020-10-01T00:00:00.000Z", - "key" : 1601510400000, - "doc_count" : 1635, - "total_bytes" : { - "value" : 9400200.0 - } - } - ] - } - } -} -``` - -You can also use this aggregation to truncate the resulting buckets without sorting. For this, just use the `from` and/or `size` parameters without `sort`. - -## cumulative_sum - -The `cumulative_sum` aggregation is a parent aggregation that calculates the cumulative sum of each bucket of a previous aggregation. - -A cumulative sum is a sequence of partial sums of a given sequence. For example, the cumulative sums of the sequence `{a,b,c,…}` are `a`, `a+b`, `a+b+c`, and so on. You can use the cumulative sum to visualize the rate of change of a field over time. - -The following example calculates the cumulative number of bytes over a monthly basis: - -```json -GET opensearch_dashboards_sample_data_logs/_search -{ - "size": 0, - "aggs": { - "sales_per_month": { - "date_histogram": { - "field": "@timestamp", - "calendar_interval": "month" - }, - "aggs": { - "no-of-bytes": { - "sum": { - "field": "bytes" - } - }, - "cumulative_bytes": { - "cumulative_sum": { - "buckets_path": "no-of-bytes" - } - } - } - } - } -} -``` - -#### Example response - -```json -... -"aggregations" : { - "sales_per_month" : { - "buckets" : [ - { - "key_as_string" : "2020-10-01T00:00:00.000Z", - "key" : 1601510400000, - "doc_count" : 1635, - "no-of-bytes" : { - "value" : 9400200.0 - }, - "cumulative_bytes" : { - "value" : 9400200.0 - } - }, - { - "key_as_string" : "2020-11-01T00:00:00.000Z", - "key" : 1604188800000, - "doc_count" : 6844, - "no-of-bytes" : { - "value" : 3.8880434E7 - }, - "cumulative_bytes" : { - "value" : 4.8280634E7 - } - }, - { - "key_as_string" : "2020-12-01T00:00:00.000Z", - "key" : 1606780800000, - "doc_count" : 5595, - "no-of-bytes" : { - "value" : 3.1445055E7 - }, - "cumulative_bytes" : { - "value" : 7.9725689E7 - } - } - ] - } - } -} -``` - -## derivative - -The `derivative` aggregation is a parent aggregation that calculates 1st order and 2nd order derivatives of each bucket of a previous aggregation. - -In mathematics, the derivative of a function measures its sensitivity to change. In other words, a derivative evaluates the rate of change in some function with respect to some variable. To learn more about derivatives, see [Wikipedia](https://en.wikipedia.org/wiki/Derivative). - -You can use derivatives to calculate the rate of change of numeric values compared to its previous time periods. - -The 1st order derivative indicates whether a metric is increasing or decreasing, and by how much it's increasing or decreasing. - -The following example calculates the 1st order derivative for the sum of bytes per month. The 1st order derivative is the difference between the number of bytes in the current month and the previous month: - -```json -GET opensearch_dashboards_sample_data_logs/_search -{ - "size": 0, - "aggs": { - "sales_per_month": { - "date_histogram": { - "field": "@timestamp", - "calendar_interval": "month" - }, - "aggs": { - "number_of_bytes": { - "sum": { - "field": "bytes" - } - }, - "bytes_deriv": { - "derivative": { - "buckets_path": "number_of_bytes" - } - } - } - } - } -} -``` - -#### Example response - -```json -... -"aggregations" : { - "sales_per_month" : { - "buckets" : [ - { - "key_as_string" : "2020-10-01T00:00:00.000Z", - "key" : 1601510400000, - "doc_count" : 1635, - "number_of_bytes" : { - "value" : 9400200.0 - } - }, - { - "key_as_string" : "2020-11-01T00:00:00.000Z", - "key" : 1604188800000, - "doc_count" : 6844, - "number_of_bytes" : { - "value" : 3.8880434E7 - }, - "bytes_deriv" : { - "value" : 2.9480234E7 - } - }, - { - "key_as_string" : "2020-12-01T00:00:00.000Z", - "key" : 1606780800000, - "doc_count" : 5595, - "number_of_bytes" : { - "value" : 3.1445055E7 - }, - "bytes_deriv" : { - "value" : -7435379.0 - } - } - ] - } - } -} -``` - -The 2nd order derivative is a double derivative or a derivative of the derivative. -It indicates how the rate of change of a quantity is itself changing. It’s the difference between the 1st order derivatives of adjacent buckets. - -To calculate a 2nd order derivative, chain one derivative aggregation to another: - -```json -GET opensearch_dashboards_sample_data_logs/_search -{ - "size": 0, - "aggs": { - "sales_per_month": { - "date_histogram": { - "field": "@timestamp", - "calendar_interval": "month" - }, - "aggs": { - "number_of_bytes": { - "sum": { - "field": "bytes" - } - }, - "bytes_deriv": { - "derivative": { - "buckets_path": "number_of_bytes" - } - }, - "bytes_2nd_deriv": { - "derivative": { - "buckets_path": "bytes_deriv" - } - } - } - } - } -} -``` - -#### Example response - -```json -... -"aggregations" : { - "sales_per_month" : { - "buckets" : [ - { - "key_as_string" : "2020-10-01T00:00:00.000Z", - "key" : 1601510400000, - "doc_count" : 1635, - "number_of_bytes" : { - "value" : 9400200.0 - } - }, - { - "key_as_string" : "2020-11-01T00:00:00.000Z", - "key" : 1604188800000, - "doc_count" : 6844, - "number_of_bytes" : { - "value" : 3.8880434E7 - }, - "bytes_deriv" : { - "value" : 2.9480234E7 - } - }, - { - "key_as_string" : "2020-12-01T00:00:00.000Z", - "key" : 1606780800000, - "doc_count" : 5595, - "number_of_bytes" : { - "value" : 3.1445055E7 - }, - "bytes_deriv" : { - "value" : -7435379.0 - }, - "bytes_2nd_deriv" : { - "value" : -3.6915613E7 - } - } - ] - } - } -} -``` - -The first bucket doesn't have a 1st order derivate as a derivate needs at least two points for comparison. The first and second buckets don't have a 2nd order derivate because a 2nd order derivate needs at least two data points from the 1st order derivative. - -The 1st order derivative for the "2020-11-01" bucket is 2.9480234E7 and the "2020-12-01" bucket is -7435379. So, the 2nd order derivative of the “2020-12-01” bucket is -3.6915613E7 (-7435379-2.9480234E7). - -Theoretically, you could continue chaining derivate aggregations to calculate the third, the fourth, and even higher-order derivatives. That would, however, provide little to no value for most datasets. - -## moving_avg - -A `moving_avg` aggregation is a parent aggregation that calculates the moving average metric. - -The `moving_avg` aggregation finds the series of averages of different windows (subsets) of a dataset. A window’s size represents the number of data points covered by the window on each iteration (specified by the `window` property and set to 5 by default). On each iteration, the algorithm calculates the average for all data points that fit into the window and then slides forward by excluding the first member of the previous window and including the first member from the next window. - -For example, given the data `[1, 5, 8, 23, 34, 28, 7, 23, 20, 19]`, you can calculate a simple moving average with a window’s size of 5 as follows: - -``` -(1 + 5 + 8 + 23 + 34) / 5 = 14.2 -(5 + 8 + 23 + 34+ 28) / 5 = 19.6 -(8 + 23 + 34 + 28 + 7) / 5 = 20 -so on... -``` - -For more information, see [Wikipedia](https://en.wikipedia.org/wiki/Moving_average). - -You can use the `moving_avg` aggregation to either smoothen out short-term fluctuations or to highlight longer-term trends or cycles in your time-series data. - -Specify a small window size (for example, `window`: 10) that closely follows the data to smoothen out small-scale fluctuations. -Alternatively, specify a larger window size (for example, `window`: 100) that lags behind the actual data by a substantial amount to smoothen out all higher-frequency fluctuations or random noise, making lower frequency trends more visible. - -The following example nests a `moving_avg` aggregation into a `date_histogram` aggregation: - -```json -GET opensearch_dashboards_sample_data_logs/_search -{ - "size": 0, - "aggs": { - "my_date_histogram": { - "date_histogram": { - "field": "@timestamp", - "calendar_interval": "month" - }, - "aggs": { - "sum_of_bytes": { - "sum": { "field": "bytes" } - }, - "moving_avg_of_sum_of_bytes": { - "moving_avg": { "buckets_path": "sum_of_bytes" } - } - } - } - } -} -``` - -#### Example response - -```json -... -"aggregations" : { - "my_date_histogram" : { - "buckets" : [ - { - "key_as_string" : "2020-10-01T00:00:00.000Z", - "key" : 1601510400000, - "doc_count" : 1635, - "sum_of_bytes" : { - "value" : 9400200.0 - } - }, - { - "key_as_string" : "2020-11-01T00:00:00.000Z", - "key" : 1604188800000, - "doc_count" : 6844, - "sum_of_bytes" : { - "value" : 3.8880434E7 - }, - "moving_avg_of_sum_of_bytes" : { - "value" : 9400200.0 - } - }, - { - "key_as_string" : "2020-12-01T00:00:00.000Z", - "key" : 1606780800000, - "doc_count" : 5595, - "sum_of_bytes" : { - "value" : 3.1445055E7 - }, - "moving_avg_of_sum_of_bytes" : { - "value" : 2.4140317E7 - } - } - ] - } - } -} -``` - -You can also use the `moving_avg` aggregation to predict future buckets. -To predict buckets, add the `predict` property and set it to the number of predictions that you want to see. - -The following example adds five predictions to the preceding query: - -```json -GET opensearch_dashboards_sample_data_logs/_search -{ - "size": 0, - "aggs": { - "my_date_histogram": { - "date_histogram": { - "field": "@timestamp", - "calendar_interval": "month" - }, - "aggs": { - "sum_of_bytes": { - "sum": { - "field": "bytes" - } - }, - "moving_avg_of_sum_of_bytes": { - "moving_avg": { - "buckets_path": "sum_of_bytes", - "predict": 5 - } - } - } - } - } -} -``` - -#### Example response - -```json -"aggregations" : { - "my_date_histogram" : { - "buckets" : [ - { - "key_as_string" : "2020-10-01T00:00:00.000Z", - "key" : 1601510400000, - "doc_count" : 1635, - "sum_of_bytes" : { - "value" : 9400200.0 - } - }, - { - "key_as_string" : "2020-11-01T00:00:00.000Z", - "key" : 1604188800000, - "doc_count" : 6844, - "sum_of_bytes" : { - "value" : 3.8880434E7 - }, - "moving_avg_of_sum_of_bytes" : { - "value" : 9400200.0 - } - }, - { - "key_as_string" : "2020-12-01T00:00:00.000Z", - "key" : 1606780800000, - "doc_count" : 5595, - "sum_of_bytes" : { - "value" : 3.1445055E7 - }, - "moving_avg_of_sum_of_bytes" : { - "value" : 2.4140317E7 - } - }, - { - "key_as_string" : "2021-01-01T00:00:00.000Z", - "key" : 1609459200000, - "doc_count" : 0, - "moving_avg_of_sum_of_bytes" : { - "value" : 2.6575229666666668E7 - } - }, - { - "key_as_string" : "2021-02-01T00:00:00.000Z", - "key" : 1612137600000, - "doc_count" : 0, - "moving_avg_of_sum_of_bytes" : { - "value" : 2.6575229666666668E7 - } - }, - { - "key_as_string" : "2021-03-01T00:00:00.000Z", - "key" : 1614556800000, - "doc_count" : 0, - "moving_avg_of_sum_of_bytes" : { - "value" : 2.6575229666666668E7 - } - }, - { - "key_as_string" : "2021-04-01T00:00:00.000Z", - "key" : 1617235200000, - "doc_count" : 0, - "moving_avg_of_sum_of_bytes" : { - "value" : 2.6575229666666668E7 - } - }, - { - "key_as_string" : "2021-05-01T00:00:00.000Z", - "key" : 1619827200000, - "doc_count" : 0, - "moving_avg_of_sum_of_bytes" : { - "value" : 2.6575229666666668E7 - } - } - ] - } - } -} -``` - -The `moving_avg` aggregation supports five models — `simple`, `linear`, `exponentially weighted`, `holt-linear`, and `holt-winters`. These models differ in how the values of the window are weighted. As data points become "older" (i.e., the window slides away from them), they might be weighted differently. You can specify a model of your choice by setting the `model` property. The `model` property holds the name of the model and the `settings` object, which you can use to provide model properties. For more information on these models, see [Wikipedia](https://en.wikipedia.org/wiki/Moving_average). - -A `simple` model first calculates the sum of all data points in the window, and then divides that sum by the size of the window. In other words, a `simple` model calculates a simple arithmetic mean for each window in your dataset. - -The following example uses a simple model with a window size of 30: - -```json -GET opensearch_dashboards_sample_data_logs/_search -{ - "size": 0, - "aggs": { - "my_date_histogram": { - "date_histogram": { - "field": "@timestamp", - "calendar_interval": "month" - }, - "aggs": { - "sum_of_bytes": { - "sum": { - "field": "bytes" - } - }, - "moving_avg_of_sum_of_bytes": { - "moving_avg": { - "buckets_path": "sum_of_bytes", - "window": 30, - "model": "simple" - } - } - } - } - } -} -``` - - -#### Example response - -```json -... -"aggregations" : { - "my_date_histogram" : { - "buckets" : [ - { - "key_as_string" : "2020-10-01T00:00:00.000Z", - "key" : 1601510400000, - "doc_count" : 1635, - "sum_of_bytes" : { - "value" : 9400200.0 - } - }, - { - "key_as_string" : "2020-11-01T00:00:00.000Z", - "key" : 1604188800000, - "doc_count" : 6844, - "sum_of_bytes" : { - "value" : 3.8880434E7 - }, - "moving_avg_of_sum_of_bytes" : { - "value" : 9400200.0 - } - }, - { - "key_as_string" : "2020-12-01T00:00:00.000Z", - "key" : 1606780800000, - "doc_count" : 5595, - "sum_of_bytes" : { - "value" : 3.1445055E7 - }, - "moving_avg_of_sum_of_bytes" : { - "value" : 2.4140317E7 - } - } - ] - } - } -} -``` - -The following example uses a `holt` model. You can set the speed at which the importance decays occurs with the `alpha` and `beta` setting. The default value of `alpha` is 0.3 and `beta` is 0.1. You can specify any float value between 0-1 inclusive. - -```json -GET opensearch_dashboards_sample_data_logs/_search -{ - "size": 0, - "aggs": { - "my_date_histogram": { - "date_histogram": { - "field": "@timestamp", - "calendar_interval": "month" - }, - "aggs": { - "sum_of_bytes": { - "sum": { - "field": "bytes" - } - }, - "moving_avg_of_sum_of_bytes": { - "moving_avg": { - "buckets_path": "sum_of_bytes", - "model": "holt", - "settings": { - "alpha": 0.6, - "beta": 0.4 - } - } - } - } - } - } -} -``` - -#### Example response - -```json -... -"aggregations" : { - "my_date_histogram" : { - "buckets" : [ - { - "key_as_string" : "2020-10-01T00:00:00.000Z", - "key" : 1601510400000, - "doc_count" : 1635, - "sum_of_bytes" : { - "value" : 9400200.0 - } - }, - { - "key_as_string" : "2020-11-01T00:00:00.000Z", - "key" : 1604188800000, - "doc_count" : 6844, - "sum_of_bytes" : { - "value" : 3.8880434E7 - }, - "moving_avg_of_sum_of_bytes" : { - "value" : 9400200.0 - } - }, - { - "key_as_string" : "2020-12-01T00:00:00.000Z", - "key" : 1606780800000, - "doc_count" : 5595, - "sum_of_bytes" : { - "value" : 3.1445055E7 - }, - "moving_avg_of_sum_of_bytes" : { - "value" : 2.70883404E7 - } - } - ] - } - } -} -``` - - - -## serial_diff - -The `serial_diff` aggregation is a parent pipeline aggregation that computes a series of value differences between a time lag of the buckets from previous aggregations. - -You can use the `serial_diff` aggregation to find the data changes between time periods instead of finding the whole value. - -With the `lag` parameter (a positive, non-zero integer value), you can tell which previous bucket to subtract from the current one. If you don't specify the `lag` parameter, OpenSearch sets it to 1. - -Lets say that the population of a city grows with time. If you use the serial differencing aggregation with the period of one day, you can see the daily growth. For example, you can compute a series of differences of the weekly average changes of a total price. - -```json -GET opensearch_dashboards_sample_data_logs/_search -{ - "size": 0, - "aggs": { - "my_date_histogram": { - "date_histogram": { - "field": "@timestamp", - "calendar_interval": "month" - }, - "aggs": { - "the_sum": { - "sum": { - "field": "bytes" - } - }, - "thirtieth_difference": { - "serial_diff": { - "buckets_path": "the_sum", - "lag" : 30 - } - } - } - } - } -} -``` - -#### Example response - -```json -... -"aggregations" : { - "my_date_histogram" : { - "buckets" : [ - { - "key_as_string" : "2020-10-01T00:00:00.000Z", - "key" : 1601510400000, - "doc_count" : 1635, - "the_sum" : { - "value" : 9400200.0 - } - }, - { - "key_as_string" : "2020-11-01T00:00:00.000Z", - "key" : 1604188800000, - "doc_count" : 6844, - "the_sum" : { - "value" : 3.8880434E7 - } - }, - { - "key_as_string" : "2020-12-01T00:00:00.000Z", - "key" : 1606780800000, - "doc_count" : 5595, - "the_sum" : { - "value" : 3.1445055E7 - } - } - ] - } - } -} -``` diff --git a/_aggregations/pipeline/avg-bucket.md b/_aggregations/pipeline/avg-bucket.md new file mode 100644 index 00000000000..02ee4087cc2 --- /dev/null +++ b/_aggregations/pipeline/avg-bucket.md @@ -0,0 +1,112 @@ +--- +layout: default +title: Average bucket +parent: Pipeline aggregations +nav_order: 10 +--- + +# Average bucket aggregations + +The `avg_bucket` aggregation is a sibling aggregation that calculates the average of a metric in each bucket of a previous aggregation. + +The specified metric must be numeric, and the sibling aggregation must be a multi-bucket aggregation. + +## Parameters + +The `avg_bucket` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `buckets_path` | Required | String | The path of the aggregation buckets to be aggregated. See [Buckets path]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#buckets-path). | +| `gap_policy` | Optional | String | The policy to apply to missing data. Valid values are `skip` and `insert_zeros`. Default is `skip`. See [Data gaps]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#data-gaps).| +| `format` | Optional | String | A [DecimalFormat](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/text/DecimalFormat.html) formatting string. Returns the formatted output in the aggregation's `value_as_string` property. | + +## Example + +The following example creates a date histogram with a one-month interval from the OpenSearch Dashboards e-commerce sample data. The `sum` subaggregation calculates the sum of bytes for each month. Finally, the `avg_bucket` aggregation calculates the average number of bytes per month from these sums: + +```json +POST opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "visits_per_month": { + "date_histogram": { + "field": "@timestamp", + "interval": "month" + }, + "aggs": { + "sum_of_bytes": { + "sum": { + "field": "bytes" + } + } + } + }, + "avg_monthly_bytes": { + "avg_bucket": { + "buckets_path": "visits_per_month>sum_of_bytes" + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +The aggregation returns the average bytes from the monthly buckets: + +```json +{ + "took": 43, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "visits_per_month": { + "buckets": [ + { + "key_as_string": "2025-03-01T00:00:00.000Z", + "key": 1740787200000, + "doc_count": 480, + "sum_of_bytes": { + "value": 2804103 + } + }, + { + "key_as_string": "2025-04-01T00:00:00.000Z", + "key": 1743465600000, + "doc_count": 6849, + "sum_of_bytes": { + "value": 39103067 + } + }, + { + "key_as_string": "2025-05-01T00:00:00.000Z", + "key": 1746057600000, + "doc_count": 6745, + "sum_of_bytes": { + "value": 37818519 + } + } + ] + }, + "avg_monthly_bytes": { + "value": 26575229.666666668 + } + } +} +``` diff --git a/_aggregations/pipeline/bucket-script.md b/_aggregations/pipeline/bucket-script.md new file mode 100644 index 00000000000..93dae48bf76 --- /dev/null +++ b/_aggregations/pipeline/bucket-script.md @@ -0,0 +1,161 @@ +--- +layout: default +title: Bucket script +parent: Pipeline aggregations +nav_order: 20 +--- + +# Bucket script aggregations + +The `bucket_script` aggregation is a parent pipeline aggregation that executes a script to perform per-bucket numeric computations across a set of buckets. Use the `bucket_script` aggregation to perform custom numeric computations on multiple metrics in a bucketed aggregation. For example, you can: + +- Calculate derived and composite metrics. +- Apply conditional logic using if/else statements. +- Compute business-specific KPIs, such as custom scoring metrics. + +## Parameters + +The `bucket_script` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `buckets_path` | Required | Object | A map of variable names to bucketed metrics that identify the metrics to be used in the script. The metrics must be numeric. See [Script variables](#script-variables). | +| `script` | Required | String or Object | The script to execute. Can be an inline script, stored script, or script file. The script has access to the variable names defined in the `buckets_path` parameter. Must return a numeric value. | +| `gap_policy` | Optional | String | The policy to apply to missing data. Valid values are `skip` and `insert_zeros`. Default is `skip`. See [Data gaps]({{site.url}}{{site.baseurl}}/aggregations/pipeline/#data-gaps). | +| `format` | Optional | String | A [DecimalFormat](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/text/DecimalFormat.html) formatting string. Returns the formatted output in the aggregation's `value_as_string` parameter. | + +## Script variables + +The `buckets_path` parameter maps script variable names to metrics from parent aggregations. These variables can then be used in the script. + +For the `bucket_script` and `bucket_selector` aggregations, the `buckets_path` parameter is an object rather than a string because it must refer to multiple bucket metrics. See the [Pipeline aggregations]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#buckets-path) page for a description of the string version of `buckets_path`. +{: .note} + +The following `buckets_path` maps the `sales_sum` metric to the `total_sales` script variable and the `item_count` metric to the `item_count` script variable: + +```json +"buckets_path": { + "total_sales": "sales_sum", + "item_count": "item_count" +} +``` + +The mapped variables can be accessed from the `params` context. For example: + +- `params.total_sales` +- `params.item_count` + +## Enabling inline scripting + +Use the `script` parameter to add your script. The script can be inline, in a file, or in an index. To enable inline scripting, the `opensearch.yml` file in the `config` folder must contain the following: + +```yml +script.inline: on +``` + +## Example + +The following example creates a date histogram with a one-month interval from the OpenSearch Dashboards e-commerce sample data. The `total_sales` subaggregation sums the taxed price of all items sold for each month. The `vendor_count` aggregation counts the total number of unique vendors for each month. Finally, the `avg_vendor_spend` aggregation uses an inline script to calculate the average amount spent per vendor each month: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "sales_per_month": { + "date_histogram": { + "field": "order_date", + "calendar_interval": "month" + }, + "aggs": { + "total_sales": { + "sum": { + "field": "taxful_total_price" + } + }, + "vendor_count": { + "cardinality": { + "field": "products.manufacturer.keyword" + } + }, + "avg_vendor_spend": { + "bucket_script": { + "buckets_path": { + "sales": "total_sales", + "vendors": "vendor_count" + }, + "script": "params.sales / params.vendors", + "format": "$#,###.00" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +The aggregation returns the formatted monthly average vendor spend: + +```json +{ + "took": 6, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4675, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "sales_per_month": { + "buckets": [ + { + "key_as_string": "2025-03-01T00:00:00.000Z", + "key": 1740787200000, + "doc_count": 721, + "vendor_count": { + "value": 21 + }, + "total_sales": { + "value": 53468.1484375 + }, + "avg_vendor_spend": { + "value": 2546.1023065476193, + "value_as_string": "$2,546.10" + } + }, + { + "key_as_string": "2025-04-01T00:00:00.000Z", + "key": 1743465600000, + "doc_count": 3954, + "vendor_count": { + "value": 21 + }, + "total_sales": { + "value": 297415.98046875 + }, + "avg_vendor_spend": { + "value": 14162.665736607143, + "value_as_string": "$14,162.67" + } + } + ] + } + } +} +``` + + + + diff --git a/_aggregations/pipeline/bucket-selector.md b/_aggregations/pipeline/bucket-selector.md new file mode 100644 index 00000000000..e8e00a6a1ff --- /dev/null +++ b/_aggregations/pipeline/bucket-selector.md @@ -0,0 +1,120 @@ +--- +layout: default +title: Bucket selector +parent: Pipeline aggregations +nav_order: 30 +--- + +# Bucket selector aggregations + +The `bucket_selector` aggregation is a parent pipeline aggregation that evaluates a script to determine whether buckets returned by a `histogram` (or `date_histogram`) aggregation should be included in the final result. + +Unlike pipeline aggregations that create new values, the `bucket_selector` aggregation acts as a filter, keeping or removing entire buckets based on the specified criteria. Use this aggregation to filter buckets based on the computed metrics of a bucket. + +## Parameters + +The `bucket_selector` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `buckets_path` | Required | Object | A map of variable names to bucketed metrics that identify the metrics to be used in the script. The metrics must be numeric. See [Script variables]({{site.url}}{{site.baseurl}}/aggregations/pipeline/bucket-script#script-variables). | +| `script` | Required | String or Object | The script to execute. Can be an inline script, stored script, or script file. The script has access to the variable names defined in the `buckets_path` parameter. Must return a Boolean value. Buckets returning `false` are removed from the final output. | +| `gap_policy` | Optional | String | The policy to apply to missing data. Valid values are `skip` and `insert_zeros`. Default is `skip`. See [Data gaps]({{site.url}}{{site.baseurl}}/aggregations/pipeline/#data-gaps). | + + +## Example + +The following example creates a date histogram with a one-week interval from the OpenSearch Dashboards e-commerce sample data. The `sum` subaggregation calculates the sum of all sales for each week. Finally, the `bucket_selector` aggregation filters the resulting weekly buckets, removing all the buckets that do not have a sum of more than $75,000: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "sales_per_week": { + "date_histogram": { + "field": "order_date", + "calendar_interval": "week" + }, + "aggs": { + "weekly_sales": { + "sum": { + "field": "taxful_total_price", + "format": "$#,###.00" + } + }, + "avg_vendor_spend": { + "bucket_selector": { + "buckets_path": { + "weekly_sales": "weekly_sales" + }, + "script": "params.weekly_sales > 75000" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +The aggregation returns the `sales_per_week` buckets that meet the scripted criterion: + +```json +{ + "took": 3, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4675, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "sales_per_week": { + "buckets": [ + { + "key_as_string": "2025-03-31T00:00:00.000Z", + "key": 1743379200000, + "doc_count": 1048, + "weekly_sales": { + "value": 79448.60546875, + "value_as_string": "$79,448.61" + } + }, + { + "key_as_string": "2025-04-07T00:00:00.000Z", + "key": 1743984000000, + "doc_count": 1048, + "weekly_sales": { + "value": 78208.4296875, + "value_as_string": "$78,208.43" + } + }, + { + "key_as_string": "2025-04-14T00:00:00.000Z", + "key": 1744588800000, + "doc_count": 1073, + "weekly_sales": { + "value": 81277.296875, + "value_as_string": "$81,277.30" + } + } + ] + } + } +} +``` + +Because it returns a Boolean rather than a numeric value, the `buckets_selector` aggregation does not take a `format` parameter. In this example, the formatted metrics are returned in the `value_as_string` result by the `sum` subaggregation. Contrast this with the [example in the `bucket_script` aggregation]({{site.url}}{{site.baseurl}}/aggregations/pipeline/bucket-script/#example). +{: .note} \ No newline at end of file diff --git a/_aggregations/pipeline/bucket-sort.md b/_aggregations/pipeline/bucket-sort.md new file mode 100644 index 00000000000..2c1f6c5f6eb --- /dev/null +++ b/_aggregations/pipeline/bucket-sort.md @@ -0,0 +1,229 @@ +--- +layout: default +title: Bucket sort +parent: Pipeline aggregations +nav_order: 40 +--- + +# Bucket sort aggregations + +The `bucket_sort` aggregation is a parent aggregation that sorts or truncates the buckets produced by its parent multi-bucket aggregation. + +In `bucket_sort` aggregations, you can sort buckets by multiple fields, each with its own sort order. Buckets can be sorted by their key, document count, or values from subaggregations. You can also use the `from` and `size` parameters to truncate the results, with or without sorting. + +For information about specifying sort order, see [Sort results]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/sort/). + +## Parameters + +The `bucket_sort` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `gap_policy` | Optional | String | The policy to apply to missing data. Valid values are `skip` and `insert_zeros`. Default is `skip`. See [Data gaps]({{site.url}}{{site.baseurl}}/aggregations/pipeline/#data-gaps). | +| `sort` | Optional | String | A list of fields by which to sort. See [Sort results]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/sort/). | +| `from` | Optional | String | The index of the first result to return. Must be a non-negative integer. Default is `0`. See [The `from` and `size` parameters]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/paginate/#the-from-and-size-parameters). | +| `size` | Optional | String | The maximum number of results to return. Must be a positive integer. See [The `from` and `size` parameters]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/paginate/#the-from-and-size-parameters).| + +You must supply at least one of `sort`, `from`, and `size`. +{: .note} + +## Example + +The following example creates a date histogram with a one-month interval from the OpenSearch Dashboards e-commerce sample data. The `sum` subaggregation calculates the sum of all bytes for each month. Finally, the aggregation sorts the buckets in descending order by number of bytes: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "sales_per_month": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "total_bytes": { + "sum": { + "field": "bytes" + } + }, + "bytes_bucket_sort": { + "bucket_sort": { + "sort": [ + { "total_bytes": { "order": "desc" } } + ] + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +The aggregation reorders the buckets in descending order by total number of bytes: + +```json +{ + "took": 3, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "sales_per_month": { + "buckets": [ + { + "key_as_string": "2025-05-01T00:00:00.000Z", + "key": 1746057600000, + "doc_count": 7072, + "total_bytes": { + "value": 40124337 + } + }, + { + "key_as_string": "2025-06-01T00:00:00.000Z", + "key": 1748736000000, + "doc_count": 6056, + "total_bytes": { + "value": 34123131 + } + }, + { + "key_as_string": "2025-04-01T00:00:00.000Z", + "key": 1743465600000, + "doc_count": 946, + "total_bytes": { + "value": 5478221 + } + } + ] + } + } +} +``` + +## Example: Truncating the results + +To truncate the results, provide the `from` and/or `size` parameters. The following example performs the same sort but returns two buckets, starting with the second bucket: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "sales_per_month": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "total_bytes": { + "sum": { + "field": "bytes" + } + }, + "bytes_bucket_sort": { + "bucket_sort": { + "sort": [ + { "total_bytes": { "order": "desc" } } + ], + "from": 1, + "size": 2 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The aggregation returns the two sorted buckets: + +```json +{ + "took": 2, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "sales_per_month": { + "buckets": [ + { + "key_as_string": "2025-06-01T00:00:00.000Z", + "key": 1748736000000, + "doc_count": 6056, + "total_bytes": { + "value": 34123131 + } + }, + { + "key_as_string": "2025-04-01T00:00:00.000Z", + "key": 1743465600000, + "doc_count": 946, + "total_bytes": { + "value": 5478221 + } + } + ] + } + } +} +``` + +To truncate results without sorting, omit the `sort` parameter: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "sales_per_month": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "total_bytes": { + "sum": { + "field": "bytes" + } + }, + "bytes_bucket_sort": { + "bucket_sort": { + "from": 1, + "size": 2 + } + } + } + } + } +} +``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_aggregations/pipeline/cumulative-sum.md b/_aggregations/pipeline/cumulative-sum.md new file mode 100644 index 00000000000..f93fc8589e0 --- /dev/null +++ b/_aggregations/pipeline/cumulative-sum.md @@ -0,0 +1,117 @@ +--- +layout: default +title: Cumulative sum +parent: Pipeline aggregations +has_children: false +nav_order: 60 +--- + +# Cumulative sum aggregations + +The `cumulative_sum` aggregation is a parent aggregation that calculates the cumulative sum across the buckets of a previous aggregation. + +A cumulative sum is a sequence of partial sums of a given sequence. For example, the cumulative sums of the sequence `{a,b,c,…}` are `a`, `a+b`, `a+b+c`, and so on. You can use the cumulative sum to visualize the rate of change of a field over time. + +## Parameters + +The `cumulative_sum` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `buckets_path` | Required | String | The path of the aggregation buckets to be aggregated. See [Buckets path]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#buckets-path). | +| `format` | Optional | String | A [DecimalFormat](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/text/DecimalFormat.html) formatting string. Returns the formatted output in the aggregation's `value_as_string` property. | + + +## Example + +The following example creates a date histogram with a one-month interval from the OpenSearch Dashboards e-commerce sample data. The `sum` subaggregation calculates the sum of all bytes for each month. Finally, the `cumulative_sum` aggregation calculates the cumulative number of bytes for each month's bucket: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "sales_per_month": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "no-of-bytes": { + "sum": { + "field": "bytes" + } + }, + "cumulative_bytes": { + "cumulative_sum": { + "buckets_path": "no-of-bytes" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took": 8, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "sales_per_month": { + "buckets": [ + { + "key_as_string": "2025-03-01T00:00:00.000Z", + "key": 1740787200000, + "doc_count": 480, + "no-of-bytes": { + "value": 2804103 + }, + "cumulative_bytes": { + "value": 2804103 + } + }, + { + "key_as_string": "2025-04-01T00:00:00.000Z", + "key": 1743465600000, + "doc_count": 6849, + "no-of-bytes": { + "value": 39103067 + }, + "cumulative_bytes": { + "value": 41907170 + } + }, + { + "key_as_string": "2025-05-01T00:00:00.000Z", + "key": 1746057600000, + "doc_count": 6745, + "no-of-bytes": { + "value": 37818519 + }, + "cumulative_bytes": { + "value": 79725689 + } + } + ] + } + } +} +``` diff --git a/_aggregations/pipeline/derivative.md b/_aggregations/pipeline/derivative.md new file mode 100644 index 00000000000..280055c6cdb --- /dev/null +++ b/_aggregations/pipeline/derivative.md @@ -0,0 +1,218 @@ +--- +layout: default +title: Derivative +parent: Pipeline aggregations +nav_order: 70 +--- + +# Derivative aggregations + +The `derivative` aggregation is a parent aggregation used to calculate first-order and second-order derivatives of each bucket of an aggregation. "First-order derivative" and "second-order derivative" are often shortened to "first derivative" and "second derivative," respectively. This page uses the shortened terms. + +For an ordered series of buckets, `derivative` approximates a first derivative as the difference between metric values in the current and previous buckets. + +## Parameters + +The `derivative` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `buckets_path` | Required | String | The path of the aggregation buckets to be aggregated. See [Buckets path]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#buckets-path). | +| `gap_policy` | Optional | String | The policy to apply to missing data. Valid values are `skip` and `insert_zeros`. Default is `skip`. See [Data gaps]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index/#data-gaps). | +| `format` | Optional | String | A [DecimalFormat](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/text/DecimalFormat.html) formatting string. Returns the formatted output in the aggregation's `value_as_string` property. | + +## Example: First derivative + +The following example creates a date histogram with a one-month interval from the OpenSearch Dashboards e-commerce sample data. The `sum` sub-aggregation calculates the sum of all bytes for each month. Finally, the `derivative` aggregation calculates the first derivative of the `sum` sub-aggregation. The first derivative is estimated as the difference between the number of bytes in the current month and the previous month: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "sales_per_month": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "number_of_bytes": { + "sum": { + "field": "bytes" + } + }, + "bytes_deriv": { + "derivative": { + "buckets_path": "number_of_bytes" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response: First derivative + +The response shows derivatives computed for the second and third buckets: + +```json +{ + "took": 6, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "sales_per_month": { + "buckets": [ + { + "key_as_string": "2025-03-01T00:00:00.000Z", + "key": 1740787200000, + "doc_count": 480, + "number_of_bytes": { + "value": 2804103 + } + }, + { + "key_as_string": "2025-04-01T00:00:00.000Z", + "key": 1743465600000, + "doc_count": 6849, + "number_of_bytes": { + "value": 39103067 + }, + "bytes_deriv": { + "value": 36298964 + } + }, + { + "key_as_string": "2025-05-01T00:00:00.000Z", + "key": 1746057600000, + "doc_count": 6745, + "number_of_bytes": { + "value": 37818519 + }, + "bytes_deriv": { + "value": -1284548 + } + } + ] + } + } +} +``` + +No derivative is calculated for the first bucket because no previous bucket is available for that bucket. + +## Example: Second derivative + +To calculate a second derivative, chain one derivative aggregation to another: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "sales_per_month": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "number_of_bytes": { + "sum": { + "field": "bytes" + } + }, + "bytes_1st_deriv": { + "derivative": { + "buckets_path": "number_of_bytes" + } + }, + "bytes_2nd_deriv": { + "derivative": { + "buckets_path": "bytes_1st_deriv" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response: Second derivative + +```json +{ + "took": 6, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "sales_per_month": { + "buckets": [ + { + "key_as_string": "2025-03-01T00:00:00.000Z", + "key": 1740787200000, + "doc_count": 480, + "number_of_bytes": { + "value": 2804103 + } + }, + { + "key_as_string": "2025-04-01T00:00:00.000Z", + "key": 1743465600000, + "doc_count": 6849, + "number_of_bytes": { + "value": 39103067 + }, + "bytes_1st_deriv": { + "value": 36298964 + } + }, + { + "key_as_string": "2025-05-01T00:00:00.000Z", + "key": 1746057600000, + "doc_count": 6745, + "number_of_bytes": { + "value": 37818519 + }, + "bytes_1st_deriv": { + "value": -1284548 + }, + "bytes_2nd_deriv": { + "value": -37583512 + } + } + ] + } + } +} +``` + +No first derivative is calculated for the first bucket because no previous bucket is available for that bucket. Similarly, no second derivative is calculated for the first or second buckets. diff --git a/_aggregations/pipeline/extended-stats.md b/_aggregations/pipeline/extended-stats.md new file mode 100644 index 00000000000..2d18659d968 --- /dev/null +++ b/_aggregations/pipeline/extended-stats.md @@ -0,0 +1,179 @@ +--- +layout: default +title: Extended stats bucket +parent: Pipeline aggregations +nav_order: 80 +--- + +# Extended stats bucket aggregation + +The `extended_stats_bucket` aggregation is a more comprehensive version of the [`stats_bucket`]({{site.url}}{{site.baseurl}}/aggregations/pipeline/stats-bucket/) sibling aggregation. As well as the basic statistical measures provided by `stats_bucket`, `extended_stats_bucket` calculates the following metrics: + +- Sum of squares +- Variance +- Population variance +- Sampling variance +- Standard deviation +- Population standard deviation +- Sampling standard deviation +- Standard deviation bounds: + - Upper + - Lower + - Population upper + - Population lower + - Sampling upper + - Sampling lower + +The standard deviation and variance are population statistics; they are always equal to the population standard deviation and variance, respectively. + +The `std_deviation_bounds` object defines a range that spans the specified number of standard deviations above and below the mean (default is two standard deviations). This object is always included in the output but is meaningful only for normally distributed data. Before interpreting these values, verify that your dataset follows a normal distribution. + +The specified metric must be numeric, and the sibling aggregation must be a multi-bucket aggregation. + +## Parameters + +The `extended_stats_bucket` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `buckets_path` | Required | String | The path of the aggregation buckets to be aggregated. See [Buckets path]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#buckets-path). | +| `gap_policy` | Optional | String | The policy to apply to missing data. Valid values are `skip` and `insert_zeros`. Default is `skip`. See [Data gaps]({{site.url}}{{site.baseurl}}/aggregations/pipeline/#data-gaps).| +| `format` | Optional | String | A [DecimalFormat](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/text/DecimalFormat.html) formatting string. Returns the formatted output in the aggregation's `_as_string` property. | +| `sigma` | Optional | Double (non-negative) | The number of standard deviations above and below the mean used to calculate the `std_deviation_bounds` interval. Default is `2`. See [Defining bounds]({{site.url}}{{site.baseurl}}/aggregations/metric/extended-stats#defining-bounds) in `extended_stats`. | + +## Example + +The following example creates a date histogram with a one-month interval using the OpenSearch Dashboards e-commerce sample data. The `sum` sub-aggregation calculates the sum of all bytes for each month. Finally, the `extended_stats_bucket` aggregation returns the extended stats for these sums: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "visits_per_month": { + "date_histogram": { + "field": "@timestamp", + "interval": "month" + }, + "aggs": { + "sum_of_bytes": { + "sum": { + "field": "bytes" + } + } + } + }, + "stats_monthly_bytes": { + "extended_stats_bucket": { + "buckets_path": "visits_per_month>sum_of_bytes", + "sigma": 3, + "format": "0.##E0" + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +The response contains extended stats for the selected buckets. Note that the standard deviation bounds are for a three-sigma range; changing `sigma` (or letting it default to `2`) returns different results: + +
+ + Response + + +```json +{ + "took": 6, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "visits_per_month": { + "buckets": [ + { + "key_as_string": "2025-03-01T00:00:00.000Z", + "key": 1740787200000, + "doc_count": 480, + "sum_of_bytes": { + "value": 2804103 + } + }, + { + "key_as_string": "2025-04-01T00:00:00.000Z", + "key": 1743465600000, + "doc_count": 6849, + "sum_of_bytes": { + "value": 39103067 + } + }, + { + "key_as_string": "2025-05-01T00:00:00.000Z", + "key": 1746057600000, + "doc_count": 6745, + "sum_of_bytes": { + "value": 37818519 + } + } + ] + }, + "stats_monthly_bytes": { + "count": 3, + "min": 2804103, + "max": 39103067, + "avg": 26575229.666666668, + "sum": 79725689, + "min_as_string": "2.8E6", + "max_as_string": "3.91E7", + "avg_as_string": "2.66E7", + "sum_as_string": "7.97E7", + "sum_of_squares": 2967153221794459, + "variance": 282808242095406.25, + "variance_population": 282808242095406.25, + "variance_sampling": 424212363143109.4, + "std_deviation": 16816903.46334325, + "std_deviation_population": 16816903.46334325, + "std_deviation_sampling": 20596416.2694171, + "std_deviation_bounds": { + "upper": 77025940.05669643, + "lower": -23875480.72336309, + "upper_population": 77025940.05669643, + "lower_population": -23875480.72336309, + "upper_sampling": 88364478.47491796, + "lower_sampling": -35214019.141584635 + }, + "sum_of_squares_as_string": "2.97E15", + "variance_as_string": "2.83E14", + "variance_population_as_string": "2.83E14", + "variance_sampling_as_string": "4.24E14", + "std_deviation_as_string": "1.68E7", + "std_deviation_population_as_string": "1.68E7", + "std_deviation_sampling_as_string": "2.06E7", + "std_deviation_bounds_as_string": { + "upper": "7.7E7", + "lower": "-2.39E7", + "upper_population": "7.7E7", + "lower_population": "-2.39E7", + "upper_sampling": "8.84E7", + "lower_sampling": "-3.52E7" + } + } + } +} +``` + +
diff --git a/_aggregations/pipeline/index.md b/_aggregations/pipeline/index.md new file mode 100644 index 00000000000..9f79786357b --- /dev/null +++ b/_aggregations/pipeline/index.md @@ -0,0 +1,250 @@ +--- +layout: default +title: Pipeline aggregations +nav_order: 5 +has_children: true +has_toc: false +redirect_from: + - /opensearch/pipeline-agg/ + - /query-dsl/aggregations/pipeline-agg/ + - /aggregations/pipeline/ + - /aggregations/pipeline-agg/ +--- + +# Pipeline aggregations + +Pipeline aggregations chain together multiple aggregations by using the output of one aggregation as the input for another. They compute complex statistical and mathematical measures like derivatives, moving averages, and cumulative sums. Some pipeline aggregations duplicate the functionality of metric and bucket aggregations but, in many cases, are more intuitive to use. + +Pipeline aggregations are executed after all other sibling aggregations. This has performance implications. For example, using the `bucket_selector` pipeline aggregation to narrow a list of buckets does not reduce the number of computations performed on omitted buckets. +{: .note} + +Pipeline aggregations cannot be sub-aggregated but can be chained to other pipeline aggregations. For example, you can calculate a second derivative by chaining two consecutive `derivative` aggregations. Keep in mind that pipeline aggregations append to existing output. For example, computing a second derivative by chaining `derivative` aggregations outputs both the first and second derivatives. + +## Pipeline aggregation types + +Pipeline aggregations are of two types: [sibling](#sibling-aggregations) and [parent](#parent-aggregations). + +### Sibling aggregations + +A _sibling_ pipeline aggregation takes the output of a nested aggregation and produces new buckets or new aggregations at the same level as the nested buckets. + +A sibling aggregation must be a multi-bucket aggregation (have multiple grouped values for a certain field), and the metric must be a numeric value. + +### Parent aggregations + +A _parent_ aggregation takes the output of an outer aggregation and produces new buckets or new aggregations at the same level as the existing buckets. Unlike sibling pipeline aggregations, which operate across all buckets and produce a single output, parent pipeline aggregations process each bucket individually and write the result back into each bucket. + +The specified metric for a parent aggregation must be a numeric value. + +We strongly recommend setting `min_doc_count` to `0` (the default for `histogram` aggregations) for parent aggregations. If `min_doc_count` is greater than `0`, then the aggregation omits buckets, which might lead to incorrect results. +{: .important} + +## Supported pipeline aggregations + +OpenSearch supports the following pipeline aggregations. + +| Name | Type | Description | +|------|------|-------------| +| [`avg_bucket`]({{site.url}}{{site.baseurl}}/aggregations/pipeline/avg-bucket/) | Sibling | Calculates the average of a metric in each bucket of a previous aggregation. | +| [`bucket_script`]({{site.url}}{{site.baseurl}}/aggregations/pipeline/bucket-script/) | Parent | Executes a script to perform per-bucket numeric computations across a set of buckets. | +| [`bucket_selector`]({{site.url}}{{site.baseurl}}/aggregations/pipeline/bucket-selector/) | Parent | Evaluates a script to determine whether buckets returned by a `histogram` (or `date_histogram`) aggregation should be included in the final result. | +| [`bucket_sort`]({{site.url}}{{site.baseurl}}/aggregations/pipeline/bucket-selector/) | Parent | Sorts or truncates the buckets produced by its parent multi-bucket aggregation. | +| [`cumulative_sum`]({{site.url}}{{site.baseurl}}/aggregations/pipeline/cumulative-sum/) | Parent | Calculates the cumulative sum across the buckets of a previous aggregation. | +| [`derivative`]({{site.url}}{{site.baseurl}}/aggregations/pipeline/derivative/) | Parent | Calculates first-order and second-order derivatives of each bucket of an aggregation. | +| [`extended_stats`]({{site.url}}{{site.baseurl}}/aggregations/pipeline/extended-stats/) | Sibling | A more comprehensive version of the `stats_bucket` aggregation that provides additional metrics. | +| [`max_bucket`]({{site.url}}{{site.baseurl}}/aggregations/pipeline/max-bucket/) | Sibling | Calculates the maximum of a metric in each bucket of a previous aggregation. | +| [`min_bucket`]({{site.url}}{{site.baseurl}}/aggregations/pipeline/min-bucket/) | Sibling | Calculates the minimum of a metric in each bucket of a previous aggregation. | +| [`moving_avg`]({{site.url}}{{site.baseurl}}/aggregations/pipeline/moving-avg/) *(Deprecated)* | Parent | Calculates a sequence of averages of a metric contained in windows (adjacent subsets) of an ordered dataset. | +| [`moving_fn`]({{site.url}}{{site.baseurl}}/aggregations/pipeline/moving-function/) | Parent | Executes a script over a sliding window. | +| [`percentiles_bucket`]({{site.url}}{{site.baseurl}}/aggregations/pipeline/percentiles-bucket/) | Sibling | Calculates the percentile placement of bucketed metrics. | +| [`serial_diff`]({{site.url}}{{site.baseurl}}/aggregations/pipeline/serial-diff/) | Parent | Calculates the difference between metric values in the current bucket and a previous bucket. It stores the result in the current bucket. | +| [`stats_bucket`]({{site.url}}{{site.baseurl}}/aggregations/pipeline/stats-bucket/) | Sibling | Returns a variety of stats (`count`, `min`, `max`, `avg`, and `sum`) for the buckets of a previous aggregation. | +| [`sum_bucket`]({{site.url}}{{site.baseurl}}/aggregations/pipeline/sum-bucket/) | Sibling | Calculates the sum of a metric in each bucket of a previous aggregation. | + + +## Buckets path + +A pipeline aggregation uses the `buckets_path` parameter to reference the output of other aggregations. +The `buckets_path` parameter has the following syntax: + +```r +buckets_path = [ > ... ][ . ] +``` + +This syntax uses the following elements. + +| Element | Description | +| :-- | :-- | +| `` | The name of the aggregation. | +| `>` | A child selector used to navigate from one aggregation (parent) to another nested aggregation (child). | +| `.` | Specifies a metric to retrieve from a multi-value aggregation. Required only if the target aggregation produces multiple metrics. | + +To visualize the buckets path, suppose you have the following aggregation structure: + +```json +"aggs": { + "parent_agg": { + "terms": { + "field": "category" + }, + "aggs": { + "child_agg": { + "stats": { + "field": "price" + } + } + } + } +} +``` + +To reference the average price from the `child_agg`, which is nested in the `parent_agg`, use `parent_agg>child_agg.avg`. + +Examples: + +- `my_sum.sum`: Refers to the sum metric from the `my_sum` aggregation. + +- `popular_tags>my_sum.sum`: Refers to the `sum` metric from the `my_sum` aggregation, which is nested under the `popular_tags` aggregation. + +For multi-value metric aggregations like `stats` or `percentiles`, you must include the metric name (for example, `.min`) in the path. For single-value metrics like `sum` or `avg`, the metric name is optional if unambiguous. +{: .tip} + + +### Buckets path example + +The following example operates on the OpenSearch Dashboards logs sample data. It creates a histogram of values in the `bytes` field, sums the `phpmemory` fields in each histogram bucket, and finally sums the buckets using the `sum_bucket` pipeline aggregation. The `buckets_path` follows the `number_of_bytes>sum_total_memory ` path from the `number_of_bytes` parent aggregation to the `sum_total_memory` subaggregation: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "number_of_bytes": { + "histogram": { + "field": "bytes", + "interval": 10000 + }, + "aggs": { + "sum_total_memory": { + "sum": { + "field": "phpmemory" + } + } + } + }, + "sum_copies": { + "sum_bucket": { + "buckets_path": "number_of_bytes>sum_total_memory" + } + } + } +} +``` +{% include copy-curl.html %} + +Note that the `buckets_path` contains the names of the component aggregations. Paths are directed, meaning that they cascade one way, downward from parents to children. + +The pipeline aggregation returns the total memory summed from all the buckets: + +```json +{ + ... + "aggregations": { + "number_of_bytes": { + "buckets": [ + { + "key": 0, + "doc_count": 13372, + "sum_total_memory": { + "value": 91266400 + } + }, + { + "key": 10000, + "doc_count": 702, + "sum_total_memory": { + "value": 0 + } + } + ] + }, + "sum_copies": { + "value": 91266400 + } + } +} +``` + +### Count paths + +You can direct the `buckets_path` to use a count rather than a value as its input. To do so, use the `_count` buckets path variable. + +The following example computes basic stats on a histogram of the number of bytes from the OpenSearch Dashboards logs sample data. It creates a histogram of values in the `bytes` field and then computes the stats on the counts in the histogram buckets. + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "number_of_bytes": { + "histogram": { + "field": "bytes", + "interval": 10000 + } + }, + "count_stats": { + "stats_bucket": { + "buckets_path": "number_of_bytes>_count" + } + } + } +} +``` +{% include copy-curl.html %} + +The results show stats about the *document counts* of the buckets: + +```json +{ +... + "aggregations": { + "number_of_bytes": { + "buckets": [ + { + "key": 0, + "doc_count": 13372 + }, + { + "key": 10000, + "doc_count": 702 + } + ] + }, + "count_stats": { + "count": 2, + "min": 702, + "max": 13372, + "avg": 7037, + "sum": 14074 + } + } +} +``` + +## Data gaps + +Real-world data can be missing from nested aggregations for a number of reasons, including: + +- Missing values in documents. +- Empty buckets anywhere in the chain of aggregations. +- Missing data needed to calculate a bucket value (for example, rolling functions such as `derivative` require one or more previous values to start). + +You can specify a policy to handle missing data using the `gap_policy` property: either skip the missing data or replace the missing data with zeros. + +The `gap_policy` parameter is valid for all pipeline aggregations. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `gap_policy` | Optional | String | The policy to apply to missing data. Valid values are `skip` and `insert_zeros`. Default is `skip`. | +| `format` | Optional | String | A [DecimalFormat](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/text/DecimalFormat.html) formatting string. Returns the formatted output in the aggregation's `value_as_string` property. | + diff --git a/_aggregations/pipeline/max-bucket.md b/_aggregations/pipeline/max-bucket.md new file mode 100644 index 00000000000..71d8155e985 --- /dev/null +++ b/_aggregations/pipeline/max-bucket.md @@ -0,0 +1,116 @@ +--- +layout: default +title: Maximum bucket +parent: Pipeline aggregations +nav_order: 100 +--- + +# Maximum bucket aggregations + +The `max_bucket` aggregation is a sibling aggregation that calculates the maximum of a metric in each bucket of a previous aggregation. + +The specified metric must be numeric, and the sibling aggregation must be a multi-bucket aggregation. + +## Parameters + +The `max_bucket` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `buckets_path` | Required | String | The path of the aggregation buckets to be aggregated. See [Buckets path]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#buckets-path). | +| `gap_policy` | Optional | String | The policy to apply to missing data. Valid values are `skip` and `insert_zeros`. Default is `skip`. See [Data gaps]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#data-gaps).| +| `format` | Optional | String | A [DecimalFormat](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/text/DecimalFormat.html) formatting string. Returns the formatted output in the aggregation's `value_as_string` property. | + +## Example + +The following example creates a date histogram with a one-month interval from the OpenSearch Dashboards e-commerce sample data. The `sum` subaggregation calculates the sum of bytes for each month. Finally, the `max_bucket` aggregation finds the maximum---the largest of these buckets: + +```json +POST opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "visits_per_month": { + "date_histogram": { + "field": "@timestamp", + "interval": "month" + }, + "aggs": { + "sum_of_bytes": { + "sum": { + "field": "bytes" + } + } + } + }, + "max_monthly_bytes": { + "max_bucket": { + "buckets_path": "visits_per_month>sum_of_bytes" + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +The `max_bucket` aggregation returns the maximum value from a specified metric across multiple buckets. In this example, it calculates the maximum number of bytes per month from the `sum_of_bytes` metric inside `visits_per_month`. The `value` field shows the maximum value found across all buckets. The `keys` array contains the keys of the buckets in which this maximum value was observed. It's an array because more than one bucket can have the same maximum value. In such cases, all matching bucket keys are included. This ensures that the result is accurate even if multiple time periods (or terms) have the same maximum value: + +```json +{ + "took": 6, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "visits_per_month": { + "buckets": [ + { + "key_as_string": "2025-03-01T00:00:00.000Z", + "key": 1740787200000, + "doc_count": 480, + "sum_of_bytes": { + "value": 2804103 + } + }, + { + "key_as_string": "2025-04-01T00:00:00.000Z", + "key": 1743465600000, + "doc_count": 6849, + "sum_of_bytes": { + "value": 39103067 + } + }, + { + "key_as_string": "2025-05-01T00:00:00.000Z", + "key": 1746057600000, + "doc_count": 6745, + "sum_of_bytes": { + "value": 37818519 + } + } + ] + }, + "max_monthly_bytes": { + "value": 39103067, + "keys": [ + "2025-04-01T00:00:00.000Z" + ] + } + } +} +``` + diff --git a/_aggregations/pipeline/min-bucket.md b/_aggregations/pipeline/min-bucket.md new file mode 100644 index 00000000000..ede64038c5f --- /dev/null +++ b/_aggregations/pipeline/min-bucket.md @@ -0,0 +1,117 @@ +--- +layout: default +title: Minimum bucket +parent: Pipeline aggregations +nav_order: 110 +--- + +# Minimum bucket aggregations + +The `min_bucket` aggregation is a sibling aggregation that calculates the minimum of a metric in each bucket of a previous aggregation. + +The specified metric must be numeric, and the sibling aggregation must be a multi-bucket aggregation. + +## Parameters + +The `min_bucket` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `buckets_path` | Required | String | The path of the aggregation buckets to be aggregated. See [Buckets path]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#buckets-path). | +| `gap_policy` | Optional | String | The policy to apply to missing data. Valid values are `skip` and `insert_zeros`. Default is `skip`. See [Data gaps]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#data-gaps).| +| `format` | Optional | String | A [DecimalFormat](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/text/DecimalFormat.html) formatting string. Returns the formatted output in the aggregation's `value_as_string` property. | + +## Example + +The following example creates a date histogram with a one-month interval from the OpenSearch Dashboards e-commerce sample data. The `sum` subaggregation calculates the sum of bytes for each month. Finally, the `min_bucket` aggregation finds the minimum---the smallest of these buckets: + +```json +POST opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "visits_per_month": { + "date_histogram": { + "field": "@timestamp", + "interval": "month" + }, + "aggs": { + "sum_of_bytes": { + "sum": { + "field": "bytes" + } + } + } + }, + "min_monthly_bytes": { + "min_bucket": { + "buckets_path": "visits_per_month>sum_of_bytes" + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +The `max_bucket` aggregation returns the minimum value from a specified metric across multiple buckets. In this example, it calculates the minimum number of bytes per month from the `sum_of_bytes` metric inside `visits_per_month`. The `value` field shows the minimum value found across all buckets. The `keys` array contains the keys of the buckets in which this minimum value was observed. It's an array because more than one bucket can have the same minimum value. In such cases, all matching bucket keys are included. This ensures that the result is accurate even if multiple time periods (or terms) have the same minimum value: + +```json +{ + "took": 7, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "visits_per_month": { + "buckets": [ + { + "key_as_string": "2025-03-01T00:00:00.000Z", + "key": 1740787200000, + "doc_count": 480, + "sum_of_bytes": { + "value": 2804103 + } + }, + { + "key_as_string": "2025-04-01T00:00:00.000Z", + "key": 1743465600000, + "doc_count": 6849, + "sum_of_bytes": { + "value": 39103067 + } + }, + { + "key_as_string": "2025-05-01T00:00:00.000Z", + "key": 1746057600000, + "doc_count": 6745, + "sum_of_bytes": { + "value": 37818519 + } + } + ] + }, + "min_monthly_bytes": { + "value": 2804103, + "keys": [ + "2025-03-01T00:00:00.000Z" + ] + } + } +} +``` + + diff --git a/_aggregations/pipeline/moving-avg.md b/_aggregations/pipeline/moving-avg.md new file mode 100644 index 00000000000..106dd068022 --- /dev/null +++ b/_aggregations/pipeline/moving-avg.md @@ -0,0 +1,570 @@ +--- +layout: default +title: Moving average +parent: Pipeline aggregations +nav_order: 120 +--- + +## Moving average aggregations +**Deprecated** +{: .label .label-red } + +The `moving_avg` aggregation has been deprecated in favor of the `moving_fn` aggregation. +{: .important} + +A `moving_avg` aggregation is a parent pipeline aggregation that calculates a sequence of averages of a metric contained in windows (adjacent subsets) of an ordered dataset. + +To create a `moving_avg` aggregation, you first create a `histogram` or `date_histogram` aggregation. Optionally, you then embed a metric aggregation in the histogram aggregation. Finally, you embed the `moving_avg` aggregation in the histogram and set the `buckets_path` parameter to the embedded metric that you want to track. + +A window's size is the number of sequential data values in the window. During each iteration, the algorithm calculates the average for all data points in the window and then slides forward one data value, excluding the first value of the previous window and including the first value of the next window. + +For example, given the data `[1, 5, 8, 23, 34, 28, 7, 23, 20, 19]`, a moving average with a window size of 5 is as follows: + +``` +(1 + 5 + 8 + 23 + 34) / 5 = 14.2 +(5 + 8 + 23 + 34 + 28) / 5 = 19.6 +(8 + 23 + 34 + 28 + 7) / 5 = 20 +and so on ... +``` + +The `moving_avg` aggregation is typically applied to time-series data to smooth out noise or short-term fluctuations and to identify trends. +Specify a small window size to smooth out small-scale fluctuations. Specify a larger window size to smooth out high-frequency fluctuations or random noise, making lower-frequency trends more visible. + +For more information about moving averages, see [Wikipedia](https://en.wikipedia.org/wiki/Moving_average). + +## Parameters + +The `moving_avg` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `buckets_path` | Required | String | The path of the buckets to be aggregated. See [Buckets path]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#buckets-path). | +| `gap_policy` | Optional | String | The policy to apply to missing data. Valid values are `skip` and `insert_zeros`. Default is `skip`. See [Data gaps]({{site.url}}{{site.baseurl}}/aggregations/pipeline/#data-gaps). | +| `format` | Optional | String | A [DecimalFormat](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/text/DecimalFormat.html) formatting string. Returns the formatted output in the aggregation's `value_as_string` property. | +| `window` | Optional | Numerical | The number of data points contained in the window. Default is `5`. | +| `model` | Optional | String | The weighted moving average model to use. Options are `ewma`, `holt`, `holt_winters`, `linear`, and `simple`. Default is `simple`. See [Models](#models). | +| `settings` | Optional | Object | The parameters for adjusting the window. See [Models](#models). | +| `predict` | Optional | Numerical | The number of predicted values to append to the end of the result. Default is `0`. | + + +## Example + +The following example creates a date histogram with a one-month interval from the OpenSearch Dashboards logs sample data. The `sum` subaggregation calculates the sum of all bytes for each month. Finally, the `moving_avg` aggregation calculates the moving average of bytes per month from these sums: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "my_date_histogram": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "sum_of_bytes": { + "sum": { "field": "bytes" } + }, + "moving_avg_of_sum_of_bytes": { + "moving_avg": { + "buckets_path": "sum_of_bytes" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +The aggregation returns the `moving_avg` value starting from the second bucket. The first bucket does not have a moving average value because there aren't enough previous data points to calculate it: + +```json +{ + "took": 5, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "my_date_histogram": { + "buckets": [ + { + "key_as_string": "2025-03-01T00:00:00.000Z", + "key": 1740787200000, + "doc_count": 480, + "sum_of_bytes": { + "value": 2804103 + } + }, + { + "key_as_string": "2025-04-01T00:00:00.000Z", + "key": 1743465600000, + "doc_count": 6849, + "sum_of_bytes": { + "value": 39103067 + }, + "moving_avg_of_sum_of_bytes": { + "value": 2804103 + } + }, + { + "key_as_string": "2025-05-01T00:00:00.000Z", + "key": 1746057600000, + "doc_count": 6745, + "sum_of_bytes": { + "value": 37818519 + }, + "moving_avg_of_sum_of_bytes": { + "value": 20953585 + } + } + ] + } + } +} +``` + + +## Example: Prediction + +You can use the `moving_avg` aggregation to predict future buckets. + +The following example reduces the interval of the previous example to one week and appends five predicted one-week buckets to the end of the response: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "my_date_histogram": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "week" + }, + "aggs": { + "sum_of_bytes": { + "sum": { + "field": "bytes" + } + }, + "moving_avg_of_sum_of_bytes": { + "moving_avg": { + "buckets_path": "sum_of_bytes", + "predict": 5 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The response includes the five predictions. Note that the `doc_count` for the predicted buckets is `0`: + +
+ + Response + + +```json +{ + "took": 5, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "my_date_histogram": { + "buckets": [ + { + "key_as_string": "2025-03-24T00:00:00.000Z", + "key": 1742774400000, + "doc_count": 249, + "sum_of_bytes": { + "value": 1531493 + } + }, + { + "key_as_string": "2025-03-31T00:00:00.000Z", + "key": 1743379200000, + "doc_count": 1617, + "sum_of_bytes": { + "value": 9213161 + }, + "moving_avg_of_sum_of_bytes": { + "value": 1531493 + } + }, + { + "key_as_string": "2025-04-07T00:00:00.000Z", + "key": 1743984000000, + "doc_count": 1610, + "sum_of_bytes": { + "value": 9188671 + }, + "moving_avg_of_sum_of_bytes": { + "value": 5372327 + } + }, + { + "key_as_string": "2025-04-14T00:00:00.000Z", + "key": 1744588800000, + "doc_count": 1610, + "sum_of_bytes": { + "value": 9244851 + }, + "moving_avg_of_sum_of_bytes": { + "value": 6644441.666666667 + } + }, + { + "key_as_string": "2025-04-21T00:00:00.000Z", + "key": 1745193600000, + "doc_count": 1609, + "sum_of_bytes": { + "value": 9061045 + }, + "moving_avg_of_sum_of_bytes": { + "value": 7294544 + } + }, + { + "key_as_string": "2025-04-28T00:00:00.000Z", + "key": 1745798400000, + "doc_count": 1554, + "sum_of_bytes": { + "value": 8713507 + }, + "moving_avg_of_sum_of_bytes": { + "value": 7647844.2 + } + }, + { + "key_as_string": "2025-05-05T00:00:00.000Z", + "key": 1746403200000, + "doc_count": 1710, + "sum_of_bytes": { + "value": 9544718 + }, + "moving_avg_of_sum_of_bytes": { + "value": 9084247 + } + }, + { + "key_as_string": "2025-05-12T00:00:00.000Z", + "key": 1747008000000, + "doc_count": 1610, + "sum_of_bytes": { + "value": 9155820 + }, + "moving_avg_of_sum_of_bytes": { + "value": 9150558.4 + } + }, + { + "key_as_string": "2025-05-19T00:00:00.000Z", + "key": 1747612800000, + "doc_count": 1610, + "sum_of_bytes": { + "value": 9025078 + }, + "moving_avg_of_sum_of_bytes": { + "value": 9143988.2 + } + }, + { + "key_as_string": "2025-05-26T00:00:00.000Z", + "key": 1748217600000, + "doc_count": 895, + "sum_of_bytes": { + "value": 5047345 + }, + "moving_avg_of_sum_of_bytes": { + "value": 9100033.6 + } + }, + { + "key_as_string": "2025-06-02T00:00:00.000Z", + "key": 1748822400000, + "doc_count": 0, + "moving_avg_of_sum_of_bytes": { + "value": 8297293.6 + } + }, + { + "key_as_string": "2025-06-09T00:00:00.000Z", + "key": 1749427200000, + "doc_count": 0, + "moving_avg_of_sum_of_bytes": { + "value": 8297293.6 + } + }, + { + "key_as_string": "2025-06-16T00:00:00.000Z", + "key": 1750032000000, + "doc_count": 0, + "moving_avg_of_sum_of_bytes": { + "value": 8297293.6 + } + }, + { + "key_as_string": "2025-06-23T00:00:00.000Z", + "key": 1750636800000, + "doc_count": 0, + "moving_avg_of_sum_of_bytes": { + "value": 8297293.6 + } + }, + { + "key_as_string": "2025-06-30T00:00:00.000Z", + "key": 1751241600000, + "doc_count": 0, + "moving_avg_of_sum_of_bytes": { + "value": 8297293.6 + } + } + ] + } + } +} +``` +
+ +## Models + +The `moving_avg` aggregation supports five models that differ in how they weight values in the moving window. + +Use the `model` parameter to specify which model to use. + +| Model | Model keyword | Weighting | +|-------|---------------|-------------| +| Simple | `simple` | An unweighted mean of all values in the window. | +| Linear | `linear` | Uses a linear decay of weights, giving more importance to recent values. | +| Exponentially Weighted Moving Average | `ewma` | Uses exponentially decreasing weights, giving more importance to recent values. | +| Holt | `holt` | Uses a second exponential term to smooth long-term trends. | +| Holt-Winters | `holt_winters` | Uses a third exponential term to smooth periodic (seasonal) effects. | + +Use the `settings` object to set the model's properties. The following table shows the available settings for each model. + +| Model | Parameter | Allowed values | Default | Description | +| :-- | :-- | :-- | :-- | :-- | +| `simple` | None | Numeric array | None | The arithmetic mean of all values in the window. | +| `linear` | None | Numeric array | None | The weighted average of all values in the window, with more recent values weighted more heavily.| +| `ewma` | `alpha` | [0, 1] | 0.3 | The decay parameter. Higher values give more weight to recent data points. | +| `holt` | `alpha` | [0, 1] | 0.3 | The decay parameter for the level component. | +| | `beta` | [0, 1] | 0.1 | The decay parameter for the trend component.| +| `holt_winters` | `alpha` | [0, 1] | 0.3 | The decay parameter for the level component. | +| | `beta` | [0, 1] | 0.3 | The decay parameter for the trend component. | +| | `gamma` | [0, 1] | 0.3 | The decay parameter for the seasonal component. | +| | `type` | `add`, `mult` | `add` | Defines how seasonality is modeled: additive or multiplicative. | +| | `period` | Integer | 1 | The number of buckets comprising the period. | +| | `pad` | Boolean | `true` | Whether to add a small offset to `0` values for `mult` type models to avoid a divide-by-zero error. | + + +For a discussion of these models and their parameters, see [Wikipedia](https://en.wikipedia.org/wiki/Moving_average). + + +### Example: Holt model + +The `holt` model computes weights with exponential decay controlled by the `alpha` and `beta` parameters. + +The following request calculates a moving average of total weekly byte data using a Holt model with a `window` size of `6`, an `alpha` value of `0.4`, and a `beta` value of `0.2`: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "my_date_histogram": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "week" + }, + "aggs": { + "sum_of_bytes": { + "sum": { + "field": "bytes" + } + }, + "moving_avg_of_sum_of_bytes": { + "moving_avg": { + "buckets_path": "sum_of_bytes", + "window": 6, + "model": "holt", + "settings": { "alpha": 0.4, "beta": 0.2 } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The moving average begins with the second bucket: + +
+ + Response + + +```json +{ + "took": 7, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "my_date_histogram": { + "buckets": [ + { + "key_as_string": "2025-03-24T00:00:00.000Z", + "key": 1742774400000, + "doc_count": 249, + "sum_of_bytes": { + "value": 1531493 + } + }, + { + "key_as_string": "2025-03-31T00:00:00.000Z", + "key": 1743379200000, + "doc_count": 1617, + "sum_of_bytes": { + "value": 9213161 + }, + "moving_avg_of_sum_of_bytes": { + "value": 1531493 + } + }, + { + "key_as_string": "2025-04-07T00:00:00.000Z", + "key": 1743984000000, + "doc_count": 1610, + "sum_of_bytes": { + "value": 9188671 + }, + "moving_avg_of_sum_of_bytes": { + "value": 4604160.2 + } + }, + { + "key_as_string": "2025-04-14T00:00:00.000Z", + "key": 1744588800000, + "doc_count": 1610, + "sum_of_bytes": { + "value": 9244851 + }, + "moving_avg_of_sum_of_bytes": { + "value": 6806684.584000001 + } + }, + { + "key_as_string": "2025-04-21T00:00:00.000Z", + "key": 1745193600000, + "doc_count": 1609, + "sum_of_bytes": { + "value": 9061045 + }, + "moving_avg_of_sum_of_bytes": { + "value": 8341230.127680001 + } + }, + { + "key_as_string": "2025-04-28T00:00:00.000Z", + "key": 1745798400000, + "doc_count": 1554, + "sum_of_bytes": { + "value": 8713507 + }, + "moving_avg_of_sum_of_bytes": { + "value": 9260724.7236736 + } + }, + { + "key_as_string": "2025-05-05T00:00:00.000Z", + "key": 1746403200000, + "doc_count": 1710, + "sum_of_bytes": { + "value": 9544718 + }, + "moving_avg_of_sum_of_bytes": { + "value": 9657431.903375873 + } + }, + { + "key_as_string": "2025-05-12T00:00:00.000Z", + "key": 1747008000000, + "doc_count": 1610, + "sum_of_bytes": { + "value": 9155820 + }, + "moving_avg_of_sum_of_bytes": { + "value": 9173999.55240704 + } + }, + { + "key_as_string": "2025-05-19T00:00:00.000Z", + "key": 1747612800000, + "doc_count": 1610, + "sum_of_bytes": { + "value": 9025078 + }, + "moving_avg_of_sum_of_bytes": { + "value": 9172040.511275519 + } + }, + { + "key_as_string": "2025-05-26T00:00:00.000Z", + "key": 1748217600000, + "doc_count": 895, + "sum_of_bytes": { + "value": 5047345 + }, + "moving_avg_of_sum_of_bytes": { + "value": 9108804.964619776 + } + } + ] + } + } +} +``` +
diff --git a/_aggregations/pipeline/moving-function.md b/_aggregations/pipeline/moving-function.md new file mode 100644 index 00000000000..43da60840f6 --- /dev/null +++ b/_aggregations/pipeline/moving-function.md @@ -0,0 +1,613 @@ +--- +layout: default +title: Moving function +parent: Pipeline aggregations +nav_order: 130 +--- + +# Moving function aggregations + +The `moving_fn` aggregation is a parent pipeline aggregation that executes a script over a sliding window. The sliding window moves over a sequence of values extracted from a parent `histogram` or `date histogram` aggregation. The window shifts left to right one bucket at a time; `moving_fn` runs the script each time the window shifts. + +Use the `moving_fn` aggregation to script any numeric calculation on data within the sliding window. You can use `moving_fn` for the following purposes: + +- Trend analysis +- Outlier detection +- Custom time-series analysis +- Custom smoothing algorithms +- Digital signal processing (DSP) + + +## Parameters + +The `moving_fn` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `buckets_path` | Required | String | The path of the aggregation buckets containing the metric values to process. See [Buckets path]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#buckets-path). | +| `script` | Required | String or Object | The script that calculates a value for each window of data. Can be an inline script, stored script, or script file. The script has access to the variable names defined in the `buckets_path` parameter. | +| `window` | Required | Integer | The number of buckets in the sliding window. Must be a positive integer. | +| `gap_policy` | Optional | String | The policy to apply to missing data. Valid values are `skip` and `insert_zeros`. Default is `skip`. See [Data gaps]({{site.url}}{{site.baseurl}}/aggregations/pipeline/#data-gaps). | +| `format` | Optional | String | A [DecimalFormat](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/text/DecimalFormat.html) formatting string. Returns the formatted output in the aggregation's `value_as_string` property. | +| `shift` | Optional | Integer | The number of buckets by which to shift the window. Can be positive (shift right toward future buckets) or negative (toward past buckets). Default is `0`, which places the window immediately to the left of the current bucket. See [Shifting the window](#shifting-the-window). | + + +## How moving function works + +The `moving_fn` aggregation operates on a sliding window over an ordered sequence of buckets. Starting at the first bucket in the parent aggregation, `moving_fn` does the following: + +1. Collects the subsequence (window) of values from the buckets specified by the `window` and `shift` parameters. +2. Passes these values as an array to the function specified by `script`. +3. Uses `script` to compute a single value from the array. +4. Returns this value as the result for the current bucket. +5. Moves forward one bucket and repeats this process. + +"Past" and "future" values imply time-series data, the most common use case for moving window functions. More generally, they refer to previous and upcoming values, respectively, in any ordered data sequence. +{: .note} + +The script applied by `moving_fn` can be a [predefined function](#predefined-functions) or a [custom script](#custom-scripts). Bucket values are provided to the script in the `values` array. The script returns a double value as the result. The result values `NaN` and `+/- Inf` are allowed, but `null` is not. + + +### Window size + +The `window` parameter specifies the number of buckets that define the size of the window. + +The array passed to the `script` function is zero-indexed. Its values are accessed within the script as `values[0]` to `values[n]`, where `n = values.length - 1`. + + +### Shifting the window + +The `shift` parameter controls where the moving window is located relative to the current bucket. Set `shift` based on whether your analysis requires historical context, current data, or future prediction. The default is `0`, which shows only past values (excluding the current bucket). + +Some commonly used values of `shift` are as follows: + +| `shift` | Window description | | +| :-- | :-- | :-- | +| `0` | Only past values. Excludes the current value. | `--[-----]x----` | +| `1` | Past values, including the current value. | `--[----x]-----` | +| `window/2` | Centers the window around the current value. | `--[--x--]-----` | +| `window` | Future values, including the current value. | `--[x----]-----` | + +When a window extends beyond available data at the beginning or end of a sequence, `window` shrinks automatically to use only the available points: + +``` +[x----]-- +-[x----]- +--[x----] +---[x---] +----[x--] +-----[x-] +------[x] +``` + + +## Predefined functions + +The `moving_fn` aggregation supports a number of predefined functions that can be used instead of a custom script. The functions are accessible from the `MovingFunctions` context. For example, you can access the `max` function as `MovingFunctions.max(values)`. + +The following table describes the predefined functions. + +| Function | Model keyword | Description | +|:-- | :-- |:-- | +| Max | `max` | The maximum value in the window. | +| Min | `min` | The minimum value in the window. | +| Sum | `sum` | The sum of values in the window. | +| Unweighted average | `unweightedAvg` | An unweighted mean of all values in the window, equal to `sum` / `window`. | +| Linear weighted average | `linearWeightedAvg` | A weighted average using a linear decay of weights, giving more importance to recent values. | +| Exponentially Weighted Moving Average | `ewma` | A weighted average using exponentially decaying weights, giving more importance to recent values. | +| Holt | `holt` | A weighted average using a second exponential term to smooth long-term trends. | +| Holt-Winters | `holt_wimnters` | A weighted average using a third exponential term to smooth periodic (seasonal) effects. | +| Standard deviation | `stdDev` | The sum of values in the window. | + +All of the predefined functions take the `values` array as their first parameter. For functions that take extra parameters, pass these parameters in order after `values`. For example, call the `stdDev` function by setting the `script` value to `MovingFunctions.stdDev(values, MovingFunctions.unweightedAvg(values))`. + +The following table shows the settings required for each model. + +| Function | Extra parameters | Allowed values | Default | Description | +| :-- | :-- | :-- | :-- | :-- | +| `max` | None | Numeric array | None | The maximum value of the window. | +| `min` | None | Numeric array | None | The minimum value of the window. | +| `sum` | None | Numeric array | None | The sum of all values in the window. | +| `unweightedAvg` | None | Numeric array | None | The arithmetic mean of all values in the window. | +| `linearWeightedAvg` | None | Numeric array | None | The weighted average of all values in the window, with more recent values weighted more heavily.| +| `ewma` | `alpha` | [0, 1] | 0.3 | The decay parameter. Higher values give more weight to recent data points. | +| `holt` | `alpha` | [0, 1] | 0.3 | The decay parameter for the level component. | +| | `beta` | [0, 1] | 0.1 | The decay parameter for the trend component.| +| `holt_winters` | `alpha` | [0, 1] | 0.3 | The decay parameter for the level component. | +| | `beta` | [0, 1] | 0.3 | The decay parameter for the trend component. | +| | `gamma` | [0, 1] | 0.3 | The decay parameter for the seasonal component. | +| | `type` | `add`, `mult` | `add` | Defines how seasonality is modeled: additive or multiplicative. | +| | `period` | Integer | 1 | The number of buckets comprising the period. | +| | `pad` | Boolean | true | Whether to add a small offset to `0` values for `mult` type models to avoid a divide-by-zero error. | +| `stdDev` | `avg` | Any double | None | The standard deviation of the window. To compute a meaningful standard deviation, use the mean of the sliding window array, typically, `MovingFunctions.unweightedAvg(values)`. | + +The predefined functions do not support function signatures with missing parameters. You therefore must supply the extra parameters, even if using the default values. +{: .important} + + +### Example: Predefined functions + +The following example creates a date histogram with a one-week interval from the OpenSearch Dashboards logs sample data. The `sum` sub-aggregation calculates the sum of all bytes logged for each week. Finally, the `moving_fn` aggregation calculates the standard deviation of the byte sum using a `window` size of `5`, the default `shift` of `0`, and unweighted means: + +```json +POST /opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "my_date_histo": { + "date_histogram": { + "field": "timestamp", + "calendar_interval": "week" + }, + "aggs": { + "the_sum": { + "sum": { "field": "bytes" } + }, + "the_movavg": { + "moving_fn": { + "buckets_path": "the_sum", + "window": 5, + "script": "MovingFunctions.stdDev(values, MovingFunctions.unweightedAvg(values))" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +The response shows the standard deviation of the moving window starting with a zero value in the second bucket. The `stdDev` function returns `0` for windows that are empty or contain only invalid values (`null` or `NaN`): + +
+ + Response + + +```json +{ + "took": 15, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "my_date_histo": { + "buckets": [ + { + "key_as_string": "2025-03-24T00:00:00.000Z", + "key": 1742774400000, + "doc_count": 249, + "the_sum": { + "value": 1531493 + }, + "the_movavg": { + "value": null + } + }, + { + "key_as_string": "2025-03-31T00:00:00.000Z", + "key": 1743379200000, + "doc_count": 1617, + "the_sum": { + "value": 9213161 + }, + "the_movavg": { + "value": 0 + } + }, + { + "key_as_string": "2025-04-07T00:00:00.000Z", + "key": 1743984000000, + "doc_count": 1610, + "the_sum": { + "value": 9188671 + }, + "the_movavg": { + "value": 3840834 + } + }, + { + "key_as_string": "2025-04-14T00:00:00.000Z", + "key": 1744588800000, + "doc_count": 1610, + "the_sum": { + "value": 9244851 + }, + "the_movavg": { + "value": 3615414.498228507 + } + }, + { + "key_as_string": "2025-04-21T00:00:00.000Z", + "key": 1745193600000, + "doc_count": 1609, + "the_sum": { + "value": 9061045 + }, + "the_movavg": { + "value": 3327358.65618917 + } + }, + { + "key_as_string": "2025-04-28T00:00:00.000Z", + "key": 1745798400000, + "doc_count": 1554, + "the_sum": { + "value": 8713507 + }, + "the_movavg": { + "value": 3058812.9440705855 + } + }, + { + "key_as_string": "2025-05-05T00:00:00.000Z", + "key": 1746403200000, + "doc_count": 1710, + "the_sum": { + "value": 9544718 + }, + "the_movavg": { + "value": 195603.33146038183 + } + }, + { + "key_as_string": "2025-05-12T00:00:00.000Z", + "key": 1747008000000, + "doc_count": 1610, + "the_sum": { + "value": 9155820 + }, + "the_movavg": { + "value": 270085.92336040025 + } + }, + { + "key_as_string": "2025-05-19T00:00:00.000Z", + "key": 1747612800000, + "doc_count": 1610, + "the_sum": { + "value": 9025078 + }, + "the_movavg": { + "value": 269477.75659701484 + } + }, + { + "key_as_string": "2025-05-26T00:00:00.000Z", + "key": 1748217600000, + "doc_count": 895, + "the_sum": { + "value": 5047345 + }, + "the_movavg": { + "value": 267356.5422566652 + } + } + ] + } + } +} +``` +
+ + +## Custom scripts + +You can supply an arbitrary custom script to calculate `moving_fn` results. Custom scripts use the Painless scripting language. + +### Example: Custom scripts + +The following example creates a date histogram with a one-week interval from the OpenSearch Dashboards e-commerce sample data. The `sum` sub-aggregation calculates the sum of all taxed revenue for each week. The `moving_fn` script then returns the greater of the two values previous to the current value or `NaN` if two values are not available: + +```json +POST /opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "my_date_histo": { + "date_histogram": { + "field": "order_date", + "calendar_interval": "week" + }, + "aggs": { + "the_sum": { + "sum": { "field": "taxful_total_price" } + }, + "the_movavg": { + "moving_fn": { + "buckets_path": "the_sum", + "window": 2, + "script": "return (values.length < 2 ? Double.NaN : (values[0]>values[1] ? values[0] : values[1]))" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The example returns the results of the calculation starting in bucket three, where enough previous data exists to perform the calculation: + +
+ + Response + + +```json +{ + "took": 7, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4675, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "my_date_histo": { + "buckets": [ + { + "key_as_string": "2025-03-24T00:00:00.000Z", + "key": 1742774400000, + "doc_count": 582, + "the_sum": { + "value": 41455.5390625 + }, + "the_movavg": { + "value": null + } + }, + { + "key_as_string": "2025-03-31T00:00:00.000Z", + "key": 1743379200000, + "doc_count": 1048, + "the_sum": { + "value": 79448.60546875 + }, + "the_movavg": { + "value": null + } + }, + { + "key_as_string": "2025-04-07T00:00:00.000Z", + "key": 1743984000000, + "doc_count": 1048, + "the_sum": { + "value": 78208.4296875 + }, + "the_movavg": { + "value": 79448.60546875 + } + }, + { + "key_as_string": "2025-04-14T00:00:00.000Z", + "key": 1744588800000, + "doc_count": 1073, + "the_sum": { + "value": 81277.296875 + }, + "the_movavg": { + "value": 79448.60546875 + } + }, + { + "key_as_string": "2025-04-21T00:00:00.000Z", + "key": 1745193600000, + "doc_count": 924, + "the_sum": { + "value": 70494.2578125 + }, + "the_movavg": { + "value": 81277.296875 + } + } + ] + } + } +} +``` +
+ + +## Example: Moving average + +The `moving_fn` aggregation replaces the deprecated `moving_avg` aggregation. The `moving_fn` aggregation is similar to the `moving_avg` aggregation but is more versatile since it computes arbitrary functions instead of only averages. All of the predefined `moving_avg` functions are implemented in `moving_fn` as well. + +The `holt` model is a moving average that uses exponentially decaying weights controlled by the `alpha` and `beta` parameters. The following example creates a date histogram with a one-week interval from the OpenSearch Dashboards logs sample data. The `sum` sub-aggregation calculates the sum of all bytes for each week. Finally, the `moving_fn` aggregation calculates a weighted average of the byte sum using a Holt model with a `window` size of `6`, the default `shift` of `0`, an `alpha` value of `0.3`, and a `beta` value of `0.1`: + +```json +POST /opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "my_date_histogram": { + "date_histogram": { + "field": "timestamp", + "calendar_interval": "week" + }, + "aggs": { + "the_sum": { + "sum": { "field": "bytes" } + }, + "the_movavg": { + "moving_fn": { + "buckets_path": "the_sum", + "window": 6, + "script": "MovingFunctions.holt(values, 0.3, 0.1)" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The aggregation returns the moving `holt` average starting with the second bucket: + +
+ + Response + + +```json +{ + "took": 16, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "my_date_histogram": { + "buckets": [ + { + "key_as_string": "2025-03-24T00:00:00.000Z", + "key": 1742774400000, + "doc_count": 249, + "the_sum": { + "value": 1531493 + }, + "the_movavg": { + "value": null + } + }, + { + "key_as_string": "2025-03-31T00:00:00.000Z", + "key": 1743379200000, + "doc_count": 1617, + "the_sum": { + "value": 9213161 + }, + "the_movavg": { + "value": 1531493 + } + }, + { + "key_as_string": "2025-04-07T00:00:00.000Z", + "key": 1743984000000, + "doc_count": 1610, + "the_sum": { + "value": 9188671 + }, + "the_movavg": { + "value": 3835993.3999999994 + } + }, + { + "key_as_string": "2025-04-14T00:00:00.000Z", + "key": 1744588800000, + "doc_count": 1610, + "the_sum": { + "value": 9244851 + }, + "the_movavg": { + "value": 5603111.707999999 + } + }, + { + "key_as_string": "2025-04-21T00:00:00.000Z", + "key": 1745193600000, + "doc_count": 1609, + "the_sum": { + "value": 9061045 + }, + "the_movavg": { + "value": 6964515.302359998 + } + }, + { + "key_as_string": "2025-04-28T00:00:00.000Z", + "key": 1745798400000, + "doc_count": 1554, + "the_sum": { + "value": 8713507 + }, + "the_movavg": { + "value": 7930766.089341199 + } + }, + { + "key_as_string": "2025-05-05T00:00:00.000Z", + "key": 1746403200000, + "doc_count": 1710, + "the_sum": { + "value": 9544718 + }, + "the_movavg": { + "value": 8536788.607547803 + } + }, + { + "key_as_string": "2025-05-12T00:00:00.000Z", + "key": 1747008000000, + "doc_count": 1610, + "the_sum": { + "value": 9155820 + }, + "the_movavg": { + "value": 9172269.837272028 + } + }, + { + "key_as_string": "2025-05-19T00:00:00.000Z", + "key": 1747612800000, + "doc_count": 1610, + "the_sum": { + "value": 9025078 + }, + "the_movavg": { + "value": 9166173.88436614 + } + }, + { + "key_as_string": "2025-05-26T00:00:00.000Z", + "key": 1748217600000, + "doc_count": 895, + "the_sum": { + "value": 5047345 + }, + "the_movavg": { + "value": 9123157.830417283 + } + } + ] + } + } +} +``` +
+ diff --git a/_aggregations/pipeline/percentiles-bucket.md b/_aggregations/pipeline/percentiles-bucket.md new file mode 100644 index 00000000000..b3b416c2001 --- /dev/null +++ b/_aggregations/pipeline/percentiles-bucket.md @@ -0,0 +1,288 @@ +--- +layout: default +title: Percentiles bucket +parent: Pipeline aggregations +nav_order: 160 +--- + +# Percentiles bucket aggregations + +The `percentiles_bucket` aggregation is a sibling aggregation that calculates the percentile placement of bucketed metrics. + +The `percentiles_bucket` aggregation computes percentiles exactly, without approximation or interpolation. Each percentile is returned as the closest value less than or equal to the target percentile. + +The `percentiles_bucket` aggregation requires that the entire list of values be kept temporarily in memory, even for large datasets. In contrast, [the `percentiles` metric aggregation]({{site.url}}{{site.baseurl}}/aggregations/metric/percentile/) uses less memory but approximates the percentages. + +The specified metric must be numeric, and the sibling aggregation must be a multi-bucket aggregation. + +## Parameters + +The `avg_bucket` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `buckets_path` | Required | String | The path of the aggregation buckets to aggregate. See [Buckets path]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#buckets-path). | +| `gap_policy` | Optional | String | The policy to apply to missing data. Valid values are `skip` and `insert_zeros`. Default is `skip`. See [Data gaps]({{site.url}}{{site.baseurl}}/aggregations/pipeline/#data-gaps). | +| `format` | Optional | String | A [DecimalFormat](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/text/DecimalFormat.html) formatting string. Returns the formatted output in the aggregation's `value_as_string` property. | +| `percents` | Optional | List | A list containing any number of numeric percentage values to be included in the output. Valid values are between 0.0 and 100.0, inclusive. Default is `[1.0, 5.0, 25.0, 50.0, 75.0, 95.0, 99.0]`. | +| `keyed` | Optional | Boolean | Whether to format the output as a dictionary rather than as an array of key-value pair objects. Default is `true` (format the output as key-value pairs). | + + +## Example + +The following example creates a date histogram with a one-week interval from the OpenSearch Dashboards e-commerce sample data. The `sum` sub-aggregation adds up the `taxful_total_price` for each week. Finally, the `percentiles_bucket` aggregation calculates the percentile values for each week from these sums: + +```json +POST /opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "weekly_sales": { + "date_histogram": { + "field": "order_date", + "calendar_interval": "week" + }, + "aggs": { + "total_price": { + "sum": { + "field": "taxful_total_price" + } + } + } + }, + "percentiles_monthly_sales": { + "percentiles_bucket": { + "buckets_path": "weekly_sales>total_price" + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +The aggregation returns the default percentile values for the weekly price totals: + +
+ + Response + + +```json +{ + "took": 4, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4675, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "weekly_sales": { + "buckets": [ + { + "key_as_string": "2025-03-24T00:00:00.000Z", + "key": 1742774400000, + "doc_count": 582, + "total_price": { + "value": 41455.5390625 + } + }, + { + "key_as_string": "2025-03-31T00:00:00.000Z", + "key": 1743379200000, + "doc_count": 1048, + "total_price": { + "value": 79448.60546875 + } + }, + { + "key_as_string": "2025-04-07T00:00:00.000Z", + "key": 1743984000000, + "doc_count": 1048, + "total_price": { + "value": 78208.4296875 + } + }, + { + "key_as_string": "2025-04-14T00:00:00.000Z", + "key": 1744588800000, + "doc_count": 1073, + "total_price": { + "value": 81277.296875 + } + }, + { + "key_as_string": "2025-04-21T00:00:00.000Z", + "key": 1745193600000, + "doc_count": 924, + "total_price": { + "value": 70494.2578125 + } + } + ] + }, + "percentiles_monthly_sales": { + "values": { + "1.0": 41455.5390625, + "5.0": 41455.5390625, + "25.0": 70494.2578125, + "50.0": 78208.4296875, + "75.0": 79448.60546875, + "95.0": 81277.296875, + "99.0": 81277.296875 + } + } + } +} +``` +
+ +## Example: Options + +The next example computes percentiles using the same data as in the previous example but with the following differences: + +- The `percents` parameter specifies that only the 25th, 50th, and 75th percentiles be calculated. +- String-formatted outputs are appended using the `format` parameter. +- Results are displayed as key-value pair objects (with string values appended) by setting the `keyed` parameter to `false`. + +The example is as follows: + +```json +POST /opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "weekly_sales": { + "date_histogram": { + "field": "order_date", + "calendar_interval": "week" + }, + "aggs": { + "total_price": { + "sum": { + "field": "taxful_total_price" + } + } + } + }, + "percentiles_monthly_sales": { + "percentiles_bucket": { + "buckets_path": "weekly_sales>total_price", + "percents": [25.0, 50.0, 75.0], + "format": "$#,###.00", + "keyed": false + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response: Options + +The options modify the output of the aggregation: + + +
+ + Response + + +```json +{ + "took": 5, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4675, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "weekly_sales": { + "buckets": [ + { + "key_as_string": "2025-03-24T00:00:00.000Z", + "key": 1742774400000, + "doc_count": 582, + "total_price": { + "value": 41455.5390625 + } + }, + { + "key_as_string": "2025-03-31T00:00:00.000Z", + "key": 1743379200000, + "doc_count": 1048, + "total_price": { + "value": 79448.60546875 + } + }, + { + "key_as_string": "2025-04-07T00:00:00.000Z", + "key": 1743984000000, + "doc_count": 1048, + "total_price": { + "value": 78208.4296875 + } + }, + { + "key_as_string": "2025-04-14T00:00:00.000Z", + "key": 1744588800000, + "doc_count": 1073, + "total_price": { + "value": 81277.296875 + } + }, + { + "key_as_string": "2025-04-21T00:00:00.000Z", + "key": 1745193600000, + "doc_count": 924, + "total_price": { + "value": 70494.2578125 + } + } + ] + }, + "percentiles_monthly_sales": { + "values": [ + { + "key": 25, + "value": 70494.2578125, + "25.0_as_string": "$70,494.26" + }, + { + "key": 50, + "value": 78208.4296875, + "50.0_as_string": "$78,208.43" + }, + { + "key": 75, + "value": 79448.60546875, + "75.0_as_string": "$79,448.61" + } + ] + } + } +} +``` +
+ diff --git a/_aggregations/pipeline/serial-diff.md b/_aggregations/pipeline/serial-diff.md new file mode 100644 index 00000000000..f4e2f650aaf --- /dev/null +++ b/_aggregations/pipeline/serial-diff.md @@ -0,0 +1,289 @@ +--- +layout: default +title: Serial differencing +parent: Pipeline aggregations +nav_order: 180 +--- + +# Serial differencing aggregations + +The `serial_diff` aggregation is a parent pipeline aggregation that calculates the difference between metric values in the current bucket and a previous bucket. It stores the result in the current bucket. + +Use the `serial_diff` aggregation to compute changes between time periods with a specified lag. The `lag` parameter (a positive integer value) specifies which previous bucket value to subtract from the current one. The default `lag` value is `1`, meaning `serial_diff` subtracts the value in the immediately previous bucket from the value in the current bucket. + +## Parameters + +The `serial_diff` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `buckets_path` | Required | String | The path of the aggregation buckets to be aggregated. See [Buckets path]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#buckets-path). | +| `gap_policy` | Optional | String | The policy to apply to missing data. Valid values are `skip` and `insert_zeros`. Default is `skip`. See [Data gaps]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#data-gaps). | +| `format` | Optional | String | A [DecimalFormat](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/text/DecimalFormat.html) formatting string. Returns the formatted output in the aggregation's `value_as_string` property. | +| `lag` | Optional | Integer | The historical bucket to subtract from the current bucket. Must be a positive integer. Default is `1`. | + +## Example + +The following example creates a date histogram with a one-month interval from the OpenSearch Dashboards logs sample data. The `sum` subaggregation calculates the sum of all bytes for each month. Finally, the `serial_diff` aggregation calculates month-to-month difference in total bytes from these sums: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "monthly_bytes": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "total_bytes": { + "sum": { + "field": "bytes" + } + }, + "monthly_bytes_change": { + "serial_diff": { + "buckets_path": "total_bytes", + "lag": 1 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the month-to-month difference for the second and third months. (The first month `serial_diff` cannot be calculated because there's no previous month against which to compare it): + +```json +{ + "took": 3, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "monthly_bytes": { + "buckets": [ + { + "key_as_string": "2025-03-01T00:00:00.000Z", + "key": 1740787200000, + "doc_count": 480, + "total_bytes": { + "value": 2804103 + } + }, + { + "key_as_string": "2025-04-01T00:00:00.000Z", + "key": 1743465600000, + "doc_count": 6849, + "total_bytes": { + "value": 39103067 + }, + "monthly_bytes_change": { + "value": 36298964 + } + }, + { + "key_as_string": "2025-05-01T00:00:00.000Z", + "key": 1746057600000, + "doc_count": 6745, + "total_bytes": { + "value": 37818519 + }, + "monthly_bytes_change": { + "value": -1284548 + } + } + ] + } + } +} +``` + +The following line chart shows the results of the `serial_diff` aggregation. The x-axis represents time, and the y-axis shows the month-over-month change in total bytes transferred. Each data point on the line reflects the difference between the total bytes in that month and the previous month. For example, a value of 5,000,000 means that the system transferred 5 million more bytes than the prior month; a negative value indicates a decrease. The first month is excluded from the line because there's no previous bucket against which to compare it (the difference is undefined). The line starts with the second month and continues across all available data. + +![Example serial difference aggregation visualization]({{site.url}}{{site.baseurl}}/images/serial-diff-agg-result.png) + +This visualization helps you quickly spot spikes, drops, or trends in data volume over time. + +## Example: Multi-period differences + +Use a larger `lag` value to compare each bucket with one that occurred further in the past. The following example computes differences in weekly byte data with a lag of 4 (meaning each bucket is compared to the one from 4 weeks earlier). This has the effect of removing any variation with a period of 4 weeks: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "monthly_bytes": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "week" + }, + "aggs": { + "total_bytes": { + "sum": { + "field": "bytes" + } + }, + "monthly_bytes_change": { + "serial_diff": { + "buckets_path": "total_bytes", + "lag": 4 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +The response contains a list of weekly buckets. Note that the `serial_diff` aggregation does not begin until the fifth bucket, when a bucket with a `lag` of `4` becomes available: + +
+ + Response + + +```json +{ + "took": 6, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "monthly_bytes": { + "buckets": [ + { + "key_as_string": "2025-03-24T00:00:00.000Z", + "key": 1742774400000, + "doc_count": 249, + "total_bytes": { + "value": 1531493 + } + }, + { + "key_as_string": "2025-03-31T00:00:00.000Z", + "key": 1743379200000, + "doc_count": 1617, + "total_bytes": { + "value": 9213161 + } + }, + { + "key_as_string": "2025-04-07T00:00:00.000Z", + "key": 1743984000000, + "doc_count": 1610, + "total_bytes": { + "value": 9188671 + } + }, + { + "key_as_string": "2025-04-14T00:00:00.000Z", + "key": 1744588800000, + "doc_count": 1610, + "total_bytes": { + "value": 9244851 + } + }, + { + "key_as_string": "2025-04-21T00:00:00.000Z", + "key": 1745193600000, + "doc_count": 1609, + "total_bytes": { + "value": 9061045 + }, + "monthly_bytes_change": { + "value": 7529552 + } + }, + { + "key_as_string": "2025-04-28T00:00:00.000Z", + "key": 1745798400000, + "doc_count": 1554, + "total_bytes": { + "value": 8713507 + }, + "monthly_bytes_change": { + "value": -499654 + } + }, + { + "key_as_string": "2025-05-05T00:00:00.000Z", + "key": 1746403200000, + "doc_count": 1710, + "total_bytes": { + "value": 9544718 + }, + "monthly_bytes_change": { + "value": 356047 + } + }, + { + "key_as_string": "2025-05-12T00:00:00.000Z", + "key": 1747008000000, + "doc_count": 1610, + "total_bytes": { + "value": 9155820 + }, + "monthly_bytes_change": { + "value": -89031 + } + }, + { + "key_as_string": "2025-05-19T00:00:00.000Z", + "key": 1747612800000, + "doc_count": 1610, + "total_bytes": { + "value": 9025078 + }, + "monthly_bytes_change": { + "value": -35967 + } + }, + { + "key_as_string": "2025-05-26T00:00:00.000Z", + "key": 1748217600000, + "doc_count": 895, + "total_bytes": { + "value": 5047345 + }, + "monthly_bytes_change": { + "value": -3666162 + } + } + ] + } + } +} +``` +
diff --git a/_aggregations/pipeline/stats-bucket.md b/_aggregations/pipeline/stats-bucket.md new file mode 100644 index 00000000000..ad29bf8aca7 --- /dev/null +++ b/_aggregations/pipeline/stats-bucket.md @@ -0,0 +1,116 @@ +--- +layout: default +title: Stats bucket +parent: Pipeline aggregations +nav_order: 190 +--- + +# Stats bucket aggregation + +The `stats_bucket` aggregation is a sibling aggregation that returns a variety of stats (`count`, `min`, `max`, `avg`, and `sum`) for the buckets of a previous aggregation. + +The specified metric must be numeric, and the sibling aggregation must be a multi-bucket aggregation. + +## Parameters + +The `stats_bucket` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `buckets_path` | Required | String | The path of the aggregation buckets to be aggregated. See [Buckets path]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#buckets-path). | +| `gap_policy` | Optional | String | The policy to apply to missing data. Valid values are `skip` and `insert_zeros`. Default is `skip`. See [Data gaps]({{site.url}}{{site.baseurl}}/aggregations/pipeline/#data-gaps). | +| `format` | Optional | String | A [DecimalFormat](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/text/DecimalFormat.html) formatting string. Returns the formatted output in the aggregation's `_as_string` property. | + +## Example + +The following example creates a date histogram with a one-month interval using the OpenSearch Dashboards e-commerce sample data. The `sum` sub-aggregation calculates the sum of all bytes for each month. Finally, the `stats_bucket` aggregation returns the `count`, `avg`, `sum`, `min`, and `max` stats from these sums: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "visits_per_month": { + "date_histogram": { + "field": "@timestamp", + "interval": "month" + }, + "aggs": { + "sum_of_bytes": { + "sum": { + "field": "bytes" + } + } + } + }, + "stats_monthly_bytes": { + "stats_bucket": { + "buckets_path": "visits_per_month>sum_of_bytes" + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +The aggregation returns all five basic statistics for the buckets: + +```json +{ + "took": 4, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "visits_per_month": { + "buckets": [ + { + "key_as_string": "2025-03-01T00:00:00.000Z", + "key": 1740787200000, + "doc_count": 480, + "sum_of_bytes": { + "value": 2804103 + } + }, + { + "key_as_string": "2025-04-01T00:00:00.000Z", + "key": 1743465600000, + "doc_count": 6849, + "sum_of_bytes": { + "value": 39103067 + } + }, + { + "key_as_string": "2025-05-01T00:00:00.000Z", + "key": 1746057600000, + "doc_count": 6745, + "sum_of_bytes": { + "value": 37818519 + } + } + ] + }, + "stats_monthly_bytes": { + "count": 3, + "min": 2804103, + "max": 39103067, + "avg": 26575229.666666668, + "sum": 79725689 + } + } +} +``` diff --git a/_aggregations/pipeline/sum-bucket.md b/_aggregations/pipeline/sum-bucket.md new file mode 100644 index 00000000000..6c324918dfd --- /dev/null +++ b/_aggregations/pipeline/sum-bucket.md @@ -0,0 +1,114 @@ +--- +layout: default +title: Sum bucket +parent: Pipeline aggregations +nav_order: 190 +--- + +# Sum bucket aggregations + +The `sum_bucket` aggregation is a sibling aggregation that calculates the sum of a metric in each bucket of a previous aggregation. + +The specified metric must be numeric, and the sibling aggregation must be a multi-bucket aggregation. + +## Parameters + +The `sum_bucket` aggregation takes the following parameters. + +| Parameter | Required/Optional | Data type | Description | +| :-- | :-- | :-- | :-- | +| `buckets_path` | Required | String | The path of the aggregation buckets to be aggregated. See [Buckets path]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#buckets-path). | +| `gap_policy` | Optional | String | The policy to apply to missing data. Valid values are `skip` and `insert_zeros`. Default is `skip`. See [Data gaps]({{site.url}}{{site.baseurl}}/aggregations/pipeline/index#data-gaps).| +| `format` | Optional | String | A [DecimalFormat](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/text/DecimalFormat.html) formatting string. Returns the formatted output in the aggregation's `value_as_string` property. | + +## Example + +The following example creates a date histogram with a one-month interval from the OpenSearch Dashboards e-commerce sample data. The `sum` subaggregation calculates the sum of bytes for each month. Finally, the `sum_bucket` aggregation calculates the total number of bytes per month by totaling these sums: + +```json +POST opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "visits_per_month": { + "date_histogram": { + "field": "@timestamp", + "interval": "month" + }, + "aggs": { + "sum_of_bytes": { + "sum": { + "field": "bytes" + } + } + } + }, + "sum_monthly_bytes": { + "sum_bucket": { + "buckets_path": "visits_per_month>sum_of_bytes" + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +The aggregation returns the sum of bytes from all the monthly buckets: + +```json +{ + "took": 10, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "visits_per_month": { + "buckets": [ + { + "key_as_string": "2025-03-01T00:00:00.000Z", + "key": 1740787200000, + "doc_count": 480, + "sum_of_bytes": { + "value": 2804103 + } + }, + { + "key_as_string": "2025-04-01T00:00:00.000Z", + "key": 1743465600000, + "doc_count": 6849, + "sum_of_bytes": { + "value": 39103067 + } + }, + { + "key_as_string": "2025-05-01T00:00:00.000Z", + "key": 1746057600000, + "doc_count": 6745, + "sum_of_bytes": { + "value": 37818519 + } + } + ] + }, + "sum_monthly_bytes": { + "value": 79725689 + } + } +} +``` + + diff --git a/_analyzers/character-filters/html-character-filter.md b/_analyzers/character-filters/html-character-filter.md index 9fb98d97446..bd9f88583e7 100644 --- a/_analyzers/character-filters/html-character-filter.md +++ b/_analyzers/character-filters/html-character-filter.md @@ -1,15 +1,17 @@ --- layout: default -title: html_strip character filter +title: HTML strip parent: Character filters nav_order: 100 --- -# `html_strip` character filter +# HTML strip character filter The `html_strip` character filter removes HTML tags, such as `
`, `

`, and ``, from the input text and renders plain text. The filter can be configured to preserve certain tags or decode specific HTML entities, such as ` `, into spaces. -## Example: HTML analyzer +## Example + +The following request applies an `html_strip` character filter to the provided text: ```json GET /_analyze @@ -23,15 +25,35 @@ GET /_analyze ``` {% include copy-curl.html %} -Using the HTML analyzer, you can convert the HTML character entity references into their corresponding symbols. The processed text would read as follows: +The response contains the token in which HTML characters have been converted to their decoded values: -``` +```json +{ + "tokens": [ + { + "token": """ Commonly used calculus symbols include α, β and θ +""", + "start_offset": 0, + "end_offset": 74, + "type": "word", + "position": 0 + } + ] +} ``` +## Parameters + +The `html_strip` character filter can be configured with the following parameter. + +| Parameter | Required/Optional | Data type | Description | +|:---|:---|:---|:---| +| `escaped_tags` | Optional | Array of strings | An array of HTML element names, specified without the enclosing angle brackets (`< >`). The filter does not remove elements in this list when stripping HTML from the text. For example, setting the array to `["b", "i"]` will prevent the `` and `` elements from being stripped.| + ## Example: Custom analyzer with lowercase filter -The following example query creates a custom analyzer that strips HTML tags and converts the plain text to lowercase by using the `html_strip` analyzer and `lowercase` filter: +The following example request creates a custom analyzer that strips HTML tags and converts the plain text to lowercase by using the `html_strip` analyzer and `lowercase` filter: ```json PUT /html_strip_and_lowercase_analyzer @@ -57,9 +79,7 @@ PUT /html_strip_and_lowercase_analyzer ``` {% include copy-curl.html %} -### Testing `html_strip_and_lowercase_analyzer` - -You can run the following request to test the analyzer: +Use the following request to examine the tokens generated using the analyzer: ```json GET /html_strip_and_lowercase_analyzer/_analyze @@ -72,8 +92,32 @@ GET /html_strip_and_lowercase_analyzer/_analyze In the response, the HTML tags have been removed and the plain text has been converted to lowercase: -``` -welcome to opensearch! +```json +{ + "tokens": [ + { + "token": "welcome", + "start_offset": 4, + "end_offset": 11, + "type": "", + "position": 0 + }, + { + "token": "to", + "start_offset": 12, + "end_offset": 14, + "type": "", + "position": 1 + }, + { + "token": "opensearch", + "start_offset": 23, + "end_offset": 42, + "type": "", + "position": 2 + } + ] +} ``` ## Example: Custom analyzer that preserves HTML tags @@ -104,9 +148,7 @@ PUT /html_strip_preserve_analyzer ``` {% include copy-curl.html %} -### Testing `html_strip_preserve_analyzer` - -You can run the following request to test the analyzer: +Use the following request to examine the tokens generated using the analyzer: ```json GET /html_strip_preserve_analyzer/_analyze @@ -119,6 +161,18 @@ GET /html_strip_preserve_analyzer/_analyze In the response, the `italic` and `bold` tags have been retained, as specified in the custom analyzer request: -``` +```json +{ + "tokens": [ + { + "token": """ This is a bold and italic text. +""", + "start_offset": 0, + "end_offset": 52, + "type": "word", + "position": 0 + } + ] +} ``` diff --git a/_analyzers/character-filters/index.md b/_analyzers/character-filters/index.md index 0e2ce01b8cc..9d4980ac805 100644 --- a/_analyzers/character-filters/index.md +++ b/_analyzers/character-filters/index.md @@ -14,6 +14,6 @@ Unlike token filters, which operate on tokens (words or terms), character filter Use cases for character filters include: -- **HTML stripping:** Removes HTML tags from content so that only the plain text is indexed. -- **Pattern replacement:** Replaces or removes unwanted characters or patterns in text, for example, converting hyphens to spaces. -- **Custom mappings:** Substitutes specific characters or sequences with other values, for example, to convert currency symbols into their textual equivalents. +- **HTML stripping**: The [`html_strip`]({{site.url}}{{site.baseurl}}/analyzers/character-filters/html-character-filter/) character filter removes HTML tags from content so that only the plain text is indexed. +- **Pattern replacement**: The [`pattern_replace`]({{site.url}}{{site.baseurl}}/analyzers/character-filters/pattern-replace-character-filter/) character filter replaces or removes unwanted characters or patterns in text, for example, converting hyphens to spaces. +- **Custom mappings**: The [`mapping`]({{site.url}}{{site.baseurl}}/analyzers/character-filters/mapping-character-filter/) character filter substitutes specific characters or sequences with other values, for example, to convert currency symbols into their textual equivalents. diff --git a/_analyzers/character-filters/mapping-character-filter.md b/_analyzers/character-filters/mapping-character-filter.md new file mode 100644 index 00000000000..59e516e4ec9 --- /dev/null +++ b/_analyzers/character-filters/mapping-character-filter.md @@ -0,0 +1,125 @@ +--- +layout: default +title: Mapping +parent: Character filters +nav_order: 120 +--- + +# Mapping character filter + +The `mapping` character filter accepts a map of key-value pairs for character replacement. Whenever the filter encounters a string of characters matching a key, it replaces them with the corresponding value. Replacement values can be empty strings. + +The filter applies greedy matching, meaning that the longest matching pattern is matched. + +The `mapping` character filter helps in scenarios where specific text replacements are required before tokenization. + +## Example + +The following request configures a `mapping` character filter that converts Roman numerals (such as I, II, or III) into their corresponding Arabic numerals (1, 2, and 3): + +```json +GET /_analyze +{ + "tokenizer": "keyword", + "char_filter": [ + { + "type": "mapping", + "mappings": [ + "I => 1", + "II => 2", + "III => 3", + "IV => 4", + "V => 5" + ] + } + ], + "text": "I have III apples and IV oranges" +} +``` +{% include copy-curl.html %} + +The response contains a token where Roman numerals have been replaced with Arabic numerals: + +```json +{ + "tokens": [ + { + "token": "1 have 3 apples and 4 oranges", + "start_offset": 0, + "end_offset": 32, + "type": "word", + "position": 0 + } + ] +} +``` + +## Parameters + +You can use either of the following parameters to configure the key-value map. + +| Parameter | Required/Optional | Data type | Description | +|:---|:---|:---|:---| +| `mappings` | Optional | Array | An array of key-value pairs in the format `key => value`. Each key found in the input text will be replaced with its corresponding value. | +| `mappings_path` | Optional | String | The path to a UTF-8 encoded file containing key-value mappings. Each mapping should appear on a new line in the format `key => value`. The path can be absolute or relative to the OpenSearch configuration directory. | + +### Using a custom mapping character filter + +You can create a custom mapping character filter by defining your own set of mappings. The following request creates a custom character filter that replaces common abbreviations in a text: + +```json +PUT /test-index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_abbr_analyzer": { + "tokenizer": "standard", + "char_filter": [ + "custom_abbr_filter" + ] + } + }, + "char_filter": { + "custom_abbr_filter": { + "type": "mapping", + "mappings": [ + "BTW => By the way", + "IDK => I don't know", + "FYI => For your information" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /text-index/_analyze +{ + "tokenizer": "keyword", + "char_filter": [ "custom_abbr_filter" ], + "text": "FYI, updates to the workout schedule are posted. IDK when it takes effect, but we have some details. BTW, the finalized schedule will be released Monday." +} +``` +{% include copy-curl.html %} + +The response shows that the abbreviations were replaced: + +```json +{ + "tokens": [ + { + "token": "For your information, updates to the workout schedule are posted. I don't know when it takes effect, but we have some details. By the way, the finalized schedule will be released Monday.", + "start_offset": 0, + "end_offset": 153, + "type": "word", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/character-filters/pattern-replace-character-filter.md b/_analyzers/character-filters/pattern-replace-character-filter.md new file mode 100644 index 00000000000..87cc93e9044 --- /dev/null +++ b/_analyzers/character-filters/pattern-replace-character-filter.md @@ -0,0 +1,238 @@ +--- +layout: default +title: Pattern replace +parent: Character filters +nav_order: 130 +--- + +# Pattern replace character filter + +The `pattern_replace` character filter allows you to use regular expressions to define patterns for matching and replacing characters in the input text. It is a flexible tool for advanced text transformations, especially when dealing with complex string patterns. + +This filter replaces all instances of a pattern with a specified replacement string, allowing for easy substitutions, deletions, or complex modifications of the input text. You can use it to normalize the input before tokenization. + +## Example + +To standardize phone numbers, you'll use the regular expression `[\\s()-]+`: + +- `[ ]`: Defines a **character class**, meaning it will match **any one** of the characters inside the brackets. +- `\\s`: Matches any **white space** character, such as a space, tab, or newline. +- `()`: Matches literal **parentheses** (`(` or `)`). +- `-`: Matches a literal **hyphen** (`-`). +- `+`: Specifies that the pattern should match **one or more** occurrences of the preceding characters. + +The pattern `[\\s()-]+` will match any sequence of one or more white space characters, parentheses, or hyphens and remove it from the input text. This ensures that the phone numbers are normalized and contain only digits. + +The following request standardizes phone numbers by removing spaces, dashes, and parentheses: + +```json +GET /_analyze +{ + "tokenizer": "standard", + "char_filter": [ + { + "type": "pattern_replace", + "pattern": "[\\s()-]+", + "replacement": "" + } + ], + "text": "(555) 123-4567" +} +``` +{% include copy-curl.html %} + +The response contains the generated token: + +```json +{ + "tokens": [ + { + "token": "5551234567", + "start_offset": 1, + "end_offset": 14, + "type": "", + "position": 0 + } + ] +} +``` + +## Parameters + +The `pattern_replace` character filter must be configured with the following parameters. + +| Parameter | Required/Optional | Data type | Description | +|:---|:---| +| `pattern` | Required | String | A regular expression used to match parts of the input text. The filter identifies and matches this pattern to perform replacement. | +| `replacement` | Optional | String | The string that replaces pattern matches. Use an empty string (`""`) to remove the matched text. Default is an empty string (`""`). | + +## Creating a custom analyzer + +The following request creates an index with a custom analyzer configured with a `pattern_replace` character filter. The filter removes currency signs and thousands separators (both European `.` and American `,`) from numbers: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "standard", + "char_filter": [ + "pattern_char_filter" + ] + } + }, + "char_filter": { + "pattern_char_filter": { + "type": "pattern_replace", + "pattern": "[$€,.]", + "replacement": "" + } + } + } + } +} +``` + +{% include copy-curl.html %} + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "Total: $ 1,200.50 and € 1.100,75" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Total", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "120050", + "start_offset": 9, + "end_offset": 17, + "type": "", + "position": 1 + }, + { + "token": "and", + "start_offset": 18, + "end_offset": 21, + "type": "", + "position": 2 + }, + { + "token": "110075", + "start_offset": 24, + "end_offset": 32, + "type": "", + "position": 3 + } + ] +} +``` + +## Using capturing groups + +You can use capturing groups in the `replacement` parameter. For example, the following request creates a custom analyzer that uses a `pattern_replace` character filter to replace hyphens with dots in phone numbers: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "standard", + "char_filter": [ + "pattern_char_filter" + ] + } + }, + "char_filter": { + "pattern_char_filter": { + "type": "pattern_replace", + "pattern": "(\\d+)-(?=\\d)", + "replacement": "$1." + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "Call me at 555-123-4567 or 555-987-6543" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Call", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "me", + "start_offset": 5, + "end_offset": 7, + "type": "", + "position": 1 + }, + { + "token": "at", + "start_offset": 8, + "end_offset": 10, + "type": "", + "position": 2 + }, + { + "token": "555.123.4567", + "start_offset": 11, + "end_offset": 23, + "type": "", + "position": 3 + }, + { + "token": "or", + "start_offset": 24, + "end_offset": 26, + "type": "", + "position": 4 + }, + { + "token": "555.987.6543", + "start_offset": 27, + "end_offset": 39, + "type": "", + "position": 5 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/custom-analyzer.md b/_analyzers/custom-analyzer.md new file mode 100644 index 00000000000..bae41bb751b --- /dev/null +++ b/_analyzers/custom-analyzer.md @@ -0,0 +1,377 @@ +--- +layout: default +title: Creating a custom analyzer +nav_order: 40 +parent: Analyzers +--- + +# Creating a custom analyzer + +To create a custom analyzer, specify a combination of the following components: + +- Character filters (zero or more) + +- Tokenizer (one) + +- Token filters (zero or more) + +## Configuration + +The following parameters can be used to configure a custom analyzer. + +| Parameter | Required/Optional | Description | +|:--- | :--- | :--- | +| `type` | Optional | The analyzer type. Default is `custom`. You can also specify a prebuilt analyzer using this parameter. | +| `tokenizer` | Required | A tokenizer to be included in the analyzer. | +| `char_filter` | Optional | A list of character filters to be included in the analyzer. | +| `filter` | Optional | A list of token filters to be included in the analyzer. | +| `position_increment_gap` | Optional | The extra spacing applied between values when indexing text fields that have multiple values. For more information, see [Position increment gap](#position-increment-gap). Default is `100`. | + +## Examples + +The following examples demonstrate various custom analyzer configurations. + +### Custom analyzer with a character filter for HTML stripping + +The following example analyzer removes HTML tags from text before tokenization: + +```json +PUT simple_html_strip_analyzer_index +{ + "settings": { + "analysis": { + "analyzer": { + "html_strip_analyzer": { + "type": "custom", + "char_filter": ["html_strip"], + "tokenizer": "whitespace", + "filter": ["lowercase"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET simple_html_strip_analyzer_index/_analyze +{ + "analyzer": "html_strip_analyzer", + "text": "

OpenSearch is awesome!

" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 3, + "end_offset": 13, + "type": "word", + "position": 0 + }, + { + "token": "is", + "start_offset": 14, + "end_offset": 16, + "type": "word", + "position": 1 + }, + { + "token": "awesome!", + "start_offset": 25, + "end_offset": 42, + "type": "word", + "position": 2 + } + ] +} +``` + +### Custom analyzer with a mapping character filter for synonym replacement + +The following example analyzer replaces specific characters and patterns before applying the synonym filter: + +```json +PUT mapping_analyzer_index +{ + "settings": { + "analysis": { + "analyzer": { + "synonym_mapping_analyzer": { + "type": "custom", + "char_filter": ["underscore_to_space"], + "tokenizer": "standard", + "filter": ["lowercase", "stop", "synonym_filter"] + } + }, + "char_filter": { + "underscore_to_space": { + "type": "mapping", + "mappings": ["_ => ' '"] + } + }, + "filter": { + "synonym_filter": { + "type": "synonym", + "synonyms": [ + "quick, fast, speedy", + "big, large, huge" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET mapping_analyzer_index/_analyze +{ + "analyzer": "synonym_mapping_analyzer", + "text": "The slow_green_turtle is very large" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "slow","start_offset": 4,"end_offset": 8,"type": "","position": 1}, + {"token": "green","start_offset": 9,"end_offset": 14,"type": "","position": 2}, + {"token": "turtle","start_offset": 15,"end_offset": 21,"type": "","position": 3}, + {"token": "very","start_offset": 25,"end_offset": 29,"type": "","position": 5}, + {"token": "large","start_offset": 30,"end_offset": 35,"type": "","position": 6}, + {"token": "big","start_offset": 30,"end_offset": 35,"type": "SYNONYM","position": 6}, + {"token": "huge","start_offset": 30,"end_offset": 35,"type": "SYNONYM","position": 6} + ] +} +``` + +### Custom analyzer with a custom pattern-based character filter for number normalization + +The following example analyzer normalizes phone numbers by removing dashes and spaces and applies edge n-grams to the normalized text to support partial matches: + +```json +PUT advanced_pattern_replace_analyzer_index +{ + "settings": { + "analysis": { + "analyzer": { + "phone_number_analyzer": { + "type": "custom", + "char_filter": ["phone_normalization"], + "tokenizer": "standard", + "filter": ["lowercase", "edge_ngram"] + } + }, + "char_filter": { + "phone_normalization": { + "type": "pattern_replace", + "pattern": "[-\\s]", + "replacement": "" + } + }, + "filter": { + "edge_ngram": { + "type": "edge_ngram", + "min_gram": 3, + "max_gram": 10 + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET advanced_pattern_replace_analyzer_index/_analyze +{ + "analyzer": "phone_number_analyzer", + "text": "123-456 7890" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "123","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "1234","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "12345","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "123456","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "1234567","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "12345678","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "123456789","start_offset": 0,"end_offset": 12,"type": "","position": 0}, + {"token": "1234567890","start_offset": 0,"end_offset": 12,"type": "","position": 0} + ] +} +``` + +## Handling special characters in regex patterns + +When using custom regex patterns in your analyzer, ensure that special or non-English characters are handled correctly. By default, Java's regex considers only `[A-Za-z0-9_]` to be word characters (`\w`). This can cause unexpected behavior when using `\w` or `\b`, which match the boundary between a word and a non-word character. + +For example, the following analyzer attempts to use the pattern `(\b\p{L}+\b)` to match one or more letter characters from any language (`\p{L}`) surrounded by word boundaries: + +```json +PUT /buggy_custom_analyzer +{ + "settings": { + "analysis": { + "filter": { + "capture_words": { + "type": "pattern_capture", + "patterns": [ + "(\\b\\p{L}+\\b)" + ] + } + }, + "analyzer": { + "filter_only_analyzer": { + "type": "custom", + "tokenizer": "keyword", + "filter": [ + "capture_words" + ] + } + } + } + } +} +``` + +However, this analyzer incorrectly tokenizes `él-empezó-a-reír` as `l`, `empez`, `a`, and `reír` because `\b` does not match the boundary between accented characters and the start or end of a string. + +To handle special characters correctly, add the Unicode case flag `(?U)` to your pattern: + +```json +PUT /fixed_custom_analyzer +{ + "settings": { + "analysis": { + "filter": { + "capture_words": { + "type": "pattern_capture", + "patterns": [ + "(?U)(\\b\\p{L}+\\b)" + ] + } + }, + "analyzer": { + "filter_only_analyzer": { + "type": "custom", + "tokenizer": "keyword", + "filter": [ + "capture_words" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Position increment gap + +The `position_increment_gap` parameter sets a positional gap between terms when indexing multi-valued fields, such as arrays. This gap ensures that phrase queries don't match terms across separate values unless explicitly allowed. For example, a default gap of 100 specifies that terms in different array entries are 100 positions apart, preventing unintended matches in phrase searches. You can adjust this value or set it to `0` in order to allow phrases to span across array values. + +The following example demonstrates the effect of `position_increment_gap` using a `match_phrase` query. + +1. Index a document in a `test-index`: + + ```json + PUT test-index/_doc/1 + { + "names": [ "Slow green", "turtle swims"] + } + ``` + {% include copy-curl.html %} + +1. Query the document using a `match_phrase` query: + + ```json + GET test-index/_search + { + "query": { + "match_phrase": { + "names": { + "query": "green turtle" + } + } + } + } + ``` + {% include copy-curl.html %} + + The response returns no hits because the distance between the terms `green` and `turtle` is `100` (the default `position_increment_gap`). + +1. Now query the document using a `match_phrase` query with a `slop` parameter that is higher than the `position_increment_gap`: + + ```json + GET test-index/_search + { + "query": { + "match_phrase": { + "names": { + "query": "green turtle", + "slop": 101 + } + } + } + } + ``` + {% include copy-curl.html %} + + The response contains the matching document: + + ```json + { + "took": 4, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.010358453, + "hits": [ + { + "_index": "test-index", + "_id": "1", + "_score": 0.010358453, + "_source": { + "names": [ + "Slow green", + "turtle swims" + ] + } + } + ] + } + } + ``` diff --git a/_analyzers/index.md b/_analyzers/index.md index fec61792b28..1dc38b2cd42 100644 --- a/_analyzers/index.md +++ b/_analyzers/index.md @@ -51,7 +51,7 @@ For a list of supported analyzers, see [Analyzers]({{site.url}}{{site.baseurl}}/ ## Custom analyzers -If needed, you can combine tokenizers, token filters, and character filters to create a custom analyzer. +If needed, you can combine tokenizers, token filters, and character filters to create a custom analyzer. For more information, see [Creating a custom analyzer]({{site.url}}{{site.baseurl}}/analyzers/custom-analyzer/). ## Text analysis at indexing time and query time @@ -160,14 +160,16 @@ The response provides information about the analyzers for each field: ``` ## Normalizers + Tokenization divides text into individual terms, but it does not address variations in token forms. Normalization resolves these issues by converting tokens into a standard format. This ensures that similar terms are matched appropriately, even if they are not identical. ### Normalization techniques The following normalization techniques can help address variations in token forms: + 1. **Case normalization**: Converts all tokens to lowercase to ensure case-insensitive matching. For example, "Hello" is normalized to "hello". -2. **Stemming**: Reduces words to their root form. For instance, "cars" is stemmed to "car", and "running" is normalized to "run". +2. **Stemming**: Reduces words to their root form. For instance, "cars" is stemmed to "car" and "running" is normalized to "run". 3. **Synonym handling:** Treats synonyms as equivalent. For example, "jogging" and "running" can be indexed under a common term, such as "run". diff --git a/_analyzers/language-analyzers.md b/_analyzers/language-analyzers.md deleted file mode 100644 index ca4ba320dd0..00000000000 --- a/_analyzers/language-analyzers.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -layout: default -title: Language analyzers -nav_order: 100 -parent: Analyzers -redirect_from: - - /query-dsl/analyzers/language-analyzers/ ---- - -# Language analyzers - -OpenSearch supports the following language analyzers: -`arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `english`, `estonian`, `finnish`, `french`, `galician`, `german`, `greek`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`, `lithuanian`, `norwegian`, `persian`, `portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `turkish`, and `thai`. - -To use the analyzer when you map an index, specify the value within your query. For example, to map your index with the French language analyzer, specify the `french` value for the analyzer field: - -```json - "analyzer": "french" -``` - -#### Example request - -The following query specifies the `french` language analyzer for the index `my-index`: - -```json -PUT my-index -{ - "mappings": { - "properties": { - "text": { - "type": "text", - "fields": { - "french": { - "type": "text", - "analyzer": "french" - } - } - } - } - } -} -``` - - diff --git a/_analyzers/language-analyzers/arabic.md b/_analyzers/language-analyzers/arabic.md new file mode 100644 index 00000000000..e61c684cbbd --- /dev/null +++ b/_analyzers/language-analyzers/arabic.md @@ -0,0 +1,182 @@ +--- +layout: default +title: Arabic +parent: Language analyzers +grand_parent: Analyzers +nav_order: 10 +--- + +# Arabic analyzer + +The built-in `arabic` analyzer can be applied to a text field using the following command: + +```json +PUT /arabic-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "arabic" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_arabic +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_arabic_analyzer":{ + "type":"arabic", + "stem_exclusion":["تكنولوجيا","سلطة "] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Arabic analyzer internals + +The `arabic` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - decimal_digit + - stop (Arabic) + - normalization (Arabic) + - keyword + - stemmer (Arabic) + +## Custom Arabic analyzer + +You can create a custom Arabic analyzer using the following command: + +```json +PUT /arabic-index +{ + "settings": { + "analysis": { + "filter": { + "arabic_stop": { + "type": "stop", + "stopwords": "_arabic_" + }, + "arabic_stemmer": { + "type": "stemmer", + "language": "arabic" + }, + "arabic_normalization": { + "type": "arabic_normalization" + }, + "decimal_digit": { + "type": "decimal_digit" + }, + "arabic_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "arabic_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "arabic_normalization", + "decimal_digit", + "arabic_stop", + "arabic_keywords", + "arabic_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "arabic_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /arabic-index/_analyze +{ + "field": "content", + "text": "الطلاب يدرسون في الجامعات العربية. أرقامهم ١٢٣٤٥٦." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "طلاب", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "يدرس", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "جامع", + "start_offset": 17, + "end_offset": 25, + "type": "", + "position": 3 + }, + { + "token": "عرب", + "start_offset": 26, + "end_offset": 33, + "type": "", + "position": 4 + }, + { + "token": "ارقامهم", + "start_offset": 35, + "end_offset": 42, + "type": "", + "position": 5 + }, + { + "token": "123456", + "start_offset": 43, + "end_offset": 49, + "type": "", + "position": 6 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/armenian.md b/_analyzers/language-analyzers/armenian.md new file mode 100644 index 00000000000..9bd0549c80a --- /dev/null +++ b/_analyzers/language-analyzers/armenian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Armenian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 20 +--- + +# Armenian analyzer + +The built-in `armenian` analyzer can be applied to a text field using the following command: + +```json +PUT /arabic-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "armenian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_armenian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_armenian_analyzer": { + "type": "armenian", + "stem_exclusion": ["բարև", "խաղաղություն"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Armenian analyzer internals + +The `armenian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Armenian) + - keyword + - stemmer (Armenian) + +## Custom Armenian analyzer + +You can create a custom Armenian analyzer using the following command: + +```json +PUT /armenian-index +{ + "settings": { + "analysis": { + "filter": { + "armenian_stop": { + "type": "stop", + "stopwords": "_armenian_" + }, + "armenian_stemmer": { + "type": "stemmer", + "language": "armenian" + }, + "armenian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "armenian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "armenian_stop", + "armenian_keywords", + "armenian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "armenian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET armenian-index/_analyze +{ + "analyzer": "stem_exclusion_armenian_analyzer", + "text": "բարև բոլորին, մենք խաղաղություն ենք ուզում և նոր օր ենք սկսել" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "բարև","start_offset": 0,"end_offset": 4,"type": "","position": 0}, + {"token": "բոլոր","start_offset": 5,"end_offset": 12,"type": "","position": 1}, + {"token": "խաղաղություն","start_offset": 19,"end_offset": 31,"type": "","position": 3}, + {"token": "ուզ","start_offset": 36,"end_offset": 42,"type": "","position": 5}, + {"token": "նոր","start_offset": 45,"end_offset": 48,"type": "","position": 7}, + {"token": "օր","start_offset": 49,"end_offset": 51,"type": "","position": 8}, + {"token": "սկսել","start_offset": 56,"end_offset": 61,"type": "","position": 10} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/basque.md b/_analyzers/language-analyzers/basque.md new file mode 100644 index 00000000000..e73510cc661 --- /dev/null +++ b/_analyzers/language-analyzers/basque.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Basque +parent: Language analyzers +grand_parent: Analyzers +nav_order: 30 +--- + +# Basque analyzer + +The built-in `basque` analyzer can be applied to a text field using the following command: + +```json +PUT /basque-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "basque" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_basque_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_basque_analyzer": { + "type": "basque", + "stem_exclusion": ["autoritate", "baldintza"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Basque analyzer internals + +The `basque` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Basque) + - keyword + - stemmer (Basque) + +## Custom Basque analyzer + +You can create a custom Basque analyzer using the following command: + +```json +PUT /basque-index +{ + "settings": { + "analysis": { + "filter": { + "basque_stop": { + "type": "stop", + "stopwords": "_basque_" + }, + "basque_stemmer": { + "type": "stemmer", + "language": "basque" + }, + "basque_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "basque_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "basque_stop", + "basque_keywords", + "basque_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "basque_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /basque-index/_analyze +{ + "field": "content", + "text": "Ikasleek euskal unibertsitateetan ikasten dute. Haien zenbakiak 123456 dira." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "ikasle","start_offset": 0,"end_offset": 8,"type": "","position": 0}, + {"token": "euskal","start_offset": 9,"end_offset": 15,"type": "","position": 1}, + {"token": "unibertsi","start_offset": 16,"end_offset": 33,"type": "","position": 2}, + {"token": "ikas","start_offset": 34,"end_offset": 41,"type": "","position": 3}, + {"token": "haien","start_offset": 48,"end_offset": 53,"type": "","position": 5}, + {"token": "zenba","start_offset": 54,"end_offset": 63,"type": "","position": 6}, + {"token": "123456","start_offset": 64,"end_offset": 70,"type": "","position": 7} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/bengali.md b/_analyzers/language-analyzers/bengali.md new file mode 100644 index 00000000000..af913a01efb --- /dev/null +++ b/_analyzers/language-analyzers/bengali.md @@ -0,0 +1,142 @@ +--- +layout: default +title: Bengali +parent: Language analyzers +grand_parent: Analyzers +nav_order: 40 +--- + +# Bengali analyzer + +The built-in `bengali` analyzer can be applied to a text field using the following command: + +```json +PUT /bengali-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bengali" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_bengali_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_bengali_analyzer": { + "type": "bengali", + "stem_exclusion": ["কর্তৃপক্ষ", "অনুমোদন"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Bengali analyzer internals + +The `bengali` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - decimal_digit + - indic_normalization + - normalization (Bengali) + - stop (Bengali) + - keyword + - stemmer (Bengali) + +## Custom Bengali analyzer + +You can create a custom Bengali analyzer using the following command: + +```json +PUT /bengali-index +{ + "settings": { + "analysis": { + "filter": { + "bengali_stop": { + "type": "stop", + "stopwords": "_bengali_" + }, + "bengali_stemmer": { + "type": "stemmer", + "language": "bengali" + }, + "bengali_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "bengali_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "decimal_digit", + "indic_normalization", + "bengali_normalization", + "bengali_stop", + "bengali_keywords", + "bengali_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bengali_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /bengali-index/_analyze +{ + "field": "content", + "text": "ছাত্ররা বিশ্ববিদ্যালয়ে পড়াশোনা করে। তাদের নম্বরগুলি ১২৩৪৫৬।" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "ছাত্র","start_offset": 0,"end_offset": 7,"type": "","position": 0}, + {"token": "বিসসবিদালয়","start_offset": 8,"end_offset": 23,"type": "","position": 1}, + {"token": "পরাসোন","start_offset": 24,"end_offset": 32,"type": "","position": 2}, + {"token": "তা","start_offset": 38,"end_offset": 43,"type": "","position": 4}, + {"token": "নমমর","start_offset": 44,"end_offset": 53,"type": "","position": 5}, + {"token": "123456","start_offset": 54,"end_offset": 60,"type": "","position": 6} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/brazilian.md b/_analyzers/language-analyzers/brazilian.md new file mode 100644 index 00000000000..67db2b92bc1 --- /dev/null +++ b/_analyzers/language-analyzers/brazilian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Brazilian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 50 +--- + +# Brazilian analyzer + +The built-in `brazilian` analyzer can be applied to a text field using the following command: + +```json +PUT /brazilian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "brazilian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_brazilian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_brazilian_analyzer": { + "type": "brazilian", + "stem_exclusion": ["autoridade", "aprovação"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Brazilian analyzer internals + +The `brazilian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Brazilian) + - keyword + - stemmer (Brazilian) + +## Custom Brazilian analyzer + +You can create a custom Brazilian analyzer using the following command: + +```json +PUT /brazilian-index +{ + "settings": { + "analysis": { + "filter": { + "brazilian_stop": { + "type": "stop", + "stopwords": "_brazilian_" + }, + "brazilian_stemmer": { + "type": "stemmer", + "language": "brazilian" + }, + "brazilian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "brazilian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "brazilian_stop", + "brazilian_keywords", + "brazilian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "brazilian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /brazilian-index/_analyze +{ + "field": "content", + "text": "Estudantes estudam em universidades brasileiras. Seus números são 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "estudant","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "estud","start_offset": 11,"end_offset": 18,"type": "","position": 1}, + {"token": "univers","start_offset": 22,"end_offset": 35,"type": "","position": 3}, + {"token": "brasileir","start_offset": 36,"end_offset": 47,"type": "","position": 4}, + {"token": "numer","start_offset": 54,"end_offset": 61,"type": "","position": 6}, + {"token": "sao","start_offset": 62,"end_offset": 65,"type": "","position": 7}, + {"token": "123456","start_offset": 66,"end_offset": 72,"type": "","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/bulgarian.md b/_analyzers/language-analyzers/bulgarian.md new file mode 100644 index 00000000000..42d5794e189 --- /dev/null +++ b/_analyzers/language-analyzers/bulgarian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Bulgarian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 60 +--- + +# Bulgarian analyzer + +The built-in `bulgarian` analyzer can be applied to a text field using the following command: + +```json +PUT /bulgarian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bulgarian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_bulgarian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_bulgarian_analyzer": { + "type": "bulgarian", + "stem_exclusion": ["авторитет", "одобрение"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Bulgarian analyzer internals + +The `bulgarian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Bulgarian) + - keyword + - stemmer (Bulgarian) + +## Custom Bulgarian analyzer + +You can create a custom Bulgarian analyzer using the following command: + +```json +PUT /bulgarian-index +{ + "settings": { + "analysis": { + "filter": { + "bulgarian_stop": { + "type": "stop", + "stopwords": "_bulgarian_" + }, + "bulgarian_stemmer": { + "type": "stemmer", + "language": "bulgarian" + }, + "bulgarian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "bulgarian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "bulgarian_stop", + "bulgarian_keywords", + "bulgarian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bulgarian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /bulgarian-index/_analyze +{ + "field": "content", + "text": "Студентите учат в българските университети. Техните номера са 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "студент","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "учат","start_offset": 11,"end_offset": 15,"type": "","position": 1}, + {"token": "българск","start_offset": 18,"end_offset": 29,"type": "","position": 3}, + {"token": "университят","start_offset": 30,"end_offset": 42,"type": "","position": 4}, + {"token": "техн","start_offset": 44,"end_offset": 51,"type": "","position": 5}, + {"token": "номер","start_offset": 52,"end_offset": 58,"type": "","position": 6}, + {"token": "123456","start_offset": 62,"end_offset": 68,"type": "","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/catalan.md b/_analyzers/language-analyzers/catalan.md new file mode 100644 index 00000000000..89762da094c --- /dev/null +++ b/_analyzers/language-analyzers/catalan.md @@ -0,0 +1,143 @@ +--- +layout: default +title: Catalan +parent: Language analyzers +grand_parent: Analyzers +nav_order: 70 +--- + +# Catalan analyzer + +The built-in `catalan` analyzer can be applied to a text field using the following command: + +```json +PUT /catalan-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "catalan" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_catalan_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_catalan_analyzer": { + "type": "catalan", + "stem_exclusion": ["autoritat", "aprovació"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Catalan analyzer internals + +The `catalan` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - elision (Catalan) + - lowercase + - stop (Catalan) + - keyword + - stemmer (Catalan) + +## Custom Catalan analyzer + +You can create a custom Catalan analyzer using the following command: + +```json +PUT /catalan-index +{ + "settings": { + "analysis": { + "filter": { + "catalan_stop": { + "type": "stop", + "stopwords": "_catalan_" + }, + "catalan_elision": { + "type": "elision", + "articles": [ "d", "l", "m", "n", "s", "t"], + "articles_case": true + }, + "catalan_stemmer": { + "type": "stemmer", + "language": "catalan" + }, + "catalan_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "catalan_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "catalan_elision", + "lowercase", + "catalan_stop", + "catalan_keywords", + "catalan_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "catalan_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /catalan-index/_analyze +{ + "field": "content", + "text": "Els estudiants estudien a les universitats catalanes. Els seus números són 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "estud","start_offset": 4,"end_offset": 14,"type": "","position": 1}, + {"token": "estud","start_offset": 15,"end_offset": 23,"type": "","position": 2}, + {"token": "univer","start_offset": 30,"end_offset": 42,"type": "","position": 5}, + {"token": "catalan","start_offset": 43,"end_offset": 52,"type": "","position": 6}, + {"token": "numer","start_offset": 63,"end_offset": 70,"type": "","position": 9}, + {"token": "123456","start_offset": 75,"end_offset": 81,"type": "","position": 11} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/cjk.md b/_analyzers/language-analyzers/cjk.md new file mode 100644 index 00000000000..aed7e6da224 --- /dev/null +++ b/_analyzers/language-analyzers/cjk.md @@ -0,0 +1,142 @@ +--- +layout: default +title: CJK +parent: Language analyzers +grand_parent: Analyzers +nav_order: 80 +--- + +# CJK analyzer + +The built-in `cjk` analyzer can be applied to a text field using the following command: + +```json +PUT /cjk-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "cjk" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_cjk_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_cjk_analyzer": { + "type": "cjk", + "stem_exclusion": ["example", "words"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## CJK analyzer internals + +The `cjk` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - cjk_width + - lowercase + - cjk_bigram + - stop (similar to English) + +## Custom CJK analyzer + +You can create a custom CJK analyzer using the following command: + +```json +PUT /cjk-index +{ + "settings": { + "analysis": { + "filter": { + "english_stop": { + "type": "stop", + "stopwords": [ + "a", "and", "are", "as", "at", "be", "but", "by", "for", + "if", "in", "into", "is", "it", "no", "not", "of", "on", + "or", "s", "such", "t", "that", "the", "their", "then", + "there", "these", "they", "this", "to", "was", "will", + "with", "www" + ] + } + }, + "analyzer": { + "cjk_custom_analyzer": { + "tokenizer": "standard", + "filter": [ + "cjk_width", + "lowercase", + "cjk_bigram", + "english_stop" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "cjk_custom_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /cjk-index/_analyze +{ + "field": "content", + "text": "学生们在中国、日本和韩国的大学学习。123456" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "学生","start_offset": 0,"end_offset": 2,"type": "","position": 0}, + {"token": "生们","start_offset": 1,"end_offset": 3,"type": "","position": 1}, + {"token": "们在","start_offset": 2,"end_offset": 4,"type": "","position": 2}, + {"token": "在中","start_offset": 3,"end_offset": 5,"type": "","position": 3}, + {"token": "中国","start_offset": 4,"end_offset": 6,"type": "","position": 4}, + {"token": "日本","start_offset": 7,"end_offset": 9,"type": "","position": 5}, + {"token": "本和","start_offset": 8,"end_offset": 10,"type": "","position": 6}, + {"token": "和韩","start_offset": 9,"end_offset": 11,"type": "","position": 7}, + {"token": "韩国","start_offset": 10,"end_offset": 12,"type": "","position": 8}, + {"token": "国的","start_offset": 11,"end_offset": 13,"type": "","position": 9}, + {"token": "的大","start_offset": 12,"end_offset": 14,"type": "","position": 10}, + {"token": "大学","start_offset": 13,"end_offset": 15,"type": "","position": 11}, + {"token": "学学","start_offset": 14,"end_offset": 16,"type": "","position": 12}, + {"token": "学习","start_offset": 15,"end_offset": 17,"type": "","position": 13}, + {"token": "123456","start_offset": 18,"end_offset": 24,"type": "","position": 14} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/czech.md b/_analyzers/language-analyzers/czech.md new file mode 100644 index 00000000000..c1778cd0f4a --- /dev/null +++ b/_analyzers/language-analyzers/czech.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Czech +parent: Language analyzers +grand_parent: Analyzers +nav_order: 90 +--- + +# Czech analyzer + +The built-in `czech` analyzer can be applied to a text field using the following command: + +```json +PUT /czech-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "czech" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_czech_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_czech_analyzer": { + "type": "czech", + "stem_exclusion": ["autorita", "schválení"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Czech analyzer internals + +The `czech` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Czech) + - keyword + - stemmer (Czech) + +## Custom Czech analyzer + +You can create a custom Czech analyzer using the following command: + +```json +PUT /czech-index +{ + "settings": { + "analysis": { + "filter": { + "czech_stop": { + "type": "stop", + "stopwords": "_czech_" + }, + "czech_stemmer": { + "type": "stemmer", + "language": "czech" + }, + "czech_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "czech_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "czech_stop", + "czech_keywords", + "czech_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "czech_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /czech-index/_analyze +{ + "field": "content", + "text": "Studenti studují na českých univerzitách. Jejich čísla jsou 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "student", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "studuj", + "start_offset": 9, + "end_offset": 16, + "type": "", + "position": 1 + }, + { + "token": "česk", + "start_offset": 20, + "end_offset": 27, + "type": "", + "position": 3 + }, + { + "token": "univerzit", + "start_offset": 28, + "end_offset": 40, + "type": "", + "position": 4 + }, + { + "token": "čísl", + "start_offset": 49, + "end_offset": 54, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 60, + "end_offset": 66, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/danish.md b/_analyzers/language-analyzers/danish.md new file mode 100644 index 00000000000..b5ee1b0e975 --- /dev/null +++ b/_analyzers/language-analyzers/danish.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Danish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 100 +--- + +# Danish analyzer + +The built-in `danish` analyzer can be applied to a text field using the following command: + +```json +PUT /danish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "danish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_danish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_danish_analyzer": { + "type": "danish", + "stem_exclusion": ["autoritet", "godkendelse"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Danish analyzer internals + +The `danish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Danish) + - keyword + - stemmer (Danish) + +## Custom Danish analyzer + +You can create a custom Danish analyzer using the following command: + +```json +PUT /danish-index +{ + "settings": { + "analysis": { + "filter": { + "danish_stop": { + "type": "stop", + "stopwords": "_danish_" + }, + "danish_stemmer": { + "type": "stemmer", + "language": "danish" + }, + "danish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "danish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "danish_stop", + "danish_keywords", + "danish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "danish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /danish-index/_analyze +{ + "field": "content", + "text": "Studerende studerer på de danske universiteter. Deres numre er 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "stud", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "stud", + "start_offset": 11, + "end_offset": 19, + "type": "", + "position": 1 + }, + { + "token": "dansk", + "start_offset": 26, + "end_offset": 32, + "type": "", + "position": 4 + }, + { + "token": "universitet", + "start_offset": 33, + "end_offset": 46, + "type": "", + "position": 5 + }, + { + "token": "numr", + "start_offset": 54, + "end_offset": 59, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 63, + "end_offset": 69, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/dutch.md b/_analyzers/language-analyzers/dutch.md new file mode 100644 index 00000000000..0259707d78d --- /dev/null +++ b/_analyzers/language-analyzers/dutch.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Dutch +parent: Language analyzers +grand_parent: Analyzers +nav_order: 110 +--- + +# Dutch analyzer + +The built-in `dutch` analyzer can be applied to a text field using the following command: + +```json +PUT /dutch-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "dutch" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_dutch_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_dutch_analyzer": { + "type": "dutch", + "stem_exclusion": ["autoriteit", "goedkeuring"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Dutch analyzer internals + +The `dutch` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Dutch) + - keyword + - stemmer_override + - stemmer (Dutch) + +## Custom Dutch analyzer + +You can create a custom Dutch analyzer using the following command: + +```json +PUT /dutch-index +{ + "settings": { + "analysis": { + "filter": { + "dutch_stop": { + "type": "stop", + "stopwords": "_dutch_" + }, + "dutch_stemmer": { + "type": "stemmer", + "language": "dutch" + }, + "dutch_keywords": { + "type": "keyword_marker", + "keywords": [] + }, + "dutch_override": { + "type": "stemmer_override", + "rules": [ + "fiets=>fiets", + "bromfiets=>bromfiets", + "ei=>eier", + "kind=>kinder" + ] + } + }, + "analyzer": { + "dutch_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "dutch_stop", + "dutch_keywords", + "dutch_override", + "dutch_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "dutch_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /dutch-index/_analyze +{ + "field": "content", + "text": "De studenten studeren in Nederland en bezoeken Amsterdam. Hun nummers zijn 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 3,"end_offset": 12,"type": "","position": 1}, + {"token": "studer","start_offset": 13,"end_offset": 21,"type": "","position": 2}, + {"token": "nederland","start_offset": 25,"end_offset": 34,"type": "","position": 4}, + {"token": "bezoek","start_offset": 38,"end_offset": 46,"type": "","position": 6}, + {"token": "amsterdam","start_offset": 47,"end_offset": 56,"type": "","position": 7}, + {"token": "nummer","start_offset": 62,"end_offset": 69,"type": "","position": 9}, + {"token": "123456","start_offset": 75,"end_offset": 81,"type": "","position": 11} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/english.md b/_analyzers/language-analyzers/english.md new file mode 100644 index 00000000000..2d0b6003125 --- /dev/null +++ b/_analyzers/language-analyzers/english.md @@ -0,0 +1,143 @@ +--- +layout: default +title: English +parent: Language analyzers +grand_parent: Analyzers +nav_order: 120 +--- + +# English analyzer + +The built-in `english` analyzer can be applied to a text field using the following command: + +```json +PUT /english-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "english" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_english_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_english_analyzer": { + "type": "english", + "stem_exclusion": ["authority", "authorization"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## English analyzer internals + +The `english` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - stemmer (possessive_english) + - lowercase + - stop (English) + - keyword + - stemmer (English) + +## Custom English analyzer + +You can create a custom English analyzer using the following command: + +```json +PUT /english-index +{ + "settings": { + "analysis": { + "filter": { + "english_stop": { + "type": "stop", + "stopwords": "_english_" + }, + "english_stemmer": { + "type": "stemmer", + "language": "english" + }, + "english_keywords": { + "type": "keyword_marker", + "keywords": [] + }, + "english_possessive_stemmer": { + "type": "stemmer", + "language": "possessive_english" + } + }, + "analyzer": { + "english_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "english_possessive_stemmer", + "lowercase", + "english_stop", + "english_keywords", + "english_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "english_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /english-index/_analyze +{ + "field": "content", + "text": "The students study in the USA and work at NASA. Their numbers are 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 4,"end_offset": 12,"type": "","position": 1}, + {"token": "studi","start_offset": 13,"end_offset": 18,"type": "","position": 2}, + {"token": "usa","start_offset": 26,"end_offset": 29,"type": "","position": 5}, + {"token": "work","start_offset": 34,"end_offset": 38,"type": "","position": 7}, + {"token": "nasa","start_offset": 42,"end_offset": 46,"type": "","position": 9}, + {"token": "number","start_offset": 54,"end_offset": 61,"type": "","position": 11}, + {"token": "123456","start_offset": 66,"end_offset": 72,"type": "","position": 13} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/estonian.md b/_analyzers/language-analyzers/estonian.md new file mode 100644 index 00000000000..a4cb664f185 --- /dev/null +++ b/_analyzers/language-analyzers/estonian.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Estonian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 130 +--- + +# Estonian analyzer + +The built-in `estonian` analyzer can be applied to a text field using the following command: + +```json +PUT /estonian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "estonian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_estonian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_estonian_analyzer": { + "type": "estonian", + "stem_exclusion": ["autoriteet", "kinnitus"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Estonian analyzer internals + +The `estonian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Estonian) + - keyword + - stemmer (Estonian) + +## Custom Estonian analyzer + +You can create a custom Estonian analyzer using the following command: + +```json +PUT /estonian-index +{ + "settings": { + "analysis": { + "filter": { + "estonian_stop": { + "type": "stop", + "stopwords": "_estonian_" + }, + "estonian_stemmer": { + "type": "stemmer", + "language": "estonian" + }, + "estonian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "estonian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "estonian_stop", + "estonian_keywords", + "estonian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "estonian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /estonian-index/_analyze +{ + "field": "content", + "text": "Õpilased õpivad Tallinnas ja Eesti ülikoolides. Nende numbrid on 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "õpilase","start_offset": 0,"end_offset": 8,"type": "","position": 0}, + {"token": "õpi","start_offset": 9,"end_offset": 15,"type": "","position": 1}, + {"token": "tallinna","start_offset": 16,"end_offset": 25,"type": "","position": 2}, + {"token": "eesti","start_offset": 29,"end_offset": 34,"type": "","position": 4}, + {"token": "ülikooli","start_offset": 35,"end_offset": 46,"type": "","position": 5}, + {"token": "nende","start_offset": 48,"end_offset": 53,"type": "","position": 6}, + {"token": "numbri","start_offset": 54,"end_offset": 61,"type": "","position": 7}, + {"token": "on","start_offset": 62,"end_offset": 64,"type": "","position": 8}, + {"token": "123456","start_offset": 65,"end_offset": 71,"type": "","position": 9} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/finnish.md b/_analyzers/language-analyzers/finnish.md new file mode 100644 index 00000000000..6f559650d2d --- /dev/null +++ b/_analyzers/language-analyzers/finnish.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Finnish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 140 +--- + +# Finnish analyzer + +The built-in `finnish` analyzer can be applied to a text field using the following command: + +```json +PUT /finnish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "finnish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_finnish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_finnish_analyzer": { + "type": "finnish", + "stem_exclusion": ["valta", "hyväksyntä"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Finnish analyzer internals + +The `finnish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Finnish) + - keyword + - stemmer (Finnish) + +## Custom Finnish analyzer + +You can create a custom Finnish analyzer using the following command: + +```json +PUT /finnish-index +{ + "settings": { + "analysis": { + "filter": { + "finnish_stop": { + "type": "stop", + "stopwords": "_finnish_" + }, + "finnish_stemmer": { + "type": "stemmer", + "language": "finnish" + }, + "finnish_keywords": { + "type": "keyword_marker", + "keywords": ["Helsinki", "Suomi"] + } + }, + "analyzer": { + "finnish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "finnish_stop", + "finnish_keywords", + "finnish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "finnish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /finnish-index/_analyze +{ + "field": "content", + "text": "Opiskelijat opiskelevat Helsingissä ja Suomen yliopistoissa. Heidän numeronsa ovat 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "opiskelij","start_offset": 0,"end_offset": 11,"type": "","position": 0}, + {"token": "opiskelev","start_offset": 12,"end_offset": 23,"type": "","position": 1}, + {"token": "helsing","start_offset": 24,"end_offset": 35,"type": "","position": 2}, + {"token": "suome","start_offset": 39,"end_offset": 45,"type": "","position": 4}, + {"token": "yliopisto","start_offset": 46,"end_offset": 59,"type": "","position": 5}, + {"token": "numero","start_offset": 68,"end_offset": 77,"type": "","position": 7}, + {"token": "123456","start_offset": 83,"end_offset": 89,"type": "","position": 9} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/french.md b/_analyzers/language-analyzers/french.md new file mode 100644 index 00000000000..64e7ab54154 --- /dev/null +++ b/_analyzers/language-analyzers/french.md @@ -0,0 +1,148 @@ +--- +layout: default +title: French +parent: Language analyzers +grand_parent: Analyzers +nav_order: 150 +--- + +# French analyzer + +The built-in `french` analyzer can be applied to a text field using the following command: + +```json +PUT /french-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "french" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_french_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_french_analyzer": { + "type": "french", + "stem_exclusion": ["autorité", "acceptation"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## French analyzer internals + +The `french` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - elision (French) + - lowercase + - stop (French) + - keyword + - stemmer (French) + +## Custom French analyzer + +You can create a custom French analyzer using the following command: + +```json +PUT /french-index +{ + "settings": { + "analysis": { + "filter": { + "french_stop": { + "type": "stop", + "stopwords": "_french_" + }, + "french_elision": { + "type": "elision", + "articles_case": true, + "articles": [ + "l", "m", "t", "qu", "n", "s", + "j", "d", "c", "jusqu", "quoiqu", + "lorsqu", "puisqu" + ] + }, + "french_stemmer": { + "type": "stemmer", + "language": "light_french" + }, + "french_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "french_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "french_elision", + "lowercase", + "french_stop", + "french_keywords", + "french_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "french_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /french-index/_analyze +{ + "field": "content", + "text": "Les étudiants étudient à Paris et dans les universités françaises. Leurs numéros sont 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "etudiant","start_offset": 4,"end_offset": 13,"type": "","position": 1}, + {"token": "etudient","start_offset": 14,"end_offset": 22,"type": "","position": 2}, + {"token": "pari","start_offset": 25,"end_offset": 30,"type": "","position": 4}, + {"token": "universit","start_offset": 43,"end_offset": 54,"type": "","position": 8}, + {"token": "francais","start_offset": 55,"end_offset": 65,"type": "","position": 9}, + {"token": "numero","start_offset": 73,"end_offset": 80,"type": "","position": 11}, + {"token": "123456","start_offset": 86,"end_offset": 92,"type": "","position": 13} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/galician.md b/_analyzers/language-analyzers/galician.md new file mode 100644 index 00000000000..00338b23a77 --- /dev/null +++ b/_analyzers/language-analyzers/galician.md @@ -0,0 +1,138 @@ +--- +layout: default +title: Galician +parent: Language analyzers +grand_parent: Analyzers +nav_order: 160 +--- + +# Galician analyzer + +The built-in `galician` analyzer can be applied to a text field using the following command: + +```json +PUT /galician-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "galician" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_galician_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_galician_analyzer": { + "type": "galician", + "stem_exclusion": ["autoridade", "aceptación"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Galician analyzer internals + +The `galician` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (French) + - keyword + - stemmer (French) + +## Custom Galician analyzer + +You can create a custom Galician analyzer using the following command: + +```json +PUT /galician-index +{ + "settings": { + "analysis": { + "filter": { + "galician_stop": { + "type": "stop", + "stopwords": "_galician_" + }, + "galician_stemmer": { + "type": "stemmer", + "language": "galician" + }, + "galician_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "galician_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "galician_stop", + "galician_keywords", + "galician_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "galician_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /galician-index/_analyze +{ + "field": "content", + "text": "Os estudantes estudan en Santiago e nas universidades galegas. Os seus números son 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "estud","start_offset": 3,"end_offset": 13,"type": "","position": 1}, + {"token": "estud","start_offset": 14,"end_offset": 21,"type": "","position": 2}, + {"token": "santiag","start_offset": 25,"end_offset": 33,"type": "","position": 4}, + {"token": "univers","start_offset": 40,"end_offset": 53,"type": "","position": 7}, + {"token": "galeg","start_offset": 54,"end_offset": 61,"type": "","position": 8}, + {"token": "numer","start_offset": 71,"end_offset": 78,"type": "","position": 11}, + {"token": "son","start_offset": 79,"end_offset": 82,"type": "","position": 12}, + {"token": "123456","start_offset": 83,"end_offset": 89,"type": "","position": 13} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/german.md b/_analyzers/language-analyzers/german.md new file mode 100644 index 00000000000..4071ef53780 --- /dev/null +++ b/_analyzers/language-analyzers/german.md @@ -0,0 +1,174 @@ +--- +layout: default +title: German +parent: Language analyzers +grand_parent: Analyzers +nav_order: 170 +--- + +# German analyzer + +The built-in `german` analyzer can be applied to a text field using the following command: + +```json +PUT /german-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "german" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_german_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_german_analyzer": { + "type": "german", + "stem_exclusion": ["Autorität", "Genehmigung"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## German analyzer internals + +The `german` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (German) + - keyword + - normalization (German) + - stemmer (German) + +## Custom German analyzer + +You can create a custom German analyzer using the following command: + +```json +PUT /german-index +{ + "settings": { + "analysis": { + "filter": { + "german_stop": { + "type": "stop", + "stopwords": "_german_" + }, + "german_stemmer": { + "type": "stemmer", + "language": "light_german" + }, + "german_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "german_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "german_stop", + "german_keywords", + "german_normalization", + "german_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "german_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /german-index/_analyze +{ + "field": "content", + "text": "Die Studenten studieren an den deutschen Universitäten. Ihre Nummern sind 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "student", + "start_offset": 4, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "studi", + "start_offset": 14, + "end_offset": 23, + "type": "", + "position": 2 + }, + { + "token": "deutsch", + "start_offset": 31, + "end_offset": 40, + "type": "", + "position": 5 + }, + { + "token": "universitat", + "start_offset": 41, + "end_offset": 54, + "type": "", + "position": 6 + }, + { + "token": "numm", + "start_offset": 61, + "end_offset": 68, + "type": "", + "position": 8 + }, + { + "token": "123456", + "start_offset": 74, + "end_offset": 80, + "type": "", + "position": 10 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/greek.md b/_analyzers/language-analyzers/greek.md new file mode 100644 index 00000000000..2446b1e2d6d --- /dev/null +++ b/_analyzers/language-analyzers/greek.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Greek +parent: Language analyzers +grand_parent: Analyzers +nav_order: 180 +--- + +# Greek analyzer + +The built-in `greek` analyzer can be applied to a text field using the following command: + +```json +PUT /greek-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "greek" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_greek_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_greek_analyzer": { + "type": "greek", + "stem_exclusion": ["αρχή", "έγκριση"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Greek analyzer internals + +The `greek` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Greek) + - keyword + - stemmer (Greek) + +## Custom Greek analyzer + +You can create a custom Greek analyzer using the following command: + +```json +PUT /greek-index +{ + "settings": { + "analysis": { + "filter": { + "greek_stop": { + "type": "stop", + "stopwords": "_greek_" + }, + "greek_stemmer": { + "type": "stemmer", + "language": "greek" + }, + "greek_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "greek_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "greek_stop", + "greek_keywords", + "greek_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "greek_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /greek-index/_analyze +{ + "field": "content", + "text": "Οι φοιτητές σπουδάζουν στα ελληνικά πανεπιστήμια. Οι αριθμοί τους είναι 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "φοιτητές","start_offset": 3,"end_offset": 11,"type": "","position": 1}, + {"token": "σπουδάζ","start_offset": 12,"end_offset": 22,"type": "","position": 2}, + {"token": "στα","start_offset": 23,"end_offset": 26,"type": "","position": 3}, + {"token": "ελληνικά","start_offset": 27,"end_offset": 35,"type": "","position": 4}, + {"token": "πανεπιστήμ","start_offset": 36,"end_offset": 48,"type": "","position": 5}, + {"token": "αριθμοί","start_offset": 53,"end_offset": 60,"type": "","position": 7}, + {"token": "τους","start_offset": 61,"end_offset": 65,"type": "","position": 8}, + {"token": "είνα","start_offset": 66,"end_offset": 71,"type": "","position": 9}, + {"token": "123456","start_offset": 72,"end_offset": 78,"type": "","position": 10} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/hindi.md b/_analyzers/language-analyzers/hindi.md new file mode 100644 index 00000000000..93f2eea319a --- /dev/null +++ b/_analyzers/language-analyzers/hindi.md @@ -0,0 +1,178 @@ +--- +layout: default +title: Hindi +parent: Language analyzers +grand_parent: Analyzers +nav_order: 190 +--- + +# Hindi analyzer + +The built-in `hindi` analyzer can be applied to a text field using the following command: + +```json +PUT /hindi-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hindi" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_hindi_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_hindi_analyzer": { + "type": "hindi", + "stem_exclusion": ["अधिकार", "अनुमोदन"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Hindi analyzer internals + +The `hindi` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - decimal_digit + - keyword + - normalization (indic) + - normalization (Hindi) + - stop (Hindi) + - stemmer (Hindi) + +## Custom Hindi analyzer + +You can create a custom Hindi analyzer using the following command: + +```json +PUT /hindi-index +{ + "settings": { + "analysis": { + "filter": { + "hindi_stop": { + "type": "stop", + "stopwords": "_hindi_" + }, + "hindi_stemmer": { + "type": "stemmer", + "language": "hindi" + }, + "hindi_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "hindi_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "decimal_digit", + "hindi_keywords", + "indic_normalization", + "hindi_normalization", + "hindi_stop", + "hindi_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hindi_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /hindi-index/_analyze +{ + "field": "content", + "text": "छात्र भारतीय विश्वविद्यालयों में पढ़ते हैं। उनके नंबर १२३४५६ हैं।" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "छातर", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "भारतिय", + "start_offset": 6, + "end_offset": 12, + "type": "", + "position": 1 + }, + { + "token": "विशवविदयालय", + "start_offset": 13, + "end_offset": 28, + "type": "", + "position": 2 + }, + { + "token": "पढ", + "start_offset": 33, + "end_offset": 38, + "type": "", + "position": 4 + }, + { + "token": "नंबर", + "start_offset": 49, + "end_offset": 53, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 54, + "end_offset": 60, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/hungarian.md b/_analyzers/language-analyzers/hungarian.md new file mode 100644 index 00000000000..d115c5d29ca --- /dev/null +++ b/_analyzers/language-analyzers/hungarian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Hungarian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 200 +--- + +# Hungarian analyzer + +The built-in `hungarian` analyzer can be applied to a text field using the following command: + +```json +PUT /hungarian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hungarian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_hungarian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_hungarian_analyzer": { + "type": "hungarian", + "stem_exclusion": ["hatalom", "jóváhagyás"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Hungarian analyzer internals + +The `hungarian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Hungarian) + - keyword + - stemmer (Hungarian) + +## Custom Hungarian analyzer + +You can create a custom Hungarian analyzer using the following command: + +```json +PUT /hungarian-index +{ + "settings": { + "analysis": { + "filter": { + "hungarian_stop": { + "type": "stop", + "stopwords": "_hungarian_" + }, + "hungarian_stemmer": { + "type": "stemmer", + "language": "hungarian" + }, + "hungarian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "hungarian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "hungarian_stop", + "hungarian_keywords", + "hungarian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hungarian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /hungarian-index/_analyze +{ + "field": "content", + "text": "A diákok a magyar egyetemeken tanulnak. A számaik 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "diák", + "start_offset": 2, + "end_offset": 8, + "type": "", + "position": 1 + }, + { + "token": "magyar", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "egyetem", + "start_offset": 18, + "end_offset": 29, + "type": "", + "position": 4 + }, + { + "token": "tanul", + "start_offset": 30, + "end_offset": 38, + "type": "", + "position": 5 + }, + { + "token": "szám", + "start_offset": 42, + "end_offset": 49, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 50, + "end_offset": 56, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/index.md b/_analyzers/language-analyzers/index.md new file mode 100644 index 00000000000..cc53c1cdac8 --- /dev/null +++ b/_analyzers/language-analyzers/index.md @@ -0,0 +1,135 @@ +--- +layout: default +title: Language analyzers +nav_order: 140 +parent: Analyzers +has_children: true +has_toc: true +redirect_from: + - /query-dsl/analyzers/language-analyzers/ + - /analyzers/language-analyzers/ +--- + +# Language analyzers + +OpenSearch supports the following language analyzers: +`arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `english`, `estonian`, `finnish`, `french`, `galician`, `german`, `greek`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`, `lithuanian`, `norwegian`, `persian`, `portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `thai`, and `turkish`. + +To use an analyzer when you map an index, specify the value in your query. For example, to map your index with the French language analyzer, specify the `french` value in the analyzer field: + +```json + "analyzer": "french" +``` + +#### Example request + +The following query specifies an index `my-index` with the `content` field configured as multi-field, and a sub-field named `french` is configured with the `french` language analyzer: + +```json +PUT my-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "fields": { + "french": { + "type": "text", + "analyzer": "french" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The default `french` analyzer can also be configured for the entire index using the following query: + +```json +PUT my-index +{ + "settings": { + "analysis": { + "analyzer": { + "default": { + "type": "french" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text" + }, + "title": { + "type": "text" + }, + "description": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can apply stem exclusion to any language analyzer by providing a list of lowercase words that should be excluded from stemming. Internally, OpenSearch uses the `keyword_marker` token filter to mark these words as keywords, ensuring that they are not stemmed. + +## Stem exclusion example + +Use the following request to configure `stem_exclusion`: + +```json +PUT index_with_stem_exclusion_english_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_english_analyzer":{ + "type":"english", + "stem_exclusion": ["manager", "management"] + } + } + } + } +} +``` +{% include copy-curl.html %} + + +## Stem exclusion with custom analyzers + +All language analyzers consist of tokenizers and token filters specific to a particular language. If you want to implement a custom version of the language analyzer with stem exclusion, you need to configure the `keyword_marker` token filter and list the words excluded from stemming in the `keywords` parameter: + +```json +PUT index_with_keyword_marker_analyzer +{ + "settings": { + "analysis": { + "filter": { + "protected_keywords_filter": { + "type": "keyword_marker", + "keywords": ["Apple", "OpenSearch"] + } + }, + "analyzer": { + "custom_english_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "protected_keywords_filter", + "english_stemmer" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_analyzers/language-analyzers/indonesian.md b/_analyzers/language-analyzers/indonesian.md new file mode 100644 index 00000000000..5c3d430b3a8 --- /dev/null +++ b/_analyzers/language-analyzers/indonesian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Indonesian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 210 +--- + +# Indonesian analyzer + +The built-in `indonesian` analyzer can be applied to a text field using the following command: + +```json +PUT /indonesian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "indonesian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_indonesian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_indonesian_analyzer": { + "type": "indonesian", + "stem_exclusion": ["otoritas", "persetujuan"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Indonesian analyzer internals + +The `indonesian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Indonesian) + - keyword + - stemmer (Indonesian) + +## Custom Indonesian analyzer + +You can create a custom Indonesian analyzer using the following command: + +```json +PUT /hungarian-index +{ + "settings": { + "analysis": { + "filter": { + "hungarian_stop": { + "type": "stop", + "stopwords": "_hungarian_" + }, + "hungarian_stemmer": { + "type": "stemmer", + "language": "hungarian" + }, + "hungarian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "hungarian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "hungarian_stop", + "hungarian_keywords", + "hungarian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hungarian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /indonesian-index/_analyze +{ + "field": "content", + "text": "Mahasiswa belajar di universitas Indonesia. Nomor mereka adalah 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "mahasiswa", + "start_offset": 0, + "end_offset": 9, + "type": "", + "position": 0 + }, + { + "token": "ajar", + "start_offset": 10, + "end_offset": 17, + "type": "", + "position": 1 + }, + { + "token": "universitas", + "start_offset": 21, + "end_offset": 32, + "type": "", + "position": 3 + }, + { + "token": "indonesia", + "start_offset": 33, + "end_offset": 42, + "type": "", + "position": 4 + }, + { + "token": "nomor", + "start_offset": 44, + "end_offset": 49, + "type": "", + "position": 5 + }, + { + "token": "123456", + "start_offset": 64, + "end_offset": 70, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/irish.md b/_analyzers/language-analyzers/irish.md new file mode 100644 index 00000000000..3e1535d1345 --- /dev/null +++ b/_analyzers/language-analyzers/irish.md @@ -0,0 +1,157 @@ +--- +layout: default +title: Irish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 210 +--- + +# Irish analyzer + +The built-in `irish` analyzer can be applied to a text field using the following command: + +```json +PUT /irish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "irish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_irish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_irish_analyzer": { + "type": "irish", + "stem_exclusion": ["údarás", "faomhadh"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Irish analyzer internals + +The `irish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - hyphenation (Irish) + - elision (Irish) + - lowercase (Irish) + - stop (Irish) + - keyword + - stemmer (Irish) + +## Custom Irish analyzer + +You can create a custom Irish analyzer using the following command: + +```json +PUT /irish-index +{ + "settings": { + "analysis": { + "filter": { + "irish_stop": { + "type": "stop", + "stopwords": "_irish_" + }, + "irish_elision": { + "type": "elision", + "articles": [ "d", "m", "b" ], + "articles_case": true + }, + "irish_hyphenation": { + "type": "stop", + "stopwords": [ "h", "n", "t" ], + "ignore_case": true + }, + "irish_lowercase": { + "type": "lowercase", + "language": "irish" + }, + "irish_stemmer": { + "type": "stemmer", + "language": "irish" + }, + "irish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "irish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "irish_hyphenation", + "irish_elision", + "irish_lowercase", + "irish_stop", + "irish_keywords", + "irish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "irish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /irish-index/_analyze +{ + "field": "content", + "text": "Tá mic léinn ag staidéar in ollscoileanna na hÉireann. Is iad a gcuid uimhreacha ná 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "tá","start_offset": 0,"end_offset": 2,"type": "","position": 0}, + {"token": "mic","start_offset": 3,"end_offset": 6,"type": "","position": 1}, + {"token": "léinn","start_offset": 7,"end_offset": 12,"type": "","position": 2}, + {"token": "staidéar","start_offset": 16,"end_offset": 24,"type": "","position": 4}, + {"token": "ollscoileanna","start_offset": 28,"end_offset": 41,"type": "","position": 6}, + {"token": "héireann","start_offset": 45,"end_offset": 53,"type": "","position": 8}, + {"token": "cuid","start_offset": 64,"end_offset": 69,"type": "","position": 12}, + {"token": "uimhreacha","start_offset": 70,"end_offset": 80,"type": "","position": 13}, + {"token": "123456","start_offset": 84,"end_offset": 90,"type": "","position": 15} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/italian.md b/_analyzers/language-analyzers/italian.md new file mode 100644 index 00000000000..190056d63c9 --- /dev/null +++ b/_analyzers/language-analyzers/italian.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Italian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 220 +--- + +# Italian analyzer + +The built-in `italian` analyzer can be applied to a text field using the following command: + +```json +PUT /italian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "italian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_italian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_italian_analyzer": { + "type": "italian", + "stem_exclusion": ["autorità", "approvazione"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Italian analyzer internals + +The `italian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - elision (Italian) + - lowercase + - stop (Italian) + - keyword + - stemmer (Italian) + +## Custom Italian analyzer + +You can create a custom Italian analyzer using the following command: + +```json +PUT /italian-index +{ + "settings": { + "analysis": { + "filter": { + "italian_stop": { + "type": "stop", + "stopwords": "_italian_" + }, + "italian_elision": { + "type": "elision", + "articles": [ + "c", "l", "all", "dall", "dell", + "nell", "sull", "coll", "pell", + "gl", "agl", "dagl", "degl", "negl", + "sugl", "un", "m", "t", "s", "v", "d" + ], + "articles_case": true + }, + "italian_stemmer": { + "type": "stemmer", + "language": "light_italian" + }, + "italian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "italian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "italian_elision", + "lowercase", + "italian_stop", + "italian_keywords", + "italian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "italian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /italian-index/_analyze +{ + "field": "content", + "text": "Gli studenti studiano nelle università italiane. I loro numeri sono 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 4,"end_offset": 12,"type": "","position": 1}, + {"token": "studian","start_offset": 13,"end_offset": 21,"type": "","position": 2}, + {"token": "universit","start_offset": 28,"end_offset": 38,"type": "","position": 4}, + {"token": "italian","start_offset": 39,"end_offset": 47,"type": "","position": 5}, + {"token": "numer","start_offset": 56,"end_offset": 62,"type": "","position": 8}, + {"token": "123456","start_offset": 68,"end_offset": 74,"type": "","position": 10} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/latvian.md b/_analyzers/language-analyzers/latvian.md new file mode 100644 index 00000000000..2301759763c --- /dev/null +++ b/_analyzers/language-analyzers/latvian.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Latvian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 230 +--- + +# Latvian analyzer + +The built-in `latvian` analyzer can be applied to a text field using the following command: + +```json +PUT /latvian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "latvian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_latvian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_latvian_analyzer": { + "type": "latvian", + "stem_exclusion": ["autoritāte", "apstiprinājums"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Latvian analyzer internals + +The `latvian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Latvian) + - keyword + - stemmer (Latvian) + +## Custom Latvian analyzer + +You can create a custom Latvian analyzer using the following command: + +```json +PUT /italian-index +{ + "settings": { + "analysis": { + "filter": { + "italian_stop": { + "type": "stop", + "stopwords": "_italian_" + }, + "italian_elision": { + "type": "elision", + "articles": [ + "c", "l", "all", "dall", "dell", + "nell", "sull", "coll", "pell", + "gl", "agl", "dagl", "degl", "negl", + "sugl", "un", "m", "t", "s", "v", "d" + ], + "articles_case": true + }, + "italian_stemmer": { + "type": "stemmer", + "language": "light_italian" + }, + "italian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "italian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "italian_elision", + "lowercase", + "italian_stop", + "italian_keywords", + "italian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "italian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /latvian-index/_analyze +{ + "field": "content", + "text": "Studenti mācās Latvijas universitātēs. Viņu numuri ir 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 0,"end_offset": 8,"type": "","position": 0}, + {"token": "māc","start_offset": 9,"end_offset": 14,"type": "","position": 1}, + {"token": "latvij","start_offset": 15,"end_offset": 23,"type": "","position": 2}, + {"token": "universitāt","start_offset": 24,"end_offset": 37,"type": "","position": 3}, + {"token": "vin","start_offset": 39,"end_offset": 43,"type": "","position": 4}, + {"token": "numur","start_offset": 44,"end_offset": 50,"type": "","position": 5}, + {"token": "123456","start_offset": 54,"end_offset": 60,"type": "","position": 7} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/lithuanian.md b/_analyzers/language-analyzers/lithuanian.md new file mode 100644 index 00000000000..ca5966c54ec --- /dev/null +++ b/_analyzers/language-analyzers/lithuanian.md @@ -0,0 +1,136 @@ +--- +layout: default +title: Lithuanian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 230 +--- + +# Lithuanian analyzer + +The built-in `lithuanian` analyzer can be applied to a text field using the following command: + +```json +PUT /lithuanian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "lithuanian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_lithuanian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_lithuanian_analyzer": { + "type": "lithuanian", + "stem_exclusion": ["autoritetas", "patvirtinimas"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Lithuanian analyzer internals + +The `lithuanian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Lithuanian) + - keyword + - stemmer (Lithuanian) + +## Custom Lithuanian analyzer + +You can create a custom Lithuanian analyzer using the following command: + +```json +PUT /lithuanian-index +{ + "settings": { + "analysis": { + "filter": { + "lithuanian_stop": { + "type": "stop", + "stopwords": "_lithuanian_" + }, + "lithuanian_stemmer": { + "type": "stemmer", + "language": "lithuanian" + }, + "lithuanian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "lithuanian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "lithuanian_stop", + "lithuanian_keywords", + "lithuanian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "lithuanian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /lithuanian-index/_analyze +{ + "field": "content", + "text": "Studentai mokosi Lietuvos universitetuose. Jų numeriai yra 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 0,"end_offset": 9,"type": "","position": 0}, + {"token": "mok","start_offset": 10,"end_offset": 16,"type": "","position": 1}, + {"token": "lietuv","start_offset": 17,"end_offset": 25,"type": "","position": 2}, + {"token": "universitet","start_offset": 26,"end_offset": 41,"type": "","position": 3}, + {"token": "num","start_offset": 46,"end_offset": 54,"type": "","position": 5}, + {"token": "123456","start_offset": 59,"end_offset": 65,"type": "","position": 7} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/norwegian.md b/_analyzers/language-analyzers/norwegian.md new file mode 100644 index 00000000000..cfb04eebf3e --- /dev/null +++ b/_analyzers/language-analyzers/norwegian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Norwegian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 240 +--- + +# Norwegian analyzer + +The built-in `norwegian` analyzer can be applied to a text field using the following command: + +```json +PUT /norwegian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "norwegian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_norwegian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_norwegian_analyzer": { + "type": "norwegian", + "stem_exclusion": ["autoritet", "godkjenning"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Norwegian analyzer internals + +The `norwegian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Norwegian) + - keyword + - stemmer (Norwegian) + +## Custom Norwegian analyzer + +You can create a custom Norwegian analyzer using the following command: + +```json +PUT /norwegian-index +{ + "settings": { + "analysis": { + "filter": { + "norwegian_stop": { + "type": "stop", + "stopwords": "_norwegian_" + }, + "norwegian_stemmer": { + "type": "stemmer", + "language": "norwegian" + }, + "norwegian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "norwegian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "norwegian_stop", + "norwegian_keywords", + "norwegian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "norwegian_analyzer" + } + } + } +} + +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /norwegian-index/_analyze +{ + "field": "content", + "text": "Studentene studerer ved norske universiteter. Deres nummer er 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "studer","start_offset": 11,"end_offset": 19,"type": "","position": 1}, + {"token": "norsk","start_offset": 24,"end_offset": 30,"type": "","position": 3}, + {"token": "universitet","start_offset": 31,"end_offset": 44,"type": "","position": 4}, + {"token": "numm","start_offset": 52,"end_offset": 58,"type": "","position": 6}, + {"token": "123456","start_offset": 62,"end_offset": 68,"type": "","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/persian.md b/_analyzers/language-analyzers/persian.md new file mode 100644 index 00000000000..2e06f8d714a --- /dev/null +++ b/_analyzers/language-analyzers/persian.md @@ -0,0 +1,145 @@ +--- +layout: default +title: Persian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 250 +--- + +# Persian analyzer + +The built-in `persian` analyzer can be applied to a text field using the following command: + +```json +PUT /persian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "persian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_persian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_persian_analyzer": { + "type": "persian", + "stem_exclusion": ["حکومت", "تأیید"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Persian analyzer internals + +The `persian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Char filter: `mapping` + +- Token filters: + - lowercase + - decimal_digit + - normalization (Arabic) + - normalization (Persian) + - stop (Persian) + - keyword + +## Custom Persian analyzer + +You can create a custom Persian analyzer using the following command: + +```json +PUT /persian-index +{ + "settings": { + "analysis": { + "filter": { + "persian_stop": { + "type": "stop", + "stopwords": "_persian_" + }, + "persian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "char_filter": { + "null_width_replace_with_space": { + "type": "mapping", + "mappings": [ "\\u200C=>\\u0020"] + } + }, + "analyzer": { + "persian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "char_filter": [ "null_width_replace_with_space" ], + "filter": [ + "lowercase", + "decimal_digit", + "arabic_normalization", + "persian_normalization", + "persian_stop", + "persian_keywords" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "persian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /persian-index/_analyze +{ + "field": "content", + "text": "دانشجویان در دانشگاه‌های ایرانی تحصیل می‌کنند. شماره‌های آن‌ها ۱۲۳۴۵۶ است." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "دانشجويان","start_offset": 0,"end_offset": 9,"type": "","position": 0}, + {"token": "دانشگاه","start_offset": 13,"end_offset": 20,"type": "","position": 2}, + {"token": "ايراني","start_offset": 25,"end_offset": 31,"type": "","position": 4}, + {"token": "تحصيل","start_offset": 32,"end_offset": 37,"type": "","position": 5}, + {"token": "شماره","start_offset": 47,"end_offset": 52,"type": "","position": 8}, + {"token": "123456","start_offset": 63,"end_offset": 69,"type": "","position": 12} + ] +} +``` diff --git a/_analyzers/language-analyzers/portuguese.md b/_analyzers/language-analyzers/portuguese.md new file mode 100644 index 00000000000..166ffa0010d --- /dev/null +++ b/_analyzers/language-analyzers/portuguese.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Portuguese +parent: Language analyzers +grand_parent: Analyzers +nav_order: 260 +--- + +# Portuguese analyzer + +The built-in `portuguese` analyzer can be applied to a text field using the following command: + +```json +PUT /portuguese-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "portuguese" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_portuguese_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_portuguese_analyzer": { + "type": "portuguese", + "stem_exclusion": ["autoridade", "aprovação"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Portuguese analyzer internals + +The `portuguese` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Portuguese) + - keyword + - stemmer (Portuguese) + +## Custom Portuguese analyzer + +You can create a custom Portuguese analyzer using the following command: + +```json +PUT /portuguese-index +{ + "settings": { + "analysis": { + "filter": { + "portuguese_stop": { + "type": "stop", + "stopwords": "_portuguese_" + }, + "portuguese_stemmer": { + "type": "stemmer", + "language": "light_portuguese" + }, + "portuguese_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "portuguese_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "portuguese_stop", + "portuguese_keywords", + "portuguese_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "portuguese_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /portuguese-index/_analyze +{ + "field": "content", + "text": "Os estudantes estudam nas universidades brasileiras. Seus números são 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "estudant", + "start_offset": 3, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "estudam", + "start_offset": 14, + "end_offset": 21, + "type": "", + "position": 2 + }, + { + "token": "universidad", + "start_offset": 26, + "end_offset": 39, + "type": "", + "position": 4 + }, + { + "token": "brasileir", + "start_offset": 40, + "end_offset": 51, + "type": "", + "position": 5 + }, + { + "token": "numer", + "start_offset": 58, + "end_offset": 65, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 70, + "end_offset": 76, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/romanian.md b/_analyzers/language-analyzers/romanian.md new file mode 100644 index 00000000000..cad0953385b --- /dev/null +++ b/_analyzers/language-analyzers/romanian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Romanian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 270 +--- + +# Romanian analyzer + +The built-in `romanian` analyzer can be applied to a text field using the following command: + +```json +PUT /romanian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "romanian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_romanian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_romanian_analyzer": { + "type": "romanian", + "stem_exclusion": ["autoritate", "aprobat"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Romanian analyzer internals + +The `romanian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Romanian) + - keyword + - stemmer (Romanian) + +## Custom Romanian analyzer + +You can create a custom Romanian analyzer using the following command: + +```json +PUT /romanian-index +{ + "settings": { + "analysis": { + "filter": { + "romanian_stop": { + "type": "stop", + "stopwords": "_romanian_" + }, + "romanian_stemmer": { + "type": "stemmer", + "language": "romanian" + }, + "romanian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "romanian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "romanian_stop", + "romanian_keywords", + "romanian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "romanian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /romanian-index/_analyze +{ + "field": "content", + "text": "Studenții învață la universitățile din România. Numerele lor sunt 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "studenț", + "start_offset": 0, + "end_offset": 9, + "type": "", + "position": 0 + }, + { + "token": "învaț", + "start_offset": 10, + "end_offset": 16, + "type": "", + "position": 1 + }, + { + "token": "universităț", + "start_offset": 20, + "end_offset": 34, + "type": "", + "position": 3 + }, + { + "token": "român", + "start_offset": 39, + "end_offset": 46, + "type": "", + "position": 5 + }, + { + "token": "numer", + "start_offset": 48, + "end_offset": 56, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 66, + "end_offset": 72, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/russian.md b/_analyzers/language-analyzers/russian.md new file mode 100644 index 00000000000..bd57ba0b275 --- /dev/null +++ b/_analyzers/language-analyzers/russian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Russian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 280 +--- + +# Russian analyzer + +The built-in `russian` analyzer can be applied to a text field using the following command: + +```json +PUT /russian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "russian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_russian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_russian_analyzer": { + "type": "russian", + "stem_exclusion": ["авторитет", "одобрение"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Russian analyzer internals + +The `russian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Russian) + - keyword + - stemmer (Russian) + +## Custom Russian analyzer + +You can create a custom Russian analyzer using the following command: + +```json +PUT /russian-index +{ + "settings": { + "analysis": { + "filter": { + "russian_stop": { + "type": "stop", + "stopwords": "_russian_" + }, + "russian_stemmer": { + "type": "stemmer", + "language": "russian" + }, + "russian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "russian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "russian_stop", + "russian_keywords", + "russian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "russian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /russian-index/_analyze +{ + "field": "content", + "text": "Студенты учатся в университетах России. Их номера 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "студент", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "учат", + "start_offset": 9, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "университет", + "start_offset": 18, + "end_offset": 31, + "type": "", + "position": 3 + }, + { + "token": "росс", + "start_offset": 32, + "end_offset": 38, + "type": "", + "position": 4 + }, + { + "token": "номер", + "start_offset": 43, + "end_offset": 49, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 50, + "end_offset": 56, + "type": "", + "position": 7 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/sorani.md b/_analyzers/language-analyzers/sorani.md new file mode 100644 index 00000000000..f71d43c4810 --- /dev/null +++ b/_analyzers/language-analyzers/sorani.md @@ -0,0 +1,168 @@ +--- +layout: default +title: Sorani +parent: Language analyzers +grand_parent: Analyzers +nav_order: 290 +--- + +# Sorani analyzer + +The built-in `sorani` analyzer can be applied to a text field using the following command: + +```json +PUT /sorani-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "sorani" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_sorani_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_sorani_analyzer": { + "type": "sorani", + "stem_exclusion": ["مؤسسه", "اجازه"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Sorani analyzer internals + +The `sorani` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - normalization (Sorani) + - lowercase + - decimal_digit + - stop (Sorani) + - keyword + - stemmer (Sorani) + +## Custom Sorani analyzer + +You can create a custom Sorani analyzer using the following command: + +```json +PUT /sorani-index +{ + "settings": { + "analysis": { + "filter": { + "sorani_stop": { + "type": "stop", + "stopwords": "_sorani_" + }, + "sorani_stemmer": { + "type": "stemmer", + "language": "sorani" + }, + "sorani_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "sorani_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "decimal_digit", + "sorani_stop", + "sorani_keywords", + "sorani_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "sorani_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /sorani-index/_analyze +{ + "field": "content", + "text": "خوێندنی فەرمی لە هەولێرەوە. ژمارەکان ١٢٣٤٥٦." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "خوێندن", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "فەرم", + "start_offset": 8, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "هەولێر", + "start_offset": 17, + "end_offset": 26, + "type": "", + "position": 3 + }, + { + "token": "ژمار", + "start_offset": 28, + "end_offset": 36, + "type": "", + "position": 4 + }, + { + "token": "123456", + "start_offset": 37, + "end_offset": 43, + "type": "", + "position": 5 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/spanish.md b/_analyzers/language-analyzers/spanish.md new file mode 100644 index 00000000000..8a0d8fad3cb --- /dev/null +++ b/_analyzers/language-analyzers/spanish.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Spanish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 300 +--- + +# Spanish analyzer + +The built-in `spanish` analyzer can be applied to a text field using the following command: + +```json +PUT /spanish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "spanish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_spanish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_spanish_analyzer": { + "type": "spanish", + "stem_exclusion": ["autoridad", "aprobación"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Spanish analyzer internals + +The `spanish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Spanish) + - keyword + - stemmer (Spanish) + +## Custom Spanish analyzer + +You can create a custom Spanish analyzer using the following command: + +```json +PUT /spanish-index +{ + "settings": { + "analysis": { + "filter": { + "spanish_stop": { + "type": "stop", + "stopwords": "_spanish_" + }, + "spanish_stemmer": { + "type": "stemmer", + "language": "light_spanish" + }, + "spanish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "spanish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "spanish_stop", + "spanish_keywords", + "spanish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "spanish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /spanish-index/_analyze +{ + "field": "content", + "text": "Los estudiantes estudian en universidades españolas. Sus números son 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "estudiant", + "start_offset": 4, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "estudian", + "start_offset": 16, + "end_offset": 24, + "type": "", + "position": 2 + }, + { + "token": "universidad", + "start_offset": 28, + "end_offset": 41, + "type": "", + "position": 4 + }, + { + "token": "español", + "start_offset": 42, + "end_offset": 51, + "type": "", + "position": 5 + }, + { + "token": "numer", + "start_offset": 57, + "end_offset": 64, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 69, + "end_offset": 75, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/swedish.md b/_analyzers/language-analyzers/swedish.md new file mode 100644 index 00000000000..9da595f12e0 --- /dev/null +++ b/_analyzers/language-analyzers/swedish.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Swedish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 310 +--- + +# Swedish analyzer + +The built-in `swedish` analyzer can be applied to a text field using the following command: + +```json +PUT /swedish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "swedish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_swedish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_swedish_analyzer": { + "type": "swedish", + "stem_exclusion": ["myndighet", "godkännande"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Swedish analyzer internals + +The `swedish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Swedish) + - keyword + - stemmer (Swedish) + +## Custom Swedish analyzer + +You can create a custom Swedish analyzer using the following command: + +```json +PUT /swedish-index +{ + "settings": { + "analysis": { + "filter": { + "swedish_stop": { + "type": "stop", + "stopwords": "_swedish_" + }, + "swedish_stemmer": { + "type": "stemmer", + "language": "swedish" + }, + "swedish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "swedish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "swedish_stop", + "swedish_keywords", + "swedish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "swedish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /swedish-index/_analyze +{ + "field": "content", + "text": "Studenter studerar vid svenska universitet. Deras nummer är 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "student", + "start_offset": 0, + "end_offset": 9, + "type": "", + "position": 0 + }, + { + "token": "studer", + "start_offset": 10, + "end_offset": 18, + "type": "", + "position": 1 + }, + { + "token": "svensk", + "start_offset": 23, + "end_offset": 30, + "type": "", + "position": 3 + }, + { + "token": "universitet", + "start_offset": 31, + "end_offset": 42, + "type": "", + "position": 4 + }, + { + "token": "numm", + "start_offset": 50, + "end_offset": 56, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 60, + "end_offset": 66, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/thai.md b/_analyzers/language-analyzers/thai.md new file mode 100644 index 00000000000..e4daa1f0be2 --- /dev/null +++ b/_analyzers/language-analyzers/thai.md @@ -0,0 +1,132 @@ +--- +layout: default +title: Thai +parent: Language analyzers +grand_parent: Analyzers +nav_order: 320 +--- + +# Thai analyzer + +The built-in `thai` analyzer can be applied to a text field using the following command: + +```json +PUT /thai-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "thai" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_thai_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_thai_analyzer": { + "type": "thai", + "stem_exclusion": ["อำนาจ", "การอนุมัติ"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Thai analyzer internals + +The `thai` analyzer is built using the following components: + +- Tokenizer: `thai` + +- Token filters: + - lowercase + - decimal_digit + - stop (Thai) + - keyword + +## Custom Thai analyzer + +You can create a custom Thai analyzer using the following command: + +```json +PUT /thai-index +{ + "settings": { + "analysis": { + "filter": { + "thai_stop": { + "type": "stop", + "stopwords": "_thai_" + }, + "thai_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "thai_analyzer": { + "tokenizer": "thai", + "filter": [ + "lowercase", + "decimal_digit", + "thai_stop", + "thai_keywords" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "thai_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /thai-index/_analyze +{ + "field": "content", + "text": "นักเรียนกำลังศึกษาอยู่ที่มหาวิทยาลัยไทย หมายเลข 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "นักเรียน","start_offset": 0,"end_offset": 8,"type": "word","position": 0}, + {"token": "กำลัง","start_offset": 8,"end_offset": 13,"type": "word","position": 1}, + {"token": "ศึกษา","start_offset": 13,"end_offset": 18,"type": "word","position": 2}, + {"token": "มหาวิทยาลัย","start_offset": 25,"end_offset": 36,"type": "word","position": 5}, + {"token": "ไทย","start_offset": 36,"end_offset": 39,"type": "word","position": 6}, + {"token": "หมายเลข","start_offset": 40,"end_offset": 47,"type": "word","position": 7}, + {"token": "123456","start_offset": 48,"end_offset": 54,"type": "word","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/turkish.md b/_analyzers/language-analyzers/turkish.md new file mode 100644 index 00000000000..fb36c5413ca --- /dev/null +++ b/_analyzers/language-analyzers/turkish.md @@ -0,0 +1,143 @@ +--- +layout: default +title: Turkish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 330 +--- + +# Turkish analyzer + +The built-in `turkish` analyzer can be applied to a text field using the following command: + +```json +PUT /turkish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "turkish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_turkish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_turkish_analyzer": { + "type": "turkish", + "stem_exclusion": ["otorite", "onay"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Turkish analyzer internals + +The `turkish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - apostrophe + - lowercase (Turkish) + - stop (Turkish) + - keyword + - stemmer (Turkish) + +## Custom Turkish analyzer + +You can create a custom Turkish analyzer using the following command: + +```json +PUT /turkish-index +{ + "settings": { + "analysis": { + "filter": { + "turkish_stop": { + "type": "stop", + "stopwords": "_turkish_" + }, + "turkish_stemmer": { + "type": "stemmer", + "language": "turkish" + }, + "turkish_lowercase": { + "type": "lowercase", + "language": "turkish" + }, + "turkish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "turkish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "apostrophe", + "turkish_lowercase", + "turkish_stop", + "turkish_keywords", + "turkish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "turkish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /turkish-index/_analyze +{ + "field": "content", + "text": "Öğrenciler Türk üniversitelerinde öğrenim görüyor. Numara 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "öğrenci","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "türk","start_offset": 11,"end_offset": 15,"type": "","position": 1}, + {"token": "üniversite","start_offset": 16,"end_offset": 33,"type": "","position": 2}, + {"token": "öğre","start_offset": 34,"end_offset": 41,"type": "","position": 3}, + {"token": "görüyor","start_offset": 42,"end_offset": 49,"type": "","position": 4}, + {"token": "numar","start_offset": 51,"end_offset": 57,"type": "","position": 5}, + {"token": "123456","start_offset": 58,"end_offset": 64,"type": "","position": 6} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/normalizers.md b/_analyzers/normalizers.md index b89659f814b..52841d25716 100644 --- a/_analyzers/normalizers.md +++ b/_analyzers/normalizers.md @@ -1,7 +1,7 @@ --- layout: default title: Normalizers -nav_order: 100 +nav_order: 110 --- # Normalizers diff --git a/_analyzers/search-analyzers.md b/_analyzers/search-analyzers.md index 52159edb70f..80a0852a62a 100644 --- a/_analyzers/search-analyzers.md +++ b/_analyzers/search-analyzers.md @@ -22,14 +22,12 @@ To determine which analyzer to use for a query string at query time, OpenSearch In most cases, specifying a search analyzer that is different from the index analyzer is not necessary and could negatively impact search result relevance or lead to unexpected search results. {: .warning} -For information about verifying which analyzer is associated with which field, see [Verifying analyzer settings]({{site.url}}{{site.baseurl}}/analyzers/index/#verifying-analyzer-settings). +## Specifying a search analyzer at query time -## Specifying a search analyzer for a query string - -Specify the name of the analyzer you want to use at query time in the `analyzer` field: +You can override the default analyzer behavior by explicitly setting the analyzer in the query. The following query uses the `english` analyzer to stem the input terms: ```json -GET shakespeare/_search +GET /shakespeare/_search { "query": { "match": { @@ -43,16 +41,16 @@ GET shakespeare/_search ``` {% include copy-curl.html %} -For more information about supported analyzers, see [Analyzers]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/index/). +## Specifying a search analyzer in the mappings -## Specifying a search analyzer for a field +When defining mappings, you can provide both the `analyzer` (used at index time) and `search_analyzer` (used at query time) for any [`text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/) field. -When creating index mappings, you can provide the `search_analyzer` parameter for each [text]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/) field. When providing the `search_analyzer`, you must also provide the `analyzer` parameter, which specifies the [index analyzer]({{site.url}}{{site.baseurl}}/analyzers/index-analyzers/) to be used at indexing time. +### Example: Different analyzers for indexing and search -For example, the following request specifies the `simple` analyzer as the index analyzer and the `whitespace` analyzer as the search analyzer for the `text_entry` field: +The following configuration allows different tokenization strategies for indexing and querying: ```json -PUT testindex +PUT /testindex { "mappings": { "properties": { @@ -67,14 +65,100 @@ PUT testindex ``` {% include copy-curl.html %} -## Specifying the default search analyzer for an index +### Example: Using the edge n-gram analyzer for indexing and the standard analyzer for search -If you want to analyze all query strings at search time with the same analyzer, you can specify the search analyzer in the `analysis.analyzer.default_search` setting. When providing the `analysis.analyzer.default_search`, you must also provide the `analysis.analyzer.default` parameter, which specifies the [index analyzer]({{site.url}}{{site.baseurl}}/analyzers/index-analyzers/) to be used at indexing time. +The following configuration enables [autocomplete]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/autocomplete/)-like behavior, where you can type the beginning of a word and still receive relevant matches: -For example, the following request specifies the `simple` analyzer as the index analyzer and the `whitespace` analyzer as the search analyzer for the `testindex` index: +```json +PUT /articles +{ + "settings": { + "analysis": { + "analyzer": { + "edge_ngram_analyzer": { + "tokenizer": "edge_ngram_tokenizer", + "filter": ["lowercase"] + } + }, + "tokenizer": { + "edge_ngram_tokenizer": { + "type": "edge_ngram", + "min_gram": 2, + "max_gram": 10, + "token_chars": ["letter", "digit"] + } + } + } + }, + "mappings": { + "properties": { + "title": { + "type": "text", + "analyzer": "edge_ngram_analyzer", + "search_analyzer": "standard" + } + } + } +} +``` +{% include copy-curl.html %} + +The `edge_ngram_analyzer` is applied at index time, breaking input strings into partial prefixes (n-grams), which allows the index to store fragments like "se", "sea", "sear", and so on. +Use the following request to index a document: ```json -PUT testindex +PUT /articles/_doc/1 +{ + "title": "Search Analyzer in Action" +} +``` +{% include copy-curl.html %} + +Use the following request to search for the partial word `sear` in the `title` field: + +```json +POST /articles/_search +{ + "query": { + "match": { + "title": "sear" + } + } +} +``` +{% include copy-curl.html %} + +The response demonstrates that the query containing "sear" matches the document "Search Analyzer in Action" because the n-gram tokens generated at index time include that prefix. This mirrors the [autocomplete functionality]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/autocomplete/), in which typing a prefix can retrieve full matches: + +```json +{ + ... + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.2876821, + "hits": [ + { + "_index": "articles", + "_id": "1", + "_score": 0.2876821, + "_source": { + "title": "Search Analyzer in Action" + } + } + ] + } +} +``` + +## Setting a default search analyzer for an index + +Specify `analysis.analyzer.default_search` to define a search analyzer for all fields unless overridden: + +```json +PUT /testindex { "settings": { "analysis": { @@ -89,6 +173,9 @@ PUT testindex } } } - ``` {% include copy-curl.html %} + +This configuration ensures consistent behavior across multiple fields, especially when using custom analyzers. + +For more information about supported analyzers, see [Analyzers]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/index/). diff --git a/_analyzers/stemming.md b/_analyzers/stemming.md new file mode 100644 index 00000000000..b3e39e58e34 --- /dev/null +++ b/_analyzers/stemming.md @@ -0,0 +1,162 @@ +--- +layout: default +title: Stemming +nav_order: 140 +--- + +# Stemming + +Stemming is the process of reducing words to their root or base form, known as the _stem_. This technique ensures that different variations of a word are matched during search operations. For example, the words "running", "runner", and "ran" can all be reduced to the stem "run", allowing searches for any of these terms to return relevant results. + +In natural language, words often appear in various forms because of conjugation, pluralization, or derivation. Stemming improves search operations in the following ways: + +- **Improves search recall**: By matching different word forms to a common stem, stemming increases the number of relevant documents retrieved. +- **Reduces index size**: Storing only the stemmed versions of words can decrease the overall size of the search index. + +Stemming is configured using token filters within [analyzers]({{site.url}}{{site.baseurl}}/analyzers/#analyzers). An analyzer comprises the following components: + +1. **Character filters**: Modify the stream of characters before tokenization. +2. **Tokenizer**: Splits text into tokens (typically, words). +3. **Token filters**: Modify tokens after tokenization, for example, by applying stemming. + +## Stemming example using built-in token filters + +To implement stemming, you can configure a built-in token filter such as a [`porter_stem`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/porter-stem/) or [`kstem`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/kstem/) filter. + +The [Porter stemming algorithm](https://snowballstem.org/algorithms/porter/stemmer.html) is a common algorithmic stemmer used for the English language. + +### Creating an index with a custom analyzer + +The following example request creates a new index named `my_stemming_index` and configures an analyzer with the [`porter_stem`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/porter-stem/) token filter: + +```json +PUT /my_stemming_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_stemmer_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "porter_stem" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +This configuration comprises the following: + +- The [`standard`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/standard/) tokenizer splits text into terms based on word boundaries. +- The [`lowercase`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/lowercase/) filter converts all tokens to lowercase. +- The [`porter_stem`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/porter-stem/) filter reduces words to their root form. + +### Testing the analyzer + +To examine the stemming action, analyze a sample text using the previously configured custom analyzer: + +```json +POST /my_stemming_index/_analyze +{ + "analyzer": "my_stemmer_analyzer", + "text": "The runners are running swiftly." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "runner", + "start_offset": 4, + "end_offset": 11, + "type": "", + "position": 1 + }, + { + "token": "ar", + "start_offset": 12, + "end_offset": 15, + "type": "", + "position": 2 + }, + { + "token": "run", + "start_offset": 16, + "end_offset": 23, + "type": "", + "position": 3 + }, + { + "token": "swiftli", + "start_offset": 24, + "end_offset": 31, + "type": "", + "position": 4 + } + ] +} +``` + +## Stemmer categories + +You can configure stemmers belonging to the following two categories: + +- [Algorithmic stemmers]({{site.url}}{{site.baseurl}}/analyzers/stemming/#algorithmic-stemmers) +- [Dictionary stemmers]({{site.url}}{{site.baseurl}}/analyzers/stemming/#dictionary-stemmers) + +### Algorithmic stemmers + +Algorithmic stemmers apply predefined rules to systematically strip affixes (prefixes and suffixes) from words, reducing them to their stems. The following token filters use algorithmic stemmers: + +- [`porter_stem`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/porter-stem/): Applies the Porter stemming algorithm to remove common suffixes and reduce words to their stems. For example, "running" becomes "run". + +- [`kstem`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/kstem/): A lightweight stemmer designed for the English langugage that combines algorithmic stemming with a built-in dictionary. It reduces plurals to singulars, converts verb tenses to their base forms, and removes common derivational endings. + + +- [`stemmer`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/stemmer/): Provides algorithmic stemming for various languages, including English, with options for different stemming algorithms like `light_english`, `minimal_english`, and `porter2`. + + +- [`snowball`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/snowball/): Applies the Snowball algorithm to provide efficient and accurate stemming for multiple languages, including English, French, German, and others. + +### Dictionary stemmers + +Dictionary stemmers rely on extensive dictionaries to map words to their root forms, effectively stemming irregular words. They look up each word in a precompiled list to find its corresponding stem. This operation is more resource intensive but often yields better results for irregular words and words that might appear to have a similar stem but are very different in their meaning. + +The most prominent example of a dictionary stemmer is the [`hunspell`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/hunspell/) token filter, which uses Hunspell---a spell checker engine used in many open-source applications. + +### Considerations +When selecting a stemmer, take note of the following considerations: + +- Algorithmic stemmers are suitable when processing speed and memory efficiency are priorities and the language has relatively regular morphological patterns. +- Dictionary stemmers are ideal when accuracy in handling irregular word forms is crucial and resources are available to support the increased memory usage and processing time. + + +### Additional stemming configuration + +Although "organize" and "organic" share a common linguistic root, leading a stemmer to produce "organ" for both, their conceptual differences are significant. In practical search scenarios, this shared root can lead to irrelevant matches being returned in search results. + +You can address these challenges by using the following methods: + +- **Explicit stemming overrides**: Rather than relying solely on algorithmic stemming, you can define specific stemming rules. Using [`stemmer_override`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/stemmer-override/) allows you to ensure that "organize" remains unchanged while "organic" is reduced to "organ." This provides granular control over the final form of terms. + +- **Keyword preservation**: To maintain the integrity of important terms, you can use the [`keyword_marker`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keyword-marker/) token filter. This filter designates specific words as keywords, preventing subsequent stemmer filters from altering them. In this example, you can mark "organize" as a keyword, ensuring that it is indexed exactly as it appears. + +- **Conditional stemming control**: The [condition]({{site.url}}{{site.baseurl}}/analyzers/token-filters/condition/) token filter enables you to establish rules that determine whether a term should be stemmed. These can be based on various criteria, such as the term's presence in a predefined list. + +- **Language-specific term exclusion**: For built-in language analyzers, the [`stem_exclusion`]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/english/#stem-exclusion) parameter provides a way to specify words that should be exempt from stemming. For example, you can add "organize" to the `stem_exclusion` list, preventing the analyzer from stemming it. This can be useful for preserving the distinct meaning of specific terms within a given language. diff --git a/_analyzers/supported-analyzers/dl-model-analyzers.md b/_analyzers/supported-analyzers/dl-model-analyzers.md new file mode 100644 index 00000000000..5109234061d --- /dev/null +++ b/_analyzers/supported-analyzers/dl-model-analyzers.md @@ -0,0 +1,56 @@ +--- +layout: default +title: DL model analyzers +parent: Analyzers +nav_order: 130 +--- + +# DL model analyzers + +Deep learning (DL) model analyzers are designed to work with [neural sparse search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-search/). They implement the same tokenization rules used by machine learning (ML) models, ensuring compatibility with neural sparse search. While traditional OpenSearch analyzers use standard rule-based tokenization (like white space or word boundaries), DL model analyzers use tokenization rules that match specific ML models (like BERT's WordPiece tokenization scheme). This consistent tokenization between indexed documents and search queries is essential for neural sparse search to work correctly. + +OpenSearch supports the following DL model analyzers: + +* [`bert-uncased`](#the-bert-uncased-analyzer): An analyzer based on the [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) model tokenizer. +* [`mbert-uncased`](#the-mbert-uncased-analyzer): A multilingual analyzer based on the [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased) model tokenizer. + +## Usage considerations + +When using the DL model analyzers, keep the following considerations in mind: + +* These analyzers use lazy loading. The first call to these analyzers may take longer because dependencies and related resources are loaded. +* The tokenizers follow the same rules as their corresponding model tokenizers. + +## The bert-uncased analyzer + +The `bert-uncased` analyzer is based on the [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) model and tokenizes text according to BERT's WordPiece tokenization scheme. This analyzer is particularly useful for English language text. + +To analyze text with the `bert-uncased` analyzer, specify it in the `analyzer` field: + +```json +POST /_analyze +{ + "analyzer": "bert-uncased", + "text": "It's fun to contribute to OpenSearch!" +} +``` +{% include copy-curl.html %} + +## The mbert-uncased analyzer + +The `mbert-uncased` analyzer is based on the [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased) model, which supports tokenization across multiple languages. This makes it suitable for applications dealing with multilingual content. + +To analyze multilingual text, specify the `mbert-uncased` analyzer in the request: + +```json +POST /_analyze +{ + "analyzer": "mbert-uncased", + "text": "It's fun to contribute to OpenSearch!" +} +``` +{% include copy-curl.html %} + +## Example + +For a complete example of using DL model analyzers in neural sparse search queries, see [Generating sparse vector embeddings automatically]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-with-pipelines/). \ No newline at end of file diff --git a/_analyzers/supported-analyzers/fingerprint.md b/_analyzers/supported-analyzers/fingerprint.md new file mode 100644 index 00000000000..267e16c0392 --- /dev/null +++ b/_analyzers/supported-analyzers/fingerprint.md @@ -0,0 +1,115 @@ +--- +layout: default +title: Fingerprint analyzer +parent: Analyzers +nav_order: 60 +--- + +# Fingerprint analyzer + +The `fingerprint` analyzer creates a text fingerprint. The analyzer sorts and deduplicates the terms (tokens) generated from the input and then concatenates them using a separator. It is commonly used for data deduplication because it produces the same output for similar inputs containing the same words, regardless of word order. + +The `fingerprint` analyzer comprises the following components: + +- Standard tokenizer +- Lowercase token filter +- ASCII folding token filter +- Stop token filter +- Fingerprint token filter + +## Parameters + +The `fingerprint` analyzer can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`separator` | Optional | String | Specifies the character used to concatenate the terms after they have been tokenized, sorted, and deduplicated. Default is an empty space (` `). +`max_output_size` | Optional | Integer | Defines the maximum size of the output token. If the concatenated fingerprint exceeds this size, it will be truncated. Default is `255`. +`stopwords` | Optional | String or list of strings | A custom or predefined list of stopwords. Default is `_none_`. +`stopwords_path` | Optional | String | The path (absolute or relative to the config directory) to the file containing a list of stopwords. + + +## Example + +Use the following command to create an index named `my_custom_fingerprint_index` with a `fingerprint` analyzer: + +```json +PUT /my_custom_fingerprint_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_custom_fingerprint_analyzer": { + "type": "fingerprint", + "separator": "-", + "max_output_size": 50, + "stopwords": ["to", "the", "over", "and"] + } + } + } + }, + "mappings": { + "properties": { + "my_field": { + "type": "text", + "analyzer": "my_custom_fingerprint_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_custom_fingerprint_index/_analyze +{ + "analyzer": "my_custom_fingerprint_analyzer", + "text": "The slow turtle swims over to the dog" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "dog-slow-swims-turtle", + "start_offset": 0, + "end_offset": 37, + "type": "fingerprint", + "position": 0 + } + ] +} +``` + +## Further customization + +If further customization is needed, you can define an analyzer with additional `fingerprint` analyzer components: + +```json +PUT /custom_fingerprint_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "custom_fingerprint": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "asciifolding", + "fingerprint" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_analyzers/supported-analyzers/index.md b/_analyzers/supported-analyzers/index.md index 56169361792..394acc1a260 100644 --- a/_analyzers/supported-analyzers/index.md +++ b/_analyzers/supported-analyzers/index.md @@ -5,7 +5,7 @@ nav_order: 40 has_children: true has_toc: false redirect_from: - - /analyzers/supported-analyzers/index/ + - /analyzers/supported-analyzers/ --- # Analyzers @@ -18,18 +18,19 @@ The following table lists the built-in analyzers that OpenSearch provides. The l Analyzer | Analysis performed | Analyzer output :--- | :--- | :--- -**Standard** (default) | - Parses strings into tokens at word boundaries
- Removes most punctuation
- Converts tokens to lowercase | [`it’s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `2`, `to`, `opensearch`] -**Simple** | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Converts tokens to lowercase | [`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `to`, `opensearch`] -**Whitespace** | - Parses strings into tokens on white space | [`It’s`, `fun`, `to`, `contribute`, `a`,`brand-new`, `PR`, `or`, `2`, `to`, `OpenSearch!`] -**Stop** | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Removes stop words
- Converts tokens to lowercase | [`s`, `fun`, `contribute`, `brand`, `new`, `pr`, `opensearch`] -**Keyword** (no-op) | - Outputs the entire string unchanged | [`It’s fun to contribute a brand-new PR or 2 to OpenSearch!`] -**Pattern** | - Parses strings into tokens using regular expressions
- Supports converting strings to lowercase
- Supports removing stop words | [`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `2`, `to`, `opensearch`] -[**Language**]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/) | Performs analysis specific to a certain language (for example, `english`). | [`fun`, `contribut`, `brand`, `new`, `pr`, `2`, `opensearch`] -**Fingerprint** | - Parses strings on any non-letter character
- Normalizes characters by converting them to ASCII
- Converts tokens to lowercase
- Sorts, deduplicates, and concatenates tokens into a single token
- Supports removing stop words | [`2 a brand contribute fun it's new opensearch or pr to`]
Note that the apostrophe was converted to its ASCII counterpart. +[**Standard**]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/standard/) (default) | - Parses strings into tokens at word boundaries
- Removes most punctuation
- Converts tokens to lowercase | [`it’s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `2`, `to`, `opensearch`] +[**Simple**]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/simple/) | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Converts tokens to lowercase | [`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `to`, `opensearch`] +[**Whitespace**]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/whitespace/) | - Parses strings into tokens on white space | [`It’s`, `fun`, `to`, `contribute`, `a`,`brand-new`, `PR`, `or`, `2`, `to`, `OpenSearch!`] +[**Stop**]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/stop/) | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Removes stop words
- Converts tokens to lowercase | [`s`, `fun`, `contribute`, `brand`, `new`, `pr`, `opensearch`] +[**Keyword**]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/keyword/) (no-op) | - Outputs the entire string unchanged | [`It’s fun to contribute a brand-new PR or 2 to OpenSearch!`] +[**Pattern**]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/pattern/)| - Parses strings into tokens using regular expressions
- Supports converting strings to lowercase
- Supports removing stop words | [`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `2`, `to`, `opensearch`] +[**Language**]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/index/) | Performs analysis specific to a certain language (for example, `english`). | [`fun`, `contribut`, `brand`, `new`, `pr`, `2`, `opensearch`] +[**Fingerprint**]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/fingerprint/) | - Parses strings on any non-letter character
- Normalizes characters by converting them to ASCII
- Converts tokens to lowercase
- Sorts, deduplicates, and concatenates tokens into a single token
- Supports removing stop words | [`2 a brand contribute fun it's new opensearch or pr to`]
Note that the apostrophe was converted to its ASCII counterpart. +[**DL model**]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/dl-model-analyzers/) | Use ML model tokenization rules for [neural sparse search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-search/). | Model-based tokens ## Language analyzers -OpenSearch supports multiple language analyzers. For more information, see [Language analyzers]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/). +OpenSearch supports multiple language analyzers. For more information, see [Language analyzers]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/index/). ## Additional analyzers @@ -37,5 +38,6 @@ The following table lists the additional analyzers that OpenSearch supports. | Analyzer | Analysis performed | |:---------------|:---------------------------------------------------------------------------------------------------------| -| `phone` | An [index analyzer]({{site.url}}{{site.baseurl}}/analyzers/index-analyzers/) for parsing phone numbers. | -| `phone-search` | A [search analyzer]({{site.url}}{{site.baseurl}}/analyzers/search-analyzers/) for parsing phone numbers. | +| [`phone`]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/phone-analyzers/#the-phone-analyzer) | An [index analyzer]({{site.url}}{{site.baseurl}}/analyzers/index-analyzers/) for parsing phone numbers. | +| [`phone-search`]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/phone-analyzers/#the-phone-search-analyzer) | A [search analyzer]({{site.url}}{{site.baseurl}}/analyzers/search-analyzers/) for parsing phone numbers. | + diff --git a/_analyzers/supported-analyzers/keyword.md b/_analyzers/supported-analyzers/keyword.md new file mode 100644 index 00000000000..00c314d0c49 --- /dev/null +++ b/_analyzers/supported-analyzers/keyword.md @@ -0,0 +1,78 @@ +--- +layout: default +title: Keyword analyzer +parent: Analyzers +nav_order: 80 +--- + +# Keyword analyzer + +The `keyword` analyzer doesn't tokenize text at all. Instead, it treats the entire input as a single token and does not break it into individual tokens. The `keyword` analyzer is often used for fields containing email addresses, URLs, or product IDs and in other cases where tokenization is not desirable. + +## Example + +Use the following command to create an index named `my_keyword_index` with a `keyword` analyzer: + +```json +PUT /my_keyword_index +{ + "mappings": { + "properties": { + "my_field": { + "type": "text", + "analyzer": "keyword" + } + } + } +} +``` +{% include copy-curl.html %} + +## Configuring a custom analyzer + +Use the following command to configure an index with a custom analyzer that is equivalent to the `keyword` analyzer: + +```json +PUT /my_custom_keyword_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_keyword_analyzer": { + "tokenizer": "keyword" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_custom_keyword_index/_analyze +{ + "analyzer": "my_keyword_analyzer", + "text": "Just one token" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Just one token", + "start_offset": 0, + "end_offset": 14, + "type": "word", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/supported-analyzers/pattern.md b/_analyzers/supported-analyzers/pattern.md new file mode 100644 index 00000000000..bc3cb9a3060 --- /dev/null +++ b/_analyzers/supported-analyzers/pattern.md @@ -0,0 +1,97 @@ +--- +layout: default +title: Pattern analyzer +parent: Analyzers +nav_order: 90 +--- + +# Pattern analyzer + +The `pattern` analyzer allows you to define a custom analyzer that uses a regular expression (regex) to split input text into tokens. It also provides options for applying regex flags, converting tokens to lowercase, and filtering out stopwords. + +## Parameters + +The `pattern` analyzer can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`pattern` | Optional | String | A [Java regular expression](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) used to tokenize the input. Default is `\W+`. +`flags` | Optional | String | A string containing pipe-separated [Java regex flags](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#field.summary) that modify the behavior of the regular expression. +`lowercase` | Optional | Boolean | Whether to convert tokens to lowercase. Default is `true`. +`stopwords` | Optional | String or list of strings | A string specifying a predefined list of stopwords (such as `_english_`) or an array specifying a custom list of stopwords. Default is `_none_`. +`stopwords_path` | Optional | String | The path (absolute or relative to the config directory) to the file containing a list of stopwords. + + +## Example + +Use the following command to create an index named `my_pattern_index` with a `pattern` analyzer: + +```json +PUT /my_pattern_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_pattern_analyzer": { + "type": "pattern", + "pattern": "\\W+", + "lowercase": true, + "stopwords": ["and", "is"] + } + } + } + }, + "mappings": { + "properties": { + "my_field": { + "type": "text", + "analyzer": "my_pattern_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_pattern_index/_analyze +{ + "analyzer": "my_pattern_analyzer", + "text": "OpenSearch is fast and scalable" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 0, + "end_offset": 10, + "type": "word", + "position": 0 + }, + { + "token": "fast", + "start_offset": 14, + "end_offset": 18, + "type": "word", + "position": 2 + }, + { + "token": "scalable", + "start_offset": 23, + "end_offset": 31, + "type": "word", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/supported-analyzers/phone-analyzers.md b/_analyzers/supported-analyzers/phone-analyzers.md index f24b7cf3285..d94bfe192f3 100644 --- a/_analyzers/supported-analyzers/phone-analyzers.md +++ b/_analyzers/supported-analyzers/phone-analyzers.md @@ -1,6 +1,6 @@ --- layout: default -title: Phone number +title: Phone number analyzers parent: Analyzers nav_order: 140 --- diff --git a/_analyzers/supported-analyzers/simple.md b/_analyzers/supported-analyzers/simple.md new file mode 100644 index 00000000000..29f8f9a5337 --- /dev/null +++ b/_analyzers/supported-analyzers/simple.md @@ -0,0 +1,99 @@ +--- +layout: default +title: Simple analyzer +parent: Analyzers +nav_order: 100 +--- + +# Simple analyzer + +The `simple` analyzer is a very basic analyzer that breaks text into terms at non-letter characters and lowercases the terms. Unlike the `standard` analyzer, the `simple` analyzer treats everything except for alphabetic characters as delimiters, meaning that it does not recognize numbers, punctuation, or special characters as part of the tokens. + +## Example + +Use the following command to create an index named `my_simple_index` with a `simple` analyzer: + +```json +PUT /my_simple_index +{ + "mappings": { + "properties": { + "my_field": { + "type": "text", + "analyzer": "simple" + } + } + } +} +``` +{% include copy-curl.html %} + +## Configuring a custom analyzer + +Use the following command to configure an index with a custom analyzer that is equivalent to a `simple` analyzer with an added `html_strip` character filter: + +```json +PUT /my_custom_simple_index +{ + "settings": { + "analysis": { + "char_filter": { + "html_strip": { + "type": "html_strip" + } + }, + "tokenizer": { + "my_lowercase_tokenizer": { + "type": "lowercase" + } + }, + "analyzer": { + "my_custom_simple_analyzer": { + "type": "custom", + "char_filter": ["html_strip"], + "tokenizer": "my_lowercase_tokenizer", + "filter": ["lowercase"] + } + } + } + }, + "mappings": { + "properties": { + "my_field": { + "type": "text", + "analyzer": "my_custom_simple_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_custom_simple_index/_analyze +{ + "analyzer": "my_custom_simple_analyzer", + "text": "

The slow turtle swims over to dogs © 2024!

" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "the","start_offset": 3,"end_offset": 6,"type": "word","position": 0}, + {"token": "slow","start_offset": 7,"end_offset": 11,"type": "word","position": 1}, + {"token": "turtle","start_offset": 12,"end_offset": 18,"type": "word","position": 2}, + {"token": "swims","start_offset": 19,"end_offset": 24,"type": "word","position": 3}, + {"token": "over","start_offset": 25,"end_offset": 29,"type": "word","position": 4}, + {"token": "to","start_offset": 30,"end_offset": 32,"type": "word","position": 5}, + {"token": "dogs","start_offset": 33,"end_offset": 37,"type": "word","position": 6} + ] +} +``` diff --git a/_analyzers/supported-analyzers/standard.md b/_analyzers/supported-analyzers/standard.md new file mode 100644 index 00000000000..d5c3650d5da --- /dev/null +++ b/_analyzers/supported-analyzers/standard.md @@ -0,0 +1,97 @@ +--- +layout: default +title: Standard analyzer +parent: Analyzers +nav_order: 50 +--- + +# Standard analyzer + +The `standard` analyzer is the default analyzer used when no other analyzer is specified. It is designed to provide a basic and efficient approach to generic text processing. + +This analyzer consists of the following tokenizers and token filters: + +- `standard` tokenizer: Removes most punctuation and splits text on spaces and other common delimiters. +- `lowercase` token filter: Converts all tokens to lowercase, ensuring case-insensitive matching. +- `stop` token filter: Removes common stopwords, such as "the", "is", and "and", from the tokenized output. + +## Example + +Use the following command to create an index named `my_standard_index` with a `standard` analyzer: + +```json +PUT /my_standard_index +{ + "mappings": { + "properties": { + "my_field": { + "type": "text", + "analyzer": "standard" + } + } + } +} +``` +{% include copy-curl.html %} + +## Parameters + +You can configure a `standard` analyzer with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`max_token_length` | Optional | Integer | Sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at the length configured in `max_token_length`. Default is `255`. +`stopwords` | Optional | String or list of strings | A string specifying a predefined list of stopwords (such as `_english_`) or an array specifying a custom list of stopwords. Default is `_none_`. +`stopwords_path` | Optional | String | The path (absolute or relative to the config directory) to the file containing a list of stop words. + + +## Configuring a custom analyzer + +Use the following command to configure an index with a custom analyzer that is equivalent to the `standard` analyzer: + +```json +PUT /my_custom_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "stop" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_custom_index/_analyze +{ + "analyzer": "my_custom_analyzer", + "text": "The slow turtle swims away" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "slow","start_offset": 4,"end_offset": 8,"type": "","position": 1}, + {"token": "turtle","start_offset": 9,"end_offset": 15,"type": "","position": 2}, + {"token": "swims","start_offset": 16,"end_offset": 21,"type": "","position": 3}, + {"token": "away","start_offset": 22,"end_offset": 26,"type": "","position": 4} + ] +} +``` diff --git a/_analyzers/supported-analyzers/stop.md b/_analyzers/supported-analyzers/stop.md new file mode 100644 index 00000000000..df62c7fe582 --- /dev/null +++ b/_analyzers/supported-analyzers/stop.md @@ -0,0 +1,177 @@ +--- +layout: default +title: Stop analyzer +parent: Analyzers +nav_order: 110 +--- + +# Stop analyzer + +The `stop` analyzer removes a predefined list of stopwords. This analyzer consists of a `lowercase` tokenizer and a `stop` token filter. + +## Parameters + +You can configure a `stop` analyzer with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`stopwords` | Optional | String or list of strings | A string specifying a predefined list of stopwords (such as `_english_`) or an array specifying a custom list of stopwords. Default is `_english_`. +`stopwords_path` | Optional | String | The path (absolute or relative to the config directory) to the file containing a list of stopwords. + +## Example + +Use the following command to create an index named `my_stop_index` with a `stop` analyzer: + +```json +PUT /my_stop_index +{ + "mappings": { + "properties": { + "my_field": { + "type": "text", + "analyzer": "stop" + } + } + } +} +``` +{% include copy-curl.html %} + +## Configuring a custom analyzer + +Use the following command to configure an index with a custom analyzer that is equivalent to a `stop` analyzer: + +```json +PUT /my_custom_stop_analyzer_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_custom_stop_analyzer": { + "tokenizer": "lowercase", + "filter": [ + "stop" + ] + } + } + } + }, + "mappings": { + "properties": { + "my_field": { + "type": "text", + "analyzer": "my_custom_stop_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_custom_stop_analyzer_index/_analyze +{ + "analyzer": "my_custom_stop_analyzer", + "text": "The large turtle is green and brown" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "large", + "start_offset": 4, + "end_offset": 9, + "type": "word", + "position": 1 + }, + { + "token": "turtle", + "start_offset": 10, + "end_offset": 16, + "type": "word", + "position": 2 + }, + { + "token": "green", + "start_offset": 20, + "end_offset": 25, + "type": "word", + "position": 4 + }, + { + "token": "brown", + "start_offset": 30, + "end_offset": 35, + "type": "word", + "position": 6 + } + ] +} +``` + +# Specifying stopwords + +The following example request specifies a custom list of stopwords: + +```json +PUT /my_new_custom_stop_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_custom_stop_analyzer": { + "type": "stop", + "stopwords": ["is", "and", "was"] + } + } + } + }, + "mappings": { + "properties": { + "description": { + "type": "text", + "analyzer": "my_custom_stop_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +The following example request specifies a path to the file containing stopwords: + +```json +PUT /my_new_custom_stop_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_custom_stop_analyzer": { + "type": "stop", + "stopwords_path": "stopwords.txt" + } + } + } + }, + "mappings": { + "properties": { + "description": { + "type": "text", + "analyzer": "my_custom_stop_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +In this example, the file is located in the config directory. You can also specify a full path to the file. \ No newline at end of file diff --git a/_analyzers/supported-analyzers/whitespace.md b/_analyzers/supported-analyzers/whitespace.md new file mode 100644 index 00000000000..4691b4f733e --- /dev/null +++ b/_analyzers/supported-analyzers/whitespace.md @@ -0,0 +1,87 @@ +--- +layout: default +title: Whitespace analyzer +parent: Analyzers +nav_order: 120 +--- + +# Whitespace analyzer + +The `whitespace` analyzer breaks text into tokens based only on white space characters (for example, spaces and tabs). It does not apply any transformations, such as lowercasing or removing stopwords, so the original case of the text is retained and punctuation is included as part of the tokens. + +## Example + +Use the following command to create an index named `my_whitespace_index` with a `whitespace` analyzer: + +```json +PUT /my_whitespace_index +{ + "mappings": { + "properties": { + "my_field": { + "type": "text", + "analyzer": "whitespace" + } + } + } +} +``` +{% include copy-curl.html %} + +## Configuring a custom analyzer + +Use the following command to configure an index with a custom analyzer that is equivalent to a `whitespace` analyzer with an added `lowercase` character filter: + +```json +PUT /my_custom_whitespace_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_custom_whitespace_analyzer": { + "type": "custom", + "tokenizer": "whitespace", + "filter": ["lowercase"] + } + } + } + }, + "mappings": { + "properties": { + "my_field": { + "type": "text", + "analyzer": "my_custom_whitespace_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_custom_whitespace_index/_analyze +{ + "analyzer": "my_custom_whitespace_analyzer", + "text": "The SLOW turtle swims away! 123" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "the","start_offset": 0,"end_offset": 3,"type": "word","position": 0}, + {"token": "slow","start_offset": 4,"end_offset": 8,"type": "word","position": 1}, + {"token": "turtle","start_offset": 9,"end_offset": 15,"type": "word","position": 2}, + {"token": "swims","start_offset": 16,"end_offset": 21,"type": "word","position": 3}, + {"token": "away!","start_offset": 22,"end_offset": 27,"type": "word","position": 4}, + {"token": "123","start_offset": 28,"end_offset": 31,"type": "word","position": 5} + ] +} +``` diff --git a/_analyzers/token-filters/condition.md b/_analyzers/token-filters/condition.md new file mode 100644 index 00000000000..5e87c2cbbf6 --- /dev/null +++ b/_analyzers/token-filters/condition.md @@ -0,0 +1,135 @@ +--- +layout: default +title: Condition +parent: Token filters +nav_order: 70 +--- + +# Condition token filter + +The `condition` token filter is a special type of filter that allows you to apply other token filters conditionally based on certain criteria. This provides more control over when certain token filters should be applied during text analysis. +Multiple filters can be configured and only applied when they meet the conditions you define. +This token filter can be very useful for language-specific processing and handling of special characters. + + +## Parameters + +There are two parameters that must be configured in order to use the `condition` token filter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`filter` | Required | Array | Specifies which token filters should be applied to the tokens when the specified condition (defined by the `script` parameter) is met. +`script` | Required | Object | Configures an [inline script]({{site.url}}{{site.baseurl}}/api-reference/script-apis/exec-script/) that defines the condition that needs to be met in order for the filters specified in the `filter` parameter to be applied (only inline scripts are accepted). + + +## Example + +The following example request creates a new index named `my_conditional_index` and configures an analyzer with a `condition` filter. This filter applies a `lowercase` filter to any tokens that contain the character sequence "um": + +```json +PUT /my_conditional_index +{ + "settings": { + "analysis": { + "filter": { + "my_conditional_filter": { + "type": "condition", + "filter": ["lowercase"], + "script": { + "source": "token.getTerm().toString().contains('um')" + } + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "my_conditional_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_conditional_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "THE BLACK CAT JUMPS OVER A LAZY DOG" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "THE", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "BLACK", + "start_offset": 4, + "end_offset": 9, + "type": "", + "position": 1 + }, + { + "token": "CAT", + "start_offset": 10, + "end_offset": 13, + "type": "", + "position": 2 + }, + { + "token": "jumps", + "start_offset": 14, + "end_offset": 19, + "type": "", + "position": 3 + }, + { + "token": "OVER", + "start_offset": 20, + "end_offset": 24, + "type": "", + "position": 4 + }, + { + "token": "A", + "start_offset": 25, + "end_offset": 26, + "type": "", + "position": 5 + }, + { + "token": "LAZY", + "start_offset": 27, + "end_offset": 31, + "type": "", + "position": 6 + }, + { + "token": "DOG", + "start_offset": 32, + "end_offset": 35, + "type": "", + "position": 7 + } + ] +} +``` + diff --git a/_analyzers/token-filters/decimal-digit.md b/_analyzers/token-filters/decimal-digit.md new file mode 100644 index 00000000000..002375f7e50 --- /dev/null +++ b/_analyzers/token-filters/decimal-digit.md @@ -0,0 +1,88 @@ +--- +layout: default +title: Decimal digit +parent: Token filters +nav_order: 80 +--- + +# Decimal digit token filter + +The `decimal_digit` token filter is used to normalize decimal digit characters (0--9) into their ASCII equivalents in various scripts. This is useful when you want to ensure that all digits are treated uniformly in text analysis, regardless of the script in which they are written. + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `decimal_digit` filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_decimal_digit_filter": { + "type": "decimal_digit" + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["my_decimal_digit_filter"] + } + } + } + } +} + +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "123 ١٢٣ १२३" +} +``` +{% include copy-curl.html %} + +`text` breakdown: + + - "123" (ASCII digits) + - "١٢٣" (Arabic-Indic digits) + - "१२३" (Devanagari digits) + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "123", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "123", + "start_offset": 4, + "end_offset": 7, + "type": "", + "position": 1 + }, + { + "token": "123", + "start_offset": 8, + "end_offset": 11, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/delimited-payload.md b/_analyzers/token-filters/delimited-payload.md new file mode 100644 index 00000000000..f17bb1b1ce1 --- /dev/null +++ b/_analyzers/token-filters/delimited-payload.md @@ -0,0 +1,211 @@ +--- +layout: default +title: Delimited payload +parent: Token filters +nav_order: 90 +--- + +# Delimited payload token filter + +The `delimited_payload` token filter is used to parse tokens containing payloads during the analysis process. For example, the string `red|1.5 fast|2.0 car|1.0` is parsed into the tokens `red` (with a payload of `1.5`), `fast` (with a payload of `2.0`), and `car` (with a payload of `1.0`). This is particularly useful when your tokens include additional associated data (like weights, scores, or other numeric values) that you can use for scoring or custom query logic. The filter can handle different types of payloads, including integers, floats, and strings, and attach payloads (extra metadata) to tokens. + +When analyzing text, the `delimited_payload` token filter parses each token, extracts the payload, and attaches it to the token. This payload can later be used in queries to influence scoring, boosting, or other custom behaviors. + +Payloads are stored as Base64-encoded strings. By default, payloads are not returned in the query response along with the tokens. To return the payloads, you must configure additional parameters. For more information, see [Example with a stored payload]({{site.url}}{{site.baseurl}}/analyzers/token-filters/delimited-payload/#example-without-a-stored-payload). + +## Parameters + +The `delimited_payload` token filter has two parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`encoding` | Optional | String | Specifies the data type of the payload attached to the tokens. This determines how the payload data is interpreted during analysis and querying.
Valid values are:

- `float`: The payload is interpreted as a 32-bit floating-point number using IEEE 754 format (for example, `2.5` in `car|2.5`).
- `identity`: The payload is interpreted as a sequence of characters (for example, in `user|admin`, `admin` is interpreted as a string).
- `int`: The payload is interpreted as a 32-bit integer (for example, `1` in `priority|1`).
Default is `float`. +`delimiter` | Optional | String | Specifies the character that separates the token from its payload in the input text. Default is the pipe character (`|`). + +## Example without a stored payload + +The following example request creates a new index named `my_index` and configures an analyzer with a `delimited_payload` filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_payload_filter": { + "type": "delimited_payload", + "delimiter": "|", + "encoding": "float" + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "whitespace", + "filter": ["my_payload_filter"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "red|1.5 fast|2.0 car|1.0" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "red", + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "fast", + "start_offset": 8, + "end_offset": 16, + "type": "word", + "position": 1 + }, + { + "token": "car", + "start_offset": 17, + "end_offset": 24, + "type": "word", + "position": 2 + } + ] +} +``` + +## Example with a stored payload + +To configure the payload to be returned in the response, create an index that stores term vectors and set `term_vector` to `with_positions_payloads` or `with_positions_offsets_payloads` in the index mappings. For example, the following index is configured to store term vectors: + +```json +PUT /visible_payloads +{ + "mappings": { + "properties": { + "text": { + "type": "text", + "term_vector": "with_positions_payloads", + "analyzer": "custom_analyzer" + } + } + }, + "settings": { + "analysis": { + "filter": { + "my_payload_filter": { + "type": "delimited_payload", + "delimiter": "|", + "encoding": "float" + } + }, + "analyzer": { + "custom_analyzer": { + "tokenizer": "whitespace", + "filter": [ "my_payload_filter" ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +You can index a document into this index using the following request: + +```json +PUT /visible_payloads/_doc/1 +{ + "text": "red|1.5 fast|2.0 car|1.0" +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /visible_payloads/_termvectors/1 +{ + "fields": ["text"] +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens, which include payloads: + +```json +{ + "_index": "visible_payloads", + "_id": "1", + "_version": 1, + "found": true, + "took": 3, + "term_vectors": { + "text": { + "field_statistics": { + "sum_doc_freq": 3, + "doc_count": 1, + "sum_ttf": 3 + }, + "terms": { + "brown": { + "term_freq": 1, + "tokens": [ + { + "position": 1, + "start_offset": 10, + "end_offset": 19, + "payload": "QEAAAA==" + } + ] + }, + "fox": { + "term_freq": 1, + "tokens": [ + { + "position": 2, + "start_offset": 20, + "end_offset": 27, + "payload": "P8AAAA==" + } + ] + }, + "quick": { + "term_freq": 1, + "tokens": [ + { + "position": 0, + "start_offset": 0, + "end_offset": 9, + "payload": "QCAAAA==" + } + ] + } + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_analyzers/token-filters/dictionary-decompounder.md b/_analyzers/token-filters/dictionary-decompounder.md new file mode 100644 index 00000000000..ced6fd6fbcc --- /dev/null +++ b/_analyzers/token-filters/dictionary-decompounder.md @@ -0,0 +1,101 @@ +--- +layout: default +title: Dictionary decompounder +parent: Token filters +nav_order: 110 +--- + +# Dictionary decompounder token filter + +The `dictionary_decompounder` token filter is used to split compound words into their constituent parts based on a predefined dictionary. This filter is particularly useful for languages like German, Dutch, or Finnish, in which compound words are common, so breaking them down can improve search relevance. The `dictionary_decompounder` token filter determines whether each token (word) can be split into smaller tokens based on a list of known words. If the token can be split into known words, the filter generates the subtokens for the token. + +## Parameters + +The `dictionary_decompounder` token filter has the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`word_list` | Required unless `word_list_path` is configured | Array of strings | The dictionary of words that the filter uses to split compound words. +`word_list_path` | Required unless `word_list` is configured | String | A file path to a text file containing the dictionary words. Accepts either an absolute path or a path relative to the `config` directory. The dictionary file must be UTF-8 encoded, and each word must be listed on a separate line. +`min_word_size` | Optional | Integer | The minimum length of the entire compound word that will be considered for splitting. If a compound word is shorter than this value, it is not split. Default is `5`. +`min_subword_size` | Optional | Integer | The minimum length for any subword. If a subword is shorter than this value, it is not included in the output. Default is `2`. +`max_subword_size` | Optional | Integer | The maximum length for any subword. If a subword is longer than this value, it is not included in the output. Default is `15`. +`only_longest_match` | Optional | Boolean | If set to `true`, only the longest matching subword will be returned. Default is `false`. + +## Example + +The following example request creates a new index named `decompound_example` and configures an analyzer with the `dictionary_decompounder` filter: + +```json +PUT /decompound_example +{ + "settings": { + "analysis": { + "filter": { + "my_dictionary_decompounder": { + "type": "dictionary_decompounder", + "word_list": ["slow", "green", "turtle"] + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "my_dictionary_decompounder"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /decompound_example/_analyze +{ + "analyzer": "my_analyzer", + "text": "slowgreenturtleswim" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "slowgreenturtleswim", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + }, + { + "token": "green", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + }, + { + "token": "turtle", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/edge-ngram.md b/_analyzers/token-filters/edge-ngram.md new file mode 100644 index 00000000000..be3eaf6faba --- /dev/null +++ b/_analyzers/token-filters/edge-ngram.md @@ -0,0 +1,111 @@ +--- +layout: default +title: Edge n-gram +parent: Token filters +nav_order: 120 +--- +# Edge n-gram token filter +The `edge_ngram` token filter is very similar to the `ngram` token filter, where a particular string is split into substrings of different lengths. The `edge_ngram` token filter, however, generates n-grams (substrings) only from the beginning (edge) of a token. It's particularly useful in scenarios like autocomplete or prefix matching, where you want to match the beginning of words or phrases as the user types them. + +## Parameters + +The `edge_ngram` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min_gram` | Optional | Integer | The minimum length of the n-grams that will be generated. Default is `1`. +`max_gram` | Optional | Integer | The maximum length of the n-grams that will be generated. Default is `1` for the `edge_ngram` filter and `2` for custom token filters. Avoid setting this parameter to a low value. If the value is set too low, only very short n-grams will be generated and the search term will not be found. For example, if `max_gram` is set to `3` and you index the word "banana", the longest generated token will be "ban". If the user searches for "banana", no matches will be returned. You can use the `truncate` token filter as a search analyzer to mitigate this risk. +`preserve_original` | Optional | Boolean | Includes the original token in the output. Default is `false` . + +## Example + +The following example request creates a new index named `edge_ngram_example` and configures an analyzer with the `edge_ngram` filter: + +```json +PUT /edge_ngram_example +{ + "settings": { + "analysis": { + "filter": { + "my_edge_ngram": { + "type": "edge_ngram", + "min_gram": 3, + "max_gram": 4 + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "my_edge_ngram"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /edge_ngram_example/_analyze +{ + "analyzer": "my_analyzer", + "text": "slow green turtle" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "slo", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "gre", + "start_offset": 5, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "gree", + "start_offset": 5, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "tur", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 2 + }, + { + "token": "turt", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/elision.md b/_analyzers/token-filters/elision.md new file mode 100644 index 00000000000..abc6dba658c --- /dev/null +++ b/_analyzers/token-filters/elision.md @@ -0,0 +1,124 @@ +--- +layout: default +title: Elision +parent: Token filters +nav_order: 130 +--- + +# Elision token filter + +The `elision` token filter is used to remove elided characters from words in certain languages. Elision typically occurs in languages such as French, in which words are often contracted and combined with the following word, typically by omitting a vowel and replacing it with an apostrophe. + +The `elision` token filter is already preconfigured in the following [language analyzers]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/): `catalan`, `french`, `irish`, and `italian`. +{: .note} + +## Parameters + +The custom `elision` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`articles` | Required if `articles_path` is not configured | Array of strings | Defines which articles or short words should be removed when they appear as part of an elision. +`articles_path` | Required if `articles` is not configured | String | Specifies the path to a custom list of articles that should be removed during the analysis process. +`articles_case` | Optional | Boolean | Specifies whether the filter is case sensitive when matching elisions. Default is `false`. + +## Example + +The default set of French elisions is `l'`, `m'`, `t'`, `qu'`, `n'`, `s'`, `j'`, `d'`, `c'`, `jusqu'`, `quoiqu'`, `lorsqu'`, and `puisqu'`. You can update this by configuring the `french_elision` token filter. The following example request creates a new index named `french_texts` and configures an analyzer with a `french_elision` filter: + +```json +PUT /french_texts +{ + "settings": { + "analysis": { + "filter": { + "french_elision": { + "type": "elision", + "articles": [ "l", "t", "m", "d", "n", "s", "j" ] + } + }, + "analyzer": { + "french_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "french_elision"] + } + } + } + }, + "mappings": { + "properties": { + "text": { + "type": "text", + "analyzer": "french_analyzer" + } + } + } +} + +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /french_texts/_analyze +{ + "analyzer": "french_analyzer", + "text": "L'étudiant aime l'école et le travail." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "étudiant", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "aime", + "start_offset": 11, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "école", + "start_offset": 16, + "end_offset": 23, + "type": "", + "position": 2 + }, + { + "token": "et", + "start_offset": 24, + "end_offset": 26, + "type": "", + "position": 3 + }, + { + "token": "le", + "start_offset": 27, + "end_offset": 29, + "type": "", + "position": 4 + }, + { + "token": "travail", + "start_offset": 30, + "end_offset": 37, + "type": "", + "position": 5 + } + ] +} +``` diff --git a/_analyzers/token-filters/fingerprint.md b/_analyzers/token-filters/fingerprint.md new file mode 100644 index 00000000000..75c66154598 --- /dev/null +++ b/_analyzers/token-filters/fingerprint.md @@ -0,0 +1,86 @@ +--- +layout: default +title: Fingerprint +parent: Token filters +nav_order: 140 +--- + +# Fingerprint token filter + +The `fingerprint` token filter is used to standardize and deduplicate text. This is particularly useful when consistency in text processing is crucial. The `fingerprint` token filter achieves this by processing text using the following steps: + +1. **Lowercasing**: Converts all text to lowercase. +2. **Splitting**: Breaks the text into tokens. +3. **Sorting**: Arranges the tokens in alphabetical order. +4. **Removing duplicates**: Eliminates repeated tokens. +5. **Joining tokens**: Combines the tokens into a single string, typically joined by a space or another specified separator. + +## Parameters + +The `fingerprint` token filter can be configured with the following two parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`max_output_size` | Optional | Integer | Limits the length of the generated fingerprint string. If the concatenated string exceeds the `max_output_size`, the filter will not produce any output, resulting in an empty token. Default is `255`. +`separator` | Optional | String | Defines the character(s) used to join the tokens into a single string after they have been sorted and deduplicated. Default is space (`" "`). + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `fingerprint` token filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_fingerprint": { + "type": "fingerprint", + "max_output_size": 200, + "separator": "-" + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_fingerprint" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "OpenSearch is a powerful search engine that scales easily" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "a-easily-engine-is-opensearch-powerful-scales-search-that", + "start_offset": 0, + "end_offset": 57, + "type": "fingerprint", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/flatten-graph.md b/_analyzers/token-filters/flatten-graph.md new file mode 100644 index 00000000000..8d51c57400a --- /dev/null +++ b/_analyzers/token-filters/flatten-graph.md @@ -0,0 +1,109 @@ +--- +layout: default +title: Flatten graph +parent: Token filters +nav_order: 150 +--- + +# Flatten graph token filter + +The `flatten_graph` token filter is used to handle complex token relationships that occur when multiple tokens are generated at the same position in a graph structure. Some token filters, like `synonym_graph` and `word_delimiter_graph`, generate multi-position tokens---tokens that overlap or span multiple positions. These token graphs are useful for search queries but are not directly supported during indexing. The `flatten_graph` token filter resolves multi-position tokens into a linear sequence of tokens. Flattening the graph ensures compatibility with the indexing process. + +Token graph flattening is a lossy process. Whenever possible, avoid using the `flatten_graph` filter. Instead, apply graph token filters exclusively in search analyzers, removing the need for the `flatten_graph` filter. +{: .important} + +## Example + +The following example request creates a new index named `test_index` and configures an analyzer with a `flatten_graph` filter: + +```json +PUT /test_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_index_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "my_custom_filter", + "flatten_graph" + ] + } + }, + "filter": { + "my_custom_filter": { + "type": "word_delimiter_graph", + "catenate_all": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /test_index/_analyze +{ + "analyzer": "my_index_analyzer", + "text": "OpenSearch helped many employers" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0, + "positionLength": 2 + }, + { + "token": "Open", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "Search", + "start_offset": 4, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "helped", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 2 + }, + { + "token": "many", + "start_offset": 18, + "end_offset": 22, + "type": "", + "position": 3 + }, + { + "token": "employers", + "start_offset": 23, + "end_offset": 32, + "type": "", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/hunspell.md b/_analyzers/token-filters/hunspell.md new file mode 100644 index 00000000000..6720ba74de3 --- /dev/null +++ b/_analyzers/token-filters/hunspell.md @@ -0,0 +1,108 @@ +--- +layout: default +title: Hunspell +parent: Token filters +nav_order: 160 +--- + +# Hunspell token filter + +The `hunspell` token filter is used for stemming and morphological analysis of words in a specific language. This filter applies Hunspell dictionaries, which are widely used in spell checkers. It works by breaking down words into their root forms (stemming). + +The Hunspell dictionary files are automatically loaded at startup from the `/hunspell/` directory. For example, the `en_GB` locale must have at least one `.aff` file and one or more `.dic` files in the `/hunspell/en_GB/` directory. + +You can download these files from [LibreOffice dictionaries](https://github.com/LibreOffice/dictionaries). + +## Parameters + +The `hunspell` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`language/lang/locale` | At least one of the three is required | String | Specifies the language for the Hunspell dictionary. +`dedup` | Optional | Boolean | Determines whether to remove multiple duplicate stemming terms for the same token. Default is `true`. +`dictionary` | Optional | Array of strings | Configures the dictionary files to be used for the Hunspell dictionary. Default is all files in the `/hunspell/` directory. +`longest_only` | Optional | Boolean | Specifies whether only the longest stemmed version of the token should be returned. Default is `false`. + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `hunspell` filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_hunspell_filter": { + "type": "hunspell", + "lang": "en_GB", + "dedup": true, + "longest_only": true + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_hunspell_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "the turtle moves slowly" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "turtle", + "start_offset": 4, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "move", + "start_offset": 11, + "end_offset": 16, + "type": "", + "position": 2 + }, + { + "token": "slow", + "start_offset": 17, + "end_offset": 23, + "type": "", + "position": 3 + } + ] +} +``` diff --git a/_analyzers/token-filters/hyphenation-decompounder.md b/_analyzers/token-filters/hyphenation-decompounder.md new file mode 100644 index 00000000000..6e53d4dfd5f --- /dev/null +++ b/_analyzers/token-filters/hyphenation-decompounder.md @@ -0,0 +1,102 @@ +--- +layout: default +title: Hyphenation decompounder +parent: Token filters +nav_order: 170 +--- + +# Hyphenation decompounder token filter + +The `hyphenation_decompounder` token filter is used to break down compound words into their constituent parts. This filter is particularly useful for languages like German, Dutch, and Swedish, in which compound words are common. The filter uses hyphenation patterns (typically defined in .xml files) to identify the possible locations within a compound word where it can be split into components. These components are then checked against a provided dictionary. If there is a match, those components are treated as valid tokens. For more information about hyphenation pattern files, see [FOP XML Hyphenation Patterns](https://offo.sourceforge.net/#FOP+XML+Hyphenation+Patterns). + +## Parameters + +The `hyphenation_decompounder` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`hyphenation_patterns_path` | Required | String | The path (relative to the `config` directory or absolute) to the hyphenation patterns file, which contains the language-specific rules for word splitting. The file is typically in XML format. Sample files can be downloaded from the [OFFO SourceForge project](https://sourceforge.net/projects/offo/). +`word_list` | Required if `word_list_path` is not set | Array of strings | A list of words used to validate the components generated by the hyphenation patterns. +`word_list_path` | Required if `word_list` is not set | String | The path (relative to the `config` directory or absolute) to a list of subwords. +`max_subword_size` | Optional | Integer | The maximum subword length. If the generated subword exceeds this length, it will not be added to the generated tokens. Default is `15`. +`min_subword_size` | Optional | Integer | The minimum subword length. If the generated subword is shorter than the specified length, it will not be added to the generated tokens. Default is `2`. +`min_word_size` | Optional | Integer | The minimum word character length. Word tokens shorter than this length are excluded from decomposition into subwords. Default is `5`. +`only_longest_match` | Optional | Boolean | Only includes the longest subword in the generated tokens. Default is `false`. + +## Example + +The following example request creates a new index named `test_index` and configures an analyzer with a `hyphenation_decompounder` filter: + +```json +PUT /test_index +{ + "settings": { + "analysis": { + "filter": { + "my_hyphenation_decompounder": { + "type": "hyphenation_decompounder", + "hyphenation_patterns_path": "analysis/hyphenation_patterns.xml", + "word_list": ["notebook", "note", "book"], + "min_subword_size": 3, + "min_word_size": 5, + "only_longest_match": false + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_hyphenation_decompounder" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /test_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "notebook" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "notebook", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "note", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "book", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/index.md b/_analyzers/token-filters/index.md index 4200b988ec4..875e94db5a1 100644 --- a/_analyzers/token-filters/index.md +++ b/_analyzers/token-filters/index.md @@ -17,51 +17,51 @@ The following table lists all token filters that OpenSearch supports. Token filter | Underlying Lucene token filter| Description [`apostrophe`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/apostrophe/) | [ApostropheFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/tr/ApostropheFilter.html) | In each token containing an apostrophe, the `apostrophe` token filter removes the apostrophe itself and all characters following it. [`asciifolding`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/asciifolding/) | [ASCIIFoldingFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.html) | Converts alphabetic, numeric, and symbolic characters. -`cjk_bigram` | [CJKBigramFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKBigramFilter.html) | Forms bigrams of Chinese, Japanese, and Korean (CJK) tokens. +[`cjk_bigram`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/cjk-bigram/) | [CJKBigramFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKBigramFilter.html) | Forms bigrams of Chinese, Japanese, and Korean (CJK) tokens. [`cjk_width`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/cjk-width/) | [CJKWidthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKWidthFilter.html) | Normalizes Chinese, Japanese, and Korean (CJK) tokens according to the following rules:
- Folds full-width ASCII character variants into their equivalent basic Latin characters.
- Folds half-width katakana character variants into their equivalent kana characters. [`classic`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/classic) | [ClassicFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/classic/ClassicFilter.html) | Performs optional post-processing on the tokens generated by the classic tokenizer. Removes possessives (`'s`) and removes `.` from acronyms. [`common_grams`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/common_gram/) | [CommonGramsFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/commongrams/CommonGramsFilter.html) | Generates bigrams for a list of frequently occurring terms. The output contains both single terms and bigrams. -`conditional` | [ConditionalTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.html) | Applies an ordered list of token filters to tokens that match the conditions provided in a script. -`decimal_digit` | [DecimalDigitFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/DecimalDigitFilter.html) | Converts all digits in the Unicode decimal number general category to basic Latin digits (0--9). -`delimited_payload` | [DelimitedPayloadTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.html) | Separates a token stream into tokens with corresponding payloads, based on a provided delimiter. A token consists of all characters before the delimiter, and a payload consists of all characters after the delimiter. For example, if the delimiter is `|`, then for the string `foo|bar`, `foo` is the token and `bar` is the payload. +[`conditional`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/condition/) | [ConditionalTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.html) | Applies an ordered list of token filters to tokens that match the conditions provided in a script. +[`decimal_digit`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/decimal-digit/) | [DecimalDigitFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/DecimalDigitFilter.html) | Converts all digits in the Unicode decimal number general category to basic Latin digits (0--9). +[`delimited_payload`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/delimited-payload/) | [DelimitedPayloadTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.html) | Separates a token stream into tokens with corresponding payloads, based on a provided delimiter. A token consists of all characters preceding the delimiter, and a payload consists of all characters following the delimiter. For example, if the delimiter is `|`, then for the string `foo|bar`, `foo` is the token and `bar` is the payload. [`delimited_term_freq`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/delimited-term-frequency/) | [DelimitedTermFrequencyTokenFilter](https://lucene.apache.org/core/9_7_0/analysis/common/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.html) | Separates a token stream into tokens with corresponding term frequencies, based on a provided delimiter. A token consists of all characters before the delimiter, and a term frequency is the integer after the delimiter. For example, if the delimiter is `|`, then for the string `foo|5`, `foo` is the token and `5` is the term frequency. -`dictionary_decompounder` | [DictionaryCompoundWordTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.html) | Decomposes compound words found in many Germanic languages. -`edge_ngram` | [EdgeNGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.html) | Tokenizes the given token into edge n-grams (n-grams that start at the beginning of the token) of lengths between `min_gram` and `max_gram`. Optionally, keeps the original token. -`elision` | [ElisionFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/util/ElisionFilter.html) | Removes the specified [elisions](https://en.wikipedia.org/wiki/Elision) from the beginning of tokens. For example, changes `l'avion` (the plane) to `avion` (plane). -`fingerprint` | [FingerprintFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.html) | Sorts and deduplicates the token list and concatenates tokens into a single token. -`flatten_graph` | [FlattenGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/FlattenGraphFilter.html) | Flattens a token graph produced by a graph token filter, such as `synonym_graph` or `word_delimiter_graph`, making the graph suitable for indexing. -`hunspell` | [HunspellStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hunspell/HunspellStemFilter.html) | Uses [Hunspell](https://en.wikipedia.org/wiki/Hunspell) rules to stem tokens. Because Hunspell supports a word having multiple stems, this filter can emit multiple tokens for each consumed token. Requires you to configure one or more language-specific Hunspell dictionaries. -`hyphenation_decompounder` | [HyphenationCompoundWordTokenFilter](https://lucene.apache.org/core/9_8_0/analysis/common/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.html) | Uses XML-based hyphenation patterns to find potential subwords in compound words and checks the subwords against the specified word list. The token output contains only the subwords found in the word list. -`keep_types` | [TypeTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/TypeTokenFilter.html) | Keeps or removes tokens of a specific type. -`keep_word` | [KeepWordFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.html) | Checks the tokens against the specified word list and keeps only those that are in the list. -`keyword_marker` | [KeywordMarkerFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.html) | Marks specified tokens as keywords, preventing them from being stemmed. -`keyword_repeat` | [KeywordRepeatFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.html) | Emits each incoming token twice: once as a keyword and once as a non-keyword. -`kstem` | [KStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/KStemFilter.html) | Provides kstem-based stemming for the English language. Combines algorithmic stemming with a built-in dictionary. -`kuromoji_completion` | [JapaneseCompletionFilter](https://lucene.apache.org/core/9_10_0/analysis/kuromoji/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.html) | Adds Japanese romanized terms to the token stream (in addition to the original tokens). Usually used to support autocomplete on Japanese search terms. Note that the filter has a `mode` parameter, which should be set to `index` when used in an index analyzer and `query` when used in a search analyzer. Requires the `analysis-kuromoji` plugin. For information about installing the plugin, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#additional-plugins). -`length` | [LengthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LengthFilter.html) | Removes tokens whose lengths are shorter or longer than the length range specified by `min` and `max`. -`limit` | [LimitTokenCountFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.html) | Limits the number of output tokens. A common use case is to limit the size of document field values based on token count. -`lowercase` | [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to lowercase. The default [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) is for the English language. You can set the `language` parameter to `greek` (uses [GreekLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/el/GreekLowerCaseFilter.html)), `irish` (uses [IrishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.html)), or `turkish` (uses [TurkishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.html)). -`min_hash` | [MinHashFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/minhash/MinHashFilter.html) | Uses the [MinHash technique](https://en.wikipedia.org/wiki/MinHash) to estimate document similarity. Performs the following operations on a token stream sequentially:
1. Hashes each token in the stream.
2. Assigns the hashes to buckets, keeping only the smallest hashes of each bucket.
3. Outputs the smallest hash from each bucket as a token stream. -`multiplexer` | N/A | Emits multiple tokens at the same position. Runs each token through each of the specified filter lists separately and outputs the results as separate tokens. -`ngram` | [NGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/NGramTokenFilter.html) | Tokenizes the given token into n-grams of lengths between `min_gram` and `max_gram`. -Normalization | `arabic_normalization`: [ArabicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ar/ArabicNormalizer.html)
`german_normalization`: [GermanNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html)
`hindi_normalization`: [HindiNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hi/HindiNormalizer.html)
`indic_normalization`: [IndicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/in/IndicNormalizer.html)
`sorani_normalization`: [SoraniNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html)
`persian_normalization`: [PersianNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/fa/PersianNormalizer.html)
`scandinavian_normalization` : [ScandinavianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html)
`scandinavian_folding`: [ScandinavianFoldingFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html)
`serbian_normalization`: [SerbianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.html) | Normalizes the characters of one of the listed languages. -`pattern_capture` | N/A | Generates a token for every capture group in the provided regular expression. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). -`pattern_replace` | N/A | Matches a pattern in the provided regular expression and replaces matching substrings. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). -`phonetic` | N/A | Uses a phonetic encoder to emit a metaphone token for each token in the token stream. Requires installing the `analysis-phonetic` plugin. -`porter_stem` | [PorterStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/PorterStemFilter.html) | Uses the [Porter stemming algorithm](https://tartarus.org/martin/PorterStemmer/) to perform algorithmic stemming for the English language. -`predicate_token_filter` | N/A | Removes tokens that don’t match the specified predicate script. Supports inline Painless scripts only. -`remove_duplicates` | [RemoveDuplicatesTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.html) | Removes duplicate tokens that are in the same position. -`reverse` | [ReverseStringFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/reverse/ReverseStringFilter.html) | Reverses the string corresponding to each token in the token stream. For example, the token `dog` becomes `god`. -`shingle` | [ShingleFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/shingle/ShingleFilter.html) | Generates shingles of lengths between `min_shingle_size` and `max_shingle_size` for tokens in the token stream. Shingles are similar to n-grams but apply to words instead of letters. For example, two-word shingles added to the list of unigrams [`contribute`, `to`, `opensearch`] are [`contribute to`, `to opensearch`]. -`snowball` | N/A | Stems words using a [Snowball-generated stemmer](https://snowballstem.org/). You can use the `snowball` token filter with the following languages in the `language` field: `Arabic`, `Armenian`, `Basque`, `Catalan`, `Danish`, `Dutch`, `English`, `Estonian`, `Finnish`, `French`, `German`, `German2`, `Hungarian`, `Irish`, `Italian`, `Kp`, `Lithuanian`, `Lovins`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`, `Spanish`, `Swedish`, `Turkish`. -`stemmer` | N/A | Provides algorithmic stemming for the following languages in the `language` field: `arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `dutch_kp`, `english`, `light_english`, `lovins`, `minimal_english`, `porter2`, `possessive_english`, `estonian`, `finnish`, `light_finnish`, `french`, `light_french`, `minimal_french`, `galician`, `minimal_galician`, `german`, `german2`, `light_german`, `minimal_german`, `greek`, `hindi`, `hungarian`, `light_hungarian`, `indonesian`, `irish`, `italian`, `light_italian`, `latvian`, `Lithuanian`, `norwegian`, `light_norwegian`, `minimal_norwegian`, `light_nynorsk`, `minimal_nynorsk`, `portuguese`, `light_portuguese`, `minimal_portuguese`, `portuguese_rslp`, `romanian`, `russian`, `light_russian`, `sorani`, `spanish`, `light_spanish`, `swedish`, `light_swedish`, `turkish`. -`stemmer_override` | N/A | Overrides stemming algorithms by applying a custom mapping so that the provided terms are not stemmed. -`stop` | [StopFilter](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/analysis/StopFilter.html) | Removes stop words from a token stream. -`synonym` | N/A | Supplies a synonym list for the analysis process. The synonym list is provided using a configuration file. -`synonym_graph` | N/A | Supplies a synonym list, including multiword synonyms, for the analysis process. -`trim` | [TrimFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TrimFilter.html) | Trims leading and trailing white space from each token in a stream. -`truncate` | [TruncateTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.html) | Truncates tokens whose length exceeds the specified character limit. -`unique` | N/A | Ensures each token is unique by removing duplicate tokens from a stream. -`uppercase` | [UpperCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to uppercase. -`word_delimiter` | [WordDelimiterFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. -`word_delimiter_graph` | [WordDelimiterGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. Assigns multi-position tokens a `positionLength` attribute. +[`dictionary_decompounder`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/dictionary-decompounder/) | [DictionaryCompoundWordTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.html) | Decomposes compound words found in many Germanic languages. +[`edge_ngram`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/edge-ngram/) | [EdgeNGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.html) | Tokenizes the given token into edge n-grams (n-grams that start at the beginning of the token) of lengths between `min_gram` and `max_gram`. Optionally, keeps the original token. +[`elision`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/elision/) | [ElisionFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/util/ElisionFilter.html) | Removes the specified [elisions](https://en.wikipedia.org/wiki/Elision) from the beginning of tokens. For example, changes `l'avion` (the plane) to `avion` (plane). +[`fingerprint`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/fingerprint/) | [FingerprintFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.html) | Sorts and deduplicates the token list and concatenates tokens into a single token. +[`flatten_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/flatten-graph/) | [FlattenGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/FlattenGraphFilter.html) | Flattens a token graph produced by a graph token filter, such as `synonym_graph` or `word_delimiter_graph`, making the graph suitable for indexing. +[`hunspell`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/hunspell/) | [HunspellStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hunspell/HunspellStemFilter.html) | Uses [Hunspell](https://en.wikipedia.org/wiki/Hunspell) rules to stem tokens. Because Hunspell allows a word to have multiple stems, this filter can emit multiple tokens for each consumed token. Requires the configuration of one or more language-specific Hunspell dictionaries. +[`hyphenation_decompounder`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/hyphenation-decompounder/) | [HyphenationCompoundWordTokenFilter](https://lucene.apache.org/core/9_8_0/analysis/common/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.html) | Uses XML-based hyphenation patterns to find potential subwords in compound words and checks the subwords against the specified word list. The token output contains only the subwords found in the word list. +[`keep_types`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keep-types/) | [TypeTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/TypeTokenFilter.html) | Keeps or removes tokens of a specific type. +[`keep_words`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keep-words/) | [KeepWordFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.html) | Checks the tokens against the specified word list and keeps only those that are in the list. +[`keyword_marker`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keyword-marker/) | [KeywordMarkerFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.html) | Marks specified tokens as keywords, preventing them from being stemmed. +[`keyword_repeat`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keyword-repeat/) | [KeywordRepeatFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.html) | Emits each incoming token twice: once as a keyword and once as a non-keyword. +[`kstem`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/kstem/) | [KStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/KStemFilter.html) | Provides KStem-based stemming for the English language. Combines algorithmic stemming with a built-in dictionary. +[`kuromoji_completion`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/kuromoji-completion/) | [JapaneseCompletionFilter](https://lucene.apache.org/core/9_10_0/analysis/kuromoji/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.html) | Adds Japanese romanized terms to a token stream (in addition to the original tokens). Usually used to support autocomplete of Japanese search terms. Note that the filter has a `mode` parameter that should be set to `index` when used in an index analyzer and `query` when used in a search analyzer. Requires the `analysis-kuromoji` plugin. For information about installing the plugin, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#additional-plugins). +[`length`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/length/) | [LengthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LengthFilter.html) | Removes tokens that are shorter or longer than the length range specified by `min` and `max`. +[`limit`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/limit/) | [LimitTokenCountFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.html) | Limits the number of output tokens. For example, document field value sizes can be limited based on the token count. +[`lowercase`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/lowercase/) | [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to lowercase. The default [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) processes the English language. To process other languages, set the `language` parameter to `greek` (uses [GreekLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/el/GreekLowerCaseFilter.html)), `irish` (uses [IrishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.html)), or `turkish` (uses [TurkishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.html)). +[`min_hash`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/min-hash/) | [MinHashFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/minhash/MinHashFilter.html) | Uses the [MinHash technique](https://en.wikipedia.org/wiki/MinHash) to estimate document similarity. Performs the following operations on a token stream sequentially:
1. Hashes each token in the stream.
2. Assigns the hashes to buckets, keeping only the smallest hashes of each bucket.
3. Outputs the smallest hash from each bucket as a token stream. +[`multiplexer`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/multiplexer/) | N/A | Emits multiple tokens at the same position. Runs each token through each of the specified filter lists separately and outputs the results as separate tokens. +[`ngram`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/ngram/) | [NGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/NGramTokenFilter.html) | Tokenizes the given token into n-grams of lengths between `min_gram` and `max_gram`. +[Normalization]({{site.url}}{{site.baseurl}}/analyzers/token-filters/normalization/) | `arabic_normalization`: [ArabicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ar/ArabicNormalizer.html)
`german_normalization`: [GermanNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html)
`hindi_normalization`: [HindiNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hi/HindiNormalizer.html)
`indic_normalization`: [IndicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/in/IndicNormalizer.html)
`sorani_normalization`: [SoraniNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html)
`persian_normalization`: [PersianNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/fa/PersianNormalizer.html)
`scandinavian_normalization` : [ScandinavianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html)
`scandinavian_folding`: [ScandinavianFoldingFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html)
`serbian_normalization`: [SerbianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.html) | Normalizes the characters of one of the listed languages. +[`pattern_capture`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/pattern-capture/) | N/A | Generates a token for every capture group in the provided regular expression. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). +[`pattern_replace`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/pattern-replace/) | N/A | Matches a pattern in the provided regular expression and replaces matching substrings. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). +[`phonetic`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/phonetic/) | N/A | Uses a phonetic encoder to emit a metaphone token for each token in the token stream. Requires installing the `analysis-phonetic` plugin. +[`porter_stem`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/porter-stem/) | [PorterStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/PorterStemFilter.html) | Uses the [Porter stemming algorithm](https://tartarus.org/martin/PorterStemmer/) to perform algorithmic stemming for the English language. +[`predicate_token_filter`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/predicate-token-filter/) | N/A | Removes tokens that do not match the specified predicate script. Supports only inline Painless scripts. +[`remove_duplicates`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/remove-duplicates/) | [RemoveDuplicatesTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.html) | Removes duplicate tokens that are in the same position. +[`reverse`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/reverse/) | [ReverseStringFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/reverse/ReverseStringFilter.html) | Reverses the string corresponding to each token in the token stream. For example, the token `dog` becomes `god`. +[`shingle`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/shingle/) | [ShingleFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/shingle/ShingleFilter.html) | Generates shingles of lengths between `min_shingle_size` and `max_shingle_size` for tokens in the token stream. Shingles are similar to n-grams but are generated using words instead of letters. For example, two-word shingles added to the list of unigrams [`contribute`, `to`, `opensearch`] are [`contribute to`, `to opensearch`]. +[`snowball`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/snowball/) | N/A | Stems words using a [Snowball-generated stemmer](https://snowballstem.org/). The `snowball` token filter supports using the following languages in the `language` field: `Arabic`, `Armenian`, `Basque`, `Catalan`, `Danish`, `Dutch`, `English`, `Estonian`, `Finnish`, `French`, `German`, `German2`, `Hungarian`, `Irish`, `Italian`, `Kp`, `Lithuanian`, `Lovins`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`, `Spanish`, `Swedish`, `Turkish`. +[`stemmer`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/stemmer/) | N/A | Provides algorithmic stemming for the following languages used in the `language` field: `arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `dutch_kp`, `english`, `light_english`, `lovins`, `minimal_english`, `porter2`, `possessive_english`, `estonian`, `finnish`, `light_finnish`, `french`, `light_french`, `minimal_french`, `galician`, `minimal_galician`, `german`, `german2`, `light_german`, `minimal_german`, `greek`, `hindi`, `hungarian`, `light_hungarian`, `indonesian`, `irish`, `italian`, `light_italian`, `latvian`, `Lithuanian`, `norwegian`, `light_norwegian`, `minimal_norwegian`, `light_nynorsk`, `minimal_nynorsk`, `portuguese`, `light_portuguese`, `minimal_portuguese`, `portuguese_rslp`, `romanian`, `russian`, `light_russian`, `sorani`, `spanish`, `light_spanish`, `swedish`, `light_swedish`, `turkish`. +[`stemmer_override`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/stemmer-override/) | N/A | Overrides stemming algorithms by applying a custom mapping so that the provided terms are not stemmed. +[`stop`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/stop/) | [StopFilter](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/analysis/StopFilter.html) | Removes stop words from a token stream. +[`synonym`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/synonym/) | N/A | Supplies a synonym list for the analysis process. The synonym list is provided using a configuration file. +[`synonym_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/synonym-graph/) | N/A | Supplies a synonym list, including multiword synonyms, for the analysis process. +[`trim`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/trim/) | [TrimFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TrimFilter.html) | Trims leading and trailing white space characters from each token in a stream. +[`truncate`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/truncate/) | [TruncateTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.html) | Truncates tokens with lengths exceeding the specified character limit. +[`unique`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/unique/) | N/A | Ensures that each token is unique by removing duplicate tokens from a stream. +[`uppercase`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/uppercase/) | [UpperCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to uppercase. +[`word_delimiter`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/word-delimiter/) | [WordDelimiterFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.html) | Splits tokens on non-alphanumeric characters and performs normalization based on the specified rules. +[`word_delimiter_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/word-delimiter-graph/) | [WordDelimiterGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.html) | Splits tokens on non-alphanumeric characters and performs normalization based on the specified rules. Assigns a `positionLength` attribute to multi-position tokens. diff --git a/_analyzers/token-filters/keep-types.md b/_analyzers/token-filters/keep-types.md new file mode 100644 index 00000000000..59e617f5670 --- /dev/null +++ b/_analyzers/token-filters/keep-types.md @@ -0,0 +1,115 @@ +--- +layout: default +title: Keep types +parent: Token filters +nav_order: 180 +--- + +# Keep types token filter + +The `keep_types` token filter is a type of token filter used in text analysis to control which token types are kept or discarded. Different tokenizers produce different token types, for example, ``, ``, or ``. + +The `keyword`, `simple_pattern`, and `simple_pattern_split` tokenizers do not support the `keep_types` token filter because these tokenizers do not support token type attributes. +{: .note} + +## Parameters + +The `keep_types` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`types` | Required | List of strings | List of token types to be kept or discarded (determined by the `mode`). +`mode`| Optional | String | Whether to `include` or `exclude` the token types specified in `types`. Default is `include`. + + +## Example + +The following example request creates a new index named `test_index` and configures an analyzer with a `keep_types` filter: + +```json +PUT /test_index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "keep_types_filter"] + } + }, + "filter": { + "keep_types_filter": { + "type": "keep_types", + "types": [""] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /test_index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "Hello 2 world! This is an example." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "hello", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "world", + "start_offset": 8, + "end_offset": 13, + "type": "", + "position": 2 + }, + { + "token": "this", + "start_offset": 15, + "end_offset": 19, + "type": "", + "position": 3 + }, + { + "token": "is", + "start_offset": 20, + "end_offset": 22, + "type": "", + "position": 4 + }, + { + "token": "an", + "start_offset": 23, + "end_offset": 25, + "type": "", + "position": 5 + }, + { + "token": "example", + "start_offset": 26, + "end_offset": 33, + "type": "", + "position": 6 + } + ] +} +``` diff --git a/_analyzers/token-filters/keep-words.md b/_analyzers/token-filters/keep-words.md new file mode 100644 index 00000000000..4a6b199e5c8 --- /dev/null +++ b/_analyzers/token-filters/keep-words.md @@ -0,0 +1,92 @@ +--- +layout: default +title: Keep words +parent: Token filters +nav_order: 190 +--- + +# Keep words token filter + +The `keep_words` token filter is designed to keep only certain words during the analysis process. This filter is useful if you have a large body of text but are only interested in certain keywords or terms. + +## Parameters + +The `keep_words` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`keep_words` | Required if `keep_words_path` is not configured | List of strings | The list of words to keep. +`keep_words_path` | Required if `keep_words` is not configured | String | The path to the file containing the list of words to keep. +`keep_words_case` | Optional | Boolean | Whether to lowercase all words during comparison. Default is `false`. + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `keep_words` filter: + +```json +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_keep_word": { + "tokenizer": "standard", + "filter": [ "keep_words_filter" ] + } + }, + "filter": { + "keep_words_filter": { + "type": "keep", + "keep_words": ["example", "world", "opensearch"], + "keep_words_case": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_index/_analyze +{ + "analyzer": "custom_keep_word", + "text": "Hello, world! This is an OpenSearch example." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "world", + "start_offset": 7, + "end_offset": 12, + "type": "", + "position": 1 + }, + { + "token": "OpenSearch", + "start_offset": 25, + "end_offset": 35, + "type": "", + "position": 5 + }, + { + "token": "example", + "start_offset": 36, + "end_offset": 43, + "type": "", + "position": 6 + } + ] +} +``` diff --git a/_analyzers/token-filters/keyword-marker.md b/_analyzers/token-filters/keyword-marker.md new file mode 100644 index 00000000000..0ec2cb96f59 --- /dev/null +++ b/_analyzers/token-filters/keyword-marker.md @@ -0,0 +1,127 @@ +--- +layout: default +title: Keyword marker +parent: Token filters +nav_order: 200 +--- + +# Keyword marker token filter + +The `keyword_marker` token filter is used to prevent certain tokens from being altered by stemmers or other filters. The `keyword_marker` token filter does this by marking the specified tokens as `keywords`, which prevents any stemming or other processing. This ensures that specific words remain in their original form. + +## Parameters + +The `keyword_marker` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`ignore_case` | Optional | Boolean | Whether to ignore the letter case when matching keywords. Default is `false`. +`keywords` | Required if either `keywords_path` or `keywords_pattern` is not set | List of strings | The list of tokens to mark as keywords. +`keywords_path` | Required if either `keywords` or `keywords_pattern` is not set | String | The path (relative to the `config` directory or absolute) to the list of keywords. +`keywords_pattern` | Required if either `keywords` or `keywords_path` is not set | String | A [regular expression](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) used for matching tokens to be marked as keywords. + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `keyword_marker` filter. The filter marks the word `example` as a keyword: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "keyword_marker_filter", "stemmer"] + } + }, + "filter": { + "keyword_marker_filter": { + "type": "keyword_marker", + "keywords": ["example"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "Favorite example" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens. Note that while the word `favorite` was stemmed, the word `example` was not stemmed because it was marked as a keyword: + +```json +{ + "tokens": [ + { + "token": "favorit", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "example", + "start_offset": 9, + "end_offset": 16, + "type": "", + "position": 1 + } + ] +} +``` + +You can further examine the impact of the `keyword_marker` token filter by adding the following parameters to the `_analyze` query: + +```json +GET /my_index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "This is an OpenSearch example demonstrating keyword marker.", + "explain": true, + "attributes": "keyword" +} +``` +{% include copy-curl.html %} + +This will produce additional details in the response similar to the following: + +```json +{ + "name": "porter_stem", + "tokens": [ + ... + { + "token": "example", + "start_offset": 22, + "end_offset": 29, + "type": "", + "position": 4, + "keyword": true + }, + { + "token": "demonstr", + "start_offset": 30, + "end_offset": 43, + "type": "", + "position": 5, + "keyword": false + }, + ... + ] +} +``` diff --git a/_analyzers/token-filters/keyword-repeat.md b/_analyzers/token-filters/keyword-repeat.md new file mode 100644 index 00000000000..5ba15a037c1 --- /dev/null +++ b/_analyzers/token-filters/keyword-repeat.md @@ -0,0 +1,160 @@ +--- +layout: default +title: Keyword repeat +parent: Token filters +nav_order: 210 +--- + +# Keyword repeat token filter + +The `keyword_repeat` token filter emits the keyword version of a token into a token stream. This filter is typically used when you want to retain both the original token and its modified version after further token transformations, such as stemming or synonym expansion. The duplicated tokens allow the original, unchanged version of the token to remain in the final analysis alongside the modified versions. + +The `keyword_repeat` token filter should be placed before stemming filters. Stemming is not applied to every token, thus you may have duplicate tokens in the same position after stemming. To remove duplicate tokens, use the `remove_duplicates` token filter after the stemmer. +{: .note} + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `keyword_repeat` filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_kstem": { + "type": "kstem" + }, + "my_lowercase": { + "type": "lowercase" + } + }, + "analyzer": { + "my_custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "my_lowercase", + "keyword_repeat", + "my_kstem" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_custom_analyzer", + "text": "Stopped quickly" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "stopped", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "stop", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "quickly", + "start_offset": 8, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "quick", + "start_offset": 8, + "end_offset": 15, + "type": "", + "position": 1 + } + ] +} +``` + +You can further examine the impact of the `keyword_repeat` token filter by adding the following parameters to the `_analyze` query: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_custom_analyzer", + "text": "Stopped quickly", + "explain": true, + "attributes": "keyword" +} +``` +{% include copy-curl.html %} + +The response includes detailed information, such as tokenization, filtering, and the application of specific token filters: + +```json +{ + "detail": { + "custom_analyzer": true, + "charfilters": [], + "tokenizer": { + "name": "standard", + "tokens": [ + {"token": "OpenSearch","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "helped","start_offset": 11,"end_offset": 17,"type": "","position": 1}, + {"token": "many","start_offset": 18,"end_offset": 22,"type": "","position": 2}, + {"token": "employers","start_offset": 23,"end_offset": 32,"type": "","position": 3} + ] + }, + "tokenfilters": [ + { + "name": "lowercase", + "tokens": [ + {"token": "opensearch","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "helped","start_offset": 11,"end_offset": 17,"type": "","position": 1}, + {"token": "many","start_offset": 18,"end_offset": 22,"type": "","position": 2}, + {"token": "employers","start_offset": 23,"end_offset": 32,"type": "","position": 3} + ] + }, + { + "name": "keyword_marker_filter", + "tokens": [ + {"token": "opensearch","start_offset": 0,"end_offset": 10,"type": "","position": 0,"keyword": true}, + {"token": "helped","start_offset": 11,"end_offset": 17,"type": "","position": 1,"keyword": false}, + {"token": "many","start_offset": 18,"end_offset": 22,"type": "","position": 2,"keyword": false}, + {"token": "employers","start_offset": 23,"end_offset": 32,"type": "","position": 3,"keyword": false} + ] + }, + { + "name": "kstem_filter", + "tokens": [ + {"token": "opensearch","start_offset": 0,"end_offset": 10,"type": "","position": 0,"keyword": true}, + {"token": "help","start_offset": 11,"end_offset": 17,"type": "","position": 1,"keyword": false}, + {"token": "many","start_offset": 18,"end_offset": 22,"type": "","position": 2,"keyword": false}, + {"token": "employer","start_offset": 23,"end_offset": 32,"type": "","position": 3,"keyword": false} + ] + } + ] + } +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/kstem.md b/_analyzers/token-filters/kstem.md new file mode 100644 index 00000000000..d13fd2c6755 --- /dev/null +++ b/_analyzers/token-filters/kstem.md @@ -0,0 +1,92 @@ +--- +layout: default +title: KStem +parent: Token filters +nav_order: 220 +--- + +# KStem token filter + +The `kstem` token filter is a stemming filter used to reduce words to their root forms. The filter is a lightweight algorithmic stemmer designed for the English language that performs the following stemming operations: + +- Reduces plurals to their singular form. +- Converts different verb tenses to their base form. +- Removes common derivational endings, such as "-ing" or "-ed". + +The `kstem` token filter is equivalent to the a `stemmer` filter configured with a `light_english` language. It provides a more conservative stemming compared to other stemming filters like `porter_stem`. + +The `kstem` token filter is based on the Lucene KStemFilter. For more information, see the [Lucene documentation](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/KStemFilter.html). + +## Example + +The following example request creates a new index named `my_kstem_index` and configures an analyzer with a `kstem` filter: + +```json +PUT /my_kstem_index +{ + "settings": { + "analysis": { + "filter": { + "kstem_filter": { + "type": "kstem" + } + }, + "analyzer": { + "my_kstem_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "kstem_filter" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_kstem_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_kstem_index/_analyze +{ + "analyzer": "my_kstem_analyzer", + "text": "stops stopped" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "stop", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "stop", + "start_offset": 6, + "end_offset": 13, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/kuromoji-completion.md b/_analyzers/token-filters/kuromoji-completion.md new file mode 100644 index 00000000000..24833e92e1e --- /dev/null +++ b/_analyzers/token-filters/kuromoji-completion.md @@ -0,0 +1,127 @@ +--- +layout: default +title: Kuromoji completion +parent: Token filters +nav_order: 230 +--- + +# Kuromoji completion token filter + +The `kuromoji_completion` token filter is used to stem Katakana words in Japanese, which are often used to represent foreign words or loanwords. This filter is especially useful for autocompletion or suggest queries, in which partial matches on Katakana words can be expanded to include their full forms. + +To use this token filter, you must first install the `analysis-kuromoji` plugin on all nodes by running `bin/opensearch-plugin install analysis-kuromoji` and then restart the cluster. For more information about installing additional plugins, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/index/). + +## Example + +The following example request creates a new index named `kuromoji_sample` and configures an analyzer with a `kuromoji_completion` filter: + +```json +PUT kuromoji_sample +{ + "settings": { + "index": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "kuromoji_tokenizer", + "filter": [ + "my_katakana_stemmer" + ] + } + }, + "filter": { + "my_katakana_stemmer": { + "type": "kuromoji_completion" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer with text that translates to "use a computer": + +```json +POST /kuromoji_sample/_analyze +{ + "analyzer": "my_analyzer", + "text": "コンピューターを使う" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "コンピューター", // The original Katakana word "computer". + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "konpyuーtaー", // Romanized version (Romaji) of "コンピューター". + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "konnpyuーtaー", // Another possible romanized version of "コンピューター" (with a slight variation in the spelling). + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "を", // A Japanese particle, "wo" or "o" + "start_offset": 7, + "end_offset": 8, + "type": "word", + "position": 1 + }, + { + "token": "wo", // Romanized form of the particle "を" (often pronounced as "o"). + "start_offset": 7, + "end_offset": 8, + "type": "word", + "position": 1 + }, + { + "token": "o", // Another version of the romanization. + "start_offset": 7, + "end_offset": 8, + "type": "word", + "position": 1 + }, + { + "token": "使う", // The verb "use" in Kanji. + "start_offset": 8, + "end_offset": 10, + "type": "word", + "position": 2 + }, + { + "token": "tukau", // Romanized version of "使う" + "start_offset": 8, + "end_offset": 10, + "type": "word", + "position": 2 + }, + { + "token": "tsukau", // Another romanized version of "使う", where "tsu" is more phonetically correct + "start_offset": 8, + "end_offset": 10, + "type": "word", + "position": 2 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/length.md b/_analyzers/token-filters/length.md new file mode 100644 index 00000000000..f6c5dcc7062 --- /dev/null +++ b/_analyzers/token-filters/length.md @@ -0,0 +1,91 @@ +--- +layout: default +title: Length +parent: Token filters +nav_order: 240 +--- + +# Length token filter + +The `length` token filter is used to remove tokens that don't meet specified length criteria (minimum and maximum values) from the token stream. + +## Parameters + +The `length` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min` | Optional | Integer | The minimum token length. Default is `0`. +`max` | Optional | Integer | The maximum token length. Default is `Integer.MAX_VALUE` (`2147483647`). + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `length` filter: + +```json +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "only_keep_4_to_10_characters": { + "tokenizer": "whitespace", + "filter": [ "length_4_to_10" ] + } + }, + "filter": { + "length_4_to_10": { + "type": "length", + "min": 4, + "max": 10 + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_index/_analyze +{ + "analyzer": "only_keep_4_to_10_characters", + "text": "OpenSearch is a great tool!" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "word", + "position": 0 + }, + { + "token": "great", + "start_offset": 16, + "end_offset": 21, + "type": "word", + "position": 3 + }, + { + "token": "tool!", + "start_offset": 22, + "end_offset": 27, + "type": "word", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/limit.md b/_analyzers/token-filters/limit.md new file mode 100644 index 00000000000..a849f5f06b0 --- /dev/null +++ b/_analyzers/token-filters/limit.md @@ -0,0 +1,89 @@ +--- +layout: default +title: Limit +parent: Token filters +nav_order: 250 +--- + +# Limit token filter + +The `limit` token filter is used to limit the number of tokens passed through the analysis chain. + +## Parameters + +The `limit` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`max_token_count` | Optional | Integer | The maximum number of tokens to be generated. Default is `1`. +`consume_all_tokens` | Optional | Boolean | (Expert-level setting) Uses all tokens from the tokenizer, even if the result exceeds `max_token_count`. When this parameter is set, the output still only contains the number of tokens specified by `max_token_count`. However, all tokens generated by the tokenizer are processed. Default is `false`. + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `limit` filter: + +```json +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "three_token_limit": { + "tokenizer": "standard", + "filter": [ "custom_token_limit" ] + } + }, + "filter": { + "custom_token_limit": { + "type": "limit", + "max_token_count": 3 + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_index/_analyze +{ + "analyzer": "three_token_limit", + "text": "OpenSearch is a powerful and flexible search engine." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "is", + "start_offset": 11, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "a", + "start_offset": 14, + "end_offset": 15, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/lowercase.md b/_analyzers/token-filters/lowercase.md new file mode 100644 index 00000000000..89f0f219fa3 --- /dev/null +++ b/_analyzers/token-filters/lowercase.md @@ -0,0 +1,82 @@ +--- +layout: default +title: Lowercase +parent: Token filters +nav_order: 260 +--- + +# Lowercase token filter + +The `lowercase` token filter is used to convert all characters in the token stream to lowercase, making searches case insensitive. + +## Parameters + +The `lowercase` token filter can be configured with the following parameter. + +Parameter | Required/Optional | Description +:--- | :--- | :--- + `language` | Optional | Specifies a language-specific token filter. Valid values are:
- [`greek`](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/el/GreekLowerCaseFilter.html)
- [`irish`](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.html)
- [`turkish`](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.html).
Default is the [Lucene LowerCaseFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/core/LowerCaseFilter.html). + +## Example + +The following example request creates a new index named `custom_lowercase_example`. It configures an analyzer with a `lowercase` filter and specifies `greek` as the `language`: + +```json +PUT /custom_lowercase_example +{ + "settings": { + "analysis": { + "analyzer": { + "greek_lowercase_example": { + "type": "custom", + "tokenizer": "standard", + "filter": ["greek_lowercase"] + } + }, + "filter": { + "greek_lowercase": { + "type": "lowercase", + "language": "greek" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /custom_lowercase_example/_analyze +{ + "analyzer": "greek_lowercase_example", + "text": "Αθήνα ΕΛΛΑΔΑ" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "αθηνα", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "ελλαδα", + "start_offset": 6, + "end_offset": 12, + "type": "", + "position": 1 + } + ] +} +``` diff --git a/_analyzers/token-filters/min-hash.md b/_analyzers/token-filters/min-hash.md new file mode 100644 index 00000000000..e4f1a8da912 --- /dev/null +++ b/_analyzers/token-filters/min-hash.md @@ -0,0 +1,138 @@ +--- +layout: default +title: Min hash +parent: Token filters +nav_order: 270 +--- + +# Min hash token filter + +The `min_hash` token filter is used to generate hashes for tokens based on a [MinHash](https://en.wikipedia.org/wiki/MinHash) approximation algorithm, which is useful for detecting similarity between documents. The `min_hash` token filter generates hashes for a set of tokens (typically from an analyzed field). + +## Parameters + +The `min_hash` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`hash_count` | Optional | Integer | The number of hash values to generate for each token. Increasing this value generally improves the accuracy of similarity estimation but increases the computational cost. Default is `1`. +`bucket_count` | Optional | Integer | The number of hash buckets to use. This affects the granularity of the hashing. A larger number of buckets provides finer granularity and reduces hash collisions but requires more memory. Default is `512`. +`hash_set_size` | Optional | Integer | The number of hashes to retain in each bucket. This can influence the hashing quality. Larger set sizes may lead to better similarity detection but consume more memory. Default is `1`. +`with_rotation` | Optional | Boolean | When set to `true`, the filter populates empty buckets with the value from the first non-empty bucket found to its circular right, provided that the `hash_set_size` is `1`. If the `bucket_count` argument exceeds `1`, this setting automatically defaults to `true`; otherwise, it defaults to `false`. + +## Example + +The following example request creates a new index named `minhash_index` and configures an analyzer with a `min_hash` filter: + +```json +PUT /minhash_index +{ + "settings": { + "analysis": { + "filter": { + "minhash_filter": { + "type": "min_hash", + "hash_count": 3, + "bucket_count": 512, + "hash_set_size": 1, + "with_rotation": false + } + }, + "analyzer": { + "minhash_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "minhash_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /minhash_index/_analyze +{ + "analyzer": "minhash_analyzer", + "text": "OpenSearch is very powerful." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens (the tokens are not human readable because they represent hashes): + +```json +{ + "tokens" : [ + { + "token" : "\u0000\u0000㳠锯ੲ걌䐩䉵", + "start_offset" : 0, + "end_offset" : 27, + "type" : "MIN_HASH", + "position" : 0 + }, + { + "token" : "\u0000\u0000㳠锯ੲ걌䐩䉵", + "start_offset" : 0, + "end_offset" : 27, + "type" : "MIN_HASH", + "position" : 0 + }, + ... +``` + +In order to demonstrate the usefulness of the `min_hash` token filter, you can use the following Python script to compare the two strings using the previously created analyzer: + +```python +from opensearchpy import OpenSearch +from requests.auth import HTTPBasicAuth + +# Initialize the OpenSearch client with authentication +host = 'https://localhost:9200' # Update if using a different host/port +auth = ('admin', 'admin') # Username and password + +# Create the OpenSearch client with SSL verification turned off +client = OpenSearch( + hosts=[host], + http_auth=auth, + use_ssl=True, + verify_certs=False, # Disable SSL certificate validation + ssl_show_warn=False # Suppress SSL warnings in the output +) + +# Analyzes text and returns the minhash tokens +def analyze_text(index, text): + response = client.indices.analyze( + index=index, + body={ + "analyzer": "minhash_analyzer", + "text": text + } + ) + return [token['token'] for token in response['tokens']] + +# Analyze two similar texts +tokens_1 = analyze_text('minhash_index', 'OpenSearch is a powerful search engine.') +tokens_2 = analyze_text('minhash_index', 'OpenSearch is a very powerful search engine.') + +# Calculate Jaccard similarity +set_1 = set(tokens_1) +set_2 = set(tokens_2) +shared_tokens = set_1.intersection(set_2) +jaccard_similarity = len(shared_tokens) / len(set_1.union(set_2)) + +print(f"Jaccard Similarity: {jaccard_similarity}") +``` + +The response should contain the Jaccard similarity score: + +```yaml +Jaccard Similarity: 0.8571428571428571 +``` \ No newline at end of file diff --git a/_analyzers/token-filters/multiplexer.md b/_analyzers/token-filters/multiplexer.md new file mode 100644 index 00000000000..21597b7fc17 --- /dev/null +++ b/_analyzers/token-filters/multiplexer.md @@ -0,0 +1,165 @@ +--- +layout: default +title: Multiplexer +parent: Token filters +nav_order: 280 +--- + +# Multiplexer token filter + +The `multiplexer` token filter allows you to create multiple versions of the same token by applying different filters. This is useful when you want to analyze the same token in multiple ways. For example, you may want to analyze a token using different stemming, synonyms, or n-gram filters and use all of the generated tokens together. This token filter works by duplicating the token stream and applying different filters to each copy. + +The `multiplexer` token filter removes duplicate tokens from the token stream. +{: .important} + +The `multiplexer` token filter does not support multiword `synonym` or `synonym_graph` token filters or `shingle` token filters because they need to analyze not only the current token but also upcoming tokens in order to determine how to transform the input correctly. +{: .important} + +## Parameters + +The `multiplexer` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`filters` | Optional | List of strings | A comma-separated list of token filters to apply to each copy of the token stream. Default is an empty list. +`preserve_original` | Optional | Boolean | Whether to keep the original token as one of the outputs. Default is `true`. + +## Example + +The following example request creates a new index named `multiplexer_index` and configures an analyzer with a `multiplexer` filter: + +```json +PUT /multiplexer_index +{ + "settings": { + "analysis": { + "filter": { + "english_stemmer": { + "type": "stemmer", + "name": "english" + }, + "synonym_filter": { + "type": "synonym", + "synonyms": [ + "quick,fast" + ] + }, + "multiplexer_filter": { + "type": "multiplexer", + "filters": ["english_stemmer", "synonym_filter"], + "preserve_original": true + } + }, + "analyzer": { + "multiplexer_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "multiplexer_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /multiplexer_index/_analyze +{ + "analyzer": "multiplexer_analyzer", + "text": "The slow turtle hides from the quick dog" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "The", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 4, + "end_offset": 8, + "type": "", + "position": 1 + }, + { + "token": "turtle", + "start_offset": 9, + "end_offset": 15, + "type": "", + "position": 2 + }, + { + "token": "turtl", + "start_offset": 9, + "end_offset": 15, + "type": "", + "position": 2 + }, + { + "token": "hides", + "start_offset": 16, + "end_offset": 21, + "type": "", + "position": 3 + }, + { + "token": "hide", + "start_offset": 16, + "end_offset": 21, + "type": "", + "position": 3 + }, + { + "token": "from", + "start_offset": 22, + "end_offset": 26, + "type": "", + "position": 4 + }, + { + "token": "the", + "start_offset": 27, + "end_offset": 30, + "type": "", + "position": 5 + }, + { + "token": "quick", + "start_offset": 31, + "end_offset": 36, + "type": "", + "position": 6 + }, + { + "token": "fast", + "start_offset": 31, + "end_offset": 36, + "type": "SYNONYM", + "position": 6 + }, + { + "token": "dog", + "start_offset": 37, + "end_offset": 40, + "type": "", + "position": 7 + } + ] +} +``` diff --git a/_analyzers/token-filters/ngram.md b/_analyzers/token-filters/ngram.md new file mode 100644 index 00000000000..c029eac26e6 --- /dev/null +++ b/_analyzers/token-filters/ngram.md @@ -0,0 +1,137 @@ +--- +layout: default +title: N-gram +parent: Token filters +nav_order: 290 +--- + +# N-gram token filter + +The `ngram` token filter is a powerful tool used to break down text into smaller components, known as _n-grams_, which can improve partial matching and fuzzy search capabilities. It works by splitting a token into smaller substrings of defined lengths. These filters are commonly used in search applications to support autocomplete, partial matches, and typo-tolerant search. For more information, see [Autocomplete functionality]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/autocomplete/) and [Did-you-mean]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/did-you-mean/). + +## Parameters + +The `ngram` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min_gram` | Optional | Integer | The minimum length of the n-grams. Default is `1`. +`max_gram` | Optional | Integer | The maximum length of the n-grams. Default is `2`. +`preserve_original` | Optional | Boolean | Whether to keep the original token as one of the outputs. Default is `false`. + +## Example + +The following example request creates a new index named `ngram_example_index` and configures an analyzer with an `ngram` filter: + +```json +PUT /ngram_example_index +{ + "settings": { + "analysis": { + "filter": { + "ngram_filter": { + "type": "ngram", + "min_gram": 2, + "max_gram": 3 + } + }, + "analyzer": { + "ngram_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "ngram_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /ngram_example_index/_analyze +{ + "analyzer": "ngram_analyzer", + "text": "Search" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "se", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "sea", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "ea", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "ear", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "ar", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "arc", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "rc", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "rch", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "ch", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/normalization.md b/_analyzers/token-filters/normalization.md new file mode 100644 index 00000000000..1be08e65c2e --- /dev/null +++ b/_analyzers/token-filters/normalization.md @@ -0,0 +1,88 @@ +--- +layout: default +title: Normalization +parent: Token filters +nav_order: 300 +--- + +# Normalization token filter + +The `normalization` token filter is designed to adjust and simplify text in a way that reduces variations, particularly variations in special characters. It is primarily used to handle variations in writing by standardizing characters in specific languages. + +The following `normalization` token filters are available: + +- [arabic_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ar/ArabicNormalizer.html) +- [german_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html) +- [hindi_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/hi/HindiNormalizer.html) +- [indic_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/in/IndicNormalizer.html) +- [sorani_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html) +- [persian_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/fa/PersianNormalizer.html) +- [scandinavian_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html) +- [scandinavian_folding](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html) +- [serbian_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.html) + + +## Example + +The following example request creates a new index named `german_normalizer_example` and configures an analyzer with a `german_normalization` filter: + +```json +PUT /german_normalizer_example +{ + "settings": { + "analysis": { + "filter": { + "german_normalizer": { + "type": "german_normalization" + } + }, + "analyzer": { + "german_normalizer_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "german_normalizer" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /german_normalizer_example/_analyze +{ + "text": "Straße München", + "analyzer": "german_normalizer_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "strasse", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "munchen", + "start_offset": 7, + "end_offset": 14, + "type": "", + "position": 1 + } + ] +} +``` diff --git a/_analyzers/token-filters/pattern-capture.md b/_analyzers/token-filters/pattern-capture.md new file mode 100644 index 00000000000..cff36b583d6 --- /dev/null +++ b/_analyzers/token-filters/pattern-capture.md @@ -0,0 +1,97 @@ +--- +layout: default +title: Pattern capture +parent: Token filters +nav_order: 310 +--- + +# Pattern capture token filter + +The `pattern_capture` token filter is a powerful filter that uses regular expressions to capture and extract parts of text according to specific patterns. This filter can be useful when you want to extract particular parts of tokens, such as email domains, hashtags, or numbers, and reuse them for further analysis or indexing. + +## Parameters + +The `pattern_capture` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`patterns` | Required | Array of strings | An array of regular expressions used to capture parts of text. +`preserve_original` | Required | Boolean| Whether to keep the original token in the output. Default is `true`. + + +## Example + +The following example request creates a new index named `email_index` and configures an analyzer with a `pattern_capture` filter to extract the local part and domain name from an email address: + +```json +PUT /email_index +{ + "settings": { + "analysis": { + "filter": { + "email_pattern_capture": { + "type": "pattern_capture", + "preserve_original": true, + "patterns": [ + "^([^@]+)", + "@(.+)$" + ] + } + }, + "analyzer": { + "email_analyzer": { + "tokenizer": "uax_url_email", + "filter": [ + "email_pattern_capture", + "lowercase" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /email_index/_analyze +{ + "text": "john.doe@example.com", + "analyzer": "email_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "john.doe@example.com", + "start_offset": 0, + "end_offset": 20, + "type": "", + "position": 0 + }, + { + "token": "john.doe", + "start_offset": 0, + "end_offset": 20, + "type": "", + "position": 0 + }, + { + "token": "example.com", + "start_offset": 0, + "end_offset": 20, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/pattern-replace.md b/_analyzers/token-filters/pattern-replace.md new file mode 100644 index 00000000000..73ef7fa7d84 --- /dev/null +++ b/_analyzers/token-filters/pattern-replace.md @@ -0,0 +1,116 @@ +--- +layout: default +title: Pattern replace +parent: Token filters +nav_order: 320 +--- + +# Pattern replace token filter + +The `pattern_replace` token filter allows you to modify tokens using regular expressions. This filter replaces patterns in tokens with the specified values, giving you flexibility in transforming or normalizing tokens before indexing them. It's particularly useful when you need to clean or standardize text during analysis. + +## Parameters + +The `pattern_replace` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`pattern` | Required | String | A regular expression pattern that matches the text that needs to be replaced. +`all` | Optional | Boolean | Whether to replace all pattern matches. If `false`, only the first match is replaced. Default is `true`. +`replacement` | Optional | String | A string with which to replace the matched pattern. Default is an empty string. + + +## Example + +The following example request creates a new index named `text_index` and configures an analyzer with a `pattern_replace` filter to replace tokens containing digits with the string `[NUM]`: + +```json +PUT /text_index +{ + "settings": { + "analysis": { + "filter": { + "number_replace_filter": { + "type": "pattern_replace", + "pattern": "\\d+", + "replacement": "[NUM]" + } + }, + "analyzer": { + "number_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "number_replace_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /text_index/_analyze +{ + "text": "Visit us at 98765 Example St.", + "analyzer": "number_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "visit", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "us", + "start_offset": 6, + "end_offset": 8, + "type": "", + "position": 1 + }, + { + "token": "at", + "start_offset": 9, + "end_offset": 11, + "type": "", + "position": 2 + }, + { + "token": "[NUM]", + "start_offset": 12, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "example", + "start_offset": 18, + "end_offset": 25, + "type": "", + "position": 4 + }, + { + "token": "st", + "start_offset": 26, + "end_offset": 28, + "type": "", + "position": 5 + } + ] +} +``` diff --git a/_analyzers/token-filters/phonetic.md b/_analyzers/token-filters/phonetic.md new file mode 100644 index 00000000000..7fe380851f8 --- /dev/null +++ b/_analyzers/token-filters/phonetic.md @@ -0,0 +1,98 @@ +--- +layout: default +title: Phonetic +parent: Token filters +nav_order: 330 +--- + +# Phonetic token filter + +The `phonetic` token filter transforms tokens into their phonetic representations, enabling more flexible matching of words that sound similar but are spelled differently. This is particularly useful for searching names, brands, or other entities that users might spell differently but pronounce similarly. + +The `phonetic` token filter is not included in OpenSearch distributions by default. To use this token filter, you must first install the `analysis-phonetic` plugin as follows and then restart OpenSearch: + +```bash +./bin/opensearch-plugin install analysis-phonetic +``` +{% include copy.html %} + +For more information about installing plugins, see [Installing plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/). +{: .note} + +## Parameters + +The `phonetic` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`encoder` | Optional | String | Specifies the phonetic algorithm to use.

Valid values are:
- `metaphone` (default)
- `double_metaphone`
- `soundex`
- `refined_soundex`
- `caverphone1`
- `caverphone2`
- `cologne`
- `nysiis`
- `koelnerphonetik`
- `haasephonetik`
- `beider_morse`
- `daitch_mokotoff ` +`replace` | Optional | Boolean | Whether to replace the original token. If `false`, the original token is included in the output along with the phonetic encoding. Default is `true`. + + +## Example + +The following example request creates a new index named `names_index` and configures an analyzer with a `phonetic` filter: + +```json +PUT /names_index +{ + "settings": { + "analysis": { + "filter": { + "my_phonetic_filter": { + "type": "phonetic", + "encoder": "double_metaphone", + "replace": true + } + }, + "analyzer": { + "phonetic_analyzer": { + "tokenizer": "standard", + "filter": [ + "my_phonetic_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated for the names `Stephen` and `Steven` using the analyzer: + +```json +POST /names_index/_analyze +{ + "text": "Stephen", + "analyzer": "phonetic_analyzer" +} +``` +{% include copy-curl.html %} + +```json +POST /names_index/_analyze +{ + "text": "Steven", + "analyzer": "phonetic_analyzer" +} +``` +{% include copy-curl.html %} + +In both cases, the response contains the same generated token: + +```json +{ + "tokens": [ + { + "token": "STFN", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/porter-stem.md b/_analyzers/token-filters/porter-stem.md new file mode 100644 index 00000000000..fa2f4208a74 --- /dev/null +++ b/_analyzers/token-filters/porter-stem.md @@ -0,0 +1,83 @@ +--- +layout: default +title: Porter stem +parent: Token filters +nav_order: 340 +--- + +# Porter stem token filter + +The `porter_stem` token filter reduces words to their base (or _stem_) form and removes common suffixes from words, which helps in matching similar words by their root. For example, the word `running` is stemmed to `run`. This token filter is primarily used for the English language and provides stemming based on the [Porter stemming algorithm](https://snowballstem.org/algorithms/porter/stemmer.html). + + +## Example + +The following example request creates a new index named `my_stem_index` and configures an analyzer with a `porter_stem` filter: + +```json +PUT /my_stem_index +{ + "settings": { + "analysis": { + "filter": { + "my_porter_stem": { + "type": "porter_stem" + } + }, + "analyzer": { + "porter_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_porter_stem" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_stem_index/_analyze +{ + "text": "running runners ran", + "analyzer": "porter_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "run", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "runner", + "start_offset": 8, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "ran", + "start_offset": 16, + "end_offset": 19, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/predicate-token-filter.md b/_analyzers/token-filters/predicate-token-filter.md new file mode 100644 index 00000000000..24729f02242 --- /dev/null +++ b/_analyzers/token-filters/predicate-token-filter.md @@ -0,0 +1,82 @@ +--- +layout: default +title: Predicate token filter +parent: Token filters +nav_order: 340 +--- + +# Predicate token filter + +The `predicate_token_filter` evaluates whether tokens should be kept or discarded, depending on the conditions defined in a custom script. The tokens are evaluated in the analysis predicate context. This filter supports only inline Painless scripts. + +## Parameters + +The `predicate_token_filter` has one required parameter: `script`. This parameter provides a condition that is used to evaluate whether the token should be kept. + +## Example + +The following example request creates a new index named `predicate_index` and configures an analyzer with a `predicate_token_filter`. The filter specifies to only output tokens if they are longer than 7 characters: + +```json +PUT /predicate_index +{ + "settings": { + "analysis": { + "filter": { + "my_predicate_filter": { + "type": "predicate_token_filter", + "script": { + "source": "token.term.length() > 7" + } + } + }, + "analyzer": { + "predicate_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_predicate_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /predicate_index/_analyze +{ + "text": "The OpenSearch community is growing rapidly", + "analyzer": "predicate_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 4, + "end_offset": 14, + "type": "", + "position": 1 + }, + { + "token": "community", + "start_offset": 15, + "end_offset": 24, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/remove-duplicates.md b/_analyzers/token-filters/remove-duplicates.md new file mode 100644 index 00000000000..b0a589884ae --- /dev/null +++ b/_analyzers/token-filters/remove-duplicates.md @@ -0,0 +1,152 @@ +--- +layout: default +title: Remove duplicates +parent: Token filters +nav_order: 350 +--- + +# Remove duplicates token filter + +The `remove_duplicates` token filter is used to remove duplicate tokens that are generated in the same position during analysis. + +## Example + +The following example request creates an index with a `keyword_repeat` token filter. The filter adds a `keyword` version of each token in the same position as the token itself and then uses a `kstem` to create a stemmed version of the token: + +```json +PUT /example-index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "keyword_repeat", + "kstem" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to analyze the string `Slower turtle`: + +```json +GET /example-index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "Slower turtle" +} +``` +{% include copy-curl.html %} + +The response contains the token `turtle` twice in the same position: + +```json +{ + "tokens": [ + { + "token": "slower", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "turtle", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "turtle", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + } + ] +} +``` + +The duplicate token can be removed by adding a `remove_duplicates` token filter to the index settings: + +```json +PUT /index-remove-duplicate +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "keyword_repeat", + "kstem", + "remove_duplicates" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /index-remove-duplicate/_analyze +{ + "analyzer": "custom_analyzer", + "text": "Slower turtle" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "slower", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "turtle", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/reverse.md b/_analyzers/token-filters/reverse.md new file mode 100644 index 00000000000..dc48f07e770 --- /dev/null +++ b/_analyzers/token-filters/reverse.md @@ -0,0 +1,86 @@ +--- +layout: default +title: Reverse +parent: Token filters +nav_order: 360 +--- + +# Reverse token filter + +The `reverse` token filter reverses the order of the characters in each token, making suffix information accessible at the beginning of the reversed tokens during analysis. + +This is useful for suffix-based searches: + +The `reverse` token filter is useful when you need to perform suffix-based searches, such as in the following scenarios: + +- **Suffix matching**: Searching for words based on their suffixes, such as identifying words with a specific ending (for example, `-tion` or `-ing`). +- **File extension searches**: Searching for files by their extensions, such as `.txt` or `.jpg`. +- **Custom sorting or ranking**: By reversing tokens, you can implement unique sorting or ranking logic based on suffixes. +- **Autocomplete for suffixes**: Implementing autocomplete suggestions that use suffixes rather than prefixes. + + +## Example + +The following example request creates a new index named `my-reverse-index` and configures an analyzer with a `reverse` filter: + +```json +PUT /my-reverse-index +{ + "settings": { + "analysis": { + "filter": { + "reverse_filter": { + "type": "reverse" + } + }, + "analyzer": { + "my_reverse_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "reverse_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-reverse-index/_analyze +{ + "analyzer": "my_reverse_analyzer", + "text": "hello world" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "olleh", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "dlrow", + "start_offset": 6, + "end_offset": 11, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/shingle.md b/_analyzers/token-filters/shingle.md new file mode 100644 index 00000000000..ea961bf3e0c --- /dev/null +++ b/_analyzers/token-filters/shingle.md @@ -0,0 +1,120 @@ +--- +layout: default +title: Shingle +parent: Token filters +nav_order: 370 +--- + +# Shingle token filter + +The `shingle` token filter is used to generate word n-grams, or _shingles_, from input text. For example, for the string `slow green turtle`, the `shingle` filter creates the following one- and two-word shingles: `slow`, `slow green`, `green`, `green turtle`, and `turtle`. + +This token filter is often used in conjunction with other filters to enhance search accuracy by indexing phrases rather than individual tokens. For more information, see [Phrase suggester]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/did-you-mean/#phrase-suggester). + +## Parameters + +The `shingle` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min_shingle_size` | Optional | Integer | The minimum number of tokens to concatenate. Default is `2`. +`max_shingle_size` | Optional | Integer | The maximum number of tokens to concatenate. Default is `2`. +`output_unigrams` | Optional | Boolean | Whether to include unigrams (individual tokens) as output. Default is `true`. +`output_unigrams_if_no_shingles` | Optional | Boolean | Whether to output unigrams if no shingles are generated. Default is `false`. +`token_separator` | Optional | String | A separator used to concatenate tokens into a shingle. Default is a space (`" "`). +`filler_token` | Optional | String | A token inserted into empty positions or gaps between tokens. Default is an underscore (`_`). + +If `output_unigrams` and `output_unigrams_if_no_shingles` are both set to `true`, `output_unigrams_if_no_shingles` is ignored. +{: .note} + +## Example + +The following example request creates a new index named `my-shingle-index` and configures an analyzer with a `shingle` filter: + +```json +PUT /my-shingle-index +{ + "settings": { + "analysis": { + "filter": { + "my_shingle_filter": { + "type": "shingle", + "min_shingle_size": 2, + "max_shingle_size": 2, + "output_unigrams": true + } + }, + "analyzer": { + "my_shingle_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_shingle_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-shingle-index/_analyze +{ + "analyzer": "my_shingle_analyzer", + "text": "slow green turtle" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "slow", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "slow green", + "start_offset": 0, + "end_offset": 10, + "type": "shingle", + "position": 0, + "positionLength": 2 + }, + { + "token": "green", + "start_offset": 5, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "green turtle", + "start_offset": 5, + "end_offset": 17, + "type": "shingle", + "position": 1, + "positionLength": 2 + }, + { + "token": "turtle", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 2 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/snowball.md b/_analyzers/token-filters/snowball.md new file mode 100644 index 00000000000..149486e727e --- /dev/null +++ b/_analyzers/token-filters/snowball.md @@ -0,0 +1,108 @@ +--- +layout: default +title: Snowball +parent: Token filters +nav_order: 380 +--- + +# Snowball token filter + +The `snowball` token filter is a stemming filter based on the [Snowball](https://snowballstem.org/) algorithm. It supports many languages and is more efficient and accurate than the Porter stemming algorithm. + +## Parameters + +The `snowball` token filter can be configured with a `language` parameter that accepts the following values: + +- `Arabic` +- `Armenian` +- `Basque` +- `Catalan` +- `Danish` +- `Dutch` +- `English` (default) +- `Estonian` +- `Finnish` +- `French` +- `German` +- `German2` +- `Hungarian` +- `Italian` +- `Irish` +- `Kp` +- `Lithuanian` +- `Lovins` +- `Norwegian` +- `Porter` +- `Portuguese` +- `Romanian` +- `Russian` +- `Spanish` +- `Swedish` +- `Turkish` + +## Example + +The following example request creates a new index named `my-snowball-index` and configures an analyzer with a `snowball` filter: + +```json +PUT /my-snowball-index +{ + "settings": { + "analysis": { + "filter": { + "my_snowball_filter": { + "type": "snowball", + "language": "English" + } + }, + "analyzer": { + "my_snowball_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_snowball_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-snowball-index/_analyze +{ + "analyzer": "my_snowball_analyzer", + "text": "running runners" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "run", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "runner", + "start_offset": 8, + "end_offset": 15, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/stemmer-override.md b/_analyzers/token-filters/stemmer-override.md new file mode 100644 index 00000000000..c06f673714c --- /dev/null +++ b/_analyzers/token-filters/stemmer-override.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Stemmer override +parent: Token filters +nav_order: 400 +--- + +# Stemmer override token filter + +The `stemmer_override` token filter allows you to define custom stemming rules that override the behavior of default stemmers like Porter or Snowball. This can be useful when you want to apply specific stemming behavior to certain words that might not be modified correctly by the standard stemming algorithms. + +## Parameters + +The `stemmer_override` token filter must be configured with exactly one of the following parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +`rules` | String | Defines the override rules directly in the settings. +`rules_path` | String | Specifies the path to the file containing custom rules (mappings). The path can be either an absolute path or a path relative to the config directory. + +## Example + +The following example request creates a new index named `my-index` and configures an analyzer with a `stemmer_override` filter: + +```json +PUT /my-index +{ + "settings": { + "analysis": { + "filter": { + "my_stemmer_override_filter": { + "type": "stemmer_override", + "rules": [ + "running, runner => run", + "bought => buy", + "best => good" + ] + } + }, + "analyzer": { + "my_custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_stemmer_override_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-index/_analyze +{ + "analyzer": "my_custom_analyzer", + "text": "I am a runner and bought the best shoes" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "i", + "start_offset": 0, + "end_offset": 1, + "type": "", + "position": 0 + }, + { + "token": "am", + "start_offset": 2, + "end_offset": 4, + "type": "", + "position": 1 + }, + { + "token": "a", + "start_offset": 5, + "end_offset": 6, + "type": "", + "position": 2 + }, + { + "token": "run", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 3 + }, + { + "token": "and", + "start_offset": 14, + "end_offset": 17, + "type": "", + "position": 4 + }, + { + "token": "buy", + "start_offset": 18, + "end_offset": 24, + "type": "", + "position": 5 + }, + { + "token": "the", + "start_offset": 25, + "end_offset": 28, + "type": "", + "position": 6 + }, + { + "token": "good", + "start_offset": 29, + "end_offset": 33, + "type": "", + "position": 7 + }, + { + "token": "shoes", + "start_offset": 34, + "end_offset": 39, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/stemmer.md b/_analyzers/token-filters/stemmer.md new file mode 100644 index 00000000000..dd1344fcbc6 --- /dev/null +++ b/_analyzers/token-filters/stemmer.md @@ -0,0 +1,118 @@ +--- +layout: default +title: Stemmer +parent: Token filters +nav_order: 390 +--- + +# Stemmer token filter + +The `stemmer` token filter reduces words to their root or base form (also known as their _stem_). + +## Parameters + +The `stemmer` token filter can be configured with a `language` parameter that accepts the following values: + +- Arabic: `arabic` +- Armenian: `armenian` +- Basque: `basque` +- Bengali: `bengali` +- Brazilian Portuguese: `brazilian` +- Bulgarian: `bulgarian` +- Catalan: `catalan` +- Czech: `czech` +- Danish: `danish` +- Dutch: `dutch, dutch_kp` +- English: `english` (default), `light_english`, `lovins`, `minimal_english`, `porter2`, `possessive_english` +- Estonian: `estonian` +- Finnish: `finnish`, `light_finnish` +- French: `light_french`, `french`, `minimal_french` +- Galician: `galician`, `minimal_galician` (plural step only) +- German: `light_german`, `german`, `german2`, `minimal_german` +- Greek: `greek` +- Hindi: `hindi` +- Hungarian: `hungarian, light_hungarian` +- Indonesian: `indonesian` +- Irish: `irish` +- Italian: `light_italian, italian` +- Kurdish (Sorani): `sorani` +- Latvian: `latvian` +- Lithuanian: `lithuanian` +- Norwegian (Bokmål): `norwegian`, `light_norwegian`, `minimal_norwegian` +- Norwegian (Nynorsk): `light_nynorsk`, `minimal_nynorsk` +- Portuguese: `light_portuguese`, `minimal_portuguese`, `portuguese`, `portuguese_rslp` +- Romanian: `romanian` +- Russian: `russian`, `light_russian` +- Spanish: `light_spanish`, `spanish` +- Swedish: `swedish`, `light_swedish` +- Turkish: `turkish` + +You can also use the `name` parameter as an alias for the `language` parameter. If both are set, the `name` parameter is ignored. +{: .note} + +## Example + +The following example request creates a new index named `my-stemmer-index` and configures an analyzer with a `stemmer` filter: + +```json +PUT /my-stemmer-index +{ + "settings": { + "analysis": { + "filter": { + "my_english_stemmer": { + "type": "stemmer", + "language": "english" + } + }, + "analyzer": { + "my_stemmer_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_english_stemmer" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-stemmer-index/_analyze +{ + "analyzer": "my_stemmer_analyzer", + "text": "running runs" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "run", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "run", + "start_offset": 8, + "end_offset": 12, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/stop.md b/_analyzers/token-filters/stop.md new file mode 100644 index 00000000000..8f3e01b72da --- /dev/null +++ b/_analyzers/token-filters/stop.md @@ -0,0 +1,111 @@ +--- +layout: default +title: Stop +parent: Token filters +nav_order: 410 +--- + +# Stop token filter + +The `stop` token filter is used to remove common words (also known as _stopwords_) from a token stream during analysis. Stopwords are typically articles and prepositions, such as `a` or `for`. These words are not significantly meaningful in search queries and are often excluded to improve search efficiency and relevance. + +The default list of English stopwords includes the following words: `a`, `an`, `and`, `are`, `as`, `at`, `be`, `but`, `by`, `for`, `if`, `in`, `into`, `is`, `it`, `no`, `not`, `of`, `on`, `or`, `such`, `that`, `the`, `their`, `then`, `there`, `these`, `they`, `this`, `to`, `was`, `will`, and `with`. + +## Parameters + +The `stop` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`stopwords` | Optional | String | Specifies either a custom array of stopwords or a language for which to fetch the predefined Lucene stopword list:

- [`_arabic_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt)
- [`_armenian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/hy/stopwords.txt)
- [`_basque_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/eu/stopwords.txt)
- [`_bengali_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt)
- [`_brazilian_` (Brazilian Portuguese)](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/br/stopwords.txt)
- [`_bulgarian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt)
- [`_catalan_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ca/stopwords.txt)
- [`_cjk_` (Chinese, Japanese, and Korean)](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/cjk/stopwords.txt)
- [`_czech_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/cz/stopwords.txt)
- [`_danish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/danish_stop.txt)
- [`_dutch_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/dutch_stop.txt)
- [`_english_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java#L48) (Default)
- [`_estonian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/et/stopwords.txt)
- [`_finnish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/finnish_stop.txt)
- [`_french_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/french_stop.txt)
- [`_galician_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/gl/stopwords.txt)
- [`_german_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/german_stop.txt)
- [`_greek_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/el/stopwords.txt)
- [`_hindi_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt)
- [`_hungarian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/hungarian_stop.txt)
- [`_indonesian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/id/stopwords.txt)
- [`_irish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ga/stopwords.txt)
- [`_italian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/italian_stop.txt)
- [`_latvian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt)
- [`_lithuanian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/lt/stopwords.txt)
- [`_norwegian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/norwegian_stop.txt)
- [`_persian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt)
- [`_portuguese_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/portuguese_stop.txt)
- [`_romanian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt)
- [`_russian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/russian_stop.txt)
- [`_sorani_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/sr/stopwords.txt)
- [`_spanish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ckb/stopwords.txt)
- [`_swedish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/swedish_stop.txt)
- [`_thai_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/th/stopwords.txt)
- [`_turkish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt) +`stopwords_path` | Optional | String | Specifies the file path (absolute or relative to the config directory) of the file containing custom stopwords. +`ignore_case` | Optional | Boolean | If `true`, stopwords will be matched regardless of their case. Default is `false`. +`remove_trailing` | Optional | Boolean | If `true`, trailing stopwords will be removed during analysis. Default is `true`. + +## Example + +The following example request creates a new index named `my-stopword-index` and configures an analyzer with a `stop` filter that uses the predefined stopword list for the English language: + +```json +PUT /my-stopword-index +{ + "settings": { + "analysis": { + "filter": { + "my_stop_filter": { + "type": "stop", + "stopwords": "_english_" + } + }, + "analyzer": { + "my_stop_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_stop_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-stopword-index/_analyze +{ + "analyzer": "my_stop_analyzer", + "text": "A quick dog jumps over the turtle" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "quick", + "start_offset": 2, + "end_offset": 7, + "type": "", + "position": 1 + }, + { + "token": "dog", + "start_offset": 8, + "end_offset": 11, + "type": "", + "position": 2 + }, + { + "token": "jumps", + "start_offset": 12, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "over", + "start_offset": 18, + "end_offset": 22, + "type": "", + "position": 4 + }, + { + "token": "turtle", + "start_offset": 27, + "end_offset": 33, + "type": "", + "position": 6 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/synonym-graph.md b/_analyzers/token-filters/synonym-graph.md new file mode 100644 index 00000000000..d8e763d1fc0 --- /dev/null +++ b/_analyzers/token-filters/synonym-graph.md @@ -0,0 +1,180 @@ +--- +layout: default +title: Synonym graph +parent: Token filters +nav_order: 420 +--- + +# Synonym graph token filter + +The `synonym_graph` token filter is a more advanced version of the `synonym` token filter. It supports multiword synonyms and processes synonyms across multiple tokens, making it ideal for phrases or scenarios in which relationships between tokens are important. + +## Parameters + +The `synonym_graph` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`synonyms` | Either `synonyms` or `synonyms_path` must be specified | String | A list of synonym rules defined directly in the configuration. +`synonyms_path` | Either `synonyms` or `synonyms_path` must be specified | String | The file path to a file containing synonym rules (either an absolute path or a path relative to the config directory). +`lenient` | Optional | Boolean | Whether to ignore exceptions when loading the rule configurations. Default is `false`. +`format` | Optional | String | Specifies the format used to determine how OpenSearch defines and interprets synonyms. Valid values are:
- `solr`
- [`wordnet`](https://wordnet.princeton.edu/).
Default is `solr`. +`expand` | Optional | Boolean | Whether to expand equivalent synonym rules. Default is `true`.

For example:
If `synonyms` are defined as `"quick, fast"` and `expand` is set to `true`, then the synonym rules are configured as follows:
- `quick => quick`
- `quick => fast`
- `fast => quick`
- `fast => fast`

If `expand` is set to `false`, the synonym rules are configured as follows:
- `quick => quick`
- `fast => quick` + +## Example: Solr format + +The following example request creates a new index named `my-index` and configures an analyzer with a `synonym_graph` filter. The filter is configured with the default `solr` rule format: + +```json +PUT /my-index +{ + "settings": { + "analysis": { + "filter": { + "my_synonym_graph_filter": { + "type": "synonym_graph", + "synonyms": [ + "sports car, race car", + "fast car, speedy vehicle", + "luxury car, premium vehicle", + "electric car, EV" + ] + } + }, + "analyzer": { + "my_synonym_graph_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_synonym_graph_filter" + ] + } + } + } + } +} + +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-car-index/_analyze +{ + "analyzer": "my_synonym_graph_analyzer", + "text": "I just bought a sports car and it is a fast car." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "i","start_offset": 0,"end_offset": 1,"type": "","position": 0}, + {"token": "just","start_offset": 2,"end_offset": 6,"type": "","position": 1}, + {"token": "bought","start_offset": 7,"end_offset": 13,"type": "","position": 2}, + {"token": "a","start_offset": 14,"end_offset": 15,"type": "","position": 3}, + {"token": "race","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 4}, + {"token": "sports","start_offset": 16,"end_offset": 22,"type": "","position": 4,"positionLength": 2}, + {"token": "car","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 5,"positionLength": 2}, + {"token": "car","start_offset": 23,"end_offset": 26,"type": "","position": 6}, + {"token": "and","start_offset": 27,"end_offset": 30,"type": "","position": 7}, + {"token": "it","start_offset": 31,"end_offset": 33,"type": "","position": 8}, + {"token": "is","start_offset": 34,"end_offset": 36,"type": "","position": 9}, + {"token": "a","start_offset": 37,"end_offset": 38,"type": "","position": 10}, + {"token": "speedy","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 11}, + {"token": "fast","start_offset": 39,"end_offset": 43,"type": "","position": 11,"positionLength": 2}, + {"token": "vehicle","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 12,"positionLength": 2}, + {"token": "car","start_offset": 44,"end_offset": 47,"type": "","position": 13} + ] +} +``` + +## Example: WordNet format + +The following example request creates a new index named `my-wordnet-index` and configures an analyzer with a `synonym_graph` filter. The filter is configured with the [`wordnet`](https://wordnet.princeton.edu/) rule format: + +```json +PUT /my-wordnet-index +{ + "settings": { + "analysis": { + "filter": { + "my_synonym_graph_filter": { + "type": "synonym_graph", + "format": "wordnet", + "synonyms": [ + "s(100000001, 1, 'sports car', n, 1, 0).", + "s(100000001, 2, 'race car', n, 1, 0).", + "s(100000001, 3, 'fast car', n, 1, 0).", + "s(100000001, 4, 'speedy vehicle', n, 1, 0)." + ] + } + }, + "analyzer": { + "my_synonym_graph_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_synonym_graph_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-wordnet-index/_analyze +{ + "analyzer": "my_synonym_graph_analyzer", + "text": "I just bought a sports car and it is a fast car." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "i","start_offset": 0,"end_offset": 1,"type": "","position": 0}, + {"token": "just","start_offset": 2,"end_offset": 6,"type": "","position": 1}, + {"token": "bought","start_offset": 7,"end_offset": 13,"type": "","position": 2}, + {"token": "a","start_offset": 14,"end_offset": 15,"type": "","position": 3}, + {"token": "race","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 4}, + {"token": "fast","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 4,"positionLength": 2}, + {"token": "speedy","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 4,"positionLength": 3}, + {"token": "sports","start_offset": 16,"end_offset": 22,"type": "","position": 4,"positionLength": 4}, + {"token": "car","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 5,"positionLength": 4}, + {"token": "car","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 6,"positionLength": 3}, + {"token": "vehicle","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 7,"positionLength": 2}, + {"token": "car","start_offset": 23,"end_offset": 26,"type": "","position": 8}, + {"token": "and","start_offset": 27,"end_offset": 30,"type": "","position": 9}, + {"token": "it","start_offset": 31,"end_offset": 33,"type": "","position": 10}, + {"token": "is","start_offset": 34,"end_offset": 36,"type": "","position": 11}, + {"token": "a","start_offset": 37,"end_offset": 38,"type": "","position": 12}, + {"token": "sports","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 13}, + {"token": "race","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 13,"positionLength": 2}, + {"token": "speedy","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 13,"positionLength": 3}, + {"token": "fast","start_offset": 39,"end_offset": 43,"type": "","position": 13,"positionLength": 4}, + {"token": "car","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 14,"positionLength": 4}, + {"token": "car","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 15,"positionLength": 3}, + {"token": "vehicle","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 16,"positionLength": 2}, + {"token": "car","start_offset": 44,"end_offset": 47,"type": "","position": 17} + ] +} +``` diff --git a/_analyzers/token-filters/synonym.md b/_analyzers/token-filters/synonym.md new file mode 100644 index 00000000000..a1dfff845df --- /dev/null +++ b/_analyzers/token-filters/synonym.md @@ -0,0 +1,277 @@ +--- +layout: default +title: Synonym +parent: Token filters +nav_order: 415 +--- + +# Synonym token filter + +The `synonym` token filter allows you to map multiple terms to a single term or create equivalence groups between words, improving search flexibility. + +## Parameters + +The `synonym` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`synonyms` | Either `synonyms` or `synonyms_path` must be specified | String | A list of synonym rules defined directly in the configuration. +`synonyms_path` | Either `synonyms` or `synonyms_path` must be specified | String | The file path to a file containing synonym rules (either an absolute path or a path relative to the config directory). +`lenient` | Optional | Boolean | Whether to ignore exceptions when loading the rule configurations. Default is `false`. +`format` | Optional | String | Specifies the format used to determine how OpenSearch defines and interprets synonyms. Valid values are:
- `solr`
- [`wordnet`](https://wordnet.princeton.edu/).
Default is `solr`. +`expand` | Optional | Boolean | Whether to expand equivalent synonym rules. Default is `true`.

For example:
If `synonyms` are defined as `"quick, fast"` and `expand` is set to `true`, then the synonym rules are configured as follows:
- `quick => quick`
- `quick => fast`
- `fast => quick`
- `fast => fast`

If `expand` is set to `false`, the synonym rules are configured as follows:
- `quick => quick`
- `fast => quick` + +## Example: Solr format + +The following example request creates a new index named `my-synonym-index` and configures an analyzer with a `synonym` filter. The filter is configured with the default `solr` rule format: + +```json +PUT /my-synonym-index +{ + "settings": { + "analysis": { + "filter": { + "my_synonym_filter": { + "type": "synonym", + "synonyms": [ + "car, automobile", + "quick, fast, speedy", + "laptop => computer" + ] + } + }, + "analyzer": { + "my_synonym_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_synonym_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-synonym-index/_analyze +{ + "analyzer": "my_synonym_analyzer", + "text": "The quick dog jumps into the car with a laptop" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "quick", + "start_offset": 4, + "end_offset": 9, + "type": "", + "position": 1 + }, + { + "token": "fast", + "start_offset": 4, + "end_offset": 9, + "type": "SYNONYM", + "position": 1 + }, + { + "token": "speedy", + "start_offset": 4, + "end_offset": 9, + "type": "SYNONYM", + "position": 1 + }, + { + "token": "dog", + "start_offset": 10, + "end_offset": 13, + "type": "", + "position": 2 + }, + { + "token": "jumps", + "start_offset": 14, + "end_offset": 19, + "type": "", + "position": 3 + }, + { + "token": "into", + "start_offset": 20, + "end_offset": 24, + "type": "", + "position": 4 + }, + { + "token": "the", + "start_offset": 25, + "end_offset": 28, + "type": "", + "position": 5 + }, + { + "token": "car", + "start_offset": 29, + "end_offset": 32, + "type": "", + "position": 6 + }, + { + "token": "automobile", + "start_offset": 29, + "end_offset": 32, + "type": "SYNONYM", + "position": 6 + }, + { + "token": "with", + "start_offset": 33, + "end_offset": 37, + "type": "", + "position": 7 + }, + { + "token": "a", + "start_offset": 38, + "end_offset": 39, + "type": "", + "position": 8 + }, + { + "token": "computer", + "start_offset": 40, + "end_offset": 46, + "type": "SYNONYM", + "position": 9 + } + ] +} +``` + +## Example: WordNet format + +The following example request creates a new index named `my-wordnet-index` and configures an analyzer with a `synonym` filter. The filter is configured with the [`wordnet`](https://wordnet.princeton.edu/) rule format: + +```json +PUT /my-wordnet-index +{ + "settings": { + "analysis": { + "filter": { + "my_wordnet_synonym_filter": { + "type": "synonym", + "format": "wordnet", + "synonyms": [ + "s(100000001,1,'fast',v,1,0).", + "s(100000001,2,'quick',v,1,0).", + "s(100000001,3,'swift',v,1,0)." + ] + } + }, + "analyzer": { + "my_wordnet_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_wordnet_synonym_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-wordnet-index/_analyze +{ + "analyzer": "my_wordnet_analyzer", + "text": "I have a fast car" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "i", + "start_offset": 0, + "end_offset": 1, + "type": "", + "position": 0 + }, + { + "token": "have", + "start_offset": 2, + "end_offset": 6, + "type": "", + "position": 1 + }, + { + "token": "a", + "start_offset": 7, + "end_offset": 8, + "type": "", + "position": 2 + }, + { + "token": "fast", + "start_offset": 9, + "end_offset": 13, + "type": "", + "position": 3 + }, + { + "token": "quick", + "start_offset": 9, + "end_offset": 13, + "type": "SYNONYM", + "position": 3 + }, + { + "token": "swift", + "start_offset": 9, + "end_offset": 13, + "type": "SYNONYM", + "position": 3 + }, + { + "token": "car", + "start_offset": 14, + "end_offset": 17, + "type": "", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/trim.md b/_analyzers/token-filters/trim.md new file mode 100644 index 00000000000..cdfebed52ff --- /dev/null +++ b/_analyzers/token-filters/trim.md @@ -0,0 +1,93 @@ +--- +layout: default +title: Trim +parent: Token filters +nav_order: 430 +--- + +# Trim token filter + +The `trim` token filter removes leading and trailing white space characters from tokens. + +Many popular tokenizers, such as `standard`, `keyword`, and `whitespace` tokenizers, automatically strip leading and trailing white space characters during tokenization. When using these tokenizers, there is no need to configure an additional `trim` token filter. +{: .note} + + +## Example + +The following example request creates a new index named `my_pattern_trim_index` and configures an analyzer with a `trim` filter and a `pattern` tokenizer, which does not remove leading and trailing white space characters: + +```json +PUT /my_pattern_trim_index +{ + "settings": { + "analysis": { + "filter": { + "my_trim_filter": { + "type": "trim" + } + }, + "tokenizer": { + "my_pattern_tokenizer": { + "type": "pattern", + "pattern": "," + } + }, + "analyzer": { + "my_pattern_trim_analyzer": { + "type": "custom", + "tokenizer": "my_pattern_tokenizer", + "filter": [ + "lowercase", + "my_trim_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_pattern_trim_index/_analyze +{ + "analyzer": "my_pattern_trim_analyzer", + "text": " OpenSearch , is , powerful " +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 0, + "end_offset": 12, + "type": "word", + "position": 0 + }, + { + "token": "is", + "start_offset": 13, + "end_offset": 18, + "type": "word", + "position": 1 + }, + { + "token": "powerful", + "start_offset": 19, + "end_offset": 32, + "type": "word", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/truncate.md b/_analyzers/token-filters/truncate.md new file mode 100644 index 00000000000..16d1452901f --- /dev/null +++ b/_analyzers/token-filters/truncate.md @@ -0,0 +1,107 @@ +--- +layout: default +title: Truncate +parent: Token filters +nav_order: 440 +--- + +# Truncate token filter + +The `truncate` token filter is used to shorten tokens exceeding a specified length. It trims tokens to a maximum number of characters, ensuring that tokens exceeding this limit are truncated. + +## Parameters + +The `truncate` token filter can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`length` | Optional | Integer | Specifies the maximum length of the generated token. Default is `10`. + +## Example + +The following example request creates a new index named `truncate_example` and configures an analyzer with a `truncate` filter: + +```json +PUT /truncate_example +{ + "settings": { + "analysis": { + "filter": { + "truncate_filter": { + "type": "truncate", + "length": 5 + } + }, + "analyzer": { + "truncate_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "truncate_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /truncate_example/_analyze +{ + "analyzer": "truncate_analyzer", + "text": "OpenSearch is powerful and scalable" +} + +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opens", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "is", + "start_offset": 11, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "power", + "start_offset": 14, + "end_offset": 22, + "type": "", + "position": 2 + }, + { + "token": "and", + "start_offset": 23, + "end_offset": 26, + "type": "", + "position": 3 + }, + { + "token": "scala", + "start_offset": 27, + "end_offset": 35, + "type": "", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/unique.md b/_analyzers/token-filters/unique.md new file mode 100644 index 00000000000..c4dfcbab162 --- /dev/null +++ b/_analyzers/token-filters/unique.md @@ -0,0 +1,106 @@ +--- +layout: default +title: Unique +parent: Token filters +nav_order: 450 +--- + +# Unique token filter + +The `unique` token filter ensures that only unique tokens are kept during the analysis process, removing duplicate tokens that appear within a single field or text block. + +## Parameters + +The `unique` token filter can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`only_on_same_position` | Optional | Boolean | If `true`, the token filter acts as a `remove_duplicates` token filter and only removes tokens that are in the same position. Default is `false`. + +## Example + +The following example request creates a new index named `unique_example` and configures an analyzer with a `unique` filter: + +```json +PUT /unique_example +{ + "settings": { + "analysis": { + "filter": { + "unique_filter": { + "type": "unique", + "only_on_same_position": false + } + }, + "analyzer": { + "unique_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "unique_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /unique_example/_analyze +{ + "analyzer": "unique_analyzer", + "text": "OpenSearch OpenSearch is powerful powerful and scalable" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "is", + "start_offset": 22, + "end_offset": 24, + "type": "", + "position": 1 + }, + { + "token": "powerful", + "start_offset": 25, + "end_offset": 33, + "type": "", + "position": 2 + }, + { + "token": "and", + "start_offset": 43, + "end_offset": 46, + "type": "", + "position": 3 + }, + { + "token": "scalable", + "start_offset": 47, + "end_offset": 55, + "type": "", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/uppercase.md b/_analyzers/token-filters/uppercase.md new file mode 100644 index 00000000000..50268924003 --- /dev/null +++ b/_analyzers/token-filters/uppercase.md @@ -0,0 +1,83 @@ +--- +layout: default +title: Uppercase +parent: Token filters +nav_order: 460 +--- + +# Uppercase token filter + +The `uppercase` token filter is used to convert all tokens (words) to uppercase during analysis. + +## Example + +The following example request creates a new index named `uppercase_example` and configures an analyzer with an `uppercase` filter: + +```json +PUT /uppercase_example +{ + "settings": { + "analysis": { + "filter": { + "uppercase_filter": { + "type": "uppercase" + } + }, + "analyzer": { + "uppercase_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "uppercase_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /uppercase_example/_analyze +{ + "analyzer": "uppercase_analyzer", + "text": "OpenSearch is powerful" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OPENSEARCH", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "IS", + "start_offset": 11, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "POWERFUL", + "start_offset": 14, + "end_offset": 22, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/word-delimiter-graph.md b/_analyzers/token-filters/word-delimiter-graph.md new file mode 100644 index 00000000000..b901f5a0e53 --- /dev/null +++ b/_analyzers/token-filters/word-delimiter-graph.md @@ -0,0 +1,164 @@ +--- +layout: default +title: Word delimiter graph +parent: Token filters +nav_order: 480 +--- + +# Word delimiter graph token filter + +The `word_delimiter_graph` token filter is used to splits token on predefined characters and also offers optional token normalization based on customizable rules. + +The `word_delimiter_graph` filter is used to remove punctuation from complex identifiers like part numbers or product IDs. In such cases, it is best used with the `keyword` tokenizer. For hyphenated words, use the `synonym_graph` token filter instead of the `word_delimiter_graph` filter because users frequently search for these terms both with and without hyphens. +{: .note} + +By default, the filter applies the following rules. + +| Description | Input | Output | +|:---|:---|:---| +| Treats non-alphanumeric characters as delimiters. | `ultra-fast` | `ultra`, `fast` | +| Removes delimiters at the beginning or end of tokens. | `Z99++'Decoder'`| `Z99`, `Decoder` | +| Splits tokens when there is a transition between uppercase and lowercase letters. | `OpenSearch` | `Open`, `Search` | +| Splits tokens when there is a transition between letters and numbers. | `T1000` | `T`, `1000` | +| Removes the possessive ('s) from the end of tokens. | `John's` | `John` | + +It's important **not** to use tokenizers that strip punctuation, like the `standard` tokenizer, with this filter. Doing so may prevent proper token splitting and interfere with options like `catenate_all` or `preserve_original`. We recommend using this filter with a `keyword` or `whitespace` tokenizer. +{: .important} + +## Parameters + +You can configure the `word_delimiter_graph` token filter using the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`adjust_offsets` | Optional | Boolean | Determines whether the token offsets should be recalculated for split or concatenated tokens. When `true`, the filter adjusts the token offsets to accurately represent the token's position within the token stream. This adjustment ensures that the token's location in the text aligns with its modified form after processing, which is particularly useful for applications like highlighting or phrase queries. When `false`, the offsets remain unchanged, which may result in misalignment when the processed tokens are mapped back to their positions in the original text. If your analyzer uses filters like `trim` that change the token lengths without changing their offsets, we recommend setting this parameter to `false`. Default is `true`. +`catenate_all` | Optional | Boolean | Produces concatenated tokens from a sequence of alphanumeric parts. For example, `"quick-fast-200"` becomes `[ quickfast200, quick, fast, 200 ]`. Default is `false`. +`catenate_numbers` | Optional | Boolean | Concatenates numerical sequences. For example, `"10-20-30"` becomes `[ 102030, 10, 20, 30 ]`. Default is `false`. +`catenate_words` | Optional | Boolean | Concatenates alphabetic words. For example, `"high-speed-level"` becomes `[ highspeedlevel, high, speed, level ]`. Default is `false`. +`generate_number_parts` | Optional | Boolean | If `true`, numeric tokens (tokens consisting of numbers only) are included in the output. Default is `true`. +`generate_word_parts` | Optional | Boolean | If `true`, alphabetical tokens (tokens consisting of alphabetic characters only) are included in the output. Default is `true`. +`ignore_keywords` | Optional | Boolean | Whether to process tokens marked as keywords. Default is `false`. +`preserve_original` | Optional | Boolean | Keeps the original token (which may include non-alphanumeric delimiters) alongside the generated tokens in the output. For example, `"auto-drive-300"` becomes `[ auto-drive-300, auto, drive, 300 ]`. If `true`, the filter generates multi-position tokens not supported by indexing, so do not use this filter in an index analyzer or use the `flatten_graph` filter after this filter. Default is `false`. +`protected_words` | Optional | Array of strings | Specifies tokens that should not be split. +`protected_words_path` | Optional | String | Specifies a path (absolute or relative to the config directory) to a file containing tokens that should not be separated by new lines. +`split_on_case_change` | Optional | Boolean | Splits tokens where consecutive letters have different cases (one is lowercase and the other is uppercase). For example, `"OpenSearch"` becomes `[ Open, Search ]`. Default is `true`. +`split_on_numerics` | Optional | Boolean | Splits tokens where there are consecutive letters and numbers. For example `"v8engine"` will become `[ v, 8, engine ]`. Default is `true`. +`stem_english_possessive` | Optional | Boolean | Removes English possessive endings, such as `'s`. Default is `true`. +`type_table` | Optional | Array of strings | A custom map that specifies how to treat characters and whether to treat them as delimiters, which avoids unwanted splitting. For example, to treat a hyphen (`-`) as an alphanumeric character, specify `["- => ALPHA"]` so that words are not split on hyphens. Valid types are:
- `ALPHA`: alphabetical
- `ALPHANUM`: alphanumeric
- `DIGIT`: numeric
- `LOWER`: lowercase alphabetical
- `SUBWORD_DELIM`: non-alphanumeric delimiter
- `UPPER`: uppercase alphabetical +`type_table_path` | Optional | String | Specifies a path (absolute or relative to the config directory) to a file containing a custom character map. The map specifies how to treat characters and whether to treat them as delimiters, which avoids unwanted splitting. For valid types, see `type_table`. + +## Example + +The following example request creates a new index named `my-custom-index` and configures an analyzer with a `word_delimiter_graph` filter: + +```json +PUT /my-custom-index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "tokenizer": "keyword", + "filter": [ "custom_word_delimiter_filter" ] + } + }, + "filter": { + "custom_word_delimiter_filter": { + "type": "word_delimiter_graph", + "split_on_case_change": true, + "split_on_numerics": true, + "stem_english_possessive": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-custom-index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "FastCar's Model2023" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Fast", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "Car", + "start_offset": 4, + "end_offset": 7, + "type": "word", + "position": 1 + }, + { + "token": "Model", + "start_offset": 10, + "end_offset": 15, + "type": "word", + "position": 2 + }, + { + "token": "2023", + "start_offset": 15, + "end_offset": 19, + "type": "word", + "position": 3 + } + ] +} +``` + + +## Differences between the word_delimiter_graph and word_delimiter filters + + +Both the `word_delimiter_graph` and `word_delimiter` token filters generate tokens spanning multiple positions when any of the following parameters are set to `true`: + +- `catenate_all` +- `catenate_numbers` +- `catenate_words` +- `preserve_original` + +To illustrate the differences between these filters, consider the input text `Pro-XT500`. + + +### word_delimiter_graph + + +The `word_delimiter_graph` filter assigns a `positionLength` attribute to multi-position tokens, indicating how many positions a token spans. This ensures that the filter always generates valid token graphs, making it suitable for use in advanced token graph scenarios. Although token graphs with multi-position tokens are not supported for indexing, they can still be useful in search scenarios. For example, queries like `match_phrase` can use these graphs to generate multiple subqueries from a single input string. For the example input text, the `word_delimiter_graph` filter generates the following tokens: + +- `Pro` (position 1) +- `XT500` (position 2) +- `ProXT500` (position 1, `positionLength`: 2) + +The `positionLength` attribute the production of a valid graph to be used in advanced queries. + + +### word_delimiter + + +In contrast, the `word_delimiter` filter does not assign a `positionLength` attribute to multi-position tokens, leading to invalid graphs when these tokens are present. For the example input text, the `word_delimiter` filter generates the following tokens: + +- `Pro` (position 1) +- `XT500` (position 2) +- `ProXT500` (position 1, no `positionLength`) + +The lack of a `positionLength` attribute results in a token graph that is invalid for token streams containing multi-position tokens. \ No newline at end of file diff --git a/_analyzers/token-filters/word-delimiter.md b/_analyzers/token-filters/word-delimiter.md new file mode 100644 index 00000000000..77a71f28fb0 --- /dev/null +++ b/_analyzers/token-filters/word-delimiter.md @@ -0,0 +1,128 @@ +--- +layout: default +title: Word delimiter +parent: Token filters +nav_order: 470 +--- + +# Word delimiter token filter + +The `word_delimiter` token filter is used to splits token on predefined characters and also offers optional token normalization based on customizable rules. + +We recommend using the `word_delimiter_graph` filter instead of the `word_delimiter` filter whenever possible because the `word_delimiter` filter sometimes produces invalid token graphs. For more information about the differences between the two filters, see [Differences between the `word_delimiter_graph` and `word_delimiter` filters]({{site.url}}{{site.baseurl}}/analyzers/token-filters/word-delimiter-graph/#differences-between-the-word_delimiter_graph-and-word_delimiter-filters). +{: .important} + +The `word_delimiter` filter is used to remove punctuation from complex identifiers like part numbers or product IDs. In such cases, it is best used with the `keyword` tokenizer. For hyphenated words, use the `synonym_graph` token filter instead of the `word_delimiter` filter because users frequently search for these terms both with and without hyphens. +{: .note} + +By default, the filter applies the following rules. + +| Description | Input | Output | +|:---|:---|:---| +| Treats non-alphanumeric characters as delimiters. | `ultra-fast` | `ultra`, `fast` | +| Removes delimiters at the beginning or end of tokens. | `Z99++'Decoder'`| `Z99`, `Decoder` | +| Splits tokens when there is a transition between uppercase and lowercase letters. | `OpenSearch` | `Open`, `Search` | +| Splits tokens when there is a transition between letters and numbers. | `T1000` | `T`, `1000` | +| Removes the possessive ('s) from the end of tokens. | `John's` | `John` | + +It's important **not** to use tokenizers that strip punctuation, like the `standard` tokenizer, with this filter. Doing so may prevent proper token splitting and interfere with options like `catenate_all` or `preserve_original`. We recommend using this filter with a `keyword` or `whitespace` tokenizer. +{: .important} + +## Parameters + +You can configure the `word_delimiter` token filter using the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`catenate_all` | Optional | Boolean | Produces concatenated tokens from a sequence of alphanumeric parts. For example, `"quick-fast-200"` becomes `[ quickfast200, quick, fast, 200 ]`. Default is `false`. +`catenate_numbers` | Optional | Boolean | Concatenates numerical sequences. For example, `"10-20-30"` becomes `[ 102030, 10, 20, 30 ]`. Default is `false`. +`catenate_words` | Optional | Boolean | Concatenates alphabetic words. For example, `"high-speed-level"` becomes `[ highspeedlevel, high, speed, level ]`. Default is `false`. +`generate_number_parts` | Optional | Boolean | If `true`, numeric tokens (tokens consisting of numbers only) are included in the output. Default is `true`. +`generate_word_parts` | Optional | Boolean | If `true`, alphabetical tokens (tokens consisting of alphabetic characters only) are included in the output. Default is `true`. +`preserve_original` | Optional | Boolean | Keeps the original token (which may include non-alphanumeric delimiters) alongside the generated tokens in the output. For example, `"auto-drive-300"` becomes `[ auto-drive-300, auto, drive, 300 ]`. If `true`, the filter generates multi-position tokens not supported by indexing, so do not use this filter in an index analyzer or use the `flatten_graph` filter after this filter. Default is `false`. +`protected_words` | Optional | Array of strings | Specifies tokens that should not be split. +`protected_words_path` | Optional | String | Specifies a path (absolute or relative to the config directory) to a file containing tokens that should not be separated by new lines. +`split_on_case_change` | Optional | Boolean | Splits tokens where consecutive letters have different cases (one is lowercase and the other is uppercase). For example, `"OpenSearch"` becomes `[ Open, Search ]`. Default is `true`. +`split_on_numerics` | Optional | Boolean | Splits tokens where there are consecutive letters and numbers. For example `"v8engine"` will become `[ v, 8, engine ]`. Default is `true`. +`stem_english_possessive` | Optional | Boolean | Removes English possessive endings, such as `'s`. Default is `true`. +`type_table` | Optional | Array of strings | A custom map that specifies how to treat characters and whether to treat them as delimiters, which avoids unwanted splitting. For example, to treat a hyphen (`-`) as an alphanumeric character, specify `["- => ALPHA"]` so that words are not split on hyphens. Valid types are:
- `ALPHA`: alphabetical
- `ALPHANUM`: alphanumeric
- `DIGIT`: numeric
- `LOWER`: lowercase alphabetical
- `SUBWORD_DELIM`: non-alphanumeric delimiter
- `UPPER`: uppercase alphabetical +`type_table_path` | Optional | String | Specifies a path (absolute or relative to the config directory) to a file containing a custom character map. The map specifies how to treat characters and whether to treat them as delimiters, which avoids unwanted splitting. For valid types, see `type_table`. + +## Example + +The following example request creates a new index named `my-custom-index` and configures an analyzer with a `word_delimiter` filter: + +```json +PUT /my-custom-index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "tokenizer": "keyword", + "filter": [ "custom_word_delimiter_filter" ] + } + }, + "filter": { + "custom_word_delimiter_filter": { + "type": "word_delimiter", + "split_on_case_change": true, + "split_on_numerics": true, + "stem_english_possessive": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-custom-index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "FastCar's Model2023" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Fast", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "Car", + "start_offset": 4, + "end_offset": 7, + "type": "word", + "position": 1 + }, + { + "token": "Model", + "start_offset": 10, + "end_offset": 15, + "type": "word", + "position": 2 + }, + { + "token": "2023", + "start_offset": 15, + "end_offset": 19, + "type": "word", + "position": 3 + } + ] +} +``` diff --git a/_analyzers/token-graph.md b/_analyzers/token-graph.md new file mode 100644 index 00000000000..0dc8eba3008 --- /dev/null +++ b/_analyzers/token-graph.md @@ -0,0 +1,201 @@ +--- +layout: default +title: Token graphs +nav_order: 150 +--- + +# Token graphs + +Token graphs show how tokens relate to each other during text analysis, particularly when handling multi-word synonyms or compound words. They help ensure accurate query matching and phrase expansion. + +Each token is assigned the following metadata: + +- `position` – The location of the token in the text + +- `positionLength` – How many positions the token spans (used in multi-word expressions) + +Token graphs use this information to build a graph structure of token relationships, which is later used during query parsing. Graph-aware token filters, such as [`synonym_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/synonym-graph/) and [`word_delimiter_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/word-delimiter-graph/), enable you to match phrases more accurately. + +The following diagram depicts the relationship between `position` and `positionLength` when using [`synonym_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/synonym-graph/). The "NYC" token is assigned a `position` of `0` and a `positionLength` of `3`. + +token graph + +## Using token graphs during indexing and querying + +At index time, `positionLength` is ignored and token graphs are not used. + +During query execution, various query types can leverage token graphs, with the following being the most frequently used: + +- [`match`]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match/) +- [`match_phrase`]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match-phrase/) + +## Example: Synonym compared to synonym graph + +To better understand the difference between graph-aware token filters and standard token filters, you can use the following steps to compare the [`synonym`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/synonym/) token filter with the [`synonym_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/synonym-graph/) token filter: + +1. Create an index with a [`synonym`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/synonym/) token filter (not graph aware): + + ```json + PUT /synonym_index + { + "settings": { + "analysis": { + "filter": { + "my_synonyms": { + "type": "synonym", + "synonyms": ["ssd => solid state drive"] + } + }, + "analyzer": { + "my_analyzer": { + "tokenizer": "standard", + "filter": ["lowercase", "my_synonyms"] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_analyzer" + } + } + } + } + ``` + {% include copy-curl.html %} + +2. Create an index with a [`synonym_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/synonym-graph/) token filter (graph aware): + + ```json + PUT /synonym_graph_index + { + "settings": { + "analysis": { + "filter": { + "my_synonyms": { + "type": "synonym_graph", + "synonyms": ["ssd => solid state drive"] + } + }, + "analyzer": { + "my_analyzer": { + "tokenizer": "standard", + "filter": ["lowercase", "my_synonyms"] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_analyzer" + } + } + } + } + ``` + {% include copy-curl.html %} + +3. Create the same document in each index: + + ```json + PUT /synonym_index/_doc/1 + { "content": "ssd is critical" } + ``` + {% include copy-curl.html %} + + ```json + PUT /synonym_graph_index/_doc/1 + { "content": "ssd is critical" } + ``` + {% include copy-curl.html %} + +4. Search the non-graph-aware index: + + ```json + POST /synonym_index/_search + { + "query": { + "match_phrase": { + "content": "solid state drive is critical" + } + } + } + ``` + {% include copy-curl.html %} + + The response contains no hits: + + ```json + { + "took": 13, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 0, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + ``` + +5. Search the graph-aware index: + + ```json + POST /synonym_graph_index/_search + { + "query": { + "match_phrase": { + "content": "solid state drive is critical" + } + } + } + ``` + {% include copy-curl.html %} + + The response contains one hit: + + ```json + { + "took": 9, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.4384103, + "hits": [ + { + "_index": "synonym_graph_index", + "_id": "1", + "_score": 1.4384103, + "_source": { + "content": "ssd is critical" + } + } + ] + } + } + ``` + +A hit occurs when using the graph-aware token filter because during the [`match_phrase`]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match-phrase/) query, an additional subquery is generated using the token graph. The following diagram illustrates the token graph created by the graph-aware token filter. + +token graph \ No newline at end of file diff --git a/_analyzers/tokenizers/character-group.md b/_analyzers/tokenizers/character-group.md new file mode 100644 index 00000000000..56e52780fc8 --- /dev/null +++ b/_analyzers/tokenizers/character-group.md @@ -0,0 +1,124 @@ +--- +layout: default +title: Character group +parent: Tokenizers +nav_order: 20 +has_children: false +has_toc: false +--- + +# Character group tokenizer + +The `char_group` tokenizer splits text into tokens using specific characters as delimiters. It is suitable for situations requiring straightforward tokenization, offering a simpler alternative to pattern-based tokenizers without the added complexity. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `char_group` tokenizer. The tokenizer splits text on white space, `-`, and `:` characters: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_char_group_tokenizer": { + "type": "char_group", + "tokenize_on_chars": [ + "whitespace", + "-", + ":" + ] + } + }, + "analyzer": { + "my_char_group_analyzer": { + "type": "custom", + "tokenizer": "my_char_group_tokenizer" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_char_group_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_char_group_analyzer", + "text": "Fast-driving cars: they drive fast!" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Fast", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "driving", + "start_offset": 5, + "end_offset": 12, + "type": "word", + "position": 1 + }, + { + "token": "cars", + "start_offset": 13, + "end_offset": 17, + "type": "word", + "position": 2 + }, + { + "token": "they", + "start_offset": 19, + "end_offset": 23, + "type": "word", + "position": 3 + }, + { + "token": "drive", + "start_offset": 24, + "end_offset": 29, + "type": "word", + "position": 4 + }, + { + "token": "fast!", + "start_offset": 30, + "end_offset": 35, + "type": "word", + "position": 5 + } + ] +} +``` + +## Parameters + +The `char_group` tokenizer can be configured with the following parameters. + +| **Parameter** | **Required/Optional** | **Data type** | **Description** | +| :--- | :--- | :--- | :--- | +| `tokenize_on_chars` | Required | Array | Specifies a set of characters on which the text should be tokenized. You can specify single characters (for example, `-` or `@`), including escape characters (for example, `\n`), or character classes such as `whitespace`, `letter`, `digit`, `punctuation`, or `symbol`. | +| `max_token_length` | Optional | Integer | Sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at the length configured in `max_token_length`. Default is `255`. | \ No newline at end of file diff --git a/_analyzers/tokenizers/classic.md b/_analyzers/tokenizers/classic.md new file mode 100644 index 00000000000..c2ae4c23d02 --- /dev/null +++ b/_analyzers/tokenizers/classic.md @@ -0,0 +1,225 @@ +--- +layout: default +title: Classic +parent: Tokenizers +nav_order: 35 + +--- + +# Classic tokenizer + +The `classic` tokenizer parses text, applying English language grammatical rules to break the text into tokens. It includes specific logic to handle patterns such as the following: + +- Acronyms +- Email addresses +- Domain names +- Certain types of punctuation + +This tokenizer works best with the English language. It may not produce optimal results for other languages, especially those with different grammatical structures. +{: .note} + +The `classic` tokenizer parses text as follows: + +- **Punctuation**: Splits text on most punctuation marks and removes punctuation characters. Dots that aren't followed by spaces are treated as part of the token. +- **Hyphens**: Splits words at hyphens, except when a number is present. When a number is present in a token, the token is not split and is treated like a product number. +- **Email**: Recognizes email addresses and hostnames and keeps them as single tokens. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `classic` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_classic_analyzer": { + "type": "custom", + "tokenizer": "classic" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_classic_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_classic_analyzer", + "text": "For product AB3423, visit X&Y at example.com, email info@example.com, or call the operator's phone number 1-800-555-1234. P.S. 你好." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "For", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "product", + "start_offset": 4, + "end_offset": 11, + "type": "", + "position": 1 + }, + { + "token": "AB3423", + "start_offset": 12, + "end_offset": 18, + "type": "", + "position": 2 + }, + { + "token": "visit", + "start_offset": 20, + "end_offset": 25, + "type": "", + "position": 3 + }, + { + "token": "X&Y", + "start_offset": 26, + "end_offset": 29, + "type": "", + "position": 4 + }, + { + "token": "at", + "start_offset": 30, + "end_offset": 32, + "type": "", + "position": 5 + }, + { + "token": "example.com", + "start_offset": 33, + "end_offset": 44, + "type": "", + "position": 6 + }, + { + "token": "email", + "start_offset": 46, + "end_offset": 51, + "type": "", + "position": 7 + }, + { + "token": "info@example.com", + "start_offset": 52, + "end_offset": 68, + "type": "", + "position": 8 + }, + { + "token": "or", + "start_offset": 70, + "end_offset": 72, + "type": "", + "position": 9 + }, + { + "token": "call", + "start_offset": 73, + "end_offset": 77, + "type": "", + "position": 10 + }, + { + "token": "the", + "start_offset": 78, + "end_offset": 81, + "type": "", + "position": 11 + }, + { + "token": "operator's", + "start_offset": 82, + "end_offset": 92, + "type": "", + "position": 12 + }, + { + "token": "phone", + "start_offset": 93, + "end_offset": 98, + "type": "", + "position": 13 + }, + { + "token": "number", + "start_offset": 99, + "end_offset": 105, + "type": "", + "position": 14 + }, + { + "token": "1-800-555-1234", + "start_offset": 106, + "end_offset": 120, + "type": "", + "position": 15 + }, + { + "token": "P.S.", + "start_offset": 122, + "end_offset": 126, + "type": "", + "position": 16 + }, + { + "token": "你", + "start_offset": 127, + "end_offset": 128, + "type": "", + "position": 17 + }, + { + "token": "好", + "start_offset": 128, + "end_offset": 129, + "type": "", + "position": 18 + } + ] +} +``` + +## Token types + +The `classic` tokenizer produces the following token types. + +| Token type | Description | +| :--- | :--- | +| `` | Alphanumeric tokens consisting of letters, numbers, or a combination of both. | +| ``| Tokens containing an apostrophe, commonly used in possessives or contractions (for example, `John's`). | +| `` | Acronyms or abbreviations, often identified by a trailing period (for example, `P.S.` or `U.S.A.`). | +| `` | Tokens representing company names (for example, `X&Y`). If these tokens aren't produced automatically, you may need custom configurations or filters. | +| `` | Tokens matching email addresses, containing an `@` symbol and a domain (for example,`support@widgets.co` or `info@example.com`). | +| `` | Tokens matching website or host names, often containing `www.` or a domain suffix like `.com` (for example, `www.example.com` or `example.org`). | +| `` | Tokens containing only numbers or numeric-like sequences (for example, `1-800`, `12345`, or `3.14`). | +| `` | Tokens representing Chinese or Japanese characters. | +| `` | Deprecated acronym handling (for example, acronyms with different parsing rules in older versions). Rarely used---exists primarily for backward compatibility with legacy tokenizer rules. | diff --git a/_analyzers/tokenizers/edge-n-gram.md b/_analyzers/tokenizers/edge-n-gram.md new file mode 100644 index 00000000000..a01afaf19b7 --- /dev/null +++ b/_analyzers/tokenizers/edge-n-gram.md @@ -0,0 +1,203 @@ +--- +layout: default +title: Edge n-gram +parent: Tokenizers +nav_order: 40 +--- + +# Edge n-gram tokenizer + +The `edge_ngram` tokenizer generates partial word tokens, or _n-grams_, starting from the beginning of each word. It splits the text based on specified characters and produces tokens within a defined minimum and maximum length range. This tokenizer is particularly useful for implementing search-as-you-type functionality. + +Edge n-grams are ideal for autocomplete searches where the order of the words may vary, such as when searching for product names or addresses. For more information, see [Autocomplete]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/autocomplete/). However, for text with a fixed order, like movie or song titles, the completion suggester may be more accurate. + +By default, the `edge n-gram` tokenizer produces tokens with a minimum length of `1` and a maximum length of `2`. For example, when analyzing the text `OpenSearch`, the default configuration will produce the `O` and `Op` n-grams. These short n-grams often match too many irrelevant terms, so configuring the tokenizer is necessary in order to adjust the n-gram lengths. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with an `edge_ngram` tokenizer. The tokenizer produces tokens 3--6 characters in length, considering both letters and symbols to be valid token characters: + +```json +PUT /edge_n_gram_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_custom_analyzer": { + "tokenizer": "my_custom_tokenizer" + } + }, + "tokenizer": { + "my_custom_tokenizer": { + "type": "edge_ngram", + "min_gram": 3, + "max_gram": 6, + "token_chars": [ + "letter" ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /edge_n_gram_index/_analyze +{ + "analyzer": "my_custom_analyzer", + "text": "Code 42 rocks!" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Cod", + "start_offset": 0, + "end_offset": 3, + "type": "word", + "position": 0 + }, + { + "token": "Code", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 1 + }, + { + "token": "roc", + "start_offset": 8, + "end_offset": 11, + "type": "word", + "position": 2 + }, + { + "token": "rock", + "start_offset": 8, + "end_offset": 12, + "type": "word", + "position": 3 + }, + { + "token": "rocks", + "start_offset": 8, + "end_offset": 13, + "type": "word", + "position": 4 + } + ] +} +``` + +## Parameters + +| Parameter | Required/Optional | Data type | Description | +|:-------|:--------|:------|:---| +| `min_gram` | Optional | Integer | The minimum token length. Default is `1`. | +| `max_gram` | Optional | Integer | The maximum token length. Default is `2`. | +| `custom_token_chars`| Optional | String | Defines custom characters to be treated as part of a token (for example, `+-_`). | +| `token_chars` | Optional | Array of strings | Defines character classes to include in tokens. Tokens are split on characters not included in these classes. Default includes all characters. Available classes include:
- `letter`: Alphabetic characters (for example, `a`, `ç`, or `京`)
- `digit`: Numeric characters (for example, `3` or `7`)
- `punctuation`: Punctuation symbols (for example, `!` or `?`)
- `symbol`: Other symbols (for example, `$` or `√`)
- `whitespace`: Space or newline characters
- `custom`: Allows you to specify custom characters in the `custom_token_chars` setting. | + + +## max_gram parameter limitations + +The `max_gram` parameter sets the maximum length of tokens generated by the tokenizer. When a search query exceeds this length, it may fail to match any terms in the index. + +For example, if `max_gram` is set to `4`, the query `explore` would be tokenized as `expl` during indexing. As a result, a search for the full term `explore` will not match the indexed token `expl`. + +To address this limitation, you can apply a `truncate` token filter to shorten search terms to the maximum token length. However, this approach presents trade-offs. Truncating `explore` to `expl` might lead to matches with unrelated terms like `explosion` or `explicit`, reducing search precision. + +We recommend carefully balancing the `max_gram` value to ensure efficient tokenization while minimizing irrelevant matches. If precision is critical, consider alternative strategies, such as adjusting query analyzers or fine-tuning filters. + +## Best practices + +We recommend using the `edge_ngram` tokenizer only at indexing time in order to ensure that partial word tokens are stored. At search time, a basic analyzer should be used to match all query terms. + +## Configuring search-as-you-type functionality + +To implement search-as-you-type functionality, use the `edge_ngram` tokenizer during indexing and an analyzer that performs minimal processing at search time. The following example demonstrates this approach. + +Create an index with an `edge_ngram` tokenizer: + + +```json +PUT /my-autocomplete-index +{ + "settings": { + "analysis": { + "analyzer": { + "autocomplete": { + "tokenizer": "autocomplete", + "filter": [ + "lowercase" + ] + }, + "autocomplete_search": { + "tokenizer": "lowercase" + } + }, + "tokenizer": { + "autocomplete": { + "type": "edge_ngram", + "min_gram": 2, + "max_gram": 10, + "token_chars": [ + "letter" + ] + } + } + } + }, + "mappings": { + "properties": { + "title": { + "type": "text", + "analyzer": "autocomplete", + "search_analyzer": "autocomplete_search" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document containing a `product` field and refresh the index: + +```json +PUT my-autocomplete-index/_doc/1?refresh +{ + "title": "Laptop Pro" +} +``` +{% include copy-curl.html %} + +This configuration ensures that the `edge_ngram` tokenizer breaks terms like "Laptop" into tokens such as `La`, `Lap`, and `Lapt`, allowing partial matches during search. At search time, the `standard` tokenizer simplifies queries while ensuring that matches are case-insensitive because of the lowercase filter. + +Searches for `laptop Pr` or `lap pr` now retrieve the relevant document based on partial matches: + +```json +GET my-autocomplete-index/_search +{ + "query": { + "match": { + "title": { + "query": "lap pr", + "operator": "and" + } + } + } +} +``` +{% include copy-curl.html %} + +For more information, see [Search as you type]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/autocomplete/#search-as-you-type). \ No newline at end of file diff --git a/_analyzers/tokenizers/index.md b/_analyzers/tokenizers/index.md index e5ac796c12d..cef14297789 100644 --- a/_analyzers/tokenizers/index.md +++ b/_analyzers/tokenizers/index.md @@ -2,7 +2,7 @@ layout: default title: Tokenizers nav_order: 60 -has_children: false +has_children: true has_toc: false redirect_from: - /analyzers/tokenizers/index/ @@ -30,13 +30,13 @@ Word tokenizers parse full text into words. Tokenizer | Description | Example :--- | :--- | :--- -`standard` | - Parses strings into tokens at word boundaries
- Removes most punctuation | `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`
becomes
[`It’s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `PR`, `or`, `2`, `to`, `OpenSearch`] -`letter` | - Parses strings into tokens on any non-letter character
- Removes non-letter characters | `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`
becomes
[`It`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `PR`, `or`, `to`, `OpenSearch`] -`lowercase` | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Converts terms to lowercase | `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`
becomes
[`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `to`, `opensearch`] -`whitespace` | - Parses strings into tokens at white space characters | `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`
becomes
[`It’s`, `fun`, `to`, `contribute`, `a`,`brand-new`, `PR`, `or`, `2`, `to`, `OpenSearch!`] -`uax_url_email` | - Similar to the standard tokenizer
- Unlike the standard tokenizer, leaves URLs and email addresses as single terms | `It’s fun to contribute a brand-new PR or 2 to OpenSearch opensearch-project@github.com!`
becomes
[`It’s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `PR`, `or`, `2`, `to`, `OpenSearch`, `opensearch-project@github.com`] -`classic` | - Parses strings into tokens on:
  - Punctuation characters that are followed by a white space character
  - Hyphens if the term does not contain numbers
- Removes punctuation
- Leaves URLs and email addresses as single terms | `Part number PA-35234, single-use product (128.32)`
becomes
[`Part`, `number`, `PA-35234`, `single`, `use`, `product`, `128.32`] -`thai` | - Parses Thai text into terms | `สวัสดีและยินดีต`
becomes
[`สวัสด`, `และ`, `ยินดี`, `ต`] +[`standard`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/standard/) | - Parses strings into tokens at word boundaries
- Removes most punctuation | `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`
becomes
[`It’s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `PR`, `or`, `2`, `to`, `OpenSearch`] +[`letter`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/letter/) | - Parses strings into tokens on any non-letter character
- Removes non-letter characters | `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`
becomes
[`It`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `PR`, `or`, `to`, `OpenSearch`] +[`lowercase`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/lowercase/) | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Converts terms to lowercase | `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`
becomes
[`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `to`, `opensearch`] +[`whitespace`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/whitespace/) | - Parses strings into tokens at white space characters | `It’s fun to contribute a brand-new PR or 2 to OpenSearch!`
becomes
[`It’s`, `fun`, `to`, `contribute`, `a`,`brand-new`, `PR`, `or`, `2`, `to`, `OpenSearch!`] +[`uax_url_email`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/uax-url-email/) | - Similar to the standard tokenizer
- Unlike the standard tokenizer, leaves URLs and email addresses as single terms | `It’s fun to contribute a brand-new PR or 2 to OpenSearch opensearch-project@github.com!`
becomes
[`It’s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `PR`, `or`, `2`, `to`, `OpenSearch`, `opensearch-project@github.com`] +[`classic`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/classic/) | - Parses strings into tokens on:
  - Punctuation characters that are followed by a white space character
  - Hyphens if the term does not contain numbers
- Removes punctuation
- Leaves URLs and email addresses as single terms | `Part number PA-35234, single-use product (128.32)`
becomes
[`Part`, `number`, `PA-35234`, `single`, `use`, `product`, `128.32`] +[`thai`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/thai/) | - Parses Thai text into terms | `สวัสดีและยินดีต`
becomes
[`สวัสด`, `และ`, `ยินดี`, `ต`] ### Partial word tokenizers @@ -44,8 +44,8 @@ Partial word tokenizers parse text into words and generate fragments of those wo Tokenizer | Description | Example :--- | :--- | :--- -`ngram`| - Parses strings into words on specified characters (for example, punctuation or white space characters) and generates n-grams of each word | `My repo`
becomes
[`M`, `My`, `y`, `y `,  ,  r, `r`, `re`, `e`, `ep`, `p`, `po`, `o`]
because the default n-gram length is 1--2 characters -`edge_ngram` | - Parses strings into words on specified characters (for example, punctuation or white space characters) and generates edge n-grams of each word (n-grams that start at the beginning of the word) | `My repo`
becomes
[`M`, `My`]
because the default n-gram length is 1--2 characters +[`ngram`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/ngram/)| - Parses strings into words on specified characters (for example, punctuation or white space characters) and generates n-grams of each word | `My repo`
becomes
[`M`, `My`, `y`, `y `,  ,  r, `r`, `re`, `e`, `ep`, `p`, `po`, `o`]
because the default n-gram length is 1--2 characters +[`edge_ngram`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/edge-n-gram/) | - Parses strings into words on specified characters (for example, punctuation or white space characters) and generates edge n-grams of each word (n-grams that start at the beginning of the word) | `My repo`
becomes
[`M`, `My`]
because the default n-gram length is 1--2 characters ### Structured text tokenizers @@ -53,11 +53,11 @@ Structured text tokenizers parse structured text, such as identifiers, email add Tokenizer | Description | Example :--- | :--- | :--- -`keyword` | - No-op tokenizer
- Outputs the entire string unchanged
- Can be combined with token filters, like lowercase, to normalize terms | `My repo`
becomes
`My repo` -`pattern` | - Uses a regular expression pattern to parse text into terms on a word separator or to capture matching text as terms
- Uses [Java regular expressions](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) | `https://opensearch.org/forum`
becomes
[`https`, `opensearch`, `org`, `forum`] because by default the tokenizer splits terms at word boundaries (`\W+`)
Can be configured with a regex pattern -`simple_pattern` | - Uses a regular expression pattern to return matching text as terms
- Uses [Lucene regular expressions](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/util/automaton/RegExp.html)
- Faster than the `pattern` tokenizer because it uses a subset of the `pattern` tokenizer regular expressions | Returns an empty array by default
Must be configured with a pattern because the pattern defaults to an empty string -`simple_pattern_split` | - Uses a regular expression pattern to split the text at matches rather than returning the matches as terms
- Uses [Lucene regular expressions](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/util/automaton/RegExp.html)
- Faster than the `pattern` tokenizer because it uses a subset of the `pattern` tokenizer regular expressions | No-op by default
Must be configured with a pattern -`char_group` | - Parses on a set of configurable characters
- Faster than tokenizers that run regular expressions | No-op by default
Must be configured with a list of characters -`path_hierarchy` | - Parses text on the path separator (by default, `/`) and returns a full path to each component in the tree hierarchy | `one/two/three`
becomes
[`one`, `one/two`, `one/two/three`] +[`keyword`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/keyword/) | - No-op tokenizer
- Outputs the entire string unchanged
- Can be combined with token filters, like lowercase, to normalize terms | `My repo`
becomes
`My repo` +[`pattern`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/pattern/) | - Uses a regular expression pattern to parse text into terms on a word separator or to capture matching text as terms
- Uses [Java regular expressions](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) | `https://opensearch.org/forum`
becomes
[`https`, `opensearch`, `org`, `forum`] because by default the tokenizer splits terms at word boundaries (`\W+`)
Can be configured with a regex pattern +[`simple_pattern`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/simple-pattern/) | - Uses a regular expression pattern to return matching text as terms
- Uses [Lucene regular expressions](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/util/automaton/RegExp.html)
- Faster than the `pattern` tokenizer because it uses a subset of the `pattern` tokenizer regular expressions | Returns an empty array by default
Must be configured with a pattern because the pattern defaults to an empty string +[`simple_pattern_split`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/simple-pattern-split/) | - Uses a regular expression pattern to split the text on matches rather than returning the matches as terms
- Uses [Lucene regular expressions](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/util/automaton/RegExp.html)
- Faster than the `pattern` tokenizer because it uses a subset of the `pattern` tokenizer regular expressions | No-op by default
Must be configured with a pattern +[`char_group`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/character-group/) | - Parses on a set of configurable characters
- Faster than tokenizers that run regular expressions | No-op by default
Must be configured with a list of characters +[`path_hierarchy`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/path-hierarchy/) | - Parses text on the path separator (by default, `/`) and returns a full path to each component in the tree hierarchy | `one/two/three`
becomes
[`one`, `one/two`, `one/two/three`] diff --git a/_analyzers/tokenizers/keyword.md b/_analyzers/tokenizers/keyword.md new file mode 100644 index 00000000000..8b77d38ca5a --- /dev/null +++ b/_analyzers/tokenizers/keyword.md @@ -0,0 +1,119 @@ +--- +layout: default +title: Keyword +parent: Tokenizers +nav_order: 50 +--- + +# Keyword tokenizer + +The `keyword` tokenizer ingests text and outputs it exactly as a single, unaltered token. This makes it particularly useful when you want the input to remain intact, such as when managing structured data like names, product codes, or email addresses. + +The `keyword` tokenizer can be paired with token filters to process the text, for example, to normalize it or to remove extraneous characters. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `keyword` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_keyword_analyzer": { + "type": "custom", + "tokenizer": "keyword" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_keyword_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_keyword_analyzer", + "text": "OpenSearch Example" +} +``` +{% include copy-curl.html %} + +The response contains the single token representing the original text: + +```json +{ + "tokens": [ + { + "token": "OpenSearch Example", + "start_offset": 0, + "end_offset": 18, + "type": "word", + "position": 0 + } + ] +} +``` + +## Parameters + +The `keyword` token filter can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`buffer_size`| Optional | Integer | Determines the character buffer size. Default is `256`. There is usually no need to change this setting. + +## Combining the keyword tokenizer with token filters + +To enhance the functionality of the `keyword` tokenizer, you can combine it with token filters. Token filters can transform the text, such as converting it to lowercase or removing unwanted characters. + +### Example: Using the pattern_replace filter and keyword tokenizer + +In this example, the `pattern_replace` filter uses a regular expression to replace all non-alphanumeric characters with an empty string: + +```json +POST _analyze +{ + "tokenizer": "keyword", + "filter": [ + { + "type": "pattern_replace", + "pattern": "[^a-zA-Z0-9]", + "replacement": "" + } + ], + "text": "Product#1234-XYZ" +} +``` +{% include copy-curl.html %} + +The `pattern_replace` filter removes non-alphanumeric characters and returns the following token: + +```json +{ + "tokens": [ + { + "token": "Product1234XYZ", + "start_offset": 0, + "end_offset": 16, + "type": "word", + "position": 0 + } + ] +} +``` + diff --git a/_analyzers/tokenizers/letter.md b/_analyzers/tokenizers/letter.md new file mode 100644 index 00000000000..ba67a7841d9 --- /dev/null +++ b/_analyzers/tokenizers/letter.md @@ -0,0 +1,97 @@ +--- +layout: default +title: Letter +parent: Tokenizers +nav_order: 60 +--- + +# Letter tokenizer + +The `letter` tokenizer splits text into words on any non-letter characters. It works well with many European languages but is ineffective with some Asian languages in which words aren't separated by spaces. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `letter` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_letter_analyzer": { + "type": "custom", + "tokenizer": "letter" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_letter_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST _analyze +{ + "tokenizer": "letter", + "text": "Cats 4EVER love chasing butterflies!" +} + +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Cats", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "EVER", + "start_offset": 6, + "end_offset": 10, + "type": "word", + "position": 1 + }, + { + "token": "love", + "start_offset": 11, + "end_offset": 15, + "type": "word", + "position": 2 + }, + { + "token": "chasing", + "start_offset": 16, + "end_offset": 23, + "type": "word", + "position": 3 + }, + { + "token": "butterflies", + "start_offset": 24, + "end_offset": 35, + "type": "word", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/tokenizers/lowercase.md b/_analyzers/tokenizers/lowercase.md new file mode 100644 index 00000000000..5542ecbf506 --- /dev/null +++ b/_analyzers/tokenizers/lowercase.md @@ -0,0 +1,93 @@ +--- +layout: default +title: Lowercase +parent: Tokenizers +nav_order: 70 +--- + +# Lowercase tokenizer + +The `lowercase` tokenizer breaks text into terms at white space and then lowercases all the terms. Functionally, this is identical to configuring a `letter` tokenizer with a `lowercase` token filter. However, using a `lowercase` tokenizer is more efficient because the tokenizer actions are performed in a single step. + +## Example usage + +The following example request creates a new index named `my-lowercase-index` and configures an analyzer with a `lowercase` tokenizer: + +```json +PUT /my-lowercase-index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_lowercase_tokenizer": { + "type": "lowercase" + } + }, + "analyzer": { + "my_lowercase_analyzer": { + "type": "custom", + "tokenizer": "my_lowercase_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my-lowercase-index/_analyze +{ + "analyzer": "my_lowercase_analyzer", + "text": "This is a Test. OpenSearch 123!" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "this", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "is", + "start_offset": 5, + "end_offset": 7, + "type": "word", + "position": 1 + }, + { + "token": "a", + "start_offset": 8, + "end_offset": 9, + "type": "word", + "position": 2 + }, + { + "token": "test", + "start_offset": 10, + "end_offset": 14, + "type": "word", + "position": 3 + }, + { + "token": "opensearch", + "start_offset": 16, + "end_offset": 26, + "type": "word", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/tokenizers/ngram.md b/_analyzers/tokenizers/ngram.md new file mode 100644 index 00000000000..08ac4562674 --- /dev/null +++ b/_analyzers/tokenizers/ngram.md @@ -0,0 +1,111 @@ +--- +layout: default +title: N-gram +parent: Tokenizers +nav_order: 80 +--- + +# N-gram tokenizer + +The `ngram` tokenizer splits text into overlapping n-grams (sequences of characters) of a specified length. This tokenizer is particularly useful when you want to perform partial word matching or autocomplete search functionality because it generates substrings (character n-grams) of the original input text. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with an `ngram` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_ngram_tokenizer": { + "type": "ngram", + "min_gram": 3, + "max_gram": 4, + "token_chars": ["letter", "digit"] + } + }, + "analyzer": { + "my_ngram_analyzer": { + "type": "custom", + "tokenizer": "my_ngram_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_ngram_analyzer", + "text": "OpenSearch" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "Sea","start_offset": 0,"end_offset": 3,"type": "word","position": 0}, + {"token": "Sear","start_offset": 0,"end_offset": 4,"type": "word","position": 1}, + {"token": "ear","start_offset": 1,"end_offset": 4,"type": "word","position": 2}, + {"token": "earc","start_offset": 1,"end_offset": 5,"type": "word","position": 3}, + {"token": "arc","start_offset": 2,"end_offset": 5,"type": "word","position": 4}, + {"token": "arch","start_offset": 2,"end_offset": 6,"type": "word","position": 5}, + {"token": "rch","start_offset": 3,"end_offset": 6,"type": "word","position": 6} + ] +} +``` + +## Parameters + +The `ngram` tokenizer can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min_gram` | Optional | Integer | The minimum length of the n-grams. Default is `1`. +`max_gram` | Optional | Integer | The maximum length of the n-grams. Default is `2`. +`token_chars` | Optional | List of strings | The character classes to be included in tokenization. Valid values are:
- `letter`
- `digit`
- `whitespace`
- `punctuation`
- `symbol`
- `custom` (You must also specify the `custom_token_chars` parameter)
Default is an empty list (`[]`), which retains all the characters. +`custom_token_chars` | Optional | String | Custom characters to be included in the tokens. + +### Maximum difference between `min_gram` and `max_gram` + +The maximum difference between `min_gram` and `max_gram` is configured using the index-level `index.max_ngram_diff` setting and defaults to `1`. + +The following example request creates an index with a custom `index.max_ngram_diff` setting: + +```json +PUT /my-index +{ + "settings": { + "index.max_ngram_diff": 2, + "analysis": { + "tokenizer": { + "my_ngram_tokenizer": { + "type": "ngram", + "min_gram": 3, + "max_gram": 5, + "token_chars": ["letter", "digit"] + } + }, + "analyzer": { + "my_ngram_analyzer": { + "type": "custom", + "tokenizer": "my_ngram_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_analyzers/tokenizers/path-hierarchy.md b/_analyzers/tokenizers/path-hierarchy.md new file mode 100644 index 00000000000..a6609f30cd8 --- /dev/null +++ b/_analyzers/tokenizers/path-hierarchy.md @@ -0,0 +1,182 @@ +--- +layout: default +title: Path hierarchy +parent: Tokenizers +nav_order: 90 +--- + +# Path hierarchy tokenizer + +The `path_hierarchy` tokenizer tokenizes file-system-like paths (or similar hierarchical structures) by breaking them down into tokens at each hierarchy level. This tokenizer is particularly useful when working with hierarchical data such as file paths, URLs, or any other delimited paths. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `path_hierarchy` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_path_tokenizer": { + "type": "path_hierarchy" + } + }, + "analyzer": { + "my_path_analyzer": { + "type": "custom", + "tokenizer": "my_path_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_path_analyzer", + "text": "/users/john/documents/report.txt" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "/users", + "start_offset": 0, + "end_offset": 6, + "type": "word", + "position": 0 + }, + { + "token": "/users/john", + "start_offset": 0, + "end_offset": 11, + "type": "word", + "position": 0 + }, + { + "token": "/users/john/documents", + "start_offset": 0, + "end_offset": 21, + "type": "word", + "position": 0 + }, + { + "token": "/users/john/documents/report.txt", + "start_offset": 0, + "end_offset": 32, + "type": "word", + "position": 0 + } + ] +} +``` + +## Parameters + +The `path_hierarchy` tokenizer can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`delimiter` | Optional | String | Specifies the character used to separate path components. Default is `/`. +`replacement` | Optional | String | Configures the character used to replace the delimiter in the tokens. Default is `/`. +`buffer_size` | Optional | Integer | Specifies the buffer size. Default is `1024`. +`reverse` | Optional | Boolean | If `true`, generates tokens in reverse order. Default is `false`. +`skip` | Optional | Integer | Specifies the number of initial tokens (levels) to skip when tokenizing. Default is `0`. + +## Example using delimiter and replacement parameters + +The following example request configures custom `delimiter` and `replacement` parameters: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_path_tokenizer": { + "type": "path_hierarchy", + "delimiter": "\\", + "replacement": "\\" + } + }, + "analyzer": { + "my_path_analyzer": { + "type": "custom", + "tokenizer": "my_path_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} + + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_path_analyzer", + "text": "C:\\users\\john\\documents\\report.txt" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "C:", + "start_offset": 0, + "end_offset": 2, + "type": "word", + "position": 0 + }, + { + "token": """C:\users""", + "start_offset": 0, + "end_offset": 8, + "type": "word", + "position": 0 + }, + { + "token": """C:\users\john""", + "start_offset": 0, + "end_offset": 13, + "type": "word", + "position": 0 + }, + { + "token": """C:\users\john\documents""", + "start_offset": 0, + "end_offset": 23, + "type": "word", + "position": 0 + }, + { + "token": """C:\users\john\documents\report.txt""", + "start_offset": 0, + "end_offset": 34, + "type": "word", + "position": 0 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/tokenizers/pattern.md b/_analyzers/tokenizers/pattern.md new file mode 100644 index 00000000000..036dd9050fa --- /dev/null +++ b/_analyzers/tokenizers/pattern.md @@ -0,0 +1,167 @@ +--- +layout: default +title: Pattern +parent: Tokenizers +nav_order: 100 +--- + +# Pattern tokenizer + +The `pattern` tokenizer is a highly flexible tokenizer that allows you to split text into tokens based on a custom Java regular expression. Unlike the `simple_pattern` and `simple_pattern_split` tokenizers, which use Lucene regular expressions, the `pattern` tokenizer can handle more complex and detailed regex patterns, offering greater control over how the text is tokenized. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `pattern` tokenizer. The tokenizer splits text on `-`, `_`, or `.` characters: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_pattern_tokenizer": { + "type": "pattern", + "pattern": "[-_.]" + } + }, + "analyzer": { + "my_pattern_analyzer": { + "type": "custom", + "tokenizer": "my_pattern_tokenizer" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_pattern_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_pattern_analyzer", + "text": "OpenSearch-2024_v1.2" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "word", + "position": 0 + }, + { + "token": "2024", + "start_offset": 11, + "end_offset": 15, + "type": "word", + "position": 1 + }, + { + "token": "v1", + "start_offset": 16, + "end_offset": 18, + "type": "word", + "position": 2 + }, + { + "token": "2", + "start_offset": 19, + "end_offset": 20, + "type": "word", + "position": 3 + } + ] +} +``` + +## Parameters + +The `pattern` tokenizer can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`pattern` | Optional | String | The pattern used to split text into tokens, specified using a [Java regular expression](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). Default is `\W+`. +`flags` | Optional | String | Configures pipe-separated [flags](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#field.summary) to apply to the regular expression, for example, `"CASE_INSENSITIVE|MULTILINE|DOTALL"`. +`group` | Optional | Integer | Specifies the capture group to be used as a token. Default is `-1` (split on a match). + +## Example using a group parameter + +The following example request configures a `group` parameter that captures only the second group: + +```json +PUT /my_index_group2 +{ + "settings": { + "analysis": { + "tokenizer": { + "my_pattern_tokenizer": { + "type": "pattern", + "pattern": "([a-zA-Z]+)(\\d+)", + "group": 2 + } + }, + "analyzer": { + "my_pattern_analyzer": { + "type": "custom", + "tokenizer": "my_pattern_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index_group2/_analyze +{ + "analyzer": "my_pattern_analyzer", + "text": "abc123def456ghi" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "123", + "start_offset": 3, + "end_offset": 6, + "type": "word", + "position": 0 + }, + { + "token": "456", + "start_offset": 9, + "end_offset": 12, + "type": "word", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/tokenizers/simple-pattern-split.md b/_analyzers/tokenizers/simple-pattern-split.md new file mode 100644 index 00000000000..25367f25b54 --- /dev/null +++ b/_analyzers/tokenizers/simple-pattern-split.md @@ -0,0 +1,105 @@ +--- +layout: default +title: Simple pattern split +parent: Tokenizers +nav_order: 120 +--- + +# Simple pattern split tokenizer + +The `simple_pattern_split` tokenizer uses a regular expression to split text into tokens. The regular expression defines the pattern used to determine where to split the text. Any matching pattern in the text is used as a delimiter, and the text between delimiters becomes a token. Use this tokenizer when you want to define delimiters and tokenize the rest of the text based on a pattern. + +The tokenizer uses the matched parts of the input text (based on the regular expression) only as delimiters or boundaries to split the text into terms. The matched portions are not included in the resulting terms. For example, if the tokenizer is configured to split text at dot characters (`.`) and the input text is `one.two.three`, then the generated terms are `one`, `two`, and `three`. The dot characters themselves are not included in the resulting terms. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `simple_pattern_split` tokenizer. The tokenizer is configured to split text on hyphens: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_pattern_split_tokenizer": { + "type": "simple_pattern_split", + "pattern": "-" + } + }, + "analyzer": { + "my_pattern_split_analyzer": { + "type": "custom", + "tokenizer": "my_pattern_split_tokenizer" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_pattern_split_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_pattern_split_analyzer", + "text": "OpenSearch-2024-10-09" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "word", + "position": 0 + }, + { + "token": "2024", + "start_offset": 11, + "end_offset": 15, + "type": "word", + "position": 1 + }, + { + "token": "10", + "start_offset": 16, + "end_offset": 18, + "type": "word", + "position": 2 + }, + { + "token": "09", + "start_offset": 19, + "end_offset": 21, + "type": "word", + "position": 3 + } + ] +} +``` + +## Parameters + +The `simple_pattern_split` tokenizer can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`pattern` | Optional | String | The pattern used to split text into tokens, specified using a [Lucene regular expression](https://lucene.apache.org/core/9_10_0/core/org/apache/lucene/util/automaton/RegExp.html). Default is an empty string, which returns the input text as one token. \ No newline at end of file diff --git a/_analyzers/tokenizers/simple-pattern.md b/_analyzers/tokenizers/simple-pattern.md new file mode 100644 index 00000000000..eacddd69926 --- /dev/null +++ b/_analyzers/tokenizers/simple-pattern.md @@ -0,0 +1,89 @@ +--- +layout: default +title: Simple pattern +parent: Tokenizers +nav_order: 110 +--- + +# Simple pattern tokenizer + +The `simple_pattern` tokenizer identifies matching sequences in text based on a regular expression and uses those sequences as tokens. It extracts terms that match the regular expression. Use this tokenizer when you want to directly extract specific patterns as terms. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `simple_pattern` tokenizer. The tokenizer extracts numeric terms from text: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_pattern_tokenizer": { + "type": "simple_pattern", + "pattern": "\\d+" + } + }, + "analyzer": { + "my_pattern_analyzer": { + "type": "custom", + "tokenizer": "my_pattern_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_pattern_analyzer", + "text": "OpenSearch-2024-10-09" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "2024", + "start_offset": 11, + "end_offset": 15, + "type": "word", + "position": 0 + }, + { + "token": "10", + "start_offset": 16, + "end_offset": 18, + "type": "word", + "position": 1 + }, + { + "token": "09", + "start_offset": 19, + "end_offset": 21, + "type": "word", + "position": 2 + } + ] +} +``` + +## Parameters + +The `simple_pattern` tokenizer can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`pattern` | Optional | String | The pattern used to split text into tokens, specified using a [Lucene regular expression](https://lucene.apache.org/core/9_10_0/core/org/apache/lucene/util/automaton/RegExp.html). Default is an empty string, which returns the input text as one token. + diff --git a/_analyzers/tokenizers/standard.md b/_analyzers/tokenizers/standard.md new file mode 100644 index 00000000000..c10f25802bb --- /dev/null +++ b/_analyzers/tokenizers/standard.md @@ -0,0 +1,111 @@ +--- +layout: default +title: Standard +parent: Tokenizers +nav_order: 130 +--- + +# Standard tokenizer + +The `standard` tokenizer is the default tokenizer in OpenSearch. It tokenizes text based on word boundaries using a grammar-based approach that recognizes letters, digits, and other characters like punctuation. It is highly versatile and suitable for many languages because it uses Unicode text segmentation rules ([UAX#29](https://unicode.org/reports/tr29/)) to break text into tokens. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `standard` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_standard_analyzer": { + "type": "standard" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_standard_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_standard_analyzer", + "text": "OpenSearch is powerful, fast, and scalable." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "is", + "start_offset": 11, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "powerful", + "start_offset": 14, + "end_offset": 22, + "type": "", + "position": 2 + }, + { + "token": "fast", + "start_offset": 24, + "end_offset": 28, + "type": "", + "position": 3 + }, + { + "token": "and", + "start_offset": 30, + "end_offset": 33, + "type": "", + "position": 4 + }, + { + "token": "scalable", + "start_offset": 34, + "end_offset": 42, + "type": "", + "position": 5 + } + ] +} +``` + +## Parameters + +The `standard` tokenizer can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`max_token_length` | Optional | Integer | Sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at the length configured in `max_token_length`. Default is `255`. + diff --git a/_analyzers/tokenizers/thai.md b/_analyzers/tokenizers/thai.md new file mode 100644 index 00000000000..4afb14a9ebb --- /dev/null +++ b/_analyzers/tokenizers/thai.md @@ -0,0 +1,108 @@ +--- +layout: default +title: Thai +parent: Tokenizers +nav_order: 140 +--- + +# Thai tokenizer + +The `thai` tokenizer tokenizes Thai language text. Because words in Thai language are not separated by spaces, the tokenizer must identify word boundaries based on language-specific rules. + +## Example usage + +The following example request creates a new index named `thai_index` and configures an analyzer with a `thai` tokenizer: + +```json +PUT /thai_index +{ + "settings": { + "analysis": { + "tokenizer": { + "thai_tokenizer": { + "type": "thai" + } + }, + "analyzer": { + "thai_analyzer": { + "type": "custom", + "tokenizer": "thai_tokenizer" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "thai_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /thai_index/_analyze +{ + "analyzer": "thai_analyzer", + "text": "ฉันชอบไปเที่ยวที่เชียงใหม่" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "ฉัน", + "start_offset": 0, + "end_offset": 3, + "type": "word", + "position": 0 + }, + { + "token": "ชอบ", + "start_offset": 3, + "end_offset": 6, + "type": "word", + "position": 1 + }, + { + "token": "ไป", + "start_offset": 6, + "end_offset": 8, + "type": "word", + "position": 2 + }, + { + "token": "เที่ยว", + "start_offset": 8, + "end_offset": 14, + "type": "word", + "position": 3 + }, + { + "token": "ที่", + "start_offset": 14, + "end_offset": 17, + "type": "word", + "position": 4 + }, + { + "token": "เชียงใหม่", + "start_offset": 17, + "end_offset": 26, + "type": "word", + "position": 5 + } + ] +} +``` diff --git a/_analyzers/tokenizers/uax-url-email.md b/_analyzers/tokenizers/uax-url-email.md new file mode 100644 index 00000000000..34336a4f554 --- /dev/null +++ b/_analyzers/tokenizers/uax-url-email.md @@ -0,0 +1,84 @@ +--- +layout: default +title: UAX URL email +parent: Tokenizers +nav_order: 150 +--- + +# UAX URL email tokenizer + +In addition to regular text, the `uax_url_email` tokenizer is designed to handle URLs, email addresses, and domain names. It is based on the Unicode Text Segmentation algorithm ([UAX #29](https://www.unicode.org/reports/tr29/)), which allows it to correctly tokenize complex text, including URLs and email addresses. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `uax_url_email` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "uax_url_email_tokenizer": { + "type": "uax_url_email" + } + }, + "analyzer": { + "my_uax_analyzer": { + "type": "custom", + "tokenizer": "uax_url_email_tokenizer" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_uax_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_uax_analyzer", + "text": "Contact us at support@example.com or visit https://example.com for details." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "Contact","start_offset": 0,"end_offset": 7,"type": "","position": 0}, + {"token": "us","start_offset": 8,"end_offset": 10,"type": "","position": 1}, + {"token": "at","start_offset": 11,"end_offset": 13,"type": "","position": 2}, + {"token": "support@example.com","start_offset": 14,"end_offset": 33,"type": "","position": 3}, + {"token": "or","start_offset": 34,"end_offset": 36,"type": "","position": 4}, + {"token": "visit","start_offset": 37,"end_offset": 42,"type": "","position": 5}, + {"token": "https://example.com","start_offset": 43,"end_offset": 62,"type": "","position": 6}, + {"token": "for","start_offset": 63,"end_offset": 66,"type": "","position": 7}, + {"token": "details","start_offset": 67,"end_offset": 74,"type": "","position": 8} + ] +} +``` + +## Parameters + +The `uax_url_email` tokenizer can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`max_token_length` | Optional | Integer | Sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at the length configured in `max_token_length`. Default is `255`. + diff --git a/_analyzers/tokenizers/whitespace.md b/_analyzers/tokenizers/whitespace.md new file mode 100644 index 00000000000..fb168304a74 --- /dev/null +++ b/_analyzers/tokenizers/whitespace.md @@ -0,0 +1,110 @@ +--- +layout: default +title: Whitespace +parent: Tokenizers +nav_order: 160 +--- + +# Whitespace tokenizer + +The `whitespace` tokenizer splits text on white space characters, such as spaces, tabs, and new lines. It treats each word separated by white space as a token and does not perform any additional analysis or normalization like lowercasing or punctuation removal. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with a `whitespace` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "whitespace_tokenizer": { + "type": "whitespace" + } + }, + "analyzer": { + "my_whitespace_analyzer": { + "type": "custom", + "tokenizer": "whitespace_tokenizer" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_whitespace_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_whitespace_analyzer", + "text": "OpenSearch is fast! Really fast." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "word", + "position": 0 + }, + { + "token": "is", + "start_offset": 11, + "end_offset": 13, + "type": "word", + "position": 1 + }, + { + "token": "fast!", + "start_offset": 14, + "end_offset": 19, + "type": "word", + "position": 2 + }, + { + "token": "Really", + "start_offset": 20, + "end_offset": 26, + "type": "word", + "position": 3 + }, + { + "token": "fast.", + "start_offset": 27, + "end_offset": 32, + "type": "word", + "position": 4 + } + ] +} +``` + +## Parameters + +The `whitespace` tokenizer can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`max_token_length` | Optional | Integer | Sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at the length configured in `max_token_length`. Default is `255`. + diff --git a/_api-reference/analyze-apis.md b/_api-reference/analyze-apis.md index 5a63f665d9c..24552c0fbe4 100644 --- a/_api-reference/analyze-apis.md +++ b/_api-reference/analyze-apis.md @@ -1,26 +1,25 @@ --- layout: default title: Analyze API -has_children: true nav_order: 7 redirect_from: - /api-reference/analyze-apis/perform-text-analysis/ - /opensearch/rest-api/analyze-apis/ - - /api-reference/analyze-apis/ + - /api-reference/analyze-apis/terminology/ --- # Analyze API **Introduced 1.0** {: .label .label-purple } -The Analyze API allows you to perform [text analysis]({{site.url}}{{site.baseurl}}/api-reference/analyze-apis/), which is the process of converting unstructured text into individual tokens (usually words) that are optimized for search. +The Analyze API allows you to perform [text analysis]({{site.url}}{{site.baseurl}}/analyzers/), which is the process of converting unstructured text into individual tokens (usually words) that are optimized for search. For more information about common analysis components such as character filters, tokenizers, token filters, and normalizers, see [Analyzers]({{site.url}}{{site.baseurl}}/analyzers/#analyzers). The Analyze API analyzes a text string and returns the resulting tokens. If you use the Security plugin, you must have the `manage index` privilege. If you only want to analyze text, you must have the `manage cluster` privilege. {: .note} -## Path and HTTP methods +## Endpoints ```json GET /_analyze diff --git a/_api-reference/analyze-apis/terminology.md b/_api-reference/analyze-apis/terminology.md deleted file mode 100644 index 17d26308ae1..00000000000 --- a/_api-reference/analyze-apis/terminology.md +++ /dev/null @@ -1,37 +0,0 @@ ---- -layout: default -title: Analysis API Terminology -parent: Analyze API - -nav_order: 1 ---- - -# Terminology - -The following sections provide descriptions of important text analysis terms. - -## Analyzers - -Analyzers tell OpenSearch how to index and search text. An analyzer is composed of three components: a tokenizer, zero or more token filters, and zero or more character filters. - -OpenSearch provides *built-in* analyzers. For example, the `standard` built-in analyzer converts text to lowercase and breaks text into tokens based on word boundaries such as carriage returns and white space. The `standard` analyzer is also called the *default* analyzer and is used when no analyzer is specified in the text analysis request. - -If needed, you can combine tokenizers, token filters, and character filters to create a *custom* analyzer. - -#### Tokenizers - -Tokenizers break unstructured text into tokens and maintain metadata about tokens, such as their starting and ending positions in the text. - -#### Character filters - -Character filters examine text and perform translations, such as changing, removing, and adding characters. - -#### Token filters - -Token filters modify tokens, performing operations such as converting a token's characters to uppercase and adding or removing tokens. - -## Normalizers - -Similar to analyzers, normalizers tokenize text but return a single token only. Normalizers do not employ tokenizers; they make limited use of character and token filters, such as those that operate on one character at a time. - -By default, OpenSearch does not apply normalizers. To apply normalizers, you must add them to your data before creating an index. \ No newline at end of file diff --git a/_api-reference/cat/cat-aliases.md b/_api-reference/cat/cat-aliases.md index 950d4973513..0c5fb7b15af 100644 --- a/_api-reference/cat/cat-aliases.md +++ b/_api-reference/cat/cat-aliases.md @@ -16,20 +16,41 @@ has_children: false The CAT aliases operation lists the mapping of aliases to indexes, plus routing and filtering information. -## Path and HTTP methods + +## Endpoints ```json -GET _cat/aliases/ -GET _cat/aliases +GET /_cat/aliases +GET /_cat/aliases/{name} ``` -{% include copy-curl.html %} + + + ## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -local | Boolean | Whether to return information from the local node only instead of from the cluster manager node. Default is `false`. -expand_wildcards | Enum | Expands wildcard expressions to concrete indexes. Combine multiple values with commas. Supported values are `all`, `open`, `closed`, `hidden`, and `none`. Default is `open`. +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `expand_wildcards` | List or String | Specifies the type of index that wildcard expressions can match. Supports comma-separated values.
Valid values are:
- `all`: Match any index, including hidden ones.
- `closed`: Match closed, non-hidden indexes.
- `hidden`: Match hidden indexes. Must be combined with open, closed, or both.
- `none`: Wildcard expressions are not accepted.
- `open`: Match open, non-hidden indexes. | N/A | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `local` | Boolean | Whether to return information from the local node only instead of from the cluster manager node. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + + ## Example requests diff --git a/_api-reference/cat/cat-allocation.md b/_api-reference/cat/cat-allocation.md index a57c861a4bf..f0e2859680d 100644 --- a/_api-reference/cat/cat-allocation.md +++ b/_api-reference/cat/cat-allocation.md @@ -15,20 +15,41 @@ has_children: false The CAT allocation operation lists the allocation of disk space for indexes and the number of shards on each node. -## Path and HTTP methods + +## Endpoints ```json -GET _cat/allocation?v -GET _cat/allocation/ +GET /_cat/allocation +GET /_cat/allocation/{node_id} ``` + + + ## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). -local | Boolean | Whether to return information from the local node only instead of from the cluster manager node. Default is `false`. -cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `bytes` | String | The units used to display byte values.
Valid values are: `b`, `kb`, `k`, `mb`, `m`, `gb`, `g`, `tb`, `t`, `pb`, and `p`. | N/A | +| `cluster_manager_timeout` | String | A timeout for connection to the cluster manager node. | N/A | +| `format` | String | A short version of the HTTP `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `local` | Boolean | Returns local information but does not retrieve the state from cluster-manager node. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + ## Example requests diff --git a/_api-reference/cat/cat-cluster_manager.md b/_api-reference/cat/cat-cluster_manager.md index 1b75074e12c..cded9431ac2 100644 --- a/_api-reference/cat/cat-cluster_manager.md +++ b/_api-reference/cat/cat-cluster_manager.md @@ -15,19 +15,41 @@ has_children: false The CAT cluster manager operation lists information that helps identify the elected cluster manager node. -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/cluster_manager +GET /_cat/cluster_manager ``` + + + ## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `cluster_manager_timeout` | String | A timeout for connection to the cluster manager node. | N/A | +| `format` | String | A short version of the HTTP `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `local` | Boolean | Returns local information but does not retrieve the state from the cluster manager node. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | -## Example requests + + + +## Example request ``` GET _cat/cluster_manager?v @@ -39,4 +61,4 @@ GET _cat/cluster_manager?v ```json id | host | ip | node ZaIkkUd4TEiAihqJGkp5CA | 172.18.0.3 | 172.18.0.3 | opensearch-node2 -``` +``` \ No newline at end of file diff --git a/_api-reference/cat/cat-count.md b/_api-reference/cat/cat-count.md index 94a422d0612..3c2c4913159 100644 --- a/_api-reference/cat/cat-count.md +++ b/_api-reference/cat/cat-count.md @@ -16,12 +16,37 @@ redirect_from: The CAT count operation lists the number of documents in your cluster. -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/count?v -GET _cat/count/?v +GET /_cat/count +GET /_cat/count/{index} ``` + + + + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + ## Example requests diff --git a/_api-reference/cat/cat-field-data.md b/_api-reference/cat/cat-field-data.md index 3012bbbfe90..1d16495d3e2 100644 --- a/_api-reference/cat/cat-field-data.md +++ b/_api-reference/cat/cat-field-data.md @@ -14,19 +14,39 @@ redirect_from: The CAT Field Data operation lists the memory size used by each field per node. - -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/fielddata?v -GET _cat/fielddata/?v +GET /_cat/fielddata +GET /_cat/fielddata/{fields} ``` + + + ## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `bytes` | String | The units used to display byte values.
Valid values are: `b`, `kb`, `k`, `mb`, `m`, `gb`, `g`, `tb`, `t`, `pb`, and `p`. | N/A | +| `fields` | List or String | A comma-separated list of fields used to limit the amount of returned information. | N/A | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + ## Example requests diff --git a/_api-reference/cat/cat-health.md b/_api-reference/cat/cat-health.md index 0e4b7846933..f81701ddba3 100644 --- a/_api-reference/cat/cat-health.md +++ b/_api-reference/cat/cat-health.md @@ -2,7 +2,6 @@ layout: default title: CAT health parent: CAT API - nav_order: 20 has_children: false redirect_from: @@ -16,18 +15,38 @@ redirect_from: The CAT health operation lists the status of the cluster, how long the cluster has been up, the number of nodes, and other useful information that helps you analyze the health of your cluster. -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/health?v +GET /_cat/health ``` + + + ## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). -ts | Boolean | If true, returns HH:MM:SS and Unix epoch timestamps. Default is `true`. +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `time` | String | The unit used to display time values.
Valid values are: `nanos`, `micros`, `ms`, `s`, `m`, `h`, and `d`. | N/A | +| `ts` | Boolean | When `true`, returns `HH:MM:SS` and Unix epoch timestamps. | `true` | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + ## Example request diff --git a/_api-reference/cat/cat-indices.md b/_api-reference/cat/cat-indices.md index 4bbdde573cb..dfb416ae0ee 100644 --- a/_api-reference/cat/cat-indices.md +++ b/_api-reference/cat/cat-indices.md @@ -15,24 +15,45 @@ redirect_from: The CAT indices operation lists information related to indexes, that is, how much disk space they are using, how many shards they have, their health status, and so on. -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/indices/ -GET _cat/indices +GET /_cat/indices +GET /_cat/indices/{index} ``` + + + ## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). -health | String | Limit indexes based on their health status. Supported values are `green`, `yellow`, and `red`. -include_unloaded_segments | Boolean | Whether to include information from segments not loaded into memory. Default is `false`. -cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. -pri | Boolean | Whether to return information only from the primary shards. Default is `false`. -time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). -expand_wildcards | Enum | Expands wildcard expressions to concrete indexes. Combine multiple values with commas. Supported values are `all`, `open`, `closed`, `hidden`, and `none`. Default is `open`. +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `bytes` | String | The units used to display byte values.
Valid values are: `b`, `kb`, `k`, `mb`, `m`, `gb`, `g`, `tb`, `t`, `pb`, and `p`. | N/A | +| `cluster_manager_timeout` | String | The amount of time allowed to establish a connection to the cluster manager node. | N/A | +| `expand_wildcards` | List or String | Specifies the type of index that wildcard expressions can match. Supports comma-separated values.
Valid values are:
- `all`: Match any index, including hidden ones.
- `closed`: Match closed, non-hidden indexes.
- `hidden`: Match hidden indexes. Must be combined with open, closed, or both.
- `none`: Wildcard expressions are not accepted.
- `open`: Match open, non-hidden indexes. | N/A | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `health` | String | Limits indexes based on their health status. Supported values are `green`, `yellow`, and `red`.
Valid values are: `green`, `GREEN`, `yellow`, `YELLOW`, `red`, and `RED`. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `include_unloaded_segments` | Boolean | Whether to include information from segments not loaded into memory. | `false` | +| `local` | Boolean | Returns local information but does not retrieve the state from the cluster manager node. | `false` | +| `pri` | Boolean | When `true`, returns information only from the primary shards. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `time` | String | Specifies the time units.
Valid values are: `nanos`, `micros`, `ms`, `s`, `m`, `h`, and `d`. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + ## Example requests diff --git a/_api-reference/cat/cat-nodeattrs.md b/_api-reference/cat/cat-nodeattrs.md index 62471f3960c..e7f86ddc52f 100644 --- a/_api-reference/cat/cat-nodeattrs.md +++ b/_api-reference/cat/cat-nodeattrs.md @@ -15,18 +15,38 @@ redirect_from: The CAT nodeattrs operation lists the attributes of custom nodes. -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/nodeattrs +GET /_cat/nodeattrs ``` + + + ## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -local | Boolean | Whether to return information from the local node only instead of from the cluster manager node. Default is `false`. -cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `cluster_manager_timeout` | String | The amount of time allowed to establish a connection to the cluster manager node. | N/A | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `local` | Boolean | Returns local information but does not retrieve the state from the cluster manager node. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + ## Example request diff --git a/_api-reference/cat/cat-nodes.md b/_api-reference/cat/cat-nodes.md index d20393a2518..ca0cf3e7d5e 100644 --- a/_api-reference/cat/cat-nodes.md +++ b/_api-reference/cat/cat-nodes.md @@ -17,25 +17,40 @@ The CAT nodes operation lists node-level information, including node roles and l A few important node metrics are `pid`, `name`, `cluster_manager`, `ip`, `port`, `version`, `build`, `jdk`, along with `disk`, `heap`, `ram`, and `file_desc`. -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/nodes +GET /_cat/nodes ``` + -## Query parameters - -All CAT nodes URL parameters are optional. -In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: + +## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). -full_id | Boolean | If true, return the full node ID. If false, return the shortened node ID. Defaults to false. -cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. -time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). -include_unloaded_segments | Boolean | Whether to include information from segments not loaded into memory. Default is `false`. +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `bytes` | String | The units used to display byte values.
Valid values are: `b`, `kb`, `k`, `mb`, `m`, `gb`, `g`, `tb`, `t`, `pb`, and `p`. | N/A | +| `cluster_manager_timeout` | String | The amount of time allowed to establish a connection to the cluster manager node. | N/A | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `full_id` | Boolean or String | When `true`, returns the full node ID. When `false`, returns the shortened node ID. | `false` | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `time` | String | Specifies the time units, for example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/api-reference/units/).
Valid values are: `nanos`, `micros`, `ms`, `s`, `m`, `h`, and `d`. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + ## Example request diff --git a/_api-reference/cat/cat-pending-tasks.md b/_api-reference/cat/cat-pending-tasks.md index b047dd5d62b..6b2674b4daf 100644 --- a/_api-reference/cat/cat-pending-tasks.md +++ b/_api-reference/cat/cat-pending-tasks.md @@ -2,7 +2,6 @@ layout: default title: CAT pending tasks parent: CAT API - nav_order: 45 has_children: false redirect_from: @@ -16,19 +15,39 @@ redirect_from: The CAT pending tasks operation lists the progress of all pending tasks, including task priority and time in queue. -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/pending_tasks +GET /_cat/pending_tasks ``` + + + ## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -local | Boolean | Whether to return information from the local node only instead of from the cluster manager node. Default is `false`. -cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. -time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `cluster_manager_timeout` | String | The amount of time allowed to establish a connection to the cluster manager node. | N/A | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `local` | Boolean | Returns local information but does not retrieve the state from the cluster manager node. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `time` | String | Specifies the time units, for example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/api-reference/units/).
Valid values are: `nanos`, `micros`, `ms`, `s`, `m`, `h`, and `d`. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + ## Example request diff --git a/_api-reference/cat/cat-pit-segments.md b/_api-reference/cat/cat-pit-segments.md new file mode 100644 index 00000000000..e4730de55fd --- /dev/null +++ b/_api-reference/cat/cat-pit-segments.md @@ -0,0 +1,82 @@ +--- +layout: default +title: CAT PIT segments +parent: CAT API +nav_order: 46 +--- + +# CAT PIT segments + +The CAT point-in-time (PIT) segments operation returns information about one or more PIT segments. + +## Endpoints + + +```json +GET /_cat/pit_segments +``` + + + +```json +GET /_cat/pit_segments/_all +``` + + + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `bytes` | String | The units used to display byte values.
Valid values are: `b`, `kb`, `k`, `mb`, `m`, `gb`, `g`, `tb`, `t`, `pb`, and `p`. | N/A | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + + +## Request body fields + +Field | Data type | Description +:--- | :--- | :--- +pit_id | [Base64 encoded binary]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/binary/) or an array of binaries | The PIT IDs of the PITs whose segments are to be listed. Required. + +## Example request + +```json +GET /_cat/pit_segments +{ + "pit_id": [ + "o463QQEPbXktaW5kZXgtMDAwMDAxFkhGN09fMVlPUkVPLXh6MUExZ1hpaEEAFjBGbmVEZHdGU1EtaFhhUFc4ZkR5cWcAAAAAAAAAAAEWaXBPNVJtZEhTZDZXTWFFR05waXdWZwEWSEY3T18xWU9SRU8teHoxQTFnWGloQQAA", + "o463QQEPbXktaW5kZXgtMDAwMDAxFkhGN09fMVlPUkVPLXh6MUExZ1hpaEEAFjBGbmVEZHdGU1EtaFhhUFc4ZkR5cWcAAAAAAAAAAAIWaXBPNVJtZEhTZDZXTWFFR05waXdWZwEWSEY3T18xWU9SRU8teHoxQTFnWGloQQAA" + ] +} +``` +{% include copy.html %} + +## Example response + +```json +index shard prirep ip segment generation docs.count docs.deleted size size.memory committed searchable version compound +index1 0 r 10.212.36.190 _0 0 4 0 3.8kb 1364 false true 8.8.2 true +index1 1 p 10.212.36.190 _0 0 3 0 3.7kb 1364 false true 8.8.2 true +index1 2 r 10.212.74.139 _0 0 2 0 3.6kb 1364 false true 8.8.2 true +``` + diff --git a/_api-reference/cat/cat-plugins.md b/_api-reference/cat/cat-plugins.md index 45866e8ebda..38c9b82e7cb 100644 --- a/_api-reference/cat/cat-plugins.md +++ b/_api-reference/cat/cat-plugins.md @@ -2,7 +2,6 @@ layout: default title: CAT plugins parent: CAT API - nav_order: 50 has_children: false redirect_from: @@ -15,25 +14,40 @@ redirect_from: The CAT plugins operation lists the names, components, and versions of the installed plugins. - -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/plugins +GET /_cat/plugins ``` + + + ## Query parameters -All parameters are optional. +The following table lists the available query parameters. All query parameters are optional. -In addition to the [common parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `cluster_manager_timeout` | String | The amount of time allowed to establish a connection to the cluster manager node. | N/A | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `local` | Boolean | Returns local information but does not retrieve the state from the cluster manager node. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | -Parameter | Type | Description -:--- | :--- | :--- -local | Boolean | Whether to return information from the local node only instead of from the cluster manager node. Default is `false`. -cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. + -## Example requests +## Example request The following example request lists all installed plugins: diff --git a/_api-reference/cat/cat-recovery.md b/_api-reference/cat/cat-recovery.md index fc29e14ac65..b6ab40a0883 100644 --- a/_api-reference/cat/cat-recovery.md +++ b/_api-reference/cat/cat-recovery.md @@ -2,7 +2,6 @@ layout: default title: CAT recovery parent: CAT API - nav_order: 50 has_children: false redirect_from: @@ -16,20 +15,42 @@ redirect_from: The CAT recovery operation lists all completed and ongoing index and shard recoveries. -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/recovery +GET /_cat/recovery +GET /_cat/recovery/{index} ``` + + + ## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -active_only | Boolean | Whether to only include ongoing shard recoveries. Default is `false`. -bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). -detailed | Boolean | Whether to include detailed information about shard recoveries. Default is `false`. -time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `active_only` | Boolean | If `true`, the response only includes ongoing shard recoveries. | `false` | +| `bytes` | String | The units used to display byte values.
Valid values are: `b`, `kb`, `k`, `mb`, `m`, `gb`, `g`, `tb`, `t`, `pb`, and `p`. | N/A | +| `detailed` | Boolean | When `true`, includes detailed information about shard recoveries. | `false` | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `index` | List | A comma-separated list of data streams, indexes, and aliases used to limit the request. Supports wildcards (`*`). To target all data streams and indexes, omit this parameter or use `*` or `_all`. | N/A | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `time` | String | Specifies the time units, for example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/api-reference/units/).
Valid values are: `nanos`, `micros`, `ms`, `s`, `m`, `h`, and `d`. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + ## Example requests diff --git a/_api-reference/cat/cat-repositories.md b/_api-reference/cat/cat-repositories.md index c197ee5c6c4..1a0f1f8e98b 100644 --- a/_api-reference/cat/cat-repositories.md +++ b/_api-reference/cat/cat-repositories.md @@ -2,7 +2,6 @@ layout: default title: CAT repositories parent: CAT API - nav_order: 52 has_children: false redirect_from: @@ -15,19 +14,38 @@ redirect_from: The CAT repositories operation lists all snapshot repositories for a cluster. -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/repositories +GET /_cat/repositories ``` + + + ## Query parameters +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `cluster_manager_timeout` | String | The amount of time allowed to establish a connection to the cluster manager node. | N/A | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `local` | Boolean | Returns local information but does not retrieve the state from the cluster manager node. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | -Parameter | Type | Description -:--- | :--- | :--- -local | Boolean | Whether to return information from the local node only instead of from the cluster manager node. Default is `false`. -cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. + ## Example request diff --git a/_api-reference/cat/cat-segment-replication.md b/_api-reference/cat/cat-segment-replication.md index e943d0a4511..2d4a9929229 100644 --- a/_api-reference/cat/cat-segment-replication.md +++ b/_api-reference/cat/cat-segment-replication.md @@ -15,12 +15,65 @@ The CAT segment replication operation returns information about active and last Call the CAT Segment Replication API only on indexes with segment replication enabled. {: .note} -## Path and HTTP methods - + +## Endpoints ```json GET /_cat/segment_replication -GET /_cat/segment_replication/ +GET /_cat/segment_replication/{index} ``` + + + +## Path parameters + +The following table lists the available path parameters. All path parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `index` | List | A comma-separated list of data streams, indexes, and aliases used to limit the request. Supports wildcards (`*`). To target all data streams and indexes, omit this parameter or use `*` or `_all`. | + + + + + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `active_only` | Boolean | When `true`, the response only includes ongoing segment replication events. | `false` | +| `allow_no_indices` | Boolean | Whether to ignore the index if a wildcard index expression resolves to no concrete indexes. This includes the `_all` string or when no indexes have been specified. | N/A | +| `bytes` | String | The units used to display byte values.
Valid values are: `b`, `kb`, `k`, `mb`, `m`, `gb`, `g`, `tb`, `t`, `pb`, and `p`. | N/A | +| `completed_only` | Boolean | When `true`, the response only includes the last-completed segment replication events. | `false` | +| `detailed` | Boolean | When `true`, the response includes additional metrics for each stage of a segment replication event. | `false` | +| `expand_wildcards` | List or String | Specifies the type of index that wildcard expressions can match. Supports comma-separated values.
Valid values are:
- `all`: Match any index, including hidden ones.
- `closed`: Match closed, non-hidden indexes.
- `hidden`: Match hidden indexes. Must be combined with open, closed, or both.
- `none`: Wildcard expressions are not accepted.
- `open`: Match open, non-hidden indexes. | N/A | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `ignore_throttled` | Boolean | Whether specified concrete, expanded, or aliased indexes should be ignored when throttled. | N/A | +| `ignore_unavailable` | Boolean | Whether the specified concrete indexes should be ignored when missing or closed. | N/A | +| `index` | List | A comma-separated list of data streams, indexes, and aliases used to limit the request. Supports wildcards (`*`). To target all data streams and indexes, omit this parameter or use `*` or `_all`. | N/A | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `shards` | List | A comma-separated list of shards to display. | N/A | +| `time` | String | Specifies the time units, for example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/api-reference/units/).
Valid values are: `nanos`, `micros`, `ms`, `s`, `m`, `h`, and `d`. | N/A | +| `timeout` | String | The operation timeout. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + ## Path parameters diff --git a/_api-reference/cat/cat-segments.md b/_api-reference/cat/cat-segments.md index 76696d38869..6f65cc82dea 100644 --- a/_api-reference/cat/cat-segments.md +++ b/_api-reference/cat/cat-segments.md @@ -2,7 +2,6 @@ layout: default title: CAT segments parent: CAT API - nav_order: 55 has_children: false redirect_from: @@ -16,18 +15,39 @@ redirect_from: The cat segments operation lists Lucene segment-level information for each index. -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/segments +GET /_cat/segments +GET /_cat/segments/{index} ``` + + + ## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/).. -cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `bytes` | String | The units used to display byte values.
Valid values are: `b`, `kb`, `k`, `mb`, `m`, `gb`, `g`, `tb`, `t`, `pb`, and `p`. | N/A | +| `cluster_manager_timeout` | String | The amount of time allowed to establish a connection to the cluster manager node. | N/A | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + ## Example requests diff --git a/_api-reference/cat/cat-shards.md b/_api-reference/cat/cat-shards.md index c9677cb0ed8..ebcbcb77ac5 100644 --- a/_api-reference/cat/cat-shards.md +++ b/_api-reference/cat/cat-shards.md @@ -2,7 +2,6 @@ layout: default title: CAT shards parent: CAT API - nav_order: 60 has_children: false redirect_from: @@ -16,25 +15,41 @@ redirect_from: The CAT shards operation lists the state of all primary and replica shards and how they are distributed. -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/shards +GET /_cat/shards +GET /_cat/shards/{index} ``` + -## Query parameters -All parameters are optional. - -In addition to the [common parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: + +## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). -local | Boolean | Whether to return information from the local node only instead of from the cluster manager node. Default is `false`. -cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. -cancel_after_time_interval | Time | The amount of time after which the shard request will be canceled. Default is `-1`. -time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `bytes` | String | The units used to display byte values.
Valid values are: `b`, `kb`, `k`, `mb`, `m`, `gb`, `g`, `tb`, `t`, `pb`, and `p`. | N/A | +| `cluster_manager_timeout` | String | The amount of time allowed to establish a connection to the cluster manager node. | N/A | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `local` | Boolean | Returns local information but does not retrieve the state from the cluster manager node. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `time` | String | Specifies the time units, for example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/api-reference/units/).
Valid values are: `nanos`, `micros`, `ms`, `s`, `m`, `h`, and `d`. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + ## Example requests diff --git a/_api-reference/cat/cat-snapshots.md b/_api-reference/cat/cat-snapshots.md index 71c3b3f75db..d152b39523b 100644 --- a/_api-reference/cat/cat-snapshots.md +++ b/_api-reference/cat/cat-snapshots.md @@ -2,7 +2,6 @@ layout: default title: CAT snapshots parent: CAT API - nav_order: 65 has_children: false redirect_from: @@ -16,18 +15,41 @@ redirect_from: The CAT snapshots operation lists all snapshots for a repository. -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/snapshots +GET /_cat/snapshots +GET /_cat/snapshots/{repository} ``` + + + ## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. -time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +The following table lists the available query parameters. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `repository` | List or String | **(Required)** A comma-separated list of snapshot repositories used to limit the request. Accepts wildcard expressions. `_all` returns all repositories. If any repository fails during the request, OpenSearch returns an error. | N/A | +| `cluster_manager_timeout` | String | The amount of time allowed to establish a connection to the cluster manager node. | N/A | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `ignore_unavailable` | Boolean | When `true`, the response does not include information from unavailable snapshots. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `time` | String | Specifies the time units, for example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/api-reference/units/).
Valid values are: `nanos`, `micros`, `ms`, `s`, `m`, `h`, and `d`. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + ## Example request diff --git a/_api-reference/cat/cat-tasks.md b/_api-reference/cat/cat-tasks.md index 5419d5c6475..e3c39478ce9 100644 --- a/_api-reference/cat/cat-tasks.md +++ b/_api-reference/cat/cat-tasks.md @@ -2,7 +2,6 @@ layout: default title: CAT tasks parent: CAT API - nav_order: 70 has_children: false redirect_from: @@ -15,20 +14,41 @@ redirect_from: The CAT tasks operation lists the progress of all tasks currently running on your cluster. -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/tasks +GET /_cat/tasks ``` + + + ## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -nodes | List | A comma-separated list of node IDs or names to limit the returned information. Use `_local` to return information from the node you're connecting to, specify the node name to get information from specific nodes, or keep the parameter empty to get information from all nodes. -detailed | Boolean | Returns detailed task information. (Default: false) -parent_task_id | String | Returns tasks with a specified parent task ID (node_id:task_number). Keep empty or set to -1 to return all. -time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `actions` | List | The task action names used to limit the response. | N/A | +| `detailed` | Boolean | If `true`, the response includes detailed information about shard recoveries. | `false` | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `nodes` | List | A comma-separated list of node IDs or names used to limit the returned information. Use `_local` to return information from the node to which you're connecting, specify a specific node from which to get information, or keep the parameter empty to get information from all nodes. | N/A | +| `parent_task_id` | String | The parent task identifier, which is used to limit the response. | N/A | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `time` | String | Specifies the time units, for example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/api-reference/units/).
Valid values are: `nanos`, `micros`, `ms`, `s`, `m`, `h`, and `d`. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + ## Example request diff --git a/_api-reference/cat/cat-templates.md b/_api-reference/cat/cat-templates.md index 90b7d43fc7f..de3e0875810 100644 --- a/_api-reference/cat/cat-templates.md +++ b/_api-reference/cat/cat-templates.md @@ -2,7 +2,6 @@ layout: default title: CAT templates parent: CAT API - nav_order: 70 has_children: false redirect_from: @@ -16,18 +15,39 @@ redirect_from: The CAT templates operation lists the names, patterns, order numbers, and version numbers of index templates. -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/templates +GET /_cat/templates +GET /_cat/templates/{name} ``` + + + ## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -local | Boolean | Whether to return information from the local node only instead of from the cluster manager node. Default is `false`. -cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `cluster_manager_timeout` | String | The amount of time allowed to establish a connection to the cluster manager node. | N/A | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `local` | Boolean | Returns local information but does not retrieve the state from the cluster manager node. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + ## Example requests diff --git a/_api-reference/cat/cat-thread-pool.md b/_api-reference/cat/cat-thread-pool.md index 3171ae830e0..67791572836 100644 --- a/_api-reference/cat/cat-thread-pool.md +++ b/_api-reference/cat/cat-thread-pool.md @@ -15,18 +15,40 @@ redirect_from: The CAT thread pool operation lists the active, queued, and rejected threads of different thread pools on each node. -## Path and HTTP methods - + +## Endpoints ```json -GET _cat/thread_pool +GET /_cat/thread_pool +GET /_cat/thread_pool/{thread_pool_patterns} ``` + + + ## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -local | Boolean | Whether to return information from the local node only instead of from the cluster manager node. Default is `false`. -cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `cluster_manager_timeout` | String | A timeout for connection to the cluster manager node. | N/A | +| `format` | String | A short version of the `Accept` header, such as `json` or `yaml`. | N/A | +| `h` | List | A comma-separated list of column names to display. | N/A | +| `help` | Boolean | Returns help information. | `false` | +| `local` | Boolean | Returns local information but does not retrieve the state from the cluster manager node. | `false` | +| `s` | List | A comma-separated list of column names or column aliases to sort by. | N/A | +| `size` | Integer | The multiplier in which to display values. | N/A | +| `v` | Boolean | Enables verbose mode, which displays column headers. | `false` | + + ## Example requests diff --git a/_api-reference/cluster-api/cluster-allocation.md b/_api-reference/cluster-api/cluster-allocation.md index 2f8bd9799ca..5e7984cd750 100644 --- a/_api-reference/cluster-api/cluster-allocation.md +++ b/_api-reference/cluster-api/cluster-allocation.md @@ -17,7 +17,7 @@ The most basic cluster allocation explain request finds an unassigned shard and If you add some options, you can instead get information on a specific shard, including why OpenSearch assigned it to its current node. -## Path and HTTP methods +## Endpoints ```json GET _cluster/allocation/explain diff --git a/_api-reference/cluster-api/cluster-awareness.md b/_api-reference/cluster-api/cluster-awareness.md index 8c162f214cd..c76193df39a 100644 --- a/_api-reference/cluster-api/cluster-awareness.md +++ b/_api-reference/cluster-api/cluster-awareness.md @@ -1,7 +1,7 @@ --- layout: default title: Cluster routing and awareness -nav_order: 20 +nav_order: 50 parent: Cluster APIs has_children: false redirect_from: @@ -13,111 +13,144 @@ redirect_from: **Introduced 1.0** {: .label .label-purple } -To control the distribution of search or HTTP traffic, you can use the weights per awareness attribute to control the distribution of search or HTTP traffic across zones. This is commonly used for zonal deployments, heterogeneous instances, and routing traffic away from zones during zonal failure. +To control how search traffic is routed across zones, you can assign weights to awareness attribute values. This is useful for zonal deployments, heterogeneous clusters, or routing traffic away from unhealthy zones. -## Path and HTTP methods +## Prerequisites + +Before using this API, you must configure cluster awareness attributes and node attributes. This can be done either in the `opensearch.yml` file or through the Cluster Settings API. + +For example, to configure `zone` and `rack` awareness attributes using `opensearch.yml`, specify them as a comma-separated list: + +```yaml +cluster.routing.allocation.awareness.attributes: zone,rack +``` +{% include copy.html %} + +Alternatively, you can use the Cluster Settings API to configure the awareness attributes: + +```json +PUT /_cluster/settings +{ + "persistent" : { + "cluster.routing.allocation.awareness.attributes": ["zone", "rack"] + } +} +``` +{% include copy-curl.html %} + +For more information about OpenSearch settings, see [Configuring OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/). + +## Endpoints ```json PUT /_cluster/routing/awareness//weights GET /_cluster/routing/awareness//weights?local GET /_cluster/routing/awareness//weights +DELETE /_cluster/routing/awareness//weights ``` ## Path parameters -Parameter | Type | Description +The following table lists the available path parameters. All path parameters are optional. + +Parameter | Data type | Description :--- | :--- | :--- -attribute | String | The name of the awareness attribute, usually `zone`. The attribute name must match the values listed in the request body when assigning weights to zones. +`` | String | The name of the configured awareness attribute (for example, `zone`). The attribute specified in the path determines which awareness attribute the weights apply to. -## Request body fields +## Query parameters -Parameter | Type | Description -:--- | :--- | :--- -weights | JSON object | Assigns weights to attributes within the request body of the PUT request. Weights can be set in any ratio, for example, 2:3:5. In a 2:3:5 ratio with 3 zones, for every 100 requests sent to the cluster, each zone would receive either 20, 30, or 50 search requests in a random order. When assigned a weight of `0`, the zone does not receive any search traffic. -_version | String | Implements optimistic concurrency control (OCC) through versioning. The parameter uses simple versioning, such as `1`, and increments upward based on each subsequent modification. This allows any servers from which a request originates to validate whether or not a zone has been modified. +The following table lists the available query parameters. All query parameters are optional. +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `local` | Boolean | Can be provided in a `GET` request only. If `true`, the request retrieves information from the node that receives the request instead of from the cluster manager node. Default is `false`.| -In the following example request body, `zone_1` and `zone_2` receive 50 requests each, whereas `zone_3` is prevented from receiving requests: +## Request body fields -``` -{ - "weights": - { - "zone_1": "5", - "zone_2": "5", - "zone_3": "0" - } - "_version" : 1 -} -``` +The following table lists the available request body fields for the `PUT` and `DELETE` methods. -## Example requests +| Parameter | Data type | Applicable method | Description | +| :--- | :--- | :--- | :--- | +| `weights` | Object | `PUT` | Specifies custom weights for the awareness attribute values. The weights influence how search requests are distributed across zones or other awareness attribute values. Weights are relative and can use any ratio. For example, in a `2:3:5` ratio across three zones, 20%, 30%, and 50% of requests are routed to the respective zones. A weight of `0` excludes a zone from receiving search traffic. Required for the `PUT` method. | +| `_version` | Integer | `PUT`, `DELETE` | Used for optimistic concurrency control (OCC). Ensures that changes are applied only if the current version matches, preventing conflicting updates. The version is incremented after each succesful `PUT` or `DELETE` operation. To initiate concurrency control, you must set `_version` to `-1` in the initial request. Required for the `PUT` and `DELETE` methods. | -### Weighted round robin search -The following example request creates a round robin shard allocation for search traffic by using an undefined ratio: +## Example request: Weighted round-robin search +The following example request creates a round-robin shard allocation for search traffic between two zones while excluding a third zone from receiving any traffic: ```json PUT /_cluster/routing/awareness/zone/weights { - "weights": - { - "zone_1": "1", - "zone_2": "1", - "zone_3": "0" - } - "_version" : 1 + "weights": + { + "zone_1": "1", + "zone_2": "1", + "zone_3": "0" + }, + "_version" : -1 } ``` {% include copy-curl.html %} +After this request, the `_version` increments to `0`. + +To create a shard allocation for multiple awareness attributes, send a separate request for each attribute. -### Getting weights for all zones +## Example request: Updating the configuration -The following example request gets weights for all zones. +The `PUT` request fully replaces the existing weight configuration for the specified awareness attribute. Any values omitted in the request are removed from the configuration. For example, the following request updates the weights for zones 1 and 3 and removes zone 2: ```json -GET /_cluster/routing/awareness/zone/weights +PUT /_cluster/routing/awareness/zone/weights +{ + "weights": + { + "zone_1": "2", + "zone_3": "1" + }, + "_version" : 0 +} ``` {% include copy-curl.html %} +After this request, the `_version` increments to `1`. -### Deleting weights +## Example request: Viewing the configuration -You can remove your weight ratio for each zone using the `DELETE` method: +To view the current weight configuration and its version, send the following request. Use the returned version number in subsequent update or delete requests: ```json -DELETE /_cluster/routing/awareness/zone/weights +GET /_cluster/routing/awareness/zone/weights ``` {% include copy-curl.html %} -## Example responses - -OpenSearch typically responds with the following when successfully allocating shards: +## Example response ```json { - "acknowledged": true + "weights": { + "zone_1": "2.0", + "zone_3": "1.0" + }, + "_version": 1, + "discovered_cluster_manager": true } ``` -### Getting weights for all zone +## Example request: Deleting the configuration -OpenSearch responds with the weight of each zone: +To remove a weight configuration, provide the current version in a `DELETE` request: ```json +DELETE /_cluster/routing/awareness/zone/weights { - "weights": - { - - "zone_1": "1.0", - "zone_2": "1.0", - "zone_3": "0.0" - }, - "_version":1 + "_version": 1 } +``` +{% include copy-curl.html %} +After this request, the `_version` increments to `2`. ## Next steps diff --git a/_api-reference/cluster-api/cluster-decommission.md b/_api-reference/cluster-api/cluster-decommission.md index 1cf17c5b6bc..04d73d329da 100644 --- a/_api-reference/cluster-api/cluster-decommission.md +++ b/_api-reference/cluster-api/cluster-decommission.md @@ -18,7 +18,7 @@ The cluster decommission operation adds support decommissioning based on awarene For more information about allocation awareness, see [Shard allocation awareness]({{site.url}}{{site.baseurl}}//opensearch/cluster/#shard-allocation-awareness). -## Path and HTTP methods +## Endpoints ```json PUT /_cluster/decommission/awareness/{awareness_attribute_name}/{awareness_attribute_value} diff --git a/_api-reference/cluster-api/cluster-health.md b/_api-reference/cluster-api/cluster-health.md index df8f3f24e33..07ba8d43640 100644 --- a/_api-reference/cluster-api/cluster-health.md +++ b/_api-reference/cluster-api/cluster-health.md @@ -18,7 +18,7 @@ The most basic cluster health request returns a simple status of the health of y To get the status of a specific index, provide the index name. -## Path and HTTP methods +## Endpoints ```json GET _cluster/health diff --git a/_api-reference/cluster-api/cluster-settings.md b/_api-reference/cluster-api/cluster-settings.md index 1e0977c56aa..9008234ff65 100644 --- a/_api-reference/cluster-api/cluster-settings.md +++ b/_api-reference/cluster-api/cluster-settings.md @@ -14,7 +14,7 @@ redirect_from: The cluster settings operation lets you check the current settings for your cluster, review default settings, and change settings. When you update a setting using the API, OpenSearch applies it to all nodes in the cluster. -## Path and HTTP methods +## Endpoints ```json GET _cluster/settings diff --git a/_api-reference/cluster-api/cluster-stats.md b/_api-reference/cluster-api/cluster-stats.md index 09b3e5087eb..273a4f379f9 100644 --- a/_api-reference/cluster-api/cluster-stats.md +++ b/_api-reference/cluster-api/cluster-stats.md @@ -16,11 +16,13 @@ redirect_from: The cluster stats API operation returns statistics about your cluster. -## Path and HTTP methods +## Endpoints ```json GET _cluster/stats GET _cluster/stats/nodes/ +GET _cluster/stats//nodes/ +GET _cluster/stats///nodes/ ``` ## Path parameters @@ -30,10 +32,57 @@ All parameters are optional. Parameter | Type | Description :--- | :--- | :--- <node-filters> | List | A comma-separated list of [node filters]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/index/#node-filters) that OpenSearch uses to filter results. +metric | String | A comma-separated list of [metric groups](#metric-groups), for example, `jvm,fs`. Default is all metric groups. +index_metric | String | A comma-separated list of [index metric groups](#index-metric-groups), for example, `docs,store`. Default is all index metrics. - Although the `master` node is now called `cluster_manager` for version 2.0, we retained the `master` field for backwards compatibility. If you have a node that has either a `master` role or a `cluster_manager` role, the `count` increases for both fields by 1. To see an example node count increase, see the Response sample. - {: .note } +Although the term `master` was deprecated in favor of `cluster_manager` subsequent to OpenSearch 2.0, the `master` field was retained for backward compatibility. If you have a node that has either a `master` role or a `cluster_manager` role, the `count` increases for both fields by 1. For an example node count increase, see the [example response](#example-response). +{: .note } + +### Metric groups + +The following table lists all available metric groups. + +Metric | Description +:--- |:---- +`indices` | Statistics about indexes in the cluster. +`os` | Statistics about the host OS, including load and memory. +`process` | Statistics about processes, including open file descriptors and CPU usage. +`jvm` | Statistics about the JVM, including heap usage and threads. +`fs` | Statistics about file system usage. +`plugins` | Statistics about OpenSearch plugins integrated with the nodes. +`network_types` | A list of the transport and HTTP networks connected to the nodes. +`discovery_type` | The method used by the nodes to find other nodes in the cluster. +`packaging_types` | Information about each node's OpenSearch distribution. +`ingest` | Statistics about ingest pipelines. + +### Index metric groups + +To filter the information returned for the `indices` metric, you can use specific `index_metric` values. These values are only supported when using the following query types: + +```json +GET _cluster/stats/_all//nodes/ +GET _cluster/stats/indices//nodes/ +``` + +The following index metrics are supported: + +- `shards` +- `docs` +- `store` +- `fielddata` +- `query_cache` +- `completion` +- `segments` +- `mappings` +- `analysis` + +For example, the following query requests statistics for `docs` and `search`: + +```json +GET _cluster/stats/indices/docs,segments/nodes/_all +``` +{% include copy-curl.html %} ## Example request @@ -491,32 +540,32 @@ GET _cluster/stats/nodes/_cluster_manager Field | Description :--- | :--- -nodes | How many nodes returned in the response. -cluster_name | The cluster's name. -cluster_uuid | The cluster's uuid. -timestamp | The Unix epoch time of when the cluster was last refreshed. -status | The cluster's health status. -indices | Statistics about the indexes in the cluster. -indices.count | How many indexes are in the cluster. -indices.shards | Information about the cluster's shards. -indices.docs | How many documents are still in the cluster and how many documents are deleted. -indices.store | Information about the cluster's storage. -indices.fielddata | Information about the cluster's field data -indices.query_cache | Data about the cluster's query cache. -indices.completion | How many bytes in memory are used to complete operations. -indices.segments | Information about the cluster's segments, which are small Lucene indexes. -indices.mappings | Mappings within the cluster. -indices.analysis | Information about analyzers used in the cluster. -nodes | Statistics about the nodes in the cluster. -nodes.count | How many nodes were returned from the request. -nodes.versions | OpenSearch's version number. -nodes.os | Information about the operating systems used in the nodes. -nodes.process | The processes the returned nodes use. -nodes.jvm | Statistics about the Java Virtual Machines in use. -nodes.fs | The nodes' file storage. -nodes.plugins | The OpenSearch plugins integrated within the nodes. -nodes.network_types | The transport and HTTP networks within the nodes. -nodes.discovery_type | The method the nodes use to find other nodes within the cluster. -nodes.packaging_types | Information about the nodes' OpenSearch distribution. -nodes.ingest | Information about the nodes' ingest pipelines/nodes, if there are any. -total_time_spent | The total amount of download and upload time spent across all shards in the cluster when downloading or uploading from the remote store. +`nodes` | The number of nodes returned in the response. +`cluster_name` | The cluster's name. +`cluster_uuid` | The cluster's UUID. +`timestamp` | The Unix epoch time indicating when the cluster was last refreshed. +`status` | The cluster's health status. +`indices` | Statistics about the indexes in the cluster. +`indices.count` | The number of indexes in the cluster. +`indices.shards` | Information about the cluster's shards. +`indices.docs` | The number of documents remaining in the cluster and the number of documents that were deleted. +`indices.store` | Information about the cluster's storage. +`indices.fielddata` | Information about the cluster's field data. +`indices.query_cache` | Data about the cluster's query cache. +`indices.completion` | The number of bytes in memory that were used to complete operations. +`indices.segments` | Information about the cluster's segments, which are small Lucene indexes. +`indices.mappings` | Information about mappings in the cluster. +`indices.analysis` | Information about analyzers used in the cluster. +`nodes` | Statistics about the nodes in the cluster. +`nodes.count` | The number of nodes returned by the request. +`nodes.versions` | The OpenSearch version number for each node. +`nodes.os` | Information about the operating systems used by the nodes. +`nodes.process` | A list of processes used by each node. +`nodes.jvm` | Statistics about the JVMs in use. +`nodes.fs` | Information about the node's file storage. +`nodes.plugins` | A list of the OpenSearch plugins integrated with the nodes. +`nodes.network_types` | A list of the transport and HTTP networks connected to the nodes. +`nodes.discovery_type` | A list of methods used by the nodes to find other nodes in the cluster. +`nodes.packaging_types` | Information about each node's OpenSearch distribution. +`nodes.ingest` | Information about the node's ingest pipelines/nodes, if there are any. +`total_time_spent` | The total amount of download and upload time spent across all shards in the cluster when downloading or uploading from the remote store. diff --git a/_api-reference/remote-info.md b/_api-reference/cluster-api/remote-info.md similarity index 91% rename from _api-reference/remote-info.md rename to _api-reference/cluster-api/remote-info.md index 25e032a9d5d..6f407ceb595 100644 --- a/_api-reference/remote-info.md +++ b/_api-reference/cluster-api/remote-info.md @@ -1,9 +1,11 @@ --- layout: default title: Remote cluster information +parent: Cluster APIs nav_order: 67 redirect_from: - /opensearch/rest-api/remote-info/ + - /api-reference/remote-info/ --- # Remote cluster information @@ -15,13 +17,13 @@ This operation provides connection information for any remote OpenSearch cluster The response is more comprehensive and useful than a call to `_cluster/settings`, which only includes the cluster alias and seed nodes. -## Path and HTTP methods +## Endpoints ```json GET _remote/info ``` -## Example Response +## Example response ```json { diff --git a/_api-reference/common-parameters.md b/_api-reference/common-parameters.md index 5b536ad9925..ac3efbf4bfb 100644 --- a/_api-reference/common-parameters.md +++ b/_api-reference/common-parameters.md @@ -123,4 +123,17 @@ Kilometers | `km` or `kilometers` Meters | `m` or `meters` Centimeters | `cm` or `centimeters` Millimeters | `mm` or `millimeters` -Nautical miles | `NM`, `nmi`, or `nauticalmiles` \ No newline at end of file +Nautical miles | `NM`, `nmi`, or `nauticalmiles` + +## `X-Opaque-Id` header + +You can specify an opaque identifier for any request using the `X-Opaque-Id` header. This identifier is used to track tasks and deduplicate deprecation warnings in server-side logs. This identifier is used to differentiate between callers sending requests to your OpenSearch cluster. Do not specify a unique value per request. + +#### Example request + +The following request adds an opaque ID to the request: + +```json +curl -H "X-Opaque-Id: my-curl-client-1" -XGET localhost:9200/_tasks +``` +{% include copy.html %} diff --git a/_api-reference/document-apis/bulk-streaming.md b/_api-reference/document-apis/bulk-streaming.md index c127eab527b..f9379b700e8 100644 --- a/_api-reference/document-apis/bulk-streaming.md +++ b/_api-reference/document-apis/bulk-streaming.md @@ -19,7 +19,7 @@ The streaming bulk operation lets you add, update, or delete multiple documents The default HTTP transport method does not support streaming. You must install the [`transport-reactor-netty4`]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/network-settings/#selecting-the-transport) HTTP transport plugin and use it as the default HTTP transport layer. Both the `transport-reactor-netty4` plugin and the Streaming Bulk API are experimental. {: .note} -## Path and HTTP methods +## Endpoints ```json POST _bulk/stream diff --git a/_api-reference/document-apis/bulk.md b/_api-reference/document-apis/bulk.md index 2a2a5ef8a2b..cd76f4fc461 100644 --- a/_api-reference/document-apis/bulk.md +++ b/_api-reference/document-apis/bulk.md @@ -19,7 +19,7 @@ Beginning in OpenSearch 2.9, when indexing documents using the bulk operation, t -## Path and HTTP methods +## Endpoints ```json POST _bulk @@ -43,12 +43,7 @@ refresh | Enum | Whether to refresh the affected shards after performing the ind require_alias | Boolean | Set to `true` to require that all actions target an index alias rather than an index. Default is `false`. routing | String | Routes the request to the specified shard. timeout | Time | How long to wait for the request to return. Default is `1m`. -type | String | (Deprecated) The default document type for documents that don't specify a type. Default is `_doc`. We highly recommend ignoring this parameter and using the `_doc` type for all indexes. wait_for_active_shards | String | Specifies the number of active shards that must be available before OpenSearch processes the bulk request. Default is `1` (only the primary shard). Set to `all` or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have 2 replicas distributed across 2 additional nodes in order for the request to succeed. -batch_size | Integer | **(Deprecated)** Specifies the number of documents to be batched and sent to an ingest pipeline to be processed together. Default is `2147483647` (documents are ingested by an ingest pipeline all at once). If the bulk request doesn't explicitly specify an ingest pipeline or the index doesn't have a default ingest pipeline, then this parameter is ignored. Only documents with `create`, `index`, or `update` actions can be grouped into batches. -{% comment %}_source | List | asdf -_source_excludes | List | asdf -_source_includes | List | asdf{% endcomment %} ## Request body diff --git a/_api-reference/document-apis/delete-by-query.md b/_api-reference/document-apis/delete-by-query.md index a55617e145f..ef1f0dcc97e 100644 --- a/_api-reference/document-apis/delete-by-query.md +++ b/_api-reference/document-apis/delete-by-query.md @@ -13,7 +13,7 @@ redirect_from: You can include a query as part of your delete request so OpenSearch deletes all documents that match that query. -## Path and HTTP methods +## Endpoints ```json POST /_delete_by_query @@ -58,7 +58,7 @@ _source | String | Specifies whether to include the `_source` field in the respo _source_excludes | String | A comma-separated list of source fields to exclude from the response. _source_includes | String | A comma-separated list of source fields to include in the response. stats | String | Value to associate with the request for additional logging. -terminate_after | Integer | The maximum number of documents OpenSearch should process before terminating the request. +terminate_after | Integer | The maximum number of matching documents (hits) OpenSearch should process before terminating the request. timeout | Time | How long the operation should wait from a response from active shards. Default is `1m`. version | Boolean | Whether to include the document version as a match. wait_for_active_shards | String | The number of shards that must be active before OpenSearch executes the operation. Valid values are `all` or any integer up to the total number of shards in the index. Default is 1, which is the primary shard. diff --git a/_api-reference/document-apis/delete-document.md b/_api-reference/document-apis/delete-document.md index 85ce4bd79bf..fc41e1e3e04 100644 --- a/_api-reference/document-apis/delete-document.md +++ b/_api-reference/document-apis/delete-document.md @@ -13,7 +13,7 @@ redirect_from: If you no longer need a document in your index, you can use the delete document API operation to delete it. -## Path and HTTP methods +## Endpoints ```json DELETE //_doc/<_id> diff --git a/_api-reference/document-apis/get-documents.md b/_api-reference/document-apis/get-documents.md index 1a6bc73a12c..60e4a935776 100644 --- a/_api-reference/document-apis/get-documents.md +++ b/_api-reference/document-apis/get-documents.md @@ -3,7 +3,7 @@ layout: default title: Get document parent: Document APIs nav_order: 5 -redirect_from: +redirect_from: - /opensearch/rest-api/document-apis/get-documents/ --- @@ -14,7 +14,7 @@ redirect_from: After adding a JSON document to your index, you can use the Get Document API operation to retrieve the document's information and data. -## Path and HTTP methods +## Endpoints Use the GET method to retrieve a document and its source or stored fields from a particular index. Use the HEAD method to verify that a document exists: @@ -47,7 +47,7 @@ preference | String | Specifies a preference of which shard to retrieve results realtime | Boolean | Specifies whether the operation should run in realtime. If false, the operation waits for the index to refresh to analyze the source to retrieve data, which makes the operation near-realtime. Default is `true`. refresh | Boolean | If true, OpenSearch refreshes shards to make the get operation available to search results. Valid options are `true`, `false`, and `wait_for`, which tells OpenSearch to wait for a refresh before executing the operation. Default is `false`. routing | String | A value used to route the operation to a specific shard. -stored_fields | Boolean | Whether the get operation should retrieve fields stored in the index. Default is `false`. +stored_fields | List | A comma-separated list of fields stored in the index that should be retrieved. Default is no stored fields will be returned. _source | String | Whether to include the `_source` field in the response body. Default is `true`. _source_excludes | String | A comma-separated list of source fields to exclude in the query response. _source_includes | String | A comma-separated list of source fields to include in the query response. @@ -56,7 +56,7 @@ version_type | Enum | Retrieves a specifically typed document. Available options ### Real time -The OpenSearch Get Document API operates in real time by default, which means that it retrieves the latest version of the document regardless of the index's refresh rate or the rate at which new data becomes searchable. However, if you request stored fields (using the `stored_fields` parameter) for a document that has been updated but not yet refreshed, then the Get Document API parses and analyzes the document's source to extract those stored fields. +The OpenSearch Get Document API operates in real time by default, which means that it retrieves the latest version of the document regardless of the index's refresh rate or the rate at which new data becomes searchable. However, if you request stored fields (using the `stored_fields` parameter) for a document that has been updated but not yet refreshed, then the Get Document API parses and analyzes the document's source to extract those stored fields. To disable the real-time behavior and retrieve the document based on the last refreshed state of the index, set the `realtime` parameter to `false`. @@ -70,7 +70,7 @@ GET test-index/_doc/0?_source=false #### `source` includes and excludes -If you only want to retrieve specific fields from the source, use the `_source_includes` or `_source_excludes` parameters to include or exclude particular fields, respectively. This can be beneficial for large documents because retrieving only the required fields can reduce network overhead. +If you only want to retrieve specific fields from the source, use the `_source_includes` or `_source_excludes` parameters to include or exclude particular fields, respectively. This can be beneficial for large documents because retrieving only the required fields can reduce network overhead. Both parameters accept a comma-separated list of fields and wildcard expressions, as shown in the following example, where any `_source` that contains `*.play` is included in the response but sources with the field `entities` are excluded: diff --git a/_api-reference/document-apis/index-document.md b/_api-reference/document-apis/index-document.md index d195a0662ea..4666581f319 100644 --- a/_api-reference/document-apis/index-document.md +++ b/_api-reference/document-apis/index-document.md @@ -14,7 +14,7 @@ redirect_from: You can use the `Index document` operation to add a single document to your index. -## Path and HTTP methods +## Endpoints ```json PUT /_doc/<_id> diff --git a/_api-reference/document-apis/mtermvectors.md b/_api-reference/document-apis/mtermvectors.md new file mode 100644 index 00000000000..64669458cda --- /dev/null +++ b/_api-reference/document-apis/mtermvectors.md @@ -0,0 +1,333 @@ +--- +layout: default +title: Multi term vectors +parent: Document APIs +nav_order: 33 +--- + +# Multi term vectors + +The `_mtermvectors` API retrieves term vector information for multiple documents in one request. Term vectors provide detailed information about the terms (words) in a document, including term frequency, positions, offsets, and payloads. This can be useful for applications such as relevance scoring, highlighting, or similarity calculations. For more information, see [Term vector parameter]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/#term-vector-parameter). + + +## Endpoints +```json +GET /_mtermvectors +POST /_mtermvectors +GET /{index}/_mtermvectors +POST /{index}/_mtermvectors +``` + + + +## Path parameters + +The following table lists the available path parameters. All path parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `index` | String | The name of the index that contains the document. | + + + + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `field_statistics` | Boolean | If `true`, the response includes the document count, sum of document frequencies, and sum of total term frequencies. _(Default: `true`)_ | +| `fields` | List or String | A comma-separated list or a wildcard expression specifying the fields to include in the statistics. Used as the default list unless a specific field list is provided in the `completion_fields` or `fielddata_fields` parameters. | +| `ids` | List | A comma-separated list of documents IDs. You must provide either the `docs` field in the request body or specify `ids` as a query parameter or in the request body. | +| `offsets` | Boolean | If `true`, the response includes term offsets. _(Default: `true`)_ | +| `payloads` | Boolean | If `true`, the response includes term payloads. _(Default: `true`)_ | +| `positions` | Boolean | If `true`, the response includes term positions. _(Default: `true`)_ | +| `preference` | String | Specifies the node or shard on which the operation should be performed. See [preference query parameter]({{site.url}}{{site.baseurl}}/api-reference/search-apis/search/#the-preference-query-parameter) for a list of available options. By default the requests are routed randomly to available shard copies (primary or replica), with no guarantee of consistency across repeated queries. | +| `realtime` | Boolean | If `true`, the request is real time as opposed to near real time. _(Default: `true`)_ | +| `routing` | List or String | A custom value used to route operations to a specific shard. | +| `term_statistics` | Boolean | If `true`, the response includes term frequency and document frequency. _(Default: `false`)_ | +| `version` | Integer | If `true`, returns the document version as part of a hit. | +| `version_type` | String | The specific version type.
Valid values are:
- `external`: The version number must be greater than the current version.
- `external_gte`: The version number must be greater than or equal to the current version.
- `force`: The version number is forced to be the given value.
- `internal`: The version number is managed internally by OpenSearch. | + + + +## Request body fields + +The following table lists the fields that can be specified in the request body. + +| Field | Data type | Description | +| `docs` | Array | An array of document specifications. | +| `ids` | Array of strings | A list of document IDs to retrieve. Use only when all documents share the same index specified in the request path or query. | +| `fields` | Array of strings | A list of field names for which to return term vectors. | +| `offsets` | Boolean | If `true`, the response includes character offsets for each term. *(Default: `true`)* | +| `payloads` | Boolean | If `true`, the response includes payloads for each term. *(Default: `true`)* | +| `positions` | Boolean | If `true`, the response includes token positions. *(Default: `true`)* | +| `field_statistics` | Boolean | If `true`, the response includes statistics such as document count, sum of document frequencies, and sum of total term frequencies. *(Default: `true`)* | +| `term_statistics` | Boolean | If `true`, the response includes term frequency and document frequency. *(Default: `false`)* | +| `routing` | String | A custom routing value used to identify the shard. Required if custom routing was used during indexing. | +| `version` | Integer | The specific version of the document to retrieve. | +| `version_type` | String | The type of versioning to use. Valid values: `internal`, `external`, `external_gte`. | +| `filter` | Object | Filters tokens returned in the response (for example, by frequency or position). For supported fields, see [Filtering terms]({{site.url}}{{site.baseurl}}/api-reference/document-apis/mtermvectors/#filtering-terms). | +| `per_field_analyzer` | Object | Specifies a custom analyzer to use per field. Format: `{ "field_name": "analyzer_name" }`. | + +## Filtering terms + +The `filter` object in the request body allows you to filter the tokens to include in the term vector response. The `filter` object supports the following fields. + +| Field | Data type | Description | +| `max_num_terms` | Integer | The maximum number of terms to return. | +| `min_term_freq` | Integer | The minimum term frequency in the document required for a term to be included. | +| `max_term_freq` | Integer | The maximum term frequency in the document required for a term to be included. | +| `min_doc_freq` | Integer | The minimum document frequency across the index required for a term to be included. | +| `max_doc_freq` | Integer | The maximum document frequency across the index required for a term to be included. | +| `min_word_length` | Integer | The minimum length of the term to be included. | +| `max_word_length` | Integer | The maximum length of the term to be included. | + +## Example + +Create an index with term vectors enabled: + +```json +PUT /my-index +{ + "mappings": { + "properties": { + "text": { + "type": "text", + "term_vector": "with_positions_offsets_payloads" + } + } + } +} +``` +{% include copy-curl.html %} + +Index the first document: + +```json +POST /my-index/_doc/1 +{ + "text": "OpenSearch is a search engine." +} +``` +{% include copy-curl.html %} + +Index the second document: + +```json +POST /my-index/_doc/2 +{ + "text": "OpenSearch provides powerful features." +} +``` +{% include copy-curl.html %} + +### Example request + +Get term vectors for multiple documents: + +```json +POST /_mtermvectors +{ + "docs": [ + { + "_index": "my-index", + "_id": "1", + "fields": ["text"] + }, + { + "_index": "my-index", + "_id": "2", + "fields": ["text"] + } + ] +} +``` +{% include copy-curl.html %} + +Alternatively, you can specify both `ids` and `fields` as query parameters: + +```json +GET /my-index/_mtermvectors?ids=1,2&fields=text +``` +{% include copy-curl.html %} + +You can also provide document IDs in the `ids` array instead of specifying `docs`: + +```json +GET /my-index/_mtermvectors?fields=text +{ + "ids": [ + "1", "2" + ] +} +``` +{% include copy-curl.html %} + +## Example response + +The response contains term vector information for the two documents: + +```json +{ + "docs": [ + { + "_index": "my-index", + "_id": "1", + "_version": 1, + "found": true, + "took": 10, + "term_vectors": { + "text": { + "field_statistics": { + "sum_doc_freq": 9, + "doc_count": 2, + "sum_ttf": 9 + }, + "terms": { + "a": { + "term_freq": 1, + "tokens": [ + { + "position": 2, + "start_offset": 14, + "end_offset": 15 + } + ] + }, + "engine": { + "term_freq": 1, + "tokens": [ + { + "position": 4, + "start_offset": 23, + "end_offset": 29 + } + ] + }, + "is": { + "term_freq": 1, + "tokens": [ + { + "position": 1, + "start_offset": 11, + "end_offset": 13 + } + ] + }, + "opensearch": { + "term_freq": 1, + "tokens": [ + { + "position": 0, + "start_offset": 0, + "end_offset": 10 + } + ] + }, + "search": { + "term_freq": 1, + "tokens": [ + { + "position": 3, + "start_offset": 16, + "end_offset": 22 + } + ] + } + } + } + } + }, + { + "_index": "my-index", + "_id": "2", + "_version": 1, + "found": true, + "took": 0, + "term_vectors": { + "text": { + "field_statistics": { + "sum_doc_freq": 9, + "doc_count": 2, + "sum_ttf": 9 + }, + "terms": { + "features": { + "term_freq": 1, + "tokens": [ + { + "position": 3, + "start_offset": 29, + "end_offset": 37 + } + ] + }, + "opensearch": { + "term_freq": 1, + "tokens": [ + { + "position": 0, + "start_offset": 0, + "end_offset": 10 + } + ] + }, + "powerful": { + "term_freq": 1, + "tokens": [ + { + "position": 2, + "start_offset": 20, + "end_offset": 28 + } + ] + }, + "provides": { + "term_freq": 1, + "tokens": [ + { + "position": 1, + "start_offset": 11, + "end_offset": 19 + } + ] + } + } + } + } + } + ] +} +``` + +## Response body fields + +The following table lists all response body fields. + +| Field | Data type | Description | +| -------- | --------- | ----------- | +| `docs` | Array | A list of requested documents containing term vectors. | + +Each element of the `docs` array contains the following fields. + +| Field | Data type | Description | +| -------- | --------- | ----------- | +| `term_vectors` | Object | Contains term vector data for each field. | +| `term_vectors..field_statistics` | Object | Contains statistics about the field. | +| `term_vectors..field_statistics.doc_count` | Integer | The number of documents that contain at least one term in the specified field. | +| `term_vectors..field_statistics.sum_doc_freq` | Integer | The sum of document frequencies for all terms in the field. | +| `term_vectors..field_statistics.sum_ttf` | Integer | The sum of total term frequencies for all terms in the field. | +| `term_vectors..terms` | Object | A map of terms in the field, in which each term includes its frequency (`term_freq`) and associated token information. | +| `term_vectors..terms..tokens` | Array | An array of token objects for each term, including the token's `position` in the text and its character offsets (`start_offset` and `end_offset`). | diff --git a/_api-reference/document-apis/multi-get.md b/_api-reference/document-apis/multi-get.md index acd69a7b7eb..fa706b6fb5f 100644 --- a/_api-reference/document-apis/multi-get.md +++ b/_api-reference/document-apis/multi-get.md @@ -13,7 +13,7 @@ redirect_from: The multi-get operation allows you to run multiple GET operations in one request, so you can get back all documents that match your criteria. -## Path and HTTP methods +## Endpoints ```json GET _mget diff --git a/_api-reference/document-apis/pull-based-ingestion-management.md b/_api-reference/document-apis/pull-based-ingestion-management.md new file mode 100644 index 00000000000..90c97721063 --- /dev/null +++ b/_api-reference/document-apis/pull-based-ingestion-management.md @@ -0,0 +1,199 @@ +--- +layout: default +title: Pull-based ingestion management +parent: Pull-based ingestion +grand_parent: Document APIs +has_children: true +nav_order: 10 +--- + +# Pull-based ingestion management +**Introduced 3.0** +{: .label .label-purple } + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +OpenSearch provides the following APIs to manage pull-based ingestion. + +## Pause ingestion + +Pauses ingestion for one or more indexes. When paused, OpenSearch stops consuming data from the streaming source for all shards in the specified indexes. + +### Endpoint + +```json +POST //ingestion/_pause +``` + +### Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Required/Optional | Description | +| :--- | :--- | :--- | :--- | +| `index` | String | Required | The index to pause. Can be a comma-separated list of multiple index names. | + +### Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `cluster_manager_timeout` | Time units | The amount of time to wait for a connection to the cluster manager node. Default is `30s`. | +| `timeout` | Time units | The amount of time to wait for a response from the cluster. Default is `30s`. | + +### Example request + +```json +POST /my-index/ingestion/_pause +``` +{% include copy-curl.html %} + +## Resume ingestion + +Resumes ingestion for one or more indexes. When resumed, OpenSearch continues consuming data from the streaming source for all shards in the specified indexes. + +As part of the resume operation, you can optionally reset the stream consumer to start reading from a specific offset or timestamp. If reset settings are specified, all consumers for the selected shards are reset before the resume operation is applied to the index. Resetting a consumer also triggers an internal flush to persist the changes. + +### Endpoint + +```json +POST //ingestion/_resume +``` + +### Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Required/Optional | Description | +| :--- | :--- | :--- | :--- | +| `index` | String | Required | The index to resume ingestion for. Can be a comma-separated list of multiple index names. | + +### Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | :--- | +| `cluster_manager_timeout` | Time units | The amount of time to wait for a connection to the cluster manager node. Default is `30s`. | +| `timeout` | Time units | The amount of time to wait for a response from the cluster. Default is `30s`. | + +### Request body fields + +The following table lists the available request body fields. + +| Field | Data type | Required/Optional | Description | +| :--- | :--- | :--- | :--- | +| `reset_settings` | Array | Optional | A list of reset settings for each shard. If not provided, OpenSearch resumes ingestion from the current position for each shard in the specified index. | +| `reset_settings.shard` | Integer | Required | The shard to reset. | +| `reset_settings.mode` | String | Required | The reset mode. Valid values are `offset` (a positive integer offset) and `timestamp` (a Unix timestamp in milliseconds). | +| `reset_settings.value` | String | Required |  • `offset`: The Apache Kafka offset or Amazon Kinesis sequence number
 • `timestamp`: A Unix timestamp in milliseconds. | + +### Example request + +To resume ingestion without specifying reset settings, send the following request: + +```json +POST /my-index/ingestion/_resume +``` +{% include copy-curl.html %} + +To provide reset settings when resuming ingestion, send the following request: + +```json +POST /my-index/ingestion/_resume +{ + "reset_settings": [ + { + "shard": 0, + "mode": "offset", + "value": "1" + } + ] +} +``` +{% include copy-curl.html %} + +## Get ingestion state + +Returns the current ingestion state for one or more indexes. This API supports pagination. + +### Endpoint + +```json +GET //ingestion/_state +``` + +### Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Required/Optional | Description | +| :--- | :--- | :--- | :--- | +| `index` | String | Required | The index for which to return the ingestion state. Can be a comma-separated list of multiple index names. | + +### Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `timeout` | Time units | The amount of time to wait for a response from the cluster. Default is `30s`. | + +### Example request + +The following is a request with the default settings: + +```json +GET /my-index/ingestion/_state +``` +{% include copy-curl.html %} + +The following example shows a request with a page size of 20: + +```json +GET /my-index/ingestion/_state?size=20 +``` +{% include copy-curl.html %} + +The following example shows a request with a next page token: + +```json +GET /my-index/ingestion/_state?size=20&next_token= +``` +{% include copy-curl.html %} + +### Example response + +```json +{ + "_shards": { + "total": 1, + "successful": 1, + "failed": 0, + "failures": [ + { + "shard": 0, + "index": "my-index", + "status": "INTERNAL_SERVER_ERROR", + "reason": { + "type": "timeout_exception", + "reason": "error message" + } + } + ] + }, + "next_page_token" : "page token if not on last page", + "ingestion_state": { + "indexName": [ + { + "shard": 0, + "poller_state": "POLLING", + "error_policy": "DROP", + "poller_paused": false + } + ] + } +} +``` \ No newline at end of file diff --git a/_api-reference/document-apis/pull-based-ingestion.md b/_api-reference/document-apis/pull-based-ingestion.md new file mode 100644 index 00000000000..583b633c9ee --- /dev/null +++ b/_api-reference/document-apis/pull-based-ingestion.md @@ -0,0 +1,174 @@ +--- +layout: default +title: Pull-based ingestion +parent: Document APIs +has_children: true +nav_order: 60 +--- + +# Pull-based ingestion +**Introduced 3.0** +{: .label .label-purple } + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +Pull-based ingestion enables OpenSearch to ingest data from streaming sources such as Apache Kafka or Amazon Kinesis. Unlike traditional ingestion methods where clients actively push data to OpenSearch through REST APIs, pull-based ingestion allows OpenSearch to control the data flow by retrieving data directly from streaming sources. This approach provides exactly-once ingestion semantics and native backpressure handling, helping prevent server overload during traffic spikes. + +## Prerequisites + +Before using pull-based ingestion, ensure that the following prerequisites are met: + +* Install an ingestion plugin for your streaming source using the command `bin/opensearch-plugin install `. For more information, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/index/). OpenSearch supports the following ingestion plugins: + - `ingestion-kafka` + - `ingestion-kinesis` +* Enable [segment replication]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/segment-replication/index/) with [remote-backed storage]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/remote-store/index/). Pull-based ingestion is not compatible with document replication. +* Configure pull-based ingestion during [index creation](#creating-an-index-for-pull-based-ingestion). You cannot convert an existing push-based index to a pull-based one. + +## Creating an index for pull-based ingestion + +To ingest data from a streaming source, first create an index with pull-based ingestion settings. The following request creates an index that pulls data from a Kafka topic: + +```json +PUT /my-index +{ + "settings": { + "ingestion_source": { + "type": "kafka", + "pointer.init.reset": "earliest", + "param": { + "topic": "test", + "bootstrap_servers": "localhost:49353" + } + }, + "index.number_of_shards": 1, + "index.number_of_replicas": 1, + "index": { + "replication.type": "SEGMENT" + } + }, + "mappings": { + "properties": { + "name": { + "type": "text" + }, + "age": { + "type": "integer" + } + } + } +} +``` +{% include copy-curl.html %} + +### Ingestion source parameters + +The `ingestion_source` parameters control how OpenSearch pulls data from the streaming source. A _poll_ is an operation in which OpenSearch actively requests a batch of data from the streaming source. The following table lists all parameters that `ingestion_source` supports. + +| Parameter | Description | +| :--- | :--- | +| `type` | The streaming source type. Required. Valid values are `kafka` or `kinesis`. | +| `pointer.init.reset` | Determines the stream location from which to start reading. Optional. Valid values are `earliest`, `latest`, `reset_by_offset`, `reset_by_timestamp`, or `none`. See [Stream position](#stream-position). | +| `pointer.init.reset.value` | Required only for `reset_by_offset` or `reset_by_timestamp`. Specifies the offset value or timestamp in milliseconds. See [Stream position](#stream-position). | +| `error_strategy` | How to handle failed messages. Optional. Valid values are `DROP` (failed messages are skipped and ingestion continues) and `BLOCK` (when a message fails, ingestion stops). Default is `DROP`. We recommend using `DROP` for the current experimental release. | +| `poll.max_batch_size` | The maximum number of records to retrieve in each poll operation. Optional. | +| `poll.timeout` | The maximum time to wait for data in each poll operation. Optional. | +| `num_processor_threads` | The number of threads for processing ingested data. Optional. Default is 1. | +| `internal_queue_size` | The size of the internal blocking queue for advanced tuning. Valid values are from 1 to 100,000, inclusive. Optional. Default is 100. | +| `param` | Source-specific configuration parameters. Required.
 • The `ingest-kafka` plugin requires:
  - `topic`: The Kafka topic to consume from
  - `bootstrap_servers`: The Kafka server addresses
  Optionally, you can provide additional standard Kafka consumer parameters (such as `fetch.min.bytes`). These parameters are passed directly to the Kafka consumer.
 • The `ingest-kinesis` plugin requires:
  - `stream`: The Kinesis stream name
  - `region`: The AWS Region
  - `access_key`: The AWS access key
  - `secret_key`: The AWS secret key
  Optionally, you can provide an `endpoint_override`. | + +### Stream position + +When creating an index, you can specify where OpenSearch should start reading from the stream by configuring the `pointer.init.reset` and `pointer.init.reset.value` settings in the `ingestion_source` parameter. OpenSearch will resume reading from the last commited position for existing indexes. + +The following table provides the valid `pointer.init.reset` values and their corresponding `pointer.init.reset.value` values. + +| `pointer.init.reset` | Starting ingestion point | `pointer.init.reset.value` | +| :--- | :--- | :--- | +| `earliest` | The beginning of the stream | None | +| `latest` | The current end of the stream | None | +| `reset_by_offset` | A specific offset in the stream | A positive integer offset. Required. | +| `reset_by_timestamp` | A specific point in time | A Unix timestamp in milliseconds. Required.
For Kafka streams, defaults to Kafka's `auto.offset.reset` policy if no messages are found for the given timestamp. | +| `none` | The last committed position for existing indexes | None | + +### Stream partitioning + +When using partitioned streams (such as Kafka topics or Kinesis shards), note the following relationships between stream partitions and OpenSearch shards: + +- OpenSearch shards map one-to-one to stream partitions. +- The number of index shards must be greater than or equal to the number of stream partitions. +- Extra shards beyond the number of partitions remain empty. +- Documents must be sent to the same partition for successful updates. + +When using pull-based ingestion, traditional REST API--based ingestion is disabled for the index. +{: .note} + +### Updating the error policy + +You can use the [Update Settings API]({{site.url}}{{site.baseurl}}/api-reference/index-apis/update-settings/) to dynamically update the error policy by setting `index.ingestion_source.error_strategy` to either `DROP` or `BLOCK`. + +The following example demonstrates how to update the error policy: + +```json +PUT /my-index/_settings +{ + "index.ingestion_source.error_strategy": "DROP" +} +``` +{% include copy-curl.html %} + +## Message format + +To be correctly processed by OpenSearch, messages in the streaming source must have the following format: + +```json +{"_id":"1", "_version":"1", "_source":{"name": "alice", "age": 30}, "_op_type": "index"} +{"_id":"2", "_version":"2", "_source":{"name": "alice", "age": 30}, "_op_type": "delete"} +``` + +Each data unit in the streaming source (Kafka message or Kinesis record) must include the following fields that specify how to create or modify an OpenSearch document. + +| Field | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `_id` | String | No | A unique identifier for a document. If not provided, OpenSearch auto-generates an ID. Required for document updates or deletions. | +| `_version` | Long | No | A document version number, which must be maintained externally. If provided, OpenSearch drops messages with versions earlier than the current document version. If not provided, no version checking occurs. | +| `_op_type` | String | No | The operation to perform. Valid values are:
- `index`: Creates a new document or updates an existing one.
- `create`: Creates a new document in append mode. Note that this will not update existing documents.
- `delete`: Soft deletes a document. | +| `_source` | Object | Yes | The message payload containing the document data. | + +## Pull-based ingestion metrics + +Pull-based ingestion provides metrics that can be used to monitor the ingestion process. The `polling_ingest_stats` metric is currently supported and is available at the shard level. + +The following table lists the available `polling_ingest_stats` metrics. + +| Metric | Description | +| :--- | :--- | +| `message_processor_stats.total_processed_count` | The total number of messages processed by the message processor. | +| `message_processor_stats.total_invalid_message_count` | The number of invalid messages encountered. | +| `message_processor_stats.total_version_conflicts_count` | The number of version conflicts due to which older version messages will be dropped. | +| `message_processor_stats.total_failed_count` | The total number of failed messages, which error out during processing. | +| `message_processor_stats.total_failures_dropped_count` | The total number of failed messages, which are dropped after exhausting retries. Note that messages are only dropped when the DROP error policy is used. | +| `message_processor_stats.total_processor_thread_interrupt_count` | Indicates the number of thread interruptions on the processor thread. | +| `consumer_stats.total_polled_count` | The total number of messages polled from the stream consumer. | +| `consumer_stats.total_consumer_error_count` | The total number of fatal consumer read errors. | +| `consumer_stats.total_poller_message_failure_count` | The total number of failed messages on the poller. | +| `consumer_stats.total_poller_message_dropped_count` | The total number of failed messages on the poller that were dropped. | +| `consumer_stats.total_duplicate_message_skipped_count` | The total number of skipped messages that were previously processed. | +| `consumer_stats.lag_in_millis` | Lag in milliseconds, computed as the time elapsed since the last processed message timestamp. | + +To retrieve shard-level pull-based ingestion metrics, use the [Nodes Stats API]({{site.url}}{{site.baseurl}}/api-reference/index-apis/update-settings/): + +```json +GET /_nodes/stats/indices?level=shards&pretty +``` +{% include copy-curl.html %} + + +## Limitations + +The following limitations apply when using pull-based ingestion: + +* [Ingest pipelines]({{site.url}}{{site.baseurl}}/ingest-pipelines/) are not compatible with pull-based ingestion. +* [Dynamic mapping]({{site.url}}{{site.baseurl}}/field-types/) is not supported. +* [Index rollover]({{site.url}}{{site.baseurl}}/api-reference/index-apis/rollover/) is not supported. +* Operation listeners are not supported. \ No newline at end of file diff --git a/_api-reference/document-apis/reindex.md b/_api-reference/document-apis/reindex.md index 65df81777e2..34d2ba0b8d6 100644 --- a/_api-reference/document-apis/reindex.md +++ b/_api-reference/document-apis/reindex.md @@ -2,7 +2,7 @@ layout: default title: Reindex document parent: Document APIs -nav_order: 60 +nav_order: 17 redirect_from: - /opensearch/reindex-data/ - /opensearch/rest-api/document-apis/reindex/ @@ -15,7 +15,7 @@ redirect_from: The reindex document API operation lets you copy all or a subset of your data from a source index into a destination index. -## Path and HTTP methods +## Endpoints ```json POST /_reindex diff --git a/_api-reference/document-apis/termvector.md b/_api-reference/document-apis/termvector.md new file mode 100644 index 00000000000..81b826c4dab --- /dev/null +++ b/_api-reference/document-apis/termvector.md @@ -0,0 +1,243 @@ +--- +layout: default +title: Term vectors +parent: Document APIs +nav_order: 32 +--- + +# Term vectors + +The `_termvectors` API retrieves term vector information for a single document. Term vectors provide detailed information about the terms (words) in a document, including term frequency, positions, offsets, and payloads. This can be useful for applications such as relevance scoring, highlighting, or similarity calculations. For more information, see [Term vector parameter]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/#term-vector-parameter). + + +## Endpoints +```json +GET /{index}/_termvectors +POST /{index}/_termvectors +GET /{index}/_termvectors/{id} +POST /{index}/_termvectors/{id} +``` + + + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Required | Data type | Description | +| :--- | :--- | :--- | :--- | +| `index` | **Required** | String | The name of the index containing the document. | +| `id` | _Optional_ | String | The unique identifier of the document. | + + + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `field_statistics` | Boolean | If `true`, the response includes the document count, sum of document frequencies, and sum of total term frequencies. *(Default: `true`)* | +| `fields` | List or String | A comma-separated list or a wildcard expression specifying the fields to include in the statistics. Used as the default list unless a specific field list is provided in the `completion_fields` or `fielddata_fields` parameters. | +| `offsets` | Boolean | If `true`, the response includes term offsets. *(Default: `true`)* | +| `payloads` | Boolean | If `true`, the response includes term payloads. *(Default: `true`)* | +| `positions` | Boolean | If `true`, the response includes term positions. *(Default: `true`)* | +| `preference` | String | Specifies the node or shard on which the operation should be performed. See [preference query parameter]({{site.url}}{{site.baseurl}}/api-reference/search-apis/search/#the-preference-query-parameter) for a list of available options. By default the requests are routed randomly to available shard copies (primary or replica), with no guarantee of consistency across repeated queries. | +| `realtime` | Boolean | If `true`, the request is real time as opposed to near real time. *(Default: `true`)* | +| `routing` | List or String | A custom value used to route operations to a specific shard. | +| `term_statistics` | Boolean | If `true`, the response includes term frequency and document frequency. *(Default: `false`)* | +| `version` | Integer | If `true`, returns the document version as part of a hit. | +| `version_type` | String | The specific version type.
Valid values are:
- `external`: The version number must be greater than the current version.
- `external_gte`: The version number must be greater than or equal to the current version.
- `force`: The version number is forced to be the given value.
- `internal`: The version number is managed internally by OpenSearch. | + +## Request body fields + +The following table lists the fields that can be specified in the request body. + +| Field | Data type | Description | +| `doc` | Object | A document to analyze. If provided, the API does not retrieve an existing document from the index but uses the provided content. | +| `fields` | Array of strings | A list of field names for which to return term vectors. | +| `offsets` | Boolean | If `true`, the response includes character offsets for each term. *(Default: `true`)* | +| `payloads` | Boolean | If `true`, the response includes payloads for each term. *(Default: `true`)* | +| `positions` | Boolean | If `true`, the response includes token positions. *(Default: `true`)* | +| `field_statistics` | Boolean | If `true`, the response includes statistics such as document count, sum of document frequencies, and sum of total term frequencies. *(Default: `true`)* | +| `term_statistics` | Boolean | If `true`, the response includes term frequency and document frequency. *(Default: `false`)* | +| `routing` | String | A custom routing value used to identify the shard. Required if custom routing was used during indexing. | +| `version` | Integer | The specific version of the document to retrieve. | +| `version_type` | String | The type of versioning to use. Valid values: `internal`, `external`, `external_gte`, `force`. | +| `filter`| Object | Allows filtering of tokens returned in the response (for example, by frequency or position). See [Filtering terms]({{site.url}}{{site.baseurl}}/api-reference/document-apis/termvector/#filtering-terms) for available options. | +| `per_field_analyzer` | Object | Specifies a custom analyzer to use per field. Format: `{ "field_name": "analyzer_name" }`. | +| `preference` | String | Specifies shard or node routing preferences. See [preference query parameter]({{site.url}}{{site.baseurl}}/api-reference/search-apis/search/#the-preference-query-parameter).| + +## Filtering terms + +The `filter` object in the request body allows you to filter the tokens to include in the term vector response. The `filter` object supports the following fields. + +| Field | Data type | Description | +| `max_num_terms` | Integer | The maximum number of terms to return. | +| `min_term_freq` | Integer | The minimum term frequency in the document required for a term to be included. | +| `max_term_freq` | Integer | The maximum term frequency in the document required for a term to be included. | +| `min_doc_freq` | Integer | The minimum document frequency across the index required for a term to be included. | +| `max_doc_freq` | Integer | The maximum document frequency across the index required for a term to be included. | +| `min_word_length` | Integer | The minimum length of the term to be included. | +| `max_word_length` | Integer | The maximum length of the term to be included. | + +## Example + +Create an index: + +```json +PUT /my-index +{ + "mappings": { + "properties": { + "text": { + "type": "text", + "term_vector": "with_positions_offsets_payloads" + } + } + } +} +``` +{% include copy-curl.html %} + +Index the document: + +```json +POST /my-index/_doc/1 +{ + "text": "OpenSearch is a search engine." +} +``` +{% include copy-curl.html %} + +### Example request + +Retrieve the term vectors: + +```json +GET /my-index/_termvectors/1 +{ + "fields": ["text"], + "term_statistics": true +} +``` +{% include copy-curl.html %} + +Alternatively, you can provide `fields` and `term_statistics` as query parameters: + +```json +GET /my-index/_termvectors/1?fields=text&term_statistics=true +``` +{% include copy-curl.html %} + +### Example response + +The response displays term vector information: + +```json +{ + "_index": "my-index", + "_id": "1", + "_version": 1, + "found": true, + "took": 1, + "term_vectors": { + "text": { + "field_statistics": { + "sum_doc_freq": 5, + "doc_count": 1, + "sum_ttf": 5 + }, + "terms": { + "a": { + "doc_freq": 1, + "ttf": 1, + "term_freq": 1, + "tokens": [ + { + "position": 2, + "start_offset": 14, + "end_offset": 15 + } + ] + }, + "engine": { + "doc_freq": 1, + "ttf": 1, + "term_freq": 1, + "tokens": [ + { + "position": 4, + "start_offset": 23, + "end_offset": 29 + } + ] + }, + "is": { + "doc_freq": 1, + "ttf": 1, + "term_freq": 1, + "tokens": [ + { + "position": 1, + "start_offset": 11, + "end_offset": 13 + } + ] + }, + "opensearch": { + "doc_freq": 1, + "ttf": 1, + "term_freq": 1, + "tokens": [ + { + "position": 0, + "start_offset": 0, + "end_offset": 10 + } + ] + }, + "search": { + "doc_freq": 1, + "ttf": 1, + "term_freq": 1, + "tokens": [ + { + "position": 3, + "start_offset": 16, + "end_offset": 22 + } + ] + } + } + } + } +} +``` + +## Response body fields + +The following table lists all response body fields. + +| Field | Data type | Description | +| `term_vectors` | Object | Contains term vector data for each specified field. | +| `term_vectors.text` | Object | Contains term vector details for the `text` field. | +| `term_vectors.text.field_statistics` | Object | Contains statistics for the entire field. Present only if `field_statistics` is `true`. | +| `term_vectors.text.field_statistics.doc_count` | Integer | The number of documents that contain at least one term in the specified field. | +| `term_vectors.text.field_statistics.sum_doc_freq` | Integer | The sum of document frequencies for all terms in the field. | +| `term_vectors.text.field_statistics.sum_ttf` | Integer | The sum of total term frequencies (including repetitions) for all terms in the field. | +| `term_vectors.text.terms` | Object | A map, in which each key is a term and each value contains details about that term. | +| `term_vectors.text.terms..term_freq` | Integer | The number of times the term appears in the document. | +| `term_vectors.text.terms..doc_freq` | Integer | The number of documents containing the term. Present only if `term_statistics` is `true`. | +| `term_vectors.text.terms..ttf` | Integer | The total term frequency across all documents. Present only if `term_statistics` is `true`. | +| `term_vectors.text.terms..tokens` | Array | A list of token objects providing information about individual term instances. | +| `term_vectors.text.terms..tokens[].position` | Integer | The position of the token within the text. Present only if `positions` is `true`. | +| `term_vectors.text.terms..tokens[].start_offset` | Integer | The start character offset of the token. Present only if `offsets` is `true`. | +| `term_vectors.text.terms..tokens[].end_offset` | Integer | The end character offset of the token. Present only if `offsets` is `true`. | +| `term_vectors.text.terms..tokens[].payload` | String (Base64) | Optional payload data associated with the token. Present only if `payloads` is `true` and available. | diff --git a/_api-reference/document-apis/update-by-query.md b/_api-reference/document-apis/update-by-query.md index 64df8c901b4..722b0501473 100644 --- a/_api-reference/document-apis/update-by-query.md +++ b/_api-reference/document-apis/update-by-query.md @@ -14,7 +14,7 @@ redirect_from: You can include a query and a script as part of your update request so OpenSearch can run the script to update all of the documents that match the query. -## Path and HTTP methods +## Endpoints ```json POST , /_update_by_query @@ -60,7 +60,7 @@ _source | String | Whether to include the `_source` field in the response. _source_excludes | String | A comma-separated list of source fields to exclude from the response. _source_includes | String | A comma-separated list of source fields to include in the response. stats | String | Value to associate with the request for additional logging. -terminate_after | Integer | The maximum number of documents OpenSearch should process before terminating the request. +terminate_after | Integer | The maximum number of matching documents (hits) OpenSearch should process before terminating the request. timeout | Time | How long the operation should wait from a response from active shards. Default is `1m`. version | Boolean | Whether to include the document version as a match. wait_for_active_shards | String | The number of shards that must be active before OpenSearch executes the operation. Valid values are `all` or any integer up to the total number of shards in the index. Default is 1, which is the primary shard. diff --git a/_api-reference/document-apis/update-document.md b/_api-reference/document-apis/update-document.md index ff17940cdbc..33c20936a0e 100644 --- a/_api-reference/document-apis/update-document.md +++ b/_api-reference/document-apis/update-document.md @@ -14,7 +14,15 @@ redirect_from: If you need to update a document's fields in your index, you can use the update document API operation. You can do so by specifying the new data you want to be in your index or by including a script in your request body, which OpenSearch runs to update the document. By default, the update operation only updates a document that exists in the index. If a document does not exist, the API returns an error. To _upsert_ a document (update the document that exists or index a new one), use the [upsert](#using-the-upsert-operation) operation. -## Path and HTTP methods +You cannot explicitly specify an ingest pipeline when calling the Update Document API. If a `default_pipeline` or `final_pipeline` is defined in your index, the following behavior applies: + +- **Upsert operations**: When indexing a new document, the `default_pipeline` and `final_pipeline` defined in the index are executed as specified. +- **Update operations**: When updating an existing document, ingest pipeline execution is not recommended because it may produce erroneous results. Support for running ingest pipelines during update operations is deprecated and will be removed in version 3.0.0. If your index has a defined ingest pipeline, the update document operation will return the following deprecation warning: +``` +the index [sample-index1] has a default ingest pipeline or a final ingest pipeline, the support of the ingest pipelines for update operation causes unexpected result and will be removed in 3.0.0 +``` + +## Endpoints ```json POST //_update/<_id> diff --git a/_api-reference/grpc-apis/bulk.md b/_api-reference/grpc-apis/bulk.md new file mode 100644 index 00000000000..66616cecf71 --- /dev/null +++ b/_api-reference/grpc-apis/bulk.md @@ -0,0 +1,490 @@ +--- +layout: default +title: Bulk (gRPC) +parent: gRPC APIs +nav_order: 20 +--- + +# Bulk (gRPC) +**Introduced 3.0** +{: .label .label-purple } + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/16787). +{: .warning} + +The gRPC Bulk API provides an efficient, binary-encoded alternative to the [HTTP Bulk API]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) for performing multiple document operations—such as indexing, updating, and deleting—in a single call. This service uses protocol buffers and mirrors the REST API in terms of parameters and structure. + +## Prerequisite + +To submit gRPC requests, you must have a set of protobufs on the client side. For ways to obtain the protobufs, see [Using gRPC APIs]({{site.url}}{{site.baseurl}}/api-reference/grpc-apis/index/#using-grpc-apis). + +## gRPC service and method + +gRPC Document APIs reside in the [DocumentService](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/services/document_service.proto#L23C12-L23C23). + +You can submit bulk requests by invoking the [`Bulk`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/services/document_service.proto#L23) gRPC method within the `DocumentService`. The method takes in a [`BulkRequest`](#bulkrequest-fields) and returns a [`BulkResponse`](#bulkresponsebody-fields). + +## Document format + +In gRPC, documents must be provided and returned as bytes. Use Base64 encoding to provide documents in a gRPC request. +{: .note } + +For example, consider the following document in a regular Bulk API request: + +```json +"doc": "{\"title\": \"Inception\", \"year\": 2010}" +``` + +For a gRPC Bulk API request, provide the same document in Base64 encoding: + +```json +"doc": "eyJ0aXRsZSI6ICJJbmNlcHRpb24iLCAieWVhciI6IDIwMTB9" +``` + +## BulkRequest fields + +The [`BulkRequest`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/document.proto#L16) message is the top-level container for a gRPC bulk operation. It accepts the following fields. + +| Field | Protobuf type | Description | +| :---- | :---- | :---- | +| `request_body` | `repeated `[`BulkRequestBody`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/document.proto#L53) | The list of bulk operations (`index`/`create`/`update`/`delete`). Required. | +| `index` | `string` | The default index for all operations unless overridden in `request_body`. Specifying the `index` in the `BulkRequest` means that you don't need to include it in the [BulkRequestBody](#bulkrequestbody-fields). Optional. | +| `source` | [`SourceConfigParam`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L154) | Controls whether to return the full `_source`, no `_source`, or only specific fields from `_source` in the response. Optional. | +| `source_excludes` | `repeated string` | Fields to exclude from `source`. Optional. | +| `source_includes` | `repeated string` | Fields to include from `source`. Optional. | +| `pipeline` | `string` | The preprocessing ingest pipeline ID. Optional. | +| `refresh` | [`Refresh`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/document.proto#L307) | Whether to refresh shards after indexing. Optional. | +| `require_alias` | `bool` | If `true`, actions must target an alias. Optional. | +| `routing` | `string` | The routing value for shard assignment. Optional. | +| `timeout` | `string` | The timeout duration (for example, `1m`). Optional. | +| `type` (Deprecated) | `string` | The document type (always `_doc`). Optional. | +| `wait_for_active_shards` | [`WaitForActiveShards`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L13) | The minimum number of active shards to wait for. Optional. | + + +## BulkRequestBody fields + +The [`BulkRequestBody`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/document.proto#L53) message represents a single document-level operation within a `BulkRequest`. It accepts the following fields. All fields are optional, but exactly one of `index`, `create`, `update`, or `delete` must be set in the `BulkRequestBody`. + +| Field | Protobuf type | Description | +| :---- | :---- | :---- | +| `index` | [`IndexOperation`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/document.proto#L107) | Index a document. Replaces the document if it already exists. Optional. | +| `create` | [`CreateOperation`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/document.proto#L141) | Create a new document. Fails if the document already exists. Optional. | +| `update` | [`UpdateOperation`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/document.proto#L171) | Partially update a document or use upsert/script options. Optional. | +| `delete` | [`DeleteOperation`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/document.proto#L190) | Delete a document by ID. Optional. | +| `detect_noop` | `bool` | If `true`, skips the update if the document content hasn't changed. Optional. Default is `true`. | +| `doc` | `bytes` | Partial or full document data for `update` or `index` operations. Optional. | +| `doc_as_upsert` | `bool` | If `true`, treats the document as the full upsert document if the target document doesn't exist. Only valid for the `update` operation. Optional. | +| `script` | [`Script`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L27) | A script to apply to the document (used with `update`). Optional. | +| `scripted_upsert` | `bool` | If `true`, executes the script whether or not the document exists. Optional. | +| `source` | [`SourceConfig`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L176) | Controls how the document source is fetched or filtered. Optional. | +| `upsert` | `bytes` | The full document to use if the target does not exist. Used with `script`. Optional. | +| `object` | `bytes` | The full document content used with `create`. Optional. | + + +### Create + +`CreateOperation` adds a new document only if it doesn't already exist. + +The document itself must be provided in the `object` field, outside of the `CreateOperation` message. + +The following optional fields can also be provided. + +| Field | Protobuf type | Description | +| ----- | ----- | ----- | +| `id` | `string` | The document ID. If omitted, one is auto-generated. Optional. | +| `index` | `string` | The target index. Required if not set globally in the `BulkRequest`. Optional. | +| `routing` | `string` | A custom routing value used to control shard placement. Optional. | +| `if_primary_term` | `int64` | Used for concurrency control. The operation only runs if the document's primary term matches this value. Optional. | +| `if_seq_no` | `int64` | Used for concurrency control. The operation only runs if the document's primary term matches this value. Optional. | +| `version` | `int64` | The explicit document version for concurrency control. Optional. | +| `version_type` | [`VersionType`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/document.proto#L99) | Controls version matching behavior. Optional. | +| `pipeline` | `string` | The preprocessing ingest pipeline ID. Optional. | +| `require_alias` | `bool` | Enforces the use of index aliases only. Optional. | + +#### Example request + +The following example shows a bulk request with a `create` operation. It creates a document with the ID `tt1375666` in the `movies` index. The document content, provided in Base64 encoding, represents `{"title": "Inception", "year": 2010}`: + +```json +{ + "index": "movies", + "request_body": [ + { + "create": { + "index": "movies", + "id": "tt1375666" + }, + "object": "eyJ0aXRsZSI6ICJJbmNlcHRpb24iLCAieWVhciI6IDIwMTB9" + } + ] +} +``` + +### Delete + +The `DeleteOperation` removes a document by ID. It accepts the following fields. + +| Field | Protobuf type | Description | +| ----- | ----- | ----- | +| `id` | `string` | The ID of the document to delete. Required. | +| `index` | `string` | The target index. Required if not set globally in the `BulkRequest`. Optional. | +| `routing` | `string` | A custom routing value used to control shard placement. Optional. | +| `if_primary_term` | `int64` | Used for concurrency control. The operation only runs if the document's primary term matches this value. Optional. | +| `if_seq_no` | `int64` | Used for concurrency control. The operation only runs if the document's primary term matches this value. Optional. | +| `version` | `int64` | The explicit document version for concurrency control. Optional. | +| `version_type` | [`VersionType`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/document.proto#L99) | Controls version matching behavior. Optional. | + + +#### Example request + +The following example shows a bulk request with a `delete` operation. It deletes a document with the ID `tt1392214` from the `movies` index: + +```json +{ + "index": "movies", + "request_body": [ + { + "delete": { + "index": "movies", + "id": "tt1392214" + } + } + ] +} +``` +{% include copy.html %} + +### Index + +The `IndexOperation` creates or overwrites a document. If an ID is not provided, one is generated. + +The document itself is provided in the `doc` field, outside of the `IndexOperation` message. + +The following optional fields can also be provided. + + +| Field | Protobuf type | Description | +| ----- | ----- | ----- | +| `id` | `string` | The document ID. If omitted, one is auto-generated. Optional. | +| `index` | `string` | The target index. Required only if not set globally in the `BulkRequest`. | +| `routing` | `string` | A custom routing value used to control shard placement. Optional. | +| `if_primary_term` | `int64` | Used for concurrency control. The operation only runs if the document's primary term matches this value. Optional. | +| `if_seq_no` | `int64` | Used for concurrency control. The operation only runs if the document's primary term matches this value. Optional. | +| `op_type` | [`OpType`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/document.proto#L91) | The operation type. Controls the overwriting behavior. Valid values are `index` (default) and `create`. Optional. | +| `version` | `int64` | The explicit document version for concurrency control. Optional. | +| `version_type` | [`VersionType`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/document.proto#L99) | Controls version matching behavior. Optional. | +| `pipeline` | `string` | The preprocessing ingest pipeline ID. Optional. | +| `require_alias` | `bool` | Enforces the use of index aliases only. Optional. | + + +#### Example request + +The following example shows a bulk request with an `index` operation. It indexes a Base64-encoded document with the ID `tt0468569` into the `movies` index: + +```json +{ + "index": "movies", + "request_body": [ + { + "index": { + "index": "movies", + "id": "tt0468569" + }, + "doc": "eyJ0aXRsZSI6ICJUaGUgRGFyayBLbmlnaHQiLCAieWVhciI6IDIwMDh9" + } + ] +} +``` +{% include copy.html %} + +### Update + +The `UpdateOperation` performs partial document updates. + +The document itself is provided in the `doc` field, outside of the `UpdateOperation` message. + +All `UpdateOperation` fields, listed in the following table, are optional except for `id`. + +| Field | Protobuf type | Description | +| ----- | ----- | ----- | +| `id` | `string` | The ID of the document to update. Required. | +| `index` | `string` | The target index. Required if not set globally in the `BulkRequest`. Optional. | +| `routing` | `string` | A custom routing value used to control shard placement. Optional. | +| `if_primary_term` | `int64` | Used for concurrency control. The operation only runs if the document's primary term matches this value. Optional. | +| `if_seq_no` | `int64` | Used for concurrency control. The operation only runs if the document's primary term matches this value. Optional. | +| `require_alias` | `bool` | Enforces the use of index aliases only. Optional. | +| `retry_on_conflict` | `int32` | The number of times to retry the operation if a version conflict occurs. Optional. | + + +#### Example request + +The following example shows a bulk request with an `update` operation. It will update a document with the ID `tt1375666` in the `movies` index to `{"year": 2011}`: + +```json +{ + "index": "movies", + "request_body": [ + { + "update": { + "index": "movies", + "id": "tt1375666" + }, + "doc": "eyJ5ZWFyIjogMjAxMX0=", + "detect_noop": true + } + ] +} +``` +{% include copy.html %} + +### Upsert + +The `upsert` operation updates the document if it already exists. Otherwise, it creates a new document using the provided document content. + +To upsert a document, provide an `UpdateOperation` but specify `doc_as_upsert` as `true`. The document to be upserted should be provided in the `doc_as_upsert` field outside of the `UpdateOperation`. + + +#### Example request + +The following example shows a bulk request with an `upsert` operation. It updates the `year` field of the document with ID `tt1375666` in the `movies` index to `{"year": 2012}`: + +```json +{ + "index": "movies", + "request_body": [ + { + "update": { + "index": "movies", + "id": "tt1375666" + }, + "doc": "eyJ5ZWFyIjogMjAxMn0=", + "doc_as_upsert": true + } + ] +} +``` +{% include copy.html %} + +### Script + +Run a stored or inline script to modify a document. + + +To specify a script, provide an `UpdateOperation` and a `script` field outside of the `UpdateOperation`. + +#### Example request + +The following example shows a bulk request with a `script` operation. It increments the `year` field of the document with the ID `tt1375666` in the `movies` index by 1: + +```json +{ + "index": "movies", + "request_body": [ + { + "update": { + "index": "movies", + "id": "tt1375666" + }, + "script": { + "source": "ctx._source.year += 1", + "lang": "painless" + } + } + ] +} +``` +{% include copy.html %} + + +## Response fields + +The gRPC Bulk API provides the following response fields. + +### BulkResponseBody fields + +The [`BulkResponse`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/document.proto#L211) message wraps either a `BulkResponseBody` for successful requests, or a `BulkErrorResponse` for failed requests. The `BulkResponseBody` provides a summary and per-item result of a bulk operation and contains the following fields. + +| Field | Protobuf type | Description | +| ----- | ----- | ----- | +| `errors` | `bool` | Indicates whether any of the operations in the bulk request failed. If any operation fails, the response's `errors` field will be `true`. You can iterate over the individual `Item` actions for more detailed information.| +| `items` | `repeated Item` | The result of all operations in the bulk request, in the order they were submitted. | +| `took` | `int64` | The amount of time taken to process the bulk request, in milliseconds. | +| `ingest_took` | `int64` | The amount of time taken to process documents through an ingest pipeline, in milliseconds. | + + +### Item fields + +Each `Item` in the response corresponds to a single operation in the request. For each operation, only one of the following fields is provided. + +| Field | Protobuf type | Description | +| ----- | ----- | ----- | +| `create` | `ResponseItem` | The result of the `CreateOperation`. | +| `delete` | `ResponseItem` | The result of the `DeleteOperation`. | +| `index` | `ResponseItem` | The result of the `IndexOperation`. | +| `update` | `ResponseItem` | The result of the `UpdateOperation`. | + + +### ResponseItem fields + +Each `ResponseItem` corresponds to a single operation in the request. It contains the following fields. + +| Field | Protobuf type | Description | +|------------------|---------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------| +| `type` | `string` | The document type. | +| `id` | [`ResponseItem.Id`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/document.proto#L254) | The document ID associated with the operation. Can be `null`. | +| `index` | `string` | The name of the index associated with the operation. If a data stream was targeted, this is the backing index. | +| `status` | `int32` | The HTTP status code returned for the operation. *(Note: This field may be replaced with a gRPC code in the future.)* | +| `error` | [`ErrorCause`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L239) | Contains additional information about a failed operation. | +| `primary_term` | `int64` | The primary term assigned to the document. | +| `result` | `string` | The operation result. Valid values are `created`, `deleted`, and `updated`. | +| `seq_no` | `int64` | A sequence number assigned to the document to maintain version order. | +| `shards` | [`ShardInfo`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L286) | Shard information for the operation (only returned for successful actions). | +| `version` | `int64` | The document version (only returned for successful actions). | +| `forced_refresh` | `bool` | If `true`, forces the document to become visible immediately after the operation. | +| `get` | [`InlineGetDictUserDefined`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/document.proto#L290) | Contains the document `source` returned from an inline get, if requested. | + +## Example response + +```json +{ + "bulkResponseBody": { + "errors": false, + "items": [ + { + "index": { + "id": { + "string": "2" + }, + "index": "my_index", + "status": 201, + "primaryTerm": "1", + "result": "created", + "seqNo": "0", + "shards": { + "successful": 1, + "total": 2 + }, + "version": "1", + "forcedRefresh": true + } + }, + { + "create": { + "id": { + "string": "1" + }, + "index": "my_index", + "status": 201, + "primaryTerm": "1", + "result": "created", + "seqNo": "0", + "shards": { + "successful": 1, + "total": 2 + }, + "version": "1", + "forcedRefresh": true + } + }, + { + "update": { + "id": { + "string": "2" + }, + "index": "my_index", + "status": 200, + "primaryTerm": "1", + "result": "updated", + "seqNo": "1", + "shards": { + "successful": 1, + "total": 2 + }, + "version": "2", + "forcedRefresh": true, + "get": { + "found": true, + "seqNo": "1", + "primaryTerm": "1", + "source": "e30=" + } + } + }, + { + "delete": { + "id": { + "string": "2" + }, + "index": "my_index", + "status": 200, + "primaryTerm": "1", + "result": "deleted", + "seqNo": "2", + "shards": { + "successful": 1, + "total": 2 + }, + "version": "3", + "forcedRefresh": true + } + } + ], + "took": "87", + "ingestTook": "0" + } +} +``` +{% include copy.html %} + + +## Java gRPC client example + +The following example shows a Java client-side program that submits a sample bulk gRPC request and then checks whether there were any errors in the bulk response: + +```java +import org.opensearch.protobufs.*; +import io.grpc.ManagedChannel; +import io.grpc.ManagedChannelBuilder; +import com.google.protobuf.ByteString; + +public class BulkClient { + public static void main(String[] args) { + ManagedChannel channel = ManagedChannelBuilder.forAddress("localhost", 9400) + .usePlaintext() + .build(); + + DocumentServiceGrpc.DocumentServiceBlockingStub stub = DocumentServiceGrpc.newBlockingStub(channel); + + IndexOperation indexOp = IndexOperation.newBuilder() + .setIndex("my-index") + .setId("1") + .build(); + + BulkRequestBody indexBody = BulkRequestBody.newBuilder() + .setIndex(indexOp) + .setDoc(ByteString.copyFromUtf8("{\"field\": \"value\"}")) + .build(); + + DeleteOperation deleteOp = DeleteOperation.newBuilder() + .setIndex("my-index") + .setId("2") + .build(); + + BulkRequestBody deleteBody = BulkRequestBody.newBuilder() + .setDelete(deleteOp) + .build(); + + BulkRequest request = BulkRequest.newBuilder() + .setIndex("my-index") + .addRequestBody(indexBody) + .addRequestBody(deleteBody) + .build(); + + BulkResponse response = stub.bulk(request); + System.out.println("Bulk errors: " + response.getErrors()); + + channel.shutdown(); + } +} +``` +{% include copy.html %} diff --git a/_api-reference/grpc-apis/index.md b/_api-reference/grpc-apis/index.md new file mode 100644 index 00000000000..8f6dd697459 --- /dev/null +++ b/_api-reference/grpc-apis/index.md @@ -0,0 +1,78 @@ +--- +layout: default +title: gRPC APIs +has_children: true +has_toc: false +nav_order: 25 +redirect_from: + - /api-reference/grpc-apis/ +--- + +# gRPC APIs +**Introduced 3.0** +{: .label .label-purple } + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/16787). +{: .warning} + +The OpenSearch gRPC plugin provides an alternative, high-performance transport layer using [gRPC](https://grpc.io/) for communication with OpenSearch. It uses protocol buffers over gRPC for lower overhead and faster serialization. This reduces overhead, speeds up serialization, and improves request-side latency, based on initial benchmarking results. + +The primary goal of the gRPC plugin is to: + +* Offer a **binary-encoded** alternative to HTTP/REST-based communication. +* **Improve performance** for bulk workloads and large-scale ingestion scenarios. +* **Enable more efficient client integrations** across languages, like Java, Go, and Python, using native gRPC stubs. + +## Enabling the plugin + +To enable the gRPC plugin (`transport-grpc`) in OpenSearch: +1. Install the `transport-grpc` plugin. For more information, see [Installing plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/). + +1. Add the following settings to `opensearch.yml`: + ```yaml + aux.transport.types: [experimental-transport-grpc] + aux.transport.experimental-transport-grpc.port: '9400-9500' // optional + ``` + {% include copy.html %} + + Alternatively, configure a secure transport protocol using the following settings: + ```yaml + aux.transport.types: [experimental-secure-transport-grpc] + aux.transport.experimental-transport-grpc.port: '9400-9500' // optional + ``` + {% include copy.html %} + +1. Configure additional settings if needed (see [Advanced gRPC settings](#advanced-grpc-settings)): + ```yaml + grpc.host: localhost + grpc.publish_host: 10.74.124.163 + grpc.bind_host: 0.0.0.0 + ``` + {% include copy.html %} + + +## Advanced gRPC settings + +OpenSearch supports the following advanced network settings for gRPC communication: + +- `grpc.host` (Static, list): Sets the address of an OpenSearch node for gRPC communication. The `grpc.host` setting is a combination of `grpc.bind_host` and `grpc.publish_host` if they are the same value. An alternative to `grpc.host` is to configure `grpc.bind_host` and `grpc.publish_host` separately, as needed. + +- `grpc.bind_host` (Static, list): Specifies an address or addresses to which an OpenSearch node binds to listen for incoming gRPC connections. + +- `grpc.publish_host` (Static, list): Specifies an address or addresses that an OpenSearch node publishes to other nodes for gRPC communication. + +These settings are similar to the [HTTP Network settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/network-settings/#advanced-http-settings). + +## Using gRPC APIs + +To submit gRPC requests, you must have a set of protobufs on the client side. You can obtain the protobufs in the following ways: + +- **Raw protobufs**: Download the raw protobuf schema from the [OpenSearch Protobufs GitHub repository (v0.3.0)](https://github.com/opensearch-project/opensearch-protobufs). You can then generate client-side code using the protocol buffer compilers for the [supported languages](https://grpc.io/docs/languages/). +- **Java client-side programs only**: Download the `opensearch-protobufs` jar from the [Maven Central repository](https://repo1.maven.org/maven2/org/opensearch/protobufs/0.3.0). + +## Supported APIs + +This feature is currently under development and supports the following APIs: + +- [Bulk]({{site.url}}{{site.baseurl}}/api-reference/grpc-apis/bulk/) +- [Search]({{site.url}}{{site.baseurl}}/api-reference/grpc-apis/search/) (for select query types) diff --git a/_api-reference/grpc-apis/search.md b/_api-reference/grpc-apis/search.md new file mode 100644 index 00000000000..7290afc0f49 --- /dev/null +++ b/_api-reference/grpc-apis/search.md @@ -0,0 +1,426 @@ +--- +layout: default +title: Search (gRPC) +parent: gRPC APIs +nav_order: 20 +--- + +# Search (gRPC) +**Introduced 3.0** +{: .label .label-purple } + + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/16787). +{: .warning} + +The gRPC Search API provides a performant, binary interface for running [queries]({{site.url}}{{site.baseurl}}/api-reference/search/) using protocol buffers over gRPC. It mirrors the capabilities of the HTTP Search API while benefiting from protobuf-typed contracts and gRPC transport. The gRPC APIs are ideal for low-latency, high-throughput applications. + +## Prerequisite + +To submit gRPC requests, you must have a set of protobufs on the client side. For ways to obtain the protobufs, see [Using gRPC APIs]({{site.url}}{{site.baseurl}}/api-reference/grpc-apis/index/#using-grpc-apis). + +## gRPC service and method + +gRPC Document APIs reside in the [`SearchService`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/services/search_service.proto#L22). + +You can submit search requests by invoking the [`Search`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/services/document_service.proto#L23) gRPC method within the `SearchService`. The method takes in a [`SearchRequest`](#searchrequest-fields) and returns a [`SearchResponse`](#searchresponse-fields). + +Currently, only the following basic queries are supported: [`match_all`](#match-all-query), [`term`](#term-query), +[`terms`](#terms-query), and [`match_none`](#match-none-query). Additional query types will be supported in future versions. +{: .note} + +## Request fields + +The gRPC Search API supports the following request fields. + +### SearchRequest fields + +The [`SearchRequest`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L18) message accepts the following fields. All fields are optional. + +| Field | Protobuf type | Description | +| :---- | :---- | :---- | +| `index` | `repeated string` | A list of indexes to search. If not provided, defaults to all indexes. | +| `source` | [`SourceConfigParam`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L154) | Controls whether to return the full `_source`, no `_source`, or only specific fields from `_source` in the response. | +| `source_excludes` | `repeated string` | Fields to exclude from `_source`. Ignored if `source` is `false`. | +| `source_includes` | `repeated string` | Fields to include in `_source`. Ignored if `source` is `false`. | +| `allow_no_indices` | `bool` | Whether to ignore wildcards that match no indexes. Default is `true`. | +| `allow_partial_search_results` | `bool` | Whether to return partial results upon an error or timeout. Default is `true`. | +| `analyze_wildcard` | `bool` | Whether to analyze wildcard/prefix queries. Default is `false`. | +| `analyzer` | `string` | The analyzer to use with the `q` query string. | +| `batched_reduce_size` | `int32` | The number of shards to reduce on a node. Default is `512`. | +| `cancel_after_time_interval` | `string` | The time after which the request will be canceled. Default is `-1`. | +| `ccs_minimize_roundtrips` | `bool` | Whether to minimize round trips between the node and remote clusters. Default is `true`. | +| `default_operator` | [`Operator`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L43) | The default operator for query strings. Valid values are `AND` or `OR`. Default is `OR`. | +| `df` | `string` | The default field for query strings without field prefixes. | +| `docvalue_fields` | `repeated string` | The fields to return as doc values. | +| `expand_wildcards` | `repeated` [`ExpandWildcard`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L56) | Specifies the type of index that wildcard expressions can match. Valid values are `all` (match any index), `open` (match open, non-hidden indexes), `closed` (match closed, non-hidden indexes), `hidden` (match hidden indexes), and `none` (deny wildcard expressions). Default is `open`.| +| `explain` | `bool` | Whether to return document score computation details. Default is `false`. | +| `from` | `int32` | The starting index for paginated results. Default is `0`. | +| `ignore_throttled` | `bool` | Whether to ignore frozen indexes when resolving aliases. Default is `true`. | +| `ignore_unavailable` | `bool` | Whether to ignore unavailable indexes or shards. Default is `false`. | +| `include_named_queries_score` | `bool` | Whether to include scores for named queries. Default is `false`. | +| `lenient` | `bool` | Whether to accept format errors in queries. Default is `false`. | +| `max_concurrent_shard_requests` | `int32` | The number of concurrent shard requests per node. Default is `5`. | +| `phase_took` | `bool` | Whether to return phase-level `took` values. Default is `false`. | +| `pre_filter_shard_size` | `int32` | The threshold at which to trigger prefiltering by shard size. Default is `128`. | +| `preference` | `string` | The shard or node preferences for query execution. | +| `q` | `string` | The query string in [Lucene syntax]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/#query-string-syntax). | +| `request_cache` | `bool` | Whether to use the request cache. Defaults to the index's settings. | +| `rest_total_hits_as_int` | `bool` | Whether to return the number of total hits as an integer. Default is `false`. | +| `routing` | `repeated string` | The routing values used to direct requests to specific shards. | +| `scroll` | `string` | The amount of time to keep the search context alive for scrolling. | +| `search_pipeline` | `string` | The name of the search pipeline to use. | +| `search_type` | [`SearchType`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L109) | The method for calculating relevance scores. Valid values are `QUERY_THEN_FETCH` and `DFS_QUERY_THEN_FETCH`. Default is `QUERY_THEN_FETCH`. | +| `seq_no_primary_term` | `bool` | Whether to return the sequence number and primary term. | +| `size` | `int32` | The number of results to return. | +| `sort` | `repeated` [`SortOrder`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L122) | The fields and directions by which to sort the results. | +| `stats` | `repeated string` | The tags to associate with the request for logging. | +| `stored_fields` | `repeated string` | A list of stored fields to include in the response. | +| `suggest_field` | `string` | The field on which to base suggestions. | +| `suggest_mode` | [`SuggestMode`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L145) | The suggestion mode (for example, `always`, `missing`, `popular`). | +| `suggest_size` | `int32` | The number of suggestions to return. | +| `suggest_text` | `string` | The input text for generating suggestions. | +| `terminate_after` | `int32` | The maximum number of matching documents (hits) to process before early termination. Default is `0`. | +| `timeout` | `string` | The maximum amount of time to wait for query execution. Default is `1m`. | +| `track_scores` | `bool` | Whether to return document scores. Default is `false`. | +| `track_total_hits` | [`TrackHits`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L309) | Whether to include total hits metadata. | +| `typed_keys` | `bool` | Whether to include type information in aggregation and suggestion keys. Default is `true`. | +| `verbose_pipeline` | `bool` | Whether to enable verbose mode for the search pipeline. | +| `version` | `bool` | Whether to return the document version in the response. | +| `request_body` | [`SearchRequestBody`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L176) | The main search request payload, including the query and filters. | + +### SearchRequestBody fields + +The `SearchRequestBody` message accepts the following fields. All fields are optional. + +| Field | Protobuf type | Description | +| :---- | :---- | :---- | +| `collapse` | [`FieldCollapse`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L975) | Groups the results by a field. Returns only the top document per group. | +| `explain` | `bool` | Returns scoring explanations for matched documents. | +| `ext` | [`ObjectMap`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L76) | Plugin-specific metadata, for example, for extensions like RAG. | +| `from` | `int32` | The starting index for paginated results. Default is `0`. | +| `highlight` | [`Highlight`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L585) | Highlights matched terms in the result snippets. | +| `track_total_hits` | [`TrackHits`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L309) | Whether to return the total hit count. | +| `indices_boost` | `repeated` [`NumberMap`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L747) | Per-index boost multipliers in the `: ` format. | +| `docvalue_fields` | `repeated` [`FieldAndFormat`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L962) | The fields returned using doc values, optionally formatted. | +| `min_score` | float | The minimum score required in order for a document to be included in the results. | +| `post_filter` | [`QueryContainer`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L342) | Filters hits after aggregations are applied. | +| `profile` | `bool` | Enables profiling to analyze query performance. | +| `search_pipeline` | `string` | The name of the search pipeline to apply. | +| `verbose_pipeline` | `bool` | Enables verbose logging in the search pipeline. | +| `query` | [`QueryContainer`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L342) | The query domain-specific language (DSL) for the search. | +| `rescore` | `repeated` [`Rescore`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L631) | Reranks the top N hits to improve precision. | +| `script_fields` | `map` | Custom fields whose values are computed by scripts. | +| `search_after` | `repeated` [`FieldValue`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L2002) | Cursor-based pagination using values from the previous page. | +| `size` | `int32` | The number of results to return. Default is `10`. | +| `slice` | [`SlicedScroll`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L641) | Split scroll context into slices for parallel processing. | +| `sort` | `repeated` [`SortCombinations`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L814) | The sorting rules (for example, by field, score, or custom order). | +| `source` | [`SourceConfig`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L176) | Controls whether to return the full `_source`, no `_source`, or only specific fields from `_source` in the response. | +| `fields` | `repeated` [`FieldAndFormat`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L962) | Additional fields to return, with formatting options. | +| `suggest` | [`Suggester`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L654) | Suggestion queries for autocomplete or corrections. | +| `terminate_after` | `int32` | The maximum number of matching documents (hits) to process before early termination. Default is `0`. | +| `timeout` | `string` | The maximum amount of time to wait for query execution. | +| `track_scores` | `bool` | Whether to return document scores in the results. | +| `include_named_queries_score` | `bool` | Whether to include scores for named queries. | +| `version` | `bool` | Whether to include the document version in the response. | +| `seq_no_primary_term` | `bool` | Whether to include the sequence number and primary term for each hit. | +| `stored_fields` | `repeated string` | The stored fields to return (excludes `_source` unless re-enabled). | +| `pit` | [`PointInTimeReference`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L752) | The Point in Time reference used to search a fixed snapshot. | +| `stats` | `repeated string` | The tagging or logging fields to associate with the request. | +| `derived` | `map` | Dynamically computed fields returned in the response. | + + +### QueryContainer fields + +`QueryContainer` is the entry point for all supported query types. + +**Exactly one** of the following fields must be provided in each `QueryContainer` message. + +Note that some query types are currently unsupported. Currently, only [`match_all`](#match-all-query), [`term`](#term-query), [`terms`](#terms-query), and [`match_none`](#match-none-query) are supported. +{: .note} + +| Field | Protobuf type | Description | +| :---- | :------------- | :---------- | +| `bool` | [`BoolQuery`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L1290) | A Boolean query that combines multiple clauses using `AND`/`OR`/`NOT` logic. Must be the only field set. | +| `boosting` | [`BoostingQuery`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L1322) | Boosts the results matching a positive query and demotes the results matching a negative query. Must be the only field set. | +| `constant_score` | [`ConstantScoreQuery`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L1338) | Wraps a filter and assigns a constant relevance score to all matching documents. Must be the only field set. | +| `dis_max` | [`DisMaxQuery`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L1349) | Returns documents matching any clause. Uses the highest score if multiple clauses match. Must be the only field set. | +| `function_score` | [`FunctionScoreQuery`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L1364) | Adjusts the scores of results using custom functions. Must be the only field set. | +| `exists` | [`ExistsQuery`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L1007) | Matches documents that contain a specific field. Must be the only field set. | +| `fuzzy` | `map` | Matches terms similar to the search term (fuzzy matching). Only one entry is allowed. Must be the only field set. | +| `ids` | [`IdsQuery`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L2011) | Matches documents by `_id` values. Must be the only field set. | +| `prefix` | `map` | Matches terms with a specific prefix. Only one entry is allowed. Must be the only field set. | +| `range` | `map` | Matches terms within a specified range. Only one entry is allowed. Must be the only field set. | +| `regexp` | `map` | Matches terms using regular expressions. Only one entry is allowed. Must be the only field set. | +| `term` | `map` | Matches exact terms (no analysis). Only one entry is allowed. Must be the only field set. | +| `terms` | [`TermsQueryField`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L1607) | Matches any document containing one or more specified terms in a field. Must be the only field set. | +| `terms_set` | `map` | Matches documents containing a minimum number of exact terms in a field. Only one entry is allowed. Must be the only field set. | +| `wildcard` | `map` | Matches terms using a wildcard pattern. Only one entry is allowed. Must be the only field set. | +| `match` | `map` | Full-text match on text or exact-value fields. Only one entry is allowed. Must be the only field set. | +| `match_bool_prefix` | `map` | Matches full words and prefixes in a Boolean-style query. Only one entry is allowed. Must be the only field set. | +| `match_phrase` | `map` | Matches an exact phrase in order. Only one entry is allowed. Must be the only field set. | +| `match_phrase_prefix` | `map` | Matches a phrase in which the last term is treated as a prefix. Only one entry is allowed. Must be the only field set. | +| `multi_match` | [`MultiMatchQuery`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L2236) | Searches multiple fields using a single query string. Must be the only field set. | +| `query_string` | [`QueryStringQuery`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L1690) | Parses advanced queries written as a single string. Must be the only field set. | +| `simple_query_string` | [`SimpleQueryStringQuery`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L1690) | A less strict syntax alternative to `query_string`. Ignores invalid syntax. Must be the only field set. | +| `intervals` | `map` | Matches terms based on position/proximity. Only one entry is allowed. Must be the only field set. | +| `knn` | `map` | A k-NN query across vector fields. Only one entry is allowed. Must be the only field set. | +| `match_all` | [`MatchAllQuery`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L2068) | Matches all documents in the index. Must be the only field set. | +| `match_none` | [`MatchNoneQuery`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L2156) | Matches no documents. Must be the only field set. | +| `script_score` | [`ScriptScoreQuery`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L991) | Applies custom scoring using scripts. Must be the only field set. | +| `nested` | [`NestedQuery`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L499) | Wraps a query targeting nested fields. Must be the only field set. | + +## Supported queries + +The gRPC Search API supports the following queries. + +All of the following examples show valid request payloads that can be sent to the `SearchService/Search` gRPC method. + +### Match all query + +A `match_all` query returns all documents in the index. For example, the following request returns a maximum of 50 documents from the index: + +```json +{ + "request_body": { + "query": { + "match_all": {} + }, + "size": 50 + } +} +``` +{% include copy.html %} + +### Term query + +A `term` query matches a single field with a specific term. For example, the following query searches for titles containing the word `Rush`: + +```json +{ + "index": "my_index", + "request_body": { + "query": { + "term": { + "title": { + "value": { + "string_value": "Rush" + }, + "case_insensitive": true + } + } + } + } +} +``` +{% include copy.html %} + +### Terms query + +A `terms` query matches documents in which a specific field contains any value from a list. For example, the following query searches for lines with the IDs `61809` and `61810`: + +```json +{ + "request_body": { + "query": { + "terms": { + "terms_lookup_field_string_array_map": { + "line_id": { + "string_array": { + "string_array": [ + "61809", + "61810" + ] + } + } + } + } + } + } +} +``` +{% include copy.html %} + +### Terms query with a terms lookup + +A `terms` query with a `terms` lookup is a specialized form of the `terms` query that allows you to fetch the terms for filtering from another document in your cluster rather than specifying them directly in the query. For example, the following request matches documents in the `students` index with every student whose `id` matches one of the values in the `enrolled` array: + +```json +{ + "request_body": { + "query": { + "terms": { + "boost": 1.0, + "terms_lookup_field_string_array_map": { + "student_id": { + "terms_lookup_field": { + "index": "classes", + "id": "101", + "path": "enrolled" + } + } + } + } + } + } +} +``` +{% include copy.html %} + + +### Match none query + +A `match_none` query matches none of the documents: + +```json +{ + "request_body": { + "query": { + "match_none": {} + } + } +} +``` +{% include copy.html %} + +## Response fields + +The gRPC Search API provides the following response fields. + +### SearchResponse fields + +The following table lists the supported fields for the [`SearchResponse`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L317) message. + +| Field | Protobuf type | Description | +| :---- | :---- | :---- | +| `response_body` | [`ResponseBody`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L351) | The actual payload of the search response. | + +### ResponseBody fields + +The `ResponseBody` contains the following fields. + +The source documents are returned as bytes. Use Base64 decoding to read the `_source` field in the gRPC response. +{: .note} + +| Field | Protobuf type | Description | +| :---- | :---- | :---- | +| `took` | `int64` | The amount of time taken to process the search request, in milliseconds. | +| `timed_out` | `bool` | Whether the search timed out. | +| `shards` | [`ShardStatistics`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L268) | The shard-level success/failure/total metadata. | +| `phase_took` | [`PhaseTook`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L394) | The phase-level `took` time values in the response. | +| `hits` | [`HitsMetadata`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L411) | The main document results and metadata. | +| `profile` | [`Profile`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L594) | Profiling data for query execution (debugging/performance insights). | +| `fields` | [`ObjectMap`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L76) | The top-level key-value field structure from the response (if any). | + +### HitsMetadata fields + +The `HitsMetadata` object contains information about the search results, including the total number of matching documents and an array of individual document matches. It includes the following fields. + +| Field | Protobuf type | Description | +| :---- | :---- | :---- | +| `total` | [`TotalHits`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L437) | Metadata about the total number of matching documents (value \+ relation). | +| `max_score` | [`MaxScore`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L424) | The highest relevance score of the returned hits (may be `null`). | +| `hits` | `repeated` [`Hit`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L460) | The actual list of matched documents. Each hit includes core fields like `index`, `id`, `score`, and `source`, along with additional optional fields. | + +### Hit fields + +Each `Hit` represents a single document matched by the query and contains the following fields. + +| Field | Protobuf type | Description | +| :---- | :---- | :---- | +| `index` | `string` | The name of the index containing the returned document. | +| `id` | `string` | The unique ID for the document within the index. | +| `score` | [`Score`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L469) | The relevance score of the hit. | +| `explanation` | [`Explanation`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L951) | A text explanation of how the `_score` was calculated. | +| `fields` | [`ObjectMap`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L76) | The document field values. | +| `highlight` | `map` | The highlighted fields and fragments per hit. | +| `inner_hits` | `map` | The matching nested documents from a different scope that contributed to the overall query result. | +| `matched_queries` | `repeated string` | A list of query names matching the document. | +| `nested` | [`NestedIdentity`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/search.proto#L962) | The path to the inner nested object from which the hit originated. | +| `ignored` | `repeated string` | A list of ignored fields. | +| `ignored_field_values` | `map` | Raw, unprocessed values from the document's original JSON. | +| `shard` | `string` | The shard ID from which the hit was retrieved. | +| `node` | `string` | The node ID from which the hit was retrieved. | +| `routing` | `string` | The routing value used for custom shard routing. | +| `source` | `bytes` | The Base64-encoded `_source` document. | +| `seq_no` | `int64` | The sequence number (used for indexing history and versioning). | +| `primary_term` | `int64` | The primary term number (used for optimistic concurrency control). | +| `version` | `int64` | The document version number. | +| `sort` | `repeated` [`FieldValue`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L2002) | The sort values used for result sorting. | +| `meta_fields` | [`ObjectMap`](https://github.com/opensearch-project/opensearch-protobufs/blob/0.3.0/protos/schemas/common.proto#L76) | The metadata values for the document. | + +`source` is Base64 encoded and must be decoded to obtain the JSON document. +{: .note} + +## Example response + +```json +{ + "responseBody": { + "took": "64", + "timedOut": false, + "shards": { + "successful": 1, + "total": 1 + }, + "hits": { + "total": { + "totalHits": { + "relation": "TOTAL_HITS_RELATION_EQ", + "value": "1" + } + }, + "hits": [ + { + "index": "my_index", + "id": "3", + "score": { + "floatValue": 1 + }, + "source": "eyAidGl0bGUiOiAiUnVzaCIsICJ5ZWFyIjogMjAxM30=", + "metaFields": {} + } + ], + "maxScore": { + "floatValue": 1 + } + } + } +} +``` +{% include copy.html %} + +## Java gRPC client example + +The following example shows a Java client-side program that submits a sample search term query gRPC request and then prints the number of hits returned in the search response: + +```java +import org.opensearch.protobufs.*; +import io.grpc.ManagedChannel; +import io.grpc.ManagedChannelBuilder; + +public class SearchClient { + public static void main(String[] args) { + ManagedChannel channel = ManagedChannelBuilder.forAddress("localhost", 9400) + .usePlaintext() + .build(); + + SearchServiceGrpc.SearchServiceBlockingStub stub = SearchServiceGrpc.newBlockingStub(channel); + + Query query = Query.newBuilder() + .setTerm(TermQuery.newBuilder().setField("director").setValue("Nolan")) + .build(); + + SearchRequest request = SearchRequest.newBuilder() + .addIndex("movies") + .setQuery(query) + .setSize(5) + .build(); + + SearchResponse response = stub.search(request); + System.out.println("Found hits: " + response.getHits().getTotal()); + channel.shutdown(); + } +} +``` +{% include copy.html %} diff --git a/_api-reference/index-apis/alias.md b/_api-reference/index-apis/alias.md index c3ddf76911a..d90fdf606dc 100644 --- a/_api-reference/index-apis/alias.md +++ b/_api-reference/index-apis/alias.md @@ -15,7 +15,7 @@ redirect_from: An alias is a virtual pointer that you can use to reference one or more indexes. Creating and updating aliases are atomic operations, so you can reindex your data and point an alias at it without any downtime. -## Path and HTTP methods +## Endpoints ```json POST _aliases diff --git a/_api-reference/index-apis/blocks.md b/_api-reference/index-apis/blocks.md index 61b0e1ddd60..a7d874d1e03 100644 --- a/_api-reference/index-apis/blocks.md +++ b/_api-reference/index-apis/blocks.md @@ -2,7 +2,7 @@ layout: default title: Blocks parent: Index APIs -nav_order: 6 +nav_order: 7 --- # Blocks @@ -12,7 +12,7 @@ nav_order: 6 Use the Blocks API to limit certain operations on a specified index. Different types of blocks allow you to restrict index write, read, or metadata operations. For example, adding a `write` block through the API ensures that all index shards have properly accounted for the block before returning a successful response. Any in-flight write operations to the index must be complete before the `write` block takes effect. -## Path and HTTP methods +## Endpoints ```json PUT //_block/ @@ -23,7 +23,7 @@ PUT //_block/ | Parameter | Data type | Description | :--- | :--- | :--- | `index` | String | A comma-delimited list of index names. Wildcard expressions (`*`) are supported. To target all data streams and indexes in a cluster, use `_all` or `*`. Optional. | -| `` | String | Specifies the type of block to apply to the index. Valid values are:
`metadata`: Disables all metadata changes, such as closing the index.
`read`: Disables any read operations.
`read_only`: Disables any write operations and metadata changes.
`write`: Disables write operations. However, metadata changes are still allowed. | +| `` | String | Specifies the type of block to apply to the index. Valid values are:
- `metadata`: Blocks metadata changes, such as closing the index.
- `read`: Blocks read operations.
- `read_only`: Blocks write operations and metadata changes.
- `write`: Blocks write operations but allows metadata changes.
- `search_only`: Blocks indexing and write operations while allowing read-only access through search replicas.
OpenSearch automatically manages this block through the Scale API as part of the reader-writer separation mechanism. Therefore, do not set this parameter manually. | ## Query parameters @@ -44,6 +44,7 @@ The following example request disables any `write` operations made to the test i ```json PUT /test-index/_block/write ``` +{% include copy.html %} ## Example response @@ -56,4 +57,4 @@ PUT /test-index/_block/write "blocked" : true } ] } -``` \ No newline at end of file +``` diff --git a/_api-reference/index-apis/clear-index-cache.md b/_api-reference/index-apis/clear-index-cache.md index 6227a29960b..7b713fa15d3 100644 --- a/_api-reference/index-apis/clear-index-cache.md +++ b/_api-reference/index-apis/clear-index-cache.md @@ -15,7 +15,7 @@ The clear cache API operation clears the caches of one or more indexes. For data If you use the Security plugin, you must have the `manage index` privileges. {: .note} -## Path and HTTP methods +## Endpoints ```json POST //_cache/clear diff --git a/_api-reference/index-apis/clone.md b/_api-reference/index-apis/clone.md index 36592a28b55..9076e06ed89 100644 --- a/_api-reference/index-apis/clone.md +++ b/_api-reference/index-apis/clone.md @@ -13,7 +13,7 @@ redirect_from: The clone index API operation clones all data in an existing read-only index into a new index. The new index cannot already exist. -## Path and HTTP methods +## Endpoints ```json POST //_clone/ diff --git a/_api-reference/index-apis/close-index.md b/_api-reference/index-apis/close-index.md index ecad7d18cc2..ca543cdc5c1 100644 --- a/_api-reference/index-apis/close-index.md +++ b/_api-reference/index-apis/close-index.md @@ -14,7 +14,7 @@ redirect_from: The close index API operation closes an index. Once an index is closed, you cannot add data to it or search for any data within the index. -## Path and HTTP methods +## Endpoints ```json POST //_close diff --git a/_api-reference/index-apis/component-template.md b/_api-reference/index-apis/component-template.md index fa73e64c940..34d6cf12701 100644 --- a/_api-reference/index-apis/component-template.md +++ b/_api-reference/index-apis/component-template.md @@ -2,7 +2,7 @@ layout: default title: Create or update component template parent: Index APIs -nav_order: 29 +nav_order: 31 --- # Create or update component template @@ -15,7 +15,7 @@ If any settings or mappings are directly defined in the index template or the in Component templates are used solely during the process of index creation. For data streams, this includes the creation of the data stream itself and the creation of the backing indexes that support the stream. Modifications made to component templates will not affect existing indexes, including the backing indexes of a data stream. -## Path and HTTP methods +## Endpoints The PUT method adds a component template and accepts both query parameters and a request body. The GET method retrieves information about an existing component template and accepts only query parameters: @@ -75,11 +75,11 @@ Parameter | Data type | Description #### `mappings` -The field mappings that exist in the index. For more information, see [Mappings and field types](https://opensearch.org/docs/latest/field-types/). Optional. +The field mappings that exist in the index. For more information, see [Mappings and field types]({{site.url}}{{site.baseurl}}/field-types/). Optional. #### `settings` -Any configuration options for the index. For more information, see [Index settings](https://opensearch.org/docs/latest/install-and-configure/configuring-opensearch/index-settings/). +Any configuration options for the index. For more information, see [Index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/). ## Example requests diff --git a/_api-reference/index-apis/create-index-template.md b/_api-reference/index-apis/create-index-template.md index c2f4228c8e1..203d65f989f 100644 --- a/_api-reference/index-apis/create-index-template.md +++ b/_api-reference/index-apis/create-index-template.md @@ -9,7 +9,7 @@ nav_order: 26 You can use the Create or Update Index Template API to create indexes with predefined mappings and settings as well as update existing index templates. -## Path and HTTP methods +## Endpoints ```json PUT _index_template/ @@ -68,11 +68,11 @@ Parameter | Data type | Description #### `mappings` -The field mappings that exist in the index. For more information, see [Mappings and field types](https://opensearch.org/docs/latest/field-types/). Optional. +The field mappings that exist in the index. For more information, see [Mappings and field types]({{site.url}}{{site.baseurl}}/field-types/). Optional. #### `settings` -Any configuration options for the index. For more information, see [Index settings](https://opensearch.org/docs/latest/install-and-configure/configuring-opensearch/index-settings/). +Any configuration options for the index. For more information, see [Index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/). ## Example requests diff --git a/_api-reference/index-apis/create-index.md b/_api-reference/index-apis/create-index.md index f10450bb283..a956c9dd408 100644 --- a/_api-reference/index-apis/create-index.md +++ b/_api-reference/index-apis/create-index.md @@ -2,7 +2,7 @@ layout: default title: Create index parent: Index APIs -nav_order: 25 +nav_order: 21 redirect_from: - /opensearch/rest-api/index-apis/create-index/ - /opensearch/rest-api/create-index/ @@ -16,7 +16,7 @@ While you can create an index by using a document as a base, you can also create When creating an index, you can specify its mappings, settings, and aliases. -## Path and HTTP methods +## Endpoints ```json PUT diff --git a/_api-reference/index-apis/dangling-index.md b/_api-reference/index-apis/dangling-index.md index f44a9dc4d47..5ca0fae9c48 100644 --- a/_api-reference/index-apis/dangling-index.md +++ b/_api-reference/index-apis/dangling-index.md @@ -2,7 +2,7 @@ layout: default title: Dangling indexes parent: Index APIs -nav_order: 30 +nav_order: 32 --- # Dangling indexes API @@ -11,7 +11,7 @@ nav_order: 30 After a node joins a cluster, dangling indexes occur if any shards exist in the node's local directory that do not already exist in the cluster. Dangling indexes can be listed, deleted, or imported. -## Path and HTTP methods +## Endpoints List dangling indexes: diff --git a/_api-reference/index-apis/delete-index-template.md b/_api-reference/index-apis/delete-index-template.md index f6e2f38773d..bbc277404fb 100644 --- a/_api-reference/index-apis/delete-index-template.md +++ b/_api-reference/index-apis/delete-index-template.md @@ -9,7 +9,7 @@ nav_order: 28 The Delete Index Template API deletes one or more index templates. -## Path and HTTP methods +## Endpoints ```json DELETE /_index_template/ diff --git a/_api-reference/index-apis/delete-index.md b/_api-reference/index-apis/delete-index.md index af0bd292fc2..a787203fbc4 100644 --- a/_api-reference/index-apis/delete-index.md +++ b/_api-reference/index-apis/delete-index.md @@ -2,7 +2,7 @@ layout: default title: Delete index parent: Index APIs -nav_order: 35 +nav_order: 22 redirect_from: - /opensearch/rest-api/index-apis/delete-index/ --- @@ -13,7 +13,7 @@ redirect_from: If you no longer need an index, you can use the delete index API operation to delete it. -## Path and HTTP methods +## Endpoints ```json DELETE / diff --git a/_api-reference/index-apis/exists.md b/_api-reference/index-apis/exists.md index fb1a4d79c69..c3b2f72ae22 100644 --- a/_api-reference/index-apis/exists.md +++ b/_api-reference/index-apis/exists.md @@ -2,7 +2,7 @@ layout: default title: Index exists parent: Index APIs -nav_order: 50 +nav_order: 19 redirect_from: - /opensearch/rest-api/index-apis/exists/ --- @@ -14,7 +14,7 @@ redirect_from: The index exists API operation returns whether or not an index already exists. -## Path and HTTP methods +## Endpoints ```json HEAD / diff --git a/_api-reference/index-apis/flush.md b/_api-reference/index-apis/flush.md index e464a42cad9..b699b4511c3 100644 --- a/_api-reference/index-apis/flush.md +++ b/_api-reference/index-apis/flush.md @@ -14,7 +14,7 @@ The Flush API stores all in-memory operations to segments on disk. Operations fl OpenSearch automatically performs flushes in the background based on conditions like transaction log size, which is controlled by the `index.translog.flush_threshold_size` setting. Use the Flush API sparingly, for example, for manual restarts or to free up memory. -## Path and HTTP methods +## Endpoints The Flush API supports the following paths: diff --git a/_api-reference/index-apis/force-merge.md b/_api-reference/index-apis/force-merge.md index 8316c729373..0579e0824e5 100644 --- a/_api-reference/index-apis/force-merge.md +++ b/_api-reference/index-apis/force-merge.md @@ -11,7 +11,7 @@ nav_order: 37 The force merge API operation forces a merge on the shards of one or more indexes. For a data stream, the API forces a merge on the shards of the stream's backing index. -## Path and HTTP methods +## Endpoints ```json POST /_forcemerge diff --git a/_api-reference/index-apis/get-index-template.md b/_api-reference/index-apis/get-index-template.md index 7e2d3836409..3073861cdbf 100644 --- a/_api-reference/index-apis/get-index-template.md +++ b/_api-reference/index-apis/get-index-template.md @@ -9,7 +9,7 @@ nav_order: 27 The Get Index Template API returns information about one or more index templates. -## Path and HTTP methods +## Endpoints ```json GET /_index_template/ diff --git a/_api-reference/index-apis/get-index.md b/_api-reference/index-apis/get-index.md index 78fe7bcd943..907ff7b86dd 100644 --- a/_api-reference/index-apis/get-index.md +++ b/_api-reference/index-apis/get-index.md @@ -2,7 +2,7 @@ layout: default title: Get index parent: Index APIs -nav_order: 40 +nav_order: 24 redirect_from: - /opensearch/rest-api/index-apis/get-index/ --- @@ -14,7 +14,7 @@ redirect_from: You can use the get index API operation to return information about an index. -## Path and HTTP methods +## Endpoints ```json GET / diff --git a/_api-reference/index-apis/get-settings.md b/_api-reference/index-apis/get-settings.md index 4eebb432728..08c500f05a7 100644 --- a/_api-reference/index-apis/get-settings.md +++ b/_api-reference/index-apis/get-settings.md @@ -15,7 +15,7 @@ redirect_from: The get settings API operation returns all the settings in your index. -## Path and HTTP methods +## Endpoints ```json GET /_settings diff --git a/_api-reference/index-apis/index.md b/_api-reference/index-apis/index.md index 6e1fdbcfa67..3b1bf026af8 100644 --- a/_api-reference/index-apis/index.md +++ b/_api-reference/index-apis/index.md @@ -2,7 +2,7 @@ layout: default title: Index APIs has_children: true -nav_order: 35 +nav_order: 30 redirect_from: - /opensearch/rest-api/index-apis/index/ - /opensearch/rest-api/index-apis/ diff --git a/_api-reference/index-apis/open-index.md b/_api-reference/index-apis/open-index.md index 3011507697e..2d161541b46 100644 --- a/_api-reference/index-apis/open-index.md +++ b/_api-reference/index-apis/open-index.md @@ -2,7 +2,7 @@ layout: default title: Open index parent: Index APIs -nav_order: 55 +nav_order: 17 redirect_from: - /opensearch/rest-api/index-apis/open-index/ --- @@ -14,7 +14,7 @@ redirect_from: The open index API operation opens a closed index, letting you add or search for data within the index. -## Path and HTTP methods +## Endpoints ```json POST //_open diff --git a/_api-reference/index-apis/put-mapping.md b/_api-reference/index-apis/put-mapping.md index 26bfbae0d98..d90050d4da2 100644 --- a/_api-reference/index-apis/put-mapping.md +++ b/_api-reference/index-apis/put-mapping.md @@ -2,7 +2,7 @@ layout: default title: Create or update mappings parent: Index APIs -nav_order: 27 +nav_order: 30 redirect_from: - /opensearch/rest-api/index-apis/put-mapping/ - /opensearch/rest-api/index-apis/update-mapping/ @@ -17,7 +17,7 @@ If you want to create or add mappings and fields to an index, you can use the pu You can't use this operation to update mappings that already map to existing data in the index. You must first create a new index with your desired mappings, and then use the [reindex API operation]({{site.url}}{{site.baseurl}}/opensearch/reindex-data) to map all the documents from your old index to the new index. If you don't want any downtime while you re-index your indexes, you can use [aliases]({{site.url}}{{site.baseurl}}/opensearch/index-alias). -## Path and HTTP methods +## Endpoints ```json PUT //_mapping diff --git a/_api-reference/index-apis/recover.md b/_api-reference/index-apis/recover.md index 41f071cf6c7..1b6866cf72b 100644 --- a/_api-reference/index-apis/recover.md +++ b/_api-reference/index-apis/recover.md @@ -24,7 +24,7 @@ Shard recovery occurs automatically in the following scenarios: The Recovery API reports solely on completed recoveries for shard copies presently stored in the cluster. It reports only the most recent recovery for each shard copy and does not include historical information about previous recoveries or information about recoveries of shard copies that no longer exist. Consequently, if a shard copy completes a recovery and is subsequently relocated to a different node, then the information about the original recovery is not displayed in the Recovery API. -## Path and HTTP methods +## Endpoints ```json GET /_recovery @@ -55,7 +55,7 @@ The following examples demonstrate how to recover information using the Recovery ### Recover information from several or all indexes -The following example request returns recovery information about several indexes in a [human-readable format](https://opensearch.org/docs/latest/api-reference/common-parameters/#human-readable-output): +The following example request returns recovery information about several indexes in a [human-readable format]({{site.url}}{{site.baseurl}}/api-reference/common-parameters/#human-readable-output): ```json GET index1,index2/_recovery?human diff --git a/_api-reference/index-apis/refresh.md b/_api-reference/index-apis/refresh.md index 917ca5d9a91..1bc3fc03d1e 100644 --- a/_api-reference/index-apis/refresh.md +++ b/_api-reference/index-apis/refresh.md @@ -20,7 +20,7 @@ After a shard becomes idle, the indexes will not refresh until either the next s To use the Refresh Index API, you must have write access to the indexes you want to refresh. -## Path and HTTP methods +## Endpoints ```json POST /_refresh diff --git a/_api-reference/index-apis/resolve-index.md b/_api-reference/index-apis/resolve-index.md new file mode 100644 index 00000000000..de1f896aef7 --- /dev/null +++ b/_api-reference/index-apis/resolve-index.md @@ -0,0 +1,100 @@ +--- +layout: default +title: Resolve index +parent: Index APIs +nav_order: 62 +--- + +# Resolve index + +The Resolve Index API helps you understand how OpenSearch resolves aliases, data streams, and concrete indexes that match a specified name or wildcard expression. + +## Endpoints + +```json +GET /_resolve/index/ +``` + +## Path parameters + +The following table lists the available path parameters. All path parameters are required. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `name` | String | The name, alias, data stream, or wildcard expression to resolve. | + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `expand_wildcards` | String | Controls how wildcard expressions expand to matching indexes. Multiple values can be combined using commas. Valid values are:
• `all` – Expand to open and closed indexes, including hidden ones.
• `open` – Expand only to open indexes.
• `closed` – Expand only to closed indexes.
• `hidden` – Include hidden indexes (must be used with `open`, `closed`, or both).
• `none` – Wildcard expressions are not accepted.
**Default**: `open`. | + +## Example requests + +The following sections provide example Resolve API requests. + + +### Resolve a concrete index + + +```json +GET _resolve/index/my-index-001 +``` +{% include copy-curl.html %} + +### Resolve indexes using a wildcard + + +```json +GET _resolve/index/my-index-* +``` +{% include copy-curl.html %} + +### Resolve a data stream or alias + +If an alias or data stream named `logs-app` exists, use the following request to resolve it: + +```json +GET _resolve/index/logs-app +``` +{% include copy-curl.html %} + +### Resolve hidden indexes using a wildcard in a remote cluster + +The following example shows an API request using a wildcard, a remote cluster, and `expand_wildcards` configured to `hidden`: + +```json +GET _resolve/index/my-index-*,remote-cluster:my-index-*?expand_wildcards=hidden +``` +{% include copy-curl.html %} + +## Example response + +```json +{ + "indices": [ + { + "name": "my-index-001", + "attributes": [ + "open" + ] + } + ], + "aliases": [], + "data_streams": [] +} +``` + +## Response body fields + +| Field | Data type | Description | +| :--- | :--- | :--- | +| `indices` | Array | A list of resolved concrete indexes. | +| `aliases` | Array | A list of resolved index aliases. | +| `data_streams` | Array | A list of matched data streams. | + +## Required permissions + +If you are using the Security plugin, the user running these queries needs to have at least `read` permissions for the resolved index. diff --git a/_api-reference/index-apis/rollover.md b/_api-reference/index-apis/rollover.md index db30a5d7bff..7916506fe1c 100644 --- a/_api-reference/index-apis/rollover.md +++ b/_api-reference/index-apis/rollover.md @@ -1,17 +1,17 @@ --- layout: default -title: Rollover Index +title: Roll over index parent: Index APIs nav_order: 63 --- -# Rollover Index +# Roll over index Introduced 1.0 {: .label .label-purple } -The Rollover Index API creates a new index for a data stream or index alias based on the `wait_for_active_shards` setting. +The roll over index API operation creates a new index for a data stream or index alias based on the `wait_for_active_shards` setting. -## Path and HTTP methods +## Endpoints ```json POST //_rollover/ @@ -40,22 +40,22 @@ During the index alias rollover process, if you don't specify a custom name and ## Using date math with index rollovers -When using an index alias for time-series data, you can leverage [date math](https://opensearch.org/docs/latest/field-types/supported-field-types/date/) in the index name to track the rollover date. For example, you can create an alias pointing to `my-index-{now/d}-000001`. If you create an alias on June 11, 2029, then the index name would be `my-index-2029.06.11-000001`. For a rollover on June 12, 2029, the new index would be named `my-index-2029.06.12-000002`. See [Roll over an index alias with a write index](#rolling-over-an-index-alias-with-a-write-index) for a practical example. +When using an index alias for time-series data, you can use [date math]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date/) in the index name to track the rollover date. For example, you can create an alias pointing to `my-index-{now/d}-000001`. If you create an alias on June 11, 2029, then the index name would be `my-index-2029.06.11-000001`. For a rollover on June 12, 2029, the new index would be named `my-index-2029.06.12-000002`. See [Roll over an index alias with a write index](#rolling-over-an-index-alias-with-a-write-index) for a practical example. ## Path parameters -The Rollover Index API supports the parameters listed in the following table. +The following table lists the available path parameters. -Parameter | Type | Description +Parameter | Data type | Description :--- | :--- | :--- `` | String | The name of the data stream or index alias to roll over. Required. | `` | String | The name of the index to create. Supports date math. Data streams do not support this parameter. If the name of the alias's current write index does not end with `-` and a number, such as `my-index-000001` or `my-index-2`, then the parameter is required. ## Query parameters -The following table lists the supported query parameters. +The following table lists the available query parameters. -Parameter | Type | Description +Parameter | Data type | Description :--- | :--- | :--- `cluster_manager_timeout` | Time | The amount of time to wait for a connection to the cluster manager node. Default is `30s`. `timeout` | Time | The amount of time to wait for a response. Default is `30s`. @@ -81,7 +81,7 @@ Parameter | Type | Description ### `mappings` -The `mappings` parameter specifies the index field mappings. It is optional. See [Mappings and field types](https://opensearch.org/docs/latest/field-types/) for more information. +The `mappings` parameter specifies the index field mappings. It is optional. See [Mappings and field types]({{site.url}}{{site.baseurl}}/field-types/) for more information. ### `conditions` @@ -89,16 +89,15 @@ The `conditions` parameter is an optional object defining criteria for triggerin The object body supports the following parameters. -Parameter | Type | Description +Parameter | Data type | Description :--- | :--- | :--- -| `max_age` | Time units | Triggers a rollover after the maximum elapsed time since index creation is reached. The elapsed time is always calculated since the index creation time, even if the index origination date is configured to a custom date, such as when using the `index.lifecycle.parse_origination_date` or `index.lifecycle.origination_date` settings. Optional. | +`max_age` | Time units | Triggers a rollover after the maximum elapsed time since index creation is reached. The elapsed time is always calculated since the index creation time, even if the index origination date is configured to a custom date, such as when using the `index.lifecycle.parse_origination_date` or `index.lifecycle.origination_date` settings. Optional. | `max_docs` | Integer | Triggers a rollover after the specified maximum number of documents, excluding documents added since the last refresh and documents in replica shards. Optional. `max_size` | Byte units | Triggers a rollover when the index reaches a specified size, calculated as the total size of all primary shards. Replicas are not counted. Use the `_cat indices` API and check the `pri.store.size` value to see the current index size. Optional. -`max_primary_shard_size` | Byte units | Triggers a rollover when the largest primary shard in the index reaches a certain size. This is the maximum size of the primary shards in the index. As with `max_size`, replicas are ignored. To see the current shard size, use the `_cat shards` API. The `store` value shows the size of each shard, and `prirep` indicates whether a shard is a primary (`p`) or a replica (`r`). Optional. ### `settings` -The `settings` parameter specifies the index configuration options. See [Index settings](https://opensearch.org/docs/latest/install-and-configure/configuring-opensearch/index-settings/) for more information. +The `settings` parameter specifies the index configuration options. See [Index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/) for more information. ## Example requests @@ -106,7 +105,7 @@ The following examples illustrate using the Rollover Index API. A rollover occur - The index was created 5 or more days ago. - The index contains 500 or more documents. -- The index's largest primary shard is 100 GB or larger. +- The index is 100 GB or larger. ### Rolling over a data stream @@ -118,7 +117,7 @@ POST my-data-stream/_rollover "conditions": { "max_age": "5d", "max_docs": 500, - "max_primary_shard_size": "100gb" + "max_size": "100gb" } } ``` @@ -149,7 +148,7 @@ POST my-alias/_rollover "conditions": { "max_age": "5d", "max_docs": 500, - "max_primary_shard_size": "100gb" + "max_size": "100gb" } } ``` @@ -172,7 +171,7 @@ POST my-alias/_rollover ## Example response -OpenSearch returns the following response confirming that all conditions except `max_primary_shard_size` were met: +OpenSearch returns the following response confirming that all conditions except `max_size` were met: ```json { @@ -185,7 +184,7 @@ OpenSearch returns the following response confirming that all conditions except "conditions": { "[max_age: 5d]": true, "[max_docs: 500]": true, - "[max_primary_shard_size: 100gb]": false + "[max_size: 100gb]": false } } ``` diff --git a/_api-reference/index-apis/scale.md b/_api-reference/index-apis/scale.md new file mode 100644 index 00000000000..5f86ae21f7c --- /dev/null +++ b/_api-reference/index-apis/scale.md @@ -0,0 +1,75 @@ +--- +layout: default +title: Scale +parent: Index APIs +nav_order: 50 +--- + +# Scale +**Introduced 3.0** +{: .label .label-purple } + +The Scale API allows you to enable or disable the `search_only` mode on an index. When an index is in `search_only` mode, it retains only its search replicas and scales down primary and regular replica shards. This optimization helps reduce resource consumption during periods of low write activity while maintaining search capabilities. + +This feature supports scenarios such as scale-to-zero deployments and reader/writer separation patterns, which can significantly improve resource utilization and reduce costs in production environments. + +If you are using the Security plugin, you must have the `manage index` privileges. +{: .note} + +## Endpoints + +```json +POST //_scale +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Required | Data type | Description | +| :--- | :--- | :--- | :--- | +| `index` | **Required** | String | The name of the index to scale. Wildcards are not supported. | + +## Request body fields + +The following table lists the available request body fields. + +| Field | Required | Data type | Description | +| :--- | :--- | :--- | :--- | +| `search_only` | **Required** | Boolean | When `true`, enables search-only mode on the index. When `false`, disables search-only mode and restores the index to normal operations. | + +## Example requests + +### Enable search-only mode + +The following request enables search-only mode for an index named `my-index`: + +```json +POST /my-index/_scale +{ + "search_only": true +} +``` +{% include copy-curl.html %} + +### Disable search-only mode + +The following request disables search-only mode and returns the index to normal operations: + +```json +POST /my-index/_scale +{ + "search_only": false +} +``` +{% include copy-curl.html %} + +## Example response + +The API returns the following response: + +```json +{ + "acknowledged": true +} +``` diff --git a/_api-reference/index-apis/segment.md b/_api-reference/index-apis/segment.md index b9625d3b346..8426d5068d6 100644 --- a/_api-reference/index-apis/segment.md +++ b/_api-reference/index-apis/segment.md @@ -12,7 +12,7 @@ Introduced 1.0 The Segment API provides details about the Lucene segments within index shards as well as information about the backing indexes of data streams. -## Path and HTTP methods +## Endpoints ```json GET //_segments diff --git a/_api-reference/index-apis/shrink-index.md b/_api-reference/index-apis/shrink-index.md index e3e1c671554..694a50801b7 100644 --- a/_api-reference/index-apis/shrink-index.md +++ b/_api-reference/index-apis/shrink-index.md @@ -2,7 +2,7 @@ layout: default title: Shrink index parent: Index APIs -nav_order: 65 +nav_order: 25 redirect_from: - /opensearch/rest-api/index-apis/shrink-index/ --- @@ -14,7 +14,7 @@ redirect_from: The shrink index API operation moves all of your data in an existing index into a new index with fewer primary shards. -## Path and HTTP methods +## Endpoints ```json POST //_shrink/ diff --git a/_api-reference/index-apis/simulate-index-template.md b/_api-reference/index-apis/simulate-index-template.md new file mode 100644 index 00000000000..5f556a6bfd2 --- /dev/null +++ b/_api-reference/index-apis/simulate-index-template.md @@ -0,0 +1,167 @@ +--- +layout: default +title: Simulate index templates +parent: Index APIs +nav_order: 29 +--- + +# Simulate index templates + +You can use the Simulate Index Template API to preview how index templates will be applied to an index or simulate an index template before creating it. + +## Endpoints + +```json +POST /_index_template/_simulate +POST /_index_template/_simulate/ +POST /_index_template/_simulate_index/ +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `template_name` | String | The name of the index template to simulate. | +| `index_name` | String | The name of the index to use for simulating template resolution. | + +## Request body fields + +The following table lists the available request body fields. + +| Field | Data type | Description | +| :--- | :--- | :--- | +| `index_patterns` | Array | The index patterns to which the template applies. | +| `template` | Object | The template definition. | +| `template.settings` | Object | The index settings to apply. | +| `template.mappings` | Object | The field mappings to apply. | +| `template.aliases` | Object | The aliases to apply. | +| `priority` | Integer | The template's priority value, used to determine which template is applied when multiple templates match an index. Higher values take precedence. | +| `version` | Integer | The template version. | +| `_meta` | Object | Metadata for the template. | + +### Example request: Simulate a template + +Use the following request to simulate a template without creating it: + +```json +POST /_index_template/_simulate +{ + "index_patterns": ["log-*"], + "template": { + "settings": { + "number_of_shards": 1 + }, + "mappings": { + "properties": { + "message": { + "type": "text" + } + } + } + }, + "priority": 5 +} +``` +{% include copy-curl.html %} + +### Example request: Simulate a named template + +You can simulate a specific template by specifying the name of the template. + +First, create a template named `template_for_simulation` using the following request: + +```json +PUT /_index_template/template_for_simulation +{ + "index_patterns": ["logs-sim-*"], + "template": { + "settings": { + "number_of_shards": 1, + "number_of_replicas": 1 + }, + "mappings": { + "properties": { + "timestamp": { + "type": "date" + }, + "message": { + "type": "text" + }, + "level": { + "type": "keyword" + } + } + } + }, + "priority": 10, + "version": 1, + "_meta": { + "description": "Template used for simulation example", + "owner": "Docs Team" + } +} +``` +{% include copy-curl.html %} + +You can now simulate the template named `template_for_simulation`: + +```json +POST /_index_template/_simulate/template_for_simulation +``` +{% include copy-curl.html %} + +### Example request: Simulate a template on a specific index + +Simulating a template on a specific index name is particularly useful for resolving conflicts or debugging priority issues among templates. +The following request demonstrates how all applicable templates, with overlapping index patterns, will be applied to an index named `logs-sim-1`: + +```json +POST /_index_template/_simulate_index/logs-sim-1 +``` +{% include copy-curl.html %} + +## Example response + +```json +{ + "template": { + "settings": { + "index": { + "number_of_shards": "1", + "number_of_replicas": "1" + } + }, + "mappings": { + "properties": { + "level": { + "type": "keyword" + }, + "message": { + "type": "text" + }, + "timestamp": { + "type": "date" + } + } + }, + "aliases": {} + }, + "overlapping": [] +} +``` + +## Response body fields + +| Field | Data type | Description | +| :--- | :--- | :--- | +| `template` | Object | The template applied. | +| `template.settings` | Object | The resolved index settings. | +| `template.mappings` | Object | The resolved field mappings. | +| `template.aliases` | Object | The resolved aliases. | +| `overlapping` | Array | A list of other index templates that match the same index pattern but were not applied. | + +## Required permissions + +If you are using the Security plugin, make sure you have the appropriate permissions: `indices:admin/index_template/simulate`. diff --git a/_api-reference/index-apis/split.md b/_api-reference/index-apis/split.md index b3db4c33409..514939af7f7 100644 --- a/_api-reference/index-apis/split.md +++ b/_api-reference/index-apis/split.md @@ -31,7 +31,7 @@ PUT /sample-index1/_split/split-index1 ``` {% include copy-curl.html %} -## Path and HTTP methods +## Endpoints ```json POST //_split/ diff --git a/_api-reference/index-apis/stats.md b/_api-reference/index-apis/stats.md index 728fe7751f9..cc7b302ddab 100644 --- a/_api-reference/index-apis/stats.md +++ b/_api-reference/index-apis/stats.md @@ -14,7 +14,7 @@ The Index Stats API provides index statistics. For data streams, the API provide When a shard moves to a different node, the shard-level statistics for the shard are cleared. Although the shard is no longer part of the node, the node preserves any node-level statistics to which the shard contributed. {: .note} -## Path and HTTP methods +## Endpoints ```json GET /_stats diff --git a/_api-reference/index-apis/update-alias.md b/_api-reference/index-apis/update-alias.md index c069703bf38..c741acd5b18 100644 --- a/_api-reference/index-apis/update-alias.md +++ b/_api-reference/index-apis/update-alias.md @@ -2,7 +2,7 @@ layout: default title: Create or update alias parent: Index APIs -nav_order: 5 +nav_order: 6 --- # Create or Update Alias @@ -15,7 +15,7 @@ The Create or Update Alias API adds one or more indexes to an alias or updates t The Create or Update Alias API is distinct from the [Alias API]({{site.url}}{{site.baseurl}}/opensearch/rest-api/alias/), which supports the addition and removal of aliases and the removal of alias indexes. In contrast, the following API only supports adding or updating an alias without updating the index itself. Each API also uses different request body parameters. {: .note} -## Path and HTTP methods +## Endpoints ```json POST //_alias/ diff --git a/_api-reference/index-apis/update-settings.md b/_api-reference/index-apis/update-settings.md index c0991bf8524..9920473848f 100644 --- a/_api-reference/index-apis/update-settings.md +++ b/_api-reference/index-apis/update-settings.md @@ -2,7 +2,7 @@ layout: default title: Update settings parent: Index APIs -nav_order: 75 +nav_order: 47 redirect_from: - /opensearch/rest-api/index-apis/update-settings/ --- @@ -11,12 +11,12 @@ redirect_from: **Introduced 1.0** {: .label .label-purple } -You can use the update settings API operation to update index-level settings. You can change dynamic index settings at any time, but static settings cannot be changed after index creation. For more information about static and dynamic index settings, see [Create index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/). +You can use the update settings API operation to update index-level settings. You can change dynamic index settings at any time, but static settings cannot be changed after index creation. For more information about static and dynamic index settings, see [Configuring OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/). Aside from the static and dynamic index settings, you can also update individual plugins' settings. To get the full list of updatable settings, run `GET /_settings?include_defaults=true`. -## Path and HTTP methods +## Endpoints ```json PUT //_settings @@ -26,7 +26,7 @@ PUT //_settings Parameter | Type | Description :--- | :--- | :--- -<index> | String | The index to update. Can be a comma-separated list of multiple index names. Use `_all` or * to close all indexes. +<index> | String | The index to update. Can be a comma-separated list of multiple index names. Use `_all` or `*` to specify all indexes. ## Query parameters diff --git a/_api-reference/index.md b/_api-reference/index.md index f87d40214ea..c6c682035b1 100644 --- a/_api-reference/index.md +++ b/_api-reference/index.md @@ -1,6 +1,6 @@ --- layout: default -title: REST API reference +title: API reference nav_order: 1 has_toc: false has_children: true @@ -11,16 +11,15 @@ redirect_from: - /api-reference/index/ --- -# REST API reference -**Introduced 1.0** -{: .label .label-purple } +# API reference -You can use REST APIs for most operations in OpenSearch. In this reference, we provide a description of the API, and details that include the paths and HTTP methods, supported parameters, and example requests and responses. +You can use [REST APIs](#rest-apis) for most operations in OpenSearch. Starting with OpenSearch 3.0, you can use alternative experimental [gRPC APIs](#grpc-apis). -This reference includes the REST APIs supported by OpenSearch. If a REST API is missing, please provide feedback or submit a pull request in GitHub. -{: .tip } +## REST APIs +**Introduced 1.0** +{: .label .label-purple } -## Related articles +OpenSearch supports the following REST APIs: - [Analyze API]({{site.url}}{{site.baseurl}}/api-reference/analyze-apis/) - [Access control API]({{site.url}}{{site.baseurl}}/security/access-control/api/) @@ -38,7 +37,7 @@ This reference includes the REST APIs supported by OpenSearch. If a REST API is - [Index state management API]({{site.url}}{{site.baseurl}}/im-plugin/ism/api/) - [ISM error prevention API]({{site.url}}{{site.baseurl}}/im-plugin/ism/error-prevention/api/) - [Ingest APIs]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) -- [k-NN plugin API]({{site.url}}{{site.baseurl}}/search-plugins/knn/api/) +- [Vector search API]({{site.url}}{{site.baseurl}}/vector-search/api/) - [ML Commons API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/) - [Multi-search]({{site.url}}{{site.baseurl}}/api-reference/multi-search/) - [Nodes APIs]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/index/) @@ -63,5 +62,8 @@ This reference includes the REST APIs supported by OpenSearch. If a REST API is - [Transforms API]({{site.url}}{{site.baseurl}}/im-plugin/index-transforms/transforms-apis/) - [Hot reload TLS certificates]({{site.url}}{{site.baseurl}}/security/configuration/tls/#hot-reloading-tls-certificates) +## gRPC APIs +**Introduced 3.0** +{: .label .label-purple } - +Starting with OpenSearch 3.0, you can use gRPC APIs---a high-performance alternative to traditional REST interfaces. These APIs use the gRPC protocol to provide more efficient communication with OpenSearch clusters. For more information and supported APIs, see [gRPC APIs]({{site.url}}{{site.baseurl}}/api-reference/grpc-apis/). diff --git a/_api-reference/ingest-apis/index.md b/_api-reference/ingest-apis/index.md index 6cea0a9fdb7..91628e20050 100644 --- a/_api-reference/ingest-apis/index.md +++ b/_api-reference/ingest-apis/index.md @@ -20,4 +20,5 @@ Simplify, secure, and scale your OpenSearch data ingestion with the following AP - [Create pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/create-ingest/): Use this API to create or update a pipeline configuration. - [Get pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/get-ingest/): Use this API to retrieve a pipeline configuration. - [Simulate pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/simulate-ingest/): Use this pipeline to test a pipeline configuration. +- [Access data in a pipeline]({{site.url}}{{site.baseurl}}/ingest-pipelines/accessing-data/): Use this API to access data in a pipeline. - [Delete pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/delete-ingest/): Use this API to delete a pipeline configuration. diff --git a/_api-reference/list/index.md b/_api-reference/list/index.md new file mode 100644 index 00000000000..91a1cb5f834 --- /dev/null +++ b/_api-reference/list/index.md @@ -0,0 +1,203 @@ +--- +layout: default +title: List API +nav_order: 45 +has_children: true +--- + +# List APIs +**Introduced 2.18** +{: .label .label-purple } + +The List API retrieves statistics about indexes and shards in a paginated format. This streamlines the task of processing responses that include many indexes. + +The List API supports two operations: + +- [List indices]({{site.url}}{{site.baseurl}}/api-reference/list/list-indices/) +- [List shards]({{site.url}}{{site.baseurl}}/api-reference/list/list-shards/) + +## Shared query parameters + +All List API operations support the following optional query parameters. + +Parameter | Description +:--- | :--- | +`v` | Provides verbose output by adding headers to the columns. It also adds some formatting to help align each of the columns. All examples in this section include the `v` parameter. +`help` | Lists the default and other available headers for a given operation. +`h` | Limits the output to specific headers. +`format` | The format in which to return the result. Valid values are `json`, `yaml`, `cbor`, and `smile`. +`s` | Sorts the output by the specified columns. + +## Examples + +The following examples show how to use the optional query parameters to customize all List API responses. + + +### Get verbose output + +To query indexes and their statistics with a verbose output that includes all column headings in the response, use the `v` query parameter, as shown in the following example. + +#### Request + +```json +GET _list/indices?v +``` +{% include copy-curl.html %} + +#### Response + +```json +health status index uuid pri rep docs.count docs.deleted +green open .kibana_1 - - - - +yellow open sample-index-1 - - - - +next_token null +``` + + +### Get all available headers + +To see all the available headers, use the `help` parameter with the following syntax: + +```json +GET _list/?help +``` +{% include copy-curl.html %} + +#### Request + +The following example list indices operation returns all the available headers: + +```json +GET _list/indices?help +``` +{% include copy-curl.html %} + +#### Response + +The following example displays the indexes and their health status in a table: + +```json +health | h | current health status +status | s | open/close status +index | i,idx | index name +uuid | id,uuid | index uuid +pri | p,shards.primary,shardsPrimary | number of primary shards +rep | r,shards.replica,shardsReplica | number of replica shards +docs.count | dc,docsCount | available docs +``` + +### Get a subset of headers + +To limit the output to a subset of headers, use the `h` parameter with the following syntax: + +```json +GET _list/?h=,&v +``` +{% include copy-curl.html %} + +For any operation, you can determine which headers are available by using the `help` parameter and then using the `h` parameter to limit the output to only a subset of headers. + +#### Request + +The following example limits the indexes in the response to only the index name and health status headers: + +```json +GET _list/indices?h=health,index +``` +{% include copy-curl.html %} + +### Response + +```json +green .kibana_1 +yellow sample-index-1 +next_token null +``` + + +### Sort by a header + +To sort the output on a single page by a header, use the `s` parameter with the following syntax: + +```json +GET _list/?s=, +``` +{% include copy-curl.html %} + +#### Request + +The following example request sorts indexes by index name: + +```json +GET _list/indices?s=h,i +``` +{% include copy-curl.html %} + +#### Response + +```json +green sample-index-2 +yellow sample-index-1 +next_token null +``` + +### Retrieve data in JSON format + +By default, List APIs return data in a `text/plain` format. Other supported formats are [YAML](https://yaml.org/), [CBOR](https://cbor.io/), and [Smile](https://github.com/FasterXML/smile-format-specification). + + +To retrieve data in the JSON format, use the `format=json` parameter with the following syntax. + +If you use the Security plugin, ensure you have the appropriate permissions. +{: .note } + +#### Request + +```json +GET _list/?format=json +``` +{% include copy-curl.html %} + +#### Request + +```json +GET _list/indices?format=json +``` +{% include copy-curl.html %} + +### Response + +The response contains data in JSON format: + +```json +{ + "next_token": null, + "indices": [ + { + "health": "green", + "status": "-", + "index": ".kibana_1", + "uuid": "-", + "pri": "-", + "rep": "-", + "docs.count": "-", + "docs.deleted": "-", + "store.size": "-", + "pri.store.size": "-" + }, + { + "health": "yellow", + "status": "-", + "index": "sample-index-1", + "uuid": "-", + "pri": "-", + "rep": "-", + "docs.count": "-", + "docs.deleted": "-", + "store.size": "-", + "pri.store.size": "-" + } + ] +} +``` + diff --git a/_api-reference/list/list-indices.md b/_api-reference/list/list-indices.md new file mode 100644 index 00000000000..db5a114ca9a --- /dev/null +++ b/_api-reference/list/list-indices.md @@ -0,0 +1,99 @@ +--- +layout: default +title: List indices +parent: List API +nav_order: 25 +has_children: false +--- + +# List indices +**Introduced 2.18** +{: .label .label-purple } + +The list indices operation provides the following index information in a paginated format: + +- The amount of disk space used by the index. +- The number of shards contained in the index. +- The index's health status. + +## Endpoints + +```json +GET _list/indices +GET _list/indices/ +``` + +## Query parameters + +Parameter | Type | Description +:--- | :--- | :--- +`bytes` | Byte size | Specifies the units for the byte size, for example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +`health` | String | Limits indexes based on their health status. Supported values are `green`, `yellow`, and `red`. +`include_unloaded_segments` | Boolean | Whether to include information from segments not loaded into memory. Default is `false`. +`cluster_manager_timeout` | Time | The amount of time to wait for a connection to the cluster manager node. Default is `30s`. +`pri` | Boolean | Whether to return information only from the primary shards. Default is `false`. +`time` | Time | Specifies the time units, for example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +`expand_wildcards` | Enum | Expands wildcard expressions to concrete indexes. Combine multiple values with commas. Supported values are `all`, `open`, `closed`, `hidden`, and `none`. Default is `open`. +`next_token` | String | Fetches the next page of indexes. When `null`, only provides the first page of indexes. Default is `null`. +`size` | Integer | The maximum number of indexes to be displayed on a single page. The number of indexes on a single page of the response is not always equal to the specified `size`. Default is `500`. Minimum is `1` and maximum value is `5000`. +`sort` | String | The order in which the indexes are displayed. If `desc`, then the most recently created indexes are displayed first. If `asc`, then the oldest indexes are displayed first. Default is `asc`. + +When using the `next_token` path parameter, use the token produced by the response to see the next page of indexes. After the API returns `null`, all indexes contained in the API have been returned. +{: .tip } + + +## Example requests + +To get information for all the indexes, use the following query and keep specifying the `next_token` as received from response until its `null`: + +```json +GET _list/indices/?v&next_token=token +``` + + +To limit the information to a specific index, add the index name after your query, as shown in the following example: + +```json +GET _list/indices/?v +``` +{% include copy-curl.html %} + +To get information about more than one index, separate the indexes with commas, as shown in the following example: + +```json +GET _list/indices/index1,index2,index3?v&next_token=token +``` +{% include copy-curl.html %} + + +## Example response + +**Plain text format** + +```json +health | status | index | uuid | pri | rep | docs.count | docs.deleted | store.size | pri.store.size +green | open | movies | UZbpfERBQ1-3GSH2bnM3sg | 1 | 1 | 1 | 0 | 7.7kb | 3.8kb +next_token MTcyOTE5NTQ5NjM5N3wub3BlbnNlYXJjaC1zYXAtbG9nLXR5cGVzLWNvbmZpZw== +``` + +**JSON format** + +```json +{ + "next_token": "MTcyOTE5NTQ5NjM5N3wub3BlbnNlYXJjaC1zYXAtbG9nLXR5cGVzLWNvbmZpZw==", + "indices": [ + { + "health": "green", + "status": "open", + "index": "movies", + "uuid": "UZbpfERBQ1-3GSH2bnM3sg", + "pri": "1", + "rep": "1", + "docs.count": "1", + "docs.deleted": "0", + "store.size": "7.7kb", + "pri.store.size": "3.8kb" + } + ] +} +``` diff --git a/_api-reference/list/list-shards.md b/_api-reference/list/list-shards.md new file mode 100644 index 00000000000..c98fa8c3287 --- /dev/null +++ b/_api-reference/list/list-shards.md @@ -0,0 +1,102 @@ +--- +layout: default +title: List shards +parent: List API +nav_order: 20 +--- + +# List shards +**Introduced 2.18** +{: .label .label-purple } + +The list shards operation outputs, in a paginated format, the state of all primary and replica shards and how they are distributed. + +## Endpoints + +```json +GET _list/shards +GET _list/shards/ +``` + +## Query parameters + +All parameters are optional. + +Parameter | Type | Description +:--- | :--- | :--- +`bytes` | Byte size | Specifies the byte size units, for example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +`local` | Boolean | Whether to return information from the local node only instead of from the cluster manager node. Default is `false`. +`cluster_manager_timeout` | Time | The amount of time to wait for a connection to the cluster manager node. Default is `30s`. +`cancel_after_time_interval` | Time | The amount of time after which the shard request is canceled. Default is `-1` (no timeout). +`time` | Time | Specifies the time units, for example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). +`next_token` | String | Fetches the next page of indexes. When `null`, only provides the first page of indexes. Default is `null`. +`size` | Integer | The maximum number of indexes to be displayed on a single page. The number of indexes on a single page of the response is not always equal to the specified `size`. Default and minimum value is `2000`. Maximum value is `20000`. +`sort` | String | The order in which the indexes are displayed. If `desc`, then the most recently created indexes are displayed first. If `asc`, then the oldest indexes are displayed first. Default is `asc`. + +When using the `next_token` path parameter, use the token produced by the response to see the next page of indexes. After the API returns `null`, all indexes contained in the API have been returned. +{: .tip } + +## Example requests + +To get information for all the indexes and shards, use the following query and keep specifying the `next_token` as received from response until its `null`: + +```json +GET _list/shards/?v&next_token=token +``` + +To limit the information to a specific index, add the index name after your query, as shown in the following example and keep specifying the `next_token` as received from response until its `null`: + +```json +GET _list/shards/?v&next_token=token +``` +{% include copy-curl.html %} + +If you want to get information for more than one index, separate the indexes with commas, as shown in the following example: + +```json +GET _list/shards/index1,index2,index3?v&next_token=token +``` +{% include copy-curl.html %} + +## Example response + +**Plain text format** + +```json +index | shard | prirep | state | docs | store | ip | | node +plugins | 0 | p | STARTED | 0 | 208b | 172.18.0.4 | odfe-node1 +plugins | 0 | r | STARTED | 0 | 208b | 172.18.0.3 | odfe-node2 +.... +.... +next_token MTcyOTE5NTQ5NjM5N3wub3BlbnNlYXJjaC1zYXAtbG9nLXR5cGVzLWNvbmZpZw== +``` + +**JSON format** + +```json +{ + "next_token": "MTcyOTE5NTQ5NjM5N3wub3BlbnNlYXJjaC1zYXAtbG9nLXR5cGVzLWNvbmZpZw==", + "shards": [ + { + "index": "plugins", + "shard": "0", + "prirep": "p", + "state": "STARTED", + "docs": "0", + "store": "208B", + "ip": "172.18.0.4", + "node": "odfe-node1" + }, + { + "index": "plugins", + "shard": "0", + "prirep": "r", + "state": "STARTED", + "docs": "0", + "store": "208B", + "ip": "172.18.0.3", + "node": "odfe-node2" + } + ] +} +``` diff --git a/_api-reference/nodes-apis/nodes-hot-threads.md b/_api-reference/nodes-apis/nodes-hot-threads.md index 5339903d1e2..f3dd0a2cd66 100644 --- a/_api-reference/nodes-apis/nodes-hot-threads.md +++ b/_api-reference/nodes-apis/nodes-hot-threads.md @@ -12,7 +12,7 @@ nav_order: 30 The nodes hot threads endpoint provides information about busy JVM threads for selected cluster nodes. It provides a unique view of the of activity each node. -## Path and HTTP methods +## Endpoints ```json GET /_nodes/hot_threads diff --git a/_api-reference/nodes-apis/nodes-info.md b/_api-reference/nodes-apis/nodes-info.md index a8767cafcec..e97f9ed5511 100644 --- a/_api-reference/nodes-apis/nodes-info.md +++ b/_api-reference/nodes-apis/nodes-info.md @@ -19,7 +19,7 @@ The nodes info API represents mostly static information about your cluster's nod - Installed plugins -## Path and HTTP methods +## Endpoints ```json GET /_nodes diff --git a/_api-reference/nodes-apis/nodes-reload-secure.md b/_api-reference/nodes-apis/nodes-reload-secure.md index 1ed0eabde4a..3609a50a852 100644 --- a/_api-reference/nodes-apis/nodes-reload-secure.md +++ b/_api-reference/nodes-apis/nodes-reload-secure.md @@ -11,7 +11,7 @@ nav_order: 50 The nodes reload secure settings endpoint allows you to change secure settings on a node and reload the secure settings without restarting the node. -## Path and HTTP methods +## Endpoints ```json POST _nodes/reload_secure_settings diff --git a/_api-reference/nodes-apis/nodes-stats.md b/_api-reference/nodes-apis/nodes-stats.md index 604b89969b6..f790165f377 100644 --- a/_api-reference/nodes-apis/nodes-stats.md +++ b/_api-reference/nodes-apis/nodes-stats.md @@ -11,7 +11,7 @@ nav_order: 20 The nodes stats API returns statistics about your cluster. -## Path and HTTP methods +## Endpoints ```json GET /_nodes/stats @@ -44,7 +44,7 @@ thread_pool | Statistics about each thread pool for the node. fs | File system statistics, such as read/write statistics, data path, and free disk space. transport | Transport layer statistics about send/receive in cluster communication. http | Statistics about the HTTP layer. -breakers | Statistics about the field data circuit breakers. +breaker | Statistics about the field data circuit breakers. script | Statistics about scripts, such as compilations and cache evictions. discovery | Statistics about cluster states. ingest | Statistics about ingest pipelines. @@ -206,6 +206,7 @@ Select the arrow to view the example response. "suggest_total": 0, "suggest_time_in_millis": 0, "suggest_current": 0, + "search_idle_reactivate_count_total": 0, "request" : { "dfs_pre_query" : { "time_in_millis" : 0, @@ -892,6 +893,7 @@ search.point_in_time_current | Integer | The number of shard PIT contexts curren search.suggest_total | Integer | The total number of shard suggest operations. search.suggest_time_in_millis | Integer | The total amount of time for all shard suggest operations, in milliseconds. search.suggest_current | Integer | The number of shard suggest operations that are currently running. +search.search_idle_reactivate_count_total | Integer | The total number of times that all shards have been activated from an idle state. search.request | Object | Statistics about coordinator search operations for the node. search.request.took.time_in_millis | Integer | The total amount of time taken for all search requests, in milliseconds. search.request.took.current | Integer | The number of search requests that are currently running. @@ -941,12 +943,12 @@ warmer.total | Integer | The total number of index warming operations. warmer.total_time_in_millis | Integer | The total time for all index warming operations, in milliseconds. query_cache | Statistics about query cache operations for the node. query_cache.memory_size_in_bytes | Integer | The amount of memory used for the query cache for all shards in the node. -query_cache.total_count | Integer | The total number of hits, misses, and cached queries in the query cache. +query_cache.total_count | Integer | The total number of hits and misses in the query cache. query_cache.hit_count | Integer | The total number of hits in the query cache. query_cache.miss_count | Integer | The total number of misses in the query cache. -query_cache.cache_size | Integer | The size of the query cache, in bytes. -query_cache.cache_count | Integer | The number of queries in the query cache. -query_cache.evictions | Integer | The number of evictions in the query cache. +query_cache.cache_size | Integer | The number of queries currently in the query cache. +query_cache.cache_count | Integer | The total number of queries that have been added to the query cache, including those that have since been evicted. +query_cache.evictions | Integer | The number of evictions from the query cache. fielddata | Object | Statistics about the field data cache for all shards in the node. fielddata.memory_size_in_bytes | Integer | The total amount of memory used for the field data cache for all shards in the node. fielddata.evictions | Integer | The number of evictions in the field data cache. @@ -1270,7 +1272,7 @@ Field | Field type | Description memory | Object | Statistics related to memory consumption for the indexing load. memory.current | Object | Statistics related to memory consumption for the current indexing load. memory.current.combined_coordinating_and_primary_in_bytes | Integer | The total memory used by indexing requests in the coordinating or primary stages, in bytes. A node can reuse the coordinating memory if the primary stage is run locally, so the total memory does not necessarily equal the sum of the coordinating and primary stage memory usage. -memory.current.coordinating_in_bytes | The total memory consumed by indexing requests in the coordinating stage, in bytes. +memory.current.coordinating_in_bytes | Integer | The total memory consumed by indexing requests in the coordinating stage, in bytes. memory.current.primary_in_bytes | Integer | The total memory consumed by indexing requests in the primary stage, in bytes. memory.current.replica_in_bytes | Integer | The total memory consumed by indexing requests in the replica stage, in bytes. memory.current.all_in_bytes | Integer | The total memory consumed by indexing requests in the coordinating, primary, or replica stages. diff --git a/_api-reference/nodes-apis/nodes-usage.md b/_api-reference/nodes-apis/nodes-usage.md index 1101b2989a8..f61544764cb 100644 --- a/_api-reference/nodes-apis/nodes-usage.md +++ b/_api-reference/nodes-apis/nodes-usage.md @@ -11,7 +11,7 @@ nav_order: 40 The nodes usage endpoint returns low-level information about REST action usage on nodes. -## Path and HTTP methods +## Endpoints ```json GET _nodes/usage diff --git a/_api-reference/script-apis/delete-script.md b/_api-reference/script-apis/delete-script.md index 22c2a3f3946..acf0f10fcb7 100644 --- a/_api-reference/script-apis/delete-script.md +++ b/_api-reference/script-apis/delete-script.md @@ -11,7 +11,7 @@ nav_order: 4 Deletes a stored script. -## Path and HTTP methods +## Endpoints ```json DELETE _scripts/my-script diff --git a/_api-reference/script-apis/exec-script.md b/_api-reference/script-apis/exec-script.md index cd31ad92f47..f91441be334 100644 --- a/_api-reference/script-apis/exec-script.md +++ b/_api-reference/script-apis/exec-script.md @@ -11,7 +11,7 @@ nav_order: 7 The Execute Painless script API allows you to run a script that is not stored. -## Path and HTTP methods +## Endpoints ```json GET /_scripts/painless/_execute diff --git a/_api-reference/script-apis/exec-stored-script.md b/_api-reference/script-apis/exec-stored-script.md index 31102c23dd2..52db732ede6 100644 --- a/_api-reference/script-apis/exec-stored-script.md +++ b/_api-reference/script-apis/exec-stored-script.md @@ -13,7 +13,7 @@ Runs a stored script written in the Painless language. OpenSearch provides several ways to run a script; the following sections show how to run a script by passing script information in the request body of a `GET /_search` request. -## Path and HTTP methods +## Endpoints ```json GET books/_search diff --git a/_api-reference/script-apis/get-stored-script.md b/_api-reference/script-apis/get-stored-script.md index 341bfc046ee..5b1c3c75fd9 100644 --- a/_api-reference/script-apis/get-stored-script.md +++ b/_api-reference/script-apis/get-stored-script.md @@ -11,7 +11,7 @@ nav_order: 3 Retrieves a stored script. -## Path and HTTP methods +## Endpoints ```json GET _scripts/my-first-script diff --git a/_api-reference/count.md b/_api-reference/search-apis/count.md similarity index 94% rename from _api-reference/count.md rename to _api-reference/search-apis/count.md index 048dad96094..75d0900b58a 100644 --- a/_api-reference/count.md +++ b/_api-reference/search-apis/count.md @@ -1,9 +1,11 @@ --- layout: default title: Count -nav_order: 21 +parent: Search APIs +nav_order: 35 redirect_from: - /opensearch/rest-api/count/ + - /api-reference/count/ --- # Count @@ -14,7 +16,7 @@ The count API gives you quick access to the number of documents that match a que You can also use it to check the document count of an index, data stream, or cluster. -## Path and HTTP methods +## Endpoints ```json GET /_count/ @@ -39,7 +41,7 @@ Parameter | Type | Description `min_score` | Float | Include only documents with a minimum `_score` value in the result. `routing` | String | Value used to route the operation to a specific shard. `preference` | String | Specifies which shard or node OpenSearch should perform the count operation on. -`terminate_after` | Integer | The maximum number of documents OpenSearch should process before terminating the request. +`terminate_after` | Integer | The maximum number of matching documents (hits) OpenSearch should process before terminating the request. ## Example requests diff --git a/_api-reference/explain.md b/_api-reference/search-apis/explain.md similarity index 98% rename from _api-reference/explain.md rename to _api-reference/search-apis/explain.md index 0591c5bb52b..f5486a619db 100644 --- a/_api-reference/explain.md +++ b/_api-reference/search-apis/explain.md @@ -1,9 +1,11 @@ --- layout: default title: Explain -nav_order: 30 +parent: Search APIs +nav_order: 40 redirect_from: - /opensearch/rest-api/explain/ + - /api-reference/explain/ --- # Explain @@ -18,7 +20,7 @@ The explain API is an expensive operation in terms of both resources and time. O {: .warning } -## Path and HTTP methods +## Endpoints ```json GET /_explain/ diff --git a/_api-reference/search-apis/index.md b/_api-reference/search-apis/index.md new file mode 100644 index 00000000000..e29bfe46445 --- /dev/null +++ b/_api-reference/search-apis/index.md @@ -0,0 +1,42 @@ +--- +layout: default +title: Search APIs +nav_order: 75 +has_children: true +has_toc: false +--- + +# Search APIs +**Introduced 1.0** +{: .label .label-purple } + +OpenSearch provides a comprehensive suite of search-related APIs that allow you to perform various search operations, test and validate your searches, and work with search templates. OpenSearch supports the following Search APIs. + +## Core search APIs + +These APIs form the foundation of OpenSearch's search capabilities: + +- **[Search]({{site.url}}{{site.baseurl}}/api-reference/search-apis/search/)**: Run search queries across one or more indexes. +- **[Multi-search]({{site.url}}{{site.baseurl}}/api-reference/search-apis/multi-search/)**: Run multiple search requests in a single API call. +- **[Point in time]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/point-in-time-api/)**: Create a consistent view of the index for search operations. +- **[Scroll]({{site.url}}{{site.baseurl}}/api-reference/search-apis/scroll/)**: Retrieve large numbers of results from a search query. +- **[Count]({{site.url}}{{site.baseurl}}/api-reference/search-apis/count/)**: Get the number of documents that match a query. + +## Search testing APIs + +These APIs help you test, debug, and optimize your search operations: + +- **[Explain]({{site.url}}{{site.baseurl}}/api-reference/search-apis/explain/)**: Explain how a specific document matches (or doesn't match) a query. +- **Field capabilities**: Get the capabilities of fields across multiple indexes. +- **[Profile]({{site.url}}{{site.baseurl}}/api-reference/search-apis/profile/)**: Profile the execution of search requests. +- **[Ranking evaluation]({{site.url}}{{site.baseurl}}/api-reference/search-apis/rank-eval/)**: Evaluate the quality of search results. +- **[Search shards]({{site.url}}{{site.baseurl}}/api-reference/search-apis/search-shards/)**: Get information about the shards on which a search request would be executed. +- **[Validate]({{site.url}}{{site.baseurl}}/api-reference/search-apis/validate/)**: Validate a potentially expensive query before executing it. + +## Search template APIs + +These APIs allow you to work with search templates: + +- **[Search template]({{site.url}}{{site.baseurl}}/api-reference/search-apis/search-template/)**: Use search templates to run parameterized search queries. +- **[Multi-search template]({{site.url}}{{site.baseurl}}/api-reference/search-apis/search-template/msearch-template/)**: Execute multiple search template requests in a single API call. +- **[Render template]({{site.url}}{{site.baseurl}}/api-reference/search-apis/search-template/render-template/)**: Previews the final query generated from a search template by substituting parameters without executing the search. \ No newline at end of file diff --git a/_api-reference/multi-search.md b/_api-reference/search-apis/multi-search.md similarity index 98% rename from _api-reference/multi-search.md rename to _api-reference/search-apis/multi-search.md index d8ac41ecece..cc03ddecf9e 100644 --- a/_api-reference/multi-search.md +++ b/_api-reference/search-apis/multi-search.md @@ -1,9 +1,11 @@ --- layout: default title: Multi-search -nav_order: 45 +parent: Search APIs +nav_order: 20 redirect_from: - /opensearch/rest-api/multi-search/ + - /api-reference/multi-search/ --- # Multi-search @@ -13,7 +15,7 @@ redirect_from: As the name suggests, the multi-search operation lets you bundle multiple search requests into a single request. OpenSearch then executes the searches in parallel, so you get back the response more quickly compared to sending one request per search. OpenSearch executes each search independently, so the failure of one doesn't affect the others. -## Path and HTTP methods +## Endpoints The Multi-search API uses the following paths: @@ -32,7 +34,7 @@ All parameters are optional. Some can also be applied per-search as part of each Parameter | Type | Description | Supported in metadata line :--- | :--- | :--- allow_no_indices | Boolean | Whether to ignore wildcards that don't match any indexes. Default is `true`. | Yes -cancel_after_time_interval | Time | The time after which the search request will be canceled. Supported at both parent and child request levels. The order of precedence is:
1. Child-level parameter
2. Parent-level parameter
3. [Cluster setting]({{site.url}}{{site.baseurl}}/api-reference/cluster-settings).
Default is -1. | Yes +cancel_after_time_interval | Time | The time after which the search request will be canceled. Supported at both parent and child request levels. The order of precedence is:
1. Child-level parameter
2. Parent-level parameter
3. [Cluster settings]({{site.url}}{{site.baseurl}}/api-reference/cluster-settings/).
Default is -1. | Yes css_minimize_roundtrips | Boolean | Whether OpenSearch should try to minimize the number of network round trips between the coordinating node and remote clusters (only applicable to cross-cluster search requests). Default is `true`. | No expand_wildcards | Enum | Expands wildcard expressions to concrete indexes. Combine multiple values with commas. Supported values are `all`, `open`, `closed`, `hidden`, and `none`. Default is `open`. | Yes ignore_unavailable | Boolean | If an index or shard from the indexes list doesn’t exist, whether to ignore it rather than fail the query. Default is `false`. | Yes diff --git a/_api-reference/profile.md b/_api-reference/search-apis/profile.md similarity index 99% rename from _api-reference/profile.md rename to _api-reference/search-apis/profile.md index e54ffabe15e..69acd67cbd3 100644 --- a/_api-reference/profile.md +++ b/_api-reference/search-apis/profile.md @@ -1,7 +1,10 @@ --- layout: default title: Profile +parent: Search APIs nav_order: 55 +redirect_from: + - /api-reference/profile/ --- # Profile @@ -18,7 +21,7 @@ The Profile API provides timing information about the execution of individual co The Profile API is a resource-consuming operation that adds overhead to search operations. {: .warning} -## Path and HTTP methods +## Endpoints ```json GET /testindex/_search diff --git a/_api-reference/rank-eval.md b/_api-reference/search-apis/rank-eval.md similarity index 97% rename from _api-reference/rank-eval.md rename to _api-reference/search-apis/rank-eval.md index 61c80be592f..e49ca4d5ae5 100644 --- a/_api-reference/rank-eval.md +++ b/_api-reference/search-apis/rank-eval.md @@ -1,7 +1,10 @@ --- layout: default title: Ranking evaluation +parent: Search APIs nav_order: 60 +redirect_from: + - /api-reference/rank-eval/ --- # Ranking evaluation @@ -10,7 +13,7 @@ nav_order: 60 The [rank]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/rank/) eval endpoint allows you to evaluate the quality of ranked search results. -## Path and HTTP methods +## Endpoints ```json GET /_rank_eval @@ -32,7 +35,7 @@ search_type | String | Set search type to either `query_then_fetch` or `dfs_quer The request body must contain at least one parameter. -Field Type | Description +Field type | Description :--- | :--- id | Document or template ID. requests | Set multiple search requests within the request field section. diff --git a/_api-reference/scroll.md b/_api-reference/search-apis/scroll.md similarity index 97% rename from _api-reference/scroll.md rename to _api-reference/search-apis/scroll.md index 770697d4e41..f157ff661ef 100644 --- a/_api-reference/scroll.md +++ b/_api-reference/search-apis/scroll.md @@ -1,9 +1,11 @@ --- layout: default title: Scroll -nav_order: 71 +parent: Search APIs +nav_order: 30 redirect_from: - /opensearch/rest-api/scroll/ + - /api-reference/scroll/ --- # Scroll @@ -17,7 +19,7 @@ To use the `scroll` operation, add a `scroll` parameter to the request header wi Because search contexts consume a lot of memory, we suggest you don't use the `scroll` operation for frequent user queries. Instead, use the `sort` parameter with the `search_after` parameter to scroll responses for user queries. {: .note } -## Path and HTTP methods +## Endpoints ```json GET _search/scroll diff --git a/_api-reference/search-apis/search-shards.md b/_api-reference/search-apis/search-shards.md new file mode 100644 index 00000000000..af1dde298d1 --- /dev/null +++ b/_api-reference/search-apis/search-shards.md @@ -0,0 +1,169 @@ +--- +layout: default +title: Search shards +parent: Search APIs +nav_order: 85 +--- + +# Search shards API + +The `_search_shards` API provides information about which shards OpenSearch would route a search request to if the request were executed. This helps you understand how OpenSearch plans to distribute the query across shards without actually running the search. This API does not execute the search but allows you to inspect routing decisions, shard distribution, and the nodes that would handle the request. + +## Endpoints + +```json +GET /_search_shards +GET //_search_shards +POST /_search_shards +POST //_search_shards +``` + +## Path parameters + +The following table lists the available path parameters. All path parameters are optional. + +| Parameter | Date type | Description | +| --------- | ------ | ------------------------------------------------------ | +| `` | String | A comma-separated list of target index names. | + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| `allow_no_indices` | Boolean | If `true`, the request does not fail if a wildcard expression or index alias resolves to no concrete indexes. Default is `true`. | +| `expand_wildcards` | String | Controls how wildcard expressions are expanded. Options are: `open` (default), `closed`, `hidden`, `none`, or `all`. | +| `ignore_unavailable` | Boolean | If `true`, missing or closed indexes are ignored. Default is `false`. | +| `local` | Boolean | If `true`, the operation is performed only on the local node, without retrieving the state from the cluster manager node. Default is `false`. | +| `preference` | String | Specifies a preference in selecting which shards or nodes to target. See [The `preference` query parameter]({{site.url}}{{site.baseurl}}/api-reference/search-apis/search/#the-preference-query-parameter) for more information. | +| `routing` | String | A comma-separated list of specific routing values used for shard selection. | + + +## Request body fields + +The request body can include a full search query to simulate how the request would be routed: + +```json +{ + "query": { + "term": { + "user": "alice" + } + } +} +``` + +## Example + +Create an index: + +```json +PUT /logs-demo +{ + "settings": { + "number_of_shards": 3, + "number_of_replicas": 0 + }, + "mappings": { + "properties": { + "user": { "type": "keyword" }, + "message": { "type": "text" }, + "@timestamp": { "type": "date" } + } + } +} +``` +{% include copy-curl.html %} + +Index the first document with `routing=user1`: + +```json +POST /logs-demo/_doc?routing=user1 +{ + "@timestamp": "2025-05-23T10:00:00Z", + "user": "user1", + "message": "User login successful" +} +``` +{% include copy-curl.html %} + +Index the second document with `routing=user2`: + +```json +POST /logs-demo/_doc?routing=user2 +{ + "@timestamp": "2025-05-23T10:01:00Z", + "user": "user2", + "message": "User login failed" +} +``` +{% include copy-curl.html %} + +### Example request + +Simulate routing with `_search_shards`: + +```json +POST /logs-demo/_search_shards?routing=user1 +{ + "query": { + "term": { + "user": "user1" + } + } +} +``` +{% include copy-curl.html %} + + +### Example response + +The response displays the node and shard that would be searched if the search were executed: + +```json +{ + "nodes": { + "12ljrWLsQyiWHLzhFZgL9Q": { + "name": "opensearch-node3", + "ephemeral_id": "-JPvYKPMSGubd0VmSEzlbw", + "transport_address": "172.18.0.4:9300", + "attributes": { + "shard_indexing_pressure_enabled": "true" + } + } + }, + "indices": { + "logs-demo": {} + }, + "shards": [ + [ + { + "state": "STARTED", + "primary": true, + "node": "12ljrWLsQyiWHLzhFZgL9Q", + "relocating_node": null, + "shard": 1, + "index": "logs-demo", + "allocation_id": { + "id": "HwEjTdYQQJuULdQn10FRBw" + } + } + ] + ] +} +``` + +## Response body fields + +The following table lists all response body fields. + +| Field | Date type | Description | +| `nodes` | Object | Contains a map of node IDs to node metadata, such as name and transport address. | +| `indices` | Object | Contains a map of index names included in the request. | +| `shards` | Array of arrays | Nested arrays representing shard copies (primary/replica) for the request. | +| `shards.index` | String | The index name. | +| `shards.shard` | Integer | The shard number. | +| `shards.node` | String | The node ID of the node containing this shard. | +| `shards.primary` | Boolean | Whether this is a primary shard. | +| `shards.state` | String | The current shard state. | +| `shards.allocation_id.id` | String | A unique ID for this shard allocation. | diff --git a/_api-reference/search-template.md b/_api-reference/search-apis/search-template/index.md similarity index 86% rename from _api-reference/search-template.md rename to _api-reference/search-apis/search-template/index.md index 3dcaf3f5987..7c14248bc0a 100644 --- a/_api-reference/search-template.md +++ b/_api-reference/search-apis/search-template/index.md @@ -1,10 +1,14 @@ --- layout: default title: Search templates -nav_order: 80 +parent: Search APIs +has_children: true +nav_order: 90 redirect_from: - /opensearch/search-template/ - /search-plugins/search-template/ + - /api-reference/search-template/ + - /api-reference/search-apis/search-template/ --- # Search templates @@ -41,13 +45,14 @@ GET _search/template } } ``` +{% include copy-curl.html %} -This template runs the search on your entire cluster. -To run this search on a specific index, add the index name to the request: +This template runs the search on your entire cluster. To run this search on a specific index, add the index name to the request: ```json GET shakespeare/_search/template ``` +{% include copy-curl.html %} Specify the `from` and `size` parameters: @@ -70,14 +75,16 @@ GET _search/template } } ``` +{% include copy-curl.html %} To improve the search experience, you can define defaults so the user doesn’t have to specify every possible parameter. If the parameter is not defined in the `params` section, OpenSearch uses the default value. The syntax for defining the default value for a variable `var` is as follows: -```json +``` {% raw %}{{var}}{{^var}}default value{{/var}}{% endraw %} ``` +{% include copy.html %} This command sets the defaults for `from` as 10 and `size` as 10: @@ -98,6 +105,7 @@ GET _search/template } } ``` +{% include copy-curl.html %} ## Save and execute search templates @@ -116,7 +124,7 @@ POST _scripts/play_search_template "size": "{% raw %}{{size}}{{^size}}10{{/size}}{% endraw %}", "query": { "match": { - "play_name": "{{play_name}}" + "play_name": "{% raw %}{{play_name}}{% endraw %}" } } }, @@ -127,8 +135,7 @@ POST _scripts/play_search_template } ``` -Now you can reuse the template by referring to its `id` parameter. -You can reuse this source template for different input values. +Now you can reuse the template by referring to its `id` parameter. You can reuse this source template for different input values: ```json GET _search/template @@ -141,7 +148,9 @@ GET _search/template } } ``` -#### Sample output +{% include copy-curl.html %} + +## Example response ```json { @@ -191,31 +200,9 @@ POST _render/template } } ``` +{% include copy-curl.html %} -#### Sample output - -```json -{ - "template_output": { - "from": "0", - "size": "10", - "query": { - "match": { - "play_name": "Henry IV" - } - } - } -} -``` - -The following render operations are supported: - -```json -GET /_render/template -POST /_render/template -GET /_render/template/ -POST /_render/template/ -``` +For more information, see [Render Template API]({{site.url}}{{site.baseurl}}/api-reference/search-apis/search-template/render-template/). ## Advanced parameter conversion with search templates @@ -229,8 +216,9 @@ Use the section tag in Mustache to represent conditions: ```json {% raw %}{{#var}}var{{/var}}{% endraw %} ``` +{% include copy.html %} -When `var` is a boolean value, this syntax acts as an `if` condition. The `{% raw %}{{#var}}{% endraw %}` and `{% raw %}{{/var}}{% endraw %}` tags insert the values placed between them only if `var` evaluates to `true`. +When `var` is a Boolean value, this syntax acts as an `if` condition. The `{% raw %}{{#var}}{% endraw %}` and `{% raw %}{{/var}}{% endraw %}` tags insert the values placed between them only if `var` evaluates to `true`. Using section tags would make your JSON invalid, so you must write your query in a string format instead. @@ -248,9 +236,9 @@ GET _search/template } } ``` +{% include copy-curl.html %} -You can also design an `if-else` condition. -This command sets `size` to `2` if `limit` is `true`. Otherwise, it sets `size` to `10`. +You can also design an `if-else` condition. This command sets `size` to `2` if `limit` is `true`. Otherwise, it sets `size` to `10`: ```json GET _search/template @@ -262,14 +250,16 @@ GET _search/template } } ``` +{% include copy-curl.html %} ### Loops -You can also use the section tag to implement a for each loop: +You can also use the section tag to implement a for-each loop: ``` -{% raw %}{{#var}}{{.}}}{{/var}}{% endraw %} +{% raw %}{{#var}}{{.}}{{/var}}{% endraw %} ``` +{% include copy.html %} When `var` is an array, the search template iterates through it and creates a `terms` query. @@ -285,6 +275,7 @@ GET _search/template } } ``` +{% include copy-curl.html %} This template is rendered as: @@ -326,6 +317,7 @@ GET _search/template } } ``` +{% include copy-curl.html %} Renders as: @@ -358,6 +350,7 @@ GET _search/template } } ``` +{% include copy-curl.html %} Renders as: @@ -404,6 +397,8 @@ GET _msearch/template {"id":"play_search_template","params":{"play_name":"Henry IV"}} ``` +For more information, see [Multi-search Template API]({{site.url}}{{site.baseurl}}/api-reference/search-apis/msearch-template/). + ## Manage search templates To list all scripts, run the following command: @@ -411,17 +406,18 @@ To list all scripts, run the following command: ```json GET _cluster/state/metadata?pretty&filter_path=**.stored_scripts ``` +{% include copy-curl.html %} To retrieve a specific search template, run the following command: ```json GET _scripts/ ``` +{% include copy-curl.html %} To delete a search template, run the following command: ```json DELETE _scripts/ ``` - ---- +{% include copy-curl.html %} diff --git a/_api-reference/msearch-template.md b/_api-reference/search-apis/search-template/msearch-template.md similarity index 96% rename from _api-reference/msearch-template.md rename to _api-reference/search-apis/search-template/msearch-template.md index fdebf5bed1f..228f611a19b 100644 --- a/_api-reference/msearch-template.md +++ b/_api-reference/search-apis/search-template/msearch-template.md @@ -1,17 +1,22 @@ --- layout: default -title: Multi-search Template -nav_order: 47 +title: Multi-search template +parent: Search templates +grand_parent: Search APIs +nav_order: 20 +redirect_from: + - /api-reference/msearch-template/ + - /api-reference/search-apis/msearch-template/ --- -# Multi-search Template +# Multi-search template **Introduced 1.0** {: .label .label-purple } The Multi-search Template API runs multiple search template requests in a single API request. -## Path and HTTP methods +## Endpoints The Multi-search Template API uses the following paths: diff --git a/_api-reference/render-template.md b/_api-reference/search-apis/search-template/render-template.md similarity index 77% rename from _api-reference/render-template.md rename to _api-reference/search-apis/search-template/render-template.md index db2caa9cef6..b775c5116ed 100644 --- a/_api-reference/render-template.md +++ b/_api-reference/search-apis/search-template/render-template.md @@ -1,14 +1,19 @@ --- layout: default -title: Render Template -nav_order: 82 +title: Render template +parent: Search templates +grand_parent: Search APIs +nav_order: 10 +redirect_from: + - /api-reference/render-template/ + - /api-reference/search-apis/render-template/ --- -# Render Template +# Render template -The Render Template API renders a [search template]({{site.url}}{{site.baseurl}}/search-plugins/search-template/) as a search query. +The Render Template API previews the final query generated from a [search template]({{site.url}}{{site.baseurl}}/search-plugins/search-template/) by substituting parameters without executing the search. -## Paths and HTTP methods +## Endpoints ```json GET /_render/template @@ -19,17 +24,17 @@ POST /_render/template/ ## Path parameters -The Render Template API supports the following optional path parameter. +The following table lists the available path parameters. All path parameters are optional. -| Parameter | Type | Description | +| Parameter | Data type | Description | | :--- | :--- | :--- | | `id` | String | The ID of the search template to render. | ## Request body fields -The following options are supported in the request body of the Render Template API. +The following table lists the available request body fields. -| Parameter | Required | Type | Description | +| Parameter | Required | Data type | Description | | :--- | :--- | :--- | :--- | | `id` | Conditional | String | The ID of the search template to render. Is not required if the ID is provided in the path or if an inline template is specified by the `source`. | | `params` | No | Object | A list of key-value pairs that replace Mustache variables found in the search template. The key-value pairs must exist in the documents being searched. | @@ -53,6 +58,7 @@ Both of the following request examples use the search template with the template } } ``` +{% include copy.html %} ### Render template using template ID @@ -67,13 +73,13 @@ POST _render/template } } ``` -{% include copy.html %} +{% include copy-curl.html %} ### Render template using `_source` If you don't want to use a saved template, or want to test a template before saving, you can test a template with the `_source` parameter using [Mustache](https://mustache.github.io/mustache.5.html) variables, as shown in the following example: -``` +```json { "source": { "from": "{% raw %}{{from}}{{^from}}0{{/from}}{% endraw %}", @@ -107,8 +113,4 @@ OpenSearch responds with information about the template's output: } } } -``` - - - - +``` \ No newline at end of file diff --git a/_api-reference/search.md b/_api-reference/search-apis/search.md similarity index 92% rename from _api-reference/search.md rename to _api-reference/search-apis/search.md index df6992912e1..7223714d618 100644 --- a/_api-reference/search.md +++ b/_api-reference/search-apis/search.md @@ -1,9 +1,11 @@ --- layout: default title: Search -nav_order: 75 +parent: Search APIs +nav_order: 10 redirect_from: - /opensearch/rest-api/search/ + - /api-reference/search/ --- # Search @@ -12,7 +14,7 @@ redirect_from: The Search API operation lets you execute a search request to search your cluster for data. -## Path and HTTP Methods +## Endpoints ```json GET //_search @@ -33,7 +35,7 @@ allow_partial_search_results | Boolean | Whether to return partial results if th analyzer | String | Analyzer to use in the query string. analyze_wildcard | Boolean | Whether the update operation should include wildcard and prefix queries in the analysis. Default is `false`. batched_reduce_size | Integer | How many shard results to reduce on a node. Default is 512. -cancel_after_time_interval | Time | The time after which the search request will be canceled. Request-level parameter takes precedence over cancel_after_time_interval [cluster setting]({{site.url}}{{site.baseurl}}/api-reference/cluster-settings). Default is -1. +cancel_after_time_interval | Time | The time after which the search request will be canceled. Request-level parameter takes precedence over cancel_after_time_interval [cluster setting]({{site.url}}{{site.baseurl}}/api-reference/cluster-settings/). Default is -1. ccs_minimize_roundtrips | Boolean | Whether to minimize roundtrips between a node and remote clusters. Default is `true`. default_operator | String | Indicates whether the default operator for a string query should be AND or OR. Default is OR. df | String | The default field in case a field prefix is not provided in the query string. @@ -66,7 +68,7 @@ suggest_field | String | Fields OpenSearch can use to look for similar terms. suggest_mode | String | The mode to use when searching. Available options are `always` (use suggestions based on the provided terms), `popular` (use suggestions that have more occurrences), and `missing` (use suggestions for terms not in the index). suggest_size | Integer | How many suggestions to return. suggest_text | String | The source that suggestions should be based off of. -terminate_after | Integer | The maximum number of documents OpenSearch should process before terminating the request. Default is 0. +terminate_after | Integer | The maximum number of matching documents (hits) OpenSearch should process before terminating the request. Default is 0. timeout | Time | How long the operation should wait for a response from active shards. Default is `1m`. track_scores | Boolean | Whether to return document scores. Default is `false`. track_total_hits | Boolean or Integer | Whether to return how many documents matched the query. @@ -101,12 +103,12 @@ explain | String | Whether to return details about how OpenSearch computed the d from | Integer | The starting index to search from. Default is 0. indices_boost | Array of objects | Values used to boost the score of specified indexes. Specify in the format of <index> : <boost-multiplier> min_score | Integer | Specify a score threshold to return only documents above the threshold. -query | Object | The [DSL query]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index) to use in the request. +query | Object | The [DSL query]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/) to use in the request. seq_no_primary_term | Boolean | Whether to return sequence number and primary term of the last operation of each document hit. size | Integer | How many results to return. Default is 10. _source | | Whether to include the `_source` field in the response. stats | String | Value to associate with the request for additional logging. -terminate_after | Integer | The maximum number of documents OpenSearch should process before terminating the request. Default is 0. +terminate_after | Integer | The maximum number of matching documents (hits) OpenSearch should process before terminating the request. Default is 0. timeout | Time | How long to wait for a response. Default is no timeout. version | Boolean | Whether to include the document version in the response. @@ -180,7 +182,7 @@ GET /movies/_search ## The `ext` object -Starting with OpenSearch 2.10, plugin authors can add an `ext` object to the search response. The purpose of the `ext` object is to contain plugin-specific response fields. For example, in conversational search, the result of Retrieval Augmented Generation (RAG) is a single "hit" (answer). Plugin authors can include this answer in the search response as part of the `ext` object so that it is separate from the search hits. In the following example response, the RAG result is in the `ext.retrieval_augmented_generation.answer` field: +Starting with OpenSearch 2.10, plugin authors can add an `ext` object to the search response. The `ext` object contains plugin-specific response fields. For example, in conversational search, the result of retrieval-augmented generation (RAG) is a single "hit" (answer). Plugin authors can include this answer in the search response as part of the `ext` object so that it is separate from the search hits. In the following example response, the RAG result is in the `ext.retrieval_augmented_generation.answer` field: ```json { diff --git a/_api-reference/validate.md b/_api-reference/search-apis/validate.md similarity index 98% rename from _api-reference/validate.md rename to _api-reference/search-apis/validate.md index 6e1470a505c..eb4de7d80fe 100644 --- a/_api-reference/validate.md +++ b/_api-reference/search-apis/validate.md @@ -1,14 +1,17 @@ --- layout: default -title: Validate Query +title: Validate query nav_order: 87 +parent: Search APIs +redirect_from: + - /api-reference/validate/ --- -# Validate Query +# Validate query You can use the Validate Query API to validate a query without running it. The query can be sent as a path parameter or included in the request body. -## Path and HTTP methods +## Endpoints The Validate Query API contains the following path: diff --git a/_api-reference/security-apis.md b/_api-reference/security-apis.md deleted file mode 100644 index db3334fb0e5..00000000000 --- a/_api-reference/security-apis.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -layout: default -title: Security APIs -nav_order: 84 ---- - -# Security APIs - -Security APIs provide information that can be very useful in troubleshooting connection and configuration issues. - -API | Method | Description -:--- | :--- | :--- -`/_plugins/_security/whoami` | GET/POST | Returns basic details about the logged-in user. -`/_opendistro/_security/sslinfo` | GET | Returns details about the SSL connection when using certificate authentication. -`/_plugins/_security/api/permissionsinfo` | GET | Returns permission details for the logged-in user. -`/_plugins/_security/authinfo` | GET/POST | Returns the backend roles and OpenSearch roles mapped to the logged-in user. -`/_plugins/_security/api/ssl/certs` | GET | Displays the details and expiration dates of the certificates used on the OpenSearch HTTP and transport communication layers. Can only be called by users with the `superadmin` certificate. -`/_plugins/_security/api/ssl/transport/reloadcerts` | PUT | Reloads the certificates on the `transport` layer. For more information, see [Reload TLS certificates on the transport layer]({{site.url}}{{site.baseurl}}/security/configuration/tls/#reload-tls-certificates-on-the-transport-layer). -`/_plugins/_security/api/ssl/http/reloadcerts` | PUT | Reloads the certificates on the `http` layer. For more information, see [Reload TLS certificates on the http layer]({{site.url}}{{site.baseurl}}/security/configuration/tls/#reload-tls-certificates-on-the-http-layer). - diff --git a/_api-reference/security/authentication/auth-info.md b/_api-reference/security/authentication/auth-info.md new file mode 100644 index 00000000000..bd1b05fac4c --- /dev/null +++ b/_api-reference/security/authentication/auth-info.md @@ -0,0 +1,134 @@ +--- +layout: default +title: Authentication Information API +grand_parent: Security APIs +parent: Authentication APIs +nav_order: 10 +--- + +# Authentication Information API +**Introduced 1.0** +{: .label .label-purple } + +The Authentication Information API returns information about the currently authenticated user. This includes the user's name, roles, backend roles, custom attributes, and tenant memberships. This API is useful for debugging authentication issues, verifying user permissions, and building applications that need to understand the current user's access levels. + + +## Endpoints +```json +GET /_plugins/_security/authinfo +POST /_plugins/_security/authinfo +``` + + + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `auth_type` | String | The type of the current authentication request. | +| `verbose` | Boolean | Whether to return a verbose response. | + + + +## Example request + +The following example request retrieves authentication information for the currently authenticated user: + +```bash +GET /_plugins/_security/authinfo +``` +{% include copy-curl.html %} + +To get verbose information: + +```bash +GET /_plugins/_security/authinfo?verbose=true +``` +{% include copy-curl.html %} + +## Example response + +```json +{ + "user": "User [name=admin, backend_roles=[admin], requestedTenant=null]", + "user_name": "admin", + "backend_roles": [ + "admin" + ], + "roles": [ + "all_access", + "security_rest_api_access" + ], + "tenants": { + "admin": true, + "global_tenant": true + }, + "principal": null, + "peer_certificates": "0", + "sso_logout_url": null, + "remote_address": "127.0.0.1:54013" +} +``` + +For a verbose response, additional fields are included: + +```json +{ + "user": "User [name=admin, backend_roles=[admin], requestedTenant=null]", + "user_name": "admin", + "backend_roles": [ + "admin" + ], + "custom_attribute_names": [], + "roles": [ + "all_access", + "security_rest_api_access" + ], + "tenants": { + "admin": true, + "global_tenant": true + }, + "principal": null, + "peer_certificates": "0", + "sso_logout_url": null, + "remote_address": "127.0.0.1:54013", + "size_of_user": "115", + "size_of_backendroles": "28", + "size_of_custom_attributes": "2", + "user_requested_tenant": null +} +``` + +## Response body fields + +The response body is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `user` | String | A string representation of the user object, including the username and backend roles. | +| `user_name` | String | The username of the authenticated user. | +| `backend_roles` | Array of strings | The backend roles associated with the user, typically obtained from an external authentication system. | +| `roles` | Array of strings | The OpenSearch Security roles assigned to the user, determining their permissions. | +| `tenants` | Object | The tenants the user has access to, with `true` indicating read-write access and `false` indicating read-only access. | +| `principal` | String | The user's authentication principal, if available. | +| `peer_certificates` | String | The number of peer certificates related to the user's authentication. | +| `sso_logout_url` | String | The logout URL for single sign-on (SSO) authentication, if applicable. | +| `remote_address` | String | The IP address and port of the client making the request. | + +When requesting a verbose response, the following additional fields are included. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `custom_attribute_names` | Array of strings | The names of any custom attributes associated with the user. | +| `size_of_user` | String | The size of the user object in memory, in bytes. | +| `size_of_backendroles` | String | The size of the user's backend roles, in bytes. | +| `size_of_custom_attributes` | String | The size of the user's custom attributes, in bytes. | +| `user_requested_tenant` | String | The name of the tenant the user has requested to switch to, if any. | \ No newline at end of file diff --git a/_api-reference/security/authentication/change-password.md b/_api-reference/security/authentication/change-password.md new file mode 100644 index 00000000000..eabb594a8fe --- /dev/null +++ b/_api-reference/security/authentication/change-password.md @@ -0,0 +1,90 @@ +--- +layout: default +title: Change Password API +grand_parent: Security APIs +parent: Authentication APIs +nav_order: 30 +--- + +# Change Password API +**Introduced 1.0** +{: .label .label-purple } + +The Change Password API allows users to update their own passwords. Users must provide their current password for verification before the password change is allowed. + + +## Endpoints +```json +PUT /_plugins/_security/api/account +``` + + + + +## Request body fields + +The request body is __required__. It is a JSON object with the following fields. + +| Property | Required | Data type | Description | +| :--- | :--- | :--- | :--- | +| `current_password` | **Required** | String | The current password. | +| `password` | **Required** | String | The new password to set. | + + + +## Example request + +```json +PUT /_plugins/_security/api/account +{ + "current_password": "old-secure-password", + "password": "new-secure-password" +} +``` +{% include copy-curl.html %} + +## Example response + +A successful response indicates that the password has been changed: + +```json +{ + "status": "OK", + "message": "Password changed" +} +``` + +If the current password is incorrect, the API returns an error: + +```json +{ + "status": "UNAUTHORIZED", + "message": "Invalid credentials" +} +``` + +## Response body fields + +The response body is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `status` | String | The status of the request. A successful request returns "OK". | +| `message` | String | A message describing the result of the operation. | + +## Password best practices + +Proper password management is essential for securing your OpenSearch cluster. When using this API to change a password, keep the following guidelines in mind: + +- You can only use this API to change the password of the currently authenticated user. +- Make sure the new password meets any configured password policies. +- Existing authentication tokens remain valid until they expire, even after the password changes. +- Use strong passwords that include a mix of uppercase and lowercase letters, numbers, and special characters. + +To enhance security, use a password manager to generate and store complex passwords. Incorporate regular password rotation into your organization's security policy to help protect against unauthorized access. diff --git a/_api-reference/security/authentication/index.md b/_api-reference/security/authentication/index.md new file mode 100644 index 00000000000..132165a0b06 --- /dev/null +++ b/_api-reference/security/authentication/index.md @@ -0,0 +1,38 @@ +--- +layout: default +title: Authentication APIs +parent: Security APIs +has_children: true +nav_order: 10 +--- + +# Authentication APIs + +Authentication is a fundamental aspect of security in OpenSearch, verifying the identity of users and services before granting access to protected resources. The Security plugin provides several APIs you can use to manage authentication, obtain user information, and handle credentials. + +## Available APIs + +The Authentication APIs include the following operations: + +- [Authentication Information API]({{site.url}}{{site.baseurl}}/api-reference/security/authentication/auth-info/): Returns information about the currently authenticated user, including roles, backend roles, and tenant memberships. Useful for debugging authentication issues, verifying permissions, and retrieving user context for applications. + +- [Change Password API]({{site.url}}{{site.baseurl}}/api-reference/security/authentication/change-password/): Lets users update their own passwords securely. Requires verification of the current password and does not require administrator involvement. + +## Authentication workflows + +These APIs support the following common authentication workflows: + +- **User verification**: Confirm a user's identity and permissions before executing sensitive operations. +- **Self-service password management**: Allow users to change their passwords independently. +- **Multi-tenant access**: Determine a user's accessible tenants and associated permissions. + +## Authentication methods + +The Security plugin supports [multiple authentication methods]({{site.url}}{{site.baseurl}}/security/authentication-backends/authc-index/), including: + +- Basic authentication (username and password). +- Token-based authentication. +- Certificate-based authentication. +- Single sign-on (SSO) methods, such as SAML and OpenID Connect. + +The APIs in this section are compatible with all supported authentication methods and offer a consistent interface for managing authentication in OpenSearch. diff --git a/_api-reference/security/configuration/get-configuration.md b/_api-reference/security/configuration/get-configuration.md new file mode 100644 index 00000000000..6fd3236a012 --- /dev/null +++ b/_api-reference/security/configuration/get-configuration.md @@ -0,0 +1,115 @@ +--- +layout: default +title: Get Configuration API +parent: Configuration APIs +grand_parent: Security APIs +nav_order: 40 +--- + +# Get Security Configuration API +**Introduced 1.0** +{: .label .label-purple } + +The Get Security Configuration API retrieves the current security configuration. This configuration includes authentication domains and other security-related configurations. + + +## Endpoints +```json +GET /_plugins/_security/api/securityconfig +``` + + +## Example request + +```bash +GET /_plugins/_security/api/securityconfig +``` +{% include copy-curl.html %} + +## Example response + +```json +{ + "config": { + "dynamic": { + "authc": { + "basic_internal_auth_domain": { + "http_enabled": true, + "transport_enabled": true, + "order": 0, + "http_authenticator": { + "challenge": true, + "type": "basic", + "config": {} + }, + "authentication_backend": { + "type": "internal", + "config": {} + } + } + }, + "authz": { + "roles_from_myldap": { + "http_enabled": true, + "transport_enabled": true, + "authorization_backend": { + "type": "ldap", + "config": { + "roles_search_filter": "(uniqueMember={0})", + "host": "ldap.example.com", + "port": 389 + } + } + } + }, + "multi_rolespan_enabled": true, + "hosts_resolver_mode": "ip-only", + "do_not_fail_on_forbidden": false + } + } +} +``` + +## Response body fields + +The response body is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `config` | Object | The root object containing the security configuration. | + +
+ + Response body fields: config + + {: .text-delta} + +`config` is a JSON object that contains the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `dynamic` | Object | The main configuration object containing all security configuration settings. Includes authentication domains (`authc`), authorization settings (`authz`), and various security behaviors. | + +
+ +## Usage notes + +The Get Configuration API provides a way to inspect the current security configuration. When using the API, remember the following usage notes: + +- **Read-only operation**: This API only retrieves the configuration and does not modify it. + +- **Access control**: Access to this API should be restricted to administrators because the configuration contains sensitive information about your security setup. + +## Security considerations + +The security configuration contains sensitive information about your authentication mechanisms, LDAP settings, and security policies. Consider the following security best practices: + +- Be cautious about storing or logging the output from this API, as it may contain sensitive configuration details. +- Use HTTPS/TLS when interacting with this API to prevent information disclosure. + +## Permissions + +Any users with roles defined in `plugins.security.restapi.roles_enabled: ["all_access", "security_rest_api_access"]` have access to this API. \ No newline at end of file diff --git a/_api-reference/security/configuration/index.md b/_api-reference/security/configuration/index.md new file mode 100644 index 00000000000..6e662b9a238 --- /dev/null +++ b/_api-reference/security/configuration/index.md @@ -0,0 +1,166 @@ +--- +layout: default +title: Configuration APIs +parent: Security APIs +has_children: true +nav_order: 20 +--- + +# Configuration APIs +**Introduced 1.0** +{: .label .label-purple } + +The Configuration APIs provide programmatic access for managing, validating, and upgrading Security plugin configuration components. These APIs help ensure that your security settings remain compatible and effective as your OpenSearch cluster evolves. + +Security configurations may require updates in the following scenarios: + +- Upgrading to a new version of OpenSearch or the Security plugin +- Enabling new features that require updated settings +- Migrating configurations between environments +- Troubleshooting security issues + +## When to use + +Use the Configuration APIs to perform the following actions: + +- Identify outdated or incompatible configuration components. +- Perform automatic upgrades to maintain compatibility. +- Validate the structure and integrity of your security configuration. +- Manage versioning of security settings. + +## Available APIs + +- [Upgrade Check API]({{site.url}}{{site.baseurl}}/api-reference/security/configuration/upgrade-check/): Checks your current configuration for compatibility with your OpenSearch version and identifies components that need to be upgraded. + +- [Upgrade Perform API]({{site.url}}{{site.baseurl}}/api-reference/security/configuration/upgrade-perform/): Applies updates to the security configuration, based on the results of the Upgrade Check API. + +- [Update Security Configuration API]({{site.url}}{{site.baseurl}}/api-reference/security/configuration/update-configuration/): Creates or updates the security configuration. + +- [Patch Security Configuration API]({{site.url}}{{site.baseurl}}/api-reference/security/configuration/patch-configuration/): Updates specific fields of the security configuration without replacing the entire configuration document. + +- [Get Security Configuration API]({{site.url}}{{site.baseurl}}/api-reference/security/configuration/get-configuration/): Retrieves the current security configuration. + +## `authc` + +When configuring authentication domains (`authc`), you define how OpenSearch extracts user information and backend roles from the authentication response. This is especially important when integrating with external systems such as SAML, OIDC, or custom authentication backends. + +To support role mapping, use the following configuration keys: + +- `subject_key`: Specifies where to find the user identifier in the authentication response +- `roles_key`: Indicates where to find the backend roles in the authentication response + +OpenSearch uses the extracted backend roles in role mappings to assign roles to users. + +The following example shows how to configure an authentication domain to extract the username from `"preferred_username"` and backend roles from `"groups"` in a JWT token: + +```json +{ + "authc": { + "oidc_auth_domain": { + "http_enabled": true, + "transport_enabled": false, + "order": 1, + "http_authenticator": { + "type": "openid", + "challenge": false, + "config": { + "subject_key": "preferred_username", + "roles_key": "groups", + "openid_connect_url": "https://identity.example.com/.well-known/openid-configuration" + } + }, + "authentication_backend": { + "type": "noop", + "config": {} + } + } + } +} +``` +{% include copy.html %} + +You can then use the extracted backend roles in role mappings. The following configuration assigns the `analyst_role` to users whose authentication response includes either `analyst_group` or `data_scientist_group`: + +```json +{ + "role_mappings": { + "analyst_role": { + "backend_roles": ["analyst_group", "data_scientist_group"] + } + } +} +``` +{% include copy.html %} + + +## `authz` + +The `authz` section handles authorization by retrieving backend roles from external sources such as LDAP. This allows OpenSearch to authenticate users through one method (for example, basic authentication or SAML) and authorize them based on role information stored in a separate directory. + +A typical `authz` configuration includes the following elements: + +- `roles_search_filter`: The LDAP search filter used to find roles for a user +- `rolebase`: The Distinguished Name (DN) to search for roles +- `rolesearch`: The search pattern to use when looking for roles +- `rolename`: The attribute that contains the role name + +This setup is useful in enterprise environments where identities are managed in one system and roles in another. + +The following example connects to an LDAP directory, uses the `rolesearch` filter to find user groups, and extracts each group as a backend role using the `rolename` attribute: + +```json +{ + "authz": { + "ldap_role_authz": { + "http_enabled": true, + "transport_enabled": true, + "authorization_backend": { + "type": "ldap", + "config": { + "rolebase": "ou=groups,dc=example,dc=com", + "rolesearch": "(uniqueMember={0})", + "rolename": "cn", + "userbase": "ou=people,dc=example,dc=com", + "usersearch": "(uid={0})", + "username_attribute": "uid" + } + } + } + } +} +``` +{% include copy.html %} + + +The following example shows how to map an LDAP group to an OpenSearch role. If a user belongs to the LDAP group `cn=analysts,ou=groups,dc=example,dc=com`, the backend role `analysts` is extracted and mapped to the `data_access_role`: + +```json +{ + "role_mappings": { + "data_access_role": { + "backend_roles": ["analysts", "researchers"] + } + } +} +``` + +## Configuration components + +These APIs manage the following configuration components: + +- **Roles**: Permissions for actions users can perform +- **Role mappings**: Mappings for users or backend roles with specific roles +- **Action groups**: Collections of permissions used to simplify role definitions +- **Internal users**: User credentials stored directly in OpenSearch +- **Tenants**: Isolated workspaces that support multi-tenancy +- **Security configuration**: Global security settings + +## Best practices + +When using the Configuration APIs, remember the following best practices: + +- Always back up your security configuration before making changes. +- Run the Upgrade Check API before using the Upgrade Perform API. +- Test changes in a non-production environment before deploying to production. +- Integrate these APIs into your regular upgrade and maintenance workflows. +- Validate functionality after applying configuration changes. \ No newline at end of file diff --git a/_api-reference/security/configuration/patch-configuration.md b/_api-reference/security/configuration/patch-configuration.md new file mode 100644 index 00000000000..0f0e6b48205 --- /dev/null +++ b/_api-reference/security/configuration/patch-configuration.md @@ -0,0 +1,144 @@ +--- +layout: default +title: Patch Configuration API +parent: Configuration APIs +grand_parent: Security APIs +nav_order: 50 +--- + +# Patch Configuration API +**Introduced 1.0** +{: .label .label-purple } + +The Patch Configuration API allows you to update specific parts of the Security plugin configuration without replacing the entire configuration document. + +This operation can easily break your existing security configuration. We strongly recommend using the `securityadmin.sh` script instead, which includes validations and safeguards to prevent misconfiguration. +{: .warning} + + +## Endpoints +```json +PATCH /_plugins/_security/api/securityconfig +``` + + +## Request body fields + +The request body is **required**. It is an **array of JSON objects** (NDJSON). Each object has the following fields. + +| Property | Required | Data type | Description | +| :--- | :--- | :--- | :--- | +| `op` | **Required** | String | The operation to perform. Valid values are `add`, `remove`, `replace`, `move`, `copy`, and `test`. | +| `path` | **Required** | String | The JSON pointer path to the location in the configuration to modify. | +| `value` | Optional | Object | The value to use for the operation. Required for `add`, `replace`, and `test` operations. | + +## Example request + +The following example adds a new authentication domain and modifies an existing setting: + +```json +PATCH /_plugins/_security/api/securityconfig +[ + { + "op": "add", + "path": "/config/dynamic/authc/saml_auth_domain", + "value": { + "http_enabled": true, + "transport_enabled": false, + "order": 1, + "http_authenticator": { + "type": "saml", + "challenge": false, + "config": { + "idp": { + "metadata_url": "https://idp.example.com/saml/metadata" + }, + "sp": { + "entity_id": "opensearch" + } + } + }, + "authentication_backend": { + "type": "noop", + "config": {} + } + } + }, + { + "op": "replace", + "path": "/config/dynamic/multi_rolespan_enabled", + "value": true + }, + { + "op": "remove", + "path": "/config/dynamic/authc/legacy_auth_domain" + } +] +``` +{% include copy-curl.html %} + +## Example response + +```json +{ + "status": "OK", + "message": "Configuration updated." +} +``` + +## Response body fields + +The response body is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `status` | String | The status of the request. A successful request returns "OK". | +| `message` | String | A message describing the result of the operation. | + +## JSON patch operations + +The API supports the following JSON patch operations: + +- **add**: Adds a value to an object or inserts it into an array. For existing properties, the value is replaced. +- **remove**: Removes a value from an object or array. +- **replace**: Replaces a value. +- **move**: Moves a value from one location to another. +- **copy**: Copies a value from one location to another. +- **test**: Tests that a value at the target location is equal to the specified value. + +## Usage notes + +The Patch Configuration API provides more granular control over configuration updates than the Update Configuration API but still comes with potential risks: + +- **Path format**: Paths start with `/config` followed by the JSON pointer path to the specific configuration element you want to modify. + +- **Validation**: Limited validation is performed on the patched configuration, which may lead to security vulnerabilities if misconfigured. + +- **Backup configuration**: Always back up your current security configuration before making changes. + +- **Testing**: Test configuration changes in a development environment before deploying them to production. + +## Enabling this API + +By default, this API is disabled for security reasons. To enable it, perform the following steps: + +1. Update the `opensearch.yml` file with the following: + + ``` + plugins.security.unsupported.restapi.allow_securityconfig_modification: true + ``` + {% include copy.html %} + +2. Update the Security plugin's `config.yml` file with the following: + + ``` + plugins.security.restapi.endpoints_disabled.securityconfig: "false" + ``` + {% include copy.html %} + +3. Restart your OpenSearch cluster. + +Due to the potential security implications, enabling this API is generally not recommended for production environments. \ No newline at end of file diff --git a/_api-reference/security/configuration/update-configuration.md b/_api-reference/security/configuration/update-configuration.md new file mode 100644 index 00000000000..523ad1a0a88 --- /dev/null +++ b/_api-reference/security/configuration/update-configuration.md @@ -0,0 +1,151 @@ +--- +layout: default +title: Update Security Configuration API +parent: Configuration APIs +grand_parent: Security APIs +nav_order: 30 +--- + +# Update Security Configuration API +**Introduced 1.0** +{: .label .label-purple } + +The Update Security Configuration API creates or updates the Security plugin's configuration directly through the REST API. This configuration manages core security settings, including authentication methods, authorization rules, and access controls. + +This operation can easily break your existing security configuration. We strongly recommend using the `securityadmin.sh` script instead, which includes validations and safeguards to prevent misconfiguration. +{: .warning} + + +## Endpoints +```json +PUT /_plugins/_security/api/securityconfig/config +``` + + +## Request body fields + +The request body is **required**. It is a JSON object with the following fields. + +| Property | Required | Data type | Description | +| :--- | :--- | :--- | :--- | +| `dynamic` | **Required** | Object | The main configuration object containing all security configuration settings. | + +
+ + Request body fields: dynamic + + {: .text-delta} + +`dynamic` is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `auth_failure_listeners` | Object | The configuration for handling authentication failures, including thresholds and actions. | +| `authc` | Object | The authentication configuration domains that define how users are authenticated. For more information, see [authc]({{site.url}}{{site.baseurl}}/api-reference/security/configuration/index/#authc). | +| `authz` | Object | The authorization configuration that defines how to extract backend roles when using LDAP for authentication. For more information, see [authz]({{site.url}}{{site.baseurl}}/api-reference/security/configuration/index/#authz). | +| `do_not_fail_on_forbidden` | Boolean | When `true`, returns empty results instead of a forbidden error. Instead, failures are stored in the application logs. | +| `do_not_fail_on_forbidden_empty` | Boolean | Similar to `do_not_fail_on_forbidden` but with specific behavior for empty results. | +| `filtered_alias_mode` | String | Controls how document field filtering is applied to aliases. | +| `hosts_resolver_mode` | String | Determines how hostname resolution is performed for security operations. | +| `http` | Object | The HTTP-specific security configurations. | +| `on_behalf_of` | Object | Configures a temporary access token for the duration of a user's session (advanced). | +| `kibana` | Object | The configuration for OpenSearch Dashboards integration. | +| `respect_request_indices_options` | Boolean | When `true`, respects index options specified in requests. | + + +
+ + +## Example request + +The following example updates the security configuration to configure basic authentication and an internal user database: + +```json +PUT /_plugins/_security/api/securityconfig/config +{ + "dynamic": { + "filtered_alias_mode": "warn", + "disable_rest_auth": false, + "disable_intertransport_auth": false, + "respect_request_indices_options": false, + "opensearch-dashboards": { + "multitenancy_enabled": true, + "server_username": "kibanaserver", + "index": ".opensearch-dashboards" + }, + "http": { + "anonymous_auth_enabled": false + }, + "authc": { + "basic_internal_auth_domain": { + "http_enabled": true, + "transport_enabled": true, + "order": 0, + "http_authenticator": { + "challenge": true, + "type": "basic", + "config": {} + }, + "authentication_backend": { + "type": "intern", + "config": {} + }, + "description": "Authenticate via HTTP Basic against internal users database" + } + }, + "auth_failure_listeners": {}, + "do_not_fail_on_forbidden": false, + "multi_rolespan_enabled": true, + "hosts_resolver_mode": "ip-only", + "do_not_fail_on_forbidden_empty": false + } +} +``` +{% include copy-curl.html %} + +## Example response + +```json +{ + "status": "OK", + "message": "Configuration updated." +} +``` + +## Response body fields + +The response body is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `status` | String | The status of the request. A successful request returns "OK". | +| `message` | String | A message describing the result of the operation. | + +## Usage notes + +The Update Configuration API allows you to directly modify the Security plugin's core configuration but comes with potential risks: + +- **Prefer `securityadmin.sh`**: In most cases, you should use the `securityadmin.sh` script instead, which includes validations and safeguards to prevent misconfiguration. + +- **Backup configuration**: Always back up your current security configuration before making changes. + +- **Access control**: Enable access to this API only for trusted administrators, as it can potentially disable the security configuration for your entire cluster. + +- **Testing**: Test the security configuration changes in a development environment before deploying them to production. + +- **Complete configuration**: You must provide a complete configuration when updating, as partial updates will replace the entire configuration. + +- **Validation**: This API has minimal validation, so incorrect configurations might not be identified until they cause operational issues. + +## Enabling this API + +By default, this API is disabled for security reasons. To enable it, you need to: + +1. Update the Security plugin's `config.yml` file. +2. Add the setting `plugins.security.restapi.endpoints_disabled.securityconfig: "false"`. +3. Restart your OpenSearch cluster. + +Due to the potential security implications, enabling this API is generally not recommended for production environments. \ No newline at end of file diff --git a/_api-reference/security/configuration/upgrade-check.md b/_api-reference/security/configuration/upgrade-check.md new file mode 100644 index 00000000000..9ad11733023 --- /dev/null +++ b/_api-reference/security/configuration/upgrade-check.md @@ -0,0 +1,79 @@ +--- +layout: default +title: Upgrade Check API +parent: Configuration APIs +grand_parent: Security APIs +nav_order: 10 +--- + +# Upgrade Check API +**Introduced 1.0** +{: .label .label-purple } + +The Upgrade Check API allows you to check whether your Security plugin configuration requires any upgrades. This is particularly useful after upgrading OpenSearch to a new version because it helps identify any security configuration components that need to be updated to maintain compatibility or take advantage of new features. + + +## Endpoints +```json +GET /_plugins/_security/api/_upgrade_check +``` + + + +## Example request + +```bash +GET /_plugins/_security/api/_upgrade_check +``` +{% include copy-curl.html %} + +## Example response + +The following example response shows that upgrades are available for some components: + +```json +{ + "status": "OK", + "upgradeAvailable": true, + "upgradeActions": { + "roles": ["update_required"], + "rolesmapping": [], + "actiongroups": ["no_update_required"], + "config": ["update_required"], + "internalusers": ["no_update_required"], + "tenants": [] + } +} +``` + +If no upgrades are available, the response will appear similar to the following: + +```json +{ + "status": "OK", + "upgradeAvailable": false, + "upgradeActions": {} +} +``` + +## Response body fields + +The response body is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `status` | String | The status of the request. A successful request returns "OK". | +| `upgradeAvailable` | Boolean | Indicates whether any configuration components need to be upgraded. | +| `upgradeActions` | Object | A detailed breakdown of which configuration components need to be upgraded. The object contains arrays for each component type (`roles`, `rolesmapping`, `actiongroups`, `config`, `internalusers`, `tenants`) with upgrade status indicators. | + +## Usage notes + +When managing security configurations across OpenSearch upgrades, it's important to understand how to interpret and act upon the Upgrade Check API results. The following notes provide guidance on how to use this API: + +- Running this API does not make any changes to your configuration; it only checks for potential upgrades. +- After identifying necessary upgrades using this API, you can use the appropriate Configuration APIs to implement the required changes. +- We recommend running this check after every OpenSearch version upgrade. +- You may need administrator privileges to use this API. diff --git a/_api-reference/security/configuration/upgrade-perform.md b/_api-reference/security/configuration/upgrade-perform.md new file mode 100644 index 00000000000..7d14135de46 --- /dev/null +++ b/_api-reference/security/configuration/upgrade-perform.md @@ -0,0 +1,94 @@ +--- +layout: default +title: Upgrade Perform API +parent: Configuration APIs +grand_parent: Security APIs +nav_order: 20 +--- + +# Upgrade Perform API +**Introduced 1.0** +{: .label .label-purple } + +The Upgrade Perform API allows you to upgrade your Security plugin configuration components. This API is typically used after identifying necessary upgrades with the [Upgrade Check API]({{site.url}}{{site.baseurl}}/api-reference/security/configuration/upgrade-check/). It updates your configuration components to ensure compatibility with the current version of the Security plugin. + + +## Endpoints +```json +POST /_plugins/_security/api/_upgrade_perform +``` + + +## Request body fields + +The request body is optional. It is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `config` | Array of Strings | A list of specific configuration components to upgrade. If omitted, all components requiring upgrades will be processed. Valid values include `roles`, `rolesmapping`, `actiongroups`, `config`, `internalusers`, and `tenants`. | + +## Example request + +The following example request performs upgrades on only the `roles` and `config` components: + +```json +POST /_plugins/_security/api/_upgrade_perform +{ + "config": ["roles", "config"] +} +``` +{% include copy-curl.html %} + +To upgrade all components requiring it, you can omit the request body. + +## Example response + +The response includes information about which components were upgraded and the specific changes that were made: + +```json +{ + "status": "OK", + "upgrades": { + "roles": [ + "Added permissions for dashboard features to admin role", + "Updated cluster monitor permissions" + ], + "config": [ + "Updated authentication configuration", + "Added new security settings" + ] + } +} +``` + +If no components require upgrades, you'll receive a response similar to the following: + +```json +{ + "status": "OK", + "upgrades": {} +} +``` + +## Response body fields + +The response body is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `status` | String | The status of the upgrade operation. A successful operation returns "OK". | +| `upgrades` | Object | A detailed breakdown of the upgrades performed. Each key represents a configuration component that was upgraded, with an array of string descriptions detailing the specific changes made. | + +## Usage notes + +Consider the following important points when using this API: + +- Before performing upgrades, we recommend first running the Upgrade Check API to identify which components need to be upgraded. +- Always back up your security configuration before performing upgrades. +- You must have administrator privileges to use this API. +- This API makes actual changes to your configuration, unlike the Upgrade Check API, which only identifies required changes. +- For clusters in production environments, consider first testing the upgrade process in a staging environment. +- After performing upgrades, verify that your security settings still work as expected. \ No newline at end of file diff --git a/_api-reference/security/index.md b/_api-reference/security/index.md new file mode 100644 index 00000000000..b420e023fc8 --- /dev/null +++ b/_api-reference/security/index.md @@ -0,0 +1,53 @@ +--- +layout: default +title: Security APIs +nav_order: 77 +has_children: true +redirect_from: + - /api-reference/security-api/ +--- + +# Security APIs + +The Security plugin provides numerous REST APIs for managing its resources. These APIs are similar to the OpenSearch REST APIs and consist of HTTP requests that include the resource path, HTTP method (GET, PUT, POST, DELETE), request body, and output response fields. + +All Security APIs use the base path of `_plugins/_security/` followed by the specific path for each operation. For example, the path for the Upgrade Perform API would be `/_plugins/_security/api/_upgrade_perform`. + +Many Security API operations are available through both REST API calls and OpenSearch Dashboards settings. This documentation focuses on the REST APIs, which offer the most flexibility for programmatic access and automation. + +## API format + +Most Security APIs follow a consistent format: + +``` + _plugins/_security/ +{ + "request": "body" +} +``` + +For most resource types, the APIs support standard CRUD operations: + +- **Create**: PUT or POST with a request body containing the resource definition +- **Read**: GET to retrieve the current configuration +- **Update**: PUT with a request body containing the new configuration +- **Delete**: DELETE to remove a specific resource + + +## Authentication + +Most Security API calls require HTTP basic authentication with admin credentials. Make sure to include appropriate authentication headers in your requests. + +The following example shows a basic HTTP authentication call: + +```bash +curl -XGET https://localhost:9200/_plugins/_security/api/roles/ -u admin:admin -k +``` + +## Demo configuration + +The Security plugin ships with a default demo configuration for testing purposes. This configuration should not be used in a production environment. For production deployments, you should generate secure credentials and certificates. + +## Next steps + +For more information about the Security plugin and security best practices, see the [security documentation]({{site.url}}{{site.baseurl}}/security/). \ No newline at end of file diff --git a/_api-reference/snapshots/cleanup-snapshot-repository.md b/_api-reference/snapshots/cleanup-snapshot-repository.md index 18e8f35f812..202410c758a 100644 --- a/_api-reference/snapshots/cleanup-snapshot-repository.md +++ b/_api-reference/snapshots/cleanup-snapshot-repository.md @@ -11,7 +11,7 @@ Introduced 1.0 The Cleanup Snapshot Repository API clears a snapshot repository of data no longer referenced by any existing snapshot. -## Path and HTTP methods +## Endpoints ```json POST /_snapshot//_cleanup diff --git a/_api-reference/snapshots/clone-snapshot.md b/_api-reference/snapshots/clone-snapshot.md new file mode 100644 index 00000000000..d220eb266bf --- /dev/null +++ b/_api-reference/snapshots/clone-snapshot.md @@ -0,0 +1,80 @@ +--- +layout: default +title: Clone snapshot +parent: Snapshot APIs +nav_order: 10 +--- + +# Clone snapshot +Introduced 1.0 +{: .label .label-purple } + +Creates a clone of all or part of a snapshot in the same repository as the original. + + + +## Endpoints +```json +PUT /_snapshot/{repository}/{snapshot}/_clone/{target_snapshot} +``` + + + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Required | Data type | Description | +| :--- | :--- | :--- | :--- | +| `repository` | **Required** | String | The name of repository which will contain the snapshots clone. | +| `snapshot` | **Required** | String | The name of the original snapshot. | +| `target_snapshot` | **Required** | String | The name of the cloned snapshot. | + + + + + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `cluster_manager_timeout` | String | The amount of time to wait for a response from the cluster manager node. For more information about supported time units, see [Common parameters]({{site.url}}{{site.baseurl}}/api-reference/common-parameters/#time-units). | + + + + +## Example request + +The following request clones indexes `index_a` and `index_b` from `my_snapshot`, a snapshot located in the snapshot repository `my-opensearch-repo`, into a new snapshot in the same repository called `my_new_snapshot`: + +```json +PUT /_snapshot/my-opensearch-repo/my_snapshot/_clone/my_new_snapshot +{ + “indices” : “index_a,index_b” +} +``` +{% include copy-curl.html %} + + +## Example response + +The successful creation of a snapshot clone returns the following response: + +```json +{ + "acknowledged" : true +} +``` + diff --git a/_api-reference/snapshots/create-repository.md b/_api-reference/snapshots/create-repository.md index 40a35973e89..10c0aef4d49 100644 --- a/_api-reference/snapshots/create-repository.md +++ b/_api-reference/snapshots/create-repository.md @@ -19,7 +19,7 @@ There are two types of snapshot repositories: For instructions on creating a repository, see [Register repository]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore#register-repository). -## Path and HTTP methods +## Endpoints ```json POST /_snapshot// @@ -62,34 +62,41 @@ Request field | Description `compress` | Whether to compress metadata files. This setting does not affect data files, which might already be compressed, depending on your index settings. Default is `false`. Optional. `max_restore_bytes_per_sec` | The maximum rate at which snapshots restore. Default is 40 MB per second (`40m`). Optional. `max_snapshot_bytes_per_sec` | The maximum rate at which snapshots take. Default is 40 MB per second (`40m`). Optional. -`remote_store_index_shallow_copy` | Boolean | Determines whether the snapshot of the remote store indexes are captured as a shallow copy. Default is `false`. -`shallow_snapshot_v2` | Boolean | Determines whether the snapshots of the remote store indexes are captured as a [shallow copy v2]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability/#shallow-snapshot-v2). Default is `false`. +`remote_store_index_shallow_copy` | Determines whether the snapshot of the remote store indexes are captured as a shallow copy. Default is `false`. +`shallow_snapshot_v2` | Determines whether the snapshots of the remote store indexes are captured as a [shallow copy v2]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability/#shallow-snapshot-v2). Default is `false`. `readonly` | Whether the repository is read-only. Useful when migrating from one cluster (`"readonly": false` when registering) to another cluster (`"readonly": true` when registering). Optional. -#### s3 repository - -Request field | Description -:--- | :--- -`base_path` | The path within the bucket in which you want to store snapshots (for example, `my/snapshot/directory`). Optional. If not specified, snapshots are stored in the S3 bucket root. -`bucket` | Name of the S3 bucket. Required. -`buffer_size` | The threshold beyond which chunks (of `chunk_size`) should be broken into pieces (of `buffer_size`) and sent to S3 using a different API. Default is the smaller of two values: 100 MB or 5% of the Java heap. Valid values are between `5mb` and `5gb`. We don't recommend changing this option. -`canned_acl` | S3 has several [canned ACLs](https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl) that the `repository-s3` plugin can add to objects as it creates them in S3. Default is `private`. Optional. -`chunk_size` | Breaks files into chunks during snapshot operations (e.g. `64mb`, `1gb`), which is important for cloud storage providers and far less important for shared file systems. Default is `1gb`. Optional. -`client` | When specifying client settings (e.g. `s3.client.default.access_key`), you can use a string other than `default` (e.g. `s3.client.backup-role.access_key`). If you used an alternate name, change this value to match. Default and recommended value is `default`. Optional. -`compress` | Whether to compress metadata files. This setting does not affect data files, which might already be compressed, depending on your index settings. Default is `false`. Optional. -`disable_chunked_encoding` | Disables chunked encoding for compatibility with some storage services. Default is `false`. Optional. -`max_restore_bytes_per_sec` | The maximum rate at which snapshots restore. Default is 40 MB per second (`40m`). Optional. -`max_snapshot_bytes_per_sec` | The maximum rate at which snapshots take. Default is 40 MB per second (`40m`). Optional. -`readonly` | Whether the repository is read-only. Useful when migrating from one cluster (`"readonly": false` when registering) to another cluster (`"readonly": true` when registering). Optional. -`remote_store_index_shallow_copy` | Boolean | Whether the snapshot of the remote store indexes is captured as a shallow copy. Default is `false`. -`shallow_snapshot_v2` | Boolean | Determines whether the snapshots of the remote store indexes are captured as a [shallow copy v2]([shallow copy v2]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability/#shallow-snapshot-v2). Default is `false`. -`server_side_encryption` | Whether to encrypt snapshot files in the S3 bucket. This setting uses AES-256 with S3-managed keys. See [Protecting data using server-side encryption](https://docs.aws.amazon.com/AmazonS3/latest/dev/serv-side-encryption.html). Default is `false`. Optional. -`storage_class` | Specifies the [S3 storage class](https://docs.aws.amazon.com/AmazonS3/latest/dev/storage-class-intro.html) for the snapshots files. Default is `standard`. Do not use the `glacier` and `deep_archive` storage classes. Optional. +### s3 repository + +| Request field | Description | +|:--------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `base_path` | The path within the bucket in which you want to store snapshots (for example, `my/snapshot/directory`). Optional. If not specified, snapshots are stored in the S3 bucket root. | +| `bucket` | Name of the S3 bucket. Required. | +| `buffer_size` | The threshold beyond which chunks (of `chunk_size`) should be broken into pieces (of `buffer_size`) and sent to S3 using a different API. Default is the smaller of two values: 100 MB or 5% of the Java heap. Valid values are between `5mb` and `5gb`. We don't recommend changing this option. | +| `canned_acl` | S3 has several [canned ACLs](https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl) that the `repository-s3` plugin can add to objects as it creates them in S3. Default is `private`. Optional. | +| `chunk_size` | Breaks files into chunks during snapshot operations (for example, `64mb`, `1gb`), which is important for cloud storage providers and far less important for shared file systems. Default is `1gb`. Optional. | +| `client` | When specifying client settings (for example, `s3.client.default.access_key`), you can use a string other than `default` (for example, `s3.client.backup-role.access_key`). If you used an alternate name, change this value to match. Default and recommended value is `default`. Optional. | +| `compress` | Whether to compress metadata files. This setting does not affect data files, which might already be compressed, depending on your index settings. Default is `false`. Optional. | +| `disable_chunked_encoding` | Disables chunked encoding for compatibility with some storage services. Default is `false`. Optional. | +| `max_restore_bytes_per_sec` | The maximum rate at which snapshots restore. Default is 40 MB per second (`40m`). Optional. | +| `max_snapshot_bytes_per_sec` | The maximum rate at which snapshots are taken. Default is 40 MB per second (`40m`). Optional. | +| `readonly` | Whether the repository is read-only. Useful when migrating from one cluster (`"readonly": false` when registering) to another cluster (`"readonly": true` when registering). Optional. | +| `remote_store_index_shallow_copy` | Determines whether the snapshot of the remote store indexes is captured as a shallow copy. Default is `false`. +| `shallow_snapshot_v2` | Determines whether the snapshots of the remote store indexes are captured as a [shallow copy v2]([shallow copy v2]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability/#shallow-snapshot-v2). Default is `false`. +| `storage_class` | Specifies the [S3 storage class](https://docs.aws.amazon.com/AmazonS3/latest/dev/storage-class-intro.html) for the snapshot files. Default is `standard`. Do not use the `glacier` and `deep_archive` storage classes. Optional. | +| `server_side_encryption_type` | Specifies the S3 server-side encryption types. Supported values are `AES256` ([SSE-S3](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingServerSideEncryption.html), `aws:kms` ([SSE-KMS](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html)), and `bucket_default` ([bucket default encryption](https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucket-encryption.html). Default is `bucket_default`. | +| `server_side_encryption_kms_key_id` | Specifies the AWS Key Management Service (AWS KMS) key to be used if [S3 SSE-KMS](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html) is selected by setting the `aws:kms` encryption type. Required if `aws:kms` is set as the `server_side_encryption_type`. | +| `server_side_encryption_bucket_key_enabled` | Specifies whether [S3 Bucket Keys](https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucket-key.html) should be used when using [S3 SSE-KMS](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html). Optional. | +| `server_side_encryption_encryption_context` | Specifies any additional [encryption context](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html#encryption-context) that should be used when using [S3 SSE-KMS](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html). This setting value must be formatted as a JSON object. Optional. | +| `expected_bucket_owner` | Specifies the AWS account ID of the expected S3 bucket owner. This setting can be used for [verifying bucket ownership](https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucket-owner-condition.html). Optional. | For the `base_path` parameter, do not enter the `s3://` prefix when entering your S3 bucket details. Only the name of the bucket is required. {: .note} +The `server_side_encryption` setting is removed as of OpenSearch 3.1.0. S3 applies server-side encryption as the base level of encryption for all S3 buckets. Because this cannot be disabled, this value repository setting had no effect. For more information, see [Protecting data with server-side encryption](https://docs.aws.amazon.com/AmazonS3/latest/userguide/serv-side-encryption.html). +{: .note} + ## Example requests ### `fs` @@ -124,6 +131,26 @@ PUT /_snapshot/my-opensearch-repo ``` {% include copy-curl.html %} + +The following request registers a new S3 repository called `my-opensearch-repo` in an existing bucket called `my-open-search-bucket`. By default, all snapshots are stored in the `my/snapshot/directory`. Additionally, this repository is configured to use [SSE-KMS](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html#encryption-context), and the expected bucket owner AWS account ID is `123456789000`. + +```json +PUT /_snapshot/my-opensearch-repo +{ + "type": "s3", + "settings": { + "bucket": "my-open-search-bucket", + "base_path": "my/snapshot/directory", + "server_side_encryption_type": "aws:kms", + "server_side_encryption_kms_key_id": "arn:aws:kms:us-east-1:123456789000:key/kms-key-id", + "server_side_encryption_encryption_context": "{\"additional-enc-ctx\": \"sample-context\"}", + "expected_bucket_owner": "123456789000", + } +} +' +``` +{% include copy-curl.html %} + ## Example response Upon success, the following JSON object is returned: diff --git a/_api-reference/snapshots/create-snapshot.md b/_api-reference/snapshots/create-snapshot.md index 45e5a28b553..ce96bd6442f 100644 --- a/_api-reference/snapshots/create-snapshot.md +++ b/_api-reference/snapshots/create-snapshot.md @@ -15,7 +15,7 @@ Creates a snapshot within an existing repository. * To view a list of your repositories, see [Get snapshot repository]({{site.url}}{{site.baseurl}}/api-reference/snapshots/get-snapshot-repository). -## Path and HTTP methods +## Endpoints ```json PUT /_snapshot// diff --git a/_api-reference/snapshots/delete-snapshot-repository.md b/_api-reference/snapshots/delete-snapshot-repository.md index 2649c3c90dc..14c2b6c86a8 100644 --- a/_api-reference/snapshots/delete-snapshot-repository.md +++ b/_api-reference/snapshots/delete-snapshot-repository.md @@ -15,7 +15,7 @@ A repository in OpenSearch is simply a configuration that maps a repository name To learn more about repositories, see [Register or update snapshot repository]({{site.url}}{{site.baseurl}}/api-reference/snapshots/create-repository). -## Path and HTTP methods +## Endpoints ```json DELETE _snapshot/ diff --git a/_api-reference/snapshots/delete-snapshot.md b/_api-reference/snapshots/delete-snapshot.md index faed3b92d05..57cf5347fd4 100644 --- a/_api-reference/snapshots/delete-snapshot.md +++ b/_api-reference/snapshots/delete-snapshot.md @@ -11,6 +11,8 @@ nav_order: 7 Deletes a snapshot from a repository. +Deleting a snapshot that is in progress stops the snapshot operation and deletes the partially created snapshot. + * To learn more about snapshots, see [Snapshots]({{site.url}}{{site.baseurl}}/opensearch/snapshots/index). * To view a list of your repositories, see [cat repositories]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-repositories). diff --git a/_api-reference/snapshots/get-snapshot-repository.md b/_api-reference/snapshots/get-snapshot-repository.md index 1098cd544a0..522ea0c6599 100644 --- a/_api-reference/snapshots/get-snapshot-repository.md +++ b/_api-reference/snapshots/get-snapshot-repository.md @@ -16,7 +16,7 @@ To learn more about repositories, see [Register repository]({{site.url}}{{site.b You can also get details about a snapshot during and after snapshot creation. See [Get snapshot status]({{site.url}}{{site.baseurl}}/api-reference/snapshots/get-snapshot-status/). {: .note} -## Path and HTTP methods +## Endpoints ```json GET /_snapshot/ diff --git a/_api-reference/snapshots/get-snapshot-status.md b/_api-reference/snapshots/get-snapshot-status.md index 8675c23886e..83043b7684f 100644 --- a/_api-reference/snapshots/get-snapshot-status.md +++ b/_api-reference/snapshots/get-snapshot-status.md @@ -16,7 +16,7 @@ To learn about snapshot creation, see [Create snapshot]({{site.url}}{{site.baseu If you use the Security plugin, you must have the `monitor_snapshot`, `create_snapshot`, or `manage cluster` privileges. {: .note} -## Path and HTTP methods +## Endpoints ```json GET _snapshot///_status diff --git a/_api-reference/snapshots/get-snapshot.md b/_api-reference/snapshots/get-snapshot.md index 148f9e8ff2f..c936cab6024 100644 --- a/_api-reference/snapshots/get-snapshot.md +++ b/_api-reference/snapshots/get-snapshot.md @@ -11,7 +11,7 @@ nav_order: 6 Retrieves information about a snapshot. -## Path and HTTP methods +## Endpoints ```json GET _snapshot/// diff --git a/_api-reference/snapshots/restore-snapshot.md b/_api-reference/snapshots/restore-snapshot.md index 766604ee1d0..6df1397cc69 100644 --- a/_api-reference/snapshots/restore-snapshot.md +++ b/_api-reference/snapshots/restore-snapshot.md @@ -19,10 +19,10 @@ Restores a snapshot of a cluster or specified data streams and indices. If open indexes with the same name that you want to restore already exist in the cluster, you must close, delete, or rename the indexes. See [Example request](#example-request) for information about renaming an index. See [Close index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/close-index) for information about closing an index. {: .note} -## Path and HTTP methods +## Endpoints ```json -GET _snapshot/// +POST _snapshot///_restore ``` ## Path parameters @@ -51,8 +51,10 @@ All request body parameters are optional. | index_settings | String | A comma-delimited list of settings to add or change in all restored indices. Use this parameter to override index settings during snapshot restoration. For data streams, these index settings are applied to the restored backing indices. | | indices | String | A comma-delimited list of data streams and indices to restore from the snapshot. Multi-index syntax is supported. By default, a restore operation includes all data streams and indices in the snapshot. If this argument is provided, the restore operation only includes the data streams and indices that you specify. | | partial | Boolean | How the restore operation will behave if indices in the snapshot do not have all primary shards available. If `false`, the entire restore operation fails if any indices in the snapshot do not have all primary shards available.

If `true`, allows the restoration of a partial snapshot of indices with unavailable shards. Only shards that were successfully included in the snapshot are restored. All missing shards are recreated as empty. By default, the entire restore operation fails if one or more indices included in the snapshot do not have all primary shards available. To change this behavior, set `partial` to `true`. Defaults to `false`. | -| rename_pattern | String | The pattern to apply to restored data streams and indices. Data streams and indices matching the rename pattern will be renamed according to `rename_replacement`.

The rename pattern is applied as defined by the regular expression that supports referencing the original text.

The request fails if two or more data streams or indices are renamed into the same name. If you rename a restored data stream, its backing indices are also renamed. For example, if you rename the logs data stream to `recovered-logs`, the backing index `.ds-logs-1` is renamed to `.ds-recovered-logs-1`.

If you rename a restored stream, ensure an index template matches the new stream name. If there are no matching index template names, the stream cannot roll over and new backing indices are not created.| -| rename_replacement | String | The rename replacement string. See `rename_pattern` for more information.| +| rename_pattern | String | The pattern to apply to the restored data streams and indexes. Data streams and indexes matching the rename pattern will be renamed according to the `rename_replacement` setting.

The rename pattern is applied as defined by the regular expression that supports referencing the original text.

The request fails if two or more data streams or indexes are renamed to the same name. If you rename a restored data stream, its backing indexes are also renamed. For example, if you rename the logs data stream to `recovered-logs`, the backing index `.ds-logs-1` is renamed to `.ds-recovered-logs-1`.

If you rename a restored stream, ensure an index template matches the new stream name. If there are no matching index template names, the stream cannot roll over, and new backing indexes are not created.| +| rename_replacement | String | The rename replacement string.| +| rename_alias_pattern | String | The pattern to apply to the restored aliases. Aliases matching the rename pattern will be renamed according to the `rename_alias_replacement` setting.

The rename pattern is applied as defined by the regular expression that supports referencing the original text.

If two or more aliases are renamed to the same name, these aliases will be merged into one.| +| rename_alias_replacement | String | The rename replacement string for aliases.| | source_remote_store_repository | String | The name of the remote store repository of the source index being restored. If not provided, the Snapshot Restore API will use the repository that was registered when the snapshot was created. | wait_for_completion | Boolean | Whether to return a response after the restore operation has completed. If `false`, the request returns a response when the restore operation initializes. If `true`, the request returns a response when the restore operation completes. Defaults to `false`. | @@ -123,4 +125,4 @@ If open indices in a snapshot already exist in a cluster, and you don't delete, }, "status" : 500 } -```` \ No newline at end of file +```` diff --git a/_api-reference/snapshots/verify-snapshot-repository.md b/_api-reference/snapshots/verify-snapshot-repository.md index 67a006e709d..8827fee0c3f 100644 --- a/_api-reference/snapshots/verify-snapshot-repository.md +++ b/_api-reference/snapshots/verify-snapshot-repository.md @@ -17,7 +17,7 @@ If verification is successful, the verify snapshot repository API returns a list If you use the Security plugin, you must have the `manage cluster` privilege. {: .note} -## Path and HTTP methods +## Endpoints ```json GET _snapshot// diff --git a/_api-reference/tasks.md b/_api-reference/tasks.md deleted file mode 100644 index 477e720d22f..00000000000 --- a/_api-reference/tasks.md +++ /dev/null @@ -1,223 +0,0 @@ ---- -layout: default -title: Tasks -nav_order: 85 -redirect_from: - - /opensearch/rest-api/tasks/ ---- - -# Tasks -**Introduced 1.0** -{: .label .label-purple } - -A task is any operation you run in a cluster. For example, searching your data collection of books for a title or author name is a task. When you run OpenSearch, a task is automatically created to monitor your cluster's health and performance. For more information about all of the tasks currently executing in your cluster, you can use the `tasks` API operation. - -## Path and HTTP methods - -```json -GET _tasks -``` -{% include copy-curl.html %} - -By including a task ID, you can get information specific to a particular task. Note that a task ID consists of a node's identifying string and the task's numerical ID. For example, if your node's identifying string is `nodestring` and the task's numerical ID is `1234`, then your task ID is `nodestring:1234`. You can find this information by running the `tasks` operation: - -``` -GET _tasks/ -``` -{% include copy-curl.html %} - -Note that if a task finishes running, it won't be returned as part of your request. For an example of a task that takes a little longer to finish, you can run the [`_reindex`]({{site.url}}{{site.baseurl}}/opensearch/reindex-data) API operation on a larger document, and then run `tasks`. - -## Query parameters - -You can also use the following parameters with your query. - -Parameter | Data type | Description | -:--- | :--- | :--- -`nodes` | List | A comma-separated list of node IDs or names to limit the returned information. Use `_local` to return information from the node you're connecting to, specify the node name to get information from specific nodes, or keep the parameter empty to get information from all nodes. -`actions` | List | A comma-separated list of actions that should be returned. Keep empty to return all. -`detailed` | Boolean | Returns detailed task information. (Default: false) -`parent_task_id` | String | Returns tasks with a specified parent task ID (node_id:task_number). Keep empty or set to -1 to return all. -`wait_for_completion` | Boolean | Waits for the matching tasks to complete. (Default: false) -`group_by` | Enum | Groups tasks by parent/child relationships or nodes. (Default: nodes) -`timeout` | Time | An explicit operation timeout. (Default: 30 seconds) -`cluster_manager_timeout` | Time | The time to wait for a connection to the primary node. (Default: 30 seconds) - - -## Example requests - -### Return information about running tasks - -The following request returns tasks currently running on a node named `opensearch-node1`: - -```json -GET /_tasks?nodes=opensearch-node1 -``` -{% include copy-curl.html %} - -### Return information about active search tasks - -The following request returns detailed information about active search tasks: - -```bash -curl -XGET "localhost:9200/_tasks?actions=*search&detailed -``` -{% include copy.html %} - -## Example response - -The following example response shows information about running tasks: - -```json -{ - "nodes": { - "Mgqdm0r9SEGClWxp_RbnaQ": { - "name": "opensearch-node1", - "transport_address": "sample_address", - "host": "sample_host", - "ip": "sample_ip", - "roles": [ - "data", - "ingest", - "master", - "remote_cluster_client" - ], - "tasks": { - "Mgqdm0r9SEGClWxp_RbnaQ:24578": { - "node": "Mgqdm0r9SEGClWxp_RbnaQ", - "id": 24578, - "type": "transport", - "action": "cluster:monitor/tasks/lists", - "start_time_in_millis": 1611612517044, - "running_time_in_nanos": 638700, - "cancellable": false, - "headers": {} - }, - "Mgqdm0r9SEGClWxp_RbnaQ:24579": { - "node": "Mgqdm0r9SEGClWxp_RbnaQ", - "id": 24579, - "type": "direct", - "action": "cluster:monitor/tasks/lists[n]", - "start_time_in_millis": 1611612517044, - "running_time_in_nanos": 222200, - "cancellable": false, - "parent_task_id": "Mgqdm0r9SEGClWxp_RbnaQ:24578", - "headers": {} - } - } - } - } -} -``` - - -### The `resource_stats` object - -The `resource_stats` object is only updated for tasks that support resource tracking. These stats are computed based on scheduled thread executions, including both threads that have finished working on the task and threads currently working on the task. Because the same thread may be scheduled to work on the same task multiple times, each instance of a given thread being scheduled to work on a given task is considered to be a single thread execution. - -The following table lists all response fields in the `resource_stats` object. - -Response field | Description | -:--- | :--- | -`average` | The average resource usage across all scheduled thread executions. | -`total` | The sum of resource usages across all scheduled thread executions. | -`min` | The minimum resource usage across all scheduled thread executions. | -`max` | The maximum resource usage across all scheduled thread executions. | -`thread_info` | Thread-count-related stats.| -`thread_info.active_threads` | The number of threads currently working on the task. | -`thread_info.thread_executions` | The number of threads that have been scheduled to work on the task. | - -## Task canceling - -After getting a list of tasks, you can cancel all cancelable tasks with the following request: - -``` -POST _tasks/_cancel -``` -{% include copy-curl.html %} - -Note that not all tasks are cancelable. To see if a task is cancelable, refer to the `cancellable` field in the response to your `tasks` API request. - -You can also cancel a task by including a specific task ID. - -``` -POST _tasks//_cancel -``` -{% include copy-curl.html %} - -The `cancel` operation supports the same parameters as the `tasks` operation. The following example shows how to cancel all cancelable tasks on multiple nodes. - -``` -POST _tasks/_cancel?nodes=opensearch-node1,opensearch-node2 -``` -{% include copy-curl.html %} - -## Attaching headers to tasks - -To associate requests with tasks for better tracking, you can provide a `X-Opaque-Id:` header as part of the HTTPS request reader of your `curl` command. The API will attach the specified header in the returned result. - -Usage: - -```bash -curl -i -H "X-Opaque-Id: 111111" "https://localhost:9200/_tasks" -u 'admin:' --insecure -``` -{% include copy.html %} - -The `_tasks` operation returns the following result. - -```json -HTTP/1.1 200 OK -X-Opaque-Id: 111111 -content-type: application/json; charset=UTF-8 -content-length: 768 - -{ - "nodes": { - "Mgqdm0r9SEGClWxp_RbnaQ": { - "name": "opensearch-node1", - "transport_address": "172.18.0.4:9300", - "host": "172.18.0.4", - "ip": "172.18.0.4:9300", - "roles": [ - "data", - "ingest", - "master", - "remote_cluster_client" - ], - "tasks": { - "Mgqdm0r9SEGClWxp_RbnaQ:30072": { - "node": "Mgqdm0r9SEGClWxp_RbnaQ", - "id": 30072, - "type": "direct", - "action": "cluster:monitor/tasks/lists[n]", - "start_time_in_millis": 1613166701725, - "running_time_in_nanos": 245400, - "cancellable": false, - "parent_task_id": "Mgqdm0r9SEGClWxp_RbnaQ:30071", - "headers": { - "X-Opaque-Id": "111111" - } - }, - "Mgqdm0r9SEGClWxp_RbnaQ:30071": { - "node": "Mgqdm0r9SEGClWxp_RbnaQ", - "id": 30071, - "type": "transport", - "action": "cluster:monitor/tasks/lists", - "start_time_in_millis": 1613166701725, - "running_time_in_nanos": 658200, - "cancellable": false, - "headers": { - "X-Opaque-Id": "111111" - } - } - } - } - } -} -``` -This operation supports the same parameters as the `tasks` operation. The following example shows how you can associate `X-Opaque-Id` with specific tasks: - -```bash -curl -i -H "X-Opaque-Id: 123456" "https://localhost:9200/_tasks?nodes=opensearch-node1" -u 'admin:' --insecure -``` -{% include copy.html %} diff --git a/_api-reference/tasks/cancel-tasks.md b/_api-reference/tasks/cancel-tasks.md new file mode 100644 index 00000000000..08483645618 --- /dev/null +++ b/_api-reference/tasks/cancel-tasks.md @@ -0,0 +1,131 @@ +--- +layout: default +title: Cancel tasks +parent: Tasks API +nav_order: 30 +--- + +# Cancel tasks +**Introduced 1.0** +{: .label .label-purple } + +The Cancel Tasks API cancels a task, stopping it from running in the cluster. Not all tasks can be canceled. To determine whether a task is cancelable, check the `cancellable` field in the Cancel Tasks API response. + + + +## Endpoints +```json +POST /_tasks/_cancel +POST /_tasks/{task_id}/_cancel +``` + + + +## Path parameters + +The following table lists the available path parameters. All path parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `task_id` | String | The task ID. | + + + + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `actions` | List or String | A comma-separated list of actions that should be returned. Keep empty to return all. | +| `nodes` | List | A comma-separated list of node IDs or names used to limit the returned information. Use `_local` to return information from the node you're connecting to, specify the node name to get information from a specific node, or keep the parameter empty to get information from all nodes. | +| `parent_task_id` | String | Returns tasks with a specified parent task ID (`node_id:task_number`). Keep empty or set to -1 to return all. | +| `wait_for_completion` | Boolean | Waits for the matching task to complete. When `true`, the request is blocked until the task has completed. _(Default: `false`)_ | + + + +## Example request + +The following request cancels any tasks currently running on `opensearch-node1` and `opensearch-node2`: + +``` +POST _tasks/_cancel?nodes=opensearch-node1,opensearch-node2 +``` +{% include copy-curl.html %} + +## Example response + +The following response shows that a bulk write and update task were canceled without a node failure and provides additional information about the canceled tasks: + +```json +{ + "node_failures": [], + "nodes": { + "JzrCxdtFTCO_RaINw8ckNA": { + "name": "opensearch-node1", + "transport_address": "127.0.0.1:9300", + "host": "127.0.0.1", + "ip": "127.0.0.1:9300", + "roles": [ + "data", + "ingest", + "cluster_manager", + "remote_cluster_client" + ], + "attributes": {}, + "tasks": { + "JzrCxdtFTCO_RaINw8ckNA:54": { + "node": "JzrCxdtFTCO_RaINw8ckNA", + "id": 54, + "type": "transport", + "action": "indices:data/write/bulk", + "status": "cancelled", + "description": "bulk request to [test_index]", + "start_time_in_millis": 1625145678901, + "running_time_in_nanos": 2345678, + "cancellable": true, + "cancelled": true + } + } + }, + "K8iyDdtGQCO_SbJNw9dkMB": { + "name": "opensearch-node2", + "transport_address": "127.0.0.1:9301", + "host": "127.0.0.1", + "ip": "127.0.0.1:9301", + "roles": [ + "data", + "ingest", + "master", + "remote_cluster_client" + ], + "attributes": {}, + "tasks": { + "K8iyDdtGQCO_SbJNw9dkMB:78": { + "node": "K8iyDdtGQCO_SbJNw9dkMB", + "id": 78, + "type": "transport", + "action": "indices:data/write/update", + "status": "cancelled", + "description": "updating document in [another_index]", + "start_time_in_millis": 1625145679012, + "running_time_in_nanos": 1234567, + "cancellable": true, + "cancelled": true + } + } + } + } +} +``` + diff --git a/_api-reference/tasks/get-tasks.md b/_api-reference/tasks/get-tasks.md new file mode 100644 index 00000000000..f42443191ac --- /dev/null +++ b/_api-reference/tasks/get-tasks.md @@ -0,0 +1,127 @@ +--- +layout: default +title: Get task +parent: Tasks API +nav_order: 20 +--- + +# Get task +**Introduced 1.0** +{: .label .label-purple } + +The Get Task API returns detailed information about a single task. + + +## Endpoints +```json +GET /_tasks/{task_id} +``` + + + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Required | Data type | Description | +| :--- | :--- | :--- | :--- | +| `task_id` | **Required** | String | The task ID. | + + + + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `timeout` | String | The amount of time to wait for a response. | `30s` | +| `wait_for_completion` | Boolean | Waits for the matching task to complete. When `true`, the request is blocked until the task has completed. | `false` | + + + +## Example request + +The following request returns detailed information about active search tasks: + +```bash +curl -XGET "localhost:9200/_tasks?actions=*search&detailed +``` +{% include copy.html %} + +## Example response + +The following response returns detailed information about the `transport` task: + +```json +{ + "nodes": { + "JzrCxdtFTCO_RaINw8ckNA": { + "name": "node-1", + "transport_address": "127.0.0.1:9300", + "host": "127.0.0.1", + "ip": "127.0.0.1:9300", + "roles": [ + "data", + "ingest", + "cluster_manager", + "remote_cluster_client" + ], + "tasks": { + "JzrCxdtFTCO_RaINw8ckNA:54321": { + "node": "JzrCxdtFTCO_RaINw8ckNA", + "id": 54321, + "type": "transport", + "action": "indices:data/read/search", + "status": { + "total": 1000, + "created": 0, + "updated": 0, + "deleted": 0, + "batches": 1, + "version_conflicts": 0, + "noops": 0, + "retries": { + "bulk": 0, + "search": 0 + }, + "throttled_millis": 0, + "requests_per_second": -1.0, + "throttled_until_millis": 0 + }, + "description": "indices[test_index], types[_doc], search_type[QUERY_THEN_FETCH], source[{\"query\":{\"match_all\":{}}}]", + "start_time_in_millis": 1625145678901, + "running_time_in_nanos": 2345678, + "cancellable": true + } + } + } + } +} +``` + +### The `resource_stats` object + +The `resource_stats` object is only updated for tasks that support resource tracking. These statistics are computed based on scheduled thread executions, including both threads that have finished working on the task and threads currently working on the task. Because the same thread may be scheduled to work on the same task multiple times, each instance of a given thread being scheduled to work on a given task is considered to be a single thread execution. + +The following table lists all response fields in the `resource_stats` object. + +Response field | Description | +:--- | :--- | +`average` | The average resource usage across all scheduled thread executions. | +`total` | The total resource usage across all scheduled thread executions. | +`min` | The minimum resource usage across all scheduled thread executions. | +`max` | The maximum resource usage across all scheduled thread executions. | +`thread_info` | Thread-count-related statistics.| +`thread_info.active_threads` | The number of threads currently working on the task. | +`thread_info.thread_executions` | The number of threads that have been scheduled to work on the task. | \ No newline at end of file diff --git a/_api-reference/tasks/list-tasks.md b/_api-reference/tasks/list-tasks.md new file mode 100644 index 00000000000..c2ef9ec9522 --- /dev/null +++ b/_api-reference/tasks/list-tasks.md @@ -0,0 +1,113 @@ +--- +layout: default +title: List tasks +parent: Tasks API +nav_order: 10 +--- + +# List tasks +**Introduced 1.0** +{: .label .label-purple } + +The List Tasks API returns a list of tasks running in the cluster. + + +## Endpoints +```json +GET /_tasks +``` + + + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | Default | +| :--- | :--- | :--- | :--- | +| `actions` | List or String | A comma-separated list of actions that should be returned. Keep empty to return all. | N/A | +| `detailed` | Boolean | When `true`, the response includes detailed information about shard recoveries. | `false` | +| `group_by` | String | Groups tasks by parent/child relationships or nodes.
Valid values are: `nodes`, `none`, and `parents`. | `nodes` | +| `nodes` | List | A comma-separated list of node IDs or names used to limit the returned information. Use `_local` to return information from the node you're connecting to, specify the node name to get information from a specific node, or keep the parameter empty to get information from all nodes. | N/A | +| `parent_task_id` | String | Returns tasks with a specified parent task ID (`node_id:task_number`). Keep empty or set to -1 to return all. | N/A | +| `timeout` | String | The amount of time to wait for a response. | N/A | +| `wait_for_completion` | Boolean | Waits for the matching task to complete. When `true`, the request is blocked until the task has completed. | `false` | + + + +## Example request + +The following request returns tasks currently running on a node named `opensearch-node1`: + +```json +GET /_tasks?nodes=opensearch-node1 +``` +{% include copy-curl.html %} + +## Example response + +The following response provides information about running tasks: + +```json +{ + "nodes": { + "Mgqdm0r9SEGClWxp_RbnaQ": { + "name": "opensearch-node1", + "transport_address": "sample_address", + "host": "sample_host", + "ip": "sample_ip", + "roles": [ + "data", + "ingest", + "master", + "remote_cluster_client" + ], + "tasks": { + "Mgqdm0r9SEGClWxp_RbnaQ:24578": { + "node": "Mgqdm0r9SEGClWxp_RbnaQ", + "id": 24578, + "type": "transport", + "action": "cluster:monitor/tasks/lists", + "start_time_in_millis": 1611612517044, + "running_time_in_nanos": 638700, + "cancellable": false, + "headers": {} + }, + "Mgqdm0r9SEGClWxp_RbnaQ:24579": { + "node": "Mgqdm0r9SEGClWxp_RbnaQ", + "id": 24579, + "type": "direct", + "action": "cluster:monitor/tasks/lists[n]", + "start_time_in_millis": 1611612517044, + "running_time_in_nanos": 222200, + "cancellable": false, + "parent_task_id": "Mgqdm0r9SEGClWxp_RbnaQ:24578", + "headers": {} + } + } + } + } +} +``` + +### The `resource_stats` object + +The `resource_stats` object is only updated for tasks that support resource tracking. These statistics are computed based on scheduled thread executions, including both threads that have finished working on the task and threads currently working on the task. Because the same thread may be scheduled to work on the same task multiple times, each instance of a given thread being scheduled to work on a given task is considered to be a single thread execution. + +The following table lists all response fields in the `resource_stats` object. + +Response field | Description | +:--- | :--- | +`average` | The average resource usage across all scheduled thread executions. | +`total` | The total resource usage across all scheduled thread executions. | +`min` | The minimum resource usage across all scheduled thread executions. | +`max` | The maximum resource usage across all scheduled thread executions. | +`thread_info` | Thread-count-related statistics.| +`thread_info.active_threads` | The number of threads currently working on the task. | +`thread_info.thread_executions` | The number of threads that have been scheduled to work on the task. | \ No newline at end of file diff --git a/_api-reference/tasks/tasks.md b/_api-reference/tasks/tasks.md new file mode 100644 index 00000000000..724dcbb220c --- /dev/null +++ b/_api-reference/tasks/tasks.md @@ -0,0 +1,85 @@ +--- +layout: default +title: Tasks API +has_children: yes +nav_order: 85 +redirect_from: + - /opensearch/rest-api/tasks/ + - /api-reference/tasks/ +--- + +# Tasks +**Introduced 1.0** +{: .label .label-purple } + +A _task_ is any operation that you run in a cluster. For example, searching your data collection of books for a title or author name is a task. When you run OpenSearch, a task is automatically created to monitor your cluster's health and performance. For more information about all of the tasks currently executing in your cluster, you can use the `tasks` API operation. + +## Attaching headers to tasks + +To associate requests with tasks for better tracking, you can provide an `X-Opaque-Id:` header as part of the HTTPS request reader of your `curl` command. The API will attach the specified header in the returned result. + +The following request returns tasks with an `X-Opaque-Id` of `111111`: + +```bash +curl -i -H "X-Opaque-Id: 111111" "https://localhost:9200/_tasks" -u 'admin:' --insecure +``` +{% include copy.html %} + +The `_tasks` operation returns the following result: + +```json +HTTP/1.1 200 OK +X-Opaque-Id: 111111 +content-type: application/json; charset=UTF-8 +content-length: 768 + +{ + "nodes": { + "Mgqdm0r9SEGClWxp_RbnaQ": { + "name": "opensearch-node1", + "transport_address": "172.18.0.4:9300", + "host": "172.18.0.4", + "ip": "172.18.0.4:9300", + "roles": [ + "data", + "ingest", + "master", + "remote_cluster_client" + ], + "tasks": { + "Mgqdm0r9SEGClWxp_RbnaQ:30072": { + "node": "Mgqdm0r9SEGClWxp_RbnaQ", + "id": 30072, + "type": "direct", + "action": "cluster:monitor/tasks/lists[n]", + "start_time_in_millis": 1613166701725, + "running_time_in_nanos": 245400, + "cancellable": false, + "parent_task_id": "Mgqdm0r9SEGClWxp_RbnaQ:30071", + "headers": { + "X-Opaque-Id": "111111" + } + }, + "Mgqdm0r9SEGClWxp_RbnaQ:30071": { + "node": "Mgqdm0r9SEGClWxp_RbnaQ", + "id": 30071, + "type": "transport", + "action": "cluster:monitor/tasks/lists", + "start_time_in_millis": 1613166701725, + "running_time_in_nanos": 658200, + "cancellable": false, + "headers": { + "X-Opaque-Id": "111111" + } + } + } + } + } +} +``` +This operation supports the same parameters as the `tasks` operation. The following example shows you how to associate `X-Opaque-Id` with specific tasks: + +```bash +curl -i -H "X-Opaque-Id: 123456" "https://localhost:9200/_tasks?nodes=opensearch-node1" -u 'admin:' --insecure +``` +{% include copy.html %} diff --git a/_automating-configurations/api/create-workflow.md b/_automating-configurations/api/create-workflow.md index 610bfe8fabf..f36aa345b4a 100644 --- a/_automating-configurations/api/create-workflow.md +++ b/_automating-configurations/api/create-workflow.md @@ -16,7 +16,7 @@ Creating a workflow adds the content of a workflow template to the flow framewor To obtain the validation template for workflow steps, call the [Get Workflow Steps API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-steps/). -You can include placeholder expressions in the value of workflow step fields. For example, you can specify a credential field in a template as `openAI_key: '${{ openai_key }}'`. The expression will be substituted with the user-provided value during provisioning, using the format {% raw %}`${{ }}`{% endraw %}. You can pass the actual key as a parameter by using the [Provision Workflow API]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/) or by using this API with the `provision` parameter set to `true`. +You can include placeholder expressions in the value of workflow step fields. For example, you can specify a credential field in a template as {% raw %}`openAI_key: '${{ openai_key }}'`{% endraw %}. The expression will be substituted with the user-provided value during provisioning, using the format {% raw %}`${{ }}`{% endraw %}. You can pass the actual key as a parameter by using the [Provision Workflow API]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/) or by using this API with the `provision` parameter set to `true`. Once a workflow is created, provide its `workflow_id` to other APIs. @@ -25,7 +25,7 @@ The `POST` method creates a new workflow. The `PUT` method updates an existing w You can only update a complete workflow if it has not yet been provisioned. {: .note} -## Path and HTTP methods +## Endpoints ```json POST /_plugins/_flow_framework/workflow @@ -84,6 +84,24 @@ PUT /_plugins/_flow_framework/workflow/?reprovision=true You can add new steps to the workflow but cannot delete them. Only index setting, search pipeline, and ingest pipeline steps can currently be updated. {: .note} +To control how long the request waits for the provisioning and reprovisioning process to complete, use the `wait_for_completion_timeout` parameter: + +```json +POST /_plugins/_flow_framework/workflow/?provision=true&wait_for_completion_timeout=2s +``` +{% include copy-curl.html %} + +```json +PUT /_plugins/_flow_framework/workflow//?reprovision=true&wait_for_completion_timeout=2s +``` +{% include copy-curl.html %} + +If the operation does not complete within the specified amount of time, the response returns the current workflow status while execution continues asynchronously. + +The `wait_for_completion_timeout` parameter can only be used when either `provision` or `reprovision` is set to `true` +{: .note} + +For example, the following request provisions a workflow and waits for up to 2 seconds for completion: You can create and provision a workflow using a [workflow template]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-templates/) as follows: ```json @@ -96,14 +114,15 @@ POST /_plugins/_flow_framework/workflow?use_case=&provision=true The following table lists the available query parameters. All query parameters are optional. User-provided parameters are only allowed if the `provision` parameter is set to `true`. -| Parameter | Data type | Description | -| :--- | :--- | :--- | -| `provision` | Boolean | Whether to provision the workflow as part of the request. Default is `false`. | -| `update_fields` | Boolean | Whether to update only the fields included in the request body. Default is `false`. | -| `reprovision` | Boolean | Whether to reprovision the entire template if it has already been provisioned. A complete template must be provided in the request body. Default is `false`. | -| `validation` | String | Whether to validate the workflow. Valid values are `all` (validate the template) and `none` (do not validate the template). Default is `all`. | -| `use_case` | String | The name of the [workflow template]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-templates/#supported-workflow-templates) to use when creating the workflow. | -| User-provided substitution expressions | String | Parameters matching substitution expressions in the template. Only allowed if `provision` is set to `true`. Optional. If `provision` is set to `false`, you can pass these parameters in the [Provision Workflow API query parameters]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/#query-parameters). | +| Parameter | Data type | Description | +|:---------------------------------------|:----------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `provision` | Boolean | Whether to provision the workflow as part of the request. Default is `false`. | +| `update_fields` | Boolean | Whether to update only the fields included in the request body. Default is `false`. | +| `reprovision` | Boolean | Whether to reprovision the entire template if it has already been provisioned. A complete template must be provided in the request body. Default is `false`. | +| `validation` | String | Whether to validate the workflow. Valid values are `all` (validate the template) and `none` (do not validate the template). Default is `all`. | +| `use_case` | String | The name of the [workflow template]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-templates/#supported-workflow-templates) to use when creating the workflow. | +| `wait_for_completion_timeout` | Time value | Specifies the maximum wait time for synchronous provisioning or reprovisioning. If the timeout is exceeded, the request returns the current workflow status while execution continues asynchronously.| +| User-provided substitution expressions | String | Parameters matching substitution expressions in the template. Only allowed if `provision` is set to `true`. Optional. If `provision` is set to `false`, you can pass these parameters in the [Provision Workflow API query parameters]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/#query-parameters). | ## Request body fields @@ -291,4 +310,33 @@ OpenSearch responds with the `workflow_id`: } ``` -Once you have created a workflow, you can use other workflow APIs with the `workflow_id`. \ No newline at end of file +Once you have created a workflow, you can use other workflow APIs with the `workflow_id`. + +#### Example response with wait_for_completion_timeout enabled + +```json +{ + "workflow_id": "K13IR5QBEpCfUu_-AQdU", + "state": "COMPLETED", + "resources_created": [ + { + "workflow_step_name": "create_connector", + "workflow_step_id": "create_connector_1", + "resource_id": "LF3IR5QBEpCfUu_-Awd_", + "resource_type": "connector_id" + }, + { + "workflow_step_id": "register_model_2", + "workflow_step_name": "register_remote_model", + "resource_id": "L13IR5QBEpCfUu_-BQdI", + "resource_type": "model_id" + }, + { + "workflow_step_name": "deploy_model", + "workflow_step_id": "deploy_model_3", + "resource_id": "L13IR5QBEpCfUu_-BQdI", + "resource_type": "model_id" + } + ] +} +``` \ No newline at end of file diff --git a/_automating-configurations/api/delete-workflow.md b/_automating-configurations/api/delete-workflow.md index 13cd5ae5dc5..9869aba27a9 100644 --- a/_automating-configurations/api/delete-workflow.md +++ b/_automating-configurations/api/delete-workflow.md @@ -13,7 +13,7 @@ Note that deleting a workflow only deletes the stored template---it does not dep When a workflow is deleted, its corresponding status (returned by the [Workflow State API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-status/)) is also deleted unless either the provisioning status is `IN_PROGRESS` or resources have been provisioned. -## Path and HTTP methods +## Endpoints ```json DELETE /_plugins/_flow_framework/workflow/ diff --git a/_automating-configurations/api/deprovision-workflow.md b/_automating-configurations/api/deprovision-workflow.md index 98c944a9d4f..7cbb11ed2da 100644 --- a/_automating-configurations/api/deprovision-workflow.md +++ b/_automating-configurations/api/deprovision-workflow.md @@ -13,7 +13,7 @@ The workflow executes the provisioning steps in reverse order. If a failure occu To prevent data loss, resources created using the `create_index`, `create_search_pipeline`, and `create_ingest_pipeline` steps require the resource ID to be included in the `allow_delete` parameter. -## Path and HTTP methods +## Endpoints ```json POST /_plugins/_flow_framework/workflow//_deprovision diff --git a/_automating-configurations/api/get-workflow-status.md b/_automating-configurations/api/get-workflow-status.md index 280fb521951..e4f5c4f5808 100644 --- a/_automating-configurations/api/get-workflow-status.md +++ b/_automating-configurations/api/get-workflow-status.md @@ -9,7 +9,7 @@ nav_order: 40 [Provisioning a workflow]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/) may take a significant amount of time, particularly when the action is associated with OpenSearch indexing operations. The Get Workflow State API permits monitoring of the provisioning deployment status until it is complete. -## Path and HTTP methods +## Endpoints ```json GET /_plugins/_flow_framework/workflow//_status diff --git a/_automating-configurations/api/get-workflow-steps.md b/_automating-configurations/api/get-workflow-steps.md index 38059ec80c3..40332e4a105 100644 --- a/_automating-configurations/api/get-workflow-steps.md +++ b/_automating-configurations/api/get-workflow-steps.md @@ -27,11 +27,11 @@ This API returns a list of workflow steps, including their required inputs, outp } ``` -## Path and HTTP methods +## Endpoints ```json GET /_plugins/_flow_framework/workflow/_steps -GET /_plugins/_flow_framework/workflow/_step?workflow_step= +GET /_plugins/_flow_framework/workflow/_steps?workflow_step= ``` ## Query parameters @@ -54,7 +54,7 @@ GET /_plugins/_flow_framework/workflow/_steps To fetch specific workflow steps, pass the step names to the request as a query parameter: ```json -GET /_plugins/_flow_framework/workflow/_step?workflow_step=create_connector,delete_model,deploy_model +GET /_plugins/_flow_framework/workflow/_step?workflow_steps=create_connector,delete_model,deploy_model ``` {% include copy-curl.html %} @@ -73,4 +73,4 @@ To retrieve the template in JSON format, specify `Content-Type: application/json ```bash curl -XGET "http://localhost:9200/_plugins/_flow_framework/workflow/_steps" -H 'Content-Type: application/json' -``` \ No newline at end of file +``` diff --git a/_automating-configurations/api/get-workflow.md b/_automating-configurations/api/get-workflow.md index 7b1d5987c4c..0927e762037 100644 --- a/_automating-configurations/api/get-workflow.md +++ b/_automating-configurations/api/get-workflow.md @@ -9,7 +9,7 @@ nav_order: 20 The Get Workflow API retrieves the workflow template. -## Path and HTTP methods +## Endpoints ```json GET /_plugins/_flow_framework/workflow/ diff --git a/_automating-configurations/api/provision-workflow.md b/_automating-configurations/api/provision-workflow.md index 62c4954ee97..d9d90ce7f38 100644 --- a/_automating-configurations/api/provision-workflow.md +++ b/_automating-configurations/api/provision-workflow.md @@ -14,7 +14,7 @@ The `workflows` template field may contain multiple workflows. The workflow with You can only provision a workflow if it has not yet been provisioned. Deprovision the workflow if you need to repeat provisioning. {: .note} -## Path and HTTP methods +## Endpoints ```json POST /_plugins/_flow_framework/workflow//_provision @@ -30,7 +30,7 @@ The following table lists the available path parameters. ## Query parameters -If you have included a substitution expression in the template, you may pass it as a query parameter or as a string value of a request body field. For example, if you specified a credential field in a template as `openAI_key: '${{ openai_key }}'`, then you can include the `openai_key` parameter as a query parameter or body field so it can be substituted during provisioning. For example, the following request provides a query parameter: +If you have included a substitution expression in the template, you may pass it as a query parameter or as a string value of a request body field. For example, if you specified a credential field in a template as {% raw %}`openAI_key: '${{ openai_key }}'`{% endraw %}, then you can include the `openai_key` parameter as a query parameter or body field so it can be substituted during provisioning. For example, the following request provides a query parameter: ```json POST /_plugins/_flow_framework/workflow//_provision?= @@ -39,6 +39,7 @@ POST /_plugins/_flow_framework/workflow//_provision?=/_provision&wait_for_completion_timeout=2s +``` +{% include copy-curl.html %} + +The following request substitutes the expression {% raw %}`${{ openai_key }}`{% endraw %} with the value "12345" using a query parameter: ```json POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_provision?openai_key=12345 ``` {% include copy-curl.html %} -The following request substitutes the expression `${{ openai_key }}` with the value "12345" using the request body: +The following request substitutes the expression {% raw %}`${{ openai_key }}`{% endraw %} with the value "12345" using the request body: ```json POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_provision @@ -74,4 +82,33 @@ OpenSearch responds with the same `workflow_id` that was used in the request: } ``` -To obtain the provisioning status, query the [Get Workflow State API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-status/). \ No newline at end of file +To obtain the provisioning status, call the [Get Workflow State API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-status/). + +#### Example response with wait_for_completion_timeout enabled + +```json +{ + "workflow_id": "K13IR5QBEpCfUu_-AQdU", + "state": "COMPLETED", + "resources_created": [ + { + "workflow_step_name": "create_connector", + "workflow_step_id": "create_connector_1", + "resource_id": "LF3IR5QBEpCfUu_-Awd_", + "resource_type": "connector_id" + }, + { + "workflow_step_id": "register_model_2", + "workflow_step_name": "register_remote_model", + "resource_id": "L13IR5QBEpCfUu_-BQdI", + "resource_type": "model_id" + }, + { + "workflow_step_name": "deploy_model", + "workflow_step_id": "deploy_model_3", + "resource_id": "L13IR5QBEpCfUu_-BQdI", + "resource_type": "model_id" + } + ] +} +``` \ No newline at end of file diff --git a/_automating-configurations/api/search-workflow-state.md b/_automating-configurations/api/search-workflow-state.md index 1cacb3a32b7..786057c9520 100644 --- a/_automating-configurations/api/search-workflow-state.md +++ b/_automating-configurations/api/search-workflow-state.md @@ -9,7 +9,7 @@ nav_order: 65 You can search for resources created by workflows by matching a query to a field. The fields you can search correspond to those returned by the [Get Workflow Status API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-status/). -## Path and HTTP methods +## Endpoints ```json GET /_plugins/_flow_framework/workflow/state/_search diff --git a/_automating-configurations/api/search-workflow.md b/_automating-configurations/api/search-workflow.md index b78de9e9d21..2882252d4e0 100644 --- a/_automating-configurations/api/search-workflow.md +++ b/_automating-configurations/api/search-workflow.md @@ -9,7 +9,7 @@ nav_order: 60 You can retrieve created workflows with their `workflow_id` or search for workflows by using a query matching a field. You can use the `use_case` field to search for similar workflows. -## Path and HTTP methods +## Endpoints ```json GET /_plugins/_flow_framework/workflow/_search diff --git a/_automating-configurations/workflow-settings.md b/_automating-configurations/workflow-settings.md index 78762fdfbb7..5c165c9f29b 100644 --- a/_automating-configurations/workflow-settings.md +++ b/_automating-configurations/workflow-settings.md @@ -15,3 +15,6 @@ The following keys represent configurable workflow settings. |`plugins.flow_framework.max_workflow_steps` |Integer |`50` |The maximum number of steps a workflow can have. | |`plugins.flow_framework.request_timeout` |Time units |`10s` |The default timeout for REST requests, which applies to internal search queries. | |`plugins.flow_framework.task_request_retry_duration` |Time units |`5s` | When steps correspond to an API that produces a `task_id`, OpenSearch will retry them at this interval until completion. | +|`plugins.flow_framework.workflow_thread_pool_size` |Integer |`4` |The maximum size of the workflow thread pool used for polling retries. | +|`plugins.flow_framework.provision_thread_pool_size` |Integer |`8` |The maximum size of the provision workflow thread pool. | +|`plugins.flow_framework.deprovision_thread_pool_size` |Integer |`4` |The maximum size of the deprovision workflow thread pool. | diff --git a/_automating-configurations/workflow-templates.md b/_automating-configurations/workflow-templates.md index 62406ae0699..eb60535c506 100644 --- a/_automating-configurations/workflow-templates.md +++ b/_automating-configurations/workflow-templates.md @@ -22,7 +22,7 @@ In this example, you'll configure the `semantic_search_with_cohere_embedding_que - Deploys an externally hosted Cohere model - Creates an ingest pipeline using the model -- Creates a sample k-NN index and configures a search pipeline to define the default model ID for that index +- Creates a sample vector index and configures a search pipeline to define the default model ID for that index ### Step 1: Create and provision the workflow @@ -44,7 +44,7 @@ OpenSearch responds with a workflow ID for the created workflow: } ``` -The workflow in the previous step creates a default k-NN index. The default index name is `my-nlp-index`: +The workflow in the previous step creates a default vector index. The default index name is `my-nlp-index`: ```json { @@ -120,26 +120,306 @@ GET /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_status ## Supported workflow templates -The following table lists the supported workflow templates. To use a workflow template, specify it in the `use_case` query parameter when creating a workflow. - -| Template use case | Description | Required parameters | Defaults | -| `bedrock_titan_embedding_model_deploy` | Creates and deploys an Amazon Bedrock embedding model (by default, `titan-embed-text-v1`).| `create_connector.credential.access_key`, `create_connector.credential.secret_key`, `create_connector.credential.session_token` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/bedrock-titan-embedding-defaults.json)| -| `bedrock_titan_multimodal_model_deploy` | Creates and deploys an Amazon Bedrock multimodal embedding model (by default, `titan-embed-image-v1`). | `create_connector.credential.access_key`, `create_connector.credential.secret_key`, `create_connector.credential.session_token` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/bedrock-titan-multimodal-defaults.json). | -| `cohere_embedding_model_deploy`| Creates and deploys a Cohere embedding model (by default, `embed-english-v3.0`). | `create_connector.credential.key` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/cohere-embedding-defaults.json) | -| `cohere_chat_model_deploy` | Creates and deploys a Cohere chat model (by default, Cohere Command). | `create_connector.credential.key` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/cohere-chat-defaults.json) | -| `open_ai_embedding_model_deploy` | Creates and deploys an OpenAI embedding model (by default, `text-embedding-ada-002`). | `create_connector.credential.key` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/openai-embedding-defaults.json) | -| `openai_chat_model_deploy` | Creates and deploys an OpenAI chat model (by default, `gpt-3.5-turbo`). | `create_connector.credential.key` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/openai-chat-defaults.json) | -| `local_neural_sparse_search_bi_encoder` | Configures [neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/):
- Deploys a pretrained sparse encoding model.
- Creates an ingest pipeline with a sparse encoding processor.
- Creates a sample index to use for sparse search, specifying the newly created pipeline as the default pipeline. | None |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/local-sparse-search-biencoder-defaults.json) | -| `semantic_search` | Configures [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/):
- Creates an ingest pipeline with a `text_embedding` processor and a k-NN index
You must provide the model ID of the text embedding model to be used. | `create_ingest_pipeline.model_id` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/semantic-search-defaults.json) | -| `semantic_search_with_query_enricher` | Configures [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/) similarly to the `semantic_search` template. Adds a [`query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) search processor that sets a default model ID for neural queries. You must provide the model ID of the text embedding model to be used. | `create_ingest_pipeline.model_id` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/semantic-search-query-enricher-defaults.json) | -| `semantic_search_with_cohere_embedding` | Configures [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/) and deploys a Cohere embedding model. You must provide the API key for the Cohere model. | `create_connector.credential.key` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/cohere-embedding-semantic-search-defaults.json) | -| `semantic_search_with_cohere_embedding_query_enricher` | Configures [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/) and deploys a Cohere embedding model. Adds a [`query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) search processor that sets a default model ID for neural queries. You must provide the API key for the Cohere model. | `create_connector.credential.key` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/cohere-embedding-semantic-search-with-query-enricher-defaults.json) | -| `multimodal_search` | Configures an ingest pipeline with a `text_image_embedding` processor and a k-NN index for [multimodal search]({{site.url}}{{site.baseurl}}/search-plugins/multimodal-search/). You must provide the model ID of the multimodal embedding model to be used. | `create_ingest_pipeline.model_id` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/multi-modal-search-defaults.json) | -| `multimodal_search_with_bedrock_titan` | Deploys an Amazon Bedrock multimodal model and configures an ingest pipeline with a `text_image_embedding` processor and a k-NN index for [multimodal search]({{site.url}}{{site.baseurl}}/search-plugins/multimodal-search/). You must provide your AWS credentials. | `create_connector.credential.access_key`, `create_connector.credential.secret_key`, `create_connector.credential.session_token` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/multimodal-search-bedrock-titan-defaults.json) | -| `hybrid_search` | Configures [hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/):
- Creates an ingest pipeline, a k-NN index, and a search pipeline with a `normalization_processor`. You must provide the model ID of the text embedding model to be used. | `create_ingest_pipeline.model_id` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/hybrid-search-defaults.json) | -| `conversational_search_with_llm_deploy` | Deploys a large language model (LLM) (by default, Cohere Chat) and configures a search pipeline with a `retrieval_augmented_generation` processor for [conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). | `create_connector.credential.key` |[Defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/conversational-search-defaults.json) | -| `semantic_search_with_reindex` | Configures [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/) with a newly deployed Cohere embedding model. The model is configured to reindex a source index into a newly configured k-NN index. You must provide the API key for the Cohere model along with the source index to be reindexed. | `create_connector.credential.key`, `reindex.source_index`|[Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/semantic-search-with-reindex-defaults.json) | -| `semantic_search_with_local_model` | Configures [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/) and deploys a pretrained model (`msmarco-distilbert-base-tas-b`). Adds a [`query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) search processor that sets a default model ID for neural queries and creates a linked k-NN index called `my-nlp-index`. You must provide the API key for the Cohere model. | None | [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/semantic-search-with-local-model-defaults.json) | -| `hybrid_search_with_local_model` | Configures [hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/) and deploys a pretrained model (`msmarco-distilbert-base-tas-b`). Creates an ingest pipeline, a k-NN index, and a search pipeline with a `normalization_processor`. | None | [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/hybrid-search-with-local-model-defaults.json) | +To use a workflow template, specify it in the `use_case` query parameter when creating a workflow. The following templates are supported: +
+ + The following templates are supported: + +- Model deployment templates: + - [Amazon Bedrock Titan embedding](#amazon-bedrock-titan-embedding) + - [Amazon Bedrock Titan multimodal](#amazon-bedrock-titan-multimodal) + - [Cohere embedding](#cohere-embedding) + - [Cohere chat](#cohere-chat) + - [OpenAI embedding](#openai-embedding) + - [OpenAI chat](#openai-chat) +- Semantic search templates: + - [Semantic search](#semantic-search) + - [Semantic search with a query enricher](#semantic-search-with-a-query-enricher) + - [Semantic search using a local model](#semantic-search-using-a-local-model) + - [Semantic search using a Cohere embedding model](#semantic-search-using-a-cohere-embedding-model) + - [Semantic search using Cohere embedding models with a query enricher](#semantic-search-using-cohere-embedding-models-with-a-query-enricher) + - [Semantic search using Cohere embedding models with reindexing](#semantic-search-using-cohere-embedding-models-with-reindexing) +- Neural sparse search templates: + - [Neural sparse search](#neural-sparse-search) +- Multimodal search templates: + - [Multimodal search](#multimodal-search) + - [Multimodal search using Amazon Bedrock Titan](#multimodal-search-using-amazon-bedrock-titan) +- Hybrid search templates: + - [Hybrid search](#hybrid-search) + - [Hybrid search using a local model](#hybrid-search-using-a-local-model) +- Conversational search templates: + - [Conversational search using an LLM](#conversational-search-using-an-llm) + +
+ +## Model deployment templates + +The following workflow templates configure model deployment. + +### Amazon Bedrock Titan embedding + +This workflow creates and deploys an Amazon Bedrock embedding model (by default, `titan-embed-text-v1`). + +- **Use case**: `bedrock_titan_embedding_model_deploy` +- **Created components**: A connector and model for the Amazon Bedrock Titan embeddings model +- **Required parameters**: + - `create_connector.credential.access_key` + - `create_connector.credential.secret_key` + - `create_connector.credential.session_token` +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/bedrock-titan-embedding-defaults.json) + +**Note**: Requires AWS credentials and access to Amazon Bedrock. + +### Amazon Bedrock Titan multimodal + +This workflow creates and deploys an Amazon Bedrock multimodal embedding model (by default, `titan-embed-image-v1`). + +- **Use case**: `bedrock_titan_multimodal_model_deploy` +- **Created components**: A connector and model for Amazon Bedrock Titan multimodal embeddings +- **Required parameters**: + - `create_connector.credential.access_key` + - `create_connector.credential.secret_key` + - `create_connector.credential.session_token` +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/bedrock-titan-multimodal-defaults.json) + +**Note**: Requires AWS credentials and access to Amazon Bedrock. + +### Cohere embedding + +This workflow creates and deploys a Cohere embedding model (by default, `embed-english-v3.0`). + +- **Use case**: `cohere_embedding_model_deploy` +- **Created components**: A connector and model for Cohere embedding +- **Required parameters**: + - `create_connector.credential.key` +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/cohere-embedding-defaults.json) + +**Note**: Requires a Cohere API key. + +### Cohere chat + +This workflow creates and deploys a Cohere chat model (by default, Cohere Command). + +- **Use case**: `cohere_chat_model_deploy` +- **Created components**: A connector and model for Cohere chat +- **Required parameters**: + - `create_connector.credential.key` +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/cohere-chat-defaults.json) + +**Note**: Requires a Cohere API key. + +### OpenAI embedding + +This workflow creates and deploys an OpenAI embedding model (by default, `text-embedding-ada-002`). + +- **Use case**: `open_ai_embedding_model_deploy` +- **Created components**: A connector and model for OpenAI embeddings +- **Required parameters**: + - `create_connector.credential.key` +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/openai-embedding-defaults.json) + +**Note**: Requires an OpenAI API key. + +### OpenAI chat + +This workflow creates and deploys an OpenAI chat model (by default, `gpt-3.5-turbo`). + +- **Use case**: `openai_chat_model_deploy` +- **Created components**: A connector and model for OpenAI chat +- **Required parameters**: + - `create_connector.credential.key` +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/openai-chat-defaults.json) + +**Note**: Requires an OpenAI API key. + +## Semantic search templates + +The following workflow templates configure semantic search. + +### Semantic search + +This workflow configures [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/). + +- **Use case**: `semantic_search` +- **Created components**: + - An ingest pipeline with a `text_embedding` processor + - A vector index configured with the pipeline +- **Required parameters**: + - `create_ingest_pipeline.model_id`: The model ID of the text embedding model to be used +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/semantic-search-defaults.json) + +### Semantic search with a query enricher + +This workflow configures [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/) with a default model for neural queries. + +- **Use case**: `semantic_search_with_query_enricher` +- **Created components**: + - An ingest pipeline with a `text_embedding` processor + - A vector index configured with the pipeline + - A [`query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) search processor that sets a default model ID for neural queries. +- **Required parameters**: + - `create_ingest_pipeline.model_id`: The model ID of the text embedding model to be used +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/semantic-search-query-enricher-defaults.json) + +### Semantic search using a local model + +This workflow configures [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/) and deploys a pretrained model. + +- **Use case**: `semantic_search_with_local_model` +- **Created components**: + - A pretrained model (by default, `huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2`) + - An ingest pipeline with a `text_embedding` processor + - A vector index configured with the pipeline + - A [`query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) search processor that sets a default model ID for neural queries. +- **Required parameters**: None +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/semantic-search-with-local-model-defaults.json) + +**Note**: Uses a local pretrained model with a default configuration. + +### Semantic search using a Cohere embedding model + +This workflow configures [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/) and deploys a Cohere embedding model. + +- **Use case**: `semantic_search_with_cohere_embedding` +- **Created components**: + - A Cohere embedding model (by default, `embed-english-v3.0`) connector and deployment + - An ingest pipeline with a `text_embedding` processor + - A vector index configured with the pipeline +- **Required parameters**: + - `create_connector.credential.key`: API key for the Cohere model +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/cohere-embedding-semantic-search-defaults.json) + +**Note**: Requires a Cohere API key. + +### Semantic search using Cohere embedding models with a query enricher + +This workflow configures [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/), deploys a Cohere embedding model, and adds a query enricher search processor. + +- **Use case**: `semantic_search_with_cohere_embedding_query_enricher` +- **Created components**: + - A Cohere embedding model connector and deployment + - An ingest pipeline with a `text_embedding` processor + - A vector index configured with the pipeline + - A [`query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) search processor that sets a default model ID for neural queries. +- **Required parameters**: + - `create_connector.credential.key`: API key for the Cohere model +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/cohere-embedding-semantic-search-with-query-enricher-defaults.json) + +**Note**: Requires a Cohere API key. + +### Semantic search using Cohere embedding models with reindexing + +This workflow configures [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/) with a Cohere embedding model and reindexes an existing index. + +- **Use case**: `semantic_search_with_reindex` +- **Created components**: + - A Cohere embedding model connector and deployment + - A vector index configured with the pipeline + - A reindexing process +- **Required parameters**: + - `create_connector.credential.key`: API key for the Cohere model + - `reindex.source_index`: The source index to be reindexed +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/semantic-search-with-reindex-defaults.json) + +**Note**: Reindexes a source index into a newly configured k-NN index using a Cohere embedding model. + +## Neural sparse search templates + +The following workflow template configures [neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). + +### Neural sparse search + +This workflow configures [neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). + +- **Use case**: `local_neural_sparse_search_bi_encoder` +- **Created components**: + - A locally hosted pretrained sparse encoding model (by default, `amazon/neural-sparse/opensearch-neural-sparse-encoding-v1`) + - An ingest pipeline with a `sparse_encoding` processor + - A vector index configured with the pipeline +- **Required parameters**: None +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/local-sparse-search-biencoder-defaults.json) + +## Multimodal search templates + +The following workflow templates configure [multimodal search]({{site.url}}{{site.baseurl}}/search-plugins/multimodal-search/). + +### Multimodal search + +This workflow configures [multimodal search]({{site.url}}{{site.baseurl}}/search-plugins/multimodal-search/). + +- **Use case**: `multimodal_search` +- **Created components**: + - An ingest pipeline with a `text_image_embedding` processor + - A vector index configured with the pipeline +- **Required parameters**: + - `create_ingest_pipeline.model_id`: The model ID of the multimodal embedding model to be used +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/multi-modal-search-defaults.json) + +### Multimodal search using Amazon Bedrock Titan + +This workflow deploys an Amazon Bedrock multimodal model and configures a multimodal search pipeline. + +- **Use case**: `multimodal_search_with_bedrock_titan` +- **Created components**: + - An Amazon Bedrock Titan multimodal embedding model connector and deployment + - An ingest pipeline with a `text_image_embedding` processor + - A vector index for multimodal search configured with the pipeline +- **Required parameters**: + - `create_connector.credential.access_key` + - `create_connector.credential.secret_key` + - `create_connector.credential.session_token` +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/multimodal-search-bedrock-titan-defaults.json) + +**Note**: Requires AWS credentials and access to Amazon Bedrock. + +## Hybrid search templates + +The following workflow templates configure [hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/). + +### Hybrid search + +This workflow configures [hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/). + +- **Use case**: `hybrid_search` +- **Created components**: + - An ingest pipeline + - A vector index configured with the pipeline + - A search pipeline with a `normalization_processor` +- **Required parameters**: + - `create_ingest_pipeline.model_id`: The model ID of the text embedding model to be used +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/hybrid-search-defaults.json) + +### Hybrid search using a local model + +This workflow configures hybrid search and deploys a pretrained model. + +- **Use case**: `hybrid_search_with_local_model` +- **Created components**: + - A pretrained model (by default, `huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2`) + - An ingest pipeline + - A vector index configured with the pipeline + - A search pipeline with a `normalization_processor` +- **Required parameters**: None +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/hybrid-search-with-local-model-defaults.json) + +**Note**: Uses a local pretrained model for hybrid search configuration. + +## Conversational search templates + +The following workflow template configures [conversational search with RAG]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). + +### Conversational search using an LLM + +This workflow deploys a large language model and configures a conversational search pipeline. + +- **Use case**: `conversational_search_with_llm_deploy` +- **Created components**: + - A chat model (by default, Cohere Command) connector and deployment + - A search pipeline with a `retrieval_augmented_generation` processor +- **Required parameters**: + - `create_connector.credential.key`: API key for the LLM +- [Defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/conversational-search-defaults.json) + +**Note**: Requires an API key for the chosen language model. diff --git a/_automating-configurations/workflow-tutorial.md b/_automating-configurations/workflow-tutorial.md index 0074ad4691c..4efb94f78f6 100644 --- a/_automating-configurations/workflow-tutorial.md +++ b/_automating-configurations/workflow-tutorial.md @@ -16,10 +16,10 @@ The setup requires the following sequence of API requests, with provisioned reso * [`deploy_model_3`](#deploy_model_3): Deploy the model. 1. **Use the deployed model for inference** * Set up several tools that perform specific tasks: - * [`cat_index_tool`](#cat_index_tool): Set up a tool to obtain index information. + * [`list_index_tool`](#list_index_tool): Set up a tool to obtain index information. * [`ml_model_tool`](#ml_model_tool): Set up a machine learning (ML) model tool. * Set up one or more agents that use some combination of the tools: - * [`sub_agent`](#sub_agent): Create an agent that uses the `cat_index_tool`. + * [`sub_agent`](#sub_agent): Create an agent that uses the `list_index_tool`. * Set up tools representing these agents: * [`agent_tool`](#agent_tool): Wrap the `sub_agent` so that you can use it as a tool. * [`root_agent`](#root_agent): Set up a root agent that may delegate the task to either a tool or another agent. @@ -97,7 +97,7 @@ The [Deploy Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model- register_model_2: model_id ``` -When using the Deploy Model API directly, a task ID is returned, requiring use of the [Tasks API](https://opensearch.org/docs/latest/ml-commons-plugin/api/tasks-apis/get-task/) to determine when the deployment is complete. The automated workflow eliminates the manual status check and returns the final `model_id` directly. +When using the Deploy Model API directly, a task ID is returned, requiring use of the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/) to determine when the deployment is complete. The automated workflow eliminates the manual status check and returns the final `model_id` directly. ### Ordering steps @@ -119,17 +119,17 @@ If you define `previous_node_inputs`, then defining edges is optional. A CoT agent can use the deployed model in a tool. This step doesn’t strictly correspond to an API but represents a component of the body required by the [Register Agent API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/). This simplifies the register request and allows reuse of the same tool in multiple agents. For more information about agents and tools, see [Agents and tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/). -### cat_index_tool +### list_index_tool -You can configure other tools to be used by the CoT agent. For example, you can configure a `cat_index_tool` as follows. This tool does not depend on any previous steps: +You can configure other tools to be used by the CoT agent. For example, you can configure a `list_index_tool` as follows. This tool does not depend on any previous steps: ```yaml -- id: cat_index_tool +- id: list_index_tool type: create_tool user_inputs: - name: CatIndexTool - type: CatIndexTool + name: ListIndexTool + type: ListIndexTool parameters: max_iteration: 5 ``` @@ -138,7 +138,7 @@ You can configure other tools to be used by the CoT agent. For example, you can ### sub_agent -To use the `cat_index_tool` in the agent configuration, specify it as one of the tools in the `previous_node_inputs` field of the agent. You can add other tools to `previous_node_inputs` as necessary. The agent also needs a large language model (LLM) in order to reason with the tools. The LLM is defined by the `llm.model_id` field. This example assumes that the `model_id` from the `deploy_model_3` step will be used. However, if another model is already deployed, the `model_id` of that previously deployed model could be included in the `user_inputs` field instead: +To use the `list_index_tool` in the agent configuration, specify it as one of the tools in the `previous_node_inputs` field of the agent. You can add other tools to `previous_node_inputs` as necessary. The agent also needs a large language model (LLM) in order to reason with the tools. The LLM is defined by the `llm.model_id` field. This example assumes that the `model_id` from the `deploy_model_3` step will be used. However, if another model is already deployed, the `model_id` of that previously deployed model could be included in the `user_inputs` field instead: ```yaml - id: sub_agent @@ -146,7 +146,7 @@ To use the `cat_index_tool` in the agent configuration, specify it as one of the previous_node_inputs: # When llm.model_id is not present this can be used as a fallback value deploy-model-3: model_id - cat_index_tool: tools + list_index_tool: tools user_inputs: name: Sub Agent type: conversational @@ -164,7 +164,7 @@ To use the `cat_index_tool` in the agent configuration, specify it as one of the OpenSearch will automatically create the following edges so that the agent can retrieve the fields from the previous node: ```yaml -- source: cat_index_tool +- source: list_index_tool dest: sub_agent - source: deploy_model_3 dest: sub_agent @@ -322,11 +322,11 @@ workflows: # For example purposes, the model_id obtained as the output of the deploy_model_3 step will be used # for several below steps. However, any other deployed model_id can be used for those steps. # This is one example tool from the Agent Framework. - - id: cat_index_tool + - id: list_index_tool type: create_tool user_inputs: - name: CatIndexTool - type: CatIndexTool + name: ListIndexTool + type: ListIndexTool parameters: max_iteration: 5 # This simple agent only has one tool, but could be configured with many tools @@ -334,7 +334,7 @@ workflows: type: register_agent previous_node_inputs: deploy-model-3: model_id - cat_index_tool: tools + list_index_tool: tools user_inputs: name: Sub Agent type: conversational @@ -394,7 +394,7 @@ workflows: dest: register_model_2 - source: register_model_2 dest: deploy_model_3 - - source: cat_index_tool + - source: list_index_tool dest: sub_agent - source: deploy_model_3 dest: sub_agent @@ -479,11 +479,11 @@ The following is the same template in JSON format: } }, { - "id": "cat_index_tool", + "id": "list_index_tool", "type": "create_tool", "user_inputs": { - "name": "CatIndexTool", - "type": "CatIndexTool", + "name": "ListIndexTool", + "type": "ListIndexTool", "parameters": { "max_iteration": 5 } @@ -494,7 +494,7 @@ The following is the same template in JSON format: "type": "register_agent", "previous_node_inputs": { "deploy-model-3": "llm.model_id", - "cat_index_tool": "tools" + "list_index_tool": "tools" }, "user_inputs": { "name": "Sub Agent", @@ -581,7 +581,7 @@ The following is the same template in JSON format: "dest": "deploy_model_3" }, { - "source": "cat_index_tool", + "source": "list_index_tool", "dest": "sub_agent" }, { diff --git a/_benchmark/index.md b/_benchmark/index.md index 6d343b908a8..da6c5d6e009 100644 --- a/_benchmark/index.md +++ b/_benchmark/index.md @@ -8,6 +8,31 @@ has_toc: false permalink: /benchmark/ redirect_from: - /benchmark/index/ + - /benchmark/tutorials/index/ +tutorial_cards: + - heading: "Get started with OpenSearch Benchmark" + description: "Run your first OpenSearch Benchmark workload and receive performance metrics" + link: "/benchmark/quickstart/" + - heading: "Choosing a workload" + description: "Choose a benchmark workload based on your cluster's use case" + link: "/benchmark/user-guide/understanding-workloads/choosing-a-workload/" +more_cards: + - heading: "User guide" + description: "Learn how to benchmark the performance of your cluster" + link: "/benchmark/user-guide/index/" + - heading: "Reference" + description: "Learn about OpenSearch Benchmark commands and options" + link: "/benchmark/reference/index/" +items: + - heading: "Install and configure OpenSearch Benchmark" + description: "Install OpenSearch Benchmark and configure your experience" + link: "/benchmark/user-guide/install-and-configure/installing-benchmark/" + - heading: "Run a workload" + description: "Run a workload and receive performance metrics." + link: "/benchmark/user-guide/working-with-workloads/running-workloads/" + - heading: "Analyze performance metrics" + description: "View your benchmark report and analyze your metrics" + link: "/benchmark/user-guide/understanding-results/summary-reports/" --- # OpenSearch Benchmark @@ -18,18 +43,25 @@ OpenSearch Benchmark is a macrobenchmark utility provided by the [OpenSearch Pro - Informing decisions about when to upgrade your cluster to a new version. - Determining how changes to your workflow---such as modifying mappings or queries---might impact your cluster. +## Get started + OpenSearch Benchmark can be installed directly on a compatible host running Linux or macOS. You can also run OpenSearch Benchmark in a Docker container. See [Installing OpenSearch Benchmark]({{site.url}}{{site.baseurl}}/benchmark/installing-benchmark/) for more information. +{: .info } + +{% include cards.html cards=page.tutorial_cards %} + +{% include list.html list_items=page.items%} + + +[Get started]({{site.url}}{{site.baseurl}}/benchmark/quickstart/){: .btn-dark-blue} + + +## Customize your benchmarks -The following diagram visualizes how OpenSearch Benchmark works when run against a local host: +{% include cards.html cards=page.more_cards %} -![Benchmark workflow]({{site.url}}{{site.baseurl}}/images/benchmark/osb-workflow.jpg). -The OpenSearch Benchmark documentation is split into four sections: -- [Quickstart]({{site.url}}{{site.baseurl}}/benchmark/quickstart/): Learn how to quickly run and install OpenSearch Benchmark. -- [User guide]({{site.url}}{{site.baseurl}}/benchmark/user-guide/index/): Dive deep into how OpenSearch Benchmark can help you track the performance of your cluster. -- [Tutorials]({{site.url}}{{site.baseurl}}/benchmark/tutorials/index/): Use step-by-step guides for more advanced benchmarking configurations and functionality. -- [Reference]({{site.url}}{{site.baseurl}}/benchmark/reference/index/): A detailed reference of metrics, commands, telemetry devices, and workloads. diff --git a/_benchmark/reference/commands/aggregate.md b/_benchmark/reference/commands/aggregate.md index 17612f11647..a891bf3edf1 100644 --- a/_benchmark/reference/commands/aggregate.md +++ b/_benchmark/reference/commands/aggregate.md @@ -69,9 +69,30 @@ Aggregate test execution ID: aggregate_results_geonames_9aafcfb8-d3b7-4583-864e ------------------------------- ``` -The results will be aggregated into one test execution and stored under the ID shown in the output: +The results will be aggregated into one test execution and stored under the ID shown in the output. +### Additional options - `--test-execution-id`: Define a unique ID for the aggregated test execution. - `--results-file`: Write the aggregated results to the provided file. - `--workload-repository`: Define the repository from which OpenSearch Benchmark will load workloads (default is `default`). +## Aggregated results + +Aggregated results includes the following information: + +- **Relative Standard Deviation (RSD)**: For each metric an additional `mean_rsd` value shows the spread of results across test executions. +- **Overall min/max values**: Instead of averaging minimum and maximum values, the aggregated result include `overall_min` and `overall_max` which reflect the true minimum/maximum across all test runs. +- **Storage**: Aggregated test results are stored in a separate `aggregated_results` folder alongside the `test_executions` folder. + +The following example shows aggregated results: + +```json + "throughput": { + "overall_min": 29056.890292903263, + "mean": 50115.8603858536, + "median": 50099.54349684457, + "overall_max": 72255.15946248993, + "unit": "docs/s", + "mean_rsd": 59.426059705973664 + }, +``` diff --git a/_benchmark/reference/commands/redline-test.md b/_benchmark/reference/commands/redline-test.md new file mode 100644 index 00000000000..2ab228b4651 --- /dev/null +++ b/_benchmark/reference/commands/redline-test.md @@ -0,0 +1,84 @@ +--- +layout: default +title: redline-test +nav_order: 85 +parent: Command reference +grand_parent: OpenSearch Benchmark Reference +--- + +# Redline testing + +The `--redline-test` command enables OpenSearch Benchmark to automatically determine the maximum request throughput your OpenSearch cluster can handle under increasing load. It dynamically adjusts the number of active clients based on real-time cluster performance, helping with capacity planning and identifying performance regressions. + +When the `--redline-test` flag is used, OpenSearch Benchmark performs the following steps: + +1. **Client initialization**: OpenSearch Benchmark initializes a large number of clients (default: 1,000). You can override this with `--redline-test=`. +2. **Feedback mechanism**: OpenSearch Benchmark ramps up the number of active clients. A FeedbackActor monitors real-time request failures and adjusts the client count accordingly. +3. **Shared state coordination**: OpenSearch Benchmark uses Python's multiprocessing library to manage shared dictionaries and queues for inter-process communication: + - **Workers** create and share client state maps with the WorkerCoordinatorActor. + - The **WorkerCoordinatorActor** aggregates client state and forwards it to the FeedbackActor. + - The **FeedbackActor** increases the number of clients until it detects request errors, then pauses clients, waits 30 seconds, and resumes testing. + +The following images provides a visual overview of the redline testing architecture. + +Redline Overview + + +## Usage + +To perform a redline test, use the `execute-test` command with the `--redline-test` flag and a timed test procedure. + +This test procedure defines a timed workload using the keyword-terms operation. It runs in two phases: + +- **Warmup phase**: The test begins with a warmup period (`warmup-time-period`) to stabilize performance metrics before measurement begins. This helps avoid skewing results with cold-start effects. +- **Measurement phase**: During the `time-period`, OpenSearch Benchmark sends requests at a `target-throughput` (requests per second) using a specified number of clients. The redline test logic will scale the number of active clients from this baseline to determine the cluster's maximum sustainable load. + +The following example timed test procedure is used as input to a redline test, which then dynamically adjusts the client load to find the maximum request throughput your cluster can handle without errors: + +```json +{ + "name": "timed-mode-test-procedure", + "schedule": [ + { + "operation": "keyword-terms", + "warmup-time-period": {% raw %}{{ warmup_time | default(300) | tojson }}{% endraw %}, + "time-period": {% raw %}{{ time_period | default(900) | tojson }}{% endraw %}, + "target-throughput": {% raw %}{{ target_throughput | default(20) | tojson }}{% endraw %}, + "clients": {% raw %}{{ search_clients | default(20) }}{% endraw %} + } + ] +} +``` +{% include copy.html %} + +Run the following command to start a redline test using a timed test procedure against your OpenSearch cluster: + +```bash +opensearch-benchmark execute-test \ + --pipeline=benchmark-only \ + --target-hosts= \ + --workload= \ + --test-procedure=timed-mode-test-procedure \ + --redline-test +``` +{% include copy.html %} + +## Results + +During a redline test, OpenSearch Benchmark provides detailed logs with scaling decisions and request failures during the test. At the end of a redline test, OpenSearch Benchmark logs the maximum number of clients that your cluster supported without request errors. + +The following example log output indicates that the redline test detected a `15%` error rate for the keyword-terms operation and determined that the cluster's maximum stable client load before errors occurred was `410`: + +``` +[WARNING] Error rate is 15.0 for operation 'keyword-terms'. Please check the logs. +Redline test finished. Maximum stable client number reached: 410 +``` + +## Configuration tips and test behavior + +Use the following options and behaviors to better understand and customize redline test execution: + +- `--redline-scale-step`: Specifies the number of clients to unpause in each scaling iteration. +- `--redline-scaledown-percentage`: Specifies the percentage of clients to pause when an error occurs. +- `--redline-post-scaledown-sleep`: Specifies the number of seconds the feedback actor waits before initiating a scale-up after scaling down. +- `--redline-max-clients`: Specifies the maximum number of clients allowed during redline testing. If unset, OpenSearch Benchmark defaults to the number of clients defined in the test procedure. \ No newline at end of file diff --git a/_benchmark/reference/metrics/index.md b/_benchmark/reference/metrics/index.md index 63e5a799e88..614cc66dbe1 100644 --- a/_benchmark/reference/metrics/index.md +++ b/_benchmark/reference/metrics/index.md @@ -13,7 +13,7 @@ After a workload completes, OpenSearch Benchmark stores all metric records withi ## Storing metrics -You can specify whether metrics are stored in memory or in a metrics store while running the benchmark by setting the [`datastore.type`](https://opensearch.org/docs/latest/benchmark/configuring-benchmark/#results_publishing) parameter in your `benchmark.ini` file. +You can specify whether metrics are stored in memory or in a metrics store while running the benchmark by setting the [`datastore.type`]({{site.url}}{{site.baseurl}}/benchmark/configuring-benchmark/#results_publishing) parameter in your `benchmark.ini` file. ### In memory diff --git a/_benchmark/tutorials/index.md b/_benchmark/tutorials/index.md deleted file mode 100644 index 3e53db2eaea..00000000000 --- a/_benchmark/tutorials/index.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -layout: default -title: Tutorials -nav_order: 10 -has_children: true ---- - -# Tutorial - -This section of the OpenSearch Benchmark documentation provides a set of tutorials for those who want to learn more advanced OpenSearch Benchmark concepts. \ No newline at end of file diff --git a/_benchmark/tutorials/sigv4.md b/_benchmark/tutorials/sigv4.md deleted file mode 100644 index f7ef38f948f..00000000000 --- a/_benchmark/tutorials/sigv4.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -layout: default -title: AWS Signature Version 4 support -nav_order: 70 -parent: Tutorials ---- - -# Running OpenSearch Benchmark with AWS Signature Version 4 - -OpenSearch Benchmark supports AWS Signature Version 4 authentication. To run Benchmark with Signature Version 4, use the following steps: - -1. Set up an [IAM user or an IAM Role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create.html) and provide it access to the OpenSearch cluster using Signature Version 4 authentication. - -2. Set up the following environment variables for your IAM user: - - ```bash - export OSB_AWS_ACCESS_KEY_ID= - export OSB_AWS_SECRET_ACCESS_KEY= - export OSB_REGION= - export OSB_SERVICE=es - ``` - {% include copy.html %} - - If you want to set up an IAM role instead of an IAM user, use the following environment variables instead: - - ```bash - export OSB_AWS_ACCESS_KEY_ID= - export OSB_AWS_SECRET_ACCESS_KEY= - export OSB_AWS_SESSION_TOKEN= - export OSB_REGION= - export OSB_SERVICE=es - ``` - {% include copy.html %} - - If you're testing against Amazon OpenSearch Serverless, set `OSB_SERVICE` to `aoss`. - -3. Customize and run the following `execute-test` command with the ` --client-options=amazon_aws_log_in:environment` flag. This flag tells OpenSearch Benchmark the location of your exported credentials. - - ```bash - opensearch-benchmark execute-test \ - --target-hosts= \ - --pipeline=benchmark-only \ - --workload=geonames \ - --client-options=timeout:120,amazon_aws_log_in:environment \ - ``` diff --git a/_benchmark/user-guide/concepts.md b/_benchmark/user-guide/concepts.md index f000f6c28cb..f849a5f6158 100644 --- a/_benchmark/user-guide/concepts.md +++ b/_benchmark/user-guide/concepts.md @@ -3,13 +3,20 @@ layout: default title: Concepts nav_order: 3 parent: User guide +has_toc: false redirect_from: - /benchmark/user-guide/concepts/ --- # Concepts -Before using OpenSearch Benchmark, familiarize yourself with the following concepts. +Before you start using OpenSearch Benchmark, it's helpful to understand the following concepts, in order to effectively design, run, and analyze your benchmarks to evaluate OpenSearch performance under different scenarios. + +## Benchmark architecture + +The following diagram illustrates how OpenSearch Benchmark operates when running against a local host. + +![Benchmark workflow]({{site.url}}{{site.baseurl}}/images/benchmark/osb-workflow.jpg). ## Core concepts and definitions diff --git a/_benchmark/user-guide/index.md b/_benchmark/user-guide/index.md index 31c0d2c36e8..a8dd9480135 100644 --- a/_benchmark/user-guide/index.md +++ b/_benchmark/user-guide/index.md @@ -3,8 +3,31 @@ layout: default title: User guide nav_order: 5 has_children: true +has_toc: false +more_cards: + - heading: "Concepts" + description: "Learn core OpenSearch Benchmark concepts" + link: "/benchmark/user-guide/concepts/" + - heading: "Install and configure OpenSearch Benchmark" + description: "Install OpenSearch Benchmark and configure your experience" + link: "/benchmark/user-guide/install-and-configure/index/" + - heading: "Understanding workloads" + description: "Dive deep into each workload component and choose a workload" + link: "/benchmark/user-guide/understanding-workloads/index/" + - heading: "Run and customize your workload" + description: "Run and customize your OpenSearch workload to get the most accurate results" + link: "/benchmark/user-guide/working-with-workloads/index/" + - heading: "Understanding your results" + description: "Analyze and store your benchmark results" + link: "/benchmark/user-guide/understanding-results/index/" + - heading: "Optimizing benchmarks" + description: "Optimize your benchmark experience through randomization and best practices" + link: "/benchmark/user-guide/optimizing-benchmarks/index/" --- # OpenSearch Benchmark User Guide The OpenSearch Benchmark User Guide includes core [concepts]({{site.url}}{{site.baseurl}}/benchmark/user-guide/concepts/), [installation]({{site.url}}{{site.baseurl}}/benchmark/installing-benchmark/) instructions, and [configuration options]({{site.url}}{{site.baseurl}}/benchmark/configuring-benchmark/) to help you get the most out of OpenSearch Benchmark. + + +{% include cards.html cards=page.more_cards %} \ No newline at end of file diff --git a/_benchmark/user-guide/install-and-configure/configuring-benchmark.md b/_benchmark/user-guide/install-and-configure/configuring-benchmark.md index 59ac13a83c6..9d24d6808d0 100644 --- a/_benchmark/user-guide/install-and-configure/configuring-benchmark.md +++ b/_benchmark/user-guide/install-and-configure/configuring-benchmark.md @@ -7,6 +7,7 @@ parent: Install and configure redirect_from: - /benchmark/configuring-benchmark/ - /benchmark/user-guide/configuring-benchmark/ + - /benchmark/tutorials/sigv4/ --- # Configuring OpenSearch Benchmark @@ -159,6 +160,51 @@ This section defines how OpenSearch versions are distributed. | :---- | :---- | :---- | | `release.cache` | Boolean | Determines whether newly released OpenSearch versions should be cached locally. | +## Running OpenSearch Benchmark with AWS Signature Version 4 + +OpenSearch Benchmark supports AWS Signature Version 4 authentication. To run OpenSearch Benchmark with AWS Signature Version 4, you need to set up an [AWS Identity and Access Management (IAM) user or role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create.html) and provide it access to the OpenSearch cluster using AWS Signature Version 4 authentication. + +Whether to use an IAM role or user depends on your test cluster's access management requirements. For more information about whether to use an IAM role or user, see [When to create an IAM user (instead of a role)](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles.html#id_which-to-choose). + +Use the following steps to set up AWS Signature Version 4: + +1. Create an IAM role or user in the AWS Management Console. + +2. Set up your environment variables. If you're testing using Amazon OpenSearch Serverless, set `OSB_SERVICE` to `aoss`. + + - For an IAM user, configure the following environment variables: + + ```bash + export OSB_AWS_ACCESS_KEY_ID= + export OSB_AWS_SECRET_ACCESS_KEY= + export OSB_REGION= + export OSB_SERVICE=es + ``` + {% include copy.html %} + + - For an IAM role, configure the following environment variables: + + ```bash + export OSB_AWS_ACCESS_KEY_ID= + export OSB_AWS_SECRET_ACCESS_KEY= + export OSB_AWS_SESSION_TOKEN= + export OSB_REGION= + export OSB_SERVICE=es + ``` + {% include copy.html %} + + +3. Customize and run the following `execute-test` command with the `--client-options=amazon_aws_log_in:environment` flag. This flag provides the location of your exported credentials to OpenSearch Benchmark. + + ```bash + opensearch-benchmark execute-test \ + --target-hosts= \ + --pipeline=benchmark-only \ + --workload=geonames \ + --client-options=timeout:120,amazon_aws_log_in:environment \ + ``` + + ## Proxy configurations OpenSearch automatically downloads all the necessary proxy data for you, including: diff --git a/_benchmark/user-guide/optimizing-benchmarks/expand-data-corpus.md b/_benchmark/user-guide/optimizing-benchmarks/expand-data-corpus.md new file mode 100644 index 00000000000..f304fbd13df --- /dev/null +++ b/_benchmark/user-guide/optimizing-benchmarks/expand-data-corpus.md @@ -0,0 +1,83 @@ +--- +layout: default +title: Expanding a workload's data corpus +nav_order: 20 +parent: Optimizing benchmarks +grand_parent: User guide +--- + +# Expanding a workload's data corpus + +This tutorial shows you how to use the [`expand-data-corpus.py`](https://github.com/opensearch-project/opensearch-benchmark/blob/main/scripts/expand-data-corpus.py) script to increase the size of the data corpus for an OpenSearch Benchmark workload. This can be helpful when running the `http_logs` workload against a large OpenSearch cluster. + +This script only works with the `http_logs` workload. +{: .warning} + +## Prerequisites + +To use this tutorial, make sure you fulfill the following prerequisites: + +1. You have installed Python 3.x or later. +2. The `http_logs` workload data corpus is already stored on the load generation host running OpenSearch Benchmark. + +## Understanding the script + +The `expand-data-corpus.py` script is designed to generate a larger data corpus by duplicating and modifying existing documents from the `http_logs` workload corpus. It primarily adjusts the timestamp field while keeping other fields intact. It also generates an offset file, which enables OpenSearch Benchmark to start up faster. + +## Using `expand-data-corpus.py` + +To use `expand-data-corpus.py`, use the following syntax: + +```bash +./expand-data-corpus.py [options] +``` + +The script provides several customization options. The following are the most commonly used options: + +- `--corpus-size`: The desired corpus size in GB +- `--output-file-suffix`: The suffix for the output file name. + +## Example + +The following example script command generates a 100 GB corpus: + +```bash +./expand-data-corpus.py --corpus-size 100 --output-file-suffix 100gb +``` + +The script will start generating documents. For a 100 GB corpus, it can take up to 30 minutes to generate the full corpus. + +You can generate multiple corpora by running the script multiple times with different output suffixes. All corpora generated by the script are used by OpenSearch Benchmark sequentially during injection. + +## Verifying the documents + +After the script completes, check the following locations for new files: + +- In the OpenSearch Benchmark data directory for `http_logs`: + - `documents-100gb.json`: The generated corpus + - `documents-100gb.json.offset`: The associated offset file + +- In the `http_logs` workload directory: + - `gen-docs-100gb.json`: The metadata for the generated corpus + - `gen-idx-100gb.json`: The index specification for the generated corpus + +## Using the corpus in a test + +To use the newly generated corpus in an OpenSearch Benchmark test, use the following syntax: + +```bash +opensearch-benchmark execute-test --workload http_logs --workload-params=generated_corpus:t [other_options] +``` + +The `generated_corpus:t` parameter tells OpenSearch Benchmark to use the expanded corpus. Any additional workload parameters can be appended using commas in the `--workload-params` option. + +## Expert-level settings + +Use `--help` to see all of the script's supported options. Be cautious when using the following expert-level settings because they may affect the corpus structure: + +- `-f`: Specifies the input file to use as a base for generating new documents +- `-n`: Sets the number of documents to generate instead of the corpus size +- `-i`: Defines the interval between consecutive timestamps +- `-t`: Sets the starting timestamp for the generated documents +- `-b`: Defines the number of documents per batch when writing to the offset file + diff --git a/_benchmark/user-guide/optimizing-benchmarks/index.md b/_benchmark/user-guide/optimizing-benchmarks/index.md index 0ea6c1978e5..3f2d83f3c1b 100644 --- a/_benchmark/user-guide/optimizing-benchmarks/index.md +++ b/_benchmark/user-guide/optimizing-benchmarks/index.md @@ -4,8 +4,27 @@ title: Optimizing benchmarks nav_order: 25 parent: User guide has_children: true +has_toc: false +more_cards: + - heading: "Running distributed loads" + description: "Configure multiple load generator machines to run large-scale benchmarks" + link: "/benchmark/user-guide/optimizing-benchmarks/distributed-load/" + - heading: "Expanding a workload's data corpus" + description: "Increase dataset size and complexity for more realistic benchmark scenarios" + link: "/benchmark/user-guide/optimizing-benchmarks/expand-data-corpus/" + - heading: "Target throughput" + description: "Set specific throughput targets to match your production workload patterns" + link: "/benchmark/user-guide/optimizing-benchmarks/target-throughput/" + - heading: "Performance testing best practices" + description: "Follow these best practices to achieve more meaningful benchmark results" + link: "/benchmark/user-guide/optimizing-benchmarks/performance-testing-best-practices/" + - heading: "Randomizing queries" + description: "Generate varied search patterns to simulate real-world queries" + link: "/benchmark/user-guide/optimizing-benchmarks/randomizing-queries/" --- # Optimizing benchmarks -This section details different ways you can optimize the benchmark tools for your cluster. \ No newline at end of file +This section provides information about optimizing the benchmark tools for your cluster. + +{% include cards.html cards=page.more_cards %} \ No newline at end of file diff --git a/_benchmark/user-guide/optimizing-benchmarks/performance-testing-best-practices.md b/_benchmark/user-guide/optimizing-benchmarks/performance-testing-best-practices.md new file mode 100644 index 00000000000..9b2dbd24dd2 --- /dev/null +++ b/_benchmark/user-guide/optimizing-benchmarks/performance-testing-best-practices.md @@ -0,0 +1,266 @@ +--- +layout: default +title: Performance testing best practices +nav_order: 160 +parent: Optimizing benchmarks +grand_parent: User guide +--- + +# Performance testing best practices + +When conducting performance testing using OpenSearch Benchmark, it's crucial to follow some key best practices to ensure accurate, reliable, and meaningful results. These practices help in creating realistic test scenarios, minimizing external factors that could skew results, and generating comparable and reproducible benchmarks. By adhering to these guidelines, you can gain valuable insights into your cluster's performance, including identifying bottlenecks and making informed decisions about cluster configuration and optimization. + +## Environment setup + +Performance testing requires careful attention to the testing environment. A properly configured environment is vital to obtaining reliable and reproducible results. + +When setting up your testing environment, it's essential to use hardware that closely matches your production environment. Using development or underpowered hardware will not provide meaningful results that are translatable to production performance. Local machines often have limited hardware, and local development libraries can conflict with the workload's library, preventing the benchmark test from running effectively. + +For the best results, make sure that your load generation host or machine running OpenSearch Benchmark follows the minimum hardware requirements: + +- CPU: 8+ cores +- RAM: 32+ GB +- Storage: Solid-state drive (SSD)/NVMe +- Network: 10 Gbps + + +We recommend provisioning a test cluster and configuring its settings to reflect what you are most likely to deploy in production. + + +## Test configuration + +Proper test configuration includes setting appropriate parameters for your test scenarios and ensuring that your cluster is configured optimally. + +### Basic setup + +The following example shows a basic benchmark configuration file. This configuration includes essential parameters such as warmup time, test duration, and the number of clients: + +```json + { + "name": "my-benchmark-test-procedure", + "description": "This test procedure runs term query against a cluster. It includes a 300-second warm-up, followed by a 3600-second benchmark using 8 concurrent clients.", + "schedule": [ + { + "operation": "term", + "warmup-time=period": 300, + "time-period": 3600, + "clients": 8 + } + ] + } +``` +{% include copy.html %} + +### Index settings + +Your OpenSearch index settings should be optimized for your specific use case. Try to set the number of shards per index to match your production cluster. However, if you're a developer who wants to focus on a single shard's performance and limit the variables impacting performance, use a single primary shard, as shown in the following example `index_settings`: + +```json +{ + "index_settings": { + "number_of_shards": 1, + "number_of_replicas": 0, + "refresh_interval": "30s" + } +} +``` + +These settings offer ample storage space for your documents and test results, with 3 shards and 1 replica per index. + + +## Running tests + +Running benchmark tests involves monitoring the system during the test and ensuring consistent conditions across test runs. + +While you can run a basic test, you can also customize your test run with additional [benchmark command options]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/index/). The following example runs a `geonames` workload test that targets a specific host and outputs the test results as a `csv`, which can be used for further analysis of the benchmark's metrics: + +```bash +opensearch-benchmark run \ + --workload=geonames \ + --target-hosts=localhost:9200 \ + --pipeline=benchmark-only \ + --test-procedure=default \ + --report-format=csv \ + --report-file=benchmark-results.csv +``` +{% include copy.html %} + +### Monitoring during tests + +During test execution, it's essential to monitor various system metrics to ensure that the test is running correctly and to identify any potential bottlenecks. The following commands help you monitor different aspects of system performance: + +```bash +# Monitor system resources +vmstat 1 + +# Monitor OpenSearch metrics +curl localhost:9200/_cat/nodes?v +curl localhost:9200/_cat/indices?v + +# Monitor cluster health +curl localhost:9200/_cluster/health?pretty +``` +{% include copy.html %} + +## Collecting metrics + +Collecting and storing appropriate metrics is important for analyzing test results and making informed decisions about performance optimizations. + +### Essential metrics + +Configure your benchmark to collect comprehensive metrics. The following example configuration shows you how to set up metric collection with file storage: + +```json +{ + "metrics": { + "store_metrics": true, + "detailed": true, + "metrics_store": { + "type": "file", + "location": "/path/to/metrics" + } + } +} +``` +{% include copy.html %} + +### Sample metrics to track + +The following Python structure can be used as a template and includes a list of metrics that should be tracked during performance testing: + +```python +metrics_to_track = { + 'latency': { + 'mean': 'ms', + 'median': 'ms', + 'p95': 'ms', + 'p99': 'ms' + }, + 'throughput': { + 'ops/sec': 'count', + 'mb/sec': 'bytes' + }, + 'system': { + 'cpu_usage': '%', + 'memory_used': 'bytes', + 'disk_io': 'iops' + } +} +``` +{% include copy.html %} + +### Calculating metrics + +OpenSearch Benchmark calculates metrics differently than traditional client-server systems. For detailed information about how metrics are calculated, see [Differences between OpenSearch Benchmark and a traditional client-server system]({{site.url}}{{site.baseurl}}/benchmark/user-guide/concepts/#differences-between-opensearch-benchmark-and-a-traditional-client-server-system). + +## Integration with OpenSearch Dashboards + +To integrate OpenSearch Benchmark results with OpenSearch Dashboards, use the following steps: + +1. [Configure OpenSearch Benchmark]({{site.url}}{{site.baseurl}}/benchmark/user-guide/install-and-configure/configuring-benchmark/) to store results in OpenSearch. +2. Create index patterns in OpenSearch Dashboards for the benchmark results. +3. Create visualizations and dashboards to analyze the benchmark data. + + +## Common pitfalls + +When conducting performance tests using OpenSearch Benchmark, it's important to be aware of some common pitfalls that can lead to inaccurate or misleading results. + +### Warmup intervals + +Proper warmup is critical to accurate performance testing. Without an adequate warmup period, your test results may be skewed by initial system instabilities or caching effects. + +Don't run tests without a warmup period. + +Instead, always include an adequate warmup period in your tests. This allows the system to reach a steady state before measurements begin. In the following example, a `geonames` run is given a warmup period of `300s`: + +```python +opensearch-benchmark execute-test --workload=geonames --workload-params="warmup_time_period:300" +``` + +The appropriate warmup period can vary depending on your specific workload and system configuration. Start with at least 5 minutes (300 seconds) and adjust as needed based on your observations. + +### Comparing results from different environments + +One of the most common mistakes in performance testing is comparing results from different environments. Results obtained from a laptop or development machine are not comparable to those from a production server due to differences in hardware, network conditions, and other environmental factors. + +Instead, ensure that all comparisons are made using the same or identical environments. If you need to compare different configurations, make sure to change only one variable at a time while keeping the environment consistent. + +### Documenting your test environment + +Proper documentation of your test environment is crucial for reproducibility and accurate analysis. Without detailed environment information, it becomes difficult to interpret results or reproduce tests in the future. + +Don't omit environment details from your test reports. + +Instead, always comprehensively document the details of your test environment. This should include hardware specifications, software versions, and any relevant configuration settings. The following example shows you how to add environment details when running OpenSearch Benchmark with a Python script: + +```python +# DO: Document environment details +def run_benchmark(): + environment = { + 'hardware': 'AWS m5.2xlarge', + 'os': 'Ubuntu 20.04', + 'kernel': '5.4.0-1018-aws', + 'opensearch': '2.0.0', + 'java': 'OpenJDK 11.0.11', + 'benchmark_version': '1.0.0' + } + results = opensearch_benchmark.run() + return {'environment': environment, 'results': results} +``` +{% include copy.html %} + +By documenting these details, you ensure that your test results can be properly interpreted and that the tests can be reproduced if necessary. + +### Troubleshooting with logs + +When encountering issues or unexpected results, OpenSearch Benchmark logs can provide valuable insights. Here's how to effectively use logs for troubleshooting: + +1. Navigate to the log file. The main log file is typically located at `~/.osb/logs/benchmark.log`. + +2. Look for error messages. Search for lines containing "ERROR" or "WARNING" to identify potential issues. + +3. Check for performance bottlenecks. Look for entries that indicate slow operations or resource constraints. + +4. Review configuration details, such as logs. Logs often include information about the test configuration, which can help verify that your intended settings were applied correctly. + +5. Pay attention to the duration of different phases of the benchmark, including warmup and measurement periods. + +By carefully reviewing these logs, you can often identify the root cause of performance issues or unexpected benchmark results. If you encounter a log error that you do not recognize, submit an issue to the [OpenSearch Benchmark repository](https://github.com/opensearch-project/opensearch-benchmark). + +## Security considerations + +In most cases, a basic authentication protocol should be sufficient for testing. However, you can use SSL for secure communication during benchmark testing, as shown in the following example `opensearch.yml` configuration: + +```yaml +security: + ssl: true + verification_mode: full + certificate_authorities: + - /path/to/ca.crt + client_certificate: /path/to/client.crt + client_key: /path/to/client.key +``` +{% include copy.html %} + +## Maintenance + +Regular maintenance of your benchmark environment and tools is essential for consistent and reliable testing over time. + +Keep your benchmark tools and workloads up to date with the following commands: + +```bash +# Update OpenSearch Benchmark +pip install --upgrade opensearch-benchmark + +# Update workloads +opensearch-benchmark update-workload geonames + +# Clean old data +opensearch-benchmark clean +``` +{% include copy.html %} + +## Amazon OpenSearch Serverless considerations + +When testing using Amazon OpenSearch Serverless, be aware that not all test procedures may be supported. Always check the `README.md` file of the [workload](https://github.com/opensearch-project/opensearch-benchmark-workloads) you're using to confirm whether it's compatible with OpenSearch Serverless. If compatibility information is not provided, you may need to test the procedures individually to determine which ones are supported. diff --git a/_benchmark/user-guide/optimizing-benchmarks/randomizing-queries.md b/_benchmark/user-guide/optimizing-benchmarks/randomizing-queries.md new file mode 100644 index 00000000000..2242e7e9727 --- /dev/null +++ b/_benchmark/user-guide/optimizing-benchmarks/randomizing-queries.md @@ -0,0 +1,143 @@ +--- +layout: default +title: Randomizing queries +nav_order: 160 +parent: Optimizing benchmarks +grand_parent: User guide +has_math: true +--- + +# Randomizing queries + +By default, OpenSearch Benchmark runs identical queries for multiple benchmark iterations. However, running the same queries repeatedly isn't ideal in every test scenario. For example, simulating real-world caching with many iterations of the same query results in one cache miss followed by many hits. OpenSearch Benchmark lets you randomize queries in a configurable way. + +For example, changing `"gte"` and `"lt"` in the following `nyc_taxis` operation creates distinct queries, resulting in unique cache entries: + +```json +{ + "name": "range", + "operation-type": "search", + "body": { + "query": { + "range": { + "total_amount": { + "gte": 5, + "lte": 15 + } + } + } + } +} +``` + +You can't completely randomize the values because the cache would not get any hits. To get cache hits, the cache must sometimes encounter the same values. To account for the same values while randomizing, OpenSearch Benchmark generates a number $$N$$ of value pairs for each randomized operation at the beginning of the benchmark. OpenSearch Benchmark stores these values in a saved list where each pair is assigned an index from $$1$$ to $$N$$. + +Every time OpenSearch sends a query, OpenSearch Benchmark decides whether to use a pair of values from this saved list in the query. It does this a configurable fraction of the time, called _repeat frequency_ (`rf`). If OpenSearch has encountered the value pair before, this might cause a cache hit. For example, if `rf` = 0.7, the cache hit ratio could be up to 70%. This ratio could cause a hit, depending on the benchmark's duration and cache size. + +OpenSearch Benchmark selects saved value pairs using the Zipf probability distribution, where the probability of selecting pair $$i$$ is proportional to $$1 \over i^\alpha$$. In this formula, $$i$$ represents the index of the saved value pair, and $$\alpha$$ controls how concentrated the distribution is. This distribution reflects usage patterns observed in real caches. Pairs with lower $$i$$ values (closer to $$1$$) are selected more frequently, while pairs with higher $$i$$ values (closer to $$N$$) are selected less often. + +The other $$1 -$$ `rf` fraction of the time, a new random pair of values is generated. Because OpenSearch Benchmark has not encountered these value pairs before, the pairs should miss the cache. + +## Usage + +To use this feature in a workload, you must make some changes to `workload.py` and supply some CLI flags when running OpenSearch Benchmark. + +### Modifying `workload.py` + +Specify how to generate the saved value pairs for each operation by registering a "standard value source" for that operation. This Python function accepts no arguments and returns a dictionary. The keys mirror those in the input query but are randomized. Finally, change the `register()` method so that it registers this function with the operation name and field name, which are randomized. + +For example, a standard value source used to randomize the `"total_amount"` field in the preceding `"range"` operation might appear similar to the following function: + +```py +def random_money_values(max_value): + gte_cents = random.randrange(0, max_value*100) + lte_cents = random.randrange(gte_cents, max_value*100) + return { + "gte":gte_cents/100, + "lte":lte_cents/100 + } + +def range_query_standard_value_source(): + return random_money_values(120.00) +``` + +Similarly, you can randomize the registration behavior using the following function: + +```py +def register(registry): + registry.register_standard_value_source("range", "total_amount", range_query_standard_value_source) +``` + +This function may already contain code. Retain it if so. If `workload.py` does not exist or lacks a `register(registry)` function, you can create them. + +#### Randomizing non-range queries + +By default, OpenSearch Benchmark assumes that the query to be randomized is a `"range"` query with values `"gte"`/`"gt"`, `"lte"`/`"lt"`, and, optionally, `"format"`. If this isn't the case, you can configure it to use a different query type name and different values. + +For example, to randomize the following workload operation: + +```json +{ + "name": "bbox", + "operation-type": "search", + "index": "nyc_taxis", + "body": { + "size": 0, + "query": { + "geo_bounding_box": { + "pickup_location": { + "top_left": [-74.27, 40.92], + "bottom_right": [-73.68, 40.49] + } + } + } + } +} +``` + +You would register the following function in `workload.py`: + +```py +registry.register_query_randomization_info("bbox", "geo_bounding_box", [["top_left"], ["bottom_right"]], []) +``` + +The first argument, `"bbox"`, is the operation's name. + +The second argument, `"geo_bounding_box"`, is the query type name. + +The third argument is a list of lists: `[[“top_left”], [“bottom_right”]]`. The outer list's entries specify parameters for randomization because there might be different versions of the same name that represent roughly the same parameters, for example, `"gte"` or `"gt"`. Here, there's only one option for each parameter name. At least one version of each parameter's name must be present in the original query in order for it to be randomized. + +The last argument is a list of optional parameters. If an optional parameter is present in the random standard value source, OpenSearch Benchmark inserts the parameter into the randomized version of the query. If it's not in the source, it's ignored. There are no optional parameters in the following example, but the typical use case would be `"format"` in a range query. + +If there is no registration, the default registration is used: `registry.register_query_randomization_info(, “range”, [[“gte”, “gt”], [“lte”, “lt”]], [“format”])`. + + +The `dict` returned by the standard value source should match the parameter names you are randomizing. For example, the following is the standard value source for the preceding example: + +```py +def bounding_box_source(): + top_longitude = random.uniform(-74.27, -73.68) + top_latitude = random.uniform(40.49, 40.92) + + bottom_longitude = random.uniform(top_longitude, -73.68) + bottom_latitude = random.uniform(40.49, top_latitude) + + return { + "top_left":[top_longitude, top_latitude], + "bottom_right":[bottom_longitude, bottom_latitude] + } +``` + + + +### CLI flags + +Use the following CLI flags to customize randomization: + +- `--randomization-enabled` turns randomization on and off. If randomization is not enabled, none of the randomization flags will be applied. + +- `--randomization-repeat-frequency` or `-rf` sets the fraction of pairs drawn from the saved value pairs generated at the start of the benchmark. The value should be between `0.0` and `1.0`. Default is `0.3`. + +- `--randomization-n` sets the number `N` of value pairs generated for each operation. Default is `5000`. + +- `--randomization-alpha` sets the `alpha` parameter, which controls the spread of the `Zipf` distribution. The value should be `>=0`. Lower values increase the spread of the distribution. Default is `1.0`. diff --git a/_benchmark/user-guide/understanding-results/summary-reports.md b/_benchmark/user-guide/understanding-results/summary-reports.md index 28578c8c893..eed6d82e1da 100644 --- a/_benchmark/user-guide/understanding-results/summary-reports.md +++ b/_benchmark/user-guide/understanding-results/summary-reports.md @@ -120,7 +120,7 @@ OpenSearch Benchmark results are stored in-memory or in external storage. When stored in-memory, results can be found in the `/.benchmark/benchmarks/test_executions/` directory. Results are named in accordance with the `test_execution_id` of the most recent workload test. -While [running a test](https://opensearch.org/docs/latest/benchmark/reference/commands/execute-test/#general-settings), you can customize where the results are stored using any combination of the following command flags: +While [running a test]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/execute-test/#general-settings), you can customize where the results are stored using any combination of the following command flags: * `--results-file`: When provided a file path, writes the summary report to the file indicated in the path. * `--results-format`: Defines the output format for the summary report results, either `markdown` or `csv`. Default is `markdown`. diff --git a/_benchmark/user-guide/understanding-workloads/choosing-a-workload.md b/_benchmark/user-guide/understanding-workloads/choosing-a-workload.md index ae973a7c62f..80e94b33630 100644 --- a/_benchmark/user-guide/understanding-workloads/choosing-a-workload.md +++ b/_benchmark/user-guide/understanding-workloads/choosing-a-workload.md @@ -16,14 +16,79 @@ For example, say you're a system architect at a rideshare company. As a rideshar Consider the following criteria when deciding which workload would work best for benchmarking your cluster: -- The cluster's use case. +- The cluster's use case and the size of the cluster. Small clusters usually contain 1--10 nodes and are suitable for development environments. Medium clusters usually contain 11--50 nodes and are used for testing environments that more closely resemble a production cluster. - The data types that your cluster uses compared to the data structure of the documents contained in the workload. Each workload contains an example document so that you can compare data types, or you can view the index mappings and data types in the `index.json` file. - The query types most commonly used inside your cluster. The `operations/default.json` file contains information about the query types and workload operations. For a list of common operations, see [Common operations]({{site.url}}{{site.baseurl}}/benchmark/user-guide/understanding-workloads/common-operations/). -## General search clusters +## General search use cases: `nyc_taxis` -For benchmarking clusters built for general search use cases, start with the [nyc_taxis](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/nyc_taxis) workload. This workload contains data about the rides taken in yellow taxis in New York City in 2015. +For benchmarking clusters built for general search use cases, start with the [nyc_taxis](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/nyc_taxis) workload. It contains the following: -## Log data +- **Data type**: Ride data from yellow taxis in New York City in 2015. +- **Cluster requirements**: Suitable for small- to medium-sized clusters. -For benchmarking clusters built for indexing and search with log data, use the [http_logs](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/http_logs) workload. This workload contains data about the 1998 World Cup. +This workload tests the following queries and search functions: + +- Range queries +- Term queries on various fields +- Geodistance queries +- Aggregations + +## Vector data: `vectorsearch` + +The [`vectorsearch`](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/vectorsearch) workload is designed to benchmark vector search capabilities, including performance and accuracy. It contains the following: + +- **Data type**: High-dimensional vector data, often representing embeddings of text or images. +- **Cluster requirements**: Requires a cluster with [vector search capabilities]({{site.url}}{{site.baseurl}}/vector-search/) enabled. + +This workload tests the following queries and search functions: + +- k-NN vector searches +- Hybrid searches combining vector similarity with metadata filtering +- Indexing performance for high-dimensional vector data + +## Comprehensive search solutions: `big5` + +The [big5](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/big5) workload is a comprehensive benchmark suite for testing various aspects of search engine performance, including overall search engine performance across multiple use cases. It contains the following: + +- **Data type**: A mix of different data types, including text, numeric, and structured data. +- **Cluster requirements**: Suitable for medium to large clusters because it's designed to stress test various components. + +This workload tests the following queries and search functions: + +- Full-text search performance +- Aggregation performance +- Complex Boolean queries +- Sorting and pagination +- Indexing performance for various data types + +## Percolator queries: `percolator` + +The [percolator](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/percolator) workload is designed to test the performance of the `percolator` query type. It contains the following: + +- **Data type**: A set of stored queries and documents to be matched against those queries. +- **Cluster requirements**: Suitable for clusters that make heavy use of the [percolator]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/percolator/) feature. + +This workload tests the following queries and search functions: + +- Indexing performance for storing queries +- Matching performance for percolator queries +- Scalability with increasing numbers of stored queries + +## Log data: `http_logs` + +For benchmarking clusters built for indexing and search using log data, use the [http_logs](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/http_logs) workload. It contains the following: + +- **Data type**: HTTP access logs from the 1998 World Cup website. +- **Cluster requirements**: Suitable for clusters optimized for time-series data and log analytics. + +This workload tests the following queries and search functions: + +- Time range queries +- Term queries on fields like `status-code` or `user-agent` +- Aggregations for metrics like request count and average response size +- Cardinality aggregations on fields like `ip-address`. + +## Creating a custom workload + +If you can't find an official workload that suits your needs, you can create a custom workload. For more information, see [Creating custom workloads]({{site.url}}{{site.baseurl}}/benchmark/user-guide/working-with-workloads/creating-custom-workloads/). diff --git a/_benchmark/user-guide/understanding-workloads/index.md b/_benchmark/user-guide/understanding-workloads/index.md index 6e6d2aa9c16..9401a2cf038 100644 --- a/_benchmark/user-guide/understanding-workloads/index.md +++ b/_benchmark/user-guide/understanding-workloads/index.md @@ -3,12 +3,23 @@ layout: default title: Understanding workloads nav_order: 10 parent: User guide +has_toc: false has_children: true +items: + - heading: "Anatomy of a workload" + description: "Understand each component of a workload" + link: "/benchmark/user-guide/understanding-workloads/anatomy-of-a-workload/" + - heading: "Choosing a workload" + description: "Determine which workload best matches your dataset" + link: "/benchmark/user-guide/understanding-workloads/choosing-a-workload/" + - heading: "Common operations" + description: "Familiarize yourself with common operations" + link: "/benchmark/user-guide/understanding-workloads/common-operations/" --- # Understanding workloads OpenSearch Benchmark includes a set of [workloads](https://github.com/opensearch-project/opensearch-benchmark-workloads) that you can use to benchmark data from your cluster. Workloads contain descriptions of one or more benchmarking scenarios that use a specific document corpus to perform a benchmark against your cluster. The document corpus contains any indexes, data files, and operations invoked when the workflow runs. - +{% include list.html list_items=page.items%} diff --git a/_benchmark/user-guide/working-with-workloads/creating-custom-workloads.md b/_benchmark/user-guide/working-with-workloads/creating-custom-workloads.md index a239c94249f..e5cf7106b79 100644 --- a/_benchmark/user-guide/working-with-workloads/creating-custom-workloads.md +++ b/_benchmark/user-guide/working-with-workloads/creating-custom-workloads.md @@ -12,7 +12,7 @@ redirect_from: # Creating custom workloads -OpenSearch Benchmark (OSB) includes a set of [workloads](https://github.com/opensearch-project/opensearch-benchmark-workloads) that you can use to benchmark data from your cluster. Additionally, if you want to create a workload that is tailored to your own data, you can create a custom workload using one of the following options: +OpenSearch Benchmark includes a set of [workloads](https://github.com/opensearch-project/opensearch-benchmark-workloads) that you can use to benchmark data from your cluster. Additionally, if you want to create a workload that is tailored to your own data, you can create a custom workload using one of the following options: - [Creating custom workloads](#creating-custom-workloads) - [Creating a workload from an existing cluster](#creating-a-workload-from-an-existing-cluster) @@ -32,14 +32,14 @@ If you already have an OpenSearch cluster with indexed data, use the following s ### Prerequisites -Before creating a custom OSB workload, make sure you have the following prerequisites in place: +Before creating a custom OpenSearch Benchmark workload, make sure you have the following prerequisites in place: - An OpenSearch cluster with an index that contains 1000 or more documents. If your cluster's index does not contain at least 1000 documents, the workload can still run tests, however, you cannot run workloads using `--test-mode`. - You must have the correct permissions to access your OpenSearch cluster. For more information about cluster permissions, see [Permissions]({{site.url}}{{site.baseurl}}/security/access-control/permissions/). ### Customizing the workload -To begin creating a custom OSB workload, use the `opensearch-benchmark create-workload` command. +To begin creating a custom OpenSearch Benchmark workload, use the `opensearch-benchmark create-workload` command. ``` opensearch-benchmark create-workload \ @@ -81,7 +81,7 @@ Extracting documents for index [movies]... 2000/2000 docs [10 ------------------------------- ``` -As part of workload creation, OSB generates the following files. You can access them in the directory specified by the `--output-path` option. +As part of workload creation, OpenSearch Benchmark generates the following files. You can access them in the directory specified by the `--output-path` option. - `workload.json`: Contains general workload specifications. - `.json`: Contains mappings and settings for the extracted indexes. diff --git a/_benchmark/user-guide/working-with-workloads/index.md b/_benchmark/user-guide/working-with-workloads/index.md index a6acb86b4bb..fdd4ecd91ec 100644 --- a/_benchmark/user-guide/working-with-workloads/index.md +++ b/_benchmark/user-guide/working-with-workloads/index.md @@ -3,6 +3,7 @@ layout: default title: Working with workloads nav_order: 15 parent: User guide +has_toc: false has_children: true --- diff --git a/_benchmark/workloads/vectorsearch.md b/_benchmark/workloads/vectorsearch.md new file mode 100644 index 00000000000..be337ac5fda --- /dev/null +++ b/_benchmark/workloads/vectorsearch.md @@ -0,0 +1,572 @@ +--- +layout: default +title: Vector search +nav_order: 35 +--- + +# Vector search + +The vector search workload benchmarks OpenSearch's vector engine capabilities for both indexing and search operations. It tests various vector search algorithms, quantization methods, and index configurations to measure performance metrics like throughput, latency, and recall accuracy. The workload supports different datasets and can evaluate both trained and untrained vector search methods. + +This workload currently supports datasets in either the `HDF5` or `BIG-ANN` formats. To download the datasets, use [this link](http://corpus-texmex.irisa.fr/). + +## Supported workload parameters + +The following workload parameters are supported by the vector search workload. + +| Name | Description | +| ----------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| `target_index_name` | The name of index to which to add vectors. | +| `target_field_name` | The name of the field to which to add vectors. Use "." to indicate a nested field. | +| `target_index_body` | The path to the target index definition. | +| `target_index_primary_shards` | The target index's primary shards. | +| `target_index_replica_shards` | The target index's replica shards. | +| `target_index_dimension` | The dimension of the target index. | +| `target_index_space_type` | The target index space type. | +| `target_index_bulk_size` | The target index bulk size. | +| `target_index_bulk_index_data_set_format` | The format of the vector dataset. | +| `target_index_bulk_index_data_set_path` | The path to the vector dataset in the index. | +| `target_index_bulk_index_data_set_corpus` | The corpus name of the vector dataset. | +| `target_index_bulk_index_clients` | The clients to be used for bulk ingestion. Must be a divisor of dataset size. | +| `target_index_max_num_segments` | The number of segments to merge into the target index before beginning to search. | +| `target_index_force_merge_timeout` | The amount of time (in seconds) to wait before force merging requests. | +| `hnsw_ef_search ` | THE `HNSW ef` search parameter. | +| `hnsw_ef_construction` | The `HNSW ef` construction parameter. | +| `id_field_name` | The name of the field that will be used to identify documents in an index. | +| `hnsw_m ` | The `HNSW m` parameter. | +| `query_k` | The number of neighbors to return for the search. Only one of `query_k`, `query_max_distance`, or `query_min_score` can be provided. | +| `query_max_distance ` | The maximum distance to be returned for the vector search. Only one of `query_k`, `query_max_distance`, or `query_min_score` can be provided. | +| `query_min_score` | The minimum score to be returned for the vector search. Only one of `query_k`, `query_max_distance`, or `query_min_score` can be provided. | +| `query_data_set_format` | The format of the vector dataset used for queries. Only one of `query_k`, `query_max_distance`, or `query_min_score` can be provided. | +| `query_data_set_path` | The path to the vector dataset used for queries. | +| `query_count` | The number of queries used for the search operation. | +| `query_body` | The JSON properties that will be merged with the search body. | +| `search_clients` | The number of clients used to run queries. | +| `repetitions` | The number of repetitions completed until the dataset is exhausted. Default is `1`. | +| `target_throughput` | The target throughput for each query operation, in requests per second. Default is `10`. | +| `time_period` | The period of time dedicated to running the benchmark test, in seconds. Default is `900`. | + + + +## Test procedures + +The vector search workload supports the following test procedures. + +### No-train test procedure + +The no-train test procedure tests vector search indexes that require no training. You can define the underlying configuration of the vector search algorithm (such as specifying a specific engine or space type) as method definitions. + +### No-train test (index only) procedure + +The no-train test (index only) procedure is used to index vector search indexes that require no training. This can be particularly useful when you want to benchmark only the indexing operation. + +### No-train test (Amazon OpenSearch Serverless) + +The no-train test procedure for Amazon OpenSearch Serverless is used specifically for OpenSearch Serverless vector search collections. This procedure doesn't include operations like **refresh** and **warmup** because they aren't supported by vector search collections. + +### Force merge index procedure + +The force merge index procedure optimizes vector search indexes by performing force merge operations up to a given maximum number of segments. For large datasets, force merging is a costly operation. Therefore, we recommend using a separate procedure to occasionally trigger force merge operations based on user requirements. + +### Train test procedure + +The train test procedure benchmarks approximate k-NN search algorithms that require a training step. For example, the Faiss Inverted File Indexing (IVF) technique requires a training step in order to retrieve cluster vectors. After the step is performed, the benchmark can search a smaller number of cluster centroids instead of the entire dataset. + +### Search procedure + +The search procedure benchmarks previously indexed vector search indexes. This can be useful when you want to benchmark large vector search indexes without reindexing each time because load time can be substantial for large datasets. This procedure includes warmup operations intended to avoid cold start problems during vector search. + +## Custom runners + +Only one custom runner, `warmup-knn-indices`, is supported by the vector search workload. This runner will warm up k-NN indexes and retry the warmup until it succeeds. + +## Running the workload + +To run the vector search workload, use the following command: + +```bash +export ENDPOINT= +export PARAMS_FILE= + +opensearch-benchmark execute-test \ + --target-hosts $ENDPOINT \ + --workload vectorsearch \ + --workload-params ${PARAMS_FILE} \ + --pipeline benchmark-only \ + --kill-running-processes +``` +{% include copy.html %} + +## Sample results + +When using the vector search workload, you can expect results similar to the following. + +### Train test procedure + +The following example provides results from the train test procedure: + + +``` +------------------------------------------------------ + _______ __ _____ + / ____(_)___ ____ _/ / / ___/_________ ________ + / /_ / / __ \/ __ `/ / \__ \/ ___/ __ \/ ___/ _ \ + / __/ / / / / / /_/ / / ___/ / /__/ /_/ / / / __/ +/_/ /_/_/ /_/\__,_/_/ /____/\___/\____/_/ \___/ +------------------------------------------------------ + +| Metric | Task | Value | Unit | +|---------------------------------------------------------------:|-------------------:|------------:|-------:| +| Cumulative indexing time of primary shards | | 0.00946667 | min | +| Min cumulative indexing time across primary shards | | 0 | min | +| Median cumulative indexing time across primary shards | | 0.00298333 | min | +| Max cumulative indexing time across primary shards | | 0.00336667 | min | +| Cumulative indexing throttle time of primary shards | | 0 | min | +| Min cumulative indexing throttle time across primary shards | | 0 | min | +| Median cumulative indexing throttle time across primary shards | | 0 | min | +| Max cumulative indexing throttle time across primary shards | | 0 | min | +| Cumulative merge time of primary shards | | 0 | min | +| Cumulative merge count of primary shards | | 0 | | +| Min cumulative merge time across primary shards | | 0 | min | +| Median cumulative merge time across primary shards | | 0 | min | +| Max cumulative merge time across primary shards | | 0 | min | +| Cumulative merge throttle time of primary shards | | 0 | min | +| Min cumulative merge throttle time across primary shards | | 0 | min | +| Median cumulative merge throttle time across primary shards | | 0 | min | +| Max cumulative merge throttle time across primary shards | | 0 | min | +| Cumulative refresh time of primary shards | | 0.00861667 | min | +| Cumulative refresh count of primary shards | | 33 | | +| Min cumulative refresh time across primary shards | | 0 | min | +| Median cumulative refresh time across primary shards | | 0.00268333 | min | +| Max cumulative refresh time across primary shards | | 0.00291667 | min | +| Cumulative flush time of primary shards | | 0.000183333 | min | +| Cumulative flush count of primary shards | | 2 | | +| Min cumulative flush time across primary shards | | 0 | min | +| Median cumulative flush time across primary shards | | 0 | min | +| Max cumulative flush time across primary shards | | 0.000183333 | min | +| Total Young Gen GC time | | 0.075 | s | +| Total Young Gen GC count | | 17 | | +| Total Old Gen GC time | | 0 | s | +| Total Old Gen GC count | | 0 | | +| Store size | | 0.00869293 | GB | +| Translog size | | 2.56114e-07 | GB | +| Heap used for segments | | 0 | MB | +| Heap used for doc values | | 0 | MB | +| Heap used for terms | | 0 | MB | +| Heap used for norms | | 0 | MB | +| Heap used for points | | 0 | MB | +| Heap used for stored fields | | 0 | MB | +| Segment count | | 9 | | +| Min Throughput | custom-vector-bulk | 25527 | docs/s | +| Mean Throughput | custom-vector-bulk | 25527 | docs/s | +| Median Throughput | custom-vector-bulk | 25527 | docs/s | +| Max Throughput | custom-vector-bulk | 25527 | docs/s | +| 50th percentile latency | custom-vector-bulk | 36.3095 | ms | +| 90th percentile latency | custom-vector-bulk | 52.2662 | ms | +| 100th percentile latency | custom-vector-bulk | 68.6513 | ms | +| 50th percentile service time | custom-vector-bulk | 36.3095 | ms | +| 90th percentile service time | custom-vector-bulk | 52.2662 | ms | +| 100th percentile service time | custom-vector-bulk | 68.6513 | ms | +| error rate | custom-vector-bulk | 0 | % | +| Min Throughput | prod-queries | 211.26 | ops/s | +| Mean Throughput | prod-queries | 213.85 | ops/s | +| Median Throughput | prod-queries | 213.48 | ops/s | +| Max Throughput | prod-queries | 216.49 | ops/s | +| 50th percentile latency | prod-queries | 3.43393 | ms | +| 90th percentile latency | prod-queries | 4.01881 | ms | +| 99th percentile latency | prod-queries | 5.56238 | ms | +| 99.9th percentile latency | prod-queries | 9.95666 | ms | +| 99.99th percentile latency | prod-queries | 39.7922 | ms | +| 100th percentile latency | prod-queries | 62.415 | ms | +| 50th percentile service time | prod-queries | 3.43405 | ms | +| 90th percentile service time | prod-queries | 4.0191 | ms | +| 99th percentile service time | prod-queries | 5.56316 | ms | +| 99.9th percentile service time | prod-queries | 9.95666 | ms | +| 99.99th percentile service time | prod-queries | 39.7922 | ms | +| 100th percentile service time | prod-queries | 62.415 | ms | +| error rate | prod-queries | 0 | % | + + +--------------------------------- +[INFO] SUCCESS (took 119 seconds) +--------------------------------- +``` + +### Faiss results + +The following sample outputs were generated using the Faiss IVF benchmarking procedure. For brevity, the test used 100 search queries instead of the 10,000 specified in the parameter files. All other parameters remain the same as those in the `params/train` folder. The first run demonstrates results without quantization, the second run demonstrates scalar quantization, and the third run demonstrates product quantization. Note that quantization may cause search recall to drop. + +#### Faiss IVF with no quantization/flat encoding + + +``` +| Metric | Task | Value | Unit | +|---------------------------------------------------------------:|-------------------------:|------------:|-------:| +| Cumulative indexing time of primary shards | | 11.7662 | min | +| Min cumulative indexing time across primary shards | | 0.000266667 | min | +| Median cumulative indexing time across primary shards | | 0.1423 | min | +| Max cumulative indexing time across primary shards | | 11.6236 | min | +| Cumulative indexing throttle time of primary shards | | 0 | min | +| Min cumulative indexing throttle time across primary shards | | 0 | min | +| Median cumulative indexing throttle time across primary shards | | 0 | min | +| Max cumulative indexing throttle time across primary shards | | 0 | min | +| Cumulative merge time of primary shards | | 1.09872 | min | +| Cumulative merge count of primary shards | | 21 | | +| Min cumulative merge time across primary shards | | 0 | min | +| Median cumulative merge time across primary shards | | 0.00045 | min | +| Max cumulative merge time across primary shards | | 1.09827 | min | +| Cumulative merge throttle time of primary shards | | 0.872417 | min | +| Min cumulative merge throttle time across primary shards | | 0 | min | +| Median cumulative merge throttle time across primary shards | | 0 | min | +| Max cumulative merge throttle time across primary shards | | 0.872417 | min | +| Cumulative refresh time of primary shards | | 0.113733 | min | +| Cumulative refresh count of primary shards | | 59 | | +| Min cumulative refresh time across primary shards | | 0.00235 | min | +| Median cumulative refresh time across primary shards | | 0.00516667 | min | +| Max cumulative refresh time across primary shards | | 0.106217 | min | +| Cumulative flush time of primary shards | | 0.01685 | min | +| Cumulative flush count of primary shards | | 8 | | +| Min cumulative flush time across primary shards | | 0 | min | +| Median cumulative flush time across primary shards | | 0.00791667 | min | +| Max cumulative flush time across primary shards | | 0.00893333 | min | +| Total Young Gen GC time | | 5.442 | s | +| Total Young Gen GC count | | 3739 | | +| Total Old Gen GC time | | 0 | s | +| Total Old Gen GC count | | 0 | | +| Store size | | 1.3545 | GB | +| Translog size | | 0.0304573 | GB | +| Heap used for segments | | 0 | MB | +| Heap used for doc values | | 0 | MB | +| Heap used for terms | | 0 | MB | +| Heap used for norms | | 0 | MB | +| Heap used for points | | 0 | MB | +| Heap used for stored fields | | 0 | MB | +| Segment count | | 14 | | +| Min Throughput | custom-vector-bulk-train | 32222.6 | docs/s | +| Mean Throughput | custom-vector-bulk-train | 32222.6 | docs/s | +| Median Throughput | custom-vector-bulk-train | 32222.6 | docs/s | +| Max Throughput | custom-vector-bulk-train | 32222.6 | docs/s | +| 50th percentile latency | custom-vector-bulk-train | 26.5199 | ms | +| 90th percentile latency | custom-vector-bulk-train | 34.9823 | ms | +| 99th percentile latency | custom-vector-bulk-train | 196.712 | ms | +| 100th percentile latency | custom-vector-bulk-train | 230.342 | ms | +| 50th percentile service time | custom-vector-bulk-train | 26.5158 | ms | +| 90th percentile service time | custom-vector-bulk-train | 34.9823 | ms | +| 99th percentile service time | custom-vector-bulk-train | 196.712 | ms | +| 100th percentile service time | custom-vector-bulk-train | 230.342 | ms | +| error rate | custom-vector-bulk-train | 0 | % | +| Min Throughput | delete-model | 10.58 | ops/s | +| Mean Throughput | delete-model | 10.58 | ops/s | +| Median Throughput | delete-model | 10.58 | ops/s | +| Max Throughput | delete-model | 10.58 | ops/s | +| 100th percentile latency | delete-model | 93.6958 | ms | +| 100th percentile service time | delete-model | 93.6958 | ms | +| error rate | delete-model | 0 | % | +| Min Throughput | train-knn-model | 0.63 | ops/s | +| Mean Throughput | train-knn-model | 0.63 | ops/s | +| Median Throughput | train-knn-model | 0.63 | ops/s | +| Max Throughput | train-knn-model | 0.63 | ops/s | +| 100th percentile latency | train-knn-model | 1577.49 | ms | +| 100th percentile service time | train-knn-model | 1577.49 | ms | +| error rate | train-knn-model | 0 | % | +| Min Throughput | custom-vector-bulk | 11055 | docs/s | +| Mean Throughput | custom-vector-bulk | 14163.8 | docs/s | +| Median Throughput | custom-vector-bulk | 12878.9 | docs/s | +| Max Throughput | custom-vector-bulk | 33841.3 | docs/s | +| 50th percentile latency | custom-vector-bulk | 81.6677 | ms | +| 90th percentile latency | custom-vector-bulk | 117.848 | ms | +| 99th percentile latency | custom-vector-bulk | 202.484 | ms | +| 99.9th percentile latency | custom-vector-bulk | 406.209 | ms | +| 99.99th percentile latency | custom-vector-bulk | 458.823 | ms | +| 100th percentile latency | custom-vector-bulk | 459.417 | ms | +| 50th percentile service time | custom-vector-bulk | 81.6621 | ms | +| 90th percentile service time | custom-vector-bulk | 117.843 | ms | +| 99th percentile service time | custom-vector-bulk | 202.294 | ms | +| 99.9th percentile service time | custom-vector-bulk | 406.209 | ms | +| 99.99th percentile service time | custom-vector-bulk | 458.823 | ms | +| 100th percentile service time | custom-vector-bulk | 459.417 | ms | +| error rate | custom-vector-bulk | 0 | % | +| Min Throughput | force-merge-segments | 0.1 | ops/s | +| Mean Throughput | force-merge-segments | 0.1 | ops/s | +| Median Throughput | force-merge-segments | 0.1 | ops/s | +| Max Throughput | force-merge-segments | 0.1 | ops/s | +| 100th percentile latency | force-merge-segments | 10017.4 | ms | +| 100th percentile service time | force-merge-segments | 10017.4 | ms | +| error rate | force-merge-segments | 0 | % | +| Min Throughput | warmup-indices | 9.63 | ops/s | +| Mean Throughput | warmup-indices | 9.63 | ops/s | +| Median Throughput | warmup-indices | 9.63 | ops/s | +| Max Throughput | warmup-indices | 9.63 | ops/s | +| 100th percentile latency | warmup-indices | 103.228 | ms | +| 100th percentile service time | warmup-indices | 103.228 | ms | +| error rate | warmup-indices | 0 | % | +| Min Throughput | prod-queries | 120.06 | ops/s | +| Mean Throughput | prod-queries | 120.06 | ops/s | +| Median Throughput | prod-queries | 120.06 | ops/s | +| Max Throughput | prod-queries | 120.06 | ops/s | +| 50th percentile latency | prod-queries | 1.75219 | ms | +| 90th percentile latency | prod-queries | 2.29527 | ms | +| 99th percentile latency | prod-queries | 50.4419 | ms | +| 100th percentile latency | prod-queries | 97.9905 | ms | +| 50th percentile service time | prod-queries | 1.75219 | ms | +| 90th percentile service time | prod-queries | 2.29527 | ms | +| 99th percentile service time | prod-queries | 50.4419 | ms | +| 100th percentile service time | prod-queries | 97.9905 | ms | +| error rate | prod-queries | 0 | % | +| Mean recall@k | prod-queries | 0.96 | | +| Mean recall@1 | prod-queries | 0.99 | | + + +--------------------------------- +[INFO] SUCCESS (took 218 seconds) +--------------------------------- +``` + +#### Faiss IVF with scalar quantization (100 search queries) + +``` +| Metric | Task | Value | Unit | +|---------------------------------------------------------------:|-------------------------:|------------:|-------:| +| Cumulative indexing time of primary shards | | 11.5 | min | +| Min cumulative indexing time across primary shards | | 0.000283333 | min | +| Median cumulative indexing time across primary shards | | 0.10915 | min | +| Max cumulative indexing time across primary shards | | 11.3905 | min | +| Cumulative indexing throttle time of primary shards | | 0 | min | +| Min cumulative indexing throttle time across primary shards | | 0 | min | +| Median cumulative indexing throttle time across primary shards | | 0 | min | +| Max cumulative indexing throttle time across primary shards | | 0 | min | +| Cumulative merge time of primary shards | | 1.03638 | min | +| Cumulative merge count of primary shards | | 22 | | +| Min cumulative merge time across primary shards | | 0 | min | +| Median cumulative merge time across primary shards | | 0.000266667 | min | +| Max cumulative merge time across primary shards | | 1.03612 | min | +| Cumulative merge throttle time of primary shards | | 0.798767 | min | +| Min cumulative merge throttle time across primary shards | | 0 | min | +| Median cumulative merge throttle time across primary shards | | 0 | min | +| Max cumulative merge throttle time across primary shards | | 0.798767 | min | +| Cumulative refresh time of primary shards | | 0.107117 | min | +| Cumulative refresh count of primary shards | | 61 | | +| Min cumulative refresh time across primary shards | | 0.00236667 | min | +| Median cumulative refresh time across primary shards | | 0.00543333 | min | +| Max cumulative refresh time across primary shards | | 0.0993167 | min | +| Cumulative flush time of primary shards | | 0.0193167 | min | +| Cumulative flush count of primary shards | | 9 | | +| Min cumulative flush time across primary shards | | 0 | min | +| Median cumulative flush time across primary shards | | 0.00871667 | min | +| Max cumulative flush time across primary shards | | 0.0106 | min | +| Total Young Gen GC time | | 5.267 | s | +| Total Young Gen GC count | | 3688 | | +| Total Old Gen GC time | | 0 | s | +| Total Old Gen GC count | | 0 | | +| Store size | | 1.11609 | GB | +| Translog size | | 0.0304573 | GB | +| Heap used for segments | | 0 | MB | +| Heap used for doc values | | 0 | MB | +| Heap used for terms | | 0 | MB | +| Heap used for norms | | 0 | MB | +| Heap used for points | | 0 | MB | +| Heap used for stored fields | | 0 | MB | +| Segment count | | 18 | | +| Min Throughput | custom-vector-bulk-train | 35950.5 | docs/s | +| Mean Throughput | custom-vector-bulk-train | 35950.5 | docs/s | +| Median Throughput | custom-vector-bulk-train | 35950.5 | docs/s | +| Max Throughput | custom-vector-bulk-train | 35950.5 | docs/s | +| 50th percentile latency | custom-vector-bulk-train | 22.8328 | ms | +| 90th percentile latency | custom-vector-bulk-train | 34.864 | ms | +| 99th percentile latency | custom-vector-bulk-train | 99.471 | ms | +| 100th percentile latency | custom-vector-bulk-train | 210.424 | ms | +| 50th percentile service time | custom-vector-bulk-train | 22.823 | ms | +| 90th percentile service time | custom-vector-bulk-train | 34.864 | ms | +| 99th percentile service time | custom-vector-bulk-train | 99.471 | ms | +| 100th percentile service time | custom-vector-bulk-train | 210.424 | ms | +| error rate | custom-vector-bulk-train | 0 | % | +| Min Throughput | delete-model | 8.39 | ops/s | +| Mean Throughput | delete-model | 8.39 | ops/s | +| Median Throughput | delete-model | 8.39 | ops/s | +| Max Throughput | delete-model | 8.39 | ops/s | +| 100th percentile latency | delete-model | 118.241 | ms | +| 100th percentile service time | delete-model | 118.241 | ms | +| error rate | delete-model | 0 | % | +| Min Throughput | train-knn-model | 0.64 | ops/s | +| Mean Throughput | train-knn-model | 0.64 | ops/s | +| Median Throughput | train-knn-model | 0.64 | ops/s | +| Max Throughput | train-knn-model | 0.64 | ops/s | +| 100th percentile latency | train-knn-model | 1564.44 | ms | +| 100th percentile service time | train-knn-model | 1564.44 | ms | +| error rate | train-knn-model | 0 | % | +| Min Throughput | custom-vector-bulk | 11313.1 | docs/s | +| Mean Throughput | custom-vector-bulk | 14065.7 | docs/s | +| Median Throughput | custom-vector-bulk | 12894.8 | docs/s | +| Max Throughput | custom-vector-bulk | 30050.8 | docs/s | +| 50th percentile latency | custom-vector-bulk | 81.4293 | ms | +| 90th percentile latency | custom-vector-bulk | 111.812 | ms | +| 99th percentile latency | custom-vector-bulk | 196.45 | ms | +| 99.9th percentile latency | custom-vector-bulk | 370.543 | ms | +| 99.99th percentile latency | custom-vector-bulk | 474.156 | ms | +| 100th percentile latency | custom-vector-bulk | 499.048 | ms | +| 50th percentile service time | custom-vector-bulk | 81.4235 | ms | +| 90th percentile service time | custom-vector-bulk | 111.833 | ms | +| 99th percentile service time | custom-vector-bulk | 197.125 | ms | +| 99.9th percentile service time | custom-vector-bulk | 370.543 | ms | +| 99.99th percentile service time | custom-vector-bulk | 474.156 | ms | +| 100th percentile service time | custom-vector-bulk | 499.048 | ms | +| error rate | custom-vector-bulk | 0 | % | +| Min Throughput | force-merge-segments | 0.1 | ops/s | +| Mean Throughput | force-merge-segments | 0.1 | ops/s | +| Median Throughput | force-merge-segments | 0.1 | ops/s | +| Max Throughput | force-merge-segments | 0.1 | ops/s | +| 100th percentile latency | force-merge-segments | 10015.2 | ms | +| 100th percentile service time | force-merge-segments | 10015.2 | ms | +| error rate | force-merge-segments | 0 | % | +| Min Throughput | warmup-indices | 19 | ops/s | +| Mean Throughput | warmup-indices | 19 | ops/s | +| Median Throughput | warmup-indices | 19 | ops/s | +| Max Throughput | warmup-indices | 19 | ops/s | +| 100th percentile latency | warmup-indices | 52.1685 | ms | +| 100th percentile service time | warmup-indices | 52.1685 | ms | +| error rate | warmup-indices | 0 | % | +| Min Throughput | prod-queries | 159.49 | ops/s | +| Mean Throughput | prod-queries | 159.49 | ops/s | +| Median Throughput | prod-queries | 159.49 | ops/s | +| Max Throughput | prod-queries | 159.49 | ops/s | +| 50th percentile latency | prod-queries | 1.92377 | ms | +| 90th percentile latency | prod-queries | 2.63867 | ms | +| 99th percentile latency | prod-queries | 48.513 | ms | +| 100th percentile latency | prod-queries | 90.543 | ms | +| 50th percentile service time | prod-queries | 1.92377 | ms | +| 90th percentile service time | prod-queries | 2.63867 | ms | +| 99th percentile service time | prod-queries | 48.513 | ms | +| 100th percentile service time | prod-queries | 90.543 | ms | +| error rate | prod-queries | 0 | % | +| Mean recall@k | prod-queries | 0.96 | | +| Mean recall@1 | prod-queries | 0.98 | | + + +--------------------------------- +[INFO] SUCCESS (took 218 seconds) +--------------------------------- +``` + +#### Faiss IVF with product quantization (100 search queries) + +``` +| Metric | Task | Value | Unit | +|---------------------------------------------------------------:|-------------------------:|------------:|-------:| +| Cumulative indexing time of primary shards | | 11.3862 | min | +| Min cumulative indexing time across primary shards | | 0.0003 | min | +| Median cumulative indexing time across primary shards | | 0.12735 | min | +| Max cumulative indexing time across primary shards | | 11.2586 | min | +| Cumulative indexing throttle time of primary shards | | 0 | min | +| Min cumulative indexing throttle time across primary shards | | 0 | min | +| Median cumulative indexing throttle time across primary shards | | 0 | min | +| Max cumulative indexing throttle time across primary shards | | 0 | min | +| Cumulative merge time of primary shards | | 1.50842 | min | +| Cumulative merge count of primary shards | | 19 | | +| Min cumulative merge time across primary shards | | 0 | min | +| Median cumulative merge time across primary shards | | 0.000233333 | min | +| Max cumulative merge time across primary shards | | 1.50818 | min | +| Cumulative merge throttle time of primary shards | | 0.58095 | min | +| Min cumulative merge throttle time across primary shards | | 0 | min | +| Median cumulative merge throttle time across primary shards | | 0 | min | +| Max cumulative merge throttle time across primary shards | | 0.58095 | min | +| Cumulative refresh time of primary shards | | 0.2059 | min | +| Cumulative refresh count of primary shards | | 61 | | +| Min cumulative refresh time across primary shards | | 0.00238333 | min | +| Median cumulative refresh time across primary shards | | 0.00526667 | min | +| Max cumulative refresh time across primary shards | | 0.19825 | min | +| Cumulative flush time of primary shards | | 0.0254667 | min | +| Cumulative flush count of primary shards | | 10 | | +| Min cumulative flush time across primary shards | | 0 | min | +| Median cumulative flush time across primary shards | | 0.0118333 | min | +| Max cumulative flush time across primary shards | | 0.0136333 | min | +| Total Young Gen GC time | | 6.477 | s | +| Total Young Gen GC count | | 3565 | | +| Total Old Gen GC time | | 0 | s | +| Total Old Gen GC count | | 0 | | +| Store size | | 0.892541 | GB | +| Translog size | | 0.0304573 | GB | +| Heap used for segments | | 0 | MB | +| Heap used for doc values | | 0 | MB | +| Heap used for terms | | 0 | MB | +| Heap used for norms | | 0 | MB | +| Heap used for points | | 0 | MB | +| Heap used for stored fields | | 0 | MB | +| Segment count | | 21 | | +| Min Throughput | custom-vector-bulk-train | 31931 | docs/s | +| Mean Throughput | custom-vector-bulk-train | 31931 | docs/s | +| Median Throughput | custom-vector-bulk-train | 31931 | docs/s | +| Max Throughput | custom-vector-bulk-train | 31931 | docs/s | +| 50th percentile latency | custom-vector-bulk-train | 25.3297 | ms | +| 90th percentile latency | custom-vector-bulk-train | 35.3864 | ms | +| 99th percentile latency | custom-vector-bulk-train | 144.372 | ms | +| 100th percentile latency | custom-vector-bulk-train | 209.37 | ms | +| 50th percentile service time | custom-vector-bulk-train | 25.3226 | ms | +| 90th percentile service time | custom-vector-bulk-train | 35.3864 | ms | +| 99th percentile service time | custom-vector-bulk-train | 144.372 | ms | +| 100th percentile service time | custom-vector-bulk-train | 209.37 | ms | +| error rate | custom-vector-bulk-train | 0 | % | +| Min Throughput | delete-model | 8.65 | ops/s | +| Mean Throughput | delete-model | 8.65 | ops/s | +| Median Throughput | delete-model | 8.65 | ops/s | +| Max Throughput | delete-model | 8.65 | ops/s | +| 100th percentile latency | delete-model | 114.725 | ms | +| 100th percentile service time | delete-model | 114.725 | ms | +| error rate | delete-model | 0 | % | +| Min Throughput | train-knn-model | 0.03 | ops/s | +| Mean Throughput | train-knn-model | 0.03 | ops/s | +| Median Throughput | train-knn-model | 0.03 | ops/s | +| Max Throughput | train-knn-model | 0.03 | ops/s | +| 100th percentile latency | train-knn-model | 37222.2 | ms | +| 100th percentile service time | train-knn-model | 37222.2 | ms | +| error rate | train-knn-model | 0 | % | +| Min Throughput | custom-vector-bulk | 10669.3 | docs/s | +| Mean Throughput | custom-vector-bulk | 14468.6 | docs/s | +| Median Throughput | custom-vector-bulk | 12496.1 | docs/s | +| Max Throughput | custom-vector-bulk | 35027.8 | docs/s | +| 50th percentile latency | custom-vector-bulk | 74.2584 | ms | +| 90th percentile latency | custom-vector-bulk | 113.426 | ms | +| 99th percentile latency | custom-vector-bulk | 293.075 | ms | +| 99.9th percentile latency | custom-vector-bulk | 1774.41 | ms | +| 99.99th percentile latency | custom-vector-bulk | 1969.99 | ms | +| 100th percentile latency | custom-vector-bulk | 1971.29 | ms | +| 50th percentile service time | custom-vector-bulk | 74.2577 | ms | +| 90th percentile service time | custom-vector-bulk | 113.477 | ms | +| 99th percentile service time | custom-vector-bulk | 292.481 | ms | +| 99.9th percentile service time | custom-vector-bulk | 1774.41 | ms | +| 99.99th percentile service time | custom-vector-bulk | 1969.99 | ms | +| 100th percentile service time | custom-vector-bulk | 1971.29 | ms | +| error rate | custom-vector-bulk | 0 | % | +| Min Throughput | force-merge-segments | 0.05 | ops/s | +| Mean Throughput | force-merge-segments | 0.05 | ops/s | +| Median Throughput | force-merge-segments | 0.05 | ops/s | +| Max Throughput | force-merge-segments | 0.05 | ops/s | +| 100th percentile latency | force-merge-segments | 20015.2 | ms | +| 100th percentile service time | force-merge-segments | 20015.2 | ms | +| error rate | force-merge-segments | 0 | % | +| Min Throughput | warmup-indices | 47.06 | ops/s | +| Mean Throughput | warmup-indices | 47.06 | ops/s | +| Median Throughput | warmup-indices | 47.06 | ops/s | +| Max Throughput | warmup-indices | 47.06 | ops/s | +| 100th percentile latency | warmup-indices | 20.6798 | ms | +| 100th percentile service time | warmup-indices | 20.6798 | ms | +| error rate | warmup-indices | 0 | % | +| Min Throughput | prod-queries | 87.76 | ops/s | +| Mean Throughput | prod-queries | 87.76 | ops/s | +| Median Throughput | prod-queries | 87.76 | ops/s | +| Max Throughput | prod-queries | 87.76 | ops/s | +| 50th percentile latency | prod-queries | 1.81677 | ms | +| 90th percentile latency | prod-queries | 2.80454 | ms | +| 99th percentile latency | prod-queries | 51.2039 | ms | +| 100th percentile latency | prod-queries | 98.2032 | ms | +| 50th percentile service time | prod-queries | 1.81677 | ms | +| 90th percentile service time | prod-queries | 2.80454 | ms | +| 99th percentile service time | prod-queries | 51.2039 | ms | +| 100th percentile service time | prod-queries | 98.2032 | ms | +| error rate | prod-queries | 0 | % | +| Mean recall@k | prod-queries | 0.62 | | +| Mean recall@1 | prod-queries | 0.52 | | + +--------------------------------- +[INFO] SUCCESS (took 413 seconds) +--------------------------------- +``` diff --git a/_clients/go.md b/_clients/go.md index 4e7de566018..892c6cec057 100644 --- a/_clients/go.md +++ b/_clients/go.md @@ -6,7 +6,7 @@ nav_order: 50 # Go client -The OpenSearch Go client lets you connect your Go application with the data in your OpenSearch cluster. This getting started guide illustrates how to connect to OpenSearch, index documents, and run queries. For the client's complete API documentation and additional examples, see the [Go client API documentation](https://pkg.go.dev/github.com/opensearch-project/opensearch-go/v2). +The OpenSearch Go client lets you connect your Go application with the data in your OpenSearch cluster. This getting started guide illustrates how to connect to OpenSearch, index documents, and run queries. For the client's complete API documentation and additional examples, see the [Go client API documentation](https://pkg.go.dev/github.com/opensearch-project/opensearch-go/v4). For the client source code, see the [opensearch-go repo](https://github.com/opensearch-project/opensearch-go). @@ -29,7 +29,7 @@ go get github.com/opensearch-project/opensearch-go ## Connecting to OpenSearch -To connect to the default OpenSearch host, create a client object with the address `https://localhost:9200` if you are using the Security plugin: +To connect to the default OpenSearch host, create a client object with the address `https://localhost:9200` if you are using the Security plugin: ```go client, err := opensearch.NewClient(opensearch.Config{ @@ -68,9 +68,9 @@ import ( "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/config" - opensearch "github.com/opensearch-project/opensearch-go/v2" - opensearchapi "github.com/opensearch-project/opensearch-go/v2/opensearchapi" - requestsigner "github.com/opensearch-project/opensearch-go/v2/signer/awsv2" + opensearch "github.com/opensearch-project/opensearch-go/v4" + opensearchapi "github.com/opensearch-project/opensearch-go/v4/opensearchapi" + requestsigner "github.com/opensearch-project/opensearch-go/v4/signer/awsv2" ) const endpoint = "" // e.g. https://opensearch-domain.region.com or Amazon OpenSearch Serverless endpoint @@ -102,6 +102,9 @@ func main() { if err != nil { log.Fatal("client creation err", err) } + + _ = client + // your code here } func getCredentialProvider(accessKey, secretAccessKey, token string) aws.CredentialsProviderFunc { @@ -130,9 +133,8 @@ import ( "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/config" - opensearch "github.com/opensearch-project/opensearch-go/v2" - opensearchapi "github.com/opensearch-project/opensearch-go/v2/opensearchapi" - requestsigner "github.com/opensearch-project/opensearch-go/v2/signer/awsv2" + opensearch "github.com/opensearch-project/opensearch-go/v4" + requestsigner "github.com/opensearch-project/opensearch-go/v4/signer/awsv2" ) const endpoint = "" // e.g. https://opensearch-domain.region.com or Amazon OpenSearch Serverless endpoint @@ -164,6 +166,9 @@ func main() { if err != nil { log.Fatal("client creation err", err) } + + _ = client + // your code here } func getCredentialProvider(accessKey, secretAccessKey, token string) aws.CredentialsProviderFunc { @@ -173,7 +178,6 @@ func getCredentialProvider(accessKey, secretAccessKey, token string) aws.Credent SecretAccessKey: secretAccessKey, SessionToken: token, } - return *c, nil } } ``` @@ -197,7 +201,7 @@ client, err := opensearch.NewClient(opensearch.Config{ ``` {% include copy.html %} -The Go client retries requests for a maximum of three times by default. To customize the number of retries, set the `MaxRetries` parameter. Additionally, you can change the list of response codes for which a request is retried by setting the `RetryOnStatus` parameter. The following code snippet creates a new Go client with custom `MaxRetries` and `RetryOnStatus` values: +The Go client retries requests for a maximum of three times by default. To customize the number of retries, set the `MaxRetries` parameter. Additionally, you can change the list of response codes for which a request is retried by setting the `RetryOnStatus` parameter. The following code snippet creates a new Go client with custom `MaxRetries` and `RetryOnStatus` values: ```go client, err := opensearch.NewClient(opensearch.Config{ @@ -226,7 +230,7 @@ settings := strings.NewReader(`{ }`) res := opensearchapi.IndicesCreateRequest{ - Index: "go-test-index1", + Index: "go-test-index1", Body: settings, } ``` @@ -369,7 +373,7 @@ func main() { // Create an index with non-default settings. res := opensearchapi.IndicesCreateRequest{ - Index: IndexName, + Index: IndexName, Body: settings, } fmt.Println("Creating index") @@ -396,7 +400,7 @@ func main() { fmt.Println("Inserting a document") fmt.Println(insertResponse) defer insertResponse.Body.Close() - + // Perform bulk operations. blk, err := client.Bulk( strings.NewReader(` @@ -471,4 +475,4 @@ func main() { defer deleteIndexResponse.Body.Close() } ``` -{% include copy.html %} \ No newline at end of file +{% include copy.html %} diff --git a/_clients/java-rest-high-level.md b/_clients/java-rest-high-level.md index e4364994e51..cfc68cd1e47 100644 --- a/_clients/java-rest-high-level.md +++ b/_clients/java-rest-high-level.md @@ -6,7 +6,7 @@ nav_order: 20 # Java high-level REST client -The OpenSearch Java high-level REST client is deprecated. Support will be removed in OpenSearch version 3.0.0. We recommend switching to the [Java client]({{site.url}}{{site.baseurl}}/clients/java/) instead. +The OpenSearch Java high-level REST client is deprecated. Support will be removed in a future version. We recommend switching to the [Java client]({{site.url}}{{site.baseurl}}/clients/java/) instead. {: .warning} The OpenSearch Java high-level REST client lets you interact with your OpenSearch clusters and indexes through Java methods and data structures rather than HTTP methods and JSON. diff --git a/_clients/python-low-level.md b/_clients/python-low-level.md index ba40fa3f459..4b13c367753 100644 --- a/_clients/python-low-level.md +++ b/_clients/python-low-level.md @@ -106,7 +106,7 @@ client = OpenSearch( ## Connecting to Amazon OpenSearch Service -The following example illustrates connecting to Amazon OpenSearch Service: +The following example illustrates connecting to Amazon OpenSearch Service using IAM credentials: ```python from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth @@ -127,6 +127,25 @@ client = OpenSearch( pool_maxsize = 20 ) ``` + +To connect to Amazon OpenSearch Service through HTTP with a username and password, use the following code: + +```python +from opensearchpy import OpenSearch + +auth = ('admin', 'admin') # For testing only. Don't store credentials in code. + +client = OpenSearch( + hosts=[{"host": host, "port": 443}], + http_auth=auth, + http_compress=True, # enables gzip compression for request bodies + use_ssl=True, + verify_certs=True, + ssl_assert_hostname=False, + ssl_show_warn=False, +) +``` + {% include copy.html %} ## Connecting to Amazon OpenSearch Serverless @@ -359,4 +378,4 @@ print(response) ## Next steps - For Python client API, see the [`opensearch-py` API documentation](https://opensearch-project.github.io/opensearch-py/). -- For Python code samples, see [Samples](https://github.com/opensearch-project/opensearch-py/tree/main/samples). \ No newline at end of file +- For Python code samples, see [Samples](https://github.com/opensearch-project/opensearch-py/tree/main/samples). diff --git a/_config.yml b/_config.yml index 4ead6344c24..5c2ef7212c6 100644 --- a/_config.yml +++ b/_config.yml @@ -1,14 +1,15 @@ title: OpenSearch Documentation description: >- # this means to ignore newlines until "baseurl:" Documentation for OpenSearch, the Apache 2.0 search, analytics, and visualization suite with advanced security, alerting, SQL support, automated index management, deep performance analysis, and more. -baseurl: "/docs/latest" # the subpath of your site, e.g. /blog -url: "https://opensearch.org" # the base hostname & protocol for your site, e.g. http://example.com +baseurl: "/docs/latest" # The subpath of the current version. This is version-specific. For example, for 2.19, the base URL is /docs/2.19. +latesturl: "/docs/latest" # The subpath of the latest version. Used for non-version-specific documentation, like Data Prepper, etc. +url: "https://docs.opensearch.org" # the base hostname & protocol for your site, e.g. http://example.com permalink: /:path/ -opensearch_version: '2.17.0' -opensearch_dashboards_version: '2.17.0' -opensearch_major_minor_version: '2.17' -lucene_version: '9_11_1' +opensearch_version: '3.1.0' +opensearch_dashboards_version: '3.1.0' +opensearch_major_minor_version: '3.1' +lucene_version: '10_2_1' # Build settings markdown: kramdown @@ -31,9 +32,6 @@ collections: install-and-configure: permalink: /:collection/:path/ output: true - upgrade-to: - permalink: /:collection/:path/ - output: true im-plugin: permalink: /:collection/:path/ output: true @@ -43,9 +41,6 @@ collections: dashboards: permalink: /:collection/:path/ output: true - integrations: - permalink: /:collection/:path/ - output: true tuning-your-cluster: permalink: /:collection/:path/ output: true @@ -57,7 +52,7 @@ collections: output: true search-plugins: permalink: /:collection/:path/ - output: true + output: true ml-commons-plugin: permalink: /:collection/:path/ output: true @@ -94,6 +89,9 @@ collections: data-prepper: permalink: /:collection/:path/ output: true + migration-assistant: + permalink: /:collection/:path/ + output: true tools: permalink: /:collection/:path/ output: true @@ -121,6 +119,15 @@ collections: getting-started: permalink: /:collection/:path/ output: true + workspace: + permalink: /:collection/:path/ + output: true + vector-search: + permalink: /:collection/:path/ + output: true + tutorials: + permalink: /:collection/:path/ + output: true opensearch_collection: # Define the collections used in the theme @@ -131,14 +138,15 @@ opensearch_collection: getting-started: name: Getting started nav_fold: true + tutorials: + name: Tutorials + nav_fold: true install-and-configure: name: Install and upgrade nav_fold: true - upgrade-to: - name: Migrate to OpenSearch - # nav_exclude: true + tuning-your-cluster: + name: Creating and tuning your cluster nav_fold: true - # search_exclude: true im-plugin: name: Managing Indexes nav_fold: true @@ -148,12 +156,6 @@ opensearch_collection: dashboards: name: OpenSearch Dashboards nav_fold: true - integrations: - name: OpenSearch Integrations - nav_fold: true - tuning-your-cluster: - name: Creating and tuning your cluster - nav_fold: true security: name: Security in OpenSearch nav_fold: true @@ -173,7 +175,10 @@ opensearch_collection: name: Aggregations nav_fold: true search-plugins: - name: Search + name: Search features + nav_fold: true + vector-search: + name: Vector search nav_fold: true ml-commons-plugin: name: Machine learning @@ -210,6 +215,12 @@ clients_collection: name: Clients nav_fold: true +migration_assistant_collection: + collections: + migration-assistant: + name: Migration Assistant + nav_fold: true + benchmark_collection: collections: benchmark: @@ -219,7 +230,7 @@ benchmark_collection: data_prepper_collection: collections: data-prepper: - name: Data Prepper + name: OpenSearch Data Prepper nav_fold: true # Defaults @@ -236,7 +247,7 @@ defaults: path: "_data-prepper" values: section: "data-prepper" - section-name: "Data Prepper" + section-name: "OpenSearch Data Prepper" - scope: path: "_clients" @@ -249,6 +260,12 @@ defaults: values: section: "benchmark" section-name: "Benchmark" + - + scope: + path: "_migration-assistant" + values: + section: "migration-assistant" + section-name: "Migration Assistant" # Enable or disable the site search # By default, just-the-docs enables its JSON file-based search. We also have an OpenSearch-driven search functionality. @@ -308,6 +325,7 @@ plugins: - jekyll-remote-theme - jekyll-redirect-from - jekyll-sitemap + - jekyll-spec-insert # This format has to conform to RFC822 last-modified-at: @@ -317,6 +335,8 @@ last-modified-at: # The following items will not be processed, by default. Create a custom list # to override the default setting. exclude: + - README.md + - DEVELOPER_GUIDE.md - Gemfile - Gemfile.lock - node_modules @@ -324,6 +344,12 @@ exclude: - vendor/cache/ - vendor/gems/ - vendor/ruby/ - - README.md - - .idea - - templates + - templates/ + - .sass-cache/ + - .jekyll-cache/ + - .idea/ + - .github/ + - .bundle/ + - _site/ + - spec-insert + - release-notes \ No newline at end of file diff --git a/_dashboards/dashboards-assistant/alert-insight.md b/_dashboards/dashboards-assistant/alert-insight.md new file mode 100644 index 00000000000..b1cc8db3392 --- /dev/null +++ b/_dashboards/dashboards-assistant/alert-insight.md @@ -0,0 +1,321 @@ +--- +layout: default +title: Alert insights +parent: OpenSearch Assistant for OpenSearch Dashboards +nav_order: 1 +has_children: false +--- + +# Alert insights + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress the feature or if you want to leave feedback, join the discussion in the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +The OpenSearch Dashboards Assistant alert insights help generate alert summaries and provide log patterns based on the logs that triggered the alert. + +## Configuring alert insights + +To configure alert insights, use the following steps. + +### Prerequisite + +Before using alert insights, you must have the `alerting` and `alerting-dashboards` plugins installed on your cluster. By default, these plugins are installed as part of standard OpenSearch distributions. For more information, see [Installing plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/). + +### Step 1: Enable alert insights + +To enable alert insights, configure the following `opensearch_dashboards.yml` setting: + +```yaml +assistant.alertInsight.enabled: true +``` +{% include copy.html %} + +### Step 2: Create the agents + +To orchestrate alert insights, you'll need to create the necessary [agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/). Create a workflow template for creating all necessary agents by sending the following request: + +
+ + Request + + {: .text-delta} + +```json +POST /_plugins/_flow_framework/workflow?provision=true +{ + "name": "Alert Summary Agent", + "description": "Create Alert Summary Agent using Claude on BedRock", + "use_case": "REGISTER_AGENT", + "version": { + "template": "1.0.0", + "compatibility": ["2.17.0", "3.0.0"] + }, + "workflows": { + "provision": { + "user_params": {}, + "nodes": [ + { + "id": "create_claude_connector", + "type": "create_connector", + "previous_node_inputs": {}, + "user_inputs": { + "version": "1", + "name": "Claude instant runtime Connector", + "protocol": "aws_sigv4", + "description": "The connector to BedRock service for Claude model", + "actions": [ + { + "headers": { + "x-amz-content-sha256": "required", + "content-type": "application/json" + }, + "method": "POST", + "request_body": "{\"prompt\":\"\\n\\nHuman: ${parameters.prompt}\\n\\nAssistant:\", \"max_tokens_to_sample\":${parameters.max_tokens_to_sample}, \"temperature\":${parameters.temperature}, \"anthropic_version\":\"${parameters.anthropic_version}\" }", + "action_type": "predict", + "url": "https://bedrock-runtime.us-west-2.amazonaws.com/model/anthropic.claude-instant-v1/invoke" + } + ], + "credential": { + "access_key": "", + "secret_key": "", + "session_token": "" + }, + "parameters": { + "region": "us-west-2", + "endpoint": "bedrock-runtime.us-west-2.amazonaws.com", + "content_type": "application/json", + "auth": "Sig_V4", + "max_tokens_to_sample": "8000", + "service_name": "bedrock", + "temperature": "0.0001", + "response_filter": "$.completion", + "anthropic_version": "bedrock-2023-05-31" + } + } + }, + { + "id": "register_claude_model", + "type": "register_remote_model", + "previous_node_inputs": { + "create_claude_connector": "connector_id" + }, + "user_inputs": { + "description": "Claude model", + "deploy": true, + "name": "claude-instant" + } + }, + { + "id": "create_alert_summary_ml_model_tool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "parameters": { + "prompt": "You are an OpenSearch Alert Assistant to help summarize the alerts.\n Here is the detail of alert: ${parameters.context};\n The question is: ${parameters.question}." + }, + "name": "MLModelTool", + "type": "MLModelTool" + } + }, + { + "id": "create_alert_summary_agent", + "type": "register_agent", + "previous_node_inputs": { + "create_alert_summary_ml_model_tool": "tools" + }, + "user_inputs": { + "parameters": {}, + "type": "flow", + "name": "Alert Summary Agent", + "description": "this is an alert summary agent" + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +
+ +For sample agent templates, see [Flow Framework sample templates](https://github.com/opensearch-project/flow-framework/tree/2.x/sample-templates). Note the agent ID; you'll use it in the following step. + +For this example, use the templates to create the following agents: +- An alert insights agent, see [flow template](https://github.com/opensearch-project/flow-framework/blob/2.x/sample-templates/create-knowledge-base-alert-agent.json) +- Two summary agents: + - A basic alert summary agent, see [flow template](https://github.com/opensearch-project/flow-framework/blob/2.x/sample-templates/alert-summary-agent-claude-tested.json) + - An agent for an alert summary that includes log patterns, see [flow template](https://github.com/opensearch-project/flow-framework/blob/2.x/sample-templates/alert-summary-log-pattern-agent.json) + + These agents require different prompts. The prompt for the log patterns summary must include a placeholder `${parameters.topNLogPatternData}` and additional instructions to guide the LLM on using this information effectively. Note that log patterns are available only for query monitors created using OpenSearch Dashboards. + +### Step 3: Create the root agents + +Next, create [root agents]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-tutorial/#root_agent) for agents created in the previous step. + +Create a root agent for the alert summary agent: + +```json +POST /.plugins-ml-config/_doc/os_summary +{ + "type": "os_root_agent", + "configuration": { + "agent_id": "" + } +} +``` +{% include copy-curl.html %} + +Create a root agent for the alert summary with log patterns agent: + +```json +POST /.plugins-ml-config/_doc/os_summary_with_log_pattern +{ + "type": "os_root_agent", + "configuration": { + "agent_id": "" + } +} +``` +{% include copy-curl.html %} + +Create a root agent for the alert insights agent: + +```json +POST /.plugins-ml-config/_doc/os_insight +{ + "type": "os_root_agent", + "configuration": { + "agent_id": "" + } +} +``` +{% include copy-curl.html %} + +The created `os_insight` agent provides alert insights related to OpenSearch cluster metrics. For insights about alerts unrelated to OpenSearch cluster metrics, you need to register an agent with [this template](https://github.com/opensearch-project/flow-framework/blob/2.x/sample-templates/create-knowledge-base-alert-agent.json) and change the agent name to `KB_For_Alert_Insight`. +{: .note} + +This example demonstrates a system index. In security-enabled domains, only superadmins have permissions to execute this code. For information about making superadmin calls, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). For access permissions, contact your system administrator. +{: .warning} + +### Step 4: Test the agents + +You can verify that the agents were created successfully by calling the agents with an example payload. + +To test the alert summary agent, send the following request: + +```json +POST /_plugins/_ml/agents//_execute +{ + "parameters": { + "question": "Please summarize this alert, do not use any tool.", + "context": "\n Here is the detail information about alert Error log over 100\n ### Monitor definition\n {\"type\":\"monitor\",\"schema_version\":8,\"name\":\"loghub-apache-error-log\",\"monitor_type\":\"query_level_monitor\",\"enabled\":false,\"enabled_time\":null,\"schedule\":{\"period\":{\"interval\":1,\"unit\":\"MINUTES\"}},\"inputs\":[{\"search\":{\"indices\":[\"loghub-apache-new\"],\"query\":{\"size\":0,\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"10/12/24 11:21 am CST||-1000000h\",\"to\":\"10/12/24 11:21 am CST\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}}}],\"triggers\":[{\"query_level_trigger\":{\"id\":\"NAq7fpIBRJyww-JMjwP_\",\"name\":\"Error log over 100\",\"severity\":\"1\",\"condition\":{\"script\":{\"source\":\"ctx.results[0].hits.total.value > 100\",\"lang\":\"painless\"}},\"actions\":[]}}],\"last_update_time\":1728714554388,\"owner\":\"alerting\",\"associated_workflows\":[],\"associatedCompositeMonitorCnt\":0,\"item_type\":\"query_level_monitor\",\"id\":\"NQq7fpIBRJyww-JMkAMC\",\"version\":3}\n\n ### Active Alert\n {\"ACTIVE\":1,\"ACKNOWLEDGED\":0,\"ERROR\":0,\"total\":1,\"alerts\":[{\"id\":\"Wgq8fpIBRJyww-JMegNr\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"workflow_id\":\"\",\"workflow_name\":\"\",\"associated_alert_ids\":[],\"schema_version\":5,\"monitor_version\":1,\"monitor_name\":\"loghub-apache-error-log\",\"execution_id\":\"NQq7fpIBRJyww-JMkAMC_2024-10-12T03:18:54.311214115_22d189ce-5e93-4927-b8bb-bcf61b7537e3\",\"trigger_id\":\"NAq7fpIBRJyww-JMjwP_\",\"trigger_name\":\"Error log over 100\",\"finding_ids\":[],\"related_doc_ids\":[],\"state\":\"ACTIVE\",\"error_message\":null,\"alert_history\":[],\"severity\":\"1\",\"action_execution_results\":[],\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"end_time\":null,\"acknowledged_time\":null,\"alert_source\":\"monitor\"}],\"trigger_name\":\"Error log over 100\",\"severity\":\"1\",\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"monitor_name\":\"loghub-apache-error-log\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"alert_source\":\"monitor\",\"triggerID\":\"NAq7fpIBRJyww-JMjwP_\"}\n\n ### Value triggers this alert\n 595\n\n ### Alert query DSL {\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}} \n", + } +} +``` +{% include copy-curl.html %} + +To test the alert summary with log patterns agent, send the following request: + +```json +POST /_plugins/_ml/agents//_execute +{ + "parameters": { + "question": "Please summarize this alert, do not use any tool.", + "context": "\n Here is the detail information about alert Error log over 100\n ### Monitor definition\n {\"type\":\"monitor\",\"schema_version\":8,\"name\":\"loghub-apache-error-log\",\"monitor_type\":\"query_level_monitor\",\"enabled\":false,\"enabled_time\":null,\"schedule\":{\"period\":{\"interval\":1,\"unit\":\"MINUTES\"}},\"inputs\":[{\"search\":{\"indices\":[\"loghub-apache-new\"],\"query\":{\"size\":0,\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"10/12/24 11:21 am CST||-1000000h\",\"to\":\"10/12/24 11:21 am CST\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}}}],\"triggers\":[{\"query_level_trigger\":{\"id\":\"NAq7fpIBRJyww-JMjwP_\",\"name\":\"Error log over 100\",\"severity\":\"1\",\"condition\":{\"script\":{\"source\":\"ctx.results[0].hits.total.value > 100\",\"lang\":\"painless\"}},\"actions\":[]}}],\"last_update_time\":1728714554388,\"owner\":\"alerting\",\"associated_workflows\":[],\"associatedCompositeMonitorCnt\":0,\"item_type\":\"query_level_monitor\",\"id\":\"NQq7fpIBRJyww-JMkAMC\",\"version\":3}\n\n ### Active Alert\n {\"ACTIVE\":1,\"ACKNOWLEDGED\":0,\"ERROR\":0,\"total\":1,\"alerts\":[{\"id\":\"Wgq8fpIBRJyww-JMegNr\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"workflow_id\":\"\",\"workflow_name\":\"\",\"associated_alert_ids\":[],\"schema_version\":5,\"monitor_version\":1,\"monitor_name\":\"loghub-apache-error-log\",\"execution_id\":\"NQq7fpIBRJyww-JMkAMC_2024-10-12T03:18:54.311214115_22d189ce-5e93-4927-b8bb-bcf61b7537e3\",\"trigger_id\":\"NAq7fpIBRJyww-JMjwP_\",\"trigger_name\":\"Error log over 100\",\"finding_ids\":[],\"related_doc_ids\":[],\"state\":\"ACTIVE\",\"error_message\":null,\"alert_history\":[],\"severity\":\"1\",\"action_execution_results\":[],\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"end_time\":null,\"acknowledged_time\":null,\"alert_source\":\"monitor\"}],\"trigger_name\":\"Error log over 100\",\"severity\":\"1\",\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"monitor_name\":\"loghub-apache-error-log\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"alert_source\":\"monitor\",\"triggerID\":\"NAq7fpIBRJyww-JMjwP_\"}\n\n ### Value triggers this alert\n 595\n\n ### Alert query DSL {\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}} \n", + "topNLogPatternData": "[[539,["[Sun Dec 04 07:12:44 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 06:19:18 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 16:52:49 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 06:59:47 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 07:11:22 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:01:47 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:31:12 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 05:04:04 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 20:24:49 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 06:16:23 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 20:47:17 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 06:30:43 2005] [error] mod_jk child workerEnv in error state 6","[Mon Dec 05 06:35:27 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:07:30 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 16:32:56 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:01:47 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 16:52:49 2005] [error] mod_jk child workerEnv in error state 8"],"[ :: ] [] _ "],[32,["[Sun Dec 04 14:29:00 2005] [error] [client 4.245.93.87] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 08:54:17 2005] [error] [client 147.31.138.75] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 17:34:57 2005] [error] [client 61.138.216.82] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 07:45:45 2005] [error] [client 63.13.186.196] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 10:53:30 2005] [error] [client 218.76.139.20] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 10:48:48 2005] [error] [client 67.166.248.235] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 15:18:36 2005] [error] [client 67.154.58.130] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 01:30:32 2005] [error] [client 211.62.201.48] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 16:45:04 2005] [error] [client 216.216.185.130] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 17:31:39 2005] [error] [client 218.75.106.250] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 19:00:56 2005] [error] [client 68.228.3.15] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 19:14:09 2005] [error] [client 61.220.139.68] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 09:35:12 2005] [error] [client 207.203.80.15] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 10:28:44 2005] [error] [client 198.232.168.9] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 16:24:05 2005] [error] [client 58.225.62.140] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 17:53:43 2005] [error] [client 218.39.132.175] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 12:33:13 2005] [error] [client 208.51.151.210] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 15:59:01 2005] [error] [client 24.83.37.136] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 11:42:43 2005] [error] [client 216.127.124.16] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 05:15:09 2005] [error] [client 222.166.160.184] Directory index forbidden by rule: /var/www/html/"],"[ :: ] [] [ ...] : ////"],[12,["[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 07:57:02 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 17:43:12 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:16 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 07:57:02 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 17:43:12 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2"],"[ :: ] [] _ -"]]" + } +} +``` +{% include copy-curl.html %} + +To test the alert insights agent, send the following request: + +```json +POST /_plugins/_ml/agents//_execute +{ + "parameters": { + "question": "Please provide your insight on this alerts.", + "context": "\n Here is the detail information about alert Error log over 100\n ### Monitor definition\n {\"type\":\"monitor\",\"schema_version\":8,\"name\":\"loghub-apache-error-log\",\"monitor_type\":\"query_level_monitor\",\"enabled\":false,\"enabled_time\":null,\"schedule\":{\"period\":{\"interval\":1,\"unit\":\"MINUTES\"}},\"inputs\":[{\"search\":{\"indices\":[\"loghub-apache-new\"],\"query\":{\"size\":0,\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"10/12/24 11:21 am CST||-1000000h\",\"to\":\"10/12/24 11:21 am CST\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}}}],\"triggers\":[{\"query_level_trigger\":{\"id\":\"NAq7fpIBRJyww-JMjwP_\",\"name\":\"Error log over 100\",\"severity\":\"1\",\"condition\":{\"script\":{\"source\":\"ctx.results[0].hits.total.value > 100\",\"lang\":\"painless\"}},\"actions\":[]}}],\"last_update_time\":1728714554388,\"owner\":\"alerting\",\"associated_workflows\":[],\"associatedCompositeMonitorCnt\":0,\"item_type\":\"query_level_monitor\",\"id\":\"NQq7fpIBRJyww-JMkAMC\",\"version\":3}\n\n ### Active Alert\n {\"ACTIVE\":1,\"ACKNOWLEDGED\":0,\"ERROR\":0,\"total\":1,\"alerts\":[{\"id\":\"Wgq8fpIBRJyww-JMegNr\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"workflow_id\":\"\",\"workflow_name\":\"\",\"associated_alert_ids\":[],\"schema_version\":5,\"monitor_version\":1,\"monitor_name\":\"loghub-apache-error-log\",\"execution_id\":\"NQq7fpIBRJyww-JMkAMC_2024-10-12T03:18:54.311214115_22d189ce-5e93-4927-b8bb-bcf61b7537e3\",\"trigger_id\":\"NAq7fpIBRJyww-JMjwP_\",\"trigger_name\":\"Error log over 100\",\"finding_ids\":[],\"related_doc_ids\":[],\"state\":\"ACTIVE\",\"error_message\":null,\"alert_history\":[],\"severity\":\"1\",\"action_execution_results\":[],\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"end_time\":null,\"acknowledged_time\":null,\"alert_source\":\"monitor\"}],\"trigger_name\":\"Error log over 100\",\"severity\":\"1\",\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"monitor_name\":\"loghub-apache-error-log\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"alert_source\":\"monitor\",\"triggerID\":\"NAq7fpIBRJyww-JMjwP_\"}\n\n ### Value triggers this alert\n 595\n\n ### Alert query DSL {\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}} \n", + "summary": + } +} +``` +{% include copy-curl.html %} + +## Generating an alert summary + +You can generate an alert summary by calling the `/api/assistant/summary` API endpoint. To generate an alert summary, the fields `index`, `dsl`, and `topNLogPatternData` are optional. If all three fields are provided, the agent will provide a summary with log pattern analysis; otherwise, it will provide a general summary: + +```json +POST /api/assistant/summary +{ + "summaryType": "alerts", + "question": "Please summarize this alert, do not use any tool.", + "context": "\n Here is the detail information about alert Error log over 100\n ### Monitor definition\n {\"type\":\"monitor\",\"schema_version\":8,\"name\":\"loghub-apache-error-log\",\"monitor_type\":\"query_level_monitor\",\"enabled\":false,\"enabled_time\":null,\"schedule\":{\"period\":{\"interval\":1,\"unit\":\"MINUTES\"}},\"inputs\":[{\"search\":{\"indices\":[\"loghub-apache-new\"],\"query\":{\"size\":0,\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"10/12/24 11:21 am CST||-1000000h\",\"to\":\"10/12/24 11:21 am CST\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}}}],\"triggers\":[{\"query_level_trigger\":{\"id\":\"NAq7fpIBRJyww-JMjwP_\",\"name\":\"Error log over 100\",\"severity\":\"1\",\"condition\":{\"script\":{\"source\":\"ctx.results[0].hits.total.value > 100\",\"lang\":\"painless\"}},\"actions\":[]}}],\"last_update_time\":1728714554388,\"owner\":\"alerting\",\"associated_workflows\":[],\"associatedCompositeMonitorCnt\":0,\"item_type\":\"query_level_monitor\",\"id\":\"NQq7fpIBRJyww-JMkAMC\",\"version\":3}\n\n ### Active Alert\n {\"ACTIVE\":1,\"ACKNOWLEDGED\":0,\"ERROR\":0,\"total\":1,\"alerts\":[{\"id\":\"Wgq8fpIBRJyww-JMegNr\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"workflow_id\":\"\",\"workflow_name\":\"\",\"associated_alert_ids\":[],\"schema_version\":5,\"monitor_version\":1,\"monitor_name\":\"loghub-apache-error-log\",\"execution_id\":\"NQq7fpIBRJyww-JMkAMC_2024-10-12T03:18:54.311214115_22d189ce-5e93-4927-b8bb-bcf61b7537e3\",\"trigger_id\":\"NAq7fpIBRJyww-JMjwP_\",\"trigger_name\":\"Error log over 100\",\"finding_ids\":[],\"related_doc_ids\":[],\"state\":\"ACTIVE\",\"error_message\":null,\"alert_history\":[],\"severity\":\"1\",\"action_execution_results\":[],\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"end_time\":null,\"acknowledged_time\":null,\"alert_source\":\"monitor\"}],\"trigger_name\":\"Error log over 100\",\"severity\":\"1\",\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"monitor_name\":\"loghub-apache-error-log\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"alert_source\":\"monitor\",\"triggerID\":\"NAq7fpIBRJyww-JMjwP_\"}\n\n ### Value triggers this alert\n 595\n\n ### Alert query DSL {\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}} \n", + "index": "loghub-apache-new", + "dsl": "{\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}", + "topNLogPatternData": "[[539,["[Sun Dec 04 07:12:44 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 06:19:18 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 16:52:49 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 06:59:47 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 07:11:22 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:01:47 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:31:12 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 05:04:04 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 20:24:49 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 06:16:23 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 20:47:17 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 06:30:43 2005] [error] mod_jk child workerEnv in error state 6","[Mon Dec 05 06:35:27 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 07:07:30 2005] [error] mod_jk child workerEnv in error state 8","[Sun Dec 04 07:18:00 2005] [error] mod_jk child workerEnv in error state 7","[Sun Dec 04 16:32:56 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 17:01:47 2005] [error] mod_jk child workerEnv in error state 6","[Sun Dec 04 16:52:49 2005] [error] mod_jk child workerEnv in error state 8"],"[ :: ] [] _ "],[32,["[Sun Dec 04 14:29:00 2005] [error] [client 4.245.93.87] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 08:54:17 2005] [error] [client 147.31.138.75] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 17:34:57 2005] [error] [client 61.138.216.82] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 07:45:45 2005] [error] [client 63.13.186.196] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 10:53:30 2005] [error] [client 218.76.139.20] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 10:48:48 2005] [error] [client 67.166.248.235] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 15:18:36 2005] [error] [client 67.154.58.130] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 01:30:32 2005] [error] [client 211.62.201.48] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 16:45:04 2005] [error] [client 216.216.185.130] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 17:31:39 2005] [error] [client 218.75.106.250] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 19:00:56 2005] [error] [client 68.228.3.15] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 19:14:09 2005] [error] [client 61.220.139.68] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 09:35:12 2005] [error] [client 207.203.80.15] Directory index forbidden by rule: /var/www/html/","[Mon Dec 05 10:28:44 2005] [error] [client 198.232.168.9] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 16:24:05 2005] [error] [client 58.225.62.140] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 17:53:43 2005] [error] [client 218.39.132.175] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 12:33:13 2005] [error] [client 208.51.151.210] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 15:59:01 2005] [error] [client 24.83.37.136] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 11:42:43 2005] [error] [client 216.127.124.16] Directory index forbidden by rule: /var/www/html/","[Sun Dec 04 05:15:09 2005] [error] [client 222.166.160.184] Directory index forbidden by rule: /var/www/html/"],"[ :: ] [] [ ...] : ////"],[12,["[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 07:57:02 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 17:43:12 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:17 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 20:47:16 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 07:57:02 2005] [error] mod_jk child init 1 -2","[Sun Dec 04 17:43:12 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2","[Mon Dec 05 11:06:52 2005] [error] mod_jk child init 1 -2"],"[ :: ] [] _ -"]]" +} +``` +{% include copy-curl.html %} + +The following table describes the Assistant Summary API parameters. + +Parameter | Required/Optional | Description +:--- | :--- | :--- +`summaryType` | Required | Specifies the type of application calling this API. Use `alerts` for alert insights. +`question` | Required | Specifies the user's question regarding alert insights. Default is `Please summarize this alert, do not use any tool.` +`context` | Required | Provides context for the alert, including the alert monitor definition, active alerts, and trigger values. +`index` | Optional | The index that the alert monitors. If this parameter is not provided, log pattern analysis is not returned. +`dsl` | Optional | The DSL query for alert monitoring. If this parameter is not provided, log pattern analysis is not returned. +`topNLogPatternData` | Optional | Log patterns for the alert trigger data. If this parameter is not provided, log pattern analysis is not returned. + +## Generating alert insights + +You can generate alert insights by calling the `/api/assistant/insight` API endpoint. To generate alert insights, all of the following parameters are required: + +```json +POST /api/assistant/insight +{ + "summaryType": "alerts", + "insightType": "user_insight" + "context": "\n Here is the detail information about alert Error log over 100\n ### Monitor definition\n {\"type\":\"monitor\",\"schema_version\":8,\"name\":\"loghub-apache-error-log\",\"monitor_type\":\"query_level_monitor\",\"enabled\":false,\"enabled_time\":null,\"schedule\":{\"period\":{\"interval\":1,\"unit\":\"MINUTES\"}},\"inputs\":[{\"search\":{\"indices\":[\"loghub-apache-new\"],\"query\":{\"size\":0,\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"10/12/24 11:21 am CST||-1000000h\",\"to\":\"10/12/24 11:21 am CST\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}}}],\"triggers\":[{\"query_level_trigger\":{\"id\":\"NAq7fpIBRJyww-JMjwP_\",\"name\":\"Error log over 100\",\"severity\":\"1\",\"condition\":{\"script\":{\"source\":\"ctx.results[0].hits.total.value > 100\",\"lang\":\"painless\"}},\"actions\":[]}}],\"last_update_time\":1728714554388,\"owner\":\"alerting\",\"associated_workflows\":[],\"associatedCompositeMonitorCnt\":0,\"item_type\":\"query_level_monitor\",\"id\":\"NQq7fpIBRJyww-JMkAMC\",\"version\":3}\n\n ### Active Alert\n {\"ACTIVE\":1,\"ACKNOWLEDGED\":0,\"ERROR\":0,\"total\":1,\"alerts\":[{\"id\":\"Wgq8fpIBRJyww-JMegNr\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"workflow_id\":\"\",\"workflow_name\":\"\",\"associated_alert_ids\":[],\"schema_version\":5,\"monitor_version\":1,\"monitor_name\":\"loghub-apache-error-log\",\"execution_id\":\"NQq7fpIBRJyww-JMkAMC_2024-10-12T03:18:54.311214115_22d189ce-5e93-4927-b8bb-bcf61b7537e3\",\"trigger_id\":\"NAq7fpIBRJyww-JMjwP_\",\"trigger_name\":\"Error log over 100\",\"finding_ids\":[],\"related_doc_ids\":[],\"state\":\"ACTIVE\",\"error_message\":null,\"alert_history\":[],\"severity\":\"1\",\"action_execution_results\":[],\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"end_time\":null,\"acknowledged_time\":null,\"alert_source\":\"monitor\"}],\"trigger_name\":\"Error log over 100\",\"severity\":\"1\",\"start_time\":\"10/12/24 11:18 am CST\",\"last_notification_time\":\"10/12/24 11:21 am CST\",\"monitor_name\":\"loghub-apache-error-log\",\"monitor_id\":\"NQq7fpIBRJyww-JMkAMC\",\"alert_source\":\"monitor\",\"triggerID\":\"NAq7fpIBRJyww-JMjwP_\"}\n\n ### Value triggers this alert\n 595\n\n ### Alert query DSL {\"query\":{\"bool\":{\"filter\":[{\"range\":{\"Time\":{\"from\":\"2024-10-12T03:21:54+00:00||-1000000h\",\"to\":\"2024-10-12T03:21:54+00:00\",\"include_lower\":true,\"include_upper\":true,\"boost\":1}}},{\"term\":{\"Level\":{\"value\":\"error\",\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}} \n", + "question": "Please provide your insight on this alerts.", + "summary": +} +``` +{% include copy-curl.html %} + +The following table describes the Assistant Insight API parameters. + +Parameter | Required/Optional | Description +:--- | :--- | :--- +`summaryType` | Required | Specifies the type of application calling this API. Use `alerts` for alert insights. +`insightType` | Required | Defines the alert type. Use `os_insight` for cluster metrics alerts and `user_insight` for other alert types. +`question` | Required | Specifies the user's question regarding alert insights. Default is `Please provide your insight on this alerts.` +`context` | Required | Provides context for the alert, including the alert monitor definition, active alerts, and trigger values. +`summary` | Required | The result returned by the alert summary agent. + + +## Viewing alert insights in OpenSearch Dashboards + +Before viewing alert insights, you must configure alerts in OpenSearch Dashboards. For more information, see [Alerting]({{site.url}}{{site.baseurl}}/observing-your-data/alerting/index/). + +To view alert insights in OpenSearch Dashboards, use the following steps: + +1. On the top menu bar, go to **OpenSearch Plugins > Alerting**. All alerts are displayed. + +1. Hover over the alerts for your desired monitor. If you configured alert insights, you will see a sparkle icon ({::nomarkdown}sparkle icon{:/}) next to the alerts in the **Alerts** column, as shown in the following image. + + Alerting page with sparkle icon + +1. Select the alerts label or the sparkle icon. You will see the generated summary, as shown in the following image. + + Alert summary + +1. Select the information icon ({::nomarkdown}info icon{:/}) to view alert insights. You will see the generated alert insights, as shown in the following image. + + Alert insights \ No newline at end of file diff --git a/_dashboards/dashboards-assistant/data-summary.md b/_dashboards/dashboards-assistant/data-summary.md new file mode 100644 index 00000000000..2d070661b19 --- /dev/null +++ b/_dashboards/dashboards-assistant/data-summary.md @@ -0,0 +1,219 @@ +--- +layout: default +title: Data summary +parent: OpenSearch Assistant for OpenSearch Dashboards +nav_order: 1 +has_children: false +--- + +# Data summary + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +The OpenSearch Dashboards Assistant data summary feature uses large language models (LLMs) to help you generate summaries for data stored in OpenSearch indexes. This tool provides an efficient way to gain insights from large datasets, making it easier to understand and act on the information contained in your OpenSearch indexes. + +## Configuration + +To configure the data summary feature, use the following steps. + +### Prerequisite + +Before using the data summary feature, enable query enhancements in OpenSearch Dashboards as follows: + +1. On the top menu bar, go to **Management > Dashboards Management**. +1. In the left navigation pane, select **Advanced settings**. +1. On the settings page, toggle **Enable query enhancements** to **On**. + +### Step 1: Enable the data summary feature + +To enable the data summary feature, configure the following `opensearch_dashboards.yml` setting: + +```yaml +queryEnhancements.queryAssist.summary.enabled: true +``` +{% include copy.html %} + +### Step 2: Create a data summary agent + +To orchestrate data summarization, create a data summary [agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/). To create an agent, send a `POST /_plugins/_flow_framework/workflow?provision=true` request and provide the agent template as a payload: + +
+ + Request + + {: .text-delta} + +```json +POST /_plugins/_flow_framework/workflow?provision=true +{ + "name": "Query Assist Agent", + "description": "Create a Query Assist Agent using Claude on BedRock", + "use_case": "REGISTER_AGENT", + "version": { + "template": "1.0.0", + "compatibility": ["2.13.0", "3.0.0"] + }, + "workflows": { + "provision": { + "user_params": {}, + "nodes": [ + { + "id": "create_claude_connector", + "type": "create_connector", + "previous_node_inputs": {}, + "user_inputs": { + "version": "1", + "name": "Claude instant runtime Connector", + "protocol": "aws_sigv4", + "description": "The connector to BedRock service for Claude model", + "actions": [ + { + "headers": { + "x-amz-content-sha256": "required", + "content-type": "application/json" + }, + "method": "POST", + "request_body": "{\"prompt\":\"${parameters.prompt}\", \"max_tokens_to_sample\":${parameters.max_tokens_to_sample}, \"temperature\":${parameters.temperature}, \"anthropic_version\":\"${parameters.anthropic_version}\" }", + "action_type": "predict", + "url": "https://bedrock-runtime.us-west-2.amazonaws.com/model/anthropic.claude-instant-v1/invoke" + } + ], + "credential": { + "access_key": "", + "secret_key": "", + "session_token": "" + }, + "parameters": { + "region": "us-west-2", + "endpoint": "bedrock-runtime.us-west-2.amazonaws.com", + "content_type": "application/json", + "auth": "Sig_V4", + "max_tokens_to_sample": "8000", + "service_name": "bedrock", + "temperature": "0.0001", + "response_filter": "$.completion", + "anthropic_version": "bedrock-2023-05-31" + } + } + }, + { + "id": "register_claude_model", + "type": "register_remote_model", + "previous_node_inputs": { + "create_claude_connector": "connector_id" + }, + "user_inputs": { + "description": "Claude model", + "deploy": true, + "name": "claude-instant" + } + }, + { + "id": "create_query_assist_data_summary_ml_model_tool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "parameters": { + "prompt": "Human: You are an assistant that helps to summarize the data and provide data insights.\nThe data are queried from OpenSearch index through user's question which was translated into PPL query.\nHere is a sample PPL query: `source= | where = `.\nNow you are given ${parameters.sample_count} sample data out of ${parameters.total_count} total data.\nThe user's question is `${parameters.question}`, the translated PPL query is `${parameters.ppl}` and sample data are:\n```\n${parameters.sample_data}\n```\nCould you help provide a summary of the sample data and provide some useful insights with precise wording and in plain text format, do not use markdown format.\nYou don't need to echo my requirements in response.\n\nAssistant:" + }, + "name": "MLModelTool", + "type": "MLModelTool" + } + }, + { + "id": "create_query_assist_data_summary_agent", + "type": "register_agent", + "previous_node_inputs": { + "create_query_assist_data_summary_ml_model_tool": "tools" + }, + "user_inputs": { + "parameters": {}, + "type": "flow", + "name": "Query Assist Data Summary Agent", + "description": "this is an query assist data summary agent" + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +
+ +For sample agent templates, see [Flow Framework sample templates](https://github.com/opensearch-project/flow-framework/tree/2.x/sample-templates). Note the agent ID; you'll use it in the following step. + +### Step 3: Create a root agent + +Next, create a [root agent]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-tutorial/#root_agent) for the data summary agent created in the previous step: + +```json +POST /.plugins-ml-config/_doc/os_data2summary +{ + "type": "os_root_agent", + "configuration": { + "agent_id": "" + } +} +``` +{% include copy-curl.html %} + +This example demonstrates a system index. In security-enabled domains, only superadmins have permissions to execute this code. For information about making superadmin calls, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). For access permissions, contact your system administrator. +{: .warning} + +### Step 4: Test the agent + +You can verify that the data summary agent was created successfully by calling the agent with an example payload: + +```json +POST /_plugins/_ml/agents//_execute +{ + "parameters": { + "sample_data":"'[{\"_index\":\"90943e30-9a47-11e8-b64d-95841ca0b247\",\"_source\":{\"referer\":\"http://twitter.com/success/gemini-9a\",\"request\":\"/beats/metricbeat/metricbeat-6.3.2-amd64.deb\",\"agent\":\"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)\",\"extension\":\"deb\",\"memory\":null,\"ip\":\"239.67.210.53\",\"index\":\"opensearch_dashboards_sample_data_logs\",\"message\":\"239.67.210.53 - - [2018-08-30T15:29:01.686Z] \\\"GET /beats/metricbeat/metricbeat-6.3.2-amd64.deb HTTP/1.1\\\" 404 2633 \\\"-\\\" \\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)\\\"\",\"url\":\"https://artifacts.opensearch.org/downloads/beats/metricbeat/metricbeat-6.3.2-amd64.deb\",\"tags\":\"success\",\"geo\":{\"srcdest\":\"CN:PL\",\"src\":\"CN\",\"coordinates\":{\"lat\":44.91167028,\"lon\":-108.4455092},\"dest\":\"PL\"},\"utc_time\":\"2024-09-05 15:29:01.686\",\"bytes\":2633,\"machine\":{\"os\":\"win xp\",\"ram\":21474836480},\"response\":\"404\",\"clientip\":\"239.67.210.53\",\"host\":\"artifacts.opensearch.org\",\"event\":{\"dataset\":\"sample_web_logs\"},\"phpmemory\":null,\"timestamp\":\"2024-09-05 15:29:01.686\"}}]'", + "sample_count":1, + "total_count":383, + "question":"Are there any errors in my logs?", + "ppl":"source=opensearch_dashboards_sample_data_logs| where QUERY_STRING(['response'], '4* OR 5*')"} +} +``` +{% include copy-curl.html %} + +## Generating a data summary + +You can generate a data summary by calling the `/api/assistant/data2summary` API endpoint. The `sample_count`, `total_count`, `question`, and `ppl` parameters are optional: + +```json +POST /api/assistant/data2summary +{ + "sample_data":"'[{\"_index\":\"90943e30-9a47-11e8-b64d-95841ca0b247\",\"_source\":{\"referer\":\"http://twitter.com/success/gemini-9a\",\"request\":\"/beats/metricbeat/metricbeat-6.3.2-amd64.deb\",\"agent\":\"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)\",\"extension\":\"deb\",\"memory\":null,\"ip\":\"239.67.210.53\",\"index\":\"opensearch_dashboards_sample_data_logs\",\"message\":\"239.67.210.53 - - [2018-08-30T15:29:01.686Z] \\\"GET /beats/metricbeat/metricbeat-6.3.2-amd64.deb HTTP/1.1\\\" 404 2633 \\\"-\\\" \\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)\\\"\",\"url\":\"https://artifacts.opensearch.org/downloads/beats/metricbeat/metricbeat-6.3.2-amd64.deb\",\"tags\":\"success\",\"geo\":{\"srcdest\":\"CN:PL\",\"src\":\"CN\",\"coordinates\":{\"lat\":44.91167028,\"lon\":-108.4455092},\"dest\":\"PL\"},\"utc_time\":\"2024-09-05 15:29:01.686\",\"bytes\":2633,\"machine\":{\"os\":\"win xp\",\"ram\":21474836480},\"response\":\"404\",\"clientip\":\"239.67.210.53\",\"host\":\"artifacts.opensearch.org\",\"event\":{\"dataset\":\"sample_web_logs\"},\"phpmemory\":null,\"timestamp\":\"2024-09-05 15:29:01.686\"}}]'", + "sample_count":1, + "total_count":383, + "question":"Are there any errors in my logs?", + "ppl":"source=opensearch_dashboards_sample_data_logs| where QUERY_STRING(['response'], '4* OR 5*')" +} +``` +{% include copy-curl.html %} + +The following table describes the Assistant Data Summary API parameters. + +Parameter | Required/Optional | Description +:--- | :--- | :--- +`sample_data` | Required | A sample of data returned by the specified query and used as input for summarization. +`question` | Optional | The user's natural language question about the data, which guides the summary generation. +`ppl` | Optional | The Piped Processing Language (PPL) query used to retrieve data; in query assistance, this is generated by the LLM using the user's natural language question. +`sample_count` | Optional | The number of entries included in sample_data. +`total_count` | Optional | The total number of entries in the full query result set. + +## Viewing data summaries in OpenSearch Dashboards + +To view alert insights in OpenSearch Dashboards, use the following steps: + +1. On the top menu bar, go to **OpenSearch Dashboards > Discover**. + +1. From the query language dropdown list, select **PPL**. You will see the generated data summary after the query text, as shown in the following image. + + data summary diff --git a/_dashboards/dashboards-assistant/index.md b/_dashboards/dashboards-assistant/index.md index bf2d754be89..615a53b75f1 100644 --- a/_dashboards/dashboards-assistant/index.md +++ b/_dashboards/dashboards-assistant/index.md @@ -2,7 +2,7 @@ layout: default title: OpenSearch Assistant for OpenSearch Dashboards nav_order: 3 -has_children: false +has_children: true has_toc: false --- @@ -22,7 +22,7 @@ To enable **OpenSearch Assistant** in OpenSearch Dashboards, locate your copy of ```yaml assistant.chat.enabled: true ``` -{% include copy-curl.html %} +{% include copy.html %} Then configure the root `agent_id` through the following API: @@ -131,8 +131,17 @@ assistant.next.enabled: true ``` {% include copy-curl.html %} +## Additional Dashboards Assistant capabilities + +For information about additional Dashboards Assistant capabilities, see the following pages: + +- [Generating alert insights]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/alert-insight/) +- [Generating data summaries]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/data-summary/) +- [Generating anomaly detector suggestions]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/suggest-anomaly-detector/) +- [Generating visualizations from text]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/text-to-visualization/) + ## Related articles - [Getting started guide for OpenSearch Assistant in OpenSearch Dashboards](https://github.com/opensearch-project/dashboards-assistant/blob/main/GETTING_STARTED_GUIDE.md) - [OpenSearch Assistant configuration through the REST API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/opensearch-assistant/) -- [Build your own chatbot]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/build-chatbot/) \ No newline at end of file +- [Build your own chatbot]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/build-chatbot/) diff --git a/_dashboards/dashboards-assistant/suggest-anomaly-detector.md b/_dashboards/dashboards-assistant/suggest-anomaly-detector.md new file mode 100644 index 00000000000..dca94c7b157 --- /dev/null +++ b/_dashboards/dashboards-assistant/suggest-anomaly-detector.md @@ -0,0 +1,89 @@ +--- +layout: default +title: Anomaly detector suggestions +parent: OpenSearch Assistant for OpenSearch Dashboards +nav_order: 1 +has_children: false +--- + +# Anomaly detector suggestions + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +The OpenSearch Dashboards Assistant can use a large language model (LLM) to suggest the creation of an anomaly detector. The LLM analyzes data patterns in your OpenSearch indexes and recommends configuration settings for the anomaly detector, making it easier to identify unusual activity or trends in your data. + +## Configuration + +To configure anomaly detector suggestions, use the following steps. + +### Prerequisite + +Before using anomaly detector suggestions, enable query enhancements in OpenSearch Dashboards as follows: + +1. On the top menu bar, go to **Management > Dashboards Management**. +1. In the left navigation pane, select **Advanced settings**. +1. On the settings page, toggle **Enable query enhancements** to **On**. + +### Step 1: Enable anomaly detector suggestions + +To enable anomaly detector suggestions, configure the following `opensearch_dashboards.yml` setting: + +```yaml +assistant.smartAnomalyDetector.enabled: true +``` +{% include copy.html %} + +### Step 2: Create an anomaly detector suggestion agent + +To orchestrate anomaly detector suggestions, create an anomaly detector suggestion [agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/). To create an agent, send a `POST /_plugins/_flow_framework/workflow?provision=true` request and provide the agent template as a payload. For more information, see [Configuring OpenSearch Assistant]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/index/#configuring-opensearch-assistant). + +For sample agent templates, see [Flow Framework sample templates](https://github.com/opensearch-project/flow-framework/tree/2.x/sample-templates). Note the agent ID; you'll use it in the following step. + +### Step 3: Configure the agent + +Next, configure the anomaly detector suggestion agent created in the previous step: + +```json +POST /.plugins-ml-config/_doc/os_suggest_ad +{ + "type": "suggest_anomaly_detector_agent", + "configuration": { + "agent_id": "" + } +} +``` +{% include copy-curl.html %} + +This example demonstrates a system index. In security-enabled domains, only superadmins have permissions to execute this code. For information about making superadmin calls, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). For access permissions, contact your system administrator. +{: .warning} + +### Step 4: Test the agent + +You can verify that the agent was created successfully by calling the agent with an example payload: + +```json +POST /_plugins/_ml/agents//_execute +{ + "parameters": { + "index":"sample_weblogs_test" + } +} +``` +{% include copy-curl.html %} + +## Viewing anomaly detector suggestions in OpenSearch Dashboards + +To view anomaly detector suggestions in OpenSearch Dashboards, use the following steps: + +1. On the top menu bar, go to **OpenSearch Dashboards > Discover**. + +1. From the index pattern dropdown list, select an index pattern. + +1. Select the **AI assistant** dropdown list and then select **Suggest anomaly detector**, as shown in the following image. + + Click the Suggest anomaly detector action + +1. Wait for the LLM to populate the **Suggest anomaly detector** fields that will be used to create an anomaly detector for the index pattern. Then select the **Create detector** button to create an anomaly detector, as shown in the following image. + + Suggested anomaly detector diff --git a/_dashboards/dashboards-assistant/text-to-visualization.md b/_dashboards/dashboards-assistant/text-to-visualization.md new file mode 100644 index 00000000000..8bacc268268 --- /dev/null +++ b/_dashboards/dashboards-assistant/text-to-visualization.md @@ -0,0 +1,286 @@ +--- +layout: default +title: Text to visualization +parent: OpenSearch Assistant for OpenSearch Dashboards +nav_order: 1 +has_children: false +--- + +# Text to visualization + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +The OpenSearch Dashboards Assistant can create visualizations using natural language instructions. + +## Configuration + +To configure text to visualization, use the following steps. + +### Step 1: Enable text to visualization + +To enable text to visualization, configure the following `opensearch_dashboards.yml` setting: + +```yaml +assistant.text2viz.enabled: true +``` +{% include copy.html %} + +### Step 2: Create the agents + +To orchestrate text to visualization, you'll need to create the necessary [agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/). Create a workflow template for creating all necessary text-to-visualization agents by sending the following request: + +
+ + Request + + {: .text-delta} + +```json +POST /_plugins/_flow_framework/workflow +{ + "name": "Text to visualization agents", + "description": "This template is to create all Agents required for text to visualization", + "use_case": "REGISTER_AGENTS", + "version": { + "template": "1.0.0", + "compatibility": [ + "2.18.0", + "3.0.0" + ] + }, + "workflows": { + "provision": { + "user_params": {}, + "nodes": [ + { + "id": "create_claude_connector", + "type": "create_connector", + "previous_node_inputs": {}, + "user_inputs": { + "credential": { + "access_key": "", + "secret_key": "", + "session_token": "" + }, + "parameters": { + "endpoint": "bedrock-runtime.us-east-1.amazonaws.com", + "content_type": "application/json", + "auth": "Sig_V4", + "max_tokens_to_sample": "8000", + "service_name": "bedrock", + "temperature": "0.0000", + "response_filter": "$.content[0].text", + "region": "us-east-1", + "anthropic_version": "bedrock-2023-05-31" + }, + "version": "1", + "name": "Claude haiku runtime Connector", + "protocol": "aws_sigv4", + "description": "The connector to BedRock service for claude model", + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-3-haiku-20240307-v1:0/invoke", + "headers": { + "content-type": "application/json", + "x-amz-content-sha256": "required" + }, + "request_body": "{\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"${parameters.prompt}\"}]}],\"anthropic_version\":\"${parameters.anthropic_version}\",\"max_tokens\":${parameters.max_tokens_to_sample}}" + } + ] + } + }, + { + "id": "register_claude_model", + "type": "register_remote_model", + "previous_node_inputs": { + "create_claude_connector": "connector_id" + }, + "user_inputs": { + "name": "claude-haiku", + "description": "Claude model", + "deploy": true + } + }, + { + "id": "create_t2vega_tool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "parameters": { + "prompt": "You're an expert at creating vega-lite visualization. No matter what the user asks, you should reply with a valid vega-lite specification in json.\nYour task is to generate Vega-Lite specification in json based on the given sample data, the schema of the data, the PPL query to get the data and the user's input.\nLet's start from dimension and metric/date. Now I have a question, I already transfer it to PPL and query my Opensearch cluster. \nThen I get data. For the PPL, it will do aggregation like \"stats AVG(field_1) as avg, COUNT(field_2) by field_3, field_4, field_5\". \nIn this aggregation, the metric is [avg, COUNT(field_2)] , and then we judge the type of field_3,4,5. If only field_5 is type related to date, the dimension is [field_3, field_4], and date is [field_5]\nFor example, stats SUM(bytes) by span(timestamp, 1w), machine.os, response, then SUM(bytes) is metric and span(timestamp, 1w) is date, while machine.os, response are dimensions.\nNotice: Some fields like 'span()....' will be the date, but not metric and dimension. \nAnd one field will only count once in dimension count. You should always pick field name from schema\nTo summarize,\nA dimension is a categorical variable that is used to group, segment, or categorize data. It is typically a qualitative attribute that provides context for metrics and is used to slice and dice data to see how different categories perform in relation to each other.\nThe dimension is not date related fields. The dimension and date are very closed. The only difference is date is related to datetime, while dimension is not.\nA metric is a quantitative measure used to quantify or calculate some aspect of the data. Metrics are numerical and typically represent aggregated values like sums, averages, counts, or other statistical calculations.\n\nIf a ppl doesn't have aggregation using 'stats', then each field in output is dimension.\nOtherwise, if a ppl use aggregation using 'stats' but doesn't group by using 'by', then each field in output is metric.\n\nThen for each given PPL, you could give the metric and dimension and date. One field will in only one of the metric, dimension or date.\n\nThen according to the metric number and dimension number of PPL result, you should first format the entrance code by metric_number, dimension_number, and date_number. For example, if metric_number = 1, dimension_number = 2, date_number=1, then the entrance code is 121.\nI define several use case categories here according to the entrance code.\nFor each category, I will define the entrance condition (number of metric and dimension)\nI will also give some defined attribute of generated vega-lite. Please refer to it to generate vega-lite.\n\nType 1:\nEntrance code: <1, 1, 0>\nDefined Attributes:\n {\n \"title\": \"\",\n \"description\": \"<description>\",\n \"mark\": \"bar\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<metric name>\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"field\": \"<dimension name>\",\n \"type\": \"nominal\"\n }\n },\n }\n\nType 2:\nEntrance code: <1, 2, 0>\nDefined Attributes:\n{\n \"mark\": \"bar\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\"\n },\n \"color\": {\n \"field\": \"<dimension 2>\",\n \"type\": \"nominal\"\n }\n }\n }\n\n\nType 3\nEntrance code: <3, 1, 0>\nDefined Attributes:\n{\n \"mark\": \"point\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"field\": \"<metric 2>\",\n \"type\": \"quantitative\"\n },\n \"size\": {\n \"field\": \"<metric 3>\",\n \"type\": \"quantitative\"\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\"\n }\n }\n}\n\nType 4\nEntrance code: <2, 1, 0>\nDefined Attributes:\n{\n \"mark\": \"point\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<mtric 1>\",\n \"type\": \"quantitative\"\n },\n \"y\": {\n \"field\": \"<metric 2>\",\n \"type\": \"quantitative\"\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\"\n }\n }\n}\n\nType 5:\nEntrance code: <2, 1, 1>\nDefined Attributes:\n{\n \"layer\": [\n {\n \"mark\": \"bar\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\"\n },\n \"y\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric 1 name>\"\n }\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\"\n }\n }\n },\n {\n \"mark\": {\n \"type\": \"line\",\n \"color\": \"red\"\n },\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\"\n },\n \"y\": {\n \"field\": \"<metric 2>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric 2 name>\",\n \"orient\": \"right\"\n }\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\"\n }\n }\n }\n ],\n \"resolve\": {\n \"scale\": {\n \"y\": \"independent\"\n }\n }\n }\n\nType 6:\nEntrance code: <2, 0, 1>\nDefined Attributes:\n{\n \"title\": \"<title>\",\n \"description\": \"<description>\",\n \"layer\": [\n {\n \"mark\": \"area\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\"\n },\n \"y\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric 1 name>\"\n }\n }\n }\n },\n {\n \"mark\": {\n \"type\": \"line\",\n \"color\": \"black\"\n },\n \"encoding\": {\n \"x\": {\n \"field\": \"date\",\n \"type\": \"temporal\"\n },\n \"y\": {\n \"field\": \"metric 2\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric 2 name>\",\n \"orient\": \"right\"\n }\n }\n }\n }\n ],\n \"resolve\": {\n \"scale\": {\n \"y\": \"independent\"\n }\n }\n }\n \nType 7:\nEntrance code: <1, 0, 1>\nDefined Attributes:\n{\n \"title\": \"<title>\",\n \"description\": \"<description>\",\n \"mark\": \"line\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\",\n \"axis\": {\n \"title\": \"<date name>\"\n }\n },\n \"y\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric name>\"\n }\n }\n }\n }\n\nType 8:\nEntrance code: <1, 1, 1>\nDefined Attributes:\n{\n \"title\": \"<title>\",\n \"description\": \"<description>\",\n \"mark\": \"line\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\",\n \"axis\": {\n \"title\": \"<date name>\"\n }\n },\n \"y\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric name>\"\n }\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\",\n \"legend\": {\n \"title\": \"<dimension name>\"\n }\n }\n }\n }\n\nType 9:\nEntrance code: <1, 2, 1>\nDefined Attributes:\n{\n \"title\": \"<title>\",\n \"description\": \"<description>\",\n \"mark\": \"line\",\n \"encoding\": {\n \"x\": {\n \"field\": \"<date 1>\",\n \"type\": \"temporal\",\n \"axis\": {\n \"title\": \"<date name>\"\n }\n },\n \"y\": {\n \"field\": \"<metric 1>\",\n \"type\": \"quantitative\",\n \"axis\": {\n \"title\": \"<metric 1>\"\n }\n },\n \"color\": {\n \"field\": \"<dimension 1>\",\n \"type\": \"nominal\",\n \"legend\": {\n \"title\": \"<dimension 1>\"\n }\n },\n \"facet\": {\n \"field\": \"<dimension 2>\",\n \"type\": \"nominal\",\n \"columns\": 2\n }\n }\n }\n\nType 10:\nEntrance code: all other code\nAll others type.\nUse a table to show the result\n\n\nBesides, here are some requirements:\n1. Do not contain the key called 'data' in vega-lite specification.\n2. If mark.type = point and shape.field is a field of the data, the definition of the shape should be inside the root \"encoding\" object, NOT in the \"mark\" object, for example, {\"encoding\": {\"shape\": {\"field\": \"field_name\"}}}\n3. Please also generate title and description\n\nThe sample data in json format:\n${parameters.sampleData}\n\nThis is the schema of the data:\n${parameters.dataSchema}\n\nThe user used this PPL query to get the data: ${parameters.ppl}\n\nThe user's question is: ${parameters.input_question}\n\nNotice: Some fields like 'span()....' will be the date, but not metric and dimension. \nAnd one field will only count once in dimension count. You should always pick field name from schema.\n And when you code is <2, 1, 0>, it belongs type 4.\n And when you code is <1, 2, 0>, it belongs type 9.\n\n\nNow please reply a valid vega-lite specification in json based on above instructions.\nPlease return the number of dimension, metric and date. Then choose the type. \nPlease also return the type.\nFinally return the vega-lite specification according to the type.\nPlease make sure all the key in the schema matches the word I given. \nYour answer format should be:\nNumber of metrics:[list the metric name here, Don't use duplicate name] <number of metrics {a}> \nNumber of dimensions:[list the dimension name here] <number of dimension {b}> \nNumber of dates:[list the date name here] <number of dates {c}> \nThen format the entrance code by: <Number of metrics, Number of dimensions, Number of dates>\nType and its entrance code: <type number>: <its entrance code>\nThen apply the vega-lite requirements of the type.\n<vega-lite> {here is the vega-lite json} </vega-lite>\n\nAnd don't use 'transformer' in your vega-lite and wrap your vega-lite json in <vega-lite> </vega-lite> tags\n" + }, + "name": "Text2Vega", + "type": "MLModelTool" + } + }, + { + "id": "create_instruction_based_t2vega_tool", + "type": "create_tool", + "previous_node_inputs": { + "register_claude_model": "model_id" + }, + "user_inputs": { + "parameters": { + "prompt": "You're an expert at creating vega-lite visualization. No matter what the user asks, you should reply with a valid vega-lite specification in json.\nYour task is to generate Vega-Lite specification in json based on the given sample data, the schema of the data, the PPL query to get the data and the user's input.\n\nBesides, here are some requirements:\n1. Do not contain the key called 'data' in vega-lite specification.\n2. If mark.type = point and shape.field is a field of the data, the definition of the shape should be inside the root \"encoding\" object, NOT in the \"mark\" object, for example, {\"encoding\": {\"shape\": {\"field\": \"field_name\"}}}\n3. Please also generate title and description\n\nThe sample data in json format:\n${parameters.sampleData}\n\nThis is the schema of the data:\n${parameters.dataSchema}\n\nThe user used this PPL query to get the data: ${parameters.ppl}\n\nThe user's input question is: ${parameters.input_question}\nThe user's instruction on the visualization is: ${parameters.input_instruction}\n\nNow please reply a valid vega-lite specification in json based on above instructions.\nPlease only contain vega-lite in your response.\n" + }, + "name": "Text2Vega", + "type": "MLModelTool" + } + }, + { + "id": "t2vega_agent", + "type": "register_agent", + "previous_node_inputs": { + "create_t2vega_tool": "tools" + }, + "user_inputs": { + "parameters": {}, + "type": "flow", + "name": "t2vega agent", + "description": "this is the t2vega agent that has a set of rules to generate the visualizations" + } + }, + { + "id": "t2vega_instruction_based_agent", + "type": "register_agent", + "previous_node_inputs": { + "create_instruction_based_t2vega_tool": "tools" + }, + "user_inputs": { + "parameters": {}, + "type": "flow", + "name": "t2vega instruction based agent", + "description": "this is the t2vega agent that supports instructions" + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +</details> + +Use the workflow ID returned in the response to provision the resources: + +```json +POST /_plugins/_flow_framework/workflow/<workflow_id>/_provision +``` +{% include copy-curl.html %} + +To view the status of the workflow and all created resources, send the following request: + +```json +GET /_plugins/_flow_framework/workflow/<workflow_id>/_status +``` +{% include copy-curl.html %} + +### Step 3: Configure the root agent + +Next, configure a root agent for text to visualization: + +```json +POST /.plugins-ml-config/_doc/os_text2vega +{ + "type": "os_chat_root_agent", + "configuration": { + "agent_id": "<ROOT_AGENT_ID>" + } +} +``` +{% include copy-curl.html %} + +Configure the agent to receive user instructions for creating visualizations: + +```json +POST /.plugins-ml-config/_doc/os_text2vega_with_instructions +{ + "type": "os_chat_root_agent", + "configuration": { + "agent_id": "<ROOT_AGENT_ID>" + } +} +``` +{% include copy-curl.html %} + +This example demonstrates a system index. In security-enabled domains, only superadmins have permissions to execute this code. For information about making superadmin calls, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). For access permissions, contact your system administrator. +{: .warning} + +### Step 4: Test the agent + +You can verify that the agent was created successfully by calling the agent with an example payload: + +```json +POST /_plugins/_ml/agents/<ROOT_AGENT_ID>/_execute +{ + "parameters": { + "input_question": "find unique visitors and average bytes every 3 hours", + "input_instruction": "display with different layers, use independent scale for different layers, display unique visitors with light blue bar chart", + "ppl": "source=opensearch_dashboards_sample_data_ecommerce| stats DISTINCT_COUNT(user) as unique_visitors, AVG(taxful_total_price) as avg_bytes by span(order_date, 3h)", + "sampleData": """[{\"unique_visitors\":15,\"avg_bytes\":90.98684210526316,\"span(order_date,3h)\":\"2024-04-25 00:00:00\"},{\"unique_visitors\":14,\"avg_bytes\":72.72083333333333,\"span(order_date,3h)\":\"2024-04-25 03:00:00\"}]""", + "dataSchema": """[{\"name\":\"unique_visitors\",\"type\":\"integer\"},{\"name\":\"avg_bytes\",\"type\":\"double\"},{\"name\":\"span(order_date,3h)\",\"type\":\"timestamp\"}]""" + } +} +``` +{% include copy-curl.html %} + +## Generating a visualization from text + +You can generate a visualization from text by calling the `/api/assistant/text2vega` API endpoint. The `input_instruction` parameter is optional: + +```json +POST /api/assistant/text2vega +{ + "input_instruction": "<input_instruction>", + "input_question": "<input_question>", + "ppl": "<ppl_query>", + "dataSchema": "<data_schema_of_ppl_response>", + "sampleData": "<sample_data_of_ppl_response>" +} +``` +{% include copy-curl.html %} + +The following table describes the Text to Visualization API parameters. + +Parameter | Required/Optional | Description +:--- | :--- | :--- +`input_question` | Required | The user's original question used to generate the corresponding Piped Processing Language (PPL) query. +`ppl` | Required | The generated PPL query that retrieves the data required for the visualization. +`dataSchema` | Required | Describes the structure and types of the data fields in the visualization output, based on the PPL response. +`sampleData` | Required | Provides sample entries from the data that will populate the visualization. +`input_instruction` | Optional | Specifies the styling instructions, such as colors, for the visualization. + +## Generating visualizations from text in OpenSearch Dashboards + +To generate visualizations from text in OpenSearch Dashboards, use the following steps: + +1. On the top menu bar, go to **OpenSearch Dashboards > Visualize** and then select **Create visualization**. + +1. In the **New Visualization** dialog, select **Natural language**, as shown in the following image. + + <img width="800px" src="{{site.url}}{{site.baseurl}}/images/dashboards-assistant/t2viz-start.png" alt="Create a visualization by selecting natural language"> + +1. From the data sources dropdown list, select a data source, as shown in the following image. + + <img src="{{site.url}}{{site.baseurl}}/images/dashboards-assistant/t2viz-select-data-source.png" alt="Create a visualization by selecting natural language"> + +1. In the text box on the upper right, enter a question using natural language. A new visualization is generated, as shown in the following image. + + <img src="{{site.url}}{{site.baseurl}}/images/dashboards-assistant/t2viz-ask-question.png" alt="Create a visualization by selecting natural language"> + +1. To modify the generated visualization, select **Edit visual**. In the **Edit visual** dialog, enter the desired modifications and then select **Apply**, as shown in the following image. + + <img src="{{site.url}}{{site.baseurl}}/images/dashboards-assistant/t2viz-edit-visual.png" alt="Create a visualization by selecting natural language"> + + The visualization is updated, as shown in the following image. + + <img src="{{site.url}}{{site.baseurl}}/images/dashboards-assistant/t2viz-edit-visual-response.png" alt="Create a visualization by selecting natural language"> + + + diff --git a/_dashboards/dev-tools/index-dev.md b/_dashboards/dev-tools/index-dev.md index 814c2320976..d4a6397e4a7 100644 --- a/_dashboards/dev-tools/index-dev.md +++ b/_dashboards/dev-tools/index-dev.md @@ -1,21 +1,148 @@ --- layout: default title: Dev Tools -nav_order: 120 -has_children: true +nav_order: 110 +redirect_from: + - /dashboards/run-queries/ + - /dashboards/dev-tools/run-queries/ --- -# Dev Tools +# Running queries in the Dev Tools console -**Dev Tools** is a development environment that lets you set up your OpenSearch Dashboards environment, run queries, explore data, and debug problems. You can use the Dev Tools console to: +You can use the OpenSearch Dev Tools console to send queries to OpenSearch. -- **Set up your OpenSearch Dashboards environment.** For example, you can use the console to configure authentication settings for your OpenSearch Dashboards instance. -- **[Run queries to explore your data]({{site.url}}{{site.baseurl}}/dashboards/dev-tools/run-queries/).** For example, you can use the console to tune your queries for relevance. -- **Debug problems with your queries.** For example, if your query is not returning the results you expect, you can use the console to identify the problem. -- **Learn about the APIs in OpenSearch.** For example, you can use the API reference documentation linked in the console to look up the syntax for different API calls (select the question circle icon ({::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/icons/question-circle.png" class="inline-icon" alt="question circle icon"/>{:/})). -- **Develop custom visualizations.** For example, you can use the console to create [Vega visualizations]({{site.url}}{{site.baseurl}}/dashboards/visualize/viz-index/#vega). -- **Customize the appearance and behavior of dashboards.** For example, you can use the console to customize dashboard visualization colors or to add new filters. +## Navigating to the console -To access the console, go to the OpenSearch Dashboards main menu and select **Management** > **Dev Tools**. An example is shown in the following image. +To open the console, select **Dev Tools** on the main OpenSearch Dashboards page: -<img src="{{site.url}}{{site.baseurl}}/images/dashboards/dev-tools-ui.png" alt="Dev Tools console interface" width="700"/> +<img src="{{site.url}}{{site.baseurl}}/images/dev-tools/dev-tools-main.png" alt="Dev Tools console from main page">{: .img-fluid } + +You can open the console from any other page by navigating to the main menu and selecting **Management** > **Dev Tools**. + +<img src="{{site.url}}{{site.baseurl}}/images/dev-tools/dev-tools-left.png" width=200 alt="Dev Tools console from all pages"> + +## Writing queries + +Write your queries in the editor pane on the left side of the console: + +<img src="{{site.url}}{{site.baseurl}}/images/dev-tools/dev-tools-request.png" alt="Request pane">{: .img-fluid } + +You can collapse and expand parts of your query by selecting the small triangles next to the line numbers. +{: .tip} + +To learn more about writing queries in OpenSearch domain-specific language (DSL), see [Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/). + +You can import or export queries by selecting **Import** or **Export** from the top menu. + +### Comments + +Use `#` at the beginning of a line to write single-line comments. + +### Autocomplete + +OpenSearch provides autocomplete suggestions for fields, indexes and their aliases, and templates. To configure autocomplete preferences, update them in [Console Settings](#updating-the-console-settings). + +## Sending the request + +To send a query to OpenSearch, select the query by placing the cursor anywhere in the query text. Then choose the play icon ({::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/dev-tools/play-icon.png" class="inline-icon" alt="play icon"/>{:/}) on the upper right of the request or press `Ctrl/Cmd+Enter`: + +<img src="{{site.url}}{{site.baseurl}}/images/dev-tools/dev-tools-send.png" alt="Send request"> + +OpenSearch displays the response in the response pane on the right side of the console: + +<img src="{{site.url}}{{site.baseurl}}/images/dev-tools/dev-tools-response.png" alt="Response pane">{: .img-fluid } + +## Working in the cURL and console formats + +The console uses an easier syntax to format REST requests than the `curl` command. + +For example, the following `curl` command runs a search query: + +```bash +curl -XGET http://localhost:9200/shakespeare/_search?pretty -H 'Content-Type: application/json' -d' +{ + "query": { + "match": { + "text_entry": "To be, or not to be" + } + } +}' +``` + +The same query has a simpler syntax in the console format: + +```json +GET shakespeare/_search +{ + "query": { + "match": { + "text_entry": "To be, or not to be" + } + } +} +``` + +If you paste a `curl` command directly into the console, the command is automatically converted into the format the console uses. + +To import a query in cURL format, select the query, select the wrench icon ({::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/dev-tools/wrench-icon.png" class="inline-icon" alt="wrench icon"/>{:/}), and choose **Copy as cURL**: + +<img src="{{site.url}}{{site.baseurl}}/images/dev-tools/dev-tools-tools.png" alt="Console tools"> + +## Using triple quotation marks in queries + +When writing queries containing quotation marks (`"`) and backslash (`\`) characters, you can use triple quotation marks (`"""`) to avoid escaping the characters. This format improves readability and helps avoid escape characters when writing large or complex strings, especially when working with deeply nested JSON strings. + +You can index a document containing special characters by escaping each special character with a backslash: + +```json +PUT /testindex/_doc/1 +{ + "test_query": "{ \"query\": { \"query_string\": { \"query\": \"host:\\\"127.0.0.1\\\"\" } } }" +} +``` + +Alternatively, you can use triple quotation marks for a simpler format: + +``` +PUT /testindex/_doc/1 +{ + "test_query": """{ "query": { "query_string": { "query": "host:\"127.0.0.1\"" } } }""" +} +``` + +Triple quotation marks are only supported in the Dev Tools console---not in `curl` or other HTTP clients. To import a query with triple quotation marks in cURL format, use **Copy as cURL**. +{: .tip} + +If a response contains the `\n`, `\t`, `\`, or `"` special characters, the console formats the response using triple quotation marks. To turn off this behavior, select **Settings** from the top menu and toggle **JSON syntax**. +{: .tip} + +## Viewing documentation + +To view the OpenSearch documentation, select the wrench icon ({::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/dev-tools/wrench-icon.png" class="inline-icon" alt="wrench icon"/>{:/}) and choose **Open documentation**. + +## Auto indenting + +To use auto indent, select the queries that you want to format, select the wrench icon ({::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/dev-tools/wrench-icon.png" class="inline-icon" alt="wrench icon"/>{:/}), and choose **Auto indent**. + +Auto indenting a collapsed query expands it. + +Auto indenting a well-formatted query puts the request body on a single line. This is useful for working with [bulk APIs]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/). +{: .tip} + +## Viewing your request history + +You can view up to the 500 most recent requests that OpenSearch ran successfully. To view request history, select **History** from the top menu. If you select the request you want to view from the left pane, the query is shown in the right pane. + +To copy the query into the editor pane, select the query text and then select **Apply**. + +To clear the history, select **Clear**. + +## Updating the console settings + +To update your preferences, select **Settings** from the top menu: + +<img src="{{site.url}}{{site.baseurl}}/images/dev-tools/dev-tools-settings.png" width=400 alt="Settings"> + +## Using keyboard shortcuts + +To view all available keyboard shortcuts, select **Help** from the top menu. \ No newline at end of file diff --git a/_dashboards/dev-tools/run-queries.md b/_dashboards/dev-tools/run-queries.md deleted file mode 100644 index 7f92de9fa79..00000000000 --- a/_dashboards/dev-tools/run-queries.md +++ /dev/null @@ -1,66 +0,0 @@ ---- -layout: default -title: Running queries in the Dev Tools console -parent: Dev Tools -nav_order: 10 -redirect_from: - - /dashboards/run-queries/ ---- - -# Running queries in the Dev Tools console - -The Dev Tools console can be used to send queries to OpenSearch. To access the console, go to the OpenSearch Dashboards main menu and select **Management** > **Dev Tools**. -## Writing queries - -OpenSearch provides a query domain-specific language (DSL) called [Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/). It is a flexible language with a JSON interface. - -To write your queries, use the editor pane on the left side of the console. To send a query to OpenSearch, select the query by placing the cursor in the query text and then selecting the play icon ({::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/icons/play-icon.png" class="inline-icon" alt="play icon"/>{:/}) on the upper right of the request or press `Ctrl/Cmd+Enter`. The response from OpenSearch is displayed in the response pane on the right side of the console. To run multiple commands simultaneously, select all the commands in the editor pane, and then select the play icon or press `Ctrl/Cmd+Enter`. - -An example of the query and response panes is shown in the following image. - -<img src="{{site.url}}{{site.baseurl}}/images/dashboards/query-request-ui.png" alt="Console UI with query and request"> - -### Query options - -When writing queries using the console, there are common actions that can help you write queries more efficiently and accurately. The following table describes these features and how you can use them. - -Feature | How to use | ---------|------------| -**Collapsing or expanding a query** | To hide or show details of your query, select the expander arrow ({::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/icons/arrow-down-icon.png" class="inline-icon" alt="arrow down icon"/>{:/}) next to the line number. | -**Auto indenting** | To use auto indent, select the queries that you want to format, then select the wrench icon ({::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/icons/wrench-icon.png" class="inline-icon" alt="wrench icon"/>{:/}), and choose **Auto indent**. | -**Autocomplete** | To define your preferences for autocomplete suggestions, configure them in **Settings**. | -**Request history** | To view request history, select **History** from the top menu. If you select the request you want to view from the left pane, the query is shown in the right pane. To copy the query into the editor pane, select the query text and then select **Apply**. To clear the history, select **Clear**. | -**Keyboard shortcuts** | To view all available keyboard shortcuts, select **Help** from the top menu. | -**Documentation access from the console** | To access OpenSearch documentation from the console, select the wrench icon ({::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/icons/wrench-icon.png" class="inline-icon" alt="wrench icon"/>{:/}) and choose **Open documentation**. | - -## Working in the cURL and console formats - -The console uses a simplified syntax to format REST requests instead of the `curl` command. If you paste a `curl` command directly into the console, the command is automatically converted into the format used by the console. To import a query in cURL format, select the query, then select the wrench icon ({::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/icons/wrench-icon.png" class="inline-icon" alt="wrench icon"/>{:/}), and choose **Copy as cURL**. - -For example, the following `curl` command runs a search query: - -```bash -curl -XGET http://localhost:9200/shakespeare/_search?pretty -H 'Content-Type: application/json' -d' -{ - "query": { - "match": { - "text_entry": "To be, or not to be" - } - } -}' -``` -{% include copy.html %} - -The same query has a simplified syntax in the console format, as shown in the following example: - -```json -GET shakespeare/_search -{ - "query": { - "match": { - "text_entry": "To be, or not to be" - } - } -} -``` -{% include copy-curl.html %} diff --git a/_dashboards/dql.md b/_dashboards/dql.md index 7ddcbc6d1ba..ec558d1ffd7 100644 --- a/_dashboards/dql.md +++ b/_dashboards/dql.md @@ -1,7 +1,7 @@ --- layout: default title: Dashboards Query Language (DQL) -nav_order: 130 +nav_order: 125 redirect_from: - /dashboards/dql/ - /dashboards/discover/dql/ @@ -9,12 +9,19 @@ redirect_from: # Dashboards Query Language (DQL) -Dashboards Query Language (DQL) is a simple text-based query language used to filter data in OpenSearch Dashboards. For example, to display your site visitor data for a host in the United States, you would enter `geo.dest:US` in the search field, as shown in the following image. +Dashboards Query Language (DQL) is a simple text-based query language used to filter data in OpenSearch Dashboards. -<img src="{{site.url}}{{site.baseurl}}/images/dashboards/dql-interface.png" alt="Search term using DQL toolbar in Dashboard" width="500"> +DQL and [query string query]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/) (Lucene) language are the two search bar language options in Discover and Dashboards. This page provides a reference for the DQL syntax. For the Lucene syntax, see [Query string query]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/). For a syntax comparison, see the [Command quick reference](#dql-and-query-string-query-quick-reference). -DQL and query string query (Lucene) language are the two search bar language options in Discover and Dashboards. To compare these language options, see [Discover and Dashboard search bar]({{site.url}}{{site.baseurl}}/dashboards/index/#discover-and-dashboard-search-bar). -{: .tip} +By default, OpenSearch Dashboards uses DQL syntax. To switch to query string query (Lucene), select the **DQL** button next to the search box and then toggle the **On** switch, as shown in the following image. + +![Search term using DQL toolbar in Dashboard]({{site.url}}{{site.baseurl}}/images/dashboards/dql-interface.png) + +The syntax changes to **Lucene**. To switch back to DQL, select the **Lucene** button and toggle the **Off** switch. + +## Queries on analyzed text + +When running queries, understanding whether your fields are analyzed ([`text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/) type) or non-analyzed ([`keyword`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/keyword/) type) is crucial because it significantly impacts search behavior. In analyzed fields, text undergoes tokenization and filtering, while non-analyzed fields store exact values. For simple field queries like `wind`, searches against analyzed fields match documents containing `wind` regardless of case, while the same query on keyword fields requires exact matching of the full string. For more information about analyzed fields, see [Text analysis]({{site.url}}{{site.baseurl}}/analyzers/). ## Setup @@ -116,6 +123,36 @@ The [Object fields](#object-fields) and [Nested fields](#nested-fields) sections {: .note} </details> +## DQL and query string query quick reference + +The following table provides a quick reference for both query language commands. + +| Feature | DQL | Query string query (Lucene)| +|:---|:---|:---| +| Basic term search | `wind` | `wind` | +| Multiple terms | `wind gone` (finds documents containing `wind` or `gone`) | `wind gone` (finds documents containing `wind` or `gone`) | +| Exact phrase search | `"wind rises"` | `"wind rises"` | +| Field-specific search | `title: wind` | `title:wind` | +| Existence of a field | `description:*` | `_exists_:description` | +| Multiple terms in field | `title: (wind OR rises)` <br><br> | `title:(wind OR rises)` | +| Field containing spaces | `article*title: wind` | `article\ title:wind` | +| Escaping special characters | `format: 2\*3` | `format:2\*3` | +| Multiple field search | `title: wind OR description: film` | `title:wind OR description:film` | +| Nested field search | See [Nested fields](#nested-fields) | Not supported | +| Numeric range | `page_views >= 100 and page_views <= 300` <br><br> `not page_views: 100` (results include documents that don't contain a `page_views` field) <br><br> See [Ranges](#ranges)| `page_views:[100 TO 300]` <br><br> `page_views:(>=100 AND <=300)` <br><br> `page_views:(+>=100 +<=300)` <br><br> `page_views:[100 TO *]` <br><br> `page_views:>=100` <br><br> `NOT page_views:100` (results include documents that don't contain a `page_views` field) <br><br> See [Ranges]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/#ranges)| +| Date range | `date >= "1939-01-01" and date <= "2013-12-31"` <br><br> `not date: "1939-09-08"` | `date:[1939-01-01 TO 2013-12-31]` <br><br> `NOT date:1939-09-08` <br><br> Supports all numeric range syntax constructs| +| Exclusive range | Not supported | `page_views: {100 TO 300}` (returns documents whose `page_views` are between `100` and `300`, excluding `100` and `300`) | +| Boolean `AND` | `media_type:film AND page_views:100` <br><br> `media_type:film and page_views:100`| `media_type:film AND page_views:100` <br><br> `+media_type:film +page_views:100`| +| Boolean `NOT` | `NOT media_type: article` <br><br> `not media_type: article` | `NOT media_type:article` <br><br> `-media_type:article` | +| Boolean `OR` | `title: wind OR description: film` <br><br> `title: wind or description: film` | `title: wind OR description: film` | +| Required/Prohibited operators | Not supported | Supports both `+` (required operator) and `-` (prohibited operator) <br><br> `+title:wind -media_type:article` (returns documents in which `title` contains `wind` but `media_type` does not contain `article`) | +| Wildcards | `title: wind*`<br><br> `titl*: wind` <br><br> Does not support wildcards in phrase searches (within quotation marks) <br><br> Only supports `*` (multiple characters) | `title:wind*` or `title:w?nd` <br><br> Does not support wildcards in field names <br><br> Does not support wildcards in phrase searches (within quotation marks) <br><br> Supports `*` (multiple characters) and `?` (single character) | +| Regular expressions | Not supported | `title:/w[a-z]nd/` | +| Fuzzy search | Not supported | `title:wind~2` | +| Proximity search | Not supported | `"wind rises"~2` | +| Boosting terms | Not supported | `title:wind^2` | +| Reserved characters | `\ ( ) : < > " *` | `+ - = && \|\| > < ! ( ) { } [ ] ^ " ~ * ? : \ /` | + ## Search for terms By default, DQL searches in the field set as the default field on the index. If the default field is not set, DQL searches all fields. For example, the following query searches for documents containing the words `rises` or `wind` in any of their fields: diff --git a/_integrations/index.md b/_dashboards/integrations/index.md similarity index 96% rename from _integrations/index.md rename to _dashboards/integrations/index.md index 644f3fccd22..cf5e3d3636f 100644 --- a/_integrations/index.md +++ b/_dashboards/integrations/index.md @@ -1,12 +1,12 @@ --- layout: default title: Integrations in OpenSearch Dashboards -nav_order: 1 +nav_order: 135 has_children: false -nav_exclude: true -permalink: /integrations/ redirect_from: + - /integrations/ - /integrations/index/ + - /dashboards/integrations/ --- # Integrations in OpenSearch Dashboards @@ -31,7 +31,7 @@ A consistent telemetry data schema is crucial for effective observability, enabl OpenSearch adopted the [OpenTelemetry (OTel)](https://opentelemetry.io/) protocol as the foundation for its observability solution. OTel is a community-driven standard that defines a consistent schema and data collection approach for metrics, logs, and traces. It is widely supported by APIs, SDKs, and telemetry collectors, enabling features like auto-instrumentation for seamless observability integration. -This shared schema allows cross-correlation and analysis across different data sources. To this end, OpenSearch derived the [Simple Schema for Observability](https://github.com/opensearch-project/opensearch-catalog/tree/main/docs/schema/observability), which encodes the OTel standard as OpenSearch mappings. OpenSearch also supports the [Piped Processing Language (PPL)](https://opensearch.org/docs/latest/search-plugins/sql/ppl/index/), which is designed for high-dimensionality querying in observability use cases. +This shared schema allows cross-correlation and analysis across different data sources. To this end, OpenSearch derived the [Simple Schema for Observability](https://github.com/opensearch-project/opensearch-catalog/tree/main/docs/schema/observability), which encodes the OTel standard as OpenSearch mappings. OpenSearch also supports the [Piped Processing Language (PPL)]({{site.url}}{{site.baseurl}}/search-plugins/sql/ppl/index/), which is designed for high-dimensionality querying in observability use cases. --- diff --git a/_dashboards/management/accelerate-external-data.md b/_dashboards/management/accelerate-external-data.md index 6d1fa030e40..61c08c01f88 100644 --- a/_dashboards/management/accelerate-external-data.md +++ b/_dashboards/management/accelerate-external-data.md @@ -1,48 +1,182 @@ --- layout: default -title: Optimize query performance using OpenSearch indexing -parent: Connecting Amazon S3 to OpenSearch -grand_parent: Data sources -nav_order: 15 -has_children: false +title: Optimizing query performance using OpenSearch indexing +parent: Data sources +nav_order: 17 --- -# Optimize query performance using OpenSearch indexing +# Optimizing query performance using OpenSearch indexing Introduced 2.11 {: .label .label-purple } Query performance can be slow when using external data sources for reasons such as network latency, data transformation, and data volume. You can optimize your query performance by using OpenSearch indexes, such as a skipping index or a covering index. -A _skipping index_ uses skip acceleration methods, such as partition, minimum and maximum values, and value sets, to ingest and create compact aggregate data structures. This makes them an economical option for direct querying scenarios. +- A _skipping index_ uses skip acceleration methods, such as partition, minimum and maximum values, and value sets, to ingest and create compact aggregate data structures. This makes them an economical option for direct querying scenarios. For more information, see [Skipping indexes]({{site.url}}{{site.baseurl}}/dashboards/management/accelerate-external-data/#skipping-indexes). +- A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. For more information, see [Covering indexes]({{site.url}}{{site.baseurl}}/dashboards/management/accelerate-external-data/#covering-indexes). +- A _materialized view_ enhances query performance by storing precomputed and aggregated data from the source data. For more information, see [Materialized views]({{site.url}}{{site.baseurl}}/dashboards/management/accelerate-external-data/#materialized-views). -A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. See the [Flint Index Reference Manual](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md) for comprehensive guidance on this feature's indexing process. +For comprehensive guidance on each indexing process, see the [Flint Index Reference Manual](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md). ## Data sources use case: Accelerate performance -To get started with the **Accelerate performance** use case available in **Data sources**, follow these steps: +To get started with accelerating query performance, perform the following steps: -1. Go to **OpenSearch Dashboards** > **Query Workbench** and select your Amazon S3 data source from the **Data sources** dropdown menu in the upper-left corner. -2. From the left-side navigation menu, select a database. -3. View the results in the table and confirm that you have the desired data. +1. Go to **OpenSearch Dashboards** > **Query Workbench** and select your data source from the **Data sources** dropdown menu. +2. From the navigation menu, select a database. +3. View the results in the table and confirm that you have the correct data. 4. Create an OpenSearch index by following these steps: - 1. Select the **Accelerate data** button. A pop-up window appears. - 2. Enter your details in **Select data fields**. In the **Database** field, select the desired acceleration index: **Skipping index** or **Covering index**. A _skipping index_ uses skip acceleration methods, such as partition, min/max, and value sets, to ingest data using compact aggregate data structures. This makes them an economical option for direct querying scenarios. A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. -5. Under **Index settings**, enter the information for your acceleration index. For information about naming, select **Help**. Note that an Amazon S3 table can only have one skipping index at a time. + 1. Select **Accelerate data**. A pop-up window appears. + 2. Enter your database and table details under **Select data fields**. +5. For **Acceleration type**, select the type of acceleration according to your use case. Then, enter the information for your acceleration type. For more information, see the following sections: + - [Skipping indexes]({{site.url}}{{site.baseurl}}/dashboards/management/accelerate-external-data/#skipping-indexes) + - [Covering indexes]({{site.url}}{{site.baseurl}}/dashboards/management/accelerate-external-data/#covering-indexes) + - [Materialized views]({{site.url}}{{site.baseurl}}/dashboards/management/accelerate-external-data/#materialized-views) + +## Skipping indexes + +A _skipping index_ uses skip acceleration methods, such as partition, min/max, and value sets, to ingest data using compact aggregate data structures. This makes them an economical option for direct querying scenarios. + +With a skipping index, you can index only the metadata of the data stored in Amazon S3. When you query a table with a skipping index, the query planner references the index and rewrites the query to efficiently locate the data, instead of scanning all partitions and files. This allows the skipping index to quickly narrow down the specific location of the stored data. ### Define skipping index settings -1. Under **Skipping index definition**, select the **Add fields** button to define the skipping index acceleration method and choose the fields you want to add. -2. Select the **Copy Query to Editor** button to apply your skipping index settings. -3. View the skipping index query details in the table pane and then select the **Run** button. Your index is added to the left-side navigation menu containing the list of your databases. +1. Under **Skipping index definition**, select **Generate** to automatically generate a skipping index. Alternately, to manually choose the fields you want to add, select **Add fields**. Choose from the following types: + - `Partition`: Uses data partition details to locate data. This type is best for partitioning-based columns such as year, month, day, hour. + - `MinMax`: Uses lower and upper bound of the indexed column to locate data. This type is best for numeric columns. + - `ValueSet`: Uses a unique value set to locate data. This type is best for columns with low to moderate cardinality that require exact matching. + - `BloomFilter`: Uses the bloom filter algorithm to locate data. This type is best for columns with high cardinality that do not require exact matching. +2. Select **Create acceleration** to apply your skipping index settings. +3. View the skipping index query details and then click **Run**. OpenSearch adds your index to the left navigation pane. + +Alternately, you can manually create a skipping index using Query Workbench. Select your data source from the dropdown and run a query like the following: + +```sql +CREATE SKIPPING INDEX +ON datasourcename.gluedatabasename.vpclogstable( + `srcaddr` BLOOM_FILTER, + `dstaddr` BLOOM_FILTER, + `day` PARTITION, + `account_id`BLOOM_FILTER + ) WITH ( +index_settings = '{"number_of_shards":5,"number_of_replicas":1}', +auto_refresh = true, +checkpoint_location = 's3://accountnum-vpcflow/AWSLogs/checkpoint' +) +``` + +## Covering indexes + +A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. + +With a covering index, you can ingest data from a specified column in a table. This is the most performant of the three indexing types. Because OpenSearch ingests all data from your desired column, you get better performance and can perform advanced analytics. + +OpenSearch creates a new index from the covering index data. You can use this new index to create visualizations, or for anomaly detection and geospatial capabilities. You can manage the covering view index with Index State Management. For more information, see [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/). ### Define covering index settings -1. Under **Index settings**, enter a valid index name. Note that each Amazon S3 table can have multiple covering indexes. -2. Once you have added the index name, define the covering index fields by selecting `(add fields here)` under **Covering index definition**. -3. Select the **Copy Query to Editor** button to apply your covering index settings. -4. View the covering index query details in the table pane and then select the **Run** button. Your index is added to the left-side navigation menu containing the list of your databases. +1. For **Index name**, enter a valid index name. Note that each table can have multiple covering indexes. +2. Choose a **Refresh type**. By default, OpenSearch automatically refreshes the index. Otherwise, you must manually trigger a refresh using a REFRESH statement. +3. Enter a **Checkpoint location**, which is a path for refresh job checkpoints. The location must be a path in a file system compatible with the Hadoop Distributed File System (HDFS). For more information, see [Starting streaming queries](https://spark.apache.org/docs/3.5.1/structured-streaming-programming-guide.html#starting-streaming-queries). +4. Define the covering index fields by selecting **(add fields here)** under **Covering index definition**. +5. Select **Create acceleration** to apply your covering index settings. +6. View the covering index query details and then click **Run**. OpenSearch adds your index to the left navigation pane. + +Alternately, you can manually create a covering index on your table using Query Workbench. Select your data source from the dropdown and run a query like the following: + +```sql +CREATE INDEX vpc_covering_index +ON datasourcename.gluedatabasename.vpclogstable (version, account_id, interface_id, +srcaddr, dstaddr, srcport, dstport, protocol, packets, +bytes, start, action, log_status STRING, +`aws-account-id`, `aws-service`, `aws-region`, year, +month, day, hour ) +WITH ( + auto_refresh = true, + refresh_interval = '15 minute', + checkpoint_location = 's3://accountnum-vpcflow/AWSLogs/checkpoint' +) +``` + +## Materialized views + +With _materialized views_, you can use complex queries, such as aggregations, to power Dashboards visualizations. Materialized views ingest a small amount of your data, depending on the query, into OpenSearch. OpenSearch then forms an index from the ingested data that you can use for visualizations. You can manage the materialized view index with Index State Management. For more information, see [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/). + +### Define materialized view settings + +1. For **Index name**, enter a valid index name. Note that each table can have multiple covering indexes. +2. Choose a **Refresh type**. By default, OpenSearch automatically refreshes the index. Otherwise, you must manually trigger a refresh using a `REFRESH` statement. +3. Enter a **Checkpoint location**, which is a path for refresh job checkpoints. The location must be a path in an HDFS compatible file system. +4. Enter a **Watermark delay**, which defines how late data can come and still be processed, such as 1 minute or 10 seconds. +5. Define the covering index fields under **Materialized view definition**. +6. Select **Create acceleration** to apply your materialized view index settings. +7. View the materialized view query details and then click **Run**. OpenSearch adds your index to the left navigation pane. + +Alternately, you can manually create a materialized view index on your table using Query Workbench. Select your data source from the dropdown and run a query like the following: + +```sql +CREATE MATERIALIZED VIEW {table_name}__week_live_mview AS + SELECT + cloud.account_uid AS `aws.vpc.cloud_account_uid`, + cloud.region AS `aws.vpc.cloud_region`, + cloud.zone AS `aws.vpc.cloud_zone`, + cloud.provider AS `aws.vpc.cloud_provider`, + + CAST(IFNULL(src_endpoint.port, 0) AS LONG) AS `aws.vpc.srcport`, + CAST(IFNULL(src_endpoint.svc_name, 'Unknown') AS STRING) AS `aws.vpc.pkt-src-aws-service`, + CAST(IFNULL(src_endpoint.ip, '0.0.0.0') AS STRING) AS `aws.vpc.srcaddr`, + CAST(IFNULL(src_endpoint.interface_uid, 'Unknown') AS STRING) AS `aws.vpc.src-interface_uid`, + CAST(IFNULL(src_endpoint.vpc_uid, 'Unknown') AS STRING) AS `aws.vpc.src-vpc_uid`, + CAST(IFNULL(src_endpoint.instance_uid, 'Unknown') AS STRING) AS `aws.vpc.src-instance_uid`, + CAST(IFNULL(src_endpoint.subnet_uid, 'Unknown') AS STRING) AS `aws.vpc.src-subnet_uid`, + + CAST(IFNULL(dst_endpoint.port, 0) AS LONG) AS `aws.vpc.dstport`, + CAST(IFNULL(dst_endpoint.svc_name, 'Unknown') AS STRING) AS `aws.vpc.pkt-dst-aws-service`, + CAST(IFNULL(dst_endpoint.ip, '0.0.0.0') AS STRING) AS `aws.vpc.dstaddr`, + CAST(IFNULL(dst_endpoint.interface_uid, 'Unknown') AS STRING) AS `aws.vpc.dst-interface_uid`, + CAST(IFNULL(dst_endpoint.vpc_uid, 'Unknown') AS STRING) AS `aws.vpc.dst-vpc_uid`, + CAST(IFNULL(dst_endpoint.instance_uid, 'Unknown') AS STRING) AS `aws.vpc.dst-instance_uid`, + CAST(IFNULL(dst_endpoint.subnet_uid, 'Unknown') AS STRING) AS `aws.vpc.dst-subnet_uid`, + CASE + WHEN regexp(dst_endpoint.ip, '(10\\..*)|(192\\.168\\..*)|(172\\.1[6-9]\\..*)|(172\\.2[0-9]\\..*)|(172\\.3[0-1]\\.*)') + THEN 'ingress' + ELSE 'egress' + END AS `aws.vpc.flow-direction`, + + CAST(IFNULL(connection_info['protocol_num'], 0) AS INT) AS `aws.vpc.connection.protocol_num`, + CAST(IFNULL(connection_info['tcp_flags'], '0') AS STRING) AS `aws.vpc.connection.tcp_flags`, + CAST(IFNULL(connection_info['protocol_ver'], '0') AS STRING) AS `aws.vpc.connection.protocol_ver`, + CAST(IFNULL(connection_info['boundary'], 'Unknown') AS STRING) AS `aws.vpc.connection.boundary`, + CAST(IFNULL(connection_info['direction'], 'Unknown') AS STRING) AS `aws.vpc.connection.direction`, + + CAST(IFNULL(traffic.packets, 0) AS LONG) AS `aws.vpc.packets`, + CAST(IFNULL(traffic.bytes, 0) AS LONG) AS `aws.vpc.bytes`, + + CAST(FROM_UNIXTIME(time / 1000) AS TIMESTAMP) AS `@timestamp`, + CAST(FROM_UNIXTIME(start_time / 1000) AS TIMESTAMP) AS `start_time`, + CAST(FROM_UNIXTIME(start_time / 1000) AS TIMESTAMP) AS `interval_start_time`, + CAST(FROM_UNIXTIME(end_time / 1000) AS TIMESTAMP) AS `end_time`, + status_code AS `aws.vpc.status_code`, + + severity AS `aws.vpc.severity`, + class_name AS `aws.vpc.class_name`, + category_name AS `aws.vpc.category_name`, + activity_name AS `aws.vpc.activity_name`, + disposition AS `aws.vpc.disposition`, + type_name AS `aws.vpc.type_name`, + + region AS `aws.vpc.region`, + accountid AS `aws.vpc.account-id` + FROM + datasourcename.gluedatabasename.vpclogstable +WITH ( + auto_refresh = true, + refresh_interval = '15 Minute', + checkpoint_location = 's3://accountnum-vpcflow/AWSLogs/checkpoint', + watermark_delay = '1 Minute', +) +``` ## Limitations -This feature is still under development, so there are some limitations. For real-time updates, refer to the [developer documentation on GitHub](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md#limitations). +This feature is still under development, so there are some limitations. For real-time updates, see the [developer documentation on GitHub](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md#limitations). diff --git a/_dashboards/management/acl.md b/_dashboards/management/acl.md new file mode 100644 index 00000000000..bd57b724190 --- /dev/null +++ b/_dashboards/management/acl.md @@ -0,0 +1,78 @@ +--- +layout: default +title: Access control lists for saved objects +parent: Dashboards Management +nav_order: 50 +--- + +# Access control lists for saved objects +Introduced 2.18 +{: .label .label-purple } + +You can use access control lists (ACLs) to manage permissions for your saved objects, providing authorization (AuthZ) capabilities without requiring backend plugin integration. + +## Understanding ACL types + +ACLs are applied at two levels: + +1. **Workspace ACL:** Workspace objects inherit permissions from their parent workspace. See [Workspace ACL]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl) for more information. +2. **Objects ACL:** Each individual object can have its own ACL policy. All operations on these objects must pass ACL policy validation. + +## Enabling the ACL feature + +The ACL feature must be enabled before you can define any access controls. Enable it by: + +1. Opening your `opensearch_dashboards.yml` file. +2. Enabling permissions with `savedObjects.permission.enabled: true`. + +## Defining ACL permissions + +ACL permissions are defined using the following schema: + +```json +{ + "permissions": { + "<permission_type_1>": { + "users": ["<principal_1>", "<principal_2>"], + "groups": ["<principal_3>", "<principal_4>"] + } + } +} +``` +{% include copy-curl.html %} + +### Granting permissions to authenticated users + +The wildcard character (`*`) grants permissions to all authenticated users. In the following example, the ACL grants workspace management permissions to the `finance_manager` group and dashboard creation permissions to the `finance_analyst` group: + +```json +{ + "permissions": { + "write": { + "groups": ["finance_manager"] + }, + "library_write": { + "groups": ["finance_analyst"] + } + } +} +``` +{% include copy-curl.html %} + +### Configuring mixed-level permissions + +To allow one user, `user-1` for example, to modify an object while giving read-only access to others, you can configure the ACL policy as follows: + +```json +{ + "permissions": { + "read": { + "users": ["*"] + }, + "write": { + "users": ["user-1"] + }, + } +} +``` +{% include copy-curl.html %} diff --git a/_dashboards/management/advanced-settings.md b/_dashboards/management/advanced-settings.md index b4c0225c5bf..5d817e1c79c 100644 --- a/_dashboards/management/advanced-settings.md +++ b/_dashboards/management/advanced-settings.md @@ -18,7 +18,7 @@ To access **Advanced settings**, go to **Dashboards Management** and select **Ad ## Required permissions -To modify settings, you must have permission to make changes. See [Multi-tenancy configuration](https://opensearch.org/docs/latest/security/multi-tenancy/multi-tenancy-config/#give-roles-access-to-tenants) for guidance about assigning role access to tenants. +To modify settings, you must have permission to make changes. See [Multi-tenancy configuration]({{site.url}}{{site.baseurl}}/security/multi-tenancy/multi-tenancy-config/#give-roles-access-to-tenants) for guidance about assigning role access to tenants. ## Advanced settings descriptions diff --git a/_dashboards/management/management-index.md b/_dashboards/management/management-index.md index 7edc4d06c23..01796180e56 100644 --- a/_dashboards/management/management-index.md +++ b/_dashboards/management/management-index.md @@ -9,16 +9,14 @@ has_children: true Introduced 2.10 {: .label .label-purple } -**Dashboards Management** serves as the command center for customizing OpenSearch Dashboards to your needs. A view of the interface is shown in the following image. +**Dashboards Management** is the central hub for managing and customizing OpenSearch data directly within OpenSearch Dashboards. -<img src="{{site.url}}{{site.baseurl}}/images/dashboards/dashboards-management-ui.png" alt="Dashboards Management interface" width="700"/> - -{::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/icons/alert-icon.png" class="inline-icon" alt="alert icon"/>{:/} **Note**<br>OpenSearch and OpenSearch Dashboards privileges govern access to individual features. If you do not have the appropriate access, consult your administrator. -{: .note} +OpenSearch and OpenSearch Dashboards permissions govern access to individual features. If you do not have the appropriate access permissions, consult your administrator. +{: .warning} ## Applications -The following applications are available in **Dashboards Management**: +You can access the following applications in **Dashboards Management**: - **[Index Patterns]({{site.url}}{{site.baseurl}}/dashboards/management/index-patterns/):** To access OpenSearch data, you need to create an index pattern so that you can select the data you want to use and define the properties of the fields. The Index Pattern tool gives you the ability to create an index pattern from within the UI. Index patterns point to one or more indexes, data streams, or index aliases. - **[Data Sources]({{site.url}}{{site.baseurl}}/dashboards/management/multi-data-sources/):** The Data Sources tool is used to configure and manage the data sources that OpenSearch uses to collect and analyze data. You can use the tool to specify the source configuration in your copy of the [OpenSearch Dashboards configuration file]({{site.url}}{{site.baseurl}}https://github.com/opensearch-project/OpenSearch-Dashboards/blob/main/config/opensearch_dashboards.yml). diff --git a/_dashboards/management/multi-data-sources.md b/_dashboards/management/multi-data-sources.md index dc3096c2510..3876530a796 100644 --- a/_dashboards/management/multi-data-sources.md +++ b/_dashboards/management/multi-data-sources.md @@ -17,7 +17,7 @@ You can ingest, process, and analyze data from multiple data sources in OpenSear The following tutorial guides you through configuring and using multiple data sources in OpenSearch Dashboards. -The following features are not supported when using multiple data sources: timeline visualization types and the `gantt-chart` plugin. +The following features are not supported when using multiple data sources: timeline visualization types. {: .note} ### Step 1: Modify the YAML file settings diff --git a/_dashboards/management/scheduled-query-acceleration.md b/_dashboards/management/scheduled-query-acceleration.md new file mode 100644 index 00000000000..3aa83e6fc5c --- /dev/null +++ b/_dashboards/management/scheduled-query-acceleration.md @@ -0,0 +1,208 @@ +--- +layout: default +title: Scheduled Query Acceleration +parent: Data sources +nav_order: 18 +has_children: false +--- + +# Scheduled Query Acceleration +Introduced 2.17 +{: .label .label-purple } + +Scheduled Query Acceleration (SQA) is designed to optimize queries sent directly from OpenSearch to external data sources, such as Amazon Simple Storage Service (Amazon S3). It uses automation to address issues commonly encountered when managing and refreshing indexes, views, and data. + +Query acceleration is facilitated by secondary indexes like [skipping indexes]({{site.url}}{{site.baseurl}}/dashboards/management/accelerate-external-data/#skipping-indexes), [covering indexes]({{site.url}}{{site.baseurl}}/dashboards/management/accelerate-external-data/#covering-indexes), or [materialized views]({{site.url}}{{site.baseurl}}/dashboards/management/accelerate-external-data/#materialized-views). When queries run, they use these indexes instead of directly querying Amazon S3. + +The secondary indexes need to be refreshed periodically in order to remain current with the Amazon S3 data. This refresh operation can be scheduled using either an internal scheduler (within Spark) or an external scheduler. + +SQA provides the following benefits: + +- **Cost reduction through optimized resource usage**: SQA reduces the operational load on driver nodes, lowering the costs associated with maintaining auto-refresh for indexes and views. + +- **Improved observability of refresh operations**: SQA provides visibility into index states and refresh timing, offering insights into data processing and the current system state. + +- **Better control over refresh scheduling**: SQA allows flexible scheduling of refresh intervals, helping you to manage resource usage and refresh frequency according to specific requirements. + +- **Simplified index management**: SQA enables updates to index settings, such as refresh intervals, in a single query, which simplifies workflows. + +## Concepts + +Before configuring SQA, familiarize yourself with the following topics: + +- [Optimizing query performance using OpenSearch indexing]({{site.url}}{{site.baseurl}}/dashboards/management/accelerate-external-data/) +- [Flint index refresh](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md#flint-index-refresh) +- [Index State Management](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md#index-state-transition-1) + +## Prerequisites + +Before configuring SQA, verify that the following requirements are met: + +- Ensure you're running OpenSearch version 2.17 or later. +- Ensure you have the SQL plugin installed. The SQL plugin is included in most OpenSearch distributions. For more information, see [Installing plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/). +- Ensure you have configured a data source (in this example, Amazon S3): Configure a skipping index, covering index, or materialized view. These secondary data sources are additional data structures that improve query performance by optimizing queries sent to external data sources, such as Amazon S3. For more information, see [Optimizing query performance using OpenSearch indexing]({{site.url}}{{site.baseurl}}/dashboards/management/accelerate-external-data/). +- Configure Amazon EMR Serverless (needed for access to Apache Spark). + +## Configuring SQA settings + +If you want to override default configuration values, change the following cluster settings: + +- **Enable asynchronous query execution**: Set `plugins.query.executionengine.async_query.enabled` to `true` (default value): + ```json + PUT /_cluster/settings + { + "transient": { + "plugins.query.executionengine.async_query.enabled": "true" + } + } + ``` + {% include copy-curl.html %} + + For more information, see [Settings](https://github.com/opensearch-project/sql/blob/main/docs/user/admin/settings.rst#pluginsqueryexecutionengineasync_queryenabled). + +- **Configure the external scheduler interval for asynchronous queries**: This setting defines how often the external scheduler checks for tasks, allowing customization of refresh frequency. There is no default value for this setting: if this value is empty, the default comes from `opensearch-spark` and is `5 minutes`. Adjusting the interval based on workload volume can help you to optimize resources and manage costs: + ```json + PUT /_cluster/settings + { + "transient": { + "plugins.query.executionengine.async_query.external_scheduler.interval": "10 minutes" + } + } + ``` + {% include copy-curl.html %} + + For more information, see [Settings](https://github.com/opensearch-project/sql/blob/main/docs/user/admin/settings.rst#pluginsqueryexecutionengineasync_queryexternal_schedulerinterval). + +## Running an accelerated query + +You can run accelerated queries in [Query Workbench]({{site.url}}{{site.baseurl}}/dashboards/query-workbench/). To run an accelerated query, use the following syntax: + +```sql +CREATE SKIPPING INDEX example_index +WITH ( + auto_refresh = true, + refresh_interval = '15 minutes' +); +``` +{% include copy.html %} + +By default, the query uses an external scheduler. To use an internal scheduler, set `scheduler_mode` to `internal`: + +```sql +CREATE SKIPPING INDEX example_index +WITH ( + auto_refresh = true, + refresh_interval = '15 minutes', + scheduler_mode = 'internal' +); +``` +{% include copy.html %} + +## Parameters + +When creating indexes using an accelerated query, you can specify the following parameters in the `WITH` clause to control refresh behavior, scheduling, and timing. + +| Parameter | Description | +|:--- | :--- | +| `auto_refresh` | Enables automatic refresh for the index. If `true`, the index refreshes automatically at the specified interval. If `false`, the refresh operation must be triggered manually using the `REFRESH` statement. Default is `false`. | +| `refresh_interval` | Defines the amount of time between index refresh operations for the index, which determines how frequently new data is ingested into the index. This is applicable only when `auto_refresh` is enabled. The interval determines how frequently new data is integrated and can be specified in formats like `1 minute` or `10 seconds`. For valid time units, see [Time units](#time-units).| +| `scheduler_mode` | Specifies the scheduling mode for auto-refresh (internal or external scheduling). The external scheduler requires a `checkpoint_location` (a path for refresh job checkpoints) for state management. For more information, see [Starting streaming queries](https://spark.apache.org/docs/3.5.1/structured-streaming-programming-guide.html#starting-streaming-queries). Valid values are `internal` and `external`.| + +For more information and additional available parameters, see [Flint index refresh](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md#flint-index-refresh). + +## Time units + +You can specify the following time units when defining time intervals: + +- Milliseconds: `ms`, `millisecond`, or `milliseconds` +- Seconds: `s`, `second`, or `seconds` +- Minutes: `m`, `minute`, or `minutes` +- Hours: `h`, `hour`, or `hours` +- Days: `d`, `day`, or `days` + +## Monitoring index status + +To monitor the status of an index, use the following statement: + +```sql +SHOW FLINT INDEXES IN spark_catalog.default; +``` +{% include copy.html %} + +## Managing scheduled jobs + +Use the following commands to manage scheduled jobs. + +### Enabling jobs + +To disable auto-refresh using an internal or external scheduler, set `auto_refresh` to `false`: + +```sql +ALTER MATERIALIZED VIEW myglue_test.default.count_by_status_v9 WITH (auto_refresh = false); +``` +{% include copy.html %} + +### Updating schedules + +To update the schedule and modify the refresh settings, specify the `refresh_interval` in the `WITH` clause: + +```sql +ALTER INDEX example_index +WITH (refresh_interval = '30 minutes'); +``` +{% include copy.html %} + +### Switching the scheduler mode + +To switch the scheduler mode, specify the `scheduler_mode` in the `WITH` clause: + +```sql +ALTER MATERIALIZED VIEW myglue_test.default.count_by_status_v9 WITH (scheduler_mode = 'internal'); +``` +{% include copy.html %} + +### Inspecting scheduler metadata + +To inspect scheduler metadata, use the following request: + +```json +GET /.async-query-scheduler/_search +``` +{% include copy-curl.html %} + +## Best practices + +We recommend the following best practices when using SQA. + +### Performance optimization + +- **Recommended refresh intervals**: Choosing the right refresh interval is crucial for balancing resource usage and system performance. Consider your workload requirements and the freshness of the data you need when setting intervals. + +- **Concurrent job limits**: Limit the number of concurrent running jobs to avoid overloading system resources. Monitor system capacity and adjust job limits accordingly to ensure optimal performance. + +- **Resource usage**: Efficient resource allocation is key to maximizing performance. Properly allocate memory, CPU, and I/O based on the workload and the type of queries you're running. + +### Cost management + +- **Use an external scheduler**: An external scheduler offloads refresh operations, reducing demand on core driver nodes. + +- **Configure a refresh interval for your use case**: Longer refresh intervals lead to reduced costs but may impact data freshness. + +- **Optimize the refresh schedule**: Adjust refresh intervals based on workload patterns to reduce unnecessary refresh operations. + +- **Monitor costs**: Regularly monitor costs related to scheduled queries and refresh operations. Using observability tools can help you gain insights into resource usage and costs over time. + +## Validating settings + +You can validate your settings by running a test query and verifying the scheduler configuration: + +```sql +SHOW FLINT INDEXES EXTENDED +``` +{% include copy.html %} + +For more information, see the [OpenSearch Spark documentation](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md#all-indexes). + +## Troubleshooting + +If the refresh operation is not triggering as expected, ensure that the `auto_refresh` setting is enabled and the refresh interval is properly configured. diff --git a/_dashboards/visualize/area.md b/_dashboards/visualize/area.md index 5df59579ec7..0da08c68c18 100644 --- a/_dashboards/visualize/area.md +++ b/_dashboards/visualize/area.md @@ -17,7 +17,7 @@ In this tutorial you'll create a simple area chart using sample data and aggrega You have several aggregation options in Dashboards, and the choice influences your analysis. The use cases for aggregations vary from analyzing data in real time to using Dashboards to create a visualization dashboard. If you need an overview of aggregations in OpenSearch, see [Aggregations]({{site.url}}{{site.baseurl}}/opensearch/aggregations/) before starting this tutorial. -Make sure you have [installed the latest version of Dashboards](https://opensearch.org/docs/latest/install-and-configure/install-dashboards/index/) and added the sample data before continuing with this tutorial. _This tutorial uses Dashboards version 2.4.1_. +Make sure you have [installed the latest version of Dashboards]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/index/) and added the sample data before continuing with this tutorial. _This tutorial uses Dashboards version 2.4.1_. {: .note} ## Set up the area chart diff --git a/_dashboards/visualize/gantt.md b/_dashboards/visualize/gantt.md deleted file mode 100644 index 3a9814465a2..00000000000 --- a/_dashboards/visualize/gantt.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -layout: default -title: Gantt charts -parent: Building data visualizations -nav_order: 30 -redirect_from: - - /dashboards/gantt/ ---- - -# Gantt charts - -OpenSearch Dashboards includes a Gantt chart visualization. Gantt charts show the start, end, and duration of unique events in a sequence. Gantt charts are useful in trace analytics, telemetry, and anomaly detection use cases, where you want to understand interactions and dependencies between various events in a schedule. - -For example, consider an index of log data. The fields in a typical set of log data, especially audit logs, contain a specific operation or event with a start time and duration. - -To create a Gantt chart, perform the following steps: - -1. In the visualizations menu, choose **Create visualization** and **Gantt Chart**. -1. Choose a source for the chart (e.g. some log data). -1. Under **Metrics**, choose **Event**. For log data, each log is an event. -1. Select the **Start Time** and **Duration** fields from your dataset. The start time is the timestamp for the beginning of an event. The duration is the amount of time to add to the start time. -1. Under **Results**, choose the number of events to display on the chart. Gantt charts sequence events from earliest to latest based on start time. -1. Choose **Panel settings** to adjust axis labels, time format, and colors. -1. Choose **Update**. - -![Gantt Chart]({{site.url}}{{site.baseurl}}/images/dashboards/gantt-chart.png) - -This Gantt chart displays the ID of each log on the y-axis. Each bar is a unique event that spans some amount of time. Hover over a bar to see the duration of that event. diff --git a/_dashboards/visualize/viz-index.md b/_dashboards/visualize/viz-index.md index 4bde79d2cc9..22d4591cffa 100644 --- a/_dashboards/visualize/viz-index.md +++ b/_dashboards/visualize/viz-index.md @@ -3,6 +3,7 @@ layout: default title: Building data visualizations nav_order: 40 has_children: true +has_toc: false --- # Building data visualizations @@ -39,12 +40,6 @@ Data tables, or tables, show your raw data in tabular form. <img src="{{site.url}}{{site.baseurl}}/images/data-table-1.png" width="600" height="600" alt="Example data table in OpenSearch Dashboards"> -### Gantt charts - -Gantt charts show the start, end, and duration of unique events in a sequence. Gantt charts are useful in trace analytics, telemetry, and anomaly detection use cases where you want to understand interactions and dependencies between various events in a schedule. **Gantt chart** is currently a plugin, instead of built-in, visualization type in Dashboards. See [Gantt charts]({{site.url}}{{site.baseurl}}/dashboards/visualize/gantt/) to learn how to create and use them in Dashboards. - -<img src="{{site.url}}{{site.baseurl}}/images/dashboards/gantt-chart.png" width="600" height="600" alt="Example Gantt chart in OpenSearch Dashboards"> - ### Gauge charts Gauge charts look similar to an analog speedometer that reads left to right from zero. They display how much there is of the thing you are measuring, and this measurement can exist alone or in relation to another measurement, such as tracking performance against benchmarks or goals. @@ -126,3 +121,13 @@ VisBuilder is a drag-and-drop data visualization tool in Dashboards. It gives yo [Vega](https://vega.github.io/vega/) and [Vega-Lite](https://vega.github.io/vega-lite/) are open-source, declarative language visualization grammars for creating, sharing, and saving interactive data visualizations. Vega visualizations give you the flexibility to visualize multidimensional data using a layered approach in order to build and manipulate visualizations in a structured manner. Vega can be used to create customized visualizations using any Dashboards visualization type. <img src="{{site.url}}{{site.baseurl}}/images/dashboards/vega-1.png" width="600" height="600" alt="Example Vega visualization with JSON specification in OpenSearch Dashboards"> + +## Next steps + +To try building various visualizations, see the following articles: + +- [Area charts]({{site.url}}{{site.baseurl}}/dashboards/visualize/area/) +- [Coordinate and region maps]({{site.url}}{{site.baseurl}}/dashboards/visualize/geojson-regionmaps/) +- [Time-series visualizations]({{site.url}}{{site.baseurl}}/dashboards/visualize/tsvb/) +- [Vega visualizations]({{site.url}}{{site.baseurl}}/dashboards/visualize/vega/) +- [Drag-and-drop visualizations]({{site.url}}{{site.baseurl}}/dashboards/visualize/visbuilder/) \ No newline at end of file diff --git a/_dashboards/workspace/apis.md b/_dashboards/workspace/apis.md new file mode 100644 index 00000000000..683488e4235 --- /dev/null +++ b/_dashboards/workspace/apis.md @@ -0,0 +1,386 @@ +--- +layout: default +title: Workspaces APIs +parent: Workspace for OpenSearch Dashboards +nav_order: 10 +--- + +# Workspaces APIs +Introduced 2.18 +{: .label .label-purple } + +The Workspaces API provides a set of endpoints for managing workspaces in OpenSearch Dashboards. + +## List Workspaces API + +You can use the following endpoint to retrieve a list of workspaces: + +```json +POST <osd host>:<port>/api/workspaces/_list +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `search` | String | Optional | A query string used to filter workspaces with simple query syntax, for example, `simple_query_string`. | +| `searchFields` | Array | Optional | Specifies which fields to perform the search query against. | +| `sortField` | String | Optional | The field name to use for sorting results. | +| `sortOrder` | String | Optional | Specifies ascending or descending sort order. | +| `perPage` | Number | Optional | The number of workspace results per page. | +| `page` | Number | Optional | The number of pages of results to retrieve. | +| `permissionModes` | Array | Optional | A list of permissions to filter by. | + +#### Example request + +```json +POST /api/workspaces/_list +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": { + "page": 1, + "per_page": 20, + "total": 3, + "workspaces": [ + { + "name": "test1", + "features": [ + "use-case-all" + ], + "id": "hWNZls" + }, + { + "name": "test2", + "features": [ + "use-case-observability" + ], + "id": "SnkOPt" + } + ] + } +} +``` +{% include copy-curl.html %} + +## Get Workspaces API + +You can use the following endpoint to retrieve a single workspace: + +```json +GET <osd host>:<port>/api/workspaces/<id> +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. All path parameters are required. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `<id>` | String | Required | Identifies the unique workspace to be retrieved. | + +#### Example request + +```json +GET /api/workspaces/SnkOPt +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": { + "name": "test2", + "features": ["use-case-all"], + "id": "SnkOPt" + } +} +``` +{% include copy-curl.html %} + +## Create Workspaces API + +You can use the following endpoint to create a workspace: + +```json +POST <osd host>:<port>/api/workspaces +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `attributes` | Object | Required | Defines the workspace attributes. | +| `permissions` | Object | Optional | Specifies the permissions for the workspace. | + +#### Example request + +```json +POST api/workspaces +{ + "attributes": { + "name": "test4", + "description": "test4" + } +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": { + "id": "eHVoCJ" + } +} +``` +{% include copy-curl.html %} + +## Update Workspaces API + +You can use the following endpoint to update the attributes and permissions for a workspace: + +```json +PUT <osd host>:<port>/api/workspaces/<id> +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `<id>` | String | Required | Identifies the unique workspace to be retrieved. | +| `attributes` | Object | Required | Defines the workspace attributes. | +| `permissions` | Object | Optional | Specifies the permissions for the workspace. | + +#### Example request + +```json +PUT api/workspaces/eHVoCJ +{ + "attributes": { + "name": "test4", + "description": "test update" + } +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": true +} +``` +{% include copy-curl.html %} + +## Delete Workspaces API + +You can use the following endpoint to delete a workspace: + +```json +DELETE <osd host>:<port>/api/workspaces/<id> +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. All path parameters are required. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `<id>` | String | Required | Identifies the unique workspace to be retrieved. | + +#### Example request + +```json +DELETE api/workspaces/eHVoCJ +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": true +} +``` +{% include copy-curl.html %} + +## Duplicate Saved Objects Workspaces API + +You can use the following endpoint to copy saved objects between workspaces: + +```json +POST <osd host>:<port>/api/workspaces/_duplicate_saved_objects +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `objects` | Array | Required | Specifies the saved objects to be duplicated. | +| `targetWorkspace` | String | Required | Identifies the destination workspace for copying. | +| `includeReferencesDeep` | Boolean | Optional | Determines whether to copy all referenced objects to the target workspace. Default is `true`. | + +The following table lists the attributes of the object in the `objects` parameter. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `type` | String | Required | Defines the saved object classification, such as `index-pattern`, `config`, or `dashboard`. | +| `id` | String | Required | The ID of the saved object. | + +#### Example request + +```json +POST api/workspaces/_duplicate_saved_objects +{ + "objects": [ + { + "type": "index-pattern", + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d" + } + ], + "targetWorkspace": "9gt4lB" +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "successCount": 1, + "success": true, + "successResults": [ + { + "type": "index-pattern", + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d", + "meta": { + "title": "test*", + "icon": "indexPatternApp" + }, + "destinationId": "f4b724fd-9647-4bbf-bf59-610b43a62c75" + } + ] +} +``` +{% include copy-curl.html %} + +## Associate Saved Objects Workspaces API + +You can use the following endpoint to associate saved objects with a workspace: + +```json +POST <osd host>:<port>/api/workspaces/_associate +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `workspaceId` | String | Required | Identifies the target workspace for object association. | +| `savedObjects` | Array | Required | Specifies the list of saved objects to be copied. | + +The following table lists the attributes of the object in the `objects` parameter. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `type` | String | Required | Defines the saved object classification, such as `index-pattern`, `config`, or `dashboard`. | +| `id` | String | Required | The ID of the saved object. | + +#### Example request + +```json +POST api/workspaces/_associate +{ + "objects": [ + { + "type": "index-pattern", + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d" + } + ], + "targetWorkspace": "9gt4lB" +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": [ + { + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d", + } + ] +} +``` +{% include copy-curl.html %} + +## Dissociate Saved Objects Workspaces API + +You can use the following endpoint to dissociate saved objects from a workspace: + +```json +POST <osd host>:<port>/api/workspaces/_dissociate +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `workspaceId` | String | Required | The target workspace with which to associate the objects. | +| `savedObjects` | Array | Required | A list of saved objects to copy. | + +The following table lists the attributes of the `savedObjects` parameter. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `type` | String | Required | The type of the saved object, such as `index-pattern`, `config`, or `dashboard`. | +| `id` | String | Required | The ID of the saved object. | + +#### Example request + +```json +POST api/workspaces/_dissociate +{ + "objects": [ + { + "type": "index-pattern", + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d" + } + ], + "targetWorkspace": "9gt4lB" +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": [ + { + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d", + } + ] +} +``` +{% include copy-curl.html %} diff --git a/_dashboards/workspace/create-workspace.md b/_dashboards/workspace/create-workspace.md new file mode 100644 index 00000000000..34ba65bb540 --- /dev/null +++ b/_dashboards/workspace/create-workspace.md @@ -0,0 +1,52 @@ +--- +layout: default +title: Create a workspace +parent: Workspace for OpenSearch Dashboards +nav_order: 1 +--- + +# Create a workspace +Introduced 2.18 +{: .label .label-purple } + +Before getting started with this tutorial, you must enable the workspace feature flag. See [Enabling the ACL feature]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace/#enabling-the-workspace-feature) for more information. + +When the saved objects permission is enabled, only users with admin status can create workspaces. See [Configuring the dashboard admin]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) for more information. + +To create a workspace, follow these steps: + +1. Open OpenSearch Dashboards. +2. From the main page, choose the appropriate card for your use case, for example, **Observability**, **Security Analytics**, **Search**, **Essentials**, or **Analytics**. Alternatively, you can select the **Create workspace** button and choose the appropriate use case from the dropdown menu. +3. Enter the required information in the **Workspace details** window. + - **Workspace name** is required. Valid characters are `a-z`, `A-Z`, `0-9`, parentheses (`()`), brackets (`[]`), underscore (`_`), hyphen (`-`), and spaces. Choose a unique workspace name within the character limit (40 characters). The **Create workspace** button is disabled when the workspace name already exists or exceeds the character limit, and an error message appears. + - **Use case and features** is required. Choose the use case that best fits your needs. If you are using Amazon OpenSearch Serverless and have enabled the [multiple data sources]({{site.url}}{{site.baseurl}}/dashboards/management/data-sources/) feature, **Essentials** is automatically assigned. +4. (Optional) Select the color picker to customize the color of your workspace icon. +5. (Optional) Add a workspace description of up to 200 characters. This option is disabled when the description exceeds the character limit. +6. Save your workspace. + - The **Create workspace** button becomes active once you enter the information for all required fields. You become the workspace owner automatically. The system redirects you to either the collaborators page if the saved objects permission is enabled or the overview page if the saved objects permission is disabled. See [Configuring dashboard admin]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) for more information about permissions. + +To set up permissions, see [Workspace access control lists]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/) for more information. + +## Associating data sources with a workspace + +The **Associate data source** option is only visible when the multiple data sources feature is enabled. Before creating your workspace, you must connect it with at least one data source. If you have not set up your data sources, see [Data sources]({{site.url}}{{site.baseurl}}/dashboards/management/data-sources/). Once your sources are connected, you can link them to your new workspace. +{: .warning} + +### Associating OpenSearch data sources + +To associate OpenSearch data sources, follow these steps: + +1. Select the **Associate OpenSearch Data Sources** button to open the selection modal. +2. View the available data sources in the modal: + - Standard OpenSearch sources appear as single entries. + - Sources with direct query connections show a +N indicator. +3. Select the appropriate data source name(s). +4. Select the **Associate data sources** button to complete the association. + +### Associating direct query sources + +To associate direct query sources, follow these steps: + +1. Select the **Associate direct query data sources** button to open the selection modal. The modal displays only sources with direct query connections. +2. Select a data source to automatically expand its direct query connections. +3. Select the **Associate data sources** button to complete the association. diff --git a/_dashboards/workspace/index.md b/_dashboards/workspace/index.md new file mode 100644 index 00000000000..f0f572a4a5b --- /dev/null +++ b/_dashboards/workspace/index.md @@ -0,0 +1,27 @@ +--- +layout: default +title: Getting started with workspaces +parent: Workspace for OpenSearch Dashboards +nav_order: 0 +--- + +# Getting started with workspaces +Introduced 2.18 +{: .label .label-purple } + +OpenSearch Dashboards 2.18 introduces an enhanced home page that provides a comprehensive view of all your workspaces. + +The new home page includes the following features: + +1. A **Create workspace** button for [OpenSearch Dashboard admins]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) to navigate to the [create workspace]({{site.url}}{{site.baseurl}}/dashboards/workspace/create-workspace) page. +2. Workspace access time information and a link to the workspace overview page. +3. A use case information icon that displays information about the workspace's purpose. +4. A **View all workspaces** button that navigates to the [workspace management]({{site.url}}{{site.baseurl}}/dashboards/workspace/manage-workspace/#navigating-the-workspaces-list) page. +5. Links to the latest OpenSearch documentation through the **Learn more from documentation** button and to [OpenSearch Playground](https://playground.opensearch.org/app/home#/) through the **Explore live demo environment at playground.opensearch.org** button. + +The navigation logic ensures a seamless user experience by directing you to the appropriate page based on your workspace access level: + +- If a you have a default workspace configured, you are directed to the workspace overview page. +- If a you have only one workspace, you are directed to the overview page of that workspace. +- If a you have multiple workspaces, you are directed to the new home page. +- If a you have no workspaces, you are directed to the new home page. diff --git a/_dashboards/workspace/manage-workspace.md b/_dashboards/workspace/manage-workspace.md new file mode 100644 index 00000000000..45733d75be3 --- /dev/null +++ b/_dashboards/workspace/manage-workspace.md @@ -0,0 +1,118 @@ +--- +layout: default +title: Manage workspaces +parent: Workspace for OpenSearch Dashboards +nav_order: 2 +--- + +# Manage workspaces +Introduced 2.18 +{: .label .label-purple } + +You can access and modify the workspace details, including name, description, use case, and icon color, on the **Workspace details** page. + +To access and modify your workspace details, follow these steps: + +1. Open OpenSearch Dashboards and navigate to **My Workspaces**. +2. Choose the desired workspace and then select the **Edit** button to make changes +3. Select the **Save** button to confirm changes or the **Discard changes** button to cancel modifications. + +## Workspace update permissions + +The following permissions apply when changing workspaces: + +1. **Without the Security plugin:** All users can edit and update the workspace. +2. **With the Security plugin installed and `savedObjects.permission.enabled: false` in the `config/opensearch_dashboards.yml` file:** All users can edit and update workspaces. +3. **With the Security plugin and `savedObjects.permission.enabled: true` in the `config/opensearch_dashboards.yml`:** Only the [workspace owner]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#defining-workspace-collaborators) and the [workspace admins]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) can edit and update workspaces. + +## Workspace update restrictions + +When updating workspace use cases, the following rules apply. + +Original use case | Target use case | +:---: | :---: +Analytics | Cannot be changed to any other use case +Search | Analytics +Security analytics | Analytics +Observability | Analytics +Essentials | Analytics Search<br> Security Analytics<br> Observability + +## Workspace control panel + +The **Workspace details** page features the following buttons in the upper-right corner: + +1. **Delete** ({::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/dashboards/trash-can-icon.png" class="inline-icon" alt="trash can icon"/>{:/} icon) + - **Without the Security plugin installed:** All users can delete the workspace. + - **With the Security plugins installed and `savedObjects.permission.enabled: false` in the `config/opensearch_dashboards.yml` file:** All users can delete the workspace. + - **With the Security plugin installed and `savedObjects.permission.enabled: true` in the `config/opensearch_dashboards.yml` file:** Only the admin can delete the workspace. +2. **Set as default workspace:** Sets the current workspace as the default login destination. +3. **Workspace overview:** Opens the **Overview** page in a new tab. + +## Adding assets to the workspace + +Access the **Sample data** in the navigation menu on the left. Select the appropriate dataset to install it in your cluster and OpenSearch Dashboards. + +## Copying assets between workspaces + +Data sources and configuration copying are not supported. +{: .warning} + +The assets page provides the following methods for copying assets across workspaces: + +1. **Copy all assets to...:** Copies all assets in the table. +2. **Copy to...:** Moves selected assets from the table. +3. **Copy to...:** Copies a single asset from the table. + +After selecting a copy option, choose the target workspace from the dropdown menu. The **Copy related assets** checkbox allows you to transfer associated assets. + +Upon selecting the **Copy** button, a side panel appears showing successful and failed asset transfers. Asset copy destinations depend on the following security configurations: + +1. **Without the Security plugin:** All workspaces are accessible. +2. **With the Security plugin and `savedObjects.permission.enabled: false` in the `config/opensearch_dashboards.yml` file:** All workspaces are accessible. +3. **With the Security plugin and `savedObjects.permission.enabled: true` in the `config/opensearch_dashboards.yml` file:** Only workspaces for which the user has read and write or admin permissions are accessible. + +## Associating data sources + +On the data source management page, you can access a comprehensive list of associated OpenSearch connections, monitor direct query connections relevant to your current workspace, and establish new data source associations as needed. + +### Managing OpenSearch connections + +The OpenSearch connections tab displays all associated connections for the current workspace. Follow these steps to manage your connections: + +1. Access a comprehensive list of associated OpenSearch connections on the connections tab. +2. Use the **Remove association** button to unlink connections as needed. +3. Add new data sources by selecting the **OpenSearch data sources** button and subsequent modal. +4. Select from unassociated OpenSearch connections to expand your workspace's capabilities. + +### Adding direct query connections + +The **Direct query connections** tab displays a list of all direct query connections associated with your current workspace. To add more direct query connections to your workspace, select the **Direct query data sources** button. A modal window opens. + +The association modal displays a list of OpenSearch connections that contain direct query connections and have not yet been associated with your current workspace. When you associate an OpenSearch connection with your current workspace, all direct query connections within that OpenSearch connection are automatically associated as well. + +## Deleting your workspace + +Workspace deletion is restricted to admins. If you do not see a {::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/dashboards/trash-can-icon.png" class="inline-icon" alt="trash can icon"/>{:/} icon, check your permissions. See [Configuring dashboard administrators]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) for more information. +{: .warning} + +Deleting a workspace permanently erases all its assets (except data sources) and the workspace itself. This action cannot be reversed. + +To delete a workspace, follow these steps: + +1. From the **Workspace details** page, select the {::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/dashboards/trash-can-icon.png" class="inline-icon" alt="trash can icon"/>{:/} icon in the upper-right corner to delete the current workspace. +2. Alternatively, from the workspace list page, select the {::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/ellipsis-icon.png" class="inline-icon" alt="ellipsis icon"/>{:/} icon and select **Delete**. Optionally, select multiple workspaces for bulk deletion. + +## Navigating the workspaces list + +The workspaces list page serves as your central hub for workspace management, displaying all workspaces for which you have access permissions. Key features include the following: + +- Search: Quickly find a workspace by name. +- Filter: Sort workspaces by use case. +- At a glance: View each workspace's name, use case, description, last update time, and associated data sources. + +Each workspace entry includes an **Actions** column with the following functional buttons. These tools streamline your workspace management, allowing for efficient organization and customization of your OpenSearch Dashboards environment: + +1. Copy ID: One-click copying of the workspace ID. +2. Edit: Direct access to the workspace's detailed configuration page. +3. Set as default: Easily set any workspace as your default workspace. +4. Delete: Remove workspaces as needed (may require admin privileges). diff --git a/_dashboards/workspace/workspace-acl.md b/_dashboards/workspace/workspace-acl.md new file mode 100644 index 00000000000..a3779cfbe0d --- /dev/null +++ b/_dashboards/workspace/workspace-acl.md @@ -0,0 +1,189 @@ +--- +layout: default +title: Workspace access control lists +parent: Workspace for OpenSearch Dashboards +nav_order: 3 +--- + +# Workspace access control lists +Introduced 2.18 +{: .label .label-purple } + +Workspace access control lists (ACLs) manage authorization for saved objects `AuthZ(Authorization)` while enabling [Security in OpenSearch]({{site.url}}{{site.baseurl}}/security/) for `AuthN(Authentication)`. + +## Personas + +**Workspace** use cases involve the following key personas: + +* **Dashboard admin:** Has full access to all OpenSearch Dashboards functions and data. +* **Workspace administrator (also called _owner_):** Has full control over a specific workspace, including its configuration and saved objects. When a workspace is created, its creator is automatically assigned the role of workspace owner. +* **Workspace content producer:** Can view, create, and update saved objects within the workspace. +* **Workspace viewer:** Has read-only access to saved objects in the workspace. + + Roles are workspace specific, allowing users to assume different roles across workspaces. + {: .note} + +## Enabling permission control + +See [Enabling the ACL feature]({{site.url}}{{site.baseurl}}/dashboards/management/acl#enabling-the-acl-feature) for instructions. + +## Configuring dashboard administrators + +To grant full access to all workspaces and objects in OpenSearch Dashboards, configure the admin permissions. Edit the `opensearch_dashboards.yml` file to define the admin by user ID and backend role, as shown in the following configuration: + +```yaml +opensearchDashboards.dashboardAdmin.users: ["UserID"] +opensearchDashboards.dashboardAdmin.groups: ["BackendRole"] +savedObjects.permission.enabled: true +``` +{% include copy.html %} + +By default, the configuration is set to `[]`, meaning that no users are designated as admins. If the Security plugin is not installed and `savedObjects.permission.enabled: false`, all users are granted admin permissions. + +### Configuring global admin access + +Set all users as admins with this wildcard setting: + +```yaml +opensearchDashboards.dashboardAdmin.users: ["*"] +``` +{% include copy-curl.html %} + +### Configuring admin access for a single user + +Configure a user with the `admin-user-id` setting: + +```yaml +opensearchDashboards.dashboardAdmin.users: ["admin-user-id"] +``` +{% include copy-curl.html %} + +### Configuring admin access by backend role + +Configure a user with the `admin-role` setting: + +```yaml +opensearchDashboards.dashboardAdmin.groups: ["admin-role"] +``` +{% include copy-curl.html %} + +### Admin-restricted operations + +Admin-restricted operations include the following: + +- Workspace creation +- Workspace deletion +- Data source connections +- Disconnecting data sources from workspaces + +## Defining workspace collaborators + +Access to collaborator management is limited to admins. The **Collaborators** feature is only available when permission control is enabled. For instructions on activating permission control, see [Enabling permission control](#enabling-permission-control). The access levels include the following: + +- **Read only:** Grants permission to view the workspace and its assets. +- **Read and write:** Allows viewing and editing of assets within the workspace. +- **Admin:** Provides full access, including viewing and editing of assets within the workspace and updating workspace metadata, such as name, description, data sources, and collaborators. + +From the **Collaborators** page, you can by collaborator ID and filter results by collaborator type and access level. + +### Adding collaborators + +Workspace creators are granted the **Admin** access level as a collaborator. To add more collaborators, select the **Add collaborators** button, which displays a dropdown menu. Choose **Add Users** or **Add Groups** to access the corresponding modal for adding new collaborators. + +#### Adding users + +To add users, follow these steps: + +1. Select the **Add Users** button to open the modal. The modal displays one empty `User ID` field by default. +2. Choose an access level: **Read only**, **Read and write**, or **Admin**. +3. Choose **Add another User** to add multiple users. Do not use duplicate or existing `User ID` fields to avoid errors. +4. Resolve any errors before finalizing. Successfully added users appear in the collaborators table. + +#### Adding groups + +To add groups, follow these steps: + +1. Select the **Add Groups** button to open the modal. The modal displays one empty `Group ID` field by default. +2. Choose an access level: **Read only**, **Read and write**, or **Admin**. +3. Use **Add another group** to add multiple groups. Do not use duplicate or existing `Group ID` fields to avoid errors. +4. Resolve any errors before finalizing. Successfully added users appear in the collaborators table. + +### Modifying access levels + +You can modify collaborators access levels after adding them to the collaborators table if you have the required permissions. Collaborators can be assigned any access level. However, if all **Admin** collaborators are changed to lower access levels, then only admins can manage workspace collaboration. + +#### Modifying individual access levels + +To modify a single collaborator's access level, follow these steps: + +1. Select the action icon on the right of the table row. +2. Select **Change access level** from the dropdown menu. +3. Choose the desired access level from the list. +4. Confirm the change in the modal that appears and select **Confirm**. The collaborator's access level is updated in the table upon confirmation. + +#### Modifying access levels in batch + +To change access levels for several collaborators simultaneously, follow these steps: + +1. Select the desired collaborator rows in the table. +2. Select the **Actions** button that appears. +3. Select **Change access level** from the dropdown menu. +4. Select the new access level from the list provided. +5. Review and confirm the changes in the modal that appears. The access levels for all selected collaborators are updated in the table upon confirmation. + +### Deleting collaborators + +After adding collaborators to the table, you have the option to delete them. Be cautious when removing admin collaborators because deleting all of them restricts workspace collaborator management to admins only. A confirmation modal is displayed before finalizing this action. + +#### Deleting individual collaborators + +To delete an individual collaborator, follow these steps: + +1. Select the {::nomarkdown}<img src="{{site.url}}{{site.baseurl}}/images/ellipsis-icon.png" class="inline-icon" alt="ellipsis icon"/>{:/} icon on the right of the table row to display a dropdown menu. +2. Select **Delete collaborator** from the dropdown menu. A confirmation modal appears to verify your action. +3. Select **Confirm** in the modal to remove the collaborator from the table. + +#### Deleting collaborators in batch + +To remove several collaborators simultaneously, follow these steps: + +1. Select the rows containing the collaborators you want to remove from the table. A "Delete x collaborators" button appears. +2. Select the **Delete x collaborators** button. +3. Review the confirmation modal that appears. +4. Select **Confirm** to remove all selected collaborators from the table. + +## Configuring workspace privacy + +When permission control is enabled, workspace administrators can set one of the following three access levels: + +* **Private to collaborators (Default):** Only workspace collaborators can access the workspace. +* **Anyone can view:** Grants **Read only** permissions to all workspace users, allowing them to view workspace assets. +* **Anyone can edit:** Grants **Read and write** permissions to all users, allowing them to view, create, and update workspace assets. + +Collaborators are granted higher permissions when their individual access level differs from that set in the workspace settings. For example, if workspace privacy is set to "Anyone can edit", any collaborator with read-only access will also be able to edit workspace assets. +Users at the collaborator level are granted additional permissions even when their access level differs from that of the workspace. +You can set up workspace privacy on the **Create workspace** page as a **Dashboard admin**. You can also modify it on the **Collaborators** or **Workspace details** pages as a **Workspace admin** or **Dashboard admin**. + +### Setting up workspace privacy during workspace creation + +Use the following steps to change workspace privacy settings when creating a new workspace: + +1. Choose the desired access level from the **Set up privacy** panel. +2. _Optional_ Decide whether to add collaborators after workspace creation by selecting the **Add collaborators after workspace creation.** checkbox. +3. Select **Create workspace** to create the workspace. + +### Modifying workspace privacy on the **Collaborators** page + +Use the following steps to edit the workspace privacy settings on the **Collaborators** page: + +1. Next to **Workspace privacy**, select **Edit**. +2. Select the new access level from the dropdown menu. +3. Select **Save changes** to apply the modifications. + +### Modifying workspace privacy on the **Workspace details** page + +Use the following steps to edit the workspace privacy settings on the **Workspace details** page: + +1. Select the **Edit** button in the upper-right corner of the **Details** panel. +2. Select the new access level from the dropdown menu. +3. Select **Save** to apply the modifications. diff --git a/_dashboards/workspace/workspace.md b/_dashboards/workspace/workspace.md new file mode 100644 index 00000000000..0938c48891f --- /dev/null +++ b/_dashboards/workspace/workspace.md @@ -0,0 +1,118 @@ +--- +layout: default +title: Workspace for OpenSearch Dashboards +nav_order: 110 +has_children: true +--- + +# Workspace for OpenSearch Dashboards +Introduced 2.18 +{: .label .label-purple } + +The Workspace feature in OpenSearch Dashboards enables you to tailor your environment with use-case-specific configurations. For example, you can create dedicated workspaces for observability scenarios, allowing you to focus on relevant functionalities. Additionally, the Workspace feature enables organization of visual assets, such as dashboards and visualizations, within a workspace with isolated storage. + +## Workspace data model + +The Workspace data model is defined by the following structure: + +```typescript +interface Workspace { + id: string; + name: string; + description?: string; + features?: string[]; + color: string; + uiSettings: Record<string, unknown>; +} +``` +{% include copy-curl.html %} + +The Workspace data model is composed of the following key attributes: + +- `id`: String type; unique ID for each each workspace. +- `name`: String type; designates the name of the workspace. +- `description`: Optional string type; provides contextual information for the workspace. +- `features`: Optional array of strings; contains use case IDs linked to the workspace. + +--- + +#### Example Workspace object + +The following object shows a typical Workspace configuration: + +```typescript +{ + id: "M5NqCu", + name: "Analytics team", + description: "Analytics team workspace", + features: ["use-case-analytics"], +} +``` +{% include copy-curl.html %} + +The configuration creates the `Analytics team` using the `use-case-observability` feature set. Use cases map to specific feature groups, limiting functionality to the defined set within each workspace. + +The following are predefined use case options: + +- `use-case-observability` +- `use-case-security-analytics` +- `use-case-search` +- `use-case-essentials` +- `use-case-all` + +--- + +## Associating saved objects with workspaces + +Saved objects in OpenSearch Dashboards, such as dashboards, visualizations, and index patterns, can be associated with specific workspaces, improving organization and accessibility as the volume of objects grows. + +The `workspaces` attribute, an array of strings, is added to saved objects to be linked with one or more workspaces. As a result, saved objects such as dashboards and visualizations are only accessible within their designated workspaces. + +The following saved object shows a dashboard object associated with the workspace `M5NqCu`: + +```typescript +{ + type: "dashboard", + id: "da123f20-6680-11ee-93fa-df944ec23359", + workspaces: ["M5NqCu"] +} +``` +{% include copy-curl.html %} + +Saved objects support association with multiple workspaces, facilitating cross-team collaboration and resource sharing. This feature is useful when an object is relevant to multiple teams, projects, or use cases. + +The following example shows a data source object linked to multiple workspaces: + +```typescript +{ + type: "data-source", + id: "da123f20-6680-11ee-93fa-df944ec23359", + workspaces: ["M5NqCu", "<TeamA-workspace-id>", "<Analytics-workspace-id>"] +} +``` +{% include copy-curl.html %} + +## Non-workspace saved objects + +Not all saved objects in OpenSearch Dashboards are associated with a workspace. Some objects operate independently of the workspace framework. These objects lack `workspace` attributes and serve system-wide functions. For example, the global user interface settings object manages configurations affecting the entire OpenSearch Dashboards interface in order to maintain consistent functionality across all workspaces. + +This dual approach allows OpenSearch Dashboards to balance granular, context-specific customization with overall system consistency. + +## Enabling the Workspace feature + +In your `opensearch_dashboards.yml` file, set the following option: + +```yaml +workspace.enabled: true +uiSettings: + overrides: + "home:useNewHomePage": true +``` +{% include copy-curl.html %} + +If your cluster has the Security plugin installed, then multi-tenancy must be disabled to avoid conflicts with similar workspaces: + +```yaml +opensearch_security.multitenancy.enabled: false +``` +{% include copy-curl.html %} diff --git a/_data-prepper/common-use-cases/anomaly-detection.md b/_data-prepper/common-use-cases/anomaly-detection.md index e7003558f18..7d3bbcb3900 100644 --- a/_data-prepper/common-use-cases/anomaly-detection.md +++ b/_data-prepper/common-use-cases/anomaly-detection.md @@ -7,7 +7,7 @@ nav_order: 5 # Anomaly detection -You can use Data Prepper to train models and generate anomalies in near real time on time-series aggregated events. You can generate anomalies either on events generated within the pipeline or on events coming directly into the pipeline, like OpenTelemetry metrics. You can feed these tumbling window aggregated time-series events to the [`anomaly_detector` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/anomaly-detector/), which trains a model and generates anomalies with a grade score. Then you can configure your pipeline to write the anomalies to a separate index to create document monitors and trigger fast alerting. +You can use OpenSearch Data Prepper to train models and generate anomalies in near real time on time-series aggregated events. You can generate anomalies either on events generated within the pipeline or on events coming directly into the pipeline, like OpenTelemetry metrics. You can feed these tumbling window aggregated time-series events to the [`anomaly_detector` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/anomaly-detector/), which trains a model and generates anomalies with a grade score. Then you can configure your pipeline to write the anomalies to a separate index to create document monitors and trigger fast alerting. ## Metrics from logs diff --git a/_data-prepper/common-use-cases/codec-processor-combinations.md b/_data-prepper/common-use-cases/codec-processor-combinations.md index 525bc704bef..279c7d530bd 100644 --- a/_data-prepper/common-use-cases/codec-processor-combinations.md +++ b/_data-prepper/common-use-cases/codec-processor-combinations.md @@ -7,7 +7,7 @@ nav_order: 10 # Codec processor combinations -At ingestion time, data received by the [`s3` source]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/s3/) can be parsed by [codecs]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/s3#codec). Codecs compresses and decompresses large data sets in a certain format before ingestion them through a Data Prepper pipeline [processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/processors/). +At ingestion time, data received by the [`s3` source]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/s3/) can be parsed by [codecs]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/s3#codec). Codecs compresses and decompresses large data sets in a certain format before ingestion them through an OpenSearch Data Prepper pipeline [processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/processors/). While most codecs can be used with most processors, the following codec processor combinations can make your pipeline more efficient when used with the following input types. diff --git a/_data-prepper/common-use-cases/common-use-cases.md b/_data-prepper/common-use-cases/common-use-cases.md index 342a8fc819e..adca11418b7 100644 --- a/_data-prepper/common-use-cases/common-use-cases.md +++ b/_data-prepper/common-use-cases/common-use-cases.md @@ -9,4 +9,4 @@ redirect_from: # Common use cases -You can use Data Prepper for several different purposes, including trace analytics, log analytics, Amazon S3 log analytics, and metrics ingestion. \ No newline at end of file +You can use OpenSearch Data Prepper for several different purposes, including trace analytics, log analytics, Amazon S3 log analytics, and metrics ingestion. \ No newline at end of file diff --git a/_data-prepper/common-use-cases/event-aggregation.md b/_data-prepper/common-use-cases/event-aggregation.md index f6e2757d9a3..4e1464b5059 100644 --- a/_data-prepper/common-use-cases/event-aggregation.md +++ b/_data-prepper/common-use-cases/event-aggregation.md @@ -7,7 +7,7 @@ nav_order: 25 # Event aggregation -You can use Data Prepper to aggregate data from different events over a period of time. Aggregating events can help to reduce unnecessary log volume and manage use cases like multiline logs that are received as separate events. The [`aggregate` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/aggregate/) is a stateful processor that groups events based on the values for a set of specified identification keys and performs a configurable action on each group. +You can use OpenSearch Data Prepper to aggregate data from different events over a period of time. Aggregating events can help to reduce unnecessary log volume and manage use cases like multiline logs that are received as separate events. The [`aggregate` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/aggregate/) is a stateful processor that groups events based on the values for a set of specified identification keys and performs a configurable action on each group. The `aggregate` processor state is stored in memory. For example, in order to combine four events into one, the processor needs to retain pieces of the first three events. The state of an aggregate group of events is kept for a configurable amount of time. Depending on your logs, the aggregate action being used, and the number of memory options in the processor configuration, the aggregation could take place over a long period of time. diff --git a/_data-prepper/common-use-cases/log-analytics.md b/_data-prepper/common-use-cases/log-analytics.md index ceb26ff5b78..242e16dfe94 100644 --- a/_data-prepper/common-use-cases/log-analytics.md +++ b/_data-prepper/common-use-cases/log-analytics.md @@ -7,7 +7,7 @@ nav_order: 30 # Log analytics -Data Prepper is an extendable, configurable, and scalable solution for log ingestion into OpenSearch and Amazon OpenSearch Service. Data Prepper supports receiving logs from [Fluent Bit](https://fluentbit.io/) through the [HTTP Source](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/http-source/README.md) and processing those logs with a [Grok Processor](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/grok-processor/README.md) before ingesting them into OpenSearch through the [OpenSearch sink](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/opensearch/README.md). +OpenSearch Data Prepper is an extendable, configurable, and scalable solution for log ingestion into OpenSearch and Amazon OpenSearch Service. Data Prepper supports receiving logs from [Fluent Bit](https://fluentbit.io/) through the [HTTP Source](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/http-source/README.md) and processing those logs with a [Grok Processor](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/grok-processor/README.md) before ingesting them into OpenSearch through the [OpenSearch sink](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/opensearch/README.md). The following image shows all of the components used for log analytics with Fluent Bit, Data Prepper, and OpenSearch. diff --git a/_data-prepper/common-use-cases/log-enrichment.md b/_data-prepper/common-use-cases/log-enrichment.md index 0d8ce4ab7d3..c09fdec6030 100644 --- a/_data-prepper/common-use-cases/log-enrichment.md +++ b/_data-prepper/common-use-cases/log-enrichment.md @@ -7,7 +7,7 @@ nav_order: 35 # Log enrichment -You can perform different types of log enrichment with Data Prepper, including: +You can perform different types of log enrichment with OpenSearch Data Prepper, including: - Filtering. - Extracting key-value pairs from strings. diff --git a/_data-prepper/common-use-cases/metrics-logs.md b/_data-prepper/common-use-cases/metrics-logs.md index 3fda8597c7c..fc0518ce261 100644 --- a/_data-prepper/common-use-cases/metrics-logs.md +++ b/_data-prepper/common-use-cases/metrics-logs.md @@ -7,7 +7,7 @@ nav_order: 15 # Deriving metrics from logs -You can use Data Prepper to derive metrics from logs. +You can use OpenSearch Data Prepper to derive metrics from logs. The following example pipeline receives incoming logs using the [`http` source plugin]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/http-source) and the [`grok` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/grok/). It then uses the [`aggregate` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/aggregate/) to extract the metric bytes aggregated during a 30-second window and derives histograms from the results. diff --git a/_data-prepper/common-use-cases/metrics-traces.md b/_data-prepper/common-use-cases/metrics-traces.md index c15eaa099b8..2cd0dafbb75 100644 --- a/_data-prepper/common-use-cases/metrics-traces.md +++ b/_data-prepper/common-use-cases/metrics-traces.md @@ -7,7 +7,7 @@ nav_order: 20 # Deriving metrics from traces -You can use Data Prepper to derive metrics from OpenTelemetry traces. The following example pipeline receives incoming traces and extracts a metric called `durationInNanos`, aggregated over a tumbling window of 30 seconds. It then derives a histogram from the incoming traces. +You can use OpenSearch Data Prepper to derive metrics from OpenTelemetry traces. The following example pipeline receives incoming traces and extracts a metric called `durationInNanos`, aggregated over a tumbling window of 30 seconds. It then derives a histogram from the incoming traces. The pipeline contains the following pipelines: diff --git a/_data-prepper/common-use-cases/s3-logs.md b/_data-prepper/common-use-cases/s3-logs.md index 8d5a9ce9673..2f93c1281d2 100644 --- a/_data-prepper/common-use-cases/s3-logs.md +++ b/_data-prepper/common-use-cases/s3-logs.md @@ -7,7 +7,7 @@ nav_order: 40 # S3 logs -Data Prepper allows you to load logs from [Amazon Simple Storage Service](https://aws.amazon.com/s3/) (Amazon S3), including traditional logs, JSON documents, and CSV logs. +OpenSearch Data Prepper allows you to load logs from [Amazon Simple Storage Service](https://aws.amazon.com/s3/) (Amazon S3), including traditional logs, JSON documents, and CSV logs. ## Architecture diff --git a/_data-prepper/common-use-cases/sampling.md b/_data-prepper/common-use-cases/sampling.md index 7c77e8c3f2f..47bead46499 100644 --- a/_data-prepper/common-use-cases/sampling.md +++ b/_data-prepper/common-use-cases/sampling.md @@ -7,7 +7,7 @@ nav_order: 45 # Sampling -Data Prepper provides the following sampling capabilities: +OpenSearch Data Prepper provides the following sampling capabilities: - Time sampling - Percentage sampling diff --git a/_data-prepper/common-use-cases/text-processing.md b/_data-prepper/common-use-cases/text-processing.md index 041ca63ab26..1fc81c5d981 100644 --- a/_data-prepper/common-use-cases/text-processing.md +++ b/_data-prepper/common-use-cases/text-processing.md @@ -7,7 +7,7 @@ nav_order: 55 # Text processing -Data Prepper provides text processing capabilities with the [`grok processor`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/grok/). The `grok` processor is based on the [`java-grok`](https://mvnrepository.com/artifact/io.krakens/java-grok) library and supports all compatible patterns. The `java-grok` library is built using the [`java.util.regex`](https://docs.oracle.com/javase/8/docs/api/java/util/regex/package-summary.html) regular expression library. +OpenSearch Data Prepper provides text processing capabilities with the [`grok processor`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/grok/). The `grok` processor is based on the [`java-grok`](https://mvnrepository.com/artifact/io.krakens/java-grok) library and supports all compatible patterns. The `java-grok` library is built using the [`java.util.regex`](https://docs.oracle.com/javase/8/docs/api/java/util/regex/package-summary.html) regular expression library. You can add custom patterns to your pipelines by using the `patterns_definitions` option. When debugging custom patterns, the [Grok Debugger](https://grokdebugger.com/) can be helpful. diff --git a/_data-prepper/common-use-cases/trace-analytics.md b/_data-prepper/common-use-cases/trace-analytics.md index 1a961077fef..a91f37823ce 100644 --- a/_data-prepper/common-use-cases/trace-analytics.md +++ b/_data-prepper/common-use-cases/trace-analytics.md @@ -7,7 +7,7 @@ nav_order: 60 # Trace analytics -Trace analytics allows you to collect trace data and customize a pipeline that ingests and transforms the data for use in OpenSearch. The following provides an overview of the trace analytics workflow in Data Prepper, how to configure it, and how to visualize trace data. +Trace analytics allows you to collect trace data and customize a pipeline that ingests and transforms the data for use in OpenSearch. The following provides an overview of the trace analytics workflow in OpenSearch Data Prepper, how to configure it, and how to visualize trace data. ## Introduction diff --git a/_data-prepper/getting-started.md b/_data-prepper/getting-started.md index 624cd5fcbc3..5dc90316d0f 100644 --- a/_data-prepper/getting-started.md +++ b/_data-prepper/getting-started.md @@ -1,14 +1,14 @@ --- layout: default -title: Getting started +title: Getting started with OpenSearch Data Prepper nav_order: 5 redirect_from: - /clients/data-prepper/get-started/ --- -# Getting started with Data Prepper +# Getting started with OpenSearch Data Prepper -Data Prepper is an independent component, not an OpenSearch plugin, that converts data for use with OpenSearch. It's not bundled with the all-in-one OpenSearch installation packages. +OpenSearch Data Prepper is an independent component, not an OpenSearch plugin, that converts data for use with OpenSearch. It's not bundled with the all-in-one OpenSearch installation packages. If you are migrating from Open Distro Data Prepper, see [Migrating from Open Distro]({{site.url}}{{site.baseurl}}/data-prepper/migrate-open-distro/). {: .note} diff --git a/_data-prepper/index.md b/_data-prepper/index.md index e418aa1966b..63ff2fd07c1 100644 --- a/_data-prepper/index.md +++ b/_data-prepper/index.md @@ -1,6 +1,6 @@ --- layout: default -title: Data Prepper +title: OpenSearch Data Prepper nav_order: 1 has_children: false has_toc: false @@ -12,9 +12,9 @@ redirect_from: - /data-prepper/index/ --- -# Data Prepper +# OpenSearch Data Prepper -Data Prepper is a server-side data collector capable of filtering, enriching, transforming, normalizing, and aggregating data for downstream analysis and visualization. Data Prepper is the preferred data ingestion tool for OpenSearch. It is recommended for most data ingestion use cases in OpenSearch and for processing large, complex datasets. +OpenSearch Data Prepper is a server-side data collector capable of filtering, enriching, transforming, normalizing, and aggregating data for downstream analysis and visualization. Data Prepper is the preferred data ingestion tool for OpenSearch. It is recommended for most data ingestion use cases in OpenSearch and for processing large, complex datasets. With Data Prepper you can build custom pipelines to improve the operational view of applications. Two common use cases for Data Prepper are trace analytics and log analytics. [Trace analytics]({{site.url}}{{site.baseurl}}/data-prepper/common-use-cases/trace-analytics/) can help you visualize event flows and identify performance problems. [Log analytics]({{site.url}}{{site.baseurl}}/data-prepper/common-use-cases/log-analytics/) equips you with tools to enhance your search capabilities, conduct comprehensive analysis, and gain insights into your applications' performance and behavior. @@ -74,6 +74,6 @@ In the given pipeline configuration, the `source` component reads string events ## Next steps -- [Get started with Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/). +- [Getting started with OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/). - [Get familiar with Data Prepper pipelines]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/pipelines/). - [Explore common use cases]({{site.url}}{{site.baseurl}}/data-prepper/common-use-cases/common-use-cases/). diff --git a/_data-prepper/managing-data-prepper/configuring-data-prepper.md b/_data-prepper/managing-data-prepper/configuring-data-prepper.md index e42a9e94498..ab5f3aa0667 100644 --- a/_data-prepper/managing-data-prepper/configuring-data-prepper.md +++ b/_data-prepper/managing-data-prepper/configuring-data-prepper.md @@ -1,16 +1,16 @@ --- layout: default -title: Configuring Data Prepper -parent: Managing Data Prepper +title: Configuring OpenSearch Data Prepper +parent: Managing OpenSearch Data Prepper nav_order: 5 redirect_from: - /clients/data-prepper/data-prepper-reference/ - /monitoring-plugins/trace/data-prepper-reference/ --- -# Configuring Data Prepper +# Configuring OpenSearch Data Prepper -You can customize your Data Prepper configuration by editing the `data-prepper-config.yaml` file in your Data Prepper installation. The following configuration options are independent from pipeline configuration options. +You can customize your OpenSearch Data Prepper configuration by editing the `data-prepper-config.yaml` file in your Data Prepper installation. The following configuration options are independent from pipeline configuration options. ## Data Prepper configuration diff --git a/_data-prepper/managing-data-prepper/configuring-log4j.md b/_data-prepper/managing-data-prepper/configuring-log4j.md index 175c754abff..fe256e0da5e 100644 --- a/_data-prepper/managing-data-prepper/configuring-log4j.md +++ b/_data-prepper/managing-data-prepper/configuring-log4j.md @@ -1,13 +1,13 @@ --- layout: default title: Configuring Log4j -parent: Managing Data Prepper +parent: Managing OpenSearch Data Prepper nav_order: 20 --- # Configuring Log4j -You can configure logging using Log4j in Data Prepper. +You can configure logging using Log4j in OpenSearch Data Prepper. ## Logging diff --git a/_data-prepper/managing-data-prepper/core-apis.md b/_data-prepper/managing-data-prepper/core-apis.md index b810c7b15ef..eecc4ee73bd 100644 --- a/_data-prepper/managing-data-prepper/core-apis.md +++ b/_data-prepper/managing-data-prepper/core-apis.md @@ -1,13 +1,13 @@ --- layout: default title: Core APIs -parent: Managing Data Prepper +parent: Managing OpenSearch Data Prepper nav_order: 15 --- # Core APIs -All Data Prepper instances expose a server with some control APIs. By default, this server runs on port 4900. Some plugins, especially source plugins, may expose other servers that run on different ports. Configurations for these plugins are independent of the core API. For example, to shut down Data Prepper, you can run the following curl request: +All OpenSearch Data Prepper instances expose a server with some control APIs. By default, this server runs on port 4900. Some plugins, especially source plugins, may expose other servers that run on different ports. Configurations for these plugins are independent of the core API. For example, to shut down Data Prepper, you can run the following curl request: ``` curl -X POST http://localhost:4900/shutdown diff --git a/_data-prepper/managing-data-prepper/extensions/extensions.md b/_data-prepper/managing-data-prepper/extensions/extensions.md index 8cbfc602c7c..80da40767ee 100644 --- a/_data-prepper/managing-data-prepper/extensions/extensions.md +++ b/_data-prepper/managing-data-prepper/extensions/extensions.md @@ -1,14 +1,14 @@ --- layout: default title: Extensions -parent: Managing Data Prepper +parent: Managing OpenSearch Data Prepper has_children: true nav_order: 18 --- # Extensions -Data Prepper extensions provide Data Prepper functionality outside of core Data Prepper pipeline components. +OpenSearch Data Prepper extensions provide Data Prepper functionality outside of core Data Prepper pipeline components. Many extensions provide configuration options that give Data Prepper administrators greater flexibility over Data Prepper's functionality. Extension configurations can be configured in the `data-prepper-config.yaml` file under the `extensions:` YAML block. diff --git a/_data-prepper/managing-data-prepper/extensions/geoip-service.md b/_data-prepper/managing-data-prepper/extensions/geoip-service.md index 53c21a08ff1..157367dce1c 100644 --- a/_data-prepper/managing-data-prepper/extensions/geoip-service.md +++ b/_data-prepper/managing-data-prepper/extensions/geoip-service.md @@ -3,12 +3,12 @@ layout: default title: geoip_service nav_order: 5 parent: Extensions -grand_parent: Managing Data Prepper +grand_parent: Managing OpenSearch Data Prepper --- # geoip_service -The `geoip_service` extension configures all [`geoip`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/geoip) processors in Data Prepper. +The `geoip_service` extension configures all [`geoip`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/geoip) processors in OpenSearch Data Prepper. ## Usage diff --git a/_data-prepper/managing-data-prepper/managing-data-prepper.md b/_data-prepper/managing-data-prepper/managing-data-prepper.md index ea2d1f111c4..204510be248 100644 --- a/_data-prepper/managing-data-prepper/managing-data-prepper.md +++ b/_data-prepper/managing-data-prepper/managing-data-prepper.md @@ -1,10 +1,10 @@ --- layout: default -title: Managing Data Prepper +title: Managing OpenSearch Data Prepper has_children: true nav_order: 20 --- -# Managing Data Prepper +# Managing OpenSearch Data Prepper -You can perform administrator functions for Data Prepper, including system configuration, interacting with core APIs, Log4j configuration, and monitoring. You can set up peer forwarding to coordinate multiple Data Prepper nodes when using stateful aggregation. \ No newline at end of file +You can perform administrator functions for OpenSearch Data Prepper, including system configuration, interacting with core APIs, Log4j configuration, and monitoring. You can set up peer forwarding to coordinate multiple Data Prepper nodes when using stateful aggregation. \ No newline at end of file diff --git a/_data-prepper/managing-data-prepper/monitoring.md b/_data-prepper/managing-data-prepper/monitoring.md index 691f376b332..cb29e49a518 100644 --- a/_data-prepper/managing-data-prepper/monitoring.md +++ b/_data-prepper/managing-data-prepper/monitoring.md @@ -1,13 +1,13 @@ --- layout: default title: Monitoring -parent: Managing Data Prepper +parent: Managing OpenSearch Data Prepper nav_order: 25 --- -# Monitoring Data Prepper with metrics +# Monitoring OpenSearch Data Prepper with metrics -You can monitor Data Prepper with metrics using [Micrometer](https://micrometer.io/). There are two types of metrics: JVM/system metrics and plugin metrics. [Prometheus](https://prometheus.io/) is used as the default metrics backend. +You can monitor OpenSearch Data Prepper with metrics using [Micrometer](https://micrometer.io/). There are two types of metrics: JVM/system metrics and plugin metrics. [Prometheus](https://prometheus.io/) is used as the default metrics backend. ## JVM and system metrics diff --git a/_data-prepper/managing-data-prepper/peer-forwarder.md b/_data-prepper/managing-data-prepper/peer-forwarder.md index f6a0f9890a7..9d54aef87c9 100644 --- a/_data-prepper/managing-data-prepper/peer-forwarder.md +++ b/_data-prepper/managing-data-prepper/peer-forwarder.md @@ -2,12 +2,12 @@ layout: default title: Peer forwarder nav_order: 12 -parent: Managing Data Prepper +parent: Managing OpenSearch Data Prepper --- # Peer forwarder -Peer forwarder is an HTTP service that performs peer forwarding of an `event` between Data Prepper nodes for aggregation. This HTTP service uses a hash-ring approach to aggregate events and determine which Data Prepper node it should handle on a given trace before rerouting it to that node. Currently, peer forwarder is supported by the `aggregate`, `service_map_stateful`, and `otel_traces_raw` [processors]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/processors/). +Peer forwarder is an HTTP service that performs peer forwarding of an `event` between OpenSearch Data Prepper nodes for aggregation. This HTTP service uses a hash-ring approach to aggregate events and determine which Data Prepper node it should handle on a given trace before rerouting it to that node. Currently, peer forwarder is supported by the `aggregate`, `service_map_stateful`, and `otel_traces_raw` [processors]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/processors/). Peer Forwarder groups events based on the identification keys provided by the supported processors. For `service_map_stateful` and `otel_traces_raw`, the identification key is `traceId` by default and cannot be configured. The `aggregate` processor is configured using the `identification_keys` configuration option. From here, you can specify which keys to use for Peer Forwarder. See [Aggregate Processor page](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/aggregate-processor#identification_keys) for more information about identification keys. diff --git a/_data-prepper/managing-data-prepper/source-coordination.md b/_data-prepper/managing-data-prepper/source-coordination.md index 3c60b452807..5dc85e50a7c 100644 --- a/_data-prepper/managing-data-prepper/source-coordination.md +++ b/_data-prepper/managing-data-prepper/source-coordination.md @@ -2,12 +2,12 @@ layout: default title: Source coordination nav_order: 35 -parent: Managing Data Prepper +parent: Managing OpenSearch Data Prepper --- # Source coordination -_Source coordination_ is the concept of coordinating and distributing work between Data Prepper data sources in a multi-node environment. Some data sources, such as Amazon Kinesis or Amazon Simple Queue Service (Amazon SQS), handle coordination natively. Other data sources, such as OpenSearch, Amazon Simple Storage Service (Amazon S3), Amazon DynamoDB, and JDBC/ODBC, do not support source coordination. +_Source coordination_ is the concept of coordinating and distributing work between OpenSearch Data Prepper data sources in a multi-node environment. Some data sources, such as Amazon Kinesis or Amazon Simple Queue Service (Amazon SQS), handle coordination natively. Other data sources, such as OpenSearch, Amazon Simple Storage Service (Amazon S3), Amazon DynamoDB, and JDBC/ODBC, do not support source coordination. Data Prepper source coordination decides which partition of work is performed by each node in the Data Prepper cluster and prevents duplicate partitions of work. diff --git a/_data-prepper/migrate-open-distro.md b/_data-prepper/migrate-open-distro.md index 8b3e7a7198c..31a47c56825 100644 --- a/_data-prepper/migrate-open-distro.md +++ b/_data-prepper/migrate-open-distro.md @@ -23,4 +23,4 @@ In your Data Prepper Docker configuration, adjust `amazon/opendistro-for-elastic ## Next steps -For more information about Data Prepper configurations, see [Getting Started with Data Prepper]({{site.url}}{{site.baseurl}}/clients/data-prepper/get-started/). +For more information about Data Prepper configurations, see [Getting Started with OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/clients/data-prepper/get-started/). diff --git a/_data-prepper/migrating-from-logstash-data-prepper.md b/_data-prepper/migrating-from-logstash-data-prepper.md index 3d87f29517e..13548092dce 100644 --- a/_data-prepper/migrating-from-logstash-data-prepper.md +++ b/_data-prepper/migrating-from-logstash-data-prepper.md @@ -9,9 +9,9 @@ redirect_from: # Migrating from Logstash -You can run Data Prepper with a Logstash configuration. +You can run OpenSearch Data Prepper with a Logstash configuration. -As mentioned in [Getting started with Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/), you'll need to configure Data Prepper with a pipeline using a `pipelines.yaml` file. +As mentioned in [Getting started with OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/), you'll need to configure Data Prepper with a pipeline using a `pipelines.yaml` file. Alternatively, if you have a Logstash configuration `logstash.conf` to configure Data Prepper instead of `pipelines.yaml`. @@ -29,7 +29,7 @@ As of the Data Prepper 1.2 release, the following plugins from the Logstash conf ## Running Data Prepper with a Logstash configuration -1. To install Data Prepper's Docker image, see Installing Data Prepper in [Getting Started with Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started#1-installing-data-prepper). +1. To install Data Prepper's Docker image, see Installing Data Prepper in [Getting Started with OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started#1-installing-data-prepper). 2. Run the Docker image installed in Step 1 by supplying your `logstash.conf` configuration. diff --git a/_data-prepper/pipelines/cidrcontains.md b/_data-prepper/pipelines/cidrcontains.md index 898f1bc1f58..1e8b3fa396b 100644 --- a/_data-prepper/pipelines/cidrcontains.md +++ b/_data-prepper/pipelines/cidrcontains.md @@ -19,6 +19,6 @@ For example, if your data contains an IP address field named `client.ip` and you ``` cidrContains('/client.ip', '192.168.0.0/16', '10.0.0.0/8') ``` -{% include copy-curl.html %} +{% include copy.html %} -This function returns `true` if the IP address matches any of the specified CIDR blocks or `false` if it does not. \ No newline at end of file +This function returns `true` if the IP address matches any of the specified CIDR blocks or `false` if it does not. diff --git a/_data-prepper/pipelines/configuration/buffers/buffers.md b/_data-prepper/pipelines/configuration/buffers/buffers.md index 287825b5495..0965b0acd02 100644 --- a/_data-prepper/pipelines/configuration/buffers/buffers.md +++ b/_data-prepper/pipelines/configuration/buffers/buffers.md @@ -8,7 +8,7 @@ nav_order: 30 # Buffers -The `buffer` component acts as an intermediary layer between the `source` and `sink` components in a Data Prepper pipeline. It serves as temporary storage for events, decoupling the `source` from the downstream processors and sinks. Buffers can be either in-memory or disk based. +The `buffer` component acts as an intermediary layer between the `source` and `sink` components in an OpenSearch Data Prepper pipeline. It serves as temporary storage for events, decoupling the `source` from the downstream processors and sinks. Buffers can be either in-memory or disk based. If not explicitly specified in the pipeline configuration, Data Prepper uses the default `bounded_blocking` buffer, which is an in-memory queue bounded by the number of events it can store. The `bounded_blocking` buffer is a convenient option when the event volume and processing rates are manageable within the available memory constraints. diff --git a/_data-prepper/pipelines/configuration/buffers/kafka.md b/_data-prepper/pipelines/configuration/buffers/kafka.md index 87600601b4c..5ab0d03d2e6 100644 --- a/_data-prepper/pipelines/configuration/buffers/kafka.md +++ b/_data-prepper/pipelines/configuration/buffers/kafka.md @@ -59,12 +59,12 @@ Option | Required | Type | Description `name` | Yes | String | The name of the Kafka topic. `group_id` | Yes | String | Sets Kafka's `group.id` option. `workers` | No | Integer | The number of multithreaded consumers associated with each topic. Default is `2`. The maximum value is `200`. -`encryption_key` | No | String | An Advanced Encryption Standard (AES) encryption key used to encrypt and decrypt data within Data Prepper before sending it to Kafka. This value must be plain text or encrypted using AWS Key Management Service (AWS KMS). +`encryption_key` | No | String | An Advanced Encryption Standard (AES) encryption key used to encrypt and decrypt data within OpenSearch Data Prepper before sending it to Kafka. This value must be plain text or encrypted using AWS Key Management Service (AWS KMS). `kms` | No | AWS KMS key | When configured, uses an AWS KMS key to encrypt data. See [`kms`](#kms) for more information. `auto_commit` | No | Boolean | When `false`, the consumer offset will not be periodically committed to Kafka in the background. Default is `false`. `commit_interval` | No | Integer | When `auto_commit` is set to `true`, sets how often, in seconds, the consumer offsets are auto-committed to Kafka through Kafka's `auto.commit.interval.ms` option. Default is `5s`. `session_timeout` | No | Integer | The amount of time during which the source detects client failures when using Kafka's group management features, which can be used to balance the data stream. Default is `45s`. -`auto_offset_reset` | No | String | Automatically resets the offset to the earliest or the latest offset through Kafka's `auto.offset.reset` option. Default is `latest`. +`auto_offset_reset` | No | String | Automatically resets the offset to the earliest or the latest offset through Kafka's `auto.offset.reset` option. Default is `earliest`. `thread_waiting_time` | No | Integer | The amount of time that a thread waits for the preceding thread to complete its task and to signal the next thread. The Kafka consumer API poll timeout value is set to half of this setting. Default is `5s`. `max_partition_fetch_bytes` | No | Integer | Sets the maximum limit, in megabytes, for data returns from each partition through Kafka's `max.partition.fetch.bytes` setting. Default is `1mb`. `heart_beat_interval` | No | Integer | The expected amount of time between heartbeats to the consumer coordinator when using Kafka's group management facilities through Kafka's `heartbeat.interval.ms` setting. Default is `5s`. diff --git a/_data-prepper/pipelines/configuration/processors/aggregate.md b/_data-prepper/pipelines/configuration/processors/aggregate.md index 38b138a996c..1d0052ada67 100644 --- a/_data-prepper/pipelines/configuration/processors/aggregate.md +++ b/_data-prepper/pipelines/configuration/processors/aggregate.md @@ -20,7 +20,7 @@ Option | Required | Type | Description identification_keys | Yes | List | An unordered list by which to group events. Events with the same values as these keys are put into the same group. If an event does not contain one of the `identification_keys`, then the value of that key is considered to be equal to `null`. At least one identification_key is required (for example, `["sourceIp", "destinationIp", "port"]`). action | Yes | AggregateAction | The action to be performed on each group. One of the [available aggregate actions](#available-aggregate-actions) must be provided, or you can create custom aggregate actions. `remove_duplicates` and `put_all` are the available actions. For more information, see [Creating New Aggregate Actions](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/aggregate-processor#creating-new-aggregate-actions). group_duration | No | String | The amount of time that a group should exist before it is concluded automatically. Supports ISO_8601 notation strings ("PT20.345S", "PT15M", etc.) as well as simple notation for seconds (`"60s"`) and milliseconds (`"1500ms"`). Default value is `180s`. -local_mode | No | Boolean | When `local_mode` is set to `true`, the aggregation is performed locally on each Data Prepper node instead of forwarding events to a specific node based on the `identification_keys` using a hash function. Default is `false`. +local_mode | No | Boolean | When `local_mode` is set to `true`, the aggregation is performed locally on each OpenSearch Data Prepper node instead of forwarding events to a specific node based on the `identification_keys` using a hash function. Default is `false`. ## Available aggregate actions @@ -31,7 +31,7 @@ Use the following aggregate actions to determine how the `aggregate` processor p The `remove_duplicates` action processes the first event for a group immediately and drops any events that duplicate the first event from the source. For example, when using `identification_keys: ["sourceIp", "destination_ip"]`: 1. The `remove_duplicates` action processes `{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "status": 200 }`, the first event in the source. -2. Data Prepper drops the `{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 1000 }` event because the `sourceIp` and `destinationIp` match the first event in the source. +2. OpenSearch Data Prepper drops the `{ "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 1000 }` event because the `sourceIp` and `destinationIp` match the first event in the source. 3. The `remove_duplicates` action processes the next event, `{ "sourceIp": "127.0.0.2", "destinationIp": "192.168.0.1", "bytes": 1000 }`. Because the `sourceIp` is different from the first event of the group, Data Prepper creates a new group based on the event. ### put_all diff --git a/_data-prepper/pipelines/configuration/processors/anomaly-detector.md b/_data-prepper/pipelines/configuration/processors/anomaly-detector.md index 9628bb6caf8..3fae80cb3ff 100644 --- a/_data-prepper/pipelines/configuration/processors/anomaly-detector.md +++ b/_data-prepper/pipelines/configuration/processors/anomaly-detector.md @@ -35,7 +35,7 @@ The random cut forest (RCF) ML algorithm is an unsupervised algorithm for detect | :--- | :--- | | `random_cut_forest` | Processes events using the RCF ML algorithm to detect anomalies. | -RCF is an unsupervised ML algorithm for detecting anomalous data points within a dataset. Data Prepper uses RCF to detect anomalies in data by passing the values of the configured key to RCF. For example, when an event with a latency value of 11.5 is sent, the following anomaly event is generated: +RCF is an unsupervised ML algorithm for detecting anomalous data points within a dataset. OpenSearch Data Prepper uses RCF to detect anomalies in data by passing the values of the configured key to RCF. For example, when an event with a latency value of 11.5 is sent, the following anomaly event is generated: ```json @@ -53,6 +53,7 @@ You can configure `random_cut_forest` mode with the following options. | `sample_size` | `256` | 100--2500 | The sample size used in the ML algorithm. | | `time_decay` | `0.1` | 0--1.0 | The time decay value used in the ML algorithm. Used as the mathematical expression `timeDecay` divided by `SampleSize` in the ML algorithm. | | `type` | `metrics` | N/A | The type of data sent to the algorithm. | +| `output_after` | 32 | N/A | Specifies the number of events to process before outputting any detected anomalies. | | `version` | `1.0` | N/A | The algorithm version number. | ## Usage diff --git a/_data-prepper/pipelines/configuration/processors/aws-lambda.md b/_data-prepper/pipelines/configuration/processors/aws-lambda.md new file mode 100644 index 00000000000..0ef9dfd7d74 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/aws-lambda.md @@ -0,0 +1,94 @@ +--- +layout: default +title: aws_lambda +parent: Processors +grand_parent: Pipelines +nav_order: 10 +--- + +# aws_lambda integration for OpenSearch Data Prepper + +The [AWS Lambda](https://aws.amazon.com/lambda/) integration allows developers to use serverless computing capabilities within their OpenSearch Data Prepper pipelines for flexible event processing and data routing. + +## AWS Lambda processor configuration + +The `aws_lambda` processor enables invocation of an AWS Lambda function within your Data Prepper pipeline in order to process events. It supports both synchronous and asynchronous invocations based on your use case. + +## Configuration fields + +You can configure the processor using the following configuration options. + +Field | Type | Required | Description +-------------------- | ------- | -------- | ---------------------------------------------------------------------------- +`function_name` | String | Required | The name of the AWS Lambda function to invoke. +`invocation_type` | String | Required | Specifies the invocation type, either `request-response` or `event`. Default is `request-response`. +`aws.region` | String | Required | The AWS Region in which the Lambda function is located. +`aws.sts_role_arn` | String | Optional | The Amazon Resource Name (ARN) of the role to assume before invoking the Lambda function. +`max_retries` | Integer | Optional | The maximum number of retries for failed invocations. Default is `3`. +`batch` | Object | Optional | The batch settings for the Lambda invocations. Default is `key_name = "events"`. Default threshold is `event_count=100`, `maximum_size="5mb"`, and `event_collect_timeout = 10s`. +`lambda_when` | String | Optional | A conditional expression that determines when to invoke the Lambda processor. +`response_codec` | Object | Optional | A codec configuration for parsing Lambda responses. Default is `json`. +`tags_on_match_failure` | List | Optional | A list of tags to add to events when Lambda matching fails or encounters an unexpected error. +`sdk_timeout` | Duration| Optional | Configures the SDK's client connection timeout period. Default is `60s`. +`response_events_match` | Boolean | Optional | Specifies how Data Prepper interprets and processes Lambda function responses. Default is `false`. + +#### Example configuration + +``` +processors: + - aws_lambda: + function_name: "my-lambda-function" + invocation_type: "request-response" + response_events_match: false + aws: + region: "us-east-1" + sts_role_arn: "arn:aws:iam::123456789012:role/my-lambda-role" + max_retries: 3 + batch: + key_name: "events" + threshold: + event_count: 100 + maximum_size: "5mb" + event_collect_timeout: PT10S + lambda_when: "event['status'] == 'process'" + +``` +{% include copy-curl.html %} + +## Usage + +The processor supports the following invocation types: + +- `request-response`: The processor waits for Lambda function completion before proceeding. +- `event`: The function is triggered asynchronously without waiting for a response. +- `batch`: When enabled, events are aggregated and sent in bulk to optimize Lambda invocations. Batch thresholds control the event count, size limit, and timeout. +- `codec`: JSON is used for both request and response codecs. Lambda must return JSON array outputs. +- `tags_on_match_failure`: Custom tags can be applied to events when Lambda processing fails or encounters unexpected issues. + +## Behavior + +When configured for batching, the AWS Lambda processor groups multiple events into a single request. This grouping is governed by batch thresholds, which can be based on the event count, size limit, or timeout. The processor then sends the entire batch to the Lambda function as a single payload. + +## Lambda response handling + +The `response_events_match` setting defines how Data Prepper handles the relationship between batch events sent to Lambda and the response received: + +- `true`: Lambda returns a JSON array with results for each batched event. Data Prepper maps this array back to its corresponding original event, ensuring that each event in the batch gets the corresponding part of the response from the array. +- `false`: Lambda returns one or more events for the entire batch. Response events are not correlated with the original events. Original event metadata is not preserved in the response events. For example, when `response_events_match` is set to `true`, the Lambda function is expected to return the same number of response events as the number of original requests, maintaining the original order. + +## Limitations + +Note the following limitations: + +- Payload limitation: 6 MB payload limit +- Response codec: JSON-only codec support + +## Integration testing + +Integration tests for this plugin are executed separately from the main Data Prepper build process. Use the following Gradle command to run these tests: + +``` +./gradlew :data-prepper-plugins:aws-lambda:integrationTest -Dtests.processor.lambda.region="us-east-1" -Dtests.processor.lambda.functionName="lambda_test_function" -Dtests.processor.lambda.sts_role_arn="arn:aws:iam::123456789012:role/dataprepper-role +``` + +{% include copy-curl.html %} diff --git a/_data-prepper/pipelines/configuration/processors/convert-entry-type.md b/_data-prepper/pipelines/configuration/processors/convert-entry-type.md index c2c46260ed3..cc707832ad7 100644 --- a/_data-prepper/pipelines/configuration/processors/convert-entry-type.md +++ b/_data-prepper/pipelines/configuration/processors/convert-entry-type.md @@ -47,7 +47,7 @@ type-conv-pipeline: ``` {% include copy.html %} -Next, create a log file named `logs_json.log` and replace the `path` in the file source of your `pipeline.yaml` file with that filepath. For more information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). +Next, create a log file named `logs_json.log` and replace the `path` in the file source of your `pipeline.yaml` file with that filepath. For more information, see [Configuring OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). For example, before you run the `convert_entry_type` processor, if the `logs_json.log` file contains the following event record: diff --git a/_data-prepper/pipelines/configuration/processors/csv.md b/_data-prepper/pipelines/configuration/processors/csv.md index e386db4bf45..d640b19eb32 100644 --- a/_data-prepper/pipelines/configuration/processors/csv.md +++ b/_data-prepper/pipelines/configuration/processors/csv.md @@ -113,4 +113,4 @@ The `csv` processor includes the following custom metrics. The `csv` processor includes the following counter metrics: -* `csvInvalidEvents`: The number of invalid events, usually caused by an unclosed quotation mark in the event itself. Data Prepper throws an exception when an invalid event is parsed. +* `csvInvalidEvents`: The number of invalid events, usually caused by an unclosed quotation mark in the event itself. OpenSearch Data Prepper throws an exception when an invalid event is parsed. diff --git a/_data-prepper/pipelines/configuration/processors/decompress.md b/_data-prepper/pipelines/configuration/processors/decompress.md index d03c236ac5a..2a4b1763efd 100644 --- a/_data-prepper/pipelines/configuration/processors/decompress.md +++ b/_data-prepper/pipelines/configuration/processors/decompress.md @@ -16,7 +16,7 @@ Option | Required | Type | Description :--- | :--- | :--- | :--- `keys` | Yes | List<String> | The fields in the event that will be decompressed. `type` | Yes | Enum | The type of decompression to use for the `keys` in the event. Only `gzip` is supported. -`decompress_when` | No | String| A [Data Prepper conditional expression](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/) that determines when the `decompress` processor will run on certain events. +`decompress_when` | No | String| A [Data Prepper conditional expression]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/) that determines when the `decompress` processor will run on certain events. `tags_on_failure` | No | List<String> | A list of strings with which to tag events when the processor fails to decompress the `keys` inside an event. Defaults to `_decompression_failure`. ## Usage diff --git a/_data-prepper/pipelines/configuration/processors/delete-entries.md b/_data-prepper/pipelines/configuration/processors/delete-entries.md index e7c022c6a76..f30bccae232 100644 --- a/_data-prepper/pipelines/configuration/processors/delete-entries.md +++ b/_data-prepper/pipelines/configuration/processors/delete-entries.md @@ -41,7 +41,7 @@ pipeline: ``` {% include copy.html %} -Next, create a log file named `logs_json.log` and replace the `path` in the file source of your `pipeline.yaml` file with that filepath. For more information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). +Next, create a log file named `logs_json.log` and replace the `path` in the file source of your `pipeline.yaml` file with that filepath. For more information, see [Configuring OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). For example, before you run the `delete_entries` processor, if the `logs_json.log` file contains the following event record: diff --git a/_data-prepper/pipelines/configuration/processors/drop-events.md b/_data-prepper/pipelines/configuration/processors/drop-events.md index 1f601c9743c..eba3d0a8fb5 100644 --- a/_data-prepper/pipelines/configuration/processors/drop-events.md +++ b/_data-prepper/pipelines/configuration/processors/drop-events.md @@ -13,7 +13,7 @@ The `drop_events` processor drops all the events that are passed into it. The fo Option | Required | Type | Description :--- | :--- | :--- | :--- -drop_when | Yes | String | Accepts a Data Prepper expression string following the [Data Prepper Expression Syntax]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/). Configuring `drop_events` with `drop_when: true` drops all the events received. +drop_when | Yes | String | Accepts an OpenSearch Data Prepper expression string following the [expression syntax]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/). Configuring `drop_events` with `drop_when: true` drops all the events received. handle_failed_events | No | Enum | Specifies how exceptions are handled when an exception occurs while evaluating an event. Default value is `drop`, which drops the event so that it is not sent to OpenSearch. Available options are `drop`, `drop_silently`, `skip`, and `skip_silently`. For more information, see [handle_failed_events](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/drop-events-processor#handle_failed_events). <!---## Configuration diff --git a/_data-prepper/pipelines/configuration/processors/flatten.md b/_data-prepper/pipelines/configuration/processors/flatten.md index 43793c2b837..c5b1f8e16a0 100644 --- a/_data-prepper/pipelines/configuration/processors/flatten.md +++ b/_data-prepper/pipelines/configuration/processors/flatten.md @@ -21,12 +21,12 @@ Option | Required | Type | Description `exclude_keys` | No | List | The keys from the source field that should be excluded from processing. Default is an empty list (`[]`). `remove_processed_fields` | No | Boolean | When `true`, the processor removes all processed fields from the source. Default is `false`. `remove_list_indices` | No | Boolean | When `true`, the processor converts the fields from the source map into lists and puts the lists into the target field. Default is `false`. -`flatten_when` | No | String | A [conditional expression](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/), such as `/some-key == "test"'`, that determines whether the `flatten` processor will be run on the event. Default is `null`, which means that all events will be processed unless otherwise stated. +`flatten_when` | No | String | A [conditional expression]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/), such as `/some-key == "test"'`, that determines whether the `flatten` processor will be run on the event. Default is `null`, which means that all events will be processed unless otherwise stated. `tags_on_failure` | No | List | A list of tags to add to the event metadata when the event fails to process. ## Usage -The following examples show how the `flatten` processor can be used in Data Prepper pipelines. +The following examples show how the `flatten` processor can be used in OpenSearch Data Prepper pipelines. ### Minimum configuration diff --git a/_data-prepper/pipelines/configuration/processors/geoip.md b/_data-prepper/pipelines/configuration/processors/geoip.md index d0b6bd1cbbb..eaedd062691 100644 --- a/_data-prepper/pipelines/configuration/processors/geoip.md +++ b/_data-prepper/pipelines/configuration/processors/geoip.md @@ -9,7 +9,7 @@ nav_order: 49 # geoip The `geoip` processor enriches events with geographic information extracted from IP addresses contained in the events. -By default, Data Prepper uses the [MaxMind GeoLite2](https://dev.maxmind.com/geoip/geolite2-free-geolocation-data) geolocation database. +By default, OpenSearch Data Prepper uses the [MaxMind GeoLite2](https://dev.maxmind.com/geoip/geolite2-free-geolocation-data) geolocation database. Data Prepper administrators can configure the databases using the [`geoip_service`]({{site.url}}{{site.baseurl}}/data-prepper/managing-data-prepper/extensions/geoip-service/) extension configuration. ## Usage diff --git a/_data-prepper/pipelines/configuration/processors/grok.md b/_data-prepper/pipelines/configuration/processors/grok.md index 3724278adf2..a9bd90867e1 100644 --- a/_data-prepper/pipelines/configuration/processors/grok.md +++ b/_data-prepper/pipelines/configuration/processors/grok.md @@ -54,7 +54,7 @@ processor: ``` {% include copy.html %} -The `grok_when` option can take a conditional expression. This expression is detailed in the [Expression syntax](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/) documentation. +The `grok_when` option can take a conditional expression. This expression is detailed in the [Expression syntax]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/) documentation. ## Grok performance metadata diff --git a/_data-prepper/pipelines/configuration/processors/key-value.md b/_data-prepper/pipelines/configuration/processors/key-value.md index 52ecc7719c6..bd2e52600f9 100644 --- a/_data-prepper/pipelines/configuration/processors/key-value.md +++ b/_data-prepper/pipelines/configuration/processors/key-value.md @@ -37,6 +37,6 @@ destination | The destination field for the parsed source. The parsed source ove `drop_keys_with_no_value` | Specifies whether keys should be dropped if they have a null value. Default is `false`. If `drop_keys_with_no_value` is set to `true`, then `{"key1=value1&key2"}` parses to `{"key1": "value1"}`. `strict_grouping` | Specifies whether strict grouping should be enabled when the `value_grouping` or `string_literal_character` options are used. Default is `false`. | When enabled, groups with unmatched end characters yield errors. The event is ignored after the errors are logged. `string_literal_character` | Can be set to either a single quotation mark (`'`) or a double quotation mark (`"`). Default is `null`. | When this option is used, any text contained within the specified quotation mark character will be ignored and excluded from key-value parsing. For example, `text1 "key1=value1" text2 key2=value2` would parse to `{"key2": "value2"}`. -`key_value_when` | Allows you to specify a [conditional expression](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/), such as `/some-key == "test"`, that will be evaluated to determine whether the processor should be applied to the event. +`key_value_when` | Allows you to specify a [conditional expression]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/), such as `/some-key == "test"`, that will be evaluated to determine whether the processor should be applied to the event. diff --git a/_data-prepper/pipelines/configuration/processors/map-to-list.md b/_data-prepper/pipelines/configuration/processors/map-to-list.md index f3393e6c460..9079b9087ba 100644 --- a/_data-prepper/pipelines/configuration/processors/map-to-list.md +++ b/_data-prepper/pipelines/configuration/processors/map-to-list.md @@ -23,7 +23,7 @@ Option | Required | Type | Description `exclude_keys` | No | List | The keys in the source map that will be excluded from processing. Default is an empty list (`[]`). `remove_processed_fields` | No | Boolean | When `true`, the processor will remove the processed fields from the source map. Default is `false`. `convert_field_to_list` | No | Boolean | If `true`, the processor will convert the fields from the source map into lists and place them in fields in the target list. Default is `false`. -`map_to_list_when` | No | String | A [conditional expression](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/), such as `/some-key == "test"'`, that will be evaluated to determine whether the processor will be run on the event. Default is `null`. All events will be processed unless otherwise stated. +`map_to_list_when` | No | String | A [conditional expression]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/), such as `/some-key == "test"'`, that will be evaluated to determine whether the processor will be run on the event. Default is `null`. All events will be processed unless otherwise stated. `tags_on_failure` | No | List | A list of tags to add to the event metadata when the event fails to process. ## Usage diff --git a/_data-prepper/pipelines/configuration/processors/mutate-event.md b/_data-prepper/pipelines/configuration/processors/mutate-event.md index ff2da6b5279..139ea95ac89 100644 --- a/_data-prepper/pipelines/configuration/processors/mutate-event.md +++ b/_data-prepper/pipelines/configuration/processors/mutate-event.md @@ -8,7 +8,7 @@ nav_order: 65 # Mutate event processors -Mutate event processors allow you to modify events in Data Prepper. The following processors are available: +Mutate event processors allow you to modify events in OpenSearch Data Prepper. The following processors are available: * [add_entries]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/add-entries/) allows you to add entries to an event. * [convert_entry_type]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/convert-entry-type/) allows you to convert value types in an event. diff --git a/_data-prepper/pipelines/configuration/processors/mutate-string.md b/_data-prepper/pipelines/configuration/processors/mutate-string.md index 48f6423676b..b84e63ea61b 100644 --- a/_data-prepper/pipelines/configuration/processors/mutate-string.md +++ b/_data-prepper/pipelines/configuration/processors/mutate-string.md @@ -53,9 +53,9 @@ pipeline: ``` {% include copy.html %} -Next, create a log file named `logs_json.log`. After that, replace the `path` of the file source in your `pipeline.yaml` file with your file path. For more detailed information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). +Next, create a log file named `logs_json.log`. After that, replace the `path` of the file source in your `pipeline.yaml` file with your file path. For more detailed information, see [Configuring OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). -Before you run Data Prepper, the source appears in the following format: +Before you run OpenSearch Data Prepper, the source appears in the following format: ```json {"message": "ab:cd:ab:cd"} @@ -105,7 +105,7 @@ pipeline: ``` {% include copy.html %} -Next, create a log file named `logs_json.log`. After that, replace the `path` in the file source of your `pipeline.yaml` file with your file path. For more detailed information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). +Next, create a log file named `logs_json.log`. After that, replace the `path` in the file source of your `pipeline.yaml` file with your file path. For more detailed information, see [Configuring OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). Before you run Data Prepper, the source appears in the following format: @@ -150,7 +150,7 @@ pipeline: ``` {% include copy.html %} -Next, create a log file named `logs_json.log`. After that, replace the `path` in the file source of your `pipeline.yaml` file with the correct file path. For more detailed information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). +Next, create a log file named `logs_json.log`. After that, replace the `path` in the file source of your `pipeline.yaml` file with the correct file path. For more detailed information, see [Configuring OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). Before you run Data Prepper, the source appears in the following format: @@ -195,7 +195,7 @@ pipeline: ``` {% include copy.html %} -Next, create a log file named `logs_json.log`. After that, replace the `path` in the file source of your `pipeline.yaml` file with the correct file path. For more detailed information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). +Next, create a log file named `logs_json.log`. After that, replace the `path` in the file source of your `pipeline.yaml` file with the correct file path. For more detailed information, see [Configuring OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). Before you run Data Prepper, the source appears in the following format: @@ -241,7 +241,7 @@ pipeline: ``` {% include copy.html %} -Next, create a log file named `logs_json.log`. After that, replace the `path` in the file source of your `pipeline.yaml` file with the correct file path. For more detailed information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). +Next, create a log file named `logs_json.log`. After that, replace the `path` in the file source of your `pipeline.yaml` file with the correct file path. For more detailed information, see [Configuring OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). Before you run Data Prepper, the source appears in the following format: diff --git a/_data-prepper/pipelines/configuration/processors/otel-metrics.md b/_data-prepper/pipelines/configuration/processors/otel-metrics.md index 6fc82f5deb9..38c3a825c6b 100644 --- a/_data-prepper/pipelines/configuration/processors/otel-metrics.md +++ b/_data-prepper/pipelines/configuration/processors/otel-metrics.md @@ -141,7 +141,7 @@ The following `JSON` file is a more detailed form of OpenTelemetry representatio The `exponential_histogram_max_allowed_scale` parameter defines the maximum allowed scale for an exponential histogram. If you increase this parameter, you will increase potential memory consumption. See the [OpenTelemetry specifications](https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/metrics/v1/metrics.proto) for more information on exponential histograms and their computational complexity. -All exponential histograms that have a scale that is above the configured parameter (by default, a value of `10`) are discarded and logged with an error level. You can check the log that Data Prepper creates to see the `ERROR` log message. +All exponential histograms that have a scale that is above the configured parameter (by default, a value of `10`) are discarded and logged with an error level. You can check the log that OpenSearch Data Prepper creates to see the `ERROR` log message. The absolute scale value is used for comparison, so a scale of `-11` that is treated equally to `11` exceeds the configured value of `10` and can be discarded. {: .note} diff --git a/_data-prepper/pipelines/configuration/processors/otel-trace-group.md b/_data-prepper/pipelines/configuration/processors/otel-trace-group.md index 06bc754a989..cf3db6a7305 100644 --- a/_data-prepper/pipelines/configuration/processors/otel-trace-group.md +++ b/_data-prepper/pipelines/configuration/processors/otel-trace-group.md @@ -55,8 +55,8 @@ You can configure the `otel_trace_group` processor with the following options. | `aws_sts_role_arn`| An AWS Identity and Access Management (IAM) role that the sink plugin assumes to sign the request to Amazon OpenSearch Service. If not provided, the plugin uses the [default credentials](https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/auth/credentials/DefaultCredentialsProvider.html). | `null` | | `aws_sts_header_overrides` | A map of header overrides that the IAM role assumes for the sink plugin. | `null` | | `insecure` | A Boolean flag used to turn off SSL certificate verification. If set to `true`, CA certificate verification is turned off and insecure HTTP requests are sent. | `false` | -| `username` | A string that contains the username and is used in the [internal users](https://opensearch.org/docs/latest/security/access-control/users-roles/) `YAML` configuration file of your OpenSearch cluster. | `null` | -| `password` | A string that contains the password and is used in the [internal users](https://opensearch.org/docs/latest/security/access-control/users-roles/) `YAML` configuration file of your OpenSearch cluster. | `null` | +| `username` | A string that contains the username and is used in the [internal users]({{site.url}}{{site.baseurl}}/security/access-control/users-roles/) `YAML` configuration file of your OpenSearch cluster. | `null` | +| `password` | A string that contains the password and is used in the [internal users]({{site.url}}{{site.baseurl}}/security/access-control/users-roles/) `YAML` configuration file of your OpenSearch cluster. | `null` | ## Configuration option examples diff --git a/_data-prepper/pipelines/configuration/processors/otel-traces.md b/_data-prepper/pipelines/configuration/processors/otel-traces.md index 6d26a5aca83..4d8d6cc5a12 100644 --- a/_data-prepper/pipelines/configuration/processors/otel-traces.md +++ b/_data-prepper/pipelines/configuration/processors/otel-traces.md @@ -8,7 +8,7 @@ nav_order: 75 # otel_trace -The `otel_trace` processor completes trace-group-related fields in all incoming Data Prepper span records by state caching the root span information for each `traceId`. +The `otel_trace` processor completes trace-group-related fields in all incoming OpenSearch Data Prepper span records by state caching the root span information for each `traceId`. ## Parameters diff --git a/_data-prepper/pipelines/configuration/processors/parse-ion.md b/_data-prepper/pipelines/configuration/processors/parse-ion.md index 8360eaa2968..38de7a86eb8 100644 --- a/_data-prepper/pipelines/configuration/processors/parse-ion.md +++ b/_data-prepper/pipelines/configuration/processors/parse-ion.md @@ -26,7 +26,7 @@ This table is autogenerated. Do not edit it. | `source` | No | String | The field in the `event` that is parsed. Default value is `message`. | | `destination` | No | String | The destination field of the parsed JSON. Defaults to the root of the `event`. Cannot be `""`, `/`, or any white-space-only `string` because these are not valid `event` fields. | | `pointer` | No | String | A JSON pointer to the field to be parsed. There is no `pointer` by default, meaning that the entire `source` is parsed. The `pointer` can access JSON array indexes as well. If the JSON pointer is invalid, then the entire `source` data is parsed into the outgoing `event`. If the key that is pointed to already exists in the `event` and the `destination` is the root, then the pointer uses the entire path of the key. | -| `parse_when` | No | String | Specifies under which conditions the processor should perform parsing. Default is no condition. Accepts a Data Prepper expression string following the [Expression syntax]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/). | +| `parse_when` | No | String | Specifies under which conditions the processor should perform parsing. Default is no condition. Accepts an OpenSearch Data Prepper expression string following the [expression syntax]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/). | | `overwrite_if_destination_exists` | No | Boolean | Overwrites the destination if set to `true`. Set to `false` to prevent changing a destination value that exists. Defaults is `true`. | | `delete_source` | No | Boolean | If set to `true`, then the source field is deleted. Defaults is `false`. | | `tags_on_failure` | No | String | A list of strings specifying the tags to be set in the event that the processor fails or an unknown exception occurs during parsing. diff --git a/_data-prepper/pipelines/configuration/processors/parse-json.md b/_data-prepper/pipelines/configuration/processors/parse-json.md index 894d5dba423..b50b5eac590 100644 --- a/_data-prepper/pipelines/configuration/processors/parse-json.md +++ b/_data-prepper/pipelines/configuration/processors/parse-json.md @@ -8,8 +8,7 @@ nav_order: 80 # parse_json -The `parse_json` processor parses JSON data for an event, including any nested fields. The processor extracts the JSON pointer data and adds the input event to the extracted fields. - +The `parse_json` processor parses JSON-formatted strings within an event, including nested fields. It can optionally use a JSON pointer to extract a specific part of the source JSON and add the extracted data to the event. ## Configuration @@ -24,65 +23,95 @@ This table is autogenerated. Do not edit it. | Option | Required | Type | Description | | :--- | :--- | :--- | :--- | -| `source` | No | String | The field in the `event` that will be parsed. Default value is `message`. | -| `destination` | No | String | The destination field of the parsed JSON. Defaults to the root of the `event`. Cannot be `""`, `/`, or any white-space-only `string` because these are not valid `event` fields. | -| `pointer` | No | String | A JSON pointer to the field to be parsed. There is no `pointer` by default, meaning the entire `source` is parsed. The `pointer` can access JSON array indexes as well. If the JSON pointer is invalid then the entire `source` data is parsed into the outgoing `event`. If the key that is pointed to already exists in the `event` and the `destination` is the root, then the pointer uses the entire path of the key. | -| `parse_when` | No | String | Specifies under which conditions the processor should perform parsing. Default is no condition. Accepts a Data Prepper expression string following the [Expression syntax]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/). | -| `overwrite_if_destination_exists` | No | Boolean | Overwrites the destination if set to `true`. Set to `false` to prevent changing a destination value that exists. Defaults to `true`. | -| `delete_source` | No | Boolean | If set to `true` then this will delete the source field. Defaults to `false`. | -| `tags_on_failure` | No | String | A list of strings specifying the tags to be set in the event that the processor fails or an unknown exception occurs during parsing. +| `source` | No | String | The field in the event that will be parsed. Default is `message`. | +| `destination` | No | String | The destination field for the parsed JSON. Default is the root of the event. Cannot be `""`, `/`, or any white-space-only string. | +| `pointer` | No | String | A JSON pointer (as defined by [RFC 6901](https://datatracker.ietf.org/doc/html/rfc6901)) to a specific field in the source JSON. If omitted, the entire `source` is parsed. If the pointer is invalid, the full `source` is parsed instead. When writing to the root destination, existing keys will be preserved unless overwritten. | +| `parse_when` | No | String | A condition expression that determines when to parse the field. Accepts a string following the [expression syntax]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/). | +| `overwrite_if_destination_exists` | No | Boolean | Whether to overwrite the destination field if it already exists. Default is `true`. | +| `delete_source` | No | Boolean | Whether to delete the source field after parsing. Default is `false`. | +| `tags_on_failure` | No | String | A list of tags to apply if parsing fails or an unexpected exception occurs. | +| `handle_failed_events` | No | String | Determines how to handle events containing JSON processing errors. Valid values are `skip` (log the error and send the event downstream to the next processor) and `skip_silently` (send the event downstream to the next processor without logging the error). | ## Usage -To get started, create the following `pipeline.yaml` file: +To use the `parse_json` processor, add it to your `pipeline.yaml` configuration file: ```yaml parse-json-pipeline: source: ... - .... + ... processor: - parse_json: ``` +{% include copy.html %} -### Basic example - -To test the `parse_json` processor with the previous configuration, run the pipeline and paste the following line into your console, then enter `exit` on a new line: +All examples use the following JSON message for the event output: -``` +```json {"outer_key": {"inner_key": "inner_value"}} ``` {% include copy.html %} -The `parse_json` processor parses the message into the following format: +### Basic example -``` -{"message": {"outer_key": {"inner_key": "inner_value"}}", "outer_key":{"inner_key":"inner_value"}}} +The following example parses a JSON message field and flattens the data into the event. The original `message` from the example event remains, and the parsed content is added at the root level, as shown in the following output: + +```json +{ + "message": "{\"outer_key\": {\"inner_key\": \"inner_value\"}}", + "outer_key": { + "inner_key": "inner_value" + } +} ``` -### Example with a JSON pointer +### Delete a source -You can use a JSON pointer to parse a selection of the JSON data by specifying the `pointer` option in the configuration. To get started, create the following `pipeline.yaml` file: +If you want to remove the original field from the originating JSON message, use the `delete_source` option, as shown in the following example pipeline: ```yaml parse-json-pipeline: source: ... - .... + ... processor: - parse_json: - pointer: "outer_key/inner_key" + delete_source: true ``` +{% include copy.html %} -To test the `parse_json` processor with the pointer option, run the pipeline, paste the following line into your console, and then enter `exit` on a new line: +In the following event, the `message` field is parsed and removed, leaving only the structured output: +```json +{ + "outer_key": { + "inner_key": "inner_value" + } +} ``` -{"outer_key": {"inner_key": "inner_value"}} + + +### Example using a JSON pointer + +You can use the `pointer` option to extract a specific nested field from the JSON data, as shown in the following example pipeline: + +```yaml +parse-json-pipeline: + source: + ... + ... + processor: + - parse_json: + pointer: "/outer_key/inner_key" ``` {% include copy.html %} -The processor parses the message into the following format: +Only the value at the pointer path `/outer_key/inner_key` is extracted and added to the event. If you set `destination`, the extracted value will be added to that field instead: +```json +{ + "message": "{\"outer_key\": {\"inner_key\": \"inner_value\"}}", + "inner_key": "inner_value" +} ``` -{"message": {"outer_key": {"inner_key": "inner_value"}}", "inner_key": "inner_value"} -``` \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/parse-xml.md b/_data-prepper/pipelines/configuration/processors/parse-xml.md index c8c9f3eebf8..69088770747 100644 --- a/_data-prepper/pipelines/configuration/processors/parse-xml.md +++ b/_data-prepper/pipelines/configuration/processors/parse-xml.md @@ -26,7 +26,7 @@ This table is autogenerated. Do not edit it. | `source` | No | String | Specifies which `event` field to parse. | | `destination` | No | String | The destination field of the parsed XML. Defaults to the root of the `event`. Cannot be `""`, `/`, or any white-space-only string because these are not valid `event` fields. | | `pointer` | No | String | A JSON pointer to the field to be parsed. The value is null by default, meaning that the entire `source` is parsed. The `pointer` can access JSON array indexes as well. If the JSON pointer is invalid, then the entire `source` data is parsed into the outgoing `event` object. If the key that is pointed to already exists in the `event` object and the `destination` is the root, then the pointer uses the entire path of the key. | -| `parse_when` | No | String | Specifies under what conditions the processor should perform parsing. Default is no condition. Accepts a Data Prepper expression string following the [Data Prepper Expression Syntax]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/). | +| `parse_when` | No | String | Specifies under what conditions the processor should perform parsing. Default is no condition. Accepts an OpenSearch Data Prepper expression string following the [expression syntax]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/). | | `overwrite_if_destination_exists` | No | Boolean | Overwrites the destination if set to `true`. Set to `false` to prevent changing a destination value that exists. Defaults to `true`. | | `delete_source` | No | Boolean | If set to `true` then this will delete the source field. Defaults to `false`. | | `tags_on_failure` | No | String | A list of strings specifying the tags to be set in the event that the processor fails or an unknown exception occurs during parsing. diff --git a/_data-prepper/pipelines/configuration/processors/processors.md b/_data-prepper/pipelines/configuration/processors/processors.md index 1fa7120551c..a5d9e315f18 100644 --- a/_data-prepper/pipelines/configuration/processors/processors.md +++ b/_data-prepper/pipelines/configuration/processors/processors.md @@ -8,7 +8,7 @@ nav_order: 35 # Processors -Processors are components within a Data Prepper pipeline that enable you to filter, transform, and enrich events using your desired format before publishing records to the `sink` component. If no `processor` is defined in the pipeline configuration, then the events are published in the format specified by the `source` component. You can incorporate multiple processors within a single pipeline, and they are executed sequentially as defined in the pipeline. +Processors are components within an OpenSearch Data Prepper pipeline that enable you to filter, transform, and enrich events using your desired format before publishing records to the `sink` component. If no `processor` is defined in the pipeline configuration, then the events are published in the format specified by the `source` component. You can incorporate multiple processors within a single pipeline, and they are executed sequentially as defined in the pipeline. Prior to Data Prepper 1.3, these components were named *preppers*. In Data Prepper 1.3, the term *prepper* was deprecated in favor of *processor*. In Data Prepper 2.0, the term *prepper* was removed. {: .note } diff --git a/_data-prepper/pipelines/configuration/processors/rename-keys.md b/_data-prepper/pipelines/configuration/processors/rename-keys.md index f57b4e509fd..a2f1711ebf4 100644 --- a/_data-prepper/pipelines/configuration/processors/rename-keys.md +++ b/_data-prepper/pipelines/configuration/processors/rename-keys.md @@ -44,7 +44,7 @@ pipeline: {% include copy.html %} -Next, create a log file named `logs_json.log` and replace the `path` in the file source of your `pipeline.yaml` file with that filepath. For more information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). +Next, create a log file named `logs_json.log` and replace the `path` in the file source of your `pipeline.yaml` file with that filepath. For more information, see [Configuring OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). For example, before you run the `rename_keys` processor, if the `logs_json.log` file contains the following event record: diff --git a/_data-prepper/pipelines/configuration/processors/routes.md b/_data-prepper/pipelines/configuration/processors/routes.md deleted file mode 100644 index eb45153756e..00000000000 --- a/_data-prepper/pipelines/configuration/processors/routes.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -layout: default -title: routes -parent: Processors -grand_parent: Pipelines -nav_order: 90 ---- - -# Routes - -Routes define conditions that can be used in sinks for conditional routing. Routes are specified at the same level as processors and sinks under the name `route` and consist of a list of key-value pairs, where the key is the name of a route and the value is a Data Prepper expression representing the routing condition. - -<!---## Configuration - -Content will be added to this section. - -## Metrics - -Content will be added to this section.---> \ No newline at end of file diff --git a/_data-prepper/pipelines/configuration/processors/select-entries.md b/_data-prepper/pipelines/configuration/processors/select-entries.md index 49fac39f4b9..4e9d1d1099f 100644 --- a/_data-prepper/pipelines/configuration/processors/select-entries.md +++ b/_data-prepper/pipelines/configuration/processors/select-entries.md @@ -8,7 +8,7 @@ nav_order: 59 # select_entries -The `select_entries` processor selects entries from a Data Prepper event. +The `select_entries` processor selects entries from an OpenSearch Data Prepper event. Only the selected entries remain in the processed event and while all other entries are removed. However, the processor does not remove any events from the Data Prepper pipeline. ## Configuration @@ -18,7 +18,7 @@ You can configure the `select_entries` processor using the following options. | Option | Required | Description | | :--- | :--- | :--- | | `include_keys` | Yes | A list of keys to be selected from an event. | -| `select_when` | No | A [conditional expression](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/), such as `/some-key == "test"'`, that will be evaluated to determine whether the processor will be run on the event. If the condition is not met, then the event continues through the pipeline unmodified with all the original fields present. | +| `select_when` | No | A [conditional expression]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/), such as `/some-key == "test"'`, that will be evaluated to determine whether the processor will be run on the event. If the condition is not met, then the event continues through the pipeline unmodified with all the original fields present. | ## Usage diff --git a/_data-prepper/pipelines/configuration/processors/trace-peer-forwarder.md b/_data-prepper/pipelines/configuration/processors/trace-peer-forwarder.md index a73295b8c87..2665b985f72 100644 --- a/_data-prepper/pipelines/configuration/processors/trace-peer-forwarder.md +++ b/_data-prepper/pipelines/configuration/processors/trace-peer-forwarder.md @@ -14,7 +14,7 @@ You should use `trace_peer_forwarder` for Trace Analytics pipelines when you hav ## Usage -To get started with `trace_peer_forwarder`, first configure [peer forwarder]({{site.url}}{{site.baseurl}}/data-prepper/managing-data-prepper/peer-forwarder/). Then create a `pipeline.yaml` file and specify `trace peer forwarder` as the processor. You can configure `peer forwarder` in your `data-prepper-config.yaml` file. For more detailed information, see [Configuring Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). +To get started with `trace_peer_forwarder`, first configure [peer forwarder]({{site.url}}{{site.baseurl}}/data-prepper/managing-data-prepper/peer-forwarder/). Then create a `pipeline.yaml` file and specify `trace peer forwarder` as the processor. You can configure `peer forwarder` in your `data-prepper-config.yaml` file. For more detailed information, see [Configuring OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). See the following example `pipeline.yaml` file: diff --git a/_data-prepper/pipelines/configuration/processors/translate.md b/_data-prepper/pipelines/configuration/processors/translate.md index d29aa5894c6..2da970d8438 100644 --- a/_data-prepper/pipelines/configuration/processors/translate.md +++ b/_data-prepper/pipelines/configuration/processors/translate.md @@ -42,7 +42,7 @@ Then create the following file named `logs_json.log` and replace the `path` in t The `translate` processor configuration in `pipeline.yaml` retrieves the `source` value from the event data and compares it against the keys specified under the `targets`. When a match is found, the processor places the corresponding mapped value into the `target` key provided in the configuration. -When you run Data Prepper with the previous `pipeline.yaml` file, you should receive the following output: +When you run OpenSearch Data Prepper with the previous `pipeline.yaml` file, you should receive the following output: ```json { diff --git a/_data-prepper/pipelines/configuration/sinks/aws-lambda.md b/_data-prepper/pipelines/configuration/sinks/aws-lambda.md new file mode 100644 index 00000000000..5fc6e7a8bfc --- /dev/null +++ b/_data-prepper/pipelines/configuration/sinks/aws-lambda.md @@ -0,0 +1,73 @@ +--- +layout: default +title: aws_lambda +parent: Sinks +grand_parent: Pipelines +nav_order: 10 +--- + +---------------------------------------------------------------------------------------- +# `aws_lambda` sink for OpenSearch Data Prepper + +This page explains how to configure and use [AWS Lambda](https://aws.amazon.com/lambda/) with OpenSearch Data Prepper, enabling Lambda functions to serve as both processors and sinks. + +## `aws_lambda` sink + +Configure the Lambda sink using the following parameters. + +Field | Type | Required | Description +--------------------| ------- | -------- | ---------------------------------------------------------------------------- +`function_name` | String | Yes | The name of the AWS Lambda function to invoke. +`invocation_type` | String | No | Specifies the invocation type. Default is `event`. +`aws.region` | String | Yes | The AWS Region in which the Lambda function is located. +`aws.sts_role_arn` | String | No | The Amazon Resource Name (ARN) of the role to assume before invoking the Lambda function. +`max_retries` | Integer | No | The maximum number of retries if the invocation fails. Default is `3`. +`batch` | Object | No | Optional batch settings for Lambda invocations. Default is `key_name = events`. Default threshold is `event_count=100`, `maximum_size="5mb"`, and `event_collect_timeout = 10s`. +`lambda_when` | String | No | A conditional expression that determines when to invoke the Lambda sink. +`dlq` | Object | No | The dead-letter queue (DLQ) configuration for failed invocations. + +#### Example configuration + +``` +sink: + - aws_lambda: + function_name: "my-lambda-sink" + invocation_type: "event" + aws: + region: "us-west-2" + sts_role_arn: "arn:aws:iam::123456789012:role/my-lambda-sink-role" + max_retries: 5 + batch: + key_name: "events" + threshold: + event_count: 50 + maximum_size: "3mb" + event_collect_timeout: PT5S + lambda_when: "event['type'] == 'log'" + dlq: + region: "us-east-1" + sts_role_arn: "arn:aws:iam::123456789012:role/my-sqs-role" + bucket: "<<your-dlq-bucket-name>>" +``` +{% include copy-curl.html %} + +## Usage + +The invocation types are as follows: + +- `event` (Default): Executes functions asynchronously without waiting for responses. +- `request-response` (Sink only): Executes functions synchronously, though responses are not processed. +- `batch`: Automatically groups events based on configured thresholds. +- `dlq`: Supports the DLQ configuration for failed invocations after retry attempts. + +Data Prepper components use an AWS Identity and Access Management (IAM) role assumption, `aws.sts_role_arn`, for secure Lambda function invocation and respect Lambda's concurrency limits during event processing. For more information, see the [AWS Lambda documentation](https://docs.aws.amazon.com/lambda). +{: .note} + +## Developer guide + +Integration tests must be executed separately from the main Data Prepper build. Execute them with the following command: + +``` +./gradlew :data-prepper-plugins:aws-lambda:integrationTest -Dtests.sink.lambda.region="us-east-1" -Dtests.sink.lambda.functionName="lambda_test_function" -Dtests.sink.lambda.sts_role_arn="arn:aws:iam::123456789012:role/dataprepper-role +``` +{% include copy-curl.html %} diff --git a/_data-prepper/pipelines/configuration/sinks/opensearch.md b/_data-prepper/pipelines/configuration/sinks/opensearch.md index b1c32f00052..78c4ff773be 100644 --- a/_data-prepper/pipelines/configuration/sinks/opensearch.md +++ b/_data-prepper/pipelines/configuration/sinks/opensearch.md @@ -58,15 +58,17 @@ Option | Required | Type | Description `password` | No | String | The password for HTTP basic authentication. `aws` | No | AWS | The [AWS](#aws) configuration. [max_retries](#configure-max_retries) | No | Integer | The maximum number of times that the `opensearch` sink should try to push data to the OpenSearch server before considering it to be a failure. Defaults to `Integer.MAX_VALUE`. When not provided, the sink will try to push data to the OpenSearch server indefinitely and exponential backoff will increase the waiting time before a retry. -`aws_sigv4` | No | Boolean | **Deprecated in Data Prepper 2.7.** Default is `false`. Whether to use AWS Identity and Access Management (IAM) signing to connect to an Amazon OpenSearch Service domain. For your access key, secret key, and optional session token, Data Prepper uses the default credential chain (environment variables, Java system properties, `~/.aws/credential`). +`aws_sigv4` | No | Boolean | **Deprecated in Data Prepper 2.7.** Default is `false`. Whether to use AWS Identity and Access Management (IAM) signing to connect to an Amazon OpenSearch Service domain. For your access key, secret key, and optional session token, OpenSearch Data Prepper uses the default credential chain (environment variables, Java system properties, `~/.aws/credential`). `aws_region` | No | String | **Deprecated in Data Prepper 2.7.** The AWS Region (for example, `"us-east-1"`) for the domain when you are connecting to Amazon OpenSearch Service. `aws_sts_role_arn` | No | String | **Deprecated in Data Prepper 2.7.** The IAM role that the plugin uses to sign requests sent to Amazon OpenSearch Service. If this information is not provided, then the plugin uses the default credentials. `socket_timeout` | No | Integer | The timeout value, in milliseconds, when waiting for data to be returned (the maximum period of inactivity between two consecutive data packets). A timeout value of `0` is interpreted as an infinite timeout. If this timeout value is negative or not set, then the underlying Apache HttpClient will rely on operating system settings to manage socket timeouts. `connect_timeout` | No | Integer| The timeout value, in milliseconds, when requesting a connection from the connection manager. A timeout value of `0` is interpreted as an infinite timeout. If this timeout value is negative or not set, the underlying Apache HttpClient will rely on operating system settings to manage connection timeouts. `insecure` | No | Boolean | Whether or not to verify SSL certificates. If set to `true`, then certificate authority (CA) certificate verification is disabled and insecure HTTP requests are sent instead. Default is `false`. `proxy` | No | String | The address of the [forward HTTP proxy server](https://en.wikipedia.org/wiki/Proxy_server). The format is `"<hostname or IP>:<port>"` (for example, `"example.com:8100"`, `"http://example.com:8100"`, `"112.112.112.112:8100"`). The port number cannot be omitted. -`index` | Conditionally | String | The name of the export index. Only required when the `index_type` is `custom`. The index can be a plain string, such as `my-index-name`, contain [Java date-time patterns](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html), such as `my-index-%{yyyy.MM.dd}` or `my-%{yyyy-MM-dd-HH}-index`, be formatted using field values, such as `my-index-${/my_field}`, or use [Data Prepper expressions](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/), such as `my-index-${getMetadata(\"my_metadata_field\"}`. All formatting options can be combined to provide flexibility when creating static, dynamic, and rolling indexes. -`index_type` | No | String | Tells the sink plugin what type of data it is handling. Valid values are `custom`, `trace-analytics-raw`, `trace-analytics-service-map`, or `management-disabled`. Default is `custom`. +`index` | Conditionally | String | The name of the export index. Only required when the `index_type` is `custom`. The index can be a plain string, such as `my-index-name`, contain [Java date-time patterns](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html), such as `my-index-%{yyyy.MM.dd}` or `my-%{yyyy-MM-dd-HH}-index`, be formatted using field values, such as `my-index-${/my_field}`, or use [Data Prepper expressions]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/), such as `my-index-${getMetadata(\"my_metadata_field\"}`. All formatting options can be combined to provide flexibility when creating static, dynamic, and rolling indexes. +`index_type` | No | String | Specifies the type of data the sink plugin handles. Valid values include `custom`, `trace-analytics-raw`, `trace-analytics-plain-raw`, `trace-analytics-service-map`, `log-analytics`, `log-analytics-plain`, `metric-analytics`, `metric-analytics-plain`, and `management-disabled`. <br><br>To produce Amazon Security Lake–compliant data from the `otel_logs_source` with `output_format: otel`, set `index_type` to `log-analytics-plain`. <br>For `otel_metrics_source` with `output_format: otel`, set `index_type` to `metric-analytics-plain`. <br>For `otel_trace_source` with `output_format: otel`, set `index_type` to `trace-analytics-plain-raw`. <br><br>Default is `custom`. + + `template_type` | No | String | Defines what type of OpenSearch template to use. Available options are `v1` and `index-template`. The default value is `v1`, which uses the original OpenSearch templates available at the `_template` API endpoints. The `index-template` option uses composable [index templates]({{site.url}}{{site.baseurl}}/opensearch/index-templates/), which are available through the OpenSearch `_index_template` API. Composable index types offer more flexibility than the default and are necessary when an OpenSearch cluster contains existing index templates. Composable templates are available for all versions of OpenSearch and some later versions of Elasticsearch. When `distribution_version` is set to `es6`, Data Prepper enforces the `template_type` as `v1`. `template_file` | No | String | The path to a JSON [index template]({{site.url}}{{site.baseurl}}/opensearch/index-templates/) file, such as `/your/local/template-file.json`, when `index_type` is set to `custom`. For an example template file, see [otel-v1-apm-span-index-template.json](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/opensearch/src/main/resources/otel-v1-apm-span-index-template.json). If you supply a template file, then it must match the template format specified by the `template_type` parameter. `template_content` | No | JSON | Contains all the inline JSON found inside of the index [index template]({{site.url}}{{site.baseurl}}/opensearch/index-templates/). For an example of template content, see [the example template content](#example_template_content). diff --git a/_data-prepper/pipelines/configuration/sinks/s3.md b/_data-prepper/pipelines/configuration/sinks/s3.md index 6bae749d387..9e2ba9d777e 100644 --- a/_data-prepper/pipelines/configuration/sinks/s3.md +++ b/_data-prepper/pipelines/configuration/sinks/s3.md @@ -8,7 +8,7 @@ nav_order: 55 # s3 -The `s3` sink saves and writes batches of Data Prepper events to Amazon Simple Storage Service (Amazon S3) objects. The configured `codec` determines how the `s3` sink serializes the data into Amazon S3. +The `s3` sink saves and writes batches of OpenSearch Data Prepper events to Amazon Simple Storage Service (Amazon S3) objects. The configured `codec` determines how the `s3` sink serializes the data into Amazon S3. The `s3` sink uses the following format when batching events: @@ -159,7 +159,7 @@ Use the following options to define how object keys are constructed for objects Option | Required | Type | Description :--- | :--- | :--- | :--- -`path_prefix` | No | String | The S3 key prefix path to use for objects written to S3. Accepts date-time formatting and dynamic injection of values using [Data Prepper expressions](https://opensearch.org/docs/latest/data-prepper/pipelines/expression-syntax/). For example, you can use `/${/my_partition_key}/%{yyyy}/%{MM}/%{dd}/%{HH}/` to create hourly folders in S3 based on the `my_partition_key` value. The prefix path should end with `/`. By default, Data Prepper writes objects to the S3 bucket root. +`path_prefix` | No | String | The S3 key prefix path to use for objects written to S3. Accepts date-time formatting and dynamic injection of values using [Data Prepper expressions]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/). For example, you can use `/${/my_partition_key}/%{yyyy}/%{MM}/%{dd}/%{HH}/` to create hourly folders in S3 based on the `my_partition_key` value. The prefix path should end with `/`. By default, Data Prepper writes objects to the S3 bucket root. ## `codec` diff --git a/_data-prepper/pipelines/configuration/sinks/sinks.md b/_data-prepper/pipelines/configuration/sinks/sinks.md index 51bf3b1c9c3..cad70af9c70 100644 --- a/_data-prepper/pipelines/configuration/sinks/sinks.md +++ b/_data-prepper/pipelines/configuration/sinks/sinks.md @@ -8,7 +8,7 @@ nav_order: 25 # Sinks -A `sink` is an output component that specifies the destination(s) to which a Data Prepper pipeline publishes events. Sink destinations can be services like OpenSearch, Amazon Simple Storage Service (Amazon S3), or even another Data Prepper pipeline, enabling chaining of multiple pipelines. The sink component has the following configurable options that you can use to customize the destination type. +A `sink` is an output component that specifies the destination(s) to which an OpenSearch Data Prepper pipeline publishes events. Sink destinations can be services like OpenSearch, Amazon Simple Storage Service (Amazon S3), or even another Data Prepper pipeline, enabling chaining of multiple pipelines. The sink component has the following configurable options that you can use to customize the destination type. ## Configuration options diff --git a/_data-prepper/pipelines/configuration/sources/documentdb.md b/_data-prepper/pipelines/configuration/sources/documentdb.md index d3dd31edcbe..5eaebcfd5a8 100644 --- a/_data-prepper/pipelines/configuration/sources/documentdb.md +++ b/_data-prepper/pipelines/configuration/sources/documentdb.md @@ -12,7 +12,7 @@ The `documentdb` source reads documents from [Amazon DocumentDB](https://aws.ama It can read historical data from an export and keep up to date on the data using Amazon DocumentDB [change streams](https://docs.aws.amazon.com/documentdb/latest/developerguide/change_streams.html). The `documentdb` source reads data from Amazon DocumentDB and puts that data into an [Amazon Simple Storage Service (Amazon S3)](https://aws.amazon.com/s3/) bucket. -Then, other Data Prepper workers read from the S3 bucket to process data. +Then, other OpenSearch Data Prepper workers read from the S3 bucket to process data. ## Usage The following example pipeline uses the `documentdb` source: diff --git a/_data-prepper/pipelines/configuration/sources/http.md b/_data-prepper/pipelines/configuration/sources/http.md index 574f49e2894..deb035b2203 100644 --- a/_data-prepper/pipelines/configuration/sources/http.md +++ b/_data-prepper/pipelines/configuration/sources/http.md @@ -15,8 +15,9 @@ The `http` plugin accepts HTTP requests from clients. The following table descri Option | Required | Type | Description :--- | :--- | :--- | :--- port | No | Integer | The port that the source is running on. Default value is `2021`. Valid options are between `0` and `65535`. +path | No | String | The URI path for log ingestion should start with a forward slash (/), for example, `/${pipelineName}/logs`. The `${pipelineName}` placeholder will be replaced with the pipeline name. The default value is `/log/ingest`. health_check_service | No | Boolean | Enables the health check service on the `/health` endpoint on the defined port. Default value is `false`. -unauthenticated_health_check | No | Boolean | Determines whether or not authentication is required on the health check endpoint. Data Prepper ignores this option if no authentication is defined. Default value is `false`. +unauthenticated_health_check | No | Boolean | Determines whether or not authentication is required on the health check endpoint. OpenSearch Data Prepper ignores this option if no authentication is defined. Default value is `false`. request_timeout | No | Integer | The request timeout, in milliseconds. Default value is `10000`. thread_count | No | Integer | The number of threads to keep in the ScheduledThreadPool. Default value is `200`. max_connection_count | No | Integer | The maximum allowed number of open connections. Default value is `500`. @@ -46,7 +47,9 @@ The `http` protocol only supports the JSON UTF-8 codec for incoming requests, fo The following cURL command can be used to ingest data: -`curl "http://localhost:2021/log/ingest" --data '[{"key1": "value1"}, {"key2": "value2"}]'` +``` +curl "http://localhost:2021/log/ingest" --data '[{"key1": "value1"}, {"key2": "value2"}]' +``` {% include copy-curl.html %} ## Metrics diff --git a/_data-prepper/pipelines/configuration/sources/kafka.md b/_data-prepper/pipelines/configuration/sources/kafka.md index ecd7c7eaa0a..240473b8648 100644 --- a/_data-prepper/pipelines/configuration/sources/kafka.md +++ b/_data-prepper/pipelines/configuration/sources/kafka.md @@ -8,7 +8,7 @@ nav_order: 40 # kafka -You can use the Apache Kafka source (`kafka`) in Data Prepper to read records from one or more Kafka [topics](https://kafka.apache.org/intro#intro_concepts_and_terms). These records hold events that your Data Prepper pipeline can ingest. The `kafka` source uses Kafka's [Consumer API](https://kafka.apache.org/documentation/#consumerapi) to consume messages from the Kafka broker, which then creates Data Prepper events for further processing by the Data Prepper pipeline. +You can use the Apache Kafka source (`kafka`) in OpenSearch Data Prepper to read records from one or more Kafka [topics](https://kafka.apache.org/intro#intro_concepts_and_terms). These records hold events that your Data Prepper pipeline can ingest. The `kafka` source uses Kafka's [Consumer API](https://kafka.apache.org/documentation/#consumerapi) to consume messages from the Kafka broker, which then creates Data Prepper events for further processing by the Data Prepper pipeline. ## Usage @@ -55,7 +55,7 @@ Option | Required | Type | Description `auto_commit` | No | Boolean | When `false`, the consumer's offset will not be periodically committed to Kafka in the background. Default is `false`. `commit_interval` | No | Integer | When `auto_commit` is set to `true`, sets how frequently, in seconds, the consumer offsets are auto-committed to Kafka through Kafka's `auto.commit.interval.ms` option. Default is `5s`. `session_timeout` | No | Integer | The amount of time during which the source detects client failures when using Kafka's group management features, which can be used to balance the data stream. Default is `45s`. -`auto_offset_reset` | No | String | Automatically resets the offset to an earlier or the latest offset through Kafka's `auto.offset.reset` option. Default is `latest`. +`auto_offset_reset` | No | String | Automatically resets the offset to an earlier or the latest offset through Kafka's `auto.offset.reset` option. Default is `earliest`. `thread_waiting_time` | No | Integer | The amount of time that threads wait for the preceding thread to complete its task and to signal the next thread. The Kafka consumer API poll timeout value is set to half of this setting. Default is `5s`. `max_partition_fetch_bytes` | No | Integer | Sets the maximum limit in megabytes for max data returns from each partition through Kafka's `max.partition.fetch.bytes` setting. Default is `1mb`. `heart_beat_interval` | No | Integer | The expected amount of time between heartbeats to the consumer coordinator when using Kafka's group management facilities through Kafka's `heartbeat.interval.ms` setting. Default is `5s`. diff --git a/_data-prepper/pipelines/configuration/sources/kinesis.md b/_data-prepper/pipelines/configuration/sources/kinesis.md new file mode 100644 index 00000000000..bf732da6295 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sources/kinesis.md @@ -0,0 +1,170 @@ +--- +layout: default +title: kinesis +parent: Sources +grand_parent: Pipelines +nav_order: 45 +--- + +# kinesis + +You can use the OpenSearch Data Prepper `kinesis` source to ingest records from one or more [Amazon Kinesis Data Streams](https://aws.amazon.com/kinesis/data-streams/). + +## Usage + +The following example pipeline specifies Kinesis as a source. The pipeline ingests data from multiple Kinesis data streams named `stream1` and `stream2` and sets the `initial_position` to indicate the starting point for reading the stream records: + +```yaml +version: "2" +kinesis-pipeline: + source: + kinesis: + streams: + - stream_name: "stream1" + initial_position: "LATEST" + - stream_name: "stream2" + initial_position: "LATEST" + aws: + region: "us-west-2" + sts_role_arn: "arn:aws:iam::123456789012:role/my-iam-role" +``` + +## Configuration options + +The `kinesis` source supports the following configuration options. + +Option | Required | Type | Description +:--- |:---------|:---------| :--- +`aws` | Yes | AWS | Specifies the AWS configuration. See [`aws`](#aws). +`acknowledgments` | No | Boolean | When set to `true`, enables the `kinesis` source to receive [end-to-end acknowledgments]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/pipelines#end-to-end-acknowledgments) when events are received by OpenSearch sinks. +`streams` | Yes | List | Configures a list of multiple Kinesis data streams that the `kinesis` source uses to read records. You can configure up to four streams. See [Streams](#streams). +`codec` | Yes | Codec | Specifies the [codec](#codec) to apply. +`buffer_timeout` | No | Duration | Sets the amount of time allowed for writing events to the Data Prepper buffer before timeout occurs. Any events that the source cannot write to the buffer during the specified amount of time are discarded. Default is `1s`. +`records_to_accumulate` | No | Integer | Determines the number of messages that accumulate before being written to the buffer. Default is `100`. +`consumer_strategy` | No | String | Selects the consumer strategy to use for ingesting Kinesis data streams. The default is `fan-out`, but `polling` can also be used. If `polling` is enabled, the additional configuration is required. +`polling` | No | polling | See [polling](#polling). + +### Streams + +You can use the following options in the `streams` array. + +Option | Required | Type | Description +:--- |:---------| :--- | :--- +`stream_name` | Yes | String | Defines the name of each Kinesis data stream. +`initial_position` | No | String | Sets the `initial_position` to determine at what point the `kinesis` source starts reading stream records. Use `LATEST` to start from the most recent record or `EARLIEST` to start from the beginning of the stream. Default is `LATEST`. +`checkpoint_interval` | No | Duration | Configure the `checkpoint_interval` to periodically checkpoint Kinesis data streams and avoid duplication of record processing. Default is `PT2M`. +`compression` | No | String | Specifies the compression format. To decompress records added by a [CloudWatch Logs Subscription Filter](https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/SubscriptionFilters.html) to Kinesis, use the `gzip` compression format. + +## codec + +The `codec` determines how the `kinesis` source parses each Kinesis stream record. For increased and more efficient performance, you can use [codec combinations]({{site.url}}{{site.baseurl}}/data-prepper/common-use-cases/codec-processor-combinations/) with certain processors. + +### json codec + +The `json` codec parses each single line as a single JSON object from a JSON array and then creates a Data Prepper event for each object in the array. It can be used for parsing nested CloudWatch events into individual log entries. +It also supports the below configuration to use with this codec. + +Option | Required | Type | Description +:--- | :--- |:--------| :--- +`key_name` | No | String | The name of the input field from which to extract the JSON array and create Data Prepper events. +`include_keys` | No | List | The list of input fields to be extracted and added as additional fields in the Data Prepper event. +`include_keys_metadata` | No | List | The list of input fields to be extracted and added to the Data Prepper event metadata object. +`max_event_length` | No | Integer | The maximum size of any single event being read by the JSON codec. Default is 20,000,000 characters. + + +### `newline` codec + +The `newline` codec parses each Kinesis stream record as a single log event, making it ideal for processing single-line records. It also works well with the [`parse_json` processor]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/processors/parse-json/) to parse each line. + +You can use the following options to configure the `newline` codec. + +Option | Required | Type | Description +:--- | :--- |:--------| :--- +`skip_lines` | No | Integer | Sets the number of lines to skip before creating events. You can use this configuration to skip common header rows. Default is `0`. +`header_destination` | No | String | Defines a key value to assign to the header line of the stream event. If this option is specified, then each event will contain a `header_destination` field. + +### polling + +When the `consumer_strategy` is set to `polling`, the `kinesis` source uses a polling-based approach to read records from the Kinesis data streams, instead of the default `fan-out` approach. + +Option | Required | Type | Description +:--- | :--- |:--------| :--- +`max_polling_records` | No | Integer | Sets the number of records to fetch from Kinesis during a single call. +`idle_time_between_reads` | No | Duration | Defines the amount of idle time between calls. + +### aws + +You can use the following options in the `aws` configuration. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`region` | No | String | Sets the AWS Region to use for credentials. Defaults to the [standard SDK behavior for determining the Region](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/region-selection.html). +`sts_role_arn` | No | String | Defines the AWS Security Token Service (AWS STS) role to assume for requests to Amazon Kinesis Data Streams and Amazon DynamoDB. Defaults to `null`, which uses the [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). +`aws_sts_header_overrides` | No | Map | Defines a map of header overrides that the AWS Identity and Access Management (IAM) role assumes for the sink plugin. + +## Exposed metadata attributes + +The `kinesis` source adds the following metadata to each processed event. You can access the metadata attributes using the [expression syntax `getMetadata` function]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/get-metadata/). + +- `stream_name`: Contains the name of the Kinesis data stream from which the event was obtained. + +## Permissions + +The following minimum permissions are required in order to run `kinesis` as a source: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "kinesis:DescribeStream", + "kinesis:DescribeStreamConsumer", + "kinesis:DescribeStreamSummary", + "kinesis:GetRecords", + "kinesis:GetShardIterator", + "kinesis:ListShards", + "kinesis:ListStreams", + "kinesis:ListStreamConsumers", + "kinesis:RegisterStreamConsumer", + "kinesis:SubscribeToShard" + ], + "Resource": [ + "arn:aws:kinesis:us-east-1:{account-id}:stream/stream1", + "arn:aws:kinesis:us-east-1:{account-id}:stream/stream2" + ] + }, + { + "Sid": "allowCreateTable", + "Effect": "Allow", + "Action": [ + "dynamodb:CreateTable", + "dynamodb:PutItem", + "dynamodb:DescribeTable", + "dynamodb:DeleteItem", + "dynamodb:GetItem", + "dynamodb:Scan", + "dynamodb:UpdateItem", + "dynamodb:Query" + ], + "Resource": [ + "arn:aws:dynamodb:us-east-1:{account-id}:table/kinesis-pipeline" + ] + } + ] +} +``` + +The `kinesis` source uses a DynamoDB table for ingestion coordination among multiple workers, so you need DynamoDB permissions. + +## Metrics + +The `kinesis` source includes the following metrics. + +### Counters + +* `recordsProcessed`: Counts the number of processed stream records. +* `recordProcessingErrors`: Counts the number of stream record processing errors. +* `acknowledgementSetSuccesses`: Counts the number of processed stream records that were successfully added to the sink. +* `acknowledgementSetFailures`: Counts the number of processed stream records that failed to be added to the sink. diff --git a/_data-prepper/pipelines/configuration/sources/opensearch.md b/_data-prepper/pipelines/configuration/sources/opensearch.md index 1ee22375753..1c8fc337ae4 100644 --- a/_data-prepper/pipelines/configuration/sources/opensearch.md +++ b/_data-prepper/pipelines/configuration/sources/opensearch.md @@ -84,7 +84,7 @@ The `opensearch` source can be configured with Amazon OpenSearch Serverless by s ## Using metadata -When the `opensource` source constructs Data Prepper events from documents in the cluster, the document index is stored in the EventMetadata with an `opensearch-index` key, and the document_id is stored in the `EventMetadata` with the `opensearch-document_id` as the key. This allows for conditional routing based on the index or `document_id`. The following example pipeline configuration sends events to an `opensearch` sink and uses the same index and `document_id` from the source cluster as in the destination cluster: +When the `opensource` source constructs OpenSearch Data Prepper events from documents in the cluster, the document index is stored in the EventMetadata with an `opensearch-index` key, and the document_id is stored in the `EventMetadata` with the `opensearch-document_id` as the key. This allows for conditional routing based on the index or `document_id`. The following example pipeline configuration sends events to an `opensearch` sink and uses the same index and `document_id` from the source cluster as in the destination cluster: ```yaml @@ -177,8 +177,8 @@ Option | Required | Type | Description ### Default search behavior By default, the `opensearch` source will look up the cluster version and distribution to determine -which `search_context_type` to use. For versions and distributions that support [Point in Time](https://opensearch.org/docs/latest/search-plugins/searching-data/paginate/#point-in-time-with-search_after), `point_in_time` will be used. -If `point_in_time` is not supported by the cluster, then [scroll](https://opensearch.org/docs/latest/search-plugins/searching-data/paginate/#scroll-search) will be used. For Amazon OpenSearch Serverless collections, [search_after](https://opensearch.org/docs/latest/search-plugins/searching-data/paginate/#the-search_after-parameter) will be used because neither `point_in_time` nor `scroll` are supported by collections. +which `search_context_type` to use. For versions and distributions that support [Point in Time]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/paginate/#point-in-time-with-search_after), `point_in_time` will be used. +If `point_in_time` is not supported by the cluster, then [scroll]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/paginate/#scroll-search) will be used. For Amazon OpenSearch Serverless collections, [search_after]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/paginate/#the-search_after-parameter) will be used because neither `point_in_time` nor `scroll` are supported by collections. ### Connection diff --git a/_data-prepper/pipelines/configuration/sources/otel-logs-source.md b/_data-prepper/pipelines/configuration/sources/otel-logs-source.md index 38095d7d7f7..47dd741f22a 100644 --- a/_data-prepper/pipelines/configuration/sources/otel-logs-source.md +++ b/_data-prepper/pipelines/configuration/sources/otel-logs-source.md @@ -29,6 +29,8 @@ You can configure the `otel_logs_source` source with the following options. | unframed_requests | Boolean | Enables requests that are not framed using the gRPC wire protocol. Default value is `false`. | | thread_count | int | The number of threads to keep in the `ScheduledThreadPool`. Default value is `500`. | | max_connection_count | int | The maximum number of open connections allowed. Default value is `500`. | +| `output_format` | String | Specifies the output format of the generated events. Valid values are `otel` or `opensearch`. Default is `opensearch`. | + ### SSL @@ -52,6 +54,15 @@ source: - otel_logs_source: ``` +To generate data in the OpenTelemetry format, set the `output_format` setting to `otel`, as shown in the following example: + +```yaml +source: + - otel_logs_source: + output_format: otel +``` +{% include copy.html %} + ## Metrics You can use the following metrics with the `otel_logs_source` source. @@ -65,4 +76,4 @@ You can use the following metrics with the `otel_logs_source` source. | `internalServerError` | Counter | Measures the total number of requests that are erroneous due to errors other than `requestTimeouts` or `requestsTooLarge`. | | `successRequests` | Counter | Measures the total number of requests successfully written to the buffer. | | `payloadSize` | Distribution summary | Measures the distribution of all incoming payload sizes. | -| `requestProcessDuration` | Timer | Measures the duration of request processing. | \ No newline at end of file +| `requestProcessDuration` | Timer | Measures the duration of request processing. | diff --git a/_data-prepper/pipelines/configuration/sources/otel-metrics-source.md b/_data-prepper/pipelines/configuration/sources/otel-metrics-source.md index 0e8d3778280..6283743116d 100644 --- a/_data-prepper/pipelines/configuration/sources/otel-metrics-source.md +++ b/_data-prepper/pipelines/configuration/sources/otel-metrics-source.md @@ -19,6 +19,7 @@ proto_reflection_service | No | Boolean | Enables a reflection service for Proto unframed_requests | No | Boolean | Enables requests not framed using the gRPC wire protocol. thread_count | No | Integer | The number of threads to keep in the `ScheduledThreadPool`. Default value is `200`. max_connection_count | No | Integer | The maximum allowed number of open connections. Default value is `500`. +| `output_format` | String | Specifies the output format of the generated events. Valid values are `otel` or `opensearch`. Default is `opensearch`. | max_request_length | No | ByteCount | The maximum number of bytes allowed in the payload of a single gRPC or HTTP request. Default value is `10mb`. ssl | No | Boolean | Enables connections to the OpenTelemetry source port over TLS/SSL. Default value is `true`. sslKeyCertChainFile | Conditionally | String | File-system path or Amazon Simple Storage Service (Amazon S3) path to the security certificate (for example, `"config/demo-data-prepper.crt"` or `"s3://my-secrets-bucket/demo-data-prepper.crt"`). Required if `ssl` is set to `true`. @@ -28,9 +29,25 @@ acmCertificateArn | Conditionally | String | Represents the ACM certificate ARN. awsRegion | Conditionally | String | Represents the AWS Region used by ACM or Amazon S3. Required if `useAcmCertForSSL` is set to `true` or `sslKeyCertChainFile` and `sslKeyFile` is the Amazon S3 path. authentication | No | Object | An authentication configuration. By default, an unauthenticated server is created for the pipeline. This uses pluggable authentication for HTTPS. To use basic authentication, define the `http_basic` plugin with a `username` and `password`. To provide customer authentication, use or create a plugin that implements [GrpcAuthenticationProvider](https://github.com/opensearch-project/data-prepper/blob/1.2.0/data-prepper-plugins/armeria-common/src/main/java/com/amazon/dataprepper/armeria/authentication/GrpcAuthenticationProvider.java). -<!--- ## Configuration +## Usage + +To use the `otel-metrics` source, create the following `pipeline.yaml` file with `otel_metrics_source` as the source: + +```yaml +source: + - otel_metrics_source: +``` +{% include copy.html %} + +To use the OpenTelemetry format for your output, set the `output_format` to `otel`, as shown in the following example: + +```yaml +source: + - otel_metrics_source: + output_format: otel +``` +{% include copy.html %} -Content will be added to this section.---> ## Metrics diff --git a/_data-prepper/pipelines/configuration/sources/otel-trace-source.md b/_data-prepper/pipelines/configuration/sources/otel-trace-source.md index de45a5de638..9047cfd0c2f 100644 --- a/_data-prepper/pipelines/configuration/sources/otel-trace-source.md +++ b/_data-prepper/pipelines/configuration/sources/otel-trace-source.md @@ -19,11 +19,12 @@ Option | Required | Type | Description port | No | Integer | The port that the `otel_trace_source` source runs on. Default value is `21890`. request_timeout | No | Integer | The request timeout, in milliseconds. Default value is `10000`. health_check_service | No | Boolean | Enables a gRPC health check service under `grpc.health.v1/Health/Check`. Default value is `false`. -unauthenticated_health_check | No | Boolean | Determines whether or not authentication is required on the health check endpoint. Data Prepper ignores this option if no authentication is defined. Default value is `false`. +unauthenticated_health_check | No | Boolean | Determines whether or not authentication is required on the health check endpoint. OpenSearch Data Prepper ignores this option if no authentication is defined. Default value is `false`. proto_reflection_service | No | Boolean | Enables a reflection service for Protobuf services (see [gRPC reflection](https://github.com/grpc/grpc/blob/master/doc/server-reflection.md) and [gRPC Server Reflection Tutorial](https://github.com/grpc/grpc-java/blob/master/documentation/server-reflection-tutorial.md) docs). Default value is `false`. unframed_requests | No | Boolean | Enable requests not framed using the gRPC wire protocol. thread_count | No | Integer | The number of threads to keep in the ScheduledThreadPool. Default value is `200`. max_connection_count | No | Integer | The maximum allowed number of open connections. Default value is `500`. +| `output_format` | String | Specifies the output format of the generated events. Valid values are `otel` or `opensearch`. Default is `opensearch`. | max_request_length | No | ByteCount | The maximum number of bytes allowed in the payload of a single gRPC or HTTP request. Default value is `10mb`. ssl | No | Boolean | Enables connections to the OTel source port over TLS/SSL. Defaults to `true`. sslKeyCertChainFile | Conditionally | String | File system path or Amazon Simple Storage Service (Amazon S3) path to the security certificate (for example, `"config/demo-data-prepper.crt"` or `"s3://my-secrets-bucket/demo-data-prepper.crt"`). Required if `ssl` is set to `true`. @@ -33,6 +34,24 @@ acmCertificateArn | Conditionally | String | Represents the ACM certificate ARN. awsRegion | Conditionally | String | Represents the AWS region used by ACM or Amazon S3. Required if `useAcmCertForSSL` is set to `true` or `sslKeyCertChainFile` and `sslKeyFile` are Amazon S3 paths. authentication | No | Object | An authentication configuration. By default, an unauthenticated server is created for the pipeline. This parameter uses pluggable authentication for HTTPS. To use basic authentication, define the `http_basic` plugin with a `username` and `password`. To provide customer authentication, use or create a plugin that implements [GrpcAuthenticationProvider](https://github.com/opensearch-project/data-prepper/blob/1.2.0/data-prepper-plugins/armeria-common/src/main/java/com/amazon/dataprepper/armeria/authentication/GrpcAuthenticationProvider.java). +## Usage + +To use the `otel-metrics` source, create the following `pipeline.yaml` file with `otel_metrics_source` as the source: + +```yaml +source: + - otel_trace_source: +``` +{% include copy.html %} + +If you want to use the OpenTelemetry format for your output, set the `output_format` to `otel`, as shown in the following example: + +```yaml +source: + - otel_trace_source: + output_format: otel +``` +{% include copy.html %} ## Metrics diff --git a/_data-prepper/pipelines/configuration/sources/pipeline.md b/_data-prepper/pipelines/configuration/sources/pipeline.md index 6ba025bd18c..46aa9ad675f 100644 --- a/_data-prepper/pipelines/configuration/sources/pipeline.md +++ b/_data-prepper/pipelines/configuration/sources/pipeline.md @@ -12,7 +12,7 @@ Use the `pipeline` sink to read from another pipeline. ## Configuration options -The `pipeline` sink supports the following configuration options. +The `pipeline` source supports the following configuration options. | Option | Required | Type | Description | |:-------|:---------|:-------|:---------------------------------------| diff --git a/_data-prepper/pipelines/configuration/sources/s3.md b/_data-prepper/pipelines/configuration/sources/s3.md index db92718a36f..ce3a2842868 100644 --- a/_data-prepper/pipelines/configuration/sources/s3.md +++ b/_data-prepper/pipelines/configuration/sources/s3.md @@ -11,7 +11,7 @@ nav_order: 100 `s3` is a source plugin that reads events from [Amazon Simple Storage Service (Amazon S3)](https://aws.amazon.com/s3/) objects. You can configure the source to either use an [Amazon Simple Queue Service (Amazon SQS)](https://aws.amazon.com/sqs/) queue or scan an S3 bucket: - To use Amazon SQS notifications, configure S3 event notifications on your S3 bucket. After Amazon SQS is configured, the `s3` source receives messages from Amazon SQS. When the SQS message indicates that an S3 object has been created, the `s3` source loads the S3 objects and then parses them using the configured [codec](#codec). -- To use an S3 bucket, configure the `s3` source to use Amazon S3 Select instead of Data Prepper to parse S3 objects. +- To use an S3 bucket, configure the `s3` source to use Amazon S3 Select instead of OpenSearch Data Prepper to parse S3 objects. ## IAM permissions @@ -104,7 +104,7 @@ Option | Required | Type | Description `s3_select` | No | [s3_select](#s3_select) | The Amazon S3 Select configuration. `scan` | No | [scan](#scan) | The S3 scan configuration. `delete_s3_objects_on_read` | No | Boolean | When `true`, the S3 scan attempts to delete S3 objects after all events from the S3 object are successfully acknowledged by all sinks. `acknowledgments` should be enabled when deleting S3 objects. Default is `false`. -`workers` | No | Integer | Configures the number of worker threads that the source uses to read data from S3. Leave this value as the default unless your S3 objects are less than 1 MB in size. Performance may decrease for larger S3 objects. This setting affects SQS-based sources and S3-Scan sources. Default is `1`. +`workers` | No | Integer | Configures the number of worker threads (1--10) that the source uses to read data from S3. Leave this value as the default unless your S3 objects are less than 1 MB in size. Performance may decrease for larger S3 objects. This setting affects SQS-based sources and S3-Scan sources. Default is `1`. diff --git a/_data-prepper/pipelines/configuration/sources/sources.md b/_data-prepper/pipelines/configuration/sources/sources.md index 682f215517a..9da88a080e8 100644 --- a/_data-prepper/pipelines/configuration/sources/sources.md +++ b/_data-prepper/pipelines/configuration/sources/sources.md @@ -8,6 +8,6 @@ nav_order: 110 # Sources -A `source` is an input component that specifies how a Data Prepper pipeline ingests events. Each pipeline has a single source that either receives events over HTTP(S) or reads from external endpoints, such as OpenTelemetry Collector or Amazon Simple Storage Service (Amazon S3). Sources have configurable options based on the event format (string, JSON, Amazon CloudWatch logs, OpenTelemtry traces). The source consumes events and passes them to the [`buffer`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/buffers/buffers/) component. +A `source` is an input component that specifies how an OpenSearch Data Prepper pipeline ingests events. Each pipeline has a single source that either receives events over HTTP(S) or reads from external endpoints, such as OpenTelemetry Collector or Amazon Simple Storage Service (Amazon S3). Sources have configurable options based on the event format (string, JSON, Amazon CloudWatch logs, OpenTelemtry traces). The source consumes events and passes them to the [`buffer`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/buffers/buffers/) component. diff --git a/_data-prepper/pipelines/contains.md b/_data-prepper/pipelines/contains.md index 657f66bd28e..bb65b4c785b 100644 --- a/_data-prepper/pipelines/contains.md +++ b/_data-prepper/pipelines/contains.md @@ -20,17 +20,18 @@ For example, if you want to check if the string `"abcd"` is contained within the ``` contains('/message', 'abcd') ``` -{% include copy-curl.html %} +{% include copy.html %} -This will return `true` if the field `message` contains the substring `abcd` or `false` if it does not. +This call returns `true` if the field `message` contains the substring `abcd` or `false` if it does not. -Alternatively, you can also use a literal string as the first argument: +Alternatively, you can use a literal string as the first argument: ``` contains('This is a test message', 'test') ``` -{% include copy-curl.html %} +{% include copy.html %} -In this case, the function will return `true` because the substring `test` is present within the string `This is a test message`. +In this case, the function returns `true` because the substring `test` is present within the string `This is a test message`. -Note that the `contains()` function performs a case-sensitive search by default. If you need to perform a case-insensitive search, you can use the `containsIgnoreCase()` function instead. +The `contains()` function performs a case-sensitive search. +{: .note} diff --git a/_data-prepper/pipelines/dlq.md b/_data-prepper/pipelines/dlq.md index ac1d868ea4b..fc18e6f1e93 100644 --- a/_data-prepper/pipelines/dlq.md +++ b/_data-prepper/pipelines/dlq.md @@ -7,7 +7,7 @@ nav_order: 15 # Dead-letter queues -Data Prepper pipelines support dead-letter queues (DLQs) for offloading failed events and making them accessible for analysis. +OpenSearch Data Prepper pipelines support dead-letter queues (DLQs) for offloading failed events and making them accessible for analysis. As of Data Prepper 2.3, only the `s3` source supports DLQs. diff --git a/_data-prepper/pipelines/expression-syntax.md b/_data-prepper/pipelines/expression-syntax.md index 383b54c19b9..b496c2abbb0 100644 --- a/_data-prepper/pipelines/expression-syntax.md +++ b/_data-prepper/pipelines/expression-syntax.md @@ -7,7 +7,7 @@ nav_order: 5 # Expression syntax -Expressions provide flexibility in manipulating, filtering, and routing data. The following sections provide information about expression syntax in Data Prepper. +Expressions provide flexibility in manipulating, filtering, and routing data. The following sections provide information about expression syntax in OpenSearch Data Prepper. ## Key terms @@ -30,6 +30,9 @@ The following table lists the supported operators. Operators are listed in order |----------------------|-------------------------------------------------------|---------------| | `()` | Priority expression | Left to right | | `not`<br> `+`<br> `-`| Unary logical NOT<br>Unary positive<br>Unary negative | Right to left | +| `*`, `/` | Multiplication and division operators | Left to right | +| `+`, `-` | Addition and subtraction operators | Left to right | +| `+` | String concatenation operator | Left to right | | `<`, `<=`, `>`, `>=` | Relational operators | Left to right | | `==`, `!=` | Equality operators | Left to right | | `and`, `or` | Conditional expression | Left to right | @@ -44,14 +47,14 @@ Relational operators compare numeric values or JSON pointers that resolve to num <Number | JSON Pointer> > <Number | JSON Pointer> <Number | JSON Pointer> >= <Number | JSON Pointer> ``` -{% include copy-curl.html %} +{% include copy.html %} For example, to check if the value of the `status_code` field in an event is within the range of successful HTTP responses (200--299), you can use the following expression: ``` /status_code >= 200 and /status_code < 300 ``` -{% include copy-curl.html %} +{% include copy.html %} ### Equality operators @@ -61,7 +64,7 @@ Equality operators are used to test whether two values are equivalent. These ope <Any> == <Any> <Any> != <Any> ``` -{% include copy-curl.html %} +{% include copy.html %} The following are some example equality operators: @@ -78,7 +81,6 @@ Conditional expressions allow you to combine multiple expressions or values usin <Any> or <Any> not <Any> ``` -{% include copy-curl.html %} The following are some example conditional expressions: @@ -89,11 +91,66 @@ not /status_code in {200, 202} /response == null /response != null ``` -{% include copy-curl.html %} +{% include copy.html %} + +### Arithmetic expressions + +Arithmetic expressions enable basic mathematical operations like addition, subtraction, multiplication, and division. These expressions can be combined with conditional expressions to create more complex conditional statements. The available arithmetic operators are +, -, *, and /. The syntax for using the arithmetic operators is as follows: + +``` +<Any> + <Any> +<Any> - <Any> +<Any> * <Any> +<Any> / <Any> +``` + +The following are example arithmetic expressions: + +``` +/value + length(/message) +/bytes / 1024 +/value1 - /value2 +/TimeInSeconds * 1000 +``` +{% include copy.html %} + +The following are some example arithmetic expressions used in conditional expressions : + +``` +/value + length(/message) > 200 +/bytes / 1024 < 10 +/value1 - /value2 != /value3 + /value4 +``` +{% include copy.html %} + +### String concatenation expressions + +String concatenation expressions enable you to combine strings to create new strings. These concatenated strings can also be used within conditional expressions. The syntax for using string concatenation is as follows: + +``` +<String Variable or String Literal> + <String Variable or String Literal> +``` + +The following are example string concatenation expressions: + +``` +/name + "suffix" +"prefix" + /name +"time of " + /timeInMs + " ms" +``` +{% include copy.html %} + +The following are example string concatenation expressions that can be used in conditional expressions: + +``` +/service + ".com" == /url +"www." + /service != /url +``` +{% include copy.html %} ### Reserved symbols -Reserved symbols are symbols that are not currently used in the expression syntax but are reserved for possible future functionality or extensions. Reserved symbols include `^`, `*`, `/`, `%`, `+`, `-`, `xor`, `=`, `+=`, `-=`, `*=`, `/=`, `%=`, `++`, `--`, and `${<text>}`. +Certain symbols, such as ^, %, xor, =, +=, -=, *=, /=, %=, ++, --, and ${<text>}, are reserved for future functionality or extensions. Reserved symbols include `^`, `%`, `xor`, `=`, `+=`, `-=`, `*=`, `/=`, `%=`, `++`, `--`, and `${<text>}`. ## Syntax components @@ -106,7 +163,7 @@ Priority expressions specify the evaluation order of expressions. They are enclo ``` /is_cool == (/name == "Steven") ``` -{% include copy-curl.html %} +{% include copy.html %} ### JSON pointers @@ -121,7 +178,7 @@ The shorthand syntax for a JSON pointer can be expressed using the following reg ``` /\w+(/\w+)*` ``` -{% include copy-curl.html %} +{% include copy.html %} The following is an example of this shorthand syntax: @@ -129,7 +186,7 @@ The following is an example of this shorthand syntax: ``` /Hello/World/0 ``` -{% include copy-curl.html %} +{% include copy.html %} #### Escaped syntax @@ -138,7 +195,7 @@ The escaped syntax for a JSON pointer can be expressed as follows: ``` "/<Valid String Characters | Escaped Character>(/<Valid String Characters | Escaped Character>)*" ``` -{% include copy-curl.html %} +{% include copy.html %} The following is an example of an escaped JSON pointer: @@ -147,7 +204,7 @@ The following is an example of an escaped JSON pointer: # { "Hello - 'world/" : [{ "\"JsonPointer\"": true }] } "/Hello - 'world\//0/\"JsonPointer\"" ``` -{% include copy-curl.html %} +{% include copy.html %} ### Literals @@ -170,6 +227,9 @@ White space is optional around relational operators, regex equality operators, e | `()` | Priority expression | Yes | `/a==(/b==200)`<br>`/a in ({200})` | `/status in({200})` | | `in`, `not in` | Set operators | Yes | `/a in {200}`<br>`/a not in {400}` | `/a in{200, 202}`<br>`/a not in{400}` | | `<`, `<=`, `>`, `>=` | Relational operators | No | `/status < 300`<br>`/status>=300` | | +| `+` | String concatenation operator | No | `/status_code + /message + "suffix"` +| `+`, `-` | Arithmetic addition and subtraction operators | No | `/status_code + length(/message) - 2` +| `*`, `/` | Multiplication and division operators | No | `/status_code * length(/message) / 3` | `=~`, `!~` | Regex equality operators | No | `/msg =~ "^\w*$"`<br>`/msg=~"^\w*$"` | | | `==`, `!=` | Equality operators | No | `/status == 200`<br>`/status_code==200` | | | `and`, `or`, `not` | Conditional operators | Yes | `/a<300 and /b>200` | `/b<300and/b>200` | diff --git a/_data-prepper/pipelines/functions.md b/_data-prepper/pipelines/functions.md index f0661faba4f..c3ecb4122bf 100644 --- a/_data-prepper/pipelines/functions.md +++ b/_data-prepper/pipelines/functions.md @@ -8,11 +8,13 @@ has_children: true # Functions -Data Prepper offers a range of built-in functions that can be used within expressions to perform common data preprocessing tasks, such as calculating lengths, checking for tags, retrieving metadata, searching for substrings, checking IP address ranges, and joining list elements. These functions include the following: +OpenSearch Data Prepper offers a range of built-in functions that can be used within expressions to perform common data preprocessing tasks, such as calculating lengths, checking for tags, retrieving metadata, searching for substrings, checking IP address ranges, and joining list elements. These functions include the following: - [`cidrContains()`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/cidrcontains/) - [`contains()`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/contains/) - [`getMetadata()`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/get-metadata/) +- [`getEventType()`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/get-eventtype/) - [`hasTags()`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/has-tags/) - [`join()`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/join/) -- [`length()`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/length/) \ No newline at end of file +- [`length()`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/length/) +- [`startsWith()`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/startswith/) diff --git a/_data-prepper/pipelines/get-eventtype.md b/_data-prepper/pipelines/get-eventtype.md new file mode 100644 index 00000000000..dfffa7e814d --- /dev/null +++ b/_data-prepper/pipelines/get-eventtype.md @@ -0,0 +1,20 @@ +--- +layout: default +title: getEventType() +parent: Functions +grand_parent: Pipelines +nav_order: 45 +--- + +# getEventType() + +The `getEventType()` function returns the internal event type of the current event. + +The return value is one of the event types defined in the `EventType.java` file. For example, if the event is an OpenTelemetry trace event, the returned event type is `TRACE`. + +Use this function to check the event type before performing conditional processing, as shown in the following example: + +```json +getEventType() == "TRACE" +``` +{% include copy.html %} diff --git a/_data-prepper/pipelines/get-metadata.md b/_data-prepper/pipelines/get-metadata.md index fc89ed51d6c..3f6e4297d4e 100644 --- a/_data-prepper/pipelines/get-metadata.md +++ b/_data-prepper/pipelines/get-metadata.md @@ -39,4 +39,4 @@ The value returned can be of any type. For example, if the metadata contains `{" ] } ``` -{% include copy-curl.html %} +{% include copy.html %} diff --git a/_data-prepper/pipelines/has-tags.md b/_data-prepper/pipelines/has-tags.md index d6cb498b11f..fe251d6db42 100644 --- a/_data-prepper/pipelines/has-tags.md +++ b/_data-prepper/pipelines/has-tags.md @@ -10,7 +10,7 @@ nav_order: 20 The `hasTags()` function takes one or more string type arguments and returns `true` if all of the arguments passed are present in an event's tags. If an argument does not exist in the event's tags, then the function returns `false`. -For example, if you use the expression `hasTags("tag1")` and the event contains `tag1`, then Data Prepper returns `true`. If you use the expression `hasTags("tag2")` but the event only contains `tag1`, then Data Prepper returns `false`. +For example, if you use the expression `hasTags("tag1")` and the event contains `tag1`, then OpenSearch Data Prepper returns `true`. If you use the expression `hasTags("tag2")` but the event only contains `tag1`, then Data Prepper returns `false`. #### Example @@ -42,4 +42,4 @@ For example, if you use the expression `hasTags("tag1")` and the event contains ] } ``` -{% include copy-curl.html %} +{% include copy.html %} diff --git a/_data-prepper/pipelines/length.md b/_data-prepper/pipelines/length.md index fca4b10df2a..dd42a231dce 100644 --- a/_data-prepper/pipelines/length.md +++ b/_data-prepper/pipelines/length.md @@ -21,4 +21,4 @@ The `length()` function takes one argument of the JSON pointer type and returns "expected_output": 10 } ``` -{% include copy-curl.html %} +{% include copy.html %} diff --git a/_data-prepper/pipelines/pipelines.md b/_data-prepper/pipelines/pipelines.md index d519f0da806..98e344ff602 100644 --- a/_data-prepper/pipelines/pipelines.md +++ b/_data-prepper/pipelines/pipelines.md @@ -10,7 +10,7 @@ redirect_from: # Pipelines -Pipelines are critical components that streamline the process of acquiring, transforming, and loading data from various sources into a centralized data repository or processing system. The following diagram illustrates how Data Prepper ingests data into OpenSearch. +Pipelines are critical components that streamline the process of acquiring, transforming, and loading data from various sources into a centralized data repository or processing system. The following diagram illustrates how OpenSearch Data Prepper ingests data into OpenSearch. <img src="{{site.url}}{{site.baseurl}}/images/data-prepper-pipeline.png" alt="Data Prepper pipeline">{: .img-fluid} @@ -36,7 +36,7 @@ simple-sample-pipeline: sink: - stdout: ``` -{% include copy-curl.html %} +{% include copy.html %} ### Pipeline components @@ -63,7 +63,7 @@ If a pipeline component fails to process and send an event, then the source rece ### Conditional routing -Pipelines also support conditional routing, which enables the routing of events to different sinks based on specific conditions. To add conditional routing, specify a list of named routes using the `route` component and assign specific routes to sinks using the `routes` property. Any sink with the `routes` property will only accept events matching at least one of the routing conditions. +Pipelines also support conditional routing, which enables the routing of events to different sinks based on specific conditions. To add conditional routing, specify a list of named routes using the `route` component and assign specific routes to sinks using the `routes` property. Any sink with the `routes` property only accepts events matching at least one of the routing conditions. In the following example pipeline, `application-logs` is a named route with a condition set to `/log_type == "application"`. The route uses [Data Prepper expressions](https://github.com/opensearch-project/data-prepper/tree/main/examples) to define the condition. Data Prepper routes events satisfying this condition to the first OpenSearch sink. By default, Data Prepper routes all events to sinks without a defined route, as shown in the third OpenSearch sink of the given pipeline: @@ -88,7 +88,7 @@ conditional-routing-sample-pipeline: hosts: [ "https://opensearch:9200" ] index: all_logs ``` -{% include copy-curl.html %} +{% include copy.html %} ## Next steps diff --git a/_data-prepper/pipelines/startswith.md b/_data-prepper/pipelines/startswith.md new file mode 100644 index 00000000000..41455aea7e0 --- /dev/null +++ b/_data-prepper/pipelines/startswith.md @@ -0,0 +1,37 @@ +--- +layout: default +title: startsWith() +parent: Functions +grand_parent: Pipelines +nav_order: 40 +--- + +# startsWith() + +The `startsWith()` function checks whether a string starts with the given string. It takes two arguments: + +- The first argument is either a literal string or a JSON pointer that represents the field or value to be checked. + +- The second argument is the string to be checked in the first argument. +The function returns `true` if the string or field value represented by the first argument starts with the string specified in the second argument and `false` otherwise. + +For example, to check whether the value of a field name `message` starts with a string `"abcd"`, use the `startsWith()` function as follows: + +``` +startsWith('/message', 'abcd') +``` +{% include copy.html %} + +This call returns `true` if the `message` field starts with the string `abcd` or `false` if it does not. + +Alternatively, you can use a literal string as the first argument: + +``` +startsWith('abcdef', 'abcd') +``` +{% include copy.html %} + +In this case, the function returns `true` because the string `abcdef` starts with `abcd`. + +The `startsWith()` function performs a case-sensitive check. +{: .note } diff --git a/_data/code_languages.yml b/_data/code_languages.yml new file mode 100644 index 00000000000..9c1b86dde07 --- /dev/null +++ b/_data/code_languages.yml @@ -0,0 +1,19 @@ +languages: + - id: rest + name: REST + - id: python + name: Python + - id: java + name: Java + - id: javascript + name: JavaScript + - id: go + name: Go + - id: ruby + name: Ruby + - id: php + name: PHP + - id: dotnet + name: .NET + - id: rust + name: Rust \ No newline at end of file diff --git a/_data/migration-assistant/breaking-changes.yml b/_data/migration-assistant/breaking-changes.yml new file mode 100644 index 00000000000..193d1f129b1 --- /dev/null +++ b/_data/migration-assistant/breaking-changes.yml @@ -0,0 +1,65 @@ +# Breaking changes data for migration paths +# +# Data structure: +# breaking_changes: Array of breaking change objects with: +# - title: Display name of the breaking change +# - url: Link to documentation +# - introducedIn: Version where the breaking change was introduced +# - affects (optional): Object with minSource and maxTarget versions +# - minSource: Minimum source version affected +# - maxTarget: Maximum target version affected +# - comp: Array of components affected (e.g., ["dashboards"]) +# - transformation (optional): Optional object with transformation information +# - title: Title of the transformation guide +# - url: Link to transformation guide +breaking_changes: + - title: "Amazon OpenSearch Service: Upgrade Guidance" + url: "https://docs.aws.amazon.com/opensearch-service/latest/developerguide/version-migration.html" + introducedIn: "OpenSearch 1.x" + comp: [] + - title: "Amazon OpenSearch Service: Rename - Summary of changes" + url: "https://docs.aws.amazon.com/opensearch-service/latest/developerguide/rename.html" + introducedIn: "OpenSearch 1.x" + comp: [] + - title: "OpenSearch 2.x: Remove mapping types parameter" + url: "https://docs.opensearch.org/docs/latest/breaking-changes/#remove-mapping-types-parameter" + introducedIn: "OpenSearch 2.x" + comp: [] + transformation: + title: "Type Mapping Deprecation" + url: "https://docs.opensearch.org/docs/latest/migration-assistant/migration-phases/planning-your-migration/handling-type-mapping-deprecation/" + - title: "OpenSearch 3.x: Breaking Changes" + url: "/docs/latest/breaking-changes/#300" + introducedIn: "OpenSearch 3.x" + comp: [] + - title: "OpenSearch Notifications Plugins" + url: "https://docs.opensearch.org/docs/latest/breaking-changes/#add-opensearch-notifications-plugins" + introducedIn: "OpenSearch 2.x" + comp: [] + - title: "OpenSearch 2.x: Client JDK 8 Support Dropped" + url: "https://docs.opensearch.org/docs/latest/breaking-changes/#drop-support-for-jdk-8" + introducedIn: "OpenSearch 2.x" + comp: [] + - title: "Removal of Types in Elasticsearch 7.x" + url: "https://www.elastic.co/guide/en/elasticsearch/reference/7.10/removal-of-types.html" + introducedIn: "Elasticsearch 7.x" + comp: [] + transformation: + title: "Type Mapping Deprecation" + url: "https://docs.opensearch.org/docs/latest/migration-assistant/migration-phases/planning-your-migration/handling-type-mapping-deprecation/" + - title: "Elasticsearch 6.0 - 6.6 Breaking Changes" + url: "https://www.elastic.co/guide/en/elasticsearch/reference/6.8/breaking-changes.html" + introducedIn: "Elasticsearch 6.x" + comp: [] + - title: "Elasticsearch 7.0 - 7.10 Breaking Changes" + url: "https://www.elastic.co/guide/en/elasticsearch/reference/7.10/breaking-changes.html" + introducedIn: "Elasticsearch 7.x" + comp: [] + - title: "Kibana 6 Breaking Changes" + url: "https://www.elastic.co/guide/en/kibana/6.8/breaking-changes.html" + introducedIn: "Elasticsearch 6.x" + comp: ["dashboards"] + - title: "Kibana 7 Breaking Changes" + url: "https://www.elastic.co/guide/en/kibana/7.10/breaking-changes.html" + introducedIn: "Elasticsearch 7.x" + comp: ["dashboards"] diff --git a/_data/migration-assistant/valid_migrations.yml b/_data/migration-assistant/valid_migrations.yml new file mode 100644 index 00000000000..a2223b1dd2d --- /dev/null +++ b/_data/migration-assistant/valid_migrations.yml @@ -0,0 +1,29 @@ +# Migration paths for Migration Assistant +migration_paths: + - source: "Elasticsearch 5.x" + targets: + - "OpenSearch 1.x" + - "OpenSearch 2.x" + - "OpenSearch 3.x" + - source: "Elasticsearch 6.x" + targets: + - "OpenSearch 1.x" + - "OpenSearch 2.x" + - "OpenSearch 3.x" + - source: "Elasticsearch 7.x" + targets: + - "OpenSearch 1.x" + - "OpenSearch 2.x" + - "OpenSearch 3.x" + - source: "Elasticsearch 8.x" + targets: + - "OpenSearch 2.x" + - "OpenSearch 3.x" + - source: "OpenSearch 1.x" + targets: + - "OpenSearch 2.x" + - "OpenSearch 3.x" + - source: "OpenSearch 2.x" + targets: + - "OpenSearch 2.x" + - "OpenSearch 3.x" diff --git a/_data/top_nav.yml b/_data/top_nav.yml index 51d81386800..832e689d2ac 100644 --- a/_data/top_nav.yml +++ b/_data/top_nav.yml @@ -1,80 +1,58 @@ items: - - label: OpenSearchCon - children: - - label: 2024 Europe - url: /events/opensearchcon/2024/europe/index.html - - label: 2024 North America - url: /events/opensearchcon/2024/north-america/index.html - # children: - # - label: Speakers - # url: /events/opensearchcon/2024/north-america/speakers/index.html - # - label: Sessions - # url: /events/opensearchcon/2024/north-america/sessions/index.html - # - label: Exhibitors - # url: /events/opensearchcon/2024/north-america/exhibitors/index.html - # - label: Workshops - # url: /events/opensearchcon/2024/north-america/workshops/index.html - # - label: Unconference - # url: /events/opensearchcon/2024/north-america/unconference/index.html - - label: 2024 India - url: /events/opensearchcon/2024/india/index.html - - label: Archive - children: - - label: 2023 North America - url: /events/opensearchcon/2023/north-america/index.html - - label: 2022 North America - url: /events/opensearchcon/2022/north-america/index.html - - label: Download - url: /downloads.html - label: About - url: /about.html + url: https://opensearch.org/about.html children: - label: Releases - url: /releases.html + url: https://opensearch.org/releases.html - label: Roadmap url: https://github.com/orgs/opensearch-project/projects/1 - label: FAQ - url: /faq/ + url: https://opensearch.org/faq/ + - label: Platform + url: https://opensearch.org/platform/index.html + children: + - label: Search + url: https://opensearch.org/platform/search/index.html + - label: Observability + url: https://opensearch.org/platform/observability/index.html + - label: Security Analytics + url: https://opensearch.org/platform/security-analytics/index.html + - label: Vector Database + url: https://opensearch.org/platform/search/vector-database.html + - label: Playground Demo + url: https://playground.opensearch.org/ + - label: Performance Benchmarks + url: https://opensearch.org/benchmarks/ - label: Community + url: https://opensearch.org/community/ children: - - label: Blog - url: /blog/ - label: Forum url: https://forum.opensearch.org/ - label: Slack - url: /slack.html + url: https://opensearch.org/slack.html - label: Events - url: /events + url: https://opensearch.org/events/ class_name: events-page-menu-link__device-based - - label: Partners - url: /partners/ + - label: Solutions Providers + url: https://opensearch.org/solutions-providers/ - label: Projects - url: /community_projects/ + url: https://opensearch.org/community_projects/ - label: Members - url: /community/members/index.html + url: https://opensearch.org/authors-list/ - label: Documentation - url: /docs/latest/ + url: https://docs.opensearch.org/docs/latest/ children: - label: OpenSearch and Dashboards - url: /docs/latest/about/ + url: https://docs.opensearch.org/docs/latest/about/ - label: Data Prepper - url: /docs/latest/data-prepper/ + url: https://docs.opensearch.org/docs/latest/data-prepper/ - label: Clients - url: /docs/latest/clients/ + url: https://docs.opensearch.org/docs/latest/clients/ - label: Benchmark - url: /docs/latest/benchmark/ - - label: Platform - url: /platform/index.html - children: - - label: Search - url: /platform/search/index.html - - label: Observability - url: /platform/observability/index.html - - label: Security Analytics - url: /platform/security-analytics/index.html - - label: Vector Database - url: /platform/search/vector-database.html - - label: Playground Demo - url: https://playground.opensearch.org/ - - label: Performance Benchmarks - url: /benchmarks/ \ No newline at end of file + url: https://docs.opensearch.org/docs/latest/benchmark/ + - label: Migration Assistant + url: https://docs.opensearch.org/docs/latest/migration-assistant/ + - label: Blog + url: https://opensearch.org/blog/ + - label: Download + url: https://opensearch.org/downloads.html \ No newline at end of file diff --git a/_data/versions.json b/_data/versions.json index c14e91fa0cf..da4e31aae72 100644 --- a/_data/versions.json +++ b/_data/versions.json @@ -1,10 +1,13 @@ { - "current": "2.17", + "current": "3.1", "all": [ - "2.17", - "1.3" + "3.1", + "2.19" ], "archived": [ + "3.0", + "2.18", + "2.17", "2.16", "2.15", "2.14", @@ -22,11 +25,12 @@ "2.2", "2.1", "2.0", + "1.3", "1.2", "1.1", "1.0" ], - "latest": "2.17" + "latest": "3.1" } diff --git a/_developer-documentation/index.md b/_developer-documentation/index.md index 46ea7dd8556..8d4a2f65f74 100644 --- a/_developer-documentation/index.md +++ b/_developer-documentation/index.md @@ -3,7 +3,6 @@ layout: default title: Developer documentation nav_order: 1 has_children: false -has_toc: false nav_exclude: true permalink: /developer-documentation/ redirect_from: @@ -22,6 +21,7 @@ We welcome your contributions to the OpenSearch Project. Here are some helpful l - [OpenSearch Project roadmap](https://github.com/orgs/opensearch-project/projects/1) - [OpenSearch Community Forum](https://forum.opensearch.org/) -## What's new +## In this section -New in version 2.9, OpenSearch introduces _extensions_---an easier-to-develop and more secure alternative to plugins---to simplify creating custom functionality for OpenSearch. To learn more about building extensions using _OpenSearch SDK for Java_, see [Extensions]({{site.url}}{{site.baseurl}}/developer-documentation/extensions/). +- [Plugin as a service]({{site.url}}{{site.baseurl}}/developer-documentation/plugin-as-a-service/): Enables stateless OpenSearch plugins using external data stores, such as a remote OpenSearch cluster or cloud storage services. +- [Extensions]({{site.url}}{{site.baseurl}}/developer-documentation/extensions/): An easier-to-develop and more secure alternative to plugins that simplifies creating custom functionality for OpenSearch. \ No newline at end of file diff --git a/_developer-documentation/plugin-as-a-service/index.md b/_developer-documentation/plugin-as-a-service/index.md new file mode 100644 index 00000000000..cbf0bef5f9e --- /dev/null +++ b/_developer-documentation/plugin-as-a-service/index.md @@ -0,0 +1,110 @@ +--- +layout: default +title: Plugin as a service +nav_order: 5 +has_children: false +has_toc: false +redirect_from: + - /developer-documentation/plugin-as-a-service/ +--- + +# Plugin as a service +Introduced 2.19 +{: .label .label-purple } + +To extend core features, OpenSearch uses plugins, which have several limitations: +- They operate in the same JVM as a cluster, sharing storage, memory, and state. +- They require strict version compatibility. +- They are restricted to a single tenant. + +To address these challenges, you can use a _remote metadata SDK client_, which enables stateless OpenSearch plugins using external data stores, such as a remote OpenSearch cluster or cloud storage services. Using the client improves scalability and makes plugins more adaptable for large workloads. For more information about the client, see [SDK Client Repository](https://github.com/opensearch-project/opensearch-remote-metadata-sdk). + +## Remote metadata storage + +Remote metadata storage allows OpenSearch plugins to operate in a stateless manner, without relying on local JVM or cluster resources, by using external storage solutions. Instead of storing metadata within the OpenSearch cluster, plugins can save it in remote locations such as other OpenSearch clusters or cloud storage services. This approach improves scalability, reduces resource contention, and enables plugins to function independently of the core OpenSearch cluster. + +Remote metadata storage offers the following benefits: + +- **Scalability**: Offloading metadata storage to an external system reduces OpenSearch cluster memory and CPU usage. +- **Multi-tenancy support**: Tenant-based storage separation enables cloud providers to offer more flexible plugin solutions, logically separating resources using tenant IDs. + +### Supported storage backends + +Remote metadata storage can be configured to use the following external backends: + +- Remote OpenSearch clusters +- Amazon DynamoDB + +## Enabling multi-tenancy + +To enable multi-tenancy in a plugin, update the following static settings. After the update, restart the cluster in order for the changes to take effect. For more information about ways to update the settings, see [Configuring OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/). + +### Multi-tenancy setting + +The following table lists the multi-tenancy setting. + +| Setting | Data type | Description | +|:---|:---|:---| +| `multi_tenancy_enabled` | Boolean | Enables multi-tenancy for the plugin. | + +### Remote metadata storage settings + +The following table lists settings related to remote metadata storage configuration. + +| Setting | Data type | Description | +|:---|:---|:---| +| `remote_metadata_type` | String | The remote metadata storage type. Valid values are: <br> - `RemoteOpenSearch`: A remote OpenSearch cluster compatible with OpenSearch Java Client. <br> - `AWSDynamoDB` : Amazon DynamoDB with zero-ETL replication to OpenSearch. <br> - `AWSOpenSearchService`: Amazon OpenSearch Service using AWS SDK v2. | +| `remote_metadata_endpoint` | String | The remote metadata endpoint URL. | +| `remote_metadata_region` | String | The AWS region in which metadata is stored. | +| `remote_metadata_service_name` | String | The remote metadata service name. | + +## Example + +The following configuration enables multi-tenancy using a remote OpenSearch cluster: + +```yaml +plugins.<plugin_name>.multi_tenancy_enabled: true +plugins.<plugin_name>.remote_metadata_type: "opensearch" +plugins.<plugin_name>.remote_metadata_endpoint: "https://remote-store.example.com" +plugins.<plugin_name>.remote_metadata_region: "us-west-2" +plugins.<plugin_name>.remote_metadata_service_name: "remote-store-service" +``` +{% include copy.html %} + +## Supported plugins + +OpenSearch supports multi-tenancy for the following plugins. + +### ML Commons + +The ML Commons plugin supports multi-tenancy for the following components: + +- [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/) +- [Model groups]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control/#model-groups) +- [Models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/) (externally hosted only) +- [Agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/) +- [Tasks]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/index/) + +The following example configures multi-tenancy for the ML Commons plugin: + +```yaml +plugins.ml_commons.multi_tenancy_enabled: true +plugins.ml_commons.remote_metadata_type: AWSDynamoDB +plugins.ml_commons.remote_metadata_endpoint: <REMOTE_ENDPOINT> +plugins.ml_commons.remote_metadata_region: <AWS_REGION> +plugins.ml_commons.remote_metadata_service_name: <SERVICE_NAME> +``` +{% include copy.html %} + +### Flow Framework + +The following example configures multi-tenancy for the Flow Framework plugin: + +```yaml +plugins.flow_framework.multi_tenancy_enabled: true +plugins.flow_framework.remote_metadata_type: AWSDynamoDB +plugins.flow_framework.remote_metadata_endpoint: <REMOTE_ENDPOINT> +plugins.flow_framework.remote_metadata_region: <AWS_REGION> +plugins.flow_framework.remote_metadata_service_name: <SERVICE_NAME> +``` +{% include copy.html %} \ No newline at end of file diff --git a/_field-types/index.md b/_field-types/index.md index e9250f409d7..7bac4e114e2 100644 --- a/_field-types/index.md +++ b/_field-types/index.md @@ -28,8 +28,8 @@ Type | Description :--- | :--- `null` | A `null` field can't be indexed or searched. When a field is set to null, OpenSearch behaves as if the field has no value. `boolean` | OpenSearch accepts `true` and `false` as Boolean values. An empty string is equal to `false.` -`float` | A single-precision, 32-bit floating-point number. -`double` | A double-precision, 64-bit floating-point number. +`float` | A single-precision, 32-bit IEEE 754 floating-point number, restricted to finite values. +`double` | A double-precision, 64-bit IEEE 754 floating-point number, restricted to finite values. `integer` | A signed 32-bit number. `object` | Objects are standard JSON objects, which can have fields and mappings of their own. For example, a `movies` object can have additional properties such as `title`, `year`, and `director`. `array` | OpenSearch does not have a specific array data type. Arrays are represented as a set of values of the same data type (for example, integers or strings) associated with a field. When indexing, you can pass multiple values for a field, and OpenSearch will treat it as an array. Empty arrays are valid and recognized as array fields with zero elements---not as fields with no values. OpenSearch supports querying and filtering arrays, including checking for values, range queries, and array operations like concatenation and intersection. Nested arrays, which may contain complex objects or other arrays, can also be used for advanced data modeling. diff --git a/_field-types/mapping-parameters/doc-values.md b/_field-types/mapping-parameters/doc-values.md new file mode 100644 index 00000000000..c1cada03010 --- /dev/null +++ b/_field-types/mapping-parameters/doc-values.md @@ -0,0 +1,46 @@ +--- +layout: default +title: Doc values +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 25 +has_children: false +has_toc: false +--- + +# doc_values + +By default, OpenSearch indexes most fields for search purposes. The `doc_values ` parameter enables document-to-term lookups for operations such as sorting, aggregations, and scripting. + +The `doc_values` parameter accepts the following options. + +Option | Description +:--- | :--- +`true` | Enables `doc_values` for the field. Default is `true`. +`false` | Disables `doc_values` for the field. + +The `doc_values` parameter is not supported for use in text fields. + +--- + +## Example: Creating an index with `doc_values` enabled and disabled + +The following example request creates an index with `doc_values` enabled for one field and disabled for another: + +```json +PUT my-index-001 +{ + "mappings": { + "properties": { + "status_code": { + "type": "keyword" + }, + "session_id": { + "type": "keyword", + "doc_values": false + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_field-types/mapping-parameters/dynamic.md b/_field-types/mapping-parameters/dynamic.md index abb0a7cb6d5..2d48e98082e 100644 --- a/_field-types/mapping-parameters/dynamic.md +++ b/_field-types/mapping-parameters/dynamic.md @@ -3,7 +3,7 @@ layout: default title: Dynamic parent: Mapping parameters grand_parent: Mapping and field types -nav_order: 25 +nav_order: 30 has_children: false has_toc: false redirect_from: diff --git a/_field-types/mapping-parameters/eager_global_ordinals.md b/_field-types/mapping-parameters/eager_global_ordinals.md new file mode 100644 index 00000000000..9a8e0177f47 --- /dev/null +++ b/_field-types/mapping-parameters/eager_global_ordinals.md @@ -0,0 +1,74 @@ +--- +layout: default +title: Eager global ordinals +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 35 +has_children: false +has_toc: false +--- + +# Eager global ordinals + +The `eager_global_ordinals` mapping parameter controls when global ordinals are built for a field. When enabled, global ordinals are computed during index refresh rather than "lazily" during query execution. This can improve performance for operations that rely on global ordinals, for example, sorting and aggregations on keyword fields. However, it may also increase index refresh times and memory usage. + +Global ordinals represent a mapping from term values to integer identifiers and are used internally to quickly execute aggregations and sort operations. By loading them "eagerly," the system reduces query latency at the cost of additional upfront processing during indexing. + +By default, `eager_global_ordinals` are disabled, ensuring that the cluster is optimized for indexing speed. + +Global ordinals are stored in the field data cache and consume heap memory. Fields with high cardinality can consume a large amount of heap memory. To prevent memory-related issues, it is important to carefully configure the [field data circuit breaker settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/circuit-breaker/#field-data-circuit-breaker-settings). + +## When global ordinals are used + +Global ordinals are used if a search includes any of the following: + +- Bucket aggregations on `keyword`, `ip`, and `flattened` fields. This includes `terms`, `composite`, `diversified_sampler`, and `significant_terms` aggregations. +- Aggregations on `text` fields that require `fielddata` to be enabled. +- Parent/child queries using a [`join`]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/join/) field, such as [`has_child`]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-child/) queries or `parent` aggregations. + + +## Enabling eager global ordinals on a field + +The following request creates an index named `products` with `eager_global_ordinals` enabled: + +```json +PUT /products +{ + "mappings": { + "properties": { + "size": { + "type": "keyword", + "eager_global_ordinals": true + } + } + } +} +``` +{% include copy-curl.html %} + +The following request indexes a document: + +```json +PUT /products/_doc/1 +{ + "size": "ABC123" +} +``` +{% include copy-curl.html %} + +The following request runs a `terms` aggregation: + +```json +POST /products/_search +{ + "size": 0, + "aggs": { + "size_agg": { + "terms": { + "field": "size" + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_field-types/mapping-parameters/enabled.md b/_field-types/mapping-parameters/enabled.md new file mode 100644 index 00000000000..ec1ec08c778 --- /dev/null +++ b/_field-types/mapping-parameters/enabled.md @@ -0,0 +1,47 @@ +--- +layout: default +title: Enabled +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 40 +has_children: false +has_toc: false +--- + +# Enabled + +The `enabled` parameter allows you to control whether OpenSearch parses the contents of a field. This parameter can be applied to the top-level mapping definition and to object fields. + +The `enabled` parameter accepts the following values. + +Parameter | Description +:--- | :--- +`true` | The field is parsed and indexed. Default is `true`. +`false` | The field is not parsed or indexed but is still retrievable from the `_source` field. When `enabled` is set to `false`, OpenSearch stores the field's value in the `_source` field but does not index or parse its contents. This can be useful for fields that you want to store but do not need to search, sort, or aggregate on. + +--- + +## Example: Using the `enabled` parameter + +In the following example request, the `session_data` field is disabled. OpenSearch stores its contents in the `_source` field but does not index or parse them: + +```json +PUT my-index-002 +{ + "mappings": { + "properties": { + "user_id": { + "type": "keyword" + }, + "last_updated": { + "type": "date" + }, + "session_data": { + "type": "object", + "enabled": false + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_field-types/mapping-parameters/fields.md b/_field-types/mapping-parameters/fields.md new file mode 100644 index 00000000000..977e2f7246f --- /dev/null +++ b/_field-types/mapping-parameters/fields.md @@ -0,0 +1,186 @@ +--- +layout: default +title: Fields +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 100 +has_children: false +has_toc: false +--- + +# Fields + +The `fields` mapping parameter enables you to index the same field in multiple ways by defining additional subfields. With multi-fields, the primary field value is stored using its main mapping. Additionally, you can configure one or more subfields with alternate mappings, for example, different data types or analyzers that support varied search and aggregation requirements. + +Multi-fields are especially useful when you need to perform full-text searches on one representation of the data and exact-match operations (like sorting or aggregations) on another. Additionally, you can index the same field with different analyzers. For example, one subfield might use the default analyzer for general text searches, while another subfield uses a custom analyzer for generating n-grams to support autocomplete or fuzzy matching. + +## Configuring multi-fields + +In the following example, an index named `articles` is created with a `title` field that is analyzed as full text. A subfield named `raw` is defined under `fields` to store the same value as a `keyword` for exact-match queries: + +```json +PUT /articles +{ + "mappings": { + "properties": { + "title": { + "type": "text", + "fields": { + "raw": { + "type": "keyword" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Using different analyzers + +In the following example, the same `title` field is indexed using two different analyzers. The main field uses the default analyzer for full-text search, while the `ngrams` subfield uses a custom n-gram analyzer to support features like autocomplete: + +```json +PUT /articles +{ + "settings": { + "analysis": { + "analyzer": { + "ngram_analyzer": { + "tokenizer": "ngram_tokenizer", + "filter": [ + "lowercase" + ] + } + }, + "tokenizer": { + "ngram_tokenizer": { + "type": "ngram", + "min_gram": 3, + "max_gram": 4, + "token_chars": [ + "letter", + "digit" + ] + } + } + } + }, + "mappings": { + "properties": { + "title": { + "type": "text", + "fields": { + "raw": { + "type": "keyword" + }, + "ngrams": { + "type": "text", + "analyzer": "ngram_analyzer" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Indexing a document + +After the index is created, you can index documents into it. The `title` field will be processed as defined by its mapping, and its subfields will provide alternate representations of the same value: + +```json +PUT /articles/_doc/1 +{ + "title": "Understanding Multi-Fields in Search" +} +``` +{% include copy-curl.html %} + +## Querying multi-fields + +You can target the additional subfields in queries to suit different requirements. For example, to perform an aggregation on the exact value of the title, query the `title.raw` subfield using the following request: + +```json +POST /articles/_search +{ + "size": 0, + "aggs": { + "titles": { + "terms": { + "field": "title.raw" + } + } + } +} +``` +{% include copy-curl.html %} + +The `title.raw` subfield, mapped as a `keyword`, allows exact-match aggregations even though the original title field is full-text analyzed: + +```json +{ + ... + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "titles": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "Understanding Multi-Fields in Search", + "doc_count": 1 + } + ] + } + } +} +``` + +Alternatively, to use the autocomplete functionality, you can run a `match` query on the `title.ngrams` subfield: + +```json +POST /articles/_search +{ + "query": { + "match": { + "title.ngrams": "Und" + } + } +} +``` +{% include copy-curl.html %} + +The `title.ngrams` subfield uses a custom n-gram analyzer, therefore the prefix "Und" successfully matches the start of the word "Understanding": + +```json +{ + ... + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.2876821, + "hits": [ + { + "_index": "articles", + "_id": "1", + "_score": 0.2876821, + "_source": { + "title": "Understanding Multi-Fields in Search" + } + } + ] + } +} +``` diff --git a/_field-types/mapping-parameters/format.md b/_field-types/mapping-parameters/format.md new file mode 100644 index 00000000000..0b2be9f457d --- /dev/null +++ b/_field-types/mapping-parameters/format.md @@ -0,0 +1,88 @@ +--- +layout: default +title: Format +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 50 +has_children: false +has_toc: false +--- + +# Format + +The `format` mapping parameter specifies the [built-in date formats]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date/#built-in-formats) that a date field can accept during indexing. By defining the expected date formats, you ensure that date values are correctly parsed and stored, facilitating accurate search and aggregation operations. + +## Example: Defining a custom date format + +Create an `events` index with the `event_date` field configured to a custom `yyyy-MM-dd HH:mm:ss` date format: + +```json +PUT events +{ + "mappings": { + "properties": { + "event_date": { + "type": "date", + "format": "yyyy-MM-dd HH:mm:ss" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document using the specified format for the `event_date` field: + +```json +PUT events/_doc/1 +{ + "event_name": "Conference", + "event_date": "2025-03-26 15:30:00" +} +``` +{% include copy-curl.html %} + +## Example: Using multiple date formats + +Create an index containing a `log_timestamp` field, which accepts both the custom `yyyy-MM-dd HH:mm:ss` date format and the `epoch_millis` format: + +```json +PUT logs +{ + "mappings": { + "properties": { + "log_timestamp": { + "type": "date", + "format": "yyyy-MM-dd HH:mm:ss||epoch_millis" + } + } + } +} +``` +{% include copy-curl.html %} + +Index the first document using the custom format: + +```json +PUT logs/_doc/1 +{ + "message": "System rebooted", + "log_timestamp": "2025-03-26 08:45:00" +} +``` +{% include copy-curl.html %} + +Index the second document using the millisecond format: + +```json +PUT logs/_doc/2 +{ + "message": "System updated", + "log_timestamp": 1711442700000 +} +``` +{% include copy-curl.html %} + +## Built-in date formats + +For a comprehensive list of built-in date formats, see [Built-in formats]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date/#built-in-formats). \ No newline at end of file diff --git a/_field-types/mapping-parameters/ignore-above.md b/_field-types/mapping-parameters/ignore-above.md new file mode 100644 index 00000000000..fcf4bbaaff9 --- /dev/null +++ b/_field-types/mapping-parameters/ignore-above.md @@ -0,0 +1,180 @@ +--- +layout: default +title: Ignore above +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 45 +has_children: false +has_toc: false +--- + +# Ignore above + +The `ignore_above` mapping parameter limits the maximum number of characters for an indexed string. If a string's length exceeds the specified threshold, the value is stored with the document but is not indexed. This can help prevent the index from bloating with unusually long values and can ensure efficient queries. + +By default, if you do not specify `ignore_above`, all string values will be fully indexed. + +## Example: Without ignore_above + +Create an index with a `keyword` field without specifying the `ignore_above` parameter: + +```json +PUT /test-no-ignore +{ + "mappings": { + "properties": { + "sentence": { + "type": "keyword" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document with a long string value: + +```json +PUT /test-no-ignore/_doc/1 +{ + "sentence": "text longer than 10 characters" +} +``` +{% include copy-curl.html %} + +Run a term query for the full string: + +```json +POST /test-no-ignore/_search +{ + "query": { + "term": { + "sentence": "text longer than 10 characters" + } + } +} +``` +{% include copy-curl.html %} + +The document is returned because the `sentence` field was indexed: + +```json +{ + ... + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.13353139, + "hits": [ + { + "_index": "test-no-ignore", + "_id": "1", + "_score": 0.13353139, + "_source": { + "sentence": "text longer than 10 characters" + } + } + ] + } +} +``` + +## Example: With ignore_above + +Create an index with the `ignore_above` parameter set to `10` on the same field: + +```json +PUT /test-ignore +{ + "mappings": { + "properties": { + "sentence": { + "type": "keyword", + "ignore_above": 10 + } + } + } +} +``` +{% include copy-curl.html %} + +Index the same document with the long string value: + +```json +PUT /test-ignore/_doc/1 +{ + "sentence": "text longer than 10 characters" +} +``` +{% include copy-curl.html %} + +Run a term query for the full string: + +```json +POST /test-ignore/_search +{ + "query": { + "term": { + "sentence": "text longer than 10 characters" + } + } +} +``` +{% include copy-curl.html %} + +No results are returned because the string in the `sentence` field exceeded the `ignore_above` threshold and was not indexed: + +```json +{ + "took": 3, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 0, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } +} +``` + +However, the document is still present, which can be confirmed using the following request: + +```json +GET test-ignore/_search +``` +{% include copy-curl.html %} + +The returned hits include the document: + +```json +{ + ... + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "test-ignore", + "_id": "1", + "_score": 1, + "_source": { + "sentence": "text longer than 10 characters" + } + } + ] + } +} +``` diff --git a/_field-types/mapping-parameters/ignore-malformed.md b/_field-types/mapping-parameters/ignore-malformed.md new file mode 100644 index 00000000000..d6c7c96237e --- /dev/null +++ b/_field-types/mapping-parameters/ignore-malformed.md @@ -0,0 +1,119 @@ +--- +layout: default +title: Ignore malformed +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 45 +has_children: false +has_toc: false +--- + +# Ignore malformed + +The `ignore_malformed` mapping parameter instructs the indexing engine to ignore values that do not match the field's expected format. When enabled, malformed values are not indexed, preventing entire-document rejection because of data format issues. This ensures that documents are still stored even if one or more fields contain data that cannot be parsed. + +By default, `ignore_malformed` is disabled, which means that if a value cannot be parsed according to the field type, indexing will fail for the entire document. + +## Example: ignore_malformed off + +Create an index named `people_no_ignore` containing an `age` field of type `integer`. By default, `ignore_malformed` is set to `false`: + +```json +PUT /people_no_ignore +{ + "mappings": { + "properties": { + "age": { + "type": "integer" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document with a malformed value: + +```json +PUT /people_no_ignore/_doc/1 +{ + "age": "twenty" +} +``` +{% include copy-curl.html %} + +The request fails because of the malformed value: + +```json +{ + "error": { + "root_cause": [ + { + "type": "mapper_parsing_exception", + "reason": "failed to parse field [age] of type [integer] in document with id '1'. Preview of field's value: 'twenty'" + } + ], + "type": "mapper_parsing_exception", + "reason": "failed to parse field [age] of type [integer] in document with id '1'. Preview of field's value: 'twenty'", + "caused_by": { + "type": "number_format_exception", + "reason": "For input string: \"twenty\"" + } + }, + "status": 400 +} +``` + +## Example: ignore_malformed on + +Create an index named `people_ignore` in which the `age` field has `ignore_malformed` set to `true`: + +```json +PUT /people_ignore +{ + "mappings": { + "properties": { + "age": { + "type": "integer", + "ignore_malformed": true + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document with a malformed value: + +```json +PUT /people_ignore/_doc/1 +{ + "age": "twenty" +} +``` +{% include copy-curl.html %} + +Retrieve the document: + +```json +GET /people_ignore/_doc/1 +``` +{% include copy-curl.html %} + +The response shows that the document was indexed successfully, despite having a malformed value: + +```json +{ + "_index": "people_ignore", + "_id": "1", + "_version": 1, + "_seq_no": 0, + "_primary_term": 1, + "found": true, + "_source": { + "age": "twenty" + } +} +``` + + diff --git a/_field-types/mapping-parameters/index-options.md b/_field-types/mapping-parameters/index-options.md new file mode 100644 index 00000000000..92ce6b5fb0d --- /dev/null +++ b/_field-types/mapping-parameters/index-options.md @@ -0,0 +1,92 @@ +--- +layout: default +title: Index options +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 70 +has_children: false +has_toc: false +--- + +# Index options + +The `index_options` mapping parameter controls the level of detail stored in the inverted index for text fields. This setting directly influences both the index size and the capabilities available for scoring, phrase matching, and highlighting. + +The `index_options` parameter has the following valid values. + +| Value | Stores | Description | +|------------|----------------------------------|-------------| +| `docs` | Document IDs only | Indexes only the existence of a term in a set of documents. Does not store frequency or position. Minimizes index size; suitable for simple existence checks. | +| `freqs` | Document IDs + term frequency | Adds term frequency information. Useful for improved relevance scoring but does not support phrase or proximity queries. | +| `positions`| Document IDs + term frequency + term positions | Includes term order and location in the document. Required for phrase queries and proximity searches. | +| `offsets` | Document IDs + term frequency + term positions + offsets | Most detailed. Adds character offsets for matched terms. Useful for highlighting but increases storage size. | + +By default, text fields are indexed with the `positions` option, balancing functionality and index size. + +## Example: Setting index_options on a field + +Create an index named `products` with a `description` field that uses the `positions` setting for `index_options`: + +```json +PUT /products +{ + "mappings": { + "properties": { + "description": { + "type": "text", + "index_options": "positions" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document with content in the `description` field: + +```json +PUT /products/_doc/1 +{ + "description": "This is a sample product description with several terms." +} +``` +{% include copy-curl.html %} + +Run a phrase query against the `description` field: + +```json +POST /products/_search +{ + "query": { + "match_phrase": { + "description": "product description" + } + } +} +``` +{% include copy-curl.html %} + +The phrase query successfully matches the document, demonstrating how the `positions` setting in `index_options` enables accurate phrase matching within the `description` field: + +```json +{ + ... + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.5753642, + "hits": [ + { + "_index": "products", + "_id": "1", + "_score": 0.5753642, + "_source": { + "description": "This is a sample product description with several terms." + } + } + ] + } +} +``` diff --git a/_field-types/mapping-parameters/index-parameter.md b/_field-types/mapping-parameters/index-parameter.md new file mode 100644 index 00000000000..04f9ce3c26e --- /dev/null +++ b/_field-types/mapping-parameters/index-parameter.md @@ -0,0 +1,175 @@ +--- +layout: default +title: Index +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 60 +has_children: false +has_toc: false +--- + +# Index + +The `index` mapping parameter controls whether a field is searchable by including it in the inverted index. When set to `true`, the field is indexed and available for queries. When set to `false`, the field is stored in the document but not indexed, making it non-searchable. If you do not need to search a particular field, disabling indexing for that field can reduce index size and improve indexing performance. For example, you can disable indexing on large text fields or metadata that is only used for display. + +By default, all field types are indexed. + +## Supported data types + +The `index` mapping parameter can be applied to the following data types: + +- [Text]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/) +- [Keyword]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/keyword/) +- [Boolean]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/boolean/) +- [IP address]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/ip/) +- [Date field types]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/dates/) +- [Numeric field types]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/numeric/) + +## Enabling indexing on a field + +The following request creates an index named `products` with a `description` field that is indexed (the default behavior): + +```json +PUT /products +{ + "mappings": { + "properties": { + "description": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document using the following request: + +```json +PUT /products/_doc/1 +{ + "description": "This product has a searchable description." +} +``` +{% include copy-curl.html %} + +Query the description field: + +```json +POST /products/_search +{ + "query": { + "match": { + "description": "searchable" + } + } +} +``` +{% include copy-curl.html %} + +The following response confirms that the indexed document was successfully matched by the query: + +```json +{ + ... + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.2876821, + "hits": [ + { + "_index": "products", + "_id": "1", + "_score": 0.2876821, + "_source": { + "description": "This product has a searchable description." + } + } + ] + } +} +``` + +## Disabling indexing on a field + +Create an index named `products-no-index` with a `description` field that is not indexed: + +```json +PUT /products-no-index +{ + "mappings": { + "properties": { + "description": { + "type": "text", + "index": false + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document using the following request: + +```json +PUT /products-no-index/_doc/1 +{ + "description": "This product has a non-searchable description." +} +``` +{% include copy-curl.html %} + +Query `products-no-index` using the `description` field: + +```json +POST /products-no-index/_search +{ + "query": { + "match": { + "description": "non-searchable" + } + } +} +``` +{% include copy-curl.html %} + +The following error response indicates that the search query failed because the description field is not indexed: + +```json +{ + "error": { + "root_cause": [ + { + "type": "query_shard_exception", + "reason": "failed to create query: Cannot search on field [description] since it is not indexed.", + "index": "products-no-index", + "index_uuid": "yX2F4En1RqOBbf3YWihGCQ" + } + ], + "type": "search_phase_execution_exception", + "reason": "all shards failed", + "phase": "query", + "grouped": true, + "failed_shards": [ + { + "shard": 0, + "index": "products-no-index", + "node": "0tmy2tf7TKW8qCmya9sG2g", + "reason": { + "type": "query_shard_exception", + "reason": "failed to create query: Cannot search on field [description] since it is not indexed.", + "index": "products-no-index", + "index_uuid": "yX2F4En1RqOBbf3YWihGCQ", + "caused_by": { + "type": "illegal_argument_exception", + "reason": "Cannot search on field [description] since it is not indexed." + } + } + } + ] + }, + "status": 400 +} +``` diff --git a/_field-types/mapping-parameters/index-phrases.md b/_field-types/mapping-parameters/index-phrases.md new file mode 100644 index 00000000000..b52176e6ed4 --- /dev/null +++ b/_field-types/mapping-parameters/index-phrases.md @@ -0,0 +1,96 @@ +--- +layout: default +title: Index phrases +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 80 +has_children: false +has_toc: false +--- + +# Index phrases + +The `index_phrases` mapping parameter determines whether a field's text is additionally processed to generate phrase tokens. When enabled, the system creates extra tokens representing sequences of exactly two consecutive words (_bigrams_). This can significantly improve the performance and accuracy of phrase queries. However, it also increases the index size and the time needed to index documents. + +By default, `index_phrases` is set to `false` to maintain a leaner index and faster document ingestion. + +## Enabling index phrases on a field + +The following example creates an index named `blog` in which the `content` field is configured with `index_phrases`: + +```json +PUT /blog +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "index_phrases": true + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document using the following request: + +```json +PUT /blog/_doc/1 +{ + "content": "The slow green turtle swims past the whale" +} +``` +{% include copy-curl.html %} + +Perform a `match_phrase` query using the following search request: + +```json +POST /blog/_search +{ + "query": { + "match_phrase": { + "content": "slow green" + } + } +} +``` +{% include copy-curl.html %} + +The query returns the stored document: + +```json +{ + "took": 25, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.5753642, + "hits": [ + { + "_index": "blog", + "_id": "1", + "_score": 0.5753642, + "_source": { + "content": "The slow green turtle swims past the whale" + } + } + ] + } +} +``` + +Although the same hit is returned when you don't provide the `index_phrases` mapping parameter, using this parameter ensures that the query performs as follows: + +- Uses the `.index_phrases` field internally +- Matches pre-tokenized bigrams such as "slow green", "green turtle", or "turtle swims". +- Bypasses position lookups and is faster, especially at scale. \ No newline at end of file diff --git a/_field-types/mapping-parameters/index-prefixes.md b/_field-types/mapping-parameters/index-prefixes.md new file mode 100644 index 00000000000..799b2afeb69 --- /dev/null +++ b/_field-types/mapping-parameters/index-prefixes.md @@ -0,0 +1,112 @@ +--- +layout: default +title: Index prefixes +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 90 +has_children: false +has_toc: false +--- + +# Index prefixes + +The `index_prefixes` mapping parameter instructs the engine to generate additional index entries for the beginning segments of terms in a text field. When enabled, it builds a prefix index based on configurable minimum and maximum character lengths. This can significantly improve the performance of [prefix queries]({{site.url}}{{site.baseurl}}/query-dsl/term/prefix/), such as [autocomplete]({{site.url}}{{site.baseurl}}/opensearch/search/autocomplete/) or [search as you type]({{site.url}}{{site.baseurl}}/opensearch/search/autocomplete/#search-as-you-type), by allowing these queries to quickly match the pre-indexed term prefixes. + +By default, prefix indexing is not performed, maintaining minimal index size and fast indexing operations. However, if your application benefits from rapid prefix matching, enabling this parameter can provide a marked improvement in query efficiency. + +## Index prefixes configuration + +You can pass the following configuration parameters to the `index_prefixes` mapping parameter: + +- `min_chars`: The minimum length of the prefix that needs to be indexed. Minimum is `0`. Default is `2`. +- `max_chars`: The maximum length of the prefix that needs to be indexed. Maximum is `20`. Default is `5`. + +## Enabling index prefixes on a field + +The following request creates an index named `products` with the `name` field configured to build a prefix index with a length of between `2` and `10` characters: + +```json +PUT /products +{ + "mappings": { + "properties": { + "name": { + "type": "text", + "index_prefixes": { + "min_chars": 2, + "max_chars": 10 + } + } + } + } +} +``` +{% include copy-curl.html %} + +Index a document using the following request: + +```json +PUT /products/_doc/1 +{ + "name": "Ultra HD Television" +} +``` +{% include copy-curl.html %} + +The following search request shows a prefix query that searches for documents in which the `name` field starts with `ul`: + +```json +POST /products/_search +{ + "query": { + "prefix": { + "name": "ul" + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching document: + +```json +{ + ... + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "products", + "_id": "1", + "_score": 1, + "_source": { + "name": "Ultra HD Television" + } + } + ] + } +} +``` + +## Using default parameters with index prefixes + +The following request creates an index named `products_default` using `index_prefixes` with the default parameters: + +```json +PUT /products_default +{ + "mappings": { + "properties": { + "name": { + "type": "text", + "index_prefixes": {} + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_field-types/mapping-parameters/index.md b/_field-types/mapping-parameters/index.md index ca5586bb8f2..44015a1e114 100644 --- a/_field-types/mapping-parameters/index.md +++ b/_field-types/mapping-parameters/index.md @@ -14,15 +14,17 @@ The following table lists OpenSearch mapping parameters. Parameter | Description :--- | :--- -`analyzer` | Specifies the analyzer used to analyze string fields. Default is the `standard` analyzer, which is a general-purpose analyzer that splits text on white space and punctuation, converts to lowercase, and removes stop words. Allowed values are `standard`, `simple`, and `whitespace`. -`boost` | Specifies a field-level boost factor applied at query time. Allows you to increase or decrease the relevance score of a specific field during search queries. Default boost value is `1.0`, which means no boost is applied. Allowed values are any positive floating-point number. -`coerce` | Controls how values are converted to the expected field data type during indexing. Default value is `true`, which means that OpenSearch tries to coerce the value to the expected value type. Allowed values are `true` or `false`. -`copy_to` | Copies the value of a field to another field. There is no default value for this parameter. Optional. -`doc_values` | Specifies whether a field should be stored on disk to make sorting and aggregation faster. Default value is `true`, which means that the doc values are enabled. Allowed values are a single field name or a list of field names. -`dynamic` | Determines whether new fields should be added dynamically. Default value is `true`, which means that new fields can be added dynamically. Allowed values are `true`, `false`, or `strict`. -`enabled` | Specifies whether the field is enabled or disabled. Default value is `true`, which means that the field is enabled. Allowed values are `true` or `false`. -`format` | Specifies the date format for date fields. There is no default value for this parameter. Allowed values are any valid date format string, such as `yyyy-MM-dd` or `epoch_millis`. -`ignore_above` | Skips indexing values that exceed the specified length. Default value is `2147483647`, which means that there is no limit on the field value length. Allowed values are any positive integer. -`ignore_malformed` | Specifies whether malformed values should be ignored. Default value is `false`, which means that malformed values are not ignored. Allowed values are `true` or `false`. -`index` | Specifies whether a field should be indexed. Default value is `true`, which means that the field is indexed. Allowed values are `true`, `false`, or `not_analyzed`. -`index_options` | Specifies what information should be stored in an index for scoring purposes. Default value is `docs`, which means that only the document numbers are stored in the index. Allowed values are `docs`, `freqs`, `positions`, or `offsets`. \ No newline at end of file +[`analyzer`]({{site.url}}{{site.baseurl}}/field-types/mapping-parameters/analyzer/) | Specifies the analyzer used to analyze string fields. Default is the `standard` analyzer, which is a general-purpose analyzer that splits text on white space and punctuation, converts to lowercase, and removes stop words. Allowed values are `standard`, `simple`, or `whitespace`. +[`boost`]({{site.url}}{{site.baseurl}}/field-types/mapping-parameters/boost/) | Specifies a field-level boost factor applied at query time. Allows you to increase or decrease the relevance score of a specific field during search queries. Default boost value is `1.0`, which means that no boost is applied. Allowed values are any positive floating-point number. +[`coerce`]({{site.url}}{{site.baseurl}}/field-types/mapping-parameters/coerce/) | Controls how values are converted to the expected field data type during indexing. Default value is `true`, which means that OpenSearch tries to coerce the value to the expected value type. Allowed values are `true` or `false`. +[`copy_to`]({{site.url}}{{site.baseurl}}/field-types/mapping-parameters/copy-to/) | Copies the value of a field to another field. There is no default value for this parameter. Optional. +[`doc_values`]({{site.url}}{{site.baseurl}}/field-types/mapping-parameters/doc-values/) | Specifies whether a field should be stored on disk to make sorting and aggregation faster. Default value is `true`, which means that the doc values are enabled. Allowed values are a single field name or a list of field names. +[`dynamic`]({{site.url}}{{site.baseurl}}/field-types/mapping-parameters/dynamic/) | Determines whether new fields should be added dynamically. Default value is `true`, which means that new fields can be added dynamically. Allowed values are `true`, `false`, or `strict`. +[`enabled`]({{site.url}}{{site.baseurl}}/field-types/mapping-parameters/enabled/) | Specifies whether the field is enabled or disabled. Default value is `true`, which means that the field is enabled. Allowed values are `true` or `false`. +[`format`]({{site.url}}{{site.baseurl}}/field-types/mapping-parameters/format/) | Specifies the date format for date fields. There is no default value for this parameter. Allowed values are any valid date format string, such as `yyyy-MM-dd` or `epoch_millis`. +[`ignore_above`]({{site.url}}{{site.baseurl}}/field-types/mapping-parameters/ignore-above/) | Skips indexing values that exceed the specified length. Default value is `2147483647`, which means that there is no limit on the field value length. Allowed values are any positive integer. +[`ignore_malformed`]({{site.url}}{{site.baseurl}}/field-types/mapping-parameters/ignore-malformed/) | Specifies whether malformed values should be ignored. Default value is `false`, which means that malformed values are not ignored. Allowed values are `true` or `false`. +[`index`]({{site.url}}{{site.baseurl}}/field-types/mapping-parameters/index-parameter/) | Specifies whether a field should be indexed. Default value is `true`, which means that the field is indexed. Allowed values are `true` or `false`. +[`index_phrases`]({{site.url}}{{site.baseurl}}/field-types/mapping-parameters/index-phrases/) | Determines whether extra phrase tokens are generated for text fields to improve the performance of phrase queries. Default is `false`. +[`index_options`]({{site.url}}{{site.baseurl}}/field-types/mapping-parameters/index-options/) | Specifies what information should be stored in an index for scoring purposes. Default value is `docs`, which means that only the document numbers are stored in the index. Allowed values are `docs`, `freqs`, `positions`, or `offsets`. + diff --git a/_field-types/mapping-parameters/meta.md b/_field-types/mapping-parameters/meta.md new file mode 100644 index 00000000000..8cd2fb89359 --- /dev/null +++ b/_field-types/mapping-parameters/meta.md @@ -0,0 +1,79 @@ +--- +layout: default +title: Meta +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 100 +has_children: false +has_toc: false +--- + +# Meta + +The `_meta` mapping parameter allows you to attach metadata to your mapping definition. This metadata is stored alongside your mapping and is returned when the mapping is retrieved, serving solely as informational context without influencing indexing or search operations. + +You can use the `_meta` mapping parameter to provide important details, such as version information, descriptions, or authorship. Metadata can also be updated by submitting a mapping update that overrides the existing metadata. + + +## Enabling meta on a mapping + +The following request creates an index named `products` with a `_meta` mapping parameter containing version and description information: + +```json +PUT /products +{ + "mappings": { + "_meta": { + "version": "1.0", + "description": "Mapping for the products index." + }, + "properties": { + "name": { + "type": "text" + }, + "price": { + "type": "float" + } + } + } +} +``` +{% include copy-curl.html %} + +### Updating metadata on an index + +Use the following request to update the `_meta` mapping parameter on an index: + +```json +PUT /products/_mapping +{ + "_meta": { + "version": "1.1", + "description": "Updated mapping for the products index.", + "author": "Team B" + } +} +``` +{% include copy-curl.html %} + +### Indexing a document + +After the index is created, you can index documents as usual. The `_meta` information remains with the mapping and does not affect the document indexing process: + +```json +PUT /products/_doc/1 +{ + "name": "Widget", + "price": 19.99 +} +``` +{% include copy-curl.html %} + +### Retrieve the meta information + +To verify that your `_meta` information is stored, you can retrieve the mapping for the index: + +```json +GET /products/_mapping +``` +{% include copy-curl.html %} diff --git a/_field-types/mapping-parameters/normalizer.md b/_field-types/mapping-parameters/normalizer.md new file mode 100644 index 00000000000..755e97fb798 --- /dev/null +++ b/_field-types/mapping-parameters/normalizer.md @@ -0,0 +1,92 @@ +--- +layout: default +title: Normalizer +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 110 +has_children: false +has_toc: false +--- + +# Normalizer + +The `normalizer` mapping parameter defines a custom normalization process for keyword fields. Unlike [analyzers]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/index/) for text fields, which generate multiple tokens, [normalizers]({{site.url}}{{site.baseurl}}/analyzers/normalizers/) transform the entire field value into a single token using a set of token filters. When you define a normalizer, the keyword field is processed by the specified filters before it is stored while keeping the `_source` of the document unchanged. + + +## Defining a normalizer + +The following request creates an index named `products` with a custom normalizer called `my_normalizer`. The normalizer is applied to the `code` field, which uses the `trim` and `lowercase` filters: + +```json +PUT /products +{ + "settings": { + "analysis": { + "normalizer": { + "my_normalizer": { + "type": "custom", + "filter": ["trim", "lowercase"] + } + } + } + }, + "mappings": { + "properties": { + "code": { + "type": "keyword", + "normalizer": "my_normalizer" + } + } + } +} +``` +{% include copy-curl.html %} + +When you ingest a document into the index, the `code` field is normalized by trimming any extra spaces and converting the text to lowercase: + +```json +PUT /products/_doc/1 +{ + "code": " ABC-123 EXTRA " +} +``` +{% include copy-curl.html %} + +Search for the indexed document using lowercase and trimmed text in the query: + +```json +POST /products/_search +{ + "query": { + "term": { + "code": "abc-123 extra" + } + } +} +``` +{% include copy-curl.html %} + +Because the `code` field is normalized, the `term` query successfully matches the stored document: + +```json +{ +... + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.2876821, + "hits": [ + { + "_index": "products", + "_id": "1", + "_score": 0.2876821, + "_source": { + "code": " ABC-123 EXTRA " + } + } + ] + } +} +``` diff --git a/_field-types/mapping-parameters/norms.md b/_field-types/mapping-parameters/norms.md new file mode 100644 index 00000000000..7a0a9025afb --- /dev/null +++ b/_field-types/mapping-parameters/norms.md @@ -0,0 +1,67 @@ +--- +layout: default +title: Norms +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 120 +has_children: false +has_toc: false +--- + +# Norms + +The `norms` mapping parameter controls whether normalization factors are computed and stored for a field. These factors are used during query scoring to adjust the relevance of the search results. However, storing `norms` increases the index size and consumes additional memory. + +By default, `norms` is enabled on `text` fields, for which relevance scoring is important. Fields that do not require these scoring features, such as `keyword` fields used only for filtering, are configured with `norms` disabled. + +## Disabling `norms` on a field + +The following request creates an index named `products` with the `description` field as a `text` field with `norms` disabled: + +```json +PUT /products +{ + "mappings": { + "properties": { + "description": { + "type": "text", + "norms": false + } + } + } +} +``` +{% include copy-curl.html %} + +To disable `norms` on a field in an existing index, use the following request: + +```json +PUT /products/_mapping +{ + "properties": { + "review": { + "type": "text", + "norms": false + } + } +} +``` +{% include copy-curl.html %} + +Enabling `norms` on a field that has `norms` disabled is impossible and will result in the following error: + +```json +{ + "error": { + "root_cause": [ + { + "type": "illegal_argument_exception", + "reason": "Mapper for [description] conflicts with existing mapper:\n\tCannot update parameter [norms] from [false] to [true]" + } + ], + "type": "illegal_argument_exception", + "reason": "Mapper for [description] conflicts with existing mapper:\n\tCannot update parameter [norms] from [false] to [true]" + }, + "status": 400 +} +``` \ No newline at end of file diff --git a/_field-types/mapping-parameters/null-value.md b/_field-types/mapping-parameters/null-value.md new file mode 100644 index 00000000000..3857c274716 --- /dev/null +++ b/_field-types/mapping-parameters/null-value.md @@ -0,0 +1,135 @@ +--- +layout: default +title: Null value +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 130 +has_children: false +has_toc: false +--- + +# Null value + +The `null_value` mapping parameter allows you to replace explicit `null` values with a predefined substitute during indexing. By default, if a field is set to `null`, it is not indexed and cannot be searched. With `null_value` defined, the specified replacement value is indexed instead. This allows you to query or aggregate documents in which a field was originally `null` without modifying the document `_source`. + +The `null_value` must be of the same type as the field it is applied to. For instance, a `date` field cannot use a `boolean` such as `true` as its `null_value`; the `null_value` must be a valid date string. +{: .important} + +## Setting a null_value on a field + +The following request creates an index named `products`. The `category` field is of type `keyword` and replaces `null` values with `"unknown"` during indexing: + +```json +PUT /products +{ + "mappings": { + "properties": { + "category": { + "type": "keyword", + "null_value": "unknown" + } + } + } +} +``` +{% include copy-curl.html %} + +## Indexing a document with a null value + +Use the following command to index a document in which the `category` field is set to `null`: + +```json +PUT /products/_doc/1 +{ + "category": null +} +``` +{% include copy-curl.html %} + +## Querying the null substitute + +Use the following command to search for documents in which the `category` field was previously `null`: + +```json +POST /products/_search +{ + "query": { + "term": { + "category": "unknown" + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching document: + +```json +{ + ... + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.2876821, + "hits": [ + { + "_index": "products", + "_id": "1", + "_score": 0.2876821, + "_source": { + "category": null + } + } + ] + } +} +``` + +## Aggregating on a null substitute + +Because the null replacement is indexed, it also appears in aggregations. Use the following command to perform a `terms` aggregation on the `category` field: + +```json +POST /products/_search +{ + "size": 0, + "aggs": { + "category_count": { + "terms": { + "field": "category" + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains aggregated results: + +```json +{ + ... + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "category_count": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "unknown", + "doc_count": 1 + } + ] + } + } +} +``` \ No newline at end of file diff --git a/_field-types/mapping-parameters/position-increment-gap.md b/_field-types/mapping-parameters/position-increment-gap.md new file mode 100644 index 00000000000..305296ebdc9 --- /dev/null +++ b/_field-types/mapping-parameters/position-increment-gap.md @@ -0,0 +1,92 @@ +--- +layout: default +title: Position increment gap +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 140 +has_children: false +has_toc: false +--- + +# Position increment gap + +The `position_increment_gap` mapping parameter defines the positional distance between tokens of multi-valued fields during indexing. This affects how [`match_phrase`]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match-phrase/) and [`span`]({{site.url}}{{site.baseurl}}/query-dsl/span/index/) queries behave when searching across multiple values of the same field. + +By default, each new value in a multi-valued field is treated as if it is separated from the previous one by a gap of `100` positions. This helps prevent false positives when searching for phrases that may span across different field values. + +## Setting a position increment gap + +Use the following request to create an index named `articles` with a `tags` field of type `text`, setting `position_increment_gap` to `0`: + +```json +PUT /articles +{ + "mappings": { + "properties": { + "tags": { + "type": "text", + "position_increment_gap": 0 + } + } + } +} +``` +{% include copy-curl.html %} + +## Indexing a multi-valued field + +Use the following request to index a document in which the `tags` field contains multiple values: + +```json +PUT /articles/_doc/1 +{ + "tags": ["machine", "learning"] +} +``` +{% include copy-curl.html %} + +## Search using a `match_phrase` query + +Use the following `match_phrase` query to search for "machine learning" in the `tags` field: + +```json +GET /articles/_search +{ + "query": { + "match_phrase": { + "tags": "machine learning" + } + } +} +``` +{% include copy-curl.html %} + +The result demonstrates that the phrase match succeeds because the `position_increment_gap` is set to `0`, allowing tokens from separate values to be treated as adjacent: + +```json +{ + ... + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.5753642, + "hits": [ + { + "_index": "articles", + "_id": "1", + "_score": 0.5753642, + "_source": { + "tags": [ + "machine", + "learning" + ] + } + } + ] + } +} +``` + +If the `position_increment_gap` remained at `100`, no hits would be returned because tokens `machine` and `learning` would be considered to be 100 positions away from each other. diff --git a/_field-types/mapping-parameters/properties.md b/_field-types/mapping-parameters/properties.md new file mode 100644 index 00000000000..f328692e58f --- /dev/null +++ b/_field-types/mapping-parameters/properties.md @@ -0,0 +1,149 @@ +--- +layout: default +title: Properties +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 150 +has_children: false +has_toc: false +--- + +# Properties + +The `properties` mapping parameter is used to define the structure and data types of fields within an object or the root of a document. It acts as the core of any mapping definition, allowing you to explicitly specify field names, types (such as `text`, `keyword`, `date`, or `float`), and additional settings or mapping parameters for each field. + +By using `properties`, you gain full control over how your data is indexed and stored, enabling precise search behavior, aggregation support, and data validation. + +## Defining fields with properties + +The following request creates an index named `products` with a structured mapping using the `properties` parameter. It includes a nested object field called `dimensions` with subfields: + +```json +PUT /products +{ + "mappings": { + "properties": { + "name": { + "type": "text" + }, + "sku": { + "type": "keyword" + }, + "price": { + "type": "float" + }, + "available": { + "type": "boolean" + }, + "created_at": { + "type": "date", + "format": "yyyy-MM-dd" + }, + "dimensions": { + "type": "object", + "properties": { + "width": { "type": "float" }, + "height": { "type": "float" }, + "depth": { "type": "float" } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Indexing a document + +Use the following command to index a document with [nested fields]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/): + +```json +PUT /products/_doc/1 +{ + "name": "Wireless Mouse", + "sku": "WM-1001", + "price": 24.99, + "available": true, + "created_at": "2024-12-01", + "dimensions": { + "width": 6.5, + "height": 3.2, + "depth": 1.5 + } +} +``` +{% include copy-curl.html %} + +## Querying and aggregating using dot notation + +You can query or aggregate on object subfields using dot notation. Use the following command to execute a query that: + +- Filters documents on the `dimensions.width` field, returning documents in which `width` is between `5` and `10`. +- Creates a [histogram aggregation]({{site.url}}{{site.baseurl}}/aggregations/bucket/histogram/) on the `dimensions.depth` field, creating buckets for products using `depth` intervals of `0.5`. + +```json +POST /products/_search +{ + "query": { + "range": { + "dimensions.width": { + "gte": 5, + "lte": 10 + } + } + }, + "aggs": { + "Depth Distribution": { + "histogram": { + "field": "dimensions.depth", + "interval": 0.5 + } + } + } +} +``` +{% include copy-curl.html %} + +The following response shows a matching document in which the `dimensions.width` field falls within the specified range. It also includes a histogram aggregation result for `dimensions.depth`: + +```json +{ + ... + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "products", + "_id": "1", + "_score": 1, + "_source": { + "name": "Wireless Mouse", + "sku": "WM-1001", + "price": 24.99, + "available": true, + "created_at": "2024-12-01", + "dimensions": { + "width": 6.5, + "height": 3.2, + "depth": 1.5 + } + } + } + ] + }, + "aggregations": { + "Depth Distribution": { + "buckets": [ + { + "key": 1.5, + "doc_count": 1 + } + ] + } + } +} +``` diff --git a/_field-types/mapping-parameters/search-analyzer.md b/_field-types/mapping-parameters/search-analyzer.md new file mode 100644 index 00000000000..b6321bd66a2 --- /dev/null +++ b/_field-types/mapping-parameters/search-analyzer.md @@ -0,0 +1,56 @@ +--- +layout: default +title: Search analyzer +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 160 +has_children: false +has_toc: false +--- + +# Search analyzer + +The `search_analyzer` mapping parameter specifies the analyzer to be used at search time for a [`text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/) field. This allows the analyzer used for indexing to differ from the one used for search, offering greater control over how search terms are interpreted and matched. + +By default, the same analyzer is used for both indexing and search. However, using a custom `search_analyzer` can be helpful when you want to apply looser or stricter matching rules during search, such as using [`stemming`]({{site.url}}{{site.baseurl}}/analyzers/stemming/) or removing stopwords only at search time. For more information and use cases, see [Search analyzers]({{site.url}}{{site.baseurl}}/analyzers/search-analyzers/). +{: .note} + +## Example + +The following example creates a field that uses an `edge_ngram_analyzer` configured with an [`edge_ngram_tokenizer`]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/edge-n-gram/) for indexing and a [`standard` analyzer]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/standard/) for search: + +```json +PUT /articles +{ + "settings": { + "analysis": { + "analyzer": { + "edge_ngram_analyzer": { + "tokenizer": "edge_ngram_tokenizer", + "filter": ["lowercase"] + } + }, + "tokenizer": { + "edge_ngram_tokenizer": { + "type": "edge_ngram", + "min_gram": 2, + "max_gram": 10, + "token_chars": ["letter", "digit"] + } + } + } + }, + "mappings": { + "properties": { + "title": { + "type": "text", + "analyzer": "edge_ngram_analyzer", + "search_analyzer": "standard" + } + } + } +} +``` +{% include copy-curl.html %} + +For a full explanation of how search analyzers work as well as more examples, see [Search analyzers]({{site.url}}{{site.baseurl}}/analyzers/search-analyzers/). diff --git a/_field-types/mapping-parameters/similarity.md b/_field-types/mapping-parameters/similarity.md new file mode 100644 index 00000000000..d41ac17de1d --- /dev/null +++ b/_field-types/mapping-parameters/similarity.md @@ -0,0 +1,90 @@ +--- +layout: default +title: Similarity +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 170 +has_children: false +has_toc: false +--- + +# Similarity + +The `similarity` mapping parameter lets you customize how relevance scores are calculated for a text field during search. It defines the scoring algorithm used to rank matching documents, which directly impacts how results are ordered in search responses. + +## Supported similarity types + +- `BM25` (default): Uses a modern, probabilistic ranking model that balances term frequency, document length, and inverse document frequency. +- `boolean`: Returns constant scores (`1` or `0`), so should be used if you care only about matching, not relevance. + +## Setting a custom similarity on a field + +The following request creates an index named `products` with a `title` field that uses the `boolean` similarity, which assigns all matches the same score: + +```json +PUT /products +{ + "mappings": { + "properties": { + "title": { + "type": "text", + "similarity": "boolean" + } + } + } +} +``` +{% include copy-curl.html %} + +## Indexing a document + +Use the following command to index a sample document: + +```json +PUT /products/_doc/1 +{ + "title": "Compact Wireless Mouse" +} +``` +{% include copy-curl.html %} + +## Querying and inspecting scoring impact + +Use the following command to search by the `title` field: + +```json +POST /products/_search +{ + "query": { + "match": { + "title": "wireless mouse" + } + } +} +``` +{% include copy-curl.html %} + +You can examine the score returned in the `_score` field of the response: + +```json +{ + ... + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 2, + "hits": [ + { + "_index": "products", + "_id": "1", + "_score": 2, + "_source": { + "title": "Compact Wireless Mouse" + } + } + ] + } +} +``` diff --git a/_field-types/mapping-parameters/store.md b/_field-types/mapping-parameters/store.md new file mode 100644 index 00000000000..0bab4bf80a1 --- /dev/null +++ b/_field-types/mapping-parameters/store.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Store +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 180 +has_children: false +has_toc: false +--- + +# Store + +The `store` mapping parameter determines whether the value of a field should be stored separately from the `_source` and made directly retrievable using the `stored_fields` option in a search request. + +By default, `store` is set to `false`, meaning that field values are not stored individually and are only available as part of the document `_source`. If `store` is set to `true`, you can disable the `_source` to save disk space and still [retrieve specific fields]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/retrieve-specific-fields/). + +## Example: Enabling `store` on a field + +The following request creates an index named `products` in which the `model` field is stored separately from the `_source`: + +```json +PUT /products +{ + "mappings": { + "properties": { + "model": { + "type": "keyword", + "store": true + }, + "name": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +Ingest a document into the index: + +```json +PUT /products/_doc/1 +{ + "model": "WM-1001", + "name": "Wireless Mouse" +} +``` +{% include copy-curl.html %} + +Retrieve only the stored field: + +```json +POST /products/_search +{ + "query": { + "match": { + "name": "Mouse" + } + }, + "stored_fields": ["model"] +} +``` +{% include copy-curl.html %} + +This query returns the `model` field stored separately even though the `_source` is still available. + +--- + +## Example: Storing fields with `_source` disabled + +If you want to save disk space and don't need to access the full original document later (for example, for reindexing or updates), you can disable `_source` and store only necessary fields: + +```json +PUT /products_no_source +{ + "mappings": { + "_source": { + "enabled": false + }, + "properties": { + "model": { + "type": "keyword", + "store": true + }, + "name": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +Ingest a document into the index: + +```json +PUT /products_no_source/_doc/1 +{ + "model": "KB-2002", + "name": "Mechanical Keyboard" +} +``` +{% include copy-curl.html %} + +Retrieve the stored field: + +```json +POST /products_no_source/_search +{ + "query": { + "match": { + "name": "Keyboard" + } + }, + "stored_fields": ["model"] +} +``` +{% include copy-curl.html %} + +This query returns the `model` field retrieved from `stored_fields` without accessing the `_source`. + +If you attempt to retrieve the `_source` as follows: + +```json +GET /products_no_source/_doc/1 +``` + +Then the `_source` in the response will be `null`. This demonstrates that the full document is no longer available and that only stored fields can be retrieved because `_source` is disabled: + +```json +{ + "_index": "products_no_source", + "_id": "1", + "found": true, + "_source": null +} +``` diff --git a/_field-types/mapping-parameters/term-vector.md b/_field-types/mapping-parameters/term-vector.md new file mode 100644 index 00000000000..4c3daa39307 --- /dev/null +++ b/_field-types/mapping-parameters/term-vector.md @@ -0,0 +1,258 @@ +--- +layout: default +title: Term vector +parent: Mapping parameters +grand_parent: Mapping and field types +nav_order: 190 +has_children: false +has_toc: false +--- + +# Term vector + +The `term_vector` mapping parameter controls whether term-level information is stored for individual text fields during indexing. This information includes details such as term frequency, position, and character offsets, which can be used for advanced features like custom scoring and highlighting. + +By default, `term_vector` is disabled. When enabled, term vectors are stored and can be retrieved using the `_termvectors` API. + +Enabling `term_vector` increases index size. Only use it when you need detailed term-level data. +{: .important} + +## Configuration options + +The `term_vector` parameter supports the following valid values: + +- `no` (default): Term vectors are not stored. +- `yes`: Store term frequencies (the number of times a term appears in the specific document) and basic positions. +- `with_positions`: Store term positions. The order in which the term appears in the field. +- `with_offsets`: Store character offsets. The exact starting and ending character positions of the term within the field text. +- `with_positions_offsets`: Store both positions and offsets. +- `with_positions_payloads`: Store term positions along with payloads, which are optional pieces of custom metadata (such as tags or numeric values) that can be attached to individual terms during indexing. Payloads are used in advanced scenarios like custom scoring or tagging but require special analyzers in order to be set up. +- `with_positions_offsets_payloads`: Store all term vector data. + +## Enabling term_vector on a field + +The following request creates an index named `articles` with the `content` field configured to store term vectors, including positions and offsets: + +```json +PUT /articles +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "term_vector": "with_positions_offsets" + } + } + } +} +``` +{% include copy-curl.html %} + + +Index a sample document: + +```json +PUT /articles/_doc/1 +{ + "content": "OpenSearch is an open-source search and analytics suite." +} +``` +{% include copy-curl.html %} + + +Retrieve term-level statistics using the `_termvectors` API: + +```json +POST /articles/_termvectors/1 +{ + "fields": ["content"], + "term_statistics": true, + "positions": true, + "offsets": true +} +``` +{% include copy-curl.html %} + +The following response includes detailed term-level statistics for the `content` field in document ID `1`, such as term frequency, document frequency, token positions, and character offsets: + +```json +{ + "_index": "articles", + "_id": "1", + "_version": 1, + "found": true, + "took": 4, + "term_vectors": { + "content": { + "field_statistics": { + "sum_doc_freq": 9, + "doc_count": 1, + "sum_ttf": 9 + }, + "terms": { + "an": { + "doc_freq": 1, + "ttf": 1, + "term_freq": 1, + "tokens": [ + { + "position": 2, + "start_offset": 14, + "end_offset": 16 + } + ] + }, + "analytics": { + "doc_freq": 1, + "ttf": 1, + "term_freq": 1, + "tokens": [ + { + "position": 7, + "start_offset": 40, + "end_offset": 49 + } + ] + }, + "and": { + "doc_freq": 1, + "ttf": 1, + "term_freq": 1, + "tokens": [ + { + "position": 6, + "start_offset": 36, + "end_offset": 39 + } + ] + }, + "is": { + "doc_freq": 1, + "ttf": 1, + "term_freq": 1, + "tokens": [ + { + "position": 1, + "start_offset": 11, + "end_offset": 13 + } + ] + }, + "open": { + "doc_freq": 1, + "ttf": 1, + "term_freq": 1, + "tokens": [ + { + "position": 3, + "start_offset": 17, + "end_offset": 21 + } + ] + }, + "opensearch": { + "doc_freq": 1, + "ttf": 1, + "term_freq": 1, + "tokens": [ + { + "position": 0, + "start_offset": 0, + "end_offset": 10 + } + ] + }, + "search": { + "doc_freq": 1, + "ttf": 1, + "term_freq": 1, + "tokens": [ + { + "position": 5, + "start_offset": 29, + "end_offset": 35 + } + ] + }, + "source": { + "doc_freq": 1, + "ttf": 1, + "term_freq": 1, + "tokens": [ + { + "position": 4, + "start_offset": 22, + "end_offset": 28 + } + ] + }, + "suite": { + "doc_freq": 1, + "ttf": 1, + "term_freq": 1, + "tokens": [ + { + "position": 8, + "start_offset": 50, + "end_offset": 55 + } + ] + } + } + } + } +} +``` + +## Highlighting with term vectors + +Use the following command to search for the term "analytics" and [highlight]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/highlight/) it using the field's stored term vectors: + +```json +POST /articles/_search +{ + "query": { + "match": { + "content": "analytics" + } + }, + "highlight": { + "fields": { + "content": { + "type": "fvh" + } + } + } +} +``` +{% include copy-curl.html %} + +The following response shows a matched document in which the term "analytics" was found in the `content` field. The `highlight` section includes the matched term wrapped in `<em>` tags, using the field's stored term vectors for efficient and accurate highlighting: + +```json +{ + ... + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.2876821, + "hits": [ + { + "_index": "articles", + "_id": "1", + "_score": 0.2876821, + "_source": { + "content": "OpenSearch is an open-source search and analytics suite." + }, + "highlight": { + "content": [ + "OpenSearch is an open-source search and <em>analytics</em> suite." + ] + } + } + ] + } +} +``` \ No newline at end of file diff --git a/_field-types/supported-field-types/flat-object.md b/_field-types/supported-field-types/flat-object.md index c9e59710e15..65d7c6dc8ea 100644 --- a/_field-types/supported-field-types/flat-object.md +++ b/_field-types/supported-field-types/flat-object.md @@ -56,7 +56,8 @@ The flat object field type supports the following queries: - [Multi-match]({{site.url}}{{site.baseurl}}/query-dsl/full-text/multi-match/) - [Query string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/) - [Simple query string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/simple-query-string/) -- [Exists]({{site.url}}{{site.baseurl}}/query-dsl/term/exists/) +- [Exists]({{site.url}}{{site.baseurl}}/query-dsl/term/exists/) +- [Wildcard]({{site.url}}{{site.baseurl}}/query-dsl/term/wildcard/) ## Limitations @@ -243,4 +244,4 @@ PUT /test-index/ ``` {% include copy-curl.html %} -Because `issue.number` is not part of the flat object, you can use it to aggregate and sort documents. \ No newline at end of file +Because `issue.number` is not part of the flat object, you can use it to aggregate and sort documents. diff --git a/_field-types/supported-field-types/index.md b/_field-types/supported-field-types/index.md index a43da396d52..94cd9ccd434 100644 --- a/_field-types/supported-field-types/index.md +++ b/_field-types/supported-field-types/index.md @@ -11,25 +11,66 @@ redirect_from: # Supported field types -You can specify data types for your fields when creating a mapping. The following table lists all data field types that OpenSearch supports. - -Category | Field types and descriptions -:--- | :--- -Alias | [`alias`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/alias/): An additional name for an existing field. -Binary | [`binary`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/binary/): A binary value in Base64 encoding. -[Numeric]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/numeric/) | A numeric value (`byte`, `double`, `float`, `half_float`, `integer`, `long`, [`unsigned_long`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/unsigned-long/), `scaled_float`, `short`). -Boolean | [`boolean`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/boolean/): A Boolean value. -[Date]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/dates/)| [`date`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date/): A date stored in milliseconds. <br> [`date_nanos`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date-nanos/): A date stored in nanoseconds. -IP | [`ip`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/ip/): An IP address in IPv4 or IPv6 format. -[Range]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/range/) | A range of values (`integer_range`, `long_range`, `double_range`, `float_range`, `date_range`, `ip_range`). -[Object]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/object-fields/)| [`object`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/object/): A JSON object. <br>[`nested`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/): Used when objects in an array need to be indexed independently as separate documents.<br>[`flat_object`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/flat-object/): A JSON object treated as a string.<br>[`join`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/): Establishes a parent/child relationship between documents in the same index. -[String]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/string/)|[`keyword`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/keyword/): Contains a string that is not analyzed.<br> [`text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/): Contains a string that is analyzed.<br> [`match_only_text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/match-only-text/): A space-optimized version of a `text` field.<br>[`token_count`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/token-count/): Stores the number of analyzed tokens in a string. <br>[`wildcard`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/wildcard/): A variation of `keyword` with efficient substring and regular expression matching. -[Autocomplete]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/autocomplete/) |[`completion`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/completion/): Provides autocomplete functionality through a completion suggester.<br> [`search_as_you_type`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/search-as-you-type/): Provides search-as-you-type functionality using both prefix and infix completion. -[Geographic]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/geographic/)| [`geo_point`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/geo-point/): A geographic point.<br>[`geo_shape`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/geo-shape/): A geographic shape. -[Rank]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/rank/) | Boosts or decreases the relevance score of documents (`rank_feature`, `rank_features`). -k-NN vector | [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/): Allows indexing a k-NN vector into OpenSearch and performing different kinds of k-NN search. -Percolator | [`percolator`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/percolator/): Specifies to treat this field as a query. -Derived | [`derived`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/derived/): Creates new fields dynamically by executing scripts on existing fields. +You can specify data types for your fields when creating a mapping. The following sections group supported field types by purpose or data structure. + +## General field types + +| Field type | Description | +| [`alias`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/alias/) | An alternate name for an existing field. | +| [`boolean`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/boolean/) | A true/false value. | +| [`binary`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/binary/) | A binary value in Base64 encoding. | +| [`percolator`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/percolator/) | A field that acts as a stored query. | +| [`derived`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/derived/) | A dynamically generated field computed from other fields using a script. | + +## String-based field types + +| Field type | Description | +| [`keyword`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/keyword/) | A non-analyzed string, useful for exact matches. | +| [`text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/) | Analyzed full-text string. | +| [`match_only_text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/match-only-text/) | A lightweight version of `text` for search-only use cases. | +| [`token_count`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/token-count/) | Stores the number of tokens after analysis. | +| [`wildcard`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/wildcard/) | Enables efficient substring and regex matching. | + +## Numeric field types + +| Field type | Description | +| [`byte`, `double`, `float`, `half_float`, `integer`, `long`, `short`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/numeric/) | Stores integer or floating-point numbers in various precisions. | +| [`unsigned_long`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/unsigned-long/) | A 64-bit unsigned integer. | +| [`scaled_float`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/numeric/) | A floating-point number scaled by a fixed factor for storage. | + +## Date and time field types + +| Field type | Description | +| [`date`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date/) | A date or timestamp stored in milliseconds. | +| [`date_nanos`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date-nanos/) | A date or timestamp stored in nanoseconds. | + +## IP field types + +| Field type | Description | +| [`ip`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/ip/) | Stores IPv4 or IPv6 addresses. | + +## Range field types + +| Field type | Description | +| [`integer_range`, `long_range`, `double_range`, `float_range`, `ip_range`, `date_range`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/range/) | Define ranges of numeric, date, or IP values. | + +## Object field types + +| Field type | Description | +| [`object`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/object/) | A JSON object. | +| [`nested`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) | An array of JSON objects, indexed as separate documents. | +| [`flat_object`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/flat-object/) | A JSON object treated as a flat map of strings. | +| [`join`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) | Defines parent/child relationships between documents. | + +## Specialized search field types + +| Field type | Description | +| [`completion`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/completion/) | Supports autocomplete functionality using a suggester. | +| [`search_as_you_type`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/search-as-you-type/) | Enables prefix and infix search-as-you-type queries. | +| [`rank_feature`, `rank_features`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/rank/) | Boosts or lowers document relevance scores. | +| [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) | Indexes a vector for k-NN search. | +| [`semantic`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/semantic/) | Wraps a text or binary field to simplify semantic search setup. | +| [`star_tree`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/star-tree/) | Precomputes aggregations for faster performance using a [star-tree index](https://docs.pinot.apache.org/basics/indexing/star-tree-index). | ## Arrays diff --git a/_field-types/supported-field-types/keyword.md b/_field-types/supported-field-types/keyword.md index ca9c8085f6f..dae3f4b7439 100644 --- a/_field-types/supported-field-types/keyword.md +++ b/_field-types/supported-field-types/keyword.md @@ -57,7 +57,120 @@ Parameter | Description [`normalizer`]({{site.url}}{{site.baseurl}}/analyzers/normalizers/) | Specifies how to preprocess this field before indexing (for example, make it lowercase). Default is `null` (no preprocessing). `norms` | A Boolean value that specifies whether the field length should be used when calculating relevance scores. Default is `false`. [`null_value`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/index#null-value) | A value to be used in place of `null`. Must be of the same type as the field. If this parameter is not specified, the field is treated as missing when its value is `null`. Default is `null`. -`similarity` | The ranking algorithm for calculating relevance scores. Default is `BM25`. +`similarity` | The ranking algorithm for calculating relevance scores. Default is the index's `similarity` setting (by default, `BM25`). +`use_similarity` | Determines whether to calculate relevance scores. Default is `false`, which uses `constant_score` for faster queries. Setting this parameter to `true` enables scoring but may increase search latency. See [The use_similarity parameter ](#the-use_similarity-parameter). `split_queries_on_whitespace` | A Boolean value that specifies whether full-text queries should be split on white space. Default is `false`. `store` | A Boolean value that specifies whether the field value should be stored and can be retrieved separately from the `_source` field. Default is `false`. +## The use_similarity parameter + +The `use_similarity` parameter controls whether OpenSearch calculates relevance scores when querying a `keyword` field. By default, it is set to `false`, which improves performance by using `constant_score`. Setting it to `true` enables scoring based on the configured similarity algorithm (typically, BM25) but may increase query latency. + +Run a term query on the index for which `use_similarity` is disabled (default): + +```json +GET /big5/_search +{ + "size": 3, + "explain": false, + "query": { + "term": { + "process.name": "kernel" + } + }, + "_source": false +} +``` +{% include copy-curl.html %} + +The query returns results quickly (10 ms), and all documents receive a constant relevance score of 1.0: + +```json +{ + "took": 10, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10000, + "relation": "gte" + }, + "max_score": 1, + "hits": [ + { + "_index": "big5", + "_id": "xDoCtJQBE3c7bAfikzbk", + "_score": 1 + }, + { + "_index": "big5", + "_id": "xzoCtJQBE3c7bAfikzbk", + "_score": 1 + }, + { + "_index": "big5", + "_id": "yDoCtJQBE3c7bAfikzbk", + "_score": 1 + } + ] + } +} +``` + +To enable scoring using the default BM25 algorithm for the `process.name` field, provide the `use_similarity` parameter in the index mappings: + +```json +PUT /big5/_mapping +{ + "properties": { + "process.name": { + "type": "keyword", + "use_similarity": true + } + } +} +``` + +When you run the same term query on the configured index, the query takes longer to run (200 ms), and the returned documents have varying relevance scores based on term frequency and other BM25 factors: + +```json +{ + "took" : 200, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 10000, + "relation" : "gte" + }, + "max_score" : 0.8844931, + "hits" : [ + { + "_index" : "big5", + "_id" : "xDoCtJQBE3c7bAfikzbk", + "_score" : 0.8844931 + }, + { + "_index" : "big5", + "_id" : "xzoCtJQBE3c7bAfikzbk", + "_score" : 0.8844931 + }, + { + "_index" : "big5", + "_id" : "yDoCtJQBE3c7bAfikzbk", + "_score" : 0.8844931 + } + ] + } +} +``` \ No newline at end of file diff --git a/_field-types/supported-field-types/knn-memory-optimized.md b/_field-types/supported-field-types/knn-memory-optimized.md new file mode 100644 index 00000000000..3f423ace3aa --- /dev/null +++ b/_field-types/supported-field-types/knn-memory-optimized.md @@ -0,0 +1,930 @@ +--- +layout: default +title: Memory-optimized vectors +parent: k-NN vector +grand_parent: Supported field types +nav_order: 30 +--- + +# Memory-optimized vectors + +Vector search operations can be memory intensive, particularly when dealing with large-scale deployments. OpenSearch provides several strategies for optimizing memory usage while maintaining search performance. You can choose between different workload modes that prioritize either low latency or low cost, apply various compression levels to reduce memory footprint, or use alternative vector representations like byte or binary vectors. These optimization techniques allow you to balance memory consumption, search performance, and cost based on your specific use case requirements. + +## Vector workload modes + +Vector search requires balancing search performance and operational costs. While in-memory search provides the lowest latency, [disk-based search]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/disk-based-vector-search/) offers a more cost-effective approach by reducing memory usage, though it results in slightly higher search latency. To choose between these approaches, use the `mode` mapping parameter in your `knn_vector` field configuration. This parameter sets appropriate default values for k-NN parameters based on your priority: either low latency or low cost. For additional optimization, you can override these default parameter values in your k-NN field mapping. + +OpenSearch supports the following vector workload modes. + +| Mode | Default engine | Description | +|:---|:---|:---| +| `in_memory` (Default) | `faiss` | Prioritizes low-latency search. This mode uses the `faiss` engine without any quantization applied. It is configured with the default parameter values for vector search in OpenSearch. | +| `on_disk` | `faiss` | Prioritizes low-cost vector search while maintaining strong recall. By default, the `on_disk` mode uses quantization and rescoring to execute a two-phase approach in order to retrieve the top neighbors. The `on_disk` mode supports only `float` vector types. | + +To create a vector index that uses the `on_disk` mode for low-cost search, send the following request: + +```json +PUT test-index +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector": { + "type": "knn_vector", + "dimension": 3, + "space_type": "l2", + "mode": "on_disk" + } + } + } +} +``` +{% include copy-curl.html %} + +### Compression levels + +The `compression_level` mapping parameter selects a quantization encoder that reduces vector memory consumption by the given factor. The following table lists the available `compression_level` values. + +| Compression level | Supported engines | +|:------------------|:---------------------------------------------| +| `1x` | `faiss`, `lucene`, and `nmslib` (deprecated) | +| `2x` | `faiss` | +| `4x` | `lucene` | +| `8x` | `faiss` | +| `16x` | `faiss` | +| `32x` | `faiss` | + +For example, if a `compression_level` of `32x` is passed for a `float32` index of 768-dimensional vectors, the per-vector memory is reduced from `4 * 768 = 3072` bytes to `3072 / 32 = 846` bytes. Internally, binary quantization (which maps a `float` to a `bit`) may be used to achieve this compression. + +If you set the `compression_level` parameter, then you cannot specify an `encoder` in the `method` mapping. Compression levels greater than `1x` are only supported for `float` vector types. +{: .note} + +Starting with OpenSearch 3.1, enabling `on_disk` mode with a `1x` compression level activates [memory-optimized search]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/memory-optimized-search/). In this mode, the engine loads data on demand during search instead of loading all data into memory at once. +{: .important} + +The following table lists the default `compression_level` values for the available workload modes. + +| Mode | Default compression level | +|:------------------|:-------------------------------| +| `in_memory` | `1x` | +| `on_disk` | `32x` | + + +To create a vector field with a `compression_level` of `16x`, specify the `compression_level` parameter in the mappings. This parameter overrides the default compression level for the `on_disk` mode from `32x` to `16x`, producing higher recall and accuracy at the expense of a larger memory footprint: + +```json +PUT test-index +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector": { + "type": "knn_vector", + "dimension": 3, + "space_type": "l2", + "mode": "on_disk", + "compression_level": "16x" + } + } + } +} +``` +{% include copy-curl.html %} + +## Rescoring quantized results to full precision + +To improve recall while maintaining the memory savings of quantization, you can use a two-phase search approach. In the first phase, `oversample_factor * k` results are retrieved from an index using quantized vectors and the scores are approximated. In the second phase, the full-precision vectors of those `oversample_factor * k` results are loaded into memory from disk, and scores are recomputed against the full-precision query vector. The results are then reduced to the top k. + +The default rescoring behavior is determined by the `mode` and `compression_level` of the backing k-NN vector field: + +- For `in_memory` mode, no rescoring is applied by default. +- For `on_disk` mode, default rescoring is based on the configured `compression_level`. Each `compression_level` provides a default `oversample_factor`, specified in the following table. + +| Compression level | Default rescore `oversample_factor` | +|:------------------|:------------------------------------| +| `32x` (default) | 3.0 | +| `16x` | 2.0 | +| `8x` | 2.0 | +| `4x` | 1.0 | +| `2x` | No default rescoring | + +To explicitly apply rescoring, provide the `rescore` parameter in a query on a quantized index and specify the `oversample_factor`: + +```json +GET /my-vector-index/_search +{ + "size": 2, + "query": { + "knn": { + "target-field": { + "vector": [2, 3, 5, 6], + "k": 2, + "rescore" : { + "oversample_factor": 1.2 + } + } + } + } +} +``` +{% include copy-curl.html %} + +Alternatively, set the `rescore` parameter to `true` to use the default `oversample_factor` of `1.0`: + +```json +GET /my-vector-index/_search +{ + "size": 2, + "query": { + "knn": { + "target-field": { + "vector": [2, 3, 5, 6], + "k": 2, + "rescore" : true + } + } + } +} +``` +{% include copy-curl.html %} + +The `oversample_factor` is a floating-point number between 1.0 and 100.0, inclusive. The number of results in the first pass is calculated as `oversample_factor * k` and is guaranteed to be between 100 and 10,000, inclusive. If the calculated number of results is smaller than 100, then the number of results is set to 100. If the calculated number of results is greater than 10,000, then the number of results is set to 10,000. + +Rescoring is available only for the Faiss and Lucene engines. +{: .note} + +Rescoring is not needed if quantization is not used because the scores returned are already fully precise. +{: .note} + + +## Byte vectors + +By default, k-NN vectors are `float` vectors, in which each dimension is 4 bytes. If you want to save storage space, you can use `byte` vectors with the `faiss` or `lucene` engine. In a `byte` vector, each dimension is a signed 8-bit integer in the [-128, 127] range. + +Byte vectors are supported only for the `lucene` and `faiss` engines. They are not supported for the `nmslib` engine. +{: .note} + +In [k-NN benchmarking tests](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/vectorsearch), the use of `byte` rather than `float` vectors resulted in a significant reduction in storage and memory usage as well as improved indexing throughput and reduced query latency. Additionally, recall precision was not greatly affected (note that recall can depend on various factors, such as the [quantization technique](#quantization-techniques) used and the data distribution). + +When using `byte` vectors, expect some loss of recall precision compared to using `float` vectors. Byte vectors are useful in large-scale applications and use cases that prioritize a reduced memory footprint in exchange for a minimal loss of recall. +{: .important} + +When using `byte` vectors with the `faiss` engine, we recommend using [Single Instruction Multiple Data (SIMD) optimization]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/#simd-optimization), which helps to significantly reduce search latencies and improve indexing throughput. +{: .important} + +Introduced in k-NN plugin version 2.9, the optional `data_type` parameter defines the data type of a vector. The default value of this parameter is `float`. + +To use a `byte` vector, set the `data_type` parameter to `byte` when creating mappings for an index. + +### Example: HNSW + +The following example creates a byte vector index with the `lucene` engine and `hnsw` algorithm: + +```json +PUT test-index +{ + "settings": { + "index": { + "knn": true, + "knn.algo_param.ef_search": 100 + } + }, + "mappings": { + "properties": { + "my_vector": { + "type": "knn_vector", + "dimension": 3, + "data_type": "byte", + "space_type": "l2", + "method": { + "name": "hnsw", + "engine": "lucene", + "parameters": { + "ef_construction": 100, + "m": 16 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +After creating the index, ingest documents as usual. Make sure each dimension in the vector is in the supported [-128, 127] range: + +```json +PUT test-index/_doc/1 +{ + "my_vector": [-126, 28, 127] +} +``` +{% include copy-curl.html %} + +```json +PUT test-index/_doc/2 +{ + "my_vector": [100, -128, 0] +} +``` +{% include copy-curl.html %} + +When querying, be sure to use a `byte` vector: + +```json +GET test-index/_search +{ + "size": 2, + "query": { + "knn": { + "my_vector": { + "vector": [26, -120, 99], + "k": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +### Example: IVF + +The `ivf` method requires a training step that creates a model and trains it to initialize the native library index during segment creation. For more information, see [Building a vector index from a model]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-vector-index-from-a-model). + +First, create an index that will contain byte vector training data. Specify the `faiss` engine and `ivf` algorithm and make sure that the `dimension` matches the dimension of the model you want to create: + +```json +PUT train-index +{ + "mappings": { + "properties": { + "train-field": { + "type": "knn_vector", + "dimension": 4, + "data_type": "byte" + } + } + } +} +``` +{% include copy-curl.html %} + +First, ingest training data containing byte vectors into the training index: + +```json +PUT _bulk +{ "index": { "_index": "train-index", "_id": "1" } } +{ "train-field": [127, 100, 0, -120] } +{ "index": { "_index": "train-index", "_id": "2" } } +{ "train-field": [2, -128, -10, 50] } +{ "index": { "_index": "train-index", "_id": "3" } } +{ "train-field": [13, -100, 5, 126] } +{ "index": { "_index": "train-index", "_id": "4" } } +{ "train-field": [5, 100, -6, -125] } +``` +{% include copy-curl.html %} + +Then, create and train the model named `byte-vector-model`. The model will be trained using the training data from the `train-field` in the `train-index`. Specify the `byte` data type: + +```json +POST _plugins/_knn/models/byte-vector-model/_train +{ + "training_index": "train-index", + "training_field": "train-field", + "dimension": 4, + "description": "model with byte data", + "data_type": "byte", + "method": { + "name": "ivf", + "engine": "faiss", + "space_type": "l2", + "parameters": { + "nlist": 1, + "nprobes": 1 + } + } +} +``` +{% include copy-curl.html %} + +To check the model training status, call the Get Model API: + +```json +GET _plugins/_knn/models/byte-vector-model?filter_path=state +``` +{% include copy-curl.html %} + +Once the training is complete, the `state` changes to `created`. + +Next, create an index that will initialize its native library indexes using the trained model: + +```json +PUT test-byte-ivf +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector": { + "type": "knn_vector", + "model_id": "byte-vector-model" + } + } + } +} +``` +{% include copy-curl.html %} + +Ingest the data containing the byte vectors that you want to search into the created index: + +```json +PUT _bulk?refresh=true +{"index": {"_index": "test-byte-ivf", "_id": "1"}} +{"my_vector": [7, 10, 15, -120]} +{"index": {"_index": "test-byte-ivf", "_id": "2"}} +{"my_vector": [10, -100, 120, -108]} +{"index": {"_index": "test-byte-ivf", "_id": "3"}} +{"my_vector": [1, -2, 5, -50]} +{"index": {"_index": "test-byte-ivf", "_id": "4"}} +{"my_vector": [9, -7, 45, -78]} +{"index": {"_index": "test-byte-ivf", "_id": "5"}} +{"my_vector": [80, -70, 127, -128]} +``` +{% include copy-curl.html %} + +Finally, search the data. Be sure to provide a byte vector in the k-NN vector field: + +```json +GET test-byte-ivf/_search +{ + "size": 2, + "query": { + "knn": { + "my_vector": { + "vector": [100, -120, 50, -45], + "k": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +### Memory estimation + +In the best-case scenario, byte vectors require 25% of the memory required by 32-bit vectors. + +#### HNSW memory estimation + +The memory required for Hierarchical Navigable Small World (HNSW) is estimated to be `1.1 * (dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph. + +As an example, assume that you have 1 million vectors with a `dimension` of `256` and an `m` of `16`. The memory requirement can be estimated as follows: + +```r +1.1 * (256 + 8 * 16) * 1,000,000 ~= 0.39 GB +``` + +#### IVF memory estimation + +The memory required for Inverted File Index (IVF) is estimated to be `1.1 * ((dimension * num_vectors) + (4 * nlist * dimension))` bytes/vector, where `nlist` is the number of buckets into which to partition vectors. + +As an example, assume that you have 1 million vectors with a `dimension` of `256` and an `nlist` of `128`. The memory requirement can be estimated as follows: + +```r +1.1 * ((256 * 1,000,000) + (4 * 128 * 256)) ~= 0.27 GB +``` + + +### Quantization techniques + +If your vectors are of the type `float`, you need to first convert them to the `byte` type before ingesting documents. This conversion is accomplished by _quantizing the dataset_---reducing the precision of its vectors. The Faiss engine supports several quantization techniques, such as scalar quantization (SQ) and product quantization (PQ). The choice of quantization technique depends on the type of data you're using and can affect the accuracy of recall values. The following sections describe the scalar quantization algorithms that were used to quantize the [k-NN benchmarking test](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/vectorsearch) data for the [L2](#scalar-quantization-for-the-l2-space-type) and [cosine similarity](#scalar-quantization-for-the-cosine-similarity-space-type) space types. The provided pseudocode is for illustration purposes only. + +#### Scalar quantization for the L2 space type + +The following example pseudocode illustrates the scalar quantization technique used for the benchmarking tests on Euclidean datasets with the L2 space type. Euclidean distance is shift invariant. If you shift both $$x$$ and $$y$$ by the same $$z$$, then the distance remains the same ($$\lVert x-y\rVert =\lVert (x-z)-(y-z)\rVert$$). + +```python +# Random dataset (Example to create a random dataset) +dataset = np.random.uniform(-300, 300, (100, 10)) +# Random query set (Example to create a random queryset) +queryset = np.random.uniform(-350, 350, (100, 10)) +# Number of values +B = 256 + +# INDEXING: +# Get min and max +dataset_min = np.min(dataset) +dataset_max = np.max(dataset) +# Shift coordinates to be non-negative +dataset -= dataset_min +# Normalize into [0, 1] +dataset *= 1. / (dataset_max - dataset_min) +# Bucket into 256 values +dataset = np.floor(dataset * (B - 1)) - int(B / 2) + +# QUERYING: +# Clip (if queryset range is out of datset range) +queryset = queryset.clip(dataset_min, dataset_max) +# Shift coordinates to be non-negative +queryset -= dataset_min +# Normalize +queryset *= 1. / (dataset_max - dataset_min) +# Bucket into 256 values +queryset = np.floor(queryset * (B - 1)) - int(B / 2) +``` +{% include copy.html %} + +#### Scalar quantization for the cosine similarity space type + +The following example pseudocode illustrates the scalar quantization technique used for the benchmarking tests on angular datasets with the cosine similarity space type. Cosine similarity is not shift invariant ($$cos(x, y) \neq cos(x-z, y-z)$$). + +The following pseudocode is for positive numbers: + +```python +# For Positive Numbers + +# INDEXING and QUERYING: + +# Get Max of train dataset +max = np.max(dataset) +min = 0 +B = 127 + +# Normalize into [0,1] +val = (val - min) / (max - min) +val = (val * B) + +# Get int and fraction values +int_part = floor(val) +frac_part = val - int_part + +if 0.5 < frac_part: + bval = int_part + 1 +else: + bval = int_part + +return Byte(bval) +``` +{% include copy.html %} + +The following pseudocode is for negative numbers: + +```python +# For Negative Numbers + +# INDEXING and QUERYING: + +# Get Min of train dataset +min = 0 +max = -np.min(dataset) +B = 128 + +# Normalize into [0,1] +val = (val - min) / (max - min) +val = (val * B) + +# Get int and fraction values +int_part = floor(var) +frac_part = val - int_part + +if 0.5 < frac_part: + bval = int_part + 1 +else: + bval = int_part + +return Byte(bval) +``` +{% include copy.html %} + +## Binary vectors + +You can reduce memory costs by a factor of 32 by switching from float to binary vectors. Using binary vector indexes can lower operational costs while maintaining high recall performance, making large-scale deployment more economical and efficient. + +Binary format is available for the following k-NN search types: + +- [Approximate k-NN]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/): Supports binary vectors only for the Faiss engine with the HNSW and IVF algorithms. +- [Script score k-NN]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-score-script/): Enables the use of binary vectors in script scoring. +- [Painless extensions]({{site.url}}{{site.baseurl}}/search-plugins/knn/painless-functions/): Allows the use of binary vectors with Painless scripting extensions. + +### Requirements + +There are several requirements for using binary vectors in the OpenSearch k-NN plugin: + +- The `data_type` of the binary vector index must be `binary`. +- The `space_type` of the binary vector index must be `hamming`. +- The `dimension` of the binary vector index must be a multiple of 8. +- You must convert your binary data into 8-bit signed integers (`int8`) in the [-128, 127] range. For example, the binary sequence of 8 bits `0, 1, 1, 0, 0, 0, 1, 1` must be converted into its equivalent byte value of `99` in order to be used as a binary vector input. + +### Example: HNSW + +To create a binary vector index with the Faiss engine and HNSW algorithm, send the following request: + +```json +PUT /test-binary-hnsw +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector": { + "type": "knn_vector", + "dimension": 8, + "data_type": "binary", + "space_type": "hamming", + "method": { + "name": "hnsw", + "engine": "faiss" + } + } + } + } +} +``` +{% include copy-curl.html %} + +Then ingest some documents containing binary vectors: + +```json +PUT _bulk +{"index": {"_index": "test-binary-hnsw", "_id": "1"}} +{"my_vector": [7], "price": 4.4} +{"index": {"_index": "test-binary-hnsw", "_id": "2"}} +{"my_vector": [10], "price": 14.2} +{"index": {"_index": "test-binary-hnsw", "_id": "3"}} +{"my_vector": [15], "price": 19.1} +{"index": {"_index": "test-binary-hnsw", "_id": "4"}} +{"my_vector": [99], "price": 1.2} +{"index": {"_index": "test-binary-hnsw", "_id": "5"}} +{"my_vector": [80], "price": 16.5} +``` +{% include copy-curl.html %} + +When querying, be sure to use a binary vector: + +```json +GET /test-binary-hnsw/_search +{ + "size": 2, + "query": { + "knn": { + "my_vector": { + "vector": [9], + "k": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the two vectors closest to the query vector: + +<details markdown="block"> + <summary> + Response + </summary> + {: .text-delta} + +```json +{ + "took": 8, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 0.5, + "hits": [ + { + "_index": "test-binary-hnsw", + "_id": "2", + "_score": 0.5, + "_source": { + "my_vector": [ + 10 + ], + "price": 14.2 + } + }, + { + "_index": "test-binary-hnsw", + "_id": "5", + "_score": 0.25, + "_source": { + "my_vector": [ + 80 + ], + "price": 16.5 + } + } + ] + } +} +``` +</details> + +### Example: IVF + +The IVF method requires a training step that creates a model and trains it to initialize the native library index during segment creation. For more information, see [Building a vector index from a model]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-vector-index-from-a-model). + +First, create an index that will contain binary vector training data. Specify the Faiss engine and IVF algorithm and make sure that the `dimension` matches the dimension of the model you want to create: + +```json +PUT train-index +{ + "mappings": { + "properties": { + "train-field": { + "type": "knn_vector", + "dimension": 8, + "data_type": "binary" + } + } + } +} +``` +{% include copy-curl.html %} + +Ingest training data containing binary vectors into the training index: + +<details markdown="block"> + <summary> + Bulk ingest request + </summary> + {: .text-delta} + +```json +PUT _bulk +{ "index": { "_index": "train-index", "_id": "1" } } +{ "train-field": [1] } +{ "index": { "_index": "train-index", "_id": "2" } } +{ "train-field": [2] } +{ "index": { "_index": "train-index", "_id": "3" } } +{ "train-field": [3] } +{ "index": { "_index": "train-index", "_id": "4" } } +{ "train-field": [4] } +{ "index": { "_index": "train-index", "_id": "5" } } +{ "train-field": [5] } +{ "index": { "_index": "train-index", "_id": "6" } } +{ "train-field": [6] } +{ "index": { "_index": "train-index", "_id": "7" } } +{ "train-field": [7] } +{ "index": { "_index": "train-index", "_id": "8" } } +{ "train-field": [8] } +{ "index": { "_index": "train-index", "_id": "9" } } +{ "train-field": [9] } +{ "index": { "_index": "train-index", "_id": "10" } } +{ "train-field": [10] } +{ "index": { "_index": "train-index", "_id": "11" } } +{ "train-field": [11] } +{ "index": { "_index": "train-index", "_id": "12" } } +{ "train-field": [12] } +{ "index": { "_index": "train-index", "_id": "13" } } +{ "train-field": [13] } +{ "index": { "_index": "train-index", "_id": "14" } } +{ "train-field": [14] } +{ "index": { "_index": "train-index", "_id": "15" } } +{ "train-field": [15] } +{ "index": { "_index": "train-index", "_id": "16" } } +{ "train-field": [16] } +{ "index": { "_index": "train-index", "_id": "17" } } +{ "train-field": [17] } +{ "index": { "_index": "train-index", "_id": "18" } } +{ "train-field": [18] } +{ "index": { "_index": "train-index", "_id": "19" } } +{ "train-field": [19] } +{ "index": { "_index": "train-index", "_id": "20" } } +{ "train-field": [20] } +{ "index": { "_index": "train-index", "_id": "21" } } +{ "train-field": [21] } +{ "index": { "_index": "train-index", "_id": "22" } } +{ "train-field": [22] } +{ "index": { "_index": "train-index", "_id": "23" } } +{ "train-field": [23] } +{ "index": { "_index": "train-index", "_id": "24" } } +{ "train-field": [24] } +{ "index": { "_index": "train-index", "_id": "25" } } +{ "train-field": [25] } +{ "index": { "_index": "train-index", "_id": "26" } } +{ "train-field": [26] } +{ "index": { "_index": "train-index", "_id": "27" } } +{ "train-field": [27] } +{ "index": { "_index": "train-index", "_id": "28" } } +{ "train-field": [28] } +{ "index": { "_index": "train-index", "_id": "29" } } +{ "train-field": [29] } +{ "index": { "_index": "train-index", "_id": "30" } } +{ "train-field": [30] } +{ "index": { "_index": "train-index", "_id": "31" } } +{ "train-field": [31] } +{ "index": { "_index": "train-index", "_id": "32" } } +{ "train-field": [32] } +{ "index": { "_index": "train-index", "_id": "33" } } +{ "train-field": [33] } +{ "index": { "_index": "train-index", "_id": "34" } } +{ "train-field": [34] } +{ "index": { "_index": "train-index", "_id": "35" } } +{ "train-field": [35] } +{ "index": { "_index": "train-index", "_id": "36" } } +{ "train-field": [36] } +{ "index": { "_index": "train-index", "_id": "37" } } +{ "train-field": [37] } +{ "index": { "_index": "train-index", "_id": "38" } } +{ "train-field": [38] } +{ "index": { "_index": "train-index", "_id": "39" } } +{ "train-field": [39] } +{ "index": { "_index": "train-index", "_id": "40" } } +{ "train-field": [40] } +``` +{% include copy-curl.html %} +</details> + +Then, create and train the model named `test-binary-model`. The model will be trained using the training data from the `train_field` in the `train-index`. Specify the `binary` data type and `hamming` space type: + +```json +POST _plugins/_knn/models/test-binary-model/_train +{ + "training_index": "train-index", + "training_field": "train-field", + "dimension": 8, + "description": "model with binary data", + "data_type": "binary", + "space_type": "hamming", + "method": { + "name": "ivf", + "engine": "faiss", + "parameters": { + "nlist": 16, + "nprobes": 1 + } + } +} +``` +{% include copy-curl.html %} + +To check the model training status, call the Get Model API: + +```json +GET _plugins/_knn/models/test-binary-model?filter_path=state +``` +{% include copy-curl.html %} + +Once the training is complete, the `state` changes to `created`. + +Next, create an index that will initialize its native library indexes using the trained model: + +```json +PUT test-binary-ivf +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector": { + "type": "knn_vector", + "model_id": "test-binary-model" + } + } + } +} +``` +{% include copy-curl.html %} + +Ingest the data containing the binary vectors that you want to search into the created index: + +```json +PUT _bulk?refresh=true +{"index": {"_index": "test-binary-ivf", "_id": "1"}} +{"my_vector": [7], "price": 4.4} +{"index": {"_index": "test-binary-ivf", "_id": "2"}} +{"my_vector": [10], "price": 14.2} +{"index": {"_index": "test-binary-ivf", "_id": "3"}} +{"my_vector": [15], "price": 19.1} +{"index": {"_index": "test-binary-ivf", "_id": "4"}} +{"my_vector": [99], "price": 1.2} +{"index": {"_index": "test-binary-ivf", "_id": "5"}} +{"my_vector": [80], "price": 16.5} +``` +{% include copy-curl.html %} + +Finally, search the data. Be sure to provide a binary vector in the k-NN vector field: + +```json +GET test-binary-ivf/_search +{ + "size": 2, + "query": { + "knn": { + "my_vector": { + "vector": [8], + "k": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the two vectors closest to the query vector: + +<details markdown="block"> + <summary> + Response + </summary> + {: .text-delta} + +```json +GET /_plugins/_knn/models/my-model?filter_path=state +{ + "took": 7, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 0.5, + "hits": [ + { + "_index": "test-binary-ivf", + "_id": "2", + "_score": 0.5, + "_source": { + "my_vector": [ + 10 + ], + "price": 14.2 + } + }, + { + "_index": "test-binary-ivf", + "_id": "3", + "_score": 0.25, + "_source": { + "my_vector": [ + 15 + ], + "price": 19.1 + } + } + ] + } +} +``` +</details> + +### Memory estimation + +Use the following formulas to estimate the amount of memory required for binary vectors. + +#### HNSW memory estimation + +The memory required for HNSW can be estimated using the following formula, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph: + +```r +1.1 * (dimension / 8 + 8 * m) bytes/vector +``` + +#### IVF memory estimation + +The memory required for IVF can be estimated using the following formula, where `nlist` is the number of buckets into which to partition vectors: + +```r +1.1 * (((dimension / 8) * num_vectors) + (nlist * dimension / 8)) +``` + +## Next steps + +- [k-NN query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/k-nn/) +- [Disk-based vector search]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/disk-based-vector-search/) +- [Vector quantization]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/knn-vector-quantization/) diff --git a/_field-types/supported-field-types/knn-methods-engines.md b/_field-types/supported-field-types/knn-methods-engines.md new file mode 100644 index 00000000000..d854b99e403 --- /dev/null +++ b/_field-types/supported-field-types/knn-methods-engines.md @@ -0,0 +1,422 @@ +--- +layout: default +title: Methods and engines +parent: k-NN vector +grand_parent: Supported field types +nav_order: 20 +--- + +# Methods and engines + +A _method_ defines the algorithm used for organizing vector data at indexing time and searching it at search time in [approximate k-NN search]({{site.url}}{{site.baseurl}}/vector-search/vector-search-techniques/approximate-knn/). + +OpenSearch supports the following methods: + +- **Hierarchical Navigable Small World (HNSW)** creates a hierarchical graph structure of connections between vectors. For more information about the algorithm, see [Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs](https://arxiv.org/abs/1603.09320). +- **Inverted File Index (IVF)** organizes vectors into buckets based on clustering and, during search, searches only a subset of the buckets. + +An _engine_ is the library that implements these methods. Different engines can implement the same method, sometimes with varying optimizations or characteristics. For example, HNSW is implemented by all supported engines, each with its own advantages. + +OpenSearch supports the following engines: +- [**Lucene**](#lucene-engine): The native search library, offering an HNSW implementation with efficient filtering capabilities +- [**Faiss**](#faiss-engine) (Facebook AI Similarity Search): A comprehensive library implementing both the HNSW and IVF methods, with additional vector compression options +- [**NMSLIB**](#nmslib-engine-deprecated) (Non-Metric Space Library): A legacy implementation of HNSW (now deprecated) + +## Method definition example + +A method definition contains the following components: + +- The `name` of the method (for example, `hnsw` or `ivf`) +- The `space_type` for which the method is built (for example, `l2` or `cosinesimil`) +- The `engine` that will implement the method (for example, `faiss` or `lucene`) +- A map of `parameters` specific to that implementation + +The following example configures an `hnsw` method with the `l2` space type, the `faiss` engine, and the method-specific parameters: + +```json +PUT test-index +{ + "settings": { + "index": { + "knn": true, + "knn.algo_param.ef_search": 100 + } + }, + "mappings": { + "properties": { + "my_vector1": { + "type": "knn_vector", + "dimension": 1024, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "faiss", + "parameters": { + "ef_construction": 128, + "m": 24 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +Not every method/engine combination supports each of the spaces. For a list of supported spaces, see the section for a specific engine. +{: .note} + +## Common parameters + +The following parameters are common to all method definitions. + +Mapping parameter | Required | Default | Updatable | Description +:--- | :--- | :--- | :--- | :--- +`name` | Yes | N/A | No | The nearest neighbor method. Valid values are `hnsw` and `ivf`. Not every engine combination supports each of the methods. For a list of supported methods, see the section for a specific engine. +`space_type` | No | `l2` | No | The vector space used to calculate the distance between vectors. Valid values are `l1`, `l2`, `linf`, `cosinesimil`, `innerproduct`, `hamming`, and `hammingbit`. Not every method/engine combination supports each of the spaces. For a list of supported spaces, see the section for a specific engine. Note: This value can also be specified at the top level of the mapping. For more information, see [Spaces]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-spaces/). +`engine` | No | `faiss` | No | The approximate k-NN library to use for indexing and search. Valid values are `faiss`, `lucene`, and `nmslib` (deprecated). +`parameters` | No | `null` | No | The parameters used for the nearest neighbor method. For more information, see the section for a specific engine. + +## Lucene engine + +The Lucene engine provides a native implementation of vector search directly within Lucene. It offers efficient filtering capabilities and is well suited for smaller deployments. + +### Supported methods + +The Lucene engine supports the following method. + +Method name | Requires training | Supported spaces +:--- | :--- |:--- +[`hnsw`](#hnsw-parameters) | No | `l2`, `cosinesimil`, `innerproduct` (supported in OpenSearch 2.13 and later) + +#### HNSW parameters + +The HNSW method supports the following parameters. + +Parameter name | Required | Default | Updatable | Description +:--- | :--- | :--- | :--- | :--- +`ef_construction` | No | 100 | No | The size of the dynamic list used during k-NN graph creation. Higher values result in a more accurate graph but slower indexing speed.<br>Note: Lucene uses the term `beam_width` internally, but the OpenSearch documentation uses `ef_construction` for consistency. +`m` | No | 16 | No | The number of bidirectional links created for each new element. Impacts memory consumption significantly. Keep between `2` and `100`.<br>Note: Lucene uses the term `max_connections` internally, but the OpenSearch documentation uses `m` for consistency. + +The Lucene HNSW implementation ignores `ef_search` and dynamically sets it to the value of "k" in the search request. There is therefore no need to configure settings for `ef_search` when using the Lucene engine. +{: .note} + +An index created in OpenSearch version 2.11 or earlier will still use the previous `ef_construction` value (`512`). +{: .note} + +### Example configuration + +```json +"method": { + "name": "hnsw", + "engine": "lucene", + "parameters": { + "m": 2048, + "ef_construction": 245 + } +} +``` + +## Faiss engine + +The Faiss engine provides advanced vector indexing capabilities with support for multiple methods and encoding options to optimize memory usage and search performance. + +### Supported methods + +The Faiss engine supports the following methods. + +Method name | Requires training | Supported spaces +:--- | :--- |:--- +[`hnsw`](#hnsw-parameters-1) | No | `l2`, `innerproduct` (not available when [PQ](#pq-parameters) is used), `hamming`, and `cosinesimil` (supported in OpenSearch 2.19 and later). +[`ivf`](#ivf-parameters) | Yes | `l2`, `innerproduct`, `hamming` (supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized#binary-vectors), `cosinesimil` (supported in OpenSearch 2.19 and later). + + +#### HNSW parameters + +The `hnsw` method supports the following parameters. + +Parameter name | Required | Default | Updatable | Description +:--- | :--- | :--- | :--- | :--- +`ef_search` | No | 100 | No | The size of the dynamic list used during k-NN searches. Higher values result in more accurate but slower searches. Default is `256` for [binary indexes]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/binary-quantization/). +`ef_construction` | No | 100 | No | The size of the dynamic list used during k-NN graph creation. Higher values result in a more accurate graph but slower indexing speed. Default is `256` for [binary indexes]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/binary-quantization/). +`m` | No | 16 | No | The number of bidirectional links that the plugin creates for each new element. Increasing and decreasing this value can have a large impact on memory consumption. Keep this value between `2` and `100`. +`encoder` | No | flat | No | An encoder definition for encoding vectors. Encoders can reduce the memory footprint of your index at the expense of search accuracy. + +An index created in OpenSearch version 2.11 or earlier will still use the previous `ef_construction` value (`512`). +{: .note} + +#### IVF parameters + +The IVF method supports the following parameters. + +Parameter name | Required | Default | Updatable | Description +:--- | :--- | :--- | :--- | :--- +`nlist` | No | 4 | No | The number of buckets into which to partition vectors. Higher values may increase accuracy but also increase memory and training latency. +`nprobes` | No | 1 | No | The number of buckets to search during a query. Higher values result in more accurate but slower searches. +`encoder` | No | flat | No | An encoder definition for encoding vectors. + +For more information about these parameters, see the [Faiss documentation](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes). + +### IVF training requirements + +The IVF algorithm requires a training step. To create an index that uses IVF, you need to train a model with the [Train API]({{site.url}}{{site.baseurl}}/vector-search/api/knn#train-a-model), passing the IVF method definition. IVF requires, at a minimum, that there be `nlist` training data points, but we recommend [that you use more than this](https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index#how-big-is-the-dataset). Training data can be the same as the data you plan to index or come from a separate dataset. + +### Supported encoders + +You can use encoders to reduce the memory footprint of a vector index at the expense of search accuracy. + +OpenSearch currently supports the following encoders in the Faiss library. + +Encoder name | Requires training | Description +:--- | :--- | :--- +`flat` (Default) | No | Encode vectors as floating-point arrays. This encoding does not reduce memory footprint. +[`pq`](#pq-parameters) | Yes | An abbreviation for _product quantization_, PQ is a lossy compression technique that uses clustering to encode a vector into a fixed byte size, with the goal of minimizing the drop in k-NN search accuracy. At a high level, vectors are separated into `m` subvectors, and then each subvector is represented by a `code_size` code obtained from a code book produced during training. For more information about product quantization, see [this blog post](https://medium.com/dotstar/understanding-faiss-part-2-79d90b1e5388). +[`sq`](#sq-parameters) | No | An abbreviation for _scalar quantization_. Starting with OpenSearch version 2.13, you can use the `sq` encoder to quantize 32-bit floating-point vectors into 16-bit floats. In version 2.13, the built-in `sq` encoder is the SQFP16 Faiss encoder. The encoder reduces memory footprint with a minimal loss of precision and improves performance by using SIMD optimization (using AVX2 on x86 architecture or Neon on ARM64 architecture). For more information, see [Faiss scalar quantization]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/faiss-16-bit-quantization/). + +#### PQ parameters + +The `pq` encoder supports the following parameters. + +Parameter name | Required | Default | Updatable | Description +:--- | :--- | :--- | :--- | :--- +`m` | No | `1` | No | Determines the number of subvectors into which to separate the vector. Subvectors are encoded independently of each other. This vector dimension must be divisible by `m`. Maximum value is 1,024. +`code_size` | No | `8` | No | Determines the number of bits into which to encode a subvector. Maximum value is `8`. For `ivf`, this value must be less than or equal to `8`. For `hnsw`, this value must be `8`. + +The `hnsw` method supports the `pq` encoder for OpenSearch version 2.10 and later. The `code_size` parameter of a `pq` encoder with the `hnsw` method must be **8**. +{: .important} + +#### SQ parameters + +The `sq` encoder supports the following parameters. + +Parameter name | Required | Default | Updatable | Description +:--- | :--- | :-- | :--- | :--- +`type` | No | `fp16` | No | The type of scalar quantization to be used to encode 32-bit float vectors into the corresponding type. As of OpenSearch 2.13, only the `fp16` encoder type is supported. For the `fp16` encoder, vector values must be in the [-65504.0, 65504.0] range. +`clip` | No | `false` | No | If `true`, then any vector values outside of the supported range for the specified vector type are rounded so that they are within the range. If `false`, then the request is rejected if any vector values are outside of the supported range. Setting `clip` to `true` may decrease recall. + +For more information and examples, see [Using Faiss scalar quantization]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/faiss-16-bit-quantization/). + +### SIMD optimization + +Starting with version 2.13, OpenSearch supports [Single Instruction Multiple Data (SIMD)](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) processing if the underlying hardware supports SIMD instructions (AVX2 on x64 architecture and Neon on ARM64 architecture). SIMD is supported by default on Linux machines only for the Faiss engine. SIMD architecture helps boost overall performance by improving indexing throughput and reducing search latency. Starting with version 2.18, OpenSearch supports AVX-512 SIMD instructions on x64 architecture. Starting with version 2.19, OpenSearch supports advanced AVX-512 SIMD instructions on x64 architecture for Intel Sapphire Rapids or a newer-generation processor, improving the performance of Hamming distance computation. + +SIMD optimization is applicable only if the vector dimension is a multiple of 8. +{: .note} + +<!-- vale off --> +#### x64 architecture +<!-- vale on --> + +For x64 architecture, the following versions of the Faiss library are built and shipped with the artifact: + +- `libopensearchknn_faiss_avx512_spr.so`: The Faiss library containing advanced AVX-512 SIMD instructions for newer-generation processors, available on public clouds such as AWS for c/m/r 7i or newer instances. +- `libopensearchknn_faiss_avx512.so`: The Faiss library containing AVX-512 SIMD instructions. +- `libopensearchknn_faiss_avx2.so`: The Faiss library containing AVX2 SIMD instructions. +- `libopensearchknn_faiss.so`: The non-optimized Faiss library without SIMD instructions. + +When using the Faiss library, the performance ranking is as follows: advanced AVX-512 > AVX-512 > AVX2 > no optimization. +{: .note } + +If your hardware supports advanced AVX-512(spr), OpenSearch loads the `libopensearchknn_faiss_avx512_spr.so` library at runtime. + +If your hardware supports AVX-512, OpenSearch loads the `libopensearchknn_faiss_avx512.so` library at runtime. + +If your hardware supports AVX2 but doesn't support AVX-512, OpenSearch loads the `libopensearchknn_faiss_avx2.so` library at runtime. + +To disable the advanced AVX-512 (for Sapphire Rapids or newer-generation processors), AVX-512, and AVX2 SIMD instructions and load the non-optimized Faiss library (`libopensearchknn_faiss.so`), specify the `knn.faiss.avx512_spr.disabled`, `knn.faiss.avx512.disabled`, and `knn.faiss.avx2.disabled` static settings as `true` in `opensearch.yml` (by default, all of these are set to `false`). + +Note that to update a static setting, you must stop the cluster, change the setting, and restart the cluster. For more information, see [Static settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#static-settings). + +#### ARM64 architecture + +For the ARM64 architecture, only one performance-boosting Faiss library (`libopensearchknn_faiss.so`) is built and shipped. The library contains Neon SIMD instructions and cannot be disabled. + +### Example configurations + +The following example uses the `ivf` method without specifying an encoder (by default, OpenSearch uses the `flat` encoder): + +```json +"method": { + "name":"ivf", + "engine":"faiss", + "parameters":{ + "nlist": 4, + "nprobes": 2 + } +} +``` + +The following example uses the `ivf` method with a `pq` encoder: + +```json +"method": { + "name":"ivf", + "engine":"faiss", + "parameters":{ + "encoder":{ + "name":"pq", + "parameters":{ + "code_size": 8, + "m": 8 + } + } + } +} +``` + +The following example uses the `hnsw` method without specifying an encoder (by default, OpenSearch uses the `flat` encoder): + +```json +"method": { + "name":"hnsw", + "engine":"faiss", + "parameters":{ + "ef_construction": 256, + "m": 8 + } +} +``` + +The following example uses the `ivf` method with an `sq` encoder of type `fp16`: + +```json +"method": { + "name":"ivf", + "engine":"faiss", + "parameters":{ + "encoder": { + "name": "sq", + "parameters": { + "type": "fp16", + "clip": false + } + }, + "nprobes": 2 + } +} +``` + +The following example uses the `hnsw` method with an `sq` encoder of type `fp16` with `clip` enabled: + +```json +"method": { + "name":"hnsw", + "engine":"faiss", + "parameters":{ + "encoder": { + "name": "sq", + "parameters": { + "type": "fp16", + "clip": true + } + }, + "ef_construction": 256, + "m": 8 + } +} +``` + +## NMSLIB engine (deprecated) + +The Non-Metric Space Library (NMSLIB) engine was one of the first vector search implementations in OpenSearch. While still supported, it has been deprecated in favor of the Faiss and Lucene engines. + +### Supported methods + +The NMSLIB engine supports the following method. + +Method name | Requires training | Supported spaces +:--- | :--- | :--- +[`hnsw`](#hnsw-parameters-2) | No | `l2`, `innerproduct`, `cosinesimil`, `l1`, `linf` + +#### HNSW parameters + +The HNSW method supports the following parameters. + +Parameter name | Required | Default | Updatable | Description +:--- | :--- | :--- | :--- | :--- +`ef_construction` | No | 100 | No | The size of the dynamic list used during k-NN graph creation. Higher values result in a more accurate graph but slower indexing speed. +`m` | No | 16 | No | The number of bidirectional links created for each new element. Impacts memory consumption significantly. Keep between `2` and `100`. + +For NMSLIB (deprecated), *ef_search* is set in the [index settings]({{site.url}}{{site.baseurl}}/vector-search/settings/#index-settings). +{: .note} + +An index created in OpenSearch version 2.11 or earlier will still use the previous `ef_construction` value (`512`). +{: .note} + +### Example configuration + +```json +"method": { + "name": "hnsw", + "engine": "nmslib", + "space_type": "l2", + "parameters": { + "ef_construction": 100, + "m": 16 + } +} +``` + +## Choosing the right method + +There are several options to choose from when building your `knn_vector` field. To select the correct method and parameters, you should first understand the requirements of your workload and what trade-offs you are willing to make. Factors to consider are (1) query latency, (2) query quality, (3) memory limits, and (4) indexing latency. + +If memory is not a concern, HNSW offers a strong query latency/query quality trade-off. + +If you want to use less memory and increase indexing speed as compared to HNSW while maintaining similar query quality, you should evaluate IVF. + +If memory is a concern, consider adding a PQ encoder to your HNSW or IVF index. Because PQ is a lossy encoding, query quality will drop. + +You can reduce the memory footprint by a factor of 2, with a minimal loss in search quality, by using the [`fp_16` encoder]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/faiss-16-bit-quantization/). If your vector dimensions are within the [-128, 127] byte range, we recommend using the [byte quantizer]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/#byte-vectors) to reduce the memory footprint by a factor of 4. To learn more about vector quantization options, see [k-NN vector quantization]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/knn-vector-quantization/). + +## Engine recommendations + +In general, select Faiss for large-scale use cases. Lucene is a good option for smaller deployments and offers benefits like smart filtering, where the optimal filtering strategy—pre-filtering, post-filtering, or exact k-NN—is automatically applied depending on the situation. The following table summarizes the differences between each option. + +| | Faiss/HNSW | Faiss/IVF | Lucene/HNSW | +|:---|:---|:---|:---| +| Max dimensions | 16,000 | 16,000 | 16,000 | +| Filter | Post-filter | Post-filter | Filter during search | +| Training required | No (Yes for PQ) | Yes | No | +| Similarity metrics | `l2`, `innerproduct`, `cosinesimil` | `l2`, `innerproduct`, `cosinesimil` | `l2`, `cosinesimil` | +| Number of vectors | Tens of billions | Tens of billions | Less than 10 million | +| Indexing latency | Low | Lowest | Low | +| Query latency and quality | Low latency and high quality | Low latency and low quality | High latency and high quality | +| Vector compression | Flat <br><br>PQ | Flat <br><br>PQ | Flat | +| Memory consumption | High <br><br> Low with PQ | Medium <br><br> Low with PQ | High | + +## Memory estimation + +In a typical OpenSearch cluster, a certain portion of RAM is reserved for the JVM heap. OpenSearch allocates native library indexes to a portion of the remaining RAM. This portion's size is determined by the `circuit_breaker_limit` cluster setting. By default, the limit is set to 50%. + +Using a replica doubles the total number of vectors. +{: .note } + +For information about using memory estimation with vector quantization, see [Vector quantization]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/knn-vector-quantization/). +{: .note } + +### HNSW memory estimation + +The memory required for HNSW is estimated to be `1.1 * (4 * dimension + 8 * m)` bytes/vector. + +As an example, assume you have 1 million vectors with a `dimension` of 256 and an `m` of 16. The memory requirement can be estimated as follows: + +```r +1.1 * (4 * 256 + 8 * 16) * 1,000,000 ~= 1.267 GB +``` + +### IVF memory estimation + +The memory required for IVF is estimated to be `1.1 * (((4 * dimension) * num_vectors) + (4 * nlist * d))` bytes. + +As an example, assume you have 1 million vectors with a `dimension` of `256` and an `nlist` of `128`. The memory requirement can be estimated as follows: + +```r +1.1 * (((4 * 256) * 1,000,000) + (4 * 128 * 256)) ~= 1.126 GB +``` + +## Next steps + +- [Performance tuning]({{site.url}}{{site.baseurl}}/vector-search/performance-tuning/) +- [Optimizing vector storage]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/) +- [Vector quantization]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/knn-vector-quantization/) diff --git a/_field-types/supported-field-types/knn-spaces.md b/_field-types/supported-field-types/knn-spaces.md new file mode 100644 index 00000000000..7b0ce09aaba --- /dev/null +++ b/_field-types/supported-field-types/knn-spaces.md @@ -0,0 +1,98 @@ +--- +layout: default +title: Spaces +parent: k-NN vector +grand_parent: Supported field types +nav_order: 10 +has_math: true +--- + +# Spaces + +In vector search, a _space_ defines how the distance (or similarity) between two vectors is calculated. The choice of space affects how nearest neighbors are determined during search operations. + +## Distance calculation + +A space defines the function used to measure the distance between two points in order to determine the k-nearest neighbors. In k-NN search, a lower score equates to a closer and better result. This is the opposite of how OpenSearch scores results, where a higher score equates to a better result. OpenSearch supports the following spaces. + +Not every method/engine combination supports each of the spaces. For a list of supported spaces, see the section for a specific engine in the [method documentation]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/). +{: .note} + +| Space type | Search type | Distance function ($$d$$ ) | OpenSearch score | +| :--- | :--- | :--- | +| `l1` | Approximate, exact | $$ d(\mathbf{x}, \mathbf{y}) = \sum_{i=1}^n \lvert x_i - y_i \rvert $$ | $$ score = {1 \over {1 + d} } $$ | +| `l2` | Approximate, exact | $$ d(\mathbf{x}, \mathbf{y}) = \sum_{i=1}^n (x_i - y_i)^2 $$ | $$ score = {1 \over 1 + d } $$ | +| `linf` | Approximate, exact | $$ d(\mathbf{x}, \mathbf{y}) = max(\lvert x_i - y_i \rvert) $$ | $$ score = {1 \over 1 + d } $$ | +| `cosinesimil` | Approximate, exact | $$ d(\mathbf{x}, \mathbf{y}) = 1 - cos { \theta } = 1 - {\mathbf{x} \cdot \mathbf{y} \over \lVert \mathbf{x}\rVert \cdot \lVert \mathbf{y}\rVert}$$$$ = 1 - {\sum_{i=1}^n x_i y_i \over \sqrt{\sum_{i=1}^n x_i^2} \cdot \sqrt{\sum_{i=1}^n y_i^2}}$$, <br> where $$\lVert \mathbf{x}\rVert$$ and $$\lVert \mathbf{y}\rVert$$ represent the norms of vectors $$\mathbf{x}$$ and $$\mathbf{y}$$, respectively. | $$ score = {2 - d \over 2} $$ | +| `innerproduct` (supported for Lucene in OpenSearch version 2.13 and later) | Approximate | **NMSLIB** and **Faiss**:<br> $$ d(\mathbf{x}, \mathbf{y}) = - {\mathbf{x} \cdot \mathbf{y}} = - \sum_{i=1}^n x_i y_i $$ <br><br>**Lucene**:<br> $$ d(\mathbf{x}, \mathbf{y}) = {\mathbf{x} \cdot \mathbf{y}} = \sum_{i=1}^n x_i y_i $$ | **NMSLIB** and **Faiss**:<br> $$ \text{If} d \ge 0, score = {1 \over 1 + d }$$ <br> $$\text{If} d < 0, score = −d + 1$$ <br><br>**Lucene:**<br> $$ \text{If} d > 0, score = d + 1 $$ <br> $$\text{If} d \le 0, score = {1 \over 1 + (-1 \cdot d) }$$ | +| `innerproduct` (supported for Lucene in OpenSearch version 2.13 and later) | Exact | $$ d(\mathbf{x}, \mathbf{y}) = - {\mathbf{x} \cdot \mathbf{y}} = - \sum_{i=1}^n x_i y_i $$ | $$ \text{If} d \ge 0, score = {1 \over 1 + d }$$ <br> $$\text{If} d < 0, score = −d + 1$$ | +| `hamming` (supported for binary vectors in OpenSearch version 2.16 and later) | Approximate, exact | $$ d(\mathbf{x}, \mathbf{y}) = \text{countSetBits}(\mathbf{x} \oplus \mathbf{y})$$ | $$ score = {1 \over 1 + d } $$ | +| `hammingbit` (supported for binary and long vectors) | Exact | $$ d(\mathbf{x}, \mathbf{y}) = \text{countSetBits}(\mathbf{x} \oplus \mathbf{y})$$ | $$ score = {1 \over 1 + d } $$ | + +The cosine similarity formula does not include the `1 -` prefix. However, because similarity search libraries equate lower scores with closer results, they return `1 - cosineSimilarity` for the cosine similarity space---this is why `1 -` is included in the distance function. +{: .note } + +With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ...]`) as input. This is because the magnitude of such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests containing the zero vector will be rejected, and a corresponding exception will be thrown. +{: .note } + +The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized#binary-vectors). +{: .note} + +## Specifying the space type + +The space type is specified when creating an index. + +You can specify the space type at the top level of the field mapping: + +```json +PUT /test-index +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector1": { + "type": "knn_vector", + "dimension": 3, + "space_type": "l2" + } + } + } +} +``` +{% include copy-curl.html %} + +Alternatively, you can specify the space type within the `method` object if defining a method: + +```json +PUT test-index +{ + "settings": { + "index": { + "knn": true, + "knn.algo_param.ef_search": 100 + } + }, + "mappings": { + "properties": { + "my_vector1": { + "type": "knn_vector", + "dimension": 1024, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "nmslib", + "parameters": { + "ef_construction": 128, + "m": 24 + } + } + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_field-types/supported-field-types/knn-vector.md b/_field-types/supported-field-types/knn-vector.md index da784aeefe6..4b05a084f3f 100644 --- a/_field-types/supported-field-types/knn-vector.md +++ b/_field-types/supported-field-types/knn-vector.md @@ -1,62 +1,24 @@ --- layout: default title: k-NN vector -nav_order: 58 -has_children: false +nav_order: 20 +has_children: true parent: Supported field types has_math: true --- -# k-NN vector field type +# k-NN vector **Introduced 1.0** {: .label .label-purple } -The [k-NN plugin]({{site.url}}{{site.baseurl}}/search-plugins/knn/index/) introduces a custom data type, the `knn_vector`, that allows users to ingest their k-NN vectors into an OpenSearch index and perform different kinds of k-NN search. The `knn_vector` field is highly configurable and can serve many different k-NN workloads. In general, a `knn_vector` field can be built either by providing a method definition or specifying a model id. +The `knn_vector` data type allows you to ingest vectors into an OpenSearch index and perform different kinds of vector search. The `knn_vector` field is highly configurable and can serve many different vector workloads. In general, a `knn_vector` field can be built either by [providing a method definition](#method-definitions) or [specifying a model ID](#model-ids). ## Example -For example, to map `my_vector` as a `knn_vector`, use the following request: +To map `my_vector` as a `knn_vector`, use the following request: ```json -PUT test-index -{ - "settings": { - "index": { - "knn": true - } - }, - "mappings": { - "properties": { - "my_vector": { - "type": "knn_vector", - "dimension": 3, - "space_type": "l2", - "method": { - "name": "hnsw", - "engine": "faiss" - } - } - } - } -} -``` -{% include copy-curl.html %} - -## Vector workload modes - -Vector search involves trade-offs between low-latency and low-cost search. Specify the `mode` mapping parameter of the `knn_vector` type to indicate which search mode you want to prioritize. The `mode` dictates the default values for k-NN parameters. You can further fine-tune your index by overriding the default parameter values in the k-NN field mapping. - -The following modes are currently supported. - -| Mode | Default engine | Description | -|:---|:---|:---| -| `in_memory` (Default) | `nmslib` | Prioritizes low-latency search. This mode uses the `nmslib` engine without any quantization applied. It is configured with the default parameter values for vector search in OpenSearch. | -| `on_disk` | `faiss` | Prioritizes low-cost vector search while maintaining strong recall. By default, the `on_disk` mode uses quantization and rescoring to execute a two-pass approach to retrieve the top neighbors. The `on_disk` mode supports only `float` vector types. | - -To create a k-NN index that uses the `on_disk` mode for low-cost search, send the following request: - -```json -PUT test-index +PUT /test-index { "settings": { "index": { @@ -68,8 +30,7 @@ PUT test-index "my_vector": { "type": "knn_vector", "dimension": 3, - "space_type": "l2", - "mode": "on_disk" + "space_type": "l2" } } } @@ -77,33 +38,10 @@ PUT test-index ``` {% include copy-curl.html %} -## Compression levels - -The `compression_level` mapping parameter selects a quantization encoder that reduces vector memory consumption by the given factor. The following table lists the available `compression_level` values. +## Optimizing vector storage -| Compression level | Supported engines | -|:------------------|:-------------------------------| -| `1x` | `faiss`, `lucene`, and `nmslib` | -| `2x` | `faiss` | -| `4x` | `lucene` | -| `8x` | `faiss` | -| `16x` | `faiss` | -| `32x` | `faiss` | +To optimize vector storage, you can specify a [vector workload mode]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/#vector-workload-modes) as `in_memory` (which optimizes for lowest latency) or `on_disk` (which optimizes for lowest cost). The `on_disk` mode reduces memory usage. Optionally, you can specify a [`compression_level`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/#compression-levels) to fine-tune the vector memory consumption: -For example, if a `compression_level` of `32x` is passed for a `float32` index of 768-dimensional vectors, the per-vector memory is reduced from `4 * 768 = 3072` bytes to `3072 / 32 = 846` bytes. Internally, binary quantization (which maps a `float` to a `bit`) may be used to achieve this compression. - -If you set the `compression_level` parameter, then you cannot specify an `encoder` in the `method` mapping. Compression levels greater than `1x` are only supported for `float` vector types. -{: .note} - -The following table lists the default `compression_level` values for the available workload modes. - -| Mode | Default compression level | -|:------------------|:-------------------------------| -| `in_memory` | `1x` | -| `on_disk` | `32x` | - - -To create a vector field with a `compression_level` of `16x`, specify the `compression_level` parameter in the mappings. This parameter overrides the default compression level for the `on_disk` mode from `32x` to `16x`, producing higher recall and accuracy at the expense of a larger memory footprint: ```json PUT test-index @@ -128,68 +66,10 @@ PUT test-index ``` {% include copy-curl.html %} -## Method definitions - -[Method definitions]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#method-definitions) are used when the underlying [approximate k-NN]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/) algorithm does not require training. For example, the following `knn_vector` field specifies that *nmslib*'s implementation of *hnsw* should be used for approximate k-NN search. During indexing, *nmslib* will build the corresponding *hnsw* segment files. -```json -"my_vector": { - "type": "knn_vector", - "dimension": 4, - "space_type": "l2", - "method": { - "name": "hnsw", - "engine": "nmslib", - "parameters": { - "ef_construction": 100, - "m": 16 - } - } -} -``` - -## Model IDs - -Model IDs are used when the underlying Approximate k-NN algorithm requires a training step. As a prerequisite, the model must be created with the [Train API]({{site.url}}{{site.baseurl}}/search-plugins/knn/api#train-a-model). The -model contains the information needed to initialize the native library segment files. - -```json -"my_vector": { - "type": "knn_vector", - "model_id": "my-model" -} -``` - -However, if you intend to use Painless scripting or a k-NN score script, you only need to pass the dimension. - ```json -"my_vector": { - "type": "knn_vector", - "dimension": 128 - } - ``` - -## Byte vectors - -By default, k-NN vectors are `float` vectors, in which each dimension is 4 bytes. If you want to save storage space, you can use `byte` vectors with the `faiss` or `lucene` engine. In a `byte` vector, each dimension is a signed 8-bit integer in the [-128, 127] range. - -Byte vectors are supported only for the `lucene` and `faiss` engines. They are not supported for the `nmslib` engine. -{: .note} - -In [k-NN benchmarking tests](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/vectorsearch), the use of `byte` rather than `float` vectors resulted in a significant reduction in storage and memory usage as well as improved indexing throughput and reduced query latency. Additionally, precision on recall was not greatly affected (note that recall can depend on various factors, such as the [quantization technique](#quantization-techniques) and data distribution). - -When using `byte` vectors, expect some loss of precision in the recall compared to using `float` vectors. Byte vectors are useful in large-scale applications and use cases that prioritize a reduced memory footprint in exchange for a minimal loss of recall. -{: .important} - -When using `byte` vectors with the `faiss` engine, we recommend using [SIMD optimization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#simd-optimization-for-the-faiss-engine), which helps to significantly reduce search latencies and improve indexing throughput. -{: .important} - -Introduced in k-NN plugin version 2.9, the optional `data_type` parameter defines the data type of a vector. The default value of this parameter is `float`. - -To use a `byte` vector, set the `data_type` parameter to `byte` when creating mappings for an index: - -### Example: HNSW +## Method definitions -The following example creates a byte vector index with the `lucene` engine and `hnsw` algorithm: +[Method definitions]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/) are used when the underlying [approximate k-NN (ANN)]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/) algorithm does not require training. For example, the following `knn_vector` field specifies that a Faiss implementation of HNSW should be used for ANN search. During indexing, Faiss builds the corresponding HNSW segment files: ```json PUT test-index @@ -202,14 +82,13 @@ PUT test-index }, "mappings": { "properties": { - "my_vector": { + "my_vector1": { "type": "knn_vector", - "dimension": 3, - "data_type": "byte", - "space_type": "l2", + "dimension": 1024, "method": { "name": "hnsw", - "engine": "lucene", + "space_type": "l2", + "engine": "faiss", "parameters": { "ef_construction": 100, "m": 16 @@ -222,687 +101,79 @@ PUT test-index ``` {% include copy-curl.html %} -After creating the index, ingest documents as usual. Make sure each dimension in the vector is in the supported [-128, 127] range: +You can also specify the `space_type` at the top level: ```json -PUT test-index/_doc/1 -{ - "my_vector": [-126, 28, 127] -} -``` -{% include copy-curl.html %} - -```json -PUT test-index/_doc/2 -{ - "my_vector": [100, -128, 0] -} -``` -{% include copy-curl.html %} - -When querying, be sure to use a `byte` vector: - -```json -GET test-index/_search -{ - "size": 2, - "query": { - "knn": { - "my_vector": { - "vector": [26, -120, 99], - "k": 2 - } - } - } -} -``` -{% include copy-curl.html %} - -### Example: IVF - -The `ivf` method requires a training step that creates and trains the model used to initialize the native library index during segment creation. For more information, see [Building a k-NN index from a model]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model). - -First, create an index that will contain byte vector training data. Specify the `faiss` engine and `ivf` algorithm and make sure that the `dimension` matches the dimension of the model you want to create: - -```json -PUT train-index -{ - "mappings": { - "properties": { - "train-field": { - "type": "knn_vector", - "dimension": 4, - "data_type": "byte" - } - } - } -} -``` -{% include copy-curl.html %} - -First, ingest training data containing byte vectors into the training index: - -```json -PUT _bulk -{ "index": { "_index": "train-index", "_id": "1" } } -{ "train-field": [127, 100, 0, -120] } -{ "index": { "_index": "train-index", "_id": "2" } } -{ "train-field": [2, -128, -10, 50] } -{ "index": { "_index": "train-index", "_id": "3" } } -{ "train-field": [13, -100, 5, 126] } -{ "index": { "_index": "train-index", "_id": "4" } } -{ "train-field": [5, 100, -6, -125] } -``` -{% include copy-curl.html %} - -Then, create and train the model named `byte-vector-model`. The model will be trained using the training data from the `train-field` in the `train-index`. Specify the `byte` data type: - -```json -POST _plugins/_knn/models/byte-vector-model/_train -{ - "training_index": "train-index", - "training_field": "train-field", - "dimension": 4, - "description": "model with byte data", - "data_type": "byte", - "method": { - "name": "ivf", - "engine": "faiss", - "space_type": "l2", - "parameters": { - "nlist": 1, - "nprobes": 1 - } - } -} -``` -{% include copy-curl.html %} - -To check the model training status, call the Get Model API: - -```json -GET _plugins/_knn/models/byte-vector-model?filter_path=state -``` -{% include copy-curl.html %} - -Once the training is complete, the `state` changes to `created`. - -Next, create an index that will initialize its native library indexes using the trained model: - -```json -PUT test-byte-ivf -{ - "settings": { - "index": { - "knn": true - } - }, - "mappings": { - "properties": { - "my_vector": { - "type": "knn_vector", - "model_id": "byte-vector-model" - } - } - } -} -``` -{% include copy-curl.html %} - -Ingest the data containing the byte vectors that you want to search into the created index: - -```json -PUT _bulk?refresh=true -{"index": {"_index": "test-byte-ivf", "_id": "1"}} -{"my_vector": [7, 10, 15, -120]} -{"index": {"_index": "test-byte-ivf", "_id": "2"}} -{"my_vector": [10, -100, 120, -108]} -{"index": {"_index": "test-byte-ivf", "_id": "3"}} -{"my_vector": [1, -2, 5, -50]} -{"index": {"_index": "test-byte-ivf", "_id": "4"}} -{"my_vector": [9, -7, 45, -78]} -{"index": {"_index": "test-byte-ivf", "_id": "5"}} -{"my_vector": [80, -70, 127, -128]} -``` -{% include copy-curl.html %} - -Finally, search the data. Be sure to provide a byte vector in the k-NN vector field: - -```json -GET test-byte-ivf/_search -{ - "size": 2, - "query": { - "knn": { - "my_vector": { - "vector": [100, -120, 50, -45], - "k": 2 - } - } - } -} -``` -{% include copy-curl.html %} - -### Memory estimation - -In the best-case scenario, byte vectors require 25% of the memory required by 32-bit vectors. - -#### HNSW memory estimation - -The memory required for Hierarchical Navigable Small Worlds (HNSW) is estimated to be `1.1 * (dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph. - -As an example, assume that you have 1 million vectors with a dimension of 256 and an `m` of 16. The memory requirement can be estimated as follows: - -```r -1.1 * (256 + 8 * 16) * 1,000,000 ~= 0.39 GB -``` - -#### IVF memory estimation - -The memory required for IVF is estimated to be `1.1 * ((dimension * num_vectors) + (4 * nlist * dimension))` bytes/vector, where `nlist` is the number of buckets to partition vectors into. - -As an example, assume that you have 1 million vectors with a dimension of 256 and an `nlist` of 128. The memory requirement can be estimated as follows: - -```r -1.1 * ((256 * 1,000,000) + (4 * 128 * 256)) ~= 0.27 GB -``` - - -### Quantization techniques - -If your vectors are of the type `float`, you need to first convert them to the `byte` type before ingesting the documents. This conversion is accomplished by _quantizing the dataset_---reducing the precision of its vectors. There are many quantization techniques, such as scalar quantization or product quantization (PQ), which is used in the Faiss engine. The choice of quantization technique depends on the type of data you're using and can affect the accuracy of recall values. The following sections describe the scalar quantization algorithms that were used to quantize the [k-NN benchmarking test](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/vectorsearch) data for the [L2](#scalar-quantization-for-the-l2-space-type) and [cosine similarity](#scalar-quantization-for-the-cosine-similarity-space-type) space types. The provided pseudocode is for illustration purposes only. - -#### Scalar quantization for the L2 space type - -The following example pseudocode illustrates the scalar quantization technique used for the benchmarking tests on Euclidean datasets with the L2 space type. Euclidean distance is shift invariant. If you shift both $$x$$ and $$y$$ by the same $$z$$, then the distance remains the same ($$\lVert x-y\rVert =\lVert (x-z)-(y-z)\rVert$$). - -```python -# Random dataset (Example to create a random dataset) -dataset = np.random.uniform(-300, 300, (100, 10)) -# Random query set (Example to create a random queryset) -queryset = np.random.uniform(-350, 350, (100, 10)) -# Number of values -B = 256 - -# INDEXING: -# Get min and max -dataset_min = np.min(dataset) -dataset_max = np.max(dataset) -# Shift coordinates to be non-negative -dataset -= dataset_min -# Normalize into [0, 1] -dataset *= 1. / (dataset_max - dataset_min) -# Bucket into 256 values -dataset = np.floor(dataset * (B - 1)) - int(B / 2) - -# QUERYING: -# Clip (if queryset range is out of datset range) -queryset = queryset.clip(dataset_min, dataset_max) -# Shift coordinates to be non-negative -queryset -= dataset_min -# Normalize -queryset *= 1. / (dataset_max - dataset_min) -# Bucket into 256 values -queryset = np.floor(queryset * (B - 1)) - int(B / 2) -``` -{% include copy.html %} - -#### Scalar quantization for the cosine similarity space type - -The following example pseudocode illustrates the scalar quantization technique used for the benchmarking tests on angular datasets with the cosine similarity space type. Cosine similarity is not shift invariant ($$cos(x, y) \neq cos(x-z, y-z)$$). - -The following pseudocode is for positive numbers: - -```python -# For Positive Numbers - -# INDEXING and QUERYING: - -# Get Max of train dataset -max = np.max(dataset) -min = 0 -B = 127 - -# Normalize into [0,1] -val = (val - min) / (max - min) -val = (val * B) - -# Get int and fraction values -int_part = floor(val) -frac_part = val - int_part - -if 0.5 < frac_part: - bval = int_part + 1 -else: - bval = int_part - -return Byte(bval) -``` -{% include copy.html %} - -The following pseudocode is for negative numbers: - -```python -# For Negative Numbers - -# INDEXING and QUERYING: - -# Get Min of train dataset -min = 0 -max = -np.min(dataset) -B = 128 - -# Normalize into [0,1] -val = (val - min) / (max - min) -val = (val * B) - -# Get int and fraction values -int_part = floor(var) -frac_part = val - int_part - -if 0.5 < frac_part: - bval = int_part + 1 -else: - bval = int_part - -return Byte(bval) -``` -{% include copy.html %} - -## Binary vectors - -You can reduce memory costs by a factor of 32 by switching from float to binary vectors. -Using binary vector indexes can lower operational costs while maintaining high recall performance, making large-scale deployment more economical and efficient. - -Binary format is available for the following k-NN search types: - -- [Approximate k-NN]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/): Supports binary vectors only for the Faiss engine with the HNSW and IVF algorithms. -- [Script score k-NN]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-score-script/): Enables the use of binary vectors in script scoring. -- [Painless extensions]({{site.url}}{{site.baseurl}}/search-plugins/knn/painless-functions/): Allows the use of binary vectors with Painless scripting extensions. - -### Requirements - -There are several requirements for using binary vectors in the OpenSearch k-NN plugin: - -- The `data_type` of the binary vector index must be `binary`. -- The `space_type` of the binary vector index must be `hamming`. -- The `dimension` of the binary vector index must be a multiple of 8. -- You must convert your binary data into 8-bit signed integers (`int8`) in the [-128, 127] range. For example, the binary sequence of 8 bits `0, 1, 1, 0, 0, 0, 1, 1` must be converted into its equivalent byte value of `99` to be used as a binary vector input. - -### Example: HNSW - -To create a binary vector index with the Faiss engine and HNSW algorithm, send the following request: - -```json -PUT /test-binary-hnsw +PUT test-index { "settings": { "index": { - "knn": true + "knn": true, + "knn.algo_param.ef_search": 100 } }, "mappings": { "properties": { - "my_vector": { + "my_vector1": { "type": "knn_vector", - "dimension": 8, - "data_type": "binary", - "space_type": "hamming", + "dimension": 1024, + "space_type": "l2", "method": { "name": "hnsw", - "engine": "faiss" - } - } - } - } -} -``` -{% include copy-curl.html %} - -Then ingest some documents containing binary vectors: - -```json -PUT _bulk -{"index": {"_index": "test-binary-hnsw", "_id": "1"}} -{"my_vector": [7], "price": 4.4} -{"index": {"_index": "test-binary-hnsw", "_id": "2"}} -{"my_vector": [10], "price": 14.2} -{"index": {"_index": "test-binary-hnsw", "_id": "3"}} -{"my_vector": [15], "price": 19.1} -{"index": {"_index": "test-binary-hnsw", "_id": "4"}} -{"my_vector": [99], "price": 1.2} -{"index": {"_index": "test-binary-hnsw", "_id": "5"}} -{"my_vector": [80], "price": 16.5} -``` -{% include copy-curl.html %} - -When querying, be sure to use a binary vector: - -```json -GET /test-binary-hnsw/_search -{ - "size": 2, - "query": { - "knn": { - "my_vector": { - "vector": [9], - "k": 2 - } - } - } -} -``` -{% include copy-curl.html %} - -The response contains the two vectors closest to the query vector: - -<details markdown="block"> - <summary> - Response - </summary> - {: .text-delta} - -```json -{ - "took": 8, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 2, - "relation": "eq" - }, - "max_score": 0.5, - "hits": [ - { - "_index": "test-binary-hnsw", - "_id": "2", - "_score": 0.5, - "_source": { - "my_vector": [ - 10 - ], - "price": 14.2 - } - }, - { - "_index": "test-binary-hnsw", - "_id": "5", - "_score": 0.25, - "_source": { - "my_vector": [ - 80 - ], - "price": 16.5 + "engine": "faiss", + "parameters": { + "ef_construction": 100, + "m": 16 + } } } - ] - } -} -``` -</details> - -### Example: IVF - -The IVF method requires a training step that creates and trains the model used to initialize the native library index during segment creation. For more information, see [Building a k-NN index from a model]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model). - -First, create an index that will contain binary vector training data. Specify the Faiss engine and IVF algorithm and make sure that the `dimension` matches the dimension of the model you want to create: - -```json -PUT train-index -{ - "mappings": { - "properties": { - "train-field": { - "type": "knn_vector", - "dimension": 8, - "data_type": "binary" - } } } } ``` {% include copy-curl.html %} -Ingest training data containing binary vectors into the training index: - -<details markdown="block"> - <summary> - Bulk ingest request - </summary> - {: .text-delta} - -```json -PUT _bulk -{ "index": { "_index": "train-index", "_id": "1" } } -{ "train-field": [1] } -{ "index": { "_index": "train-index", "_id": "2" } } -{ "train-field": [2] } -{ "index": { "_index": "train-index", "_id": "3" } } -{ "train-field": [3] } -{ "index": { "_index": "train-index", "_id": "4" } } -{ "train-field": [4] } -{ "index": { "_index": "train-index", "_id": "5" } } -{ "train-field": [5] } -{ "index": { "_index": "train-index", "_id": "6" } } -{ "train-field": [6] } -{ "index": { "_index": "train-index", "_id": "7" } } -{ "train-field": [7] } -{ "index": { "_index": "train-index", "_id": "8" } } -{ "train-field": [8] } -{ "index": { "_index": "train-index", "_id": "9" } } -{ "train-field": [9] } -{ "index": { "_index": "train-index", "_id": "10" } } -{ "train-field": [10] } -{ "index": { "_index": "train-index", "_id": "11" } } -{ "train-field": [11] } -{ "index": { "_index": "train-index", "_id": "12" } } -{ "train-field": [12] } -{ "index": { "_index": "train-index", "_id": "13" } } -{ "train-field": [13] } -{ "index": { "_index": "train-index", "_id": "14" } } -{ "train-field": [14] } -{ "index": { "_index": "train-index", "_id": "15" } } -{ "train-field": [15] } -{ "index": { "_index": "train-index", "_id": "16" } } -{ "train-field": [16] } -{ "index": { "_index": "train-index", "_id": "17" } } -{ "train-field": [17] } -{ "index": { "_index": "train-index", "_id": "18" } } -{ "train-field": [18] } -{ "index": { "_index": "train-index", "_id": "19" } } -{ "train-field": [19] } -{ "index": { "_index": "train-index", "_id": "20" } } -{ "train-field": [20] } -{ "index": { "_index": "train-index", "_id": "21" } } -{ "train-field": [21] } -{ "index": { "_index": "train-index", "_id": "22" } } -{ "train-field": [22] } -{ "index": { "_index": "train-index", "_id": "23" } } -{ "train-field": [23] } -{ "index": { "_index": "train-index", "_id": "24" } } -{ "train-field": [24] } -{ "index": { "_index": "train-index", "_id": "25" } } -{ "train-field": [25] } -{ "index": { "_index": "train-index", "_id": "26" } } -{ "train-field": [26] } -{ "index": { "_index": "train-index", "_id": "27" } } -{ "train-field": [27] } -{ "index": { "_index": "train-index", "_id": "28" } } -{ "train-field": [28] } -{ "index": { "_index": "train-index", "_id": "29" } } -{ "train-field": [29] } -{ "index": { "_index": "train-index", "_id": "30" } } -{ "train-field": [30] } -{ "index": { "_index": "train-index", "_id": "31" } } -{ "train-field": [31] } -{ "index": { "_index": "train-index", "_id": "32" } } -{ "train-field": [32] } -{ "index": { "_index": "train-index", "_id": "33" } } -{ "train-field": [33] } -{ "index": { "_index": "train-index", "_id": "34" } } -{ "train-field": [34] } -{ "index": { "_index": "train-index", "_id": "35" } } -{ "train-field": [35] } -{ "index": { "_index": "train-index", "_id": "36" } } -{ "train-field": [36] } -{ "index": { "_index": "train-index", "_id": "37" } } -{ "train-field": [37] } -{ "index": { "_index": "train-index", "_id": "38" } } -{ "train-field": [38] } -{ "index": { "_index": "train-index", "_id": "39" } } -{ "train-field": [39] } -{ "index": { "_index": "train-index", "_id": "40" } } -{ "train-field": [40] } -``` -{% include copy-curl.html %} -</details> +## Model IDs -Then, create and train the model named `test-binary-model`. The model will be trained using the training data from the `train_field` in the `train-index`. Specify the `binary` data type and `hamming` space type: +Model IDs are used when the underlying ANN algorithm requires a training step. As a prerequisite, the model must be created using the [Train API]({{site.url}}{{site.baseurl}}/vector-search/api/knn#train-a-model). The model contains the information needed to initialize the native library segment files. To configure a model for a vector field, specify the `model_id`: ```json -POST _plugins/_knn/models/test-binary-model/_train -{ - "training_index": "train-index", - "training_field": "train-field", - "dimension": 8, - "description": "model with binary data", - "data_type": "binary", - "space_type": "hamming", - "method": { - "name": "ivf", - "engine": "faiss", - "parameters": { - "nlist": 16, - "nprobes": 1 - } - } +"my_vector": { + "type": "knn_vector", + "model_id": "my-model" } ``` -{% include copy-curl.html %} - -To check the model training status, call the Get Model API: - -```json -GET _plugins/_knn/models/test-binary-model?filter_path=state -``` -{% include copy-curl.html %} - -Once the training is complete, the `state` changes to `created`. -Next, create an index that will initialize its native library indexes using the trained model: +However, if you intend to use Painless scripting or a k-NN score script, you only need to pass the `dimension`: ```json -PUT test-binary-ivf -{ - "settings": { - "index": { - "knn": true - } - }, - "mappings": { - "properties": { - "my_vector": { - "type": "knn_vector", - "model_id": "test-binary-model" - } - } - } -} +"my_vector": { + "type": "knn_vector", + "dimension": 128 + } ``` -{% include copy-curl.html %} -Ingest the data containing the binary vectors that you want to search into the created index: +For more information, see [Building a vector index from a model]({{site.url}}{{site.baseurl}}/vector-search/vector-search-techniques/approximate-knn/#building-a-vector-index-from-a-model). -```json -PUT _bulk?refresh=true -{"index": {"_index": "test-binary-ivf", "_id": "1"}} -{"my_vector": [7], "price": 4.4} -{"index": {"_index": "test-binary-ivf", "_id": "2"}} -{"my_vector": [10], "price": 14.2} -{"index": {"_index": "test-binary-ivf", "_id": "3"}} -{"my_vector": [15], "price": 19.1} -{"index": {"_index": "test-binary-ivf", "_id": "4"}} -{"my_vector": [99], "price": 1.2} -{"index": {"_index": "test-binary-ivf", "_id": "5"}} -{"my_vector": [80], "price": 16.5} -``` -{% include copy-curl.html %} - -Finally, search the data. Be sure to provide a binary vector in the k-NN vector field: +### Parameters -```json -GET test-binary-ivf/_search -{ - "size": 2, - "query": { - "knn": { - "my_vector": { - "vector": [8], - "k": 2 - } - } - } -} -``` -{% include copy-curl.html %} +The following table lists the parameters accepted by k-NN vector field types. -The response contains the two vectors closest to the query vector: +Parameter | Data type | Description +:--- | :--- +`type` | String | The vector field type. Must be `knn_vector`. Required. +`dimension` | Integer | The size of the vectors used. Valid values are in the [1, 16,000] range. Required. +`data_type` | String | The data type of the vector elements. Valid values are `binary`, `byte`, and `float`. Optional. Default is `float`. +`space_type` | String | The vector space used to calculate the distance between vectors. Valid values are `l1`, `l2`, `linf`, `cosinesimil`, `innerproduct`, `hamming`, and `hammingbit`. Not every method/engine combination supports each of the spaces. For a list of supported spaces, see the section for a specific engine. Note: This value can also be specified within the `method`. Optional. For more information, see [Spaces]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-spaces/). +`mode` | String | Sets appropriate default values for k-NN parameters based on your priority: either low latency or low cost. Valid values are `in_memory` and `on_disk`. Optional. Default is `in_memory`. For more information, see [Memory-optimized vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/). +`compression_level` | String | Selects a quantization encoder that reduces vector memory consumption by the given factor. Valid values are `1x`, `2x`, `4x`, `8x`, `16x`, and `32x`. Optional. For more information, see [Memory-optimized vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/). +`method` | Object | The algorithm used for organizing vector data at indexing time and searching it at search time. Used when the ANN algorithm does not require training. Optional. For more information, see [Methods and engines]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/). +`model_id` | String | The model ID of a trained model. Used when the ANN algorithm requires training. See [Model IDs](#model-ids). Optional. -<details markdown="block"> - <summary> - Response - </summary> - {: .text-delta} +## Next steps -```json -GET /_plugins/_knn/models/my-model?filter_path=state -{ - "took": 7, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 2, - "relation": "eq" - }, - "max_score": 0.5, - "hits": [ - { - "_index": "test-binary-ivf", - "_id": "2", - "_score": 0.5, - "_source": { - "my_vector": [ - 10 - ], - "price": 14.2 - } - }, - { - "_index": "test-binary-ivf", - "_id": "3", - "_score": 0.25, - "_source": { - "my_vector": [ - 15 - ], - "price": 19.1 - } - } - ] - } -} -``` -</details> +- [Spaces]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-spaces/) +- [Methods and engines]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/) +- [Memory-optimized vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/) +- [Vector search]({{site.url}}{{site.baseurl}}/vector-search/) +- [k-NN query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/k-nn/) \ No newline at end of file diff --git a/_field-types/supported-field-types/semantic.md b/_field-types/supported-field-types/semantic.md new file mode 100644 index 00000000000..33368cbedc4 --- /dev/null +++ b/_field-types/supported-field-types/semantic.md @@ -0,0 +1,213 @@ +--- +layout: default +title: Semantic +nav_order: 20 +parent: Supported field types +--- + +# Semantic field type +**Introduced 3.1** +{: .label .label-purple } + +The `semantic` field type is a high-level abstraction that simplifies neural search setup in OpenSearch. It can wrap a variety of field types, including all string and binary fields. The `semantic` field type automatically enables semantic indexing and querying based on the configured machine learning (ML) model. + +**PREREQUISITE**<br> +Before using the `semantic` field type, you must configure either a local ML model hosted on your OpenSearch cluster or an externally hosted model connected to your OpenSearch cluster. For more information about local models, see [Using ML models within OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/). For more information about externally hosted models, see [Connecting to externally hosted models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). +{: .note} + +## Example: Dense embedding model + +Once you configure a model, you can use it to create an index with a `semantic` field. This example assumes that you have configured a dense embedding model with the ID `n17yX5cBsaYnPfyOzmQU` in your cluster: + +```json +PUT /my-nlp-index +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "passage": { + "type": "semantic", + "model_id": "n17yX5cBsaYnPfyOzmQU" + } + } + } +} +``` +{% include copy-curl.html %} + +After creating the index, you can retrieve its mapping to verify that a `passage_semantic_info` field was automatically created. The `passage_semantic_info` field contains a `knn_vector` subfield for storing the dense embedding and additional metadata fields for capturing information such as the model ID, model name, and model type: + +```json +GET /my-nlp-index/_mapping +{ + "my-nlp-index": { + "mappings": { + "properties": { + "passage": { + "type": "semantic", + "model_id": "n17yX5cBsaYnPfyOzmQU", + "raw_field_type": "text" + }, + "passage_semantic_info": { + "properties": { + "embedding": { + "type": "knn_vector", + "dimension": 384, + "method": { + "engine": "faiss", + "space_type": "l2", + "name": "hnsw", + "parameters": {} + } + }, + "model": { + "properties": { + "id": { + "type": "text", + "index": false + }, + "name": { + "type": "text", + "index": false + }, + "type": { + "type": "text", + "index": false + } + } + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The `dimension` and `space_type` of the `knn_vector` field are determined by the ML model configuration. For [pretrained dense models](ml-commons-plugin/pretrained-models/#sentence-transformers), this information is included in the default model configuration. For externally hosted dense embedding models, you must explicitly define the `dimension` and `space_type` in the model configuration before using the model with a `semantic` field. + +The autogenerated `knn_vector` subfield supports additional settings that are not currently configurable in the `semantic` field. For more information, see [Limitations](#limitations). +{: .note} + +## Example: Sparse encoding model + +Once you configure a model, you can use it to create an index with a `semantic` field. This example assumes that you have configured a sparse encoding model with the ID `n17yX5cBsaYnPfyOzmQU` in your cluster: + +```json +PUT /my-nlp-index +{ + "mappings": { + "properties": { + "passage": { + "type": "semantic", + "model_id": "nF7yX5cBsaYnPfyOq2SG" + } + } + } +} +``` +{% include copy-curl.html %} + +After creating the index, you can retrieve its mapping to verify that a `rank_features` field was automatically created: + +```json +GET /my-nlp-index/_mapping +{ + "my-nlp-index": { + "mappings": { + "properties": { + "passage": { + "type": "semantic", + "model_id": "nF7yX5cBsaYnPfyOq2SG", + "raw_field_type": "text" + }, + "passage_semantic_info": { + "properties": { + "embedding": { + "type": "rank_features" + }, + "model": { + "properties": { + "id": { + "type": "text", + "index": false + }, + "name": { + "type": "text", + "index": false + }, + "type": { + "type": "text", + "index": false + } + } + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Parameters + +The `semantic` field type supports the following parameters. + +| Parameter | Data type | Required/Optional | Description | +|-----------|-----------|----------------------|-------------| +| `type` | String | Required | Must be set to `semantic`. | +| `raw_field_type` | String | Optional | The underlying field type wrapped by the `semantic` field. The raw input is stored as this type at the path of the semantic field, allowing it to behave like a standard field of that type. Valid values are `text`, `keyword`, `match_only_text`, `wildcard`, `token_count`, and `binary`. Default is `text`. You can use any parameters supported by the underlying field type; those parameters function as expected. | +| `model_id` | String | Required | The ID of the ML model used to generate embeddings from field values during indexing and from query input during search. | +| `search_model_id` | String | Optional | The ID of the ML model used specifically for query-time embedding generation. If not specified, the `model_id` is used. Cannot be specified together with `semantic_field_search_analyzer`. | +| `semantic_info_field_name` | String | Optional | A custom name for the internal metadata field that stores the embedding and model information. By default, this field name is derived by appending `_semantic_info` to the semantic field name. | +| `chunking` | Boolean | Optional | Enables fixed-length token chunking during ingestion. When enabled, the input is split into chunks using a default configuration. See [Text chunking](#text-chunking).| +| `semantic_field_search_analyzer` | String | Optional | Specifies an analyzer for tokenizing the query input when using a sparse model. Valid values are `standard`, `bert-uncased`, and `mbert-uncased`. Cannot be used together with `search_model_id`. For more information, see [Analyzers]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/). | + + +## Text chunking + +By default, text chunking is disabled for `semantic` fields. This is because enabling chunking requires storing each chunk's embedding in a nested object, which can increase search latency. Searching nested objects requires joining child documents to their parent, along with additional scoring and aggregation logic. The more matching child documents there are, the higher the potential latency. + +If you're working with long-form text and want to improve search relevance, you can enable chunking by setting the `chunking` parameter for the `semantic` field to `true` when creating an index: + +```json +PUT /my-nlp-index +{ + "mappings": { + "properties": { + "passage": { + "type": "semantic", + "model_id": "nF7yX5cBsaYnPfyOq2SG", + "chunking": true + } + } + } +} +``` +{% include copy-curl.html %} + +Chunking is performed using the [fixed token length algorithm]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/text-chunking/#the-fixed-token-length-algorithm). + +### Limitations + +Note the following limitations of the `semantic` field: + +- When using a `semantic` field with a dense model, the automatically generated `knn_vector` subfield takes the `dimension` and `space_type` values from the model configuration, so you must ensure that this information is defined before using the model. Other `knn_vector` parameters use default values and cannot be customized. + +- Text chunking uses a [fixed token length algorithm]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/text-chunking/#the-fixed-token-length-algorithm) with default settings. You cannot modify the chunking algorithm. + +- For sparse models, OpenSearch applies a default prune ratio of `0.1` when generating sparse embeddings. This value is not configurable. Querying a semantic field with a sparse model is not supported by the [`neural_sparse_two_phase_processor`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-sparse-query-two-phase-processor/), which is used to optimize search latency. + +- Querying a `semantic` field from a remote cluster is not supported. + +## Next steps + +- [Using a `semantic` field with text embedding models for semantic search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/semantic-search/#using-a-semantic-field) +- [Using a `semantic` field with sparse encoding models for neural sparse search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-with-pipelines/#using-a-semantic-field) \ No newline at end of file diff --git a/_field-types/supported-field-types/star-tree.md b/_field-types/supported-field-types/star-tree.md new file mode 100644 index 00000000000..c4da9a39932 --- /dev/null +++ b/_field-types/supported-field-types/star-tree.md @@ -0,0 +1,234 @@ +--- +layout: default +title: Star-tree +nav_order: 61 +parent: Supported field types +--- + +# Star-tree field type + +A star-tree index precomputes aggregations, accelerating the performance of aggregation queries. +If a star-tree index is configured as part of an index mapping, the star-tree index is created and maintained as data is ingested in real time. + +OpenSearch will automatically use the star-tree index to optimize aggregations if the queried fields are part of star-tree index dimension fields and the aggregations are on star-tree index metric fields. No changes are required in the query syntax or the request parameters. + +For more information, see [Star-tree index]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index/). + +## Prerequisites + +To use a star-tree index, follow the instructions in [Enabling a star-tree index]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index#enabling-a-star-tree-index). + +## Examples + +The following examples show how to use a star-tree index. + +### Star-tree index mappings + +Define star-tree index mappings in the `composite` section in `mappings`. + +The following example API request creates a corresponding star-tree index named`request_aggs`. To compute metric aggregations for `request_size` and `latency` fields with queries on `port` and `status` fields, configure the following mappings: + +```json +PUT logs +{ + "settings": { + "index.number_of_shards": 1, + "index.number_of_replicas": 0, + "index.composite_index": true, + "index.append_only.enabled": true + }, + "mappings": { + "composite": { + "request_aggs": { + "type": "star_tree", + "config": { + "max_leaf_docs": 10000, + "skip_star_node_creation_for_dimensions": [ + "port" + ], + "date_dimension" : { + "name": "@timestamp", + "calendar_intervals": [ + "month", + "day" + ] + }, + "ordered_dimensions": [ + { + "name": "status" + }, + { + "name": "port" + }, + { + "name": "method" + } + ], + "metrics": [ + { + "name": "request_size", + "stats": [ + "sum", + "value_count", + "min", + "max" + ] + }, + { + "name": "latency", + "stats": [ + "sum", + "value_count", + "min", + "max" + ] + } + ] + } + } + }, + "properties": { + "@timestamp": { + "format": "strict_date_optional_time||epoch_second", + "type": "date" + }, + "status": { + "type": "integer" + }, + "port": { + "type": "integer" + }, + "request_size": { + "type": "integer" + }, + "method" : { + "type": "keyword" + }, + "latency": { + "type": "scaled_float", + "scaling_factor": 10 + } + } + } +} +``` + +## Star-tree index configuration options + +You can customize your star-tree implementation using the following `config` options in the `mappings` section. These options cannot be modified without reindexing. + +| Parameter | Description | +| :--- | :--- | +| `ordered_dimensions` | A [list of fields](#ordered-dimensions) based on which metrics will be aggregated in a star-tree index. Required. | +| `date_dimension` | If the [date dimension](#date-dimension) is provided, `ordered_dimensions` is appended to it based on which metrics will be aggregated in a star-tree index. Optional. | +| `metrics` | A [list of metric](#metrics) fields required in order to perform aggregations. Required. | +| `max_leaf_docs` | The maximum number of star-tree documents that a leaf node can point to. After the maximum number of documents is reached, child nodes will be created based on the unique value of the next field in the `ordered_dimension` (if any). Default is `10000`. A lower value will use more storage but result in faster query performance. Inversely, a higher value will use less storage but result in slower query performance. For more information, see [Star-tree indexing structure]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index/#star-tree-index-structure). | +| `skip_star_node_creation_for_dimensions` | A list of dimensions for which a star-tree index will skip star node creation. When `true`, this reduces storage size at the expense of query performance. Default is `false`. For more information about star nodes, see [Star-tree indexing structure]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index/#star-tree-index-structure). | + + +### Ordered dimensions + +The `ordered_dimensions` parameter contains fields based on which metrics will be aggregated in a star-tree index. The star-tree index will be selected for querying only if all the fields in the query are part of the `ordered_dimensions`. + +When using the `ordered_dimesions` parameter, follow these best practices: + +- The order of dimensions matters. You can define the dimensions ordered from the highest cardinality to the lowest cardinality for efficient storage and query pruning. +- Avoid using high-cardinality fields as dimensions. High-cardinality fields adversely affect storage space, indexing throughput, and query performance. +- A minimum of `2` and a maximum of `10` dimensions are supported per star-tree index. + +The `ordered_dimensions` parameter supports the following field types: + + - All numeric field types, excluding `unsigned_long` and `scaled_float` + - `keyword` + - `object` + +Support for other field types, such as `ip`, will be added in future versions. For more information, see [GitHub issue #13875](https://github.com/opensearch-project/OpenSearch/issues/13875). + +The `ordered_dimensions` parameter supports the following property. + +| Parameter | Required/Optional | Description | +| :--- | :--- | :--- | +| `name` | Required | The name of the field. The field name should be present in the `properties` section as part of the index `mapping`. Ensure that the `doc_values` setting is `enabled` for any associated fields. | + + +### Date dimension + +The `date_dimension` supports one `Date` field and is always the first dimension placed above the ordered dimensions, as they generally have high cardinality. + +The `date_dimension` can support up to three of the following calendar intervals: + +- `year` (of era) +- `quarter` (of year) +- `month` (of year) +- `week` (of week-based year) +- `day` (of month) +- `hour` (of day) +- `half-hour` (of day) +- `quater-hour` (of day) +- `minute` (of hour) +- `second` (of minute) + + +Any values in the `date` field are rounded based on the granularity associated with the calendar intervals provided. For example: + +- The default `calendar_intervals` are `minute` and `half-hour`. +- During queries, the nearest granular intervals are automatically picked up. For example, if you have configured `hour` and `minute` as the `calendar_intervals` and your query is a monthly date histogram, the `hour` interval will be automatically selected so that the query computes the results in an optimized way. +- To support time-zone-based queries, `:30` equals a `half-hour` interval and `:15` equals a `quarter-hour` interval. + + +### Metrics + +Configure any metric fields on which you need to perform aggregations. `Metrics` are required as part of a star-tree index configuration. + +When using `metrics`, follow these best practices: + +- Currently, fields supported by `metrics` are all [numeric field types]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/numeric/), with the exception of `unsigned_long`. For more information, see [GitHub issue #15231](https://github.com/opensearch-project/OpenSearch/issues/15231). +- Supported metric aggregations include `Min`, `Max`, `Sum`, `Avg`, and `Value_count`. + - `Avg` is a derived metric based on `Sum` and `Value_count` and is not indexed when a query is run. The remaining base metrics are indexed. +- A maximum of `100` base metrics are supported per star-tree index. + +If `Min`, `Max`, `Sum`, and `Value_count` are defined as `metrics` for each field, then up to 25 such fields can be configured, as shown in the following example: + +```json +{ + "metrics": [ + { + "name": "field1", + "stats": [ + "sum", + "value_count", + "min", + "max" + ], + ..., + ..., + "name": "field25", + "stats": [ + "sum", + "value_count", + "min", + "max" + ] + } + ] +} +``` + + +#### Properties + +The `metrics` parameter supports the following properties. + +| Parameter | Required/Optional | Description | +| :--- | :--- | :--- | +| `name` | Required | The name of the field. The field name should be present in the `properties` section as part of the index `mapping`. Ensure that the `doc_values` setting is `enabled` for any associated fields. | +| `stats` | Optional | A list of metric aggregations computed for each field. You can choose between `Min`, `Max`, `Sum`, `Avg`, and `Value Count`.<br/>Default is `Sum` and `Value_count`.<br/>`Avg` is a derived metric statistic that will automatically be supported in queries if `Sum` and `Value_Count` are present as part of metric `stats`. + + +## Supported queries and aggregations + +For more information about supported queries and aggregations, see [Supported queries and aggregations for a star-tree index]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index/#supported-queries-and-aggregations). + +## Next steps + +- [Star-tree index]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index/) \ No newline at end of file diff --git a/_field-types/supported-field-types/text.md b/_field-types/supported-field-types/text.md index b06bec2187c..26ac30f1dfb 100644 --- a/_field-types/supported-field-types/text.md +++ b/_field-types/supported-field-types/text.md @@ -122,12 +122,12 @@ GET testindex/_search { "query": { "match": { - "text": "date of birth" + "dob": "date of birth" } }, "highlight": { "fields": { - "text": {} + "dob": {} } } } @@ -170,4 +170,4 @@ The words "date of birth" are highlighted in the response: ] } } -``` \ No newline at end of file +``` diff --git a/_getting-started/communicate.md b/_getting-started/communicate.md index 3472270c305..6f5382ff92f 100644 --- a/_getting-started/communicate.md +++ b/_getting-started/communicate.md @@ -28,7 +28,7 @@ curl -X GET "http://localhost:9200/_cluster/health" If you're using the Security plugin, provide the username and password in the request: ```bash -curl -X GET "http://localhost:9200/_cluster/health" -ku admin:<custom-admin-password> +curl -X GET "https://localhost:9200/_cluster/health" -ku admin:<custom-admin-password> ``` {% include copy.html %} @@ -37,7 +37,7 @@ The default username is `admin`, and the password is set in your `docker-compose OpenSearch generally returns responses in a flat JSON format by default. For a human-readable response body, provide the `pretty` query parameter: ```bash -curl -X GET "http://localhost:9200/_cluster/health?pretty" +curl -X GET "https://localhost:9200/_cluster/health?pretty" ``` {% include copy.html %} @@ -46,7 +46,7 @@ For more information about `pretty` and other useful query parameters, see [Comm For requests that contain a body, specify the `Content-Type` header and provide the request payload in the `-d` (data) option: ```json -curl -X GET "http://localhost:9200/students/_search?pretty" -H 'Content-Type: application/json' -d' +curl -X GET "https://localhost:9200/students/_search?pretty" -H 'Content-Type: application/json' -d' { "query": { "match_all": {} @@ -59,7 +59,7 @@ curl -X GET "http://localhost:9200/students/_search?pretty" -H 'Content-Type: ap The Dev Tools console in OpenSearch Dashboards uses a simpler syntax to format REST requests as compared to the cURL command. To send requests in Dev Tools, use the following steps: -1. Access OpenSearch Dashboards by opening `http://localhost:5601/` in a web browser on the same host that is running your OpenSearch cluster. The default username is `admin`, and the password is set in your `docker-compose.yml` file in the `OPENSEARCH_INITIAL_ADMIN_PASSWORD=<custom-admin-password>` setting. +1. Access OpenSearch Dashboards by opening `https://localhost:5601/` in a web browser on the same host that is running your OpenSearch cluster. The default username is `admin`, and the password is set in your `docker-compose.yml` file in the `OPENSEARCH_INITIAL_ADMIN_PASSWORD=<custom-admin-password>` setting. 1. On the top menu bar, go to **Management > Dev Tools**. 1. In the left pane of the console, enter the following request: ```json @@ -317,4 +317,4 @@ Once a field is created, you cannot change its type. Changing a field type requi ## Next steps -- See [Ingest data into OpenSearch]({{site.url}}{{site.baseurl}}/getting-started/ingest-data/) to learn about ingestion options. \ No newline at end of file +- See [Ingest data into OpenSearch]({{site.url}}{{site.baseurl}}/getting-started/ingest-data/) to learn about ingestion options. diff --git a/_getting-started/concepts.md b/_getting-started/concepts.md new file mode 100644 index 00000000000..bb5febc07b4 --- /dev/null +++ b/_getting-started/concepts.md @@ -0,0 +1,100 @@ +--- +layout: default +title: Concepts +nav_order: 70 +--- + +# Concepts + +This page defines key terms and concepts related to OpenSearch. + +## Basic concepts + +- [***Document***]({{site.url}}{{site.baseurl}}/getting-started/intro/#document): The basic unit of information in OpenSearch, stored in JSON format. +- [***Index***]({{site.url}}{{site.baseurl}}/getting-started/intro/#index): A collection of related documents. +- [***JSON (JavaScript object notation)***](https://www.json.org/): A text format used to store data in OpenSearch, representing information as key-value pairs. +- [***Mapping***]({{site.url}}{{site.baseurl}}/field-types/): The schema definition for an index that specifies how documents and their fields should be stored and indexed. + +## Cluster architecture + +- [***Node***]({{site.url}}{{site.baseurl}}/getting-started/intro/#clusters-and-nodes): A single server that is part of an OpenSearch cluster. +- [***Cluster***]({{site.url}}{{site.baseurl}}/getting-started/intro/#clusters-and-nodes): A collection of OpenSearch nodes working together. +- [***Cluster manager***]({{site.url}}{{site.baseurl}}/getting-started/intro/#clusters-and-nodes): The node responsible for managing cluster-wide operations. +- [***Shard***]({{site.url}}{{site.baseurl}}/getting-started/intro/#shards): A subset of an index's data; indexes are split into shards for distribution across nodes. +- [***Primary shard***]({{site.url}}{{site.baseurl}}/getting-started/intro/#primary-and-replica-shards): The original shard containing index data. +- [***Replica shard***]({{site.url}}{{site.baseurl}}/getting-started/intro/#primary-and-replica-shards): A copy of a primary shard for redundancy and search performance. + + +## Data structures and storage + +- [***Doc values***]({{site.url}}{{site.baseurl}}/field-types/mapping-parameters/doc-values/): An on-disk data structure for efficient sorting and aggregating of field values. +- [***Inverted index***]({{site.url}}{{site.baseurl}}/getting-started/intro/#inverted-index): A data structure that maps words to the documents containing them. +- ***Lucene***: The underlying search library that OpenSearch uses to index and search data. +- ***Segment***: An immutable unit of data storage within a shard. + +## Data operations + +- ***Ingestion***: The process of adding data to OpenSearch. +- [***Indexing***]({{site.url}}{{site.baseurl}}/api-reference/document-apis/index-document/): The process of storing and organizing data in OpenSearch to make it searchable. +- [***Bulk indexing***]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/): The process of indexing multiple documents in a single request. + +## Text analysis + +- [***Text analysis***]({{site.url}}{{site.baseurl}}/analyzers/): A process of splitting the unstructured free text content of a document into a sequence of terms, which are then stored in an inverted index. +- [***Analyzer***]({{site.url}}{{site.baseurl}}/analyzers/#analyzers): A component that processes text to prepare it for search. Analyzers convert text into terms that are stored in the inverted index. +- [***Tokenizer***]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/index/): The component of an analyzer that splits text into individual tokens (usually words) and records metadata about their positions. +- [***Token filter***]({{site.url}}{{site.baseurl}}/analyzers/token-filters/index/): The final component of an analyzer, which modifies, adds, or removes tokens after tokenization. Examples include lowercase conversion, stopword removal, and synonym addition. +- [***Token***]({{site.url}}{{site.baseurl}}/analyzers/): A unit of text created by a tokenizer during text analysis. Tokens can be modified by token filters and contain metadata used in the text analysis process. +- [***Term***]({{site.url}}{{site.baseurl}}/analyzers/): A data value that is directly stored in the inverted index and used for matching during search operations. Terms have minimal associated metadata. +- [***Character filter***]({{site.url}}{{site.baseurl}}/analyzers/character-filters/index/): The first component of an analyzer that processes raw text by adding, removing, or modifying characters before tokenization. +- [***Normalizer***]({{site.url}}{{site.baseurl}}/analyzers/normalizers/): A special type of analyzer that processes text without tokenization. It can only perform character-level operations and cannot modify whole tokens. +- [***Stemming***]({{site.url}}{{site.baseurl}}/analyzers/stemming/): The process of reducing words to their root or base form, known as the _stem_. + +## Search and query concepts + +- ***Query***: A request to OpenSearch that describes what you're searching for in your data. +- ***Query clause***: A single condition within a query that specifies criteria for matching documents. +- [***Filter***]({{site.url}}{{site.baseurl}}/query-dsl/query-filter-context/#filter-context): A query component that finds exact matches without scoring. +- [***Filter context***]({{site.url}}{{site.baseurl}}/query-dsl/query-filter-context/): A query clause in a filter context asks the question _"Does the document match the query clause?"_ +- [***Query context***]({{site.url}}{{site.baseurl}}/query-dsl/query-filter-context/): A query clause in a query context asks the question _"How well does the document match the query clause?"_ +- [***Full-text search***]({{site.url}}{{site.baseurl}}/query-dsl/term-vs-full-text/): Search that analyzes and matches text fields, considering variations in word forms. +- [***Keyword search***]({{site.url}}{{site.baseurl}}/query-dsl/term-vs-full-text/): Search that requires exact text matches. +- [***Query domain-specific language (DSL)***]({{site.url}}{{site.baseurl}}/query-dsl/): OpenSearch's primary query language for creating complex, customizable searches. +- [***Query string query language***]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/): A simplified query syntax that can be used in URL parameters. +- [***Dashboards Query Language (DQL)***]({{site.url}}{{site.baseurl}}/dashboards/dql/): A simple text-based query language used specifically for filtering data in OpenSearch Dashboards. +- [***Piped Processing Language (PPL)***]({{site.url}}{{site.baseurl}}/search-plugins/sql/ppl/index/): A query language that uses pipe syntax (`|`) to chain commands for data processing and analysis. Primarily used for observability use cases in OpenSearch. +- [***Relevance score***]({{site.url}}{{site.baseurl}}/getting-started/intro/#relevance): A number indicating how well a document matches a query. +- [***Aggregation***]({{site.url}}{{site.baseurl}}/aggregations/): A way to analyze and summarize data based on a search query. + +## Vector search concepts + +See [Vector search concepts]({{site.url}}{{site.baseurl}}/vector-search/getting-started/concepts/). + +## Advanced concepts + +The following section describes more advanced OpenSearch concepts. + +### Update lifecycle + +The lifecycle of an update operation consists of the following steps: + +1. An update is received by a primary shard and is written to the shard's transaction log ([translog](#translog)). The translog is flushed to disk (followed by an fsync) before the update is acknowledged. This guarantees durability. +1. The update is also passed to the Lucene index writer, which adds it to an in-memory buffer. +1. On a [refresh operation](#refresh), the Lucene index writer flushes the in-memory buffers to disk (with each buffer becoming a new Lucene segment), and a new index reader is opened over the resulting segment files. The updates are now visible for search. +1. On a [flush operation](#flush), the shard fsyncs the Lucene segments. Because the segment files are a durable representation of the updates, the translog is no longer needed to provide durability, so the updates can be purged from the translog. + +### Translog + +An indexing or bulk call responds when the documents have been written to the translog and the translog is flushed to disk, so the updates are durable. The updates will not be visible to search requests until after a [refresh operation](#refresh). + +### Refresh + +Periodically, OpenSearch performs a _refresh_ operation, which writes the documents from the in-memory Lucene index to files. These files are not guaranteed to be durable because an `fsync` is not performed. A refresh makes documents available for search. + +### Flush + +A _flush_ operation persists the files to disk using `fsync`, ensuring durability. Flushing ensures that the data stored only in the translog is recorded in the Lucene index. OpenSearch performs a flush as needed to ensure that the translog does not grow too large. + +### Merge + +In OpenSearch, a shard is a Lucene index, which consists of _segments_ (or segment files). Segments store the indexed data and are immutable. Periodically, smaller segments are merged into larger ones. Merging reduces the overall number of segments on each shard, frees up disk space, and improves search performance. Eventually, segments reach a maximum size specified in the merge policy and are no longer merged into larger segments. The merge policy also specifies how often merges are performed. \ No newline at end of file diff --git a/_getting-started/intro.md b/_getting-started/intro.md index f5eb24ba2be..f1ea7ef0962 100644 --- a/_getting-started/intro.md +++ b/_getting-started/intro.md @@ -127,35 +127,6 @@ Individual words in a search query are called search _terms_. Each search term i OpenSearch uses the BM25 ranking algorithm to calculate document relevance scores and then returns the results sorted by relevance. To learn more, see [Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25). -## Advanced concepts - -The following section describes more advanced OpenSearch concepts. - -### Update lifecycle - -The lifecycle of an update operation consists of the following steps: - -1. An update is received by a primary shard and is written to the shard's transaction log ([translog](#translog)). The translog is flushed to disk (followed by an fsync) before the update is acknowledged. This guarantees durability. -1. The update is also passed to the Lucene index writer, which adds it to an in-memory buffer. -1. On a [refresh operation](#refresh), the Lucene index writer flushes the in-memory buffers to disk (with each buffer becoming a new Lucene segment), and a new index reader is opened over the resulting segment files. The updates are now visible for search. -1. On a [flush operation](#flush), the shard fsyncs the Lucene segments. Because the segment files are a durable representation of the updates, the translog is no longer needed to provide durability, so the updates can be purged from the translog. - -### Translog - -An indexing or bulk call responds when the documents have been written to the translog and the translog is flushed to disk, so the updates are durable. The updates will not be visible to search requests until after a [refresh operation](#refresh). - -### Refresh - -Periodically, OpenSearch performs a _refresh_ operation, which writes the documents from the in-memory Lucene index to files. These files are not guaranteed to be durable because an `fsync` is not performed. A refresh makes documents available for search. - -### Flush - -A _flush_ operation persists the files to disk using `fsync`, ensuring durability. Flushing ensures that the data stored only in the translog is recorded in the Lucene index. OpenSearch performs a flush as needed to ensure that the translog does not grow too large. - -### Merge - -In OpenSearch, a shard is a Lucene index, which consists of _segments_ (or segment files). Segments store the indexed data and are immutable. Periodically, smaller segments are merged into larger ones. Merging reduces the overall number of segments on each shard, frees up disk space, and improves search performance. Eventually, segments reach a maximum size specified in the merge policy and are no longer merged into larger segments. The merge policy also specifies how often merges are performed. - ## Next steps - Learn how to install OpenSearch within minutes in [Installation quickstart]({{site.url}}{{site.baseurl}}/getting-started/quickstart/). diff --git a/_getting-started/quickstart.md b/_getting-started/quickstart.md index 0a28e29a04a..4feacee4f55 100644 --- a/_getting-started/quickstart.md +++ b/_getting-started/quickstart.md @@ -14,8 +14,6 @@ To quickly get started using OpenSearch and OpenSearch Dashboards, deploy your c Before proceeding, you need to install [Docker](https://docs.docker.com/get-docker/) and [Docker Compose](https://github.com/docker/compose) on your local machine. -The Docker Compose commands used in this guide are written with a hyphen (for example, `docker-compose`). If you installed Docker Desktop on your machine, which automatically installs a bundled version of Docker Compose, then you should remove the hyphen. For example, change `docker-compose` to `docker compose`. -{: .note} ## Starting your cluster @@ -72,14 +70,14 @@ You'll need a special file, called a Compose file, that Docker Compose uses to d 1. In your terminal application, navigate to the directory containing the `docker-compose.yml` file you downloaded, [set up a custom admin password]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/docker/#setting-a-custom-admin-password), and run the following command to create and start the cluster as a background process: ```bash - docker-compose up -d + docker compose up -d ``` {% include copy.html %} -1. Confirm that the containers are running with the command `docker-compose ps`. You should see an output like the following: +1. Confirm that the containers are running with the command `docker compose ps`. You should see an output like the following: ```bash - $ docker-compose ps + $ docker compose ps NAME COMMAND SERVICE STATUS PORTS opensearch-dashboards "./opensearch-dashbo…" opensearch-dashboards running 0.0.0.0:5601->5601/tcp opensearch-node1 "./opensearch-docker…" opensearch-node1 running 0.0.0.0:9200->9200/tcp, 9300/tcp, 0.0.0.0:9600->9600/tcp, 9650/tcp diff --git a/_getting-started/search-data.md b/_getting-started/search-data.md index 8e4169fbaec..1043d60afb4 100644 --- a/_getting-started/search-data.md +++ b/_getting-started/search-data.md @@ -14,6 +14,8 @@ In OpenSearch, there are several ways to search data: - [Piped Processing Language (PPL)]({{site.url}}{{site.baseurl}}/search-plugins/sql/ppl/index/): The primary language used for observability in OpenSearch. PPL uses a pipe syntax that chains commands into a query. - [Dashboards Query Language (DQL)]({{site.url}}{{site.baseurl}}/dashboards/dql/): A simple text-based query language for filtering data in OpenSearch Dashboards. +This tutorial contains a brief introduction to searching using [query string queries](#query-string-queries) and [query DSL](#query-dsl). + ## Prepare the data For this tutorial, you'll need to index student data if you haven't done so already. You can start by deleting the `students` index (`DELETE /students`) and then sending the following bulk request: diff --git a/_im-plugin/append-only-index.md b/_im-plugin/append-only-index.md new file mode 100644 index 00000000000..a019eafd0ea --- /dev/null +++ b/_im-plugin/append-only-index.md @@ -0,0 +1,68 @@ +--- +layout: default +title: Append-only index +nav_order: 14 +--- + +# Append-only index + +An append-only index is an immutable index that only allows document ingestion (appending) while blocking all updates or deletions after initial document creation. When you enable the append-only setting for an index, OpenSearch prevents any modifications to existing documents. You can only add new documents to the index. + +When you configure an index as append-only, the following operations return an error: + +- Document update call (Update API) +- Document delete call (Delete API) +- Update by query call +- Delete by query call +- Bulk API calls made with the update, delete, or upsert actions +- Bulk API calls containing an index action with a custom document ID + +## Benefits + +Append-only indexes offer several advantages: + +- Optimized performance by eliminating costly update and delete operations +- Optimized storage and segment merges by eliminating soft deletes and version tracking +- Support for future optimizations like auto-rollovers and efficient warm tiering + +Append-only indexes are ideal for immutable workloads, such as those containing log, metric, observability, or security event data, where data is not modified once ingested. + +## Creating an append-only index + +The following request creates a new index named `my-append-only-index` with all updates disabled: + +```json +PUT /my-append-only-index +{ + "settings": { + "index.append_only.enabled": true + } +} +``` +{% include copy-curl.html %} + +After an index is set to append-only, it cannot be changed to another index type. +{: .warning} + + +To append data from an existing index to a new append-only index, use the Reindex API. Because append-only indexes don't support custom document IDs, you need to set the `ctx._id` of the source index to `null`. This allows documents to be added through reindexing. + +The following example reindexes documents from a source index (`my-source-index`) into the new append-only index: + +```json +POST /_reindex +{ + "source": { + "index": "my-source-index" + }, + "dest": { + "index": "my-append-only-index" + }, + "script": { + "source": "ctx._id = null", + "lang": "painless" + } +} + +``` +{% include copy-curl.html %} diff --git a/_im-plugin/index-rollups/rollup-api.md b/_im-plugin/index-rollups/rollup-api.md index 5064d2ac497..25e2e37de8e 100644 --- a/_im-plugin/index-rollups/rollup-api.md +++ b/_im-plugin/index-rollups/rollup-api.md @@ -34,6 +34,11 @@ PUT _plugins/_rollup/jobs/<rollup_id>?if_seq_no=1&if_primary_term=1 // Update "rollup": { "source_index": "nyc-taxi-data", "target_index": "rollup-nyc-taxi-data", + "target_index_settings":{ + "index.number_of_shards": 1, + "index.number_of_replicas": 1, + "index.codec": "best_compression" + }, "schedule": { "interval": { "period": 1, @@ -92,26 +97,27 @@ PUT _plugins/_rollup/jobs/<rollup_id>?if_seq_no=1&if_primary_term=1 // Update You can specify the following options. -Options | Description | Type | Required -:--- | :--- |:--- |:--- | -`source_index` | The name of the detector. | String | Yes -`target_index` | Specify the target index that the rolled up data is ingested into. You can either create a new target index or use an existing index. The target index cannot be a combination of raw and rolled up data. This field supports dynamically generated index names like {% raw %}`rollup_{{ctx.source_index}}`{% endraw %}, where `source_index` cannot contain wildcards. | String | Yes -`schedule` | Schedule of the index rollup job which can be an interval or a cron expression. | Object | Yes -`schedule.interval` | Specify the frequency of execution of the rollup job. | Object | No -`schedule.interval.start_time` | Start time of the interval. | Timestamp | Yes -`schedule.interval.period` | Define the interval period. | String | Yes -`schedule.interval.unit` | Specify the time unit of the interval. | String | Yes -`schedule.interval.cron` | Optionally, specify a cron expression to define therollup frequency. | List | No -`schedule.interval.cron.expression` | Specify a Unix cron expression. | String | Yes -`schedule.interval.cron.timezone` | Specify timezones as defined by the IANA Time Zone Database. Defaults to UTC. | String | No -`description` | Optionally, describe the rollup job. | String | No -`enabled` | When true, the index rollup job is scheduled. Default is `true`. | Boolean | Yes -`continuous` | Specify whether or not the index rollup job continuously rolls up data forever or executes over the current dataset once and stops. Default is `false`. | Boolean | Yes -`error_notification` | Set up a Mustache message template for error notifications. For example, if an index rollup job fails, the system sends a message to a Slack channel. | Object | No -`page_size` | Specify the number of buckets to paginate at a time during rollup. | Number | Yes -`delay` | The number of milliseconds to delay execution of the index rollup job. | Long | No -`dimensions` | Specify aggregations to create dimensions for the roll up time window. Supported groups are `terms`, `histogram`, and `date_histogram`. For more information, see [Bucket Aggregations]({{site.url}}{{site.baseurl}}/opensearch/bucket-agg). | Array | Yes -`metrics` | Specify a list of objects that represent the fields and metrics that you want to calculate. Supported metrics are `sum`, `max`, `min`, `value_count` and `avg`. For more information, see [Metric Aggregations]({{site.url}}{{site.baseurl}}/opensearch/metric-agg). | Array | No +Options | Description | Type | Required +:--- |:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--- |:--- | +`source_index` | The name of the detector. | String | Yes +`target_index` | Specify the target index that the rolled up data is ingested into. You can either create a new target index or use an existing index. The target index cannot be a combination of raw and rolled up data. This field supports dynamically generated index names like {% raw %}`rollup_{{ctx.source_index}}`{% endraw %}, where `source_index` cannot contain wildcards. | String | Yes +`target_index_settings` | Specify any [index settings]({{site.url}}{{site.baseurl}}/im-plugin/index-settings/) to be applied to the target index created during the rollup. | Object | No +`schedule` | Schedule of the index rollup job which can be an interval or a cron expression. | Object | Yes +`schedule.interval` | Specify the frequency of execution of the rollup job. | Object | No +`schedule.interval.start_time` | Start time of the interval. | Timestamp | Yes +`schedule.interval.period` | Define the interval period. | String | Yes +`schedule.interval.unit` | Specify the time unit of the interval. | String | Yes +`schedule.interval.cron` | Optionally, specify a cron expression to define therollup frequency. | List | No +`schedule.interval.cron.expression` | Specify a Unix cron expression. | String | Yes +`schedule.interval.cron.timezone` | Specify timezones as defined by the IANA Time Zone Database. Defaults to UTC. | String | No +`description` | Optionally, describe the rollup job. | String | No +`enabled` | When true, the index rollup job is scheduled. Default is `true`. | Boolean | Yes +`continuous` | Specify whether or not the index rollup job continuously rolls up data forever or executes over the current dataset once and stops. Default is `false`. | Boolean | Yes +`error_notification` | Set up a Mustache message template for error notifications. For example, if an index rollup job fails, the system sends a message to a Slack channel. | Object | No +`page_size` | Specify the number of buckets to paginate at a time during rollup. | Number | Yes +`delay` | The number of milliseconds to delay execution of the index rollup job. | Long | No +`dimensions` | Specify aggregations to create dimensions for the roll up time window. Supported groups are `terms`, `histogram`, and `date_histogram`. For more information, see [Bucket Aggregations]({{site.url}}{{site.baseurl}}/opensearch/bucket-agg). | Array | Yes +`metrics` | Specify a list of objects that represent the fields and metrics that you want to calculate. Supported metrics are `sum`, `max`, `min`, `value_count` and `avg`. For more information, see [Metric Aggregations]({{site.url}}{{site.baseurl}}/opensearch/metric-agg). | Array | No #### Example response diff --git a/_im-plugin/index-transforms/transforms-apis.md b/_im-plugin/index-transforms/transforms-apis.md index 37d2c035b50..7e0803c38b4 100644 --- a/_im-plugin/index-transforms/transforms-apis.md +++ b/_im-plugin/index-transforms/transforms-apis.md @@ -177,8 +177,8 @@ The update operation supports the following query parameters: Parameter | Description | Required :---| :--- | :--- -`seq_no` | Only perform the transform operation if the last operation that changed the transform job has the specified sequence number. | Yes -`primary_term` | Only perform the transform operation if the last operation that changed the transform job has the specified sequence term. | Yes +`if_seq_no` | Only perform the transform operation if the last operation that changed the transform job has the specified sequence number. | Yes +`if_primary_term` | Only perform the transform operation if the last operation that changed the transform job has the specified sequence term. | Yes ### Request body fields diff --git a/_im-plugin/ism/policies.md b/_im-plugin/ism/policies.md index 27c37e67ea7..9a0c15129e6 100644 --- a/_im-plugin/ism/policies.md +++ b/_im-plugin/ism/policies.md @@ -108,9 +108,11 @@ ISM supports the following operations: - [rollover](#rollover) - [notification](#notification) - [snapshot](#snapshot) +- [convert-index-to-remote](#convert_index_to_remote) - [index_priority](#index_priority) - [allocation](#allocation) - [rollup](#rollup) +- [stop_replication](#stop_replication) ### force_merge @@ -406,6 +408,33 @@ Parameter | Description | Type | Required | Default } ``` +### convert_index_to_remote + +Converts an index from a local snapshot repository to a remote repository. + +The `convert_index_to_remote` operation has the following parameters. + +Parameter | Description | Type | Required | Default +:--- | :--- |:--- |:--- | +`repository` | The repository name registered through the native snapshot API operations. | `string` | Yes | N/A +`snapshot` | The snapshot name created through the snapshot action. | `string` | Yes | N/A + +Make sure that the repository name used in the `convert_index_to_remote` operation matches the repository name specified during the snapshot action. Additionally, you can reference the snapshot using `{{ctx.index}}`, as shown in the following example policy: + +```json +{ + "snapshot": { + "repository": "my_backup", + "snapshot": "{{ctx.index}}" + }, + "convert_index_to_remote": { + "repository": "my_backup", + "snapshot": "{{ctx.index}}" + } +} +``` +{% include copy.html %} + ### index_priority Set the priority for the index in a specific state. Unallocated shards of indexes are recovered in the order of their priority, whenever possible. The indexes with higher priority values are recovered first followed by the indexes with lower priority values. @@ -457,7 +486,22 @@ Parameter | Description | Type | Required Rollup jobs can be continuous or non-continuous. A rollup job created using an ISM policy can only be non-continuous. {: .note } -#### Path and HTTP methods +### stop_replication + +Stops replication and converts the follower index to a regular index. + +```json +{ + "stop_replication": {} +} +``` + +When cross-cluster replication is enabled, the follower index becomes read-only, preventing all write operations. To manage replicated indexes on a follower cluster, you can perform the `stop_replication` action before performing other write operations. For example, you can define a policy that first runs `stop_replication` and then deletes the index by running a `delete` action. + +If security is enabled, in addition to [stop replication permissions]({{site.url}}{{site.baseurl}}/tuning-your-cluster/replication-plugin/permissions/#replication-permissions), you must have the `indices:internal/plugins/replication/index/stop` permission in order to use the `stop_replication` action. +{: .note} + +#### Endpoints ````bash PUT _plugins/_rollup/jobs/<rollup_id> @@ -484,6 +528,11 @@ GET _plugins/_rollup/jobs/<rollup_id>/_explain "ism_rollup": { "description": "Creating rollup through ISM", "target_index": "target", + "target_index_settings":{ + "index.number_of_shards": 1, + "index.number_of_replicas": 1, + "index.codec": "best_compression" + }, "page_size": 1000, "dimensions": [ { diff --git a/_includes/cards.html b/_includes/cards.html index 6d958e61a51..61a0a6c2a06 100644 --- a/_includes/cards.html +++ b/_includes/cards.html @@ -1,37 +1,24 @@ <div class="card-container-wrapper"> - <p class="heading-main">Explore OpenSearch documentation</p> - <div class="card-container"> - <div class="card"> - <a href="{{site.url}}{{site.baseurl}}/about/" class='card-link'></a> - <p class="heading">OpenSearch and OpenSearch Dashboards</p> - <p class="description">Build your OpenSearch solution using core tooling and visualizations</p> - <p class="last-link">Documentation →</p> - </div> - - - <div class="card"> - <a href="{{site.url}}/docs/latest/data-prepper/" class='card-link'></a> - <p class="heading">Data Prepper</p> - <p class="description">Filter, mutate, and sample your data for ingestion into OpenSearch</p> - <p class="last-link" >Documentation →</p> - </div> - - <div class="card"> - <a href="{{site.url}}/docs/latest/clients/" class='card-link'></a> - <p class="heading">Clients</p> - <p class="description">Interact with OpenSearch from your application using language APIs</p> - <p class="last-link">Documentation →</p> - </div> - - - <div class="card"> - <a href="{{site.url}}/docs/latest/benchmark/" class='card-link'></a> - <p class="heading">OpenSearch Benchmark</p> - <p class="description">Measure performance metrics for your OpenSearch cluster</p> - <p class="last-link">Documentation →</p> - </div> + <div class="card-container"> + {% for card in include.cards %} + <div class="card"> + <a href="{{ site.url }}{{ site.baseurl }}{{ card.link }}" class="card-link"></a> + <p class="heading">{{ card.heading }}</p> + {% if card.description %} + <p class="description">{{ card.description }}</p> + {% endif %} + {% if card.list %} + <ul> + {% for item in card.list %} + <li class="description">{{ item }}</li> + {% endfor %} + </ul> + {% endif %} + {% if include.documentation_link %} + <p class="last-link">Documentation →</p> + {% endif %} + </div> + {% endfor %} + </div> </div> - -</div> - - + \ No newline at end of file diff --git a/_includes/code-block.html b/_includes/code-block.html new file mode 100644 index 00000000000..f42f0cb0a5d --- /dev/null +++ b/_includes/code-block.html @@ -0,0 +1,46 @@ +{% assign languages = site.data.code_languages.languages %} +<div class="code-tabs"> + <div class="tab-nav"> + {% for lang in languages %} + {% assign lang_content = include[lang.id] %} + {% if lang_content != nil %} + <button class="tab-button{% if forloop.first %} active{% endif %}" data-action="switch_tab" + data-tab="{{ lang.id }}">{{ lang.name }}</button> + {% endif %} + {% endfor %} + </div> + + {% if include.description %} + <div class="code-description"> + {{ include.description }} + </div> + {% endif %} + + {% for lang in languages %} + {% assign lang_content = include[lang.id] %} + {% if lang_content != nil %} + <div id="{{ lang.id }}" class="tab{% if forloop.first %} active{% endif %}"> + <div class="code-container"> + {%- case lang.id -%} + {%- when 'rest' -%}{% highlight json %}{{ lang_content | strip }}{% endhighlight -%} + {%- when 'python' -%}{% highlight python %}{{ lang_content | strip }}{% endhighlight -%} + {%- when 'java' -%}{% highlight java %}{{ lang_content | strip }}{% endhighlight -%} + {%- when 'javascript' -%}{% highlight javascript %}{{ lang_content | strip }}{% endhighlight -%} + {%- when 'go' -%}{% highlight go %}{{ lang_content | strip }}{% endhighlight -%} + {%- when 'ruby' -%}{% highlight ruby %}{{ lang_content | strip }}{% endhighlight -%} + {%- when 'php' -%}{% highlight php %}{{ lang_content | strip }}{% endhighlight -%} + {%- when 'dotnet' -%}{% highlight cs %}{{ lang_content | strip }}{% endhighlight -%} + {%- when 'rust' -%}{% highlight rust %}{{ lang_content | strip }}{% endhighlight -%} + {%- else -%}{% highlight text %}{{ lang_content | strip }}{% endhighlight -%} + {%- endcase -%} + <div class="button-container"> + <button class="copy-button copy-code-button" data-action="copy_code">Copy</button> + {% if lang.id == "rest" %} + <button class="copy-button copy-curl-button" data-action="copy_as_curl">Copy as cURL</button> + {% endif %} + </div> + </div> + </div> + {% endif %} + {% endfor %} +</div> \ No newline at end of file diff --git a/_includes/feedback.html b/_includes/feedback.html index 95ccfd116b4..b4c57824ee6 100644 --- a/_includes/feedback.html +++ b/_includes/feedback.html @@ -1,16 +1,16 @@ <div class="div-feedback"> <div class="feedback-header">WAS THIS PAGE HELPFUL?</div> <div class="feedback-radio-div"> - <input id="yes" type="radio" name="radio-button" value="yes" class="feedback-button"> + <input id="yes" type="radio" name="radio-button" value="yes" class="feedback-button" data-action="enable_send_button"> <label class="feedback-radio" for="yes">✔ Yes</label> - <input id="no" type="radio" name="radio-button" value="no" class="feedback-button"> + <input id="no" type="radio" name="radio-button" value="no" class="feedback-button" data-action="enable_send_button"> <label class="feedback-radio" for="no">✖ No</label> </div> <div class="feedback-text-header">Tell us why</div> <textarea id="comment" class="feedback-text" placeholder="Enter comment" maxlength="350"></textarea> <div class="text-small num-chars" id="num-chars">350 characters left</div> - <button id="send" class="send-button" disabled>Send</button> + <button id="send" class="send-button" data-action="send_feedback" disabled>Send</button> <p class="text-small text-grey-dk-100 hidden" id="thank-you">Thank you for your feedback!</p> - <p class="text-small text-grey-dk-100">Have a question? <a class="feedback-forum" target="_blank" href="https://forum.opensearch.org/">Ask us on the OpenSearch forum</a>.</p> - <p class="text-small text-grey-dk-100">Want to contribute? <a class="feedback-edit" target="_blank" href="https://github.com/opensearch-project/documentation-website/edit/main/{{ page.path }}">Edit this page</a> or <a class="feedback-issue" target="_blank" href="https://github.com/opensearch-project/documentation-website/issues/new?assignees=&labels=untriaged&template=issue_template.md&title=%5BDOC%5D">create an issue</a>.</p> + <p class="text-small text-grey-dk-100">Have a question? <a class="feedback-forum" data-action="forum_link_click" target="_blank" href="https://forum.opensearch.org/">Ask us on the OpenSearch forum</a>.</p> + <p class="text-small text-grey-dk-100">Want to contribute? <a class="feedback-edit" data-action="edit_page_click" target="_blank" href="https://github.com/opensearch-project/documentation-website/edit/main/{{ page.path }}">Edit this page</a> or <a class="feedback-issue" data-action="submit_issue_click" target="_blank" href="https://github.com/opensearch-project/documentation-website/issues/new?assignees=&labels=untriaged&template=issue_template.md&title=%5BDOC%5D">create an issue</a>.</p> </div> \ No newline at end of file diff --git a/_includes/footer.html b/_includes/footer.html index 801b8c126eb..73a649f2dc5 100644 --- a/_includes/footer.html +++ b/_includes/footer.html @@ -69,12 +69,8 @@ <h4>{{ column.title }}</h4> </div> </div> <div class="footer--legal-rows-wrapper--row"> - © OpenSearch contributors, {{ 'now' | date: "%Y" }}. OpenSearch is a <a href="/trademark-brand-policy.html">registered trademark</a> of Amazon Web Services.</a> <br /><br /> - {% if page.notice == true or layout.notice == true %} OpenSearch includes certain Apache-licensed Elasticsearch code from Elasticsearch B.V. and other source code. Elasticsearch B.V. is not the source of that other source code. ELASTICSEARCH is a registered trademark of Elasticsearch B.V. <br /><br /> {% endif %} - - © 2005-2021 <a href="https://www.djangoproject.com/foundation/"> Django Software Foundation</a> and individual contributors. Django is a - <a href="https://www.djangoproject.com/trademarks/">registered trademark</a> of the Django Software Foundation.<br /> - This website was forked from the BSD-licensed <a href="https://github.com/django/djangoproject.com/">djangoproject.com</a> originally designed by <a href="https://www.threespot.com">Threespot</a> <span class="ampersand">&</span> <a href="https://andrevv.com/">andrevv</a>. + Copyright © OpenSearch Project a Series of LF Projects, LLC<br> + For web site terms of use, trademark policy and other project policies please see <a href="https://lfprojects.org">https://lfprojects.org</a>. </div> </div> </div> diff --git a/_includes/head_custom.html b/_includes/head_custom.html index a6bc7b7a3ba..dfbb473d281 100755 --- a/_includes/head_custom.html +++ b/_includes/head_custom.html @@ -3,14 +3,13 @@ {% endif %} {% if page.has_math == true %} - <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script> <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3.0.1/es5/tex-mml-chtml.js"></script> {% endif %} {% if jekyll.environment == "development" %} <script src="{{ '/assets/js/version-selector.js' | relative_url }}"></script> {% else %} - <script src="{{ '/docs/latest/assets/js/version-selector.js' }}"></script> + <script src="{{site.baseurl}}/assets/js/version-selector.js"></script> {% endif %} <!-- Last-Modified: {% last_modified_at %} --> diff --git a/_includes/header.html b/_includes/header.html index 20d82c451e0..32d5b147746 100644 --- a/_includes/header.html +++ b/_includes/header.html @@ -82,7 +82,7 @@ {% endif %} <div role="banner" id="top"> <div class="navigation-container"> - <a class="navigation-container--logo" href="{{ '/' | relative_url }}"> + <a class="navigation-container--logo" href="https://opensearch.org/"> OpenSearch <svg width="200" height="39" viewBox="0 0 200 39" fill="none" xmlns="http://www.w3.org/2000/svg"> <g clip-path="url(#clip0_723_1352)"> diff --git a/_includes/home_cards.html b/_includes/home_cards.html new file mode 100644 index 00000000000..098472d4c85 --- /dev/null +++ b/_includes/home_cards.html @@ -0,0 +1,72 @@ +<div class="home-card-container-wrapper"> + <p class="heading-main">OpenSearch and OpenSearch Dashboards</p> + <div class="home-card-container"> + <div class="home-card"> + <a href="{{site.url}}{{site.baseurl}}/about/" class='card-link'></a> + <p class="heading">All documentation</p> + <p class="description">Build your OpenSearch solution using core tooling and visualizations.</p> + <p class="last-link">Documentation →</p> + </div> + + + <div class="home-card"> + <a href="{{site.url}}{{site.baseurl}}/vector-search/" class='card-link'></a> + <p class="heading">Vector search</p> + <p class="description">Use vector database capabilities for more relevant search results.</p> + <p class="last-link" >Documentation →</p> + </div> + + <div class="home-card"> + <a href="{{site.url}}{{site.baseurl}}/ml-commons-plugin/" class='card-link'></a> + <p class="heading">Machine learning</p> + <p class="description">Power your applications with machine learning model integration.</p> + <p class="last-link">Documentation →</p> + </div> + + + <div class="home-card"> + <a href="{{site.url}}{{site.baseurl}}/dashboards/" class='card-link'></a> + <p class="heading">OpenSearch Dashboards</p> + <p class="description">Explore and visualize your data using interactive dashboards.</p> + <p class="last-link">Documentation →</p> + </div> + </div> + +</div> + +<div class="home-card-container-wrapper"> + <p class="heading-main">Supporting tools</p> + <div class="home-card-container"> + + <div class="home-card"> + <a href="{{site.url}}{{site.latesturl}}/data-prepper/" class='card-link'></a> + <p class="heading">Data Prepper</p> + <p class="description">Filter, mutate, and sample your data for ingestion into OpenSearch.</p> + <p class="last-link" >Documentation →</p> + </div> + + <div class="home-card"> + <a href="{{site.url}}{{site.latesturl}}/clients/" class='card-link'></a> + <p class="heading">Clients</p> + <p class="description">Interact with OpenSearch from your application using language APIs.</p> + <p class="last-link">Documentation →</p> + </div> + + + <div class="home-card"> + <a href="{{site.url}}{{site.latesturl}}/benchmark/" class='card-link'></a> + <p class="heading">OpenSearch Benchmark</p> + <p class="description">Measure OpenSearch cluster performance metrics.</p> + <p class="last-link">Documentation →</p> + </div> + + <div class="home-card"> + <a href="{{site.url}}{{site.latesturl}}/migration-assistant/" class='card-link'></a> + <p class="heading">Migration Assistant</p> + <p class="description">Migrate to OpenSearch.</p> + <p class="last-link">Documentation →</p> + </div> + </div> + +</div> + diff --git a/_includes/list.html b/_includes/list.html new file mode 100644 index 00000000000..c32fcdd0c58 --- /dev/null +++ b/_includes/list.html @@ -0,0 +1,22 @@ +<div class="numbered-list"> + {% if include.list_title %} + <div class="heading">{{ include.list_title }}</div> + {% endif %} + {% assign counter = 0 %} + {% for item in include.list_items %} + {% assign counter = counter | plus: 1 %} + <div class="list-item"> + <div class="number-circle">{{ counter }}</div> + <div class="list-content"> + <div class="list-heading"> + {% if item.link %} + <a href="{{ site.url }}{{ site.baseurl }}{{ item.link }}">{{ item.heading }}</a> + {% else %} + {{ item.heading }} + {% endif %} + </div> + <p class="description">{{ item.description | markdownify }}</p> + </div> + </div> + {% endfor %} +</div> diff --git a/_includes/redesign_buttons.html b/_includes/redesign_buttons.html deleted file mode 100644 index 0a6bc2f5b98..00000000000 --- a/_includes/redesign_buttons.html +++ /dev/null @@ -1,176 +0,0 @@ -{% case include.name %} - {% when 'download' %} - <label for="download-button" class="redesign-buttons--label">Download OpenSearch</label> - <div class="redesign-button--wrapper redesign-button--wrapper__complex-content"> - <a href="/downloads" class="redesign-button--anchor" id="download-button"> - <div class="redesign-button--contents redesign-buttons--download__large"> - <div class="redesign-button--contents--text-slot"> - <span class="download-button--mouseout-logo redesign-button--contents__mouseout"> - {%- include icons.html type='opensearch-logo-darkmode' -%} - </span> - <span class="download-button--mouseover-logo redesign-button--contents__mouseover"> - {%- include icons.html type='opensearch-logo-darkmode-2' -%} - </span> - <span class="download-button--disabled-logo redesign-button--contents__disabled"> - {%- include icons.html type='opensearch-logo-monochrome-1' %} - </span> - </div> - <div class="redesign-button--contents--icon-slot"> - <span class="download-button--mouseout-icon redesign-button--contents__mouseout"> - {%- include icons.html type='angle-double-down' -%} - </span> - <span class="download-button--mouseover-icon redesign-button--contents__mouseover"> - {%- include icons.html type='angle-double-down-2' -%} - </span> - <span class="download-button--disabled-icon redesign-button--contents__disabled"> - {%- include icons.html type='angle-double-down-monochrome-1' -%} - </span> - </div> - </div> - </a> - </div> - {% when 'documentation' %} - <label for="documentation-button" class="redesign-buttons--label">Learn More</label> - <div class="redesign-button--wrapper redesign-button--wrapper__text-only"> - <a href="https://opensearch.org/docs/latest/" class="redesign-button--anchor" id="documentation-button"> - Documentation - </a> - </div> - {% when 'documentation-dark' %} - <label for="documentation-button" class="redesign-buttons--label">Learn More</label> - <div class="redesign-button--wrapper redesign-button--wrapper__text-only__dark"> - <a href="https://opensearch.org/docs/latest/" class="redesign-button--anchor" id="documentation-button"> - Documentation - </a> - </div> - {% when 'community' %} - <label class="redesign-buttons--label">Join Our Community</label> - <div class="redesign-button-pair--wrapper"> - <div class="redesign-button--wrapper redesign-button--wrapper__text-only"> - <a href="https://forum.opensearch.org/" class="redesign-button--anchor"> - Forum - </a> - </div> - <div class="redesign-button--wrapper redesign-button--wrapper__text-only"> - <a href="/slack.html" class="redesign-button--anchor"> - Slack - </a> - </div> - </div> - {% when 'community-dark' %} - <label class="redesign-buttons--label">Join Our Community</label> - <div class="redesign-button-pair--wrapper"> - <div class="redesign-button--wrapper redesign-button--wrapper__text-only__dark"> - <a href="https://forum.opensearch.org/" class="redesign-button--anchor"> - Forum - </a> - </div> - <div class="redesign-button--wrapper redesign-button--wrapper__text-only__dark"> - <a href="/slack.html" class="redesign-button--anchor"> - Slack - </a> - </div> - </div> - {% when 'project' %} - <label for="project-button" class="redesign-buttons--label">Check Out Github</label> - <div class="redesign-button--wrapper redesign-button--wrapper__complex-content"> - <a id="project-button" - href="https://github.com/opensearch-project" - target="_blank" - class="redesign-button--anchor" - > - <div class="redesign-button--contents redesign-buttons--project__large"> - <div class="redesign-button--contents--text-slot"> - <span class="opensearch-button--label__combined-text-icon project-button--mouseout-logo redesign-button--contents__mouseout"> - <span class="opensearch-button--label__combined-text-icon--icon"> - {%- include icons.html type='github' -%} - </span> - <span class="opensearch-button--label__combined-text-icon--text">OpenSearch Project Org</span> - </span> - <span class="opensearch-button--label__combined-text-icon project-button--mouseover-logo redesign-button--contents__mouseover"> - <span class="opensearch-button--label__combined-text-icon--icon"> - {%- include icons.html type='github-mouseover' -%} - </span> - <span class="opensearch-button--label__combined-text-icon--text">OpenSearch Project Org</span> - </span> - <span class="opensearch-button--label__combined-text-icon project-button--disabled-logo redesign-button--contents__disabled"> - <span class="opensearch-button--label__combined-text-icon--icon"> - {%- include icons.html type='github-disabled' -%} - </span> - <span class="opensearch-button--label__combined-text-icon--text">OpenSearch Project Org</span> - </span> - </div> - <div class="redesign-button--contents--icon-slot"> - <span class="project-button--mouseout-icon redesign-button--contents__mouseout"> - {%- include icons.html type='chevron-right-circle' -%} - </span> - <span class="project-button--mouseover-icon redesign-button--contents__mouseover"> - {%- include icons.html type='chevron-right-circle-mouseover' -%} - </span> - <span class="project-button--disabled-icon redesign-button--contents__disabled"> - {%- include icons.html type='chevron-right-circle-disabled' -%} - </span> - </div> - </div> - </a> - </div> - {% when 'expand-collapse-toggle' %} - <div class="opensearch-toggle-button--wrapper"> - <a href="#" class="opensearch-toggle-button-link opensearch-toggle-button-link--untoggled opensearch-toggle-button-link__visible"> - {% include icons.html type='plus' %} - </a> - <a href="#" class="opensearch-toggle-button-link opensearch-toggle-button-link--toggled opensearch-toggle-button-link__invisible"> - {% include icons.html type='minus' %} - </a> - </div> - {% when 'opensearchcon-stay-informed' %} - <div class="redesign-button--wrapper redesign-button--wrapper__text-with-icon"> - <a href="https://opensearchcon2024interest.splashthat.com/" class="redesign-button--anchor" id="opensearchcon-stay-informed-button"> - <div class="redesign-button--contents redesign-buttons--opensearchcon-stay-informed__large"> - <div class="redesign-button--contents--icon-slot"> - <span class="opensearchcon-stay-informed-button--mouseout-logo redesign-button--contents__mouseout"> - {%- include icons.html type='opensearch-mark-darkmode' -%} - </span> - </div> - <div class="redesign-button--contents--text-slot">Stay Informed</div> - </div> - </a> - </div> - {% when 'opensearchcon-archive-2023' %} - <div class="redesign-button--wrapper redesign-button--wrapper__text-only"> - <a href="/events/opensearchcon/2023/north-america/index.html" class="redesign-button--anchor" id="opensearchcon-archive-2023-button"> - OpenSearchCon 2023 - </a> - </div> - {% when 'opensearchcon-archive-2022' %} - <div class="redesign-button--wrapper redesign-button--wrapper__text-only"> - <a href="/events/opensearchcon/2022/north-america/index.html" class="redesign-button--anchor" id="opensearchcon-archive-2022-button"> - OpenSearchCon 2022 - </a> - </div> - {% when 'upcoming-events-dark' %} - <div class="redesign-button--wrapper redesign-button--wrapper__text-only__dark"> - <a href="/events/calendars/2024-01.html" class="redesign-button--anchor events-page-menu-link__device-based" id="upcoming-opensearch-events-button"> - Upcoming OpenSearch Events - </a> - </div> - {% when 'opensearchcons' %} - <div class="redesign-button--wrapper redesign-button--wrapper__text-with-icon"> - <a href="/events/opensearchcon/index.html" class="redesign-button--anchor" id="opensearchcon-stay-informed-button"> - <div class="redesign-button--contents redesign-buttons--opensearchcon-stay-informed__large"> - <div class="redesign-button--contents--icon-slot"> - <span class="opensearchcon-stay-informed-button--mouseout-logo redesign-button--contents__mouseout"> - {%- include icons.html type='opensearch-mark-darkmode' -%} - </span> - </div> - <div class="redesign-button--contents--text-slot">OpenSearchCon</div> - </div> - </a> - </div> - {% when 'join-slack' %} - <div class="redesign-button--wrapper redesign-button--wrapper__text-only__dark"> - <a target="_blank" href="https://join.slack.com/t/opensearch/shared_invite/zt-2bh052go7-1s_~5G1_alt9aLPmoRAINQ" class="redesign-button--anchor" id="join-slack-button"> - Join the Conversation - </a> - </div> -{% endcase %} diff --git a/_includes/youtube-player.html b/_includes/youtube-player.html index 05981ca8525..cc9172d2439 100644 --- a/_includes/youtube-player.html +++ b/_includes/youtube-player.html @@ -6,10 +6,8 @@ <div class="embed-container"> <iframe src="https://www.youtube.com/embed/{{ include.id }}" - width="640" - height="385" frameborder="0" allowfullscreen="true" allow="accelerometer; clipboard-write; encrypted-media; gyroscope; picture-in-picture"> </iframe> - </div> \ No newline at end of file +</div> \ No newline at end of file diff --git a/_ingest-pipelines/accessing-data.md b/_ingest-pipelines/accessing-data.md new file mode 100644 index 00000000000..d0a4af214a9 --- /dev/null +++ b/_ingest-pipelines/accessing-data.md @@ -0,0 +1,269 @@ +--- +layout: default +title: Access data in a pipeline +nav_order: 20 +--- + +# Access data in a pipeline + +In ingest pipelines, you can access the document data using the `ctx` object. This object represents the processed document and allows you to read, modify, or enrich the document fields. Pipeline processors have read and write access to both the `_source` field of a document and its metadata fields. + +## Accessing document fields + +The `ctx` object exposes all document fields. You can access them directly using dot notation. + +### Example: Access a top-level field + +Given the following example document: + +```json +{ + "user": "alice" +} +``` + +You can access `user` as follows: + +```json +"field": "ctx.user" +``` + +### Example: Access a nested field + +Given the following example document: + +```json +{ + "user": { + "name": "alice" + } +} +``` + +You can access `user.name` as follows: + +```json +"field": "ctx.user.name" +``` + +## Accessing a field in the source + +To access a field in the document `_source`, refer to the field by its name: + +```json +{ + "set": { + "field": "environment", + "value": "production" + } +} +``` + +Alternatively, you can explicitly use `_source`: + +```json +{ + "set": { + "field": "_source.environment", + "value": "production" + } +} +``` + +## Accessing metadata fields + +You can read or write to metadata fields such as the following: + +- `_index` +- `_type` +- `_id` +- `_routing` + +### Example: Set `_routing` dynamically + +```json +{ + "set": { + "field": "_routing", + "value": "{% raw %}{{region}}{% endraw %}" + } +} +``` + + +## Accessing ingest metadata fields + +The `_ingest.timestamp` field represents the time at which the ingest node received the document. To persist this timestamp, use the `set` processor: + +```json +{ + "set": { + "field": "received_at", + "value": "{% raw %}{{_ingest.timestamp}}{% endraw %}" + } +} +``` + +## Using `ctx` in Mustache templates + +Use Mustache templates to insert field values into processor settings. Use triple curly braces ({% raw %}`{{{` and `}}}`{% endraw %}) for unescaped field values. + +### Example: Combining source fields + +The following processor configuration combines the `app` and `env` fields, separated by an underscore (_), and stores the result in the `log_label` field: + +```json +{ + "set": { + "field": "log_label", + "value": "{% raw %}{{{app}}}_{{{env}}}{% endraw %}" + } +} +``` + +### Example: Generating a dynamic greeting using the `set` processor + +If a document's `user` field is set to `alice`, use the following syntax to produce the result `"greeting": "Hello, alice!"`: + +```json +{ + "set": { + "field": "greeting", + "value": "Hello, {% raw %}{{{user}}}{% endraw %}!" + } +} +``` + +## Dynamic field names + +You can use a field's value as the name of a new field: + +```json +{ + "set": { + "field": "{% raw %}{{service}}{% endraw %}", + "value": "{% raw %}{{code}}{% endraw %}" + } +} +``` + +## Example: Routing to a dynamic index based on status + +The following processor configuration sets the target index dynamically by appending `-events` to the value of the `status` field: + +```json +{ + "set": { + "field": "_index", + "value": "{% raw %}{{status}}{% endraw %}-events" + } +} +``` + +## Using `ctx` in the `script` processor + +Use the `script` processor for advanced transformations. + +### Example: Adding a field only if another is missing + +The following processor adds the `error_message` field with the value "none" only if the field is missing from the document: + +```json +{ + "script": { + "lang": "painless", + "source": "if (ctx.error_message == null) { ctx.error_message = 'none'; }" + } +} +``` + +### Example: Copying a value from one field to another + +The following processor copies the value from the `timestamp` field into a new field called `event_time`: + +```json +{ + "script": { + "lang": "painless", + "source": "ctx.event_time = ctx.timestamp;" + } +} +``` + +## Example of a complete pipeline + +The following example defines a complete ingest pipeline that sets a tagline using the `source` field, extracts the `year` from the `date` field, and records the document’s ingest timestamp in the `received_at` field: + +```json +PUT _ingest/pipeline/example-pipeline +{ + "description": "Sets tags, log label, and defaults error message", + "processors": [ + { + "set": { + "field": "tagline", + "value": "{% raw %}{{{user.first}}} from {{{department}}}{% endraw %}" + } + }, + { + "script": { + "lang": "painless", + "source": "ctx.year = ctx.date.substring(0, 4);" + } + }, + { + "set": { + "field": "received_at", + "value": "{% raw %}{{_ingest.timestamp}}{% endraw %}" + } + } + ] +} +``` + +To test the pipeline, use the following request: + +```json +POST _ingest/pipeline/example-pipeline/_simulate +{ + "docs": [ + { + "_source": { + "user": { + "first": "Liam" + }, + "department": "Engineering", + "date": "2024-12-03T14:05:00Z" + } + } + ] +} +``` + +The response shows the enriched document after processing, including the newly added `tagline`, extracted `year`, and the `received_at` timestamp generated by the ingest pipeline: + +```json +{ + "docs": [ + { + "doc": { + "_index": "_index", + "_id": "_id", + "_source": { + "user": { + "first": "Liam" + }, + "department": "Engineering", + "date": "2024-12-03T14:05:00Z", + "tagline": "Liam from Engineering", + "year": "2024", + "received_at": "2025-04-14T18:40:00.000Z" + }, + "_ingest": { + "timestamp": "2025-04-14T18:40:00.000Z" + } + } + } + ] +} +``` diff --git a/_ingest-pipelines/conditional-execution.md b/_ingest-pipelines/conditional-execution.md new file mode 100644 index 00000000000..200d0d47436 --- /dev/null +++ b/_ingest-pipelines/conditional-execution.md @@ -0,0 +1,233 @@ +--- +layout: default +title: Conditional execution +nav_order: 40 +--- + +# Conditional execution + +In ingest pipelines, you can control whether a processor runs by using the optional `if` parameter. This allows for conditional execution of processors based on the incoming document contents. The condition is written as a Painless script and evaluated against the document context (`ctx`). + +## Basic conditional execution + +Each processor can include an `if` clause. If the condition evaluates to `true`, the processor runs; otherwise, it's skipped. + +### Example: Drop debug-level logs + +The following pipeline drops any document in which the `log_level` field is equal to `debug`: + +```json +PUT _ingest/pipeline/drop_debug_logs +{ + "processors": [ + { + "drop": { + "if": "ctx.log_level == 'debug'" + } + } + ] +} +``` +{% include copy-curl.html %} + +### Example index request + +```json +POST logs/_doc/1?pipeline=drop_debug_logs +{ + "message": "User logged in", + "log_level": "debug" +} +``` +{% include copy-curl.html %} + +This document is dropped because the condition evaluates to `true`: + +```json +{ + "_index": "logs", + "_id": "1", + "_version": -3, + "result": "noop", + "_shards": { + "total": 0, + "successful": 0, + "failed": 0 + } +} +``` + +## Null-safe field checks when using nested fields + +When working with nested fields, it's important to avoid null pointer exceptions. Use the null-safe `?.` operator in Painless scripts. + +### Example: Drop documents based on a nested field + +The following drop processor executes only if the nested `app.env` field exists and equals `debug`: + +```json +PUT _ingest/pipeline/drop_debug_env +{ + "processors": [ + { + "drop": { + "if": "ctx.app?.env == 'debug'" + } + } + ] +} +``` +{% include copy-curl.html %} + +If the null-safe `?.` operator is not configured, indexing any document that doesn't contain the `app.env` field will trigger the following null pointer exception: + +```json +{ + "error": "IngestProcessorException[ScriptException[runtime error]; nested: NullPointerException[Cannot invoke \"Object.getClass()\" because \"callArgs[0]\" is null];]", + "status": 400 +} +``` + +## Handling flattened fields + +If your document has a flattened field, for example, `"app.env": "debug"`, use the [`dot_expander`]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/dot-expander/) processor to convert it into a nested structure: + +```json +PUT _ingest/pipeline/drop_debug_env +{ + "processors": [ + { + "dot_expander": { + "field": "app.env" + } + }, + { + "drop": { + "if": "ctx.app?.env == 'debug'" + } + } + ] +} +``` +{% include copy-curl.html %} + +## Safe method calls in conditions + +Avoid calling methods on potential null values. Use constants or null checks instead: + +```json +{ + "drop": { + "if": "ctx.app?.env != null && ctx.app.env.contains('debug')" + } +} +``` + +## Full example: Multi-step conditional pipeline + +The following ingest pipeline uses three processors: + +1. `set`: If no value is provided in the `user` field, sets the `user` field to `guest`. +2. `set`: If the `status_code` is provided and is higher than `400`, sets the `error` field to `true`. +3. `drop`: If the `app.env` field is equal to `debug`, drops the entire document. + +```json +PUT _ingest/pipeline/logs_processing +{ + "processors": [ + { + "set": { + "field": "user", + "value": "guest", + "if": "ctx.user == null" + } + }, + { + "set": { + "field": "error", + "value": true, + "if": "ctx.status_code != null && ctx.status_code >= 400" + } + }, + { + "drop": { + "if": "ctx.app?.env == 'debug'" + } + } + ] +} +``` +{% include copy-curl.html %} + +### Simulate the pipeline + +The following simulation request applies the conditional logic to three documents: + +```json +POST _ingest/pipeline/logs_processing/_simulate +{ + "docs": [ + { + "_source": { + "message": "Successful login", + "status_code": 200 + } + }, + { + "_source": { + "message": "Database error", + "status_code": 500, + "user": "alice" + } + }, + { + "_source": { + "message": "Debug mode trace", + "app": { "env": "debug" } + } + } + ] +} +``` +{% include copy-curl.html %} + +The response demonstrates how the processors respond based on each condition: + +```json +{ + "docs": [ + { + "doc": { + "_index": "_index", + "_id": "_id", + "_source": { + "status_code": 200, + "message": "Successful login", + "user": "guest" + }, + "_ingest": { + "timestamp": "2025-04-16T14:04:35.923159885Z" + } + } + }, + { + "doc": { + "_index": "_index", + "_id": "_id", + "_source": { + "status_code": 500, + "message": "Database error", + "error": true, + "user": "alice" + }, + "_ingest": { + "timestamp": "2025-04-16T14:04:35.923198551Z" + } + } + }, + null + ] +} +``` + + diff --git a/_ingest-pipelines/index.md b/_ingest-pipelines/index.md index f0b52ea1520..43df68273c2 100644 --- a/_ingest-pipelines/index.md +++ b/_ingest-pipelines/index.md @@ -60,4 +60,5 @@ Learn how to: - [Test a pipeline]({{site.url}}{{site.baseurl}}/ingest-pipelines/simulate-ingest/). - [Retrieve information about a pipeline]({{site.url}}{{site.baseurl}}/ingest-pipelines/get-ingest/). - [Delete a pipeline]({{site.url}}{{site.baseurl}}/ingest-pipelines/delete-ingest/). -- [Use ingest processors in OpenSearch]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/index-processors/) \ No newline at end of file +- [Use ingest processors in OpenSearch]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/index-processors/) +- [Use conditional execution]({{site.url}}{{site.baseurl}}/ingest-pipelines/conditional-execution/) \ No newline at end of file diff --git a/_ingest-pipelines/pipeline-failures.md b/_ingest-pipelines/pipeline-failures.md index 47e75168ce2..d78684e9e73 100644 --- a/_ingest-pipelines/pipeline-failures.md +++ b/_ingest-pipelines/pipeline-failures.md @@ -1,7 +1,7 @@ --- layout: default title: Handling pipeline failures -nav_order: 15 +nav_order: 30 redirect_from: - /api-reference/ingest-apis/pipeline-failures/ --- diff --git a/_ingest-pipelines/processors/append.md b/_ingest-pipelines/processors/append.md index 8101cf97c99..aabe77a9bbe 100644 --- a/_ingest-pipelines/processors/append.md +++ b/_ingest-pipelines/processors/append.md @@ -30,7 +30,7 @@ The following is the syntax for the `append` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters @@ -43,6 +43,7 @@ Parameter | Required/Optional | Description | `description` | Optional | A brief description of the processor. | `if` | Optional | A condition for running the processor. | `ignore_failure` | Optional | Specifies whether the processor continues execution even if it encounters errors. If set to `true`, failures are ignored. Default is `false`. | +`allow_duplicates` | Optional | Specifies whether to append the values already contained in the field. If `true`, duplicate values are appended. Otherwise, they are skipped. | `on_failure` | Optional | A list of processors to run if the processor fails. | `tag` | Optional | An identifier tag for the processor. Useful for debugging in order to distinguish between processors of the same type. | diff --git a/_ingest-pipelines/processors/bytes.md b/_ingest-pipelines/processors/bytes.md index 29a30ca3b8d..aa770e9134a 100644 --- a/_ingest-pipelines/processors/bytes.md +++ b/_ingest-pipelines/processors/bytes.md @@ -22,7 +22,7 @@ The following is the syntax for the `bytes` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/community_id.md b/_ingest-pipelines/processors/community_id.md index c6f74d8af7f..2e06d3f6e2e 100644 --- a/_ingest-pipelines/processors/community_id.md +++ b/_ingest-pipelines/processors/community_id.md @@ -24,7 +24,7 @@ The following is the `community_id` processor syntax: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/convert.md b/_ingest-pipelines/processors/convert.md index c86f86c9a76..917ce6d83f0 100644 --- a/_ingest-pipelines/processors/convert.md +++ b/_ingest-pipelines/processors/convert.md @@ -26,7 +26,7 @@ The following is the syntax for the `convert` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/copy.md b/_ingest-pipelines/processors/copy.md index 03ee2279a5d..b6a34e12500 100644 --- a/_ingest-pipelines/processors/copy.md +++ b/_ingest-pipelines/processors/copy.md @@ -29,7 +29,7 @@ The following is the syntax for the `copy` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/csv.md b/_ingest-pipelines/processors/csv.md index 1d64fb01596..a4c46c93925 100644 --- a/_ingest-pipelines/processors/csv.md +++ b/_ingest-pipelines/processors/csv.md @@ -26,7 +26,7 @@ The following is the syntax for the `csv` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/date-index-name.md b/_ingest-pipelines/processors/date-index-name.md index e40b0d6ea61..2f661d62f60 100644 --- a/_ingest-pipelines/processors/date-index-name.md +++ b/_ingest-pipelines/processors/date-index-name.md @@ -19,7 +19,7 @@ The following is the syntax for the `date_index_name` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/date.md b/_ingest-pipelines/processors/date.md index a601cacbed4..75320c3d05e 100644 --- a/_ingest-pipelines/processors/date.md +++ b/_ingest-pipelines/processors/date.md @@ -26,7 +26,7 @@ The following is the syntax for the `date` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/dissect.md b/_ingest-pipelines/processors/dissect.md index 4a42b924237..e9976b265fe 100644 --- a/_ingest-pipelines/processors/dissect.md +++ b/_ingest-pipelines/processors/dissect.md @@ -24,7 +24,7 @@ The following is the syntax for the `dissect` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/dot-expander.md b/_ingest-pipelines/processors/dot-expander.md index 5cfebba7583..35ad9c28f63 100644 --- a/_ingest-pipelines/processors/dot-expander.md +++ b/_ingest-pipelines/processors/dot-expander.md @@ -18,7 +18,7 @@ The following is the syntax for the `dot_expander` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/drop.md b/_ingest-pipelines/processors/drop.md index c7bfc3cd755..c1bab740274 100644 --- a/_ingest-pipelines/processors/drop.md +++ b/_ingest-pipelines/processors/drop.md @@ -25,7 +25,7 @@ The following is the syntax for the `drop` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/fail.md b/_ingest-pipelines/processors/fail.md index e529fee7991..ca164cda8ac 100644 --- a/_ingest-pipelines/processors/fail.md +++ b/_ingest-pipelines/processors/fail.md @@ -17,7 +17,7 @@ The following is the syntax for the `fail` processor: "message": "Custom error message" } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/fingerprint.md b/_ingest-pipelines/processors/fingerprint.md index 4775da98b65..342bdd755b3 100644 --- a/_ingest-pipelines/processors/fingerprint.md +++ b/_ingest-pipelines/processors/fingerprint.md @@ -24,7 +24,7 @@ The following is the syntax for the `fingerprint` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/foreach.md b/_ingest-pipelines/processors/foreach.md index d0f962e6185..45d56de9eb6 100644 --- a/_ingest-pipelines/processors/foreach.md +++ b/_ingest-pipelines/processors/foreach.md @@ -25,7 +25,7 @@ The following is the syntax for the `foreach` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/grok.md b/_ingest-pipelines/processors/grok.md index 5579dbda138..e597bb24eae 100644 --- a/_ingest-pipelines/processors/grok.md +++ b/_ingest-pipelines/processors/grok.md @@ -31,7 +31,7 @@ The following is the basic syntax for the `grok` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/gsub.md b/_ingest-pipelines/processors/gsub.md index 1619d98f814..f0fb857cf4c 100644 --- a/_ingest-pipelines/processors/gsub.md +++ b/_ingest-pipelines/processors/gsub.md @@ -18,7 +18,7 @@ The following is the syntax for the `gsub` processor: "replacement": "replacement_string" } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/html-strip.md b/_ingest-pipelines/processors/html-strip.md index ac33c45eae9..45d3810f058 100644 --- a/_ingest-pipelines/processors/html-strip.md +++ b/_ingest-pipelines/processors/html-strip.md @@ -18,7 +18,7 @@ The following is the syntax for the `html_strip` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/index-processors.md b/_ingest-pipelines/processors/index-processors.md index 9628a167280..886877b19c4 100644 --- a/_ingest-pipelines/processors/index-processors.md +++ b/_ingest-pipelines/processors/index-processors.md @@ -1,7 +1,7 @@ --- layout: default title: Ingest processors -nav_order: 30 +nav_order: 80 has_children: true has_toc: false redirect_from: diff --git a/_ingest-pipelines/processors/join.md b/_ingest-pipelines/processors/join.md index c2cdcfe4de1..246ae5db72a 100644 --- a/_ingest-pipelines/processors/join.md +++ b/_ingest-pipelines/processors/join.md @@ -19,7 +19,7 @@ The following is the syntax for the `join` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/json.md b/_ingest-pipelines/processors/json.md index d251533e27c..61e8ceffa8f 100644 --- a/_ingest-pipelines/processors/json.md +++ b/_ingest-pipelines/processors/json.md @@ -22,7 +22,7 @@ The following is the syntax for the `json` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/kv.md b/_ingest-pipelines/processors/kv.md index cc235070561..5e5a54425d4 100644 --- a/_ingest-pipelines/processors/kv.md +++ b/_ingest-pipelines/processors/kv.md @@ -27,7 +27,7 @@ The following is the syntax for the `kv` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters @@ -85,8 +85,6 @@ To test the pipeline, run the following query: ```json POST _ingest/pipeline/kv-pipeline/_simulate - -```json { "docs": [ { @@ -103,7 +101,7 @@ POST _ingest/pipeline/kv-pipeline/_simulate **Response** -The following example response confirms that the pipeline is working as expected: +The following example response confirms that, in addition to the original `message` field, the document contains fields generated from key-value pairs: ```json { @@ -132,8 +130,6 @@ The following query ingests a document into an index named `testindex1`: ```json PUT testindex1/_doc/1?pipeline=kv-pipeline - -```json { "message": "goodbye=everybody hello=world" } diff --git a/_ingest-pipelines/processors/lowercase.md b/_ingest-pipelines/processors/lowercase.md index 5bfa3704912..0c8ce51e2a2 100644 --- a/_ingest-pipelines/processors/lowercase.md +++ b/_ingest-pipelines/processors/lowercase.md @@ -25,7 +25,7 @@ The following is the syntax for the `lowercase` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/ml-inference.md b/_ingest-pipelines/processors/ml-inference.md index 1aa22995e13..bc8b05825d5 100644 --- a/_ingest-pipelines/processors/ml-inference.md +++ b/_ingest-pipelines/processors/ml-inference.md @@ -43,7 +43,7 @@ The following is the syntax for the `ml-inference` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters @@ -53,7 +53,7 @@ The following table lists the required and optional parameters for the `ml-infer |:--- | :--- | :--- | :--- | | `model_id` | String | Required | The ID of the ML model used by the processor. | | `function_name` | String | Optional for externally hosted models<br/><br/>Required for local models | The function name of the ML model configured in the processor. For local models, valid values are `sparse_encoding`, `sparse_tokenize`, `text_embedding`, and `text_similarity`. For externally hosted models, valid value is `remote`. Default is `remote`. | -| `model_config` | Object | Optional | Custom configuration options for the ML model. For more information, see [The `model_config` object]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#the-model_config-object). | +| `model_config` | Object | Optional | Custom configuration options for the ML model. For externally hosted models, if set, this configuration overrides the default connector parameters. For local models, you can add `model_config` to `model_input` to override the model configuration set during registration. For more information, see [The `model_config` object]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#the-model_config-object). | | `model_input` | String | Optional for externally hosted models<br/><br/>Required for local models | A template that defines the input field format expected by the model. Each local model type might use a different set of inputs. For externally hosted models, default is `"{ \"parameters\": ${ml_inference.parameters} }`.| | `input_map` | Array | Optional for externally hosted models<br/><br/>Required for local models | An array specifying how to map ingested document fields to the model input fields. Each element of the array is a map in the `"<model_input_field>": "<document_field>"` format and corresponds to one model invocation for a document field. If no input mapping is specified for an externally hosted model, then all fields from the document are passed to the model directly as input. The `input_map` size indicates the number of times the model is invoked (the number of Predict API requests). | | `<model_input_field>` | String | Optional for externally hosted models<br/><br/>Required for local models | The model input field name. | diff --git a/_ingest-pipelines/processors/pipeline.md b/_ingest-pipelines/processors/pipeline.md index 5c1c6306d19..96650caca0c 100644 --- a/_ingest-pipelines/processors/pipeline.md +++ b/_ingest-pipelines/processors/pipeline.md @@ -18,7 +18,7 @@ The following is the syntax for the `pipeline` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/remove_by_pattern.md b/_ingest-pipelines/processors/remove-by-pattern.md similarity index 95% rename from _ingest-pipelines/processors/remove_by_pattern.md rename to _ingest-pipelines/processors/remove-by-pattern.md index 5fd1516fb7f..c5afce01714 100644 --- a/_ingest-pipelines/processors/remove_by_pattern.md +++ b/_ingest-pipelines/processors/remove-by-pattern.md @@ -1,13 +1,13 @@ --- layout: default -title: Remove_by_pattern +title: Remove by pattern parent: Ingest processors nav_order: 225 redirect_from: - - /api-reference/ingest-apis/processors/remove_by_pattern/ + - /ingest-pipelines/processors/remove_by_pattern/ --- -# Remove_by_pattern processor +# Remove by pattern processor The `remove_by_pattern` processor removes the root-level fields from a document by using specified wildcard patterns. @@ -22,7 +22,7 @@ The following is the syntax for the `remove_by_pattern` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/remove.md b/_ingest-pipelines/processors/remove.md index 9656f437b31..c5aa51d7780 100644 --- a/_ingest-pipelines/processors/remove.md +++ b/_ingest-pipelines/processors/remove.md @@ -2,7 +2,7 @@ layout: default title: Remove parent: Ingest processors -nav_order: 230 +nav_order: 222 redirect_from: - /api-reference/ingest-apis/processors/remove/ --- @@ -22,7 +22,7 @@ The following is the syntax for the `remove` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/rename.md b/_ingest-pipelines/processors/rename.md index a3ea14fa2b8..63860dce46d 100644 --- a/_ingest-pipelines/processors/rename.md +++ b/_ingest-pipelines/processors/rename.md @@ -2,7 +2,7 @@ layout: default title: Rename parent: Ingest processors -nav_order: 230 +nav_order: 227 redirect_from: - /api-reference/ingest-apis/processors/rename/ --- @@ -23,14 +23,14 @@ The following is the syntax for the `rename` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters The following table lists the required and optional parameters for the `rename` processor. -| Parameter | Required/Optional | Description | -|---|---|---| +Parameter | Required/Optional | Description | +---|---|---| `field` | Required | The field name containing the data to be removed. Supports [template snippets]({{site.url}}{{site.baseurl}}/ingest-pipelines/create-ingest/#template-snippets). | `target_field` | Required | The new name of the field. Supports [template snippets]({{site.url}}{{site.baseurl}}/ingest-pipelines/create-ingest/#template-snippets). | `ignore_missing` | Optional | Specifies whether the processor should ignore documents that do not contain the specified `field`. If set to `true`, the processor does not modify the document if the `field` does not exist. Default is `false`. | diff --git a/_ingest-pipelines/processors/script.md b/_ingest-pipelines/processors/script.md index ae8e0bd9c6b..8ec35db0f5a 100644 --- a/_ingest-pipelines/processors/script.md +++ b/_ingest-pipelines/processors/script.md @@ -7,7 +7,7 @@ nav_order: 230 # Script processor -The `script` processor executes inline and stored scripts that can modify or transform data in an OpenSearch document during the ingestion process. The processor uses script caching for improved performance because scripts may be recompiled per document. Refer to [Script APIs](https://opensearch.org/docs/latest/api-reference/script-apis/index/) for information about working with scripts in OpenSearch. +The `script` processor executes inline and stored scripts that can modify or transform data in an OpenSearch document during the ingestion process. The processor uses script caching for improved performance because scripts may be recompiled per document. Refer to [Script APIs]({{site.url}}{{site.baseurl}}/api-reference/script-apis/index/) for information about working with scripts in OpenSearch. The following is the syntax for the `script` processor: @@ -24,7 +24,7 @@ The following is the syntax for the `script` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/set.md b/_ingest-pipelines/processors/set.md index 1abf9775b9a..05f77fa7ccd 100644 --- a/_ingest-pipelines/processors/set.md +++ b/_ingest-pipelines/processors/set.md @@ -24,7 +24,7 @@ The following is the syntax for the `set` processor: ] } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/sort.md b/_ingest-pipelines/processors/sort.md index 7f1377bc32d..1df489c9b6b 100644 --- a/_ingest-pipelines/processors/sort.md +++ b/_ingest-pipelines/processors/sort.md @@ -24,7 +24,7 @@ The following is the syntax for the `sort` processor: ] } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/sparse-encoding.md b/_ingest-pipelines/processors/sparse-encoding.md index 3af6f4e9871..d0a8f2f6d8e 100644 --- a/_ingest-pipelines/processors/sparse-encoding.md +++ b/_ingest-pipelines/processors/sparse-encoding.md @@ -27,7 +27,7 @@ The following is the syntax for the `sparse_encoding` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters @@ -36,12 +36,31 @@ The following table lists the required and optional parameters for the `sparse_e | Parameter | Data type | Required/Optional | Description | |:---|:---|:---|:---| `model_id` | String | Required | The ID of the model that will be used to generate the embeddings. The model must be deployed in OpenSearch before it can be used in neural search. For more information, see [Using custom models within OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/) and [Neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). +`prune_type` | String | Optional | The prune strategy for sparse vectors. Valid values are `max_ratio`, `alpha_mass`, `top_k`, `abs_value`, and `none`. Default is `none`. +`prune_ratio` | Float | Optional | The ratio for the pruning strategy. Required when `prune_type` is specified. `field_map` | Object | Required | Contains key-value pairs that specify the mapping of a text field to a `rank_features` field. `field_map.<input_field>` | String | Required | The name of the field from which to obtain text for generating vector embeddings. `field_map.<vector_field>` | String | Required | The name of the vector field in which to store the generated vector embeddings. `description` | String | Optional | A brief description of the processor. | `tag` | String | Optional | An identifier tag for the processor. Useful for debugging to distinguish between processors of the same type. | `batch_size` | Integer | Optional | Specifies the number of documents to be batched and processed each time. Default is `1`. | +`skip_existing` | Boolean | Optional | When `true`, the processor does not make inference calls for fields that already contain embeddings, leaving existing embeddings unchanged. Default is `false`.| + +### Pruning sparse vectors + +A sparse vector often has a long-tail distribution of token weights, with less important tokens occupying a significant amount of storage space. Pruning reduces the size of an index by removing tokens with lower semantic importance, yielding a slight decrease in search relevance in exchange for a more compact index. + +The `sparse_encoding` processor can be used to prune sparse vectors by configuring the `prune_type` and `prune_ratio` parameters. The following table lists the supported pruning options for the `sparse_encoding` processor. + +| Pruning type | Valid pruning ratio | Description | +|:---|:---|:---| +`max_ratio` | Float [0, 1) | Prunes a sparse vector by keeping only elements whose values are within the `prune_ratio` of the largest value in the vector. +`abs_value` | Float (0, +∞) | Prunes a sparse vector by removing elements with values lower than the `prune_ratio`. +`alpha_mass` | Float [0, 1) | Prunes a sparse vector by keeping only elements whose cumulative sum of values is within the `prune_ratio` of the total sum. +`top_k` | Integer (0, +∞) | Prunes a sparse vector by keeping only the top `prune_ratio` elements. +none | N/A | Leaves sparse vectors unchanged. + +Among all pruning options, specifying `max_ratio` as equal to `0.1` demonstrates strong generalization on test datasets. This approach reduces storage requirements by approximately 40% while incurring less than a 1% loss in search relevance. ## Using the processor @@ -59,6 +78,8 @@ PUT /_ingest/pipeline/nlp-ingest-pipeline { "sparse_encoding": { "model_id": "aP2Q8ooBpBj3wT4HVS8a", + "prune_type": "max_ratio", + "prune_ratio": 0.1, "field_map": { "passage_text": "passage_embedding" } @@ -111,23 +132,15 @@ The response confirms that in addition to the `passage_text` field, the processo "worlds" : 2.7839446, "yes" : 0.75845814, "##world" : 2.5432441, - "born" : 0.2682308, "nothing" : 0.8625516, - "goodbye" : 0.17146169, "greeting" : 0.96817183, "birth" : 1.2788506, - "come" : 0.1623208, - "global" : 0.4371151, - "it" : 0.42951578, "life" : 1.5750692, - "thanks" : 0.26481047, "world" : 4.7300377, - "tiny" : 0.5462298, "earth" : 2.6555297, "universe" : 2.0308156, "worldwide" : 1.3903781, "hello" : 6.696973, - "so" : 0.20279501, "?" : 0.67785245 }, "passage_text" : "hello world" @@ -141,7 +154,7 @@ The response confirms that in addition to the `passage_text` field, the processo } ``` -Once you have created an ingest pipeline, you need to create an index for ingestion and ingest documents into the index. To learn more, see [Create an index for ingestion]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-with-pipelines/#step-2b-create-an-index-for-ingestion) and [Step 3: Ingest documents into the index]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-with-pipelines/#step-2c-ingest-documents-into-the-index) of [Neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). +Once you have created an ingest pipeline, you need to create an index for ingestion and ingest documents into the index. For a complete example, see [Generating sparse vector embeddings automatically]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-with-pipelines/). --- @@ -150,4 +163,4 @@ Once you have created an ingest pipeline, you need to create an index for ingest - To learn how to use the `neural_sparse` query for a sparse search, see [Neural sparse query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/neural-sparse/). - To learn more about sparse search, see [Neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). - To learn more about using models in OpenSearch, see [Choosing a model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/#choosing-a-model). -- For a comprehensive example, see [Neural search tutorial]({{site.url}}{{site.baseurl}}/search-plugins/neural-search-tutorial/). +- For a comprehensive example, see [Getting started with semantic and hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search-tutorial/). diff --git a/_ingest-pipelines/processors/split.md b/_ingest-pipelines/processors/split.md index cdb0cfe3de4..0aef1e09ce1 100644 --- a/_ingest-pipelines/processors/split.md +++ b/_ingest-pipelines/processors/split.md @@ -20,7 +20,7 @@ The following is the syntax for the `split` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/text-chunking.md b/_ingest-pipelines/processors/text-chunking.md index 0141ba15643..7c1e23d0166 100644 --- a/_ingest-pipelines/processors/text-chunking.md +++ b/_ingest-pipelines/processors/text-chunking.md @@ -9,8 +9,9 @@ nav_order: 250 The `text_chunking` processor splits a long document into shorter passages. The processor supports the following algorithms for text splitting: -- [`fixed_token_length`](#fixed-token-length-algorithm): Splits text into passages of the specified size. -- [`delimiter`](#delimiter-algorithm): Splits text into passages on a delimiter. +- [`fixed_token_length`](#the-fixed-token-length-algorithm): Splits text into passages of the length specified by the number of tokens. +- [`fixed_char_length`](#the-fixed-character-length-algorithm): Splits text into passages of the length specified by the number of characters. +- [`delimiter`](#the-delimiter-algorithm): Splits text into passages on a delimiter. The following is the syntax for the `text_chunking` processor: @@ -37,7 +38,7 @@ The following table lists the required and optional parameters for the `text_chu | `field_map.<input_field>` | String | Required | The name of the field from which to obtain text for generating chunked passages. | | `field_map.<output_field>` | String | Required | The name of the field in which to store the chunked results. | | `algorithm` | Object | Required | Contains at most one key-value pair that specifies the chunking algorithm and parameters. | -| `algorithm.<name>` | String | Optional | The name of the chunking algorithm. Valid values are [`fixed_token_length`](#fixed-token-length-algorithm) or [`delimiter`](#delimiter-algorithm). Default is `fixed_token_length`. | +| `algorithm.<name>` | String | Optional | The name of the chunking algorithm. Valid values are [`fixed_token_length`](#the-fixed-token-length-algorithm), [`fixed_char_length`](#the-fixed-character-length-algorithm), and [`delimiter`](#the-delimiter-algorithm). Default is `fixed_token_length`. | | `algorithm.<parameters>` | Object | Optional | The parameters for the chunking algorithm. By default, contains the default parameters of the `fixed_token_length` algorithm. | | `ignore_missing` | Boolean | Optional | If `true`, empty fields are excluded from the output. If `false`, the output will contain an empty list for every empty field. Default is `false`. | | `description` | String | Optional | A brief description of the processor. | @@ -46,7 +47,7 @@ The following table lists the required and optional parameters for the `text_chu To perform chunking on nested fields, specify `input_field` and `output_field` values as JSON objects. Dot paths of nested fields are not supported. For example, use `"field_map": { "foo": { "bar": "bar_chunk"} }` instead of `"field_map": { "foo.bar": "foo.bar_chunk"}`. {: .note} -### Fixed token length algorithm +### The fixed token length algorithm The following table lists the optional parameters for the `fixed_token_length` algorithm. @@ -55,18 +56,37 @@ The following table lists the optional parameters for the `fixed_token_length` a | `token_limit` | Integer | Optional | The token limit for chunking algorithms. Valid values are integers of at least `1`. Default is `384`. | | `tokenizer` | String | Optional | The [word tokenizer]({{site.url}}{{site.baseurl}}/analyzers/tokenizers/index/#word-tokenizers) name. Default is `standard`. | | `overlap_rate` | Float | Optional | The degree of overlap in the token algorithm. Valid values are floats between `0` and `0.5`, inclusive. Default is `0`. | -| `max_chunk_limit` | Integer | Optional | The chunk limit for chunking algorithms. Default is 100. To disable this parameter, set it to `-1`. | +| `max_chunk_limit` | Integer | Optional | The chunk limit for chunking algorithms. Default is `100`. To disable this parameter, set it to `-1`. | -The default value of `token_limit` is `384` so that output passages don't exceed the token limit constraint of the downstream text embedding models. For [OpenSearch-supported pretrained models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#supported-pretrained-models), like `msmarco-distilbert-base-tas-b` and `opensearch-neural-sparse-encoding-v1`, the input token limit is `512`. The `standard` tokenizer tokenizes text into words. According to [OpenAI](https://platform.openai.com/docs/introduction), 1 token equals approximately 0.75 words of English text. The default token limit is calculated as 512 * 0.75 = 384. +The default value of `token_limit` is calculated as `512 (tokens) * 0.75 = 384` so that output passages don't exceed the token limit constraint of the downstream text embedding models. For [OpenSearch-supported pretrained models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#supported-pretrained-models), like `msmarco-distilbert-base-tas-b` and `opensearch-neural-sparse-encoding-v1`, the input token limit is `512`. The `standard` tokenizer tokenizes text into words. According to [OpenAI](https://platform.openai.com/docs/introduction), 1 token equals approximately 0.75 words of English text. {: .note} -You can set the `overlap_rate` to a decimal percentage value in the 0--0.5 range, inclusive. Per [Amazon Bedrock](https://aws.amazon.com/blogs/aws/knowledge-bases-now-delivers-fully-managed-rag-experience-in-amazon-bedrock/), we recommend setting this parameter to a value of 0–0.2 to improve accuracy. +You can set the `overlap_rate` to a decimal percentage value in the 0--0.5 range, inclusive. As suggested by [Amazon Bedrock](https://aws.amazon.com/blogs/aws/knowledge-bases-now-delivers-fully-managed-rag-experience-in-amazon-bedrock/), we recommend setting this parameter to a value of 0–0.2 to improve accuracy. {: .note} -The `max_chunk_limit` parameter limits the number of chunked passages. If the number of passages generated by the processor exceeds the limit, the algorithm will return an exception, prompting you to either increase or disable the limit. +The `max_chunk_limit` parameter limits the number of chunked passages. If the number of passages generated by the processor exceeds the limit, the excess text is added to the last chunk. {: .note} -### Delimiter algorithm +### The fixed character length algorithm + +The following table lists the optional parameters for the `fixed_char_length` algorithm. + +| Parameter | Data type | Required/Optional | Description | +|:---|:----------|:---|:---| +| `char_limit` | Integer | Optional | The char limit for chunking algorithms. Valid values are integers of at least `1`. Default is `2048`. | +| `overlap_rate` | Float | Optional | The degree of overlap in the token algorithm. Valid values are floats between `0` and `0.5`, inclusive. Default is `0`. | +| `max_chunk_limit` | Integer | Optional | The chunk limit for chunking algorithms. Default is `100`. To disable this parameter, set it to `-1`. | + +The default `char_limit` is calculated as `512 (tokens) * 4 (chars) = 2048` because 512 tokens is a common limit for text embedding models. According to [OpenAI](https://platform.openai.com/docs/concepts#tokens), 1 token equals approximately 4 characters of English text. +{: .note} + +You can set the `overlap_rate` to a decimal percentage value in the 0--0.5 range, inclusive. As suggested by [Amazon Bedrock](https://aws.amazon.com/blogs/aws/knowledge-bases-now-delivers-fully-managed-rag-experience-in-amazon-bedrock/), we recommend setting this parameter to a value of 0–0.2 to improve accuracy. +{: .note} + +The `max_chunk_limit` parameter limits the number of chunked passages. If the number of passages generated by the processor exceeds the limit, the excess text is added to the last chunk. +{: .note} + +### The delimiter algorithm The following table lists the optional parameters for the `delimiter` algorithm. @@ -75,7 +95,7 @@ The following table lists the optional parameters for the `delimiter` algorithm. | `delimiter` | String | Optional | A string delimiter used to split text. You can set the `delimiter` to any string, for example, `\n` (split text into paragraphs on a new line) or `.` (split text into sentences). Default is `\n\n` (split text into paragraphs on two new line characters). | | `max_chunk_limit` | Integer | Optional | The chunk limit for chunking algorithms. Default is `100`. To disable this parameter, set it to `-1`. | -The `max_chunk_limit` parameter limits the number of chunked passages. If the number of passages generated by the processor exceeds the limit, the algorithm will return an exception, prompting you to either increase or disable the limit. +The `max_chunk_limit` parameter limits the number of chunked passages. If the number of passages generated by the processor exceeds the limit, the excess text is added to the last chunk. {: .note} ## Using the processor @@ -163,7 +183,7 @@ The response confirms that, in addition to the `passage_text` field, the process Once you have created an ingest pipeline, you need to create an index for document ingestion. To learn more, see [Text chunking]({{site.url}}{{site.baseurl}}/search-plugins/text-chunking/). -## Cascaded text chunking processors +## Cascading text chunking processors You can chain multiple text chunking processors together. For example, to split documents into paragraphs, apply the `delimiter` algorithm and specify the parameter as `\n\n`. To prevent a paragraph from exceeding the token limit, append another text chunking processor that uses the `fixed_token_length` algorithm. You can configure the ingest pipeline for this example as follows: @@ -203,6 +223,61 @@ PUT _ingest/pipeline/text-chunking-cascade-ingest-pipeline ``` {% include copy-curl.html %} +### Recursive text chunking using cascaded processors + +For more advanced control, you can chain more than two processors in order to create a recursive chunking effect. This strategy involves deconstructing text into progressively smaller, more semantically meaningful units. + +For example, you can first split a document into paragraphs (`\n\n`) and then split each paragraph into sentences (`. `). Finally, you can chunk each sentence using the `fixed_char_length` algorithm to ensure that the final passages do not exceed a specific length. This hierarchical approach helps maintain as much semantic context as possible within the final size constraints. + +The following example configures a three-stage recursive chunking pipeline: + +```json +PUT _ingest/pipeline/recursively-text-chunking-cascade-ingest-pipeline +{ + "description": "A pipeline that recursively chunks text by paragraph, then sentence, then character length.", + "processors": [ + { + "text_chunking": { + "algorithm": { + "delimiter": { + "delimiter": "\n\n" + } + }, + "field_map": { + "original_text": "paragraph_chunks" + } + } + }, + { + "text_chunking": { + "algorithm": { + "delimiter": { + "delimiter": ". " + } + }, + "field_map": { + "paragraph_chunks": "sentence_chunks" + } + } + }, + { + "text_chunking": { + "algorithm": { + "fixed_char_length": { + "char_limit": 300, + "overlap_rate": 0.1 + } + }, + "field_map": { + "sentence_chunks": "final_recursive_chunks" + } + } + } + ] +} +``` +{% include copy-curl.html %} + ## Next steps - For a complete example, see [Text chunking]({{site.url}}{{site.baseurl}}/search-plugins/text-chunking/). diff --git a/_ingest-pipelines/processors/text-embedding.md b/_ingest-pipelines/processors/text-embedding.md index 6d263a0fec6..958475751d6 100644 --- a/_ingest-pipelines/processors/text-embedding.md +++ b/_ingest-pipelines/processors/text-embedding.md @@ -27,7 +27,7 @@ The following is the syntax for the `text_embedding` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters @@ -42,6 +42,10 @@ The following table lists the required and optional parameters for the `text_emb `description` | String | Optional | A brief description of the processor. | `tag` | String | Optional | An identifier tag for the processor. Useful for debugging to distinguish between processors of the same type. | `batch_size` | Integer | Optional | Specifies the number of documents to be batched and processed each time. Default is `1`. | +`if` | String containing a Boolean expression | Optional | A condition for running the processor.| +`ignore_failure` | Boolean | Optional | Specifies whether the processor continues execution even if it encounters an error. If set to `true`, the processor failure is ignored. Default is `false`.| +`on_failure` | List | Optional | A list of processors to run if the processor fails. | +`skip_existing` | Boolean | Optional | When `true`, the processor does not make inference calls for fields that already contain embeddings, leaving existing embeddings unchanged. Default is `false`.| ## Using the processor @@ -129,4 +133,4 @@ Once you have created an ingest pipeline, you need to create an index for ingest - To learn how to use the `neural` query for text search, see [Neural query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/neural/). - To learn more about semantic search, see [Semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/). - To learn more about using models in OpenSearch, see [Choosing a model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/#choosing-a-model). -- For a comprehensive example, see [Neural search tutorial]({{site.url}}{{site.baseurl}}/search-plugins/neural-search-tutorial/). \ No newline at end of file +- For a comprehensive example, see [Getting started with semantic and hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search-tutorial/). diff --git a/_ingest-pipelines/processors/text-image-embedding.md b/_ingest-pipelines/processors/text-image-embedding.md index 212e9f96bec..7dcdfdbc0a5 100644 --- a/_ingest-pipelines/processors/text-image-embedding.md +++ b/_ingest-pipelines/processors/text-image-embedding.md @@ -29,7 +29,7 @@ The following is the syntax for the `text_image_embedding` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Parameters @@ -44,6 +44,7 @@ The following table lists the required and optional parameters for the `text_ima `field_map.image` | String | Optional | The name of the field from which to obtain the image for generating vector embeddings. You must specify at least one `text` or `image`. `description` | String | Optional | A brief description of the processor. | `tag` | String | Optional | An identifier tag for the processor. Useful for debugging to distinguish between processors of the same type. | +`skip_existing` | Boolean | Optional | When `true`, the processor does not make inference calls for fields that already contain embeddings, leaving existing embeddings unchanged. Default is `false`.| ## Using the processor @@ -138,4 +139,4 @@ Once you have created an ingest pipeline, you need to create an index for ingest - To learn how to use the `neural` query for a multimodal search, see [Neural query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/neural/). - To learn more about multimodal search, see [Multimodal search]({{site.url}}{{site.baseurl}}/search-plugins/multimodal-search/). - To learn more about using models in OpenSearch, see [Choosing a model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/#choosing-a-model). -- For a comprehensive example, see [Neural search tutorial]({{site.url}}{{site.baseurl}}/search-plugins/neural-search-tutorial/). \ No newline at end of file +- For a comprehensive example, see [Getting started with semantic and hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search-tutorial/). \ No newline at end of file diff --git a/_ingest-pipelines/processors/trim.md b/_ingest-pipelines/processors/trim.md index 9c1999aeb2c..e2b7522077f 100644 --- a/_ingest-pipelines/processors/trim.md +++ b/_ingest-pipelines/processors/trim.md @@ -19,18 +19,17 @@ The following is the syntax for the `trim` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters The following table lists the required and optional parameters for the `trim` processor. -Parameter | Required/Optional | Description | -|-----------|-----------|-----------| +Parameter | Required/Optional | Description +:---|:---|:--- `field` | Required | The field containing the text to be trimmed. `target_field` | Required | The field in which the trimmed text is stored. If not specified, then the field is updated in-place. -`ignore_missing` | Optional | Specifies whether the processor should ignore documents that do not contain the specified -field. If set to `true`, then the processor ignores missing values in the field and leaves the `target_field` unchanged. Default is `false`. +`ignore_missing` | Optional | Specifies whether the processor should ignore documents that do not contain the specified field. If set to `true`, then the processor ignores missing values in the field and leaves the `target_field` unchanged. Default is `false`. `description` | Optional | A brief description of the processor. `if` | Optional | A condition for running the processor. `ignore_failure` | Optional | Specifies whether the processor continues execution even if it encounters an error. If set to `true`, then failures are ignored. Default is `false`. diff --git a/_ingest-pipelines/processors/uppercase.md b/_ingest-pipelines/processors/uppercase.md index 7fa5192f421..201c57dd8b1 100644 --- a/_ingest-pipelines/processors/uppercase.md +++ b/_ingest-pipelines/processors/uppercase.md @@ -25,7 +25,7 @@ The following is the syntax for the `uppercase` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/urldecode.md b/_ingest-pipelines/processors/urldecode.md index 1736a8469b2..0d6a8f46e3a 100644 --- a/_ingest-pipelines/processors/urldecode.md +++ b/_ingest-pipelines/processors/urldecode.md @@ -19,7 +19,7 @@ The following is the syntax for the `urldecode` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/processors/user-agent.md b/_ingest-pipelines/processors/user-agent.md index c4ece62ac5c..f451d839090 100644 --- a/_ingest-pipelines/processors/user-agent.md +++ b/_ingest-pipelines/processors/user-agent.md @@ -21,7 +21,7 @@ The following is the syntax for the `user_agent` processor: } } ``` -{% include copy-curl.html %} +{% include copy.html %} ## Configuration parameters diff --git a/_ingest-pipelines/simulate-ingest.md b/_ingest-pipelines/simulate-ingest.md index 33ae9351510..7e7d7b3750d 100644 --- a/_ingest-pipelines/simulate-ingest.md +++ b/_ingest-pipelines/simulate-ingest.md @@ -13,7 +13,7 @@ redirect_from: Use the simulate ingest pipeline API operation to run or test the pipeline. -## Path and HTTP methods +## Endpoints The following requests **simulate the latest ingest pipeline created**: diff --git a/_install-and-configure/additional-plugins/index.md b/_install-and-configure/additional-plugins/index.md index afc17cd8b2b..5e574d39e7a 100644 --- a/_install-and-configure/additional-plugins/index.md +++ b/_install-and-configure/additional-plugins/index.md @@ -10,29 +10,31 @@ nav_order: 10 There are many more plugins available in addition to those provided by the standard distribution of OpenSearch. These additional plugins have been built by OpenSearch developers or members of the OpenSearch community. While it isn't possible to provide an exhaustive list (because many plugins are not maintained in an OpenSearch GitHub repository), the following plugins, available in the [OpenSearch/plugins](https://github.com/opensearch-project/OpenSearch/tree/main/plugins) directory on GitHub, are some of the plugins that can be installed using one of the installation options, for example, using the command `bin/opensearch-plugin install <plugin-name>`. | Plugin name | Earliest available version | -|:-----------------------------------------------------------------------------------------------------------------------|:---------------------------| -| analysis-icu | 1.0.0 | -| analysis-kuromoji | 1.0.0 | -| analysis-nori | 1.0.0 | +|:---|:---| +| `analysis-icu` | 1.0.0 | +| `analysis-kuromoji` | 1.0.0 | +| `analysis-nori` | 1.0.0 | | [`analysis-phonenumber`]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/phone-analyzers/) | 2.18.0 | -| analysis-phonetic | 1.0.0 | -| analysis-smartcn | 1.0.0 | -| analysis-stempel | 1.0.0 | -| analysis-ukrainian | 1.0.0 | -| discovery-azure-classic | 1.0.0 | -| discovery-ec2 | 1.0.0 | -| discovery-gce | 1.0.0 | +| `analysis-phonetic` | 1.0.0 | +| `analysis-smartcn` | 1.0.0 | +| `analysis-stempel` | 1.0.0 | +| `analysis-ukrainian` | 1.0.0 | +| `discovery-azure-classic` | 1.0.0 | +| `discovery-ec2` | 1.0.0 | +| `discovery-gce` | 1.0.0 | | [`ingest-attachment`]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/ingest-attachment-plugin/) | 1.0.0 | -| mapper-annotated-text | 1.0.0 | -| mapper-murmur3 | 1.0.0 | +| `ingest-kafka` | 3.0.0 | +| `ingest-kinesis` | 3.0.0 | +| `mapper-annotated-text` | 1.0.0 | +| `mapper-murmur3` | 1.0.0 | | [`mapper-size`]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/mapper-size-plugin/) | 1.0.0 | -| query-insights | 2.12.0 | -| repository-azure | 1.0.0 | -| repository-gcs | 1.0.0 | -| repository-hdfs | 1.0.0 | -| repository-s3 | 1.0.0 | -| store-smb | 1.0.0 | -| transport-nio | 1.0.0 | +| `query-insights` | 2.12.0 | +| `repository-azure` | 1.0.0 | +| `repository-gcs` | 1.0.0 | +| `repository-hdfs` | 1.0.0 | +| `repository-s3` | 1.0.0 | +| `store-smb` | 1.0.0 | +| `transport-grpc` | 3.0.0 | ## Related articles diff --git a/_install-and-configure/configuring-opensearch/cluster-settings.md b/_install-and-configure/configuring-opensearch/cluster-settings.md index 65804a5de95..dac3e2bfc5d 100644 --- a/_install-and-configure/configuring-opensearch/cluster-settings.md +++ b/_install-and-configure/configuring-opensearch/cluster-settings.md @@ -100,10 +100,16 @@ OpenSearch supports the following cluster-level routing and shard allocation set - `REPLICA_FIRST` – Replica shards are relocated first, before primary shards. This prioritization may help prevent a cluster's health status from going red when carrying out shard relocation in a mixed-version, segment-replication-enabled OpenSearch cluster. In this situation, primary shards relocated to OpenSearch nodes of a newer version could try to copy segment files to replica shards on an older version of OpenSearch, which would result in shard failure. Relocating replica shards first may help to avoid this in multi-version clusters. - `NO_PREFERENCE` – The default behavior in which the order of shard relocation has no importance. +- `cluster.routing.search_replica.strict` (Dynamic, Boolean): Controls how search requests are routed when search replica shards exist for an index, such as when `index.number_of_search_replicas` is greater than `0`. This setting applies only when search replicas are configured for an index. When set to `true`, all search requests for such indexes are routed only to search replica shards. If search replicas are unassigned, the requests fail. When set to `false`, if search replicas are unassigned, requests fall back to any available shard. Default is `true`. + - `cluster.allocator.gateway.batch_size` (Dynamic, integer): Limits the number of shards sent to data nodes in a single batch to fetch any unassigned shard metadata. Default is `2000`. - `cluster.allocator.existing_shards_allocator.batch_enabled` (Static, Boolean): Enables batch allocation of unassigned shards that already exist on the disk, as opposed to allocating one shard at a time. This reduces memory and transport overhead by fetching any unassigned shard metadata in a batch call. Default is `false`. +- `cluster.routing.allocation.total_shards_per_node` (Dynamic, integer): The maximum combined total number of primary and replica shards that can be allocated to a single node. Default is `-1` (unlimited). Helps distribute shards evenly across nodes by limiting the total number of shards per node. Use with caution because shards may remain unallocated if nodes reach their configured limits. + +- `cluster.routing.allocation.total_primary_shards_per_node` (Dynamic, integer): The maximum number of primary shards that can be allocated to a single node. This setting is applicable only for remote-backed clusters. Default is `-1`(unlimited). Helps distribute primary shards evenly across nodes by limiting the number of primary shards per node. Use with caution because primary shards may remain unallocated if nodes reach their configured limits. + ## Cluster-level shard, block, and task settings OpenSearch supports the following cluster-level shard, block, and task settings: diff --git a/_install-and-configure/configuring-opensearch/index-settings.md b/_install-and-configure/configuring-opensearch/index-settings.md index 8aa714c02d7..265039f5282 100644 --- a/_install-and-configure/configuring-opensearch/index-settings.md +++ b/_install-and-configure/configuring-opensearch/index-settings.md @@ -34,7 +34,7 @@ OpenSearch supports the following dynamic cluster-level index settings: - `action.auto_create_index` (Boolean): Automatically creates an index if the index doesn't already exist. Also applies any index templates that are configured. Default is `true`. -- `action.destructive_requires_name` (Boolean): When set to `true`, you must specify the index name to delete an index. You cannot delete all indexes or use wildcards. Default is `true`. +- `action.destructive_requires_name` (Boolean): When `true`, you must specify the index name to delete an index. You cannot delete all indexes or use wildcards. Default is `false`. - `cluster.default.index.refresh_interval` (Time unit): Sets the refresh interval when the `index.refresh_interval` setting is not provided. This setting can be useful when you want to set a default refresh interval across all indexes in a cluster and support the `searchIdle` setting. You cannot set the interval lower than the `cluster.minimum.index.refresh_interval` setting. @@ -81,6 +81,16 @@ OpenSearch supports the following dynamic cluster-level index settings: - `cluster.default_number_of_replicas` (Integer): Controls the default number of replicas for indexes in the cluster. The index-level `index.number_of_replicas` setting defaults to this value if not configured. Default is `1`. +- `cluster.thread_pool.<fixed-threadpool>.size` (Integer): Controls the sizes of both the fixed and resizable queue thread pools. Overrides the defaults provided in `opensearch.yml`. + +- `cluster.thread_pool.<scaling-threadpool>.max` (Integer): Sets the maximum size of the scaling thread pool. Overrides the default provided in `opensearch.yml`. + +- `cluster.thread_pool.<scaling-threadpool>.core` (Integer): Specifies the core size of the scaling thread pool. Overrides the default provided in `opensearch.yml`. + + +Before tuning thread pool settings dynamically, note that these are expert-level settings that can potentially destabilize your cluster. Modifying thread pool settings applies the same thread pool size to all nodes, so it's not recommended for clusters with different hardware for the same roles. Similarly, avoid tuning thread pools shared by both data nodes and cluster manager nodes. After making these changes, we recommend monitoring your cluster to ensure that it remains stable and performs as expected. +{: .warning} + ## Index-level index settings You can specify index settings at index creation. There are two types of index settings: @@ -150,6 +160,8 @@ For `zstd`, `zstd_no_dict`, `qat_lz4`, and `qat_deflate`, you can specify the co - `index.use_compound_file` (Boolean): This setting controls the Apache Lucene `useCompoundFile` index writer settings, which specifies whether newly written segment files will be packed into a compound file. Default is `true`. +- `index.append_only.enabled` (Boolean): Set to `true` to prevent any updates to documents in the index. Default is `false`. + ### Updating a static index setting You can update a static index setting only on a closed index. The following example demonstrates updating the index codec setting. @@ -189,8 +201,14 @@ OpenSearch supports the following dynamic index-level index settings: - `index.number_of_replicas` (Integer): The number of replica shards each primary shard should have. For example, if you have 4 primary shards and set `index.number_of_replicas` to 3, the index has 12 replica shards. If not set, defaults to `cluster.default_number_of_replicas` (which is `1` by default). +- `index.number_of_search_replicas` (Integer): The number of search replica shards that each primary shard should have. For example, if you have 4 primary shards and set `index.number_of_search_replicas` to 3, the index has 12 search replica shards. Default is `0`. + - `index.auto_expand_replicas` (String): Whether the cluster should automatically add replica shards based on the number of data nodes. Specify a lower bound and upper limit (for example, 0--9) or `all` for the upper limit. For example, if you have 5 data nodes and set `index.auto_expand_replicas` to 0--3, then the cluster does not automatically add another replica shard. However, if you set this value to `0-all` and add 2 more nodes for a total of 7, the cluster will expand to now have 6 replica shards. Default is disabled. +- `index.auto_expand_search_replicas` (String): Controls whether the cluster automatically adjusts the number of search replica shards based on the number of available search nodes. Specify the value as a range with a lower and upper bound, for example, `0-3` or `0-all`. If you don't specify a value, this feature is disabled. + + For example, if you have 5 data nodes and set `index.auto_expand_search_replicas` to `0-3`, the index can have up to 3 search replicas and the cluster does not automatically add another search replica shard. However, if you set `index.auto_expand_search_replicas` to `0-all` and add 2 more nodes, for a total of 7, the cluster will expand to now have 7 search replica shards. This setting is disabled by default. + - `index.search.idle.after` (Time unit): The amount of time a shard should wait for a search or get request until it goes idle. Default is `30s`. - `index.search.default_pipeline` (String): The name of the search pipeline that is used if no pipeline is explicitly set when searching an index. If a default pipeline is set and the pipeline doesn't exist, then the index requests fail. Use the pipeline name `_none` to specify no default search pipeline. For more information, see [Default search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/using-search-pipeline/#default-search-pipeline). @@ -223,7 +241,7 @@ OpenSearch supports the following dynamic index-level index settings: - `index.query.default_field` (List): A field or list of fields that OpenSearch uses in queries in case a field isn't specified in the parameters. -- `index.query.max_nested_depth` (Integer): The maximum number of nesting levels for `nested` queries. Default is `Integer.MAX_VALUE`. Minimum is 1 (single `nested` query). +- `index.query.max_nested_depth` (Integer): The maximum number of nesting levels for `nested` queries. Default is `20`. Minimum is `1` (single `nested` query). - `index.requests.cache.enable` (Boolean): Enables or disables the index request cache. Default is `true`. For more information, see [Index request cache]({{site.url}}{{site.baseurl}}/search-plugins/caching/request-cache/). @@ -241,6 +259,10 @@ OpenSearch supports the following dynamic index-level index settings: - `index.optimize_doc_id_lookup.fuzzy_set.false_positive_probability` (Double): Sets the false-positive probability for the underlying `fuzzy_set` (that is, the Bloom filter). A lower false-positive probability ensures higher throughput for upsert and get operations but results in increased storage and memory use. Allowed values range between `0.01` and `0.50`. Default is `0.20`. +- `index.routing.allocation.total_shards_per_node` (Integer): The maximum combined total number of primary and replica shards from a single index that can be allocated to a single node. Default is `-1` (unlimited). Helps control per-index shard distribution across nodes by limiting the number of shards per node. Use with caution because shards from this index may remain unallocated if nodes reach their configured limits. + +- `index.routing.allocation.total_primary_shards_per_node` (Integer): The maximum number of primary shards from a single index that can be allocated to a single node. This setting is applicable only for remote-backed clusters. Default is `-1` (unlimited). Helps control per-index primary shard distribution across nodes by limiting the number of primary shards per node. Use with caution because primary shards from this index may remain unallocated if nodes reach their configured limits. + ### Updating a dynamic index setting You can update a dynamic index setting at any time through the API. For example, to update the refresh interval, use the following request: diff --git a/_install-and-configure/configuring-opensearch/logs.md b/_install-and-configure/configuring-opensearch/logs.md index e601a1eeaa7..1c320b1cc0e 100644 --- a/_install-and-configure/configuring-opensearch/logs.md +++ b/_install-and-configure/configuring-opensearch/logs.md @@ -93,6 +93,7 @@ There are other ways to change log levels: - `${sys:opensearch.logs.base_path}` is the directory for logs (for example, `/var/log/opensearch/`). - `${sys:opensearch.logs.cluster_name}` is the name of the cluster. + - `${sys:opensearch.logs.node_name}` is the name of the node. - `[%node_name]` is the name of the node. ## Search request slow logs diff --git a/_install-and-configure/configuring-opensearch/network-settings.md b/_install-and-configure/configuring-opensearch/network-settings.md index dc61ccc49b9..bd3a2c63a26 100644 --- a/_install-and-configure/configuring-opensearch/network-settings.md +++ b/_install-and-configure/configuring-opensearch/network-settings.md @@ -39,6 +39,8 @@ OpenSearch supports the following advanced network settings for HTTP communicati - `http.compression` (Static, Boolean): Enables support for compression using `Accept-Encoding` when applicable. When `HTTPS` is enabled, the default is `false`, otherwise, the default is `true`. Disabling compression for HTTPS helps mitigate potential security risks, such as `BREACH` attacks. To enable compression for HTTPS traffic, explicitly set `http.compression` to `true`. +- `http.max_header_size`: (Static, string) The maximum combined size of all HTTP headers allowed in a request. Default is `16KB`. + ## Advanced transport settings OpenSearch supports the following advanced network settings for transport communication: @@ -55,5 +57,4 @@ The default OpenSearch transport is provided by the `transport-netty4` module an Plugin | Description :---------- | :-------- -`transport-nio` | The OpenSearch transport based on Java NIO. <br> Installation: `./bin/opensearch-plugin install transport-nio` <br> Configuration (using `opensearch.yml`): <br> `transport.type: nio-transport` <br> `http.type: nio-http-transport` `transport-reactor-netty4` | The OpenSearch HTTP transport based on [Project Reactor](https://github.com/reactor/reactor-netty) and Netty 4 (**experimental**) <br> Installation: `./bin/opensearch-plugin install transport-reactor-netty4` <br> Configuration (using `opensearch.yml`): <br> `http.type: reactor-netty4` <br> `http.type: reactor-netty4-secure` diff --git a/_install-and-configure/configuring-opensearch/plugin-settings.md b/_install-and-configure/configuring-opensearch/plugin-settings.md index cc2212fb1ee..26e157189e5 100644 --- a/_install-and-configure/configuring-opensearch/plugin-settings.md +++ b/_install-and-configure/configuring-opensearch/plugin-settings.md @@ -85,7 +85,7 @@ The Notifications plugin supports the following settings. All settings in this l ## Query Insights plugin settings -For information about Query Insights plugin settings, see [Query insights settings]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/index#query-insights-settings). +For information about Query Insights plugin settings, see [Query Insights features and settings]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/index#query-insights-features-and-settings). ## Security plugin settings diff --git a/_install-and-configure/configuring-opensearch/security-settings.md b/_install-and-configure/configuring-opensearch/security-settings.md index 2ac09a48197..5a2d80c588a 100644 --- a/_install-and-configure/configuring-opensearch/security-settings.md +++ b/_install-and-configure/configuring-opensearch/security-settings.md @@ -307,16 +307,6 @@ The Security plugin supports the following REST layer TLS key store and trust st For more information, see [REST layer TLS]({{site.url}}{{site.baseurl}}/security/configuration/tls/#rest-layer-tls-1). -## OpenSSL settings - -The Security plugin supports the following OpenSSL settings: - -- `plugins.security.ssl.transport.enable_openssl_if_available` (Static): Enables OpenSSL on the transport layer if available. Optional. Default is `true`. - -- `plugins.security.ssl.http.enable_openssl_if_available` (Static): Enables OpenSSL on the REST layer if available. Optional. Default is `true`. - -For more information, see [OpenSSL]({{site.url}}{{site.baseurl}}/security/configuration/tls/#advanced-openssl). - ## X.509 PEM certificates and PKCS #8 keys---transport layer TLS settings The Security plugin supports the following transport layer TLS settings related to X.509 PEM certificates and PKCS #8 keys: @@ -392,7 +382,7 @@ The Security plugin supports the following transport layer security settings: plugins.security.nodes_dn: - "CN=*.example.com, OU=SSL, O=Test, L=Test, C=DE" - "CN=node.other.com, OU=SSL, O=Test, L=Test, C=DE" - - "CN=node.example.com, OU=SSL\, Inc., L=Test, C=DE" # escape additional comma with `\` + - "CN=node.example.com, OU=SSL\\, Inc., L=Test, C=DE" # escape additional comma with `\\` plugins.security.authcz.admin_dn: - CN=kirk,OU=client,O=client,L=test, C=de plugins.security.roles_mapping_resolution: MAPPING_ONLY diff --git a/_install-and-configure/install-dashboards/debian.md b/_install-and-configure/install-dashboards/debian.md index 73aba46cd41..69bb4a52a2e 100644 --- a/_install-and-configure/install-dashboards/debian.md +++ b/_install-and-configure/install-dashboards/debian.md @@ -5,6 +5,12 @@ parent: Installing OpenSearch Dashboards nav_order: 33 --- +{% comment %} +The following liquid syntax declares a variable, major_version_mask, which is transformed into "N.x" where "N" is the major version number. This is required for proper versioning references to the Yum repo. +{% endcomment %} +{% assign version_parts = site.opensearch_major_minor_version | split: "." %} +{% assign major_version_mask = version_parts[0] | append: ".x" %} + # Installing OpenSearch Dashboards (Debian) Installing OpenSearch Dashboards using the Advanced Packaging Tool (APT) package manager simplifies the process considerably compared to the [Tarball]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/tar/) method. For example, the package manager handles several technical considerations, such as the installation path, location of configuration files, and creation of a service managed by `systemd`. @@ -56,7 +62,7 @@ The Debian package is not signed. If you would like to verify the fingerprint, t ``` 1. Download and import the GPG key. ```bash - curl -o- https://artifacts.opensearch.org/publickeys/opensearch.pgp | gpg --import - + curl -o- https://artifacts.opensearch.org/publickeys/opensearch-release.pgp | gpg --import - ``` 1. Verify the signature. ```bash @@ -73,11 +79,11 @@ APT, the primary package management tool for Debian–based operating systems, a ``` 1. Import the public GPG key. This key is used to verify that the APT repository is signed. ```bash - curl -o- https://artifacts.opensearch.org/publickeys/opensearch.pgp | sudo gpg --dearmor --batch --yes -o /usr/share/keyrings/opensearch-keyring + curl -o- https://artifacts.opensearch.org/publickeys/opensearch-release.pgp | sudo gpg --dearmor --batch --yes -o /usr/share/keyrings/opensearch-release-keyring ``` 1. Create an APT repository for OpenSearch. ```bash - echo "deb [signed-by=/usr/share/keyrings/opensearch-keyring] https://artifacts.opensearch.org/releases/bundle/opensearch-dashboards/2.x/apt stable main" | sudo tee /etc/apt/sources.list.d/opensearch-dashboards-2.x.list + echo "deb [signed-by=/usr/share/keyrings/opensearch-release-keyring] https://artifacts.opensearch.org/releases/bundle/opensearch-dashboards/{{major_version_mask}}/apt stable main" | sudo tee /etc/apt/sources.list.d/opensearch-dashboards-{{major_version_mask}}.list ``` 1. Verify that the repository was created successfully. ```bash diff --git a/_install-and-configure/install-dashboards/docker.md b/_install-and-configure/install-dashboards/docker.md index aa9ca1d5291..7f18fd7a662 100644 --- a/_install-and-configure/install-dashboards/docker.md +++ b/_install-and-configure/install-dashboards/docker.md @@ -19,8 +19,8 @@ You *can* start OpenSearch Dashboards using `docker run` after [creating a Docke Just like `opensearch.yml`, you can pass a custom `opensearch_dashboards.yml` to the container in the Docker Compose file. {: .tip } -1. Run `docker-compose up`. +1. Run `docker compose up`. Wait for the containers to start. Then see the [OpenSearch Dashboards documentation]({{site.url}}{{site.baseurl}}/dashboards/index/). -1. When finished, run `docker-compose down`. +1. When finished, run `docker compose down`. diff --git a/_install-and-configure/install-dashboards/plugins.md b/_install-and-configure/install-dashboards/plugins.md index 6a15e65f1fc..a80153e3118 100644 --- a/_install-and-configure/install-dashboards/plugins.md +++ b/_install-and-configure/install-dashboards/plugins.md @@ -36,11 +36,11 @@ The following table lists available OpenSearch Dashboards plugins. | Anomaly Detection Dashboards | [anomaly-detection-dashboards-plugin](https://github.com/opensearch-project/anomaly-detection-dashboards-plugin) | 1.0.0 | | Custom Import Maps Dashboards | [dashboards-maps](https://github.com/opensearch-project/dashboards-maps) | 2.2.0 | | Search Relevance Dashboards | [dashboards-search-relevance](https://github.com/opensearch-project/dashboards-search-relevance) | 2.4.0 | -| Gantt Chart Dashboards | [gantt-chart](https://github.com/opensearch-project/dashboards-visualizations) | 1.0.0 | | Index Management Dashboards | [index-management-dashboards-plugin](https://github.com/opensearch-project/index-management-dashboards-plugin) | 1.0.0 | | Notebooks Dashboards | [dashboards-notebooks](https://github.com/opensearch-project/dashboards-notebooks) | 1.0.0 | | Notifications Dashboards | [dashboards-notifications](https://github.com/opensearch-project/dashboards-notifications) | 2.0.0 | | Observability Dashboards | [dashboards-observability](https://github.com/opensearch-project/dashboards-observability) | 2.0.0 | +| Query Insights Dashboards | [query-insights-dashboards](https://github.com/opensearch-project/query-insights-dashboards) | 2.19.0 | | Query Workbench Dashboards | [query-workbench](https://github.com/opensearch-project/dashboards-query-workbench) | 1.0.0 | | Reports Dashboards | [dashboards-reporting](https://github.com/opensearch-project/dashboards-reporting) | 1.0.0 | | Security Analytics Dashboards | [security-analytics-dashboards-plugin](https://github.com/opensearch-project/security-analytics-dashboards-plugin)| 2.4.0 | diff --git a/_install-and-configure/install-dashboards/rpm.md b/_install-and-configure/install-dashboards/rpm.md index cc5974c91e7..2e9d166f821 100644 --- a/_install-and-configure/install-dashboards/rpm.md +++ b/_install-and-configure/install-dashboards/rpm.md @@ -22,7 +22,7 @@ OpenSearch Dashboards is the default visualization tool for data in OpenSearch. 1. Download the RPM package for the desired version directly from the [OpenSearch downloads page](https://opensearch.org/downloads.html){:target='\_blank'}. The RPM package can be download for both **x64** and **arm64** architectures. 1. Import the public GPG key. This key verifies that your OpenSearch instance is signed. ```bash - sudo rpm --import https://artifacts.opensearch.org/publickeys/opensearch.pgp + sudo rpm --import https://artifacts.opensearch.org/publickeys/opensearch-release.pgp ``` 1. From the command line interface (CLI), you can install the package with `rpm` or `yum`. **x64** diff --git a/_install-and-configure/install-opensearch/debian.md b/_install-and-configure/install-opensearch/debian.md index 0f7d674d7b1..f2822095971 100644 --- a/_install-and-configure/install-opensearch/debian.md +++ b/_install-and-configure/install-opensearch/debian.md @@ -92,7 +92,7 @@ The Debian package is not signed. If you would like to verify the fingerprint, t 1. Download and import the GPG key. ```bash - curl -o- https://artifacts.opensearch.org/publickeys/opensearch.pgp | gpg --import - + curl -o- https://artifacts.opensearch.org/publickeys/opensearch-release.pgp | gpg --import - ``` {% include copy.html %} @@ -114,13 +114,13 @@ APT, the primary package management tool for Debian–based operating systems, a 1. Import the public GPG key. This key is used to verify that the APT repository is signed. ```bash - curl -o- https://artifacts.opensearch.org/publickeys/opensearch.pgp | sudo gpg --dearmor --batch --yes -o /usr/share/keyrings/opensearch-keyring + curl -o- https://artifacts.opensearch.org/publickeys/opensearch-release.pgp | sudo gpg --dearmor --batch --yes -o /usr/share/keyrings/opensearch-release-keyring ``` {% include copy.html %} 1. Create an APT repository for OpenSearch: ```bash - echo "deb [signed-by=/usr/share/keyrings/opensearch-keyring] https://artifacts.opensearch.org/releases/bundle/opensearch/2.x/apt stable main" | sudo tee /etc/apt/sources.list.d/opensearch-2.x.list + echo "deb [signed-by=/usr/share/keyrings/opensearch-release-keyring] https://artifacts.opensearch.org/releases/bundle/opensearch/{{major_version_mask}}/apt stable main" | sudo tee /etc/apt/sources.list.d/opensearch-{{major_version_mask}}.list ``` {% include copy.html %} diff --git a/_install-and-configure/install-opensearch/docker.md b/_install-and-configure/install-opensearch/docker.md index 8925cb5ed89..82ea43ff5e2 100644 --- a/_install-and-configure/install-opensearch/docker.md +++ b/_install-and-configure/install-opensearch/docker.md @@ -70,23 +70,23 @@ Official OpenSearch images are hosted on [Docker Hub](https://hub.docker.com/u/o [Docker Hub](https://hub.docker.com/u/opensearchproject/): ```bash -docker pull opensearchproject/opensearch:2 +docker pull opensearchproject/opensearch:{{ site.opensearch_version | split: "." | first }} ``` {% include copy.html %} ```bash -docker pull opensearchproject/opensearch-dashboards:2 +docker pull opensearchproject/opensearch-dashboards:{{ site.opensearch_version | split: "." | first }} ``` {% include copy.html %} [Amazon ECR](https://gallery.ecr.aws/opensearchproject/): ```bash -docker pull public.ecr.aws/opensearchproject/opensearch:2 +docker pull public.ecr.aws/opensearchproject/opensearch:{{ site.opensearch_version | split: "." | first }} ``` {% include copy.html %} ```bash -docker pull public.ecr.aws/opensearchproject/opensearch-dashboards:2 +docker pull public.ecr.aws/opensearchproject/opensearch-dashboards:{{ site.opensearch_version | split: "." | first }} ``` {% include copy.html %} @@ -142,7 +142,7 @@ Before continuing, you should verify that Docker is working correctly by deployi ``` {% include copy.html %} -Remember that `docker container ls` does not list stopped containers. If you would like to review stopped containers, use `docker container ls -a`. You can remove unneeded containers manually with `docker container rm <containerId_1> <containerId_2> <containerId_3> [...]` (pass all container IDs you wish to stop, separated by spaces), or if you want to remove all stopped containers, you can use the shorter command `docker container prune`. +Remember that `docker container ls` does not list stopped containers. If you would like to review stopped containers, use `docker container ls -a`. You can remove unneeded containers manually with `docker container rm <containerId_1> <containerId_2> <containerId_3> [...]` (pass all container IDs you want to stop, separated by spaces), or if you want to remove all stopped containers, you can use the shorter command `docker container prune`. {: .tip} ## Deploy an OpenSearch cluster using Docker Compose @@ -160,7 +160,7 @@ If none of those files exist in your current directory, the `docker-compose` com You can specify a custom file location and name when invoking `docker-compose` with the `-f` flag: ```bash # Use a relative or absolute path to the file. -docker-compose -f /path/to/your-file.yml up +docker compose -f /path/to/your-file.yml up ``` If this is your first time launching an OpenSearch cluster using Docker Compose, use the following example `docker-compose.yml` file. Save it in the home directory of your host and name it `docker-compose.yml`. This file creates a cluster that contains three containers: two containers running the OpenSearch service and a single container running OpenSearch Dashboards. These containers communicate over a bridge network called `opensearch-net` and use two volumes, one for each OpenSearch node. Because this file does not explicitly disable the demo security configuration, self-signed TLS certificates are installed and internal users with default names and passwords are created. @@ -177,10 +177,35 @@ Starting with OpenSearch 2.12, a custom admin password is required to set up a d - Create an `.env` file in the same folder as your `docker-compose.yml` file with the `OPENSEARCH_INITIAL_ADMIN_PASSWORD` and a strong password value. +### Password requirements + +OpenSearch enforces strong password security by default, using the [`zxcvbn`](https://github.com/dropbox/zxcvbn) password strength estimation library developed by Dropbox. + +This library evaluates passwords based on entropy, rather than rigid complexity rules, using the following guidelines: + +- **Focus on entropy, not only rules**: Instead of only adding numbers or special characters, prioritize overall unpredictability. Longer passwords composed of random words or characters provide higher entropy, making them more secure than short passwords that meet conventional complexity rules. + +- **Avoid common patterns and dictionary words**: The `zxcvbn` library detects commonly used words, dates, sequences (for example, `1234` or `qwerty`), and even predictable character substitutions (for example, `3` for `E`). To ensure strong security, avoid using these patterns in your passwords. + +- **Length matters**: Longer passwords generally offer greater security. For example, a passphrase such as `correct horse battery staple` is considered to be strong because of its length and randomness, even though it does not contain special characters or numbers. + +- **Unpredictability is key**: Whether you choose a string of random characters or a passphrase made of unrelated words, the key to password security is unpredictability. Higher entropy significantly increases the number of required guesses, making the password more resistant to attacks. + +To learn more about `zxcvbn`, see [this Dropbox blog post](https://dropbox.tech/security/zxcvbn-realistic-password-strength-estimation). To experiment with password strength, use [this demo](https://lowe.github.io/tryzxcvbn). +{: .tip} + +OpenSearch uses the following default password requirements: + +- Minimum password length: 8 characters. +- Maximum password length: 100 characters. +- No requirements for special characters, numbers, or uppercase letters. +- Passwords must be rated `strong` using the `zxcvbn` entropy-based calculation. + +You can customize the default password requirements by updating the [password cluster settings]({{site.url}}{{site.baseurl}}/security/configuration/yaml/#password-settings). + ### Sample docker-compose.yml ```yml -version: '3' services: opensearch-node1: # This is also the hostname of the container within the Docker network (i.e. https://opensearch-node1/) image: opensearchproject/opensearch:latest # Specifying the latest available image - modify if you want a specific version @@ -255,35 +280,35 @@ If you override `opensearch_dashboards.yml` settings using environment variables From the home directory of your host (containing `docker-compose.yml`), create and start the containers in detached mode: ```bash -docker-compose up -d +docker compose up -d ``` {% include copy.html %} Verify that the service containers started correctly: ```bash -docker-compose ps +docker compose ps ``` {% include copy.html %} If a container failed to start, you can review the service logs: ```bash -# If you don't pass a service name, docker-compose will show you logs from all of the nodes -docker-compose logs <serviceName> +# If you don't pass a service name, docker compose will show you logs from all of the nodes +docker compose logs <serviceName> ``` {% include copy.html %} -Verify access to OpenSearch Dashboards by connecting to http://localhost:5601 from a browser. The default username and password are `admin`. We do not recommend using this configuration on hosts that are accessible from the public internet until you have customized the security configuration of your deployment. +Verify access to OpenSearch Dashboards by connecting to http://localhost:5601 from a browser. For OpenSearch 2.12 and later, you must use your configured username and password. For earlier versions, the default username and password are `admin`. We do not recommend using this configuration on hosts that are accessible from the public internet until you have customized the security configuration of your deployment. Remember that `localhost` cannot be accessed remotely. If you are deploying these containers to a remote host, then you will need to establish a network connection and replace `localhost` with the IP or DNS record corresponding to the host. {: .note} Stop the running containers in your cluster: ```bash -docker-compose down +docker compose down ``` {% include copy.html %} -`docker-compose down` will stop the running containers, but it will not remove the Docker volumes that exist on the host. If you don't care about the contents of these volumes, use the `-v` option to delete all volumes, for example, `docker-compose down -v`. +`docker compose down` will stop the running containers, but it will not remove the Docker volumes that exist on the host. If you don't care about the contents of these volumes, use the `-v` option to delete all volumes, for example, `docker compose down -v`. {: .tip} ## Configure OpenSearch @@ -331,7 +356,6 @@ services: If you want to build your own compose file from an example, review the following sample `docker-compose.yml` file. This sample file creates two OpenSearch nodes and one OpenSearch Dashboards node with the Security plugin disabled. You can use this sample file as a starting point while reviewing [Configuring basic security settings](#configuring-basic-security-settings). ```yml -version: '3' services: opensearch-node1: image: opensearchproject/opensearch:latest @@ -475,7 +499,7 @@ Use the same process to specify a [Backend configuration]({{site.url}}{{site.bas After replacing the certificates and creating your own internal users, roles, mappings, action groups, and tenants, use Docker Compose to start the cluster: ```bash -docker-compose up -d +docker compose up -d ``` {% include copy.html %} diff --git a/_install-and-configure/install-opensearch/index.md b/_install-and-configure/install-opensearch/index.md index 94c259667a0..bfaf9897d62 100644 --- a/_install-and-configure/install-opensearch/index.md +++ b/_install-and-configure/install-opensearch/index.md @@ -29,7 +29,7 @@ The OpenSearch distribution for Linux ships with a compatible [Adoptium JDK](htt OpenSearch Version | Compatible Java Versions | Bundled Java Version :---------- | :-------- | :----------- 1.0--1.2.x | 11, 15 | 15.0.1+9 -1.3.x | 8, 11, 14 | 11.0.24+8 +1.3.x | 8, 11, 14 | 11.0.25+9 2.0.0--2.11.x | 11, 17 | 17.0.2+8 2.12.0+ | 11, 17, 21 | 21.0.5+11 diff --git a/_install-and-configure/install-opensearch/rpm.md b/_install-and-configure/install-opensearch/rpm.md index 85872b7c349..9721caad864 100644 --- a/_install-and-configure/install-opensearch/rpm.md +++ b/_install-and-configure/install-opensearch/rpm.md @@ -40,7 +40,7 @@ This guide assumes that you are comfortable working from the Linux command line 1. Import the public GNU Privacy Guard (GPG) key. This key verifies that your OpenSearch instance is signed. ```bash - sudo rpm --import https://artifacts.opensearch.org/publickeys/opensearch.pgp + sudo rpm --import https://artifacts.opensearch.org/publickeys/opensearch-release.pgp ``` {% include copy.html %} diff --git a/_install-and-configure/os-comp.md b/_install-and-configure/os-comp.md index a62b82b7da2..dee2d72f1d4 100644 --- a/_install-and-configure/os-comp.md +++ b/_install-and-configure/os-comp.md @@ -15,7 +15,7 @@ OS | Version Rocky Linux | 8 Alma Linux | 8 Amazon Linux | 2/2023 -Ubuntu | 20.04 +Ubuntu | 24.04 Windows Server | 2019 @@ -27,6 +27,7 @@ The following table lists changes made to operating system compatibility. | Date | Issue | PR | Details | |:-----------|:-------|:-------|:--------------------------| -| 2024-07-23 | [opensearch-build Issue 4379](https://github.com/opensearch-project/opensearch-build/issues/4379) | [PR 7821](https://github.com/opensearch-project/documentation-website/pull/7821) | Remove [CentOS7](https://blog.centos.org/2023/04/end-dates-are-coming-for-centos-stream-8-and-centos-linux-7/). | +| 2025-02-06 | [opensearch-build Issue 5270](https://github.com/opensearch-project/opensearch-build/issues/5270) | [PR 9165](https://github.com/opensearch-project/documentation-website/pull/9165) | Remove [Ubuntu 20.04](https://ubuntu.com/blog/ubuntu-20-04-lts-end-of-life-standard-support-is-coming-to-an-end-heres-how-to-prepare) | +| 2024-07-23 | [opensearch-build Issue 4379](https://github.com/opensearch-project/opensearch-build/issues/4379) | [PR 7821](https://github.com/opensearch-project/documentation-website/pull/7821) | Remove [CentOS7](https://blog.centos.org/2023/04/end-dates-are-coming-for-centos-stream-8-and-centos-linux-7/) | | 2024-03-08 | [opensearch-build Issue 4573](https://github.com/opensearch-project/opensearch-build/issues/4573) | [PR 6637](https://github.com/opensearch-project/documentation-website/pull/6637) | Remove CentOS8, add Almalinux8/Rockylinux8, and remove Ubuntu 16.04/18.04 because we currently only test on 20.04 | -| 2023-06-06 | [documentation-website Issue 4217](https://github.com/opensearch-project/documentation-website/issues/4217) | [PR 4218](https://github.com/opensearch-project/documentation-website/pull/4218) | Support matrix creation | \ No newline at end of file +| 2023-06-06 | [documentation-website Issue 4217](https://github.com/opensearch-project/documentation-website/issues/4217) | [PR 4218](https://github.com/opensearch-project/documentation-website/pull/4218) | Support matrix creation | diff --git a/_install-and-configure/plugins.md b/_install-and-configure/plugins.md index e96b29e8228..bb0b043b1e1 100644 --- a/_install-and-configure/plugins.md +++ b/_install-and-configure/plugins.md @@ -10,9 +10,9 @@ redirect_from: # Installing plugins -OpenSearch comprises of a number of plugins that add features and capabilities to the core platform. The plugins available to you are dependent on how OpenSearch was installed and which plugins were subsequently added or removed. For example, the minimal distribution of OpenSearch enables only core functionality, such as indexing and search. Using the minimal distribution of OpenSearch is beneficial when you are working in a testing environment, have custom plugins, or are intending to integrate OpenSearch with other services. +OpenSearch includes a number of plugins that add features and capabilities to the core platform. The plugins available to you are dependent on how OpenSearch was installed and which plugins were subsequently added or removed. For example, the minimal distribution of OpenSearch enables only core functionality, such as indexing and search. Using the minimal distribution of OpenSearch is beneficial when you are working in a testing environment, have custom plugins, or are intending to integrate OpenSearch with other services. -The standard distribution of OpenSearch has much more functionality included. You can choose to add additional plugins or remove any of the plugins you don't need. +The standard distribution of OpenSearch includes many more plugins offering much more functionality. You can choose to add additional plugins or remove any of the plugins you don't need. For a list of the available plugins, see [Available plugins](#available-plugins). @@ -289,6 +289,7 @@ The following plugins are bundled with all OpenSearch distributions except for t | Index Management | [opensearch-index-management](https://github.com/opensearch-project/index-management) | 1.0.0 | | Job Scheduler | [opensearch-job-scheduler](https://github.com/opensearch-project/job-scheduler) | 1.0.0 | | k-NN | [opensearch-knn](https://github.com/opensearch-project/k-NN) | 1.0.0 | +| Learning to Rank | [opensearch-ltr](https://github.com/opensearch-project/opensearch-learning-to-rank-base) | 2.19.0 | | ML Commons | [opensearch-ml](https://github.com/opensearch-project/ml-commons) | 1.3.0 | | Skills | [opensearch-skills](https://github.com/opensearch-project/skills) | 2.12.0 | | Neural Search | [neural-search](https://github.com/opensearch-project/neural-search) | 2.4.0 | @@ -297,11 +298,42 @@ The following plugins are bundled with all OpenSearch distributions except for t | Security | [opensearch-security](https://github.com/opensearch-project/security) | 1.0.0 | | Security Analytics | [opensearch-security-analytics](https://github.com/opensearch-project/security-analytics) | 2.4.0 | | SQL | [opensearch-sql](https://github.com/opensearch-project/sql) | 1.0.0 | +| Learning to Rank Base | [opensearch-learning-to-rank-base](https://github.com/opensearch-project/opensearch-learning-to-rank-base) | 2.19.0 | +| Remote Metadata SDK | [opensearch-remote-metadata-sdk](https://github.com/opensearch-project/opensearch-remote-metadata-sdk) | 2.19.0 | +| Query Insights | [query-insights](https://github.com/opensearch-project/query-insights) | 2.16.0 | +| System Templates | [opensearch-system-templates](https://github.com/opensearch-project/opensearch-system-templates) | 2.17.0 | _<sup>1</sup>Dashboard Notebooks was merged in to the Observability plugin with the release of OpenSearch 1.2.0._<br> _<sup>2</sup>Performance Analyzer is not available on Windows._ +#### Downloading bundled plugins for offline installation + +Each bundled plugin can be downloaded and installed offline from a [zip file](#install-a-plugin-from-a-zip-file). + +The URL for the corresponding plugin can be found in the `manifest.yml` file located in the root directory of the extracted bundle. + +### Core plugins + +A _core_ (or _native_) plugin in OpenSearch is a plugin that resides in the [OpenSearch core engine repository](https://github.com/opensearch-project/OpenSearch/tree/main/plugins). These plugins are tightly integrated with the OpenSearch engine, are versioned alongside core releases, and are not bundled by default in the standard OpenSearch distribution. + + +#### Downloading core plugins for offline installation + +Each core plugins in [this list](https://github.com/opensearch-project/OpenSearch/tree/main/plugins) can be downloaded and installed offline from a [zip file](#install-a-plugin-from-a-zip-file) using the official `plugins` repository URL template: + +```html +https://artifacts.opensearch.org/releases/plugins/<plugin-name>/<version>/<plugin-name>-<version>.zip +``` + +The `<plugin-name>` corresponds to the name of the bundled plugin (for example, `analysis-icu`). The `<version>` must match the version of the OpenSearch distribution (for example, `2.19.1`). + +For example, use the following URL to download the `analysis-icu` bundled plugin distribution for OpenSearch version `2.19.1`: + +``` +https://artifacts.opensearch.org/releases/plugins/analysis-icu/2.19.1/analysis-icu-2.19.1.zip +``` + ### Additional plugins There are many more plugins available in addition to those provided by the default distribution. These additional plugins have been built by OpenSearch developers or members of the OpenSearch community. For a list of additional plugins you can install, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/index/). @@ -333,6 +365,7 @@ You can specify only one of the `opensearch.version` or `dependencies` propertie - [Cross-cluster replication]({{site.url}}{{site.baseurl}}/replication-plugin/index/) - [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) - [k-NN search]({{site.url}}{{site.baseurl}}/search-plugins/knn/index/) +- [Learning to Rank]({{site.url}}{{site.baseurl}}/search-plugins/ltr/index/) - [ML Commons]({{site.url}}{{site.baseurl}}/ml-commons-plugin/index/) - [Neural search]({{site.url}}{{site.baseurl}}/neural-search-plugin/index/) - [Notifications]({{site.url}}{{site.baseurl}}/notifications-plugin/index/) diff --git a/_install-and-configure/upgrade-opensearch/index.md b/_install-and-configure/upgrade-opensearch/index.md index 99265dc94ea..8f13ea18422 100644 --- a/_install-and-configure/upgrade-opensearch/index.md +++ b/_install-and-configure/upgrade-opensearch/index.md @@ -9,9 +9,9 @@ redirect_from: # Upgrading OpenSearch -The OpenSearch Project releases regular updates that include new features, enhancements, and bug fixes. OpenSearch uses [Semantic Versioning](https://semver.org/), which means that breaking changes are only introduced between major version releases. To learn about upcoming features and fixes, review the [OpenSearch Project Roadmap](https://github.com/orgs/opensearch-project/projects/1) on GitHub. To view a list of previous releases or to learn more about how OpenSearch uses versioning, see [Release Schedule and Maintenance Policy]({{site.url}}/releases.html). +The OpenSearch Project releases regular updates that include new features, enhancements, and bug fixes. OpenSearch uses [Semantic Versioning](https://semver.org/), which means that breaking changes are only introduced between major version releases. To learn about upcoming features and fixes, review the [OpenSearch Project Roadmap](https://github.com/orgs/opensearch-project/projects/206) on GitHub. To view a list of previous releases or to learn more about how OpenSearch uses versioning, see [Release Schedule and Maintenance Policy]({{site.url}}/releases.html). -We recognize that users are excited about upgrading OpenSearch in order to enjoy the latest features, and we will continue to expand on these upgrade and migration documents to cover additional topics, such as upgrading OpenSearch Dashboards and preserving custom configurations, such as for plugins. To see what's coming next or to make a request for future content, leave a comment on the [upgrade and migration documentation meta issue](https://github.com/opensearch-project/documentation-website/issues/2830) in the [OpenSearch Project](https://github.com/opensearch-project) on GitHub. +We recognize that users are excited about upgrading OpenSearch in order to enjoy the latest features, and we will continue to expand on these upgrade and migration documents to cover additional topics, such as upgrading OpenSearch Dashboards and preserving custom configurations, such as for plugins. If you would like a specific process to be added or would like to contribute, [create an issue](https://github.com/opensearch-project/documentation-website/issues) on GitHub. See the [Contributor Guidelines](https://github.com/opensearch-project/documentation-website/blob/main/CONTRIBUTING.md) to learn how you can help. {: .tip} diff --git a/_install-and-configure/upgrade-opensearch/rolling-upgrade.md b/_install-and-configure/upgrade-opensearch/rolling-upgrade.md index 1e4145e7ba2..f1cf13d25bf 100644 --- a/_install-and-configure/upgrade-opensearch/rolling-upgrade.md +++ b/_install-and-configure/upgrade-opensearch/rolling-upgrade.md @@ -1,11 +1,11 @@ --- layout: default -title: Rolling Upgrade +title: Rolling upgrade parent: Upgrading OpenSearch nav_order: 10 --- -# Rolling Upgrade +# Rolling upgrade Rolling upgrades, sometimes referred to as "node replacement upgrades," can be performed on running clusters with virtually no downtime. Nodes are individually stopped and upgraded in place. Alternatively, nodes can be stopped and replaced, one at a time, by hosts running the new version. During this process you can continue to index and query data in your cluster. @@ -195,6 +195,62 @@ Review [Upgrading OpenSearch]({{site.url}}{{site.baseurl}}/upgrade-opensearch/in ``` 1. The upgrade is now complete, and you can begin enjoying the latest features and fixes! +# Rolling restart + +A rolling restart follows the same step-by-step procedure as a rolling upgrade, with the exception of upgrading of actual nodes. During rolling restart, nodes are restarted one at a time—typically to apply configuration changes, refresh certificates, or perform system-level maintenance—without disrupting cluster availability. + +To perform a rolling restart, follow the steps outlined in [Rolling upgrade](#rolling-upgrade), excluding the steps that involve upgrading the OpenSearch binary or container image: + +1. **Check cluster health** + Ensure the cluster status is green and all shards are assigned. + _(Rolling upgrade step 1)_ + +2. **Disable shard allocation** + Prevent OpenSearch from trying to reallocate shards while nodes are offline. + _(Rolling upgrade step 2)_ + +3. **Flush transaction logs** + Commit recent operations to Lucene to reduce recovery time. + _(Rolling upgrade step 3)_ + +4. **Review and identify the next node to restart** + Ensure you restart the current cluster manager node last. + _(Rolling upgrade step 4)_ + +5. **Check which node is the current cluster manager** + Use the `_cat/nodes` API to determine which node is the current active cluster manager. + _(Rolling upgrade step 5)_ + +6. **Stop the node** + Shut down the node gracefully. Do not delete the associated data volume. + _(Rolling upgrade step 6)_ + +7. **Confirm the node has left the cluster** + Use `_cat/nodes` to verify that it's no longer listed. + _(Rolling upgrade step 7)_ + +8. **Restart the node** + Start the same node (same binary/version/config) and let it rejoin the cluster. + _(Rolling upgrade step 8 — without upgrading the binary)_ + +9. **Verify that the restarted node has rejoined** + Check `_cat/nodes` to confirm that the node is present and healthy. + _(Rolling upgrade step 9)_ + +10. **Reenable shard allocation** + Restore full shard movement capability. + _(Rolling upgrade step 10)_ + +11. **Confirm cluster health is green** + Validate stability before restarting the next node. + _(Rolling upgrade step 11)_ + +12. **Repeat the process for all other nodes** + Restart each node one at a time. If a node is eligible for the cluster manager role, restart it last. + _(Rolling upgrade step 12 — again, no upgrade step)_ + +By preserving quorum and restarting nodes sequentially, rolling restarts ensure zero downtime and full data continuity. + ### Related articles - [OpenSearch configuration]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/) diff --git a/_layouts/default.html b/_layouts/default.html index d4d40d8cc48..44adf857999 100755 --- a/_layouts/default.html +++ b/_layouts/default.html @@ -52,7 +52,7 @@ <div id="main-header"></div> <div class="side-bar"> <div class="site-header"> - <a href="/docs/latest/" id="menu-button" class="site-button"> + <a href="{{site.latesturl}}/" id="menu-button" class="site-button"> Documentation <svg viewBox="0 0 24 24" class="icon"><use xlink:href="#svg-grid"></use></svg> </a> </div> @@ -64,7 +64,7 @@ <a href="{{site.url}}{{site.baseurl}}/{{ page.section}}/" class="site-category">{{ page.section-name }}</a> {% endif %} - <a href="{{site.url}}/docs/latest/" class="back-link">← Back to docs home</a> + <a href="{{site.url}}{{site.latesturl}}/" class="back-link">← Back to docs home</a> {% if page.section == "opensearch" %} <version-selector selected="{{ site.data.versions.current }}"></version-selector> @@ -87,6 +87,8 @@ {% assign section = site.clients_collection.collections %} {% elsif page.section == "benchmark" %} {% assign section = site.benchmark_collection.collections %} + {% elsif page.section == "migration-assistant" %} + {% assign section = site.migration_assistant_collection.collections %} {% endif %} {% if section %} diff --git a/_migration-assistant/assets/css/breaking-changes-selector.css b/_migration-assistant/assets/css/breaking-changes-selector.css new file mode 100644 index 00000000000..f01ee66af8f --- /dev/null +++ b/_migration-assistant/assets/css/breaking-changes-selector.css @@ -0,0 +1,55 @@ +.breaking-changes-selector { + background: #f5f7f7; + padding: 15px; + border-radius: 4px; + margin-bottom: 20px; +} + +.breaking-changes-selector div { + margin: 10px 0; +} + +.breaking-changes-selector label { + margin-right: 8px; +} + +.breaking-changes-selector select { + margin-right: 15px; + padding: 4px; +} + +.breaking-changes-selector input[type="checkbox"] { + margin-right: 4px; +} + +#breaking-changes-results { + margin-top: 15px; +} + +#breaking-changes-results ul { + margin-top: 10px; +} + +.transformation-info { + margin-left: 20px; + padding: 5px 10px; + background-color: #e6f7ff; + border-left: 3px solid #1890ff; + margin-top: 5px; + margin-bottom: 5px; + font-size: 0.9em; +} + +.component-checkbox { + display: inline-block; + margin-left: 20px; +} + +.transformation-request { + margin-top: 15px; + padding: 10px; + background-color: #f0f0f0; + border-radius: 4px; + font-size: 0.9em; + font-style: italic; +} diff --git a/_migration-assistant/assets/js/breaking-changes-data.js b/_migration-assistant/assets/js/breaking-changes-data.js new file mode 100644 index 00000000000..e00f7269731 --- /dev/null +++ b/_migration-assistant/assets/js/breaking-changes-data.js @@ -0,0 +1,195 @@ +/** + * Breaking changes data for migration paths + * + * Data structure: + * - VERSIONS: Array of version strings derived from valid_migrations.yml + * - VERSION_ORDER: Array of version strings in strict order (derived from VERSIONS) + * - breakingChanges: Array of breaking change objects with: + * - title: Display name of the breaking change + * - url: Link to documentation + * - introducedIn: Version where the breaking change was introduced + * - affects (optional): Object with minSource and maxTarget versions + * - minSource: Minimum source version affected + * - maxTarget: Maximum target version affected + * - comp: Array of components affected + * - transformation (optional): Optional object with transformation information + */ + +// Variables to store version ordering +let VERSION_ORDER = []; + +// Version utility functions that use the dynamically determined version order +export function getVersionIndex(version) { + const index = VERSION_ORDER.indexOf(version); + if (index === -1) throw new Error(`Unknown version: ${version}`); + return index; +} + +export function isVersionBetween(target, min, max) { + const targetIdx = getVersionIndex(target); + const minIdx = getVersionIndex(min); + const maxIdx = getVersionIndex(max); + return targetIdx >= minIdx && targetIdx <= maxIdx; +} + +// Helper function to determine version ordering based on migration paths +function determineVersionOrder(versions, migrationMap) { + // Start with a copy of all versions + const remainingVersions = [...versions]; + const orderedVersions = []; + + // First, find versions that are only sources (not targets) + // These are the oldest versions + const onlySources = versions.filter(v => + !Object.values(migrationMap.targetToSources).flat().includes(v) && + migrationMap.sourceToTargets[v] + ); + + // Add oldest versions first + onlySources.forEach(v => { + orderedVersions.push(v); + const index = remainingVersions.indexOf(v); + if (index !== -1) remainingVersions.splice(index, 1); + }); + + // Then add the rest based on migration paths + while (remainingVersions.length > 0) { + let added = false; + + for (let i = 0; i < remainingVersions.length; i++) { + const version = remainingVersions[i]; + const sources = migrationMap.targetToSources[version] || []; + + // If all sources of this version are already in orderedVersions, + // we can add this version next + if (sources.every(s => orderedVersions.includes(s))) { + orderedVersions.push(version); + remainingVersions.splice(i, 1); + added = true; + break; + } + } + + // If we couldn't add any version in this pass, there might be a cycle + // Just add the first remaining version to break the cycle + if (!added && remainingVersions.length > 0) { + orderedVersions.push(remainingVersions[0]); + remainingVersions.splice(0, 1); + } + } + + return orderedVersions; +} + +// Variables to store migration data +let VALID_MIGRATIONS = {}; +let VERSIONS = []; +let MIGRATION_MAP = { + sourceToTargets: {}, + targetToSources: {} +}; + +// Variables to store breaking changes data +let breakingChanges = []; + +// Extract all unique versions for dropdowns +function extractUniqueVersions(migrationsMap) { + const versions = new Set(); + + // Add all sources + Object.keys(migrationsMap).forEach(source => { + versions.add(source); + }); + + // Add all targets + Object.values(migrationsMap).forEach(targets => { + targets.forEach(target => { + versions.add(target); + }); + }); + + return Array.from(versions).sort(); +} + +// Function to generate the reverse mapping (target -> sources) +function generateReverseMigrationMap(forwardMap) { + const reverseMap = {}; + + Object.entries(forwardMap).forEach(([source, targets]) => { + targets.forEach(target => { + if (!reverseMap[target]) { + reverseMap[target] = []; + } + reverseMap[target].push(source); + }); + }); + + return reverseMap; +} + +// Function to initialize the migration data from the data attribute +function initializeMigrationData() { + const migrationDataElement = document.getElementById('migration-data'); + + if (migrationDataElement && migrationDataElement.dataset.migrationPaths) { + try { + // Parse the JSON data from the data attribute + const migrationPaths = JSON.parse(migrationDataElement.dataset.migrationPaths); + + // Transform the data structure from YAML format to the expected JavaScript object format + VALID_MIGRATIONS = migrationPaths.reduce((acc, path) => { + acc[path.source] = path.targets; + return acc; + }, {}); + + // Now that we have the migration data, create the derived data structures + VERSIONS = extractUniqueVersions(VALID_MIGRATIONS); + MIGRATION_MAP = { + sourceToTargets: VALID_MIGRATIONS, + targetToSources: generateReverseMigrationMap(VALID_MIGRATIONS) + }; + + // Determine version ordering based on migration paths + VERSION_ORDER = determineVersionOrder(VERSIONS, MIGRATION_MAP); + console.log('Determined version order:', VERSION_ORDER); + } catch (error) { + console.error('Failed to parse migration data:', error); + // Fallback to empty object if parsing fails + VALID_MIGRATIONS = {}; + VERSIONS = []; + MIGRATION_MAP = { sourceToTargets: {}, targetToSources: {} }; + VERSION_ORDER = []; + } + } else { + console.error('Migration data element not found or empty'); + } +} + +// Function to initialize the breaking changes data from the data attribute +function initializeBreakingChangesData() { + const migrationDataElement = document.getElementById('migration-data'); + + if (!migrationDataElement || !migrationDataElement.dataset.breakingChanges) { + console.error('Breaking changes data not found in migration-data element. Make sure to add data-breaking-changes attribute.'); + return; + } + + try { + // Parse the JSON data from the data attribute + breakingChanges = JSON.parse(migrationDataElement.dataset.breakingChanges); + console.log('Loaded breaking changes data:', breakingChanges.length); + } catch (error) { + console.error('Failed to parse breaking changes data:', error); + // Fallback to empty array if parsing fails + breakingChanges = []; + } +} + +// Initialize the data when the DOM is loaded +document.addEventListener('DOMContentLoaded', () => { + initializeMigrationData(); + initializeBreakingChangesData(); +}); + +// Export the breaking changes array for use in other modules +export { breakingChanges }; diff --git a/_migration-assistant/assets/js/breaking-changes-filter.js b/_migration-assistant/assets/js/breaking-changes-filter.js new file mode 100644 index 00000000000..63eb00ac260 --- /dev/null +++ b/_migration-assistant/assets/js/breaking-changes-filter.js @@ -0,0 +1,192 @@ +/** + * Breaking changes filter - displays relevant breaking changes based on selected versions and components + * + * Features: + * - Dynamically generates dropdowns and checkboxes from configuration arrays + * - Automatically checks all component checkboxes by default + * - Filters breaking changes based on source version, target version, and components + * - Displays results in a formatted list + * - Supports bidirectional filtering of source and target versions + */ +import { getVersionIndex, breakingChanges } from './breaking-changes-data.js'; +import BreakingChangesUI from './breaking-changes-ui.js'; +document.addEventListener('DOMContentLoaded', () => { + // Wait for migration data to be initialized + if (typeof initializeMigrationData === 'function') { + initializeMigrationData(); + } + + // Cache DOM elements + const src = document.getElementById('source-version'); + const tgt = document.getElementById('target-version'); + const componentContainer = document.getElementById('component-checkboxes'); + const results = document.getElementById('breaking-changes-results'); + + // Flag to prevent infinite loops when updating dropdowns + let isUpdating = false; + + // Helper function to create dropdown options + const createOption = (value, text) => { + const option = document.createElement('option'); + option.value = value; + option.textContent = text; + return option; + }; + + // Helper function to populate a dropdown with versions + const populateDropdown = (dropdown, versions, placeholder = 'Select Version') => { + const currentValue = dropdown.value; + + while (dropdown.firstChild) { + dropdown.removeChild(dropdown.firstChild); + } + + dropdown.appendChild(createOption('', placeholder)); + + versions.forEach(version => { + dropdown.appendChild(createOption(version, version)); + }); + + if (currentValue && versions.includes(currentValue)) { + dropdown.value = currentValue; + } + }; + + // Get all available version strings + const allVersions = VERSIONS; + + // Populate source dropdown with all versions initially + populateDropdown(src, allVersions, 'Select Source'); + + // Populate target dropdown with all versions initially + populateDropdown(tgt, allVersions, 'Select Target'); + + // Function to update target dropdown based on selected source + const updateTargetDropdown = () => { + if (isUpdating) return; + isUpdating = true; + + const selectedSrc = src.value; + + if (selectedSrc) { + // Get valid targets for this source + const validTargets = MIGRATION_MAP.sourceToTargets[selectedSrc] || []; + populateDropdown(tgt, validTargets, 'Select Target'); + } else { + // If no source selected, show all possible targets + populateDropdown(tgt, allVersions, 'Select Target'); + } + + isUpdating = false; + updateResults(); + }; + + // Function to update source dropdown based on selected target + const updateSourceDropdown = () => { + if (isUpdating) return; + isUpdating = true; + + const selectedTgt = tgt.value; + + if (selectedTgt) { + // Get valid sources for this target + const validSources = MIGRATION_MAP.targetToSources[selectedTgt] || []; + populateDropdown(src, validSources, 'Select Source'); + } else { + // If no target selected, show all possible sources + populateDropdown(src, allVersions, 'Select Source'); + } + + isUpdating = false; + updateResults(); + }; + + // Add event listeners for dropdown changes + src.addEventListener('change', updateTargetDropdown); + tgt.addEventListener('change', updateSourceDropdown); + + // Only show Dashboards as an optional component + const span = document.createElement('span'); + span.className = 'component-checkbox'; + + const checkbox = document.createElement('input'); + checkbox.type = 'checkbox'; + checkbox.id = 'component-dashboards'; + checkbox.value = 'dashboards'; + checkbox.checked = false; // Unchecked by default + + const label = document.createElement('label'); + label.htmlFor = 'component-dashboards'; + label.textContent = 'Dashboards'; + + span.appendChild(checkbox); + span.appendChild(label); + componentContainer.appendChild(span); + + // Get all checkboxes + const checkboxes = document.querySelectorAll('#component-checkboxes input[type="checkbox"]'); + + // Update results based on selected filters + const updateResults = () => { + const selectedSrc = src.value; + const selectedTgt = tgt.value; + + // Show message if source or target not selected + if (!selectedSrc || !selectedTgt) { + results.innerHTML = '<p>Select both source and target versions to see breaking changes.</p>'; + return; + } + + const selectedComp = Array.from(checkboxes) + .filter(cb => cb.checked) + .map(cb => cb.value); + + // Check if source and target are the same + if (selectedSrc === selectedTgt) { + results.innerHTML = '<p>No breaking changes when source and target versions are the same.</p>'; + return; + } + + // Filter breaking changes based on selection + const relevantChanges = breakingChanges.filter(change => { + // Check if the breaking change applies to this migration path + const sourceVersionIdx = getVersionIndex(selectedSrc); + const targetVersionIdx = getVersionIndex(selectedTgt); + const introducedInIdx = getVersionIndex(change.introducedIn); + + // A breaking change applies if: + // 1. The breaking change was introduced in a version that is between source and target (inclusive of target) + // 2. The source version is at or after the minimum source version affected + // 3. The target version is at or before the maximum target version affected + + // Handle optional affects field by using defaults if not present + const minSource = change.affects && change.affects.minSource ? change.affects.minSource : VERSIONS[0]; // Default to oldest version + const maxTarget = change.affects && change.affects.maxTarget ? change.affects.maxTarget : VERSIONS[VERSIONS.length - 1]; // Default to newest version + + const versionMatch = + introducedInIdx <= targetVersionIdx && // Breaking change was introduced at or before target + introducedInIdx > sourceVersionIdx && // Breaking change was introduced after source + sourceVersionIdx < targetVersionIdx && // Valid migration path (source before target) + sourceVersionIdx >= getVersionIndex(minSource) && // Source is affected + targetVersionIdx <= getVersionIndex(maxTarget); // Target is affected + + // For component filtering: + // - Always include changes with empty comp array (default/data components) + // - Only include dashboard components if the checkbox is checked + const componentMatch = + change.comp.length === 0 || // Include changes with no specific component (default/data) + (change.comp.includes('dashboards') && selectedComp.includes('dashboards')); // Only include dashboards if selected + + return versionMatch && componentMatch; + }); + + // Use the UI module's displayResults function to avoid duplication + BreakingChangesUI.UIManager.displayResults(relevantChanges, results); + }; + + // Add event listeners for checkboxes + checkboxes.forEach(el => el.addEventListener('change', updateResults)); + + // Initialize results + updateResults(); +}); diff --git a/_migration-assistant/assets/js/breaking-changes-index.js b/_migration-assistant/assets/js/breaking-changes-index.js new file mode 100644 index 00000000000..2198437f6e0 --- /dev/null +++ b/_migration-assistant/assets/js/breaking-changes-index.js @@ -0,0 +1,70 @@ +/** + * Breaking Changes Index + * + * Main entry point for the breaking changes functionality + * Initializes modules and wires everything together + */ + +import BreakingChangesModule from './breaking-changes-module.js'; +import BreakingChangesUI from './breaking-changes-ui.js'; + +// Export initializeMigrationData function for compatibility with original code +export function initializeMigrationData() { + // Get migration data from the data attribute + const migrationDataElement = document.getElementById('migration-data'); + + if (!migrationDataElement) { + console.error('Migration data element not found'); + return; + } + + try { + // Parse the JSON data from the data attribute + const migrationPaths = JSON.parse(migrationDataElement.dataset.migrationPaths || '[]'); + + // Initialize the module with data + BreakingChangesModule.initialize(migrationPaths); + } catch (error) { + console.error('Failed to initialize migration data:', error); + } +} + +// Function to initialize the breaking changes functionality +function initializeBreakingChanges() { + // Get migration data from the data attribute + const migrationDataElement = document.getElementById('migration-data'); + + if (!migrationDataElement) { + console.error('Migration data element not found'); + return; + } + + try { + // Parse the JSON data from the data attribute + const migrationPaths = JSON.parse(migrationDataElement.dataset.migrationPaths || '[]'); + const breakingChangesData = JSON.parse(migrationDataElement.dataset.breakingChanges || '[]'); + + // Initialize the module with data + BreakingChangesModule.initialize(migrationPaths); + + // Initialize UI + BreakingChangesUI.UIManager.initialize(); + + console.log('Breaking changes functionality initialized successfully'); + } catch (error) { + console.error('Failed to initialize breaking changes functionality:', error); + } +} + +// Initialize on DOMContentLoaded +document.addEventListener('DOMContentLoaded', initializeBreakingChanges); + +// Also initialize on window load to handle browser back/forward navigation +window.addEventListener('pageshow', (event) => { + // The pageshow event is fired when the page is shown, including when navigating back to the page + // The persisted property is true if the page is being restored from the bfcache + if (event.persisted) { + console.log('Page restored from back-forward cache, reinitializing breaking changes'); + initializeBreakingChanges(); + } +}); diff --git a/_migration-assistant/assets/js/breaking-changes-module.js b/_migration-assistant/assets/js/breaking-changes-module.js new file mode 100644 index 00000000000..1b24e73abfc --- /dev/null +++ b/_migration-assistant/assets/js/breaking-changes-module.js @@ -0,0 +1,342 @@ +/** + * Breaking Changes Module + * + * Handles version management, breaking changes data, and filtering logic + * Provides a clean API for the UI layer to interact with + */ + +// Global variables for compatibility with existing code +// These will be initialized by the module and exported +export let VERSIONS = []; +export let VERSION_ORDER = []; +export let MIGRATION_MAP = { sourceToTargets: {}, targetToSources: {} }; +export let breakingChanges = []; + +// Export utility functions for compatibility with existing code +export function getVersionIndex(version) { + const index = VERSION_ORDER.indexOf(version); + if (index === -1) throw new Error(`Unknown version: ${version}`); + return index; +} + +export function isVersionBetween(target, min, max) { + const targetIdx = getVersionIndex(target); + const minIdx = getVersionIndex(min); + const maxIdx = getVersionIndex(max); + return targetIdx >= minIdx && targetIdx <= maxIdx; +} + +const BreakingChangesModule = (function() { + // Private variables + let versions = []; + let versionOrder = []; + let migrationMap = { sourceToTargets: {}, targetToSources: {} }; + + // Breaking changes data - will be loaded from the data attribute + let breakingChangesData = []; + + /** + * Version Manager - Handles all version-related operations + */ + const VersionManager = { + /** + * Get the index of a version in the ordered version array + * @param {string} version - The version to find + * @returns {number} The index of the version + */ + getVersionIndex(version) { + const index = versionOrder.indexOf(version); + if (index === -1) throw new Error(`Unknown version: ${version}`); + return index; + }, + + /** + * Check if a version is between two other versions (inclusive) + * @param {string} target - The version to check + * @param {string} min - The minimum version + * @param {string} max - The maximum version + * @returns {boolean} True if the target is between min and max + */ + isVersionBetween(target, min, max) { + const targetIdx = this.getVersionIndex(target); + const minIdx = this.getVersionIndex(min); + const maxIdx = this.getVersionIndex(max); + return targetIdx >= minIdx && targetIdx <= maxIdx; + }, + + /** + * Get all valid target versions for a given source version + * @param {string} source - The source version + * @returns {Array} Array of valid target versions + */ + getValidTargets(source) { + return migrationMap.sourceToTargets[source] || []; + }, + + /** + * Get all valid source versions for a given target version + * @param {string} target - The target version + * @returns {Array} Array of valid source versions + */ + getValidSources(target) { + return migrationMap.targetToSources[target] || []; + }, + + /** + * Get all available versions + * @returns {Array} Array of all versions + */ + getAllVersions() { + return versions; + }, + + /** + * Get the ordered list of versions + * @returns {Array} Array of versions in order + */ + getVersionOrder() { + return versionOrder; + } + }; + + /** + * Breaking Changes Manager - Handles breaking changes data and filtering + */ + const BreakingChangesManager = { + /** + * Get all breaking changes + * @returns {Array} Array of breaking changes + */ + getBreakingChanges() { + return breakingChangesData; + }, + + /** + * Filter breaking changes based on source, target, and components + * @param {string} source - The source version + * @param {string} target - The target version + * @param {Array} selectedComponents - Array of selected component names + * @returns {Array} Filtered breaking changes + */ + filterBreakingChanges(source, target, selectedComponents) { + if (!source || !target || source === target) { + return []; + } + + return breakingChangesData.filter(change => { + // Check if the breaking change applies to this migration path + const sourceVersionIdx = VersionManager.getVersionIndex(source); + const targetVersionIdx = VersionManager.getVersionIndex(target); + const introducedInIdx = VersionManager.getVersionIndex(change.introducedIn); + + // A breaking change applies if: + // 1. The breaking change was introduced in a version that is between source and target (inclusive of target) + // 2. The source version is at or after the minimum source version affected + // 3. The target version is at or before the maximum target version affected + + // Handle optional affects field by using defaults if not present + const minSource = change.affects && change.affects.minSource ? change.affects.minSource : versions[0]; // Default to oldest version + const maxTarget = change.affects && change.affects.maxTarget ? change.affects.maxTarget : versions[versions.length - 1]; // Default to newest version + + const versionMatch = + introducedInIdx <= targetVersionIdx && // Breaking change was introduced at or before target + introducedInIdx > sourceVersionIdx && // Breaking change was introduced after source + sourceVersionIdx < targetVersionIdx && // Valid migration path (source before target) + sourceVersionIdx >= VersionManager.getVersionIndex(minSource) && // Source is affected + targetVersionIdx <= VersionManager.getVersionIndex(maxTarget); // Target is affected + + // For component filtering: + // - Always include changes with empty comp array (default/data components) + // - Only include dashboard components if the checkbox is checked + const componentMatch = + change.comp.length === 0 || // Include changes with no specific component (default/data) + change.comp.some(comp => selectedComponents.includes(comp)); // Include if any component matches + + return versionMatch && componentMatch; + }); + }, + + /** + * Get all unique components from breaking changes + * @returns {Array} Array of unique component names + */ + getUniqueComponents() { + const components = new Set(); + breakingChangesData.forEach(change => { + change.comp.forEach(comp => components.add(comp)); + }); + return Array.from(components); + } + }; + + /** + * Initialize the module with migration data + * @param {Array} migrationPaths - Array of migration paths from YAML + */ + function initialize(migrationPaths) { + // Get the migration data element + const migrationDataElement = document.getElementById('migration-data'); + + if (!migrationDataElement) { + console.error('Migration data element not found'); + return; + } + + // Load breaking changes data if available + if (migrationDataElement.dataset.breakingChanges) { + try { + breakingChangesData = JSON.parse(migrationDataElement.dataset.breakingChanges); + console.log('Loaded breaking changes data:', breakingChangesData.length); + } catch (error) { + console.error('Failed to parse breaking changes data:', error); + breakingChangesData = []; + } + } else { + console.error('Breaking changes data not found in migration-data element'); + breakingChangesData = []; + } + try { + // Transform the data structure from YAML format to the expected JavaScript object format + const validMigrations = migrationPaths.reduce((acc, path) => { + acc[path.source] = path.targets; + return acc; + }, {}); + + // Extract unique versions + versions = extractUniqueVersions(validMigrations); + + // Generate migration maps + migrationMap = { + sourceToTargets: validMigrations, + targetToSources: generateReverseMigrationMap(validMigrations) + }; + + // Determine version ordering + versionOrder = determineVersionOrder(versions, migrationMap); + + // Update global variables for compatibility with existing code + VERSIONS = [...versions]; + VERSION_ORDER = [...versionOrder]; + MIGRATION_MAP = { + sourceToTargets: {...migrationMap.sourceToTargets}, + targetToSources: {...migrationMap.targetToSources} + }; + breakingChanges = [...breakingChangesData]; + + console.log('Initialized version order:', versionOrder); + } catch (error) { + console.error('Failed to initialize migration data:', error); + // Fallback to empty arrays if initialization fails + versions = []; + versionOrder = []; + migrationMap = { sourceToTargets: {}, targetToSources: {} }; + } + } + + /** + * Extract all unique versions from migration paths + * @param {Object} migrationsMap - Map of source versions to target versions + * @returns {Array} Array of unique versions + */ + function extractUniqueVersions(migrationsMap) { + const versionsSet = new Set(); + + // Add all sources + Object.keys(migrationsMap).forEach(source => { + versionsSet.add(source); + }); + + // Add all targets + Object.values(migrationsMap).forEach(targets => { + targets.forEach(target => { + versionsSet.add(target); + }); + }); + + return Array.from(versionsSet).sort(); + } + + /** + * Generate a reverse mapping from target versions to source versions + * @param {Object} forwardMap - Map of source versions to target versions + * @returns {Object} Map of target versions to source versions + */ + function generateReverseMigrationMap(forwardMap) { + const reverseMap = {}; + + Object.entries(forwardMap).forEach(([source, targets]) => { + targets.forEach(target => { + if (!reverseMap[target]) { + reverseMap[target] = []; + } + reverseMap[target].push(source); + }); + }); + + return reverseMap; + } + + /** + * Determine the order of versions based on migration paths + * @param {Array} versions - Array of all versions + * @param {Object} migrationMap - Map of source versions to target versions and vice versa + * @returns {Array} Ordered array of versions + */ + function determineVersionOrder(versions, migrationMap) { + // Start with a copy of all versions + const remainingVersions = [...versions]; + const orderedVersions = []; + + // First, find versions that are only sources (not targets) + // These are the oldest versions + const onlySources = versions.filter(v => + !Object.values(migrationMap.targetToSources).flat().includes(v) && + migrationMap.sourceToTargets[v] + ); + + // Add oldest versions first + onlySources.forEach(v => { + orderedVersions.push(v); + const index = remainingVersions.indexOf(v); + if (index !== -1) remainingVersions.splice(index, 1); + }); + + // Then add the rest based on migration paths + while (remainingVersions.length > 0) { + let added = false; + + for (let i = 0; i < remainingVersions.length; i++) { + const version = remainingVersions[i]; + const sources = migrationMap.targetToSources[version] || []; + + // If all sources of this version are already in orderedVersions, + // we can add this version next + if (sources.every(s => orderedVersions.includes(s))) { + orderedVersions.push(version); + remainingVersions.splice(i, 1); + added = true; + break; + } + } + + // If we couldn't add any version in this pass, there might be a cycle + // Just add the first remaining version to break the cycle + if (!added && remainingVersions.length > 0) { + orderedVersions.push(remainingVersions[0]); + remainingVersions.splice(0, 1); + } + } + + return orderedVersions; + } + + // Public API + return { + initialize, + VersionManager, + BreakingChangesManager + }; +})(); + +// Export for ES modules +export default BreakingChangesModule; diff --git a/_migration-assistant/assets/js/breaking-changes-ui.js b/_migration-assistant/assets/js/breaking-changes-ui.js new file mode 100644 index 00000000000..18e06b94934 --- /dev/null +++ b/_migration-assistant/assets/js/breaking-changes-ui.js @@ -0,0 +1,407 @@ +/** + * Breaking Changes UI Module + * + * Handles all UI operations for the breaking changes selector + * Provides a clean separation between UI logic and data logic + */ + +import { VERSIONS, MIGRATION_MAP, breakingChanges, getVersionIndex } from './breaking-changes-module.js'; + +const BreakingChangesUI = (function() { + // Cache for DOM elements + const elements = { + sourceSelect: null, + targetSelect: null, + componentContainer: null, + resultsContainer: null + }; + + // Flag to prevent infinite loops when updating dropdowns + let isUpdating = false; + + /** + * UI Manager - Handles all UI operations + */ + const UIManager = { + /** + * Initialize the UI + */ + initialize() { + // Cache DOM elements + elements.sourceSelect = document.getElementById('source-version'); + elements.targetSelect = document.getElementById('target-version'); + elements.componentContainer = document.getElementById('component-checkboxes'); + elements.resultsContainer = document.getElementById('breaking-changes-results'); + + if (!elements.sourceSelect || !elements.targetSelect || + !elements.componentContainer || !elements.resultsContainer) { + console.error('Required DOM elements not found'); + return; + } + + // Store original values before populating dropdowns + const originalSourceValue = elements.sourceSelect.value; + const originalTargetValue = elements.targetSelect.value; + + // Set up event listeners + this.setupEventListeners(); + + // Populate dropdowns with all versions + // Use the global VERSIONS array for compatibility with original code + const allVersions = VERSIONS; + console.log('Available versions:', allVersions); + this.populateVersionDropdowns(allVersions); + + // Set up component checkboxes + this.setupComponentCheckboxes(); + + // Restore original values if they were set (e.g., when navigating back to the page) + if (originalSourceValue && originalTargetValue) { + console.log('Restoring pre-selected values:', originalSourceValue, originalTargetValue); + elements.sourceSelect.value = originalSourceValue; + elements.targetSelect.value = originalTargetValue; + } + + // Initialize results + this.updateResults(); + + // Use requestAnimationFrame to wait for the next rendering cycle + // This ensures the DOM has been updated before we check for selected values + requestAnimationFrame(() => { + if (elements.sourceSelect.value && elements.targetSelect.value) { + console.log('Selected values detected, updating results after DOM update'); + this.updateResults(); + } + }); + }, + + /** + * Set up event listeners for dropdowns and checkboxes + */ + setupEventListeners() { + elements.sourceSelect.addEventListener('change', () => { + this.onSourceChange(); + }); + + elements.targetSelect.addEventListener('change', () => { + this.onTargetChange(); + }); + }, + + /** + * Create an option element for a dropdown + * @param {string} value - The option value + * @param {string} text - The option text + * @returns {HTMLOptionElement} The created option element + */ + createOption(value, text) { + const option = document.createElement('option'); + option.value = value; + option.textContent = text; + return option; + }, + + /** + * Populate a dropdown with versions + * @param {Array} versions - Array of version strings + * @param {HTMLSelectElement} dropdown - The dropdown to populate + * @param {string} placeholder - Placeholder text for the default option + */ + populateDropdown(dropdown, versions, placeholder = 'Select Version') { + const currentValue = dropdown.value; + + // Clear existing options + while (dropdown.firstChild) { + dropdown.removeChild(dropdown.firstChild); + } + + // Add placeholder option + dropdown.appendChild(this.createOption('', placeholder)); + + // Add version options + versions.forEach(version => { + dropdown.appendChild(this.createOption(version, version)); + }); + + // Restore selected value if it still exists + if (currentValue && versions.includes(currentValue)) { + dropdown.value = currentValue; + } + }, + + /** + * Populate both version dropdowns + * @param {Array} versions - Array of version strings + */ + populateVersionDropdowns(versions) { + // Populate source dropdown with all versions + this.populateDropdown(elements.sourceSelect, versions, 'Select Source'); + + // For target dropdown, only show versions that are targets for any source + const allTargetVersions = new Set(); + + // Collect all target versions from the migration map + Object.values(MIGRATION_MAP.sourceToTargets).forEach(targets => { + targets.forEach(target => allTargetVersions.add(target)); + }); + + // Convert Set to Array and sort in reverse order (latest version first) + const targetVersions = Array.from(allTargetVersions).sort((a, b) => { + // Use the version index to sort in reverse order + return getVersionIndex(b) - getVersionIndex(a); + }); + console.log('Filtered target versions (latest first):', targetVersions); + + // Populate target dropdown with filtered versions + this.populateDropdown(elements.targetSelect, targetVersions, 'Select Target'); + }, + + /** + * Handle source version change + */ + onSourceChange() { + if (isUpdating) return; + isUpdating = true; + + const selectedSource = elements.sourceSelect.value; + + if (selectedSource) { + // Get valid targets for this source and sort in reverse order (latest version first) + const validTargets = (MIGRATION_MAP.sourceToTargets[selectedSource] || []).sort((a, b) => { + // Use the version index to sort in reverse order + return getVersionIndex(b) - getVersionIndex(a); + }); + console.log('Valid targets for', selectedSource, '(latest first):', validTargets); + this.populateDropdown(elements.targetSelect, validTargets, 'Select Target'); + } else { + // If no source selected, show all possible targets + this.populateDropdown(elements.targetSelect, VERSIONS, 'Select Target'); + } + + isUpdating = false; + this.updateResults(); + }, + + /** + * Handle target version change + */ + onTargetChange() { + if (isUpdating) return; + isUpdating = true; + + const selectedTarget = elements.targetSelect.value; + + if (selectedTarget) { + // Get valid sources for this target + const validSources = MIGRATION_MAP.targetToSources[selectedTarget] || []; + console.log('Valid sources for', selectedTarget, ':', validSources); + this.populateDropdown(elements.sourceSelect, validSources, 'Select Source'); + } else { + // If no target selected, show all possible sources + this.populateDropdown(elements.sourceSelect, VERSIONS, 'Select Source'); + } + + isUpdating = false; + this.updateResults(); + }, + + /** + * Set up component checkboxes based on available components + */ + setupComponentCheckboxes() { + // Clear existing checkboxes + elements.componentContainer.innerHTML = ''; + + // Get unique components from breaking changes + const components = this.getUniqueComponents(); + console.log('Available components:', components); + + // Create a checkbox for each component + components.forEach(component => { + const span = document.createElement('span'); + span.className = 'component-checkbox'; + + const checkbox = document.createElement('input'); + checkbox.type = 'checkbox'; + checkbox.id = `component-${component}`; + checkbox.value = component; + checkbox.checked = false; // Unchecked by default + + // Add event listener to checkbox + checkbox.addEventListener('change', () => { + this.updateResults(); + }); + + const label = document.createElement('label'); + label.htmlFor = `component-${component}`; + label.textContent = component.charAt(0).toUpperCase() + component.slice(1); // Capitalize first letter + + span.appendChild(checkbox); + span.appendChild(label); + elements.componentContainer.appendChild(span); + }); + + // If no components found, add a message + if (components.length === 0) { + elements.componentContainer.innerHTML = '<p>No component filters available</p>'; + } + }, + + /** + * Get selected components from checkboxes + * @returns {Array} Array of selected component names + */ + getSelectedComponents() { + const checkboxes = elements.componentContainer.querySelectorAll('input[type="checkbox"]'); + return Array.from(checkboxes) + .filter(cb => cb.checked) + .map(cb => cb.value); + }, + + /** + * Get all unique components from breaking changes + * @returns {Array} Array of unique component names + */ + getUniqueComponents() { + const components = new Set(); + breakingChanges.forEach(change => { + change.comp.forEach(comp => components.add(comp)); + }); + return Array.from(components); + }, + + /** + * Update results based on selected filters + */ + updateResults() { + const selectedSource = elements.sourceSelect.value; + const selectedTarget = elements.targetSelect.value; + + // Show message if source or target not selected + if (!selectedSource || !selectedTarget) { + elements.resultsContainer.innerHTML = '<p>Please select both source and target versions to see breaking changes.</p>'; + return; + } + + // Check if source and target are the same + if (selectedSource === selectedTarget) { + elements.resultsContainer.innerHTML = '<p>No breaking changes when source and target versions are the same.</p>'; + return; + } + + // Get selected components + const selectedComponents = this.getSelectedComponents(); + + // Filter breaking changes directly using the global variables + const relevantChanges = this.filterBreakingChanges(selectedSource, selectedTarget, selectedComponents); + console.log('Filtered breaking changes:', relevantChanges); + + // Display results + this.displayResults(relevantChanges); + }, + + /** + * Filter breaking changes based on source, target, and components + * @param {string} source - The source version + * @param {string} target - The target version + * @param {Array} selectedComponents - Array of selected component names + * @returns {Array} Filtered breaking changes + */ + filterBreakingChanges(source, target, selectedComponents) { + if (!source || !target || source === target) { + return []; + } + + try { + // Use the imported getVersionIndex function + const sourceVersionIdx = getVersionIndex(source); + const targetVersionIdx = getVersionIndex(target); + + return breakingChanges.filter(change => { + const introducedInIdx = getVersionIndex(change.introducedIn); + + // A breaking change applies if: + // 1. The breaking change was introduced in a version that is between source and target (inclusive of target) + // 2. The source version is at or after the minimum source version affected + // 3. The target version is at or before the maximum target version affected + + // Handle optional affects field by using defaults if not present + const minSource = change.affects && change.affects.minSource ? change.affects.minSource : VERSIONS[0]; // Default to oldest version + const maxTarget = change.affects && change.affects.maxTarget ? change.affects.maxTarget : VERSIONS[VERSIONS.length - 1]; // Default to newest version + + const versionMatch = + introducedInIdx <= targetVersionIdx && // Breaking change was introduced at or before target + introducedInIdx > sourceVersionIdx && // Breaking change was introduced after source + sourceVersionIdx < targetVersionIdx && // Valid migration path (source before target) + sourceVersionIdx >= getVersionIndex(minSource) && // Source is affected + targetVersionIdx <= getVersionIndex(maxTarget); // Target is affected + + // For component filtering: + // - Always include changes with empty comp array (default/data components) + // - Only include dashboard components if the checkbox is checked + const componentMatch = + change.comp.length === 0 || // Include changes with no specific component (default/data) + change.comp.some(comp => selectedComponents.includes(comp)); // Include if any component matches + + return versionMatch && componentMatch; + }); + } catch (error) { + console.error('Error filtering breaking changes:', error); + return []; + } + }, + + /** + * Display filtered breaking changes + * @param {Array} changes - Array of breaking changes to display + * @param {HTMLElement} container - Optional container element to display results in (defaults to elements.resultsContainer) + */ + displayResults(changes, container = null) { + // Use the provided container or default to the UI module's container + const resultsContainer = container || elements.resultsContainer; + + if (changes.length) { + resultsContainer.innerHTML = ` + <h4>Relevant breaking changes:</h4> + <ul>${changes.map(change => { + let transformationHtml = ''; + if (change.transformation) { + transformationHtml = ` + <div class="transformation-info"> + <strong>Available Migration Assistant Transformation:</strong> + <a href="${change.transformation.url}">${change.transformation.title}</a> + </div> + `; + } + return ` + <li> + <a href="${change.url}">${change.title}</a> + ${transformationHtml} + </li> + `; + }).join('')}</ul> + <p class="transformation-request"> + To request additional transformations to be built into the Migration Assistant, + open a GitHub issue <a href="https://github.com/opensearch-project/opensearch-migrations/issues">here</a>. + </p> + `; + } else { + resultsContainer.innerHTML = ` + <p>No specific breaking changes found for your selection.</p> + <p class="transformation-request"> + To request additional transformations to be built into the Migration Assistant, + open a GitHub issue <a href="https://github.com/opensearch-project/opensearch-migrations/issues">here</a>. + </p> + `; + } + } + }; + + // Public API + return { + UIManager + }; +})(); + +// Export for ES modules +export default BreakingChangesUI; diff --git a/_migration-assistant/deploying-migration-assistant/configuration-options.md b/_migration-assistant/deploying-migration-assistant/configuration-options.md new file mode 100644 index 00000000000..5fa28e44c43 --- /dev/null +++ b/_migration-assistant/deploying-migration-assistant/configuration-options.md @@ -0,0 +1,232 @@ +--- +layout: default +title: Configuration options +nav_order: 15 +parent: Deploying Migration Assistant +--- + +# Configuration options + +This page outlines the configuration options for three key migrations scenarios: + +1. **Metadata migration** +2. **Backfill migration with `Reindex-from-Snapshot` (RFS)** +3. **Live capture migration with Capture and Replay (C&R)** + +Each of these migrations depends on either a snapshot or a capture proxy. The following example `cdk.context.json` configurations are used by AWS Cloud Development Kit (AWS CDK) to deploy and configure Migration Assistant for OpenSearch, shown as separate blocks for each migration type. If you are performing a migration applicable to multiple scenarios, these options can be combined. + + +For a complete list of configuration options, see [opensearch-migrations-options.md](https://github.com/opensearch-project/opensearch-migrations/blob/main/deployment/cdk/opensearch-service-migration/options.md). If you need a configuration option that is not found on this page, create an issue in the [OpenSearch Migrations repository](https://github.com/opensearch-project/opensearch-migrations/issues). +{: .tip } + +Options for the source cluster endpoint, target cluster endpoint, and existing virtual private cloud (VPC) should be configured in order for the migration tools to function effectively. + +## Shared configuration options + +Each migration configuration shares the following options. + + +| Name | Example | Description | +| :--- | :--- | :--- | +| `sourceClusterEndpoint` | `"https://source-cluster.elb.us-east-1.endpoint.com"` | The endpoint for the source cluster. | +| `targetClusterEndpoint` | `"https://vpc-demo-opensearch-cluster-cv6hggdb66ybpk4kxssqt6zdhu.us-west-2.es.amazonaws.com:443"` | The endpoint for the target cluster. Required if using an existing target cluster for the migration instead of creating a new one. | +| `vpcId` | `"vpc-123456789abcdefgh"` | The ID of the existing VPC in which the migration resources will be stored. The VPC must have at least two private subnets that span two Availability Zones. | + + +## Backfill migration using RFS + +The following CDK performs a backfill migrations using RFS: + +```json +{ + "backfill-migration": { + "stage": "dev", + "vpcId": <VPC_ID>, + "sourceCluster": { + "endpoint": <SOURCE_CLUSTER_ENDPOINT>, + "version": "ES 7.10", + "auth": {"type": "none"} + }, + "targetCluster": { + "endpoint": <TARGET_CLUSTER_ENDPOINT>, + "auth": { + "type": "basic", + "username": <TARGET_CLUSTER_USERNAME>, + "passwordFromSecretArn": <TARGET_CLUSTER_PASSWORD_SECRET> + } + }, + "reindexFromSnapshotServiceEnabled": true, + "reindexFromSnapshotExtraArgs": "", + "artifactBucketRemovalPolicy": "DESTROY" + } +} +``` +{% include copy.html %} + +Performing an RFS backfill migration requires an existing snapshot. + + +The RFS configuration uses the following options. All options are optional. + +| Name | Example | Description | +| :--- | :--- | :--- | +| `reindexFromSnapshotServiceEnabled` | `true` | Enables deployment and configuration of the RFS ECS service. | +| `reindexFromSnapshotExtraArgs` | `"--target-aws-region us-east-1 --target-aws-service-signing-name es"` | Extra arguments for the Document Migration command, with space separation. See [RFS Extra Arguments](https://github.com/opensearch-project/opensearch-migrations/blob/main/DocumentsFromSnapshotMigration/README.md#arguments) for more information. You can pass `--no-insecure` to remove the `--insecure` flag. | + +To view all available arguments for `reindexFromSnapshotExtraArgs`, see [Snapshot migrations README](https://github.com/opensearch-project/opensearch-migrations/blob/main/DocumentsFromSnapshotMigration/README.md#arguments). At a minimum, no extra arguments may be needed. + +## Live capture migration with C&R + +The following sample CDK performs a live capture migration with C&R: + +```json +{ + "live-capture-migration": { + "stage": "dev", + "vpcId": <VPC_ID>, + "sourceCluster": { + "endpoint": <SOURCE_CLUSTER_ENDPOINT>, + "version": "ES 7.10", + "auth": {"type": "none"} + }, + "targetCluster": { + "endpoint": <TARGET_CLUSTER_ENDPOINT>, + "auth": { + "type": "basic", + "username": <TARGET_CLUSTER_USERNAME>, + "passwordFromSecretArn": <TARGET_CLUSTER_PASSWORD_SECRET> + } + }, + "captureProxyServiceEnabled": true, + "captureProxyExtraArgs": "", + "trafficReplayerServiceEnabled": true, + "trafficReplayerExtraArgs": "--speedup-factor 2.0", + "targetClusterProxyServiceEnabled": true + } +} +``` +{% include copy.html %} + +Performing a live capture migration requires that a Capture Proxy be configured to capture incoming traffic and send it to the target cluster using the Traffic Replayer service. For arguments available in `captureProxyExtraArgs`, refer to the `@Parameter` fields [here](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficCaptureProxyServer/src/main/java/org/opensearch/migrations/trafficcapture/proxyserver/CaptureProxy.java). For `trafficReplayerExtraArgs`, refer to the `@Parameter` fields [here](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficReplayer/src/main/java/org/opensearch/migrations/replay/TrafficReplayer.java). At a minimum, no extra arguments may be needed. + + +| Name | Example | Description | +| :--- | :--- | :--- | +| `captureProxyServiceEnabled` | `true` | Enables the Capture Proxy service deployment using an AWS CloudFormation stack. | +| `captureProxyExtraArgs` | `"--suppressCaptureForHeaderMatch user-agent .*elastic-java/7.17.0.*"` | Extra arguments for the Capture Proxy command, including options specified by the [Capture Proxy](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficCaptureProxyServer/src/main/java/org/opensearch/migrations/trafficcapture/proxyserver/CaptureProxy.java). | +| `trafficReplayerServiceEnabled` | `true` | Enables the Traffic Replayer service deployment using a CloudFormation stack. | +| `trafficReplayerExtraArgs` | `"--sigv4-auth-header-service-region es,us-east-1 --speedup-factor 5"` | Extra arguments for the Traffic Replayer command, including options for auth headers and other parameters specified by the [Traffic Replayer](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficReplayer/src/main/java/org/opensearch/migrations/replay/TrafficReplayer.java). | +| `targetClusterProxyServiceEnabled` | `true` | Enables the target cluster proxy service deployment using a CloudFormation stack. | + +For arguments available in `captureProxyExtraArgs`, see the `@Parameter` fields in [`CaptureProxy.java`](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficCaptureProxyServer/src/main/java/org/opensearch/migrations/trafficcapture/proxyserver/CaptureProxy.java). For `trafficReplayerExtraArgs`, see the `@Parameter` fields in [`TrafficReplayer.java`](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficReplayer/src/main/java/org/opensearch/migrations/replay/TrafficReplayer.java). + + +## Cluster authentication options + +Both the source and target cluster can use no authentication, authentication limited to VPC, basic authentication with a username and password, or AWS Signature Version 4 scoped to a user or role. + +### No authentication + +```json + "sourceCluster": { + "endpoint": <SOURCE_CLUSTER_ENDPOINT>, + "version": "ES 7.10", + "auth": {"type": "none"} + } +``` +{% include copy.html %} + +### Basic authentication + +```json + "sourceCluster": { + "endpoint": <SOURCE_CLUSTER_ENDPOINT>, + "version": "ES 7.10", + "auth": { + "type": "basic", + "username": <TARGET_CLUSTER_USERNAME>, + "passwordFromSecretArn": <TARGET_CLUSTER_PASSWORD_SECRET> + } + } +``` +{% include copy.html %} + +### AWS Signature Version 4 authentication + +```json + "sourceCluster": { + "endpoint": <SOURCE_CLUSTER_ENDPOINT>, + "version": "ES 7.10", + "auth": { + "type": "sigv4", + "region": "us-east-1", + "serviceSigningName": "es" + } + } +``` +{% include copy.html %} + +The `serviceSigningName` can be `es` for an Elasticsearch or OpenSearch domain. + +All of these authentication options apply to both source and target clusters. + +## Snapshot options + +The following configuration options customize the process of migrating from snapshots. + +### Snapshot of a managed service source + +If your source cluster is on Amazon OpenSearch Service, you need to set up an additional AWS Identity and Access Management (IAM) role and pass it with the snapshot creation call, as described in the [AWS documentation](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/managedomains-snapshots.html). Migration Assistant can automatically manage this process. OpenSearch Service snapshots are only compatible with AWS Signature Version 4 authentication. The following parameter ensures that the additional IAM role is created and passed. + +| Name | Example | Description | +| :--- | :--- | :--- | +| `managedServiceSourceSnapshotEnabled` | `true` | Creates the necessary roles and trust relationships for taking a snapshot of an OpenSearch Service source cluster. This is only compatible with AWS Signature Version 4 authentication.| + +### Bring your own snapshot + +You can use an existing Amazon Simple Storage Service (Amazon S3) snapshot to perform [metadata]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/migrating-metadata/) and [backfill]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/backfill/) migrations instead of using Migration Assistant to create a snapshot: + +```json + "snapshot": { + "snapshotName": "my-snapshot-name", + "snapshotRepoName": "my-snapshot-repo", + "s3Uri": "s3://my-s3-bucket-name/my-bucket-path-to-snapshot-repo", + "s3Region": "us-east-2" + } +``` +{% include copy.html %} + +By default, Amazon S3 buckets automatically allow roles in the same AWS account (with the appropriate `s3:*` permissions) to access the S3 bucket, regardless of the bucket's AWS Region. If the external S3 bucket is in the same AWS account as the Migration Assistant deployment, no further IAM configuration is required to access the bucket. + +If you use a custom permission model with Amazon S3, any access control list (ACL) or custom bucket policy should allow the Migration Assistant task roles for RFS and the migration console to read from the S3 bucket. + +If the S3 bucket is in a separate AWS account from the Migration Assistant deployment, you need a custom bucket policy similar to the following to allow access to Migration Assistant: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "AllowExternalAccountReadAccessToBucket", + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam::<ACCOUNT_ID>:root" + }, + "Action": [ + "s3:GetObject", + "s3:ListBucket", + "s3:GetBucketLocation" + ], + "Resource": [ + "arn:aws:s3:::my-s3-bucket-name", + "arn:aws:s3:::my-s3-bucket-name/*" + ] + } + ] +} +``` +{% include copy.html %} + +## Network configuration + +The migration tooling expects the source cluster, target cluster, and migration resources to exist in the same VPC. If this is not the case, manual networking setup outside of this documentation is likely required. diff --git a/_migration-assistant/deploying-migration-assistant/getting-started-data-migration.md b/_migration-assistant/deploying-migration-assistant/getting-started-data-migration.md new file mode 100644 index 00000000000..856d2e53aa5 --- /dev/null +++ b/_migration-assistant/deploying-migration-assistant/getting-started-data-migration.md @@ -0,0 +1,359 @@ +--- +layout: default +title: Getting started with data migration +parent: Deploying Migration Assistant +nav_order: 10 +redirect_from: + - /upgrade-to/snapshot-migrate/ + - /migration-assistant/getting-started-with-data-migration/ +--- + +# Getting started with data migration + +This quickstart outlines how to deploy Migration Assistant for OpenSearch and execute an existing data migration using `Reindex-from-Snapshot` (RFS). It uses AWS for illustrative purposes. However, the steps can be modified for use with other cloud providers. + +## Prerequisites and assumptions + +Before using this quickstart, make sure you fulfill the following prerequisites: + +* Verify that your migration path [is supported]({{site.url}}{{site.baseurl}}/migration-assistant/overview/is-migration-assistant-right-for-you/#supported-migration-paths). Note that we test with the exact versions specified, but you should be able to migrate data on alternative minor versions as long as the major version is supported. +* The source cluster must be deployed Amazon Simple Storage Service (Amazon S3) plugin. +* The target cluster must be deployed. +* Verify that the `CDKToolkit` stack exists and is set to `CREATE_COMPLETE`. For more information about how to bootstrap your AWS account in the required AWS Region, see [the CDKToolkit documentation](https://docs.aws.amazon.com/cdk/v2/guide/getting_started.html). + +The steps in this guide assume the following: + +* In this guide, a snapshot will be taken and stored in Amazon S3; the following assumptions are made about this snapshot: + * The `_source` flag is enabled on all indexes to be migrated. + * The snapshot includes the global cluster state (`include_global_state` is `true`). + * Shard sizes of up to approximately 80 GB are supported. Larger shards cannot be migrated. If this presents challenges for your migration, contact the [migration team](https://opensearch.slack.com/archives/C054JQ6UJFK). +* Migration Assistant will be installed in the same AWS Region and have access to both the source snapshot and target cluster. + +--- + +## Step 1: Install Bootstrap on an Amazon EC2 instance (~10 minutes) + +To begin your migration, use the following steps to install a `bootstrap` box on an Amazon Elastic Compute Cloud (Amazon EC2) instance. The instance uses AWS CloudFormation to create and manage the stack. + +1. Log in to the target AWS account in which you want to deploy Migration Assistant. +2. From the browser where you are logged in to your target AWS account, right-click [here](https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?templateURL=https://solutions-reference.s3.amazonaws.com/migration-assistant-for-amazon-opensearch-service/latest/migration-assistant-for-amazon-opensearch-service.template&redirectId=SolutionWeb) to load the CloudFormation template from a new browser tab. +3. Follow the CloudFormation stack wizard: + * **Stack Name:** `MigrationBootstrap` + * **Stage Name:** `dev` + * Choose **Next** after each step > **Acknowledge** > **Submit**. +4. Verify that the Bootstrap stack exists and is set to `CREATE_COMPLETE`. This process takes around 10 minutes to complete. + +--- + +## Step 2: Set up Bootstrap instance access (~5 minutes) + +Use the following steps to set up Bootstrap instance access: + +1. After deployment, find the EC2 instance ID for the `bootstrap-dev-instance`. +2. Create an AWS Identity and Access Management (IAM) policy using the following snippet, replacing `<aws-region>`, `<aws-account>`, `<stage>`, and `<ec2-instance-id>` with your information: + + ```json + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "ssm:StartSession", + "Resource": [ + "arn:aws:ec2:<aws-region>:<aws-account>:instance/<ec2-instance-id>", + "arn:aws:ssm:<aws-region>:<aws-account>:document/BootstrapShellDoc-<stage>-<aws-region>" + ] + } + ] + } + ``` + {% include copy.html %} + +3. Name the policy, for example, `SSM-OSMigrationBootstrapAccess`, and then create the policy by selecting **Create policy**. +4. Attach the newly created policy to your EC2 instance's IAM role. + +--- + +## Step 3: Log in to Bootstrap and building Migration Assistant (~15 minutes) + +Next, log in to Bootstrap and build Migration Assistant using the following steps. + +### Prerequisites + +To use these steps, make sure you fulfill the following prerequisites: + +* The AWS Command Line Interface (AWS CLI) and AWS Session Manager plugin are installed on your instance. +* The AWS credentials are configured (`aws configure`) for your instance. + +### Steps + +1. Load AWS credentials into your terminal. +2. Log in to the instance using the following command, replacing `<instance-id>` and `<aws-region>` with your instance ID and Region: + + ```bash + aws ssm start-session --document-name BootstrapShellDoc-<stage>-<aws-region> --target <instance-id> --region <aws-region> [--profile <profile-name>] + ``` + {% include copy.html %} + +3. Once logged in, run the following command from the shell of the Bootstrap instance in the `/opensearch-migrations` directory: + + ```bash + ./initBootstrap.sh && cd deployment/cdk/opensearch-service-migration + ``` + {% include copy.html %} + +4. After a successful build, note the path for infrastructure deployment, which will be used in the next step. + +--- + +## Step 4: Configure and deploy RFS (~20 minutes) + +To deploy Migration Assistant with RFS, the following stacks must be deployed: + +These commands deploy the following stacks: + +* `Migration Assistant network` stack +* `RFS` stack +* `Migration console` stack + +Use the following steps to configure and deploy RFS, deploy Migration Assistant, and verify installation of the required stacks: + +1. Add the source and target cluster password as separate **Secrets** in [AWS Secrets Manager](https://docs.aws.amazon.com/secretsmanager/latest/userguide/intro.html) as an unstructured string. Be sure to copy the secret Amazon Resource Name (ARN) for use during deployment. +2. From the same shell as the Bootstrap instance, modify the `cdk.context.json` file located in the `/opensearch-migrations/deployment/cdk/opensearch-service-migration` directory and configure the following settings: + + ```json + { + "default": { + "stage": "dev", + "vpcId": "<TARGET CLUSTER VPC ID>", + "targetCluster": { + "endpoint": "<TARGET CLUSTER ENDPOINT>", + "auth": { + "type": "basic", + "username": "<TARGET CLUSTER USERNAME>", + "passwordFromSecretArn": "<TARGET CLUSTER PASSWORD SECRET>" + } + }, + "sourceCluster": { + "endpoint": "<SOURCE CLUSTER ENDPOINT>", + "version": "<SOURCE ENGINE VERSION>", + "auth": { + "type": "basic", + "username": "<TARGET CLUSTER USERNAME>", + "passwordFromSecretArn": "<TARGET CLUSTER PASSWORD SECRET>" + } + }, + "reindexFromSnapshotExtraArgs": "<RFS PARAMETERS (see below)>", + "reindexFromSnapshotMaxShardSizeGiB": 80, + "otelCollectorEnabled": true, + "migrationConsoleServiceEnabled": true + } + } + ``` + {% include copy.html %} + + The source and target cluster authorization can be configured to have no authorization, `basic` with a username and password, or `sigv4`. + +3. After the `cdk.context.json` file is fully configured, bootstrap the account and deploy the required stacks using the following command: + + ```bash + cdk bootstrap --c contextId=default --require-approval never + ``` + {% include copy.html %} + +4. Deploy Migration Assistant using the following command: + + ```bash + cdk deploy "*" --c contextId=default --require-approval never --concurrency 5 + ``` + {% include copy.html %} + +5. From the same Bootstrap instance shell, verify that all CloudFormation stacks were installed successfully: + + ```bash + aws cloudformation list-stacks --query "StackSummaries[?StackStatus!='DELETE_COMPLETE'].[StackName,StackStatus]" --output table + ``` + {% include copy.html %} + +You should receive a similar output for your Region: + +```bash +------------------------------------------------------------------------ +| ListStacks | ++--------------------------------------------------+-------------------+ +| OSMigrations-dev-us-east-1-MigrationConsole | CREATE_COMPLETE | +| OSMigrations-dev-us-east-1-ReindexFromSnapshot | CREATE_COMPLETE | +| OSMigrations-dev-us-east-1-MigrationInfra | CREATE_COMPLETE | +| OSMigrations-dev-us-east-1-default-NetworkInfra | CREATE_COMPLETE | +| MigrationBootstrap | CREATE_COMPLETE | +| CDKToolkit | CREATE_COMPLETE | ++--------------------------------------------------+-------------------+ +``` + +### RFS parameters + +If you're creating a snapshot using migration tooling, these parameters are automatically configured. If you're using an existing snapshot, modify the `reindexFromSnapshotExtraArgs` setting with the following values: + +```bash + "reindexFromSnapshotExtraArgs": "--s3-repo-uri s3://<bucket-name>/<repo> --s3-region <region> --snapshot-name <name>" +``` + +You will also need to give the `migrationconsole` and `reindexFromSnapshot` TaskRoles permissions to the S3 bucket. + +--- + +## Step 5: Access the migration console + +Run the following command to access the migration console: + +```bash +./accessContainer.sh migration-console dev <region> +``` +{% include copy.html %} + + +`accessContainer.sh` is located in `/opensearch-migrations/deployment/cdk/opensearch-service-migration/` on the Bootstrap instance. To learn more, see [Accessing the migration console]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/migrating-metadata/). +{: .note} + +--- + +## Step 6: Verify the connection to the source and target clusters + +To verify the connection to the clusters, run the following command: + +```bash +console clusters connection-check +``` +{% include copy.html %} + +You should receive the following output: + +```bash +SOURCE CLUSTER +ConnectionResult(connection_message='Successfully connected!', connection_established=True, cluster_version='') +TARGET CLUSTER +ConnectionResult(connection_message='Successfully connected!', connection_established=True, cluster_version='') +``` + +To learn more about migration console commands, see [Migration commands]. + +--- + +## Step 7: Create a snapshot + +Run the following command to initiate snapshot creation from the source cluster: + +```bash +console snapshot create [...] +``` +{% include copy.html %} + +To check the snapshot creation status, run the following command: + +```bash +console snapshot status [...] +``` +{% include copy.html %} + +To learn more information about the snapshot, run the following command: + +```bash +console snapshot status --deep-check [...] +``` +{% include copy.html %} + +Wait for snapshot creation to complete before moving to step 9. + +To learn more about snapshot creation, see [Snapshot Creation]. + +--- + +## Step 8: Migrate metadata + +Run the following command to migrate metadata: + +```bash +console metadata migrate [...] +``` +{% include copy.html %} + +For more information, see [Migrating metadata]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/migrating-metadata/). + +--- + +## Step 9: Migrate documents with RFS + +You can now use RFS to migrate documents from your original cluster: + +1. To start the migration from RFS, start a `backfill` using the following command: + + ```bash + console backfill start + ``` + {% include copy.html %} + +2. _(Optional)_ To speed up the migration, increase the number of documents processed at a simultaneously by using the following command: + + ```bash + console backfill scale <NUM_WORKERS> + ``` + {% include copy.html %} + +3. To check the status of the documentation backfill, use the following command: + + ```bash + console backfill status + ``` + {% include copy.html %} + +4. If you need to stop the backfill process, use the following command: + + ```bash + console backfill stop + ``` + {% include copy.html %} + +For more information, see [Backfill]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/backfill/). + +--- + +## Step 10: Backfill monitoring + +Use the following command for detailed monitoring of the backfill process: + +```bash +console backfill status --deep-check +``` +{% include copy.html %} + +You should receive the following output: + +```json +BackfillStatus.RUNNING +Running=9 +Pending=1 +Desired=10 +Shards total: 62 +Shards completed: 46 +Shards incomplete: 16 +Shards in progress: 11 +Shards unclaimed: 5 +``` + +Logs and metrics are available in Amazon CloudWatch in the `OpenSearchMigrations` log group. + +--- + +## Step 11: Verify that all documents were migrated + +Use the following query in CloudWatch Logs Insights to identify failed documents: + +```bash +fields @message +| filter @message like "Bulk request succeeded, but some operations failed." +| sort @timestamp desc +| limit 10000 +``` +{% include copy.html %} + +If any failed documents are identified, you can index the failed documents directly as opposed to using RFS. diff --git a/_migration-assistant/deploying-migration-assistant/iam-and-security-groups-for-existing-clusters.md b/_migration-assistant/deploying-migration-assistant/iam-and-security-groups-for-existing-clusters.md new file mode 100644 index 00000000000..291c2cba6a5 --- /dev/null +++ b/_migration-assistant/deploying-migration-assistant/iam-and-security-groups-for-existing-clusters.md @@ -0,0 +1,92 @@ +--- +layout: default +title: IAM and security groups for existing clusters +nav_order: 20 +parent: Deploying Migration Assistant +--- + +# IAM and security groups for existing clusters + +This page outlines security scenarios for using the migration tools with existing clusters, including any necessary configuration changes to ensure proper communication between them. + +## Importing an Amazon OpenSearch Service or Amazon OpenSearch Serverless target cluster + +Use the following scenarios for Amazon OpenSearch Service or Amazon OpenSearch Serverless target clusters. + +### OpenSearch Service + +For an OpenSearch Domain, two main configurations are typically required to ensure proper functioning of the migration solution: + +1. **Security Group Configuration** + + The domain should have a security group that allows communication from the applicable migration services (Traffic Replayer, Migration Console, `Reindex-from-Snapshot`). The CDK automatically creates an `osClusterAccessSG` security group, which is applied to the migration services. The user should then add this security group to their existing domain to allow access. + +2. **Access Policy Configuration** should be one of the following: + - An open access policy that allows all access. + - Configured to allow at least the AWS Identity and Access Management (IAM) task roles for the applicable migration services (Traffic Replayer, Migration Console, `Reindex-from-Snapshot`) to access the domain. + +### Managed service role mapping (Cross-managed-cluster migrations) + +When migrating between two managed clusters, for example, when both domains were created using Amazon OpenSearch Service, provide Migration Assistant components with sufficient permissions to modify both the source and target clusters. + +Use the following steps to grant the required permissions: + +1. In the AWS Management Console, navigate to **CloudFormation** > **Stacks**. +2. Locate the stack that starts with `OSMigrations-<state>-<region>` (created during CDK deployment). +3. Go to the **Resources** tab and locate the following IAM roles: + + ```bash + arn:aws:iam::****:role/OSMigrations-<state>-<region>-MigrationServiceTaskRoleC- + arn:aws:iam::****:role/OSMigrations-<state>-<region>-reindexfromsnapshotTaskRo- + arn:aws:iam::****:role/OSMigrations-<state>-<region>-trafficreplayerdefaultTas- + ``` + +4. In both the source and target clusters, map users to each Amazon Resource Name (ARN) using the following steps: + A. Access OpenSearch Dashboards. If you're using Elasticsearch, access Kibana. + B. Navigate to **Security -> Roles -> all_access**. + C. In the "Mapped users" section, add each ARN as a backend role. + D. Save your changes. + +### OpenSearch Serverless + +For an OpenSearch Serverless Collection, you will need to configure both network and data access policies: + +1. **Network Policy Configuration**: + The Collection should have a network policy that uses the `VPC` access type. This requires creating a VPC endpoint on the VPC used for the solution. The VPC endpoint should be configured for the private subnets of the VPC and should attach the `osClusterAccessSG` security group. + +2. **Data Access Policy Configuration**: + The data access policy should grant permission to perform all [index operations](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-data-access.html#serverless-data-supported-permissions) (`aoss:*`) for all indexes in the Collection. The IAM task roles of the applicable Migration services (Traffic Replayer, migration console, `Reindex-from-Snapshot`) should be used as the principals for this data access policy. + +## Capture Proxy on Coordinator Nodes of Source Cluster + +Although the CDK does not automatically set up the Capture Proxy on source cluster nodes (except in the demo solution), the Capture Proxy instances must communicate with the resources deployed by the CDK, such as Kafka. This section outlines the necessary steps to set up communication. + +Before [setting up Capture Proxy instances](https://github.com/opensearch-project/opensearch-migrations/tree/main/TrafficCapture/trafficCaptureProxyServer#installing-capture-proxy-on-coordinator-nodes) on the source cluster, ensure the following configurations are in place: + +1. **Security Group Configuration**: + The coordinator nodes should add the `trafficStreamSourceSG` security group to allow sending captured traffic to Kafka. + +2. **IAM Policy Configuration**: + The IAM role used by the coordinator nodes should have permissions to publish captured traffic to Kafka. You can add the following template policy through the AWS Console (IAM Role → Add permissions → Create inline policy → JSON view): + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": "kafka-cluster:Connect", + "Resource": "arn:aws:kafka:<REGION>:<ACCOUNT-ID>:cluster/migration-msk-cluster-<STAGE>/*", + "Effect": "Allow" + }, + { + "Action": [ + "kafka-cluster:CreateTopic", + "kafka-cluster:DescribeTopic", + "kafka-cluster:WriteData" + ], + "Resource": "arn:aws:kafka:<REGION>:<ACCOUNT-ID>:topic/migration-msk-cluster-<STAGE>/*", + "Effect": "Allow" + } + ] +} +``` diff --git a/_migration-assistant/deploying-migration-assistant/index.md b/_migration-assistant/deploying-migration-assistant/index.md new file mode 100644 index 00000000000..1c559a81b1c --- /dev/null +++ b/_migration-assistant/deploying-migration-assistant/index.md @@ -0,0 +1,13 @@ +--- +layout: default +title: Deploying Migration Assistant +nav_order: 15 +has_children: true +permalink: /deploying-migration-assistant/ +redirect-from: + - /deploying-migration-assistant/index/ +--- + +# Deploying Migration Assistant + +This section provides information about the available options for deploying Migration Assistant. diff --git a/_migration-assistant/index.md b/_migration-assistant/index.md new file mode 100644 index 00000000000..2ce5f0d8d74 --- /dev/null +++ b/_migration-assistant/index.md @@ -0,0 +1,44 @@ +--- +layout: default +title: Migration Assistant for OpenSearch +nav_order: 1 +has_children: false +nav_exclude: true +has_toc: false +permalink: /migration-assistant/ +redirect_from: + - /migration-assistant/index/ + - /upgrade-to/index/ + - /upgrade-to/ + - /upgrade-to/upgrade-to/ +tutorial_cards: + - heading: "Overview" + description: "Get familiar with the key components of Migration Assistant and evaluate your use case." + link: "/migration-assistant/overview/" + - heading: "Deploying Migration Assistant" + description: "Follow step-by-step instructions to deploy Migration Assistant and prepare data for migration." + link: "/deploying-migration-assistant/" + - heading: "Migration phases" + description: "Execute your migration in phases—metadata, backfill, and traffic replay—for a controlled and validated transition." + link: "/migration-phases/" + - heading: "Migration console" + description: "Use CLI commands provided by the migration console to orchestrate and monitor your migration process." + link: "/migration-console/" +--- + +# Migration Assistant for OpenSearch + +Migration Assistant for OpenSearch aids you in successfully performing an end-to-end, zero-downtime migration to OpenSearch from other search providers. It helps with the following scenarios: + +- **Metadata migration**: Migrating cluster metadata, such as index settings, aliases, and templates. +- **Backfill migration**: Migrating existing or historical data from a source to a target cluster. +- **Live traffic migration**: Replicating live ongoing traffic from a source to a target cluster. +- **Comparative tooling**: Comparing the performance and behaviors of an existing cluster with a prospective new one. + +This user guide focuses on conducting a comprehensive migration involving both existing and live data with zero downtime and the option to back out of a migration. + +It's crucial to note that migration strategies are not universally applicable. This guide provides a detailed methodology, based on certain assumptions detailed throughout, emphasizing the importance of robust engineering practices to ensure a successful migration. +{: .tip } + +{% include cards.html cards=page.tutorial_cards %} + diff --git a/_migration-assistant/migration-console/accessing-the-migration-console.md b/_migration-assistant/migration-console/accessing-the-migration-console.md new file mode 100644 index 00000000000..ea66f5c04c9 --- /dev/null +++ b/_migration-assistant/migration-console/accessing-the-migration-console.md @@ -0,0 +1,35 @@ +--- +layout: default +title: Accessing the migration console +nav_order: 35 +parent: Migration console +--- + +# Accessing the migration console + +The Bootstrap box deployed through Migration Assistant contains a script that simplifies access to the migration console through that instance. + +To access the migration console, use the following commands: + +```shell +export STAGE=dev +export AWS_REGION=us-west-2 +/opensearch-migrations/deployment/cdk/opensearch-service-migration/accessContainer.sh migration-console ${STAGE} ${AWS_REGION} +``` +{% include copy.html %} + +When opening the console a message will appear above the command prompt, `Welcome to the Migration Assistant Console`. + +On a machine with the [AWS Command Line Interface (AWS CLI)](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) and the [AWS Session Manager plugin](https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html), you can directly connect to the migration console. Ensure that you've run `aws configure` with credentials that have access to the environment. + +Use the following commands: + +```shell +export STAGE=dev +export SERVICE_NAME=migration-console +export TASK_ARN=$(aws ecs list-tasks --cluster migration-${STAGE}-ecs-cluster --family "migration-${STAGE}-${SERVICE_NAME}" | jq --raw-output '.taskArns[0]') +aws ecs execute-command --cluster "migration-${STAGE}-ecs-cluster" --task "${TASK_ARN}" --container "${SERVICE_NAME}" --interactive --command "/bin/bash" +``` +{% include copy.html %} + +Typically, `STAGE` is equivalent to a standard `dev` environment, but this may vary based on what the user specified during deployment. \ No newline at end of file diff --git a/_migration-assistant/migration-console/index.md b/_migration-assistant/migration-console/index.md new file mode 100644 index 00000000000..64f87bc0fc5 --- /dev/null +++ b/_migration-assistant/migration-console/index.md @@ -0,0 +1,16 @@ +--- +layout: default +title: Migration console +nav_order: 50 +has_children: true +permalink: /migration-console/ +redirect_from: + - /migration-console/index/ +--- + +# Migration console + +The Migrations Assistant deployment includes an Amazon Elastic Container Service (Amazon ECS) task that hosts tools that run different phases of the migration and check the progress or results of the migration. This ECS task is called the **migration console**. The migration console is a command line interface used to interact with the deployed components of the solution. + +This section provides information about how to access the migration console and what commands are supported. + diff --git a/_migration-assistant/migration-console/migration-console-commands-references.md b/_migration-assistant/migration-console/migration-console-commands-references.md new file mode 100644 index 00000000000..21d793b3f3f --- /dev/null +++ b/_migration-assistant/migration-console/migration-console-commands-references.md @@ -0,0 +1,131 @@ +--- +layout: default +title: Command reference +nav_order: 40 +parent: Migration console +--- + +# Migration console command reference + +Migration console commands follow this syntax: `console [component] [action]`. The components include `clusters`, `backfill`, `snapshot`, `metadata`, and `replay`. The console is configured with a registry of the deployed services and the source and target cluster, generated from the `cdk.context.json` values. + +## Commonly used commands + +The exact commands used will depend heavily on use-case and goals, but the following are a series of common commands with a quick description of what they do. + +### Check connection + +Reports whether both the source and target clusters can be reached and provides their versions. + +```sh +console clusters connection-check +``` +{% include copy.html %} + +### Run `cat-indices` + +Runs the `cat-indices` API on the cluster. + +```sh +console clusters cat-indices +``` +{% include copy.html %} + +### Create a snapshot + +Creates a snapshot of the source cluster and stores it in a preconfigured Amazon Simple Storage Service (Amazon S3) bucket. + +```sh +console snapshot create +``` +{% include copy.html %} + +## Check snapshot status + +Runs a detailed check on the snapshot creation status, including estimated completion time: + +```sh +console snapshot status --deep-check +``` +{% include copy.html %} + +## Evaluate metadata + +Performs a dry run of metadata migration, showing which indexes, templates, and other objects will be migrated to the target cluster. + +```sh +console metadata evaluate +``` +{% include copy.html %} + +## Migrate metadata + +Migrates the metadata from the source cluster to the target cluster. + +```sh +console metadata migrate +``` +{% include copy.html %} + +## Start a backfill + +If `Reindex-From-Snapshot` (RFS) is enabled, this command starts an instance of the service to begin moving documents to the target cluster: + +There are similar `scale UNITS` and `stop` commands to change the number of active instances for RFS. + + +```sh +console backfill start +``` +{% include copy.html %} + +## Check backfill status + +Gets the current status of the backfill migration, including the number of operating instances and the progress of the shards. + + +## Start Traffic Replayer + +If Traffic Replayer is enabled, this command starts an instance of Traffic Replayer to begin replaying traffic against the target cluster. +The `stop` command stops all active instances. + +```sh +console replay start +``` +{% include copy.html %} + +## Read logs + +Reads any logs that exist when running Traffic Replayer. Use tab completion on the path to fill in the available `NODE_IDs` and, if applicable, log file names. The tuple logs roll over at a certain size threshold, so there may be many files named with timestamps. The `jq` command pretty-prints each line of the tuple output before writing it to file. + +```sh +console tuples show --in /shared-logs-output/traffic-replayer-default/[NODE_ID]/tuples/console.log | jq > readable_tuples.json +``` +{% include copy.html %} + +## Help option + +All commands and options can be explored within the tool itself by using the `--help` option, either for the entire `console` application or for individual components (for example, `console backfill --help`). For example: + +```bash +$ console --help +Usage: console [OPTIONS] COMMAND [ARGS]... + +Options: + --config-file TEXT Path to config file + --json + -v, --verbose Verbosity level. Default is warn, -v is info, -vv is + debug. + --help Show this message and exit. + +Commands: + backfill Commands related to controlling the configured backfill... + clusters Commands to interact with source and target clusters + completion Generate shell completion script and instructions for setup. + kafka All actions related to Kafka operations + metadata Commands related to migrating metadata to the target cluster. + metrics Commands related to checking metrics emitted by the capture... + replay Commands related to controlling the replayer. + snapshot Commands to create and check status of snapshots of the... + tuples All commands related to tuples. +``` diff --git a/_migration-assistant/migration-phases/backfill.md b/_migration-assistant/migration-phases/backfill.md new file mode 100644 index 00000000000..e4b621c16ec --- /dev/null +++ b/_migration-assistant/migration-phases/backfill.md @@ -0,0 +1,206 @@ +--- +layout: default +title: Backfill +nav_order: 90 +parent: Migration phases +--- + +# Backfill + +After the [metadata]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/migrating-metadata/) for your cluster has been migrated, you can use capture proxy data replication and snapshots to backfill your data into the next cluster. + +## Capture proxy data replication + +If you're interested in capturing live traffic during your migration, Migration Assistant includes an Application Load Balancer for routing traffic to the capture proxy and the target cluster. Upstream client traffic must be routed through the capture proxy in order to replay the requests later. Before using the capture proxy, remember the following: + +* The layer upstream from the Application Load Balancer is compatible with the certificate on the Application Load Balancer listener, whether it's for clients or a Network Load Balancer. The `albAcmCertArn` in the `cdk.context.json` may need to be provided to ensure that clients trust the Application Load Balancer certificate. +* If a Network Load Balancer is used directly upstream of the Application Load Balancer, it must use a TLS listener. +* Upstream resources and security groups must allow network access to the Migration Assistant Application Load Balancer. + +To set up the capture proxy, go to the AWS Management Console and navigate to **EC2 > Load Balancers > Migration Assistant Application Load Balancer**. Copy the Application Load Balancer URL. With the URL copied, you can use one of the following options. + + +### If you are using **Network Load Balancer → Application Load Balancer → Cluster** + +1. Ensure that ingress is provided directly to the Application Load Balancer for the capture proxy. +2. Create a target group for the Migration Assistant Application Load Balancer on port `9200`, and set the health check to `HTTPS`. +3. Associate this target group with your existing Network Load Balancer on a new listener for testing. +4. Verify that the health check is successful, and perform smoke testing with some clients through the new listener port. +5. Once you are ready to migrate all clients, detach the Migration Assistant Application Load Balancer target group from the testing Network Load Balancer listener and modify the existing Network Load Balancer listener to direct traffic to this target group. +6. Now client requests will be routed through the proxy (once they establish a new connection). Verify the application metrics. + +### If you are using **Network Load Balancer → Cluster** + +If you do not want to modify application logic, add an Application Load Balancer in front of your cluster and follow the **Network Load Balancer → Application Load Balancer → Cluster** steps. Otherwise: + +1. Create a target group for the Application Load Balancer on port `9200` and set the health check to `HTTPS`. +2. Associate this target group with your existing Network Load Balancer on a new listener. +3. Verify that the health check is successful, and perform smoke testing with some clients through the new listener port. +4. Once you are ready to migrate all clients, deploy a change so that clients hit the new listener. + + +### If you are **not using an Network Load Balancer** + +If you're only using backfill as your migration technique, make a client/DNS change to route clients to the Migration Assistant Application Load Balancer on port `9200`. + + +### Kafka connection + +After you have routed the client based on your use case, test adding records against HTTP requests using the following steps: + +In the migration console, run the following command: + +```bash +console kafka describe-topic-records +``` +{% include copy.html %} + +Note the records in the logging topic. + +After a short period, execute the same command again and compare the increased number of records against the expected HTTP requests. + +## Creating a snapshot + +Create a snapshot for your backfill using the following command: + +```bash +console snapshot create +``` +{% include copy.html %} + +To check the progress of your snapshot, use the following command: + +```bash +console snapshot status --deep-check +``` +{% include copy.html %} + +Depending on the size of the data in the source cluster and the bandwidth allocated for snapshots, the process can take some time. Adjust the maximum rate at which the source cluster's nodes create the snapshot using the `--max-snapshot-rate-mb-per-node` option. Increasing the snapshot rate will consume more node resources, which may affect the cluster's ability to handle normal traffic. + +## Backfilling documents to the source cluster + +From the snapshot you created of your source cluster, you can begin backfilling documents into the target cluster. Once you have started this process, a fleet of workers will spin up to read the snapshot and reindex documents into the target cluster. This fleet of workers can be scaled to increased the speed at which documents are reindexed into the target cluster. + +### Checking the starting state of the clusters + +You can check the indexes and document counts of the source and target clusters by running the `cat-indices` command. This can be used to monitor the difference between the source and target for any migration scenario. Check the indexes of both clusters using the following command: + +```shell +console clusters cat-indices +``` +{% include copy.html %} + +You should receive the following response: + +```shell +SOURCE CLUSTER +health status index uuid pri rep docs.count docs.deleted store.size pri.store.size +green open my-index WJPVdHNyQ1KMKol84Cy72Q 1 0 8 0 44.7kb 44.7kb + +TARGET CLUSTER +health status index uuid pri rep docs.count docs.deleted store.size pri.store.size +green open .opendistro_security N3uy88FGT9eAO7FTbLqqqA 1 0 10 0 78.3kb 78.3kb +``` + +### Starting the backfill + +Use the following command to start the backfill and deploy the workers: + +```shell +console backfill start +``` +{% include copy.html %} + +You should receive a response similar to the following: + +```shell +BackfillStatus.RUNNING +Running=1 +Pending=0 +Desired=1 +Shards total: 48 +Shards completed: 48 +Shards incomplete: 0 +Shards in progress: 0 +Shards unclaimed: 0 +``` + +The status will be `Running` even if all the shards have been migrated. + +### Scaling up the fleet + +To speed up the transfer, you can scale the number of workers. It may take a few minutes for these additional workers to come online. The following command will update the worker fleet to a size of 10: + +```shell +console backfill scale 5 +``` +{% include copy.html %} + +We recommend slowly scaling up the fleet while monitoring the health metrics of the target cluster to avoid over-saturating it. [Amazon OpenSearch Service domains](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/monitoring.html) provide a number of metrics and logs that can provide this insight. + +### Pausing the migration + +To pause a migration, use the following command: + +```shell +console backfill pause +``` + +This will stop all existing workers from running while leaving the backfill operation in a state from which it can be restarted. When you want to restart the migration, perform one of the following actions: + +- Run `console backfill start`. +- Scale up the worker count by running `console backfill scale <worker_count>`. + +### Stopping the migration + +Completing the backfill process requires manually stopping the migration. Stopping the migration shuts down all workers and cleans up all metadata used to track and coordinate the migration. Once the status checks report that your data has been completely migrated, you can stop the migration with the following command: + +```shell +console backfill stop +``` +{% include copy.html %} + +Migration Assistant should return the following response: + +```shell +Backfill stopped successfully. +Service migration-aws-integ-reindex-from-snapshot set to 0 desired count. Currently 0 running and 5 pending. +Archiving the working state of the backfill operation... +RFS Workers are still running, waiting for them to complete... +Backfill working state archived to: /shared-logs-output/migration-console-default/backfill_working_state/working_state_backup_20241115174822.json +``` + +You cannot restart a stopped migration. Instead, you can pause the backfill process using `console backfill pause`. + +### Amazon CloudWatch metrics and dashboard + +Migration Assistant creates an Amazon CloudWatch dashboard, named `MigrationAssistant_ReindexFromSnapshot_Dashboard`, that you can use to visualize the health and performance of the backfill process. It combines the metrics for the backfill workers and, for those migrating to Amazon OpenSearch Service, the target cluster. + +You can find the backfill dashboard in the CloudWatch console based on the AWS Region in which you have deployed Migration Assistant. The metric graphs for your target cluster will be blank until you select the OpenSearch domain you're migrating to from the dropdown menu at the top of the dashboard. + +## Validating the backfill + +After the backfill is complete and the workers have stopped, examine the contents of your cluster using the [Refresh API]({{site.url}}{{site.baseurl}}/api-reference/index-apis/refresh/) and the [Flush API]({{site.url}}{{site.baseurl}}/api-reference/index-apis/flush/). The following example uses the console CLI with the Refresh API to check the backfill status: + +```shell +console clusters cat-indices --refresh +``` +{% include copy.html %} + +This will display the number of documents in each of the indexes in the target cluster, as shown in the following example response: + +```shell +SOURCE CLUSTER +health status index uuid pri rep docs.count docs.deleted store.size pri.store.size +green open my-index -DqPQDrATw25hhe5Ss34bQ 1 0 3 0 12.7kb 12.7kb + +TARGET CLUSTER +health status index uuid pri rep docs.count docs.deleted store.size pri.store.size +green open .opensearch-observability 8HOComzdSlSWCwqWIOGRbQ 1 1 0 0 416b 208b +green open .plugins-ml-config 9tld-PCJToSUsMiyDhlyhQ 5 1 1 0 9.5kb 4.7kb +green open my-index bGfGtYoeSU6U6p8leR5NAQ 1 0 3 0 5.5kb 5.5kb +green open .migrations_working_state lopd47ReQ9OEhw4ZuJGZOg 1 1 2 0 18.6kb 6.4kb +green open .kibana_1 +``` + +You can run additional queries against the target cluster to mimic your production workflow and closely examine the results. diff --git a/_migration-assistant/migration-phases/index.md b/_migration-assistant/migration-phases/index.md new file mode 100644 index 00000000000..37d835135fd --- /dev/null +++ b/_migration-assistant/migration-phases/index.md @@ -0,0 +1,23 @@ +--- +layout: default +title: Migration phases +nav_order: 30 +has_children: true +has_toc: false +permalink: /migration-phases/ +redirect_from: + - /migration-phases/index/ +--- + +# Migration phases + +This page details how to conduct a migration with Migration Assistant. It shows you how to [plan for your migration]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/planning-your-migration/index/) and encompasses a variety of migration scenarios, including: + +- [**Metadata migration**]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/migrating-metadata/): Migrating cluster metadata, such as index settings, aliases, and templates. +- [**Backfill migration**]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/backfill/): Migrating existing or historical data from a source to a target cluster. +- [**Live traffic migration**]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/using-traffic-replayer/): Replicating live ongoing traffic from [a source cluster]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/switching-traffic-from-the-source-cluster/) to a target cluster. + + + + + diff --git a/_migration-assistant/migration-phases/live-traffic-migration/index.md b/_migration-assistant/migration-phases/live-traffic-migration/index.md new file mode 100644 index 00000000000..f9e3d9f71f0 --- /dev/null +++ b/_migration-assistant/migration-phases/live-traffic-migration/index.md @@ -0,0 +1,18 @@ +--- +layout: default +title: Live traffic migration +nav_order: 99 +parent: Migration phases +has_toc: false +has_children: true +--- + +# Live traffic migration + +Live traffic migration intercepts HTTP requests to a source cluster and stores them in a durable stream before forwarding them to the source cluster. The stored requests are then duplicated and replayed to the target cluster. This process synchronizes the source and target clusters while highlighting behavioral and performance differences between them. Kafka is used to manage the data flow and reconstruct HTTP requests. You can monitor the replication process through CloudWatch metrics and the [migration console]({{site.url}}{{site.baseurl}}/migration-console/), which provides results in JSON format for analysis. + +To start with live traffic migration, use the following steps: + +1. [Using Traffic Replayer]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/using-traffic-replayer/) +2. [Switching traffic from the source cluster]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/switching-traffic-from-the-source-cluster/) + diff --git a/_migration-assistant/migration-phases/live-traffic-migration/switching-traffic-from-the-source-cluster.md b/_migration-assistant/migration-phases/live-traffic-migration/switching-traffic-from-the-source-cluster.md new file mode 100644 index 00000000000..e9e477654ab --- /dev/null +++ b/_migration-assistant/migration-phases/live-traffic-migration/switching-traffic-from-the-source-cluster.md @@ -0,0 +1,55 @@ +--- +layout: default +title: Switching traffic from the source cluster +nav_order: 110 +grand_parent: Migration phases +parent: Live traffic migration +redirect_from: + - /migration-assistant/migration-phases/switching-traffic-from-the-source-cluster/ +--- + +# Switching traffic from the source cluster + +After the source and target clusters are synchronized, traffic needs to be switched to the target cluster so that the source cluster can be taken offline. + +## Assumptions + +This page assumes that the following has occurred before making the switch: + +- All client traffic is being routed through a switchover listener in the [MigrationAssistant Application Load Balancer]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/backfill/). +- Client traffic has been verified as compatible with the target cluster. +- The target cluster is in a good state to accept client traffic. +- The target proxy service is deployed. + +## Switching traffic + +Use the following steps to switch traffic from the source cluster to the target cluster: + +1. In the AWS Management Console, navigate to **ECS** > **Migration Assistant Cluster**. Note the desired count of the capture proxy, which should be greater than 1. + +2. Update the **ECS Service** of the target proxy to be at least as large as the traffic capture proxy. Wait for tasks to start up, and verify that all targets are healthy in the target proxy service's **Load balancer health** section. + +3. Navigate to **EC2** > **Load Balancers** > **Migration Assistant ALB**. + +4. Navigate to **ALB Metrics** and examine any useful information, specifically looking at **Active Connection Count** and **New Connection Count**. Note any large discrepancies, which can indicate reused connections affecting traffic switchover. + +5. Navigate to **Capture Proxy Target Group** (`ALBSourceProxy-<STAGE>-TG`) > **Monitoring**. + +6. Examine the **Metrics Requests**, **Target (2XX, 3XX, 4XX, 5XX)**, and **Target Response Time** metrics. Verify that this appears as expected and includes all traffic expected to be included in the switchover. Note details that could help identify anomalies during the switchover, including the expected response time and response code rate. + +7. Navigate back to **ALB Metrics** and choose **Target Proxy Target Group** (`ALBTargetProxy-<STAGE>-TG`). Verify that all expected targets are healthy and that none are in a draining state. + +8. Navigate back to **ALB Metrics** and to the **Listener** on port `9200`. + +9. Choose the **Default rule** and **Edit**. + +10. Modify the weights of the targets to switch the desired traffic to the target proxy. To perform a full switchover, modify the **Target Proxy** weight to `1` and the **Source Proxy** weight to `0`. + +11. Choose **Save Changes**. + +12. Navigate to both **SourceProxy** and **TargetProxy TG Monitoring** metrics and verify that traffic is switching over as expected. If connections are being reused by clients, perform any necessary actions to terminate them. Monitor these metrics until **SourceProxy TG** shows 0 requests when all clients have switched over. + + +## Fallback + +If you need to fall back to the source cluster at any point during the switchover, revert the **Default rule** so that the Application Load Balancer routes to the **SourceProxy Target Group**. \ No newline at end of file diff --git a/_migration-assistant/migration-phases/live-traffic-migration/using-traffic-replayer.md b/_migration-assistant/migration-phases/live-traffic-migration/using-traffic-replayer.md new file mode 100644 index 00000000000..a1b54d49417 --- /dev/null +++ b/_migration-assistant/migration-phases/live-traffic-migration/using-traffic-replayer.md @@ -0,0 +1,319 @@ +--- +layout: default +title: Using Traffic Replayer +nav_order: 100 +grand_parent: Migration phases +parent: Live traffic migration +redirect_from: + - /migration-assistant/migration-phases/using-traffic-replayer/ +--- + +# Using Traffic Replayer + +This guide covers how to use Traffic Replayer to replay captured traffic from a source cluster to a target cluster during the migration process. Traffic Replayer allows you to verify that the target cluster can handle requests in the same way as the source cluster and catch up to real-time traffic for a smooth migration. + +## When to run Traffic Replayer + +After deploying Migration Assistant, Traffic Replayer does not run by default. It should be started only after all metadata and documents have been migrated to ensure that recent changes to the source cluster are properly reflected in the target cluster. + +For example, if a document was deleted after a snapshot was taken, starting Traffic Replayer before the document migration is complete may cause the deletion request to execute before the document is added to the target. Running Traffic Replayer after all other migration processes ensures that the target cluster will be consistent with the source cluster. + +## Configuration options + +[Traffic Replayer settings]({{site.url}}{{site.baseurl}}/migration-assistant/deploying-migration-assistant/configuration-options/) are configured during the deployment of Migration Assistant. Make sure to set the authentication mode for Traffic Replayer so that it can properly communicate with the target cluster. + +## Using Traffic Replayer + +To manage Traffic Replayer, use the `console replay` command. The following examples show the available commands. + +### Start Traffic Replayer + +The following command starts Traffic Replayer with the options specified at deployment: + +```bash +console replay start +``` + +When starting Traffic Replayer, you should receive an output similar to the following: + +```bash +root@ip-10-0-2-66:~# console replay start +Replayer started successfully. +Service migration-dev-traffic-replayer-default set to 1 desired count. Currently 0 running and 0 pending. +``` + +## Check the status of Traffic Replayer + +Use the following command to show the status of Traffic Replayer: + +```bash +console replay status +``` + +Replay will return one of the following statuses: + +- `Running` shows how many container instances are actively running. +- `Pending` indicates how many instances are being provisione.d +- `Desired` shows the total number of instances that should be running. + +You should receive an output similar to the following: + +```bash +root@ip-10-0-2-66:~# console replay status +(<ReplayStatus.STOPPED: 4>, 'Running=0\nPending=0\nDesired=0') +``` + +## Stop Traffic Replayer + +The following command stops Traffic Replayer: + +```bash +console replay stop +``` + +You should receive an output similar to the following: + +```bash +root@ip-10-0-2-66:~# console replay stop +Replayer stopped successfully. +Service migration-dev-traffic-replayer-default set to 0 desired count. Currently 0 running and 0 pending. +``` + + + +### Delivery guarantees + +Traffic Replayer retrieves traffic from Kafka and updates its commit cursor after sending requests to the target cluster. This provides an "at least once" delivery guarantee; however, success isn't always guaranteed. Therefore, you should monitor metrics and tuple outputs or perform external validation to ensure that the target cluster is functioning as expected. + +## Time scaling + +Traffic Replayer sends requests in the same order that they were received from each connection to the source. However, relative timing between different connections is not guaranteed. For example: + +- **Scenario**: Two connections exist:one sends a PUT request every minute, and the other sends a GET request every second. +- **Behavior**: Traffic Replayer will maintain the sequence within each connection, but the relative timing between the connections (PUTs and GETs) is not preserved. + +Assume that a source cluster responds to requests (GETs and PUTs) within 100 ms: + +- With a **speedup factor of 1**, the target will experience the same request rates and idle periods as the source. +- With a **speedup factor of 2**, requests will be sent twice as fast, with GETs sent every 500 ms and PUTs every 30 seconds. +- With a **speedup factor of 10**, requests will be sent 10x faster, and as long as the target responds quickly, Traffic Replayer can maintain the pace. + +If the target cannot respond fast enough, Traffic Replayer will wait for the previous request to complete before sending the next one. This may cause delays and affect global relative ordering. + +## Transformations + +During migrations, some requests may need to be transformed between versions. For example, Elasticsearch previously supported multiple type mappings in indexes, but this is no longer the case in OpenSearch. Clients may need to be adjusted accordingly by splitting documents into multiple indexes or transforming request data. + +Traffic Replayer automatically rewrites host and authentication headers, but for more complex transformations, custom transformation rules can be specified using the `--transformer-config` option. For more information, see the [Traffic Replayer README](https://github.com/opensearch-project/opensearch-migrations/blob/c3d25958a44ec2e7505892b4ea30e5fbfad4c71b/TrafficCapture/trafficReplayer/README.md#transformations). + +### Example transformation + +Suppose that a source request contains a `tagToExcise` element that needs to be removed and its children promoted and that the URI path includes `extraThingToRemove`, which should also be removed. The following Jolt script handles this transformation: + +```json +[{ "JsonJoltTransformerProvider": +[ + { + "script": { + "operation": "shift", + "spec": { + "payload": { + "inlinedJsonBody": { + "top": { + "tagToExcise": { + "*": "payload.inlinedJsonBody.top.&" + }, + "*": "payload.inlinedJsonBody.top.&" + }, + "*": "payload.inlinedJsonBody.&" + }, + "*": "payload.&" + }, + "*": "&" + } + } + }, + { + "script": { + "operation": "modify-overwrite-beta", + "spec": { + "URI": "=split('/extraThingToRemove',@(1,&))" + } + } + }, + { + "script": { + "operation": "modify-overwrite-beta", + "spec": { + "URI": "=join('',@(1,&))" + } + } + } +] +}] +``` + +The resulting request sent to the target will appear similar to the following: + +```bash +PUT /oldStyleIndex/moreStuff HTTP/1.0 +host: testhostname + +{"top":{"properties":{"field1":{"type":"text"},"field2":{"type":"keyword"}}}} +``` +{% include copy.html %} + +You can pass Base64-encoded transformation scripts using `--transformer-config-base64`. + +## Result logs + +HTTP transactions from the source capture and those resent to the target cluster are logged in files located at `/shared-logs-output/traffic-replayer-default/*/tuples/tuples.log`. The `/shared-logs-output` directory is shared across containers, including the migration console. You can access these files from the migration console using the same path. Previous runs are also available in a `gzipped` format. + +Each log entry is a newline-delimited JSON object, containing information about the source and target requests/responses along with other transaction details, such as response times. + +These logs contain the contents of all requests, including authorization headers and the contents of all HTTP messages. Ensure that access to the migration environment is restricted, as these logs serve as a source of truth for determining what happened in both the source and target clusters. Response times for the source refer to the amount of time between the proxy sending the end of a request and receiving the response. While response times for the target are recorded in the same manner, keep in mind that the locations of the capture proxy, Traffic Replayer, and target may differ and that these logs do not account for the client's location. +{: .note} + + +### Example log entry + +The following example log entry shows a `/_cat/indices?v` request sent to both the source and target clusters: + +```json +{ + "sourceRequest": { + "Request-URI": "/_cat/indices?v", + "Method": "GET", + "HTTP-Version": "HTTP/1.1", + "Host": "capture-proxy:9200", + "Authorization": "Basic YWRtaW46YWRtaW4=", + "User-Agent": "curl/8.5.0", + "Accept": "*/*", + "body": "" + }, + "sourceResponse": { + "HTTP-Version": {"keepAliveDefault": true}, + "Status-Code": 200, + "Reason-Phrase": "OK", + "response_time_ms": 59, + "content-type": "text/plain; charset=UTF-8", + "content-length": "214", + "body": "aGVhbHRoIHN0YXR1cyBpbmRleCAgICAgICB..." + }, + "targetRequest": { + "Request-URI": "/_cat/indices?v", + "Method": "GET", + "HTTP-Version": "HTTP/1.1", + "Host": "opensearchtarget", + "Authorization": "Basic YWRtaW46bXlTdHJvbmdQYXNzd29yZDEyMyE=", + "User-Agent": "curl/8.5.0", + "Accept": "*/*", + "body": "" + }, + "targetResponses": [{ + "HTTP-Version": {"keepAliveDefault": true}, + "Status-Code": 200, + "Reason-Phrase": "OK", + "response_time_ms": 721, + "content-type": "text/plain; charset=UTF-8", + "content-length": "484", + "body": "aGVhbHRoIHN0YXR1cyBpbmRleCAgICAgICB..." + }], + "connectionId": "0242acfffe13000a-0000000a-00000005-1eb087a9beb83f3e-a32794b4.0", + "numRequests": 1, + "numErrors": 0 +} +``` +{% include copy.html %} + + +### Decoding log content + +The contents of HTTP message bodies are Base64 encoded in order to handle various types of traffic, including compressed data. To view the logs in a more human-readable format, use the console library `tuples show`. Running the script as follows will produce a `readable-tuples.log` in the home directory: + +```shell +console tuples show --in /shared-logs-output/traffic-replayer-default/d3a4b31e1af4/tuples/tuples.log > readable-tuples.log +``` + +The `readable-tuples.log` should appear similar to the following: + +```json +{ + "sourceRequest": { + "Request-URI": "/_cat/indices?v", + "Method": "GET", + "HTTP-Version": "HTTP/1.1", + "Host": "capture-proxy:9200", + "Authorization": "Basic YWRtaW46YWRtaW4=", + "User-Agent": "curl/8.5.0", + "Accept": "*/*", + "body": "" + }, + "sourceResponse": { + "HTTP-Version": {"keepAliveDefault": true}, + "Status-Code": 200, + "Reason-Phrase": "OK", + "response_time_ms": 59, + "content-type": "text/plain; charset=UTF-8", + "content-length": "214", + "body": "health status index uuid ..." + }, + "targetRequest": { + "Request-URI": "/_cat/indices?v", + "Method": "GET", + "HTTP-Version": "HTTP/1.1", + "Host": "opensearchtarget", + "Authorization": "Basic YWRtaW46bXlTdHJvbmdQYXNzd29yZDEyMyE=", + "User-Agent": "curl/8.5.0", + "Accept": "*/*", + "body": "" + }, + "targetResponses": [{ + "HTTP-Version": {"keepAliveDefault": true}, + "Status-Code": 200, + "Reason-Phrase": "OK", + "response_time_ms": 721, + "content-type": "text/plain; charset=UTF-8", + "content-length": "484", + "body": "health status index uuid ..." + }], + "connectionId": "0242acfffe13000a-0000000a-00000005-1eb087a9beb83f3e-a32794b4.0", + "numRequests": 1, + "numErrors": 0 +} +``` + + +## Amazon CloudWatch metrics and dashboard +Migration Assistant creates an Amazon CloudWatch dashboard named `MigrationAssistant_ReindexFromSnapshot_Dashboard` to visualize the health and performance of the backfill process. This dashboard combines metrics for the backfill workers and migration to Amazon OpenSearch Service, providing insights into the performance and health of the Capture Proxy and Traffic Replayer components, including metrics such as: + +- The number of bytes read and written. +- The number of active connections. +- The replay speed multiplier. + +You can find the Capture and Replay dashboard in the AWS Management Console for CloudWatch Dashboards in the AWS Region where you deployed Migration Assistant. + +Traffic Replayer emits various OpenTelemetry metrics to Amazon CloudWatch, and traces are sent through AWS X-Ray. The following are some useful metrics that can help evaluate migration performance. + +### `sourceStatusCode` + +This metric tracks the HTTP status codes for both the source and target clusters, with dimensions for the HTTP verb, such as `GET` or `POST`, and the status code families (200--299). These dimensions can help quickly identify discrepancies between the source and target, such as when `DELETE 200s` becomes `4xx` or `GET 4xx` errors turn into `5xx` errors. + +### `lagBetweenSourceAndTargetRequests` + +This metric shows the delay between requests hitting the source and target clusters. With a speedup factor greater than 1 and a target cluster that can handle requests efficiently, this value should decrease as the replay progresses, indicating a reduction in replay lag. + +### Additional metrics + +The following metrics are also reported: + +- **Throughput**: `bytesWrittenToTarget` and `bytesReadFromTarget` indicate the throughput to and from the cluster. +- **Retries**: `numRetriedRequests` tracks the number of requests retried due to status code mismatches between the source and target. +- **Event counts**: Various `(*)Count` metrics track the number of completed events. +- **Durations**: `(*)Duration` metrics measure the duration of each step in the process. +- **Exceptions**: `(*)ExceptionCount` shows the number of exceptions encountered during each processing phase. + + +## CloudWatch considerations + +Metrics and dashboards pushed to CloudWatch may experience a visibility lag of around 5 minutes. CloudWatch also retains higher-resolution data for a shorter period than lower-resolution data. For more information, see [Amazon CloudWatch concepts](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/cloudwatch_concepts.html). \ No newline at end of file diff --git a/_migration-assistant/migration-phases/migrating-metadata.md b/_migration-assistant/migration-phases/migrating-metadata.md new file mode 100644 index 00000000000..249a2ca4d0c --- /dev/null +++ b/_migration-assistant/migration-phases/migrating-metadata.md @@ -0,0 +1,247 @@ +--- +layout: default +title: Migrating metadata +nav_order: 85 +parent: Migration phases +--- + +# Migrating metadata + +Metadata migration involves creating a snapshot of your cluster and then migrating the metadata from the snapshot using the migration console. + +This tool gathers information from a source cluster through a snapshot or through HTTP requests against the source cluster. These snapshots are fully compatible with the backfill process for `Reindex-From-Snapshot` (RFS) scenarios. + +After collecting information on the source cluster, comparisons are made against the target cluster. If running a migration, any metadata items that do not already exist will be created on the target cluster. + +## Creating the snapshot + +Creating a snapshot of the source cluster captures all the metadata and documents to be migrated to a new target cluster. + +Create the initial snapshot of the source cluster using the following command: + +```shell +console snapshot create +``` +{% include copy.html %} + +To check the progress of the snapshot in real time, use the following command: + +```shell +console snapshot status --deep-check +``` +{% include copy.html %} + +You should receive the following response when the snapshot is created: + +```shell +SUCCESS +Snapshot is SUCCESS. +Percent completed: 100.00% +Data GiB done: 29.211/29.211 +Total shards: 40 +Successful shards: 40 +Failed shards: 0 +Start time: 2024-07-22 18:21:42 +Duration: 0h 13m 4s +Anticipated duration remaining: 0h 0m 0s +Throughput: 38.13 MiB/sec +``` + +### Managing slow snapshot speeds + +Depending on the size of the data in the source cluster and the bandwidth allocated for snapshots, the process can take some time. Adjust the maximum rate at which the source cluster's nodes create the snapshot using the `--max-snapshot-rate-mb-per-node` option. Increasing the snapshot rate will consume more node resources, which may affect the cluster's ability to handle normal traffic. + +## Command arguments + +For the following commands, to identify all valid arguments, please run with `--help`. + +```shell +console metadata evaluate --help +``` +{% include copy.html %} + +```shell +console metadata migrate --help +``` +{% include copy.html %} + +Based on the migration console deployment options, a number of commands will be pre-populated. To view them, run console with verbosity: + +```shell +console -v metadata migrate --help +``` +{% include copy.html %} + +You should receive a response similar to the following: + +```shell +(.venv) bash-5.2# console -v metadata migrate --help +INFO:console_link.cli:Logging set to INFO +. +. +. +INFO:console_link.models.metadata:Migrating metadata with command: /root/metadataMigration/bin/MetadataMigration --otel-collector-endpoint http://otel-collector:4317 migrate --snapshot-name snapshot_2023_01_01 --target-host https://opensearchtarget:9200 --min-replicas 0 --file-system-repo-path /snapshot/test-console --target-username admin --target-password ******** --target-insecure --help +. +. +. +``` + + +## Using the `evaluate` command + +By scanning the contents of the source cluster, applying filtering, and applying modifications a list of all items that will be migrated will be created. Any items not seen in this output will not be migrated onto the target cluster if the migrate command was to be run. This is a safety check before making modifications on the target cluster. + +```shell +console metadata evaluate [...] +``` +{% include copy.html %} + +You should receive a response similar to the following: + +```bash +Starting Metadata Evaluation +Clusters: + Source: + Remote Cluster: OpenSearch 1.3.16 ConnectionContext(uri=http://localhost:33039, protocol=HTTP, insecure=false, compressionSupported=false) + + Target: + Remote Cluster: OpenSearch 2.14.0 ConnectionContext(uri=http://localhost:33037, protocol=HTTP, insecure=false, compressionSupported=false) + + +Migration Candidates: + Index Templates: + simple_index_template + + Component Templates: + simple_component_template + + Indexes: + blog_2023, movies_2023 + + Aliases: + alias1, movies-alias + + +Results: + 0 issue(s) detected +``` + + +## Using the migrate command + +Running through the same data as the evaluate command all of the migrated items will be applied onto the target cluster. If re-run multiple times items that were previously migrated will not be recreated. If any items do need to be re-migrated, please delete them from the target cluster and then rerun the evaluate then migrate commands to ensure the desired changes are made. + +```shell +console metadata migrate [...] +``` +{% include copy.html %} + +You should receive a response similar to the following: + +```shell +Starting Metadata Migration + +Clusters: + Source: + Snapshot: OpenSearch 1.3.16 FileSystemRepo(repoRootDir=/tmp/junit10626813752669559861) + + Target: + Remote Cluster: OpenSearch 2.14.0 ConnectionContext(uri=http://localhost:33042, protocol=HTTP, insecure=false, compressionSupported=false) + + +Migrated Items: + Index Templates: + simple_index_template + + Component Templates: + simple_component_template + + Indexes: + blog_2023, movies_2023 + + Aliases: + alias1, movies-alias + + +Results: + 0 issue(s) detected +``` + + +## Metadata verification process + +Before moving on to additional migration steps, it is recommended to confirm details of your cluster. Depending on your configuration, this could be checking the sharding strategy or making sure index mappings are correctly defined by ingesting a test document. + +## Troubleshooting + +Use these instructions to help troubleshoot the following issues. + +### Accessing detailed logs + +Metadata migration creates a detailed log file that includes low level tracing information for troubleshooting. For each execution of the program a log file is created inside a shared volume on the migration console named `shared-logs-output` the following command will list all log files, one for each run of the command. + +```shell +ls -al /shared-logs-output/migration-console-default/*/metadata/ +``` +{% include copy.html %} + +To inspect the file within the console `cat`, `tail` and `grep` commands line tools. By looking for warnings, errors and exceptions in this log file can help understand the source of failures, or at the very least be useful for creating issues in this project. + +```shell +tail /shared-logs-output/migration-console-default/*/metadata/*.log +``` +{% include copy.html %} + +### Warnings and errors + +When encountering `WARN` or `ERROR` elements in the response, they will be accompanied by a short message, such as `WARN - my_index already exists`. More information can be found in the detailed logs associated with the warning or error. + +### OpenSearch running in compatibility mode + +There might be an error about being unable to update an ES 7.10.2 cluster, this can occur when compatibility mode has been enabled on an OpenSearch cluster disable it to continue, see [Enable compatibility mode](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/rename.html#rename-upgrade). + + +### Breaking change compatibility + +Metadata migration requires modifying data from the source to the target versions to recreate items. Sometimes these features are no longer supported and have been removed from the target version. Sometimes these features are not available in the target version, which is especially true when downgrading. While this tool is meant to make this process easier, it is not exhaustive in its support. When encountering a compatibility issue or an important feature gap for your migration, [search the issues and comment on the existing issue](https://github.com/opensearch-project/opensearch-migrations/issues) or [create a new](https://github.com/opensearch-project/opensearch-migrations/issues/new/choose) issue if one cannot be found. + +#### Deprecation of Mapping Types + +In Elasticsearch 6.8 the mapping types feature was discontinued in Elasticsearch 7.0+ which has created complexity in migrating to newer versions of Elasticsearch and OpenSearch, [learn more](https://www.elastic.co/guide/en/elasticsearch/reference/7.17/removal-of-types.html) ↗. + +As Metadata migration supports migrating from ES 6.8 on to the latest versions of OpenSearch this scenario is handled by removing the type mapping types and restructuring the template or index properties. Note that, at the time of this writing multiple type mappings are not supported, [tracking task](https://opensearch.atlassian.net/browse/MIGRATIONS-1778) ↗. + + +**Example starting state with mapping type foo (ES 6):** + +```json +{ + "mappings": [ + { + "foo": { + "properties": { + "field1": { "type": "text" }, + "field2": { "type": "keyword" } + } + } + } + ] +} +``` +{% include copy.html %} + +**Example ending state with foo removed (ES 7):** + +```json +{ + "mappings": { + "properties": { + "field1": { "type": "text" }, + "field2": { "type": "keyword" }, + } + } +} +``` +{% include copy.html %} + +For additional technical details, [view the mapping type removal source code](https://github.com/opensearch-project/opensearch-migrations/blob/main/transformation/src/main/java/org/opensearch/migrations/transformation/rules/IndexMappingTypeRemoval.java). diff --git a/_migration-assistant/migration-phases/planning-your-migration/assessing-your-cluster-for-migration.md b/_migration-assistant/migration-phases/planning-your-migration/assessing-your-cluster-for-migration.md new file mode 100644 index 00000000000..1687da15893 --- /dev/null +++ b/_migration-assistant/migration-phases/planning-your-migration/assessing-your-cluster-for-migration.md @@ -0,0 +1,76 @@ +--- +layout: default +title: Assessing your cluster for migration +nav_order: 60 +parent: Planning your migration +grand_parent: Migration phases +redirect_from: + - /migration-assistant/migration-phases/assessing-your-cluster-for-migration/ +--- + +# Assessing your cluster for migration + +The goal of the Migration Assistant is to streamline the process of migrating from one location or version of Elasticsearch/OpenSearch to another. However, completing a migration sometimes requires resolving client compatibility issues before they can communicate directly with the target cluster. + +## Understanding breaking changes + +Before performing any upgrade or migration, you should review any breaking changes that might exist between version because there may be changes required in order for clients to connect to the new cluster. Use the following tool by selecting the versions in your migration path. The tool will respond with any breaking changes you should note when migrating: + +<link rel="stylesheet" href="{{site.url}}{{site.baseurl}}/migration-assistant/assets/css/breaking-changes-selector.css"> + +<div class="breaking-changes-selector"> + <h4>Find a list of breaking changes for your migration path</h4> + + <div> + <label for="source-version">Source:</label> + <select id="source-version"> + <option value="">Select</option> + <!-- Source versions will be populated by JavaScript --> + </select> + + <label for="target-version">Target:</label> + <select id="target-version"> + <option value="">Select</option> + <!-- Target versions will be populated by JavaScript --> + </select> + </div> + + <div> + <label>Include Optional Components:</label><br> + <!-- Components will be populated by JavaScript --> + <span id="component-checkboxes"></span> + </div> + + <div id="breaking-changes-results"></div> +</div> + +<div id="migration-data" + data-migration-paths="{{ site.data.migration-assistant.valid_migrations.migration_paths | jsonify | escape }}" + data-breaking-changes="{{ site.data.migration-assistant.breaking-changes.breaking_changes | jsonify | escape }}" + style="display:none;"></div> + +<script type="module" src="{{site.url}}{{site.baseurl}}/migration-assistant/assets/js/breaking-changes-index.js"></script> + +## Impact of data transformations + +Any time you apply a transformation to your data, such as: + +- Changing index names +- Modifying field names or field mappings +- Splitting indices with type mappings + +These changes might need to be reflected in your client configurations. For example, if your clients are reliant on specific index or field names, you must ensure that their queries are updated accordingly. + +We recommend running production-like queries against the target cluster before switching over actual production traffic. This helps verify that the client can: + +- Communicate with the target cluster +- Locate the necessary indices and fields +- Retrieve the expected results + +For complex migrations involving multiple transformations or breaking changes, we highly recommend performing a trial migration with representative, non-production data (e.g., in a staging environment) to fully test client compatibility with the target cluster. + +## Supported transformations + +The following [transformations]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/live-traffic-migration/using-traffic-replayer/#transformations) are included in Migration Assistant. They can be enabled, combined, and configured to customize a migration for your use case. To request additional Migration Assistant transformations , create a GitHub issue [in the OpenSearch migrations repository](https://github.com/opensearch-project/opensearch-migrations/issues). + +- [Type mapping deprecation]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/planning-your-migration/handling-type-mapping-deprecation/) diff --git a/_migration-assistant/migration-phases/planning-your-migration/handling-field-type-breaking-changes.md b/_migration-assistant/migration-phases/planning-your-migration/handling-field-type-breaking-changes.md new file mode 100644 index 00000000000..b6a653f96ae --- /dev/null +++ b/_migration-assistant/migration-phases/planning-your-migration/handling-field-type-breaking-changes.md @@ -0,0 +1,158 @@ +--- +layout: default +title: Handling breaking changes in field types +nav_order: 60 +parent: Planning your migration +grand_parent: Migration phases +--- + +# Handling breaking changes in field types + +This guide explains how to use Migration Assistant to transform field types that are deprecated or incompatible during a migration to OpenSearch. + +Field types define how data is stored and queried in an index. Each field in a document is mapped to a data type, which determines how it is indexed and what operations can be performed on it. + +For example, the following index mapping for a library's book collection defines three fields, each with a different type: + +```json +GET /library-books/_mappings +{ + "library-books": { + "mappings": { + "properties": { + "title": { "type": "text" }, + "publishedDate": { "type": "date" }, + "pageCount": { "type": "integer" } + } + } + } +} +``` + +For more information, see [Mappings and field types]({{site.url}}{{site.baseurl}}/field-types/). + +## Configure item transformations + +You can customize how field types are transformed during metadata and data migrations by supplying a transformation configuration file using the following steps: + +1. Open the Migration Assistant console. +2. Create a JavaScript file to define your transformation logic using the following command: + + ```bash + vim /shared-logs-output/field-type-converter.js + ``` + {% include copy.html %} + +3. Write any JavaScript rules that perform the desired field type conversions. For an example of how the rules can be implemented, see the [example `field-type-converter.js` implementation](#example-field-type-converterjs-implementation). +4. Create a transformation descriptor file using the following command: + + ```bash + vim /shared-logs-output/transformation.json + ``` + {% include copy.html %} + +5. Add a reference to your JavaScript file in `transformation.json`. +6. Run the metadata migration and supply the transformation configuration using a command similar to the following: + + ```bash + console metadata migrate \ + --transformer-config-file /shared-logs-output/transformation.json + ``` + {% include copy.html %} + +### Example `field-type-converter.js` implementation + +The following script demonstrates how to perform common field type conversions, including: + +* Replacing the deprecated `string` type with `text`. +* Converting `flattened` to `flat_object` and removing the `index` property if present. + +```javascript +function main(context) { + const rules = [ + { + when: { type: "string" }, + set: { type: "text" } + }, + { + when: { type: "flattened" }, + set: { type: "flat_object" }, + remove: ["index"] + } + ]; + + function applyRules(node, rules) { + if (Array.isArray(node)) { + node.forEach((child) => applyRules(child, rules)); + } else if (node instanceof Map) { + for (const { when, set, remove = [] } of rules) { + const matches = Object.entries(when).every(([k, v]) => node.get(k) === v); + if (matches) { + Object.entries(set).every(([k, v]) => node.set(k, v)); + remove.forEach((key) => node.delete(key)); + } + } + for (const child of node.values()) { + applyRules(child, rules); + } + } else if (node && typeof node === "object") { + for (const { when, set, remove = [] } of rules) { + const matches = Object.entries(when).every(([k, v]) => node[k] === v); + if (matches) { + Object.assign(node, set); + remove.forEach((key) => delete node[key]); + } + } + Object.values(node).forEach((child) => applyRules(child, rules)); + } + } + + return (doc) => { + if (doc && doc.type && doc.name && doc.body) { + applyRules(doc, rules); + } + return doc; + }; +} +(() => main)(); +``` +{% include copy.html %} + +The script contains the following elements: + +1. The `rules` array defines transformation logic: + + * `when`: Key-value conditions to match on a node + * `set`: Key-value pairs to apply when the `when` clause matches + * `remove` (optional): Keys to delete from the node when matched + +2. The `applyRules` function recursively traverses the input: + + * Arrays are recursively processed element by element. + * `Map` objects are matched and mutated using the defined rules. + * Plain objects are checked for matches and transformed accordingly. + +3. The `main` function returns a transformation function that: + + * Applies the rules to each document. + * Returns the modified document for migration or replay. + +### Example `transformation.json` + +The following JSON file references your transformation script and initializes the JavaScript engine with your custom rules: + +```json +[ + { + "JsonJSTransformerProvider": { + "initializationScriptFile": "/shared-logs-output/field-type-converter.js", + "bindingsObject": "{}" + } + } +] +``` +{% include copy.html %} + +## Summary + +By using a transformation configuration, you can rewrite deprecated or incompatible field types during metadata migration or data replay. This ensures that your target OpenSearch cluster only receives compatible mappings—even if the source cluster includes outdated types like `string` or features like `flattened` that need conversion. diff --git a/_migration-assistant/migration-phases/planning-your-migration/handling-type-mapping-deprecation.md b/_migration-assistant/migration-phases/planning-your-migration/handling-type-mapping-deprecation.md new file mode 100644 index 00000000000..edd81612dac --- /dev/null +++ b/_migration-assistant/migration-phases/planning-your-migration/handling-type-mapping-deprecation.md @@ -0,0 +1,318 @@ +--- +layout: default +title: Managing type mapping deprecation +nav_order: 60 +parent: Planning your migration +grand_parent: Migration phases +--- + +# Managing type mapping deprecation + +This guide provides solutions for managing the deprecation of the type mapping functionality when migrating from Elasticsearch 6.x or earlier to OpenSearch. + +In versions of Elasticsearch prior to 6.x, an index could contain multiple types, each with its own mapping. These types allowed you to store and query different kinds of documents—such as books and movies—in a single index. For example, both `book` and `movie` types could have a shared field like `title`, while each had additional fields specific to that type. + +Newer versions of Elasticsearch and OpenSearch no longer support multiple mapping types. Each index now supports only a single mapping type. During migration, you must define how to transform or restructure data that used multiple types. The following example shows multiple mapping types: + + +```JSON +GET /library/_mappings +{ + "library": { + "mappings": { + "book": { + "properties": { + "title": { "type": "text" }, + "pageCount": { "type": "integer" } + } + }, + "movie": { + "properties": { + "title": { "type": "text" }, + "runTime": { "type": "integer" } + } + } + } + } +} +``` + +For more information, see the [official Elasticsearch documentation on the removal of mapping types](https://www.elastic.co/guide/en/elasticsearch/reference/7.10/removal-of-types.html). + +## Using the type mapping transformer + +To address type mapping deprecation, use the `TypeMappingsSanitizationTransformer`. This transformer can modify data, including metadata, documents, and requests, so that the previously mapped data can be used in OpenSearch. To use the mapping transformer: + +1. Navigate to the bootstrap box and open the `cdk.context.json` file with Vim. +2. Add or update the key `reindexFromSnapshotExtraArgs` to include `--doc-transformer-config-file /shared-logs-output/transformation.json`. +3. Add or update the key `trafficReplayerExtraArgs` to include `--transformer-config-file /shared-logs-output/transformation.json`. +4. Deploy Migration Assistant. +5. Navigate to the Migration Assistant console. +6. Create a file named `/shared-logs-output/transformation.json`. +7. Add your transformation configuration to the file. For configuration options, see [Configuration options](#configuration-options). +8. When running the metadata migration, run the configuration with the transformer using the command `console metadata migrate --transformer-config-file /shared-logs-output/transformation.json`. + +Whenever the transformation configuration is updated, the backfill and replayer tools need to be stopped and restarted in order to apply the changes. Any previously migrated data and metadata may need to be cleared in order to avoid an inconsistent state. + +### Configuration options + +The `TypeMappingsSanitizationTransformer` supports several strategies for managing type mappings: + +1. **Route different types to separate indexes**: Split different types into their own indexes. +2. **Merge all types into one index**: Combine multiple types into a single index. +3. **Drop specific types**: Selectively migrate only specific types. +4. **Keep the original structure**: Maintain the same index name while conforming to the new type standards. + +### Type mapping transformer configuration schema + +The type mapping transformer uses the following configuration options. + +| **Field** | **Type** | **Required** | **Description** | +| :--- | :--- | :--- | :--- | +| `staticMappings` | `object` | No | A map of `{ indexName: { typeName: targetIndex } }` used to **statically** route specific types. <br/><br/> For any **index** listed on this page, types **not** included in its object are **dropped** (no data or requests are migrated for those omitted types). | +| `regexMappings` | `array` | No | A list of **regex-based** rules for **dynamic** routing of source index/type names to a target index. <br/><br/> Each element in this array is itself an object with `sourceIndexPattern`, `sourceTypePattern`, and `targetIndexPattern` fields. <br/><br/> For information about the **default value**, see [Defaults](#Defaults). | +| `sourceProperties` | `object` | Yes | Additional **metadata** about the source (for example, its Elasticsearch/OpenSearch version). Must include at least `"version"` with `"major"` and `"minor"` fields. | + +The following example JSON configuration provides a transformation schema: + +<details> +<summary>Example JSON Configuration</summary> + +```JSON +{ + "TypeMappingSanitizationTransformerProvider": { + "staticMappings": { + "{index-name-1}": { + "{type-name-1}": "{target-index-name-1}", + "{type-name-2}": "{target-index-name-2}" + } + }, + "regexMappings": [ + { + "sourceIndexPattern": "{source-index-pattern}", + "sourceTypePattern": "{source-type-pattern}", + "targetIndexPattern": "{target-index-pattern}" + } + ], + "sourceProperties": { + "version": { + "major": "NUMBER", + "minor": "NUMBER" + } + } + } +} +``` +{% include copy.html %} + +</details> + +## Example configurations + +The following example configurations show you how to use the transformer for different mapping type scenarios. + +### Route different types to separate indexes + +If you have an index `activity` with types `user` and `post` that you want to split into separate indexes, use the following configuration: + +```json +[ + { + "TypeMappingSanitizationTransformerProvider": { + "staticMappings": { + "activity": { + "user": "new_users", + "post": "new_posts" + } + }, + "sourceProperties": { + "version": { + "major": 6, + "minor": 8 + } + } + } + } +] +{% include copy.html %} +``` + +This transformer will perform the following: + +- Route documents with type `user` to the `new_users` index. +- Route documents with type `post` to the `new_posts` index. + +### Merge all types into one index + +To merge all types into one index, use the following configuration: + +```json +[ + { + "TypeMappingSanitizationTransformerProvider": { + "staticMappings": { + "activity": { + "user": "activity", + "post": "activity" + } + }, + "sourceProperties": { + "version": { + "major": 6, + "minor": 8 + } + } + } + } +] +``` +{% include copy.html %} + +### Drop specific types + +To migrate only the `user` type within the `activity` index and drop all documents/requests with types not directly specified, use the following configuration: + +```json +[ + { + "TypeMappingSanitizationTransformerProvider": { + "staticMappings": { + "activity": { + "user": "users_only" + } + }, + "sourceProperties": { + "version": { + "major": 6, + "minor": 8 + } + } + } + } +] +``` +{% include copy.html %} + +This configuration only migrates documents of type `user` and ignores other document types in the `activity` index. + +### Keep the original structure + +To migrate only specific types and keep the original structure, use the following configuration: + +```JSON +[ + { + "TypeMappingSanitizationTransformerProvider": { + "regexMappings": [ + { + "sourceIndexPattern": "(.*)", + "sourceTypePattern": ".*", + "targetIndexPattern": "$1" + } + ], + "sourceProperties": { + "version": { + "major": 6, + "minor": 8 + } + } + } + } +] +``` +{% include copy.html %} + +This is equivalent to the strategy of merging all types into one index but also uses a pattern-based routing strategy. + +### Combining multiple strategies + +You can combine both static and regex-based mappings to manage different indexes or patterns in a single migration. For example, you might have one index that must use `staticMappings` and another that uses `regexMappings` to route all types by pattern. + +For each document, request, or metadata item (processed individually for bulk requests), the following steps are performed: + +1. The index is checked to determine whether it matches an entry in the static mappings. + - If matched, the type is checked against the index component of the static mappings entry. + - If the type matches, the mapping is applied, and the resulting index includes the value of the type key. + - If the type doesn't match, the request/document/metadata is dropped and not migrated. +2. If the index is not matched in the static mappings, the index-type combination is checked against each item in the regex mappings list, in order from first to last. If a match is found, the mapping is applied, the resulting index includes the value of the type key, and no further regex matching is performed. +3. Any request, document, or metadata that doesn't match the preceding cases is dropped, and the documents they contain are not migrated. + +The following example demonstrates how to combine static and regex-based mappings for different indexes: + +```json +[ + { + "TypeMappingSanitizationTransformerProvider": { + "staticMappings": { + "activity": { + "user": "users_activity", + "post": "posts_activity" + }, + "logs": { + "error": "logs_error", + "info": "logs_info" + } + }, + "regexMappings": [ + { + "sourceIndexPattern": "orders.*", + "sourceTypePattern": ".*", + "targetIndexPattern": "all_orders" + } + ], + "sourceProperties": { + "version": { + "major": 6, + "minor": 8 + } + } + } + } +] +``` +{% include copy.html %} + +### Defaults + +When the `regexMappings` key is missing from the transformation configuration, `regexMappings` will default to the following: + +```JSON +{ + "regexMappings": [ + { + "sourceIndexPattern": "(.+)", + "sourceTypePattern": "_doc", + "targetIndexPattern": "$1" + }, + { + "sourceIndexPattern": "(.+)", + "sourceTypePattern": "(.+)", + "targetIndexPattern": "$1_$2" + } + ] +} +``` +{% include copy.html %} + +This has the effect of retaining the index name for indexes created in Elasticsearch 6.x or later while combining the type and index name for indexes created in Elasticsearch 5.x. If you want to retain the index name for indexes created in Elasticsearch 5.x, use the `staticMappings` option or override the type mappings using the `regexMappings` option. + +## Limitations + +When using the transformer, remember the following limitations. +When using the transformer, remember the following limitations. +### Traffic Replayer + +For the Traffic Replayer, **only a subset** of requests that include types is supported. These requests are listed in the following table. + +| **Operation** | **HTTP method(s)** | **Endpoint** | **Description** | +| :--- | :--- | :--- | :--- | +| **Index (by ID)** | PUT/POST | `/{index}/{type}/{id}` | Create or update a single document with an explicit ID. | +| **Index (auto ID)** | PUT/POST | `/{index}/{type}/` | Create a single document for which the ID is automatically generated. | +| **Get Document** | GET | `/{index}/{type}/{id}` | Retrieve a document by ID. | +| **Bulk Index/Update/Delete** | PUT/POST | `/_bulk` | Perform multiple create/update/delete operations in a single request. | +| **Bulk Index/Update/Delete** | PUT/POST | `/{index}/_bulk` | Perform multiple create/update/delete operations in a single request with default index assignment. | +| **Bulk Index/Update/Delete** | PUT/POST | `/{index}/{type}/_bulk` | Perform multiple create/update/delete operations in a single request with default index and type assignment. | +| **Create/Update Index** | PUT/POST | `/{index}` | Create or update an index. <br/><br/> **Split** behavior is not supported in the Traffic Replayer. See [this GitHub issue](https://github.com/opensearch-project/opensearch-migrations/issues/1305) to provide feedback or to vote on this feature. | + +### Reindex-From_Shapshot +For `Reindex-From-Snapshot,` indexes created in Elasticsearch 6.x or later will use `_doc` as the type for all documents, even if a different type was specified in Elasticsearch 6. diff --git a/_migration-assistant/migration-phases/planning-your-migration/index.md b/_migration-assistant/migration-phases/planning-your-migration/index.md new file mode 100644 index 00000000000..4f0d264e93e --- /dev/null +++ b/_migration-assistant/migration-phases/planning-your-migration/index.md @@ -0,0 +1,15 @@ +--- +layout: default +title: Planning your migration +nav_order: 59 +parent: Migration phases +has_toc: false +has_children: true +--- + +# Planning your migration + +This section describes how to plan for your migration to OpenSearch by: + +- [Assessing your current cluster for migration]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/planning-your-migration/assessing-your-cluster-for-migration/). +- [Verifying that you have the tools for migration]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/planning-your-migration/verifying-migration-tools/). \ No newline at end of file diff --git a/_migration-assistant/migration-phases/planning-your-migration/verifying-migration-tools.md b/_migration-assistant/migration-phases/planning-your-migration/verifying-migration-tools.md new file mode 100644 index 00000000000..6cfe762c85b --- /dev/null +++ b/_migration-assistant/migration-phases/planning-your-migration/verifying-migration-tools.md @@ -0,0 +1,223 @@ +--- +layout: default +title: Verifying migration tools +nav_order: 70 +parent: Planning your migration +grand_parent: Migration phases +redirect_from: + - /migration-assistant/migration-phases/verifying-migration-tools/ +--- + +# Verifying migration tools + +Before using the Migration Assistant, take the following steps to verify that your cluster is ready for migration. + +## Verifying snapshot creation + +Verify that a snapshot can be created of your source cluster and used for metadata and backfill scenarios. + +### Installing the Elasticsearch S3 Repository plugin + +The snapshot needs to be stored in a location that Migration Assistant can access. This guide uses Amazon Simple Storage Service (Amazon S3). By default, Migration Assistant creates an S3 bucket for storage. Therefore, it is necessary to install the [Elasticsearch S3 repository plugin](https://www.elastic.co/guide/en/elasticsearch/plugins/7.10/repository-s3.html) on your source nodes (https://www.elastic.co/guide/en/elasticsearch/plugins/7.10/repository-s3.html). + +Additionally, make sure that the plugin has been configured with AWS credentials that allow it to read and write to Amazon S3. If your Elasticsearch cluster is running on Amazon Elastic Compute Cloud (Amazon EC2) or Amazon Elastic Container Service (Amazon ECS) instances with an AWS Identity and Access Management (IAM) execution role, include the necessary S3 permissions. Alternatively, you can store the credentials in the [Elasticsearch keystore](https://www.elastic.co/guide/en/elasticsearch/plugins/7.10/repository-s3-client.html). + +### Verifying the S3 repository plugin configuration + +You can verify that the S3 repository plugin is configured correctly by creating a test snapshot. + +Create an S3 bucket for the snapshot using the following AWS Command Line Interface (AWS CLI) command: + +```shell +aws s3api create-bucket --bucket <your-bucket-name> --region <your-aws-region> +``` +{% include copy.html %} + +Register a new S3 snapshot repository on your source cluster using the following cURL command: + +```shell +curl -X PUT "http://<your-source-cluster>:9200/_snapshot/test_s3_repository" -H "Content-Type: application/json" -d '{ + "type": "s3", + "settings": { + "bucket": "<your-bucket-name>", + "region": "<your-aws-region>" + } +}' +``` +{% include copy.html %} + +Next, create a test snapshot that captures only the cluster's metadata: + +```shell +curl -X PUT "http://<your-source-cluster>:9200/_snapshot/test_s3_repository/test_snapshot_1" -H "Content-Type: application/json" -d '{ + "indices": "", + "ignore_unavailable": true, + "include_global_state": true +}' +``` +{% include copy.html %} + +Check the AWS Management Console to confirm that your bucket contains the snapshot. + +### Removing test snapshots after verification + +To remove the resources created during verification, you can use the following deletion commands: + +**Test snapshot** + +```shell +curl -X DELETE "http://<your-source-cluster>:9200/_snapshot/test_s3_repository/test_snapshot_1?pretty" +``` +{% include copy.html %} + +**Test snapshot repository** + +```shell +curl -X DELETE "http://<your-source-cluster>:9200/_snapshot/test_s3_repository?pretty" +``` +{% include copy.html %} + +**S3 bucket** + +```shell +aws s3 rm s3://<your-bucket-name> --recursive +aws s3api delete-bucket --bucket <your-bucket-name> --region <your-aws-region> +``` +{% include copy.html %} + +### Troubleshooting + +Use this guidance to troubleshoot any of the following snapshot verification issues. + +#### Access denied error (403) + +If you encounter an error like `AccessDenied (Service: Amazon S3; Status Code: 403)`, verify the following: + +- Make sure you're using the S3 bucket created by Migration Assistant. +- If you're using a custom S3 bucket, verify that: + - The IAM role assigned to your Elasticsearch cluster has the necessary S3 permissions. + - The bucket name and AWS Region provided in the snapshot configuration match the actual S3 bucket you created. + +#### Older versions of Elasticsearch + +Older versions of the Elasticsearch S3 repository plugin may have trouble reading IAM role credentials embedded in Amazon EC2 and Amazon ECS instances. This is because the copy of the AWS SDK shipped with them is too old to read the new standard way of retrieving those credentials, as shown in [the Instance Metadata Service v2 (IMDSv2) specification](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html). This can result in snapshot creation failures, with an error message similar to the following: + +```json +{"error":{"root_cause":[{"type":"repository_verification_exception","reason":"[migration_assistant_repo] path [rfs-snapshot-repo] is not accessible on master node"}],"type":"repository_verification_exception","reason":"[migration_assistant_repo] path [rfs-snapshot-repo] is not accessible on master node","caused_by":{"type":"i_o_exception","reason":"Unable to upload object [rfs-snapshot-repo/tests-s8TvZ3CcRoO8bvyXcyV2Yg/master.dat] using a single upload","caused_by":{"type":"amazon_service_exception","reason":"Unauthorized (Service: null; Status Code: 401; Error Code: null; Request ID: null)"}}},"status":500} +``` + +If you encounter this issue, you can resolve it by temporarily enabling IMDSv1 on the instances in your source cluster for the duration of the snapshot. There is a toggle for this available in the AWS Management Console as well as in the AWS CLI. Switching this toggle will turn on the older access model and enable the Elasticsearch S3 repository plugin to work as normal. For more information about IMDSv1, see [Modify instance metadata options for existing instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-IMDS-existing-instances.html). + +## Switching over client traffic + +The Migration Assistant Application Load Balancer is deployed with a listener that shifts traffic between the source and target clusters through proxy services. The Application Load Balancer should start in **Source Passthrough** mode. + +### Verifying that the traffic switchover is complete + +Use the following steps to verify that the traffic switchover is complete: + +1. In the AWS Management Console, navigate to **EC2 > Load Balancers**. +2. Select the **MigrationAssistant ALB**. +3. Examine the listener on port `9200` and verify that 100% of the traffic is directed to the **Source Proxy**. +4. Navigate to the **Migration ECS Cluster** in the AWS Management Console. +5. Select the **Target Proxy Service**. +6. Verify that the desired count for the service is running: + * If the desired count is not met, update the service to increase it to at least 1 and wait for the service to start. +7. On the **Health and Metrics** tab under **Load balancer health**, verify that all targets are reporting as healthy: + * This confirms that the Application Load Balancer can connect to the target cluster through the target proxy. +8. (Reset) Update the desired count for the **Target Proxy Service** back to its original value in Amazon ECS. + +### Fixing unidentified traffic patterns + +When switching over traffic to the target cluster, you might encounter unidentified traffic patterns. To help identify the cause of these patterns, use the following steps: +* Verify that the target cluster allows traffic ingress from the **Target Proxy Security Group**. +* Navigate to **Target Proxy ECS Tasks** to investigate any failing tasks. +Set the **Filter desired status** to **Any desired status** to view all tasks, then navigate to the logs for any stopped tasks. + + +## Verifying replication + +Use the following steps to verify that replication is working once the traffic capture proxy is deployed: + + +1. Navigate to the **Migration ECS Cluster** in the AWS Management Console. +2. Navigate to **Capture Proxy Service**. +3. Verify that the capture proxy is running with the desired proxy count. If it is not, update the service to increase it to at least 1 and wait for startup. +4. Under **Health and Metrics** > **Load balancer health**, verify that all targets are healthy. This means that the Application Load Balancer is able to connect to the source cluster through the capture proxy. +5. Navigate to the **Migration Console Terminal**. +6. Run `console kafka describe-topic-records`. Wait 30 seconds for another Application Load Balancer health check. +7. Run `console kafka describe-topic-records` again and verify that the number of RECORDS increased between runs. +8. Run `console replay start` to start Traffic Replayer. +9. Run `tail -f /shared-logs-output/traffic-replayer-default/*/tuples/tuples.log | jq '.targetResponses[]."Status-Code"'` to confirm that the Kafka requests were sent to the target and that it responded as expected. If the responses don't appear: + * Check that the migration console can access the target cluster by running `./catIndices.sh`, which should show the indexes in the source and target. + * Confirm that messages are still being recorded to Kafka. + * Check for errors in the Traffic Replayer logs (`/migration/STAGE/default/traffic-replayer-default`) using CloudWatch. +10. (Reset) Update the desired count for the **Capture Proxy Service** back to its original value in Amazon ECS. + +### Troubleshooting + +Use this guidance to troubleshoot any of the following replication verification issues. + +### Health check responses with 401/403 status code + +If the source cluster is configured to require authentication, the capture proxy will not be able to verify replication beyond receiving a 401/403 status code for Application Load Balancer health checks. For more information, see [Failure Modes](https://github.com/opensearch-project/opensearch-migrations/blob/main/TrafficCapture/trafficCaptureProxyServer/README.md#failure-modes). + +### Traffic does not reach the source cluster + +Verify that the source cluster allows traffic ingress from the Capture Proxy Security Group. + +Look for failing tasks by navigating to **Traffic Capture Proxy ECS**. Change **Filter desired status** to **Any desired status** in order to see all tasks and navigate to the logs for stopped tasks. + +### Snapshot and S3 bucket issues + +When using the CDK deployment for Migration Assistant, you might encounter the following errors during snapshot creation and deletion. + +#### Bucket permissions + +To make sure that you can delete snapshots as well as create them during the CDK deployment process, confirm that the `OSMigrations-dev-<region>-CustomS3AutoDeleteObjects` stack has S3 object deletion rights. Then, verify that `OSMigrations-dev-<region>-default-SnapshotRole` has the following S3 permissions: + + - List bucket contents + - Read/Write/Delete objects + +#### Snapshot conflicts + +To prevent snapshot conflicts, use the `console snapshot delete` command from the migration console. If you delete snapshots or snapshot repositories in a location other than the migration console, you might encounter "already exists" errors. + +## Resetting before migration + +After all verifications are complete, reset all resources before using Migration Assistant for an actual migration. + +The following steps outline how to reset resources with Migration Assistant before executing the actual migration. At this point all verifications are expected to have been completed. These steps can be performed after [Accessing the Migration Console]({{site.url}}{{site.baseurl}}/migration-assistant/migration-console/accessing-the-migration-console/). + +### Traffic Replayer + +To stop running Traffic Replayer, use the following command: + +```bash +console replay stop +``` +{% include copy.html %} + +### Kafka + +To clear all captured traffic from the Kafka topic, you can run the following command. + +This command will result in the loss of any traffic data captured by the capture proxy up to this point and thus should be used with caution. +{: .warning} + +```bash +console kafka delete-topic +``` +{% include copy.html %} + +### Target cluster + +To clear non-system indexes from the target cluster that may have been created as a result of testing, you can run the following command: + +This command will result in the loss of all data in the target cluster and should be used with caution. +{: .warning} + +```bash +console clusters clear-indices --cluster target +``` +{% include copy.html %} diff --git a/_migration-assistant/migration-phases/removing-migration-infrastructure.md b/_migration-assistant/migration-phases/removing-migration-infrastructure.md new file mode 100644 index 00000000000..adc2a4b7776 --- /dev/null +++ b/_migration-assistant/migration-phases/removing-migration-infrastructure.md @@ -0,0 +1,28 @@ +--- +layout: default +title: Removing migration infrastructure +nav_order: 120 +parent: Migration phases +--- + +# Removing migration infrastructure + +After a migration is complete, you should remove all resources except for the target cluster and, optionally, your Amazon CloudWatch logs and Traffic Replayer logs. + +To remove the AWS Cloud Development Kit (AWS CDK) stack(s) created during a deployment, run the following command within the CDK directory: + +```bash +cd deployment/cdk/opensearch-service-migration +cdk destroy "*" --c contextId=<CONTEXT_ID> +``` +{% include copy.html %} + +Follow the instructions on the command line to remove the deployed resources from your AWS account. + +You can also use the AWS Management Console to remove Migration Assistant resources and confirm that they are no longer present in the account. + +## Uninstalling Migration Assistant for Amazon OpenSearch Service + +You can uninstall Migration Assistant for Amazon OpenSearch Service from the AWS Management Console or by using the AWS Command Line Interface (AWS CLI). Manually remove the contents of the Amazon Simple Storage Service (Amazon S3) bucket that matches the syntax `cdk-<unique id>-assets-<account id>-<region>`, the bucket created by Migration Assistant. Migration Assistant for Amazon OpenSearch Service does not automatically delete Amazon S3 buckets. + +To delete the stored data and the AWS CloudFormation stacks created by Migration Assistant, see [Uninstall the solution](https://docs.aws.amazon.com/solutions/latest/migration-assistant-for-amazon-opensearch-service/uninstall-the-solution.html) in the Amazon OpenSearch Service documentation. diff --git a/_migration-assistant/overview/architecture.md b/_migration-assistant/overview/architecture.md new file mode 100644 index 00000000000..309261e4915 --- /dev/null +++ b/_migration-assistant/overview/architecture.md @@ -0,0 +1,24 @@ +--- +layout: default +title: Architecture +nav_order: 15 +parent: Overview +--- + +# Architecture + +The Migration Assistant architecture is based on the use of an AWS Cloud infrastructure, but most tools are designed to be cloud independent. A local containerized version of this solution is also available. + +The design deployed on AWS uses the following architecture. + +![Migration architecture overview]({{site.url}}{{site.baseurl}}/images/migrations/migrations-architecture-overview.png) + +Each node in the diagram correlates to the following steps in the migration process: + +1. Client traffic is directed to the existing cluster. +2. An Application Load Balancer with capture proxies relays traffic to a source while replicating data to Amazon Managed Streaming for Apache Kafka (Amazon MSK). +3. Using the migration console, you can initiate metadata migration to establish indexes, templates, component templates, and aliases on the target cluster. +4. With continuous traffic capture in place, you can use a `reindex-from-snapshot` process to capture data from your current index. +5. Once `Reindex-from-Snapshot` is complete, captured traffic is replayed from Amazon MSK to the target cluster by [Traffic Replayer](https://docs.opensearch.org/docs/latest/migration-assistant/migration-phases/live-traffic-migration/using-traffic-replayer/). +6. Performance and behavior of traffic sent to the source and target clusters are compared by reviewing logs and metrics. +7. After confirming that the target cluster's functionality meets expectations, clients are redirected to the new target. \ No newline at end of file diff --git a/_migration-assistant/overview/index.md b/_migration-assistant/overview/index.md new file mode 100644 index 00000000000..7962fe5e85d --- /dev/null +++ b/_migration-assistant/overview/index.md @@ -0,0 +1,25 @@ +--- +layout: default +title: Overview +nav_order: 2 +has_children: true +has_toc: false +permalink: /migration-assistant/overview/ +items: + - heading: "Key components" + description: "Get familiar with the key components of Migration Assistant." + link: "/migration-assistant/overview/key-components/" + - heading: "Architecture" + description: "Understand how Migration Assistant integrates into your infrastructure." + link: "/migration-assistant/overview/architecture/" + - heading: "Is Migration Assistant right for you?" + description: "Evaluate whether Migration Assistant is right for your use case." + link: "/migration-assistant/overview/is-migration-assistant-right-for-you/" +--- + +# Migration Assistant overview + +Use this section to get familiar with the key concepts and structure of Migration Assistant before diving into setup or execution. These pages provide the architecture, key components, and guidance to help you determine whether Migration Assistant is right for your use case. + +{% include list.html list_items=page.items%} + diff --git a/_migration-assistant/overview/is-migration-assistant-right-for-you.md b/_migration-assistant/overview/is-migration-assistant-right-for-you.md new file mode 100644 index 00000000000..2d9a4a20508 --- /dev/null +++ b/_migration-assistant/overview/is-migration-assistant-right-for-you.md @@ -0,0 +1,153 @@ +--- +layout: default +title: Is Migration Assistant right for you? +nav_order: 10 +parent: Overview +redirect_from: + - /migration-assistant/is-migration-assistant-right-for-you/ +--- + +# Is Migration Assistant right for you? + +Deciding whether to use Migration Assistant depends on your specific upgrade path, infrastructure complexity, and operational goals. This page will help you evaluate whether Migration Assistant is right for your use case—or whether another tool might be a better fit. + +Migration Assistant was built to fill important gaps in common migration strategies. For example, if you're upgrading across multiple major versions—such as from Elasticsearch 6.8 to OpenSearch 2.19—Migration Assistant lets you do this in a single step. Other methods, like rolling upgrades or snapshot restores, require you to upgrade through each major version, often reindexing your data at every step. + +Migration Assistant also supports live traffic replication, allowing for zero-downtime migrations. This makes it a strong choice for production environments, where minimizing service disruption is critical. + +If your migration is limited to a static cluster configuration (like index templates and aliases), or if you're not concerned about downtime, simpler tools may be sufficient. But for complex migrations involving real-time traffic or major version jumps, Migration Assistant offers robust, flexible capabilities. + +## Supported migration paths + +The following matrix shows which source versions can be directly migrated to which OpenSearch target versions: + +<!-- Migration matrix rendering logic retained --> +{% comment %}First, collect all unique target versions{% endcomment %} +{% assign all_targets = "" | split: "" %} +{% for path in site.data.migration-assistant.valid_migrations.migration_paths %} + {% for target in path.targets %} + {% assign all_targets = all_targets | push: target %} + {% endfor %} +{% endfor %} +{% assign unique_targets = all_targets | uniq | sort %} + +<table class="migration-matrix"> + <thead> + <tr> + <th></th> + {% for target in unique_targets %} + <th>{{ target }}</th> + {% endfor %} + </tr> + </thead> + <tbody> + {% for path in site.data.migration-assistant.valid_migrations.migration_paths %} + <tr> + <th>{{ path.source }}</th> + {% for target_version in unique_targets %} + <td> + {% if path.targets contains target_version %}✓{% endif %} + </td> + {% endfor %} + </tr> + {% endfor %} + </tbody> +</table> + +## Supported platforms + +**Source and target platforms** + +- Self-managed (on premises or hosted by a cloud provider) +- Amazon OpenSearch Service + + +**AWS Regions** + +Migration Assistant is supported in the following AWS Regions: + +- US East (N. Virginia, Ohio) +- US West (Oregon, N. California) +- Europe (Frankfurt, Ireland, London) +- Asia Pacific (Tokyo, Singapore, Sydney) +- AWS GovCloud (US-East, US-West)[^1] + +[^1]: In AWS GovCloud (US), `reindex-from-snapshot` (RFS) is limited to shard sizes of 80 GiB or smaller. + + + +## Supported components + +Before starting a migration, consider the scope of the components involved. The following table outlines components that should potentially be migrated, indicates whether they are supported by Migration Assistant, and provides recommendations. + +| Component | Supported | Recommendations | +| :--- |:--- | :--- | +| **Documents** | Yes | Migrate existing data with RFS and live traffic with capture and replay. | +| **Index settings** | Yes | Migrate with the `Metadata-Migration-Tool`. | +| **Index mappings** | Yes | Migrate with the `Metadata-Migration-Tool`. | +| **Index templates** | Yes | Migrate with the `Metadata-Migration-Tool`. | +| **Component templates** | Yes | Migrate with the `Metadata-Migration-Tool`. | +| **Aliases** | Yes | Migrate with the `Metadata-Migration-Tool`. | +| **Index State Management (ISM) policies** | Expected in 2025 | Manually migrate using an API. For more information about ISM support, see [issue #944](https://github.com/opensearch-project/opensearch-migrations/issues/944). | +| **Elasticsearch Kibana[^2] dashboards** | Expected in 2025 | This tool is only needed when migrating from Elasticsearch Kibana dashboards to OpenSearch Dashboards. Start by exporting JSON files from Kibana and importing them into OpenSearch Dashboards. For Elasticsearch versions 7.10.2 to 7.17, use the [`dashboardsSanitizer`](https://github.com/opensearch-project/opensearch-migrations/tree/main/dashboardsSanitizer) tool before importing X-Pack visualizations like Canvas and Lens in Kibana dashboards, as they may require recreation for compatibility with OpenSearch.| +| **Security constructs** | No | Configure roles and permissions based on cloud provider recommendations. For example, if using AWS, leverage AWS Identity and Access Management (IAM) for enhanced security management. | +| **Plugins** | No | Check plugin compatibility; some Elasticsearch plugins may not have direct OpenSearch equivalents. | + +[^2]: Support for Kibana 5.0 through 7.10.2 migration paths to OpenSearch Dashboards will be added in a future version. Kibana 8 and later are not supported. For more information, see [issue #944](https://github.com/opensearch-project/opensearch-migrations/issues/944). + +## Choosing your migration approach + +Use the following checklist to determine which Migration Assistant components best fit your use case. + +### Metadata migration + +Use [metadata migration]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/migrating-metadata/) if: + +- You need to migrate while mitigating breaking changes between the source and target clusters, such as differences in mappings, settings, aliases, or component templates. +- You want a relatively consistent configuration between the source and target clusters. + +### Backfill migration + +Use [backfill migration]({{site.url}}{{site.baseurl}}/migration-assistant/migration-phases/backfill/) if: + +- You need to move historical data without disrupting live traffic. +- You want to backfill indexes from a specific point in time without impacting the source cluster. +- You want to verify historical data in the target cluster before switching over. +- You want to backfill using an existing or incremental snapshot. +- You need the fastest backfill option that includes reindexing. +- You want the ability to pause and resume migration. + +### RFS + +Use [RFS]({{site.url}}{{site.baseurl}}/migration-assistant/deploying-migration-assistant/getting-started-data-migration/) if: + +- You already use OpenSearch snapshots for backups. +- You need to migrate documents at scale in parallel, such as with Amazon Elastic Container Service (Amazon ECS). +- You require a data migration path as part of a zero-downtime migration. +- Your AWS Region supports RFS and your shard sizes are within supported limits. + +### Combination of all three + +Use a combination of all three migration types if: + +- You're performing a complex, multi-version migration. +- You require zero downtime and full validation of the target environment. +- You want end-to-end tooling for metadata, data movement, and cluster behavior comparison. +- You're cloning an existing cluster and changing the source's configuration. +- You're setting up disaster recovery. + +## Checklist + +Use this checklist to decide whether Migration Assistant is right for you: + +- Are you migrating across one or more major versions? + +- Do you need to maintain service availability with zero downtime? + +- Do you need to validate a new OpenSearch cluster before switching over? + +- Is your environment self-managed or running on Amazon OpenSearch Service? + +- Are you looking for tooling that can automate metadata migration and performance comparison? + +If you answered "yes" to most of these questions, Migration Assistant is likely the right solution for your migration. diff --git a/_migration-assistant/overview/key-components.md b/_migration-assistant/overview/key-components.md new file mode 100644 index 00000000000..ca09c7f29e8 --- /dev/null +++ b/_migration-assistant/overview/key-components.md @@ -0,0 +1,38 @@ +--- +layout: default +title: Key components +nav_order: 10 +parent: Overview +--- + +# Key components + +The following are the key components of Migration Assistant. + +## Elasticsearch/OpenSearch source + +In this solution, your source cluster operates on either Elasticsearch or OpenSearch and is hosted on Amazon Elastic Compute Cloud (Amazon EC2) instances or in a similar computing environment. A proxy is set up to interact with this source cluster, either positioned in front of or directly on the coordinating nodes of the cluster. + +## Migration console + +The migration console provides a migration-specific CLI and offers a variety of tools for streamlining the migration process. Everything necessary for completing a migration, other than cleaning up the migration resources, can be performed through this console. + +## Traffic capture proxy + +This component is designed for HTTP RESTful traffic. It forwards traffic to the source cluster and also splits and channels this traffic to a stream processing service for later playback. + +## Traffic Replayer + +Acting as a traffic simulation tool, [Traffic Replayer](https://docs.opensearch.org/docs/latest/migration-assistant/migration-phases/live-traffic-migration/using-traffic-replayer/) replays recorded request traffic to a target cluster, mirroring source traffic patterns. It links original requests and their responses to those directed at the target cluster, facilitating comparative analysis. + +## Metadata migration tool + +The metadata migration tool integrated into the Migration CLI can be used independently to migrate cluster metadata, including index mappings, index configuration settings, templates, component templates, and aliases. + +## Reindex-from-Snapshot + +`Reindex-from-Snapshot` (RFS) reindexes data from an existing snapshot. Amazon Elastic Container Service (Amazon ECS) workers coordinate the migration of documents from an existing snapshot, reindexing the documents in parallel to a target cluster. + +## Target cluster + +The target cluster is the destination cluster for migration or comparison in an A/B test. \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/agents-tools-tutorial.md b/_ml-commons-plugin/agents-tools/agents-tools-tutorial.md index 39051f399cd..b66af3c0c6b 100644 --- a/_ml-commons-plugin/agents-tools/agents-tools-tutorial.md +++ b/_ml-commons-plugin/agents-tools/agents-tools-tutorial.md @@ -42,7 +42,7 @@ In this tutorial, you'll use the `huggingface/sentence-transformers/all-MiniLM-L POST /_plugins/_ml/models/_register?deploy=true { "name": "huggingface/sentence-transformers/all-MiniLM-L12-v2", - "version": "1.0.1", + "version": "1.0.2", "model_format": "TORCH_SCRIPT" } ``` @@ -102,9 +102,9 @@ PUT /_ingest/pipeline/test-pipeline-local-model } ``` -## Step 3: Create a k-NN index and ingest data +## Step 3: Create a vector index and ingest data -Now you'll ingest supplementary data into an OpenSearch index. In OpenSearch, vectors are stored in a k-NN index. You can create a [k-NN index]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/) by sending the following request: +Now you'll ingest supplementary data into an OpenSearch index. In OpenSearch, vectors are stored in a vector index. You can create a [vector index]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/) by sending the following request: ```json PUT my_test_data @@ -352,50 +352,3 @@ Therefore, the population increase of Seattle from 2021 to 2023 is 58,000.""" ] } ``` - -## Hidden agents -**Introduced 2.13** -{: .label .label-purple } - -To hide agent details from end users, including the cluster admin, you can register a _hidden_ agent. If an agent is hidden, non-superadmin users don't have permission to call any [Agent APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/index/) except for the [Execute API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/execute-agent/), on the agent. - -Only superadmin users can register a hidden agent. To register a hidden agent, you first need to authenticate with an [admin certificate]({{site.url}}{{site.baseurl}}/security/configuration/tls/#configuring-admin-certificates): - -```bash -curl -k --cert ./kirk.pem --key ./kirk-key.pem -XGET 'https://localhost:9200/.opendistro_security/_search' -``` - -All agents created by a superadmin user are automatically registered as hidden. Only the superadmin user can view hidden agent details and delete hidden agents. -To register a hidden agent, send a request to the `_register` endpoint: - -```bash -curl -k --cert ./kirk.pem --key ./kirk-key.pem -X POST 'https://localhost:9200/_plugins/_ml/agents/_register' -H 'Content-Type: application/json' -d ' -{ - "name": "Test_Agent_For_RAG", - "type": "flow", - "description": "this is a test agent", - "tools": [ - { - "name": "vector_tool", - "type": "VectorDBTool", - "parameters": { - "model_id": "zBRyYIsBls05QaITo5ex", - "index": "my_test_data", - "embedding_field": "embedding", - "source_field": [ - "text" - ], - "input": "${parameters.question}" - } - }, - { - "type": "MLModelTool", - "description": "A general tool to answer any question", - "parameters": { - "model_id": "NWR9YIsBUysqmzBdifVJ", - "prompt": "\n\nHuman:You are a professional data analyst. You will always answer question based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say don't know. \n\n Context:\n${parameters.vector_tool.output}\n\nHuman:${parameters.question}\n\nAssistant:" - } - } - ] -}' -``` diff --git a/_ml-commons-plugin/agents-tools/agents/conversational-flow.md b/_ml-commons-plugin/agents-tools/agents/conversational-flow.md new file mode 100644 index 00000000000..5e10e4349e7 --- /dev/null +++ b/_ml-commons-plugin/agents-tools/agents/conversational-flow.md @@ -0,0 +1,76 @@ +--- +layout: default +title: Conversational flow agents +has_children: false +has_toc: false +nav_order: 20 +parent: Agents +grand_parent: Agents and tools +--- + +# Conversational flow agents +**Introduced 2.13** +{: .label .label-purple } + +Similarly to a [flow agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/flow/), a conversational flow agent is configured with a set of tools that it runs in order. The difference between them is that a conversational flow agent stores the conversation in an index, in the following example, the `conversation_index`. The following agent runs the `VectorDBTool` and then the `MLModelTool`: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "population data analysis agent", + "type": "conversational_flow", + "description": "This is a demo agent for population data analysis", + "app_type": "rag", + "memory": { + "type": "conversation_index" + }, + "tools": [ + { + "type": "VectorDBTool", + "name": "population_knowledge_base", + "parameters": { + "model_id": "YOUR_TEXT_EMBEDDING_MODEL_ID", + "index": "test_population_data", + "embedding_field": "population_description_embedding", + "source_field": [ + "population_description" + ], + "input": "${parameters.question}" + } + }, + { + "type": "MLModelTool", + "name": "bedrock_claude_model", + "description": "A general tool to answer any question", + "parameters": { + "model_id": "YOUR_LLM_MODEL_ID", + "prompt": """ + +Human:You are a professional data analyst. You will always answer question based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say don't know. + +Context: +${parameters.population_knowledge_base.output:-} + +${parameters.chat_history:-} + +Human:${parameters.question} + +Assistant:""" + } + } + ] +} +``` +{% include copy-curl.html %} + +For more information about the Register Agent API request fields, see [Request body fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent/#request-body-fields). + +For a step-by-step tutorial, see [Agents and tools tutorial]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents-tools-tutorial/). + +## Next steps + +- To learn more about registering agents, see [Register Agent API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent/). +- For a list of supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). +- For a step-by-step tutorial, see [Agents and tools tutorial]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents-tools-tutorial/). +- For supported APIs, see [Agent APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/). +- To use agents and tools in configuration automation, see [Automating configurations]({{site.url}}{{site.baseurl}}/automating-configurations/index/). \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/agents/conversational.md b/_ml-commons-plugin/agents-tools/agents/conversational.md new file mode 100644 index 00000000000..53ae3bbe982 --- /dev/null +++ b/_ml-commons-plugin/agents-tools/agents/conversational.md @@ -0,0 +1,69 @@ +--- +layout: default +title: Conversational agents +has_children: false +has_toc: false +nav_order: 30 +parent: Agents +grand_parent: Agents and tools +--- + +# Conversational agents +**Introduced 2.13** +{: .label .label-purple } + +Similarly to a [conversational flow agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/conversational-flow/), a conversational agent stores a conversation in an index, in the following example, the `conversation_index`. A conversational agent can be configured with a large language model (LLM) and a set of supplementary tools that perform specific jobs. For example, you can set up an LLM and a `CATIndexTool` when configuring an agent. When you send a question to the model, the agent also includes the `CATIndexTool` as context. The LLM then decides whether it needs to use the `CATIndexTool` to answer questions like "How many indexes are in my cluster?" The context allows an LLM to answer specific questions that are outside of its knowledge base. For example, the following agent is configured with an LLM and a `CATIndexTool` that retrieves information about your OpenSearch indexes: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_ReAct_ClaudeV2", + "type": "conversational", + "description": "this is a test agent", + "llm": { + "model_id": "YOUR_LLM_MODEL_ID", + "parameters": { + "max_iteration": 5, + "stop_when_no_tool_found": true, + "response_filter": "$.completion" + } + }, + "memory": { + "type": "conversation_index" + }, + "tools": [ + { + "type": "VectorDBTool", + "name": "VectorDBTool", + "description": "A tool to search opensearch index with natural language question. If you don't know answer for some question, you should always try to search data with this tool. Action Input: <natural language question>", + "parameters": { + "model_id": "YOUR_TEXT_EMBEDDING_MODEL_ID", + "index": "my_test_data", + "embedding_field": "embedding", + "source_field": [ "text" ], + "input": "${parameters.question}" + } + }, + { + "type": "CatIndexTool", + "name": "RetrieveIndexMetaTool", + "description": "Use this tool to get OpenSearch index information: (health, status, index, uuid, primary count, replica count, docs.count, docs.deleted, store.size, primary.store.size)." + } + ], + "app_type": "my app" +} +``` +{% include copy-curl.html %} + +For more information about the Register Agent API request fields, see [Request body fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent/#request-body-fields). + +For a step-by-step tutorial, see [Agents and tools tutorial]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents-tools-tutorial/). + + +## Next steps + +- To learn more about registering agents, see [Register Agent API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent/). +- For a list of supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). +- For a step-by-step tutorial, see [Agents and tools tutorial]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents-tools-tutorial/). +- For supported APIs, see [Agent APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/). +- To use agents and tools in configuration automation, see [Automating configurations]({{site.url}}{{site.baseurl}}/automating-configurations/index/). \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/agents/flow.md b/_ml-commons-plugin/agents-tools/agents/flow.md new file mode 100644 index 00000000000..1a9ea51b63f --- /dev/null +++ b/_ml-commons-plugin/agents-tools/agents/flow.md @@ -0,0 +1,57 @@ +--- +layout: default +title: Flow agents +has_children: false +has_toc: false +nav_order: 10 +parent: Agents +grand_parent: Agents and tools +--- + +# Flow agents +**Introduced 2.13** +{: .label .label-purple } + +A flow agent is configured with a set of tools that it runs in order. For example, the following agent runs the `VectorDBTool` and then the `MLModelTool`. The agent coordinates the tools so that one tool's output can become another tool's input. In this example, the `VectorDBTool` queries the k-NN index, and the agent passes its output `${parameters.VectorDBTool.output}` to the `MLModelTool` as context along with the `${parameters.question}` (see the `prompt` parameter): + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_RAG", + "type": "flow", + "description": "this is a test agent", + "tools": [ + { + "type": "VectorDBTool", + "parameters": { + "model_id": "YOUR_TEXT_EMBEDDING_MODEL_ID", + "index": "my_test_data", + "embedding_field": "embedding", + "source_field": ["text"], + "input": "${parameters.question}" + } + }, + { + "type": "MLModelTool", + "description": "A general tool to answer any question", + "parameters": { + "model_id": "YOUR_LLM_MODEL_ID", + "prompt": "\n\nHuman:You are a professional data analyst. You will always answer a question based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say you don't know. \n\n Context:\n${parameters.VectorDBTool.output}\n\nHuman:${parameters.question}\n\nAssistant:" + } + } + ] +} +``` +{% include copy-curl.html %} + +For more information about the Register Agent API request fields, see [Request body fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent/#request-body-fields). + +For a step-by-step tutorial, see [Agents and tools tutorial]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents-tools-tutorial/). + +## Next steps + +- To learn more about registering agents, see [Register Agent API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent/). +- For a list of supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). +- For a step-by-step tutorial, see [Agents and tools tutorial]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents-tools-tutorial/). +- For supported APIs, see [Agent APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/). +- To use agents and tools in configuration automation, see [Automating configurations]({{site.url}}{{site.baseurl}}/automating-configurations/index/). \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/agents/index.md b/_ml-commons-plugin/agents-tools/agents/index.md new file mode 100644 index 00000000000..4f546d61254 --- /dev/null +++ b/_ml-commons-plugin/agents-tools/agents/index.md @@ -0,0 +1,79 @@ +--- +layout: default +title: Agents +parent: Agents and tools +has_children: true +has_toc: false +nav_order: 20 +redirect_from: + - /ml-commons-plugin/agents-tools/agents/ +--- + +# Agents +**Introduced 2.13** +{: .label .label-purple } + +An _agent_ is a coordinator that uses a large language model (LLM) to solve a problem. After the LLM reasons and decides what action to take, the agent coordinates the action execution. OpenSearch supports the following agent types: + +- [_Flow agent_]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/flow/): Runs tools sequentially, in the order specified in its configuration. The workflow of a flow agent is fixed. Useful for retrieval-augmented generation (RAG). +- [_Conversational flow agent_]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/conversational-flow/): Runs tools sequentially, in the order specified in its configuration. The workflow of a conversational flow agent is fixed. Stores conversation history so that users can ask follow-up questions. Useful for creating a chatbot. +- [_Conversational agent_]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/conversational/): Reasons in order to provide a response based on the available knowledge, including the LLM knowledge base and a set of tools provided to the LLM. The LLM reasons iteratively to decide what action to take until it obtains the final answer or reaches the iteration limit. Stores conversation history so that users can ask follow-up questions. The workflow of a conversational agent is variable, based on follow-up questions. For specific questions, uses the Chain-of-Thought (CoT) process to select the best tool from the configured tools for providing a response to the question. Useful for creating a chatbot that employs RAG. +- [_Plan-execute-reflect agent_]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/plan-execute-reflect/): Dynamically plans, executes, and refines multi-step workflows to solve complex tasks. Internally, a plan-execute-reflect agent uses a conversational agent to execute each individual step in the plan. The agent automatically selects the most appropriate tool for each step based on tool descriptions and context. Ideal for long-running, exploratory processes that benefit from iterative reasoning and adaptive execution. Useful for conducting research or performing root cause analysis (RCA). + +## Hidden agents +**Introduced 2.13** +{: .label .label-purple } + +To hide agent details from end users, including the cluster admin, you can register a _hidden_ agent. If an agent is hidden, non-superadmin users don't have permission to call any [Agent APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/index/), except for the [Execute API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/execute-agent/), on the agent. + +Only superadmin users can register a hidden agent. To register a hidden agent, you first need to authenticate with an [admin certificate]({{site.url}}{{site.baseurl}}/security/configuration/tls/#configuring-admin-certificates): + +```bash +curl -k --cert ./kirk.pem --key ./kirk-key.pem -XGET 'https://localhost:9200/.opendistro_security/_search' +``` +{% include copy.html %} + +All agents created by a superadmin user are automatically registered as hidden. Only the superadmin user can view hidden agent details and delete hidden agents. +To register a hidden agent, send a request to the `_register` endpoint: + +```bash +curl -k --cert ./kirk.pem --key ./kirk-key.pem -X POST 'https://localhost:9200/_plugins/_ml/agents/_register' -H 'Content-Type: application/json' -d ' +{ + "name": "Test_Agent_For_RAG", + "type": "flow", + "description": "this is a test agent", + "tools": [ + { + "name": "vector_tool", + "type": "VectorDBTool", + "parameters": { + "model_id": "zBRyYIsBls05QaITo5ex", + "index": "my_test_data", + "embedding_field": "embedding", + "source_field": [ + "text" + ], + "input": "${parameters.question}" + } + }, + { + "type": "MLModelTool", + "description": "A general tool to answer any question", + "parameters": { + "model_id": "NWR9YIsBUysqmzBdifVJ", + "prompt": "\n\nHuman:You are a professional data analyst. You will always answer question based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say don't know. \n\n Context:\n${parameters.vector_tool.output}\n\nHuman:${parameters.question}\n\nAssistant:" + } + } + ] +}' +``` +{% include copy.html %} + +## Next steps + +- To learn more about registering agents, see [Register Agent API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent/). +- For a list of supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). +- For a step-by-step tutorial, see [Agents and tools tutorial]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents-tools-tutorial/). +- For a step-by-step tutorial on using a plan-execute-reflect agent, see [Building a plan-execute-reflect agent]({{site.url}}{{site.baseurl}}/tutorials/gen-ai/agents/build-plan-execute-reflect-agent/). +- For supported APIs, see [Agent APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/). +- To use agents and tools in configuration automation, see [Automating configurations]({{site.url}}{{site.baseurl}}/automating-configurations/index/). diff --git a/_ml-commons-plugin/agents-tools/agents/plan-execute-reflect.md b/_ml-commons-plugin/agents-tools/agents/plan-execute-reflect.md new file mode 100644 index 00000000000..f1034516da3 --- /dev/null +++ b/_ml-commons-plugin/agents-tools/agents/plan-execute-reflect.md @@ -0,0 +1,381 @@ +--- +layout: default +title: Plan-execute-reflect agents +has_children: false +has_toc: false +nav_order: 40 +parent: Agents +grand_parent: Agents and tools +--- + +# Plan-execute-reflect agents +**Introduced 3.0** +{: .label .label-purple } + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/ml-commons/issues/3745). +{: .warning} + +Plan-execute-reflect agents are designed to solve complex tasks that require iterative reasoning and step-by-step execution. These agents use one large language model (LLM)---the _planner_---to create and update a plan and another LLM (or the same one by default) to execute each individual step using a built-in conversational agent. + +A plan-execute-reflect agent works in three phases: + +- **Planning** – The planner LLM generates an initial step-by-step plan using the available tools. +- **Execution** – Each step is executed sequentially using the conversational agent and the available tools. +- **Re-evaluation** – After executing each step, the planner LLM re-evaluates the plan using intermediate results. The LLM can adjust the plan dynamically to skip, add, or change steps based on new context. + +Similarly to a conversational agent, the plan-execute-reflect agent stores the interaction between the LLM and the agent in a memory index. In the following example, the agent uses a `conversation_index` to persist the execution history, including the user's question, intermediate results, and final outputs. + +The agent automatically selects the most appropriate tool for each step based on the tool descriptions and current context. + +The agent currently supports re-evaluation only after each step. This allows the agent to dynamically adapt the plan based on intermediate results before proceeding to the next step. + +## Creating a plan-execute-reflect agent + +The following example request creates a plan-execute-reflect agent with three tools: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "My Plan Execute Reflect Agent", + "type": "plan_execute_and_reflect", + "description": "Agent for dynamic task planning and reasoning", + "llm": { + "model_id": "YOUR_LLM_MODEL_ID", + "parameters": { + "prompt": "${parameters.question}" + } + }, + "memory": { + "type": "conversation_index" + }, + "parameters": { + "_llm_interface": "YOUR_LLM_INTERFACE" + }, + "tools": [ + { "type": "ListIndexTool" }, + { "type": "SearchIndexTool" }, + { "type": "IndexMappingTool" } + ], + "app_type": "os_chat" +} +``` + +It is important to provide thorough descriptions of the tools so that the LLM can decide in which situations to use those tools. +{: .tip} + +For more information about the Register Agent API request fields, see [Request body fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent/#request-body-fields). + +## Supported LLMs + +The plan-execute-reflect agent provides built-in function calling interfaces for the following LLMs: + +- [Anthropic Claude 3.7 model hosted on Amazon Bedrock](https://aws.amazon.com/bedrock/claude/) +- OpenAI GPT-4o model +- DeepSeek-R1 model hosted on Amazon Bedrock + +To request default support for an LLM, [create a feature request issue in the ML Commons repository](https://github.com/opensearch-project/ml-commons/issues). + +For a step-by-step tutorial on using a plan-execute-reflect agent, see [Building a plan-execute-reflect agent]({{site.url}}{{site.baseurl}}/tutorials/gen-ai/agents/build-plan-execute-reflect-agent/). + +To configure a plan-execute-reflect agent with a particular model, you need to modify the connector in [Step 1(a): Create a connector]({{site.url}}{{site.baseurl}}/tutorials/gen-ai/agents/build-plan-execute-reflect-agent/#step-1a-create-a-connector) and provide a model-specific `llm_interface` parameter in [Step 2: Create an agent]({{site.url}}{{site.baseurl}}/tutorials/gen-ai/agents/build-plan-execute-reflect-agent/#step-2-create-an-agent): + +```json +"parameters": { + "_llm_interface": "bedrock/converse/claude" +} +``` + +For valid values of the `_llm_interface` field, see [Request body fields]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent/#request-body-fields). + +The following examples provide the connector and agent creation requests for the supported models. + +### Anthropic Claude on Amazon Bedrock + +To create a connector for the Anthropic Claude 3.7 Sonnet model hosted on Amazon Bedrock, use the following request: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "Amazon Bedrock Claude 3.7-sonnet connector", + "description": "Connector to Amazon Bedrock service for the Claude model", + "version": 1, + "protocol": "aws_sigv4", + "parameters": { + "region": "your_aws_region", + "service_name": "bedrock", + "model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0" + }, + "credential": { + "access_key": "your_aws_access_key", + "secret_key": "your_aws_secret_key", + "session_token": "your_aws_session_token" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://bedrock-runtime.${parameters.region}.amazonaws.com/model/${parameters.model}/converse", + "headers": { + "content-type": "application/json" + }, + "request_body": "{ \"system\": [{\"text\": \"${parameters.system_prompt}\"}], \"messages\": [${parameters._chat_history:-}{\"role\":\"user\",\"content\":[{\"text\":\"${parameters.prompt}\"}]}${parameters._interactions:-}]${parameters.tool_configs:-} }" + } + ] +} +``` +{% include copy-curl.html %} + +To create a plan-execute-reflect agent with the Anthropic Claude 3.7 Sonnet model, use the following request: + +```json +POST _plugins/_ml/agents/_register +{ + "name": "My Plan Execute and Reflect agent with Claude 3.7", + "type": "plan_execute_and_reflect", + "description": "this is a test agent", + "llm": { + "model_id": "your_llm_model_id", + "parameters": { + "prompt": "${parameters.question}" + }}, + "memory": { + "type": "conversation_index" + }, + "parameters": { + "_llm_interface": "bedrock/converse/claude" + }, + "tools": [ + { + "type": "ListIndexTool" + }, + { + "type": "SearchIndexTool" + }, + { + "type": "IndexMappingTool" + } + ] +} +``` +{% include copy-curl.html %} + +### OpenAI GPT-4o + +To create a connector for an OpenAI GPT-4o model, use the following request: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "My openai connector: gpt-4", + "description": "The connector to openai chat model", + "version": 1, + "protocol": "http", + "parameters": { + "model": "gpt-4o" + }, + "credential": { + "openAI_key": "your_open_ai_key" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://api.openai.com/v1/chat/completions", + "headers": { + "Authorization": "Bearer ${credential.openAI_key}" + }, + "request_body": "{ \"model\": \"${parameters.model}\", \"messages\": [{\"role\":\"developer\",\"content\":\"${parameters.system_prompt}\"},${parameters._chat_history:-}{\"role\":\"user\",\"content\":\"${parameters.prompt}\"}${parameters._interactions:-}]${parameters.tool_configs:-} }" + } + ] +} +``` +{% include copy-curl.html %} + +Then register the model and register an agent, specifying `openai/v1/chat/completions` in the `_llm_interface` field. + +### Deepseek-R1 on Amazon Bedrock + +To create a connector for a DeepSeek-R1 model hosted on Amazon Bedrock, use the following request: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "My DeepSeek R1 connector", + "description": "my test connector", + "version": 1, + "protocol": "aws_sigv4", + "parameters": { + "region": "your_region", + "service_name": "bedrock", + "model": "us.deepseek.r1-v1:0" + }, + "credential": { + "access_key": "your_access_key", + "secret_key": "your_secret_key", + "session_token": "your_session_token" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://bedrock-runtime.${parameters.region}.amazonaws.com/model/${parameters.model}/converse", + "headers": { + "content-type": "application/json" + }, + "request_body": "{ \"system\": [{\"text\": \"${parameters.system_prompt}\"}], \"messages\": [${parameters._chat_history:-}{\"role\":\"user\",\"content\":[{\"text\":\"${parameters.prompt}\"}]}${parameters._interactions:-}] }" + } + ] +} +``` +{% include copy-curl.html %} + +Then register the model and register an agent, specifying `bedrock/converse/deepseek_r1` in the `_llm_interface` field. + +Because the Deepseek-R1 model hosted on Amazon Bedrock lacks default function-calling support, provide the following prompt as an `executor_system_prompt` during agent registration: + +```json +"You are a helpful assistant. You can ask Human to use tools to look up information that may be helpful in answering the users original question. The tools the human can use are:\n[${parameters._tools.toString()}]\n\nIf need to use tool, return which tool should be used and the input to user is enough. User will run the tool to get information. To make it easier for user to parse the response to know whether they should invoke a tool or not, please also return \"stop_reason\", it only return one of two enum values: [end_turn, tool_use], add a random tool call id to differenciate in case same tool invoked multiple times. Tool call id follow this pattern \"tool_use_<random string>\". The random string should be some UUID.\n\nFor example, you should return a json like this if need to use tool:\n{\"stop_reason\": \"tool_use\", \"tool_calls\": [{\"id\":\"tool_use_IIHBxMgOTjGb6ascCiOILg\",tool_name\":\"search_opensearch_index\",\"input\": {\"index\":\"population_data\",\"query\":{\"query\":{\"match\":{\"city\":\"New York City\"}}}}}]}\n\nIf don't need to use tool, return a json like this:\n{\"stop_reason\": \"end_turn\", \"message\": {\"role\":\"user\",\"content\":[{\"text\":\"What is the most popular song on WZPZ?\"}]}}\n\nNOTE: Don't wrap response in markdown ```json<response>```. For example don't return ```json\\n{\"stop_reason\": \"end_turn\", \"message\": {\"role\":\"user\",\"content\":[{\"text\":\"What is the most popular song on WZPZ?\"}]}}```\n" +``` +{% include copy.html %} + +## Tracking agent execution and memory + +When you execute a plan-execute-reflect agent asynchronously using the [Agent Execute API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/execute-agent/), the API returns the `memory_id` and the `parent_interaction_id` of the planner agent once the agent is started. + +In the final response, the API also returns the `executor_agent_memory_id` and `executor_agent_parent_interaction_id`, which correspond to the internal executor agent responsible for carrying out each step of the plan. The `executor_agent_memory_id` and `executor_agent_parent_interaction_id` are updated in the task as soon as they are available, even before the agent has completed execution. This enables real-time tracking of the execution process. + +For a complete example, see [Building a plan-execute-reflect agent]({{site.url}}{{site.baseurl}}/tutorials/gen-ai/agents/build-plan-execute-reflect-agent/#test-the-agent). + +## Default prompts + +The plan-execute-reflect agent uses the following predefined prompts. You can customize the prompts by providing new ones in the following ways: + +- During agent registration in the `parameters` object +- Dynamically during agent execution + +### Planner template and prompt + +To create a custom planner prompt template, modify the `planner_prompt_template` parameter. +The following template is used to ask the LLM to devise a plan for the given task: + +```json +${parameters.planner_prompt} \n Objective: ${parameters.user_prompt} \n ${parameters.plan_execute_reflect_response_format} +``` + +To create a custom planner prompt, modify the `planner_prompt` parameter. +The following prompt is used to ask the LLM to devise a plan for the given task: + +``` +For the given objective, come up with a simple step by step plan. This plan should involve individual tasks, that if executed correctly will yield the correct answer. Do not add any superfluous steps. The result of the final step should be the final answer. Make sure that each step has all the information needed - do not skip steps. At all costs, do not execute the steps. You will be told when to execute the steps. +``` + +### Planner prompt with a history template + +To create a custom planner prompt with a history template, modify the `planner_with_history_template` parameter. +The following template is used when `memory_id` is provided during agent execution to give the LLM context about the previous task: + +```json +${parameters.planner_prompt} \n Objective: ${parameters.user_prompt} \n\n You have currently executed the following steps: \n[${parameters.completed_steps}] \n\n \n ${parameters.plan_execute_reflect_response_format} +``` + +### Reflection prompt and template + +To create a custom reflection prompt template, modify the `reflect_prompt_template` parameter. +The following template is used to ask the LLM to rethink the original plan based on completed steps: + +```json +${parameters.planner_prompt} \n Objective: ${parameters.user_prompt} \n Original plan:\n [${parameters.steps}] \n You have currently executed the following steps: \n [${parameters.completed_steps}] \n ${parameters.reflect_prompt} \n ${parameters.plan_execute_reflect_response_format} +``` + +To create a custom reflection prompt, modify the `reflect_prompt` parameter. +The following prompt is used to ask the LLM to rethink the original plan: + +``` +Update your plan accordingly. If no more steps are needed and you can return to the user, then respond with that. Otherwise, fill out the plan. Only add steps to the plan that still NEED to be done. Do not return previously done steps as part of the plan. Please follow the below response format +``` + +### Planner system prompt + +To create a custom planner system prompt, modify the `system_prompt` parameter. +The following is the planner system prompt: + +``` +You are part of an OpenSearch cluster. When you deliver your final result, include a comprehensive report. This report MUST:\n1. List every analysis or step you performed.\n2. Summarize the inputs, methods, tools, and data used at each step.\n3. Include key findings from all intermediate steps — do NOT omit them.\n4. Clearly explain how the steps led to your final conclusion.\n5. Return the full analysis and conclusion in the 'result' field, even if some of this was mentioned earlier.\n\nThe final response should be fully self-contained and detailed, allowing a user to understand the full investigation without needing to reference prior messages. Always respond in JSON format. +``` + +### Executor system prompt + +To create a custom executor system prompt, modify the `executor_system_prompt` parameter. +The following is the executor system prompt: + +``` +You are a dedicated helper agent working as part of a plan‑execute‑reflect framework. Your role is to receive a discrete task, execute all necessary internal reasoning or tool calls, and return a single, final response that fully addresses the task. You must never return an empty response. If you are unable to complete the task or retrieve meaningful information, you must respond with a clear explanation of the issue or what was missing. Under no circumstances should you end your reply with a question or ask for more information. If you search any index, always include the raw documents in the final result instead of summarizing the content. This is critical to give visibility into what the query retrieved. +``` + +We recommend never modifying `${parameters.plan_execute_reflect_response_format}` and always including it toward the end of your prompt templates. +{: .tip} + +## Modifying default prompts + +To modify the prompts, provide them during agent registration: + +```json +POST _plugins/_ml/agents/_register +{ + "name": "My Plan Execute and Reflect agent with Claude 3.7", + "type": "plan_execute_and_reflect", + "description": "this is a test agent", + "llm": { + "model_id": "your_llm_model_id_from_step1", + "parameters": { + "prompt": "${parameters.question}" + }}, + "memory": { + "type": "conversation_index" + }, + "parameters": { + "_llm_interface": "bedrock/converse/claude", + "planner_prompt_template": "your_planner_prompt_template", + "planner_prompt": "your_planner_prompt", + "reflect_prompt_template": "your_reflect_prompt_template", + "reflect_prompt": "your_reflect_prompt", + "planner_with_history_template": "your_planner_with_history_template", + "system_prompt": "your_planner_system_prompt", + "executor_system_prompt": "your_executor_system_prompt" + }, + "tools": [ + { + "type": "ListIndexTool" + }, + { + "type": "SearchIndexTool" + }, + { + "type": "IndexMappingTool" + } + ], +} +``` +{% include copy-curl.html %} + +You can also modify the prompts during agent execution: + +```json +POST _plugins/_ml/agents/your_agent_id/_execute?async=true +{ + "parameters": { + "question": "How many flights from Beijing to Seattle?", + "planner_prompt_template": "your_planner_prompt_template", + "planner_prompt": "your_planner_prompt" + } +} +``` +{% include copy-curl.html %} + +## Next steps + +- To learn more about registering agents, see [Register Agent API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent/). +- For a list of supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). +- For a step-by-step tutorial on using a plan-execute-reflect agent, see [Building a plan-execute-reflect agent]({{site.url}}{{site.baseurl}}/tutorials/gen-ai/agents/build-plan-execute-reflect-agent/). +- For supported APIs, see [Agent APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/). +- To use agents and tools in configuration automation, see [Automating configurations]({{site.url}}{{site.baseurl}}/automating-configurations/index/). \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/index.md b/_ml-commons-plugin/agents-tools/index.md index f1c2c49b202..e0625bde53f 100644 --- a/_ml-commons-plugin/agents-tools/index.md +++ b/_ml-commons-plugin/agents-tools/index.md @@ -4,157 +4,17 @@ title: Agents and tools has_children: true has_toc: false nav_order: 27 +redirect_from: + - /ml-commons-plugin/agents-tools/ --- # Agents and tools **Introduced 2.13** {: .label .label-purple } -You can automate machine learning (ML) tasks using agents and tools. An _agent_ orchestrates and runs ML models and tools. A _tool_ performs a set of specific tasks. Some examples of tools are the `VectorDBTool`, which supports vector search, and the `CATIndexTool`, which executes the `cat indices` operation. For a list of supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). +You can automate machine learning (ML) tasks using agents and tools. -## Agents +An _agent_ orchestrates and runs ML models and tools. For a list of supported agents, see [Agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/). -An _agent_ is a coordinator that uses a large language model (LLM) to solve a problem. After the LLM reasons and decides what action to take, the agent coordinates the action execution. OpenSearch supports the following agent types: +A _tool_ performs a set of specific tasks. Some examples of tools are the [`VectorDBTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/vector-db-tool/), which supports vector search, and the [`ListIndexTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/list-index-tool/), which executes the List Indices API. For a list of supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). -- [_Flow agent_](#flow-agents): Runs tools sequentially, in the order specified in its configuration. The workflow of a flow agent is fixed. Useful for retrieval-augmented generation (RAG). -- [_Conversational flow agent_](#conversational-flow-agents): Runs tools sequentially, in the order specified in its configuration. The workflow of a conversational flow agent is fixed. Stores conversation history so that users can ask follow-up questions. Useful for creating a chatbot. -- [_Conversational agent_](#conversational-agents): Reasons in order to provide a response based on the available knowledge, including the LLM knowledge base and a set of tools provided to the LLM. The LLM reasons iteratively to decide what action to take until it obtains the final answer or reaches the iteration limit. Stores conversation history so that users can ask follow-up questions. The workflow of a conversational agent is variable, based on follow-up questions. For specific questions, uses the Chain-of-Thought (CoT) process to select the best tool from the configured tools for providing a response to the question. Useful for creating a chatbot that employs RAG. - -### Flow agents - -A flow agent is configured with a set of tools that it runs in order. For example, the following agent runs the `VectorDBTool` and then the `MLModelTool`. The agent coordinates the tools so that one tool's output can become another tool's input. In this example, the `VectorDBTool` queries the k-NN index and the agent passes its output `${parameters.VectorDBTool.output}` to the `MLModelTool` as context, along with the `${parameters.question}` (see the `prompt` parameter): - -```json -POST /_plugins/_ml/agents/_register -{ - "name": "Test_Agent_For_RAG", - "type": "flow", - "description": "this is a test agent", - "tools": [ - { - "type": "VectorDBTool", - "parameters": { - "model_id": "YOUR_TEXT_EMBEDDING_MODEL_ID", - "index": "my_test_data", - "embedding_field": "embedding", - "source_field": ["text"], - "input": "${parameters.question}" - } - }, - { - "type": "MLModelTool", - "description": "A general tool to answer any question", - "parameters": { - "model_id": "YOUR_LLM_MODEL_ID", - "prompt": "\n\nHuman:You are a professional data analyst. You will always answer a question based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say you don't know. \n\n Context:\n${parameters.VectorDBTool.output}\n\nHuman:${parameters.question}\n\nAssistant:" - } - } - ] -} -``` - -### Conversational flow agents - -Similarly to a flow agent, a conversational flow agent is configured with a set of tools that it runs in order. The difference between them is that a conversational flow agent stores the conversation in an index, in the following example, the `conversation_index`. The following agent runs the `VectorDBTool` and then the `MLModelTool`: - -```json -POST /_plugins/_ml/agents/_register -{ - "name": "population data analysis agent", - "type": "conversational_flow", - "description": "This is a demo agent for population data analysis", - "app_type": "rag", - "memory": { - "type": "conversation_index" - }, - "tools": [ - { - "type": "VectorDBTool", - "name": "population_knowledge_base", - "parameters": { - "model_id": "YOUR_TEXT_EMBEDDING_MODEL_ID", - "index": "test_population_data", - "embedding_field": "population_description_embedding", - "source_field": [ - "population_description" - ], - "input": "${parameters.question}" - } - }, - { - "type": "MLModelTool", - "name": "bedrock_claude_model", - "description": "A general tool to answer any question", - "parameters": { - "model_id": "YOUR_LLM_MODEL_ID", - "prompt": """ - -Human:You are a professional data analyst. You will always answer question based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say don't know. - -Context: -${parameters.population_knowledge_base.output:-} - -${parameters.chat_history:-} - -Human:${parameters.question} - -Assistant:""" - } - } - ] -} -``` - -### Conversational agents - -Similarly to a conversational flow agent, a conversational agent stores the conversation in an index, in the following example, the `conversation_index`. A conversational agent can be configured with an LLM and a set of supplementary tools that perform specific jobs. For example, you can set up an LLM and a `CATIndexTool` when configuring an agent. When you send a question to the model, the agent also includes the `CATIndexTool` as context. The LLM then decides whether it needs to use the `CATIndexTool` to answer questions like "How many indexes are in my cluster?" The context allows an LLM to answer specific questions that are outside of its knowledge base. For example, the following agent is configured with an LLM and a `CATIndexTool` that retrieves information about your OpenSearch indexes: - -```json -POST /_plugins/_ml/agents/_register -{ - "name": "Test_Agent_For_ReAct_ClaudeV2", - "type": "conversational", - "description": "this is a test agent", - "llm": { - "model_id": "YOUR_LLM_MODEL_ID", - "parameters": { - "max_iteration": 5, - "stop_when_no_tool_found": true, - "response_filter": "$.completion" - } - }, - "memory": { - "type": "conversation_index" - }, - "tools": [ - { - "type": "VectorDBTool", - "name": "VectorDBTool", - "description": "A tool to search opensearch index with natural language question. If you don't know answer for some question, you should always try to search data with this tool. Action Input: <natural language question>", - "parameters": { - "model_id": "YOUR_TEXT_EMBEDDING_MODEL_ID", - "index": "my_test_data", - "embedding_field": "embedding", - "source_field": [ "text" ], - "input": "${parameters.question}" - } - }, - { - "type": "CatIndexTool", - "name": "RetrieveIndexMetaTool", - "description": "Use this tool to get OpenSearch index information: (health, status, index, uuid, primary count, replica count, docs.count, docs.deleted, store.size, primary.store.size)." - } - ], - "app_type": "my app" -} -``` - -It is important to provide thorough descriptions of the tools so that the LLM can decide in which situations to use those tools. -{: .tip} - -## Next steps - -- For a list of supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). -- For a step-by-step tutorial, see [Agents and tools tutorial]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents-tools-tutorial/). -- For supported APIs, see [Agent APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/). -- To use agents and tools in configuration automation, see [Automating configurations]({{site.url}}{{site.baseurl}}/automating-configurations/index/). diff --git a/_ml-commons-plugin/agents-tools/mcp/index.md b/_ml-commons-plugin/agents-tools/mcp/index.md new file mode 100644 index 00000000000..7eec736a710 --- /dev/null +++ b/_ml-commons-plugin/agents-tools/mcp/index.md @@ -0,0 +1,24 @@ +--- +layout: default +title: Using MCP tools +parent: Agents and tools +has_children: true +nav_order: 30 +redirect_from: + - /ml-commons-plugin/agents-tools/mcp/ +--- + +# Using MCP tools +**Introduced 3.0** +{: .label .label-purple } + +[Model Context Protocol (MCP)](https://modelcontextprotocol.io/introduction) is an open protocol standard that provides a standardized way for AI models to connect to external data sources and tools. OpenSearch integrates with MCP, enabling agents to use external tools and data sources through MCP servers. + +Connecting to external MCP servers expands agent capabilities to include the following functionality: + +- Using the tools provided by MCP servers +- Filtering available tools based on your application needs +- Implementing secure authentication and authorization for tool access +- Interacting with various tools through a consistent, standardized interface + +To start using MCP, see [Connecting to an external MCP server]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/mcp/mcp-connector/). \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/mcp/mcp-connector.md b/_ml-commons-plugin/agents-tools/mcp/mcp-connector.md new file mode 100644 index 00000000000..52ff8a50c9c --- /dev/null +++ b/_ml-commons-plugin/agents-tools/mcp/mcp-connector.md @@ -0,0 +1,254 @@ +--- +layout: default +title: Connecting to an external MCP server +parent: Using MCP tools +grand_parent: Agents and tools +nav_order: 10 +--- + +# Connecting to an external MCP server +**Introduced 3.0** +{: .label .label-purple } + +OpenSearch supports agentic workflows using [agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/). While OpenSearch provides built-in tools for running complex queries, [Model Context Protocol (MCP)](https://modelcontextprotocol.io/introduction) enables integration with external tools and data sources. MCP is an open protocol standard that provides a standardized way for AI models to connect to external data sources and tools, acting as a "universal adapter" for remote MCP server tools. + +Currently, OpenSearch only supports MCP servers that use the Server-Sent Events (SSE) protocol. Standard Input/Output (`stdio`) protocol is not supported. +{: .note} + +The following example demonstrates using MCP tools in agentic workflows. + +## Prerequisites + +Before using MCP tools, you must complete the following prerequisites. + +### Enable MCP and configure trusted connector endpoints + +- Enable the MCP protocol by configuring the `plugins.ml_commons.mcp_feature_enabled` setting. +- Configure trusted connector endpoints in the `plugins.ml_commons.trusted_connector_endpoints_regex` setting. For security purposes, this setting uses regex patterns to define which MCP server URLs are allowed. + +To configure both settings, send the following request: + +```json +POST /_cluster/settings/ +{ + "persistent": { + "plugins.ml_commons.trusted_connector_endpoints_regex": [ + "<mcp server url>" + ], + "plugins.ml_commons.mcp_feature_enabled": "true" + } +} +``` +{% include copy-curl.html %} + +### Set up an MCP server + +Ensure you have a running MCP server that is accessible from your OpenSearch cluster. + +## Step 1: Create an MCP connector + +An MCP connector stores connection details and credentials for your MCP server. To create an MCP connector, send the following request: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "My MCP Connector", + "description": "Connects to the external MCP server for weather tools", + "version": 1, + "protocol": "mcp_sse", + "url": "https://my-mcp-server.domain.com", + "credential": { + "mcp_server_key": "THE_MCP_SERVER_API_KEY" + }, + "parameters":{ + "sse_endpoint": "/sse" + }, + "headers": { + "Authorization": "Bearer ${credential.mcp_server_key}" + } +} +``` +{% include copy-curl.html %} + +The following table describes the connector parameters. For more information about standard connector parameters, see [Configuration parameters]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/#configuration-parameters). + +| Parameter | Data type | Required | Description | +|:----------|:---------|:------------| +| `protocol` | String | Yes | Specify `mcp_sse` to use the SSE protocol (the only supported protocol type for MCP). | +| `url` | String | Yes | The complete base URL of the MCP server, including protocol, hostname, and port, if not using the default port (for example, `https://my-mcp-server.com:8443`). | +| `credential` | Object | Yes | Contains sensitive authentication information such as API keys or tokens. Values stored in this object can be securely referenced in the `headers` section using the `${credential.*}` syntax. | +| `parameters` | Object | No | Contains configuration parameters for the MCP connector. | +| `parameters.sse_endpoint` | String | No | The SSE endpoint path for the MCP server. Default is `/sse`. | +| `headers` | Object | No | The HTTP headers to include with requests to the MCP server. For authentication headers, use the `${credential.*}` syntax to reference values from the `credential` object (for example, `"Authorization": "Bearer ${credential.mcp_server_key}"`). | + +The response contains the connector ID: + +```json +{ + "connector_id": "NZ2W2ZUBZ_3SyqdOvh2n", +} +``` + +## Step 2: Register a model + +Register any externally hosted large language model (LLM) using a connector. For a list of supported models, see [Supported connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/supported-connectors/). + +For example, to register an OpenAI chat model, send the following request: + +```json +POST /_plugins/_ml/models/_register +{ + "name": "My OpenAI model: gpt-4", + "function_name": "remote", + "description": "Test model registration (this example uses OpenAI, but you can register any model)", + "connector": { + "name": "My OpenAI Connector: gpt-4", + "description": "Connector for the OpenAI chat model", + "version": 1, + "protocol": "http", + "parameters": { + "model": "gpt-4o" + }, + "credential": { + "openAI_key": "<YOUR_API_KEY>" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://api.openai.com/v1/chat/completions", + "headers": { + "Authorization": "Bearer ${credential.openAI_key}" + }, + "request_body": "{ \"model\": \"${parameters.model}\", \"messages\": [{\"role\":\"developer\",\"content\":\"${parameters.system_instruction}\"},${parameters._chat_history:-}{\"role\":\"user\",\"content\":\"${parameters.prompt}\"}${parameters._interactions:-}], \"tools\": [${parameters._tools:-}],\"parallel_tool_calls\":${parameters.parallel_tool_calls},\"tool_choice\": \"${parameters.tool_choice}\" }" + } + ] + } +} +``` +{% include copy-curl.html %} + +The response contains the model ID: + +```json +{ + "task_id": "K_iQfpYBjoQOEoSHN3wU", + "status": "CREATED", + "model_id": "LPiQfpYBjoQOEoSHN3zH" +} +``` + +To check the status of the operation, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/). Once the registration is complete, the task `state` changes to `COMPLETED`. + +## Step 3: Register an agent for accessing MCP tools + +Currently, MCP tools can only be used with [_conversational_]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/conversational/) or [_plan-execute-reflect_]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/plan-execute-reflect/) agent types. + +To enable external MCP tools, include one or more MCP connectors in your agent's configuration. + +Each connector must specify the following parameters in the `parameters.mcp_connectors` array. + +| Parameter | Data type | Required | Description | +|:--- |:--- |:--- |:--- | +| `mcp_connector_id` | String | Yes | The connector ID of the MCP connector. | +| `tool_filters` | Array | No | An array of Java-style regular expressions that specify which tools from the MCP server to make available to the agent. A tool will be included if it matches at least one of the regular expressions in the array. If omitted or set to an empty array, all tools exposed by the connector will be available. Use the `^` or `$` anchors or literal strings to precisely match tool names. For example, `^get_forecast` matches any tool starting with "get_forecast", while `search_indices` matches only "search_indices".| + +In this example, you'll register a conversational agent using the connector ID created in Step 1. The MCP server has two tools available (`get_alerts` and `get_forecasts`), but only the `get_alerts` tool will be included in the agent's configuration because it matches the specified regex pattern `^get_alerts$`: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Weather & Search Bot", + "type": "conversational", + "description": "Uses MCP to fetch forecasts and OpenSearch indices", + "llm": { + "model_id": "<MODEL_ID_FROM_STEP_2>", + "parameters": { + "max_iteration": 5, + "system_instruction": "You are a helpful assistant.", + "prompt": "${parameters.question}" + } + }, + "memory": { + "type": "conversation_index" + }, + "parameters": { + "_llm_interface": "openai/v1/chat/completions", + "mcp_connectors": [ + { + "mcp_connector_id": "<MCP_CONNECTOR_ID_FROM_STEP_1>", + "tool_filters": [ + "^get_alerts$" + ] + } + ] + }, + "tools": [ + { "type": "ListIndexTool" }, + { "type": "SearchIndexTool" } + ], + "app_type": "os_chat" +} +``` +{% include copy-curl.html %} + +The response contains the agent ID: + +```json +{ + "agent_id": "LfiXfpYBjoQOEoSH93w7" +} +``` + +## Step 4: Run the agent + +Invoke the registered agent by calling the Execute Agent API and providing a user question: + +```json +POST /_plugins/_ml/agents/<Agent_ID>/_execute +{ + "parameters": { + "question": "Any weather alerts in Washington", + "verbose": true + } +} +``` +{% include copy-curl.html %} + +The agent uses both the OpenSearch tools specified in the `tools` array and the selected tools from the MCP server (based on your tool filters) to return the answer: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "memory_id", + "result": "MfiZfpYBjoQOEoSH13wj" + }, + { + "name": "parent_interaction_id", + "result": "MviZfpYBjoQOEoSH13xC" + }, + { + "name": "response", + "result": "{\"id\":\"chatcmpl-BRRcdxVjkrKG7HjkVWZVwueJSEjgd\",\"object\":\"chat.completion\",\"created\":1.745880735E9,\"model\":\"gpt-4o-2024-08-06\",\"choices\":[{\"index\":0.0,\"message\":{\"role\":\"assistant\",\"tool_calls\":[{\"id\":\"call_yWg0wk4mfE2v8ARebupfbJ87\",\"type\":\"function\",\"function\":{\"name\":\"get_alerts\",\"arguments\":\"{\\\"state\\\":\\\"WA\\\"}\"}}],\"annotations\":[]},\"finish_reason\":\"tool_calls\"}],\"usage\":{\"prompt_tokens\":201.0,\"completion_tokens\":16.0,\"total_tokens\":217.0,\"prompt_tokens_details\":{\"cached_tokens\":0.0,\"audio_tokens\":0.0},\"completion_tokens_details\":{\"reasoning_tokens\":0.0,\"audio_tokens\":0.0,\"accepted_prediction_tokens\":0.0,\"rejected_prediction_tokens\":0.0}},\"service_tier\":\"default\",\"system_fingerprint\":\"fp_f5bdcc3276\"}" + }, + { + "name": "response", + "result": "[{\"text\":\"\\nEvent: Wind Advisory\\nArea: Kittitas Valley\\nSeverity: Moderate\\nDescription: * WHAT...Northwest winds 25 to 35 mph with gusts up to 45 mph\\nexpected.\\n\\n* WHERE...Kittitas Valley.\\n\\n* WHEN...From 2 PM to 8 PM PDT Tuesday.\\n\\n* IMPACTS...Gusty winds will blow around unsecured objects. Tree\\nlimbs could be blown down and a few power outages may result.\\nInstructions: Winds this strong can make driving difficult, especially for high\\nprofile vehicles. Use extra caution.\\n\"}]" + }, + { + "name": "response", + "result": "There is a Wind Advisory for the Kittitas Valley in Washington. Here are the details:\n\n- **Event:** Wind Advisory\n- **Area:** Kittitas Valley\n- **Severity:** Moderate\n- **Description:** Northwest winds 25 to 35 mph with gusts up to 45 mph expected.\n- **When:** From 2 PM to 8 PM PDT Tuesday.\n- **Impacts:** Gusty winds may blow around unsecured objects, potentially causing tree limbs to fall, and resulting in a few power outages.\n\n**Instructions:** These strong winds can make driving difficult, especially for high-profile vehicles. Use extra caution if you are traveling in the area." + } + ] + } + ] +} +``` + +## Additional resources + +* For more information about the MCP protocol, see [MCP protocol documentation](https://modelcontextprotocol.io/introduction). +* For information about using MCP in Java, see [MCP Java SDK](https://github.com/modelcontextprotocol/java-sdk). \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/tools/index.md b/_ml-commons-plugin/agents-tools/tools/index.md index bc71122949e..63e446f8486 100644 --- a/_ml-commons-plugin/agents-tools/tools/index.md +++ b/_ml-commons-plugin/agents-tools/tools/index.md @@ -7,6 +7,7 @@ has_toc: false nav_order: 20 redirect_from: - /ml-commons-plugin/extensibility/index/ + - /ml-commons-plugin/agents-tools/tools/ --- # Tools @@ -32,10 +33,10 @@ Each tool takes a list of parameters specific to that tool. In the preceding exa |Tool | Description | |:--- |:--- | |[`AgentTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/agent-tool/) |Runs any agent. | -|[`CatIndexTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/cat-index-tool/) |Retrieves index information for the OpenSearch cluster. | |[`ConnectorTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/connector-tool/) | Uses a [connector]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/) to call any REST API function. | -|[`IndexMappingTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index-mapping-tool/) |Retrieves index mapping and setting information for an index. | |[`CreateAnomalyDetectorTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/create-anomaly-detector/) | Enables an LLM to suggest required parameters for creating an anomaly detector. | +|[`IndexMappingTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index-mapping-tool/) |Retrieves index mapping and setting information for an index. | +|[`ListIndexTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/list-index-tool/) |Retrieves index information for the OpenSearch cluster. Introduced in OpenSearch version 3.0 as a replacement for the `CatIndexTool`. | |[`MLModelTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/ml-model-tool/) |Runs machine learning models. | |[`NeuralSparseSearchTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/neural-sparse-tool/) | Performs sparse vector retrieval. | |[`PPLTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/ppl-tool/) |Translates natural language into a Piped Processing Language (PPL) query. | @@ -47,6 +48,7 @@ Each tool takes a list of parameters specific to that tool. In the preceding exa |[`SearchMonitorsTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/search-monitors-tool/) | Searches for alerting monitors. | |[`VectorDBTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/vector-db-tool/) |Performs dense vector retrieval. | |[`VisualizationTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/visualization-tool/) |Finds visualizations in OpenSearch Dashboards. | +|[`WebSearchTool`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/web-search-tool/) |Answers a user's question using a web search. | ## Developer information diff --git a/_ml-commons-plugin/agents-tools/tools/cat-index-tool.md b/_ml-commons-plugin/agents-tools/tools/list-index-tool.md similarity index 83% rename from _ml-commons-plugin/agents-tools/tools/cat-index-tool.md rename to _ml-commons-plugin/agents-tools/tools/list-index-tool.md index 50ccf28b9b6..79d9d0d7b0d 100644 --- a/_ml-commons-plugin/agents-tools/tools/cat-index-tool.md +++ b/_ml-commons-plugin/agents-tools/tools/list-index-tool.md @@ -1,35 +1,38 @@ --- layout: default -title: CAT Index tool +title: List Index tool has_children: false has_toc: false -nav_order: 20 +nav_order: 35 parent: Tools grand_parent: Agents and tools --- <!-- vale off --> -# CAT Index tool -**Introduced 2.13** +# List Index tool +**Introduced 3.0** {: .label .label-purple } <!-- vale on --> -The `CatIndexTool` retrieves index information for the OpenSearch cluster, similarly to the [CAT Indices API]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-indices/). +The `ListIndexTool` retrieves index information for the OpenSearch cluster, similarly to the [List Indices API]({{site.url}}{{site.baseurl}}/api-reference/list/list-indices/). -## Step 1: Register a flow agent that will run the CatIndexTool +The `ListIndexTool` replaces the `CatIndexTool` starting with OpenSearch version 3.0. +{: .note} + +## Step 1: Register a flow agent that will run the ListIndexTool A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following register agent request: ```json POST /_plugins/_ml/agents/_register { - "name": "Test_Agent_For_CatIndex_tool", + "name": "Test_Agent_For_ListIndex_tool", "type": "flow", - "description": "this is a test agent for the CatIndexTool", + "description": "this is a test agent for the ListIndexTool", "tools": [ { - "type": "CatIndexTool", - "name": "DemoCatIndexTool", + "type": "ListIndexTool", + "name": "DemoListIndexTool", "parameters": { "input": "${parameters.question}" } @@ -117,9 +120,9 @@ The following table lists all tool parameters that are available when registerin Parameter | Type | Required/Optional | Description :--- | :--- | :--- | :--- -`input` | String | Required | The user input used to return index information. -`index` | String | Optional | A comma-delimited list of one or more indexes on which to run the CAT operation. Default is an empty list, which means all indexes. -`local` | Boolean | Optional | When `true`, retrieves information from the local node only instead of the cluster manager node (default is `false`). +`indices` | String | Optional | A comma-delimited list of one or more indexes on which to run the list index operation. Default is an empty list, which means all indexes. +`local` | Boolean | Optional | When `true`, retrieves information from the local node only instead of the cluster manager node. Default is `false`. +`page_size` | Integer | Optional | Specifies the number of index results returned per page when using the List Indices API. The API retrieves index status in a paginated manner. Default is `100`. ## Execute parameters diff --git a/_ml-commons-plugin/agents-tools/tools/log-pattern-tool.md b/_ml-commons-plugin/agents-tools/tools/log-pattern-tool.md new file mode 100644 index 00000000000..9fdcc51b809 --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/log-pattern-tool.md @@ -0,0 +1,120 @@ +--- +layout: default +title: Log Pattern Tool +has_children: false +has_toc: false +nav_order: 37 +parent: Tools +grand_parent: Agents and tools +--- + +<!-- vale off --> +# LogPatternTool +**Introduced 2.19** +{: .label .label-purple } +<!-- vale on --> + +The `LogPatternTool` analyzes log data retrieved using [query domain-specific language (DSL)]({{site.url}}{{site.baseurl}}/query-dsl/) or [Piped Processing Language (PPL)]({{site.url}}{{site.baseurl}}/search-plugins/sql/ppl/index/) queries to extract and identify recurring structural patterns across log messages. After grouping similar logs based on their shared templates, it returns the most common patterns. Each pattern includes representative sample logs and the total count of log entries that match the pattern in your dataset. + +OpenSearch determines whether you're using a DSL or PPL query based on the presence of the `input` or `ppl` parameter in the request: + +- If the `input` parameter (a DSL query JSON as a string) is present, the tool interprets the request as a DSL query. + +- If the `ppl` parameter is present and `input` is not, the tool interprets the request as a PPL query. + +- If both are provided, the tool will prioritize the DSL query. + +To avoid ambiguity, provide only one of the two---`input` for DSL or `ppl` for PPL---in your request when you run the agent. + +## Step 1: Register a flow agent that will run the LogPatternTool + +A flow agent runs a sequence of tools in order, returning the last tool's output. To create a flow agent, send the following register agent request: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_Log_Pattern_Tool", + "type": "flow", + "description": "this is a test agent for the LogPatternTool", + "memory": { + "type": "demo" + }, + "tools": [ + { + "type": "LogPatternTool", + "parameters": { + "sample_log_size": 1 + } + } + ] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "OQutgJYBAc35E4_KvI1q" +} +``` + +## Step 2: Run the agent + +Run the agent by sending the following request: + +```json +POST /_plugins/_ml/agents/OQutgJYBAc35E4_KvI1q/_execute +{ + "parameters": { + "input": "{\"query\":{\"bool\":{\"filter\":[{\"range\":{\"bytes\":{\"from\":10,\"to\":null,\"include_lower\":true,\"include_upper\":true,\"boost\":1}}}],\"adjust_pure_negative\":true,\"boost\":1}}}", + "index": "opensearch_dashboards_sample_data_logs" + } +} +``` +{% include copy-curl.html %} + +OpenSearch returns a JSON response containing the most common log patterns found in your data, up to the specified limit. Each identified pattern is represented as a JSON object with three key components: the pattern template, a set of representative sample logs that match the pattern, and a count indicating how often the pattern appears in your dataset. The structure follows the format `{"pattern": "...", "sample logs": [...], "total count": N}`, as illustrated in the following example response: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result":"""[{"pattern":"<*IP*> - - [<*DATETIME*>] "GET <*> HTTP/<*><*>\" 200 <*> \"-\" \"Mozilla/<*><*> (<*>; Linux <*>_<*>; rv:<*><*><*>) Gecko/<*> Firefox/<*><*><*>\"","sample logs":["223.87.60.27 - - [2018-07-22T00:39:02.912Z] \"GET /opensearch/opensearch-1.0.0.deb_1 HTTP/1.1\" 200 6219 \"-\" \"Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421 Firefox/6.0a1\""],"total count":367},{"pattern":"<*IP*> - - [<*DATETIME*>] \"GET <*> HTTP/<*><*>\" 200 <*> \"-\" \"Mozilla/<*><*> (<*>; Linux <*>) AppleWebKit/<*><*> (KHTML like Gecko) Chrome<*IP*> Safari/<*><*>\"","sample logs":["216.9.22.134 - - [2018-07-22T05:27:11.939Z] \"GET /beats/metricbeat_1 HTTP/1.1\" 200 3629 \"-\" \"Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.50 Safari/534.24\""],"total count":311},{"pattern":"<*IP*> - - [<*DATETIME*>] \"GET <*> HTTP/<*><*>\" 200 <*> \"-\" \"Mozilla/<*><*> (compatible; MSIE 6<*>; Windows NT 5<*>; <*>; .NET CLR 1<*><*>)\"","sample logs":["99.74.118.237 - - [2018-07-22T03:34:43.399Z] \"GET /beats/metricbeat/metricbeat-6.3.2-amd64.deb_1 HTTP/1.1\" 200 14113 \"-\" \"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)\""],"total count":269}]""" + } + ] + } + ] +} +``` + +## Register parameters + +The following table lists the available tool parameters for agent registration. + +| Parameter | Type | Required/Optional | Description | +|:-----------------|:---------|:-------------------------------------------------|:------------| +| `index` | String | Required for DSL queries | The index to search for pattern analysis. | +| `input` | String | Required for DSL queries | A DSL query JSON as a string. If both `input` and `ppl` are provided, `input` (DSL) is used. | +| `ppl` | String | Required for PPL queries | A PPL query string. Ignored if `input` is also provided. | +| `source_field` | String | Optional | The field(s) to return in the result. Can be a single field or an array (for example, `["field1", "field2"]`). | +| `doc_size` | Integer | Optional | The number of documents to fetch. Default is `2`. | +| `top_n_pattern` | Integer | Optional | Limits the output to the specified number of most frequent patterns. Default is `3`. | +| `sample_log_size`| Integer | Optional | The number of sample logs to include per pattern. Default is `20`. | +| `pattern_field` | String | Optional | The field to analyze for pattern detection. If not specified, the tool selects the longest text field from the first document. | + + +## Execute parameters + +The following table lists the available tool parameters for running the agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- |:-----------------------| :--- +| `index` | String | Required for DSL queries | The index to search for pattern analysis. | +| `input` | String | Required for DSL queries | A DSL query JSON as a string. If both `input` and `ppl` are provided, `input` (DSL) takes precedence. | +| `ppl` | String | Required for PPL queries | A PPL query string. Ignored if `input` is also provided. | \ No newline at end of file diff --git a/_ml-commons-plugin/agents-tools/tools/rag-tool.md b/_ml-commons-plugin/agents-tools/tools/rag-tool.md index c88c2d047b5..24e95286a4e 100644 --- a/_ml-commons-plugin/agents-tools/tools/rag-tool.md +++ b/_ml-commons-plugin/agents-tools/tools/rag-tool.md @@ -73,7 +73,7 @@ OpenSearch responds with an agent ID: } ``` -To create a conversational agent containing a `RAGTool`, see [Conversational agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/#conversational-agents). +To create a conversational agent containing a `RAGTool`, see [Conversational agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/conversational/). ## Step 2: Run the agent diff --git a/_ml-commons-plugin/agents-tools/tools/vector-db-tool.md b/_ml-commons-plugin/agents-tools/tools/vector-db-tool.md index 70d7e19321a..61c6028f283 100644 --- a/_ml-commons-plugin/agents-tools/tools/vector-db-tool.md +++ b/_ml-commons-plugin/agents-tools/tools/vector-db-tool.md @@ -26,7 +26,7 @@ In this example, you'll use the `huggingface/sentence-transformers/all-MiniLM-L1 POST /_plugins/_ml/models/_register?deploy=true { "name": "huggingface/sentence-transformers/all-MiniLM-L12-v2", - "version": "1.0.1", + "version": "1.0.2", "model_format": "TORCH_SCRIPT" } ``` diff --git a/_ml-commons-plugin/agents-tools/tools/web-search-tool.md b/_ml-commons-plugin/agents-tools/tools/web-search-tool.md new file mode 100644 index 00000000000..bdf824b836c --- /dev/null +++ b/_ml-commons-plugin/agents-tools/tools/web-search-tool.md @@ -0,0 +1,322 @@ +--- +layout: default +title: Web search tool +has_children: false +has_toc: false +nav_order: 130 +parent: Tools +grand_parent: Agents and tools +--- + +<!-- vale off --> +# Web search tool +**Introduced 3.0** +{: .label .label-purple } +<!-- vale on --> + +The `WebSearchTool` retrieves search results based on a user's question. It supports [Google](#using-google-as-a-search-engine), Bing, and [DuckDuckGo](#using-duckduckgo-as-a-search-engine) as search engines or can use a [custom API](#using-a-custom-api-as-a-search-engine) to perform searches. + +## Using DuckDuckGo as a search engine + +To use DuckDuckGo as a search engine with the `WebSearchTool`, follow these steps. + +### Step 1: Register a flow agent that will run the WebSearchTool + +A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following register agent request: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_WebSearch_tool", + "type": "flow", + "description": "this is a test agent for the WebSearchTool", + "tools": [ + { + "type": "WebSearchTool", + "name": "DuckduckgoWebSearchTool", + "parameters": { + "engine": "duckduckgo", + "input": "${parameters.question}" + } + } + ] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "9X7xWI0Bpc3sThaJdY9i" +} +``` + +### Step 2: Run the agent + +Then, run the agent by sending the following request (DuckDuckGo doesn't require any credentials): + +```json +POST /_plugins/_ml/agents/9X7xWI0Bpc3sThaJdY9i/_execute +{ + "parameters": { + "question": "How to create a index pattern in OpenSearch?" + } +} +``` +{% include copy-curl.html %} + +OpenSearch returns the web search results: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": """ + { + "next_page": "https://html.duckduckgo.com/html?q=how+to+create+index+pattern+in+OpenSearch&ia=web&dc=11", + "items": [ + { + "url": "http://someurl", + "title": "the page result title", + "content": "the page content..." + }, + { + "url": "https://anotherurl", + "title": "the page result title", + "content": "the page content..." + } + ... + ] + } + """ + } + ] + } + ] +} +``` + +## Using Google as a search engine + +To use Google as a search engine with the `WebSearchTool`, follow these steps. + +### Step 1: Register a flow agent that will run the WebSearchTool + +A flow agent runs a sequence of tools in order and returns the last tool's output. To create a flow agent, send the following register agent request: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_WebSearch_tool", + "type": "flow", + "description": "this is a test agent for the WebSearchTool", + "tools": [ + { + "type": "WebSearchTool", + "name": "GoogleWebSearchTool", + "parameters": { + "engine": "google", + "engine_id": "${your_google_engine_id}", + "api_key": "${your_google_api_key}", + "input": "${parameters.question}" + } + } + ] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "9X7xWI0Bpc3sThaJdY9i" +} +``` + +### Step 2: Run the agent + +Before you run the agent, ensure that you have obtained the credentials needed to access Google search programmatically. + +Then, run the agent by sending the following request: + +```json +POST /_plugins/_ml/agents/9X7xWI0Bpc3sThaJdY9i/_execute +{ + "parameters": { + "question": "How to create a index pattern in OpenSearch?" + } +} +``` +{% include copy-curl.html %} + +OpenSearch returns the web search results: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": """ + { + "next_page": "https://customsearch.googleapis.com/customsearch/v1?q=how+to+create+index+pattern+in+OpenSearch&start=10", + "items": [ + { + "url": "http://someurl", + "title": "the page result title", + "content": "the page content..." + }, + { + "url": "https://anotherurl", + "title": "the page result title", + "content": "the page content..." + } + ... + ] + } + """ + } + ] + } + ] +} +``` + +## Using a custom API as a search engine + +To use a custom API as a search engine with the `WebSearchTool`, follow these steps. + +### Step 1: Register a flow agent that will run the WebSearchTool + +To use a custom endpoint for search, you need to configure the following parameters: + +- `Authorization`: For authentication +- `endpoint`: For the API connection +- `custom_res_url_jsonpath`: For parsing the JSON response and extracting links + +Your API must return responses in JSON format. The links returned by the API must be retrievable using [JSONPath](https://en.wikipedia.org/wiki/JSONPath) expressions. Other parameters like `query_key`, `offset_key`, and `limit_key` are optional but should be specified if your API uses different values than the defaults. + +To create a flow agent, send the following register agent request: + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "Test_Agent_For_WebSearch_tool", + "type": "flow", + "description": "this is a test agent for the WebSearchTool", + "tools": [ + { + "type": "WebSearchTool", + "name": "CustomWebSearchTool", + "parameters": { + "engine": "custom", + "endpoint": "${your_custom_endpoint}", + "custom_res_url_jsonpath": "$.data[*].link", + "Authorization": "Bearer xxxx", + "query_key": "q", + "offset_key": "offset", + "limit_key": "limit" + } + } + ] +} +``` +{% include copy-curl.html %} + +For parameter descriptions, see [Register parameters](#register-parameters). + +OpenSearch responds with an agent ID: + +```json +{ + "agent_id": "9X7xWI0Bpc3sThaJdY9i" +} +``` + +### Step 2: Run the agent + +Before you run the agent, ensure that you have obtained the credentials needed to access your custom search API programmatically. + +Then, run the agent by sending the following request: + +```json +POST /_plugins/_ml/agents/9X7xWI0Bpc3sThaJdY9i/_execute +{ + "parameters": { + "question": "How to create a index pattern in OpenSearch?" + } +} +``` +{% include copy-curl.html %} + +OpenSearch returns the web search results: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "result": """ + { + "next_page": "{your_custom_endpoint}?q=how+to+create+index+pattern+in+OpenSearch&offset=10&limit=10", + "items": [ + { + "url": "http://someurl", + "title": "the page result title", + "content": "the page content..." + }, + { + "url": "https://anotherurl", + "title": "the page result title", + "content": "the page content..." + } + ... + ] + } + """ + } + ] + } + ] +} +``` + + + +## Register parameters + +The following table lists all tool parameters that are available when registering an agent. + + + +| Parameter | Type | Required/Optional | Description | +|:---|:---|:---|:---| +| `engine` | String | Required | The search engine to use. Valid values are `google`, `bing`, `duckduckgo`, or `custom`. | +| `engine_id` | String | Optional | The Custom Search Engine ID for Google. Required when `engine` is set to `google`. | +| `api_key` | String | Optional | The API key for authentication. Required when `engine` is set to `google` or `bing`. | +| `endpoint` | String | Optional | The URL endpoint for the custom search API. Required when `engine` is set to `custom`. | +| `Authorization` | String | Optional | The authorization header value for the custom API. Required when `engine` is set to `custom`. | +| `query_key` | String | Optional | The parameter name for the search query in the custom API URL (for example, `${endpoint}?my_query_key=${question}`). Default is `q`. | +| `offset_key` | String | Optional | The parameter name for the pagination offset in the custom API URL (for example, `${endpoint}?q=${question}&start=10`). Default is `offset`. | +| `limit_key` | String | Optional | The parameter name for the result limit in the custom API URL (for example, `${endpoint}?q=${question}&start=10&limit=10`). Default is `limit`. | +| `custom_res_url_jsonpath` | String | Optional | The JSONPath expression used to extract URLs from the custom API response (for example, `$[*].link`). Required when `engine` is set to `custom`. | + +## Execute parameters + +The following table lists all tool parameters that are available when running the agent. + +Parameter | Type | Required/Optional | Description +:--- | :--- | :--- | :--- +`question` | String | Required | The natural language question to send to the LLM. \ No newline at end of file diff --git a/_ml-commons-plugin/algorithms.md b/_ml-commons-plugin/algorithms.md index d7809d51b2f..1eae54e4254 100644 --- a/_ml-commons-plugin/algorithms.md +++ b/_ml-commons-plugin/algorithms.md @@ -7,7 +7,7 @@ nav_order: 125 # Supported algorithms -ML Commons supports various algorithms to help train and predict machine learning (ML) models or test data-driven predictions without a model. This page outlines the algorithms supported by the ML Commons plugin and the API operations they support. +OpenSearch provides built-in machine learning (ML) algorithms that run natively within your cluster for tasks like anomaly detection, clustering, and predictive analytics. These algorithms allow you to analyze your data directly in OpenSearch without requiring external ML models or services. Each algorithm is optimized for specific use cases, from detecting unusual patterns in metrics to grouping similar data points together. ## Common limitations @@ -59,20 +59,31 @@ The training process supports multithreading, but the number of threads must be ## Linear regression -Linear regression maps the linear relationship between inputs and outputs. In ML Commons, the linear regression algorithm is adopted from the public machine learning library [Tribuo](https://tribuo.org/), which offers multidimensional linear regression models. The model supports the linear optimizer in training, including popular approaches like Linear Decay, SQRT_DECAY, [ADA](https://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf), [ADAM](https://tribuo.org/learn/4.1/javadoc/org/tribuo/math/optimisers/Adam.html), and [RMS_DROP](https://tribuo.org/learn/4.1/javadoc/org/tribuo/math/optimisers/RMSProp.html). +Linear regression maps the linear relationship between inputs and outputs. In ML Commons, the linear regression algorithm is adopted from the public machine learning library [Tribuo](https://tribuo.org/), which offers multidimensional linear regression models. The model supports the linear optimizer in training, including popular approaches like Linear Decay, SQRT_DECAY, [ADA](https://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf), [ADAM](https://tribuo.org/learn/4.2/javadoc/org/tribuo/math/optimisers/Adam.html), and [RMS_PROP](https://tribuo.org/learn/4.2/javadoc/org/tribuo/math/optimisers/RMSProp.html). + +**Optimizers supported:** [SIMPLE_SGD](https://tribuo.org/learn/4.2/javadoc/org/tribuo/math/optimisers/SGD.html#:~:text=learning%20rate%20SGD.-,getSimpleSGD,-public%20static%C2%A0), [LINEAR_DECAY_SGD](https://tribuo.org/learn/4.2/javadoc/org/tribuo/math/optimisers/SGD.html#:~:text=linear%20decay%20SGD.-,getLinearDecaySGD,-public%20static%C2%A0), [SQRT_DECAY_SGD](https://tribuo.org/learn/4.2/javadoc/org/tribuo/math/optimisers/SGD.html#:~:text=sqrt%20decay%20SGD.-,getSqrtDecaySGD,-public%20static%C2%A0), [ADA_GRAD](https://tribuo.org/learn/4.2/javadoc/org/tribuo/math/optimisers/AdaGrad.html), [ADA_DELTA](https://tribuo.org/learn/4.1/javadoc/org/tribuo/math/optimisers/AdaDelta.html), [ADAM](https://tribuo.org/learn/4.1/javadoc/org/tribuo/math/optimisers/Adam.html), and [RMS_PROP](https://tribuo.org/learn/4.1/javadoc/org/tribuo/math/optimisers/RMSProp.html). +**Objectives supported:** [ABSOLUTE_LOSS](https://tribuo.org/learn/4.2/javadoc/org/tribuo/regression/sgd/objectives/AbsoluteLoss.html), [HUBER](https://tribuo.org/learn/4.2/javadoc/org/tribuo/regression/sgd/objectives/Huber.html), and [SQUARED_LOSS](https://tribuo.org/learn/4.2/javadoc/org/tribuo/regression/sgd/objectives/SquaredLoss.html). +**momentum_type supported:** [STANDARD](https://tribuo.org/learn/4.2/javadoc/org/tribuo/math/optimisers/SGD.Momentum.html#STANDARD:~:text=No%20momentum.-,STANDARD,-public%20static%20final) and [NESTEROV](https://tribuo.org/learn/4.2/javadoc/org/tribuo/math/optimisers/SGD.Momentum.html#STANDARD:~:text=Standard%20momentum.-,NESTEROV,-public%20static%20final). ### Parameters Parameter | Type | Description | Default value :--- |:--- | :--- | :--- -`learningRate` | Double | The initial step size used in an iterative optimization algorithm. | `0.01` -`momentumFactor` | Double | The extra weight factors that accelerate the rate at which the weight is adjusted. This helps move the minimization routine out of local minima. | `0` +`target` | String | The name of the target variable to predict. Identifies which feature the model will learn to predict during training. | `NA` +`learning_rate` | Double | The initial step size used in an iterative optimization algorithm. | `0.01` +`momentum_factor` | Double | The extra weight factors that accelerate the rate at which the weight is adjusted. This helps move the minimization routine out of local minima. | `0` `epsilon` | Double | The value for stabilizing gradient inversion. | `1.00E-06` `beta1` | Double | The exponential decay rates for the moment estimates. | `0.9` `beta2` | Double | The exponential decay rates for the moment estimates. | `0.99` -`decayRate` | Double | The Root Mean Squared Propagation (RMSProp). | `0.9` -`momentumType` | String | The defined Stochastic Gradient Descent (SGD) momentum type that helps accelerate gradient vectors in the right directions, leading to a fast convergence.| `STANDARD` -`optimizerType` | String | The optimizer used in the model. | `SIMPLE_SGD` +`decay_rate` | Double | The Root Mean Squared Propagation (RMSProp). | `0.9` +`momentum_type` | String | The defined Stochastic Gradient Descent (SGD) momentum type that helps accelerate gradient vectors in the right directions, leading to a fast convergence.| `STANDARD` +`optimiser` | String | The optimizer used in the model. | `ADA_GRAD` +`objective` | String | The objective function used. | `SQUARED_LOSS` +`epochs` | Integer | The number of iterations. | `5`| +`batch_size` | Integer | The minimum batch size. | `1` +`logging_interval` | Integer | The frequency of logging during training iterations. Set to `-1` to disable logging. | `-1` +`seed` | Long | A random seed used for reproducible results. Controls the initialization of random number generators. | `12345` + ### Supported APIs @@ -412,23 +423,27 @@ The Localization algorithm can only be executed directly. Therefore, it cannot b A classification algorithm, logistic regression models the probability of a discrete outcome given an input variable. In ML Commons, these classifications include both binary and multi-class. The most common is the binary classification, which takes two values, such as "true/false" or "yes/no", and predicts the outcome based on the values specified. Alternatively, a multi-class output can categorize different inputs based on type. This makes logistic regression most useful for situations where you are trying to determine how your inputs fit best into a specified category. +**Optimizers supported:** [SIMPLE_SGD](https://tribuo.org/learn/4.2/javadoc/org/tribuo/math/optimisers/SGD.html#:~:text=learning%20rate%20SGD.-,getSimpleSGD,-public%20static%C2%A0), [LINEAR_DECAY_SGD](https://tribuo.org/learn/4.2/javadoc/org/tribuo/math/optimisers/SGD.html#:~:text=linear%20decay%20SGD.-,getLinearDecaySGD,-public%20static%C2%A0), [SQRT_DECAY_SGD](https://tribuo.org/learn/4.2/javadoc/org/tribuo/math/optimisers/SGD.html#:~:text=sqrt%20decay%20SGD.-,getSqrtDecaySGD,-public%20static%C2%A0), [ADA_GRAD](https://tribuo.org/learn/4.2/javadoc/org/tribuo/math/optimisers/AdaGrad.html), [ADA_DELTA](https://tribuo.org/learn/4.1/javadoc/org/tribuo/math/optimisers/AdaDelta.html), [ADAM](https://tribuo.org/learn/4.1/javadoc/org/tribuo/math/optimisers/Adam.html), and [RMS_PROP](https://tribuo.org/learn/4.1/javadoc/org/tribuo/math/optimisers/RMSProp.html). +**Objectives supported:** [HINGE](https://tribuo.org/learn/4.2/javadoc/org/tribuo/classification/sgd/objectives/Hinge.html) and [LOGMULTICLASS](https://tribuo.org/learn/4.2/javadoc/org/tribuo/classification/sgd/objectives/LogMulticlass.html). +**Momentum type supported:** [STANDARD](https://tribuo.org/learn/4.2/javadoc/org/tribuo/math/optimisers/SGD.Momentum.html#STANDARD:~:text=No%20momentum.-,STANDARD,-public%20static%20final) and [NESTEROV](https://tribuo.org/learn/4.2/javadoc/org/tribuo/math/optimisers/SGD.Momentum.html#STANDARD:~:text=Standard%20momentum.-,NESTEROV,-public%20static%20final). + ### Parameters | Parameter | Type | Description | Default value | |---|---|---|---| -| `learningRate` | Double | The initial step size used in an iterative optimization algorithm. | `1` | -| `momentumFactor` | Double | The extra weight factors that accelerate the rate at which the weight is adjusted. This helps move the minimization routine out of local minima. | `0` | +| `learning_rate` | Double | The initial step size used in an iterative optimization algorithm. | `1` | +| `momentum_factor` | Double | The extra weight factors that accelerate the rate at which the weight is adjusted. This helps move the minimization routine out of local minima. | `0` | | `epsilon` | Double | The value for stabilizing gradient inversion. | `0.1` | | `beta1` | Double | The exponential decay rates for the moment estimates. | `0.9` | | `beta2` | Double | The exponential decay rates for the moment estimates. | `0.99` | -| `decayRate` | Double | The Root Mean Squared Propagation (RMSProp). | `0.9` | -| `momentumType` | String | The Stochastic Gradient Descent (SGD) momentum that helps accelerate gradient vectors in the right direction, leading to faster convergence between vectors. | `STANDARD` | -| `optimizerType` | String | The optimizer used in the model. | `AdaGrad` | +| `decay_rate` | Double | The Root Mean Squared Propagation (RMSProp). | `0.9` | +| `momentum_type` | String | The Stochastic Gradient Descent (SGD) momentum that helps accelerate gradient vectors in the right directions, leading to a fast convergence. | `STANDARD` | +| `optimiser` | String | The optimizer used in the model. | `ADA_GRAD` | | `target` | String | The target field. | null | -| `objectiveType` | String | The objective function type. | `LogMulticlass` | +| `objective` | String | The objective function type. | `LOGMULTICLASS` | | `epochs` | Integer | The number of iterations. | `5` | -| `batchSize` | Integer | The size of min batches. | `1` | -| `loggingInterval` | Integer | The interval of logs lost after many iterations. The interval is `1` if the algorithm contains no logs. | `1000` | +| `batch_size` | Integer | The minimum batch size. | `1` | +| `logging_interval` | Integer | The interval of logs lost after many iterations. The interval is `1` if the algorithm contains no logs. | `1000` | ### Supported APIs diff --git a/_ml-commons-plugin/api/agent-apis/delete-agent.md b/_ml-commons-plugin/api/agent-apis/delete-agent.md index ddde8fb19bf..e9cd78fe25e 100644 --- a/_ml-commons-plugin/api/agent-apis/delete-agent.md +++ b/_ml-commons-plugin/api/agent-apis/delete-agent.md @@ -12,7 +12,7 @@ nav_order: 50 You can use this API to delete an agent based on the `agent_id`. -## Path and HTTP methods +## Endpoints ```json DELETE /_plugins/_ml/agents/<agent_id> diff --git a/_ml-commons-plugin/api/agent-apis/execute-agent.md b/_ml-commons-plugin/api/agent-apis/execute-agent.md index 2af4fc2c8ef..a788610cf60 100644 --- a/_ml-commons-plugin/api/agent-apis/execute-agent.md +++ b/_ml-commons-plugin/api/agent-apis/execute-agent.md @@ -10,14 +10,22 @@ nav_order: 20 **Introduced 2.13** {: .label .label-purple } -When an agent is executed, it runs the tools with which it is configured. +When an agent is executed, it runs the tools with which it is configured. Starting with OpenSearch version 3.0, you can execute an agent asynchronously by setting the `async` query parameter to `true`. -### Path and HTTP methods +### Endpoints ```json POST /_plugins/_ml/agents/<agent_id>/_execute ``` +## Query parameters + +The following table lists the available query parameters. + +Parameter | Data type | Required/Optional | Description +:--- | :--- | :--- +`async` | Boolean | Optional | If `true`, executes the agent asynchronously and returns a `task_id` to track execution. To check the status of the task, use the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/). Default is `false`. + ## Request body fields The following table lists the available request fields. diff --git a/_ml-commons-plugin/api/agent-apis/get-agent.md b/_ml-commons-plugin/api/agent-apis/get-agent.md index 7a03e852219..dfc348cc8b8 100644 --- a/_ml-commons-plugin/api/agent-apis/get-agent.md +++ b/_ml-commons-plugin/api/agent-apis/get-agent.md @@ -12,7 +12,7 @@ nav_order: 20 You can retrieve agent information using the `agent_id`. -## Path and HTTP methods +## Endpoints ```json GET /_plugins/_ml/agents/<agent_id> diff --git a/_ml-commons-plugin/api/agent-apis/index.md b/_ml-commons-plugin/api/agent-apis/index.md index 72bf6082ce9..420b0309cf3 100644 --- a/_ml-commons-plugin/api/agent-apis/index.md +++ b/_ml-commons-plugin/api/agent-apis/index.md @@ -4,7 +4,7 @@ title: Agent APIs parent: ML Commons APIs has_children: true has_toc: false -nav_order: 27 +nav_order: 30 redirect_from: /ml-commons-plugin/api/agent-apis/ --- @@ -17,6 +17,7 @@ You can automate machine learning (ML) tasks using agents and tools. An _agent_ ML Commons supports the following agent-level APIs: - [Register agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent/) +- [Update agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/update-agent/) - [Execute agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/execute-agent/) - [Get agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/get-agent/) - [Search agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/search-agent/) diff --git a/_ml-commons-plugin/api/agent-apis/register-agent.md b/_ml-commons-plugin/api/agent-apis/register-agent.md index 0057b444278..0d47e7b4a0d 100644 --- a/_ml-commons-plugin/api/agent-apis/register-agent.md +++ b/_ml-commons-plugin/api/agent-apis/register-agent.md @@ -14,13 +14,14 @@ Use this API to register an agent. Agents may be of the following types: -- Flow agent -- Conversational flow agent -- Conversational agent +- _Flow_ agent +- _Conversational flow_ agent +- _Conversational agent_ +- _Plan-execute-reflect_ agent -For more information about agents, see [Agents and tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/). +For more information about agents, see [Agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/). -## Path and HTTP methods +## Endpoints ```json POST /_plugins/_ml/agents/_register @@ -34,24 +35,32 @@ The following table lists the available request fields. Field | Data type | Required/Optional | Agent type | Description :--- | :--- | :--- | :--- | :--- `name`| String | Required | All | The agent name. | -`type` | String | Required | All | The agent type. Valid values are `flow`, `conversational_flow`, and `conversational`. For more information, see [Agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/). | +`type` | String | Required | All | The agent type. Valid values are [`flow`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/flow/), [`conversational_flow`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/conversational-flow/), [`conversational`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/conversational/), and [`plan_execute_and_reflect`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/plan-execute-reflect/). For more information, see [Agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/). | `description` | String | Optional| All | A description of the agent. | `tools` | Array | Optional | All | A list of tools for the agent to execute. `app_type` | String | Optional | All | Specifies an optional agent category. You can then perform operations on all agents in the category. For example, you can delete all messages for RAG agents. `memory.type` | String | Optional | `conversational_flow`, `conversational` | Specifies where to store the conversational memory. Currently, the only supported type is `conversation_index` (store the memory in a conversational system index). `llm.model_id` | String | Required | `conversational` | The model ID of the LLM to which to send questions. `llm.parameters.response_filter` | String | Required | `conversational` | The pattern for parsing the LLM response. For each LLM, you need to provide the field where the response is located. For example, for the Anthropic Claude model, the response is located in the `completion` field, so the pattern is `$.completion`. For OpenAI models, the pattern is `$.choices[0].message.content`. -`llm.parameters.max_iteration` | Integer | Optional | `conversational` | The maximum number of messages to send to the LLM. Default is `3`. +`llm.parameters.max_iteration` | Integer | Optional | `conversational` | The maximum number of messages to send to the LLM. Default is `10`. +`parameters` | Object | Optional | All | Agent parameters, which may be used to control the `max_steps` executed by the agent, modify default prompts, and so on. +`parameters.executor_agent_id`| Integer | Optional | `plan_execute_and_reflect` | The `plan_execute_and_reflect` agent internally uses a `conversational` agent to execute each step. By default, this executor agent uses the same model as the planning model specified in the `llm` configuration. To use a different model for executing steps, create a `conversational` agent using another model and pass the agent ID in this field. This can be useful if you want to use different models for planning and execution. +`parameters.max_steps` | Integer | Optional | `plan_execute_and_reflect` | The maximum number of steps executed by the LLM. Default is `20`. +`parameters.executor_max_iterations` | Integer | Optional | `plan_execute_and_reflect` | The maximum number of messages sent to the LLM by the executor agent. Default is `20`. +`parameters._llm_interface` | String | Required | `plan_execute_and_reflect`, `conversational` | Specifies how to parse the LLM output when using function calling. Valid values are: <br> - `bedrock/converse/claude`: Anthropic Claude conversational models hosted on Amazon Bedrock <br> - `bedrock/converse/deepseek_r1`: DeepSeek-R1 models hosted on Amazon Bedrock <br> - `openai/v1/chat/completions`: OpenAI chat completion models hosted on OpenAI. Each interface defines a default response schema and function call parser. The `tools` array contains a list of tools for the agent. Each tool contains the following fields. Field | Data type | Required/Optional | Description :--- | :--- | :--- +`type` | String | Required | The tool type. For a list of supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). `name`| String | Optional | The tool name. The tool name defaults to the `type` parameter value. If you need to include multiple tools of the same type in an agent, specify different names for the tools. | -`type` | String | Required | The tool type. For a list of supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). +`description`| String | Optional | The tool description. Defaults to a built-in description for the specified type. | `parameters` | Object | Optional | The parameters for this tool. The parameters are highly dependent on the tool type. You can find information about specific tool types in [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). +`attributes.input_schema` | Object | Optional | The expected input format for this tool defined as a [JSON schema](https://json-schema.org/). Used to define the structure the LLM should follow when calling the tool. +`attributes.strict` | Boolean | Optional | Whether function calling reliably adheres to the input schema or not. -#### Example request: Flow agent +## Example request: Flow agent ```json POST /_plugins/_ml/agents/_register @@ -86,7 +95,7 @@ POST /_plugins/_ml/agents/_register ``` {% include copy-curl.html %} -#### Example request: Conversational flow agent +## Example request: Conversational flow agent ```json POST /_plugins/_ml/agents/_register @@ -137,7 +146,7 @@ Assistant:""" ``` {% include copy-curl.html %} -#### Example request: Conversational agent +## Example request: Conversational agent ```json POST /_plugins/_ml/agents/_register @@ -173,7 +182,7 @@ POST /_plugins/_ml/agents/_register } }, { - "type": "CatIndexTool", + "type": "ListIndexTool", "name": "RetrieveIndexMetaTool", "description": "Use this tool to get OpenSearch index information: (health, status, index, uuid, primary count, replica count, docs.count, docs.deleted, store.size, primary.store.size)." } @@ -182,7 +191,45 @@ POST /_plugins/_ml/agents/_register ``` {% include copy-curl.html %} -#### Example response +## Example request: Plan-execute-reflect agent +**Introduced 3.0** +{: .label .label-purple } + +```json +POST /_plugins/_ml/agents/_register +{ + "name": "My plan execute and reflect agent", + "type": "plan_execute_and_reflect", + "description": "this is a test agent", + "llm": { + "model_id": "<llm_model_id>", + "parameters": { + "prompt": "${parameters.question}" + } + }, + "memory": { + "type": "conversation_index" + }, + "parameters": { + "_llm_interface": "<llm_interface>" + }, + "tools": [ + { + "type": "ListIndexTool" + }, + { + "type": "SearchIndexTool" + }, + { + "type": "IndexMappingTool" + } + ], + "app_type": "os_chat" +} +``` +{% include copy-curl.html %} + +## Example response OpenSearch responds with an agent ID that you can use to refer to the agent: diff --git a/_ml-commons-plugin/api/agent-apis/search-agent.md b/_ml-commons-plugin/api/agent-apis/search-agent.md index 63b1d07eedb..0d08449eb5f 100644 --- a/_ml-commons-plugin/api/agent-apis/search-agent.md +++ b/_ml-commons-plugin/api/agent-apis/search-agent.md @@ -12,7 +12,7 @@ nav_order: 30 Use this command to search for agents you've already created. You can provide any OpenSearch search query in the request body. -## Path and HTTP methods +## Endpoints ```json GET /_plugins/_ml/agents/_search diff --git a/_ml-commons-plugin/api/agent-apis/update-agent.md b/_ml-commons-plugin/api/agent-apis/update-agent.md new file mode 100644 index 00000000000..193b2b42d17 --- /dev/null +++ b/_ml-commons-plugin/api/agent-apis/update-agent.md @@ -0,0 +1,81 @@ +--- +layout: default +title: Update agent +parent: Agent APIs +grand_parent: ML Commons APIs +nav_order: 15 +--- + +# Update an agent +**Introduced 3.1** +{: .label .label-purple } + +Use this API to update an existing agent's configuration. + +## Endpoints + +```json +PUT /_plugins/_ml/agents/<agent_id> +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `agent_id` | String | The agent ID of the agent to update. | + +## Request body fields + +The following table lists the available request fields. All request body fields are optional. + +Field | Data type | Agent type | Description +:--- | :--- | :--- | :--- +`name`| String | All | The agent name. +`description` | String | All | A description of the agent. +`tools` | Array | All | A list of tools for the agent to execute. +`app_type` | String | All | Specifies an optional agent category. +`memory.type` | String | `conversational_flow`, `conversational` | Specifies where to store the conversational memory. Currently, the only supported type is `conversation_index` (store the memory in a conversational system index). +`llm.model_id` | String | `conversational` | The model ID of the large language model (LLM) to send questions to. +`llm.parameters.response_filter` | String | `conversational` | The pattern for parsing the LLM response. +`llm.parameters.max_iteration` | Integer | `conversational` | The maximum number of messages to send to the LLM. + +#### Example request: Update tool prompt + +```json +PUT /_plugins/_ml/agents/N8AE1osB0jLkkocYjz7D +{ + "name": "Updated_Test_Agent_For_RAG", + "description": "Updated description for test agent", + "tools": [ + { + "type": "MLModelTool", + "description": "Updated general tool to answer any question", + "parameters": { + "model_id": "NWR9YIsBUysqmzBdifVJ", + "prompt": "This is an updated prompt" + } + } + ] +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "_index": ".plugins-ml-agent", + "_id": "ryN5jpcBfY4uTYhorKvh", + "_version": 2, + "result": "updated", + "_shards": { + "total": 1, + "successful": 1, + "failed": 0 + }, + "_seq_no": 1, + "_primary_term": 1 +} +``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/async-batch-ingest.md b/_ml-commons-plugin/api/async-batch-ingest.md index 493f192d0f6..43514f45dae 100644 --- a/_ml-commons-plugin/api/async-batch-ingest.md +++ b/_ml-commons-plugin/api/async-batch-ingest.md @@ -4,16 +4,20 @@ title: Asynchronous batch ingestion parent: ML Commons APIs has_children: false has_toc: false -nav_order: 35 +nav_order: 80 --- # Asynchronous batch ingestion -**Introduced 2.17** -{: .label .label-purple } +**Deprecated 3.0** +{: .label .label-red } + +This feature is deprecated. For similar functionality, use [OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/). If you'd like to see this feature reinstated, [create an issue](https://github.com/opensearch-project/ml-commons/issues) in the ML Commons repository. +{: .warning} + Use the Asynchronous Batch Ingestion API to ingest data into your OpenSearch cluster from your files on remote file servers, such as Amazon Simple Storage Service (Amazon S3) or OpenAI. For detailed configuration steps, see [Asynchronous batch ingestion]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/async-batch-ingestion/). -## Path and HTTP methods +## Endpoints ```json POST /_plugins/_ml/_batch_ingestion diff --git a/_ml-commons-plugin/api/connector-apis/create-connector.md b/_ml-commons-plugin/api/connector-apis/create-connector.md index b99306bb8a1..bc0e05fda41 100644 --- a/_ml-commons-plugin/api/connector-apis/create-connector.md +++ b/_ml-commons-plugin/api/connector-apis/create-connector.md @@ -10,7 +10,7 @@ nav_order: 10 Creates a standalone connector. For more information, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). -## Path and HTTP methods +## Endpoints ```json POST /_plugins/_ml/connectors/_create diff --git a/_ml-commons-plugin/api/connector-apis/delete-connector.md b/_ml-commons-plugin/api/connector-apis/delete-connector.md index 75dff32016d..4969c42a20f 100644 --- a/_ml-commons-plugin/api/connector-apis/delete-connector.md +++ b/_ml-commons-plugin/api/connector-apis/delete-connector.md @@ -10,7 +10,7 @@ nav_order: 30 Deletes a standalone connector. For more information, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). -## Path and HTTP methods +## Endpoints ```json DELETE /_plugins/_ml/connectors/<connector_id> diff --git a/_ml-commons-plugin/api/connector-apis/get-connector.md b/_ml-commons-plugin/api/connector-apis/get-connector.md index 6a8507cd32f..583705b724a 100644 --- a/_ml-commons-plugin/api/connector-apis/get-connector.md +++ b/_ml-commons-plugin/api/connector-apis/get-connector.md @@ -10,7 +10,7 @@ nav_order: 20 This API retrieves a connector by its ID. -### Path and HTTP methods +### Endpoints ```json GET /_plugins/_ml/connectors/<connector_id> diff --git a/_ml-commons-plugin/api/connector-apis/search-connector.md b/_ml-commons-plugin/api/connector-apis/search-connector.md index 3b59d51a2e9..35824ebf81a 100644 --- a/_ml-commons-plugin/api/connector-apis/search-connector.md +++ b/_ml-commons-plugin/api/connector-apis/search-connector.md @@ -10,7 +10,7 @@ nav_order: 25 Use the `_search` endpoint to search for a connector. This API uses a query to search for matching connectors. -## Path and HTTP methods +## Endpoints ```json POST /_plugins/_ml/connectors/_search diff --git a/_ml-commons-plugin/api/connector-apis/update-connector.md b/_ml-commons-plugin/api/connector-apis/update-connector.md index 4b62652da86..6e90d94c9ae 100644 --- a/_ml-commons-plugin/api/connector-apis/update-connector.md +++ b/_ml-commons-plugin/api/connector-apis/update-connector.md @@ -19,7 +19,7 @@ Using this API, you can update the connector fields listed in the [Request field For information about user access for this API, see [Model access control considerations]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/#model-access-control-considerations). -## Path and HTTP methods +## Endpoints ```json PUT /_plugins/_ml/connectors/<connector_id> diff --git a/_ml-commons-plugin/api/controller-apis/create-controller.md b/_ml-commons-plugin/api/controller-apis/create-controller.md index 9fe9306575e..bd87d50d06f 100644 --- a/_ml-commons-plugin/api/controller-apis/create-controller.md +++ b/_ml-commons-plugin/api/controller-apis/create-controller.md @@ -19,7 +19,7 @@ The POST method creates a new controller. The PUT method updates an existing con To learn how to set rate limits at the model level for all users, see [Update Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/update-model/). The rate limit is set to either the model-level limit or the user-level limit, whichever is more restrictive. For example, if the model-level limit is 2 requests per minute and the user-level limit is 4 requests per minute, the overall limit will be set to 2 requests per minute. -## Path and HTTP methods +## Endpoints ```json POST /_plugins/_ml/controllers/<model_id> diff --git a/_ml-commons-plugin/api/controller-apis/delete-controller.md b/_ml-commons-plugin/api/controller-apis/delete-controller.md index 44120198fae..3c7aa1397ac 100644 --- a/_ml-commons-plugin/api/controller-apis/delete-controller.md +++ b/_ml-commons-plugin/api/controller-apis/delete-controller.md @@ -12,7 +12,7 @@ nav_order: 50 Use this API to delete a controller for a model based on the `model_id`. -## Path and HTTP methods +## Endpoints ```json DELETE /_plugins/_ml/controllers/<model_id> diff --git a/_ml-commons-plugin/api/controller-apis/get-controller.md b/_ml-commons-plugin/api/controller-apis/get-controller.md index 48e6f165494..b01ef969117 100644 --- a/_ml-commons-plugin/api/controller-apis/get-controller.md +++ b/_ml-commons-plugin/api/controller-apis/get-controller.md @@ -12,7 +12,7 @@ nav_order: 20 Use this API to retrieve information about a controller for a model by model ID. -### Path and HTTP methods +### Endpoints ```json GET /_plugins/_ml/controllers/<model_id> diff --git a/_ml-commons-plugin/api/controller-apis/index.md b/_ml-commons-plugin/api/controller-apis/index.md index 2f9afc14913..3846caafcc4 100644 --- a/_ml-commons-plugin/api/controller-apis/index.md +++ b/_ml-commons-plugin/api/controller-apis/index.md @@ -4,7 +4,7 @@ title: Controller APIs parent: ML Commons APIs has_children: true has_toc: false -nav_order: 29 +nav_order: 60 redirect_from: /ml-commons-plugin/api/controller-apis/ --- diff --git a/_ml-commons-plugin/api/execute-algorithm.md b/_ml-commons-plugin/api/execute-algorithm.md index 6acd9264448..c2e216671eb 100644 --- a/_ml-commons-plugin/api/execute-algorithm.md +++ b/_ml-commons-plugin/api/execute-algorithm.md @@ -2,14 +2,14 @@ layout: default title: Execute algorithm parent: ML Commons APIs -nav_order: 37 +nav_order: 90 --- # Execute algorithm Some algorithms, such as [Localization]({{site.url}}{{site.baseurl}}/ml-commons-plugin/algorithms#localization), don't require trained models. You can run no-model-based algorithms using the `execute` API. -## Path and HTTP methods +## Endpoints ```json POST _plugins/_ml/_execute/<algorithm_name> diff --git a/_ml-commons-plugin/api/index.md b/_ml-commons-plugin/api/index.md index 65171b163f6..4160cae5842 100644 --- a/_ml-commons-plugin/api/index.md +++ b/_ml-commons-plugin/api/index.md @@ -9,9 +9,9 @@ redirect_from: - /ml-commons-plugin/api/ --- -# ML Commons APIs +# ML APIs -ML Commons supports the following APIs: +OpenSearch supports the following machine learning (ML) APIs: - [Model APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/) - [Model group APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-group-apis/index/) @@ -23,3 +23,4 @@ ML Commons supports the following APIs: - [Tasks APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/index/) - [Profile API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/profile/) - [Stats API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/stats/) +- [MCP Server APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/mcp-server-apis/) diff --git a/_ml-commons-plugin/api/mcp-server-apis/index.md b/_ml-commons-plugin/api/mcp-server-apis/index.md new file mode 100644 index 00000000000..a9ef4ef9b16 --- /dev/null +++ b/_ml-commons-plugin/api/mcp-server-apis/index.md @@ -0,0 +1,31 @@ +--- +layout: default +title: MCP server APIs +parent: ML Commons APIs +has_children: true +has_toc: false +nav_order: 40 +redirect_from: + - /ml-commons-plugin/api/mcp-server-apis/ +--- + +# MCP server APIs +**Introduced 3.0** +{: .label .label-purple } + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +[Model Context Protocol (MCP)](https://modelcontextprotocol.io/introduction) is a protocol that defines how an agent can discover and execute tools. The MCP server allows external agents to connect to and use [tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/) available in OpenSearch. For a list of supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/). + +The default HTTP transport method does not support streaming. You must install the [`transport-reactor-netty4`]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/network-settings/#selecting-the-transport) HTTP transport plugin and use it as the default HTTP transport layer. Both the `transport-reactor-netty4` plugin and the MCP server feature are experimental. +{: .note} + +ML Commons supports the following MCP APIs: + +- [Register MCP tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/mcp-server-apis/register-mcp-tools/) +- [Update MCP tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/mcp-server-apis/update-mcp-tools/) +- [List MCP tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/mcp-server-apis/list-mcp-tools/) +- [Remove MCP tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/mcp-server-apis/remove-mcp-tools/) +- [MCP SSE session]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/mcp-server-apis/sse-session/) +- [MCP SSE message]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/mcp-server-apis/sse-message/) \ No newline at end of file diff --git a/_ml-commons-plugin/api/mcp-server-apis/list-mcp-tools.md b/_ml-commons-plugin/api/mcp-server-apis/list-mcp-tools.md new file mode 100644 index 00000000000..8b619b2d765 --- /dev/null +++ b/_ml-commons-plugin/api/mcp-server-apis/list-mcp-tools.md @@ -0,0 +1,70 @@ +--- +layout: default +title: List MCP tools +parent: MCP server APIs +grand_parent: ML Commons APIs +nav_order: 30 +--- + +# List MCP tools +**Introduced 3.1** +{: .label .label-purple } + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +Use this API to list all Model Context Protocol (MCP)-based tools by name. + +## Endpoints + +```json +GET /_plugins/_ml/mcp/tools/_list +``` + +## Example request + +```json +GET /_plugins/_ml/mcp/tools/_list +``` +{% include copy-curl.html %} + +## Example response + +OpenSearch responds with the MCP tool list: + +```json +{ + "tools": [ + { + "type": "WebSearchTool", + "name": "GoogleSearchTool", + "description": "This tool can be used to perform search via google engine and parse the content of the searched results", + "attributes": { + "input_schema": { + "type": "object", + "properties": { + "next_page": { + "description": "The search result's next page link. If this is provided, the WebSearchTool will fetch the next page results using this link and crawl the links on the page.", + "type": "string" + }, + "engine": { + "description": "The search engine that will be used by the tool.", + "type": "string" + }, + "query": { + "description": "The search query parameter that will be used by the engine to perform the search.", + "type": "string" + } + }, + "required": [ + "engine", + "query" + ] + }, + "strict": false + }, + "create_time": 1749864622040 + } + ] +} +``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/mcp-server-apis/register-mcp-tools.md b/_ml-commons-plugin/api/mcp-server-apis/register-mcp-tools.md new file mode 100644 index 00000000000..69a50298030 --- /dev/null +++ b/_ml-commons-plugin/api/mcp-server-apis/register-mcp-tools.md @@ -0,0 +1,395 @@ +--- +layout: default +title: Register MCP tools +parent: MCP server APIs +grand_parent: ML Commons APIs +nav_order: 10 +--- + +# Register MCP tools +**Introduced 3.0** +{: .label .label-purple } + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +Use this API to register one or more Model Context Protocol (MCP)-based tools. For more information about supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). + +## Endpoints + +```json +POST /_plugins/_ml/mcp/tools/_register +``` + +## Request body fields + +The following table lists the available request fields. + +Field | Data type | Required/Optional | Description +:--- | :--- | :--- | :--- +`tools` | Array | Required | A list of tools. + + +The `tools` array contains a list of tools. Each tool contains the following fields. + +Field | Data type | Required/Optional | Description +:--- | :--- | :--- +`name`| String | Optional | The tool name. The tool name defaults to the `type` parameter value. If you need to include multiple tools of the same type in the MCP server, specify different names for the tools. | +`type` | String | Required | The tool type. For a list of supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). +`description` | String | Optional | The description of the tool. +`parameters` | Object | Optional | The parameters for the tool. The parameters are dependent on the tool type. For information about specific tool types, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). +`attributes` | Object | Optional | The configuration properties (attributes) for the tool. The most important attribute in this field is the tool's `input_schema`, which defines the expected parameter format for the tool. This schema is sent to the large language model (LLM) so it can properly format parameters when executing the tool. + +## Example requests + +The [built-in tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/) are categorized as either zero-configuration tools (no parameters required) or parameterized tools (require parameters). Zero-configuration tools use a standard initialization process and thus have the same request body because no parameters are required. In contrast, for parameterized tools, you must provide the correct initialization parameters to ensure the tool functions as expected. + +### Example request: Zero-configuration tools + +<details markdown="block"> + <summary> + Example request + </summary> + +```json +{ + "tools": [ + { + "name": "ListIndexTool", + "type": "ListIndexTool", + "description": "This tool gets index information from the OpenSearch cluster. It takes 2 optional arguments named `indices` which is a comma-delimited list of one or more indices to get information from (default is an empty list meaning all indices), and `local` which means whether to return information from the local node only instead of the cluster manager node (default is false). The tool returns the indices information, including `health`, `status`, `index`, `uuid`, `pri`, `rep`, `docs.count`, `docs.deleted`, `store.size`, `pri.store. size `, `pri.store.size`, `pri.store`.", + "attributes": { + "input_schema": { + "type": "object", + "properties": { + "indices": { + "type": "array", + "items": { + "type": "string" + }, + "description": "OpenSearch index name list, separated by comma. for example: [\"index1\", \"index2\"], use empty array [] to list all indices in the cluster" + } + }, + "additionalProperties": false + } + } + }, + { + "name": "SearchIndexTool", + "type": "SearchIndexTool", + "description": "Use this tool to search an index by providing two parameters: 'index' for the index name, and 'query' for the OpenSearch DSL formatted query. Only use this tool when both index name and DSL query is available.", + "attributes": { + "input_schema": { + "type": "object", + "properties": { + "index": { + "type": "string" + }, + "query": { + "type": "string" + } + }, + "additionalProperties": false + } + } + }, + { + "name": "IndexMappingTool", + "type": "IndexMappingTool", + "description": "This tool gets index mapping information from a certain index. It takes 1 required argument named 'index' which is a comma-delimited list of one or more indices to get mapping information from, which expands wildcards. It takes 1 optional argument named 'local' which means whether to return information from the local node only instead of the cluster manager node (Default is false). The tool returns a list of index mappings and settings for each index. The mappings are in JSON format under the key 'properties' which includes the field name as a key and a JSON object with field type under the key 'type'. The settings are in flattened map with 'index' as the top element and key-value pairs for each setting.", + "attributes": { + "input_schema": { + "type": "object", + "properties": { + "index": { + "type": "array", + "description": "OpenSearch index name list, separated by comma. for example: [\"index1\", \"index2\"]", + "items": { + "type": "string" + } + } + }, + "required": [ + "index" + ], + "additionalProperties": false + } + } + }, + { + "name": "SearchAlertsTool", + "type": "SearchAlertsTool", + "description": "Use this tool to search an index by providing two parameters: 'index' for the index name, and 'query' for the OpenSearch DSL formatted query. Only use this tool when both index name and DSL query is available.", + "attributes": { + "input_schema": { + "type": "object", + "properties": { + "index": { + "type": "string", + "description": "OpenSearch index name. for example: index1" + }, + "query": { + "type": "object", + "description": "OpenSearch search index query. You need to get index mapping to write correct search query. It must be a valid OpenSearch query. Valid value:\n{\"query\":{\"match\":{\"population_description\":\"seattle 2023 population\"}},\"size\":2,\"_source\":\"population_description\"}\nInvalid value: \n{\"match\":{\"population_description\":\"seattle 2023 population\"}}\nThe value is invalid because the match not wrapped by \"query\".", + "additionalProperties": false + } + }, + "required": ["index", "query"], + "additionalProperties": false + } + } + }, + { + "name": "SearchAnomalyDetectorsTool", + "type": "SearchAnomalyDetectorsTool", + "description": "This is a tool that searches anomaly detectors. It takes 12 optional arguments named detectorName which is the explicit name of the detector (default is null), and detectorNamePattern which is a wildcard query to match detector name (default is null), and indices which defines the index or index pattern the detector is detecting over (default is null), and highCardinality which defines whether the anomaly detector is high cardinality (synonymous with multi-entity) of non-high-cardinality (synonymous with single-entity) (default is null, indicating both), and lastUpdateTime which defines the latest update time of the anomaly detector in epoch milliseconds (default is null), and sortOrder which defines the order of the results (options are asc or desc, and default is asc), and sortString which defines how to sort the results (default is name.keyword), and size which defines the size of the request to be returned (default is 20), and startIndex which defines the paginated index to start from (default is 0), and running which defines whether the anomaly detector is running (default is null, indicating both), and failed which defines whether the anomaly detector has failed (default is null, indicating both). The tool returns 2 values: a list of anomaly detectors (each containing the detector id, detector name, detector type indicating multi-entity or single-entity (where multi-entity also means high-cardinality), detector description, name of the configured index, last update time in epoch milliseconds), and the total number of anomaly detectors.", + "attributes": { + "input_schema": { + "type": "object", + "properties": { + "detectorName": { + "type": "string", + "description": "Anomaly detector name" + }, + "detectorNamePattern": { + "type": "string", + "description": "Anomaly detector name pattern" + }, + "indices": { + "type": "string", + "description": "The index name that anomaly detector uses" + }, + "highCardinality": { + "type": "string", + "description": "The value is true of false, the detector type will be set to MULTI_ENTITY if it's value is true, otherwise SINGLE_ENTITY if it's false" + }, + "lastUpdateTime": { + "type": "string", + "description": "The last update time of the anomaly detector" + }, + "sortString": { + "type": "string", + "description": "The sort key of the search result, default value is `name.keyword` which means the sorting is based on the detector name" + }, + "sortOrder": { + "type": "string", + "description": "The search result order is based on this value, default is asc which means the sorting is in ascending manner." + }, + "size": { + "type": "string", + "description": "This value controls how many search results will be fetched, default value is 20 which means at most 20 anomaly detecotrs can return" + }, + "startIndex": { + "type": "string", + "description": "The start index of the search, default value is 0 which means starts from the beginning" + }, + "running": { + "type": "string", + "description": "The running status of the anomaly detector, valid values are true and false, default is null" + }, + "failed": { + "type": "string", + "description": "The failed status of the anomaly detector, valid values are true and false, default is null" + } + }, + "additionalProperties": false + } + } + }, + { + "name": "SearchAnomalyResultsTool", + "type": "SearchAnomalyResultsTool", + "description": "This is a tool that searches anomaly results. It takes 9 arguments named detectorId which defines the detector ID to filter for (default is null), and realtime which defines whether the anomaly results are from a realtime detector (set to false to only get results from historical analyses) (default is null), and anomalyGradeThreshold which defines the threshold for anomaly grade (a number between 0 and 1 that indicates how anomalous a data point is) (default is greater than 0), and dataStartTime which defines the start time of the anomaly data in epoch milliseconds (default is null), and dataEndTime which defines the end time of the anomaly data in epoch milliseconds (default is null), and sortOrder which defines the order of the results (options are asc or desc, and default is desc), and sortString which defines how to sort the results (default is data_start_time), and size which defines the number of anomalies to be returned (default is 20), and startIndex which defines the paginated index to start from (default is 0). The tool returns 2 values: a list of anomaly results (where each result contains the detector ID, the anomaly grade, and the confidence), and the total number of anomaly results.", + "attributes": { + "input_schema": { + "type": "object", + "properties": { + "detectorId": { + "type": "string", + "description": "Anomaly detector id" + }, + "realTime": { + "type": "string", + "description": "If the anomaly detector a real time one, valid values are true and false, default is null" + }, + "anomalyGradeThreshold": { + "type": "string", + "description": "A float number to indicate the anomaly grade" + }, + "dataStartTime": { + "type": "string", + "description": "Start time of the data in the anomaly detector" + }, + "dataEndTime": { + "type": "string", + "description": "End time of the data in the anomaly detector" + }, + "sortString": { + "type": "string", + "description": "The sort key of the search result, default value is `name.keyword` which means the sorting is based on the detector name" + }, + "sortOrder": { + "type": "string", + "description": "The search result order is based on this value, default is asc which means the sorting is in ascending manner." + }, + "size": { + "type": "string", + "description": "This value controls how many search results will be fetched, default value is 20 which means at most 20 anomaly detecotrs can return" + }, + "startIndex": { + "type": "string", + "description": "The start index of the search, default value is 0 which means starts from the beginning" + } + }, + "additionalProperties": false + } + } + }, + { + "name": "SearchMonitorsTool", + "type": "SearchMonitorsTool", + "description": "This is a tool that searches alerting monitors. It takes 10 optional arguments named monitorId which defines the monitor ID to filter for (default is null), and monitorName which defines explicit name of the monitor (default is null), and monitorNamePattern which is a wildcard query to match monitor name (default is null), and enabled which defines whether the monitor is enabled (default is null, indicating both enabled and disabled), and hasTriggers which defines whether the monitor has triggers enabled (default is null, indicating both), and indices which defines the index being monitored (default is null), and sortOrder which defines the order of the results (options are asc or desc, and default is asc), and sortString which defines how to sort the results (default is name.keyword), and size which defines the size of the request to be returned (default is 20), and startIndex which defines the paginated index to start from (default is 0). The tool returns 2 values: a list of alerting monitors (each containining monitor ID, monitor name, monitor type (indicating query-level, document-level, or bucket-level monitor types), enabled, enabled time in epoch milliseconds, last update time in epoch milliseconds), and the total number of alerting monitors.", + "attributes": { + "input_schema": { + "type": "object", + "properties": { + "monitorId": { + "type": "string", + "description": "Alerting monitor id" + }, + "monitorName": { + "type": "string", + "description": "Alerting monitor name" + }, + "monitorNamePattern": { + "type": "string", + "description": "Alerting monitor name pattern" + }, + "enabled": { + "type": "string", + "description": "If the alerting monitor enabled or not, valid values are true and false, default is null" + }, + "hasTriggers": { + "type": "string", + "description": "If the alerting monitor has triggers, valid values are true and false, default is null" + }, + "indices": { + "type": "string", + "description": "The index names that alerting monitor uses" + }, + "sortString": { + "type": "string", + "description": "The sort key of the search result, default value is `name.keyword` which means the sorting is based on the detector name" + }, + "sortOrder": { + "type": "string", + "description": "The search result order is based on this value, default is asc which means the sorting is in ascending manner." + }, + "size": { + "type": "string", + "description": "This value controls how many search results will be fetched, default value is 20 which means at most 20 alerting monitors can return" + }, + "startIndex": { + "type": "string", + "description": "The start index of the search, default value is 0 which means starts from the beginning" + } + }, + "additionalProperties": false + } + } + } + ] +} +``` +{% include copy-curl.html %} + +</details> + +### Example requests: Parameterized tools + +The following sections provide example requests for registering parameterized tools. For information about tool-specific parameters, see the corresponding [tool documentation]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). + +#### WebSearchTool + +```json +POST /_plugins/_ml/mcp/tools/_register +{ + "tools": [ + { + "type": "WebSearchTool", + "name": "GoogleSearchTool", + "attributes": { + "input_schema": { + "type": "object", + "properties": { + "engine": { + "type": "string", + "description": "The search engine that will be used by the tool." + }, + "query": { + "type": "string", + "description": "The search query parameter that will be used by the engine to perform the search." + }, + "next_page": { + "type": "string", + "description": "The search result's next page link. If this is provided, the WebSearchTool will fetch the next page results using this link and crawl the links on the page." + } + }, + "required": [ + "engine", + "query" + ] + }, + "strict": false + } + } + ] +} +``` +{% include copy-curl.html %} + +#### PPLTool + +```json +POST /_plugins/_ml/mcp/tools/_register +{ + "type": "PPLTool", + "name": "TransferQuestionToPPLAndExecuteTool", + "description": "Use this tool to convert natural language into PPL queries and execute them. Use this tool after you know the index name; otherwise, call IndexRoutingTool first. The input parameters are: {index: IndexName, question: UserQuestion}", + "parameters": { + "model_id": "${your_model_id}", + "model_type": "FINETUNE" + }, + "attributes": { + "input_schema": { + "type": "object", + "properties": { + "question": { + "type": "string", + "description": "The user's natural language question that needs to be converted to PPL." + }, + "index": { + "type": "string", + "description": "The index on which the generated PPL query will be executed." + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +OpenSearch responds with the node ID and the status of the creation of all tools for each node: + +```json +{ + "_ZNV5BrNTVm6ilcM7Jn1pw": { + "created": true + }, + "NZ9aiUCrSp2b5KBqdJGJKw": { + "created": true + } +} +``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/mcp-server-apis/remove-mcp-tools.md b/_ml-commons-plugin/api/mcp-server-apis/remove-mcp-tools.md new file mode 100644 index 00000000000..38b5d1023e8 --- /dev/null +++ b/_ml-commons-plugin/api/mcp-server-apis/remove-mcp-tools.md @@ -0,0 +1,47 @@ +--- +layout: default +title: Remove MCP tools +parent: MCP server APIs +grand_parent: ML Commons APIs +nav_order: 40 +--- + +# Remove MCP tools +**Introduced 3.0** +{: .label .label-purple } + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +Use this API to delete one or more Model Context Protocol (MCP)-based tools by name. + +## Endpoints + +```json +POST /_plugins/_ml/mcp/tools/_remove +``` + +## Example request + +```json +POST /_plugins/_ml/mcp/tools/_remove +[ + "WebSearchTool", "ListIndexTool" +] +``` +{% include copy-curl.html %} + +## Example response + +OpenSearch responds with the node ID and the status of tool deletion for each node: + +```json +{ + "_ZNV5BrNTVm6ilcM7Jn1pw": { + "removed": true + }, + "NZ9aiUCrSp2b5KBqdJGJKw": { + "removed": true + } +} +``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/mcp-server-apis/sse-message.md b/_ml-commons-plugin/api/mcp-server-apis/sse-message.md new file mode 100644 index 00000000000..ad2f37297d9 --- /dev/null +++ b/_ml-commons-plugin/api/mcp-server-apis/sse-message.md @@ -0,0 +1,93 @@ +--- +layout: default +title: MCP SSE message +parent: MCP server APIs +grand_parent: ML Commons APIs +nav_order: 60 +--- + +# MCP SSE message +**Introduced 3.0** +{: .label .label-purple } + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +This endpoint handles standard message interactions for the Model Context Protocol (MCP). It enables communication with the MCP server in OpenSearch through Server-Sent Events (SSE). + +Most users won't need to interact with this API directly when using a standard MCP client. + +{% comment %} +For an example client implementation, see the [OpenSearch MCP client reference implementation](https://github.com/zane-neo/opensearch-mcpserver-test-example). +{% endcomment %} + +## Endpoints + +```json +POST /_plugins/_ml/mcp/sse/message +``` + +## Request body fields + +The following table lists the available request fields. + +| Field | Data type | Required/Optional | Description | +|:------|:----------|:------------------|:------------| +| `jsonrpc` | String | | The JSON-RPC version. | +| `id` | String | | A unique ID for the request. | +| `method` | String | | The operation to perform, such as `tools/call`. | +| `params` | Object | Required | The top-level container for request parameters. | +| `params.name` | String | Required | The name of the tool to call. | +| `params.arguments` | Object | Required | The arguments to pass to the tool. | +| `params.arguments.input` | Object | Required | The input parameters for the tool. The parameters are dependent on the tool type. For information about specific tool types, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). | + +## Example request + +The SSE Message API provides direct, low-level access to tools using the [JSON-RPC](https://www.jsonrpc.org/) (remote procedure call) protocol structure. This differs from the agent framework approach, where tools are configured using `parameters` during agent registration. When using this API directly, you'll structure your request with `params` and `arguments` fields according to the JSON-RPC specification, bypassing the agent framework entirely: + +```json +POST /_plugins/_ml/mcp/sse/message +{ + "jsonrpc": "2.0", + "id": "110", + "method": "tools/call", + "params": { + "name": "ListIndexTool1", + "arguments": { + "indices": ["test"] + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +OpenSearch sends an SSE data stream to the client: + +```json +event: message +data: { + "jsonrpc": "2.0", + "id": "100", + "result": { + "tools": [ + { + "name": "ListIndexTool", + "description": "This is my first list index tool", + "inputSchema": { + "type": "object", + "properties": { + "indices": { + "type": "array", + "items": { "type": "string" }, + "description": "A comma-separated list of OpenSearch index names. For example: [\"index1\", \"index2\"]. Use [] (an empty array) to list all indices in the cluster." + } + }, + "additionalProperties": false + } + } + ] + } +} +``` diff --git a/_ml-commons-plugin/api/mcp-server-apis/sse-session.md b/_ml-commons-plugin/api/mcp-server-apis/sse-session.md new file mode 100644 index 00000000000..f7927c205e0 --- /dev/null +++ b/_ml-commons-plugin/api/mcp-server-apis/sse-session.md @@ -0,0 +1,82 @@ +--- +layout: default +title: MCP SSE session +parent: MCP server APIs +grand_parent: ML Commons APIs +nav_order: 50 +--- + +# MCP SSE session +**Introduced 3.0** +{: .label .label-purple } + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +The SSE Session API creates a Server-Sent Events (SSE) session between a client and the Model Context Protocol (MCP) server in OpenSearch. The session establishes a persistent connection that allows the server to push updates to the client. + +Most users won't need to interact with this API directly when using a standard MCP client library, which handles session management automatically. + +{% comment %} +For an example client implementation, see the [OpenSearch MCP client reference implementation](https://github.com/zane-neo/opensearch-mcpserver-test-example). +{% endcomment %} + +## URL construction methods + +The SSE Session API supports two different methods of URL construction to accommodate various client implementations. + +### Default URL construction + +When `append_to_base_url` is set to `false` (default), the API returns a relative path that clients must append to their base URL. + +The Java MCP client accepts a baseURI (for example, `http://localhost:9200`) when creating the HTTP SSE connection. The default SSE URI is `/sse`, so the full SSE URL becomes `baseUri + /sse` and the message endpoint is constructed as `baseUri + sse.data`. + +### Complete path URL construction + +When `append_to_base_url` is set to `true`, the API returns a complete path that includes the plugin prefix. + +The Python MCP client accepts an endpoint as the SSE endpoint (for example, `http://localhost:8000/_plugins/_ml/mcp/sse`) and concatenates it with `sse.data`. Setting `append_to_base_url=true` ensures that the correct message endpoint is constructed as `/_plugins/_ml/mcp/sse/message`. + +## Endpoints + +```json +GET /_plugins/_ml/mcp/sse +``` + +## Path parameters + +| Parameter | Type | Required/Optional | Description | +|:----------|:-----|:------------------|:------------| +| `append_to_base_url` | Boolean | Optional | Controls how the SSE message endpoint URL is constructed. Default is `false`. See [URL construction methods](#url-construction-methods). | + +## Example request: Default URL construction + +```json +GET /_plugins/_ml/mcp/sse +``` +{% include copy-curl.html %} + +## Example response: Default URL construction + +OpenSearch sends an SSE data stream to the client: + +```yaml +event: endpoint +data: /sse/message?sessionId=e2d65bb9-e82e-473a-b050-b69dc67ca9dd +``` + +## Example request: Complete path URL construction + +```json +GET /_plugins/_ml/mcp/sse?append_to_base_url=true +``` +{% include copy-curl.html %} + +## Example response: Complete path URL construction + +OpenSearch sends an SSE data stream to the client: + +```yaml +event: endpoint +data: /_plugins/_ml/mcp/sse/message?sessionId=e2d65bb9-e82e-473a-b050-b69dc67ca9dd +``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/mcp-server-apis/update-mcp-tools.md b/_ml-commons-plugin/api/mcp-server-apis/update-mcp-tools.md new file mode 100644 index 00000000000..ecd2b938879 --- /dev/null +++ b/_ml-commons-plugin/api/mcp-server-apis/update-mcp-tools.md @@ -0,0 +1,132 @@ +--- +layout: default +title: Update MCP tools +parent: MCP server APIs +grand_parent: ML Commons APIs +nav_order: 20 +--- + +# Update MCP tools +**Introduced 3.0** +{: .label .label-purple } + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +Use this API to update one or more Model Context Protocol (MCP)-based tools. For more information about supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). + +## Endpoints + +```json +POST /_plugins/_ml/mcp/tools/_update +``` + +## Request body fields + +The following table lists the available request fields. + +Field | Data type | Required/Optional | Description +:--- | :--- | :--- | :--- +`tools` | Array | Required | A list of tools. + + +The `tools` array contains a list of tools. Each tool contains the following fields. + +Field | Data type | Required/Optional | Description +:--- | :--- | :--- +`name`| String | Required | The name of the tool to update. | +`type` | String | Optional | The tool type. For a list of supported tools, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). +`description` | String | Optional | The description of the tool. +`parameters` | Object | Optional | The parameters for the tool. The parameters are dependent on the tool type. For information about specific tool types, see [Tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). +`attributes` | Object | Optional | The configuration properties (attributes) for the tool. The most important attribute in this field is the tool's `input_schema`, which defines the expected parameter format for the tool. This schema is sent to the large language model (LLM) so it can properly format parameters when executing the tool. + + +## Example requests + +The following sections provide example requests for updating tools. For information about tool-specific parameters, see the corresponding [tool documentation]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/tools/index/). + +### WebSearchTool + +```json +POST /_plugins/_ml/mcp/tools/_update +{ + "tools": [ + { + "type": "WebSearchTool", + "name": "GoogleSearchTool", + "description": "This tool can be used to perform search via google engine and parse the content of the searched results", + "attributes": { + "input_schema": { + "type": "object", + "properties": { + "engine": { + "type": "string", + "description": "The search engine that will be used by the tool." + }, + "query": { + "type": "string", + "description": "The search query parameter that will be used by the engine to perform the search." + }, + "next_page": { + "type": "string", + "description": "The search result's next page link. If this is provided, the WebSearchTool will fetch the next page results using this link and crawl the links on the page." + } + }, + "required": [ + "engine", + "query" + ] + }, + "strict": false + } + } + ] +} +``` +{% include copy-curl.html %} + +### PPLTool + +```json +POST /_plugins/_ml/mcp/tools/_update +{ + "type": "PPLTool", + "name": "TransferQuestionToPPLAndExecuteTool", + "description": "Use this tool to convert natural language into PPL queries and execute them. Use this tool after you know the index name; otherwise, call IndexRoutingTool first. The input parameters are: {index: IndexName, question: UserQuestion}", + "parameters": { + "model_id": "${your_model_id}", + "model_type": "FINETUNE" + }, + "attributes": { + "input_schema": { + "type": "object", + "properties": { + "question": { + "type": "string", + "description": "The user's natural language question that needs to be converted to PPL." + }, + "index": { + "type": "string", + "description": "The index on which the generated PPL query will be executed." + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Example response + +For each node, OpenSearch responds with the node ID and the status of the update operation for all tools: + +```json +{ + "_ZNV5BrNTVm6ilcM7Jn1pw": { + "updated": true + }, + "NZ9aiUCrSp2b5KBqdJGJKw": { + "updated": true + } +} +``` \ No newline at end of file diff --git a/_ml-commons-plugin/api/memory-apis/create-memory.md b/_ml-commons-plugin/api/memory-apis/create-memory.md index 58ba34b2a4c..67f2ff75bd6 100644 --- a/_ml-commons-plugin/api/memory-apis/create-memory.md +++ b/_ml-commons-plugin/api/memory-apis/create-memory.md @@ -19,7 +19,7 @@ The POST method creates a new memory. The PUT method updates an existing memory. When the Security plugin is enabled, all memories exist in a `private` security mode. Only the user who created a memory can interact with that memory and its messages. {: .important} -## Path and HTTP methods +## Endpoints ```json POST /_plugins/_ml/memory/ diff --git a/_ml-commons-plugin/api/memory-apis/create-message.md b/_ml-commons-plugin/api/memory-apis/create-message.md index 78ec0ade348..d1272e3e545 100644 --- a/_ml-commons-plugin/api/memory-apis/create-message.md +++ b/_ml-commons-plugin/api/memory-apis/create-message.md @@ -22,7 +22,7 @@ You can only update the `additional_info` field of a message. When the Security plugin is enabled, all memories exist in a `private` security mode. Only the user who created a memory can interact with that memory and its messages. {: .important} -## Path and HTTP methods +## Endpoints ```json POST /_plugins/_ml/memory/<memory_id>/messages @@ -44,11 +44,14 @@ The following table lists the available request fields. Field | Data type | Required/Optional | Updatable | Description :--- | :--- | :--- | :--- | :--- -| `input` | String | Optional | No | The question (human input) in the message. | -| `prompt_template` | String | Optional | No | The prompt template that was used for the message. The template may contain instructions or examples that were sent to the large language model. | -| `response` | String | Optional | No | The answer (generative AI output) to the question. | -| `origin` | String | Optional | No | The name of the AI or other system that generated the response. | -| `additional_info` | Object | Optional | Yes | Any other information that was sent to the `origin`. | +`input` | String | Optional | No | The question (human input) in the message. | +`prompt_template` | String | Optional | No | The prompt template that was used for the message. The template may contain instructions or examples that were sent to the large language model. | +`response` | String | Optional | No | The answer (generative AI output) to the question. | +`origin` | String | Optional | No | The name of the AI or other system that generated the response. | +`additional_info` | Object | Optional | Yes | Any other information that was sent to the `origin`. | + +To create or update a message successfully, you must provide at least one of the preceding fields. The provided field(s) cannot be null or empty. +{: .note} #### Example request: Create a message diff --git a/_ml-commons-plugin/api/memory-apis/delete-memory.md b/_ml-commons-plugin/api/memory-apis/delete-memory.md index 99e4cdb5745..46bbf3ac4a9 100644 --- a/_ml-commons-plugin/api/memory-apis/delete-memory.md +++ b/_ml-commons-plugin/api/memory-apis/delete-memory.md @@ -15,7 +15,7 @@ Use this API to delete a memory based on the `memory_id`. When the Security plugin is enabled, all memories exist in a `private` security mode. Only the user who created a memory can interact with that memory and its messages. {: .important} -## Path and HTTP methods +## Endpoints ```json DELETE /_plugins/_ml/memory/<memory_id> diff --git a/_ml-commons-plugin/api/memory-apis/get-memory.md b/_ml-commons-plugin/api/memory-apis/get-memory.md index 7f62445072e..90a60453d24 100644 --- a/_ml-commons-plugin/api/memory-apis/get-memory.md +++ b/_ml-commons-plugin/api/memory-apis/get-memory.md @@ -29,7 +29,7 @@ When the Security plugin is enabled, all memories exist in a `private` security You can retrieve memory information by using the `memory_id`. The response includes all messages within the memory. -### Path and HTTP methods +### Endpoints ```json GET /_plugins/_ml/memory/<memory_id> @@ -65,7 +65,7 @@ GET /_plugins/_ml/memory/N8AE1osB0jLkkocYjz7D Use this command to get all memories. -### Path and HTTP methods +### Endpoints ```json GET /_plugins/_ml/memory diff --git a/_ml-commons-plugin/api/memory-apis/get-message-traces.md b/_ml-commons-plugin/api/memory-apis/get-message-traces.md index 1b0e9b19024..adaaa159543 100644 --- a/_ml-commons-plugin/api/memory-apis/get-message-traces.md +++ b/_ml-commons-plugin/api/memory-apis/get-message-traces.md @@ -18,7 +18,7 @@ When the Security plugin is enabled, all memories exist in a `private` security {: .important} -## Path and HTTP methods +## Endpoints ```json GET /_plugins/_ml/memory/message/<message_id>/traces @@ -92,7 +92,7 @@ green open opensearch_dashboards_sample_data_flights pJde0irnTce4-uobHw green open my_test_data T4hwNs7CTJGIfw2QpCqQ_Q 1 1 6 0 91.7kb 45.8kb green open .opendistro-job-scheduler-lock XjgmXAVKQ4e8Y-ac54VBzg 1 1 3 0 38.7kb 19.4kb """, - "origin": "CatIndexTool", + "origin": "ListIndexTool", "additional_info": {}, "parent_message_id": "TAuCZY0BT2tRrkdmCPqZ", "trace_number": 2 diff --git a/_ml-commons-plugin/api/memory-apis/get-message.md b/_ml-commons-plugin/api/memory-apis/get-message.md index 36baa84bf44..8807146e208 100644 --- a/_ml-commons-plugin/api/memory-apis/get-message.md +++ b/_ml-commons-plugin/api/memory-apis/get-message.md @@ -24,7 +24,7 @@ When the Security plugin is enabled, all memories exist in a `private` security You can retrieve message information by using the `message_id`. -### Path and HTTP methods +### Endpoints ```json GET /_plugins/_ml/memory/message/<message_id> @@ -68,7 +68,7 @@ For information about response fields, see [Create Message request fields]({{sit Use this command to get a list of messages for a certain memory. -### Path and HTTP methods +### Endpoints ```json GET /_plugins/_ml/memory/<memory_id>/messages diff --git a/_ml-commons-plugin/api/memory-apis/index.md b/_ml-commons-plugin/api/memory-apis/index.md index a279eafac93..b2399a5c13f 100644 --- a/_ml-commons-plugin/api/memory-apis/index.md +++ b/_ml-commons-plugin/api/memory-apis/index.md @@ -4,7 +4,7 @@ title: Memory APIs parent: ML Commons APIs has_children: true has_toc: false -nav_order: 28 +nav_order: 50 redirect_from: /ml-commons-plugin/api/memory-apis/ --- diff --git a/_ml-commons-plugin/api/memory-apis/search-memory.md b/_ml-commons-plugin/api/memory-apis/search-memory.md index 4fa022bfaf2..20e716a210c 100644 --- a/_ml-commons-plugin/api/memory-apis/search-memory.md +++ b/_ml-commons-plugin/api/memory-apis/search-memory.md @@ -15,7 +15,7 @@ This API retrieves a conversational memory for [conversational search]({{site.ur When the Security plugin is enabled, all memories exist in a `private` security mode. Only the user who created a memory can interact with that memory and its messages. {: .important} -## Path and HTTP methods +## Endpoints ```json GET /_plugins/_ml/memory/_search diff --git a/_ml-commons-plugin/api/memory-apis/search-message.md b/_ml-commons-plugin/api/memory-apis/search-message.md index 22602c21f27..3242ffae4bd 100644 --- a/_ml-commons-plugin/api/memory-apis/search-message.md +++ b/_ml-commons-plugin/api/memory-apis/search-message.md @@ -15,7 +15,7 @@ Retrieves message information for [conversational search]({{site.url}}{{site.bas When the Security plugin is enabled, all memories exist in a `private` security mode. Only the user who created a memory can interact with that memory and its messages. {: .important} -## Path and HTTP methods +## Endpoints ```json POST /_plugins/_ml/memory/<memory_id>/_search diff --git a/_ml-commons-plugin/api/model-apis/batch-predict.md b/_ml-commons-plugin/api/model-apis/batch-predict.md index c1dc7348fec..2b465a2170d 100644 --- a/_ml-commons-plugin/api/model-apis/batch-predict.md +++ b/_ml-commons-plugin/api/model-apis/batch-predict.md @@ -8,9 +8,6 @@ nav_order: 65 # Batch predict -This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/ml-commons/issues/2488). -{: .warning} - ML Commons can perform inference on large datasets in an offline asynchronous mode using a model deployed on external model servers. To use the Batch Predict API, you must provide the `model_id` for an externally hosted model. Amazon SageMaker, Cohere, and OpenAI are currently the only verified external servers that support this API. For information about user access for this API, see [Model access control considerations]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/#model-access-control-considerations). @@ -23,7 +20,7 @@ For instructions on how set up batch inference and connector blueprints, see the - [OpenAI batch predict connector blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/batch_inference_openAI_connector_blueprint.md) -## Path and HTTP methods +## Endpoints ```json POST /_plugins/_ml/models/<model_id>/_batch_predict diff --git a/_ml-commons-plugin/api/model-apis/delete-model.md b/_ml-commons-plugin/api/model-apis/delete-model.md index b35e7c808b2..bc152c12289 100644 --- a/_ml-commons-plugin/api/model-apis/delete-model.md +++ b/_ml-commons-plugin/api/model-apis/delete-model.md @@ -15,7 +15,7 @@ When you delete the last model version in a model group, that model group is aut For information about user access for this API, see [Model access control considerations]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/#model-access-control-considerations). -## Path and HTTP methods +## Endpoints ```json DELETE /_plugins/_ml/models/<model_id> @@ -44,4 +44,15 @@ DELETE /_plugins/_ml/models/MzcIJX8BA7mbufL6DOwl "_seq_no" : 27, "_primary_term" : 18 } -``` \ No newline at end of file +``` + +## Safely deleting a model +Introduced 2.19 +{: .label .label-purple } + +To prevent accidental deletion of models in active use by agents, search pipelines, ingest pipelines, or other components, you can enable a safety check. If the safety check is enabled and you attempt to delete a model that is in current use, OpenSearch returns an error message. To proceed with deletion: + +- Identify any components using the model and either delete them or update them so that they use other models. +- Once all dependencies are cleared, delete the model. + +For information about enabling this feature, see [Safely delete models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/cluster-settings/#safely-delete-models). \ No newline at end of file diff --git a/_ml-commons-plugin/api/model-apis/deploy-model.md b/_ml-commons-plugin/api/model-apis/deploy-model.md index 2c6991ba221..536ea688642 100644 --- a/_ml-commons-plugin/api/model-apis/deploy-model.md +++ b/_ml-commons-plugin/api/model-apis/deploy-model.md @@ -24,7 +24,7 @@ PUT _cluster/settings For information about user access for this API, see [Model access control considerations]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/#model-access-control-considerations). -## Path and HTTP methods +## Endpoints ```json POST /_plugins/_ml/models/<model_id>/_deploy diff --git a/_ml-commons-plugin/api/model-apis/get-model.md b/_ml-commons-plugin/api/model-apis/get-model.md index 0286497d31e..ff824d7f68d 100644 --- a/_ml-commons-plugin/api/model-apis/get-model.md +++ b/_ml-commons-plugin/api/model-apis/get-model.md @@ -12,7 +12,7 @@ You can retrieve model information using the `model_id`. For information about user access for this API, see [Model access control considerations]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/#model-access-control-considerations). -## Path and HTTP methods +## Endpoints ```json GET /_plugins/_ml/models/<model_id> @@ -41,7 +41,7 @@ GET /_plugins/_ml/models/N8AE1osB0jLkkocYjz7D "algorithm" : "TEXT_EMBEDDING", "version" : "1", "model_format" : "TORCH_SCRIPT", - "model_state" : "LOADED", + "model_state" : "DEPLOYED", "model_content_size_in_bytes" : 83408741, "model_content_hash_value" : "9376c2ebd7c83f99ec2526323786c348d2382e6d86576f750c89ea544d6bbb14", "model_config" : { @@ -55,4 +55,20 @@ GET /_plugins/_ml/models/N8AE1osB0jLkkocYjz7D "last_loaded_time" : 1665961815959, "total_chunks" : 9 } -``` \ No newline at end of file +``` + +## Valid model states + +When a model is registered, deployed, or undeployed in OpenSearch, it transitions through various model states that reflect its availability. These states help you track the model's readiness for use, loading status, or failure conditions. + +The following table lists all valid model states. + +| Model state | Description | +|:---------------------|:---------------------------------------------------------------------------------------------------------| +| `REGISTERING ` | The model is in the process of being registered to the cluster. | +| `REGISTERED` | The model metadata is registered to the cluster but not yet deployed. | +| `DEPLOYED` | The model has been successfully deployed/loaded to all eligible worker nodes and is ready for inference. | +| `DEPLOYING` | The model is in the process of being deployed to memory. | +| `PARTIALLY_DEPLOYED` | The model has been deployed to some of the eligible worker nodes. | +| `UNDEPLOYED` | The model has been successfully unloaded/undeployed from memory on all the nodes. | +| `DEPLOY_FAILED` | An error occurred while trying to deploy the model to the cluster nodes. | diff --git a/_ml-commons-plugin/api/model-apis/index.md b/_ml-commons-plugin/api/model-apis/index.md index 9cf992d54bf..f0b2b034d25 100644 --- a/_ml-commons-plugin/api/model-apis/index.md +++ b/_ml-commons-plugin/api/model-apis/index.md @@ -24,7 +24,7 @@ ML Commons supports the following model-level CRUD APIs: Predict APIs are used to invoke machine learning (ML) models. ML Commons supports the following Predict APIs: - [Predict]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/train-predict/predict/) -- [Batch Predict]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/batch-predict/) (experimental) +- [Batch Predict]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/batch-predict/) # Train API diff --git a/_ml-commons-plugin/api/model-apis/register-model.md b/_ml-commons-plugin/api/model-apis/register-model.md index 63537d04433..a0021927799 100644 --- a/_ml-commons-plugin/api/model-apis/register-model.md +++ b/_ml-commons-plugin/api/model-apis/register-model.md @@ -24,7 +24,7 @@ For information about user access for this API, see [Model access control consid If the model is more than 10 MB in size, ML Commons splits it into smaller chunks and saves those chunks in the model's index. -## Path and HTTP methods +## Endpoints ```json POST /_plugins/_ml/models/_register @@ -64,7 +64,7 @@ Field | Data type | Required/Optional | Description POST /_plugins/_ml/models/_register { "name": "huggingface/sentence-transformers/msmarco-distilbert-base-tas-b", - "version": "1.0.1", + "version": "1.0.3", "model_group_id": "Z1eQf4oB5Vm0Tdw8EIP2", "model_format": "TORCH_SCRIPT" } @@ -95,7 +95,7 @@ Field | Data type | Required/Optional | Description ```json POST /_plugins/_ml/models/_register { - "name": "amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v2-distill", + "name": "amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v3-distill", "version": "1.0.0", "model_group_id": "Z1eQf4oB5Vm0Tdw8EIP2", "model_format": "TORCH_SCRIPT" @@ -116,9 +116,9 @@ Field | Data type | Required/Optional | Description `name`| String | Required | The model name. | `version` | String | Required | The model version. | `model_format` | String | Required | The portable format of the model file. Valid values are `TORCH_SCRIPT` and `ONNX`. | -`function_name` | String | Required | Set this parameter to `SPARSE_ENCODING` or `SPARSE_TOKENIZE`. +`function_name` | String | Required | Set this parameter to `TEXT_EMBEDDING`, `SPARSE_ENCODING`, `SPARSE_TOKENIZE`, `TEXT_SIMILARITY`, or `QUESTION_ANSWERING`. `model_content_hash_value` | String | Required | The model content hash generated using the SHA-256 hashing algorithm. -[`model_config`](#the-model_config-object) | Object | Required | The model's configuration, including the `model_type`, `embedding_dimension`, and `framework_type`. `all_config` is an optional JSON string that contains all model configurations. | +[`model_config`](#the-model_config-object) | Object | Required | The model's configuration, including the `model_type`, `embedding_dimension`, and `framework_type`. The optional `all_config` JSON string contains all model configurations. The `additional_config` object contains the corresponding `space_type` for pretrained models or the specified `space_type` for custom models. See [Space types]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-spaces/#distance-calculation). | `url` | String | Required | The URL that contains the model. | `description` | String | Optional| The model description. | `model_group_id` | String | Optional | The model group ID of the model group to register this model to. @@ -134,6 +134,7 @@ Field | Data type | Required/Optional | Description | `embedding_dimension` | Integer | The dimension of the model-generated dense vector. For a Hugging Face model, the dimension is specified in the model card. For example, in the [`all-MiniLM-L6-v2` Hugging Face model card](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2), the statement `384 dimensional dense vector space` specifies 384 as the embedding dimension. Required. | | `framework_type` | String | The framework the model is using. Currently, OpenSearch supports `sentence_transformers` and `huggingface_transformers` frameworks. The `sentence_transformers` model outputs text embeddings directly, so ML Commons does not perform any post processing. For `huggingface_transformers`, ML Commons performs post processing by applying mean pooling to get text embeddings. See the example [`all-MiniLM-L6-v2` Hugging Face model](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) for more details. Required. | | `all_config` | String | This field is used for reference purposes. You can specify all model configurations in this field. For example, if you are using a Hugging Face model, you can minify the `config.json` file to one line and save its contents in the `all_config` field. Once the model is uploaded, you can use the get model API operation to get all model configurations stored in this field. Optional. | +| `additional_config` | Object | Additional model configurations. Contains the `space_type`, which specifies the distance metric for k-NN search. For OpenSearch-provided pretrained models, this value is automatically set to the corresponding metric (for example, `l2` for `huggingface/sentence-transformers/all-distilroberta-v1`). For custom models, specify your preferred space type. Optional. See [Space types]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-spaces/#distance-calculation). | You can further customize a pretrained sentence transformer model's post-processing logic with the following optional fields in the `model_config` object. @@ -153,6 +154,7 @@ POST /_plugins/_ml/models/_register "version": "1.0.0", "description": "test model", "model_format": "TORCH_SCRIPT", + "function_name": "TEXT_EMBEDDING", "model_group_id": "FTNlQ4gBYW0Qyy5ZoxfR", "model_content_hash_value": "c15f0d2e62d872be5b5bc6c84d2e0f4921541e29fefbef51d59cc10a8ae30e0f", "model_config": { diff --git a/_ml-commons-plugin/api/model-apis/search-model.md b/_ml-commons-plugin/api/model-apis/search-model.md index 729237eb74b..a203d69baaa 100644 --- a/_ml-commons-plugin/api/model-apis/search-model.md +++ b/_ml-commons-plugin/api/model-apis/search-model.md @@ -18,7 +18,7 @@ The response will contain only those model versions to which you have access. Fo For information about user access for this API, see [Model access control considerations]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/#model-access-control-considerations). -## Path and HTTP methods +## Endpoints ```json GET /_plugins/_ml/models/_search diff --git a/_ml-commons-plugin/api/model-apis/undeploy-model.md b/_ml-commons-plugin/api/model-apis/undeploy-model.md index 4e9360f0b53..99d088e587d 100644 --- a/_ml-commons-plugin/api/model-apis/undeploy-model.md +++ b/_ml-commons-plugin/api/model-apis/undeploy-model.md @@ -12,7 +12,7 @@ To undeploy a model from memory, use the undeploy operation. For information about user access for this API, see [Model access control considerations]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/#model-access-control-considerations). -### Path and HTTP methods +### Endpoints ```json POST /_plugins/_ml/models/<model_id>/_undeploy diff --git a/_ml-commons-plugin/api/model-apis/update-model.md b/_ml-commons-plugin/api/model-apis/update-model.md index 083f2cb4486..c097c735bd7 100644 --- a/_ml-commons-plugin/api/model-apis/update-model.md +++ b/_ml-commons-plugin/api/model-apis/update-model.md @@ -14,7 +14,7 @@ Updates a model based on the `model_ID`. For information about user access for this API, see [Model access control considerations]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/#model-access-control-considerations). -## Path and HTTP methods +## Endpoints ```json PUT /_plugins/_ml/models/<model_id> @@ -130,3 +130,53 @@ PUT /_plugins/_ml/models/9uGdCJABjaMXYrp14YRj } ``` +#### Example request: Updating the model interface + +You can update a model's interface to define input and output schemas. This is useful when working with models that lack a default interface or require customization. For more information about model interfaces, see [The `Interface` parameter]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#the-interface-parameter). + +The following example request specifies the output schema for an [AI21 Labs Jurassic model](https://aws.amazon.com/bedrock/ai21/) that was registered without a post-processing function: + +```json +PUT /_plugins/_ml/models/IMcNB5UB7judm8f45nXo +{ + "interface": { + "output": "{\n \"type\": \"object\",\n \"properties\": {\n \"inference_results\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"object\",\n \"properties\": {\n \"output\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"object\",\n \"properties\": {\n \"name\": {\n \"type\": \"string\"\n },\n \"dataAsMap\": {\n \"type\": \"object\",\n \"properties\": {\n \"id\": {\n \"type\": \"number\"\n },\n \"prompt\": {\n \"type\": \"object\",\n \"properties\": {\n \"text\": {\n \"type\": \"string\"\n },\n \"tokens\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"object\",\n \"properties\": {\n \"generatedToken\": {\n \"type\": \"object\",\n \"properties\": {\n \"token\": {\n \"type\": \"string\"\n },\n \"logprob\": {\n \"type\": \"number\"\n },\n \"raw_logprob\": {\n \"type\": \"number\"\n }\n }\n },\n \"textRange\": {\n \"type\": \"object\",\n \"properties\": {\n \"start\": {\n \"type\": \"number\"\n },\n \"end\": {\n \"type\": \"number\"\n }\n }\n }\n }\n }\n }\n }\n },\n \"completions\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"object\",\n \"properties\": {\n \"data\": {\n \"type\": \"object\",\n \"properties\": {\n \"text\": {\n \"type\": \"string\"\n },\n \"tokens\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"object\",\n \"properties\": {\n \"generatedToken\": {\n \"type\": \"object\",\n \"properties\": {\n \"token\": {\n \"type\": \"string\"\n },\n \"logprob\": {\n \"type\": \"number\"\n },\n \"raw_logprob\": {\n \"type\": \"number\"\n }\n }\n },\n \"textRange\": {\n \"type\": \"object\",\n \"properties\": {\n \"start\": {\n \"type\": \"number\"\n },\n \"end\": {\n \"type\": \"number\"\n }\n }\n }\n }\n }\n }\n }\n },\n \"finishReason\": {\n \"type\": \"object\",\n \"properties\": {\n \"reason\": {\n \"type\": \"string\"\n },\n \"length\": {\n \"type\": \"number\"\n }\n }\n }\n }\n }\n }\n }\n }\n }\n }\n },\n \"status_code\": {\n \"type\": \"integer\"\n }\n }\n }\n }\n }\n}" + } +} +``` +{% include copy-curl.html %} + +If the model was registered using the [Amazon Bedrock AI21 Labs Jurassic blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_ai21labs_jurassic_blueprint.md), a default interface is applied automatically. +{: .note} + +If the model interface is no longer needed, you can remove both the input and output schemas in order to bypass model schema validation: + +```json +PUT /_plugins/_ml/models/IMcNB5UB7judm8f45nXo +{ + "interface": { + "input": null, + "output": null + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "_index": ".plugins-ml-model", + "_id": "IMcNB5UB7judm8f45nXo", + "_version": 2, + "result": "updated", + "_shards": { + "total": 2, + "successful": 2, + "failed": 0 + }, + "_seq_no": 379, + "_primary_term": 5 +} +``` + diff --git a/_ml-commons-plugin/api/profile.md b/_ml-commons-plugin/api/profile.md index 8337f23e6ee..892205fee96 100644 --- a/_ml-commons-plugin/api/profile.md +++ b/_ml-commons-plugin/api/profile.md @@ -2,7 +2,7 @@ layout: default title: Profile parent: ML Commons APIs -nav_order: 40 +nav_order: 100 --- # Profile @@ -24,7 +24,7 @@ PUT _cluster/settings To clear all monitoring requests, set `plugins.ml_commons.monitoring_request_count` to `0`. -## Path and HTTP methods +## Endpoints ```json GET /_plugins/_ml/profile diff --git a/_ml-commons-plugin/api/stats.md b/_ml-commons-plugin/api/stats.md index 8d93a96d987..ef327a208e2 100644 --- a/_ml-commons-plugin/api/stats.md +++ b/_ml-commons-plugin/api/stats.md @@ -2,14 +2,17 @@ layout: default title: Stats parent: ML Commons APIs -nav_order: 50 +nav_order: 110 --- # Stats +The Stats API provides basic statistics about ML Commons, such as the number of running tasks. To monitor machine learning workflows using more detailed time-series metrics, see [Monitoring machine learning workflows]({{site.url}}{{site.baseurl}}/monitoring-your-cluster/metrics/getting-started/#monitoring-machine-learning-workflows). +{: .note } + Gets statistics related to the number of tasks. -## Path and HTTP methods +## Endpoints ```json GET /_plugins/_ml/stats diff --git a/_ml-commons-plugin/api/tasks-apis/delete-task.md b/_ml-commons-plugin/api/tasks-apis/delete-task.md index f3e0b0896f1..5db3bfb7dc7 100644 --- a/_ml-commons-plugin/api/tasks-apis/delete-task.md +++ b/_ml-commons-plugin/api/tasks-apis/delete-task.md @@ -13,7 +13,7 @@ Deletes a task based on the `task_id`. ML Commons does not check the task status when running the delete request. There is a risk that a currently running task could be deleted before the task completes. To check the status of a task, run `GET /_plugins/_ml/tasks/<task_id>` before task deletion. {: .note} -### Path and HTTP methods +### Endpoints ```json DELETE /_plugins/_ml/tasks/<task_id> diff --git a/_ml-commons-plugin/api/tasks-apis/get-task.md b/_ml-commons-plugin/api/tasks-apis/get-task.md index 14b28e84573..c27b16c1a5d 100644 --- a/_ml-commons-plugin/api/tasks-apis/get-task.md +++ b/_ml-commons-plugin/api/tasks-apis/get-task.md @@ -10,7 +10,7 @@ nav_order: 10 You can retrieve information about a task using the `task_id`. -## Path and HTTP methods +## Endpoints ```json GET /_plugins/_ml/tasks/<task_id> diff --git a/_ml-commons-plugin/api/tasks-apis/index.md b/_ml-commons-plugin/api/tasks-apis/index.md index e6f17aca080..cde6c989556 100644 --- a/_ml-commons-plugin/api/tasks-apis/index.md +++ b/_ml-commons-plugin/api/tasks-apis/index.md @@ -4,7 +4,7 @@ title: Tasks APIs parent: ML Commons APIs has_children: true has_toc: false -nav_order: 30 +nav_order: 70 --- # Tasks APIs diff --git a/_ml-commons-plugin/api/tasks-apis/search-task.md b/_ml-commons-plugin/api/tasks-apis/search-task.md index 526684a9ef2..026249c9a6c 100644 --- a/_ml-commons-plugin/api/tasks-apis/search-task.md +++ b/_ml-commons-plugin/api/tasks-apis/search-task.md @@ -10,7 +10,7 @@ nav_order: 15 Searches tasks based on parameters indicated in the request body. -## Path and HTTP methods +## Endpoints ```json GET /_plugins/_ml/tasks/_search diff --git a/_ml-commons-plugin/api/train-predict/predict.md b/_ml-commons-plugin/api/train-predict/predict.md index ea0938da36c..a3f369992e5 100644 --- a/_ml-commons-plugin/api/train-predict/predict.md +++ b/_ml-commons-plugin/api/train-predict/predict.md @@ -12,7 +12,7 @@ ML Commons can predict new data with your trained model either from indexed data For information about user access for this API, see [Model access control considerations]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/#model-access-control-considerations). -## Path and HTTP methods +## Endpoints ```json POST /_plugins/_ml/_predict/<algorithm_name>/<model_id> diff --git a/_ml-commons-plugin/cluster-settings.md b/_ml-commons-plugin/cluster-settings.md index efb13dd73ae..2cab887487a 100644 --- a/_ml-commons-plugin/cluster-settings.md +++ b/_ml-commons-plugin/cluster-settings.md @@ -2,13 +2,12 @@ layout: default title: ML Commons cluster settings has_children: false -nav_order: 10 +nav_order: 140 --- -# ML Commons cluster settings +# ML cluster settings - -To enhance and customize your OpenSearch cluster for machine learning (ML), you can add and modify several configuration settings for the ML Commons plugin in your 'opensearch.yml' file. +To enhance and customize your OpenSearch cluster for machine learning (ML), you can add and modify several configuration settings for the ML Commons plugin in your `opensearch.yml` file. To learn more about static and dynamic settings, see [Configuring OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/). @@ -36,7 +35,7 @@ We recommend setting `plugins.ml_commons.only_run_on_ml_node` to `true` on produ ### Setting -``` +```yaml plugins.ml_commons.only_run_on_ml_node: true ``` @@ -52,7 +51,7 @@ plugins.ml_commons.only_run_on_ml_node: true ### Setting -``` +```yaml plugins.ml_commons.task_dispatch_policy: round_robin ``` @@ -68,7 +67,7 @@ Sets the number of ML tasks that can run on each ML node. When set to `0`, no ML ### Setting -``` +```yaml plugins.ml_commons.max_ml_task_per_node: 10 ``` @@ -83,7 +82,7 @@ Sets the number of ML models that can be deployed to each ML node. When set to ` ### Setting -``` +```yaml plugins.ml_commons.max_model_on_node: 10 ``` @@ -99,7 +98,7 @@ When returning runtime information with the [Profile API]({{site.url}}{{site.bas ### Setting -``` +```yaml plugins.ml_commons.sync_up_job_interval_in_seconds: 3 ``` @@ -114,7 +113,7 @@ Controls how many predict requests are monitored on one node. If set to `0`, Ope ### Setting -``` +```yaml plugins.ml_commons.monitoring_request_count: 100 ``` @@ -129,7 +128,7 @@ Controls how many register model tasks can run in parallel on one node. If set t ### Setting -``` +```yaml plugins.ml_commons.max_register_model_tasks_per_node: 10 ``` @@ -146,7 +145,7 @@ Controls how many deploy model tasks can run in parallel on one node. If set to ### Setting -``` +```yaml plugins.ml_commons.max_deploy_model_tasks_per_node: 10 ``` @@ -157,11 +156,11 @@ plugins.ml_commons.max_deploy_model_tasks_per_node: 10 ## Register models using URLs -This setting gives you the ability to register models using a URL. By default, ML Commons only allows registration of [pretrained]({{site.url}}{{site.baseurl}}//ml-commons-plugin/pretrained-models/) models from the OpenSearch model repository. +This setting gives you the ability to register models using a URL. By default, ML Commons only allows registration of [pretrained]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/) models from the OpenSearch model repository. ### Setting -``` +```yaml plugins.ml_commons.allow_registering_model_via_url: false ``` @@ -172,11 +171,11 @@ plugins.ml_commons.allow_registering_model_via_url: false ## Register models using local files -This setting gives you the ability to register a model using a local file. By default, ML Commons only allows registration of [pretrained]({{site.url}}{{site.baseurl}}//ml-commons-plugin/pretrained-models/) models from the OpenSearch model repository. +This setting gives you the ability to register a model using a local file. By default, ML Commons only allows registration of [pretrained]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/) models from the OpenSearch model repository. ### Setting -``` +```yaml plugins.ml_commons.allow_registering_model_via_local_file: false ``` @@ -196,7 +195,7 @@ The default URL value for this trusted URL setting is not secure. For security, {: .warning } -``` +```yaml plugins.ml_commons.trusted_url_regex: <model-repository-url> ``` @@ -211,7 +210,7 @@ Assigns how long in seconds an ML task will live. After the timeout, the task wi ### Setting -``` +```yaml plugins.ml_commons.ml_task_timeout_in_seconds: 600 ``` @@ -230,7 +229,7 @@ Starting with OpenSearch 2.5, ML Commons runs a native memory circuit breaker to ### Setting -``` +```yaml plugins.ml_commons.native_memory_threshold: 90 ``` @@ -247,7 +246,7 @@ Values are based on the percentage of JVM heap memory available. When set to `0` ### Setting -``` +```yaml plugins.ml_commons.jvm_heap_memory_threshold: 85 ``` @@ -264,7 +263,7 @@ Valid values are in byte units. To disable the circuit breaker, set this value t ### Setting -``` +```yaml plugins.ml_commons.disk_free_space_threshold: 5G ``` @@ -279,7 +278,7 @@ Use this setting to specify the names of nodes on which you don't want to run ML ### Setting -``` +```yaml plugins.ml_commons.exclude_nodes._name: node1, node2 ``` @@ -289,7 +288,7 @@ When enabled, this setting grants users the ability to deploy models to specific ### Setting -``` +```yaml plugins.ml_commons.allow_custom_deployment_plan: false ``` @@ -304,7 +303,7 @@ This setting is applicable when you send a prediction request for an externally ### Setting -``` +```yaml plugins.ml_commons.model_auto_deploy.enable: false ``` @@ -319,13 +318,13 @@ This setting automatically redeploys deployed or partially deployed models upon ### Setting -``` +```yaml plugins.ml_commons.model_auto_redeploy.enable: true ``` ### Values -- Default value: true +- Default value: `true` - Valid values: `false`, `true` ## Set retires for auto redeploy @@ -334,7 +333,7 @@ This setting sets the limit for the number of times a deployed or partially depl ### Setting -``` +```yaml plugins.ml_commons.model_auto_redeploy.lifetime_retry_times: 3 ``` @@ -349,7 +348,7 @@ This setting sets the ratio of success for the auto-redeployment of a model base ### Setting -``` +```yaml plugins.ml_commons.model_auto_redeploy_success_ratio: 0.8 ``` @@ -364,22 +363,40 @@ When set to `true`, this setting enables the ability to run Python-based models ### Setting -``` +```yaml plugins.ml_commons.enable_inhouse_python_model: false ``` ### Values -- Default value: false +- Default value: `false` - Valid values: `false`, `true` +## Safely delete models +Introduced 2.19 +{: .label .label-purple } + +When set to `true`, this setting enables a safety feature that checks for downstream dependencies before deleting a model. This helps prevent accidental deletion of models in active use by agents, search pipelines, ingest pipelines, and other downstream tasks. If this setting is enabled and you attempt to delete a model that has active downstream dependencies, you'll receive an error message and the model will not be deleted. + +### Setting + +```yaml +plugins.ml_commons.safe_delete_model: true +``` + +### Values + +- Default value: `false` +- Valid values: `false`, `true` + + ## Enable access control for connectors When set to `true`, the setting allows admins to control access and permissions to the connector API using `backend_roles`. ### Setting -``` +```yaml plugins.ml_commons.connector_access_control_enabled: true ``` @@ -394,7 +411,7 @@ This setting allows a cluster admin to enable running local models on the cluste ### Setting -``` +```yaml plugins.ml_commons.local_model.enabled: true ``` @@ -409,7 +426,7 @@ This setting allows a cluster admin to control the types of nodes on which exter ### Setting -``` +```yaml plugins.ml_commons.task_dispatcher.eligible_node_role.remote_model: ["ml"] ``` @@ -424,7 +441,7 @@ This setting allows a cluster admin to control the types of nodes on which local ### Setting -``` +```yaml plugins.ml_commons.task_dispatcher.eligible_node_role.remote_model: ["ml"] ``` @@ -438,7 +455,7 @@ This setting allows a cluster admin to enable remote inference on the cluster. I ### Setting -``` +```yaml plugins.ml_commons.remote_inference.enabled: true ``` @@ -453,7 +470,7 @@ When set to `true`, this setting enables the agent framework (including agents a ### Setting -``` +```yaml plugins.ml_commons.agent_framework_enabled: true ``` @@ -468,7 +485,7 @@ When set to `true`, this setting enables conversational memory, which stores all ### Setting -``` +```yaml plugins.ml_commons.memory_feature_enabled: true ``` @@ -484,7 +501,7 @@ When set to `true`, this setting enables the search processors for retrieval-aug ### Setting -``` +```yaml plugins.ml_commons.rag_pipeline_feature_enabled: true ``` diff --git a/_ml-commons-plugin/custom-local-models.md b/_ml-commons-plugin/custom-local-models.md index 09c3105f8d1..229c23ad1c8 100644 --- a/_ml-commons-plugin/custom-local-models.md +++ b/_ml-commons-plugin/custom-local-models.md @@ -320,7 +320,7 @@ The response contains the tokens and weights: ## Step 5: Use the model for search -To learn how to use the model for vector search, see [Using an ML model for neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/#using-an-ml-model-for-neural-search). +To learn how to use the model for vector search, see [AI search methods]({{site.url}}{{site.baseurl}}/vector-search/ai-search/#ai-search-methods). ## Question answering models diff --git a/_ml-commons-plugin/index.md b/_ml-commons-plugin/index.md index 50d637379e1..e04f57637bd 100644 --- a/_ml-commons-plugin/index.md +++ b/_ml-commons-plugin/index.md @@ -8,34 +8,63 @@ nav_exclude: true permalink: /ml-commons-plugin/ redirect_from: - /ml-commons-plugin/index/ +models: + - heading: "Deploy local models to your cluster" + link: "/ml-commons-plugin/using-ml-models/" + list: + - "<b>Pretrained models</b>: Use OpenSearch-provided models for immediate implementation" + - "<b>Custom models</b>: Upload and serve your own models" + - heading: "Connect to externally hosted models" + link: "/ml-commons-plugin/remote-models/" + description: "Connect to models hosted on Amazon Bedrock, Amazon SageMaker, OpenAI, Cohere, DeepSeek, and other platforms" +more_cards: + - heading: "Get started with AI search" + description: "Build your first semantic search application using this hands-on tutorial" + link: "/vector-search/tutorials/neural-search-tutorial/" + - heading: "AI search" + description: "Discover AI search, from <b>semantic</b>, <b>hybrid</b>, and <b>multimodal</b> search to <b>RAG</b>" + link: "/vector-search/ai-search/" + - heading: "Tutorials" + description: "Follow step-by-step tutorials to integrate AI capabilities into your applications" + link: "/vector-search/tutorials/" + - heading: "ML API reference" + description: "Explore comprehensive documentation for machine learning API operations" + link: "/ml-commons-plugin/api/" +oa-toolkit: + - heading: "OpenSearch Assistant Toolkit" + link: "/ml-commons-plugin/opensearch-assistant/" + list: + - Agents for task orchestration + - Tools for specific operations + - Configuration automation +algorithms: + - heading: "Supported algorithms" + link: "/ml-commons-plugin/algorithms/" + description: "Learn about the natively supported clustering, pattern detection, and statistical analysis algorithms" --- # Machine learning -The [ML Commons plugin](https://github.com/opensearch-project/ml-commons/) provides machine learning (ML) features in OpenSearch. +OpenSearch offers two distinct approaches to machine learning (ML): using ML models for tasks like semantic search and text generation, and running statistical algorithms for data analysis. Choose the approach that best fits your use case. -## Integrating ML models +## ML models for search and AI/ML-powered applications -For ML-model-powered search, you can use a pretrained model provided by OpenSearch, upload your own model to the OpenSearch cluster, or connect to a foundation model hosted on an external platform. In OpenSearch version 2.9 and later, you can integrate local and external models simultaneously within a single cluster. +OpenSearch supports ML models that you can use to enhance search relevance through semantic understanding. You can either deploy models directly within your OpenSearch cluster or connect to models hosted on external platforms. These models can transform text into vector embeddings, enabling semantic search capabilities, or provide advanced features like text generation and question answering. For more information, see [Integrating ML models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/). -For more information, see [Integrating ML models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/). +{% include cards.html cards=page.models %} -## Managing ML models in OpenSearch Dashboards +## OpenSearch Assistant and automation -Administrators of ML clusters can use OpenSearch Dashboards to review and manage the status of ML models running inside a cluster. For more information, see [Managing ML models in OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/ml-commons-plugin/ml-dashboard/). +OpenSearch Assistant Toolkit helps you create AI-powered assistants for OpenSearch Dashboards. -## Support for algorithms +{% include cards.html cards=page.oa-toolkit %} -ML Commons supports various algorithms to help train ML models and make predictions or test data-driven predictions without a model. For more information, see [Supported algorithms]({{site.url}}{{site.baseurl}}/ml-commons-plugin/algorithms/). +## Built-in algorithms for data analysis -## ML Commons API +OpenSearch includes built-in algorithms that analyze your data directly within your cluster, enabling tasks like anomaly detection, data clustering, and predictive analytics without requiring external ML models. -ML Commons provides its own set of REST APIs. For more information, see [ML Commons API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/index/). +{% include cards.html cards=page.algorithms %} -## ML-powered search +## Build your solution -For information about available ML-powered search types, see [ML-powered search]({{site.url}}{{site.baseurl}}/search-plugins/index/#ml-powered-search). - -## Tutorials - -Using the OpenSearch ML framework, you can build various applications, from implementing conversational search to building your own chatbot. For more information, see [Tutorials]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/index/). \ No newline at end of file +{% include cards.html cards=page.more_cards %} \ No newline at end of file diff --git a/_ml-commons-plugin/integrating-ml-models.md b/_ml-commons-plugin/integrating-ml-models.md index 4dbf169e544..671bb216819 100644 --- a/_ml-commons-plugin/integrating-ml-models.md +++ b/_ml-commons-plugin/integrating-ml-models.md @@ -3,6 +3,21 @@ layout: default title: Integrating ML models nav_order: 15 has_children: true +more_cards: + - heading: "Get started with AI search" + description: "Learn how to implement semantic and hybrid search in OpenSearch" + link: "/vector-search/tutorials/neural-search-tutorial/" +local_model: + - heading: "Pretrained models provided by OpenSearch" + link: "/ml-commons-plugin/pretrained-models/" + description: "Requires minimal setup and avoids the time and effort required to train a custom model" + - heading: "Custom models" + link: "/ml-commons-plugin/custom-local-models/" + description: "Offers customization for your specific use case" +external_model: + - heading: "Externally hosted models" + link: "/ml-commons-plugin/remote-models/" + description: "Learn how to create connectors for models hosted on third-party platforms" --- # Integrating ML models @@ -14,32 +29,26 @@ Before you get started, you'll need to [set up]({{site.url}}{{site.baseurl}}/qui ## Choosing a model -To integrate an ML model into your search workflow, choose one of the following options: +To integrate an ML model into your search workflow, choose one of the following options. -1. **Local model**: Upload a model to the OpenSearch cluster and use it locally. This option allows you to serve the model in your OpenSearch cluster but may require significant system resources. +### Local model - 1. **Pretrained model provided by OpenSearch**: This option requires minimal setup and avoids the time and effort required to train a custom model. +Upload a model to the OpenSearch cluster and use it locally. This option allows you to serve the model in your OpenSearch cluster but may require significant system resources. - For a list of supported models and information about using a pretrained model provided by OpenSearch, see [Pretrained models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/). +{% include cards.html cards=page.local_model %} - 1. **Custom model**: This option offers customization for your specific use case. +### Externally hosted model - For information about uploading your model, see [Using ML models within OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/). - -1. **Externally hosted model**: This option allows you to connect to a model hosted on a third-party platform. It requires more setup but allows the use of models that are already hosted on a service other than OpenSearch. +Connect to a model hosted on a third-party platform. This requires more setup but allows the use of models that are already hosted on a service other than OpenSearch. - To connect to an externally hosted model, you need to set up a connector: - - - For a walkthrough with detailed steps, see [Connecting to externally hosted models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). - - For more information about supported connectors, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). - - For information about creating your own connector, see [Connector blueprints]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/). +{% include cards.html cards=page.external_model %} In OpenSearch version 2.9 and later, you can integrate local and external models simultaneously within a single cluster. {: .note} ## Tutorial -For a step-by-step tutorial, see [Neural search tutorial]({{site.url}}{{site.baseurl}}/search-plugins/neural-search-tutorial/). +{% include cards.html cards=page.more_cards %} ## Using a model @@ -56,7 +65,7 @@ You can invoke your model by calling the [Predict API]({{site.url}}{{site.baseur ### Using a model for search -OpenSearch supports multiple search methods that integrate with ML models. For more information, see [Search methods]({{site.url}}{{site.baseurl}}/search-plugins/index/#search-methods). +OpenSearch supports multiple search methods that integrate with ML models. For more information, see [AI search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/). ## Disabling a model diff --git a/_ml-commons-plugin/ml-dashboard.md b/_ml-commons-plugin/ml-dashboard.md index 20c4e636bb1..3fc2d537ae6 100644 --- a/_ml-commons-plugin/ml-dashboard.md +++ b/_ml-commons-plugin/ml-dashboard.md @@ -1,6 +1,7 @@ --- layout: default title: Managing ML models in OpenSearch Dashboards +parent: Integrating ML models nav_order: 120 redirect_from: - /ml-commons-plugin/ml-dashbaord/ diff --git a/_ml-commons-plugin/pretrained-models.md b/_ml-commons-plugin/pretrained-models.md index 552e3e607e4..15f3f3f8776 100644 --- a/_ml-commons-plugin/pretrained-models.md +++ b/_ml-commons-plugin/pretrained-models.md @@ -27,17 +27,17 @@ The following table provides a list of sentence transformer models and artifact | Model name | Version | Vector dimensions | Auto-truncation | TorchScript artifact | ONNX artifact | |:---|:---|:---|:---|:---|:---| -| `huggingface/sentence-transformers/all-distilroberta-v1` | 1.0.1 | 768-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-distilroberta-v1/1.0.1/torch_script/sentence-transformers_all-distilroberta-v1-1.0.1-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-distilroberta-v1/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-distilroberta-v1/1.0.1/onnx/sentence-transformers_all-distilroberta-v1-1.0.1-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-distilroberta-v1/1.0.1/onnx/config.json) | -| `huggingface/sentence-transformers/all-MiniLM-L6-v2` | 1.0.1 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L6-v2/1.0.1/torch_script/sentence-transformers_all-MiniLM-L6-v2-1.0.1-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L6-v2/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L6-v2/1.0.1/onnx/sentence-transformers_all-MiniLM-L6-v2-1.0.1-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L6-v2/1.0.1/onnx/config.json) | -| `huggingface/sentence-transformers/all-MiniLM-L12-v2` | 1.0.1 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/1.0.1/torch_script/sentence-transformers_all-MiniLM-L12-v2-1.0.1-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/1.0.1/onnx/sentence-transformers_all-MiniLM-L12-v2-1.0.1-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/1.0.1/onnx/config.json) | -| `huggingface/sentence-transformers/all-mpnet-base-v2` | 1.0.1 | 768-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-mpnet-base-v2/1.0.1/torch_script/sentence-transformers_all-mpnet-base-v2-1.0.1-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-mpnet-base-v2/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-mpnet-base-v2/1.0.1/onnx/sentence-transformers_all-mpnet-base-v2-1.0.1-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-mpnet-base-v2/1.0.1/onnx/config.json) | -| `huggingface/sentence-transformers/msmarco-distilbert-base-tas-b` | 1.0.2 | 768-dimensional dense vector space. Optimized for semantic search. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.2/torch_script/sentence-transformers_msmarco-distilbert-base-tas-b-1.0.2-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.2/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.2/onnx/sentence-transformers_msmarco-distilbert-base-tas-b-1.0.2-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.2/onnx/config.json) | -| `huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1` | 1.0.1 | 384-dimensional dense vector space. Designed for semantic search and trained on 215 million question/answer pairs. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/1.0.1/torch_script/sentence-transformers_multi-qa-MiniLM-L6-cos-v1-1.0.1-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/1.0.1/onnx/sentence-transformers_multi-qa-MiniLM-L6-cos-v1-1.0.1-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/1.0.1/onnx/config.json) | -| `huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1` | 1.0.1 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1/1.0.1/torch_script/sentence-transformers_multi-qa-mpnet-base-dot-v1-1.0.1-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1/1.0.1/onnx/sentence-transformers_multi-qa-mpnet-base-dot-v1-1.0.1-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1/1.0.1/onnx/config.json) | -| `huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2` | 1.0.1 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2/1.0.1/torch_script/sentence-transformers_paraphrase-MiniLM-L3-v2-1.0.1-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2/1.0.1/onnx/sentence-transformers_paraphrase-MiniLM-L3-v2-1.0.1-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2/1.0.1/onnx/config.json) | -| `huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` | 1.0.1 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/1.0.1/torch_script/sentence-transformers_paraphrase-multilingual-MiniLM-L12-v2-1.0.1-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/1.0.1/onnx/sentence-transformers_paraphrase-multilingual-MiniLM-L12-v2-1.0.1-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/1.0.1/onnx/config.json) | -| `huggingface/sentence-transformers/paraphrase-mpnet-base-v2` | 1.0.0 | 768-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-mpnet-base-v2/1.0.0/torch_script/sentence-transformers_paraphrase-mpnet-base-v2-1.0.0-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-mpnet-base-v2/1.0.0/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-mpnet-base-v2/1.0.0/onnx/sentence-transformers_paraphrase-mpnet-base-v2-1.0.0-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-mpnet-base-v2/1.0.0/onnx/config.json) | -| `huggingface/sentence-transformers/distiluse-base-multilingual-cased-v1` | 1.0.1 | 512-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/distiluse-base-multilingual-cased-v1/1.0.1/torch_script/sentence-transformers_distiluse-base-multilingual-cased-v1-1.0.1-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/distiluse-base-multilingual-cased-v1/1.0.1/torch_script/config.json) | Not available | +| `huggingface/sentence-transformers/all-distilroberta-v1` | 1.0.2 | 768-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-distilroberta-v1/1.0.2/torch_script/sentence-transformers_all-distilroberta-v1-1.0.2-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-distilroberta-v1/1.0.2/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-distilroberta-v1/1.0.2/onnx/sentence-transformers_all-distilroberta-v1-1.0.2-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-distilroberta-v1/1.0.2/onnx/config.json) | +| `huggingface/sentence-transformers/all-MiniLM-L6-v2` | 1.0.2 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L6-v2/1.0.2/torch_script/sentence-transformers_all-MiniLM-L6-v2-1.0.2-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L6-v2/1.0.2/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L6-v2/1.0.2/onnx/sentence-transformers_all-MiniLM-L6-v2-1.0.2-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L6-v2/1.0.2/onnx/config.json) | +| `huggingface/sentence-transformers/all-MiniLM-L12-v2` | 1.0.2 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/1.0.2/torch_script/sentence-transformers_all-MiniLM-L12-v2-1.0.2-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/1.0.2/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/1.0.2/onnx/sentence-transformers_all-MiniLM-L12-v2-1.0.2-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/1.0.2/onnx/config.json) | +| `huggingface/sentence-transformers/all-mpnet-base-v2` | 1.0.2 | 768-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-mpnet-base-v2/1.0.2/torch_script/sentence-transformers_all-mpnet-base-v2-1.0.2-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-mpnet-base-v2/1.0.2/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-mpnet-base-v2/1.0.2/onnx/sentence-transformers_all-mpnet-base-v2-1.0.2-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/all-mpnet-base-v2/1.0.2/onnx/config.json) | +| `huggingface/sentence-transformers/msmarco-distilbert-base-tas-b` | 1.0.3 | 768-dimensional dense vector space. Optimized for semantic search. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.3/torch_script/sentence-transformers_msmarco-distilbert-base-tas-b-1.0.3-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.3/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.3/onnx/sentence-transformers_msmarco-distilbert-base-tas-b-1.0.3-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.3/onnx/config.json) | +| `huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1` | 1.0.2 | 384-dimensional dense vector space. Designed for semantic search and trained on 215 million question/answer pairs. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/1.0.2/torch_script/sentence-transformers_multi-qa-MiniLM-L6-cos-v1-1.0.2-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/1.0.2/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/1.0.2/onnx/sentence-transformers_multi-qa-MiniLM-L6-cos-v1-1.0.2-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/1.0.2/onnx/config.json) | +| `huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1` | 1.0.2 | 768-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1/1.0.2/torch_script/sentence-transformers_multi-qa-mpnet-base-dot-v1-1.0.2-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1/1.0.2/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1/1.0.2/onnx/sentence-transformers_multi-qa-mpnet-base-dot-v1-1.0.2-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1/1.0.2/onnx/config.json) | +| `huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2` | 1.0.2 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2/1.0.2/torch_script/sentence-transformers_paraphrase-MiniLM-L3-v2-1.0.2-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2/1.0.2/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2/1.0.2/onnx/sentence-transformers_paraphrase-MiniLM-L3-v2-1.0.2-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2/1.0.2/onnx/config.json) | +| `huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` | 1.0.2 | 384-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/1.0.2/torch_script/sentence-transformers_paraphrase-multilingual-MiniLM-L12-v2-1.0.2-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/1.0.2/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/1.0.2/onnx/sentence-transformers_paraphrase-multilingual-MiniLM-L12-v2-1.0.2-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/1.0.2/onnx/config.json) | +| `huggingface/sentence-transformers/paraphrase-mpnet-base-v2` | 1.0.1 | 768-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-mpnet-base-v2/1.0.1/torch_script/sentence-transformers_paraphrase-mpnet-base-v2-1.0.1-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-mpnet-base-v2/1.0.1/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-mpnet-base-v2/1.0.1/onnx/sentence-transformers_paraphrase-mpnet-base-v2-1.0.1-onnx.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/paraphrase-mpnet-base-v2/1.0.1/onnx/config.json) | +| `huggingface/sentence-transformers/distiluse-base-multilingual-cased-v1` | 1.0.2 | 512-dimensional dense vector space. | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/distiluse-base-multilingual-cased-v1/1.0.2/torch_script/sentence-transformers_distiluse-base-multilingual-cased-v1-1.0.2-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/distiluse-base-multilingual-cased-v1/1.0.2/torch_script/config.json) | Not available | ### Sparse encoding models @@ -49,10 +49,10 @@ Sparse encoding models transfer text into a sparse vector and convert the vector We recommend the following combinations for optimal performance: - Use the `amazon/neural-sparse/opensearch-neural-sparse-encoding-v2-distill` model during both ingestion and search. -- Use the `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v2-distill` model during ingestion and the +- Use the `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v3-distill` model during ingestion and the `amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1` tokenizer during search. -For more information about the preceding options for running neural sparse search, see [Generating sparse vector embeddings within OpenSearch]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-with-pipelines/). +For more information about the preceding options for running neural sparse search, see [Generating sparse vector embeddings automatically]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-with-pipelines/). The following table provides a list of sparse encoding models and artifact links you can use to download them. @@ -63,7 +63,10 @@ The following table provides a list of sparse encoding models and artifact links | `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1` | 1.0.1 | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1/1.0.1/torch_script/neural-sparse_opensearch-neural-sparse-encoding-doc-v1-1.0.1-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1/1.0.1/torch_script/config.json) | A neural sparse encoding model. The model transforms text into a sparse vector, identifies the indices of non-zero elements in the vector, and then converts the vector into `<entry, weight>` pairs, where each entry corresponds to a non-zero element index. To experiment with this model using transformers and the PyTorch API, see the [Hugging Face documentation](https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v1). | | `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v2-distill` | 1.0.0 | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v2-distill/1.0.0/torch_script/neural-sparse_opensearch-neural-sparse-encoding-doc-v2-distill-1.0.0-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v2-distill/1.0.0/torch_script/config.json) | A neural sparse encoding model. The model transforms text into a sparse vector, identifies the indices of non-zero elements in the vector, and then converts the vector into `<entry, weight>` pairs, where each entry corresponds to a non-zero element index. To experiment with this model using transformers and the PyTorch API, see the [Hugging Face documentation](https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill). | | `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v2-mini` | 1.0.0 | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v2-mini/1.0.0/torch_script/neural-sparse_opensearch-neural-sparse-encoding-doc-v2-mini-1.0.0-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v2-mini/1.0.0/torch_script/config.json) | A neural sparse encoding model. The model transforms text into a sparse vector, identifies the indices of non-zero elements in the vector, and then converts the vector into `<entry, weight>` pairs, where each entry corresponds to a non-zero element index. To experiment with this model using transformers and the PyTorch API, see the [Hugging Face documentation](https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini). | +| `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v3-distill` | 1.0.0 | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v3-distill/1.0.0/torch_script/neural-sparse_opensearch-neural-sparse-encoding-doc-v3-distill-1.0.0-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v3-distill/1.0.0/torch_script/config.json) | A neural sparse encoding model. The model transforms text into a sparse vector, identifies the indices of non-zero elements in the vector, and then converts the vector into `<entry, weight>` pairs, where each entry corresponds to a non-zero element index. To experiment with this model using transformers and the PyTorch API, see the [Hugging Face documentation](https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill). | +| `amazon/neural-sparse/opensearch-neural-sparse-encoding-multilingual-v1` | 1.0.0 | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-multilingual-v1/1.0.0/torch_script/neural-sparse_opensearch-neural-sparse-encoding-multilingual-v1-1.0.0-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-encoding-multilingual-v1/1.0.0/torch_script/config.json) | A multilingual neural sparse encoding model. The model transforms text into a sparse vector, identifies the indices of non-zero elements in the vector, and then converts the vector into `<entry, weight>` pairs, where each entry corresponds to a non-zero element index. To experiment with this model using transformers and the PyTorch API, see the [Hugging Face documentation](https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-multilingual-v1). | | `amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1` | 1.0.1 | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1/1.0.1/torch_script/neural-sparse_opensearch-neural-sparse-tokenizer-v1-1.0.1-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1/1.0.1/torch_script/config.json) | A neural sparse tokenizer. The tokenizer splits text into tokens and assigns each token a predefined weight, which is the token's inverse document frequency (IDF). If the IDF file is not provided, the weight defaults to 1. For more information, see [Preparing a model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/#preparing-a-model). | +| `amazon/neural-sparse/opensearch-neural-sparse-tokenizer-multilingual-v1` | 1.0.0 | Yes | - [model_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-tokenizer-multilingual-v1/1.0.0/torch_script/neural-sparse_opensearch-neural-sparse-tokenizer-multilingual-v1-1.0.0-torch_script.zip)<br>- [config_url](https://artifacts.opensearch.org/models/ml-models/amazon/neural-sparse/opensearch-neural-sparse-tokenizer-multilingual-v1/1.0.0/torch_script/config.json) | A multilingual neural sparse tokenizer. The tokenizer splits text into tokens and assigns each token a predefined weight, which is the token's inverse document frequency (IDF). If the IDF file is not provided, the weight defaults to 1. For more information, see [Preparing a model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/#preparing-a-model). | ### Cross-encoder models **Introduced 2.12** @@ -76,7 +79,22 @@ The following table provides a list of cross-encoder models and artifact links y | Model name | Version | TorchScript artifact | ONNX artifact | |:---|:---|:---|:---| | `huggingface/cross-encoders/ms-marco-MiniLM-L-6-v2` | 1.0.2 | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-6-v2/1.0.2/torch_script/cross-encoders_ms-marco-MiniLM-L-6-v2-1.0.2-torch_script.zip) <br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-6-v2/1.0.2/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-6-v2/1.0.2/onnx/cross-encoders_ms-marco-MiniLM-L-6-v2-1.0.2-onnx.zip) <br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-6-v2/1.0.2/onnx/config.json) | -| `huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2` | 1.0.2 | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2/1.0.2/torch_script/cross-encoders_ms-marco-MiniLM-L-12-v2-1.0.2-torch_script.zip) <br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2/1.0.2/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2/1.0.2/onnx/cross-encoders_ms-marco-MiniLM-L-12-v2-1.0.2-onnx.zip) <br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2/1.0.2/onnx/config.json) +| `huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2` | 1.0.2 | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2/1.0.2/torch_script/cross-encoders_ms-marco-MiniLM-L-12-v2-1.0.2-torch_script.zip) <br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2/1.0.2/torch_script/config.json) | - [model_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2/1.0.2/onnx/cross-encoders_ms-marco-MiniLM-L-12-v2-1.0.2-onnx.zip) <br>- [config_url](https://artifacts.opensearch.org/models/ml-models/huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2/1.0.2/onnx/config.json) + +### Semantic sentence highlighting models +**Introduced 3.0** +{: .label .label-purple } + +Semantic sentence highlighting models are specifically designed to work with the [`semantic` highlighter]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/highlight/#the-semantic-highlighter). These models analyze document text and identify the sentences that are most semantically relevant to the search query. + +For a tutorial on using these models with the semantic highlighter, see [Using semantic highlighting]({{site.url}}{{site.baseurl}}/tutorials/vector-search/semantic-highlighting-tutorial/). + +The following table provides a list of semantic sentence highlighting models and artifact links you can use to download them. Note that you must prefix the model name with `opensearch/`, as shown in the **Model name** column. + +| Model name | Version | TorchScript artifact | Description | +|:---|:---|:---|:---| +| `amazon/sentence-highlighting/opensearch-semantic-highlighter-v1` | 1.0.0 | - [model_url](https://artifacts.opensearch.org/models/ml-models/amazon/sentence-highlighting/opensearch-semantic-highlighter-v1/1.0.0/torch_script/sentence-highlighting_opensearch-semantic-highlighter-v1-1.0.0-torch_script.zip) <br>- [config_url](https://artifacts.opensearch.org/models/ml-models/amazon/sentence-highlighting/opensearch-semantic-highlighter-v1/1.0.0/torch_script/config.json) | A model optimized for identifying semantically relevant sentences to be highlighted. | + ## Prerequisites @@ -135,7 +153,7 @@ Because pretrained models originate from the ML Commons model repository, you on POST /_plugins/_ml/models/_register { "name": "huggingface/sentence-transformers/msmarco-distilbert-base-tas-b", - "version": "1.0.2", + "version": "1.0.3", "model_group_id": "Z1eQf4oB5Vm0Tdw8EIP2", "model_format": "TORCH_SCRIPT" } @@ -175,7 +193,7 @@ When the operation is complete, the state changes to `COMPLETED`: } ``` -Take note of the returned `model_id` because you’ll need it to deploy the model. +Take note of the returned `model_id` because you'll need it to deploy the model. ## Step 3: Deploy the model diff --git a/_ml-commons-plugin/remote-models/async-batch-ingestion.md b/_ml-commons-plugin/remote-models/async-batch-ingestion.md index a09c0284778..dad2af088c1 100644 --- a/_ml-commons-plugin/remote-models/async-batch-ingestion.md +++ b/_ml-commons-plugin/remote-models/async-batch-ingestion.md @@ -8,8 +8,12 @@ grand_parent: Integrating ML models # Asynchronous batch ingestion -**Introduced 2.17** -{: .label .label-purple } +**Deprecated 3.0** +{: .label .label-red } + +This feature is deprecated. For similar functionality, use [OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/). If you'd like to see this feature reinstated, [create an issue](https://github.com/opensearch-project/ml-commons/issues) in the ML Commons repository. +{: .warning} + [Batch ingestion]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/batch-ingestion/) configures an ingest pipeline, which processes documents one by one. For each document, batch ingestion calls an externally hosted model to generate text embeddings from the document text and then ingests the document, including text and embeddings, into an OpenSearch index. @@ -49,7 +53,7 @@ PUT /my-nlp-index "type": "knn_vector", "dimension": 384, "method": { - "engine": "nmslib", + "engine": "faiss", "space_type": "cosinesimil", "name": "hnsw", "parameters": { @@ -65,7 +69,7 @@ PUT /my-nlp-index "type": "knn_vector", "dimension": 384, "method": { - "engine": "nmslib", + "engine": "faiss", "space_type": "cosinesimil", "name": "hnsw", "parameters": { diff --git a/_ml-commons-plugin/remote-models/blueprints.md b/_ml-commons-plugin/remote-models/blueprints.md index 9b95c311661..ade5c83ac7a 100644 --- a/_ml-commons-plugin/remote-models/blueprints.md +++ b/_ml-commons-plugin/remote-models/blueprints.md @@ -49,7 +49,7 @@ For example, the following blueprint is a specification for an Amazon SageMaker ## OpenSearch-provided connector blueprints -OpenSearch provides connector blueprints for several machine learning (ML) platforms and models. For a list of all connector blueprints provided by OpenSearch, see [Supported connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/#supported-connectors). +OpenSearch provides connector blueprints for several machine learning (ML) platforms and models. For a list of all connector blueprints provided by OpenSearch, see [Supported connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/supported-connectors/). As an ML developer, you can build connector blueprints for other platforms. Using those blueprints, administrators and data scientists can create connectors for models hosted on those platforms. @@ -80,7 +80,7 @@ The `actions` parameter supports the following options. | `url` | String | Required. Specifies the connection endpoint at which the action occurs. This must match the regex expression for the connection used when [adding trusted endpoints]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index#adding-trusted-endpoints).| | `request_body` | String | Required. Sets the parameters contained in the request body of the action. The parameters must include `\"inputText\`, which specifies how users of the connector should construct the request payload for the `action_type`. | | `pre_process_function` | String | Optional. A built-in or custom Painless script used to preprocess the input data. OpenSearch provides the following built-in preprocess functions that you can call directly:<br> - `connector.pre_process.cohere.embedding` for [Cohere](https://cohere.com/) embedding models<br> - `connector.pre_process.openai.embedding` for [OpenAI](https://platform.openai.com/docs/guides/embeddings) embedding models <br> - `connector.pre_process.default.embedding`, which you can use to preprocess documents in neural search requests so that they are in the format that ML Commons can process with the default preprocessor (OpenSearch 2.11 or later). For more information, see [Built-in functions](#built-in-pre--and-post-processing-functions). | -| `post_process_function` | String | Optional. A built-in or custom Painless script used to post-process the model output data. OpenSearch provides the following built-in post-process functions that you can call directly:<br> - `connector.pre_process.cohere.embedding` for [Cohere text embedding models](https://docs.cohere.com/reference/embed)<br> - `connector.pre_process.openai.embedding` for [OpenAI text embedding models](https://platform.openai.com/docs/api-reference/embeddings) <br> - `connector.post_process.default.embedding`, which you can use to post-process documents in the model response so that they are in the format that neural search expects (OpenSearch 2.11 or later). For more information, see [Built-in functions](#built-in-pre--and-post-processing-functions). | +| `post_process_function` | String | Optional. A built-in or custom Painless script used to post-process the model output data. OpenSearch provides the following built-in post-process functions that you can call directly:<br> - `connector.post_process.cohere.embedding` for [Cohere text embedding models](https://docs.cohere.com/reference/embed)<br> - `connector.post_process.openai.embedding` for [OpenAI text embedding models](https://platform.openai.com/docs/api-reference/embeddings) <br> - `connector.post_process.default.embedding`, which you can use to post-process documents in the model response so that they are in the format that neural search expects (OpenSearch 2.11 or later). For more information, see [Built-in functions](#built-in-pre--and-post-processing-functions). | | `headers` | JSON object | Specifies the headers used in the request or response body. Default is `ContentType: application/json`. If your third-party ML tool requires access control, define the required `credential` parameters in the `headers` parameter. | The `client_config` parameter supports the following options. diff --git a/_ml-commons-plugin/remote-models/connectors.md b/_ml-commons-plugin/remote-models/connectors.md index 3ec6c73b07a..5f4a48658ed 100644 --- a/_ml-commons-plugin/remote-models/connectors.md +++ b/_ml-commons-plugin/remote-models/connectors.md @@ -27,25 +27,10 @@ OpenSearch provides connectors for several platforms, for example: A _connector blueprint_ defines the set of parameters (the request body) you need to provide when sending an API request to create a specific connector. Connector blueprints may differ based on the platform and the model that you are accessing. -OpenSearch provides connector blueprints for several ML platforms and models. For a full list of connector blueprints provided by OpenSearch, see [Supported connectors](#supported-connectors). +OpenSearch provides connector blueprints for several ML platforms and models. For a full list of connector blueprints provided by OpenSearch, see [Supported connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/supported-connectors/). As an ML developer, you can also create connector blueprints for other platforms and models. Data scientists and administrators can then use the blueprint to create connectors. They are only required to enter their `credential` settings, such as `openAI_key`, for the service to which they are connecting. For information about creating connector blueprints, including descriptions of all parameters, see [Connector blueprints]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/). -## Supported connectors - -The following table lists all connector blueprints provided by OpenSearch. Follow the links to each connector blueprint for an example request that you can use to create the connector, including all parameters, and an example Predict API request. - -Platform | Model | Connector blueprint -:--- | :--- | :--- -[Amazon Bedrock](https://aws.amazon.com/bedrock/) | [AI21 Labs Jurassic-2 Mid](https://aws.amazon.com/bedrock/jurassic/) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_ai21labs_jurassic_blueprint.md) -[Amazon Bedrock](https://aws.amazon.com/bedrock/) | [Anthropic Claude v2](https://aws.amazon.com/bedrock/claude/) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_anthropic_claude_blueprint.md) -[Amazon Bedrock](https://aws.amazon.com/bedrock/) | [Titan Text Embeddings](https://aws.amazon.com/bedrock/titan/) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_titan_embedding_blueprint.md) -[Amazon SageMaker](https://aws.amazon.com/sagemaker/) | Text embedding models | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/sagemaker_connector_blueprint.md) -[Cohere](https://cohere.com/) | [Text Embedding models](https://docs.cohere.com/reference/embed) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/cohere_connector_embedding_blueprint.md) -[Cohere](https://cohere.com/) | [Chat models](https://docs.cohere.com/reference/chat) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/cohere_connector_chat_blueprint.md) -[OpenAI](https://openai.com/) | Chat models (for example, `gpt-3.5-turbo`) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/open_ai_connector_chat_blueprint.md) -[OpenAI](https://openai.com/) | Completion models (for example, `text-davinci-003`) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/open_ai_connector_completion_blueprint.md) -[OpenAI](https://openai.com/) | Text embedding models (for example, `text-embedding-ada-002`) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/openai_connector_embedding_blueprint.md) ## Creating a connector @@ -55,6 +40,9 @@ You can provision connectors in two ways: 2. [Create a connector for a specific externally hosted model](#creating-a-connector-for-a-specific-model): Alternatively, you can create a connector that can only be used with the model for which it was created. To access such a connector, you only need access to the model itself because the connection is established inside the model. These connectors are saved in the model index. +If using Python, you can create connectors using the [opensearch-py-ml](https://github.com/opensearch-project/opensearch-py-ml) client CLI. The CLI automates many configuration steps, making setup faster and reducing the chance of errors. For more information about using the CLI, see the [CLI documentation](https://opensearch-project.github.io/opensearch-py-ml/cli/index.html#). +{: .tip} + ## Creating a standalone connector Standalone connectors can be used by multiple models. To create a standalone connector, send a request to the `connectors/_create` endpoint and provide all of the parameters described in [Connector blueprints]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/): @@ -131,7 +119,7 @@ POST /_plugins/_ml/models/_register ## Connector examples -The following sections contain examples of connectors for popular ML platforms. For a full list of supported connectors, see [Supported connectors](#supported-connectors). +The following sections contain examples of connectors for popular ML platforms. For a full list of supported connectors, see [Supported connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/supported-connectors/). ### OpenAI chat connector @@ -289,20 +277,40 @@ POST /_plugins/_ml/connectors/_create ## Updating connector credentials -In some cases, you may need to update credentials, like `access_key`, that you use to connect to externally hosted models. You can update credentials without undeploying the model by providing the new credentials in the following request: +In some cases, you may need to update credentials, such as `access_key`, used to connect to externally hosted models. To do this without undeploying the model, provide the new credentials in an update request. + +### Connector for a specific model + +To update credentials for a connector linked to a specific model, provide the new credentials in the following request: ```json PUT /_plugins/_ml/models/<model_id> { - "connector": { + "connectors": { "credential": { "openAI_key": "YOUR NEW OPENAI KEY" } } } ``` +{% include copy-curl.html %} + +### Standalone connector + +To update credentials for a standalone connector, provide the new credentials in the following request: + +```json +PUT /_plugins/_ml/connectors/<connector_id> +{ + "credential": { + "openAI_key": "YOUR NEW OPENAI KEY" + } +} +``` +{% include copy-curl.html %} ## Next steps +- For a full list of connector blueprints provided by OpenSearch, see [Supported connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/supported-connectors/). - To learn more about connecting to external models, see [Connecting to externally hosted models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). - To learn more about model access control and model groups, see [Model access control]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control/). diff --git a/_ml-commons-plugin/remote-models/guardrails.md b/_ml-commons-plugin/remote-models/guardrails.md index 5330454c8bc..2a907fede7c 100644 --- a/_ml-commons-plugin/remote-models/guardrails.md +++ b/_ml-commons-plugin/remote-models/guardrails.md @@ -636,4 +636,5 @@ OpenSearch responds with an error. ## Next steps -- For more information about configuring guardrails, see [The `guardrails` parameter]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#the-guardrails-parameter). \ No newline at end of file +- For more information about configuring guardrails, see [The `guardrails` parameter]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#the-guardrails-parameter). +- For a tutorial demonstrating how to use Amazon Bedrock guardrails, see [Using Amazon Bedrock guardrails]({{site.url}}{{site.baseurl}}/vector-search/tutorials/model-controls/bedrock-guardrails/). \ No newline at end of file diff --git a/_ml-commons-plugin/remote-models/index.md b/_ml-commons-plugin/remote-models/index.md index ddde42ecece..bf18561c6fd 100644 --- a/_ml-commons-plugin/remote-models/index.md +++ b/_ml-commons-plugin/remote-models/index.md @@ -7,6 +7,7 @@ has_toc: false nav_order: 60 redirect_from: - /ml-commons-plugin/extensibility/index/ + - /ml-commons-plugin/remote-models/ --- # Connecting to externally hosted models @@ -323,7 +324,7 @@ To learn how to use the model for batch ingestion in order to improve ingestion ## Step 7: Use the model for search -To learn how to use the model for vector search, see [Using an ML model for neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/#using-an-ml-model-for-neural-search). +To learn how to use the model for vector search, see [AI search methods]({{site.url}}{{site.baseurl}}/vector-search/ai-search/#ai-search-methods). ## Step 8 (Optional): Undeploy the model diff --git a/_ml-commons-plugin/remote-models/supported-connectors.md b/_ml-commons-plugin/remote-models/supported-connectors.md new file mode 100644 index 00000000000..73ebfeb1c69 --- /dev/null +++ b/_ml-commons-plugin/remote-models/supported-connectors.md @@ -0,0 +1,59 @@ +--- +layout: default +title: Supported connectors +has_children: false +nav_order: 63 +parent: Connecting to externally hosted models +grand_parent: Integrating ML models +redirect_from: + - /ml-commons-plugin/extensibility/supported-connectors/ +--- + +# Supported connectors + +OpenSearch provides two types of connector blueprints: + +- **Standard blueprints (recommended for OpenSearch 2.14+)**: + + - Designed for connectors that pass input directly to the model and return its raw output without requiring additional transformations. + - Use a registered model without any pre-processing or post-processing functions. + - Compatible with machine learning (ML) inference processors: [ingest processors]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/processors/ml-inference/), [search request processors]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/ml-inference-search-request/), and [search response processors]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/ml-inference-search-response/). + - Ideal for new implementations in OpenSearch 2.14 and later. + +- **Legacy blueprints**: + + - Suitable for specific input and output formatting requirements. + - Include pre- and post-processing functions. + - Recommended for existing implementations. + - Compatible with [neural search]({{site.url}}{{site.baseurl}}/neural-search-plugin/index/) queries. + +The following table provides a comprehensive list of connector blueprints available in OpenSearch. Each blueprint link offers an example request for creating the connector, including all necessary parameters, and a sample Predict API request. + +| Platform | Model | Legacy blueprint | Standard blueprint | +|:-------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [Aleph Alpha](https://aleph-alpha.com/) | [Luminous-Base Embedding Model](https://docs.aleph-alpha.com/api/semantic-embed/) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/aleph_alpha_connector_luminous_base_embedding_blueprint.md) | N/A | +| [Amazon Bedrock](https://aws.amazon.com/bedrock/) | [AI21 Labs Jurassic-2 Mid](https://aws.amazon.com/bedrock/jurassic/) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_ai21labs_jurassic_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_ai21labs_jurassic_blueprint.md) | +| [Amazon Bedrock](https://aws.amazon.com/bedrock/) | [Anthropic Claude v2](https://aws.amazon.com/bedrock/claude/) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_anthropic_claude_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_anthropic_claude_blueprint.md) | +| [Amazon Bedrock](https://aws.amazon.com/bedrock/) | [Anthropic Claude v3](https://aws.amazon.com/bedrock/claude/) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/bedrock_connector_anthropic_claude3_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/bedrock_connector_anthropic_claude3_blueprint.md) | +| [Amazon Bedrock](https://aws.amazon.com/bedrock/) | [Anthropic Claude v3.7](https://aws.amazon.com/bedrock/claude/) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/bedrock_connector_anthropic_claude3.7_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/bedrock_connector_anthropic_claude3.7_blueprint.md) | +| [Amazon Bedrock](https://aws.amazon.com/bedrock/) | [Cohere Embed Model v3 - English](https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-embed.html) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/bedrock_connector_cohere_cohere.embed-english-v3_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/standard_blueprints/bedrock_connector_cohere_cohere.embed-english-v3_standard_blueprint.md) | +| [Amazon Bedrock](https://aws.amazon.com/bedrock/) | [Cohere Embed Model v3 - Multilingual](https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-embed.html) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/bedrock_connector_cohere_cohere.embed-multilingual-v3_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/standard_blueprints/bedrock_connector_cohere_cohere.embed-multilingual-v3_standard_blueprint.md) | +| [Amazon Bedrock](https://aws.amazon.com/bedrock/) | [Titan Text Embeddings](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_titan_embedding_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/standard_blueprints/bedrock_connector_titan_embedding_standard_blueprint.md) | +| [Amazon Bedrock](https://aws.amazon.com/bedrock/) | [Titan Multimodal Embeddings](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-multiemb-models.html) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/bedrock_connector_titan_multimodal_embedding_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/standard_blueprints/bedrock_connector_titan_multimodal_embedding_standard_blueprint.md) | +| [Amazon Bedrock Converse](https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_Converse.html) | [Anthropic Claude 3 Sonnet](https://aws.amazon.com/bedrock/claude/) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/bedrock_connector_converse_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/bedrock_connector_converse_blueprint.md) | +| [Amazon Comprehend](https://aws.amazon.com/comprehend/) | [Metadata Embedding Model](https://docs.aws.amazon.com/comprehend/) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/amazon_comprehend_connector_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/amazon_comprehend_connector_blueprint.md) | +| [Amazon Textract](https://aws.amazon.com/textract/) | [Amazon Textract Model](https://docs.aws.amazon.com/textract/) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/amazon_textract_connector_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/amazon_textract_connector_blueprint.md) | +| [Amazon SageMaker](https://aws.amazon.com/sagemaker/) | [Text embedding models for batch inference](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateModel.html) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/batch_inference_sagemaker_connector_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/batch_inference_sagemaker_connector_blueprint.md) | +| [Amazon SageMaker](https://aws.amazon.com/sagemaker/) | [Text embedding models](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateModel.html) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/sagemaker_connector_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/sagemaker_connector_blueprint.md) | +| [Azure OpenAI](https://azure.microsoft.com/en-us/products/ai-services/openai-service) | [Chat models](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models?tabs=global-standard%2Cstandard-chat-completions#gpt-4) (for example, `gpt-4`) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/azure_openai_connector_chat_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/azure_openai_connector_chat_blueprint.md) | +| [Azure OpenAI](https://azure.microsoft.com/en-us/products/ai-services/openai-service) | [Text embedding models](https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings) (for example, `text-embedding-ada-002`) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/azure_openai_connector_embedding_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/aleph_alpha_connector_luminous_base_embedding_blueprint.md) | +| [Cohere](https://cohere.com/) | [Chat models](https://docs.cohere.com/reference/chat) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/cohere_connector_chat_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/cohere_connector_chat_blueprint.md) | +| [Cohere](https://cohere.com/) | [Multimodal embedding models](https://docs.cohere.com/reference/embed) (for example, `embed-english-v3.0`) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/cohere_connector_image_embedding_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/standard_blueprints/cohere_connector_image_embedding_standard_blueprint.md) | +| [Cohere](https://cohere.com/) | [Text embedding models](https://docs.cohere.com/reference/embed) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/cohere_connector_embedding_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/standard_blueprints/cohere_connector_text_embedding_standard_blueprint.md) | +| [DeepSeek](https://www.deepseek.com/) | [Chat model](https://api-docs.deepseek.com/api/create-chat-completion) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/deepseek_connector_chat_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/deepseek_connector_chat_blueprint.md) | +| [Google Cloud Platform](https://cloud.google.com/) | [VertexAI Embedding Model](https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/gcp_vertexai_connector_embedding_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/gcp_vertexai_connector_embedding_blueprint.md) | +| [OpenAI](https://openai.com/) | [Chat models](https://platform.openai.com/docs/models) (for example, `gpt-3.5-turbo`) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/open_ai_connector_chat_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/open_ai_connector_chat_blueprint.md) | +| [OpenAI](https://openai.com/) | [Completion models](https://platform.openai.com/docs/models) (for example, `text-davinci-003`) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/open_ai_connector_completion_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/open_ai_connector_completion_blueprint.md) | +| [OpenAI](https://openai.com/) | [Text embedding models](https://platform.openai.com/docs/models#embeddings) (for example, `text-embedding-ada-002`) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/openai_connector_embedding_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/standard_blueprints/openai_connector_embedding_standard_blueprint.md) | +| [OpenAI](https://openai.com/) | [Text embedding models for batch inference](https://platform.openai.com/docs/guides/batch/overview#model-availability) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/batch_inference_openAI_connector_blueprint.md) | [Blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/batch_inference_openAI_connector_blueprint.md) | + diff --git a/_ml-commons-plugin/tutorials/index.md b/_ml-commons-plugin/tutorials/index.md deleted file mode 100644 index 070da3cae1d..00000000000 --- a/_ml-commons-plugin/tutorials/index.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -layout: default -title: Tutorials -has_children: true -has_toc: false -nav_order: 140 ---- - -# Tutorials - -Using the OpenSearch machine learning (ML) framework, you can build various applications, from implementing conversational search to building your own chatbot. To learn more, explore the following ML tutorials: - -- **Semantic search**: - - [Generating embeddings for arrays of objects]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/generate-embeddings/) - - [Semantic search using byte-quantized vectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/semantic-search-byte-vectors/) - -- **Conversational search**: - - [Conversational search using the Cohere Command model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/conversational-search-cohere/) - -- **Reranking search results**: - - [Reranking search results using the Cohere Rerank model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/reranking-cohere/) - - [Reranking search results using the MS MARCO cross-encoder model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/reranking-cross-encoder/) - -- **Agents and tools**: - - [Retrieval-augmented generation (RAG) chatbot]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/rag-chatbot/) - - [RAG with a conversational flow agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/rag-conversational-agent/) - - [Build your own chatbot]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/build-chatbot/) \ No newline at end of file diff --git a/_ml-commons-plugin/using-ml-models.md b/_ml-commons-plugin/using-ml-models.md index db50626721b..d7a7599262b 100644 --- a/_ml-commons-plugin/using-ml-models.md +++ b/_ml-commons-plugin/using-ml-models.md @@ -3,25 +3,36 @@ layout: default title: Using ML models within OpenSearch parent: Integrating ML models has_children: true +has_toc: false nav_order: 50 redirect_from: - /ml-commons-plugin/model-serving-framework/ - /ml-commons-plugin/ml-framework/ +models: + - heading: "Pretrained models provided by OpenSearch" + link: "/ml-commons-plugin/pretrained-models/" + description: "Explore OpenSearch's collection of optimized ML models for immediate use in AI applications" + - heading: "Custom models" + link: "/ml-commons-plugin/custom-local-models/" + description: "Learn how to upload and serve your own ML models in OpenSearch for specialized use cases" +gpu: + - heading: "GPU acceleration" + link: "/ml-commons-plugin/gpu-acceleration/" + description: "Take advantage of GPU acceleration on your ML node for better performance" --- # Using ML models within OpenSearch **Introduced 2.9** {: .label .label-purple } -To integrate machine learning (ML) models into your OpenSearch cluster, you can upload and serve them locally. Choose one of the following options: +To integrate machine learning (ML) models into your OpenSearch cluster, you can upload and serve them locally. Choose one of the following options. -- **Pretrained models provided by OpenSearch**: To learn more, see [OpenSearch-provided pretrained models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/). For a list of supported models, see [Supported pretrained models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#supported-pretrained-models). +{% include cards.html cards=page.models %} -- **Custom models** such as PyTorch deep learning models: To learn more, see [Custom models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). +For production environments, run local models on dedicated ML nodes rather than data nodes. For more information, see [Run tasks and models on ML nodes only]({{site.url}}{{site.baseurl}}/ml-commons-plugin/cluster-settings/#run-tasks-and-models-on-ml-nodes-only). +{: .important} Running local models on the CentOS 7 operating system is not supported. Moreover, not all local models can run on all hardware and operating systems. {: .important} -## GPU acceleration - -For better performance, you can take advantage of GPU acceleration on your ML node. For more information, see [GPU acceleration]({{site.url}}{{site.baseurl}}/ml-commons-plugin/gpu-acceleration/). +{% include cards.html cards=page.gpu %} diff --git a/_monitoring-your-cluster/metrics/getting-started.md b/_monitoring-your-cluster/metrics/getting-started.md index 659614a07c2..9b417eae4ef 100644 --- a/_monitoring-your-cluster/metrics/getting-started.md +++ b/_monitoring-your-cluster/metrics/getting-started.md @@ -35,11 +35,11 @@ The `enable` flag is toggled using a Java Virtual Machine (JVM) parameter that i cd \path\to\opensearch ``` -2. Open your `opensearch.yaml` file. -3. Add the following setting to `opensearch.yaml`: +2. Open your `opensearch.yml` file. +3. Add the following setting to `opensearch.yml`: - ```bash - opensearch.experimental.feature.telemetry.enabled=true + ```yaml + opensearch.experimental.feature.telemetry.enabled: true ``` {% include copy.html %} @@ -73,7 +73,7 @@ export OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.telemetry.enabled ### Enable with Docker -If you’re running OpenSearch using Docker, add the following line to `docker-compose.yml` under `environment`: +If you're running OpenSearch using Docker, add the following line to `docker-compose.yml` under `environment`: ```bash OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.telemetry.enabled=true" @@ -105,3 +105,17 @@ The metrics framework feature supports the following metric types: 2. **UpDown counters:** UpDown counters can be incremented with positive values or decremented with negative values. UpDown counters are well suited for tracking metrics like open connections, active requests, and other fluctuating quantities. 3. **Histograms:** Histograms are valuable tools for visualizing the distribution of continuous data. Histograms offer insight into the central tendency, spread, skewness, and potential outliers that might exist in your metrics. Patterns such as normal distribution, skewed distribution, or bimodal distribution can be readily identified, making histograms ideal for analyzing latency metrics and assessing percentiles. 4. **Asynchronous Gauges:** Asynchronous gauges capture the current value at the moment a metric is read. These metrics are non-additive and are commonly used to measure CPU utilization on a per-minute basis, memory utilization, and other real-time values. + +## Monitoring machine learning workflows +Introduced 3.1 +{: .label .label-purple } + +OpenSearch provides enhanced observability for [machine learning (ML)]({{site.url}}{{site.baseurl}}/ml-commons-plugin/) workflows. Metrics related to ML operations are pushed directly to the core metrics registry, giving you improved visibility into model usage and performance. Additionally, every 5 minutes, a periodic job collects and exports state data, helping you monitor the health and activity of your ML workloads over time. + +To enable ML observability, specify the following settings in `opensearch.yml`: + +```yaml +plugins.ml_commons.metrics_collection_enabled: true +plugins.ml_commons.metrics_static_collection_enabled: true +``` +{% include copy.html %} diff --git a/_observing-your-data/ad/index.md b/_observing-your-data/ad/index.md index 657c3c90cb4..a3e0c410ec4 100644 --- a/_observing-your-data/ad/index.md +++ b/_observing-your-data/ad/index.md @@ -28,104 +28,149 @@ To get started, go to **OpenSearch Dashboards** > **OpenSearch Plugins** > **Ano A _detector_ is an individual anomaly detection task. You can define multiple detectors, and all detectors can run simultaneously, with each analyzing data from different sources. You can define a detector by following these steps: 1. On the **Anomaly detection** page, select the **Create detector** button. -2. On the **Define detector** page, enter the required information in the **Detector details** pane. -3. In the **Select data** pane, specify the data source by choosing a source from the **Index** dropdown menu. You can choose an index, index patterns, or an alias. -4. (Optional) Filter the data source by selecting **Add data filter** and then entering the conditions for **Field**, **Operator**, and **Value**. Alternatively, you can choose **Use query DSL** and add your JSON filter query. Only [Boolean queries]({{site.url}}{{site.baseurl}}/query-dsl/compound/bool/) are supported for query domain-specific language (DSL). +2. On the **Define detector** page, add the detector details. Enter a name and a brief description. The name must be unique and descriptive enough to help you identify the detector's purpose. -#### Example: Filtering data using query DSL +3. In the **Select data** pane, specify the data source by choosing one or more sources from the **Index** dropdown menu. You can select indexes, index patterns, or aliases. + + - Detectors can use remote indexes, which you can access using the `cluster-name:index-name` pattern. For more information, see [Cross-cluster search]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/). Starting in OpenSearch Dashboards 2.17, you can also select clusters and indexes directly. If the Security plugin is enabled, see [Selecting remote indexes with fine-grained access control]({{site.url}}{{site.baseurl}}/observing-your-data/ad/security/#selecting-remote-indexes-with-fine-grained-access-control) in the [Anomaly detection security]({{site.url}}{{site.baseurl}}/observing-your-data/ad/security/) documentation. + + - To create a cross-cluster detector in OpenSearch Dashboards, you must have the following [permissions]({{site.url}}{{site.baseurl}}/security/access-control/permissions/): `indices:data/read/field_caps`, `indices:admin/resolve/index`, and `cluster:monitor/remote/info`. + +4. (Optional) Filter the data source by selecting **Add data filter** and then specifying the conditions for **Field**, **Operator**, and **Value**. Alternatively, select **Use query DSL** and enter your filter as a JSON-formatted [Boolean query]({{site.url}}{{site.baseurl}}/query-dsl/compound/bool/). Only Boolean queries are supported for query domain-specific language (DSL). + + + + +### Example: Filtering data using query DSL The following example query retrieves documents in which the `urlPath.keyword` field matches any of the specified values: -======= -1. Choose **Create detector**. -1. Add in the detector details. - - Enter a name and brief description. Make sure the name is unique and descriptive enough to help you to identify the purpose of the detector. -1. Specify the data source. - - For **Data source**, choose one or more indexes to use as the data source. Alternatively, you can use an alias or index pattern to choose multiple indexes. - - Detectors can use remote indexes. You can access them using the `cluster-name:index-name` pattern. See [Cross-cluster search]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/) for more information. Alternatively, you can select clusters and indexes in OpenSearch Dashboards 2.17 or later. To learn about configuring remote indexes with the Security plugin enabled, see [Selecting remote indexes with fine-grained access control]({{site.url}}{{site.baseurl}}/observing-your-data/ad/security/#selecting-remote-indexes-with-fine-grained-access-control) in the [Anomaly detection security](observing-your-data/ad/security/) documentation. - - (Optional) For **Data filter**, filter the index you chose as the data source. From the **Data filter** menu, choose **Add data filter**, and then design your filter query by selecting **Field**, **Operator**, and **Value**, or choose **Use query DSL** and add your own JSON filter query. Only [Boolean queries]({{site.url}}{{site.baseurl}}/query-dsl/compound/bool/) are supported for query domain-specific language (DSL). - -To create a cross-cluster detector in OpenSearch Dashboards, the following [permissions]({{site.url}}{{site.baseurl}}/security/access-control/permissions/) are required: `indices:data/read/field_caps`, `indices:admin/resolve/index`, and `cluster:monitor/remote/info`. -{: .note} - - - /domain/{id}/short - - /sub_dir/{id}/short - - /abcd/123/{id}/xyz - - ```json - { - "bool": { - "should": [ - { - "term": { - "urlPath.keyword": "/domain/{id}/short" - } - }, - { - "term": { - "urlPath.keyword": "/sub_dir/{id}/short" - } - }, - { - "term": { - "urlPath.keyword": "/abcd/123/{id}/xyz" - } - } - ] - } - } - ``` - {% include copy-curl.html %} - -5. In the **Timestamp** pane, select a field from the **Timestamp field** dropdown menu. - -6. In the **Operation settings** pane, define the **Detector interval**, which is the interval at which the detector collects data. - - The detector aggregates the data at this interval and then feeds the aggregated result into the anomaly detection model. The shorter the interval, the fewer data points the detector aggregates. The anomaly detection model uses a shingling process, a technique that uses consecutive data points to create a sample for the model. This process requires a certain number of aggregated data points from contiguous intervals. - - You should set the detector interval based on your actual data. If the detector interval is too long, then it might delay the results. If the detector interval is too short, then it might miss some data. The detector interval also will not have a sufficient number of consecutive data points for the shingle process. - - (Optional) To add extra processing time for data collection, specify a **Window delay** value. - - This value tells the detector that the data is not ingested into OpenSearch in real time but with a certain delay. Set the window delay to shift the detector interval to account for this delay. - - For example, the detector interval is 10 minutes and data is ingested into your cluster with a general delay of 1 minute. Assume the detector runs at 2:00. The detector attempts to get the last 10 minutes of data from 1:50 to 2:00, but because of the 1-minute delay, it only gets 9 minutes of data and misses the data from 1:59 to 2:00. Setting the window delay to 1 minute shifts the interval window to 1:49--1:59, so the detector accounts for all 10 minutes of the detector interval time. - - To avoid missing any data, set the **Window delay** to the upper limit of the expected ingestion delay. This ensures that the detector captures all data during its interval, reducing the risk of missing relevant information. While a longer window delay helps capture all data, too long of a window delay can hinder real-time anomaly detection because the detector will look further back in time. Find a balance to maintain both data accuracy and timely detection. - -7. Specify a custom results index. - - The Anomaly Detection plugin allows you to store anomaly detection results in a custom index of your choice. Select **Enable custom results index** and provide a name for your index, for example, `abc`. The plugin then creates an alias prefixed with `opensearch-ad-plugin-result-` followed by your chosen name, for example, `opensearch-ad-plugin-result-abc`. This alias points to an actual index with a name containing the date and a sequence number, such as `opensearch-ad-plugin-result-abc-history-2024.06.12-000002`, where your results are stored. - - You can use `-` to separate the namespace to manage custom results index permissions. For example, if you use `opensearch-ad-plugin-result-financial-us-group1` as the results index, you can create a permission role based on the pattern `opensearch-ad-plugin-result-financial-us-*` to represent the `financial` department at a granular level for the `us` group. - {: .note } - - - When the Security plugin (fine-grained access control) is enabled, the default results index becomes a system index and is no longer accessible through the standard Index or Search APIs. To access its content, you must use the Anomaly Detection RESTful API or the dashboard. As a result, you cannot build customized dashboards using the default results index if the Security plugin is enabled. However, you can create a custom results index in order to build customized dashboards. - - If the custom index you specify does not exist, the Anomaly Detection plugin will create it when you create the detector and start your real-time or historical analysis. - - If the custom index already exists, the plugin will verify that the index mapping matches the required structure for anomaly results. In this case, ensure that the custom index has a valid mapping as defined in the [`anomaly-results.json`](https://github.com/opensearch-project/anomaly-detection/blob/main/src/main/resources/mappings/anomaly-results.json) file. - - To use the custom results index option, you must have the following permissions: - - `indices:admin/create` -- The `create` permission is required in order to create and roll over the custom index. - - `indices:admin/aliases` -- The `aliases` permission is required in order to create and manage an alias for the custom index. - - `indices:data/write/index` -- The `write` permission is required in order to write results into the custom index for a single-entity detector. - - `indices:data/read/search` -- The `search` permission is required in order to search custom results indexes to show results on the Anomaly Detection interface. - - `indices:data/write/delete` -- The detector may generate many anomaly results. The `delete` permission is required in order to delete old data and save disk space. - - `indices:data/write/bulk*` -- The `bulk*` permission is required because the plugin uses the Bulk API to write results into the custom index. - - When managing the custom results index, consider the following: - - The anomaly detection dashboard queries all detector results from all custom results indexes. Having too many custom results indexes can impact the plugin's performance. - - You can use [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) to roll over old results indexes. You can also manually delete or archive any old results indexes. Reusing a custom results index for multiple detectors is recommended. - - The plugin provides lifecycle management for custom indexes. It rolls over an alias to a new index when the custom results index meets any of the conditions in the following table. - - Parameter | Description | Type | Unit | Example | Required - :--- | :--- |:--- |:--- |:--- |:--- - `result_index_min_size` | The minimum total primary shard size (excluding replicas) required for index rollover. If set to 100 GiB and the index has 5 primary and 5 replica shards of 20 GiB each, then the total primary shard size is 100 GiB, triggering the rollover. | `integer` | `MB` | `51200` | No - `result_index_min_age` | The minimum index age required for rollover, calculated from its creation time to the current time. | `integer` |`day` | `7` | No - `result_index_ttl` | The minimum age required to permanently delete rolled-over indexes. | `integer` | `day` | `60` | No - -8. Choose **Next**. + +```json + { + "bool": { + "should": [ + { + "term": { + "urlPath.keyword": "/domain/{id}/short" + } + }, + { + "term": { + "urlPath.keyword": "/sub_dir/{id}/short" + } + }, + { + "term": { + "urlPath.keyword": "/abcd/123/{id}/xyz" + } + } + ] + } + } +``` +{% include copy-curl.html %} + +#### Setting the detector interval + +In the **Timestamp** pane, select a field from the **Timestamp field** dropdown menu. + +Then, in the **Operation settings** pane, use the following best practices to define the **Detector interval**, which is the interval at which the detector collects data: + +- The detector aggregates the data at this interval and then feeds the aggregated result into the anomaly detection model. The shorter the interval, the fewer data points the detector aggregates. The anomaly detection model uses a shingling process, a technique that uses consecutive data points to create a sample for the model. This process requires a certain number of aggregated data points from contiguous intervals. +- You should set the detector interval based on your actual data. If the detector interval is too long, then it might delay the results. If the detector interval is too short, then it might miss some data. The detector interval also will not have a sufficient number of consecutive data points for the shingle process. +- (Optional) To add extra processing time for data collection, specify a **Window delay** value. + - This value tells the detector that the data is not ingested into OpenSearch in real time but with a certain delay. Set the window delay to shift the detector interval to account for this delay. + - For example, the detector interval is 10 minutes and data is ingested into your cluster with a general delay of 1 minute. Assume the detector runs at 2:00. The detector attempts to get the last 10 minutes of data from 1:50 to 2:00, but because of the 1-minute delay, it only gets 9 minutes of data and misses the data from 1:59 to 2:00. Setting the window delay to 1 minute shifts the interval window to 1:49--1:59, so the detector accounts for all 10 minutes of the detector interval time. + - To avoid missing any data, set the **Window delay** to the upper limit of the expected ingestion delay. This ensures that the detector captures all data during its interval, reducing the risk of missing relevant information. While a longer window delay helps capture all data, too long of a window delay can hinder real-time anomaly detection because the detector will look further back in time. Find a balance that maintains both data accuracy and timely detection. + +#### Specifying a custom results index + +The Anomaly Detection plugin allows you to store anomaly detection results in a custom index of your choice. Select **Enable custom results index** and provide a name for your index, for example, `abc`. The plugin then creates an alias prefixed with `opensearch-ad-plugin-result-` followed by your chosen name, for example, `opensearch-ad-plugin-result-abc`. This alias points to an actual index with a name containing the date and a sequence number, such as `opensearch-ad-plugin-result-abc-history-2024.06.12-000002`, where your results are stored. + +You can use `-` to separate the namespace to manage custom results index permissions. For example, if you use `opensearch-ad-plugin-result-financial-us-group1` as the results index, you can create a permission role based on the pattern `opensearch-ad-plugin-result-financial-us-*` to represent the `financial` department at a granular level for the `us` group. +{: .note } + +##### Permissions + +When the Security plugin (fine-grained access control) is enabled, the default results index becomes a system index and is no longer accessible through the standard Index or Search APIs. To access its content, you must use the Anomaly Detection RESTful API or the dashboard. As a result, you cannot build customized dashboards using the default results index if the Security plugin is enabled. However, you can create a custom results index in order to build customized dashboards. + +If the custom index you specify does not exist, the Anomaly Detection plugin will create it when you create the detector and start your real-time or historical analysis. + +If the custom index already exists, the plugin will verify that the index mapping matches the required structure for anomaly results. In this case, ensure that the custom index has a valid mapping as defined in the [`anomaly-results.json`](https://github.com/opensearch-project/anomaly-detection/blob/main/src/main/resources/mappings/anomaly-results.json) file. +To use the custom results index option, you must have the following permissions: + +- `indices:admin/create` -- The `create` permission is required in order to create and roll over the custom index. +- `indices:admin/aliases` -- The `aliases` permission is required in order to create and manage an alias for the custom index. +- `indices:data/write/index` -- The `write` permission is required in order to write results into the custom index for a single-entity detector. +- `indices:data/read/search` -- The `search` permission is required in order to search custom results indexes to show results on the Anomaly Detection interface. +- `indices:data/write/delete` -- The detector may generate many anomaly results. The `delete` permission is required in order to delete old data and save disk space. +- `indices:data/write/bulk*` -- The `bulk*` permission is required because the plugin uses the Bulk API to write results into the custom index. + +##### Flattening nested fields + +Custom results index mappings with nested fields pose aggregation and visualization challenges. The **Enable flattened custom result index** option flattens the nested fields in the custom results index. When selecting this option, the plugin creates a separate index prefixed with the custom results index name and detector name. For example, if the detector `Test` uses the custom results index `abc`, a separate index with the alias `opensearch-ad-plugin-result-abc-flattened-test` will store the anomaly detection results with nested fields flattened. + +In addition to creating a separate index, the plugin also sets up an ingest pipeline with a script processor. This pipeline is bound to the separate index and uses a Painless script to flatten all nested fields in the custom results index. + +Deactivating this option on a running detector removes its flattening ingest pipeline; it also ceases to be the default for the results index. +When using the flattened custom result option, consider the following: + +- The Anomaly Detection plugin constructs the index name based on the custom results index and detector name, and because the detector name is editable, conflicts can occur. If a conflict occurs, the plugin reuses the index name. +- When managing the custom results index, consider the following: + - The Anomaly Detection dashboard queries all detector results from all custom results indexes. Having too many custom results indexes can impact the plugin's performance. + - You can use [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) to roll over old results indexes. You can also manually delete or archive any old results indexes. Reusing a custom results index for multiple detectors is recommended. + +The plugin rolls over an alias to a new index when the custom results index meets any of the conditions in the following table. + +Parameter | Description | Type | Unit | Example | Required +:--- | :--- |:--- |:--- |:--- |:--- +`result_index_min_size` | The minimum total primary shard size (excluding replicas) required for index rollover. When set to 100 GiB with an index that has 5 primary and 5 replica shards of 20 GiB each, the rollover runs. | `integer` | `MB` | `51200` | No +`result_index_min_age` | The minimum index age required for the rollover, calculated from its creation time to the current time. | `integer` |`day` | `7` | No +`result_index_ttl` | The minimum age required in order to delete rolled-over indexes. | `integer` | `day` | `60` | No + +#### Next step + +After defining you detector settings, choose **Next**. After you define the detector, the next step is to configure the model. ## Step 2: Configure the model -1. Add features to your detector. +Add features to your detector. A _feature_ is an aggregation of a field or a Painless script. A detector can discover anomalies across one or more features. -A _feature_ is any field in your index that you want to analyze for anomalies. A detector can discover anomalies across one or more features. You must choose an aggregation method for each feature: `average()`, `count()`, `sum()`, `min()`, or `max()`. The aggregation method determines what constitutes an anomaly. +You must choose an aggregation method for each feature: `average()`, `count()`, `sum()`, `min()`, or `max()`. The aggregation method determines what constitutes an anomaly. For example, if you choose `min()`, the detector focuses on finding anomalies based on the minimum values of your feature. If you choose `average()`, the detector finds anomalies based on the average values of your feature. -For example, if you choose `min()`, the detector focuses on finding anomalies based on the minimum values of your feature. If you choose `average()`, the detector finds anomalies based on the average values of your feature. +You can also use [custom JSON aggregation queries](#configuring-a-model-based-on-a-json-aggregation-query) as an aggregation method. For more information about creating JSON aggregation queries, see [Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/). -A multi-feature model correlates anomalies across all its features. The [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) makes it less likely that multi-feature models will identify smaller anomalies as compared to a single-feature model. Adding more features can negatively impact the [precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall) of a model. A higher proportion of noise in your data can further amplify this negative impact. Selecting the optimal feature set is usually an iterative process. By default, the maximum number of features for a detector is `5`. You can adjust this limit using the `plugins.anomaly_detection.max_anomaly_features` setting. + +For each configured feature, you can also select the anomaly criteria. By default, the model detects an anomaly when the actual value is either abnormally higher or lower than the expected value. However, you can customize your feature settings so that anomalies are only registered when the actual value is higher than the expected value (indicating a spike in the data) or lower than the expected value (indicating a dip in the data). For example, when creating a detector for the `cpu_utilization` field, you may choose to register anomalies only when the value spikes in order to reduce alert fatigue. + + +### Suppressing anomalies with threshold-based rules + +In the **Feature selection** pane, you can suppress anomalies by setting rules that define acceptable differences between the expected and actual values, either as an absolute value or a relative percentage. This helps reduce false anomalies caused by minor fluctuations, allowing you to focus on significant deviations. + +To suppress anomalies for deviations of less than 30% from the expected value, you can set the following rules in the feature selection pane: + +- Ignore anomalies when the actual value is no more than 30% above the expected value. +- Ignore anomalies when the actual value is no more than 30% below the expected value. + +The following image shows the pane for a feature named `LogVolume`, where you can set the relative deviation percentage settings: + +<img src="{{site.url}}{{site.baseurl}}/images/anomaly-detection/add-feature-with-relative-rules.png" alt="Interface of adding a feature with suppression rules" width="800" height="800"> + +If you expect that the log volume should differ by at least 10,000 from the expected value before being considered an anomaly, you can set the following absolute thresholds: + +- Ignore anomalies when the actual value is no more than 10,000 above the expected value. +- Ignore anomalies when the actual value is no more than 10,000 below the expected value. + +The following image shows the pane for a feature named `LogVolume`, where you can set the absolute threshold settings: + +<img src="{{site.url}}{{site.baseurl}}/images/anomaly-detection/add-suppression-rules-absolute.png" alt="Interface of adding suppression rules with absolute rules" width="800" height="800"> + +If no custom suppression rules are set, then the system defaults to a filter that ignores anomalies with deviations of less than 20% from the expected value for each enabled feature. + +A multi-feature model correlates anomalies across all of its features. The [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) makes it less likely that a multi-feature model will identify smaller anomalies as compared to a single-feature model. Adding more features can negatively impact the [precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall) of a model. A higher proportion of noise in your data can further amplify this negative impact. To select the optimal feature set limit for anomalies, we recommend an iterative process of testing different limits. By default, the maximum number of features for a detector is `5`. To adjust this limit, use the `plugins.anomaly_detection.max_anomaly_features` setting. {: .note} ### Configuring a model based on an aggregation method @@ -199,30 +244,6 @@ Using these options can improve recall in anomaly detection. For instance, if yo Be cautious when imputing extensively missing data, as excessive gaps can compromise model accuracy. Quality input is critical---poor data quality leads to poor model performance. The confidence score also decreases when imputations occur. You can check whether a feature value has been imputed using the `feature_imputed` field in the anomaly results index. See [Anomaly result mapping]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/result-mapping/) for more information. {: note} -### Suppressing anomalies with threshold-based rules - -In the **Advanced settings** pane, you can suppress anomalies by setting rules that define acceptable differences between the expected and actual values, either as an absolute value or a relative percentage. This helps reduce false anomalies caused by minor fluctuations, allowing you to focus on significant deviations. - -Suppose you want to detect substantial changes in log volume while ignoring small variations that are not meaningful. Without customized settings, the system might generate false alerts for minor changes, making it difficult to identify true anomalies. By setting suppression rules, you can ignore minor deviations and focus on real anomalous patterns. - -To suppress anomalies for deviations of less than 30% from the expected value, you can set the following rules: - -``` -Ignore anomalies for feature logVolume when the actual value is no more than 30% above the expected value. -Ignore anomalies for feature logVolume when the actual value is no more than 30% below the expected value. -``` - -Ensure that a feature, for example, `logVolume`, is properly defined in your model. Suppression rules are tied to specific features. -{: .note} - -If you expect that the log volume should differ by at least 10,000 from the expected value before being considered an anomaly, you can set absolute thresholds: - -``` -Ignore anomalies for feature logVolume when the actual value is no more than 10000 above the expected value. -Ignore anomalies for feature logVolume when the actual value is no more than 10000 below the expected value. -``` - -If no custom suppression rules are set, then the system defaults to a filter that ignores anomalies with deviations of less than 20% from the expected value for each enabled feature. ### Previewing sample anomalies @@ -269,11 +290,11 @@ You can analyze anomalies using the following visualizations: - **Confidence** estimate of the probability that the reported anomaly grade matches the expected anomaly grade. Confidence increases as the model observes more data and learns the data behavior and trends. Note that confidence is distinct from model accuracy. - **Last anomaly occurrence** is the time at which the last anomaly occurred. -Underneath **Anomaly overview** or **Anomaly history** are: +The following sections can be found under **Anomaly overview** and **Anomaly history**: - **Feature breakdown** plots the features based on the aggregation method. You can vary the date-time range of the detector. Selecting a point on the feature line chart shows the **Feature output**, the number of times a field appears in your index, and the **Expected value**, a predicted value for the feature output. Where there is no anomaly, the output and expected values are equal. -- **Anomaly occurrences** shows the `Start time`, `End time`, `Data confidence`, and `Anomaly grade` for each detected anomaly. +- **Anomaly occurrences** shows the `Start time`, `End time`, `Data confidence`, and `Anomaly grade` for each detected anomaly. To view logs related to an occurrence in Discover, select the **View in Discover** icon in the **Actions** column. The logs include a 10-minute buffer before and after the start and end times. Selecting a point on the anomaly line chart shows **Feature Contribution**, the percentage of a feature that contributes to the anomaly diff --git a/_observing-your-data/ad/result-mapping.md b/_observing-your-data/ad/result-mapping.md index 967b1856843..619ef6d3cbc 100644 --- a/_observing-your-data/ad/result-mapping.md +++ b/_observing-your-data/ad/result-mapping.md @@ -394,3 +394,162 @@ Field | Description } ``` {% include copy-curl.html %} + +## Flattened anomaly result mapping + +When selecting the **Enable flattened custom result index** option in the **Custom result index** pane, the Anomaly Detection plugin saves the results with all of the nested fields flattened in the index. + +The nested fields stored in the index use the following flattening rules. + +Field | Flattening rule | Example nested input | Example flattened output +:--- | :--- | :--- | :--- +`relevant_attribution` | `relevant_attribution_$FEATURE_NAME_data: $RELEVANT_ATTRIBUTION_FEATURE_DATA` | `relevant_attribution : [{"feature_id": "deny_max1", "data": 0.07339452532666227}]` | `relevant_attribution_deny_max1_data: 0.07339452532666227` +`past_values` | `past_values_$FEATURE_NAME_data: $PAST_VALUES_FEATURE_DATA` | `"past_values": [{"feature_id": "processing_bytes_max", "data": 905}]` | `past_values_processing_bytes_max_data: 905` +`feature_data` | `feature_data_$FEATURE_NAME_data: $FEATURE_DATA_FEATURE_NAME_DATA` | `"feature_data": [{"feature_id": "processing_bytes_max", "feature_name": "processing bytes max", "data": 1360}]` | `feature_data_processing_bytes_max_data: 1360` +`expected_values` | `expected_values_$FEATURE_NAME_data: $EXPECTED_VALUES_FEATURE_DATA` | `"expected_values": [{"likelihood": 1, "value_list": [{"feature_id": "processing_bytes_max", "data": 905}]}]` | `expected_values_processing_bytes_max_data: 905` +`entity` | `entity_$NAME_value: $ENTITY_VALUE ` | `"entity": [{"name": "process_name", "value": "process_3"}]` | `entity_process_name_value: process_3 ` + +For example, when a detector is late in detecting an anomaly, the flattened result appears in the following format: + +```json +{ + "detector_id": "kzcZ43wBgEQAbjDnhzGF", + "confidence": 0.9746820962328963, + "relevant_attribution": [ + { + "feature_id": "deny_max1", + "data": 0.07339452532666227 + }, + { + "feature_id": "deny_avg", + "data": 0.04934972719948845 + }, + { + "feature_id": "deny_min", + "data": 0.01803003656061806 + }, + { + "feature_id": "deny_sum", + "data": 0.14804918212089874 + }, + { + "feature_id": "accept_max5", + "data": 0.7111765287923325 + } + ], + "relevant_attribution_deny_max1_data": 0.07339452532666227, + "relevant_attribution_deny_avg_data": 0.04934972719948845, + "relevant_attribution_deny_min_data": 0.01803003656061806, + "relevant_attribution_deny_sum_data": 0.14804918212089874, + "relevant_attribution_deny_max5_data": 0.7111765287923325, + "task_id": "9Dck43wBgEQAbjDn4zEe", + "threshold": 1, + "model_id": "kzcZ43wBgEQAbjDnhzGF_entity_app_0", + "schema_version": 5, + "anomaly_score": 1.141419389056506, + "execution_start_time": 1635898427803, + "past_values": [ + { + "feature_id": "processing_bytes_max", + "data": 905 + }, + { + "feature_id": "processing_bytes_avg", + "data": 479 + }, + { + "feature_id": "processing_bytes_min", + "data": 128 + }, + { + "feature_id": "processing_bytes_sum", + "data": 1437 + }, + { + "feature_id": "processing_time_max", + "data": 8440 + } + ], + "past_values_processing_bytes_max_data": 905, + "past_values_processing_bytes_avg_data": 479, + "past_values_processing_bytes_min_data": 128, + "past_values_processing_bytes_sum_data": 1437, + "past_values_processing_bytes_max_data": 8440, + "data_end_time": 1635883920000, + "data_start_time": 1635883860000, + "feature_data": [ + { + "feature_id": "processing_bytes_max", + "feature_name": "processing bytes max", + "data": 1360 + }, + { + "feature_id": "processing_bytes_avg", + "feature_name": "processing bytes avg", + "data": 990 + }, + { + "feature_id": "processing_bytes_min", + "feature_name": "processing bytes min", + "data": 608 + }, + { + "feature_id": "processing_bytes_sum", + "feature_name": "processing bytes sum", + "data": 2970 + }, + { + "feature_id": "processing_time_max", + "feature_name": "processing time max", + "data": 9670 + } + ], + "feature_data_processing_bytes_max_data": 1360, + "feature_data_processing_bytes_avg_data": 990, + "feature_data_processing_bytes_min_data": 608, + "feature_data_processing_bytes_sum_data": 2970, + "feature_data_processing_time_max_data": 9670, + "expected_values": [ + { + "likelihood": 1, + "value_list": [ + { + "feature_id": "processing_bytes_max", + "data": 905 + }, + { + "feature_id": "processing_bytes_avg", + "data": 479 + }, + { + "feature_id": "processing_bytes_min", + "data": 128 + }, + { + "feature_id": "processing_bytes_sum", + "data": 4847 + }, + { + "feature_id": "processing_time_max", + "data": 15713 + } + ] + } + ], + "expected_values_processing_bytes_max_data": 905, + "expected_values_processing_bytes_avg_data": 479, + "expected_values_processing_bytes_min_data": 128, + "expected_values_processing_bytes_sum_data": 4847, + "expected_values_processing_time_max_data": 15713, + "execution_end_time": 1635898427895, + "anomaly_grade": 0.5514172746375128, + "entity": [ + { + "name": "process_name", + "value": "process_3" + } + ], + "entity_process_name_value": "process_3", + "approx_anomaly_start_time": 1635883620000 +} +``` diff --git a/_observing-your-data/alerting/api.md b/_observing-your-data/alerting/api.md index ea52da552d6..0b915b3be90 100644 --- a/_observing-your-data/alerting/api.md +++ b/_observing-your-data/alerting/api.md @@ -2329,8 +2329,6 @@ POST _plugins/_alerting/destinations/email_groups/_search </details> ## Create comment -This is an experimental feature and is not recommended for use in a production environment. -{: .warning} Add comments to a specific alert, providing additional context or notes related to that alert, using the following request. @@ -2371,8 +2369,6 @@ POST _plugins/_alerting/comments/<alert-id> </details> ## Update comment -This is an experimental feature and is not recommended for use in a production environment. -{: .warning} Modify the content of a previously added comment associated with an alert using the following request. @@ -2414,8 +2410,6 @@ PUT _plugins/_alerting/comments/<comment-id> </details> ## Search comment -This is an experimental feature and is not recommended for use in a production environment. -{: .warning} Query and retrieve existing comments associated with alerts using the following request. @@ -2496,8 +2490,6 @@ GET _plugins/_alerting/comments/_search </details> ## Delete comment -This is an experimental feature and is not recommended for use in a production environment. -{: .warning} Remove a specific comment associated with an alert using the following request. diff --git a/_observing-your-data/alerting/comments.md b/_observing-your-data/alerting/comments.md index fb0630685ce..df4970fe867 100644 --- a/_observing-your-data/alerting/comments.md +++ b/_observing-your-data/alerting/comments.md @@ -10,9 +10,6 @@ redirect_from: # Adding comments -This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch-Dashboards/issues/6999). -{: .warning} - When an alert is generated, add comments to share information about its root cause and facilitate resolution. Comments are enabled by setting `plugins.alerting.comments_enabled` to `true` using the [`cluster/settings` API]({{site.url}}{{site.baseurl}}/observing-your-data/alerting/settings/). Comments can be accessed through the alerts table view by selecting the comment icon within an alert's row. From there, comments can be added, edited, or deleted. An Alerting Comments API is also available for programmatic comment management. For more information, see [Alerting API]({{site.url}}{{site.baseurl}}/observing-your-data/alerting/api/). diff --git a/_observing-your-data/alerting/dashboards-alerting.md b/_observing-your-data/alerting/dashboards-alerting.md index 3c7719edfc2..4a5d01dde4f 100644 --- a/_observing-your-data/alerting/dashboards-alerting.md +++ b/_observing-your-data/alerting/dashboards-alerting.md @@ -88,5 +88,5 @@ Once you've created or associated alerting monitors, verify that the monitor is ## Next steps -- [Learn more about the Dashboard application](https://opensearch.org/docs/latest/dashboards/dashboard/index/). -- [Learn more about alerting](https://opensearch.org/docs/latest/observing-your-data/alerting/index/). +- [Learn more about the Dashboard application]({{site.url}}{{site.baseurl}}/dashboards/dashboard/index/). +- [Learn more about alerting]({{site.url}}{{site.baseurl}}/observing-your-data/alerting/index/). diff --git a/_observing-your-data/alerting/per-document-monitors.md b/_observing-your-data/alerting/per-document-monitors.md index c1f4c7bf520..3fbe107e209 100644 --- a/_observing-your-data/alerting/per-document-monitors.md +++ b/_observing-your-data/alerting/per-document-monitors.md @@ -17,6 +17,9 @@ Per document monitors are a type of alert monitor that can be used to identify a - Enforce data quality policies, such as ensuring all documents contain a certain field or that values in a field are within a certain range. - Track changes to a specific document over time, which can be helpful for auditing and compliance purposes +Per document monitors do not support cross-cluster searching. +{: .note} + ## Defining queries Per document monitors allow you to define up to 10 queries that compare a selected field with a desired value. You can define supported field data types using the following operators: diff --git a/_observing-your-data/alerting/per-query-bucket-monitors.md b/_observing-your-data/alerting/per-query-bucket-monitors.md index cb08c494781..d944b575258 100644 --- a/_observing-your-data/alerting/per-query-bucket-monitors.md +++ b/_observing-your-data/alerting/per-query-bucket-monitors.md @@ -13,7 +13,7 @@ Per query monitors are a type of alert monitor that can be used to identify and Per bucket monitors are a type of alert monitor that can be used to identify and alert on specific buckets of data that are created by a query against an OpenSearch index. -Both monitor types support querying remote indexes using the same `cluster-name:index-name` pattern used by [cross-cluster search](https://opensearch.org/docs/latest/security/access-control/cross-cluster-search/) or by using OpenSearch Dashboards 2.12 or later. +Both monitor types support querying remote indexes using the same `cluster-name:index-name` pattern used by [cross-cluster search]({{site.url}}{{site.baseurl}}/security/access-control/cross-cluster-search/) or by using OpenSearch Dashboards 2.12 or later. The following [permissions]({{site.url}}{{site.baseurl}}/security/access-control/permissions/) are required in order to create a cross-cluster monitor through the dashboards UI: `cluster:admin/opensearch/alerting/remote/indexes/get`, `indices:admin/resolve/index`, `cluster:monitor/health`, and `indices:admin/mappings/get`. {: .note} diff --git a/_observing-your-data/alerting/triggers.md b/_observing-your-data/alerting/triggers.md index 0cbc5d6ea52..6fb195b54e8 100644 --- a/_observing-your-data/alerting/triggers.md +++ b/_observing-your-data/alerting/triggers.md @@ -145,7 +145,7 @@ Variable | Data type | Description Per bucket and per document monitors support printing sample documents in notification messages. Per document monitors support printing the list of queries that triggered the creation of the finding associated with the alert. When the monitor runs, it adds each new alert to the `ctx` variables, for example, `newAlerts` for per bucket monitors and `alerts` for per document monitors. Each alert has its own list of `sample_documents`, and each per document monitor alert has its own list of `associated_queries`. The message template can be formatted to iterate through the list of alerts, the list of `associated_queries`, and the `sample_documents` for each alert. -An alerting monitor uses the permissions of the user that created it. Be mindful of the Notifications plugin channel to which alert messages are sent and the content of the message mustache template. To learn more about security in the Alerting plugin, see [Alerting security](https://opensearch.org/docs/latest/observing-your-data/alerting/security/). +An alerting monitor uses the permissions of the user that created it. Be mindful of the Notifications plugin channel to which alert messages are sent and the content of the message mustache template. To learn more about security in the Alerting plugin, see [Alerting security]({{site.url}}{{site.baseurl}}/observing-your-data/alerting/security/). {: .note} #### Sample document variables diff --git a/_observing-your-data/forecast/api.md b/_observing-your-data/forecast/api.md new file mode 100644 index 00000000000..e1c0285a0f2 --- /dev/null +++ b/_observing-your-data/forecast/api.md @@ -0,0 +1,1280 @@ +--- +layout: default +title: Forecasting API +parent: Forecasting +nav_order: 100 +--- + +# Forecasting API + +Use these operations to programmatically create and manage forecasters that generate forecasts over your time‑series data. + +--- + +## Table of contents +- TOC +{:toc} + +--- + +## Create forecaster + +**Introduced 3.1** +{: .label .label-purple } + +Creates a forecaster for generating time-series forecasts. A forecaster can be either single-stream (without a category field) or high-cardinality (with one or more category fields). + +When creating a forecaster, you define the source indexes, the forecast interval and horizon, the feature to forecast, and optional parameters such as category fields and a custom result index. + + +### Endpoint + +``` +POST _plugins/_forecast/forecasters +``` + +### Request body fields + +This API supports the following request body fields. + +| Field | Data type | Required | Description | +| :---------------------------- | :------------------ | :------- | :------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | String | Required | The forecaster name. | +| `description` | String | Optional | A free-form description of the forecaster. | +| `time_field` | String | Required | The timestamp field for the source documents. | +| `indices` | String or string\[] | Required | One or more source indexes or index aliases. | +| `feature_attributes` | Array of objects | Required | The feature to forecast. Only one feature is supported. Each object must include the `feature_name` and an `aggregation_query`. | +| `forecast_interval` | Object | Required | The interval over which forecasts are generated. | +| `horizon` | Integer | Optional | The number of future intervals to forecast. | +| `window_delay` | Object | Optional | A delay added to account for ingestion latency. | +| `category_field` | String | Optional | One or two fields used to group forecasts by entity. | +| `result_index` | String | Optional | A custom index alias for storing forecast results. Must begin with `opensearch-forecast-result-`. Defaults to `opensearch-forecast-results`. | +| `suggested_seasonality` | Integer | Optional | The seasonal pattern length in intervals. Expected range: 8–256. | +| `recency_emphasis` | Integer | Optional | Controls how much recent data affects the forecast. Defaults to `2560`. | +| `history` | Integer | Optional | The number of past intervals used for model training. | +| `result_index_min_size` | Integer | Optional | The minimum primary shard size (in MB) required to trigger index rollover. | +| `result_index_min_age` | Integer | Optional | The minimum index age (in days) required to trigger index rollover. | +| `result_index_ttl` | Integer | Optional | The minimum amount of time (in days) before rolled-over indexes are deleted. | +| `flatten_custom_result_index` | Boolean | Optional | If `true`, flattens nested fields in the custom result index for easier aggregation. | +| `shingle_size` | Integer | Optional | The number of past intervals used to influence the forecast. Defaults to `8`. Recommended range: 4–128. | + + +### Example request: Single-stream forecaster + +The following example creates a single-stream forecaster for the `network-requests` index. The forecaster predicts the maximum value of the `deny` field every 3 minutes, using the previous 300 intervals for training. The `window_delay` setting accounts for ingest latency by delaying the forecast window by 3 minutes: + + +```json +POST _plugins/_forecast/forecasters +{ + "name": "Second-Test-Forecaster-7", + "description": "ok rate", + "time_field": "@timestamp", + "indices": [ + "network-requests" + ], + "feature_attributes": [ + { + "feature_id": "deny_max", + "feature_name": "deny max", + "feature_enabled": true, + "importance": 1, + "aggregation_query": { + "deny_max": { + "max": { + "field": "deny" + } + } + } + } + ], + "window_delay": { + "period": { + "interval": 3, + "unit": "MINUTES" + } + }, + "forecast_interval": { + "period": { + "interval": 3, + "unit": "MINUTES" + } + }, + "schema_version": 2, + "horizon": 3, + "history": 300 +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "_id": "4WnXAYoBU2pVBal92lXD", + "_version": 1, + "forecaster": { + "...": "Configuration (omitted)" + } +} +``` + +### Example request: High-cardinality forecaster + +The following example creates a high-cardinality forecaster that groups forecasts by the `host_nest.host2` field. Like the single-stream example, it forecasts the maximum value of the `deny` field at 3-minute intervals using historical data. This setup enables entity-specific forecasting across different hosts: + +```json +POST _plugins/_forecast/forecasters +{ + "name": "Second-Test-Forecaster-7", + "description": "ok rate", + "time_field": "@timestamp", + "indices": [ + "network-requests" + ], + "feature_attributes": [ + { + "feature_id": "deny_max", + "feature_name": "deny max", + "feature_enabled": true, + "importance": 1, + "aggregation_query": { + "deny_max": { + "max": { + "field": "deny" + } + } + } + } + ], + "window_delay": { + "period": { + "interval": 3, + "unit": "MINUTES" + } + }, + "forecast_interval": { + "period": { + "interval": 3, + "unit": "MINUTES" + } + }, + "schema_version": 2, + "horizon": 3, + "history": 300, + "category_field": ["host_nest.host2"], +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "_id": "4WnXAYoBU2pVBal92lXD", + "_version": 1, + "forecaster": { + "...": "Configuration (omitted)" + } +} +``` + + +--- + + +## Validate forecaster + +**Introduced 3.1** +{: .label .label-purple } + +Use this API to verify that a forecaster configuration is valid. You can perform two types of validation: + +- **Configuration-only validation**: Checks that the configuration is syntactically correct and references existing fields. +- **Training-feasibility validation**: Performs a comprehensive validation to ensure that the forecaster can be trained with the specified configuration. + + +### Endpoints + +The following endpoints are available for validating forecasters. + +**Configuration-only validation**: + +```http +POST _plugins/_forecast/forecasters/_validate +``` + +**Training-feasibility validation**: + +```http +POST _plugins/_forecast/forecasters/_validate/model +``` + +### Request body + +The request body is identical to the request body used to create a forecaster. It must include at least the following required fields: `name`, `time_field`, `indices`, `feature_attributes`, and `forecast_interval`. + +If the configuration is valid, the response returns an empty object (`{}`). If the configuration is invalid, the response includes detailed error messages. + + +### Example request: Missing `forecast_interval` + +The following request shows an invalid forecaster configuration that omits the `forecast_interval`: + +```json +POST _plugins/_forecast/forecasters/_validate +{ + "name": "invalid-forecaster", + "time_field": "@timestamp", + "indices": ["network-requests"], + "feature_attributes": [ + { + "feature_id": "deny_max", + "feature_name": "deny max", + "feature_enabled": true, + "aggregation_query": { + "deny_max": { + "max": { + "field": "deny" + } + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Example response + +```json +{ + "forecaster": { + "forecast_interval": { + "message": "Forecast interval should be set" + } + } +} +``` + + +--- + +## Suggest configuration + +**Introduced 3.1** +{: .label .label-purple } + +Returns appropriate values for one or more forecaster parameters (`forecast_interval`, `horizon`, `history`, `window_delay`) based on the cadence and density of your data. + + +### Endpoints + +``` +POST _plugins/_forecast/forecasters/_suggest/<comma‑separated-types> +``` + +`types` must be one or more of `forecast_interval`, `horizon`, `history`, or `window_delay`. + + +### Example request: Suggest an interval + +The following request analyzes the source data and suggests an appropriate `forecast_interval` value for the forecaster based on the average event frequency: + +``` +POST _plugins/_forecast/forecasters/_suggest/forecast_interval +{ + "name": "interval‑suggest", + "time_field": "@timestamp", + "indices": ["network-requests"], + ... +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "interval": { + "period": { "interval": 1, "unit": "Minutes" } + } +} +``` + +--- + +## Get forecaster + +**Introduced 3.1** +{: .label .label-purple } + +Retrieves a forecaster and (optionally) its most recent tasks. + +### Endpoints + +``` +GET _plugins/_forecast/forecasters/<forecaster_id>[?task=(true|false)] +``` + +### Example request: Include tasks + +The following request returns metadata about the forecaster and, if specified, details about its associated tasks: + +```json +GET _plugins/_forecast/forecasters/d7-r1YkB_Z-sgDOKo3Z5?task=true +``` +{% include copy-curl.html %} + +The response includes the `forecaster`, `realtime_task`, and `run_once_task` sections. + +--- + +## Update forecaster + +**Introduced 3.1** +{: .label .label-purple } + +Updates the configuration of an existing forecaster. You must stop any active forecasting jobs before making updates. + +Any change that affects the model, such as modifying the `category_field`, `result_index`, or `feature_attributes`, invalidates previous results shown in the OpenSearch Dashboards UI. + +### Endpoints + +``` +PUT _plugins/_forecast/forecasters/<forecaster_id> +``` + + +### Example request: Update the name, result index, and category fields + +The following displays the definition of forecaster `forecaster-i1nwqooBLXq6T-gGbXI-`: + +```json +{ + "_index": ".opensearch-forecasters", + "_id": "forecaster-i1nwqooBLXq6T-gGbXI-", + "_version": 1, + "_seq_no": 0, + "_primary_term": 1, + "_score": 1.0, + "_source": { + "category_field": [ + "service" + ], + "description": "ok rate", + "feature_attributes": [{ + "feature_id": "deny_max", + "feature_enabled": true, + "feature_name": "deny max", + "aggregation_query": { + "deny_max": { + "max": { + "field": "deny" + } + } + } + }], + "forecast_interval": { + "period": { + "unit": "Minutes", + "interval": 1 + } + }, + "schema_version": 2, + "time_field": "@timestamp", + "last_update_time": 1695084997949, + "horizon": 24, + "indices": [ + "network-requests" + ], + "window_delay": { + "period": { + "unit": "Seconds", + "interval": 20 + } + }, + "transform_decay": 1.0E-4, + "name": "Second-Test-Forecaster-3", + "filter_query": { + "match_all": { + "boost": 1.0 + } + }, + "shingle_size": 8, + "result_index": "opensearch-forecast-result-a" + } +} +``` + +The following request updates the `name`, `result_index`, and `category_field` properties of a forecaster: + +```json +PUT localhost:9200/_plugins/_forecast/forecasters/forecast-i1nwqooBLXq6T-gGbXI- +{ + "name": "Second-Test-Forecaster-1", + "description": "ok rate", + "time_field": "@timestamp", + "indices": [ + "network-requests" + ], + "feature_attributes": [ + { + "feature_id": "deny_max", + "feature_name": "deny max", + "feature_enabled": true, + "importance": 1, + "aggregation_query": { + "deny_max": { + "max": { + "field": "deny" + } + } + } + } + ], + "window_delay": { + "period": { + "interval": 20, + "unit": "SECONDS" + } + }, + "forecast_interval": { + "period": { + "interval": 1, + "unit": "MINUTES" + } + }, + "ui_metadata": { + "aabb": { + "ab": "bb" + } + }, + "schema_version": 2, + "horizon": 24, + "category_field": ["service", "host"] +} +``` +{% include copy-curl.html %} + +--- + + +## Delete forecaster + +**Introduced 3.1** +{: .label .label-purple } + +Deletes a forecaster configuration. You must stop any associated real-time or run-once forecasting jobs before deletion. If a job is still running, the API returns a `400` error. + +### Endpoint + +```http +DELETE _plugins/_forecast/forecasters/<forecaster_id> +``` + +### Example request: Delete a forecaster + +The following request deletes a forecaster configuration using its unique ID: + +```http +DELETE _plugins/_forecast/forecasters/forecast-i1nwqooBLXq6T-gGbXI- +``` +{% include copy-curl.html %} + +--- + +## Start a forecaster job + +**Introduced 3.1** +{: .label .label-purple } + +Begins real-time forecasting for a forecaster. + +### Endpoints + +```http +POST _plugins/_forecast/forecasters/<forecaster_id>/_start +``` + +### Example request: Start a forecaster job + +The following request initiates real-time forecasting for the specified forecaster: + +```bash +POST _plugins/_forecast/forecasters/4WnXAYoBU2pVBal92lXD/_start +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ "_id": "4WnXAYoBU2pVBal92lXD" } +``` + +--- + +## Stop a forecaster job + +**Introduced 3.1** +{: .label .label-purple } + +Stops real-time forecasting for a forecaster. + +### Endpoints +```http +POST _plugins/_forecast/forecasters/<forecaster_id>/_stop +``` + +### Example request: Stop a forecaster job + +The following request stops the real-time forecasting job for the specified forecaster: + +```bash +POST _plugins/_forecast/forecasters/4WnXAYoBU2pVBal92lXD/_stop +``` +{% include copy-curl.html %} + + +--- + +## Run one analysis + +**Introduced 3.1** +{: .label .label-purple } + +Runs backtesting (historical) forecasting. It cannot run while a real-time job is active. + +### Endpoint +```http +POST _plugins/_forecast/forecasters/<forecaster_id>/_run_once +``` + +### Example request: Run a backtesting forecast + +The following request starts a run-once forecast analysis for the specified forecaster: + +```bash +POST _plugins/_forecast/forecasters/<forecaster_id>/_run_once +``` +{% include copy-curl.html %} + +#### Example response + +The response returns the task ID assigned to the run-once job: + +```json +{ "taskId": "vXZG85UBAlM4LplcKI0f" } +``` + +### Example request: Search forecast results by task ID + +Use the returned `taskId` to query the `opensearch-forecast-results*` index for historical forecast output: + +```json +GET opensearch-forecast-results*/_search?pretty +{ + "sort": { + "data_end_time": "desc" + }, + "size": 10, + "query": { + "bool": { + "filter": [ + { "term": { "task_id": "vXZG85UBAlM4LplcKI0f" } }, + { + "range": { + "data_end_time": { + "format": "epoch_millis", + "gte": 1742585746033 + } + } + } + ] + } + }, + "track_total_hits": true +} +``` +{% include copy-curl.html %} + +This query returns the 10 most recent forecast results matching the specified task ID. + + +--- + +## Search forecasters + +**Introduced 3.1** +{: .label .label-purple } + +Provides standard `_search` functionality on the `.opensearch-forecasters` system index, which stores forecaster configurations. You must use this API to query `.opensearch-forecasters` directly because the index is a system index and cannot be accessed through regular OpenSearch queries. + +### Endpoint + +```http +GET _plugins/_forecast/forecasters/_search +``` + +### Example request: Wildcard search by index + +The following request searches for forecasters whose source index names begin with `network` using a leading-anchored wildcard: + +```json +GET _plugins/_forecast/forecasters/_search +{ + "query": { + "wildcard": { + "indices": { + "value": "network*" + } + } + } +} +``` +{% include copy-curl.html %} + +`network*` matches `network`, `network-metrics`, `network_2025-06`, and similar index names. + +#### Example response + +```json +{ + "took": 5, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.0, + "hits": [{ + "_index": ".opensearch-forecasters", + "_id": "forecast-i1nwqooBLXq6T-gGbXI-", + "_version": 1, + "_seq_no": 0, + "_primary_term": 1, + "_score": 1.0, + "_source": { + "category_field": ["server"], + "description": "ok rate", + "feature_attributes": [{ + "feature_id": "deny_max", + "feature_enabled": true, + "feature_name": "deny max", + "aggregation_query": { + "deny_max": { + "max": { + "field": "deny" + } + } + } + }], + "forecast_interval": { + "period": { + "unit": "Minutes", + "interval": 1 + } + }, + "schema_version": 2, + "time_field": "@timestamp", + "last_update_time": 1695084997949, + "horizon": 24, + "indices": ["network-requests"], + "window_delay": { + "period": { + "unit": "Seconds", + "interval": 20 + } + }, + "transform_decay": 1.0E-4, + "name": "Second-Test-Forecaster-3", + "filter_query": { + "match_all": { + "boost": 1.0 + } + }, + "shingle_size": 8 + } + }] + } +} +``` + +--- + +## Search tasks + +**Introduced 3.1** +{: .label .label-purple } + +Query tasks in the `.opensearch-forecast-state` index. + +### Endpoint + +```http +GET _plugins/_forecast/forecasters/tasks/_search +``` + +### Example request: Search previous run-once tasks + +The following request retrieves previous run-once tasks (excluding the most recent) for a specific forecaster and sorts them by `execution_start_time` in descending order: + +```json +GET _plugins/_forecast/forecasters/tasks/_search +{ + "from": 0, + "size": 1000, + "query": { + "bool": { + "filter": [ + { "term": { "forecaster_id": { "value": "m5apnooBHh7Wss2wewfW", "boost": 1.0 }}}, + { "term": { "is_latest": { "value": false, "boost": 1.0 }}}, + { "terms": { + "task_type": [ + "RUN_ONCE_FORECAST_SINGLE_STREAM", + "RUN_ONCE_FORECAST_HC_FORECASTER" + ], + "boost": 1.0 + }} + ], + "adjust_pure_negative": true, + "boost": 1.0 + } + }, + "sort": [ + { "execution_start_time": { "order": "desc" }} + ] +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took": 3, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { "value": 1, "relation": "eq" }, + "max_score": null, + "hits": [ + { + "_index": ".opensearch-forecast-state", + "_id": "4JaunooBHh7Wss2wOwcw", + "_version": 3, + "_seq_no": 5, + "_primary_term": 1, + "_score": null, + "_source": { + "last_update_time": 1694879344264, + "execution_start_time": 1694879333168, + "forecaster_id": "m5apnooBHh7Wss2wewfW", + "state": "TEST_COMPLETE", + "task_type": "RUN_ONCE_FORECAST_SINGLE_STREAM", + "is_latest": false, + "forecaster": { + "description": "ok rate", + "ui_metadata": { "aabb": { "ab": "bb" }}, + "feature_attributes": [ + { + "feature_id": "deny_max", + "feature_enabled": true, + "feature_name": "deny max", + "aggregation_query": { + "deny_max": { + "max": { "field": "deny" } + } + } + } + ], + "forecast_interval": { + "period": { + "unit": "Minutes", + "interval": 1 + } + }, + "schema_version": 2, + "time_field": "@timestamp", + "last_update_time": 1694879022036, + "horizon": 24, + "indices": [ "network-requests" ], + "window_delay": { + "period": { + "unit": "Seconds", + "interval": 20 + } + }, + "transform_decay": 1.0E-4, + "name": "Second-Test-Forecaster-5", + "filter_query": { "match_all": { "boost": 1.0 }}, + "shingle_size": 8 + } + }, + "sort": [ 1694879333168 ] + } + ] + } +} +``` + +--- + +## Top forecasters +**Introduced 3.1** +{: .label .label-purple } + +Returns the *top‑k* entities for a given timestamp range, based on built‑in or custom metrics. + +### Endpoint + +```http +POST _plugins/_forecast/forecasters/<forecaster_id>/results/_topForecasts +``` + +### Query parameters + +The following query parameters are supported. + +| Name | Type | Required | Description | +| :--- | :--- | :--- | :--- | +| `split_by` | String | Required | The field to group by (such as `service`). | +| `forecast_from` | Epoch‑ms | Required | The `data_end_time` of the first forecast in the evaluation window. | +| `size` | Integer | Optional | The number of buckets to return. Defaults is `5`. | +| `filter_by` | Enum | Required | Specifies whether to use a built-in or custom query. Must be either `BUILD_IN_QUERY` or `CUSTOM_QUERY`. | +| `build_in_query` | Enum | Optional | One of the following built-in ranking criteria is required:<br> `MIN_CONFIDENCE_INTERVAL_WIDTH` -- Sorts by the narrowest forecast confidence intervals (most precise).<br> `MAX_CONFIDENCE_INTERVAL_WIDTH` -- Sorts by the widest forecast confidence intervals (least precise).<br> `MIN_VALUE_WITHIN_THE_HORIZON` -- Sorts by the lowest forecast value observed within the prediction window.<br> `MAX_VALUE_WITHIN_THE_HORIZON` -- Sorts by the highest forecast value observed within the prediction window.<br> `DISTANCE_TO_THRESHOLD_VALUE` -- Sorts by the difference between the forecast value and a user-defined threshold. | +| `threshold`, `relation_to_threshold` | Mixed | Conditional | Required only if `build_in_query` is `DISTANCE_TO_THRESHOLD_VALUE`. | +| `filter_query` | Query DSL | Optional | A custom query used when `filter_by=CUSTOM_QUERY`. | +| `subaggregations` | Array | Optional | A list of nested aggregations and sort options used to compute additional metrics within each bucket. | + +### Example request: Built-in query for narrow confidence intervals + +The following request returns the top forecasted entities ranked by the narrowest confidence intervals: + +```json +POST _plugins/_forecast/forecasters/AG_3t4kBkYqqimCe86bP/results/_topForecasts +{ + "split_by": "service", + "filter_by": "BUILD_IN_QUERY", + "build_in_query": "MIN_CONFIDENCE_INTERVAL_WIDTH", + "forecast_from": 1691008679297 +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "buckets": [ + { + "key": { "service": "service_6" }, + "doc_count": 1, + "bucket_index": 0, + "MIN_CONFIDENCE_INTERVAL_WIDTH": 27.655361 + }, + ... + ] +} +``` + +### Example request: Built-in query with the narrowest confidence interval + +The following request returns a sorted list of entities whose forecast values have the narrowest confidence intervals. The results are ranked in ascending order based on the `MIN_CONFIDENCE_INTERVAL_WIDTH` metric: + +```json +POST _plugins/_forecast/forecasters/AG_3t4kBkYqqimCe86bP/results/_topForecasts +{ + "split_by": "service", + "filter_by": "BUILD_IN_QUERY", + "build_in_query": "MIN_CONFIDENCE_INTERVAL_WIDTH", + "forecast_from": 1691008679297 +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "buckets": [ + { + "key": { + "service": "service_6" + }, + "doc_count": 1, + "bucket_index": 0, + "MIN_CONFIDENCE_INTERVAL_WIDTH": 27.655361 + }, + { + "key": { + "service": "service_4" + }, + "doc_count": 1, + "bucket_index": 1, + "MIN_CONFIDENCE_INTERVAL_WIDTH": 1324.7734 + }, + { + "key": { + "service": "service_0" + }, + "doc_count": 1, + "bucket_index": 2, + "MIN_CONFIDENCE_INTERVAL_WIDTH": 2211.0781 + }, + { + "key": { + "service": "service_2" + }, + "doc_count": 1, + "bucket_index": 3, + "MIN_CONFIDENCE_INTERVAL_WIDTH": 3372.0469 + }, + { + "key": { + "service": "service_3" + }, + "doc_count": 1, + "bucket_index": 4, + "MIN_CONFIDENCE_INTERVAL_WIDTH": 3980.2812 + } + ] +} +``` + +### Example request: Built-in query with distance under a threshold + +The following request returns the top entities whose forecast values fall farthest from a specified threshold, based on the `DISTANCE_TO_THRESHOLD_VALUE` metric: + +```http +POST _plugins/_forecast/AG_3t4kBkYqqimCe86bP/results/_topForecasts +{ + "split_by": "service", // group forecasts by the "service" entity field + "filter_by": "BUILD_IN_QUERY", // use a built-in ranking metric + "build_in_query": "DISTANCE_TO_THRESHOLD_VALUE", + "forecast_from": 1691008679297, // data_end_time of the first forecast in scope + "threshold": -82561.8, // user-supplied threshold + "relation_to_threshold": "LESS_THAN" // keep only forecasts below the threshold +} +``` + +#### Example response + +The `DISTANCE_TO_THRESHOLD_VALUE` metric calculates `forecast_value – threshold`. Because `relation_to_threshold` is `LESS_THAN`, the API returns negative distances only and sorts them in ascending order (most negative first). Each bucket includes the following values: + +- `doc_count`: The number of forecast points that matched. +- `DISTANCE_TO_THRESHOLD_VALUE`: The largest distance within the forecast horizon from the threshold value. + +The following response returns the `DISTANCE_TO_THRESHOLD_VALUE`: + +```json +{ + "buckets": [ + { + "key": { "service": "service_5" }, + "doc_count": 18, + "bucket_index": 0, + "DISTANCE_TO_THRESHOLD_VALUE": -330387.12 + }, + ... + { + "key": { "service": "service_0" }, + "doc_count": 1, + "bucket_index": 4, + "DISTANCE_TO_THRESHOLD_VALUE": -83561.8 + } + ] +} +``` + +### Example request: Custom query and nested aggregations + +The following request uses a custom query to match services by name and ranks them by the highest forecast value: + +```json +POST _plugins/_forecast/AG_3t4kBkYqqimCe86bP/results/_topForecasts +{ + "split_by": "service", + "forecast_from": 1691018993776, + "filter_by": "CUSTOM_QUERY", + "filter_query": { + "nested": { + "path": "entity", + "query": { + "bool": { + "must": [ + { "term": { "entity.name": "service" } }, + { "wildcard": { "entity.value": "User*" } } + ] + } + } + } + }, + "subaggregations": [ + { + "aggregation_query": { + "forecast_value_max": { + "max": { "field": "forecast_value" } + } + }, + "order": "DESC" + } + ] +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "buckets": [ + { + "key": { "service": "UserAuthService" }, + "doc_count": 24, + "bucket_index": 0, + "forecast_value_max": 269190.38 + }, + ... + ] +} +``` +--- + +## Profile forecaster + +**Introduced 3.1** +{: .label .label-purple } + +Returns execution-time state such as initialization progress, per-entity model metadata, and errors. This API is useful for inspecting forecaster intervals during runtime. + +### Endpoints + +```http +GET _plugins/_forecast/forecasters/<forecaster_id>/_profile[/<type1>,<type2>][?_all=true] +``` + +You can retrieve specific profile types or request all available types using the `_all` query parameter. + +The following profile types are supported: + +- `state` +- `error` +- `coordinating_node` +- `total_size_in_bytes` +- `init_progress` +- `models` +- `total_entities` +- `active_entities` +- `forecast_task` + +If you include an `entity` array in the request body, the profile is scoped to that entity only. + +### Example request: Default profile with an entity filter + +The following request returns the default profile types (`state` and `error`) for the specified entity: + +```http +GET _plugins/_forecast/forecasters/tLch1okBCBjX5EchixQ8/_profile +{ + "entity": [ + { + "name": "service", + "value": "app_1" + }, + { + "name": "host", + "value": "server_2" + } + ] +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "state": "RUNNING" +} +``` + +### Example request: Multiple profile types + +The following request retrieves `init_progress`, `error`, `total_entities`, and `state` profile types: + +```http +GET _plugins/_forecast/forecasters/mZ6P0okBTUNS6IWgvpwo/_profile/init_progress,error,total_entities,state +``` +{% include copy-curl.html %} + +### Example request: All profile types + +The following request returns all available profile types: + +```http +GET _plugins/_forecast/forecasters/d7-r1YkB_Z-sgDOKo3Z5/_profile?_all=true&pretty +``` +{% include copy-curl.html %} + + +--- + +## Forecaster stats +**Introduced 3.1** +{: .label .label-purple } + +Returns cluster-level or node-level statistics, including the number of forecasters, model counts, request counters, and the health of internal forecast indexes. + +### Endpoints + +```http +GET _plugins/_forecast/stats +GET _plugins/_forecast/<node_id>/stats +GET _plugins/_forecast/stats/<stat_name> +``` + +### Example request: Retrieve all statistics + +The following request retrieves cluster-level statistics for all forecasters, including counts, model information, and index status: + +```http +GET _plugins/_forecast/stats +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "hc_forecaster_count": 1, + "forecast_results_index_status": "yellow", + "forecast_models_checkpoint_index_status": "yellow", + "single_stream_forecaster_count": 1, + "forecastn_state_status": "yellow", + "forecaster_count": 2, + "job_index_status": "yellow", + "config_index_status": "yellow", + "nodes": { + "8B2S4ClnRFK3GTjO45bwrw": { + "models": [ + { + "model_type": "rcf_caster", + "last_used_time": 1692245336895, + "model_id": "Doj0AIoBEU5Xd2ccoe_9_entity_SO2kPi_PAMsvThWyE-zYHg", + "last_checkpoint_time": 1692233157256, + "entity": [ + { "name": "host_nest.host2", "value": "server_2" } + ] + } + ], + "forecast_hc_execute_request_count": 204, + "forecast_model_corruption_count": 0, + "forecast_execute_failure_count": 0, + "model_count": 4, + "forecast_execute_request_count": 409, + "forecast_hc_execute_failure_count": 0 + } + } +} +``` + +### Example request: Retrieve statistics for a specific node + +The following request retrieves forecaster statistics for a specific node, identified by node ID: + +```http +GET _plugins/_forecast/8B2S4ClnRFK3GTjO45bwrw/stats +``` +{% include copy-curl.html %} + +### Example request: Retrieve the total number of high-cardinality requests + +The following request retrieves the total number of high-cardinality forecaster requests across all nodes: + +```http +GET _plugins/_forecast/stats/forecast_hc_execute_request_count +``` +{% include copy-curl.html %} + +### Example request: Retrieve the high-cardinality request count for a specific node + +The following request retrieves the number of high-cardinality forecaster requests executed by a specific node: + +```http +GET _plugins/_forecast/0ZpL8WEYShy-qx7hLJQREQ/stats/forecast_hc_execute_request_count/ +``` +{% include copy-curl.html %} + + +--- + +## Forecaster info +**Introduced 3.1** +{: .label .label-purple } + +Returns a single integer representing the total number of forecaster configurations in the cluster or checks whether a forecaster that satisfies a given search criterion exists. + + +### Endpoints +```http +GET _plugins/_forecast/forecasters/count +GET _plugins/_forecast/forecasters/match?name=<forecaster_name> +``` + +### Example request: Count forecasters + +The following request returns the number of forecaster configurations currently stored in the cluster: + +```http +GET _plugins/_forecast/forecasters/count +``` +{% include copy-curl.html %} + +### Example response + +```json +{ + "count": 2, + "match": false +} +``` + +### Example request: Match forecaster name + +The following request looks for a forecaster named `Second-Test-Forecaster-3`: + +```http +GET _plugins/_forecast/forecasters/match?name=Second-Test-Forecaster-3 +``` +{% include copy-curl.html %} + +### Example response: Match found + +```json +{ + "count": 0, + "match": true +} +``` + +### Example response: No match found + +```json +{ + "count": 0, + "match": false +} +``` diff --git a/_observing-your-data/forecast/getting-started.md b/_observing-your-data/forecast/getting-started.md new file mode 100644 index 00000000000..d2038e8f8e1 --- /dev/null +++ b/_observing-your-data/forecast/getting-started.md @@ -0,0 +1,353 @@ +--- +layout: default +title: Getting started with forecasting +nav_order: 5 +parent: Forecasting +has_children: false +--- + +# Getting started with forecasting + +You can define and configure forecasters in OpenSearch Dashboards by selecting **Forecasting** from the navigation panel. + +## Step 1: Define a forecaster + +A **forecaster** represents a single forecasting task. You can create multiple forecasters to run in parallel, each analyzing a different data source. Follow these steps to define a new forecaster: + +1. In the **Forecaster list** view, choose **Create forecaster**. + +2. Define the data source by entering the following information: + * **Name** – Provide a unique, descriptive name, such as `requests-10min`. + * **Description** – Summarize the forecaster's purpose, for example, `Forecast total request count every 10 minutes`. + * **Indexes** – Select one or more indexes, index patterns, or aliases. Remote indexes are supported through cross-cluster search (`cluster-name:index-pattern`). For more information, see [Cross-cluster search]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/). If the Security plugin is enabled, see [Selecting remote indexes with fine-grained access control]({{site.url}}{{site.baseurl}}/observing-your-data/forecast/security/#selecting-remote-indexes-with-fine-grained-access-control). + +3. (Optional) Choose **Add data filter** to set a **Field**, **Operator**, and **Value** or choose **Use query DSL** to define a [Boolean query]({{site.url}}{{site.baseurl}}/query-dsl/compound/bool/). The following example uses a query domain-specific language (DSL) filter to match three URL paths: + + ```json + { + "bool": { + "should": [ + { "term": { "urlPath.keyword": "/domain/{id}/short" } }, + { "term": { "urlPath.keyword": "/sub_dir/{id}/short" } }, + { "term": { "urlPath.keyword": "/abcd/123/{id}/xyz" } } + ] + } + } + ``` + + +4. Under **Timestamp field**, select the field that stores the timestamps. + +5. In the **Indicator (metric)** section, add a metric for the forecaster. Each forecaster supports one metric for optimal accuracy. Choose one of the following options: + + - Select a predefined aggregation: `average()`, `count()`, `sum()`, `min()`, or `max()`. + - To use a custom aggregation, choose **Custom expression** under **Forecast based on** and define your own [query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/) expression. For example, the following query forecasts the number of unique accounts with a specific account type: + + ```json + { + "bbb_unique_accounts": { + "filter": { + "bool": { + "must": [ + { + "wildcard": { + "accountType": { + "wildcard": "*blah*", + "boost": 1 + } + } + } + ], + "adjust_pure_negative": true, + "boost": 1 + } + }, + "aggregations": { + "uniqueAccounts": { + "cardinality": { + "field": "account" + } + } + } + } + } + ``` + +6. (Optional) In the **Categorical fields** section, enable **Split time series using categorical fields** to generate forecasts at the entity level (for example, by IP address, product ID, or country code). + + The number of unique entities that can be cached in memory is limited. Use the following formula to estimate capacity: + + ``` + (data nodes × heap size × plugins.forecast.model_max_size_percent) + ────────────────────────────────────────────────────────────────── + entity-model size (MB) + ``` + + For example, a cluster with 3 data nodes, each with 8 GB JVM heap and the default 10% model memory, would contain the following number of entities: + + ``` + (8096 MB × 0.10 ÷ 1 MB) × 3 nodes ≈ 2429 entities + ``` + + To determine the entity-model size, use the [Profile Forecaster API]({{site.url}}{{site.baseurl}}/observing-your-data/forecast/api/#profile-forecaster). You can raise or lower the memory ceiling with the `plugins.forecast.model_max_size_percent` setting. + + +Forecasters cache models for the most frequently and recently observed entities, subject to available memory. Models for less common entities are loaded from indexes on a best-effort basis during each interval, with no guaranteed service-level agreement (SLA). Always validate memory usage against a representative workload. + +For more information, see the blog post [Improving Anomaly Detection: One Million Entities in One Minute](https://opensearch.org/blog/one-million-enitities-in-one-minute/). Although focused on anomaly detection, the recommendations apply to forecasting, as both features share the same underlying Random Cut Forest (RCF) model. + +## Step 2: Add model parameters + +The **Suggest parameters** button in OpenSearch Dashboards initiates a review of recent history to recommend sensible defaults. You can override these defaults by adjusting the following parameters: + +* **Forecasting interval** – Specifies the aggregation bucket (for example, 10 minutes). Longer intervals smooth out noise and reduce compute costs, but they delay detection. Shorter intervals detect changes sooner but increase resource usage and can introduce noise. Choose the shortest interval that still produces a stable signal. +* **Window delay** – Tells the forecaster how much of a delay to expect between event occurrence and ingestion. This delay adjusts the forecasting interval backward to ensure complete data coverage. For example, if the forecasting interval is 10 minutes and ingestion is delayed by 1 minute, setting the window delay to 1 minute ensures that the forecaster evaluates data from 1:49 to 1:59 rather than 1:50 to 2:00. + * To avoid missing data, set the window delay to the upper limit of the expected ingestion delay. However, longer delays reduce the real-time responsiveness of forecasts. +* **Horizon** – Specifies how many future buckets to predict. Forecast accuracy declines with distance, so choose only the forecast window that is operationally meaningful. +* **History** – Sets the number of historical data points used to train the initial (cold-start) model. The maximum is 10,000. More history improves initial model accuracy up to that limit. + +The **Advanced** panel is collapsed by default, allowing most users to proceed with the suggested parameters. If you expand the panel, you can fine-tune three additional parameters: [shingle size](#choosing-a-shingle-size), [suggested seasonality](#choosing-a-shingle-size), and [recency emphasis](#choosing-a-shingle-size). These control how the forecaster balances recent fluctuations against long-term patterns. + +Unless your data or use case demands otherwise, the defaults—**shingle size 8**, **no explicit seasonality**, and **recency emphasis 2560**—are reliable starting points. + +### Choosing a shingle size + +Leave the **Shingle size** field empty to use the automatic heuristic: + +1. Start with the default value of 8. +2. If **Suggested seasonality** is defined and greater than 16, replace it with half the season length. +3. If **Horizon** is defined and one-third of the value is greater than the current candidate, update it accordingly. + +The final value is the maximum of these three: +`max(8, seasonality ÷ 2, horizon ÷ 3)` + +If you provide a custom value, it overrides this calculation. + +### Determining storage amounts + +By default, forecast results are stored in the `opensearch-forecast-results` index alias. You can: + +* Build dashboards and visualizations. +* Connect the results to the Alerting plugin. +* Query the results as with any other OpenSearch index. + +To manage storage, the plugin applies a rollover policy: + +* **Rollover trigger** – When a primary shard reaches approximately 65 GB, a new backing index is created and the alias is updated. +* **Retention** – Rolled-over indexes are retained for at least 30 days before deletion. + +You can customize this behavior using the following settings. + +| Setting | Description | Default | +|---------|-------------|---------| +| `plugins.forecast.forecast_result_history_max_docs_per_shard` | The maximum number of Lucene documents allowed per shard before triggering a rollover. One result is approximately 4 documents at around 47 bytes each, totaling about 65 GB. | `1_350_000_000` | +| `plugins.forecast.forecast_result_history_retention_period` | The duration for which to retain forecast results. Supports duration formats such as `7d`, `90d`. | `30d` | + +### Specifying a custom result index + +You can store forecast results in a custom index by selecting **Custom index** and providing an alias name, such as `abc`. The plugin creates an alias like `opensearch-forecast-result-abc` that points to the backing index (for example, `opensearch-forecast-result-abc-history-2024.06.12-000002`). + +To manage permissions, use hyphenated namespaces. For example, assign `opensearch-forecast-result-financial-us-*` to roles for the `financial` department's `us` group. +{: .note } If the Security plugin is enabled, ensure appropriate [permissions are configured]({{site.url}}{{site.baseurl}}/observing-your-data/forecast/security/#custom-result-index-permissions). + +### Flattening nested fields + +If your custom result index's documents include nested fields, enable the **Flattened custom result index** to simplify aggregation and visualization. + +This creates a separate index prefixed with the custom index and forecaster name (for example, `opensearch-forecast-result-abc-flattened-test`) and attaches an ingest pipeline using a [Painless script](https://github.com/opensearch-project/anomaly-detection/blob/main/src/main/resources/scripts/flatten-custom-result-index-painless.txt) to flatten nested data. + +If you later disable this option, the associated ingest pipeline is removed. + +Use [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) to manage rollover and deletion of flattened result indexes. + +### Custom result index lifecycle management + +The plugin triggers a rollover for custom result indexes when any of the following conditions are met. + +| Parameter | Description | Type | Unit | Default | Required | +|----------|-------------|------|------|---------|----------| +| `result_index_min_size` | The minimum total primary shard size required to trigger a rollover. | Integer | MB | `51200` (50 GB) | No | +| `result_index_min_age` | The minimum index age required to trigger a rollover. | Integer | Days | `7` | No | +| `result_index_ttl` | The minimum amount of time before rolled-over indexes are deleted | Integer | Days | `60` | No | + + +## Step 3: Test your forecaster + +Backtesting is the fastest way to evaluate and refine key forecasting settings such as **Interval** and **Horizon**. During backtesting, the model is trained on historical data, generates forecasts, and plots them alongside actual values to help visualize prediction accuracy. If the results do not meet expectations, you can adjust the settings and run the test again. + +Backtesting uses the following methods: + +1. **Training window**: The model trains on historical data defined by the **History** setting. + +2. **Rolling forecast**: The model progresses through the time series, repeatedly performing the following actions: + * Ingesting the next actual data point + * Emitting forecasts at each step + + Because this is a retrospective simulation, forecasted values are plotted at their original timestamps, allowing you to see how well the model would have performed in real time. + + +### Starting a backtest + +To begin a test: + +1. Scroll to the bottom of the **Add model parameters** page. +2. Select **Create and test**. + +To skip testing and create the forecaster immediately, select **Create**. + +Backtests usually take 1 or 2 minutes, but run time depends on the following factors. + +| Factor | Why it matters | +| --------------------- | ------------------------------------------------------------------------ | +| **History length** | More historical data increases training time. | +| **Data density** | Densely packed data slows aggregation. | +| **Categorical field** | The model trains separately for each entity. | +| **Horizon** | A longer forecast horizon increases the number of generated predictions. | + + +If the chart is empty, as shown in the following image, check that your index contains at least one time series with more than 40 data points at the selected interval. + +<img src="{{site.url}}{{site.baseurl}}/images/forecast/no_result.png" alt="test failed" width="800" height="800"> + + +### Reading the chart + +When the test succeeds, hover over any point on the chart to view exact values and confidence bounds: + +- **Actual data** – Solid line +- **Median prediction (P50)** – Dotted line +- **Confidence interval** – Shaded band between P10 and P90 + +The following image shows the chart view. + +<img src="{{site.url}}{{site.baseurl}}/images/forecast/bound.png" alt="Forecast chart with confidence bounds" width="800" height="800"> + +### Viewing forecasts from a specific date + +The forecast chart displays predictions starting from the final actual data point through the end of the configured horizon. + +For example, you might configure the following settings in the **Forecast from** field: + +- **Last actual timestamp**: Mar 5, 2025, 19:23 +- **Interval**: 1 minute +- **Horizon**: 24 + +With these settings, the forecast range would span `Mar 5, 2025, 19:23 – 19:47`, as shown in the following image. + +<img src="{{site.url}}{{site.baseurl}}/images/forecast/trend.png" alt="Forecast chart with trend" width="800" height="800"> + +You can also use the **Forecast from** dropdown list to view forecasts from earlier test runs, as shown in the following image. + +<img src="{{site.url}}{{site.baseurl}}/images/forecast/forecast_from_1.png" alt="Forecast from dropdown" width="800" height="800"> + +When you select an earlier **Forecast from** time, the forecast line is drawn directly over the historical data available at that moment. This causes the two series to overlap, as shown in the following image. + +<img src="{{site.url}}{{site.baseurl}}/images/forecast/forecast_from_2.png" alt="Overlapping forecast and actual data" width="800" height="800"> + +To return to the most recent forecast window, select **Show latest**. + +### Overlay mode: Side-by-side accuracy check + +By default the chart displays forecasts that start from a single origin point. Toggle Overlay mode to lay a forecast curve directly on top of the actual series and inspect accuracy across the entire timeline. + +Because the model emits one forecast per horizon step, for example, 24 forecasts when the horizon is 24, a single timestamp can have many forecasts that were generated from different origins. Overlay mode lets you decide which lead time (k) to plot: + +* Horizon index 0 = Immediate next step +* Horizon index 1 = 1 step ahead +* Horizon index 23 = 23 steps ahead + +The horizon control defaults to **index 3**, but you can choose any value to focus on a different lead time. + +The following image shows Overlay mode enabled with a horizon index of 3. The visualization plots the forecast curve (in purple) directly on top of the actual data points (shown with white-filled markers). This lets you evaluate the accuracy of the model's three-steps-ahead prediction across the full timeline. The forecast range is displayed as a shaded band around the predicted values, helping highlight uncertainty. + +<img src="{{site.url}}{{site.baseurl}}/images/forecast/overlay_3.png" alt="overlay config" width="800" height="800"> + +### View multiple forecast series + +A high-cardinality forecaster can display many time series at once. Use the **Time series per page** dropdown menu in the results panel to switch between the following views: + +- **Single-series view** (default): Renders one entity per page for maximum readability. +- **Multi-series view**: Plots up to five entities side by side. Confidence bands are translucent by default—hover over a line to highlight its associated band. + +Actual and forecast lines are overlaid so you can assess accuracy point by point. However, in **Multi-series view**, the overlapping lines can make the chart more difficult to interpret. To reduce visual clutter, go to **Visualization options** and turn off **Show actual data at forecast**. + +The following image shows the chart with actual and forecast lines overlaid. + +<img src="{{site.url}}{{site.baseurl}}/images/forecast/toggle_overlay_before.png" alt="Chart with actual and forecast lines overlaid" width="800" height="800"> + +The following image shows the same chart with actual lines hidden at forecast time to simplify the view. + +<img src="{{site.url}}{{site.baseurl}}/images/forecast/toggle_overlay_after.png" alt="Chart with forecast lines only" width="800" height="800"> + + +### Exploring the timeline + +Use the following timeline controls to navigate, magnify, and filter any span of your forecast history: + +* **Zoom** – Select **+ / –** to zoom in on forecasts or broaden context. +* **Pan** – Use the arrow buttons to move to earlier or later data points, if any. +* **Quick Select** – Choose common ranges, such as "Last 24 hours", or supply custom dates for the result range. + +### Sorting options in multi-series view + +When a forecaster tracks more than five entities, the chart can't show every line at once. +In **Multi-series view**, you therefore choose the five most informative series and decide what "informative" means by selecting a sort method. The following table lists the available sort methods. + +| Sort method | What it shows | When to use it | +|-------------|--------------|----------------| +| **Minimum confidence-interval width** *(default)* | The five series whose prediction bands are narrowest. A narrow band indicates that the model is highly certain about its forecast. | Surface the most "trustworthy" forecasts. | +| **Maximum confidence-interval width** | The five series with the widest bands—forecasts the model is least sure about. | Spot risky or noisy series that may need review or more training data. | +| **Minimum value within the horizon** | The lowest predicted point across the forecast window for each entity, sorted in ascending order. | Identify entities expected to drop the farthest—useful for capacity planning or alerting on potential dips. | +| **Maximum value within the horizon** | The highest predicted point across the horizon for each entity, sorted in descending order. | Highlight series with the greatest expected peaks, such as traffic spikes or sales surges. | +| **Distance to threshold value** | Filters forecasts by a numeric threshold (>, <, ≥, ≤) and then orders the remainder by how far they sit from that threshold. | Investigate entities that breach—or nearly breach—an SLA or business KPI, such as "show anything forecast to exceed 10,000 requests". | + +If the forecaster monitors five or fewer entities, **Multi-series view** displays all of them. When there are more than five, the view reranks them dynamically each time you change the sort method or adjust the threshold, ensuring that the most relevant series stay in focus. + +To focus on a specific subset of entities, switch **Filter by** to **Custom query** and enter a query DSL query. The following example shows entities where the `host` equals `server_1`: + +```json +{ + "nested": { + "path": "entity", + "query": { + "bool": { + "must": [ + { "term": { "entity.name": "host" } }, + { "wildcard": { "entity.value": "server_1" } } + ] + } + } + } +} +``` + +Next, select a sort method, such as **Maximum value within the horizon**, and select **Update visualization**. The chart updates to show only the forecast series for `host:server_1`, ranked according to your selected criteria. + +### Edit a forecaster + +If the initial backtest shows weak performance, you can adjust the forecaster's configuration and run the test again. + +To edit a forecaster: + +1. Open the forecaster's **Details** page and select **Edit** to enter edit mode. +2. Modify the settings as needed—for example, add a **Category field**, change the **Interval**, or increase the **History** window. +3. Select **Update**. The validation panel automatically evaluates the new configuration and flags any issues. + + The following image shows the validation process in progress. + + <img src="{{site.url}}{{site.baseurl}}/images/forecast/validation_loading.png" alt="Validation panel loading" width="800" height="800"> + +4. Resolve any validation errors. When the panel becomes green, select **Start test** in the upper-right corner to run another backtest with the updated parameters. + +### Real-time forecasting + +Once you are confident in the forecasting configuration, go to the **Details** page and click **Start forecasting** to begin real-time forecasting. The forecaster will generate new predictions at each interval moving forward. + +A **Live** badge appears when the chart is synchronized with the most recent data. + +Unlike backtesting, real-time forecasting continuously attempts to initialize using live data if there is not enough historical data. During this initialization period, the forecaster displays an initialization status until it has enough data to begin emitting forecasts. + +## Next steps + +Once you have tested and refined your forecaster, you can begin using it to generate live forecasts or manage it over time. To learn how to start, stop, delete, or update an existing forecaster, see [Managing forecasters]({{site.url}}{{site.baseurl}}/observing-your-data/forecast/managing-forecasters/). + diff --git a/_observing-your-data/forecast/index.md b/_observing-your-data/forecast/index.md new file mode 100644 index 00000000000..72315f1e158 --- /dev/null +++ b/_observing-your-data/forecast/index.md @@ -0,0 +1,29 @@ +--- +layout: default +title: Forecasting +nav_order: 81 +has_children: true +--- + +# Forecasting + +Forecasting in OpenSearch transforms any time-series field into a self-updating signal using the Random Cut Forest (RCF) model. RCF is an online learning model that updates incrementally with each new data point. Because RCF refreshes in real time, it adapts instantly to changes in technical conditions without requiring costly batch retraining. Each model uses only a small amount of storage—typically a few hundred kilobytes—so both compute and storage overhead remain low. + +Pair forecasting with the [Alerting plugin]({{site.url}}{{site.baseurl}}/monitoring-plugins/alerting/) to receive a notification the moment a forecasted value is predicted to breach your threshold. +{: .note} + +## Typical use case + +Forecasting can be used for the following use cases. + +| Domain | What you forecast | Operational benefit | +|--------|-------------------|---------------| +| Predictive maintenance | Future temperature, vibration, or error counts per machine | Replace parts before failure to avoid unplanned downtime. | +| Network forecasting | Future throughput, latency, or connection counts per node | Allocate bandwidth early to meet service-level agreement (SLA) targets. | +| Capacity and cost optimization | Future CPU, RAM, or disk usage per microservice | Rightsize hardware and autoscale efficiently. | +| Financial and operational planning | Future order volume, revenue, or ad spend efficiency | Align staffing and budgets with demand signals. | + + + + + diff --git a/_observing-your-data/forecast/managing-forecasters.md b/_observing-your-data/forecast/managing-forecasters.md new file mode 100644 index 00000000000..928e0d68d66 --- /dev/null +++ b/_observing-your-data/forecast/managing-forecasters.md @@ -0,0 +1,226 @@ +--- +layout: default +title: Managing forecasters +nav_order: 8 +parent: Forecasting +has_children: false +--- + +# Managing forecasters + +After you [create a forecaster]({{site.url}}{{site.baseurl}}/observing-your-data/forecast/getting-started/), you can manage its lifecycle and configuration using the **Details** page. This includes starting or stopping the forecaster, updating its settings, or deleting it entirely. Use this page to monitor forecaster status, troubleshoot issues, and fine-tune behavior over time. + +## Forecasters table + +The **Forecasters** table provides an overview of every forecaster you have configured. + +| Column | Description | +|--------|-------------| +| **Name** | The name you assigned when creating the forecaster. | +| **Status** | The current lifecycle state—for example, `Running`, `Initializing`, or `Test complete`. Click the <i class="euiIcon euiIcon--xs euiIcon--expand"></i> icon for more information, including the timestamp of the most recent status change and any failure messages. | +| **Index** | The source index or alias from which the forecaster reads. | +| **Last updated** | The timestamp of the most recent configuration change. | +| **Quick actions** | Context-aware buttons such as **Start**, **Stop**, or **Delete**, depending on the forecaster's current state. | + +## Execution states + +A forecaster (that is, the underlying forecasting job) can be in any of the following states. Transitions marked *automatic* happen without user action; others require you to manually select **Start** or **Stop**. + +| State | Description | Typical trigger | +|-------|-------------|------------------| +| **Inactive** | The forecaster was created but never started. | None. | +| **Inactive: stopped** | The forecaster was manually stopped after running. | User selects **Stop forecasting**. | +| **Awaiting data to initialize forecast** | The job is trying to start but lacks enough historical data. | Automatic. | +| **Awaiting data to restart forecast** | The job is resuming after a data gap and is waiting for new data. | Automatic after a data outage. | +| **Initializing test** | The model is being built for a one-time backtest. | Automatic on **Create and test** or **Start test**. | +| **Test complete** | The backtest has finished and the job is no longer running. | Automatic. | +| **Initializing forecast** | The model is being trained for continuous real-time forecasting. | Automatic after selecting **Start forecasting**. | +| **Running** | The job is streaming live data and generating forecasts. | Automatic when initialization completes successfully. | +| **Initializing test failed** | The test failed, often due to insufficient data. | Automatic. | +| **Initializing forecast failed** | Real-time mode failed to initialize. | Automatic. | +| **Forecast failed** | The job started but encountered a runtime error, such as a shard failure. | Automatic but requires the user's attention. | + +The following diagram illustrates the relationships and transitions between states. + +<img src="{{site.url}}{{site.baseurl}}/images/forecast/state.png" alt="Forecast state diagram" width="1600" height="1600"> + +## Find and filter forecasters + +If you have many forecasters, use the pagination controls at the bottom of the table to navigate between pages. You can also use the search bar to filter by **name**, **status**, or **index**, which can be helpful when managing large sets of forecasters. + +## Alert on forecasted values + +Because forecast result indexes are not system indexes, you can create an [Alerting monitor]({{site.url}}{{site.baseurl}}/monitoring-plugins/alerting/) for the result indexes like you would for any other user index. + +### Example alert monitor + +For example, the following is a monitor for a high-cardinality forecaster. You can modify the schedule, query, and aggregation to match your use case: + +{% raw %} +```json +{ + "name": "test", + "type": "monitor", + "monitor_type": "query_level_monitor", + "enabled": true, + "schedule": { + "period": { + "unit": "MINUTES", + "interval": 1 + } + }, + "inputs": [ + { + "search": { + "indices": [ + "opensearch-forecast-results*" + ], + "query": { + "size": 1, + "query": { + "bool": { + "filter": [ + { + "range": { + "execution_end_time": { + "from": "{{period_end}}||-15m", + "to": "{{period_end}}", + "include_lower": true, + "include_upper": true, + "format": "epoch_millis", + "boost": 1 + } + } + } + ], + "adjust_pure_negative": true, + "boost": 1 + } + }, + "aggregations": { + "metric": { + "max": { + "field": "forecast_upper_bound" + } + } + } + } + } + } + ], + "triggers": [ + { + "query_level_trigger": { + "id": "29oAl5cB5QuI4WJQ3hnx", + "name": "breach", + "severity": "1", + "condition": { + "script": { + "source": "return ctx.results[0].aggregations.metric.value == null ? false : ctx.results[0].aggregations.metric.value > 10000", + "lang": "painless" + } + }, + "actions": [ + { + "id": "notification378084", + "name": "email", + "destination_id": "2uzIlpcBMf-0-aT5HOtn", + "message_template": { + "source": "Monitor **{{ctx.monitor.name}}** entered **ALERT** state — please investigate.\n\nTrigger : {{ctx.trigger.name}}\nSeverity : {{ctx.trigger.severity}}\nTime range : {{ctx.periodStart}} → {{ctx.periodEnd}} UTC\n\nEntity\n{{#ctx.results.0.hits.hits.0._source.entity}}\n • {{name}} = {{value}}\n{{/ctx.results.0.hits.hits.0._source.entity}}\n", + "lang": "mustache" + }, + "throttle_enabled": true, + "subject_template": { + "source": "Alerting Notification action", + "lang": "mustache" + }, + "throttle": { + "value": 15, + "unit": "MINUTES" + } + } + ] + } + } + ], + "ui_metadata": { + "schedule": { + "timezone": null, + "frequency": "interval", + "period": { + "unit": "MINUTES", + "interval": 1 + }, + "daily": 0, + "weekly": { + "tue": false, + "wed": false, + "thur": false, + "sat": false, + "fri": false, + "mon": false, + "sun": false + }, + "monthly": { + "type": "day", + "day": 1 + }, + "cronExpression": "0 */1 * * *" + }, + "monitor_type": "query_level_monitor", + "search": { + "searchType": "query", + "timeField": "execution_end_time", + "aggregations": [ + { + "aggregationType": "max", + "fieldName": "forecast_upper_bound" + } + ], + "groupBy": [], + "bucketValue": 15, + "bucketUnitOfTime": "m", + "filters": [] + } + } +} +``` +{% endraw %} +{% include copy-curl.html %} + +### Monitor design + +The following table explains each design choice used in the example alert monitor and why it matters. + +| Design choice | Rationale | +|---------------|-----------| +| `size: 1` in the search input | Retrieves a single document so you can reference `ctx.results.0.hits.hits.0` in the notification to identify which entity (such as `host` or `service`) triggered the alert. | +| `execution_end_time` range `"now-15m"` → `now` | Filters on the result creation timestamp, which reflects when the forecast was generated. This avoids delays caused by ingestion lag. Avoid filtering on `data_end_time` if your index includes late-arriving data (such as backfilled logs). | +| `max(forecast_upper_bound)` as the metric | Detects upper-bound spikes. Alternatives include: <br> `min(forecast_lower_bound)` for sudden drops. <br> `avg(forecast_value)` for trend shifts. <br> For additional fields, see the [forecast result schema](https://github.com/opensearch-project/anomaly-detection/blob/main/src/main/resources/mappings/forecast-results.json). | +| Index pattern `opensearch-forecast-results*` | Matches the default result index pattern. Update this pattern if you route results to a custom index, such as `opensearch-forecast-result-abc*`. | +| Optional term filter on `forecaster_id` | Use this filter to target a specific forecaster and avoid matching unrelated forecasts. | +| Monitor every 1 min, query window 15 min | Evaluates forecasts every minute to detect anomalies quickly. The 15-minute lookback increases resilience to timing delays. Combined with a 15-minute alert throttle, this avoids duplicate notifications for the same event. | +| Mustache block prints all entity dimensions | Displays both single-dimension (`host=server_3`) and multi-dimension (`host=server_3`, `service=auth`) entity values. You can also include a link to a pre-filtered dashboard for faster triage. | +| Threshold | Use the OpenSearch Dashboards visual editor to analyze recent forecast values and determine an appropriate threshold that reliably indicates anomalies. | + + +### Example alert + +The following example shows a sample alert email generated by a monitor that detects when a forecasted value breaches a defined threshold. In this case, the monitor is tracking a high-cardinality forecaster and has triggered an alert for a specific entity (`host = server_3`): + +``` +Monitor **test** entered **ALERT** state — please investigate. + +Trigger : breach +Severity : 1 +Time range : 2025-06-22T09:56:14.490Z → 2025-06-22T09:57:14.490Z UTC + +Entity + • host = server_3 +``` + +## Next steps + +After setting up and managing your forecasters, you may want to control who can access and modify them. To learn how to manage permissions, secure result indexes, and apply fine-grained access controls, see [the security page]({{site.url}}{{site.baseurl}}/observing-your-data/forecast/security/). + + diff --git a/_observing-your-data/forecast/security.md b/_observing-your-data/forecast/security.md new file mode 100644 index 00000000000..4bab11d82c4 --- /dev/null +++ b/_observing-your-data/forecast/security.md @@ -0,0 +1,468 @@ +--- +layout: default +title: Forecasting security +nav_order: 10 +parent: Forecasting +has_children: false +--- + +# Forecasting security + +Forecasting uses the same security framework as anomaly detection. This page explains how to configure permissions for users to create, run, and view forecasters; how to restrict access to system indexes; and how to isolate forecast results across teams. + +In all examples, replace credentials, index names, and role names with values appropriate for your environment. +{: .note} + +## Indexes created by forecasting + +The following table describes the indexes used by the Forecasting API and their visibility to regular users. + +| Index pattern | Purpose | Visible to regular users? | +|---------------|---------|---------------------------| +| `.opensearch-forecasters` | Stores forecaster configuration. | No | +| `.opensearch-forecast-checkpoints` | Stores model snapshots (checkpoints). | No | +| `.opensearch-forecast-state` | Stores task metadata for real-time and run-once forecasting. | No | +| `opensearch-forecast-result*` | Stores forecast results from both backtests and real-time forecasting. | Yes | + +Users do not need direct access to `.opensearch-forecast-checkpoints`; it is used internally by the plugin. + +To view `.opensearch-forecasters`, use the [Get forecaster]({{site.url}}{{site.baseurl}}/observing-your-data/forecast/api/#get-forecaster) or [Search forecasters]({{site.url}}{{site.baseurl}}/observing-your-data/forecast/api/#search-forecasters) APIs. + +To view `.opensearch-forecast-state`, use the [Get forecaster]({{site.url}}{{site.baseurl}}/observing-your-data/forecast/api/#get-forecaster) API with the `?task=true` query parameter or call the [Search tasks]({{site.url}}{{site.baseurl}}/observing-your-data/forecast/api/#search-tasks) API directly. + + +## Cluster permissions + +Each Forecasting API route maps to a specific cluster-level permission, as shown in the following table. You must grant these permissions to roles that manage or interact with forecasters. + +| Route | Required permission | +|:------------|:---------------------| +| `POST /_plugins/_forecast/forecasters` | `cluster:admin/plugin/forecast/forecaster/write` | +| `PUT /_plugins/_forecast/forecasters/{id}` | `cluster:admin/plugin/forecast/forecaster/write` | +| `POST /_plugins/_forecast/forecasters/_validate` | `cluster:admin/plugin/forecast/forecaster/validate` | +| `POST /_plugins/_forecast/forecasters/_suggest/{types}` | `cluster:admin/plugin/forecast/forecaster/suggest` | +| `GET /_plugins/_forecast/forecasters/{id}` <br>`GET /_plugins/_forecast/forecasters/{id}?task=true` | `cluster:admin/plugin/forecast/forecaster/get` | +| `DELETE /_plugins/_forecast/forecasters/{id}` | `cluster:admin/plugin/forecast/forecaster/delete` | +| `POST /_plugins/_forecast/forecasters/{id}/_start` <br>`POST /_plugins/_forecast/forecasters/{id}/_stop` | `cluster:admin/plugin/forecast/forecaster/jobmanagement` | +| `POST /_plugins/_forecast/forecasters/{id}/_run_once` | `cluster:admin/plugin/forecast/forecaster/runOnce` | +| `POST /_plugins/_forecast/forecasters/_search` <br>`GET /_plugins/_forecast/forecasters/_search` | `cluster:admin/plugin/forecast/forecaster/search` | +| `GET /_plugins/_forecast/forecasters/tasks/_search` | `cluster:admin/plugin/forecast/tasks/search` | +| `POST /_plugins/_forecast/forecasters/{id}/results/_topForecasts` | `cluster:admin/plugin/forecast/result/topForecasts` | +| `GET /_plugins/_forecast/forecasters/{id}/_profile` | `cluster:admin/plugin/forecast/forecasters/profile` | +| `GET /_plugins/_forecast/stats` | `cluster:admin/plugin/forecast/forecaster/stats` | +| `GET /_plugins/_forecast/forecasters/count` <br>`GET /_plugins/_forecast/forecasters/match` | `cluster:admin/plugin/forecast/forecaster/info` | + +## Required roles + +A forecasting user needs three types of privileges, based on the following responsibilities: + +- Managing the forecasting job +- Reading the source data +- Accessing the forecast results + +These responsibilities correspond to three distinct security layers, as shown in the following table. + +| Layer | What it controls | Typical role | +|-------|------------------|--------------| +| **Forecaster control** | Permissions to create, edit, start, stop, delete, or view a forecaster's configuration. | `forecast_full_access` <br>(manage lifecycle)<br>or<br>`forecast_read_access` <br>(view only) | +| **Data-source read** | Grants the forecaster permission to query the raw metrics index it uses for training and prediction. | Custom role, such as `data_source_read` | +| **Result read** | Grants users and Alerting monitors access to documents in `opensearch-forecast-result*`. | Custom role, such as `forecast_result_read` | + + +The built-in roles `forecast_full_access` and `forecast_read_access` apply only to Forecasting APIs. They do **not** include permissions for source or result indexes—those must be granted separately. +{: .note} + + +### Forecaster control roles + +The Forecasting API includes two built-in roles that you can use as is or use as templates for creating custom roles: + +- `forecast_read_access` – For analysts who need read-only access to forecasters. This role allows users to view forecaster details and results but not create, modify, start, stop, or delete forecasters. + + +- `forecast_full_access` – For users responsible for managing the full lifecycle of forecasters, including creating, editing, starting, stopping, and deleting them. This role does **not** grant access to the source index. To create a forecaster, users must also have index-level permissions that include the `search` action on any index or alias the forecaster reads from. + +The following example shows how these roles are defined: + +```yaml +forecast_read_access: + reserved: true + cluster_permissions: + - 'cluster:admin/plugin/forecast/forecaster/info' + - 'cluster:admin/plugin/forecast/forecaster/stats' + - 'cluster:admin/plugin/forecast/forecaster/suggest' + - 'cluster:admin/plugin/forecast/forecaster/validate' + - 'cluster:admin/plugin/forecast/forecasters/get' + - 'cluster:admin/plugin/forecast/forecasters/info' + - 'cluster:admin/plugin/forecast/forecasters/search' + - 'cluster:admin/plugin/forecast/result/topForecasts' + - 'cluster:admin/plugin/forecast/tasks/search' + index_permissions: + - index_patterns: + - 'opensearch-forecast-result*' + allowed_actions: + - 'indices:admin/mappings/fields/get*' + - 'indices:admin/resolve/index' + - 'indices:data/read*' + +forecast_full_access: + reserved: true + cluster_permissions: + - 'cluster:admin/plugin/forecast/*' + - 'cluster:admin/settings/update' + - 'cluster_monitor' + index_permissions: + - index_patterns: + - '*' + allowed_actions: + - 'indices:admin/aliases/get' + - 'indices:admin/mapping/get' + - 'indices:admin/mapping/put' + - 'indices:admin/mappings/fields/get*' + - 'indices:admin/mappings/get' + - 'indices:admin/resolve/index' + - 'indices:data/read*' + - 'indices:data/read/field_caps*' + - 'indices:data/read/search' + - 'indices:data/write*' + - 'indices_monitor' +``` +{% include copy.html %} + +These roles do not include default `index_permissions` for specific source or result indexes. This is intentional, allowing you to add your own patterns based on your data access requirements. + +### Data source `read` role + +Each forecaster uses the creating user's credentials to query the source index. To enable this, you must grant that user read permissions for your own data index. + +The following example request creates a minimal role that allows read access to the `network-metrics` index: + +```json +PUT _plugins/_security/api/roles/data_source_read +{ + "index_permissions": [{ + "index_patterns": ["network-metrics"], + "allowed_actions": ["read"] + }] +} +``` +{% include copy-curl.html %} + +You can modify the `index_patterns` to match your actual data source. + +### `Result‑read` role + +The `forecast_result_read` role allows users to view forecast results and configure Alerting monitors that query those results. + +The following example request defines a role that grants read access to all indexes matching the `opensearch-forecast-result*` pattern: + +```json +PUT _plugins/_security/api/roles/forecast_result_read +{ + "index_permissions": [{ + "index_patterns": ["opensearch-forecast-result*"], + "allowed_actions": ["read"] + }] +} +``` +{% include copy-curl.html %} + +If you need to isolate result data between teams, you can enhance this role using document-level security (DLS) with a backend role filter, as shown in the following section. + +### Example security role configuration + +The following example request creates a `devOpsEngineer` user and assigns all three required roles for forecasting: + +```json +PUT _plugins/_security/api/internalusers/devOpsEngineer +{ + "password": "DevOps2024!", + "opendistro_security_roles": [ + "forecast_full_access", + "data_source_read", + "forecast_result_read" + ] +} +``` +{% include copy-curl.html %} + +This configuration enables the following: + +- `devOpsEngineer` can manage forecasters (`forecast_full_access`). +- Forecasters can query the source index successfully (`data_source_read`). +- The user and any configured monitors can read forecast results (`forecast_result_read`). + +To grant read-only access to forecaster configurations, replace `forecast_full_access` with `forecast_read_access`. + +--- + +## (Advanced) Limit access by backend role + +You can use backend roles to enforce **team-specific isolation**. This pattern allows different teams to operate forecasters independently while separating configurations and results. + +The model includes three layers: + +1. **Configuration isolation** – Forecasting APIs are restricted to users with a matching backend role. +2. **Result isolation** – DLS limits access to forecast results in `opensearch-forecast-result*`. +3. **Source data access** – A minimal read-only role enables each forecaster to scan its own index. + +The following sections explain how to configure each layer. + +### Assign backend roles to users + +In most environments, backend roles are assigned through LDAP or SAML. However, if you are using the internal user database, you can set them manually, as shown in the following example: + +```json +# Analyst +PUT _plugins/_security/api/internalusers/alice +{ + "password": "alice", + "backend_roles": ["analyst"] +} + +# HR staff +PUT _plugins/_security/api/internalusers/bob +{ + "password": "bob", + "backend_roles": ["human-resources"] +} +``` + +These backend roles can then be used to control access to forecasters and forecast results on a per-team basis. + +### Enable backend-role filtering for configuration access + +To isolate forecaster configurations by team, enable backend-role filtering at the cluster level: + + +```bash +PUT _cluster/settings +{ + "persistent": { + "plugins.forecast.filter_by_backend_roles": true + } +} +``` +{% include copy-curl.html %} + +When this setting is enabled, OpenSearch records the creator's backend roles in each forecaster document. Only users with a matching backend role can view, edit, or delete that forecaster. + +### Create a `result‑access` role per team + +Forecast results are stored in shared indexes, so use DLS to restrict access by backend role. + +The following example request creates a role that allows users with the `analyst` backend role to read and to write only their team's forecast results: + + +```json +PUT _plugins/_security/api/roles/forecast_analyst_result_access +{ + "index_permissions": [{ + "index_patterns": ["opensearch-forecast-result*"], + "dls": """ + { + "bool": { + "filter": [{ + "nested": { + "path": "user", + "query": { + "term": { + "user.backend_roles.keyword": "analyst" + } + }, + "score_mode": "none" + } + }] + } + }""", + "allowed_actions": ["read","write"] + }] +} +``` +{% include copy-curl.html %} + +To isolate results for another team, such as `human-resources`, create a separate role (for example, `forecast_human_resources_result_access`) and update the term value to match the appropriate backend role. + +### Define `data-source` read access + +The `data_source_read` role is defined in the same way as in earlier examples. It grants minimal read access to the metrics index that each forecaster uses for training and prediction. + +You can reuse this role across teams or create separate versions if you need per-index restrictions. + +### Map a user to three roles + +The following example maps the user `alice` to all three required roles—`full_access`, `result_access`, and `data_source_read`—using the `analyst` backend role: + +```json +PUT _plugins/_security/api/internalusers/alice +{ + "password": "alice", + "backend_roles": ["analyst"], + "opendistro_security_roles": [ + "forecast_full_access", + "forecast_analyst_result_access", + "data_source_read" + ] +} +``` +{% include copy-curl.html %} + +With this configuration, Alice can: + +- Create, start, stop, and delete only forecasters tagged with the `analyst` backend role. +- View only forecast results tagged with the `analyst` backend role. +- Read the `network-metrics` index as the source for her forecasters. + +To configure a second user, such as `bob` from the HR team, use a parallel setup with the `human-resources` backend role and `forecast_human_resources_result_access`. + +### Users without backend roles + +If a user has the `forecast_read_access` role but no backend roles, they cannot view any forecasters. Backend-role filtering enforces strict matching and prevents access to configurations that do not align with the user's roles. + +--- + +## Selecting remote indexes with fine-grained access control + +To use a remote index as a data source for a forecaster, follow the steps outlined in the [Authentication flow]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/#authentication-flow) section of the [Cross-cluster search]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/) documentation. + +To succeed, the user must: + +- Use a security role that exists in both the local and remote clusters. +- Have that role mapped to the same username in both clusters. + +### Example: Create a new user in the local cluster + +Using the following command, create a new user in the local cluster who can create the forecaster: + +```bash +curl -XPUT -k -u 'admin:<custom-admin-password>' \ + 'https://localhost:9200/_plugins/_security/api/internalusers/forecastuser' \ + -H 'Content-Type: application/json' \ + -d '{"password":"password"}' +``` +{% include copy-curl.html %} + +Using the following command, map the new user to the `forecast_full_access` role: + +``` +curl -XPUT -k -u 'admin:<custom-admin-password>' \ + 'https://localhost:9200/_plugins/_security/api/rolesmapping/forecast_full_access' \ + -H 'Content-Type: application/json' \ + -d '{"users":["forecastuser"]}' +``` +{% include copy-curl.html %} + +In the remote cluster, create the same user and map `forecast_full_access` to that role, as shown in the following command: + +```bash +# Create the user +curl -XPUT -k -u 'admin:<custom-admin-password>' \ + 'https://localhost:9250/_plugins/_security/api/internalusers/forecastuser' \ + -H 'Content-Type: application/json' \ + -d '{"password":"password"}' + +# Map the role +curl -XPUT -k -u 'admin:<custom-admin-password>' \ + 'https://localhost:9250/_plugins/_security/api/rolesmapping/forecast_full_access' \ + -H 'Content-Type: application/json' \ + -d '{"users":["forecastuser"]}' +``` +{% include copy-curl.html %} + +### Grant source index read access in both clusters + +To create a forecaster, the user also needs index-level permissions for the `search` or `read` [action groups]({{site.url}}{{site.baseurl}}/security/access-control/default-action-groups/) on every source index, alias, or pattern that the forecaster reads. The permission check occurs in both clusters when reading a remote index. Define and map the same role in both locations. + + +In the local cluster, define a `read` role that grants access to the source index and map it to the forecasting user, as shown in the following command: + +```bash +# Create a role that can search the data +curl -XPUT -k -u 'admin:<custom-admin-password>' \ + 'https://localhost:9200/_plugins/_security/api/roles/data_source_read' \ + -H 'Content-Type: application/json' \ + -d '{ + "index_permissions":[{ + "index_patterns":["network-requests"], + "allowed_actions":["search"] + }] + }' + +# Map the role to forecastuser +curl -XPUT -k -u 'admin:<custom-admin-password>' \ + 'https://localhost:9200/_plugins/_security/api/rolesmapping/data_source_read' \ + -H 'Content-Type: application/json' \ + -d '{"users":["forecastuser"]}' +``` +{% include copy-curl.html %} + +In the remote cluster, define the same role and map it to the same user to ensure that permissions are mirrored across clusters, as shown in the following command: + +``` +# Create the identical role +curl -XPUT -k -u 'admin:<custom-admin-password>' \ + 'https://localhost:9250/_plugins/_security/api/roles/data_source_read' \ + -H 'Content-Type: application/json' \ + -d '{ + "index_permissions":[{ + "index_patterns":["network-requests"], + "allowed_actions":["search"] + }] + }' + +# Map the role to the same user +curl -XPUT -k -u 'admin:<custom-admin-password>' \ + 'https://localhost:9250/_plugins/_security/api/rolesmapping/data_source_read' \ + -H 'Content-Type: application/json' \ + -d '{"users":["forecastuser"]}' +``` +{% include copy-curl.html %} + + +### Register the remote cluster with the local cluster + +Register the remote cluster with the local cluster using a seed node under the `cluster.remote.<alias>.seeds` setting. In OpenSearch, this is called adding a `follower` cluster. + +Assuming that the remote cluster is listening on transport port `9350`, run the following command in the local cluster: + +``` +curl -X PUT "https://localhost:9200/_cluster/settings" \ + -H "Content-Type: application/json" \ + -u "admin:<custom-admin-password>" \ + -d '{ + "persistent": { + "cluster.remote": { + "follower": { + "seeds": [ "127.0.0.1:9350" ] + } + } + } + }' +``` +{% include copy-curl.html %} + + +- Replace `127.0.0.1` with the remote node's transport layer IP if it's located on a different host. +- The alias `follower` can be any name you choose and will be used when referencing remote indexes or configuring cross-cluster replication. +{: .note} + +--- + +## Custom result index permissions + +You can specify a custom index for forecast results instead of using the default result index. If the custom index does not already exist, it will be created automatically when you create a forecaster and start a real-time analysis or test run. + +If the custom index already exists, the Forecasting API checks that the index mapping matches the expected forecast result structure. To ensure compatibility, the index must conform to the schema defined in the [`forecast-results.json`](https://github.com/opensearch-project/anomaly-detection/blob/main/src/main/resources/mappings/forecast-results.json) file. + +When a user creates a forecaster—either in OpenSearch Dashboards or by calling the Forecasting API—the system verifies that the user has the following index-level permissions for the custom index: + +- `indices:admin/create` – Required to create and roll over the custom result index. +- `indices:admin/aliases` – Required to create and manage the index alias. +- `indices:data/write/index` – Required to write forecast results to the index (single-stream forecasters). +- `indices:data/read/search` – Required to search the custom index when displaying forecast results. +- `indices:data/write/delete` – Required to delete older forecast results and manage disk usage. +- `indices:data/write/bulk*` – Required because the plugin writes results using the Bulk API. + +## Next step + +For more information about TLS, authentication backends, tenant isolation, and audit logging, see the [Security plugin documentation]({{site.url}}{{site.baseurl}}/security/). diff --git a/_observing-your-data/log-ingestion.md b/_observing-your-data/log-ingestion.md index 61f427d30e8..75a647a2066 100644 --- a/_observing-your-data/log-ingestion.md +++ b/_observing-your-data/log-ingestion.md @@ -37,7 +37,7 @@ Download or clone the [Data Prepper repository](https://github.com/opensearch-pr - A single-node OpenSearch cluster (`opensearch`) - OpenSearch Dashboards (`opensearch-dashboards`). -Close the file and run `docker-compose up --build` to start the containers. +Close the file and run `docker compose up --build` to start the containers. After the containers start, your ingestion pipeline is set up and ready to ingest log data. The `fluent-bit` container is configured to read log data from `test.log`. Run the following command to generate log data to send to the log ingestion pipeline. diff --git a/_observing-your-data/query-insights/grouping-top-n-queries.md b/_observing-your-data/query-insights/grouping-top-n-queries.md index 495766c0f3f..990f0597e8c 100644 --- a/_observing-your-data/query-insights/grouping-top-n-queries.md +++ b/_observing-your-data/query-insights/grouping-top-n-queries.md @@ -45,6 +45,96 @@ bool When queries share the same query structure, they are grouped together, ensuring that all similar queries belong to the same group. +## Configuring the query structure + +The preceding example query shows a simplified query structure. By default, the query structure also includes field names and field data types. + +For example, consider an index `index1` with the following field mapping: + +```json +"mappings": { + "properties": { + "field1": { + "type": "keyword" + }, + "field2": { + "type": "text" + }, + "field3": { + "type": "text" + }, + "field4": { + "type": "long" + } + } +} +``` + +If you run the following query on this index: + +```json +{ + "query": { + "bool": { + "must": [ + { + "term": { + "field1": "example_value" + } + } + ], + "filter": [ + { + "match": { + "field2": "search_text" + } + }, + { + "range": { + "field4": { + "gte": 1, + "lte": 100 + } + } + } + ], + "should": [ + { + "regexp": { + "field3": ".*" + } + } + ] + } + } +} +``` + +Then the query has the following corresponding query structure: + +```c +bool [] + must: + term [field1, keyword] + filter: + match [field2, text] + range [field4, long] + should: + regexp [field3, text] +``` + +To exclude field names and field data types from the query structure, configure the following settings: + +```json +PUT _cluster/settings +{ + "persistent" : { + "search.insights.top_queries.grouping.attributes.field_name" : false, + "search.insights.top_queries.grouping.attributes.field_type" : false + } +} +``` +{% include copy-curl.html %} ## Aggregate metrics per group @@ -58,8 +148,6 @@ The response also includes one example query from the query group. ## Configuring query grouping -Before you enable query grouping, you must enable top N query monitoring for a metric type of your choice. For more information, see [Configuring top N query monitoring]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/#configuring-top-n-query-monitoring). - To configure grouping for top N queries, use the following steps. ### Step 1: Enable top N query monitoring @@ -86,7 +174,7 @@ Set the desired grouping method by updating the following cluster setting: PUT _cluster/settings { "persistent" : { - "search.insights.top_queries.group_by" : "similarity" + "search.insights.top_queries.grouping.group_by" : "similarity" } } ``` @@ -104,7 +192,7 @@ To limit tracking to 100 query groups, send the following request: PUT _cluster/settings { "persistent" : { - "search.insights.top_queries.max_groups_excluding_topn" : 100 + "search.insights.top_queries.grouping.max_groups_excluding_topn" : 100 } } ``` @@ -312,18 +400,20 @@ The response contains the top N query groups: The response includes the following fields. Field | Data type | Description -:--- |:---| :--- -`top_queries` | Array | The list of top query groups. +:--- |:-----------------| :--- +`top_queries` | Array | The list of top query groups. `top_queries.timestamp` | Integer | The execution timestamp for the first query in the query group. -`top_queries.source` | Object | The first query in the query group. +`top_queries.id` | String | The unique identifier for the query or query group. `top_queries.phase_latency_map` | Object | The phase latency map for the first query in the query group. The map includes the amount of time, in milliseconds, that the query spent in the `expand`, `query`, and `fetch` phases. +`top_queries.source` | Object | The first query in the query group. +`top_queries.group_by` | String | The `group_by` setting applied when the query was executed. `top_queries.total_shards` | Integer | The number of shards on which the first query was executed. `top_queries.node_id` | String | The node ID of the node that coordinated the execution of the first query in the query group. -`top_queries.query_hashcode` | String | The hash code that uniquely identifies the query group. This is essentially the hash of the [query structure](#grouping-queries-by-similarity). +`top_queries.search_type` | String | The search request execution type (`query_then_fetch` or `dfs_query_then_fetch`). For more information, see the `search_type` parameter in the [Search API documentation]({{site.url}}{{site.baseurl}}/api-reference/search/#query-parameters). +`top_queries.indices` | Array | The indexes to which the first query in the query group is applied. `top_queries.task_resource_usages` | Array of objects | The resource usage breakdown for the various tasks belonging to the first query in the query group. -`top_queries.indices` | Array | The indexes searched by the first query in the query group. +`top_queries.query_hashcode` | String | The hash code that uniquely identifies the query group and is generated from the [query structure](#grouping-queries-by-similarity). `top_queries.labels` | Object | Used to label the top query. -`top_queries.search_type` | String | The search request execution type (`query_then_fetch` or `dfs_query_then_fetch`). For more information, see the `search_type` parameter in the [Search API documentation]({{site.url}}{{site.baseurl}}/api-reference/search/#query-parameters). `top_queries.measurements` | Object | The aggregate measurements for the query group. `top_queries.measurements.latency` | Object | The aggregate latency measurements for the query group. `top_queries.measurements.latency.number` | Integer | The total latency for the query group. diff --git a/_observing-your-data/query-insights/health.md b/_observing-your-data/query-insights/health.md new file mode 100644 index 00000000000..33420632516 --- /dev/null +++ b/_observing-your-data/query-insights/health.md @@ -0,0 +1,141 @@ +--- +layout: default +title: Query Insights plugin health +parent: Query insights +nav_order: 50 +--- + +# Query Insights plugin health + +The Query Insights plugin provides an [API](#health-stats-api) and [metrics](#opentelemetry-error-metrics-counters) for monitoring its health and performance, enabling proactive identification of issues that may affect query processing or system resources. + +## Health Stats API +**Introduced 2.18** +{: .label .label-purple } + +The Health Stats API provides health metrics for each node running the Query Insights plugin. These metrics allow for an in-depth view of resource usage and the health of the query processing components. + +### Endpoints + +```json +GET _insights/health_stats +``` + +### Example request + +```json +GET _insights/health_stats +``` +{% include copy-curl.html %} + +### Example response + +The response includes a set of health-related fields for each node: + +```json +GET _insights/health_stats +{ + "AqegbPL0Tv2XWvZV4PTS8Q": { + "ThreadPoolInfo": { + "query_insights_executor": { + "type": "scaling", + "core": 1, + "max": 5, + "keep_alive": "5m", + "queue_size": 2 + } + }, + "QueryRecordsQueueSize": 2, + "TopQueriesHealthStats": { + "latency": { + "TopQueriesHeapSize": 5, + "QueryGroupCount_Total": 0, + "QueryGroupCount_MaxHeap": 0 + }, + "memory": { + "TopQueriesHeapSize": 5, + "QueryGroupCount_Total": 0, + "QueryGroupCount_MaxHeap": 0 + }, + "cpu": { + "TopQueriesHeapSize": 5, + "QueryGroupCount_Total": 0, + "QueryGroupCount_MaxHeap": 0 + } + }, + "FieldTypeCacheStats" : { + "size_in_bytes" : 336, + "entry_count" : 3, + "evictions" : 1, + "hit_count" : 5, + "miss_count" : 4 + } + } +} +``` + +### Response fields + +The following table lists all response body fields. + +Field | Data type | Description +:--- |:---| :--- +`ThreadPoolInfo` | Object | Information about the Query Insights thread pool, including type, core count, max threads, and queue size. See [The ThreadPoolInfo object](#the-threadpoolinfo-object). +`QueryRecordsQueueSize` | Integer | The size of the queue that buffers incoming search queries before processing. A high value may suggest increased load or slower processing. +`TopQueriesHealthStats` | Object | Performance metrics for each top query service that provide information about memory allocation (heap size) and query grouping. See [The TopQueriesHealthStats object](#the-topquerieshealthstats-object). +`FieldTypeCacheStats` | Object | Metrics for the query insights field type cache. This cache is used to store field type mappings when query grouping is enabled. + +### The ThreadPoolInfo object + +The `ThreadPoolInfo` object contains the following detailed configuration and performance data for the thread pool dedicated to the Query Insights plugin. + +Field | Data type | Description +:--- |:---| :--- +`type`| String | The thread pool type (for example, `scaling`). +`core`| Integer | The minimum number of threads in the thread pool. +`max`| Integer | The maximum number of threads in the thread pool. +`keep_alive`| Time unit | The amount of time that idle threads are retained. +`queue_size`| Integer | The maximum number of tasks in the queue. + +### The TopQueriesHealthStats object + +The `TopQueriesHealthStats` object provides breakdowns for latency, memory, and CPU usage and contains the following information. + +Field | Data type | Description +:--- |:---| :--- +`TopQueriesHeapSize`| Integer | The heap memory allocation for the query group. +`QueryGroupCount_Total`| Integer | The total number of processed query groups. +`QueryGroupCount_MaxHeap`| Integer | The size of the max heap that stores all query groups in memory. + +### The FieldTypeCacheStats object + +The `FieldTypeCacheStats` object contains the following statistics. + +Field | Data type | Description +:--- |:---| :--- +`size_in_bytes`| Integer | The heap memory allocation for the cache. +`entry_count`| Integer | The total number of cache entries. +`evictions`| Integer | The total number of cache evictions. +`hit_count`| Integer | The total number of cache hits. +`miss_count`| Integer | The total number of cache misses. + +## OpenTelemetry error metrics counters + +The Query Insights plugin integrates with OpenTelemetry to provide real-time error metrics counters. These counters help to identify specific operational failures in the plugin and improve reliability. Each metric provides targeted insights into potential error sources in the plugin workflow, allowing for more focused debugging and maintenance. + +To collect these metrics, you must configure and collect query metrics. For more information, see [Query metrics]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/query-metrics/). + +The following table lists all available metrics. + +Field | Description +:--- | :--- +`LOCAL_INDEX_READER_PARSING_EXCEPTIONS` | The number of errors that occur when parsing data using the LocalIndexReader. +`LOCAL_INDEX_EXPORTER_BULK_FAILURES` | The number of failures that occur when ingesting Query Insights plugin data into local indexes. +`LOCAL_INDEX_EXPORTER_DELETE_FAILURES` | The number of failures that occur when deleting Query Insights local indexes. +`LOCAL_INDEX_EXPORTER_EXCEPTIONS` | The number of exceptions that occur in the Query Insights plugin LocalIndexExporter. +`INVALID_EXPORTER_TYPE_FAILURES` | The number of invalid exporter type failures. +`DATA_INGEST_EXCEPTIONS` | The number of exceptions that occur when ingesting data into the Query Insights plugin. +`QUERY_CATEGORIZE_EXCEPTIONS` | The number of exceptions that occur when categorizing the queries. +`EXPORTER_FAIL_TO_CLOSE_EXCEPTION` | The number of failures that occur when closing the exporter. +`READER_FAIL_TO_CLOSE_EXCEPTION` | The number of failures that occur when closing the reader. +`TOP_N_QUERIES_USAGE_COUNT` | The number of times the Top N Queries API is used. \ No newline at end of file diff --git a/_observing-your-data/query-insights/index.md b/_observing-your-data/query-insights/index.md index ef3a65bfcdf..f93ac743777 100644 --- a/_observing-your-data/query-insights/index.md +++ b/_observing-your-data/query-insights/index.md @@ -4,6 +4,8 @@ title: Query insights nav_order: 40 has_children: true has_toc: false +redirect_from: + - /query-insights/ --- # Query insights @@ -14,8 +16,10 @@ To monitor and analyze the search queries within your OpenSearch cluster, you ca Typical use cases for query insights features include the following: -- Identifying top queries by latency within specific time frames -- Debugging slow search queries and latency spikes +- Identify the slowest or most resource-intensive queries impacting your cluster. +- Debug latency spikes and understand query performance patterns. +- Analyze common slow query structures to find optimization opportunities. +- Monitor live, in-flight queries to diagnose immediate search performance issues. Query insights features are supported by the Query Insights plugin. At a high level, query insights features comprise the following components: @@ -33,10 +37,16 @@ bin/opensearch-plugin install query-insights ``` For information about installing plugins, see [Installing plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/). -## Query Insights settings +## Query Insights features and settings -You can obtain the following information using Query Insights: +Query Insights provides several ways to monitor and analyze your search queries: -- [Top n queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/) -- [Grouping top N queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/grouping-top-n-queries/) -- [Query metrics]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/query-metrics/) +- **[Top N queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/)**: Identify the most resource-intensive or slowest queries over specific time frames based on various performance metrics. +- **[Grouping top N queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/grouping-top-n-queries/)**: Discover patterns and analyze similar slow queries by grouping them based on query source structure. +- **[Live queries monitoring]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/live-queries/)**: Get real-time visibility into search queries currently executing within your cluster to identify and debug queries that are currently long running or resource heavy. +- **[Query insights dashboards]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/query-insights-dashboard/)**: Visualize and configure top query insights interactively in OpenSearch Dashboards. +- **[Query metrics]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/query-metrics/)**: Understand the specific performance metrics per query type. + +## Query Insights plugin health + +For information about monitoring the health of the Query Insights plugin, see [Query Insights plugin health]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/health/). \ No newline at end of file diff --git a/_observing-your-data/query-insights/live-queries.md b/_observing-your-data/query-insights/live-queries.md new file mode 100644 index 00000000000..74ddd9f16dc --- /dev/null +++ b/_observing-your-data/query-insights/live-queries.md @@ -0,0 +1,108 @@ +--- +layout: default +title: Live queries +parent: Query insights +nav_order: 20 +--- + +# Live queries +**Introduced 3.0** +{: .label .label-purple } + +Use the Live Queries API to retrieve currently running search queries across the cluster or on specific nodes. Monitoring live queries using Query Insights allows you to get real-time visibility into the search queries that are currently executing within your OpenSearch cluster. This is useful for identifying and debugging queries that might be running for an unexpectedly long time or consuming significant resources at the moment. + +The API returns a list of currently executing search queries, sorted by a specified metric (defaulting to `latency`) in descending order. The response includes the details for each live query, such as the query source, search type, involved indexes, node ID, start time, latency, and resource usage (on the coordinator node) so far. + +## Endpoints + +```json +GET /_insights/live_queries +``` + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `verbose` | Boolean | Whether to include detailed query information in the output. Default is `true`. | +| `nodeId` | String | A comma-separated list of node IDs used to filter the results. If omitted, queries from all nodes are returned. | +| `sort` | String | The metric to sort the results by. Valid values are `latency`, `cpu`, or `memory`. Default is `latency`. | +| `size` | Integer | The number of query records to return. Default is 100. | + +## Example request + +The following example request fetches the top 10 queries sorted by CPU usage, with verbose output disabled: + +```json +GET /_insights/live_queries?verbose=false&sort=cpu&size=10 +``` +{% include copy-curl.html %} + +## Example response + +```json +{ + "live_queries" : [ + { + "timestamp" : 1745359226777, + "id" : "troGHNGUShqDj3wK_K5ZIw:512", + "description" : "indices[my-index-*], search_type[QUERY_THEN_FETCH], source[{\"size\":20,\"query\":{\"term\":{\"user.id\":{\"value\":\"userId\",\"boost\":1.0}}}}]", + "node_id" : "troGHNGUShqDj3wK_K5ZIw", + "measurements" : { + "latency" : { + "number" : 13959364458, + "count" : 1, + "aggregationType" : "NONE" + }, + "memory" : { + "number" : 3104, + "count" : 1, + "aggregationType" : "NONE" + }, + "cpu" : { + "number" : 405000, + "count" : 1, + "aggregationType" : "NONE" + } + } + }, + { + "timestamp" : 1745359229158, + "id" : "Y6eBnbdISPO6XaVfxCBRgg:454", + "description" : "indices[my-index-*], search_type[QUERY_THEN_FETCH], source[{\"size\":20,\"query\":{\"term\":{\"user.id\":{\"value\":\"userId\",\"boost\":1.0}}}}]", + "node_id" : "Y6eBnbdISPO6XaVfxCBRgg", + "measurements" : { + "latency" : { + "number" : 11579097209, + "count" : 1, + "aggregationType" : "NONE" + }, + "memory" : { + "number" : 3104, + "count" : 1, + "aggregationType" : "NONE" + }, + "cpu" : { + "number" : 511000, + "count" : 1, + "aggregationType" : "NONE" + } + } + } + ] +} +``` + +## Response fields + +| Field | Data type | Description | +| :------------------ | :-------- | :--------------------------------------------------------------------------------------------------------- | +| `timestamp` | Long | The time at which the query task started, in milliseconds since the epoch. | +| `id` | String | The unique identifier of the search request (the search task ID associated with the query). | +| `description`| String | A description of the query, including the indexes on which it runs, search type, and query source. Only included if `verbose` is `true` (default). | +| `node_id`| String | The coordinator node ID of the node on which the query task is running. | +| `measurements` | Object | An object containing performance metrics gathered so far for the query. | +| `measurements.LATENCY` | Object | Contains the `value` (current running time in nanoseconds) and `unit` (`nanos`). | +| `measurements.CPU` | Object | Contains the `value` (CPU time consumed so far in nanoseconds) and `unit` (`nanos`). | +| `measurements.MEMORY` | Object | Contains the `value` (heap memory used so far in bytes) and `unit` (`bytes`). | diff --git a/_observing-your-data/query-insights/query-insights-dashboard.md b/_observing-your-data/query-insights/query-insights-dashboard.md new file mode 100644 index 00000000000..054536008a9 --- /dev/null +++ b/_observing-your-data/query-insights/query-insights-dashboard.md @@ -0,0 +1,188 @@ +--- +title: Query insights dashboards +layout: default +parent: Query insights +nav_order: 60 +--- + +# Query insights dashboards + +You can interact with the query insights feature in OpenSearch Dashboards. This gives you real-time and historical insights into query performance, providing analytics and monitoring to improve how queries are run in your cluster. + +## Navigation + +After logging in to OpenSearch Dashboards, you can find the **Query insights** page by navigating to **OpenSearch Plugins** > **Query insights**. + +If you have [multiple data sources]({{site.url}}{{site.baseurl}}/dashboards/management/multi-data-sources/) enabled, the **Query insights** page can be found by navigating to **Data administration** > **Performance** > **Query insights**. +{: .note} + +The **Query insights** dashboard contains the following pages: + +- [Top N queries](#top-n-queries): Displays the query metrics and details for the top queries. +- [Query details](#query-details): Displays details for individual queries and query groups. +- [Configuration](#configuration): Customizes all monitoring and data retention settings for the query insights feature. + + +## Top N queries + +The **Top N queries** page provides a detailed overview of the queries that have the highest impact on system resources or performance. There, you can analyze query metrics such as **latency**, **CPU time**, and **memory usage**. + +The following image of the **Top N queries** page contains letter labels for each component. + +![Top N Queries Interface]({{site.url}}{{site.baseurl}}/images/Query-Insights/QueryInsights.png) + +Each label corresponds to the following components: + +- [A. Navigation tabs](#a-navigation-tabs) +- [B. Search queries bar](#b-search-queries-bar) +- [C. Filters](#c-filters) +- [D. Date range selector](#d-date-range-selector) +- [E. Refresh button](#e-refresh-button) +- [F. Metrics table](#f-metrics-table) + +### A. Navigation tabs + +The navigation tabs allow you to switch between the **Configuration** and **Top N queries** pages. + +### B. Search queries bar + +The search queries bar filters queries based on specific attributes such as **query type** or **indexes**. You can use additional filters as shown in the [Filters](#c-filters) section. + +### C. Filters + +The filters dropdown menus allow you to select the following query filters. + +| Filter | Description | Example | +|-------------------------|---------------------------------------------------------------------|--------------------| +| **Type** | Filter by query type. | `query`, `group` | +| **Indexes** | Filter queries based on specific OpenSearch indexes. | `index1`, `index2` | +| **Search Type** | Filter by search execution method. | `query then fetch` | +| **Coordinator Node ID** | Focus on queries executed by a specific coordinator node. | `node-1`, `node-2` | +| **Time Range** | Adjust the time range for the queries displayed. | `last 1 day` | + +### D. Date range selector + +The **data range selector** analyzes queries sent during a set time frame. You can also select **Show dates** to provide detailed time stamps for each query. + +### E. Refresh button + +The **Refresh** button reloads the query data based on the selected filters and time range. + +### F. Metrics table + +The metrics table dynamically adapts based on your **Type** filter selection (**Query**, **Group**, or both). Dynamic columns improve clarity by showing only the relevant data for each query type. + +When you select **queries only**, the table displays individual metrics, including **Latency**, **CPU Time**, and **Memory Usage**. The **Query Count** column isn't displayed because each row represents a single query, as shown in the following image. + +![Column Display for Query Selected]({{site.url}}{{site.baseurl}}/images/Query-Insights/OnlyQueryColDisplay.png) + +When you select **groups only**, the table displays aggregated metrics, including **Average Latency**, **Average CPU Time**, and **Average Memory Usage**. The **Query Count** column shows how many queries are in each group, as shown in the following image. + +![Column Display for Group Selected]({{site.url}}{{site.baseurl}}/images/Query-Insights/OnlyGroupColDisplay.png) + +When you select both **groups** and **queries**, the table displays combined metrics, including both averaged and raw values, as shown in the following image. + +![Column Display for Both Selected]({{site.url}}{{site.baseurl}}/images/Query-Insights/BothColDisplay.png) + +The following table provides descriptions for each metric and the metric's related query and group when selected. + +| Column name | Description | Query selected | Group selected | Query + group selected | +| :--- | :--- | :--- | :--- | :--- | +| **ID** | The unique identifier for the query or group. | `ID` | `ID` | `ID` | +| **Type** | Indicates whether the entry is a query or a group. | `Type` | `Type` | `Type` | +| **Query Count** | The number of queries aggregated in the group. | Not shown | `Query Count` | `Query Count` | +| **Timestamp** | The time at which the query or group was recorded (may be empty for groups). | `Timestamp` | Not shown | `Timestamp` | +| **Latency** | The amount of time taken for individual queries to execute. | `Latency` | `Average Latency` | `Avg Latency/Latency` | +| **CPU Time** | The number of CPU resources consumed. | `CPU Time` | `Average CPU Time` | `Avg CPU Time/CPU Time` | +| **Memory Usage** | The amount of memory used during execution. | `Memory Usage` | `Average Memory Usage` | `Avg Memory Usage/Memory Usage` | +| **Indexes** | A list of indexes involved in the query or group. | `Indexes` | Not shown | `Indexes` | +| **Search Type** | The search execution method used (such as `query` or `fetch`). | `Search Type` | Not shown | `Search Type` | +| **Coordinator Node ID** | The node that coordinated the query. | `Coordinator Node ID` | Not shown | `Coordinator Node ID` | +| **Total Shards** | The number of shards involved in query processing. | `Total Shards` | Not shown | `Total Shards` | + +When you select **Query + Group**: + +- If all displayed rows are queries, then the table follows the **Query Selected** behavior. +- If all displayed rows are groups, then the table follows the **Group Selected** behavior. + +## Query details + +The **Query details** page provides insights into query behavior, performance, and structure. You can access the query details page by selecting the query ID, as shown in the following image: + +![Query Insights List]({{site.url}}{{site.baseurl}}/images/Query-Insights/Querieslist.png) + +### Viewing individual query details + +You can access detailed information about a single query by selecting the query ID, such as `51c68a1a-7507-4b3e-aea1-32ddd74dbac4`. The query details page will appear, as shown in the following image. + +![Individual Query Details]({{site.url}}{{site.baseurl}}/images/Query-Insights/IndividualQueryDetails.png) + +In the query details view, you can view information such as **Timestamp**, **CPU Time**, **Memory Usage**, **Indexes**, **Search Type**, **Coordinator Node ID**, and **Total Shards**. + +### Viewing query group details + +The query group details view provides insights into aggregated metrics for a group of similar queries. + +To view query group details, select a query ID marked as a "group" in the **Top N queries** list. The query group details view provides the following information: + +![Query Group Details]({{site.url}}{{site.baseurl}}/images/Query-Insights/GroupQueryDetails.png) + +- The **Aggregate summary for queries** section provides a view of key query metrics for the entire group, including **Average latency**, **Average CPU time**, **Average memory usage**, and **Group by** criteria. +- The **Sample query details** section provides information about a single representative query, including its **Timestamp**, **Indexes**, **Search Type**, **Coordinator Node ID**, and **Total Shards**. +- The **Query** section displays the JSON structure of the query. +- The **Latency** section presents a graphical representation of the run phases for the query. + +## Configuration + +The **Query insights - Configuration** page is designed to gives you control over how the query insights feature collects, monitors, groups, and retains data. The following image shows the configuration page. + +![Configuration]({{site.url}}{{site.baseurl}}/images/Query-Insights/Configuration.png) + +On the configuration page, you can configure the settings described in the following sections. + +### Top N queries monitoring + +The **Top n queries monitoring configuration settings** allow you to track query performance metrics, such as **Latency**, **CPU Usage**, and **Memory**, to analyze and optimize query performance. The configuration interface provides a structured, menu-driven setup through which you can define specific metrics to be monitored, set thresholds for analysis, and customize monitoring durations. + +Perform the following to configure the top N queries settings: + +1. From the **Query insights** page, navigate to the **Configuration** tab. +2. Select the metric type: **Latency**, **CPU Usage**, or **Memory**. +3. Toggle the **Enabled** setting to turn the top N queries feature on or off for the selected metric. +4. Specify the monitoring **Window size**, which determines the duration of the time queries collected for analysis. +5. Enter the value of **N**, which defines the number of top queries to track in each window. +6. Select **Save**. +7. Check the **Statuses for configuration metrics** panel to see the enabled metrics. + +### Top N queries grouping + +The **Top n queries group configuration settings** set the grouping settings for queries. + +Use the following steps to set specific grouping attributes: + +1. Select a grouping option under **Group By**, such as **Similarity**. +2. Select **Save**. +3. Check the **Statuses for group by** panel to verify whether the **Group by** criteria is enabled. + +### Data export and retention + +To configuring data export and retention, use the **Query insights export and data retention settings** panel. There, you can set the following settings: + +1. Under **Exporter**, choose a destination for the data, such as **Local index**. +2. Set the **Delete After (days)** field with a data retention period. +3. Select **Save**. +4. In the **Statuses for data retention** panel, make sure that the **Exporter** setting is enabled. + +### Configuration best practices + +When configuring the query insights feature, remember the following best practices: + +- Begin with a smaller value for N (count) and increase it based on your system's load. +- Choose your **Window size** carefully. A longer window size can save compute resources because the insights found are less granular. Inversely, a shorter window size can output more comprehensive query insights but uses more resources. +- When setting data retention periods, consider shorter retention periods that save storage but reduce the number of long-term insights. +- Enable metrics based on your monitoring needs. Monitoring fewer metrics prevents system overload. + + + + + diff --git a/_observing-your-data/query-insights/top-n-queries.md b/_observing-your-data/query-insights/top-n-queries.md index b63d670926b..42657213139 100644 --- a/_observing-your-data/query-insights/top-n-queries.md +++ b/_observing-your-data/query-insights/top-n-queries.md @@ -30,13 +30,13 @@ It's important to exercise caution when enabling this feature because it can con ## Enabling top N query monitoring -When you install the `query-insights` plugin, top N query monitoring is disabled by default. To enable top N query monitoring, update the dynamic settings for the desired metric types. These settings enable the corresponding collectors and aggregators in the running cluster. For example, to enable top N query monitoring by latency, update the `search.insights.top_queries.latency.enabled` setting: +When you install the `query-insights` plugin, top N query monitoring is enabled by default. To disable top N query monitoring, update the dynamic cluster settings for the desired metric types. For example, to disable monitoring of top N queries by latency, update the `search.insights.top_queries.latency.enabled` setting: ```json PUT _cluster/settings { "persistent" : { - "search.insights.top_queries.latency.enabled" : true + "search.insights.top_queries.latency.enabled" : false } } ``` @@ -44,7 +44,7 @@ PUT _cluster/settings ## Configuring the window size -To configure the monitoring window size, update the `window_size` setting for the desired metric type. For example, to collect the top N queries by latency in a 60-minute window, update the `search.insights.top_queries.latency.window_size` setting: +To configure the monitoring window size, update the `window_size` setting for the desired metric type. The default `window_size` is `5m`. For example, to collect the top N queries by latency in a 60-minute window, update the `search.insights.top_queries.latency.window_size` setting: ```json PUT _cluster/settings @@ -58,45 +58,231 @@ PUT _cluster/settings ## Configuring the value of N -To configure the value of N, update the `top_n_size` setting for the desired metric type. For example, to collect the top 10 queries by latency, update the `insights.top_queries.latency.top_n_size` setting: +To configure the value of N, update the `top_n_size` setting for the desired metric type. The default `top_n_size` is `10`. For example, to collect the top 20 queries by latency, update the `insights.top_queries.latency.top_n_size` setting: ```json PUT _cluster/settings { "persistent" : { - "search.insights.top_queries.latency.top_n_size" : 10 + "search.insights.top_queries.latency.top_n_size" : 20 } } ``` {% include copy-curl.html %} -## Monitoring the top N queries +## Monitoring current top N queries -You can use the Insights API endpoint to retrieve the top N queries. This API returns top N `latency` results by default. +You can use the Insights API endpoint to retrieve the top N queries for the current time window. This API returns top N `latency` results by default. ```json GET /_insights/top_queries ``` {% include copy-curl.html %} -Specify the `type` parameter to retrieve the top N results for other metric types. The results will be sorted in descending order based on the specified metric type. +### Query parameters -```json -GET /_insights/top_queries?type=latency -``` -{% include copy-curl.html %} +The following table lists the available query parameters. All query parameters are optional. + +Parameter | Data type | Description +:--- |:---------| :--- +`type` | String | The metric type for which to retrieve top N query data. Results will be sorted in descending order based on this metric. Valid values are `latency`, `cpu`, and `memory`. Default is `latency`. +`from` | String | The start of the time range for fetching historical top N queries. For more information, see [Monitoring historical top N queries](#monitoring-historical-top-N-queries). +`to` | String | The end of the time range for fetching historical top N queries. For more information, see [Monitoring historical top N queries](#monitoring-historical-top-N-queries). +`id` | String | The ID of a specific top query record to retrieve. +`verbose` | Boolean | Indicates whether to return verbose output. Default is `true`. + +### Example response + +<details markdown="block"> + <summary> + Response + </summary> + {: .text-delta} ```json -GET /_insights/top_queries?type=cpu +{ + "top_queries" : [ + { + "timestamp" : 1745021834451, + "id" : "36506bd2-7bca-4a0a-a6b8-f3e7db2b0745", + "group_by" : "NONE", + "indices" : [ + "my-index-0" + ], + "source" : { + "size" : 20, + "query" : { + "bool" : { + "must" : [ + { + "match_phrase" : { + "message" : { + "query" : "document", + "slop" : 0, + "zero_terms_query" : "NONE", + "boost" : 1.0 + } + } + }, + { + "match" : { + "user.id" : { + "query" : "userId", + "operator" : "OR", + "prefix_length" : 0, + "max_expansions" : 50, + "fuzzy_transpositions" : true, + "lenient" : false, + "zero_terms_query" : "NONE", + "auto_generate_synonyms_phrase_query" : true, + "boost" : 1.0 + } + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + } + }, + "task_resource_usages" : [ + { + "action" : "indices:data/read/search[phase/query]", + "taskId" : 28, + "parentTaskId" : 27, + "nodeId" : "BBgWzu8QR0qDkR0G45aw8w", + "taskResourceUsage" : { + "cpu_time_in_nanos" : 22664000, + "memory_in_bytes" : 6604536 + } + }, + { + "action" : "indices:data/read/search", + "taskId" : 27, + "parentTaskId" : -1, + "nodeId" : "BBgWzu8QR0qDkR0G45aw8w", + "taskResourceUsage" : { + "cpu_time_in_nanos" : 119000, + "memory_in_bytes" : 3920 + } + } + ], + "node_id" : "BBgWzu8QR0qDkR0G45aw8w", + "phase_latency_map" : { + "expand" : 0, + "query" : 23, + "fetch" : 0 + }, + "labels" : { + "X-Opaque-Id" : "query-label-1" + }, + "search_type" : "query_then_fetch", + "total_shards" : 1, + "measurements" : { + "memory" : { + "number" : 6608456, + "count" : 1, + "aggregationType" : "NONE" + }, + "latency" : { + "number" : 24, + "count" : 1, + "aggregationType" : "NONE" + }, + "cpu" : { + "number" : 22783000, + "count" : 1, + "aggregationType" : "NONE" + } + } + }, + { + "timestamp" : 1745021826937, + "id" : "86e161d0-e982-48c2-b8da-e3a3763f2e36", + "group_by" : "NONE", + "indices" : [ + "my-index-*" + ], + "source" : { + "size" : 20, + "query" : { + "term" : { + "user.id" : { + "value" : "userId", + "boost" : 1.0 + } + } + } + }, + "task_resource_usages" : [ + { + "action" : "indices:data/read/search[phase/query]", + "taskId" : 26, + "parentTaskId" : 25, + "nodeId" : "BBgWzu8QR0qDkR0G45aw8w", + "taskResourceUsage" : { + "cpu_time_in_nanos" : 11020000, + "memory_in_bytes" : 4292272 + } + }, + { + "action" : "indices:data/read/search", + "taskId" : 25, + "parentTaskId" : -1, + "nodeId" : "BBgWzu8QR0qDkR0G45aw8w", + "taskResourceUsage" : { + "cpu_time_in_nanos" : 1032000, + "memory_in_bytes" : 115816 + } + } + ], + "node_id" : "BBgWzu8QR0qDkR0G45aw8w", + "phase_latency_map" : { + "expand" : 0, + "query" : 15, + "fetch" : 1 + }, + "labels" : { }, + "search_type" : "query_then_fetch", + "total_shards" : 1, + "measurements" : { + "memory" : { + "number" : 4408088, + "count" : 1, + "aggregationType" : "NONE" + }, + "latency" : { + "number" : 23, + "count" : 1, + "aggregationType" : "NONE" + }, + "cpu" : { + "number" : 12052000, + "count" : 1, + "aggregationType" : "NONE" + } + } + } + ] +} ``` -{% include copy-curl.html %} + +</details> + +If your query returns no results, ensure that top N query monitoring is enabled for the target metric type and that search requests were made within the current [time window](#configuring-the-window-size). +{: .important} + +## Monitoring historical top N queries + +To query historical top N results, specify a time range with the `from` and `to` parameters in ISO 8601 format: `YYYY-MM-DD'T'HH:mm:ss.SSSZ`. +For example, to retrieve the top N queries from August 25, 2024, at 15:00 UTC to August 30, 2024, at 17:00 UTC, send the following request: ```json -GET /_insights/top_queries?type=memory +GET /_insights/top_queries?from=2024-08-25T15:00:00.000Z&to=2024-08-30T17:00:00.000Z ``` {% include copy-curl.html %} -If your query returns no results, ensure that top N query monitoring is enabled for the target metric type and that search requests were made within the current [time window](#configuring-the-window-size). +To view historical query data, the exporter type must be set to `local_index`. For more information, see [Configuring a local index exporter](#configuring-a-local-index-exporter). {: .important} ## Exporting top N query data @@ -107,13 +293,13 @@ You can configure your desired exporter to export top N query data to different ### Configuring a debug exporter -To configure a debug exporter, update the exporter setting for the desired metric type. For example, to export the top N queries by latency using the debug exporter, send the following request: +To use the debug exporter, set the exporter type to `debug`: ```json PUT _cluster/settings { "persistent" : { - "search.insights.top_queries.latency.exporter.type" : "debug" + "search.insights.top_queries.exporter.type" : "debug" } } ``` @@ -121,17 +307,48 @@ PUT _cluster/settings ### Configuring a local index exporter -A local index exporter allows you to export the top N queries to local OpenSearch indexes. The default index pattern for top N query indexes is `top_queries-YYYY.MM.dd`. All top queries from the same day are saved to the same index, and a new index is created each day. You can change the default index pattern to use other date formats. For more information about supported formats, see [DateTimeFormat](https://www.joda.org/joda-time/apidocs/org/joda/time/format/DateTimeFormat.html). +The default exporter is `local_index`. A local index exporter allows you to save top N query data to indexes that are automatically created in your OpenSearch domain. Query Insights creates these indexes following the naming pattern `top_queries-YYYY.MM.dd-hashcode`, where `hashcode` is a 5-digit number generated based on the current UTC date. A new index is created daily. For historical top N lookups using the Top Queries API or the Query Insights dashboard, you must enable the local index exporter. -To configure the local index exporter for the top N queries by latency, send the following request: +To use the local index exporter, set the exporter type to `local_index`: ```json PUT _cluster/settings { "persistent" : { - "search.insights.top_queries.latency.exporter.type" : "local_index", - "search.insights.top_queries.latency.exporter.config.index" : "YYYY.MM.dd" + "search.insights.top_queries.exporter.type" : "local_index" } } ``` {% include copy-curl.html %} + +Use the `delete_after_days` setting (integer) to specify the number of days after which local indexes are automatically deleted. Query Insights runs a job once per day at 00:05 UTC to delete top N local indexes older than the specified number of days. The default value for `delete_after_days` is 7, with valid values ranging from `1` to `180`. + +For example, to delete local indexes older than 10 days, send the following request: + +```json +PUT _cluster/settings +{ + "persistent" : { + "search.insights.top_queries.exporter.delete_after_days" : "10" + } +} +``` +{% include copy-curl.html %} + +## Excluding indexes from top N queries + +You can exclude search queries from the top N query list based on the indexes they target. This is useful when certain indexes are known to have long-running queries and don't need to be monitored. + +A query is excluded if it searches any shard that belongs to an index listed in `excluded_indices`. + +By default, this setting is `null` (all indexes are included). To exclude specific indexes, provide a comma-separated list of index names in the `search.insights.top_queries.excluded_indices` setting: + +```json +PUT _cluster/settings +{ + "persistent" : { + "search.insights.top_queries.excluded_indices" : "index-1,index-2,index-3" + } +} +``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_observing-your-data/trace/distributed-tracing.md b/_observing-your-data/trace/distributed-tracing.md index 4fb464f67c1..773b4dd34ae 100644 --- a/_observing-your-data/trace/distributed-tracing.md +++ b/_observing-your-data/trace/distributed-tracing.md @@ -1,6 +1,6 @@ --- layout: default -title: Distrbuted tracing +title: Distributed tracing parent: Trace Analytics nav_order: 65 --- diff --git a/_observing-your-data/trace/getting-started.md b/_observing-your-data/trace/getting-started.md index d1bffb70508..7dd9f39f67c 100644 --- a/_observing-your-data/trace/getting-started.md +++ b/_observing-your-data/trace/getting-started.md @@ -38,7 +38,7 @@ Download or clone the [Data Prepper repository](https://github.com/opensearch-pr - A single-node OpenSearch cluster (`opensearch`) - OpenSearch Dashboards (`opensearch-dashboards`). -Close the file and run `docker-compose up --build`. After the containers start, navigate to `http://localhost:8080` in a web browser. +Close the file and run `docker compose up --build`. After the containers start, navigate to `http://localhost:8080` in a web browser. ![HotROD web interface]({{site.url}}{{site.baseurl}}/images/hot-rod.png) diff --git a/_observing-your-data/trace/ta-dashboards.md b/_observing-your-data/trace/ta-dashboards.md index c7ef2117ad9..0689262a2c6 100644 --- a/_observing-your-data/trace/ta-dashboards.md +++ b/_observing-your-data/trace/ta-dashboards.md @@ -10,7 +10,7 @@ redirect_from: # Trace Analytics plugin for OpenSearch Dashboards -The Trace Analytics plugin offers at-a-glance visibility into application performance based on [OpenTelemetry (OTel)](https://opentelemetry.io/) protocol data that standardizes instrumentation for collecting telemetry data from cloud-native software. +The Trace Analytics plugin offers at-a-glance visibility into application performance based on [OpenTelemetry (OTel)](https://opentelemetry.io/) protocol data that standardizes instrumentation for collecting telemetry data from cloud-native software. ## Installing the plugin @@ -20,26 +20,35 @@ See [Standalone OpenSearch Dashboards plugin install]({{site.url}}{{site.baseurl The [OpenTelemetry Demo with OpenSearch](https://github.com/opensearch-project/opentelemetry-demo) simulates a distributed application generating real-time telemetry data, providing you with a practical environment in which to explore features available with the Trace Analytics plugin before implementing it in your environment. +### Step 1: Set up the OpenTelemetry Demo -**Step 1: Set up the OpenTelemetry Demo** - - - Clone the [OpenTelemetry Demo with OpenSearch](https://github.com/opensearch-project/opentelemetry-demo) repository: `git clone https://github.com/opensearch-project/opentelemetry-demo`. - - Follow the [Getting Started](https://github.com/opensearch-project/opentelemetry-demo/blob/main/tutorial/GettingStarted.md) instructions to deploy the demo application using Docker, which runs multiple microservices generating telemetry data. +- Clone the [OpenTelemetry Demo with OpenSearch](https://github.com/opensearch-project/opentelemetry-demo) repository: `git clone https://github.com/opensearch-project/opentelemetry-demo`. +- Follow the [Getting Started](https://github.com/opensearch-project/opentelemetry-demo/tree/main?tab=readme-ov-file#running-this-demo) instructions to deploy the demo application using Docker, which runs multiple microservices generating telemetry data. -**Step 2: Ingest telemetry data** +### Step 2: Ingest telemetry data - - Configure the OTel collectors to send telemetry data (traces, metrics, logs) to your OpenSearch cluster, using the [preexisting setup](https://github.com/opensearch-project/opentelemetry-demo/tree/main/src/otelcollector). - - Confirm that [Data Prepper](https://github.com/opensearch-project/opentelemetry-demo/tree/main/src/dataprepper) is set up to process the incoming data, handle trace analytics and service map pipelines, submit data to required indexes, and perform preaggregated calculations. +- Configure the OTel collectors to send telemetry data (traces, metrics, logs) to your OpenSearch cluster, using the [preexisting setup](https://github.com/opensearch-project/opentelemetry-demo/tree/main/src/otelcollector). +- Confirm that [Data Prepper](https://github.com/opensearch-project/opentelemetry-demo/tree/main/src/dataprepper) is set up to process the incoming data, handle trace analytics and service map pipelines, submit data to required indexes, and perform preaggregated calculations. -**Step 3: Explore Trace Analytics in OpenSearch Dashboards** +### Step 3: Explore Trace Analytics in OpenSearch Dashboards The **Trace Analytics** application includes two options: **Services** and **Traces**: - - **Services** lists all services in the application, plus an interactive map that shows how the various services connect to each other. In contrast to the dashboard (which helps identify problems by operation), the **Service map** helps you identify problems by service based on error rates and latency. To access this option, go to **Trace Analytics** > **Services**. - - **Traces** groups traces together by HTTP method and path so that you can see the average latency, error rate, and trends associated with a particular operation. For a more focused view, try filtering by trace group name. To access this option, go to **Trace Analytics** > **Traces**. From the **Trace Groups** panel, you can review the traces that comprise a trace group. From the **Traces** panel you can analyze individual traces for a detailed summary. +- **Services** lists all services in the application and provides an interactive map that shows how the various services connect to each other. In contrast to the dashboard (which helps identify problems by operation), the **Service map** helps you identify problems by service based on error rates and latency. To access this option, go to **Trace Analytics** > **Services**. +- **Traces** groups traces together by HTTP method and path so that you can see the average latency, error rate, and trends associated with a particular operation. For a more focused view, try filtering by trace group name. To access this option, go to **Trace Analytics** > **Traces**. From the **Trace Groups** panel, you can review the traces in a trace group. From the **Traces** panel you can analyze individual traces to get a detailed summary. - **Step 4: Perform correlation analysis** - - Select **Services correlation** to display connections between various telemetry signals. This allows you to navigate from the logical service level to the associated metrics and logs for that specific service. +### Step 4: Perform correlation analysis + +Select **Services correlation** to display relationships between telemetry signals. This feature helps you navigate from the logical service level to associated metrics and logs for a specific service. + +The Trace Analytics plugin supports correlating spans, traces, and services with their corresponding logs. This lets you move directly from a trace or span to relevant log entries, or from a service to its correlated logs, within the Trace Analytics interface. Correlation streamlines troubleshooting by offering a unified view of telemetry data, making it easier to identify root causes and understand application context. + +Use the following options to perform correlation: + +- **Trace-to-log correlation**: On the trace details page, select **View associated logs**. +- **Span-to-log correlation**: In the span details flyout (opened by selecting a span ID in the Gantt chart or span table), select **View associated logs**. +- **Service-to-log correlation**: On the services page, select the **Discover** icon next to the desired service. +- **Service-to-service correlation**: On the services page, use the **Focus on** option in the service map to view a service and its dependencies. --- @@ -76,11 +85,12 @@ Certain fields, such as `serviceName`, must be present to perform correlation an ### Correlation indexes -Navigating from the service dialog to its corresponding traces or logs requires the existence of correlating fields and that the target indexes (for example, logs) follow the specified naming conventions, as described at [Simple Schema for Observability](https://opensearch.org/docs/latest/observing-your-data/ss4o/). +Navigating from the service dialog to its corresponding traces or logs requires the existence of correlating fields and that the target indexes (for example, logs) follow the specified naming conventions, as described at [Simple Schema for Observability]({{site.url}}{{site.baseurl}}/observing-your-data/ss4o/). --- ## Trace analytics with OTel protocol analytics + Introduced 2.15 {: .label .label-purple } @@ -88,17 +98,18 @@ Trace analytics with OTel protocol analytics provide comprehensive insights into - [Service](https://opentelemetry.io/docs/specs/semconv/resource/#service): The components of a distributed application. These components are significant logical terms used to measure and monitor the application's building blocks in order to validate the system's health. - [Traces](https://opentelemetry.io/docs/concepts/signals/traces/): A visual representation of a request's path across services into requests' journeys across services, offering insights into latency and performance issues. -- [RED metrics](https://opentelemetry.io/docs/specs/otel/metrics/api/): Metrics for service health and performance, measured as requests per second (rate), failed requests (errors), and request processing time (duration). +- [RED metrics](https://opentelemetry.io/docs/specs/otel/metrics/api/): Metrics for service health and performance, measured as requests per second (rate), failed requests (errors), and request processing time (duration). ### Trace analytics visualizations **Services** visualizations, such as a table or map, help you logically analyze service behavior and accuracy. The following visualizations can help you identify anomalies and errors: - **Services table** + - A RED indicator, along with connected upstream and downstream services and other actions, is indicated in each table column. An example **Services** table is shown in the following image. ![Services table]({{site.url}}{{site.baseurl}}/images/trace-analytics/services-table.png) - + - General-purpose filter selection is used for field or filter composition. The following image shows this filter. ![Services filter selection]({{site.url}}{{site.baseurl}}/images/trace-analytics/services-filter-selection.png) @@ -106,13 +117,13 @@ Trace analytics with OTel protocol analytics provide comprehensive insights into - The **Services** throughput tooltip provides an at-a-glance overview of a service's incoming request trend for the past 24 hours. The following image shows an example tooltip. ![Services throughput tooltip ]({{site.url}}{{site.baseurl}}/images/trace-analytics/service-throughput-tooltip.png) - + - The **Services** correlation dialog window provides an at-a-glance overview of a service's details, including its 24-hour throughput trend. You can use these details to analyze correlated logs or traces by filtering based on the `serviceName` field. The following image shows this window. - + ![Services correlation dialog window]({{site.url}}{{site.baseurl}}/images/trace-analytics/single-service-correlation-dialog.png) - The **Services** RED metrics dialog window provides an at-a-glance overview of a service's RED metrics indicators, including 24-hour error, duration, and throughput rate. The following image shows this window. - + ![Services RED metrics for duration]({{site.url}}{{site.baseurl}}/images/trace-analytics/single-service-RED-metrics.png) - The **Span details** dialog window provides the details of a trace. You can use this information to further analyze a trace's elements, such as attributes and associated logs. The following image shows this window. @@ -120,25 +131,68 @@ Trace analytics with OTel protocol analytics provide comprehensive insights into ![Services Span details dialog window]({{site.url}}{{site.baseurl}}/images/trace-analytics/span-details-fly-out.png) - **Service map** + - The **Service map** displays nodes, each representing a service. The node color indicates the RED indicator severity for that service and its dependencies. The following image shows a map. ![Services map tooltip]({{site.url}}{{site.baseurl}}/images/trace-analytics/service-details-tooltip.png) - + - You can select a node to open a detailed dialog window for its associated service. This interactive map visualizes service interconnections, helping identify problems by service, unlike dashboards that identify issues by operation. You can sort by error rate or latency to pinpoint potential problem areas. - In the **Service map** dialog window, nodes represent connected downstream services dependent on the selected service. The node color indicates the RED indicator severity for that service and its downstream dependencies. The following image shows this dialog window. - + ![Service map dialog window]({{site.url}}{{site.baseurl}}/images/trace-analytics/single-service-fly-out.png) - **Trace groups** + - Traces are grouped by their HTTP API name, allowing clustering based on their business functional unit. Traces are grouped by HTTP method and path, displaying the average latency, error rate, and trends associated with a particular operation. You can filter by trace group name. The following image shows the **Trace Groups** window. ![Trace Groups window]({{site.url}}{{site.baseurl}}/images/trace-analytics/trace-group-RED-metrics.png) - In the **Trace Groups** window, you can filter by group name and other filters. You can also analyze associated traces. To drill down on the traces that comprise a group, select the number of traces in the right-hand column and then choose an individual trace to see a detailed summary. - + ![Trace group dialog window]({{site.url}}{{site.baseurl}}/images/ta-dashboard.png) - The **Trace details** window displays a breakdown of a single trace, including its corresponding spans, associated service names, and a waterfall chart of the spans' time and duration interactions. The following image shows this view. - + ![Trace details window]({{site.url}}{{site.baseurl}}/images/ta-trace.png) + +## Support for custom index names and cross-cluster indexes + +Introduced 3.1 +{: .label .label-purple } + +Trace Analytics in OpenSearch 3.1 includes expanded support for custom index names and cross-cluster indexes, offering greater flexibility and scalability for distributed environments. The following enhancements are now available: + +- You can configure custom index names for Observability span, service, and log indexes. This allows you to align index naming with your organization's conventions and manage data across multiple environments more effectively. You can also configure correlated log indexes and map their corresponding fields for `timestamp`, `serviceName`, `spanId`, and `traceId`. This feature is particularly useful if your logs do not follow the OpenTelemetry (OTel) format and require custom field mappings. Custom span indexes must follow Data Prepper span index mappings. + + The following image shows the custom index name configuration interface in the Observability settings panel. + + ![Custom index name configuration UI]({{site.url}}{{site.baseurl}}/images/ta-index-settings.png) + +- The **Trace details** page now includes an associated logs panel, which helps you analyze logs correlated with specific traces to improve troubleshooting and root cause analysis. The following image shows the logs panel. + + ![Trace detail page with associated logs panel]({{site.url}}{{site.baseurl}}/images/ta-trace-logs-correlation.png) + +- A new dropdown menu lets you view all spans, root spans, service entry spans, or traces. The custom data grid provides advanced sorting and display options, including a full-screen mode for easier data exploration, as shown in the following image. + + ![Drop-down menu and custom data grid in Trace Analytics]({{site.url}}{{site.baseurl}}/images/ta-span-kind.png) + +- The service map now appears below the traces table on the **Trace Analytics** page, providing immediate visual context for service relationships and dependencies as you analyze trace data. + + ![Service map displayed below traces table]({{site.url}}{{site.baseurl}}/images/ta-traces-page.png) + +- The **Trace details** page features a new tree view that displays a hierarchical breakdown of spans. The layout has been updated to position the pie chart next to the overview panel for a more intuitive summary of trace metrics, as shown in the following image. + + ![Gantt chart with tree view and pie chart layout]({{site.url}}{{site.baseurl}}/images/ta-hierarchial-view.png) + +- The Gantt chart now includes a selectable mini-map above it, allowing you to quickly navigate to and focus on specific sections of the trace timeline, as shown in the following image. + + ![Gantt chart with selectable mini-map]({{site.url}}{{site.baseurl}}/images/ta-gantt-mini-map.png) + +- The service map has been redesigned to better support large node groups, making it easier to visualize complex service topologies. You can now focus on a specific service to view its dependencies and reset the map as needed, as shown in the following image. + + ![Redesigned service map with large node groups]({{site.url}}{{site.baseurl}}/images/ta-service-map-dependencies.png) + +- The service view table now includes more quick-select icons, allowing you to view correlated traces and logs in their corresponding views with the correct context passed and to view service details in context without leaving the page, as shown in the following image. + + ![Service table quick select icons]({{site.url}}{{site.baseurl}}/images/ta-service-table-icons.png) diff --git a/_query-dsl/compound/bool.md b/_query-dsl/compound/bool.md index 44790942148..e44588348ae 100644 --- a/_query-dsl/compound/bool.md +++ b/_query-dsl/compound/bool.md @@ -18,7 +18,7 @@ Use the following query clauses within a `bool` query: Clause | Behavior :--- | :--- `must` | Logical `and` operator. The results must match all queries in this clause. -`must_not` | Logical `not` operator. All matches are excluded from the results. +`must_not` | Logical `not` operator. All matches are excluded from the results. If `must_not` has multiple clauses, only documents that do not match any of those clauses are returned. For example, `"must_not":[{clause_A}, {clause_B}]` is equivalent to `NOT(A OR B)`. `should` | Logical `or` operator. The results must match at least one of the queries. Matching more `should` clauses increases the document's relevance score. You can set the minimum number of queries that must match using the [`minimum_should_match`]({{site.url}}{{site.baseurl}}/query-dsl/query-dsl/minimum-should-match/) parameter. If a query contains a `must` or `filter` clause, the default `minimum_should_match` value is 0. Otherwise, the default `minimum_should_match` value is 1. `filter` | Logical `and` operator that is applied first to reduce your dataset before applying the queries. A query within a filter clause is a yes or no option. If a document matches the query, it is returned in the results; otherwise, it is not. The results of a filter query are generally cached to allow for a faster return. Use the filter query to filter the results based on exact matches, ranges, dates, or numbers. diff --git a/_query-dsl/compound/hybrid.md b/_query-dsl/compound/hybrid.md index 69ce89ce173..af742d076e4 100644 --- a/_query-dsl/compound/hybrid.md +++ b/_query-dsl/compound/hybrid.md @@ -11,9 +11,9 @@ You can use a hybrid query to combine relevance scores from multiple queries int ## Example -Learn how to use the `hybrid` query by following the steps in [Using hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/#using-hybrid-search). +Learn how to use the `hybrid` query by following the steps in [Hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/). -For a comprehensive example, follow the [Neural search tutorial]({{site.url}}{{site.baseurl}}/ml-commons-plugin/semantic-search#tutorial). +For a comprehensive example, follow the [Getting started with semantic and hybrid search]({{site.url}}{{site.baseurl}}/ml-commons-plugin/semantic-search#tutorial). ## Parameters @@ -22,6 +22,7 @@ The following table lists all top-level parameters supported by `hybrid` queries Parameter | Description :--- | :--- `queries` | An array of one or more query clauses that are used to match documents. A document must match at least one query clause in order to be returned in the results. The documents' relevance scores from all query clauses are combined into one score by applying a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/). The maximum number of query clauses is 5. Required. +`filter` | A filter to apply to all the subqueries of the hybrid query. ## Disabling hybrid queries diff --git a/_query-dsl/full-text/match-bool-prefix.md b/_query-dsl/full-text/match-bool-prefix.md index 3964dc5ee80..6905d49989f 100644 --- a/_query-dsl/full-text/match-bool-prefix.md +++ b/_query-dsl/full-text/match-bool-prefix.md @@ -216,7 +216,7 @@ The `<field>` accepts the following parameters. All parameters except `query` ar Parameter | Data type | Description :--- | :--- | :--- `query` | String | The text, number, Boolean value, or date to use for search. Required. -`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `fuzziness` | `AUTO`, `0`, or a positive integer | The number of character edits (insert, delete, substitute) that it takes to change one word to another when determining whether a term matched a value. For example, the distance between `wined` and `wind` is 1. The default, `AUTO`, chooses a value based on the length of each term and is a good choice for most use cases. `fuzzy_rewrite` | String | Determines how OpenSearch rewrites the query. Valid values are `constant_score`, `scoring_boolean`, `constant_score_boolean`, `top_terms_N`, `top_terms_boost_N`, and `top_terms_blended_freqs_N`. If the `fuzziness` parameter is not `0`, the query uses a `fuzzy_rewrite` method of `top_terms_blended_freqs_${max_expansions}` by default. Default is `constant_score`. `fuzzy_transpositions` | Boolean | Setting `fuzzy_transpositions` to `true` (default) adds swaps of adjacent characters to the insert, delete, and substitute operations of the `fuzziness` option. For example, the distance between `wind` and `wnid` is 1 if `fuzzy_transpositions` is true (swap "n" and "i") and 2 if it is false (delete "n", insert "n"). If `fuzzy_transpositions` is false, `rewind` and `wnid` have the same distance (2) from `wind`, despite the more human-centric opinion that `wnid` is an obvious typo. The default is a good choice for most use cases. diff --git a/_query-dsl/full-text/match-phrase.md b/_query-dsl/full-text/match-phrase.md index 747c4814d96..3f364657908 100644 --- a/_query-dsl/full-text/match-phrase.md +++ b/_query-dsl/full-text/match-phrase.md @@ -268,6 +268,6 @@ The `<field>` accepts the following parameters. All parameters except `query` ar Parameter | Data type | Description :--- | :--- | :--- `query` | String | The query string to use for search. Required. -`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `slop` | `0` (default) or a positive integer | Controls the degree to which words in a query can be misordered and still be considered a match. From the [Lucene documentation](https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/search/PhraseQuery.html#getSlop--): "The number of other words permitted between words in query phrase. For example, to switch the order of two words requires two moves (the first move places the words atop one another), so to permit reorderings of phrases, the slop must be at least two. A value of zero requires an exact match." `zero_terms_query` | String | In some cases, the analyzer removes all terms from a query string. For example, the `stop` analyzer removes all terms from the string `an but this`. In those cases, `zero_terms_query` specifies whether to match no documents (`none`) or all documents (`all`). Valid values are `none` and `all`. Default is `none`. \ No newline at end of file diff --git a/_query-dsl/full-text/match.md b/_query-dsl/full-text/match.md index 056ef76890f..5ece14e127f 100644 --- a/_query-dsl/full-text/match.md +++ b/_query-dsl/full-text/match.md @@ -451,7 +451,7 @@ Parameter | Data type | Description :--- | :--- | :--- `query` | String | The query string to use for search. Required. `auto_generate_synonyms_phrase_query` | Boolean | Specifies whether to create a [match phrase query]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match-phrase/) automatically for multi-term synonyms. For example, if you specify `ba,batting average` as synonyms and search for `ba`, OpenSearch searches for `ba OR "batting average"` (if this option is `true`) or `ba OR (batting AND average)` (if this option is `false`). Default is `true`. -`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `boost` | Floating-point | Boosts the clause by the given multiplier. Useful for weighing clauses in compound queries. Values in the [0, 1) range decrease relevance, and values greater than 1 increase relevance. Default is `1`. `enable_position_increments` | Boolean | When `true`, resulting queries are aware of position increments. This setting is useful when the removal of stop words leaves an unwanted "gap" between terms. Default is `true`. `fuzziness` | String | The number of character edits (insertions, deletions, substitutions, or transpositions) that it takes to change one word to another when determining whether a term matched a value. For example, the distance between `wined` and `wind` is 1. Valid values are non-negative integers or `AUTO`. The default, `AUTO`, chooses a value based on the length of each term and is a good choice for most use cases. diff --git a/_query-dsl/full-text/multi-match.md b/_query-dsl/full-text/multi-match.md index ab1496fdd3d..a3995df7141 100644 --- a/_query-dsl/full-text/multi-match.md +++ b/_query-dsl/full-text/multi-match.md @@ -900,9 +900,9 @@ Parameter | Data type | Description :--- | :--- | :--- `query` | String | The query string to use for search. Required. `auto_generate_synonyms_phrase_query` | Boolean | Specifies whether to create a [match phrase query]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match-phrase/) automatically for multi-term synonyms. For example, if you specify `ba,batting average` as synonyms and search for `ba`, OpenSearch searches for `ba OR "batting average"` (if this option is `true`) or `ba OR (batting AND average)` (if this option is `false`). Default is `true`. -`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `boost` | Floating-point | Boosts the clause by the given multiplier. Useful for weighing clauses in compound queries. Values in the [0, 1) range decrease relevance, and values greater than 1 increase relevance. Default is `1`. -`fields` | Array of strings | The list of fields in which to search. If you don't provide the `fields` parameter, `multi_match` query searches the fields specified in the `index.query. Default_field` setting, which defaults to `*`. +`fields` | Array of strings | The list of fields in which to search. If you don't provide the `fields` parameter, `multi_match` query searches the fields specified in the `index.query.default_field` setting, which defaults to `*`. `fuzziness` | String | The number of character edits (insert, delete, substitute) that it takes to change one word to another when determining whether a term matched a value. For example, the distance between `wined` and `wind` is 1. Valid values are non-negative integers or `AUTO`. The default, `AUTO`, chooses a value based on the length of each term and is a good choice for most use cases. Not supported for `phrase`, `phrase_prefix`, and `cross_fields` queries. `fuzzy_rewrite` | String | Determines how OpenSearch rewrites the query. Valid values are `constant_score`, `scoring_boolean`, `constant_score_boolean`, `top_terms_N`, `top_terms_boost_N`, and `top_terms_blended_freqs_N`. If the `fuzziness` parameter is not `0`, the query uses a `fuzzy_rewrite` method of `top_terms_blended_freqs_${max_expansions}` by default. Default is `constant_score`. `fuzzy_transpositions` | Boolean | Setting `fuzzy_transpositions` to `true` (default) adds swaps of adjacent characters to the insert, delete, and substitute operations of the `fuzziness` option. For example, the distance between `wind` and `wnid` is 1 if `fuzzy_transpositions` is true (swap "n" and "i") and 2 if it is false (delete "n", insert "n"). If `fuzzy_transpositions` is false, `rewind` and `wnid` have the same distance (2) from `wind`, despite the more human-centric opinion that `wnid` is an obvious typo. The default is a good choice for most use cases. diff --git a/_query-dsl/full-text/query-string.md b/_query-dsl/full-text/query-string.md index 47180e3f6d4..7b3343155d7 100644 --- a/_query-dsl/full-text/query-string.md +++ b/_query-dsl/full-text/query-string.md @@ -623,7 +623,7 @@ Parameter | Data type | Description `query` | String | The text that may contain expressions in the [query string syntax](#query-string-syntax) to use for search. Required. `allow_leading_wildcard` | Boolean | Specifies whether `*` and `?` are allowed as first characters of a search term. Default is `true`. `analyze_wildcard` | Boolean | Specifies whether OpenSearch should attempt to analyze wildcard terms. Default is `false`. -`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The [analyzer]({{site.url}}{{site.baseurl}}/analyzers/index/) used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `auto_generate_synonyms_phrase_query` | Boolean | Specifies whether to create a [match phrase query]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match-phrase/) automatically for multi-term synonyms. For example, if you specify `ba, batting average` as synonyms and search for `ba`, OpenSearch searches for `ba OR "batting average"` (if this option is `true`) or `ba OR (batting AND average)` (if this option is `false`). Default is `true`. `boost` | Floating-point | Boosts the clause by the given multiplier. Useful for weighing clauses in compound queries. Values in the [0, 1) range decrease relevance, and values greater than 1 increase relevance. Default is `1`. `default_field` | String | The field in which to search if the field is not specified in the query string. Supports wildcards. Defaults to the value specified in the `index.query. Default_field` index setting. By default, the `index.query. Default_field` is `*`, which means extract all fields eligible for term query and filter the metadata fields. The extracted fields are combined into a query if the `prefix` is not specified. Eligible fields do not include nested documents. Searching all eligible fields could be a resource-intensive operation. The `indices.query.bool.max_clause_count` search setting defines the maximum value for the product of the number of fields and the number of terms that can be queried at one time. The default value for `indices.query.bool.max_clause_count` is 1,024. diff --git a/_query-dsl/full-text/simple-query-string.md b/_query-dsl/full-text/simple-query-string.md index 5dd2462e9ad..1624efdaa76 100644 --- a/_query-dsl/full-text/simple-query-string.md +++ b/_query-dsl/full-text/simple-query-string.md @@ -355,7 +355,7 @@ Parameter | Data type | Description :--- | :--- | :--- `query`| String | The text that may contain expressions in the [simple query string syntax](#simple-query-string-syntax) to use for search. Required. `analyze_wildcard` | Boolean | Specifies whether OpenSearch should attempt to analyze wildcard terms. Default is `false`. -`analyzer` | String | The analyzer used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. +`analyzer` | String | The analyzer used to tokenize the query string text. Default is the index-time analyzer specified for the `default_field`. If no analyzer is specified for the `default_field`, the `analyzer` is the default analyzer for the index. For more information about `index.query.default_field`, see [Dynamic index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). `auto_generate_synonyms_phrase_query` | Boolean | Specifies whether to create [match_phrase queries]({{site.url}}{{site.baseurl}}/query-dsl/full-text/match/) automatically for multi-term synonyms. Default is `true`. `default_operator`| String | If the query string contains multiple search terms, whether all terms need to match (`AND`) or only one term needs to match (`OR`) for a document to be considered a match. Valid values are:<br>- `OR`: The string `to be` is interpreted as `to OR be`<br>- `AND`: The string `to be` is interpreted as `to AND be`<br> Default is `OR`. `fields` | String array | The list of fields to search (for example, `"fields": ["title^4", "description"]`). Supports wildcards. If unspecified, defaults to the `index.query.default_field` setting, which defaults to `["*"]`. The maximum number of fields that can be searched at the same time is defined by `indices.query.bool.max_clause_count`, which is 1,024 by default. diff --git a/_query-dsl/geo-and-xy/index.md b/_query-dsl/geo-and-xy/index.md index ee51e1e523b..9bcf6a94628 100644 --- a/_query-dsl/geo-and-xy/index.md +++ b/_query-dsl/geo-and-xy/index.md @@ -30,7 +30,7 @@ OpenSearch provides the following geographic query types: - [**Geo-bounding box queries**]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/geo-and-xy/geo-bounding-box/): Return documents with geopoint field values that are within a bounding box. - [**Geodistance queries**]({{site.url}}{{site.baseurl}}/query-dsl/geo-and-xy/geodistance/): Return documents with geopoints that are within a specified distance from the provided geopoint. -- [**Geopolygon queries**]({{site.url}}{{site.baseurl}}/query-dsl/geo-and-xy/geodistance/): Return documents containing geopoints that are within a polygon. +- [**Geopolygon queries**]({{site.url}}{{site.baseurl}}/query-dsl/geo-and-xy/geopolygon/): Return documents containing geopoints that are within a polygon. - [**Geoshape queries**]({{site.url}}{{site.baseurl}}/query-dsl/geo-and-xy/geoshape/): Return documents that contain: - Geoshapes and geopoints that have one of four spatial relations to the provided shape: `INTERSECTS`, `DISJOINT`, `WITHIN`, or `CONTAINS`. - - Geopoints that intersect the provided shape. \ No newline at end of file + - Geopoints that intersect the provided shape. diff --git a/_query-dsl/match-all.md b/_query-dsl/match-all.md index 274111f041d..f0774cf1c1f 100644 --- a/_query-dsl/match-all.md +++ b/_query-dsl/match-all.md @@ -28,4 +28,14 @@ GET _search } } ``` -{% include copy-curl.html %} \ No newline at end of file +{% include copy-curl.html %} + + +## Parameters + +Both the matchall and match none queries accepts the following parameters. All parameters are optional. + +Parameter | Data type | Description +:--- | :--- | :--- +`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. +`_name` | String | The name of the query for query tagging. Optional. diff --git a/_query-dsl/regex-syntax.md b/_query-dsl/regex-syntax.md new file mode 100644 index 00000000000..620ad26adb8 --- /dev/null +++ b/_query-dsl/regex-syntax.md @@ -0,0 +1,312 @@ +--- +layout: default +title: Regular expression syntax +nav_order: 100 +--- + +# Regular expression syntax + +A [regular expression](https://en.wikipedia.org/wiki/Regular_expression) (regex) is a way to define search patterns using special symbols and operators. These patterns let you match sequences of characters in strings. + +In OpenSearch, you can use regular expressions in the following query types: + +* [`regexp`]({{site.url}}{{site.baseurl}}/query-dsl/term/regexp/) +* [`query_string`]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/) + +OpenSearch uses the [Apache Lucene](https://lucene.apache.org/core/) regex engine, which has its own syntax and limitations. It does **not** use [Perl Compatible Regular Expressions (PCRE)](https://en.wikipedia.org/wiki/Perl_Compatible_Regular_Expressions), so some familiar regex features might behave differently or be unsupported. +{: .note} + +## Choosing between regexp and query_string queries + +Both `regexp` and `query_string` queries support regular expressions, but they behave differently and serve different use cases. + +| Feature | `regexp` query | `query_string` query | +| ------------------------- | -------------------------------------------------- | --------------------------------------------------- | +| Pattern matching | Regex pattern must match the entire field value | Regex pattern can match any part of the field | +| `flags` support | `flags` enables optional regex operators | `flags` not supported | +| Query type | Term-level query (not scored) | Full-text query (scored and parsed) | +| Best use case | Strict pattern matching on keyword or exact fields | Search within analyzed fields using a flexible query string that supports regex patterns | +| Complex query composition | Limited to regex patterns | Supports `AND`, `OR`, wildcards, fields, boosts, and other features. See [Query string query]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/). | + + +## Reserved characters + +Lucene's regex engine supports all Unicode characters. However, the following characters are treated as special operators: + +``` +. ? + * | { } [ ] ( ) " \ +``` + +Depending on the enabled `flags` that specify [optional operators](#optional-operators), the following characters may also be reserved: + +``` +@ & ~ < > +``` + +To match these characters literally, either escape them with a backslash (`\`) or wrap the entire string in double quotation marks: + +- `\&`: Matches a literal `&` +- `\\`: Matches a literal backslash (`\`) +- `"hello@world"`: Matches the full string `hello@world` + + +## Standard regex operators + +Lucene supports a core set of regex operators: + +- `.` – Matches any single character. **Example**: `f.n` matches `f` followed by any character and then `n` (for example, `fan` or `fin`). + +- `?` – Matches zero or one of the preceding characters. **Example**: `colou?r` matches `color` and `colour`. + +- `+` – Matches one or more of the preceding characters. **Example**: `go+` matches `g` followed by one or more `o`s (`go`, `goo`, `gooo`, and so on). + +- `*` – Matches zero or more of the preceding characters. **Example**: `lo*se` matches `l` followed by zero or more `o`s and then `se` (`lse`, `lose`, `loose`, `loooose`, and so on). + +- `{min,max}` – Matches a specific range of repetitions. If `max` is omitted, there is no upper limit on the number of characters matched. **Example**: `x{3}` matches exactly 3 `x`s (`xxx`); `x{2,4}` matches from 2 to 4 `x`s (`xx`, `xxx`, or `xxxx`); `x{3,}` matches 3 or more `x`s (`xxx`, `xxxx`, `xxxxx`, and so on). + +- `|` – Acts as a logical `OR`. **Example**: `apple|orange` matches `apple` or `orange`. + +- `( )` – Groups characters into a subpattern. **Example**: `ab(cd)?` matches `ab` and `abcd`. + +- `[ ]` – Matches one character from a set or range. **Example**: `[aeiou]` matches any vowel. + - `-` – When provided within the brackets, indicates a range unless escaped or is the first character within the brackets. **Example**: `[a-z]` matches any lowercase letter; `[-az]` matches `-`, `a`, or `z`; `[a\\-z]` matches `a`, `-`, or `z`. + - `^` – When provided within the brackets, acts a logical `NOT`, negating a range of characters or any character in the set. **Example**: `[^az]` matches any character except `a` or `z`; `[^a-z]` matches any character except lowercase letters; `[^-az]` matches any character except `-`, `a`, and `z`; `[^a\\-z]` matches any character except `a`, `-`, and `z`. + + +## Optional operators + +You can enable additional regex operators using the `flags` parameter. Separate multiple flags with `|`. + +The following are the available flags: + +- `ALL` (default) – Enables all optional operators. + +{% comment %} +<!-- COMPLEMENT is deprecated and doesn't work. Leaving it here until https://github.com/opensearch-project/OpenSearch/issues/18397 is resolved. --> +{% endcomment %} + +- `COMPLEMENT` – Enables `~`, which negates the shortest following expression. **Example**: `d~ef` matches `dgf`, `dxf`, but not `def`. + +- `INTERSECTION` – Enables `&` as an `AND` logical operator. **Example**: `ab.+&.+cd` matches strings containing `ab` at the beginning and `cd` at the end. + +- `INTERVAL` – Enables `<min-max>` syntax to match numeric ranges. **Example**: `id<10-12>` matches `id10`, `id11`, and `id12`. + +- `ANYSTRING` – Enables `@` to match any string. You can combine this with `~` and `&` for exclusions. **Example**: `@&.*error.*&.*[0-9]{3}.*` matches strings containing both the word "error" and a sequence of three digits. + +## Unsupported features + +Lucene's engine does not support the following commonly used regex anchors: + +- `^` – Start of line +- `$` – End of line + +Instead, your pattern must match the entire string to produce a match. + +## Example + +To try regular expressions, index the following documents into the `logs` index: + +```json +PUT /logs/_doc/1 +{ + "message": "error404" +} +``` +{% include copy-curl.html %} + +```json +PUT /logs/_doc/2 +{ + "message": "error500" +} +``` +{% include copy-curl.html %} + +```json +PUT /logs/_doc/3 +{ + "message": "error1a" +} +``` +{% include copy-curl.html %} + +### Example: Basic query containing regular expressions + +The following `regexp` query returns documents in which the entire value of the `message` field matches the pattern "error" followed by one or more digits. A value does not match if it only contains the pattern as a substring: + +```json +GET /logs/_search +{ + "query": { + "regexp": { + "message": { + "value": "error[0-9]+" + } + } + } +} +``` +{% include copy-curl.html %} + +This query matches `error404` and `error500`: + + +```json +{ + "took": 28, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "logs", + "_id": "1", + "_score": 1, + "_source": { + "message": "error404" + } + }, + { + "_index": "logs", + "_id": "2", + "_score": 1, + "_source": { + "message": "error500" + } + } + ] + } +} +``` + +### Example: Using optional operators + +The following query matches documents in which the `message` field exactly matches a string that starts with "error" followed by a number from 400 to 500, inclusive. The `INTERVAL` flag enables the use of `<min-max>` syntax for numeric ranges: + +```json +GET /logs/_search +{ + "query": { + "regexp": { + "message": { + "value": "error<400-500>", + "flags": "INTERVAL" + } + } + } +} +``` +{% include copy-curl.html %} + +This query matches `error404` and `error500`: + +```json +{ + "took": 22, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "logs", + "_id": "1", + "_score": 1, + "_source": { + "message": "error404" + } + }, + { + "_index": "logs", + "_id": "2", + "_score": 1, + "_source": { + "message": "error500" + } + } + ] + } +} +``` + +### Example: Using ANYSTRING + +When the `ANYSTRING` flag is enabled, the `@` operator matches an entire string. This is useful when combined with intersection (`&`) because it allows you to construct queries that match full strings under specific conditions. + +The following query matches messages that contain both the word "error" and a sequence of three digits. Use `ANYSTRING` to assert that the entire field must match the intersection of both patterns: + +```json +GET /logs/_search +{ + "query": { + "regexp": { + "message.keyword": { + "value": "@&.*error.*&.*[0-9]{3}.*", + "flags": "ANYSTRING|INTERSECTION" + } + } + } +} +``` +{% include copy-curl.html %} + +This query matches `error404` and `error500`: + +```json +{ + "took": 20, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "logs", + "_id": "1", + "_score": 1, + "_source": { + "message": "error404" + } + }, + { + "_index": "logs", + "_id": "2", + "_score": 1, + "_source": { + "message": "error500" + } + } + ] + } +} +``` + +Note that this query will also match `xerror500`, `error500x`, and `errorxx500`. \ No newline at end of file diff --git a/_query-dsl/span-query.md b/_query-dsl/span-query.md deleted file mode 100644 index 0527a00762c..00000000000 --- a/_query-dsl/span-query.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -layout: default -title: Span queries -nav_order: 60 -redirect_from: - - /opensearch/query-dsl/span-query/ - - /query-dsl/query-dsl/span-query/ ---- - -# Span queries - -You can use span queries to perform precise positional searches. Span queries are low-level, specific queries that provide control over the order and proximity of specified query terms. They are primarily used to search legal documents and patents. - -Span queries include the following query types: - -- **Span containing**: Wraps a list of span queries and only returns spans that match a second span query. -- **Span field masking**: Combines `span_near` or `span_or` across different fields. -- **Span first**: Matches spans close to the beginning of the field. -- **Span multi-term**: Provides a wrapper around the following query types: `term`, `range`, `prefix`, `wildcard`, `regexp` or `fuzzy`. -- **Span near**: Matches spans that are near each other. Wraps multiple span queries that must match within the specified `slop` distance of each other, and optionally in the same order. Slop represents the maximum number of intervening unmatched positions and indicates whether matches are required to be returned in order. -- **Span not**: Provides a wrapper for another span query and excludes any documents that match the internal query. -- **Span or**: Provides a wrapper for multiple span queries and includes any documents that match any of the specified queries. -- **Span term**: Functions in the same way as a `term` query, but is designed to be used with other span queries. -- **Span within**: Used with other span queries to return a single span query if its span is within the spans that are returned by a list of other span queries. \ No newline at end of file diff --git a/_query-dsl/span/index.md b/_query-dsl/span/index.md new file mode 100644 index 00000000000..39c91e81350 --- /dev/null +++ b/_query-dsl/span/index.md @@ -0,0 +1,102 @@ +--- +layout: default +title: Span queries +has_children: true +has_toc: false +nav_order: 60 +redirect_from: + - /opensearch/query-dsl/span-query/ + - /query-dsl/query-dsl/span-query/ + - /query-dsl/span-query/ + - /query-dsl/span/ +--- + +# Span queries + +You can use span queries to perform precise positional searches. Span queries are low-level, specific queries that provide control over the order and proximity of specified query terms. They are primarily used to search legal documents and patents. + +Span queries include the following query types: + +- [**Span containing**]({{site.url}}{{site.baseurl}}/query-dsl/span/span-containing/): Returns larger spans that contain smaller spans within them. Useful for finding specific terms or phrases within a broader context. The opposite of `span_within` query. + +- [**Span field masking**]({{site.url}}{{site.baseurl}}/query-dsl/span/span-field-masking/): Allows span queries to work across different fields by making one field appear as another. Particularly useful when the same text is indexed using different analyzers. + +- [**Span first**]({{site.url}}{{site.baseurl}}/query-dsl/span/span-first/): Matches terms or phrases that appear within a specified number of positions from the start of a field. Useful for finding content at the beginning of text. + +- [**Span multi-term**]({{site.url}}{{site.baseurl}}/query-dsl/span/span-multi-term/): Enables multi-term queries (like `prefix`, `wildcard`, or `fuzzy`) to work within span queries. Allows for more flexible matching patterns in span searches. + +- [**Span near**]({{site.url}}{{site.baseurl}}/query-dsl/span/span-near/): Finds terms or phrases that appear within a specified distance of each other. You can require matches to appear in a specific order and control how many words can appear between them. + +- [**Span not**]({{site.url}}{{site.baseurl}}/query-dsl/span/span-not/): Excludes matches that overlap with another span query. Useful for finding terms when they do not appear in specific phrases or contexts. + +- [**Span or**]({{site.url}}{{site.baseurl}}/query-dsl/span/span-or/): Matches documents that satisfy any of the provided span queries. Combines multiple span patterns with OR logic. + +- [**Span term**]({{site.url}}{{site.baseurl}}/query-dsl/span/span-term/): The basic building block for span queries. Matches a single term while maintaining position information for use in other span queries. + +- [**Span within**]({{site.url}}{{site.baseurl}}/query-dsl/span/span-within/): Returns smaller spans that are enclosed by larger spans. The opposite of the `span_containing` query. + +## Setup + +To try the examples in this section, use the following steps to configure an example index. + +### Step 1: Create an index + +First, create an index for an e-commerce clothing website. The `description` field uses the default `standard` analyzer, while the `description.stemmed` subfield applies the `english` analyzer to enable stemming: + +```json +PUT /clothing +{ + "mappings": { + "properties": { + "description": { + "type": "text", + "analyzer": "standard", + "fields": { + "stemmed": { + "type": "text", + "analyzer": "english" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +### Step 2: Index data + +Index sample documents into the index: + +```json +POST /clothing/_doc/1 +{ + "description": "Long-sleeved dress shirt with a formal collar and button cuffs. " +} + +``` +{% include copy-curl.html %} + +```json +POST /clothing/_doc/2 +{ + "description": "Beautiful long dress in red silk, perfect for formal events." +} +``` +{% include copy-curl.html %} + +```json +POST /clothing/_doc/3 +{ + "description": "Short-sleeved shirt with a button-down collar, can be dressed up or down." +} +``` +{% include copy-curl.html %} + +```json +POST /clothing/_doc/4 +{ + "description": "A set of two midi silk shirt dresses with long sleeves in black. " +} +``` +{% include copy-curl.html %} diff --git a/_query-dsl/span/span-containing.md b/_query-dsl/span/span-containing.md new file mode 100644 index 00000000000..d12beb12d1b --- /dev/null +++ b/_query-dsl/span/span-containing.md @@ -0,0 +1,112 @@ +--- +layout: default +title: Span containing +parent: Span queries +grand_parent: Query DSL +nav_order: 10 +--- + +# Span containing query + +The `span_containing` query finds matches where a larger text pattern (like a phrase or a set of words) contains a smaller text pattern within its boundaries. Think of it as finding a word or phrase but only when it appears within a specific larger context. + +For example, you can use the `span_containing` query to perform the following searches: + +- Find the word "quick" but only when it appears in sentences that mention both foxes and behavior. +- Ensure that certain terms appear within the context of other terms---not just anywhere in the document. +- Search for specific words that appear within larger meaningful phrases. + +## Example + +To try the examples in this section, complete the [setup steps]({{site.url}}{{site.baseurl}}/query-dsl/span/#setup). +{: .tip} + +The following query searches for occurrences of the word "red" that appear within a larger span containing the words "silk" and "dress" (not necessarily in that order) within 5 words of each other: + +```json +GET /clothing/_search +{ + "query": { + "span_containing": { + "little": { + "span_term": { + "description": "red" + } + }, + "big": { + "span_near": { + "clauses": [ + { + "span_term": { + "description": "silk" + } + }, + { + "span_term": { + "description": "dress" + } + } + ], + "slop": 5, + "in_order": false + } + } + } + } +} +``` +{% include copy-curl.html %} + +The query matches document 1 because: + +- It finds a span in which "silk" and "dress" appear within at most 5 words of each other ("...dress in red silk..."). The terms "silk" and "dress" are within 2 words of each other (there are 2 words between them). +- Within this larger span, it finds the term "red". + +<details markdown="block"> + <summary> + Response + </summary> + {: .text-delta} + +```json +{ + "took": 4, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.1577396, + "hits": [ + { + "_index": "clothing", + "_id": "2", + "_score": 1.1577396, + "_source": { + "description": "Beautiful long dress in red silk, perfect for formal events." + } + } + ] + } +} +``` + +</details> + +Both `little` and `big` parameters can contain any type of span query, allowing for complex nested span queries when needed. + +## Parameters + +The following table lists all top-level parameters supported by `span_containing` queries. All parameters are required. + +| Parameter | Data type | Description | +|:-----------|:------|:-------------| +| `little` | Object | The span query that must be contained within the `big` span. This defines the span you're searching for within a larger context. | +| `big` | Object | The containing span query that defines the boundaries within which the `little` span must appear. This establishes the context for your search. | \ No newline at end of file diff --git a/_query-dsl/span/span-field-masking.md b/_query-dsl/span/span-field-masking.md new file mode 100644 index 00000000000..78105b5038d --- /dev/null +++ b/_query-dsl/span/span-field-masking.md @@ -0,0 +1,116 @@ +--- +layout: default +title: Span field masking +parent: Span queries +grand_parent: Query DSL +nav_order: 20 +--- + +# Span field masking query + +The `field_masking_span` query allows span queries to match across different fields by "masking" the true field of a query. This is particularly useful when working with multi-fields (the same content indexed with different analyzers) or when you need to run span queries like `span_near` or `span_or` across different fields (which is normally not allowed). + +For example, you can use the `field_masking_span` query to: +- Match terms across a raw field and its stemmed version. +- Combine span queries on different fields in a single span operation. +- Work with the same content indexed using different analyzers. + +When using field masking, the relevance score is calculated using the characteristics (norms) of the masked field rather than the actual field being searched. This means that if the masked field has different properties (like length or boost values) than the field being searched, you might receive unexpected scoring results. +{: .note} + +## Example + +To try the examples in this section, complete the [setup steps]({{site.url}}{{site.baseurl}}/query-dsl/span/#setup). +{: .tip} + +The following query searches for the word "long" near variations of the word "sleeve" in the stemmed field: + +```json +GET /clothing/_search +{ + "query": { + "span_near": { + "clauses": [ + { + "span_term": { + "description": "long" + } + }, + { + "field_masking_span": { + "query": { + "span_term": { + "description.stemmed": "sleev" + } + }, + "field": "description" + } + } + ], + "slop": 1, + "in_order": true + } + } +} + +``` +{% include copy-curl.html %} + +The query matches documents 1 and 4: +- The term "long" appears in the `description` field in both documents. +- Document 1 contains the word "sleeved", and document 4 contains the word "sleeves". +- The `field_masking_span` makes the stemmed field match appear as if it were in the raw field. +- The terms appear within 1 position of each other in the specified order ("long" must appear before "sleeve"). + +<details markdown="block"> + <summary> + Response + </summary> + {: .text-delta} + +```json +{ + "took": 7, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 0.7444251, + "hits": [ + { + "_index": "clothing", + "_id": "1", + "_score": 0.7444251, + "_source": { + "description": "Long-sleeved dress shirt with a formal collar and button cuffs. " + } + }, + { + "_index": "clothing", + "_id": "4", + "_score": 0.4291246, + "_source": { + "description": "A set of two midi silk shirt dresses with long fluttered sleeves in black. " + } + } + ] + } +} +``` + +## Parameters + +The following table lists all top-level parameters supported by `field_masking_span` queries. All parameters are required. + +| Parameter | Data type | Description | +|:----------|:-----|:------------| +| `query` | Object | The span query to execute on the actual field. | +| `field` | String | The field name used to mask the query. Other span queries will treat this query as if it were executing on this field. | diff --git a/_query-dsl/span/span-first.md b/_query-dsl/span/span-first.md new file mode 100644 index 00000000000..7192157d59c --- /dev/null +++ b/_query-dsl/span/span-first.md @@ -0,0 +1,102 @@ +--- +layout: default +title: Span first +parent: Span queries +grand_parent: Query DSL +nav_order: 30 +--- + +# Span first query + +The `span_first` query matches spans that begin at the start of a field and end within a specified number of positions. This query is useful when you want to find terms or phrases that appear near the beginning of a document. + +For example, you can use the `span_first` query to perform the following searches: + +- Find documents in which specific terms appear in the first few words of a field. +- Ensure certain phrases occur at or near the beginning of a text +- Match patterns only when they appear within a specified distance from the start + +## Example + +To try the examples in this section, complete the [setup steps]({{site.url}}{{site.baseurl}}/query-dsl/span/#setup). +{: .tip} + +The following query searches for the stemmed word "dress" appearing within the first 4 positions of the description: + +```json +GET /clothing/_search +{ + "query": { + "span_first": { + "match": { + "span_term": { + "description.stemmed": "dress" + } + }, + "end": 4 + } + } +} +``` +{% include copy-curl.html %} + +The query matches documents 1 and 2: +- Documents 1 and 2 contain the word `dress` at the third position ("Long-sleeved dress..." and "Beautiful long dress"). Indexing of the words starts with 0, so the word "dress" is at position 2. +- The position of the word `dress` must be less than `4`, as specified by the `end` parameter. + +<details markdown="block"> + <summary> + Response + </summary> + {: .text-delta} + +```json +{ + "took": 13, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 0.110377684, + "hits": [ + { + "_index": "clothing", + "_id": "1", + "_score": 0.110377684, + "_source": { + "description": "Long-sleeved dress shirt with a formal collar and button cuffs. " + } + }, + { + "_index": "clothing", + "_id": "2", + "_score": 0.110377684, + "_source": { + "description": "Beautiful long dress in red silk, perfect for formal events." + } + } + ] + } +} +``` + +</details> + +The `match` parameter can contain any type of span query, allowing for more complex patterns to be matched at the beginning of fields. + +## Parameters + +The following table lists all top-level parameters supported by `span_first` queries. All parameters are required. + +| Parameter | Data type | Description | +|:----------|:-----|:------------| +| `match` | Object | The span query to match. This defines the pattern you're searching for at the start of the field. | +| `end` | Integer | The maximum end position (exclusive) allowed for the span query match. For example, `end: 4` matches terms at positions 0--3. | diff --git a/_query-dsl/span/span-multi-term.md b/_query-dsl/span/span-multi-term.md new file mode 100644 index 00000000000..aff0057fe4a --- /dev/null +++ b/_query-dsl/span/span-multi-term.md @@ -0,0 +1,134 @@ +--- +layout: default +title: Span multi-term +parent: Span queries +grand_parent: Query DSL +nav_order: 40 +--- + +# Span multi-term query + +The `span_multi` query allows you to wrap a multi-term query (like `wildcard`, `fuzzy`, `prefix`, `range`, or `regexp`) as a span query. This enables you to use these more flexible matching queries within other span queries. + +For example, you can use the `span_multi` query to: +- Find words with common prefixes near other terms. +- Match fuzzy variations of words within spans. +- Use regular expressions in span queries. + +>`span_multi` queries can potentially match many terms. To avoid excessive memory usage, you can: +>- Set the `rewrite` parameter for the multi-term query. +>- Use the `top_terms_*` rewrite method. +>- Consider enabling the `index_prefixes` option for the text field if you use `span_multi` only for a `prefix` query. This automatically rewrites any `prefix` query on the field into a single-term query that matches the indexed prefix. +{: .note} + +## Example + +To try the examples in this section, complete the [setup steps]({{site.url}}{{site.baseurl}}/query-dsl/span/index/#setup). +{: .tip} + +The `span_multi` query uses the following syntax to wrap the `prefix` query: + +```json +"span_multi": { + "match": { + "prefix": { + "description": { + "value": "flutter" + } + } + } +} +``` + +The following query searches for words starting with "dress" near any form of "sleeve" within at most 5 words of each other: + +```json +GET /clothing/_search +{ + "query": { + "span_near": { + "clauses": [ + { + "span_multi": { + "match": { + "prefix": { + "description": { + "value": "dress" + } + } + } + } + }, + { + "field_masking_span": { + "query": { + "span_term": { + "description.stemmed": "sleev" + } + }, + "field": "description" + } + } + ], + "slop": 5, + "in_order": false + } + } +} +``` +{% include copy-curl.html %} + +The query matches documents 1 ("Long-sleeved dress...") and 4 ("...dresses with long fluttered sleeves...") because "dress" and "long" occur within the maximum distance in both documents. + +<details markdown="block"> + <summary> + Response + </summary> + {: .text-delta} + +```json +{ + "took": 5, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1.7590723, + "hits": [ + { + "_index": "clothing", + "_id": "1", + "_score": 1.7590723, + "_source": { + "description": "Long-sleeved dress shirt with a formal collar and button cuffs. " + } + }, + { + "_index": "clothing", + "_id": "4", + "_score": 0.84792376, + "_source": { + "description": "A set of two midi silk shirt dresses with long fluttered sleeves in black. " + } + } + ] + } +} +``` +</details> + +## Parameters + +The following table lists all top-level parameters supported by `span_multi` queries. All parameters are required. + +| Parameter | Data type | Description | +|:----------|:-----|:------------| +| `match` | Object | The multi-term query to wrap (can be `prefix`, `wildcard`, `fuzzy`, `range`, or `regexp`). | diff --git a/_query-dsl/span/span-near.md b/_query-dsl/span/span-near.md new file mode 100644 index 00000000000..6a103cda1bb --- /dev/null +++ b/_query-dsl/span/span-near.md @@ -0,0 +1,104 @@ +--- +layout: default +title: Span near +parent: Span queries +grand_parent: Query DSL +nav_order: 50 +--- + +# Span near query + +The `span_near` query matches spans that are near one another. You can specify how far apart the spans can be and whether they need to appear in a specific order. + +For example, you can use the `span_near` query to: +- Find terms that appear within a certain distance of each other. +- Match phrases in which words appear in a specific order. +- Find related concepts that appear close to each other in text. + +## Example + +To try the examples in this section, complete the [setup steps]({{site.url}}{{site.baseurl}}/query-dsl/span/#setup). +{: .tip} + +The following query searches for any forms of "sleeve" and "long" appearing next to each other, in any order: + +```json +GET /clothing/_search +{ + "query": { + "span_near": { + "clauses": [ + { + "span_term": { + "description.stemmed": "sleev" + } + }, + { + "span_term": { + "description.stemmed": "long" + } + } + ], + "slop": 1, + "in_order": false + } + } +} +``` +{% include copy-curl.html %} + +The query matches documents 1 ("Long-sleeved...") and 2 ("...long fluttered sleeves..."). In document 1, the words are next to each other, while in document 2, they are within the specified slop distance of `1` (there is 1 word between them). + +<details markdown="block"> + <summary> + Response + </summary> + {: .text-delta} + +```json +{ + "took": 3, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 0.36496973, + "hits": [ + { + "_index": "clothing", + "_id": "1", + "_score": 0.36496973, + "_source": { + "description": "Long-sleeved dress shirt with a formal collar and button cuffs. " + } + }, + { + "_index": "clothing", + "_id": "4", + "_score": 0.25312424, + "_source": { + "description": "A set of two midi silk shirt dresses with long fluttered sleeves in black. " + } + } + ] + } +} +``` + +## Parameters + +The following table lists all top-level parameters supported by `span_near` queries. + +| Parameter | Data type | Description | +|:----------|:-----|:------------| +| `clauses` | An array of span queries that define the terms or phrases to match. All specified terms must appear within the defined slop distance. Required. | +| `slop` | Integer | The maximum number of intervening unmatched positions between spans. Required. | +| `in_order` | Boolean | Whether spans need to appear in the same order as in the `clauses` array. Optional. Default is `false`. | diff --git a/_query-dsl/span/span-not.md b/_query-dsl/span/span-not.md new file mode 100644 index 00000000000..6b5b5d64802 --- /dev/null +++ b/_query-dsl/span/span-not.md @@ -0,0 +1,82 @@ +--- +layout: default +title: Span not +parent: Span queries +grand_parent: Query DSL +nav_order: 60 +--- + +# Span not query + +The `span_not` query excludes spans that overlap with another span query. You can also specify the distance before or after the excluded spans within which matches cannot occur. + +For example, you can use the `span_not` query to: +- Find terms except when they appear in certain phrases. +- Match spans unless they are near specific terms. +- Exclude matches that occur within a certain distance of other patterns. + +## Example + +To try the examples in this section, complete the [setup steps]({{site.url}}{{site.baseurl}}/query-dsl/span/#setup). +{: .tip} + +The following query searches for the word "dress" but not when it appears in the phrase "dress shirt": + +```json +GET /clothing/_search +{ + "query": { + "span_not": { + "include": { + "span_term": { + "description": "dress" + } + }, + "exclude": { + "span_near": { + "clauses": [ + { + "span_term": { + "description": "dress" + } + }, + { + "span_term": { + "description": "shirt" + } + } + ], + "slop": 0, + "in_order": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +The query matches document 2 because it contains the word "dress" ("Beautiful long dress..."). Document 1 is not matched because it contains the phrase "dress shirt", which is excluded. Documents 3 and 4 are not matched because they contain variations of the word "dress" ("dressed" and "dresses"), and the query is searching the raw field. + +<details markdown="block"> + <summary> + Response + </summary> + {: .text-delta} + +```json + +``` +</details> + +## Parameters + +The following table lists all top-level parameters supported by `span_not` queries. + +| Parameter | Data type | Description | +|:----------|:-----|:------------| +| `include` | Object | The span query whose matches you want to find. Required. | +| `exclude` | Object | The span query whose matches should be excluded. Required. | +| `pre` | Integer | Specifies that the `exclude` span cannot appear within the given number of token positions before the `include` span. Optional. Default is `0`. | +| `post` | Integer | Specifies that the `exclude` span cannot appear within the given number of token positions after the `include` span. Optional. Default is `0`. | +| `dist` | Integer | Equivalent to setting both `pre` and `post` to the same value. Optional. | diff --git a/_query-dsl/span/span-or.md b/_query-dsl/span/span-or.md new file mode 100644 index 00000000000..706623a56fc --- /dev/null +++ b/_query-dsl/span/span-or.md @@ -0,0 +1,127 @@ +--- +layout: default +title: Span or +parent: Span queries +grand_parent: Query DSL +nav_order: 70 +--- + +# Span or query + +The `span_or` query combines multiple span queries and matches the union of their spans. A match occurs if at least one of the contained span queries matches. + +For example, you can use the `span_or` query to: +- Find spans matching any of several patterns. +- Combine different span patterns as alternatives. +- Match multiple span variations in a single query. + +## Example + +To try the examples in this section, complete the [setup steps]({{site.url}}{{site.baseurl}}/query-dsl/span/#setup). +{: .tip} + +The following query searches for either "formal collar" or "button collar" appearing within 2 words of each other: + +```json +GET /clothing/_search +{ + "query": { + "span_or": { + "clauses": [ + { + "span_near": { + "clauses": [ + { + "span_term": { + "description": "formal" + } + }, + { + "span_term": { + "description": "collar" + } + } + ], + "slop": 0, + "in_order": true + } + }, + { + "span_near": { + "clauses": [ + { + "span_term": { + "description": "button" + } + }, + { + "span_term": { + "description": "collar" + } + } + ], + "slop": 2, + "in_order": true + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +The query matches documents 1 ("...formal collar...") and 3 ("...button-down collar...") within the specified slop distance. + +<details markdown="block"> + <summary> + Response + </summary> + {: .text-delta} + +```json +{ + "took": 4, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 2.170027, + "hits": [ + { + "_index": "clothing", + "_id": "1", + "_score": 2.170027, + "_source": { + "description": "Long-sleeved dress shirt with a formal collar and button cuffs. " + } + }, + { + "_index": "clothing", + "_id": "3", + "_score": 1.2509141, + "_source": { + "description": "Short-sleeved shirt with a button-down collar, can be dressed up or down." + } + } + ] + } +} +``` +</details> + +## Parameters + +The following table lists all top-level parameters supported by `span_or` queries. + +| Parameter | Data type | Description | +|:----------|:-----|:------------| +| `clauses` | Array | The array of span queries to match. The query matches if any of these span queries match. Must contain at least one span query. Required. | diff --git a/_query-dsl/span/span-term.md b/_query-dsl/span/span-term.md new file mode 100644 index 00000000000..95f1fd31abe --- /dev/null +++ b/_query-dsl/span/span-term.md @@ -0,0 +1,123 @@ +--- +layout: default +title: Span term +parent: Span queries +grand_parent: Query DSL +nav_order: 80 +--- + +# Span term query + +The `span_term` query is the most basic span query that matches spans containing a single term. It serves as a building block for more complex span queries. + +For example, you can use the `span_term` query to: +- Find exact term matches that can be used in other span queries. +- Match specific words while maintaining position information. +- Create basic spans that can be combined with other span queries. + +## Example + +To try the examples in this section, complete the [setup steps]({{site.url}}{{site.baseurl}}/query-dsl/span/#setup). +{: .tip} + +The following query searches for the exact term "formal": + +```json +GET /clothing/_search +{ + "query": { + "span_term": { + "description": "formal" + } + } +} +``` +{% include copy-curl.html %} + +Alternatively, you can specify the search term in the `value` parameter: + +```json +GET /clothing/_search +{ + "query": { + "span_term": { + "description": { + "value": "formal" + } + } + } +} +``` +{% include copy-curl.html %} + +You can also specify a `boost` value in order to boost the document score: + +```json +GET /clothing/_search +{ + "query": { + "span_term": { + "description": { + "value": "formal", + "boost": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +The query matches documents 1 and 2 because they contain the exact term "formal". Position information is preserved for use in other span queries. + +<details markdown="block"> + <summary> + Response + </summary> + {: .text-delta} + +```json +{ + "took": 2, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1.498922, + "hits": [ + { + "_index": "clothing", + "_id": "2", + "_score": 1.498922, + "_source": { + "description": "Beautiful long dress in red silk, perfect for formal events." + } + }, + { + "_index": "clothing", + "_id": "1", + "_score": 1.4466847, + "_source": { + "description": "Long-sleeved dress shirt with a formal collar and button cuffs. " + } + } + ] + } +} +``` +</details> + +## Parameters + +The following table lists all top-level parameters supported by `span_term` queries. + +| Parameter | Data type | Description | +|:----------------|:------------|:--------| +| `<field>` | String or object | The name of the field in which to search. | diff --git a/_query-dsl/span/span-within.md b/_query-dsl/span/span-within.md new file mode 100644 index 00000000000..4f6b740ec10 --- /dev/null +++ b/_query-dsl/span/span-within.md @@ -0,0 +1,107 @@ +--- +layout: default +title: Span within +parent: Span queries +grand_parent: Query DSL +nav_order: 90 +--- + +# Span within query + +The `span_within` query matches spans that are enclosed by another span query. It is the opposite of [`span_containing`]({{site.url}}{{site.baseurl}}/query-dsl/span/span-containing/): `span_containing` returns larger spans containing smaller ones, whereas `span_within` returns smaller spans enclosed by larger ones. + +For example, you can use the `span_within` query to: +- Find shorter phrases that appear within longer phrases. +- Match terms that occur within specific contexts. +- Identify smaller patterns enclosed by larger patterns. + +## Example + +To try the examples in this section, complete the [setup steps]({{site.url}}{{site.baseurl}}/query-dsl/span/#setup). +{: .tip} + +The following query searches for the word "dress" when it appears within a span containing "shirt" and "long": + +```json +GET /clothing/_search +{ + "query": { + "span_within": { + "little": { + "span_term": { + "description": "dress" + } + }, + "big": { + "span_near": { + "clauses": [ + { + "span_term": { + "description": "shirt" + } + }, + { + "span_term": { + "description": "long" + } + } + ], + "slop": 2, + "in_order": false + } + } + } + } +} +``` +{% include copy-curl.html %} + +The query matches document 1 because: +- The word "dress" appears within a larger span ("Long-sleeved dress shirt..."). +- The larger span contains "shirt" and "long" within 2 words of each other (there are 2 words between them). + +<details markdown="block"> + <summary> + Response + </summary> + {: .text-delta} + +```json +{ + "took": 3, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.4677674, + "hits": [ + { + "_index": "clothing", + "_id": "1", + "_score": 1.4677674, + "_source": { + "description": "Long-sleeved dress shirt with a formal collar and button cuffs. " + } + } + ] + } +} +``` +</details> + +## Parameters + +The following table lists all top-level parameters supported by `span_within` queries. All parameters are required. + +| Parameter | Data type | Description | +|:----------|:-----|:------------| +| `little` | Object | The span query that must be contained within the `big` span. This defines the span you're searching for within a larger context. | +| `big` | Object | The containing span query that defines the boundaries within which the `little` span must appear. This establishes the context for your search. | \ No newline at end of file diff --git a/_query-dsl/specialized/distance-feature.md b/_query-dsl/specialized/distance-feature.md new file mode 100644 index 00000000000..d5980907a8d --- /dev/null +++ b/_query-dsl/specialized/distance-feature.md @@ -0,0 +1,223 @@ +--- +layout: default +title: Distance feature +parent: Specialized queries +nav_order: 5 +has_math: true +--- + +# Distance feature query + +Use the `distance_feature` query to boost the relevance of documents that are closer to a specific date or geographic point. This can help you prioritize more recent or nearby content in your search results. For example, you can assign more weight to products manufactured more recently or boost items closest to a user-specified location. + +You can apply this query to fields containing date or location data. It's commonly used within a `bool` query `should` clause to improve relevance scoring without filtering out results. + +## Configuring the index + +Before using the `distance_feature` query, ensure that your index contains at least one of the following field types: + +- [`date`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date/) +- [`date_nanos`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date-nanos/) +- [`geo_point`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/geo-point/) + +In this example, you'll configure the `opening_date` and `coordinates` fields that you can use to run distance feature queries: + +```json +PUT /stores +{ + "mappings": { + "properties": { + "opening_date": { + "type": "date" + }, + "coordinates": { + "type": "geo_point" + } + } + } +} +``` +{% include copy-curl.html %} + +Add sample documents to the index: + +```json +PUT /stores/_doc/1 +{ + "store_name": "Green Market", + "opening_date": "2025-03-10", + "coordinates": [74.00, 40.70] +} +``` +{% include copy-curl.html %} + +```json +PUT /stores/_doc/2 +{ + "store_name": "Fresh Foods", + "opening_date": "2025-04-01", + "coordinates": [73.98, 40.75] +} +``` +{% include copy-curl.html %} + +```json +PUT /stores/_doc/3 +{ + "store_name": "City Organics", + "opening_date": "2021-04-20", + "coordinates": [74.02, 40.68] +} +``` +{% include copy-curl.html %} + +## Example: Boost scores based on recency + +The following query searches for documents with a `store_name` matching `market` and boosts recently opened stores: + +```json +GET /stores/_search +{ + "query": { + "bool": { + "must": { + "match": { + "store_name": "market" + } + }, + "should": { + "distance_feature": { + "field": "opening_date", + "origin": "2025-04-07", + "pivot": "10d" + } + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching document: + +```json +{ + "took": 4, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.2372394, + "hits": [ + { + "_index": "stores", + "_id": "1", + "_score": 1.2372394, + "_source": { + "store_name": "Green Market", + "opening_date": "2025-03-10", + "coordinates": [ + 74, + 40.7 + ] + } + } + ] + } +} +``` + +### Example: Boost scores based on geographic proximity + +The following query searches for documents with a `store_name` matching `market` and boosts results closer to the given origin point: + +```json +GET /stores/_search +{ + "query": { + "bool": { + "must": { + "match": { + "store_name": "market" + } + }, + "should": { + "distance_feature": { + "field": "coordinates", + "origin": [74.00, 40.71], + "pivot": "500m" + } + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching document: + +```json +{ + "took": 3, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.2910118, + "hits": [ + { + "_index": "stores", + "_id": "1", + "_score": 1.2910118, + "_source": { + "store_name": "Green Market", + "opening_date": "2025-03-10", + "coordinates": [ + 74, + 40.7 + ] + } + } + ] + } +} +``` + +## Parameters + +The following table lists all top-level parameters supported by `distance_feature` queries. + +| Parameter | Required/Optional | Description | +|-----------|-------------------|-------------| +| `field` | Required | The name of the field used to calculate distances. Must be a `date`, `date_nanos`, or `geo_point` field with `index: true` (default) and `doc_values: true` (default). | +| `origin` | Required | The point of origin used to calculate distances. Use a [date]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date/) or [date math expression]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date/#date-math) (for example, `now-1h`) for `date` fields or a [geopoint]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/geo-point/) for `geo_point` fields. | +| `pivot` | Required | The distance from the `origin` at which scores receive half of the `boost` value. Use a time unit (for example, `10d`) for date fields or a distance unit (for example, `1km`) for geographic fields. For more information, see [Units]({{site.url}}{{site.baseurl}}/api-reference/common-parameters/#units).| +| `boost` | Optional | A multiplier for the relevance score of matching documents. Must be a non-negative float. Default is `1.0`. | + +## How scores are calculated + +The `distance_feature` query calculates a document's relevance score using the following formula: + +$$ \text{score} = \text{boost} \cdot \frac {\text{pivot}} {\text{pivot} + \text{distance}} $$, + +where $$\text{distance}$$ is the absolute difference between the `origin` and the field's value. + +## Skipping non-competitive hits + +Unlike other score-modifying queries like the `function_score` query, the `distance_feature` query is optimized to efficiently skip non-competitive hits when total hit tracking (`track_total_hits`) is disabled. diff --git a/_query-dsl/specialized/index.md b/_query-dsl/specialized/index.md index 8a4cd81af62..da077890b59 100644 --- a/_query-dsl/specialized/index.md +++ b/_query-dsl/specialized/index.md @@ -14,7 +14,9 @@ OpenSearch supports the following specialized queries: - `more_like_this`: Finds documents similar to the provided text, document, or collection of documents. -- [`neural`]({{site.url}}{{site.baseurl}}/query-dsl/specialized/neural/): Used for vector field search in [neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/). +- [`knn`]({{site.url}}{{site.baseurl}}/query-dsl/specialized/k-nn/): Used for searching raw vectors during [vector search]({{site.url}}{{site.baseurl}}/vector-search/). + +- [`neural`]({{site.url}}{{site.baseurl}}/query-dsl/specialized/neural/): Used for searching by text or image in [vector search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/). - [`neural_sparse`]({{site.url}}{{site.baseurl}}/query-dsl/specialized/neural-sparse/): Used for vector field search in [sparse neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). @@ -22,7 +24,7 @@ OpenSearch supports the following specialized queries: - `rank_feature`: Calculates scores based on the values of numeric features. This query can skip non-competitive hits. -- `script`: Uses a script as a filter. +- [`script`]({{site.url}}{{site.baseurl}}/query-dsl/specialized/script/): Uses a script as a filter. - [`script_score`]({{site.url}}{{site.baseurl}}/query-dsl/specialized/script-score/): Calculates a custom score for matching documents using a script. diff --git a/_query-dsl/specialized/k-nn/index.md b/_query-dsl/specialized/k-nn/index.md new file mode 100644 index 00000000000..a1aeaefff39 --- /dev/null +++ b/_query-dsl/specialized/k-nn/index.md @@ -0,0 +1,212 @@ +--- +layout: default +title: k-NN +parent: Specialized queries +has_children: true +nav_order: 10 +redirect_from: + - /query-dsl/specialized/k-nn/ +--- + +# k-NN query + +Use the `knn` query for running nearest neighbor searches on vector fields. + +## Request body fields + +Provide a vector field in the `knn` query and specify additional request fields in the vector field object: + +```json +"knn": { + "<vector_field>": { + "vector": [<vector_values>], + "k": <k_value>, + ... + } +} +``` + +The top-level `vector_field` specifies the vector field against which to run a search query. The following table lists all supported request fields. + +Field | Data type | Required/Optional | Description +:--- | :--- | :--- | :--- +`vector` | Array of floats or bytes | Required | The query vector to use for vector search. The data type of the vector elements must match the data type of vectors indexed in the [`knn_vector` field]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) searched. +`k` | Integer | Optional | The number of nearest neighbors to return. Valid values are in the [1, 10,000] range. Required if either `max_distance` or `min_score` is not specified. +`max_distance` | Float | Optional | The maximum distance threshold for search results. Only one of `k`, `max_distance`, or `min_score` can be specified. For more information, see [Radial search]({{site.url}}{{site.baseurl}}/vector-search/specialized-operations/radial-search-knn/). +`min_score` | Float | Optional | The minimum score threshold for search results. Only one of `k`, `max_distance`, or `min_score` can be specified. For more information, see [Radial search]({{site.url}}{{site.baseurl}}/vector-search/specialized-operations/radial-search-knn/). +`filter` | Object | Optional | A filter to apply to the k-NN search. For more information, see [Vector search with filters]({{site.url}}{{site.baseurl}}/vector-search/filter-search-knn/). **Important**: A filter can only be used with the `faiss` or `lucene` engines. +`method_parameters` | Object | Optional | Additional parameters for fine-tuning the search:<br>- `ef_search` (Integer): The number of vectors to examine (for the `hnsw` method)<br>- `nprobes` (Integer): The number of buckets to examine (for the `ivf` method). For more information, see [Specifying method parameters in the query](#specifying-method-parameters-in-the-query). +`rescore` | Object or Boolean | Optional | Parameters for configuring rescoring functionality:<br>- `oversample_factor` (Float): Controls how many candidate vectors are retrieved before rescoring. Valid values are in the `[1.0, 100.0]` range. Default is `false` for fields with `in_memory` mode (no rescoring) and enabled (with dynamic values) for fields with `on_disk` mode. In `on_disk` mode, the default `oversample_factor` is determined by the `compression_level`. For more information, see the [compression level table]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/#rescoring-quantized-results-to-full-precision). To explicitly enable rescoring with the default `oversample_factor` of `1.0`, set `rescore` to `true`. For more information, see [Rescoring results](#rescoring-results). +`expand_nested_docs` | Boolean | Optional | When `true`, retrieves scores for all nested field documents within each parent document. Used with nested queries. For more information, see [Vector search with nested fields]({{site.url}}{{site.baseurl}}/vector-search/specialized-operations/nested-search-knn/). + +## Example request + +```json +GET /my-vector-index/_search +{ + "query": { + "knn": { + "my_vector": { + "vector": [1.5, 2.5], + "k": 3 + } + } + } +} +``` +{% include copy-curl.html %} + +## Example request: Nested fields + +```json +GET /my-vector-index/_search +{ + "_source": false, + "query": { + "nested": { + "path": "nested_field", + "query": { + "knn": { + "nested_field.my_vector": { + "vector": [1,1,1], + "k": 2, + "expand_nested_docs": true + } + } + }, + "inner_hits": { + "_source": false, + "fields":["nested_field.color"] + }, + "score_mode": "max" + } + } +} +``` +{% include copy-curl.html %} + +## Example request: Radial search with max_distance + +The following example shows a radial search performed with `max_distance`: + +```json +GET /my-vector-index/_search +{ + "query": { + "knn": { + "my_vector": { + "vector": [ + 7.1, + 8.3 + ], + "max_distance": 2 + } + } + } +} +``` +{% include copy-curl.html %} + + +## Example request: Radial search with min_score + +The following example shows a radial search performed with `min_score`: + +```json +GET /my-vector-index/_search +{ + "query": { + "knn": { + "my_vector": { + "vector": [7.1, 8.3], + "min_score": 0.95 + } + } + } +} +``` +{% include copy-curl.html %} + +## Specifying method parameters in the query + +Starting with version 2.16, you can provide `method_parameters` in a search request: + +```json +GET /my-vector-index/_search +{ + "size": 2, + "query": { + "knn": { + "target-field": { + "vector": [2, 3, 5, 6], + "k": 2, + "method_parameters" : { + "ef_search": 100 + } + } + } + } +} +``` +{% include copy-curl.html %} + +These parameters are dependent on the combination of engine and method used to create the index. The following sections provide information about the supported `method_parameters`. + +### ef_search + +You can provide the `ef_search` parameter when searching an index created using the `hnsw` method. The `ef_search` parameter specifies the number of vectors to examine in order to find the top k nearest neighbors. Higher `ef_search` values improve recall at the cost of increased search latency. The value must be positive. + +The following table provides information about the `ef_search` parameter for the supported engines. + +Engine | Radial query support | Notes +:--- | :--- | :--- +`nmslib` (Deprecated) | No | If `ef_search` is present in a query, it overrides the `index.knn.algo_param.ef_search` index setting. +`faiss` | Yes | If `ef_search` is present in a query, it overrides the `index.knn.algo_param.ef_search` index setting. +`lucene` | No | When creating a search query, you must specify `k`. If you provide both `k` and `ef_search`, then the larger value is passed to the engine. If `ef_search` is larger than `k`, you can provide the `size` parameter to limit the final number of results to `k`. + +<!-- vale off --> +### nprobes +<!-- vale on --> + +You can provide the `nprobes` parameter when searching an index created using the `ivf` method. The `nprobes` parameter specifies the number of buckets to examine in order to find the top k nearest neighbors. Higher `nprobes` values improve recall at the cost of increased search latency. The value must be positive. + +The following table provides information about the `nprobes` parameter for the supported engines. + +Engine | Notes +:--- | :--- +`faiss` | If `nprobes` is present in a query, it overrides the value provided when creating the index. + +## Rescoring results + +You can fine-tune search by providing the `ef_search` and `oversample_factor` parameters. + +The `oversample_factor` parameter controls the factor by which the search oversamples the candidate vectors before ranking them. Using a higher oversample factor means that more candidates will be considered before ranking, improving accuracy but also increasing search time. When selecting the `oversample_factor` value, consider the trade-off between accuracy and efficiency. For example, setting the `oversample_factor` to `2.0` will double the number of candidates considered during the ranking phase, which may help achieve better results. + +The following request specifies the `ef_search` and `oversample_factor` parameters: + +```json +GET /my-vector-index/_search +{ + "size": 2, + "query": { + "knn": { + "my_vector_field": { + "vector": [1.5, 5.5, 1.5, 5.5, 1.5, 5.5, 1.5, 5.5], + "k": 10, + "method_parameters": { + "ef_search": 10 + }, + "rescore": { + "oversample_factor": 10.0 + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Next steps + +- [k-NN vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) +- [Rescoring quantized results to full precision]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/#rescoring-quantized-results-to-full-precision) \ No newline at end of file diff --git a/_query-dsl/specialized/k-nn/k-nn-explain.md b/_query-dsl/specialized/k-nn/k-nn-explain.md new file mode 100644 index 00000000000..cd5f5656615 --- /dev/null +++ b/_query-dsl/specialized/k-nn/k-nn-explain.md @@ -0,0 +1,658 @@ +--- +layout: default +title: k-NN query explain +parent: k-NN +grand_parent: Specialized queries +nav_order: 10 +--- + +# k-NN query explain +**Introduced 3.0** +{: .label .label-purple } + +You can provide the `explain` parameter to understand how scores are calculated, normalized, and combined in `knn` queries. When enabled, it provides detailed information about the scoring process for each search result. This includes revealing the score normalization techniques used, how different scores were combined, and the calculations for individual subquery scores. This comprehensive insight makes it easier to understand and optimize your `knn` query results. For more information about `explain`, see [Explain API]({{site.url}}{{site.baseurl}}/api-reference/explain/). + +`explain` is an expensive operation in terms of both resources and time. For production clusters, we recommend using it sparingly for the purpose of troubleshooting. +{: .warning } + +You can provide the `explain` parameter in a URL when running a complete `knn` query for the Faiss engine using the following syntax: + +```json +GET <index>/_search?explain=true +POST <index>/_search?explain=true +``` + +`explain` for k-NN search for all types of queries with the Lucene engine does not return a detailed explanation as with the Faiss engine. +{: .note } + +The `explain` parameter works for the following types of k-NN search with the Faiss engine: + +- [Approximate k-NN search]({{site.url}}{{site.baseurl}}/vector-search/vector-search-techniques/approximate-knn/) +- Approximate k-NN search with [exact search]({{site.url}}{{site.baseurl}}/vector-search/vector-search-techniques/knn-score-script/) +- [Disk-based search]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/disk-based-vector-search/) +- [k-NN search with efficient filtering]({{site.url}}{{site.baseurl}}/vector-search/filter-search-knn/efficient-knn-filtering/) +- [Radial search]({{site.url}}{{site.baseurl}}/vector-search/specialized-operations/radial-search-knn/) +- k-NN search with a `term` query + +`explain` for k-NN search with nested fields does not return a detailed explanation as with other searches. +{: .note } + +You can provide the `explain` parameter as a query parameter: + +```json +GET my-knn-index/_search?explain=true +{ + "query": { + "knn": { + "my_vector": { + "vector": [2, 3, 5, 7], + "k": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +Alternatively, you can provide the `explain` parameter in the request body: + +```json +GET my-knn-index/_search +{ + "query": { + "knn": { + "my_vector": { + "vector": [2, 3, 5, 7], + "k": 2 + } + } + }, + "explain": true +} +``` +{% include copy-curl.html %} + +## Example: Approximate k-NN search + +<details markdown="block"> + <summary> + Example response + </summary> + {: .text-delta} + +```json +{ + "took": 216038, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 88.4, + "hits": [ + { + "_shard": "[my-knn-index-1][0]", + "_node": "VHcyav6OTsmXdpsttX2Yug", + "_index": "my-knn-index-1", + "_id": "5", + "_score": 88.4, + "_source": { + "my_vector1": [ + 2.5, + 3.5, + 5.5, + 7.4 + ], + "price": 8.9 + }, + "_explanation": { + "value": 88.4, + "description": "the type of knn search executed was Approximate-NN", + "details": [ + { + "value": 88.4, + "description": "the type of knn search executed at leaf was Approximate-NN with vectorDataType = FLOAT, spaceType = innerproduct where score is computed as `-rawScore + 1` from:", + "details": [ + { + "value": -87.4, + "description": "rawScore, returned from FAISS library", + "details": [] + } + ] + } + ] + } + }, + { + "_shard": "[my-knn-index-1][0]", + "_node": "VHcyav6OTsmXdpsttX2Yug", + "_index": "my-knn-index-1", + "_id": "2", + "_score": 84.7, + "_source": { + "my_vector1": [ + 2.5, + 3.5, + 5.6, + 6.7 + ], + "price": 5.5 + }, + "_explanation": { + "value": 84.7, + "description": "the type of knn search executed was Approximate-NN", + "details": [ + { + "value": 84.7, + "description": "the type of knn search executed at leaf was Approximate-NN with vectorDataType = FLOAT, spaceType = innerproduct where score is computed as `-rawScore + 1` from:", + "details": [ + { + "value": -83.7, + "description": "rawScore, returned from FAISS library", + "details": [] + } + ] + } + ] + } + } + ] + } +} +``` +</details> + +## Example: Approximate k-NN search with exact search + +<details markdown="block"> + <summary> + Example response + </summary> + {: .text-delta} + +```json +{ + "took": 87, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 84.7, + "hits": [ + { + "_shard": "[my-knn-index-1][0]", + "_node": "MQVux8dZRWeznuEYKhMq0Q", + "_index": "my-knn-index-1", + "_id": "7", + "_score": 84.7, + "_source": { + "my_vector2": [ + 2.5, + 3.5, + 5.6, + 6.7 + ], + "price": 5.5 + }, + "_explanation": { + "value": 84.7, + "description": "the type of knn search executed was Approximate-NN", + "details": [ + { + "value": 84.7, + "description": "the type of knn search executed at leaf was Exact with spaceType = INNER_PRODUCT, vectorDataType = FLOAT, queryVector = [2.0, 3.0, 5.0, 6.0]", + "details": [] + } + ] + } + }, + { + "_shard": "[my-knn-index-1][0]", + "_node": "MQVux8dZRWeznuEYKhMq0Q", + "_index": "my-knn-index-1", + "_id": "8", + "_score": 82.2, + "_source": { + "my_vector2": [ + 4.5, + 5.5, + 6.7, + 3.7 + ], + "price": 4.4 + }, + "_explanation": { + "value": 82.2, + "description": "the type of knn search executed was Approximate-NN", + "details": [ + { + "value": 82.2, + "description": "the type of knn search executed at leaf was Exact with spaceType = INNER_PRODUCT, vectorDataType = FLOAT, queryVector = [2.0, 3.0, 5.0, 6.0]", + "details": [] + } + ] + } + } + ] + } +``` +</details> + +## Example: Disk-based search + +<details markdown="block"> + <summary> + Example response + </summary> + {: .text-delta} + +```json +{ + "took" : 4, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 1, + "relation" : "eq" + }, + "max_score" : 381.0, + "hits" : [ + { + "_shard" : "[my-vector-index][0]", + "_node" : "pLaiqZftTX-MVSKdQSu7ow", + "_index" : "my-vector-index", + "_id" : "9", + "_score" : 381.0, + "_source" : { + "my_vector_field" : [ + 9.5, + 9.5, + 9.5, + 9.5, + 9.5, + 9.5, + 9.5, + 9.5 + ], + "price" : 8.9 + }, + "_explanation" : { + "value" : 381.0, + "description" : "the type of knn search executed was Disk-based and the first pass k was 100 with vector dimension of 8, over sampling factor of 5.0, shard level rescoring enabled", + "details" : [ + { + "value" : 381.0, + "description" : "the type of knn search executed at leaf was Approximate-NN with spaceType = HAMMING, vectorDataType = FLOAT, queryVector = [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5]", + "details" : [ ] + } + ] + } + } + ] + } +} +``` +</details> + +## Example: k-NN search with efficient filtering + +<details markdown="block"> + <summary> + Example response + </summary> + {: .text-delta} + +```json +{ + "took" : 51, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : 0.8620689, + "hits" : [ + { + "_shard" : "[products-shirts][0]", + "_node" : "9epk8WoFT8yvnUI0tAaJgQ", + "_index" : "products-shirts", + "_id" : "8", + "_score" : 0.8620689, + "_source" : { + "item_vector" : [ + 2.4, + 4.0, + 3.0 + ], + "size" : "small", + "rating" : 8 + }, + "_explanation" : { + "value" : 0.8620689, + "description" : "the type of knn search executed was Approximate-NN", + "details" : [ + { + "value" : 0.8620689, + "description" : "the type of knn search executed at leaf was Exact since filteredIds = 2 is less than or equal to K = 10 with spaceType = L2, vectorDataType = FLOAT, queryVector = [2.0, 4.0, 3.0]", + "details" : [ ] + } + ] + } + }, + { + "_shard" : "[products-shirts][0]", + "_node" : "9epk8WoFT8yvnUI0tAaJgQ", + "_index" : "products-shirts", + "_id" : "6", + "_score" : 0.029691212, + "_source" : { + "item_vector" : [ + 6.4, + 3.4, + 6.6 + ], + "size" : "small", + "rating" : 9 + }, + "_explanation" : { + "value" : 0.029691212, + "description" : "the type of knn search executed was Approximate-NN", + "details" : [ + { + "value" : 0.029691212, + "description" : "the type of knn search executed at leaf was Exact since filteredIds = 2 is less than or equal to K = 10 with spaceType = L2, vectorDataType = FLOAT, queryVector = [2.0, 4.0, 3.0]", + "details" : [ ] + } + ] + } + } + ] + } +} +``` +</details> + +## Example: Radial search + +```json +GET my-knn-index/_search?explain=true +{ + "query": { + "knn": { + "my_vector": { + "vector": [7.1, 8.3], + "max_distance": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +<details markdown="block"> + <summary> + Example response + </summary> + {: .text-delta} + +```json +{ + "took" : 376529, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : 0.98039204, + "hits" : [ + { + "_shard" : "[knn-index-test][0]", + "_node" : "c9b4aPe4QGO8eOtb8P5D3g", + "_index" : "knn-index-test", + "_id" : "1", + "_score" : 0.98039204, + "_source" : { + "my_vector" : [ + 7.0, + 8.2 + ], + "price" : 4.4 + }, + "_explanation" : { + "value" : 0.98039204, + "description" : "the type of knn search executed was Radial with the radius of 2.0", + "details" : [ + { + "value" : 0.98039204, + "description" : "the type of knn search executed at leaf was Approximate-NN with vectorDataType = FLOAT, spaceType = l2 where score is computed as `1 / (1 + rawScore)` from:", + "details" : [ + { + "value" : 0.020000057, + "description" : "rawScore, returned from FAISS library", + "details" : [ ] + } + ] + } + ] + } + }, + { + "_shard" : "[knn-index-test][0]", + "_node" : "c9b4aPe4QGO8eOtb8P5D3g", + "_index" : "knn-index-test", + "_id" : "3", + "_score" : 0.9615384, + "_source" : { + "my_vector" : [ + 7.3, + 8.3 + ], + "price" : 19.1 + }, + "_explanation" : { + "value" : 0.9615384, + "description" : "the type of knn search executed was Radial with the radius of 2.0", + "details" : [ + { + "value" : 0.9615384, + "description" : "the type of knn search executed at leaf was Approximate-NN with vectorDataType = FLOAT, spaceType = l2 where score is computed as `1 / (1 + rawScore)` from:", + "details" : [ + { + "value" : 0.040000115, + "description" : "rawScore, returned from FAISS library", + "details" : [ ] + } + ] + } + ] + } + } + ] + } +} + +``` +</details> + +## Example: k-NN search with a term query + +```json +GET my-knn-index/_search?explain=true +{ + "query": { + "bool": { + "should": [ + { + "knn": { + "my_vector2": { // vector field name + "vector": [2, 3, 5, 6], + "k": 2 + } + } + }, + { + "term": { + "price": "4.4" + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +<details markdown="block"> + <summary> + Example response + </summary> + {: .text-delta} + +```json +{ + "took" : 51, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : 84.7, + "hits" : [ + { + "_shard" : "[my-knn-index-1][0]", + "_node" : "c9b4aPe4QGO8eOtb8P5D3g", + "_index" : "my-knn-index-1", + "_id" : "7", + "_score" : 84.7, + "_source" : { + "my_vector2" : [ + 2.5, + 3.5, + 5.6, + 6.7 + ], + "price" : 5.5 + }, + "_explanation" : { + "value" : 84.7, + "description" : "sum of:", + "details" : [ + { + "value" : 84.7, + "description" : "the type of knn search executed was Approximate-NN", + "details" : [ + { + "value" : 84.7, + "description" : "the type of knn search executed at leaf was Approximate-NN with vectorDataType = FLOAT, spaceType = innerproduct where score is computed as `-rawScore + 1` from:", + "details" : [ + { + "value" : -83.7, + "description" : "rawScore, returned from FAISS library", + "details" : [ ] + } + ] + } + ] + } + ] + } + }, + { + "_shard" : "[my-knn-index-1][0]", + "_node" : "c9b4aPe4QGO8eOtb8P5D3g", + "_index" : "my-knn-index-1", + "_id" : "8", + "_score" : 83.2, + "_source" : { + "my_vector2" : [ + 4.5, + 5.5, + 6.7, + 3.7 + ], + "price" : 4.4 + }, + "_explanation" : { + "value" : 83.2, + "description" : "sum of:", + "details" : [ + { + "value" : 82.2, + "description" : "the type of knn search executed was Approximate-NN", + "details" : [ + { + "value" : 82.2, + "description" : "the type of knn search executed at leaf was Approximate-NN with vectorDataType = FLOAT, spaceType = innerproduct where score is computed as `-rawScore + 1` from:", + "details" : [ + { + "value" : -81.2, + "description" : "rawScore, returned from FAISS library", + "details" : [ ] + } + ] + } + ] + }, + { + "value" : 1.0, + "description" : "price:[1082969293 TO 1082969293]", + "details" : [ ] + } + ] + } + } + ] + } +} +``` + +</details> + +## Response body fields + +Field | Description +:--- | :--- +`explanation` | The `explanation` object contains the following fields: <br> - `value`: Contains the calculation result.<br> - `description`: Explains what type of calculation was performed. For score normalization, the information in the `description` field includes the technique used for normalization or combination and the corresponding score. <br> - `details`: Shows any subcalculations performed. + diff --git a/_query-dsl/specialized/neural-sparse.md b/_query-dsl/specialized/neural-sparse.md index 904d340b136..5fca4b507de 100644 --- a/_query-dsl/specialized/neural-sparse.md +++ b/_query-dsl/specialized/neural-sparse.md @@ -9,41 +9,95 @@ nav_order: 55 Introduced 2.11 {: .label .label-purple } -Use the `neural_sparse` query for vector field search in [neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). The query can use either raw text or sparse vector tokens. +Use the `neural_sparse` query for vector field search in [neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). + +You can run the query in the following ways: + +- Provide sparse vector embeddings for matching. For more information, see [Neural sparse search using raw vectors]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-with-raw-vectors/): + ```json + "neural_sparse": { + "<vector_field>": { + "query_tokens": { + "<token>": <weight>, + ... + } + } + } + ``` +- Provide text to tokenize and use for matching. To tokenize the text, you can use the following components: + - A built-in DL model analyzer: + ```json + "neural_sparse": { + "<vector_field>": { + "query_text": "<input text>", + "analyzer": "bert-uncased" + } + } + ``` + - A tokenizer model: + ```json + "neural_sparse": { + "<vector_field>": { + "query_text": "<input text>", + "model_id": "<model ID>" + } + } + ``` + + For more information, see [Generating sparse vector embeddings automatically]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-with-pipelines/). + ## Request body fields -Include the following request fields in the `neural_sparse` query: -### Example: Query by raw text +The top-level `vector_field` specifies the vector field against which to run a search query. You must specify either `query_text` or `query_tokens` to define the input. The following fields can be used to configure the query + +Field | Data type | Required/Optional | Description +:--- | :--- | :--- | :--- +`query_text` | String | Optional | The query text to convert into sparse vector embeddings. Either `query_text` or `query_tokens` must be specified. +`analyzer` | String | Optional | Used with `query_text`. Specifies a built-in DL model analyzer for tokenizing query text. Valid values are `bert-uncased` and `mbert-uncased`. Default is `bert-uncased`. If neither `model_id` nor `analyzer` are specified, the default analyzer (`bert-uncased`) is used to tokenize the text. Cannot be specified at the same time as `model_id`. For more information, see [DL model analyzers]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/dl-model-analyzers/). +`model_id` | String | Optional | Used with `query_text`. The ID of the sparse encoding model (for bi-encoder mode) or tokenizer (for doc-only mode) used to generate vector embeddings from the query text. The model/tokenizer must be deployed in OpenSearch before it can be used in neural sparse search. For more information, see [Using custom models within OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/) and [Generating sparse vector embeddings automatically]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-with-pipelines/). For information about setting a default model ID in a neural sparse query, see [`neural_query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/). Cannot be specified at the same time as `analyzer`. +`query_tokens` | Map of token (string) to weight (float) | Optional | A raw sparse vector in the form of tokens and their weights. Used as an alternative to `query_text` for direct vector input. Either `query_text` or `query_tokens` must be specified. +`max_token_score` | Float | Optional | (Deprecated) This parameter has been deprecated since OpenSearch 2.12. It is maintained only for backward compatibility and no longer affects functionality. The parameter can still be provided in requests, but its value has no impact. Previously used as the theoretical upper bound of the score for all tokens in the vocabulary. + + +#### Examples + +To run a search using text tokenized by an analyzer, specify an `analyzer` in the request. The analyzer must be compatible with the model that you used for text analysis at ingestion time: ```json -"neural_sparse": { - "<vector_field>": { - "query_text": "<query_text>", - "model_id": "<model_id>" +GET my-nlp-index/_search +{ + "query": { + "neural_sparse": { + "passage_embedding": { + "query_text": "Hi world", + "analyzer": "bert-uncased" + } + } } } ``` -### Example: Query by sparse vector +{% include copy-curl.html %} + +For more information, see [DL model analyzers]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/dl-model-analyzers/). + +If you don't specify an analyzer, the default `bert-uncased` analyzer is used: + ```json -"neural_sparse": { - "<vector_field>": { - "query_tokens": "<query_tokens>" +GET my-nlp-index/_search +{ + "query": { + "neural_sparse": { + "passage_embedding": { + "query_text": "Hi world" + } + } } } ``` +{% include copy-curl.html %} -The top-level `vector_field` specifies the vector field against which to run a search query. The following table lists the other `neural_sparse` query fields. - -Field | Data type | Required/Optional | Description -:--- | :--- | :--- -`query_text` | String | Optional | The query text from which to generate sparse vector embeddings. -`model_id` | String | Optional | The ID of the sparse encoding model or tokenizer model that will be used to generate vector embeddings from the query text. The model must be deployed in OpenSearch before it can be used in sparse neural search. For more information, see [Using custom models within OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/) and [Neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). For information on setting a default model ID in a neural sparse query, see [`neural_query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/). -`query_tokens` | Map<String, Float> | Optional | The query tokens, sometimes referred to as sparse vector embeddings. Similarly to dense semantic retrieval, you can use raw sparse vectors generated by neural models or tokenizers to perform a semantic search query. Use either the `query_text` option for raw field vectors or the `query_tokens` option for sparse vectors. Must be provided in order for the `neural_sparse` query to operate. -`max_token_score` | Float | Optional | (Deprecated) The theoretical upper bound of the score for all tokens in the vocabulary (required for performance optimization). For OpenSearch-provided [pretrained sparse embedding models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#sparse-encoding-models), we recommend setting `max_token_score` to 2 for `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1` and to 3.5 for `amazon/neural-sparse/opensearch-neural-sparse-encoding-v1`. This field has been deprecated as of OpenSearch 2.12. - -#### Example request -**Query by raw text** +To search using text tokenized by a tokenizer model, provide the model ID in the request: ```json GET my-nlp-index/_search @@ -58,7 +112,9 @@ GET my-nlp-index/_search } } ``` -**Query by sparse vector** +{% include copy-curl.html %} + +To search using a sparse vector, provide the sparse vector in the `query_tokens` parameter: ```json GET my-nlp-index/_search @@ -79,4 +135,8 @@ GET my-nlp-index/_search } } ``` -{% include copy-curl.html %} \ No newline at end of file +{% include copy-curl.html %} + +## Next steps + +- For more information about neural sparse search, see [Neural sparse search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-search/). \ No newline at end of file diff --git a/_query-dsl/specialized/neural.md b/_query-dsl/specialized/neural.md index ae9e1f2ea43..7e7427c4ea4 100644 --- a/_query-dsl/specialized/neural.md +++ b/_query-dsl/specialized/neural.md @@ -7,7 +7,7 @@ nav_order: 50 # Neural query -Use the `neural` query for vector field search in [neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/). +Use the `neural` query for vector field search by text or image in [vector search]({{site.url}}{{site.baseurl}}/vector-search/). ## Request body fields @@ -24,19 +24,22 @@ Include the following request fields in the `neural` query: } ``` -The top-level `vector_field` specifies the vector field against which to run a search query. The following table lists the other neural query fields. +The top-level `vector_field` specifies the vector or semantic field against which to run a search query. The following table lists the other neural query fields. Field | Data type | Required/Optional | Description :--- | :--- | :--- `query_text` | String | Optional | The query text from which to generate vector embeddings. You must specify at least one `query_text` or `query_image`. `query_image` | String | Optional | A base-64 encoded string that corresponds to the query image from which to generate vector embeddings. You must specify at least one `query_text` or `query_image`. -`model_id` | String | Required if the default model ID is not set. For more information, see [Setting a default model on an index or field]({{site.url}}{{site.baseurl}}/search-plugins/neural-text-search/#setting-a-default-model-on-an-index-or-field). | The ID of the model that will be used to generate vector embeddings from the query text. The model must be deployed in OpenSearch before it can be used in neural search. For more information, see [Using custom models within OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/) and [Neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/). +`model_id` | String | Optional if the target field is a semantic field. Required if the target field is a `knn_vector` field and the default model ID is not set. For more information, see [Setting a default model on an index or field]({{site.url}}{{site.baseurl}}/search-plugins/neural-text-search/#setting-a-default-model-on-an-index-or-field). | The ID of the model that will be used to generate vector embeddings from the query text. The model must be deployed in OpenSearch before it can be used in neural search. For more information, see [Using custom models within OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/) and [Neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/). Cannot be provided together with the `semantic_field_search_analyzer`. `k` | Integer | Optional | The number of results returned by the k-NN search. Only one variable, either `k`, `min_score`, or `max_distance`, can be specified. If a variable is not specified, the default is `k` with a value of `10`. -`min_score` | Float | Optional | The minimum score threshold for the search results. Only one variable, either `k`, `min_score`, or `max_distance`, can be specified. For more information, see [k-NN radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/). -`max_distance` | Float | Optional | The maximum distance threshold for the search results. Only one variable, either `k`, `min_score`, or `max_distance`, can be specified. For more information, see [k-NN radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/). -`filter` | Object | Optional | A query that can be used to reduce the number of documents considered. For more information about filter usage, see [k-NN search with filters]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/). **Important**: Filter can only be used with the `faiss` or `lucene` engines. -`method_parameters` | Object | Optional | Parameters passed to the k-NN index during search. See [Additional query parameters]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#additional-query-parameters). -`rescore` | Object | Optional | Parameters for configuring rescoring functionality for k-NN indexes built using quantization. See [Rescoring]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#rescoring-quantized-results-using-full-precision). +`min_score` | Float | Optional | The minimum score threshold for the search results. Only one variable, either `k`, `min_score`, or `max_distance`, can be specified. For more information, see [Radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/). +`max_distance` | Float | Optional | The maximum distance threshold for the search results. Only one variable, either `k`, `min_score`, or `max_distance`, can be specified. For more information, see [Radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/). +`filter` | Object | Optional | A query that can be used to reduce the number of documents considered. For more information about filter usage, see [Vector search with filters]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/). +`method_parameters` | Object | Optional | Additional parameters for fine-tuning the search:<br>- `ef_search` (Integer): The number of vectors to examine (for the `hnsw` method)<br>- `nprobes` (Integer): The number of buckets to examine (for the `ivf` method). For more information, see [Specifying method parameters in the query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/k-nn/index/#specifying-method-parameters-in-the-query). +`rescore` | Object or Boolean | Optional | Parameters for configuring rescoring functionality:<br>- `oversample_factor` (Float): Controls how many candidate vectors are retrieved before rescoring. Valid values are in the `[1.0, 100.0]` range. Default is `false` for fields with `in_memory` mode (no rescoring) and `enabled` (with dynamic values) for fields with `on_disk` mode. In `on_disk` mode, the default `oversample_factor` is determined by the `compression_level`. For more information, see the [compression level table]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/#rescoring-quantized-results-to-full-precision). To explicitly enable rescoring with the default `oversample_factor` of `1.0`, set `rescore` to `true`. For more information, see [Rescoring results]({{site.url}}{{site.baseurl}}/query-dsl/specialized/k-nn/index/#rescoring-results). +`expand_nested_docs` | Boolean | Optional | When `true`, retrieves scores for all nested field documents within each parent document. Used with nested queries. For more information, see [Vector search with nested fields]({{site.url}}{{site.baseurl}}/vector-search/specialized-operations/nested-search-knn/). +`semantic_field_search_analyzer` | String | Optional | Specifies an analyzer for tokenizing the `query_text` when using a sparse encoding model. Valid values are `standard`, `bert-uncased`, and `mbert-uncased`. Cannot be used together with `model_id`. For more information, see [Analyzers]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/). +`query_tokens` | Map of token (string) to weight (float) | Optional | A raw sparse vector in the form of tokens and their weights. Used as an alternative to `query_text` for direct vector input. Either `query_text` or `query_tokens` must be specified. #### Example request @@ -150,3 +153,40 @@ GET /my-nlp-index/_search } ``` {% include copy-curl.html %} + +The following example shows a search against a `semantic` field using a dense model. A `semantic` field stores model information in its configuration. The `neural` query automatically retrieves the `model_id` from the `semantic` field's configuration in the index mapping and rewrites the query to target the corresponding embedding field: + +```json +GET /my-nlp-index/_search +{ + "query": { + "neural": { + "passage": { + "query_text": "Hi world" + "k": 100 + } + } + } +} +``` +{% include copy-curl.html %} + +The following example shows a search against a `semantic` field using a sparse encoding model. This search uses sparse embeddings: + +```json +GET /my-nlp-index/_search +{ + "query": { + "neural": { + "passage": { + "query_tokens": { + "worlds": 0.57605183 + } + } + } + } +} +``` +{% include copy-curl.html %} + +For more information, see [Semantic field type]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/semantic/). \ No newline at end of file diff --git a/_query-dsl/specialized/script.md b/_query-dsl/specialized/script.md new file mode 100644 index 00000000000..331bc87b615 --- /dev/null +++ b/_query-dsl/specialized/script.md @@ -0,0 +1,218 @@ +--- +layout: default +title: Script query +parent: Specialized queries +nav_order: 58 +--- + +# Script query + +Use the `script` query to filter documents based on a custom condition written in the Painless scripting language. This query returns documents for which the script evaluates to `true`, enabling advanced filtering logic that can't be expressed using standard queries. + +The `script` query is computationally expensive and should be used sparingly. Only use it when necessary and ensure `search.allow_expensive_queries` is enabled (default is `true`). For more information, see [Expensive queries]({{site.url}}{{site.baseurl}}/query-dsl/#expensive-queries). +{: .important } + +## Example + +Create an index named `products` with the following mappings: + +```json +PUT /products +{ + "mappings": { + "properties": { + "title": { "type": "text" }, + "price": { "type": "float" }, + "rating": { "type": "float" } + } + } +} +``` +{% include copy-curl.html %} + +Index example documents using the following request: + +```json +POST /products/_bulk +{ "index": { "_id": 1 } } +{ "title": "Wireless Earbuds", "price": 99.99, "rating": 4.5 } +{ "index": { "_id": 2 } } +{ "title": "Bluetooth Speaker", "price": 79.99, "rating": 4.8 } +{ "index": { "_id": 3 } } +{ "title": "Noise Cancelling Headphones", "price": 199.99, "rating": 4.7 } +``` +{% include copy-curl.html %} + +## Basic script query + +Return products with a rating higher than `4.6`: + +```json +POST /products/_search +{ + "query": { + "script": { + "script": { + "source": "doc['rating'].value > 4.6" + } + } + } +} +``` +{% include copy-curl.html %} + +The returned hits only include documents with a `rating` higher than `4.6`: + +```json +{ + ... + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "products", + "_id": "2", + "_score": 1, + "_source": { + "title": "Bluetooth Speaker", + "price": 79.99, + "rating": 4.8 + } + }, + { + "_index": "products", + "_id": "3", + "_score": 1, + "_source": { + "title": "Noise Cancelling Headphones", + "price": 199.99, + "rating": 4.7 + } + } + ] + } +} +``` + +## Parameters + +The `script` query takes the following top-level parameters. + +| Parameter | Required/Optional | Description | +| --------------- | ----------------- | ----------------------------------------------------- | +| `script.source` | Required | The script code that evaluates to `true` or `false`. | +| `script.params` | Optional | User-defined parameters referenced inside the script. | + +## Using script parameters + +You can use `params` to safely inject values, taking advantage of script compilation caching: + +```json +POST /products/_search +{ + "query": { + "script": { + "script": { + "source": "doc['price'].value < params.max_price", + "params": { + "max_price": 100 + } + } + } + } +} +``` +{% include copy-curl.html %} + +The returned hits only include documents with a `price` of less than `100`: + +```json +{ + ... + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "products", + "_id": "1", + "_score": 1, + "_source": { + "title": "Wireless Earbuds", + "price": 99.99, + "rating": 4.5 + } + }, + { + "_index": "products", + "_id": "2", + "_score": 1, + "_source": { + "title": "Bluetooth Speaker", + "price": 79.99, + "rating": 4.8 + } + } + ] + } +} +``` + +## Combining multiple conditions + +Use the following query to search for products with a `rating` higher than `4.5` and a `price` lower than `100`: + +```json +POST /products/_search +{ + "query": { + "script": { + "script": { + "source": "doc['rating'].value > 4.5 && doc['price'].value < 100" + } + } + } +} +``` +{% include copy-curl.html %} + +Only the documents that match the requirements are returned: + +```json +{ + "took": 12, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "products", + "_id": "2", + "_score": 1, + "_source": { + "title": "Bluetooth Speaker", + "price": 79.99, + "rating": 4.8 + } + } + ] + } +} +``` diff --git a/_query-dsl/specialized/template.md b/_query-dsl/specialized/template.md new file mode 100644 index 00000000000..e89a69c7060 --- /dev/null +++ b/_query-dsl/specialized/template.md @@ -0,0 +1,101 @@ +--- +layout: default +title: Template +parent: Specialized queries +nav_order: 70 +--- + +# Template query +Introduced 2.19 +{: .label .label-purple } + +Use a `template` query to create search queries that contain placeholder variables. Placeholders are specified using the `"${variable_name}"` syntax (note that the variables must be enclosed in quotation marks). When you submit a search request, these placeholders remain unresolved until they are processed by search request processors. This approach is particularly useful when your initial search request contains data that needs to be transformed or generated at runtime. + +For example, you might use a template query when working with the [ml_inference search request processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/ml-inference-search-request/), which converts text input into vector embeddings during the search process. The processor will replace the placeholders with the generated values before the final query is executed. + +For a complete example, see [Query rewriting using template queries]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/template-query/). + +## Example + +The following example shows a template k-NN query with a `"vector": "${text_embedding}"` placeholder. The placeholder `"${text_embedding}"` will be replaced with embeddings generated by the `ml_inference` search request processor from the `text` input field: + +```json +GET /template-knn-index/_search?search_pipeline=my_knn_pipeline +{ + "query": { + "template": { + "knn": { + "text_embedding": { + "vector": "${text_embedding}", // Placeholder for the vector field + "k": 2 + } + } + } + }, + "ext": { + "ml_inference": { + "text": "sneakers" // Input text for the ml_inference processor + } + } +} +``` +{% include copy-curl.html %} + +To use a template query with a search request processor, you need to configure a search pipeline. The following is an example configuration for the `ml_inference` search request processor. The `input_map` maps document fields to model inputs. In this example, the `ext.ml_inference.text` source field in a document is mapped to the `inputText` field---the expected input field for the model. The `output_map` maps model outputs to document fields. In this example, the `embedding` output field from the model is mapped to the `text_embedding` destination field in your document: + +```json +PUT /_search/pipeline/my_knn_pipeline +{ + "request_processors": [ + { + "ml_inference": { + "model_id": "Sz-wFZQBUpPSu0bsJTBG", + "input_map": [ + { + "inputText": "ext.ml_inference.text" // Map input text from the request + } + ], + "output_map": [ + { + "text_embedding": "embedding" // Map output to the placeholder + } + ] + } + } + ] +} +``` +{% include copy-curl.html %} + +After the `ml_inference` search request processor runs, the search request is rewritten. The `vector` field contains the embeddings generated by the processor, and the `text_embedding` field contains the processor output: + +```json +GET /template-knn-1/_search +{ + "query": { + "template": { + "knn": { + "text_embedding": { + "vector": [0.6328125, 0.26953125, ...], + "k": 2 + } + } + } + }, + "ext": { + "ml_inference": { + "text": "sneakers", + "text_embedding": [0.6328125, 0.26953125, ...] + } + } +} +``` +{% include copy-curl.html %} + +## Limitations + +Template queries require at least one search request processor in order to resolve placeholders. Search request processors must be configured to produce the variables expected in the pipeline. + +## Next steps + +- For a complete example, see [Template queries]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/template-query/). \ No newline at end of file diff --git a/_query-dsl/term/regexp.md b/_query-dsl/term/regexp.md index 34a0c916ce2..ae3b94ebe20 100644 --- a/_query-dsl/term/regexp.md +++ b/_query-dsl/term/regexp.md @@ -7,7 +7,7 @@ nav_order: 60 # Regexp query -Use the `regexp` query to search for terms that match a regular expression. +Use the `regexp` query to search for terms that match a regular expression. For more information about writing regular expressions, see [Regular expression syntax]({{site.url}}{{site.baseurl}}/query-dsl/regex-syntax/). The following query searches for any term that starts with any uppercase or lowercase letter followed by `amlet`: @@ -29,7 +29,8 @@ Note the following important considerations: - By default, the maximum length of a regular expression is 1,000 characters. To change the maximum length, update the `index.max_regex_length` setting. - Regular expressions use the Lucene syntax, which differs from more standardized implementations. Test thoroughly to ensure that you receive the results you expect. To learn more, see [the Lucene documentation](https://lucene.apache.org/core/8_9_0/core/index.html). - To improve regexp query performance, avoid wildcard patterns without a prefix or suffix, such as `.*` or `.*?+`. -- `regexp` queries can be expensive operations and require the [`search.allow_expensive_queries`]({{site.url}}{{site.baseurl}}/query-dsl/index/#expensive-queries) setting to be set to `true`. Before making frequent `regexp` queries, test their impact on cluster performance and examine alternative queries that may achieve similar results. +- `regexp` queries can be expensive operations and require the [`search.allow_expensive_queries`]({{site.url}}{{site.baseurl}}/query-dsl/#expensive-queries) setting to be set to `true`. Before making frequent `regexp` queries, test their impact on cluster performance and examine alternative queries that may achieve similar results. +- The [wildcard field type]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/wildcard/) builds an index that is specially designed to be very efficient for wildcard and regular expression queries. ## Parameters @@ -57,7 +58,7 @@ Parameter | Data type | Description `value` | String | The regular expression used for matching terms in the field specified in `<field>`. `boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. `case_insensitive` | Boolean | If `true`, allows case-insensitive matching of the regular expression value with the indexed field values. Default is `false` (case sensitivity is determined by the field's mapping). -`flags` | String | Enables optional operators for Lucene’s regular expression engine. +`flags` | String | Enables optional operators for Lucene's regular expression engine. For valid values, see [Optional operators]({{site.url}}{{site.baseurl}}/query-dsl/regex-syntax/#optional-operators). `max_determinized_states` | Integer | Lucene converts a regular expression to an automaton with a number of determinized states. This parameter specifies the maximum number of automaton states the query requires. Use this parameter to prevent high resource consumption. To run complex regular expressions, you may need to increase the value of this parameter. Default is 10,000. `rewrite` | String | Determines how OpenSearch rewrites and scores multi-term queries. Valid values are `constant_score`, `scoring_boolean`, `constant_score_boolean`, `top_terms_N`, `top_terms_boost_N`, and `top_terms_blended_freqs_N`. Default is `constant_score`. diff --git a/_query-dsl/term/term.md b/_query-dsl/term/term.md index a33146f6aa3..e833c014fe2 100644 --- a/_query-dsl/term/term.md +++ b/_query-dsl/term/term.md @@ -42,6 +42,9 @@ GET shakespeare/_search ``` {% include copy-curl.html %} +In OpenSearch 2.x and earlier, complexity can increase exponentially with the number of characters, leading to high heap memory usage and reduced performance. To avoid this, do not use case-insensitive searches. Instead, apply a [lowercase token filter]({{site.url}}{{site.baseurl}}/analyzers/token-filters/lowercase/) in the indexed field's analyzer and use lowercase query terms. +{: .warning} + The response contains the matching documents despite any differences in case: ```json @@ -95,4 +98,5 @@ Parameter | Data type | Description :--- | :--- | :--- `value` | String | The term to search for in the field specified in `<field>`. A document is returned in the results only if its field value exactly matches the term, with the correct spacing and capitalization. `boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. +`_name` | String | The name of the query for query tagging. Optional. `case_insensitive` | Boolean | If `true`, allows case-insensitive matching of the value with the indexed field values. Default is `false` (case sensitivity is determined by the field's mapping). diff --git a/_query-dsl/term/terms.md b/_query-dsl/term/terms.md index 7dac6a96190..c73e03a2a85 100644 --- a/_query-dsl/term/terms.md +++ b/_query-dsl/term/terms.md @@ -28,6 +28,10 @@ A document is returned if it matches any of the terms in the array. By default, the maximum number of terms allowed in a `terms` query is 65,536. To change the maximum number of terms, update the `index.max_terms_count` setting. +For better query performance, pass long arrays containing terms in sorted order (ordered by UTF-8 byte values, ascending). +{: .tip} + + The ability to [highlight results]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/highlight/) for terms queries may not be guaranteed, depending on the highlighter type and the number of terms in the query. {: .note} @@ -39,6 +43,7 @@ Parameter | Data type | Description :--- | :--- | :--- `<field>` | String | The field in which to search. A document is returned in the results only if its field value exactly matches at least one term, with the correct spacing and capitalization. `boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. +`_name` | String | The name of the query for query tagging. Optional. `value_type` | String | Specifies the types of values used for filtering. Valid values are `default` and `bitmap`. If omitted, the value defaults to `default`. ## Terms lookup @@ -183,7 +188,7 @@ PUT classes/_doc/102 To search for students enrolled in `CS102`, use the dot path notation to specify the full path to the field in the `path` parameter: ```json -ET students/_search +GET students/_search { "query": { "terms": { @@ -250,15 +255,15 @@ Parameter | Data type | Description `id` | String | The document ID of the document from which to fetch field values. Required. `path` | String | The name of the field from which to fetch field values. Specify nested fields using dot path notation. Required. `routing` | String | Custom routing value of the document from which to fetch field values. Optional. Required if a custom routing value was provided when the document was indexed. -`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. +`store` | Boolean | Whether to perform the lookup on the stored field instead of `_source`. Optional. ## Bitmap filtering **Introduced 2.17** {: .label .label-purple } -The `terms` query can filter for multiple terms simultaneously. However, when the number of terms in the input filter increases to a large value (around 10,000), the resulting network and memory overhead can become significant, making the query inefficient. In such cases, consider encoding your large terms filter using a [roaring bitmap](https://github.com/RoaringBitmap/RoaringBitmap) for more efficient filtering. +The `terms` query can filter for multiple terms simultaneously. However, when the number of terms in the input filter increases to a large value (around 10,000), the resulting network and memory overhead can become significant, making the query inefficient. In such cases, consider encoding your large terms filter using a [roaring bitmap](https://github.com/RoaringBitmap/RoaringBitmap) for more efficient filtering. -The following example assumes that you have two indexes: a `products` index, which contains all the products sold by a company, and a `customers` index, which stores filters representing customers who own specific products. +The following example assumes that you have two indexes: a `products` index, which contains all the products sold by a company, and a `customers` index, which stores filters representing customers who own specific products. First, create a `products` index and map `product_id` as a `keyword`: @@ -267,7 +272,7 @@ PUT /products { "mappings": { "properties": { - "product_id": { "type": "keyword" } + "product_id": { "type": "integer" } } } } @@ -277,33 +282,33 @@ PUT /products Next, index three documents that correspond to products: ```json -PUT students/_doc/1 +PUT /products/_doc/1 { "name": "Product 1", - "product_id" : "111" + "product_id" : 111 } ``` {% include copy-curl.html %} ```json -PUT students/_doc/2 +PUT /products/_doc/2 { "name": "Product 2", - "product_id" : "222" + "product_id" : 222 } ``` {% include copy-curl.html %} ```json -PUT students/_doc/3 +PUT /products/_doc/3 { "name": "Product 3", - "product_id" : "333" + "product_id" : 333 } ``` {% include copy-curl.html %} -To store customer bitmap filters, you'll create a `customer_filter` [binary field](https://opensearch.org/docs/latest/field-types/supported-field-types/binary/) in the `customers` index. Specify `store` as `true` to store the field: +To store customer bitmap filters, you'll create a `customer_filter` [binary field]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/binary/) in the `customers` index. Specify `store` as `true` to store the field: ```json PUT /customers @@ -377,10 +382,12 @@ POST /products/_search { "query": { "terms": { - "product_id": "OjAAAAEAAAAAAAIAEAAAAG8A3gBNAQ==", + "product_id": [ + "OjAAAAEAAAAAAAIAEAAAAG8A3gBNAQ==" + ], "value_type": "bitmap" } } } ``` -{% include copy-curl.html %} \ No newline at end of file +{% include copy-curl.html %} diff --git a/_query-dsl/term/wildcard.md b/_query-dsl/term/wildcard.md index c6e0499517f..f9ca1636d7b 100644 --- a/_query-dsl/term/wildcard.md +++ b/_query-dsl/term/wildcard.md @@ -36,6 +36,8 @@ If you change `*` to `?`, you get no matches because `?` refers to a single char Wildcard queries tend to be slow because they need to iterate over a lot of terms. Avoid placing wildcard characters at the beginning of a query because it could be a very expensive operation in terms of both resources and time. +The [wildcard field type]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/wildcard/) builds an index that is specially designed to be very efficient for wildcard and regular expression queries. + ## Parameters The query accepts the name of the field (`<field>`) as a top-level parameter: @@ -64,5 +66,5 @@ Parameter | Data type | Description `case_insensitive` | Boolean | If `true`, allows case-insensitive matching of the value with the indexed field values. Default is `false` (case sensitivity is determined by the field's mapping). `rewrite` | String | Determines how OpenSearch rewrites and scores multi-term queries. Valid values are `constant_score`, `scoring_boolean`, `constant_score_boolean`, `top_terms_N`, `top_terms_boost_N`, and `top_terms_blended_freqs_N`. Default is `constant_score`. -If [`search.allow_expensive_queries`]({{site.url}}{{site.baseurl}}/query-dsl/index/#expensive-queries) is set to `false`, then wildcard queries are not executed. +If [`search.allow_expensive_queries`]({{site.url}}{{site.baseurl}}/query-dsl/#expensive-queries) is set to `false`, then wildcard queries are not executed. {: .important} diff --git a/_reporting/report-dashboard-index.md b/_reporting/report-dashboard-index.md index 0df87a965c9..5e6d07b8021 100644 --- a/_reporting/report-dashboard-index.md +++ b/_reporting/report-dashboard-index.md @@ -11,7 +11,7 @@ redirect_from: You can use OpenSearch Dashboards to create PNG, PDF, and CSV reports. To create reports, you must have the correct permissions. For a summary of the predefined roles and the permissions they grant, see the [Security plugin]({{site.url}}{{site.baseurl}}/security/access-control/users-roles#predefined-roles). -CSV reports have a non-configurable 10,000 row limit. They have no explicit size limit (for example, MB), but extremely large documents could cause report generation to fail with an out of memory error from the V8 JavaScript engine. +CSV reports have a non-configurable 10,000-row limit in OpenSearch version 2.16 and earlier. As of version 2.17, this limit can be configured when setting up a report. While reports have no explicit size limit (for example, MB), extremely large documents could cause report generation to fail with an out-of-memory error from the V8 JavaScript engine. {: .tip } ## Generating reports diff --git a/_sass/_home.scss b/_sass/_home.scss index 9b5dd864a90..0a3d1f7dac3 100644 --- a/_sass/_home.scss +++ b/_sass/_home.scss @@ -22,11 +22,16 @@ // Card style -.card-container-wrapper { +.home-card-container-wrapper { @include gradient-open-sky; + margin-bottom: 2rem; } -.card-container { +.card-container-wrapper { + margin-bottom: 0; +} + +.home-card-container { display: grid; grid-template-columns: 1fr; margin: 0 auto; @@ -42,11 +47,27 @@ } } -.card { +.card-container { + display: grid; + grid-template-columns: 1fr; + margin: 0 auto; + padding: 2rem 0; + grid-row-gap: 1rem; + grid-column-gap: 1rem; + grid-auto-rows: 1fr; + @include mq(md) { + grid-template-columns: repeat(1, 1fr); + } + @include mq(lg) { + grid-template-columns: repeat(2, 1fr); + } +} + +.home-card { @extend .panel; @include thick-edge-left; padding: 1rem; - margin-bottom: 4rem; + margin-bottom: 2rem; text-align: left; background-color: white; display: flex; @@ -67,9 +88,9 @@ } } -@mixin heading-font { +@mixin heading-font($size: 1.5rem) { @include heading-sans-serif; - font-size: 1.5rem; + font-size: $size; font-weight: 700; color: $blue-dk-300; } @@ -81,6 +102,14 @@ margin: 1rem 0 1.5rem 0; } +.card { + @extend .home-card; + margin-bottom: 0; + .heading { + @include heading-font(1.2rem); + } +} + .heading-main { @include heading-font; margin: 0; @@ -110,6 +139,53 @@ width: 100%; } +// List layout + +.numbered-list { + display: flex; + flex-direction: column; + gap: 2rem; + padding: 1rem; +} + +.list-item { + display: flex; + align-items: flex-start; + gap: 1rem; +} + +.number-circle { + width: 2.5rem; + height: 2.5rem; + border-radius: 50%; + background-color: $blue-lt-100; + color: $blue-dk-300; + display: flex; + align-items: center; + justify-content: center; + font-weight: bold; + font-size: 1.2rem; + flex-shrink: 0; +} + +.list-content { + max-width: 100%; +} + +.list-heading { + @include heading-font (1.2rem); + margin: 0 0 0.75rem 0; + font-size: 1.2rem; + color: $blue-dk-300; + font-weight: bold; +} + +.list-content p { + margin: 0.5rem 0; + font-size: 1rem; + line-height: 1.5; +} + // Banner style .os-banner { diff --git a/_sass/_web-embed.scss b/_sass/_web-embed.scss index bafbdf71128..21ce53b3feb 100644 --- a/_sass/_web-embed.scss +++ b/_sass/_web-embed.scss @@ -4,12 +4,15 @@ https://github.com/nathancy/jekyll-embed-video */ .embed-container { - position: relative; - padding-bottom: 56.25%; - height: 0; - overflow: hidden; - max-width: 100%; + position: relative; + width: 100%; + max-width: 100%; + margin: 0 auto; + aspect-ratio: 16/9; + @include mq(lg) { + max-width: 640px; } +} .embed-container iframe, .embed-container object, .embed-container embed { position: absolute; @@ -17,4 +20,5 @@ https://github.com/nathancy/jekyll-embed-video left: 0; width: 100%; height: 100%; - } \ No newline at end of file + border: 0; + } diff --git a/_sass/custom/custom.scss b/_sass/custom/custom.scss index b3ee3c3775c..929e42ab72e 100755 --- a/_sass/custom/custom.scss +++ b/_sass/custom/custom.scss @@ -203,6 +203,13 @@ img { border-left: 5px solid $red-100; } +.info { + @extend %callout; + border-left: 5px solid $blue-300; + font-weight: 600; + background-color: $blue-lt-000; +} + @mixin version-warning ( $version: 'latest' ){ @extend %callout, .panel; font-weight: 600; @@ -307,6 +314,43 @@ img { } } +@mixin btn-dark-blue { + color: white; + background-color: $blue-300; + font-size: 1.13rem; + font-weight: 510; + border-width: 1px; + border-style: solid; + border-radius: 5px; + box-shadow: 1px 1px $grey-lt-300; + cursor: pointer; +} + +.btn-dark-blue { + @include btn-dark-blue; + border-color: $blue-dk-300; + padding: 0.5rem 1rem; + margin-left: 0.4rem; + margin-right: 0.4rem; + + &:hover:not([disabled]) { + background-color: $blue-vibrant-300; + box-shadow: 1px 2px 4px $grey-lt-300; + transform: translateY(-1px); + text-decoration: underline; + text-underline-offset: 2px; + } + + &:active { + transform: translateY(1px); + } +} + +.centering-container { + display: flex; + justify-content: center; +} + // Back to top button .top-link { display: block; @@ -347,12 +391,17 @@ div.highlighter-rouge { display: flex; justify-content: flex-end; } - -// Copy code button + +.button-container { + position: absolute; + bottom: 10px; + right: 10px; + display: flex; +} + .copy-button { @extend .btn-general; background-color: $sidebar-color; - display: inline; } // Copy as curl button @@ -1206,7 +1255,77 @@ body { line-height: 24px; } +.code-container { + position: relative; + padding-bottom: 3.5rem; /* Make room for scroll bar */ + background-color: $code-background-color; +} + +.button-container { + position: absolute; + bottom: 10px; + right: 10px; + display: flex; + background: inherit; /* Match parent background */ +} + +/* Add horizontal scroll to the code area */ +.code-container .highlight { + overflow-x: auto; + margin: 0; +} + +.code-container .highlight pre { + margin: 0; + white-space: pre; +} + +.code-tabs { + margin-bottom: 1.5rem; +} + +.code-tabs .tab-nav { + border-bottom: 1px solid #eeebee; + margin-bottom: -1px; +} + +.code-tabs .tab-button { + background: none; + border: 1px solid transparent; + padding: 8px 16px; + cursor: pointer; + margin-bottom: -1px; + color: $blue-dk-200; + &:hover { + background-color: $blue-lt-100; + } +} + +.code-tabs .tab-button.active { + border-color: #eeebee; + border-bottom-color: #fff; + color: $blue-dk-300; +} + +.code-tabs .tab { + display: none; + border: 1px solid #eeebee; + padding: 15px; +} + +.code-tabs .tab.active { + display: block; +} + +/* Ensure copy buttons stay on top of scrolled content */ +.copy-button { + z-index: 1; + position: relative; +} + + @import "../font-awesome.scss"; @import "../_navigation-header.scss"; @import "../footer.scss"; -@import "../_home.scss"; \ No newline at end of file +@import "../_home.scss"; +@import "../web-embed"; \ No newline at end of file diff --git a/_search-plugins/caching/index.md b/_search-plugins/caching/index.md index 000d0b61c13..c953462f3cd 100644 --- a/_search-plugins/caching/index.md +++ b/_search-plugins/caching/index.md @@ -25,9 +25,6 @@ OpenSearch supports the following on-heap cache types: **Introduced 2.14** {: .label .label-purple } -This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/10024). -{: .warning} - In addition to existing custom OpenSearch on-heap cache stores, cache plugins provide the following cache stores: - **Disk cache**: Stores the precomputed result of a query on disk. Use a disk cache to cache much larger datasets, provided that the disk's latency is within an acceptable range. diff --git a/_search-plugins/caching/request-cache.md b/_search-plugins/caching/request-cache.md index 124152300b4..b4ec39462aa 100644 --- a/_search-plugins/caching/request-cache.md +++ b/_search-plugins/caching/request-cache.md @@ -12,7 +12,7 @@ The OpenSearch index request cache is a specialized caching mechanism designed t The cache is automatically invalidated at the configured refresh interval. The invalidation includes document updates (including document deletions) and changes to index settings. This ensures that stale results are never returned from the cache. When the cache size exceeds its configured limit, the least recently used entries are evicted to make room for new entries. -Search requests with `size=0` are cached in the request cache by default. Search requests with non-deterministic characteristics (such as `Math.random()`) or relative times (such as `now` or `new Date()`) are ineligible for caching. +Some queries are ineligible for the request cache. These include profiled queries, scroll queries, and search requests with non-deterministic characteristics (such as those using `Math.random()` or DFS queries) or relative times (such as `now` or `new Date()`). By default, only requests with `size=0` are cacheable. In OpenSearch 2.19 and later, this behavior can be changed using `indices.requests.cache.maximum_cacheable_size`. {: .note} ## Configuring request caching @@ -28,6 +28,7 @@ Setting | Data type | Default | Level | Static/Dynamic | Description `indices.cache.cleanup_interval` | Time unit | `1m` (1 minute) | Cluster | Static | Schedules a recurring background task that cleans up expired entries from the cache at the specified interval. `indices.requests.cache.size` | Percentage | `1%` | Cluster | Static | The cache size as a percentage of the heap size (for example, to use 1% of the heap, specify `1%`). `index.requests.cache.enable` | Boolean | `true` | Index | Dynamic | Enables or disables the request cache. +`indices.requests.cache.maximum_cacheable_size` | Integer | `0` | Cluster | Dynamic | Sets the maximum `size` of queries to be added to the request cache. ### Example diff --git a/_search-plugins/caching/tiered-cache.md b/_search-plugins/caching/tiered-cache.md index 1b793c84653..547dbcf7304 100644 --- a/_search-plugins/caching/tiered-cache.md +++ b/_search-plugins/caching/tiered-cache.md @@ -8,26 +8,12 @@ nav_order: 10 # Tiered cache -This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/10024). -{: .warning} - A tiered cache is a multi-level cache in which each tier has its own characteristics and performance levels. By combining different tiers, you can achieve a balance between cache performance and size. ## Types of tiered caches OpenSearch provides an implementation of a `_tiered` spillover `cache_`. This implementation spills any items removed from the upper tiers to the lower tiers of cache. The upper tier, such as the on-heap tier, is smaller in size but offers better latency. The lower tier, such as the disk cache, is larger in size but slower in terms of latency. OpenSearch offers both on-heap and disk tiers. -## Enabling a tiered cache - -To enable a tiered cache, configure the following setting in `opensearch.yml`: - -```yaml -opensearch.experimental.feature.pluggable.caching.enabled: true -``` -{% include copy.html %} - -For more information about ways to enable experimental features, see [Experimental feature flags]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/experimental/). - ## Installing required plugins To use tiered caching, install a tiered cache plugin. As of OpenSearch 2.13, the only available cache plugin is the `cache-ehcache` plugin. This plugin provides a disk cache implementation that can be used as a disk tier within a tiered cache. For more information about installing non-bundled plugins, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#additional-plugins). @@ -88,7 +74,8 @@ The following table lists additional settings for the `tiered_spillover` store s Setting | Data type | Default | Description :--- | :--- | :--- | :--- -`indices.requests.cache.tiered_spillover.disk.store.policies.took_time.threshold` | Time unit | `10ms` | A policy used to determine whether to cache a query into a disk cache based on its took time. This is a dynamic setting. Optional. +`indices.requests.cache.tiered_spillover.policies.took_time.threshold` | Time unit | `0ms` | A policy used to determine whether to cache a query into the cache based on its query phase execution time. This is a dynamic setting. Optional. +`indices.requests.cache.tiered_spillover.disk.store.policies.took_time.threshold` | Time unit | `10ms` | A policy used to determine whether to cache a query into the disk tier of the cache based on its query phase execution time. This is a dynamic setting. Optional. `indices.requests.cache.tiered_spillover.disk.store.enabled` | Boolean | `True` | Enables or disables the disk cache dynamically within a tiered spillover cache. Note: After disabling a disk cache, entries are not removed automatically and requires the cache to be manually cleared. Optional. `indices.requests.cache.tiered_spillover.onheap.store.size` | Percentage | 1% of the heap size | Defines the size of the on-heap cache within tiered cache. Optional. `indices.requests.cache.tiered_spillover.disk.store.size` | Long | `1073741824` (1 GB) | Defines the size of the disk cache within tiered cache. Optional. diff --git a/_search-plugins/concurrent-segment-search.md b/_search-plugins/concurrent-segment-search.md index 6675faf1f97..5bb0f80eda3 100644 --- a/_search-plugins/concurrent-segment-search.md +++ b/_search-plugins/concurrent-segment-search.md @@ -22,9 +22,12 @@ Without concurrent segment search, Lucene executes a request sequentially across ## Enabling concurrent segment search at the index or cluster level -Starting with OpenSearch version 2.17, you can use the `search.concurrent_segment_search.mode` setting to configure concurrent segment search on your cluster. The existing `search.concurrent_segment_search.enabled` setting will be deprecated in future version releases in favor of the new setting. +Starting with OpenSearch version 3.0, concurrent segment search is enabled at the cluster level by default. The default concurrent segment search mode is `auto`. After upgrading, aggregation workloads may experience increased CPU utilization. We recommend monitoring your cluster's resource usage and adjusting your infrastructure capacity as needed to maintain optimal performance. +{: .important} -By default, concurrent segment search is disabled on the cluster. You can enable concurrent segment search at two levels: +To configure concurrent segment search on your cluster, use the `search.concurrent_segment_search.mode` setting. The older `search.concurrent_segment_search.enabled` setting will be deprecated in future version releases in favor of the new setting. + +You can enable concurrent segment search at two levels: - Cluster level - Index level @@ -34,11 +37,11 @@ The index-level setting takes priority over the cluster-level setting. Thus, if Both the cluster- and index-level `search.concurrent_segment_search.mode` settings accept the following values: -- `all`: Enables concurrent segment search across all search requests. This is equivalent to setting `search.concurrent_segment_search.enabled` to `true`. +- `auto` (Default): In this mode, OpenSearch will use the pluggable _concurrent search decider_ to decide whether to use a concurrent or sequential path for the search request based on the query evaluation and the presence of aggregations in the request. By default, if there are no deciders configured by any plugin, then the decision to use concurrent search will be made based on the presence of aggregations in the request. For more information about the pluggable decider semantics, see [Pluggable concurrent search deciders](#pluggable-concurrent-search-deciders-concurrentsearchrequestdecider). -- `none`: Disables concurrent segment search for all search requests, effectively turning off the feature. This is equivalent to setting `search.concurrent_segment_search.enabled` to `false`. This is the **default** behavior. +- `all`: Enables concurrent segment search across all search requests. This is equivalent to setting `search.concurrent_segment_search.enabled` to `true`. -- `auto`: In this mode, OpenSearch will use the pluggable _concurrent search decider_ to decide whether to use a concurrent or sequential path for the search request based on the query evaluation and the presence of aggregations in the request. By default, if there are no deciders configured by any plugin, then the decision to use concurrent search will be made based on the presence of aggregations in the request. For more information about the pluggable decider semantics, see [Pluggable concurrent search deciders](#pluggable-concurrent-search-deciders-concurrentsearchrequestdecider). +- `none`: Disables concurrent segment search for all search requests, effectively turning off the feature. This is equivalent to setting `search.concurrent_segment_search.enabled` to `false`. To enable concurrent segment search for all search requests across every index in the cluster, send the following request: @@ -109,25 +112,28 @@ PUT <index-name>/_settings ``` {% include copy-curl.html %} +## Slicing mechanisms +You can choose one of two available mechanisms for assigning segments to slices: the default [max slice count mechanism](#the-max-slice-count-mechanism) or the [Lucene mechanism](#the-lucene-mechanism). +### The max slice count mechanism -## Slicing mechanisms +The _max slice count_ mechanism is a slicing mechanism that uses a dynamically configurable maximum number of slices and divides segments among the slices in a round-robin fashion. This is useful when there are already too many top-level shard requests and you want to limit the number of slices per request in order to reduce competition between the slices. -You can choose one of two available mechanisms for assigning segments to slices: the default [Lucene mechanism](#the-lucene-mechanism) or the [max slice count mechanism](#the-max-slice-count-mechanism). +Starting with OpenSearch version 3.0, concurrent segment search uses the max slice count mechanism by default. The max slice count is calculated at the cluster startup time using the formula `Math.max(1, Math.min(Runtime.getRuntime().availableProcessors() / 2, 4))`. You can override this value by explicitly setting the `max_slice_count` parameter at either the cluster level or index level. For more information about updating `max_slice_count`, see [Setting the slicing mechanism](#setting-the-slicing-mechanism). To revert back to the default calculated value, set `max_slice_count` to `null`. ### The Lucene mechanism -By default, Lucene assigns a maximum of 250K documents or 5 segments (whichever is met first) to each slice in a shard. For example, consider a shard with 11 segments. The first 5 segments have 250K documents each, and the next 6 segments have 20K documents each. The first 5 segments will be assigned to 1 slice each because they each contain the maximum number of documents allowed for a slice. Then the next 5 segments will all be assigned to another single slice because of the maximum allowed segment count for a slice. The 11th slice will be assigned to a separate slice. +The Lucene mechanism is an alternative to the max slice count mechanism. By default, Lucene assigns a maximum of 250K documents or 5 segments (whichever is met first) to each slice in a shard. For example, consider a shard with 11 segments. The first 5 segments have 250K documents each, and the next 6 segments have 20K documents each. The first 5 segments will be assigned to 1 slice each because they each contain the maximum number of documents allowed for a slice. Then the next 5 segments will all be assigned to another single slice because of the maximum allowed segment count for a slice. The 11th slice will be assigned to a separate slice. -### The max slice count mechanism +### Setting the slicing mechanism -The _max slice count_ mechanism is an alternative slicing mechanism that uses a dynamically configurable maximum number of slices and divides segments among the slices in a round-robin fashion. This is useful when there are already too many top-level shard requests and you want to limit the number of slices per request in order to reduce competition between the slices. +You can set the slicing mechanism at the cluster level or index level by updating the `search.concurrent.max_slice_count` setting. -### Setting the slicing mechanism +Both the cluster- and index-level `search.concurrent.max_slice_count` settings can take the following valid values: -By default, concurrent segment search uses the Lucene mechanism to calculate the number of slices for each shard-level request. -To use the max slice count mechanism instead, you can set the slice count for concurrent segment search at either the cluster level or index level. +- Positive integer: Use the max target slice count mechanism. Usually, a value between 2 and 8 should be sufficient. +- `0`: Use the Lucene mechanism. To configure the slice count for all indexes in a cluster, use the following dynamic cluster setting: @@ -151,17 +157,18 @@ PUT <index-name>/_settings ``` {% include copy-curl.html %} -Both the cluster- and index-level `search.concurrent.max_slice_count` settings can take the following valid values: -- `0`: Use the default Lucene mechanism. -- Positive integer: Use the max target slice count mechanism. Usually, a value between 2 and 8 should be sufficient. - ## General guidelines + Concurrent segment search helps to improve the performance of search requests at the cost of consuming more resources, such as CPU or JVM heap. It is important to test your workload in order to understand whether the cluster is sized correctly for concurrent segment search. We recommend adhering to the following concurrent segment search guidelines: * Start with a slice count of 2 and measure the performance of your workload. If resource utilization exceeds the recommended values, then consider scaling your cluster. Based on our testing, we have observed that if your workload is already consuming more than 50% of your CPU resources, then you need to scale your cluster for concurrent segment search. * If your slice count is 2 and you still have available resources in the cluster, then you can increase the slice count to a higher number, such as 4 or 6, while monitoring search latency and resource utilization in the cluster. * When many clients send search requests in parallel, a lower slice count usually works better. This is reflected in CPU utilization because a higher number of clients leads to more queries per second, which translates to higher resource usage. +When upgrading to OpenSearch 3.0, be aware that workloads with aggregations may experience higher CPU utilization because concurrent search is enabled by default in `auto` mode. If your OpenSearch 2.x cluster's CPU utilization exceeds 25% when running aggregation workloads, consider the following options before upgrading: + +- Plan to scale your cluster's resources to accommodate the increased CPU demand. +- Prepare to disable concurrent search if scaling is not feasible for your use case. ## Limitations @@ -175,7 +182,7 @@ The following sections provide additional considerations for concurrent segment ### The `terminate_after` search parameter -The [`terminate_after` search parameter]({{site.url}}{{site.baseurl}}/api-reference/search/#query-parameters) is used to terminate a search request once a specified number of documents has been collected. If you include the `terminate_after` parameter in a request, concurrent segment search is disabled and the request is run in a non-concurrent manner. +The [`terminate_after` search parameter]({{site.url}}{{site.baseurl}}/api-reference/search/#query-parameters) is used to terminate a search request once a specified number of matching documents has been collected. If you include the `terminate_after` parameter in a request, concurrent segment search is disabled and the request is run in a non-concurrent manner. Typically, queries are used with smaller `terminate_after` values and thus complete quickly because the search is performed on a reduced dataset. Therefore, concurrent search may not further improve performance in this case. Moreover, when `terminate_after` is used with other search request parameters, such as `track_total_hits` or `size`, it adds complexity and changes the expected query behavior. Falling back to a non-concurrent path for search requests that include `terminate_after` ensures consistent results between concurrent and non-concurrent requests. @@ -206,4 +213,4 @@ Introduced 2.17 {: .label .label-purple } Plugin developers can customize the concurrent search decision-making for `auto` mode by extending [`ConcurrentSearchRequestDecider`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/deciders/ConcurrentSearchRequestDecider.java) and registering its factory through [`SearchPlugin#getConcurrentSearchRequestFactories()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/plugins/SearchPlugin.java#L148). The deciders are evaluated only if a request does not belong to any category listed in the [Limitations](#limitations) and [Other considerations](#other-considerations) sections. For more information about the decider implementation, see [the corresponding GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/15259). -The search request is parsed using a `QueryBuilderVisitor`, which calls the [`ConcurrentSearchRequestDecider#evaluateForQuery()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/deciders/ConcurrentSearchRequestDecider.java#L36) method of all the configured deciders for every node of the `QueryBuilder` tree in the search request. The final concurrent search decision is obtained by combining the decision from each decider returned by the [`ConcurrentSearchRequestDecider#getConcurrentSearchDecision()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/deciders/ConcurrentSearchRequestDecider.java#L44) method. \ No newline at end of file +The search request is parsed using a `QueryBuilderVisitor`, which calls the [`ConcurrentSearchRequestDecider#evaluateForQuery()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/deciders/ConcurrentSearchRequestDecider.java#L36) method of all the configured deciders for every node of the `QueryBuilder` tree in the search request. The final concurrent search decision is obtained by combining the decision from each decider returned by the [`ConcurrentSearchRequestDecider#getConcurrentSearchDecision()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/deciders/ConcurrentSearchRequestDecider.java#L44) method. diff --git a/_search-plugins/cross-cluster-search.md b/_search-plugins/cross-cluster-search.md index 48a5e3cfbef..561a6b3c587 100644 --- a/_search-plugins/cross-cluster-search.md +++ b/_search-plugins/cross-cluster-search.md @@ -61,7 +61,7 @@ humanresources: ## Sample Docker setup -To define Docker permissions, save the following sample file as `docker-compose.yml` and run `docker-compose up` to start two single-node clusters on the same network: +To define Docker permissions, save the following sample file as `docker-compose.yml` and run `docker compose up` to start two single-node clusters on the same network: ```yml version: '3' diff --git a/_search-plugins/filter-search.md b/_search-plugins/filter-search.md index f8625e0ac07..28393055b49 100644 --- a/_search-plugins/filter-search.md +++ b/_search-plugins/filter-search.md @@ -1,6 +1,7 @@ --- layout: default -title: Filter search results +title: Filter results +parent: Search options nav_order: 36 --- diff --git a/_search-plugins/hybrid-search.md b/_search-plugins/hybrid-search.md deleted file mode 100644 index 6d68645421b..00000000000 --- a/_search-plugins/hybrid-search.md +++ /dev/null @@ -1,1021 +0,0 @@ ---- -layout: default -title: Hybrid search -has_children: false -nav_order: 60 ---- - -# Hybrid search -Introduced 2.11 -{: .label .label-purple } - -Hybrid search combines keyword and neural search to improve search relevance. To implement hybrid search, you need to set up a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline you'll configure intercepts search results at an intermediate stage and applies the [`normalization_processor`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/normalization-processor/) to them. The `normalization_processor` normalizes and combines the document scores from multiple query clauses, rescoring the documents according to the chosen normalization and combination techniques. - -**PREREQUISITE**<br> -To follow this example, you must set up a text embedding model. For more information, see [Choosing a model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/#choosing-a-model). If you have already generated text embeddings, ingest the embeddings into an index and skip to [Step 4](#step-4-configure-a-search-pipeline). -{: .note} - -## Using hybrid search - -To use hybrid search, follow these steps: - -1. [Create an ingest pipeline](#step-1-create-an-ingest-pipeline). -1. [Create an index for ingestion](#step-2-create-an-index-for-ingestion). -1. [Ingest documents into the index](#step-3-ingest-documents-into-the-index). -1. [Configure a search pipeline](#step-4-configure-a-search-pipeline). -1. [Search the index using hybrid search](#step-5-search-the-index-using-hybrid-search). - -## Step 1: Create an ingest pipeline - -To generate vector embeddings, you need to create an [ingest pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) that contains a [`text_embedding` processor]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/processors/text-embedding/), which will convert the text in a document field to vector embeddings. The processor's `field_map` determines the input fields from which to generate vector embeddings and the output fields in which to store the embeddings. - -The following example request creates an ingest pipeline that converts the text from `passage_text` to text embeddings and stores the embeddings in `passage_embedding`: - -```json -PUT /_ingest/pipeline/nlp-ingest-pipeline -{ - "description": "A text embedding pipeline", - "processors": [ - { - "text_embedding": { - "model_id": "bQ1J8ooBpBj3wT4HVUsb", - "field_map": { - "passage_text": "passage_embedding" - } - } - } - ] -} -``` -{% include copy-curl.html %} - -## Step 2: Create an index for ingestion - -In order to use the text embedding processor defined in your pipeline, create a k-NN index, adding the pipeline created in the previous step as the default pipeline. Ensure that the fields defined in the `field_map` are mapped as correct types. Continuing with the example, the `passage_embedding` field must be mapped as a k-NN vector with a dimension that matches the model dimension. Similarly, the `passage_text` field should be mapped as `text`. - -The following example request creates a k-NN index that is set up with a default ingest pipeline: - -```json -PUT /my-nlp-index -{ - "settings": { - "index.knn": true, - "default_pipeline": "nlp-ingest-pipeline" - }, - "mappings": { - "properties": { - "id": { - "type": "text" - }, - "passage_embedding": { - "type": "knn_vector", - "dimension": 768, - "method": { - "engine": "lucene", - "space_type": "l2", - "name": "hnsw", - "parameters": {} - } - }, - "passage_text": { - "type": "text" - } - } - } -} -``` -{% include copy-curl.html %} - -For more information about creating a k-NN index and using supported methods, see [k-NN index]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/). - -## Step 3: Ingest documents into the index - -To ingest documents into the index created in the previous step, send the following requests: - -```json -PUT /my-nlp-index/_doc/1 -{ - "passage_text": "Hello world", - "id": "s1" -} -``` -{% include copy-curl.html %} - -```json -PUT /my-nlp-index/_doc/2 -{ - "passage_text": "Hi planet", - "id": "s2" -} -``` -{% include copy-curl.html %} - -Before the document is ingested into the index, the ingest pipeline runs the `text_embedding` processor on the document, generating text embeddings for the `passage_text` field. The indexed document includes the `passage_text` field, which contains the original text, and the `passage_embedding` field, which contains the vector embeddings. - -## Step 4: Configure a search pipeline - -To configure a search pipeline with a [`normalization-processor`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/normalization-processor/), use the following request. The normalization technique in the processor is set to `min_max`, and the combination technique is set to `arithmetic_mean`. The `weights` array specifies the weights assigned to each query clause as decimal percentages: - -```json -PUT /_search/pipeline/nlp-search-pipeline -{ - "description": "Post processor for hybrid search", - "phase_results_processors": [ - { - "normalization-processor": { - "normalization": { - "technique": "min_max" - }, - "combination": { - "technique": "arithmetic_mean", - "parameters": { - "weights": [ - 0.3, - 0.7 - ] - } - } - } - } - ] -} -``` -{% include copy-curl.html %} - -## Step 5: Search the index using hybrid search - -To perform hybrid search on your index, use the [`hybrid` query]({{site.url}}{{site.baseurl}}/query-dsl/compound/hybrid/), which combines the results of keyword and semantic search. - -#### Example: Combining a neural query and a match query - -The following example request combines two query clauses---a `neural` query and a `match` query. It specifies the search pipeline created in the previous step as a query parameter: - -```json -GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline -{ - "_source": { - "exclude": [ - "passage_embedding" - ] - }, - "query": { - "hybrid": { - "queries": [ - { - "match": { - "passage_text": { - "query": "Hi world" - } - } - }, - { - "neural": { - "passage_embedding": { - "query_text": "Hi world", - "model_id": "aVeif4oB5Vm0Tdw8zYO2", - "k": 5 - } - } - } - ] - } - } -} -``` -{% include copy-curl.html %} - -Alternatively, you can set a default search pipeline for the `my-nlp-index` index. For more information, see [Default search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/using-search-pipeline/#default-search-pipeline). - -The response contains the matching document: - -```json -{ - "took" : 36, - "timed_out" : false, - "_shards" : { - "total" : 1, - "successful" : 1, - "skipped" : 0, - "failed" : 0 - }, - "hits" : { - "total" : { - "value" : 1, - "relation" : "eq" - }, - "max_score" : 1.2251667, - "hits" : [ - { - "_index" : "my-nlp-index", - "_id" : "1", - "_score" : 1.2251667, - "_source" : { - "passage_text" : "Hello world", - "id" : "s1" - } - } - ] - } -} -``` -{% include copy-curl.html %} - -#### Example: Combining a match query and a term query - -The following example request combines two query clauses---a `match` query and a `term` query. It specifies the search pipeline created in the previous step as a query parameter: - -```json -GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline -{ - "_source": { - "exclude": [ - "passage_embedding" - ] - }, - "query": { - "hybrid": { - "queries": [ - { - "match":{ - "passage_text": "hello" - } - }, - { - "term":{ - "passage_text":{ - "value":"planet" - } - } - } - ] - } - } -} -``` -{% include copy-curl.html %} - -The response contains the matching documents: - -```json -{ - "took": 11, - "timed_out": false, - "_shards": { - "total": 2, - "successful": 2, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 2, - "relation": "eq" - }, - "max_score": 0.7, - "hits": [ - { - "_index": "my-nlp-index", - "_id": "2", - "_score": 0.7, - "_source": { - "id": "s2", - "passage_text": "Hi planet" - } - }, - { - "_index": "my-nlp-index", - "_id": "1", - "_score": 0.3, - "_source": { - "id": "s1", - "passage_text": "Hello world" - } - } - ] - } -} -``` -{% include copy-curl.html %} - -## Hybrid search with post-filtering -**Introduced 2.13** -{: .label .label-purple } - -You can perform post-filtering on hybrid search results by providing the `post_filter` parameter in your query. - -The `post_filter` clause is applied after the search results have been retrieved. Post-filtering is useful for applying additional filters to the search results without impacting the scoring or the order of the results. - -Post-filtering does not impact document relevance scores or aggregation results. -{: .note} - -#### Example: Post-filtering - -The following example request combines two query clauses---a `term` query and a `match` query. This is the same query as in the [preceding example](#example-combining-a-match-query-and-a-term-query), but it contains a `post_filter`: - -```json -GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline -{ - "query": { - "hybrid":{ - "queries":[ - { - "match":{ - "passage_text": "hello" - } - }, - { - "term":{ - "passage_text":{ - "value":"planet" - } - } - } - ] - } - - }, - "post_filter":{ - "match": { "passage_text": "world" } - } -} - -``` -{% include copy-curl.html %} - -Compare the results to the results without post-filtering in the [preceding example](#example-combining-a-match-query-and-a-term-query). Unlike the preceding example response, which contains two documents, the response in this example contains one document because the second document is filtered using post-filtering: - -```json -{ - "took": 18, - "timed_out": false, - "_shards": { - "total": 2, - "successful": 2, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 1, - "relation": "eq" - }, - "max_score": 0.3, - "hits": [ - { - "_index": "my-nlp-index", - "_id": "1", - "_score": 0.3, - "_source": { - "id": "s1", - "passage_text": "Hello world" - } - } - ] - } -} -``` - - -## Combining hybrid search and aggregations -**Introduced 2.13** -{: .label .label-purple } - -You can enhance search results by combining a hybrid query clause with any aggregation that OpenSearch supports. Aggregations allow you to use OpenSearch as an analytics engine. For more information about aggregations, see [Aggregations]({{site.url}}{{site.baseurl}}/aggregations/). - -Most aggregations are performed on the subset of documents that is returned by a hybrid query. The only aggregation that operates on all documents is the [`global`]({{site.url}}{{site.baseurl}}/aggregations/bucket/global/) aggregation. - -To use aggregations with a hybrid query, first create an index. Aggregations are typically used on fields of special types, like `keyword` or `integer`. The following example creates an index with several such fields: - -```json -PUT /my-nlp-index -{ - "settings": { - "number_of_shards": 2 - }, - "mappings": { - "properties": { - "doc_index": { - "type": "integer" - }, - "doc_keyword": { - "type": "keyword" - }, - "category": { - "type": "keyword" - } - } - } -} -``` -{% include copy-curl.html %} - -The following request ingests six documents into your new index: - -```json -POST /_bulk -{ "index": { "_index": "my-nlp-index" } } -{ "category": "permission", "doc_keyword": "workable", "doc_index": 4976, "doc_price": 100} -{ "index": { "_index": "my-nlp-index" } } -{ "category": "sister", "doc_keyword": "angry", "doc_index": 2231, "doc_price": 200 } -{ "index": { "_index": "my-nlp-index" } } -{ "category": "hair", "doc_keyword": "likeable", "doc_price": 25 } -{ "index": { "_index": "my-nlp-index" } } -{ "category": "editor", "doc_index": 9871, "doc_price": 30 } -{ "index": { "_index": "my-nlp-index" } } -{ "category": "statement", "doc_keyword": "entire", "doc_index": 8242, "doc_price": 350 } -{ "index": { "_index": "my-nlp-index" } } -{ "category": "statement", "doc_keyword": "idea", "doc_index": 5212, "doc_price": 200 } -{ "index": { "_index": "index-test" } } -{ "category": "editor", "doc_keyword": "bubble", "doc_index": 1298, "doc_price": 130 } -{ "index": { "_index": "index-test" } } -{ "category": "editor", "doc_keyword": "bubble", "doc_index": 521, "doc_price": 75 } -``` -{% include copy-curl.html %} - -Now you can combine a hybrid query clause with a `min` aggregation: - -```json -GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline -{ - "query": { - "hybrid": { - "queries": [ - { - "term": { - "category": "permission" - } - }, - { - "bool": { - "should": [ - { - "term": { - "category": "editor" - } - }, - { - "term": { - "category": "statement" - } - } - ] - } - } - ] - } - }, - "aggs": { - "total_price": { - "sum": { - "field": "doc_price" - } - }, - "keywords": { - "terms": { - "field": "doc_keyword", - "size": 10 - } - } - } -} -``` -{% include copy-curl.html %} - -The response contains the matching documents and the aggregation results: - -```json -{ - "took": 9, - "timed_out": false, - "_shards": { - "total": 2, - "successful": 2, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 4, - "relation": "eq" - }, - "max_score": 0.5, - "hits": [ - { - "_index": "my-nlp-index", - "_id": "mHRPNY4BlN82W_Ar9UMY", - "_score": 0.5, - "_source": { - "doc_price": 100, - "doc_index": 4976, - "doc_keyword": "workable", - "category": "permission" - } - }, - { - "_index": "my-nlp-index", - "_id": "m3RPNY4BlN82W_Ar9UMY", - "_score": 0.5, - "_source": { - "doc_price": 30, - "doc_index": 9871, - "category": "editor" - } - }, - { - "_index": "my-nlp-index", - "_id": "nXRPNY4BlN82W_Ar9UMY", - "_score": 0.5, - "_source": { - "doc_price": 200, - "doc_index": 5212, - "doc_keyword": "idea", - "category": "statement" - } - }, - { - "_index": "my-nlp-index", - "_id": "nHRPNY4BlN82W_Ar9UMY", - "_score": 0.5, - "_source": { - "doc_price": 350, - "doc_index": 8242, - "doc_keyword": "entire", - "category": "statement" - } - } - ] - }, - "aggregations": { - "total_price": { - "value": 680 - }, - "doc_keywords": { - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0, - "buckets": [ - { - "key": "entire", - "doc_count": 1 - }, - { - "key": "idea", - "doc_count": 1 - }, - { - "key": "workable", - "doc_count": 1 - } - ] - } - } -} -``` - -## Using sorting with a hybrid query -**Introduced 2.16** -{: .label .label-purple } - -By default, hybrid search returns results ordered by scores in descending order. You can apply sorting to hybrid query results by providing the `sort` criteria in the search request. For more information about sort criteria, see [Sort results]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/sort/). -When sorting is applied to a hybrid search, results are fetched from the shards based on the specified sort criteria. As a result, the search results are sorted accordingly, and the document scores are `null`. Scores are only present in the hybrid search sorting results if documents are sorted by `_score`. - -In the following example, sorting is applied by `doc_price` in the hybrid query search request: - -```json -GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline -{ - "query": { - "hybrid": { - "queries": [ - { - "term": { - "category": "permission" - } - }, - { - "bool": { - "should": [ - { - "term": { - "category": "editor" - } - }, - { - "term": { - "category": "statement" - } - } - ] - } - } - ] - } - }, - "sort":[ - { - "doc_price": { - "order": "desc" - } - } - ] -} -``` -{% include copy-curl.html %} - -The response contains the matching documents sorted by `doc_price` in descending order: - -```json -{ - "took": 35, - "timed_out": false, - "_shards": { - "total": 3, - "successful": 3, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 4, - "relation": "eq" - }, - "max_score": 0.5, - "hits": [ - { - "_index": "my-nlp-index", - "_id": "7yaM4JABZkI1FQv8AwoN", - "_score": null, - "_source": { - "category": "statement", - "doc_keyword": "entire", - "doc_index": 8242, - "doc_price": 350 - }, - "sort": [ - 350 - ] - }, - { - "_index": "my-nlp-index", - "_id": "8CaM4JABZkI1FQv8AwoN", - "_score": null, - "_source": { - "category": "statement", - "doc_keyword": "idea", - "doc_index": 5212, - "doc_price": 200 - }, - "sort": [ - 200 - ] - }, - { - "_index": "my-nlp-index", - "_id": "6yaM4JABZkI1FQv8AwoM", - "_score": null, - "_source": { - "category": "permission", - "doc_keyword": "workable", - "doc_index": 4976, - "doc_price": 100 - }, - "sort": [ - 100 - ] - }, - { - "_index": "my-nlp-index", - "_id": "7iaM4JABZkI1FQv8AwoN", - "_score": null, - "_source": { - "category": "editor", - "doc_index": 9871, - "doc_price": 30 - }, - "sort": [ - 30 - ] - } - ] - } -} -``` - -In the following example, sorting is applied by `_id`: - -```json -GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline -{ - "query": { - "hybrid": { - "queries": [ - { - "term": { - "category": "permission" - } - }, - { - "bool": { - "should": [ - { - "term": { - "category": "editor" - } - }, - { - "term": { - "category": "statement" - } - } - ] - } - } - ] - } - }, - "sort":[ - { - "_id": { - "order": "desc" - } - } - ] -} -``` -{% include copy-curl.html %} - -The response contains the matching documents sorted by `_id` in descending order: - -```json -{ - "took": 33, - "timed_out": false, - "_shards": { - "total": 3, - "successful": 3, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 4, - "relation": "eq" - }, - "max_score": 0.5, - "hits": [ - { - "_index": "my-nlp-index", - "_id": "8CaM4JABZkI1FQv8AwoN", - "_score": null, - "_source": { - "category": "statement", - "doc_keyword": "idea", - "doc_index": 5212, - "doc_price": 200 - }, - "sort": [ - "8CaM4JABZkI1FQv8AwoN" - ] - }, - { - "_index": "my-nlp-index", - "_id": "7yaM4JABZkI1FQv8AwoN", - "_score": null, - "_source": { - "category": "statement", - "doc_keyword": "entire", - "doc_index": 8242, - "doc_price": 350 - }, - "sort": [ - "7yaM4JABZkI1FQv8AwoN" - ] - }, - { - "_index": "my-nlp-index", - "_id": "7iaM4JABZkI1FQv8AwoN", - "_score": null, - "_source": { - "category": "editor", - "doc_index": 9871, - "doc_price": 30 - }, - "sort": [ - "7iaM4JABZkI1FQv8AwoN" - ] - }, - { - "_index": "my-nlp-index", - "_id": "6yaM4JABZkI1FQv8AwoM", - "_score": null, - "_source": { - "category": "permission", - "doc_keyword": "workable", - "doc_index": 4976, - "doc_price": 100 - }, - "sort": [ - "6yaM4JABZkI1FQv8AwoM" - ] - } - ] - } -} -``` - -## Hybrid search with search_after -**Introduced 2.16** -{: .label .label-purple } - -You can control sorting results by applying a `search_after` condition that provides a live cursor and uses the previous page's results to obtain the next page's results. For more information about `search_after`, see [The search_after parameter]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/paginate/#the-search_after-parameter). - -You can paginate the sorted results by applying a `search_after` condition in the sort queries. - -In the following example, sorting is applied by `doc_price` with a `search_after` condition: - -```json -GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline -{ - "query": { - "hybrid": { - "queries": [ - { - "term": { - "category": "permission" - } - }, - { - "bool": { - "should": [ - { - "term": { - "category": "editor" - } - }, - { - "term": { - "category": "statement" - } - } - ] - } - } - ] - } - }, - "sort":[ - { - "_id": { - "order": "desc" - } - } - ], - "search_after":[200] -} -``` -{% include copy-curl.html %} - -The response contains the matching documents that are listed after the `200` sort value, sorted by `doc_price` in descending order: - -```json -{ - "took": 8, - "timed_out": false, - "_shards": { - "total": 3, - "successful": 3, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 4, - "relation": "eq" - }, - "max_score": 0.5, - "hits": [ - { - "_index": "my-nlp-index", - "_id": "6yaM4JABZkI1FQv8AwoM", - "_score": null, - "_source": { - "category": "permission", - "doc_keyword": "workable", - "doc_index": 4976, - "doc_price": 100 - }, - "sort": [ - 100 - ] - }, - { - "_index": "my-nlp-index", - "_id": "7iaM4JABZkI1FQv8AwoN", - "_score": null, - "_source": { - "category": "editor", - "doc_index": 9871, - "doc_price": 30 - }, - "sort": [ - 30 - ] - } - ] - } -} -``` - -In the following example, sorting is applied by `id` with a `search_after` condition: - -```json -GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline -{ - "query": { - "hybrid": { - "queries": [ - { - "term": { - "category": "permission" - } - }, - { - "bool": { - "should": [ - { - "term": { - "category": "editor" - } - }, - { - "term": { - "category": "statement" - } - } - ] - } - } - ] - } - }, - "sort":[ - { - "_id": { - "order": "desc" - } - } - ], - "search_after":["7yaM4JABZkI1FQv8AwoN"] -} -``` -{% include copy-curl.html %} - -The response contains the matching documents that are listed after the `7yaM4JABZkI1FQv8AwoN` sort value, sorted by `id` in descending order: - -```json -{ - "took": 17, - "timed_out": false, - "_shards": { - "total": 3, - "successful": 3, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 4, - "relation": "eq" - }, - "max_score": 0.5, - "hits": [ - { - "_index": "my-nlp-index", - "_id": "7iaM4JABZkI1FQv8AwoN", - "_score": null, - "_source": { - "category": "editor", - "doc_index": 9871, - "doc_price": 30 - }, - "sort": [ - "7iaM4JABZkI1FQv8AwoN" - ] - }, - { - "_index": "my-nlp-index", - "_id": "6yaM4JABZkI1FQv8AwoM", - "_score": null, - "_source": { - "category": "permission", - "doc_keyword": "workable", - "doc_index": 4976, - "doc_price": 100 - }, - "sort": [ - "6yaM4JABZkI1FQv8AwoM" - ] - } - ] - } -} -``` \ No newline at end of file diff --git a/_search-plugins/improving-search-performance.md b/_search-plugins/improving-search-performance.md index 4a0ffafe118..4cc0a60dc0e 100644 --- a/_search-plugins/improving-search-performance.md +++ b/_search-plugins/improving-search-performance.md @@ -11,4 +11,6 @@ OpenSearch offers several ways to improve search performance: - Run resource-intensive queries asynchronously with [asynchronous search]({{site.url}}{{site.baseurl}}/search-plugins/async/). -- Search segments concurrently using [concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search/). \ No newline at end of file +- Search segments concurrently using [concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search/). + +- Improve aggregation performance using a [star-tree index]({{site.url}}{{site.baseurl}}/search-plugins/star-tree-index/). diff --git a/_search-plugins/index.md b/_search-plugins/index.md index 3604245f11a..3baf485aa93 100644 --- a/_search-plugins/index.md +++ b/_search-plugins/index.md @@ -8,9 +8,21 @@ nav_exclude: true permalink: /search-plugins/ redirect_from: - /search-plugins/index/ +keyword: + - heading: "Keyword (BM25) search" + description: "Find exact and close matches using traditional text search" + link: "/search-plugins/keyword-search/" +vector: + - heading: "Vector search" + description: "Search by similarity using dense or sparse vector embeddings" + link: "/vector-search/" +ai: + - heading: "AI search" + description: "Build intelligent search applications using AI models" + link: "/vector-search/ai-search/" --- -# Search +# Search features OpenSearch provides many features for customizing your search use cases and improving search relevance. @@ -18,29 +30,23 @@ OpenSearch provides many features for customizing your search use cases and impr OpenSearch supports the following search methods. -### Traditional lexical search +### Exact matching and keywords -OpenSearch supports [keyword (BM25) search]({{site.url}}{{site.baseurl}}/search-plugins/keyword-search/), which searches the document corpus for words that appear in the query. +OpenSearch implements lexical (keyword) text search using the BM25 algorithm to match and rank documents based on term frequency and document length. -### ML-powered search +{% include cards.html cards=page.keyword %} -OpenSearch supports the following machine learning (ML)-powered search methods: +### Similarity and meaning -- **Vector search** +OpenSearch supports similarity (k-nearest neighbor) search using dense and sparse vector embeddings to power use cases such as semantic search, retrieval-augmented generation, and multimodal image search. - - [k-NN search]({{site.url}}{{site.baseurl}}/search-plugins/knn/): Searches for the k-nearest neighbors to a search term across an index of vectors. +{% include cards.html cards=page.vector %} -- **Neural search**: [Neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/) facilitates generating vector embeddings at ingestion time and searching them at search time. Neural search lets you integrate ML models into your search and serves as a framework for implementing other search methods. The following search methods are built on top of neural search: +### AI-powered search - - [Semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/): Considers the meaning of the words in the search context. Uses dense retrieval based on text embedding models to search text data. +OpenSearch supports AI-powered search capabilities beyond vector embeddings. OpenSearch's AI search enables search and ingestion flows to be enriched by any AI service to power the full range of AI-enhanced search use cases. - - [Multimodal search]({{site.url}}{{site.baseurl}}/search-plugins/multimodal-search/): Uses multimodal embedding models to search text and image data. - - - [Neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/): Uses sparse retrieval based on sparse embedding models to search text data. - - - [Hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/): Combines traditional search and vector search to improve search relevance. - - - [Conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/): Implements a retrieval-augmented generative search. +{% include cards.html cards=page.ai %} ## Query languages @@ -66,9 +72,11 @@ OpenSearch offers several ways to improve search performance: ## Search relevance -OpenSearch provides the following search relevance features: +*Search relevance* is a measure of how well a document matches a search query. When you run a search query, OpenSearch compares the words in your query to the words in each document and assigns a relevance score based on factors such as how frequently the words appear and how closely they match. For more information, see [Relevance]({{site.url}}{{site.baseurl}}/getting-started/intro/#relevance). + +To help you fine-tune and improve search relevance, OpenSearch provides several specialized features: -- [Compare Search Results]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/compare-search-results/): A search comparison tool in OpenSearch Dashboards that you can use to compare results from two queries side by side. +- [Search Relevance Workbench]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/using-search-relevance-workbench/): A suite of tools that support search quality improvements through experimentation. - [Querqy]({{site.url}}{{site.baseurl}}/search-plugins/querqy/): Offers query rewriting capability. diff --git a/_search-plugins/keyword-search.md b/_search-plugins/keyword-search.md index 25055eadebd..f23aa9c20ac 100644 --- a/_search-plugins/keyword-search.md +++ b/_search-plugins/keyword-search.md @@ -118,8 +118,27 @@ The following table lists the supported similarity algorithms. Algorithm | Description `BM25` | The default OpenSearch [Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25) similarity algorithm. +`LegacyBM25` (Deprecated) | The older [LegacyBM25Similarity](https://github.com/opensearch-project/OpenSearch/blob/main/server/src/main/java/org/opensearch/lucene/similarity/LegacyBM25Similarity.java) implementation. Kept for backward compatibility. `boolean` | Assigns terms a score equal to their boost value. Use `boolean` similarity when you want the document scores to be based on the binary value of whether the terms match. + +### Important changes to BM25 scoring in OpenSearch 3.0 + +In OpenSearch 3.0, the default similarity algorithm changed from `LegacyBM25Similarity` to Lucene's native `BM25Similarity`. + +This change improves alignment with Lucene standards and simplifies scoring behavior, but it introduces an important difference: + +- In `LegacyBM25Similarity`, scores included an extra constant factor of `k₁ + 1` in the numerator of the `BM25` formula. + +- In `BM25Similarity`, this constant was removed for cleaner normalization (see [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) and the corresponding [Lucene GitHub issue](https://github.com/apache/lucene/issues/9609)). + +- Scores produced by `BM25Similarity` are lower than those produced by `LegacyBM25Similarity`, typically by a factor of about `2.2`. + +- Ranking is unaffected because the constant factor does not change the relative order of documents. + +- To retain the old scoring behavior, explicitly configure your field or index to use `LegacyBM25` (see [Configuring legacy BM25 similarity](#configuring-legacy-bm25-similarity)). + + ## Specifying similarity You can specify the similarity algorithm in the `similarity` parameter when configuring mappings at the field level. @@ -173,6 +192,29 @@ Parameter | Data type | Description `b` | Float | Determines the degree to which document length normalizes TF values. The default value is `0.75`. `discount_overlaps` | Boolean | Determines whether overlap tokens (tokens with zero position increment) are ignored when computing the norm. Default is `true` (overlap tokens do not count when computing the norm). + +## Configuring legacy BM25 similarity + +If you want to retain the older similarity behavior, specify `LegacyBM25` as the similarity `type`: + +```json +PUT /testindex +{ + "settings": { + "index": { + "similarity": { + "default": { + "type": "LegacyBM25", + "k1": 1.2, + "b": 0.75 + } + } + } + } +} +``` +{% include copy-curl.html %} + --- ## Next steps diff --git a/_search-plugins/knn/approximate-knn.md b/_search-plugins/knn/approximate-knn.md deleted file mode 100644 index f8921033e06..00000000000 --- a/_search-plugins/knn/approximate-knn.md +++ /dev/null @@ -1,416 +0,0 @@ ---- -layout: default -title: Approximate k-NN search -nav_order: 15 -parent: k-NN search -has_children: false -has_math: true ---- - -# Approximate k-NN search - -Standard k-NN search methods compute similarity using a brute-force approach that measures the nearest distance between a query and a number of points, which produces exact results. This works well in many applications. However, in the case of extremely large datasets with high dimensionality, this creates a scaling problem that reduces the efficiency of the search. Approximate k-NN search methods can overcome this by employing tools that restructure indexes more efficiently and reduce the dimensionality of searchable vectors. Using this approach requires a sacrifice in accuracy but increases search processing speeds appreciably. - -The Approximate k-NN search methods leveraged by OpenSearch use approximate nearest neighbor (ANN) algorithms from the [nmslib](https://github.com/nmslib/nmslib), [faiss](https://github.com/facebookresearch/faiss), and [Lucene](https://lucene.apache.org/) libraries to power k-NN search. These search methods employ ANN to improve search latency for large datasets. Of the three search methods the k-NN plugin provides, this method offers the best search scalability for large datasets. This approach is the preferred method when a dataset reaches hundreds of thousands of vectors. - -For details on the algorithms the plugin currently supports, see [k-NN Index documentation]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#method-definitions). -{: .note} - -The k-NN plugin builds a native library index of the vectors for each knn-vector field/Lucene segment pair during indexing, which can be used to efficiently find the k-nearest neighbors to a query vector during search. To learn more about Lucene segments, see the [Apache Lucene documentation](https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/codecs/lucene87/package-summary.html#package.description). These native library indexes are loaded into native memory during search and managed by a cache. To learn more about preloading native library indexes into memory, refer to the [warmup API]({{site.url}}{{site.baseurl}}/search-plugins/knn/api#warmup-operation). Additionally, you can see which native library indexes are already loaded in memory. To learn more about this, see the [stats API section]({{site.url}}{{site.baseurl}}/search-plugins/knn/api#stats). - -Because the native library indexes are constructed during indexing, it is not possible to apply a filter on an index and then use this search method. All filters are applied on the results produced by the approximate nearest neighbor search. - -## Recommendations for engines and cluster node sizing - -Each of the three engines used for approximate k-NN search has its own attributes that make one more sensible to use than the others in a given situation. You can follow the general information below to help determine which engine will best meet your requirements. - -In general, nmslib outperforms both faiss and Lucene on search. However, to optimize for indexing throughput, faiss is a good option. For relatively smaller datasets (up to a few million vectors), the Lucene engine demonstrates better latencies and recall. At the same time, the size of the index is smallest compared to the other engines, which allows it to use smaller AWS instances for data nodes. - -When considering cluster node sizing, a general approach is to first establish an even distribution of the index across the cluster. However, there are other considerations. To help make these choices, you can refer to the OpenSearch managed service guidance in the section [Sizing domains](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/sizing-domains.html). - -## Get started with approximate k-NN - -To use the k-NN plugin's approximate search functionality, you must first create a k-NN index with `index.knn` set to `true`. This setting tells the plugin to create native library indexes for the index. - -Next, you must add one or more fields of the `knn_vector` data type. This example creates an index with two -`knn_vector` fields, one using `faiss` and the other using `nmslib` fields: - -```json -PUT my-knn-index-1 -{ - "settings": { - "index": { - "knn": true, - "knn.algo_param.ef_search": 100 - } - }, - "mappings": { - "properties": { - "my_vector1": { - "type": "knn_vector", - "dimension": 2, - "space_type": "l2", - "method": { - "name": "hnsw", - "engine": "nmslib", - "parameters": { - "ef_construction": 128, - "m": 24 - } - } - }, - "my_vector2": { - "type": "knn_vector", - "dimension": 4, - "space_type": "innerproduct", - "method": { - "name": "hnsw", - "engine": "faiss", - "parameters": { - "ef_construction": 256, - "m": 48 - } - } - } - } - } -} -``` -{% include copy-curl.html %} - -In the preceding example, both `knn_vector` fields are configured using method definitions. Additionally, `knn_vector` fields can be configured using models. For more information, see [k-NN vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/). - -The `knn_vector` data type supports a vector of floats that can have a dimension count of up to 16,000 for the NMSLIB, Faiss, and Lucene engines, as set by the dimension mapping parameter. - -In OpenSearch, codecs handle the storage and retrieval of indexes. The k-NN plugin uses a custom codec to write vector data to native library indexes so that the underlying k-NN search library can read it. -{: .tip } - -After you create the index, you can add some data to it: - -```json -POST _bulk -{ "index": { "_index": "my-knn-index-1", "_id": "1" } } -{ "my_vector1": [1.5, 2.5], "price": 12.2 } -{ "index": { "_index": "my-knn-index-1", "_id": "2" } } -{ "my_vector1": [2.5, 3.5], "price": 7.1 } -{ "index": { "_index": "my-knn-index-1", "_id": "3" } } -{ "my_vector1": [3.5, 4.5], "price": 12.9 } -{ "index": { "_index": "my-knn-index-1", "_id": "4" } } -{ "my_vector1": [5.5, 6.5], "price": 1.2 } -{ "index": { "_index": "my-knn-index-1", "_id": "5" } } -{ "my_vector1": [4.5, 5.5], "price": 3.7 } -{ "index": { "_index": "my-knn-index-1", "_id": "6" } } -{ "my_vector2": [1.5, 5.5, 4.5, 6.4], "price": 10.3 } -{ "index": { "_index": "my-knn-index-1", "_id": "7" } } -{ "my_vector2": [2.5, 3.5, 5.6, 6.7], "price": 5.5 } -{ "index": { "_index": "my-knn-index-1", "_id": "8" } } -{ "my_vector2": [4.5, 5.5, 6.7, 3.7], "price": 4.4 } -{ "index": { "_index": "my-knn-index-1", "_id": "9" } } -{ "my_vector2": [1.5, 5.5, 4.5, 6.4], "price": 8.9 } -``` -{% include copy-curl.html %} - -Then you can execute an approximate nearest neighbor search on the data using the `knn` query type: - -```json -GET my-knn-index-1/_search -{ - "size": 2, - "query": { - "knn": { - "my_vector2": { - "vector": [2, 3, 5, 6], - "k": 2 - } - } - } -} -``` -{% include copy-curl.html %} - -### The number of returned results - -In the preceding query, `k` represents the number of neighbors returned by the search of each graph. You must also include the `size` option, indicating the final number of results that you want the query to return. - -For the NMSLIB and Faiss engines, `k` represents the maximum number of documents returned for all segments of a shard. For the Lucene engine, `k` represents the number of documents returned for a shard. The maximum value of `k` is 10,000. - -For any engine, each shard returns `size` results to the coordinator node. Thus, the total number of results that the coordinator node receives is `size * number of shards`. After the coordinator node consolidates the results received from all nodes, the query returns the top `size` results. - -The following table provides examples of the number of results returned by various engines in several scenarios. For these examples, assume that the number of documents contained in the segments and shards is sufficient to return the number of results specified in the table. - -`size` | `k` | Number of primary shards | Number of segments per shard | Number of returned results, Faiss/NMSLIB | Number of returned results, Lucene -:--- | :--- | :--- | :--- | :--- | :--- -10 | 1 | 1 | 4 | 4 | 1 -10 | 10 | 1 | 4 | 10 | 10 -10 | 1 | 2 | 4 | 8 | 2 - -The number of results returned by Faiss/NMSLIB differs from the number of results returned by Lucene only when `k` is smaller than `size`. If `k` and `size` are equal, all engines return the same number of results. - -Starting in OpenSearch 2.14, you can use `k`, `min_score`, or `max_distance` for [radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/). - -### Building a k-NN index from a model - -For some of the algorithms that the k-NN plugin supports, the native library index needs to be trained before it can be used. It would be expensive to train every newly created segment, so, instead, the plugin features the concept of a *model* that initializes the native library index during segment creation. You can create a model by calling the [Train API]({{site.url}}{{site.baseurl}}/search-plugins/knn/api#train-a-model) and passing in the source of the training data and the method definition of the model. Once training is complete, the model is serialized to a k-NN model system index. Then, during indexing, the model is pulled from this index to initialize the segments. - -To train a model, you first need an OpenSearch index containing training data. Training data can come from any `knn_vector` field that has a dimension matching the dimension of the model you want to create. Training data can be the same data that you are going to index or data in a separate set. To create a training index, send the following request: - -```json -PUT /train-index -{ - "settings": { - "number_of_shards": 3, - "number_of_replicas": 0 - }, - "mappings": { - "properties": { - "train-field": { - "type": "knn_vector", - "dimension": 4 - } - } - } -} -``` -{% include copy-curl.html %} - -Notice that `index.knn` is not set in the index settings. This ensures that you do not create native library indexes for this index. - -You can now add some data to the index: - -```json -POST _bulk -{ "index": { "_index": "train-index", "_id": "1" } } -{ "train-field": [1.5, 5.5, 4.5, 6.4]} -{ "index": { "_index": "train-index", "_id": "2" } } -{ "train-field": [2.5, 3.5, 5.6, 6.7]} -{ "index": { "_index": "train-index", "_id": "3" } } -{ "train-field": [4.5, 5.5, 6.7, 3.7]} -{ "index": { "_index": "train-index", "_id": "4" } } -{ "train-field": [1.5, 5.5, 4.5, 6.4]} -``` -{% include copy-curl.html %} - -After indexing into the training index completes, you can call the Train API: - -```json -POST /_plugins/_knn/models/my-model/_train -{ - "training_index": "train-index", - "training_field": "train-field", - "dimension": 4, - "description": "My model description", - "space_type": "l2", - "method": { - "name": "ivf", - "engine": "faiss", - "parameters": { - "nlist": 4, - "nprobes": 2 - } - } -} -``` -{% include copy-curl.html %} - -The Train API returns as soon as the training job is started. To check the job status, use the Get Model API: - -```json -GET /_plugins/_knn/models/my-model?filter_path=state&pretty -{ - "state": "training" -} -``` -{% include copy-curl.html %} - -Once the model enters the `created` state, you can create an index that will use this model to initialize its native library indexes: - -```json -PUT /target-index -{ - "settings": { - "number_of_shards": 3, - "number_of_replicas": 1, - "index.knn": true - }, - "mappings": { - "properties": { - "target-field": { - "type": "knn_vector", - "model_id": "my-model" - } - } - } -} -``` -{% include copy-curl.html %} - -Lastly, you can add the documents you want to be searched to the index: - -```json -POST _bulk -{ "index": { "_index": "target-index", "_id": "1" } } -{ "target-field": [1.5, 5.5, 4.5, 6.4]} -{ "index": { "_index": "target-index", "_id": "2" } } -{ "target-field": [2.5, 3.5, 5.6, 6.7]} -{ "index": { "_index": "target-index", "_id": "3" } } -{ "target-field": [4.5, 5.5, 6.7, 3.7]} -{ "index": { "_index": "target-index", "_id": "4" } } -{ "target-field": [1.5, 5.5, 4.5, 6.4]} -``` -{% include copy-curl.html %} - -After data is ingested, it can be searched in the same way as any other `knn_vector` field. - -### Additional query parameters - -Starting with version 2.16, you can provide `method_parameters` in a search request: - -```json -GET my-knn-index-1/_search -{ - "size": 2, - "query": { - "knn": { - "target-field": { - "vector": [2, 3, 5, 6], - "k": 2, - "method_parameters" : { - "ef_search": 100 - } - } - } - } -} -``` -{% include copy-curl.html %} - -These parameters are dependent on the combination of engine and method used to create the index. The following sections provide information about the supported `method_parameters`. - -#### `ef_search` - -You can provide the `ef_search` parameter when searching an index created using the `hnsw` method. The `ef_search` parameter specifies the number of vectors to examine in order to find the top k nearest neighbors. Higher `ef_search` values improve recall at the cost of increased search latency. The value must be positive. - -The following table provides information about the `ef_search` parameter for the supported engines. - -Engine | Radial query support | Notes -:--- | :--- | :--- -`nmslib` | No | If `ef_search` is present in a query, it overrides the `index.knn.algo_param.ef_search` index setting. -`faiss` | Yes | If `ef_search` is present in a query, it overrides the `index.knn.algo_param.ef_search` index setting. -`lucene` | No | When creating a search query, you must specify `k`. If you provide both `k` and `ef_search`, then the larger value is passed to the engine. If `ef_search` is larger than `k`, you can provide the `size` parameter to limit the final number of results to `k`. - -#### `nprobes` - -You can provide the `nprobes` parameter when searching an index created using the `ivf` method. The `nprobes` parameter specifies the number of buckets to examine in order to find the top k nearest neighbors. Higher `nprobes` values improve recall at the cost of increased search latency. The value must be positive. - -The following table provides information about the `nprobes` parameter for the supported engines. - -Engine | Notes -:--- | :--- -`faiss` | If `nprobes` is present in a query, it overrides the value provided when creating the index. - -### Rescoring quantized results using full precision - -Quantization can be used to significantly reduce the memory footprint of a k-NN index. For more information about quantization, see [k-NN vector quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization). Because some vector representation is lost during quantization, the computed distances will be approximate. This causes the overall recall of the search to decrease. - -To improve recall while maintaining the memory savings of quantization, you can use a two-phase search approach. In the first phase, `oversample_factor * k` results are retrieved from an index using quantized vectors and the scores are approximated. In the second phase, the full-precision vectors of those `oversample_factor * k` results are loaded into memory from disk, and scores are recomputed against the full-precision query vector. The results are then reduced to the top k. - -The default rescoring behavior is determined by the `mode` and `compression_level` of the backing k-NN vector field: - -- For `in_memory` mode, no rescoring is applied by default. -- For `on_disk` mode, default rescoring is based on the configured `compression_level`. Each `compression_level` provides a default `oversample_factor`, specified in the following table. - -| Compression level | Default rescore `oversample_factor` | -|:------------------|:----------------------------------| -| `32x` (default) | 3.0 | -| `16x` | 2.0 | -| `8x` | 2.0 | -| `4x` | No default rescoring | -| `2x` | No default rescoring | - -To explicitly apply rescoring, provide the `rescore` parameter in a query on a quantized index and specify the `oversample_factor`: - -```json -GET my-knn-index-1/_search -{ - "size": 2, - "query": { - "knn": { - "target-field": { - "vector": [2, 3, 5, 6], - "k": 2, - "rescore" : { - "oversample_factor": 1.2 - } - } - } - } -} -``` -{% include copy-curl.html %} - -Alternatively, set the `rescore` parameter to `true` to use a default `oversample_factor` of `1.0`: - -```json -GET my-knn-index-1/_search -{ - "size": 2, - "query": { - "knn": { - "target-field": { - "vector": [2, 3, 5, 6], - "k": 2, - "rescore" : true - } - } - } -} -``` -{% include copy-curl.html %} - -The `oversample_factor` is a floating-point number between 1.0 and 100.0, inclusive. The number of results in the first pass is calculated as `oversample_factor * k` and is guaranteed to be between 100 and 10,000, inclusive. If the calculated number of results is smaller than 100, then the number of results is set to 100. If the calculated number of results is greater than 10,000, then the number of results is set to 10,000. - -Rescoring is only supported for the `faiss` engine. - -Rescoring is not needed if quantization is not used because the scores returned are already fully precise. -{: .note} - -### Using approximate k-NN with filters - -To learn about using filters with k-NN search, see [k-NN search with filters]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/). - -### Using approximate k-NN with nested fields - -To learn about using k-NN search with nested fields, see [k-NN search with nested fields]({{site.url}}{{site.baseurl}}/search-plugins/knn/nested-search-knn/). - -### Using approximate radial search - -To learn more about the radial search feature, see [k-NN radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/). - -### Using approximate k-NN with binary vectors - -To learn more about using binary vectors with k-NN search, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors). - -## Spaces - -A _space_ corresponds to the function used to measure the distance between two points in order to determine the k-nearest neighbors. From the k-NN perspective, a lower score equates to a closer and better result. This is the opposite of how OpenSearch scores results, where a higher score equates to a better result. The k-NN plugin supports the following spaces. - -Not every method supports each of these spaces. Be sure to check out [the method documentation]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#method-definitions) to make sure the space you are interested in is supported. -{: note.} - -| Space type | Distance function ($$d$$ ) | OpenSearch score | -| :--- | :--- | :--- | -| `l1` | $$ d(\mathbf{x}, \mathbf{y}) = \sum_{i=1}^n \lvert x_i - y_i \rvert $$ | $$ score = {1 \over {1 + d} } $$ | -| `l2` | $$ d(\mathbf{x}, \mathbf{y}) = \sum_{i=1}^n (x_i - y_i)^2 $$ | $$ score = {1 \over 1 + d } $$ | -| `linf` | $$ d(\mathbf{x}, \mathbf{y}) = max(\lvert x_i - y_i \rvert) $$ | $$ score = {1 \over 1 + d } $$ | -| `cosinesimil` | $$ d(\mathbf{x}, \mathbf{y}) = 1 - cos { \theta } = 1 - {\mathbf{x} \cdot \mathbf{y} \over \lVert \mathbf{x}\rVert \cdot \lVert \mathbf{y}\rVert}$$$$ = 1 - {\sum_{i=1}^n x_i y_i \over \sqrt{\sum_{i=1}^n x_i^2} \cdot \sqrt{\sum_{i=1}^n y_i^2}}$$, <br> where $$\lVert \mathbf{x}\rVert$$ and $$\lVert \mathbf{y}\rVert$$ represent the norms of vectors $$\mathbf{x}$$ and $$\mathbf{y}$$, respectively. | **NMSLIB** and **Faiss**:<br>$$ score = {1 \over 1 + d } $$ <br><br>**Lucene**:<br>$$ score = {2 - d \over 2}$$ | -| `innerproduct` (supported for Lucene in OpenSearch version 2.13 and later) | **NMSLIB** and **Faiss**:<br> $$ d(\mathbf{x}, \mathbf{y}) = - {\mathbf{x} \cdot \mathbf{y}} = - \sum_{i=1}^n x_i y_i $$ <br><br>**Lucene**:<br> $$ d(\mathbf{x}, \mathbf{y}) = {\mathbf{x} \cdot \mathbf{y}} = \sum_{i=1}^n x_i y_i $$ | **NMSLIB** and **Faiss**:<br> $$ \text{If} d \ge 0, score = {1 \over 1 + d }$$ <br> $$\text{If} d < 0, score = −d + 1$$ <br><br>**Lucene:**<br> $$ \text{If} d > 0, score = d + 1 $$ <br> $$\text{If} d \le 0, score = {1 \over 1 + (-1 \cdot d) }$$ | -| `hamming` (supported for binary vectors in OpenSearch version 2.16 and later) | $$ d(\mathbf{x}, \mathbf{y}) = \text{countSetBits}(\mathbf{x} \oplus \mathbf{y})$$ | $$ score = {1 \over 1 + d } $$ | - -The cosine similarity formula does not include the `1 -` prefix. However, because similarity search libraries equate lower scores with closer results, they return `1 - cosineSimilarity` for the cosine similarity space---this is why `1 -` is included in the distance function. -{: .note } - -With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ...]`) as input. This is because the magnitude of such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests containing the zero vector will be rejected, and a corresponding exception will be thrown. -{: .note } - -The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors). -{: .note} diff --git a/_search-plugins/knn/index.md b/_search-plugins/knn/index.md deleted file mode 100644 index f8c28bcc4ed..00000000000 --- a/_search-plugins/knn/index.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -layout: default -title: k-NN search -nav_order: 20 -has_children: true -has_toc: false -redirect_from: - - /search-plugins/knn/ ---- - -# k-NN search - -Short for *k-nearest neighbors*, the k-NN plugin enables users to search for the k-nearest neighbors to a query point across an index of vectors. To determine the neighbors, you can specify the space (the distance function) you want to use to measure the distance between points. - -Use cases include recommendations (for example, an "other songs you might like" feature in a music application), image recognition, and fraud detection. For more background information about k-NN search, see [Wikipedia](https://en.wikipedia.org/wiki/Nearest_neighbor_search). - -This plugin supports three different methods for obtaining the k-nearest neighbors from an index of vectors: - -1. **Approximate k-NN** - - The first method takes an approximate nearest neighbor approach---it uses one of several algorithms to return the approximate k-nearest neighbors to a query vector. Usually, these algorithms sacrifice indexing speed and search accuracy in return for performance benefits such as lower latency, smaller memory footprints and more scalable search. To learn more about the algorithms, refer to [*nmslib*](https://github.com/nmslib/nmslib/blob/master/manual/README.md)'s and [*faiss*](https://github.com/facebookresearch/faiss/wiki)'s documentation. - - Approximate k-NN is the best choice for searches over large indexes (that is, hundreds of thousands of vectors or more) that require low latency. You should not use approximate k-NN if you want to apply a filter on the index before the k-NN search, which greatly reduces the number of vectors to be searched. In this case, you should use either the script scoring method or Painless extensions. - - For more details about this method, including recommendations for which engine to use, see [Approximate k-NN search]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/). - -2. **Script Score k-NN** - - The second method extends OpenSearch's script scoring functionality to execute a brute force, exact k-NN search over "knn_vector" fields or fields that can represent binary objects. With this approach, you can run k-NN search on a subset of vectors in your index (sometimes referred to as a pre-filter search). - - Use this approach for searches over smaller bodies of documents or when a pre-filter is needed. Using this approach on large indexes may lead to high latencies. - - For more details about this method, see [Exact k-NN with scoring script]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-score-script/). - -3. **Painless extensions** - - The third method adds the distance functions as painless extensions that you can use in more complex combinations. Similar to the k-NN Script Score, you can use this method to perform a brute force, exact k-NN search across an index, which also supports pre-filtering. - - This approach has slightly slower query performance compared to the k-NN Script Score. If your use case requires more customization over the final score, you should use this approach over Script Score k-NN. - - For more details about this method, see [Painless scripting functions]({{site.url}}{{site.baseurl}}/search-plugins/knn/painless-functions/). - - -Overall, for larger data sets, you should generally choose the approximate nearest neighbor method because it scales significantly better. For smaller data sets, where you may want to apply a filter, you should choose the custom scoring approach. If you have a more complex use case where you need to use a distance function as part of their scoring method, you should use the painless scripting approach. diff --git a/_search-plugins/knn/jni-libraries.md b/_search-plugins/knn/jni-libraries.md deleted file mode 100644 index 4dbdb2da565..00000000000 --- a/_search-plugins/knn/jni-libraries.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -layout: default -title: JNI libraries -nav_order: 35 -parent: k-NN search -has_children: false -redirect_from: - - /search-plugins/knn/jni-library/ ---- - -# JNI libraries - -To integrate [nmslib](https://github.com/nmslib/nmslib/) and [faiss](https://github.com/facebookresearch/faiss/) approximate k-NN functionality (implemented in C++) into the k-NN plugin (implemented in Java), we created a Java Native Interface, which lets the k-NN plugin make calls to the native libraries. The interface includes three libraries: `libopensearchknn_nmslib`, the JNI library that interfaces with nmslib, `libopensearchknn_faiss`, the JNI library that interfaces with faiss, and `libopensearchknn_common`, a library containing common shared functionality between native libraries. - -The Lucene library is not implemented using a native library. -{: .note} - -The libraries `libopensearchknn_faiss` and `libopensearchknn_nmslib` are lazily loaded when they are first called in the plugin. This means that if you are only planning on using one of the libraries, the plugin never loads the other library. - -To build the libraries from source, refer to the [DEVELOPER_GUIDE](https://github.com/opensearch-project/k-NN/blob/main/DEVELOPER_GUIDE.md). - -For more information about JNI, see [Java Native Interface](https://en.wikipedia.org/wiki/Java_Native_Interface) on Wikipedia. diff --git a/_search-plugins/knn/knn-index.md b/_search-plugins/knn/knn-index.md deleted file mode 100644 index 620b262cf90..00000000000 --- a/_search-plugins/knn/knn-index.md +++ /dev/null @@ -1,376 +0,0 @@ ---- -layout: default -title: k-NN index -nav_order: 5 -parent: k-NN search -has_children: false ---- - -# k-NN index - -The k-NN plugin introduces a custom data type, the `knn_vector`, that allows users to ingest their k-NN vectors into an OpenSearch index and perform different kinds of k-NN search. The `knn_vector` field is highly configurable and can serve many different k-NN workloads. For more information, see [k-NN vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/). - -To create a k-NN index, set the `settings.index.knn` parameter to `true`: - -```json -PUT /test-index -{ - "settings": { - "index": { - "knn": true - } - }, - "mappings": { - "properties": { - "my_vector1": { - "type": "knn_vector", - "dimension": 3, - "space_type": "l2", - "method": { - "name": "hnsw", - "engine": "lucene", - "parameters": { - "ef_construction": 128, - "m": 24 - } - } - } - } - } -} -``` -{% include copy-curl.html %} - -## Byte vectors - -Starting with k-NN plugin version 2.17, you can use `byte` vectors with the `faiss` and `lucene` engines to reduce the amount of required memory and storage space. For more information, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors). - -## Binary vectors - -Starting with k-NN plugin version 2.16, you can use `binary` vectors with the `faiss` engine to reduce the amount of required storage space. For more information, see [Binary vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors). - -## SIMD optimization for the Faiss engine - -Starting with version 2.13, the k-NN plugin supports [Single Instruction Multiple Data (SIMD)](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) processing if the underlying hardware supports SIMD instructions (AVX2 on x64 architecture and Neon on ARM64 architecture). SIMD is supported by default on Linux machines only for the Faiss engine. SIMD architecture helps boost overall performance by improving indexing throughput and reducing search latency. Starting with version 2.18, the k-NN plugin supports AVX512 SIMD instructions on x64 architecture. - -SIMD optimization is applicable only if the vector dimension is a multiple of 8. -{: .note} - -<!-- vale off --> -### x64 architecture -<!-- vale on --> - -For x64 architecture, the following versions of the Faiss library are built and shipped with the artifact: - -- `libopensearchknn_faiss.so`: The non-optimized Faiss library without SIMD instructions. -- `libopensearchknn_faiss_avx512.so`: The Faiss library containing AVX512 SIMD instructions. -- `libopensearchknn_faiss_avx2.so`: The Faiss library containing AVX2 SIMD instructions. - -When using the Faiss library, the performance ranking is as follows: AVX512 > AVX2 > no optimization. -{: .note } - -If your hardware supports AVX512, the k-NN plugin loads the `libopensearchknn_faiss_avx512.so` library at runtime. - -If your hardware supports AVX2 but doesn't support AVX512, the k-NN plugin loads the `libopensearchknn_faiss_avx2.so` library at runtime. - -To disable the AVX512 and AVX2 SIMD instructions and load the non-optimized Faiss library (`libopensearchknn_faiss.so`), specify the `knn.faiss.avx512.disabled` and `knn.faiss.avx2.disabled` static settings as `true` in `opensearch.yml` (by default, both of these are `false`). - -Note that to update a static setting, you must stop the cluster, change the setting, and restart the cluster. For more information, see [Static settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#static-settings). - -### ARM64 architecture - -For the ARM64 architecture, only one performance-boosting Faiss library (`libopensearchknn_faiss.so`) is built and shipped. The library contains Neon SIMD instructions and cannot be disabled. - -## Method definitions - -A method definition refers to the underlying configuration of the approximate k-NN algorithm you want to use. Method definitions are used to either create a `knn_vector` field (when the method does not require training) or [create a model during training]({{site.url}}{{site.baseurl}}/search-plugins/knn/api#train-a-model) that can then be used to [create a `knn_vector` field]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model). - -A method definition will always contain the name of the method, the space_type the method is built for, the engine -(the library) to use, and a map of parameters. - -Mapping parameter | Required | Default | Updatable | Description -:--- | :--- | :--- | :--- | :--- -`name` | true | n/a | false | The identifier for the nearest neighbor method. -`space_type` | false | l2 | false | The vector space used to calculate the distance between vectors. Note: This value can also be specified at the top level of the mapping. -`engine` | false | nmslib | false | The approximate k-NN library to use for indexing and search. The available libraries are faiss, nmslib, and Lucene. -`parameters` | false | null | false | The parameters used for the nearest neighbor method. - -### Supported nmslib methods - -Method name | Requires training | Supported spaces | Description -:--- | :--- | :--- | :--- -`hnsw` | false | l2, innerproduct, cosinesimil, l1, linf | Hierarchical proximity graph approach to approximate k-NN search. For more details on the algorithm, see this [abstract](https://arxiv.org/abs/1603.09320). - -#### HNSW parameters - -Parameter name | Required | Default | Updatable | Description -:--- | :--- | :--- | :--- | :--- -`ef_construction` | false | 100 | false | The size of the dynamic list used during k-NN graph creation. Higher values result in a more accurate graph but slower indexing speed. -`m` | false | 16 | false | The number of bidirectional links that the plugin creates for each new element. Increasing and decreasing this value can have a large impact on memory consumption. Keep this value between 2 and 100. - -For nmslib, *ef_search* is set in the [index settings](#index-settings). -{: .note} - -An index created in OpenSearch version 2.11 or earlier will still use the old `ef_construction` value (`512`). -{: .note} - -### Supported Faiss methods - -Method name | Requires training | Supported spaces | Description -:--- | :--- |:---| :--- -`hnsw` | false | l2, innerproduct, hamming | Hierarchical proximity graph approach to approximate k-NN search. -`ivf` | true | l2, innerproduct, hamming | Stands for _inverted file index_. Bucketing approach where vectors are assigned different buckets based on clustering and, during search, only a subset of the buckets is searched. - -For hnsw, "innerproduct" is not available when PQ is used. -{: .note} - -The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors). -{: .note} - -#### HNSW parameters - -Parameter name | Required | Default | Updatable | Description -:--- | :--- | :--- | :--- | :--- -`ef_search` | false | 100 | false | The size of the dynamic list used during k-NN searches. Higher values result in more accurate but slower searches. -`ef_construction` | false | 100 | false | The size of the dynamic list used during k-NN graph creation. Higher values result in a more accurate graph but slower indexing speed. -`m` | false | 16 | false | The number of bidirectional links that the plugin creates for each new element. Increasing and decreasing this value can have a large impact on memory consumption. Keep this value between 2 and 100. -`encoder` | false | flat | false | Encoder definition for encoding vectors. Encoders can reduce the memory footprint of your index, at the expense of search accuracy. - -An index created in OpenSearch version 2.11 or earlier will still use the old `ef_construction` and `ef_search` values (`512`). -{: .note} - -#### IVF parameters - -Parameter name | Required | Default | Updatable | Description -:--- | :--- | :--- | :--- | :--- -`nlist` | false | 4 | false | Number of buckets to partition vectors into. Higher values may lead to more accurate searches at the expense of memory and training latency. For more information about choosing the right value, refer to [Guidelines to choose an index](https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index). -`nprobes` | false | 1 | false | Number of buckets to search during query. Higher values lead to more accurate but slower searches. -`encoder` | false | flat | false | Encoder definition for encoding vectors. Encoders can reduce the memory footprint of your index, at the expense of search accuracy. - -For more information about setting these parameters, refer to the [Faiss documentation](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes). - -#### IVF training requirements - -The IVF algorithm requires a training step. To create an index that uses IVF, you need to train a model with the [Train API]({{site.url}}{{site.baseurl}}/search-plugins/knn/api#train-a-model), passing the IVF method definition. IVF requires that, at a minimum, there are `nlist` training data points, but it is [recommended that you use more than this](https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index#how-big-is-the-dataset). Training data can be composed of either the same data that is going to be ingested or a separate dataset. - -### Supported Lucene methods - -Method name | Requires training | Supported spaces | Description -:--- | :--- |:--------------------------------------------------------------------------------| :--- -`hnsw` | false | l2, cosinesimil, innerproduct (supported in OpenSearch 2.13 and later) | Hierarchical proximity graph approach to approximate k-NN search. - -#### HNSW parameters - -Parameter name | Required | Default | Updatable | Description -:--- | :--- | :--- | :--- | :--- -`ef_construction` | false | 100 | false | The size of the dynamic list used during k-NN graph creation. Higher values result in a more accurate graph but slower indexing speed.<br>The Lucene engine uses the proprietary term "beam_width" to describe this function, which corresponds directly to "ef_construction". To be consistent throughout the OpenSearch documentation, we retain the term "ef_construction" for this parameter. -`m` | false | 16 | false | The number of bidirectional links that the plugin creates for each new element. Increasing and decreasing this value can have a large impact on memory consumption. Keep this value between 2 and 100.<br>The Lucene engine uses the proprietary term "max_connections" to describe this function, which corresponds directly to "m". To be consistent throughout OpenSearch documentation, we retain the term "m" to label this parameter. - -Lucene HNSW implementation ignores `ef_search` and dynamically sets it to the value of "k" in the search request. Therefore, there is no need to make settings for `ef_search` when using the Lucene engine. -{: .note} - -An index created in OpenSearch version 2.11 or earlier will still use the old `ef_construction` value (`512`). -{: .note} - -```json -"method": { - "name":"hnsw", - "engine":"lucene", - "parameters":{ - "m":2048, - "ef_construction": 245 - } -} -``` - -### Supported Faiss encoders - -You can use encoders to reduce the memory footprint of a k-NN index at the expense of search accuracy. The k-NN plugin currently supports the `flat`, `pq`, and `sq` encoders in the Faiss library. - -The following example method definition specifies the `hnsw` method and a `pq` encoder: - -```json -"method": { - "name":"hnsw", - "engine":"faiss", - "parameters":{ - "encoder":{ - "name":"pq", - "parameters":{ - "code_size": 8, - "m": 8 - } - } - } -} -``` - -The `hnsw` method supports the `pq` encoder for OpenSearch versions 2.10 and later. The `code_size` parameter of a `pq` encoder with the `hnsw` method must be **8**. -{: .important} - -Encoder name | Requires training | Description -:--- | :--- | :--- -`flat` (Default) | false | Encode vectors as floating-point arrays. This encoding does not reduce memory footprint. -`pq` | true | An abbreviation for _product quantization_, it is a lossy compression technique that uses clustering to encode a vector into a fixed size of bytes, with the goal of minimizing the drop in k-NN search accuracy. At a high level, vectors are broken up into `m` subvectors, and then each subvector is represented by a `code_size` code obtained from a code book produced during training. For more information about product quantization, see [this blog post](https://medium.com/dotstar/understanding-faiss-part-2-79d90b1e5388). -`sq` | false | An abbreviation for _scalar quantization_. Starting with k-NN plugin version 2.13, you can use the `sq` encoder to quantize 32-bit floating-point vectors into 16-bit floats. In version 2.13, the built-in `sq` encoder is the SQFP16 Faiss encoder. The encoder reduces memory footprint with a minimal loss of precision and improves performance by using SIMD optimization (using AVX2 on x86 architecture or Neon on ARM64 architecture). For more information, see [Faiss scalar quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization#faiss-16-bit-scalar-quantization). - -#### PQ parameters - -Parameter name | Required | Default | Updatable | Description -:--- | :--- | :--- | :--- | :--- -`m` | false | 1 | false | Determines the number of subvectors into which to break the vector. Subvectors are encoded independently of each other. This vector dimension must be divisible by `m`. Maximum value is 1,024. -`code_size` | false | 8 | false | Determines the number of bits into which to encode a subvector. Maximum value is 8. For IVF, this value must be less than or equal to 8. For HNSW, this value can only be 8. - -#### SQ parameters - -Parameter name | Required | Default | Updatable | Description -:--- | :--- | :-- | :--- | :--- -`type` | false | `fp16` | false | The type of scalar quantization to be used to encode 32-bit float vectors into the corresponding type. As of OpenSearch 2.13, only the `fp16` encoder type is supported. For the `fp16` encoder, vector values must be in the [-65504.0, 65504.0] range. -`clip` | false | `false` | false | If `true`, then any vector values outside of the supported range for the specified vector type are rounded so that they are in the range. If `false`, then the request is rejected if any vector values are outside of the supported range. Setting `clip` to `true` may decrease recall. - -For more information and examples, see [Using Faiss scalar quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#using-faiss-scalar-quantization). - -#### Examples - -The following example uses the `ivf` method without specifying an encoder (by default, OpenSearch uses the `flat` encoder): - -```json -"method": { - "name":"ivf", - "engine":"faiss", - "parameters":{ - "nlist": 4, - "nprobes": 2 - } -} -``` - -The following example uses the `ivf` method with a `pq` encoder: - -```json -"method": { - "name":"ivf", - "engine":"faiss", - "parameters":{ - "encoder":{ - "name":"pq", - "parameters":{ - "code_size": 8, - "m": 8 - } - } - } -} -``` - -The following example uses the `hnsw` method without specifying an encoder (by default, OpenSearch uses the `flat` encoder): - -```json -"method": { - "name":"hnsw", - "engine":"faiss", - "parameters":{ - "ef_construction": 256, - "m": 8 - } -} -``` - -The following example uses the `hnsw` method with an `sq` encoder of type `fp16` with `clip` enabled: - -```json -"method": { - "name":"hnsw", - "engine":"faiss", - "parameters":{ - "encoder": { - "name": "sq", - "parameters": { - "type": "fp16", - "clip": true - } - }, - "ef_construction": 256, - "m": 8 - } -} -``` - -The following example uses the `ivf` method with an `sq` encoder of type `fp16`: - -```json -"method": { - "name":"ivf", - "engine":"faiss", - "parameters":{ - "encoder": { - "name": "sq", - "parameters": { - "type": "fp16", - "clip": false - } - }, - "nprobes": 2 - } -} -``` - -### Choosing the right method - -There are several options to choose from when building your `knn_vector` field. To determine the correct methods and parameters, you should first understand the requirements of your workload and what trade-offs you are willing to make. Factors to consider are (1) query latency, (2) query quality, (3) memory limits, and (4) indexing latency. - -If memory is not a concern, HNSW offers a strong query latency/query quality trade-off. - -If you want to use less memory and increase indexing speed as compared to HNSW while maintaining similar query quality, you should evaluate IVF. - -If memory is a concern, consider adding a PQ encoder to your HNSW or IVF index. Because PQ is a lossy encoding, query quality will drop. - -You can reduce the memory footprint by a factor of 2, with a minimal loss in search quality, by using the [`fp_16` encoder]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#faiss-16-bit-scalar-quantization). If your vector dimensions are within the [-128, 127] byte range, we recommend using the [byte quantizer]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#byte-vectors) to reduce the memory footprint by a factor of 4. To learn more about vector quantization options, see [k-NN vector quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/). - -### Memory estimation - -In a typical OpenSearch cluster, a certain portion of RAM is reserved for the JVM heap. The k-NN plugin allocates native library indexes to a portion of the remaining RAM. This portion's size is determined by the `circuit_breaker_limit` cluster setting. By default, the limit is set to 50%. - -Having a replica doubles the total number of vectors. -{: .note } - -For information about using memory estimation with vector quantization, see the [vector quantization documentation]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#memory-estimation). -{: .note } - -#### HNSW memory estimation - -The memory required for HNSW is estimated to be `1.1 * (4 * dimension + 8 * M)` bytes/vector. - -As an example, assume you have a million vectors with a dimension of 256 and M of 16. The memory requirement can be estimated as follows: - -``` -1.1 * (4 * 256 + 8 * 16) * 1,000,000 ~= 1.267 GB -``` - -#### IVF memory estimation - -The memory required for IVF is estimated to be `1.1 * (((4 * dimension) * num_vectors) + (4 * nlist * d))` bytes. - -As an example, assume you have a million vectors with a dimension of 256 and nlist of 128. The memory requirement can be estimated as follows: - -``` -1.1 * (((4 * 256) * 1,000,000) + (4 * 128 * 256)) ~= 1.126 GB - -``` - -## Index settings - -Additionally, the k-NN plugin introduces several index settings that can be used to configure the k-NN structure as well. - -At the moment, several parameters defined in the settings are in the deprecation process. Those parameters should be set in the mapping instead of the index settings. Parameters set in the mapping will override the parameters set in the index settings. Setting the parameters in the mapping allows an index to have multiple `knn_vector` fields with different parameters. - -Setting | Default | Updatable | Description -:--- | :--- | :--- | :--- -`index.knn` | false | false | Whether the index should build native library indexes for the `knn_vector` fields. If set to false, the `knn_vector` fields will be stored in doc values, but approximate k-NN search functionality will be disabled. -`index.knn.algo_param.ef_search` | 100 | true | The size of the dynamic list used during k-NN searches. Higher values result in more accurate but slower searches. Only available for NMSLIB. -`index.knn.algo_param.ef_construction` | 100 | false | Deprecated in 1.0.0. Instead, use the [mapping parameters](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) to set this value. -`index.knn.algo_param.m` | 16 | false | Deprecated in 1.0.0. Use the [mapping parameters](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) to set this value instead. -`index.knn.space_type` | l2 | false | Deprecated in 1.0.0. Use the [mapping parameters](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) to set this value instead. - -An index created in OpenSearch version 2.11 or earlier will still use the old `ef_construction` and `ef_search` values (`512`). -{: .note} diff --git a/_search-plugins/knn/knn-vector-quantization.md b/_search-plugins/knn/knn-vector-quantization.md deleted file mode 100644 index a911dc91c98..00000000000 --- a/_search-plugins/knn/knn-vector-quantization.md +++ /dev/null @@ -1,484 +0,0 @@ ---- -layout: default -title: k-NN vector quantization -nav_order: 27 -parent: k-NN search -has_children: false -has_math: true ---- - -# k-NN vector quantization - -By default, the k-NN plugin supports the indexing and querying of vectors of type `float`, where each dimension of the vector occupies 4 bytes of memory. For use cases that require ingestion on a large scale, keeping `float` vectors can be expensive because OpenSearch needs to construct, load, save, and search graphs (for native `nmslib` and `faiss` engines). To reduce the memory footprint, you can use vector quantization. - -OpenSearch supports many varieties of quantization. In general, the level of quantization will provide a trade-off between the accuracy of the nearest neighbor search and the size of the memory footprint consumed by the vector search. The supported types include byte vectors, 16-bit scalar quantization, product quantization (PQ), and binary quantization(BQ). - -## Byte vectors - -Starting with version 2.17, the k-NN plugin supports `byte` vectors with the `faiss` and `lucene` engines in order to reduce the amount of required memory. This requires quantizing the vectors outside of OpenSearch before ingesting them into an OpenSearch index. For more information, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors). - -## Lucene scalar quantization - -Starting with version 2.16, the k-NN plugin supports built-in scalar quantization for the Lucene engine. Unlike [byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors), which require you to quantize vectors before ingesting documents, the Lucene scalar quantizer quantizes input vectors in OpenSearch during ingestion. The Lucene scalar quantizer converts 32-bit floating-point input vectors into 7-bit integer vectors in each segment using the minimum and maximum quantiles computed based on the [`confidence_interval`](#confidence-interval) parameter. During search, the query vector is quantized in each segment using the segment's minimum and maximum quantiles in order to compute the distance between the query vector and the segment's quantized input vectors. - -Quantization can decrease the memory footprint by a factor of 4 in exchange for some loss in recall. Additionally, quantization slightly increases disk usage because it requires storing both the raw input vectors and the quantized vectors. - -### Using Lucene scalar quantization - -To use the Lucene scalar quantizer, set the k-NN vector field's `method.parameters.encoder.name` to `sq` when creating a k-NN index: - -```json -PUT /test-index -{ - "settings": { - "index": { - "knn": true - } - }, - "mappings": { - "properties": { - "my_vector1": { - "type": "knn_vector", - "dimension": 2, - "space_type": "l2", - "method": { - "name": "hnsw", - "engine": "lucene", - "parameters": { - "encoder": { - "name": "sq" - }, - "ef_construction": 256, - "m": 8 - } - } - } - } - } -} -``` -{% include copy-curl.html %} - -### Confidence interval - -Optionally, you can specify the `confidence_interval` parameter in the `method.parameters.encoder` object. -The `confidence_interval` is used to compute the minimum and maximum quantiles in order to quantize the vectors: -- If you set the `confidence_interval` to a value in the `0.9` to `1.0` range, inclusive, then the quantiles are calculated statically. For example, setting the `confidence_interval` to `0.9` specifies to compute the minimum and maximum quantiles based on the middle 90% of the vector values, excluding the minimum 5% and maximum 5% of the values. -- Setting `confidence_interval` to `0` specifies to compute the quantiles dynamically, which involves oversampling and additional computations performed on the input data. -- When `confidence_interval` is not set, it is computed based on the vector dimension $$d$$ using the formula $$max(0.9, 1 - \frac{1}{1 + d})$$. - -Lucene scalar quantization is applied only to `float` vectors. If you change the default value of the `data_type` parameter from `float` to `byte` or any other type when mapping a [k-NN vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/), then the request is rejected. -{: .warning} - -The following example method definition specifies the Lucene `sq` encoder with the `confidence_interval` set to `1.0`. This `confidence_interval` specifies to consider all the input vectors when computing the minimum and maximum quantiles. Vectors are quantized to 7 bits by default: - -```json -PUT /test-index -{ - "settings": { - "index": { - "knn": true - } - }, - "mappings": { - "properties": { - "my_vector1": { - "type": "knn_vector", - "dimension": 2, - "space_type": "l2", - "method": { - "name": "hnsw", - "engine": "lucene", - "parameters": { - "encoder": { - "name": "sq", - "parameters": { - "confidence_interval": 1.0 - } - }, - "ef_construction": 256, - "m": 8 - } - } - } - } - } -} -``` -{% include copy-curl.html %} - -There are no changes to ingestion or query mapping and no range limitations for the input vectors. - -### Memory estimation - -In the ideal scenario, 7-bit vectors created by the Lucene scalar quantizer use only 25% of the memory required by 32-bit vectors. - -#### HNSW memory estimation - -The memory required for the Hierarchical Navigable Small World (HNSW) graph can be estimated as `1.1 * (dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph. - -As an example, assume that you have 1 million vectors with a dimension of 256 and M of 16. The memory requirement can be estimated as follows: - -```r -1.1 * (256 + 8 * 16) * 1,000,000 ~= 0.4 GB -``` - -## Faiss 16-bit scalar quantization - -Starting with version 2.13, the k-NN plugin supports performing scalar quantization for the Faiss engine within OpenSearch. Within the Faiss engine, a scalar quantizer (SQfp16) performs the conversion between 32-bit and 16-bit vectors. At ingestion time, when you upload 32-bit floating-point vectors to OpenSearch, SQfp16 quantizes them into 16-bit floating-point vectors and stores the quantized vectors in a k-NN index. - -At search time, SQfp16 decodes the vector values back into 32-bit floating-point values for distance computation. The SQfp16 quantization can decrease the memory footprint by a factor of 2. Additionally, it leads to a minimal loss in recall when differences between vector values are large compared to the error introduced by eliminating their two least significant bits. When used with [SIMD optimization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#simd-optimization-for-the-faiss-engine), SQfp16 quantization can also significantly reduce search latencies and improve indexing throughput. - -SIMD optimization is not supported on Windows. Using Faiss scalar quantization on Windows can lead to a significant drop in performance, including decreased indexing throughput and increased search latencies. -{: .warning} - -### Using Faiss scalar quantization - -To use Faiss scalar quantization, set the k-NN vector field's `method.parameters.encoder.name` to `sq` when creating a k-NN index: - -```json -PUT /test-index -{ - "settings": { - "index": { - "knn": true, - "knn.algo_param.ef_search": 100 - } - }, - "mappings": { - "properties": { - "my_vector1": { - "type": "knn_vector", - "dimension": 3, - "space_type": "l2", - "method": { - "name": "hnsw", - "engine": "faiss", - "parameters": { - "encoder": { - "name": "sq" - }, - "ef_construction": 256, - "m": 8 - } - } - } - } - } -} -``` -{% include copy-curl.html %} - -Optionally, you can specify the parameters in `method.parameters.encoder`. For more information about `encoder` object parameters, see [SQ parameters]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#sq-parameters). - -The `fp16` encoder converts 32-bit vectors into their 16-bit counterparts. For this encoder type, the vector values must be in the [-65504.0, 65504.0] range. To define how to handle out-of-range values, the preceding request specifies the `clip` parameter. By default, this parameter is `false`, and any vectors containing out-of-range values are rejected. - -When `clip` is set to `true` (as in the preceding request), out-of-range vector values are rounded up or down so that they are in the supported range. For example, if the original 32-bit vector is `[65510.82, -65504.1]`, the vector will be indexed as a 16-bit vector `[65504.0, -65504.0]`. - -We recommend setting `clip` to `true` only if very few elements lie outside of the supported range. Rounding the values may cause a drop in recall. -{: .note} - -The following example method definition specifies the Faiss SQfp16 encoder, which rejects any indexing request that contains out-of-range vector values (because the `clip` parameter is `false` by default): - -```json -PUT /test-index -{ - "settings": { - "index": { - "knn": true, - "knn.algo_param.ef_search": 100 - } - }, - "mappings": { - "properties": { - "my_vector1": { - "type": "knn_vector", - "dimension": 3, - "space_type": "l2", - "method": { - "name": "hnsw", - "engine": "faiss", - "parameters": { - "encoder": { - "name": "sq", - "parameters": { - "type": "fp16" - } - }, - "ef_construction": 256, - "m": 8 - } - } - } - } - } -} -``` -{% include copy-curl.html %} - -During ingestion, make sure each vector dimension is in the supported range ([-65504.0, 65504.0]). - -```json -PUT test-index/_doc/1 -{ - "my_vector1": [-65504.0, 65503.845, 55.82] -} -``` -{% include copy-curl.html %} - -During querying, the query vector has no range limitation: - -```json -GET test-index/_search -{ - "size": 2, - "query": { - "knn": { - "my_vector1": { - "vector": [265436.876, -120906.256, 99.84], - "k": 2 - } - } - } -} -``` -{% include copy-curl.html %} - -### Memory estimation - -In the best-case scenario, 16-bit vectors produced by the Faiss SQfp16 quantizer require 50% of the memory that 32-bit vectors require. - -#### HNSW memory estimation - -The memory required for Hierarchical Navigable Small Worlds (HNSW) is estimated to be `1.1 * (2 * dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph. - -As an example, assume that you have 1 million vectors with a dimension of 256 and an `m` of 16. The memory requirement can be estimated as follows: - -```r -1.1 * (2 * 256 + 8 * 16) * 1,000,000 ~= 0.656 GB -``` - -#### IVF memory estimation - -The memory required for IVF is estimated to be `1.1 * (((2 * dimension) * num_vectors) + (4 * nlist * dimension))` bytes/vector, where `nlist` is the number of buckets to partition vectors into. - -As an example, assume that you have 1 million vectors with a dimension of 256 and an `nlist` of 128. The memory requirement can be estimated as follows: - -```r -1.1 * (((2 * 256) * 1,000,000) + (4 * 128 * 256)) ~= 0.525 GB -``` - -## Faiss product quantization - -PQ is a technique used to represent a vector in a configurable amount of bits. In general, it can be used to achieve a higher level of compression as compared to byte or scalar quantization. PQ works by separating vectors into _m_ subvectors and encoding each subvector with _code_size_ bits. Thus, the total amount of memory for the vector is `m*code_size` bits, plus overhead. For details about the parameters, see [PQ parameters]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#pq-parameters). PQ is only supported for the _Faiss_ engine and can be used with either the _HNSW_ or _IVF_ approximate nearest neighbor (ANN) algorithms. - -### Using Faiss product quantization - -To minimize loss in accuracy, PQ requires a _training_ step that builds a model based on the distribution of the data that will be searched. - -The product quantizer is trained by running k-means clustering on a set of training vectors for each subvector space and extracts the centroids to be used for encoding. The training vectors can be either a subset of the vectors to be ingested or vectors that have the same distribution and dimension as the vectors to be ingested. - -In OpenSearch, the training vectors need to be present in an index. In general, the amount of training data will depend on which ANN algorithm is used and how much data will be stored in the index. For IVF-based indexes, a recommended number of training vectors is `max(1000*nlist, 2^code_size * 1000)`. For HNSW-based indexes, a recommended number is `2^code_size*1000`. See the [Faiss documentation](https://github.com/facebookresearch/faiss/wiki/FAQ#how-many-training-points-do-i-need-for-k-means) for more information about the methodology used to calculate these figures. - -For PQ, both _m_ and _code_size_ need to be selected. _m_ determines the number of subvectors into which vectors should be split for separate encoding. Consequently, the _dimension_ needs to be divisible by _m_. _code_size_ determines the number of bits used to encode each subvector. In general, we recommend a setting of `code_size = 8` and then tuning _m_ to get the desired trade-off between memory footprint and recall. - -For an example of setting up an index with PQ, see the [Building a k-NN index from a model]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model) tutorial. - -### Memory estimation - -While PQ is meant to represent individual vectors with `m*code_size` bits, in reality, the indexes consume more space. This is mainly due to the overhead of storing certain code tables and auxiliary data structures. - -Some of the memory formulas depend on the number of segments present. This is not typically known beforehand, but a recommended default value is 300. -{: .note} - -#### HNSW memory estimation - -The memory required for HNSW with PQ is estimated to be `1.1*(((pq_code_size / 8) * pq_m + 24 + 8 * hnsw_m) * num_vectors + num_segments * (2^pq_code_size * 4 * d))` bytes. - -As an example, assume that you have 1 million vectors with a dimension of 256, `hnsw_m` of 16, `pq_m` of 32, `pq_code_size` of 8, and 100 segments. The memory requirement can be estimated as follows: - -```r -1.1 * ((8 / 8 * 32 + 24 + 8 * 16) * 1000000 + 100 * (2^8 * 4 * 256)) ~= 0.215 GB -``` - -#### IVF memory estimation - -The memory required for IVF with PQ is estimated to be `1.1*(((pq_code_size / 8) * pq_m + 24) * num_vectors + num_segments * (2^code_size * 4 * d + 4 * ivf_nlist * d))` bytes. - -For example, assume that you have 1 million vectors with a dimension of 256, `ivf_nlist` of 512, `pq_m` of 32, `pq_code_size` of 8, and 100 segments. The memory requirement can be estimated as follows: - -```r -1.1*((8 / 8 * 64 + 24) * 1000000 + 100 * (2^8 * 4 * 256 + 4 * 512 * 256)) ~= 0.171 GB -``` - -## Binary quantization - -Starting with version 2.17, OpenSearch supports BQ with binary vector support for the Faiss engine. BQ compresses vectors into a binary format (0s and 1s), making it highly efficient in terms of memory usage. You can choose to represent each vector dimension using 1, 2, or 4 bits, depending on the desired precision. One of the advantages of using BQ is that the training process is handled automatically during indexing. This means that no separate training step is required, unlike other quantization techniques such as PQ. - -### Using BQ -To configure BQ for the Faiss engine, define a `knn_vector` field and specify the `mode` as `on_disk`. This configuration defaults to 1-bit BQ and both `ef_search` and `ef_construction` set to `100`: - -```json -PUT my-vector-index -{ - "mappings": { - "properties": { - "my_vector_field": { - "type": "knn_vector", - "dimension": 8, - "space_type": "l2", - "data_type": "float", - "mode": "on_disk" - } - } - } -} -``` -{% include copy-curl.html %} - -To further optimize the configuration, you can specify additional parameters, such as the compression level, and fine-tune the search parameters. For example, you can override the `ef_construction` value or define the compression level, which corresponds to the number of bits used for quantization: - -- **32x compression** for 1-bit quantization -- **16x compression** for 2-bit quantization -- **8x compression** for 4-bit quantization - -This allows for greater control over memory usage and recall performance, providing flexibility to balance between precision and storage efficiency. - -To specify the compression level, set the `compression_level` parameter: - -```json -PUT my-vector-index -{ - "mappings": { - "properties": { - "my_vector_field": { - "type": "knn_vector", - "dimension": 8, - "space_type": "l2", - "data_type": "float", - "mode": "on_disk", - "compression_level": "16x", - "method": { - "params": { - "ef_construction": 16 - } - } - } - } - } -} -``` -{% include copy-curl.html %} - -The following example further fine-tunes the configuration by defining `ef_construction`, `encoder`, and the number of `bits` (which can be `1`, `2`, or `4`): - -```json -PUT my-vector-index -{ - "mappings": { - "properties": { - "my_vector_field": { - "type": "knn_vector", - "dimension": 8, - "method": { - "name": "hnsw", - "engine": "faiss", - "space_type": "l2", - "params": { - "m": 16, - "ef_construction": 512, - "encoder": { - "name": "binary", - "parameters": { - "bits": 1 - } - } - } - } - } - } - } -} -``` -{% include copy-curl.html %} - -### Search using binary quantized vectors - -You can perform a k-NN search on your index by providing a vector and specifying the number of nearest neighbors (k) to return: - -```json -GET my-vector-index/_search -{ - "size": 2, - "query": { - "knn": { - "my_vector_field": { - "vector": [1.5, 5.5, 1.5, 5.5, 1.5, 5.5, 1.5, 5.5], - "k": 10 - } - } - } -} -``` -{% include copy-curl.html %} - -You can also fine-tune search by providing the `ef_search` and `oversample_factor` parameters. -The `oversample_factor` parameter controls the factor by which the search oversamples the candidate vectors before ranking them. Using a higher oversample factor means that more candidates will be considered before ranking, improving accuracy but also increasing search time. When selecting the `oversample_factor` value, consider the trade-off between accuracy and efficiency. For example, setting the `oversample_factor` to `2.0` will double the number of candidates considered during the ranking phase, which may help achieve better results. - -The following request specifies the `ef_search` and `oversample_factor` parameters: - -```json -GET my-vector-index/_search -{ - "size": 2, - "query": { - "knn": { - "my_vector_field": { - "vector": [1.5, 5.5, 1.5, 5.5, 1.5, 5.5, 1.5, 5.5], - "k": 10, - "method_parameters": { - "ef_search": 10 - }, - "rescore": { - "oversample_factor": 10.0 - } - } - } - } -} -``` -{% include copy-curl.html %} - - -#### HNSW memory estimation - -The memory required for the Hierarchical Navigable Small World (HNSW) graph can be estimated as `1.1 * (dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph. - -As an example, assume that you have 1 million vectors with a dimension of 256 and an `m` of 16. The following sections provide memory requirement estimations for various compression values. - -##### 1-bit quantization (32x compression) - -In 1-bit quantization, each dimension is represented using 1 bit, equivalent to a 32x compression factor. The memory requirement can be estimated as follows: - -```r -Memory = 1.1 * ((256 * 1 / 8) + 8 * 16) * 1,000,000 - ~= 0.176 GB -``` - -##### 2-bit quantization (16x compression) - -In 2-bit quantization, each dimension is represented using 2 bits, equivalent to a 16x compression factor. The memory requirement can be estimated as follows: - -```r -Memory = 1.1 * ((256 * 2 / 8) + 8 * 16) * 1,000,000 - ~= 0.211 GB -``` - -##### 4-bit quantization (8x compression) - -In 4-bit quantization, each dimension is represented using 4 bits, equivalent to an 8x compression factor. The memory requirement can be estimated as follows: - -```r -Memory = 1.1 * ((256 * 4 / 8) + 8 * 16) * 1,000,000 - ~= 0.282 GB -``` diff --git a/_search-plugins/knn/painless-functions.md b/_search-plugins/knn/painless-functions.md deleted file mode 100644 index 7a8d9fec7bd..00000000000 --- a/_search-plugins/knn/painless-functions.md +++ /dev/null @@ -1,76 +0,0 @@ ---- -layout: default -title: k-NN Painless extensions -nav_order: 25 -parent: k-NN search -has_children: false -has_math: true ---- - -# k-NN Painless Scripting extensions - -With the k-NN plugin's Painless Scripting extensions, you can use k-NN distance functions directly in your Painless scripts to perform operations on `knn_vector` fields. Painless has a strict list of allowed functions and classes per context to ensure its scripts are secure. The k-NN plugin adds Painless Scripting extensions to a few of the distance functions used in [k-NN score script]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-score-script), so you can use them to customize your k-NN workload. - -## Get started with k-NN's Painless Scripting functions - -To use k-NN's Painless Scripting functions, first create an index with `knn_vector` fields like in [k-NN score script]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-score-script#getting-started-with-the-score-script-for-vectors). Once the index is created and you ingest some data, you can use the painless extensions: - -```json -GET my-knn-index-2/_search -{ - "size": 2, - "query": { - "script_score": { - "query": { - "bool": { - "filter": { - "term": { - "color": "BLUE" - } - } - } - }, - "script": { - "source": "1.0 + cosineSimilarity(params.query_value, doc[params.field])", - "params": { - "field": "my_vector", - "query_value": [9.9, 9.9] - } - } - } - } -} -``` -{% include copy-curl.html %} - -`field` needs to map to a `knn_vector` field, and `query_value` needs to be a floating point array with the same dimension as `field`. - -## Function types -The following table describes the available painless functions the k-NN plugin provides: - -Function name | Function signature | Description -:--- | :--- -l2Squared | `float l2Squared (float[] queryVector, doc['vector field'])` | This function calculates the square of the L2 distance (Euclidean distance) between a given query vector and document vectors. The shorter the distance, the more relevant the document is, so this example inverts the return value of the l2Squared function. If the document vector matches the query vector, the result is 0, so this example also adds 1 to the distance to avoid divide by zero errors. -l1Norm | `float l1Norm (float[] queryVector, doc['vector field'])` | This function calculates the square of the L2 distance (Euclidean distance) between a given query vector and document vectors. The shorter the distance, the more relevant the document is, so this example inverts the return value of the l2Squared function. If the document vector matches the query vector, the result is 0, so this example also adds 1 to the distance to avoid divide by zero errors. -cosineSimilarity | `float cosineSimilarity (float[] queryVector, doc['vector field'])` | Cosine similarity is an inner product of the query vector and document vector normalized to both have a length of 1. If the magnitude of the query vector doesn't change throughout the query, you can pass the magnitude of the query vector to improve performance, instead of calculating the magnitude every time for every filtered document:<br /> `float cosineSimilarity (float[] queryVector, doc['vector field'], float normQueryVector)` <br />In general, the range of cosine similarity is [-1, 1]. However, in the case of information retrieval, the cosine similarity of two documents ranges from 0 to 1 because the tf-idf statistic can't be negative. Therefore, the k-NN plugin adds 1.0 in order to always yield a positive cosine similarity score. -hamming | `float hamming (float[] queryVector, doc['vector field'])` | This function calculates the Hamming distance between a given query vector and document vectors. The Hamming distance is the number of positions at which the corresponding elements are different. The shorter the distance, the more relevant the document is, so this example inverts the return value of the Hamming distance. - -The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors). -{: .note} - -## Constraints - -1. If a document’s `knn_vector` field has different dimensions than the query, the function throws an `IllegalArgumentException`. - -2. If a vector field doesn't have a value, the function throws an <code>IllegalStateException</code>. - - You can avoid this situation by first checking if a document has a value in its field: - - ``` - "source": "doc[params.field].size() == 0 ? 0 : 1 / (1 + l2Squared(params.query_value, doc[params.field]))", - ``` - - Because scores can only be positive, this script ranks documents with vector fields higher than those without. - -With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ...]`) as input. This is because the magnitude of such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests containing the zero vector will be rejected, and a corresponding exception will be thrown. -{: .note } \ No newline at end of file diff --git a/_search-plugins/knn/performance-tuning.md b/_search-plugins/knn/performance-tuning.md deleted file mode 100644 index 77f44dee93f..00000000000 --- a/_search-plugins/knn/performance-tuning.md +++ /dev/null @@ -1,158 +0,0 @@ ---- -layout: default -title: Performance tuning -parent: k-NN search -nav_order: 45 ---- - -# Performance tuning - -This topic provides performance tuning recommendations to improve indexing and search performance for approximate k-NN (ANN). From a high level, k-NN works according to these principles: -* Native library indexes are created per knn_vector field / (Lucene) segment pair. -* Queries execute on segments sequentially inside the shard (same as any other OpenSearch query). -* Each native library index in the segment returns <=k neighbors. -* The coordinator node picks up final size number of neighbors from the neighbors returned by each shard. - -This topic also provides recommendations for comparing approximate k-NN to exact k-NN with score script. - -## Indexing performance tuning - -Take any of the following steps to improve indexing performance, especially when you plan to index a large number of vectors at once. - -### Disable the refresh interval - -Either disable the refresh interval (default = 1 sec) or set a long duration for the refresh interval to avoid creating multiple small segments: - - ```json - PUT /<index_name>/_settings - { - "index" : { - "refresh_interval" : "-1" - } - } - ``` - -Make sure to reenable `refresh_interval` after indexing is complete. - -### Disable replicas (no OpenSearch replica shard) - - Set replicas to `0` to prevent duplicate construction of native library indexes in both primary and replica shards. When you enable replicas after indexing completes, the serialized native library indexes are copied directly. If you have no replicas, losing nodes might cause data loss, so it's important that the data be stored elsewhere so that this initial load can be retried in the event of an issue. - -### Increase the number of indexing threads - -If your hardware has multiple cores, you can allow multiple threads in native library index construction by speeding up the indexing process. Determine the number of threads to allot with the [knn.algo_param.index_thread_qty]({{site.url}}{{site.baseurl}}/search-plugins/knn/settings#cluster-settings) setting. - -Monitor CPU utilization and choose the correct number of threads. Because native library index construction is costly, choosing more threads then you need can cause additional CPU load. - - -### (Expert-level) Disable vector field storage in the source field - -The `_source` field contains the original JSON document body that was passed at index time. This field is not indexed and is not searchable but is stored so that it can be returned when executing fetch requests such as `get` and `search`. When using vector fields within the source, you can remove the vector field to save disk space, as shown in the following example where the `location` vector is excluded: - - ```json - PUT /<index_name>/_mappings - { - "_source": { - "excludes": ["location"] - }, - "properties": { - "location": { - "type": "knn_vector", - "dimension": 2, - "space_type": "l2", - "method": { - "name": "hnsw", - "engine": "faiss" - } - } - } - } - ``` - - -Disabling the `_source` field can cause certain features to become unavailable, such as the `update`, `update_by_query`, and `reindex` APIs and the ability to debug queries or aggregations by using the original document at index time. - -In OpenSearch 2.15 or later, you can further improve indexing speed and reduce disk space by removing the vector field from the `_recovery_source`, as shown in the following example: - - ```json - PUT /<index_name>/_mappings - { - "_source": { - "excludes": ["location"], - "recovery_source_excludes": ["location"] - }, - "properties": { - "location": { - "type": "knn_vector", - "dimension": 2, - "space_type": "l2", - "method": { - "name": "hnsw", - "engine": "faiss" - } - } - } - } - ``` - -This is an expert-level setting. Disabling the `_recovery_source` may lead to failures during peer-to-peer recovery. Before disabling the `_recovery_source`, check with your OpenSearch cluster admin to determine whether your cluster performs regular flushes before starting the peer-to-peer recovery of shards before disabling the `_recovery_source`. -{: .warning} - -## Search performance tuning - -Take the following steps to improve search performance: - -### Reduce segment count - - To improve search performance, you must keep the number of segments under control. Lucene's IndexSearcher searches over all of the segments in a shard to find the 'size' best results. - - Ideally, having one segment per shard provides the optimal performance with respect to search latency. You can configure an index to have multiple shards to avoid giant shards and achieve more parallelism. - - You can control the number of segments by choosing a larger refresh interval, or during indexing by asking OpenSearch to slow down segment creation by disabling the refresh interval. - -### Warm up the index - - Native library indexes are constructed during indexing, but they're loaded into memory during the first search. In Lucene, each segment is searched sequentially (so, for k-NN, each segment returns up to k nearest neighbors of the query point), and the top 'size' number of results based on the score are returned from all the results returned by segments at a shard level (higher score = better result). - - Once a native library index is loaded (native library indexes are loaded outside OpenSearch JVM), OpenSearch caches them in memory. Initial queries are expensive and take a few seconds, while subsequent queries are faster and take milliseconds (assuming the k-NN circuit breaker isn't hit). - - To avoid this latency penalty during your first queries, you can use the warmup API operation on the indexes you want to search: - - ```json - GET /_plugins/_knn/warmup/index1,index2,index3?pretty - { - "_shards" : { - "total" : 6, - "successful" : 6, - "failed" : 0 - } - } - ``` - - The warmup API operation loads all native library indexes for all shards (primary and replica) for the specified indexes into the cache, so there's no penalty to load native library indexes during initial searches. - -This API operation only loads the segments of active indexes into the cache. If a merge or refresh operation finishes after the API runs, or if you add new documents, you need to rerun the API to load those native library indexes into memory. -{: .warning} - - -### Avoid reading stored fields - - If your use case is simply to read the IDs and scores of the nearest neighbors, you can disable reading stored fields, which saves time retrieving the vectors from stored fields. - -### Use `mmap` file I/O - - For the Lucene-based approximate k-NN search, there is no dedicated cache layer that speeds up read/write operations. Instead, the plugin relies on the existing caching mechanism in OpenSearch core. In versions 2.4 and earlier of the Lucene-based approximate k-NN search, read/write operations were based on Java NIO by default, which can be slow, depending on the Lucene version and number of segments per shard. Starting with version 2.5, k-NN enables [`mmap`](https://en.wikipedia.org/wiki/Mmap) file I/O by default when the store type is `hybridfs` (the default store type in OpenSearch). This leads to fast file I/O operations and improves the overall performance of both data ingestion and search. The two file extensions specific to vector values that use `mmap` are `.vec` and `.vem`. For more information about these file extensions, see [the Lucene documentation](https://lucene.apache.org/core/9_0_0/core/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsFormat.html). - - The `mmap` file I/O uses the system file cache rather than memory allocated for the Java heap, so no additional allocation is required. To change the default list of extensions set by the plugin, update the `index.store.hybrid.mmap.extensions` setting at the cluster level using the [Cluster Settings API]({{site.url}}{{site.baseurl}}/api-reference/cluster-api/cluster-settings). **Note**: This is an expert-level setting that requires closing the index before updating the setting and reopening it after the update. - -## Improving recall - -Recall depends on multiple factors like number of vectors, number of dimensions, segments, and so on. Searching over a large number of small segments and aggregating the results leads to better recall than searching over a small number of large segments and aggregating results. The larger the native library index, the more chances of losing recall if you're using smaller algorithm parameters. Choosing larger values for algorithm parameters should help solve this issue but sacrifices search latency and indexing time. That being said, it's important to understand your system's requirements for latency and accuracy, and then choose the number of segments you want your index to have based on experimentation. - -The default parameters work on a broader set of use cases, but make sure to run your own experiments on your data sets and choose the appropriate values. For index-level settings, see [Index settings]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#index-settings). - -## Approximate nearest neighbor versus score script - -The standard k-NN query and custom scoring option perform differently. Test with a representative set of documents to see if the search results and latencies match your expectations. - -Custom scoring works best if the initial filter reduces the number of documents to no more than 20,000. Increasing shard count can improve latency, but be sure to keep shard size within the [recommended guidelines]({{site.url}}{{site.baseurl}}/intro/#primary-and-replica-shards). diff --git a/_search-plugins/knn/settings.md b/_search-plugins/knn/settings.md deleted file mode 100644 index e4731ec94ce..00000000000 --- a/_search-plugins/knn/settings.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -layout: default -title: Settings -parent: k-NN search -nav_order: 40 ---- - -# k-NN settings - -The k-NN plugin adds several new cluster settings. To learn more about static and dynamic settings, see [Configuring OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/). - -## Cluster settings - -The following table lists all available cluster-level k-NN settings. For more information about cluster settings, see [Configuring OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#updating-cluster-settings-using-the-api) and [Updating cluster settings using the API]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#updating-cluster-settings-using-the-api). - -Setting | Static/Dynamic | Default | Description -:--- | :--- | :--- | :--- -`knn.plugin.enabled`| Dynamic | `true` | Enables or disables the k-NN plugin. -`knn.algo_param.index_thread_qty` | Dynamic | `1` | The number of threads used for native library index creation. Keeping this value low reduces the CPU impact of the k-NN plugin but also reduces indexing performance. -`knn.cache.item.expiry.enabled` | Dynamic | `false` | Whether to remove native library indexes that have not been accessed for a certain duration from memory. -`knn.cache.item.expiry.minutes` | Dynamic | `3h` | If enabled, the amount of idle time before a native library index is removed from memory. -`knn.circuit_breaker.unset.percentage` | Dynamic | `75` | The native memory usage threshold for the circuit breaker. Memory usage must be lower than this percentage of `knn.memory.circuit_breaker.limit` in order for `knn.circuit_breaker.triggered` to remain `false`. -`knn.circuit_breaker.triggered` | Dynamic | `false` | True when memory usage exceeds the `knn.circuit_breaker.unset.percentage` value. -`knn.memory.circuit_breaker.limit` | Dynamic | `50%` | The native memory limit for native library indexes. At the default value, if a machine has 100 GB of memory and the JVM uses 32 GB, then the k-NN plugin uses 50% of the remaining 68 GB (34 GB). If memory usage exceeds this value, then the plugin removes the native library indexes used least recently. -`knn.memory.circuit_breaker.enabled` | Dynamic | `true` | Whether to enable the k-NN memory circuit breaker. -`knn.model.index.number_of_shards`| Dynamic | `1` | The number of shards to use for the model system index, which is the OpenSearch index that stores the models used for approximate nearest neighbor (ANN) search. -`knn.model.index.number_of_replicas`| Dynamic | `1` | The number of replica shards to use for the model system index. Generally, in a multi-node cluster, this value should be at least 1 in order to increase stability. -`knn.model.cache.size.limit` | Dynamic | `10%` | The model cache limit cannot exceed 25% of the JVM heap. -`knn.faiss.avx2.disabled` | Static | `false` | A static setting that specifies whether to disable the SIMD-based `libopensearchknn_faiss_avx2.so` library and load the non-optimized `libopensearchknn_faiss.so` library for the Faiss engine on machines with x64 architecture. For more information, see [SIMD optimization for the Faiss engine]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#simd-optimization-for-the-faiss-engine). -`knn.faiss.avx512.disabled` | Static | `false` | A static setting that specifies whether to disable the SIMD-based `libopensearchknn_faiss_avx512.so` library and load the `libopensearchknn_faiss_avx2.so` library or the non-optimized `libopensearchknn_faiss.so` library for the Faiss engine on machines with x64 architecture. For more information, see [SIMD optimization for the Faiss engine]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#simd-optimization-for-the-faiss-engine). - -## Index settings - -The following table lists all available index-level k-NN settings. All settings are static. For information about updating static index-level settings, see [Updating a static index setting]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#updating-a-static-index-setting). - -Setting | Default | Description -:--- | :--- | :--- -`index.knn.advanced.filtered_exact_search_threshold`| `null` | The filtered ID threshold value used to switch to exact search during filtered ANN search. If the number of filtered IDs in a segment is lower than this setting's value, then exact search will be performed on the filtered IDs. -`index.knn.algo_param.ef_search` | `100` | `ef` (or `efSearch`) represents the size of the dynamic list for the nearest neighbors used during a search. Higher `ef` values lead to a more accurate but slower search. `ef` cannot be set to a value lower than the number of queried nearest neighbors, `k`. `ef` can take any value between `k` and the size of the dataset. \ No newline at end of file diff --git a/_search-plugins/ltr/advanced-functionality.md b/_search-plugins/ltr/advanced-functionality.md new file mode 100644 index 00000000000..50a7e6de199 --- /dev/null +++ b/_search-plugins/ltr/advanced-functionality.md @@ -0,0 +1,541 @@ +--- +layout: default +title: Advanced functionality +nav_order: 80 +parent: Learning to Rank +has_children: false +--- + +# Advanced functionality + +OpenSearch Learning to Rank (LTR) offers additional functionality. It is recommended that you have a foundational understanding of OpenSearch LTR before working with these features. + +## Reusable features + +[Building features]({{site.url}}{{site.baseurl}}/search-plugins/ltr/working-with-features/) involves uploading a list of features. To avoid repeating common features across multiple sets, you can maintain a library of reusable features. + +For example, if a title field query is frequently used in your feature sets, then you can create a reusable title query using the feature API: + +```json + POST _ltr/_feature/titleSearch + { + "feature": + { + "params": [ + "keywords" + ], + "template": { + "match": { + "title": "{{keywords}}" + } + } + } + } +``` +{% include copy-curl.html %} + +Normal CRUD operations apply, so you can delete a feature by using the following operation: + +```json +DELETE _ltr/_feature/titleSearch +``` +{% include copy-curl.html %} + + +To fetch an individual feature, you can use the following request: + +```json +GET _ltr/_feature/titleSearch +``` +{% include copy-curl.html %} + +To view a list of all features filtered by name prefix, you can use the following request: + +```json +GET /_ltr/_feature?prefix=t +``` +{% include copy-curl.html %} + +To create or update a feature set, you can refer to the `titleSearch` feature by using the following request: + +```json +POST /_ltr/_featureset/my_featureset/_addfeatures/titleSearch +``` +{% include copy-curl.html %} + +This adds the `titleSearch` feature to the next ordinal position within the `my_featureset` feature set. + +## Derived features + +Derived features are those that build upon other features. These can be expressed as [Lucene expressions](http://lucene.apache.org/core/7_1_0/expressions/index.html?org/apache/lucene/expressions/js/package-summary.html) and are identified by the `"template_language": "derived_expression"`. + +Additionally, derived features can accept query-time variables of type [`Number`](https://docs.oracle.com/javase/8/docs/api/java/lang/Number.html), as described in [Creating feature sets]({{site.url}}{{site.baseurl}}/search-plugins/ltr/working-with-features#creating-feature-sets). + +### Script features + +Script features are a type of [derived feature](#derived-features). These features have access to the `feature_vector`, but they are implemented as native or Painless OpenSearch scripts rather than as [Lucene +expressions](http://lucene.apache.org/core/7_1_0/expressions/index.html?org/apache/lucene/expressions/js/package-summary.html). + +To identify these features, set the `"template_language": "script_feature""`. The custom script can access the `feature_vector` through the [Java Map](https://docs.oracle.com/javase/8/docs/api/java/util/Map.html), as described in [Create a feature set]({{site.url}}{{site.baseurl}}/search-plugins/ltr/working-with-features#creating-feature-sets). + +Script-based features may impact the performance of your OpenSearch cluster, so it is best to avoid them if you require highly performant queries. +{: .warning} + +### Script feature parameters + +Script features are native or Painless scripts within the context of LTR. These script features can accept parameters as described in the [OpenSearch script documentation]({{site.url}}{{site.baseurl}}/api-reference/script-apis/index/). When working with LTR scripts, you can override parameter values and names. The priority for parameterization, in increasing order, is as follows: + +- The parameter name and value are passed directly to the source script, but not in the LTR script parameters. These cannot be configured at query time. +- The parameter name is passed to both the `sltr` query and the source script, allowing the script parameter values to be overridden at query time. +- The LTR script parameter name to native script parameter name indirection allows you to use different parameter names in your LTR feature definition than those in the underlying native script. This gives you flexibility in how you define and use scripts within the LTR context. + +For example, to set up a customizable way to rank movies in search results, considering both the title match and other adjustable factors, you can use the following request: + +```json +POST _ltr/_featureset/more_movie_features +{ + "featureset": { + "features": [ + { + "name": "title_query", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match": { + "title": "{{keywords}}" + } + } + }, + { + "name": "custom_title_query_boost", + "params": [ + "some_multiplier", + "ltr_param_foo" + ], + "template_language": "script_feature", + "template": { + "lang": "painless", + "source": "(long)params.default_param * params.feature_vector.get('title_query') * (long)params.some_multiplier * (long) params.param_foo", + "params": { + "default_param": 10, + "some_multiplier": "some_multiplier", + "extra_script_params": { + "ltr_param_foo": "param_foo" + } + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +## Multiple feature stores + +A feature store corresponds to an independent LTR system, including features, feature sets, and models backed by a single index and cache. A feature store typically represents a single search problem or application, like Wikipedia or Wiktionary. To use multiple feature stores in your OpenSearch cluster, you can create and manage them using the provided API. For example, you can create a feature set for the `wikipedia` feature store as follows: + +```json +PUT _ltr/wikipedia + +POST _ltr/wikipedia/_featureset/attempt_1 +{ + "featureset": { + "features": [ + { + "name": "title_query", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match": { + "title": "{{keywords}}" + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +When logging features, you can specify the feature store using the `store` parameter in the `sltr` section of your query, as shown in the following example structure. If you do not provide a `store` parameter, the default store is used to look up the feature set. + +```json +{ + "sltr": { + "_name": "logged_featureset", + "featureset": "attempt_1", + "store": "wikipedia", + "params": { + "keywords": "star" + } + } +} +``` +{% include copy-curl.html %} + +To delete the feature set, you can use the following operation: + +```json +DELETE _ltr/wikipedia/_featureset/attempt_1 +``` +{% include copy-curl.html %} + +## Model caching + +The Model Caching plugin uses an internal cache for compiled models. To force the models to be recompiled, you can clear the cache for a feature store: + +```json +POST /_ltr/_clearcache +``` +{% include copy-curl.html %} + +To get cluster-wide cache statistics for a specific store, use the following request: + +```json +GET /_ltr/_cachestats +``` +{% include copy-curl.html %} + +You can control the characteristics of the internal cache by using the following node settings: + +``` +# limit cache usage to 12 megabytes (defaults to 10mb or max_heap/10 if lower) ltr.caches.max_mem: 12mb +# Evict cache entries 10 minutes after insertion (defaults to 1hour, set to 0 to disable) ltr.caches.expire_after_write: 10m +# Evict cache entries 10 minutes after access (defaults to 1hour, set to 0 to disable) ltr.caches.expire_after_read: 10m +``` +{% include copy.html %} + +## Extra logging + +As described in [Logging features]({{site.url}}{{site.baseurl}}/search-plugins/ltr/logging-features/), you can use the logging extension to return feature values with each document. For native scripts, you can also return additional arbitrary information along with the logged features. + +For native scripts, the `extra_logging` parameter is injected into the script parameters. This parameter is a [`Supplier<Map<String,Object>>`](https://docs.oracle.com/javase/8/docs/api/java/util/function/Supplier.html), which provides a non-null `Map<String,Object>` only during the logging fetch phase. Any values you add to this map are returned alongside the logged features: + +```java +{ + @Override + public double runAsDouble() { + ... + Map<String,Object> extraLoggingMap = ((Supplier<Map<String,Object>>) getParams().get("extra_logging")).get(); + if (extraLoggingMap != null) { + extraLoggingMap.put("extra_float", 10.0f); + extraLoggingMap.put("extra_string", "additional_info"); + } + ... + } +} +``` +{% include copy-curl.html %} + +If the extra logging map is accessed, it is returned as an additional entry with the logged features. The format of the logged features, including the extra logging information, will appear similar to the following example: + +```json + { + "log_entry1": [ + { + "name": "title_query", + "value": 9.510193 + }, + { + "name": "body_query", + "value": 10.7808075 + }, + { + "name": "user_rating", + "value": 7.8 + }, + { + "name": "extra_logging", + "value": { + "extra_float": 10.0, + "extra_string": "additional_info" + } + } + ] +} +``` +{% include copy-curl.html %} + +## Feature score caching + +By default, the Feature Score Caching plugin calculates feature scores for both model inference and feature score logging. For example, if you write a query to rescore the top 100 documents and return the top 10 with feature scores, then the plugin calculates the feature scores of the top 100 documents for model inference and then calculates and logs the scores for the top 10 documents. + +The following query shows this behavior: + +```json +POST tmdb/_search +{ + "size": 10, + "query": { + "match": { + "_all": "rambo" + } + }, + "rescore": { + "window_size" : 100, + "query": { + "rescore_query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model" + } + } + } + }, + "ext": { + "ltr_log": { + "log_specs": { + "name": "log_entry1", + "rescore_index": 0 + } + } + } +} +``` +{% include copy-curl.html %} + +In some environments, it may be faster to cache the feature scores for model inference and reuse them for logging. To enable feature score caching, add the `cache: "true"` +flag to the `sltr` query that is the target of feature score logging, as shown in the following example: + +```json +{ + "sltr":{ + "cache":true, + "params":{ + "keywords":"rambo" + }, + "model":"my_model" + } +} +``` +{% include copy-curl.html %} + +## Stats + +You can use the Stats API to retrieve the plugin's overall status and statistics. To do this, send the following request: + +```json +GET /_ltr/_stats +``` +{% include copy-curl.html %} + +The response includes information about the cluster, configured stores, and cache statistics for various plugin components: + +```json +{ + "_nodes":{ + "total":1, + "successful":1, + "failed":0 + }, + "cluster_name":"es-cluster", + "stores":{ + "_default_":{ + "model_count":10, + "featureset_count":1, + "feature_count":0, + "status":"green" + } + }, + "status":"green", + "nodes":{ + "2QtMvxMvRoOTymAsoQbxhw":{ + "cache":{ + "feature":{ + "eviction_count":0, + "miss_count":0, + "hit_count":0, + "entry_count":0, + "memory_usage_in_bytes":0 + }, + "featureset":{ + "eviction_count":0, + "miss_count":0, + "hit_count":0, + "entry_count":0, + "memory_usage_in_bytes":0 + }, + "model":{ + "eviction_count":0, + "miss_count":0, + "hit_count":0, + "entry_count":0, + "memory_usage_in_bytes":0 + } + } + } + } +} +``` +{% include copy-curl.html %} + +You can use filters to retrieve a single statistic by sending the following request: + +```json +GET /_ltr/_stats/{stat} +``` +{% include copy-curl.html %} + +You can limit the information to a single node in the cluster by sending the following requests: + +```json +GET /_ltr/_stats/nodes/{nodeId} +GET /_ltr/_stats/{stat}/nodes/{nodeId} +``` +{% include copy-curl.html %} + +## TermStat query +Experimental +{: .label .label-red } + +The `TermStatQuery` is in an experimental stage, and the Domain-Specific Language (DSL) may change as the code advances. For stable term-statistic access, see [ExplorerQuery]{.title-ref}. + +The `TermStatQuery` is a reimagined version of the legacy `ExplorerQuery`. It provides a clearer way to specify terms and offers more flexibility for experimentation. This query surfaces the same data as the [ExplorerQuery]{.title-ref}, but it allows you to specify a custom Lucene expression to retrieve the desired data, such as in the following example: + +```json +POST tmdb/_search +{ + "query": { + "term_stat": { + "expr": "df", + "aggr": "max", + "terms": ["rambo", "rocky"], + "fields": ["title"] + } + } +} +``` +{% include copy-curl.html %} + +The `expr` parameter is used to specify a Lucene expression. This expression is run on a per-term basis. The expression can be a simple stat type or a custom formula with multiple stat types, such as `(tf * idf) / 2`. Available stat types in the Lucene expression context are listed in the following table. + +Type | Description +:---| :--- +`df` | The direct document frequency for a term. For example, if `rambo` occurs in three movie titles across multiple documents, then the value would be `3`. +`idf` | The inverse document frequency (IDF) calculation using the formula `log((NUM_DOCS+1)/(raw_df+1)) + 1`. +`tf` | The term frequency for a document. For example, if `rambo` occurs three times in a movie synopsis in the same document, then the value would be `3`. +`tp` | The term positions for a document. Multiple positions can be returned for a single term, so you should review the behavior of the `pos_aggr` parameter. +`ttf` | The total term frequency for a term across an index. For example, if `rambo` is mentioned a total of 100 times in the `overview` field across all documents, then the value would be `100`. + +The `aggr` parameter specifies the type of aggregation to be applied to the collected statistics from the `expr`. For example, if you specify the terms `rambo` and `rocky`, then the query gathers statistics for both terms. Because you can only return a single value, you need to decide which statistical calculation to use. The available aggregation types are `min`, `max`, `avg`, `sum`, and `stddev`. The query also provides the following counts: `matches` (the number of terms that matched in the current document) and `unique` (the unique number of terms that were passed in the query). + +The `terms` parameter specifies an array of terms for which you want to gather statistics. Only single terms are supported, with no support for phrases or span queries. If your field is tokenized, you can pass multiple terms in one string in the array. + +The `fields` parameter specifies the fields to check for the specified `terms`. If no `analyzer` is specified, then the configured `search_analyzer` for each field is used. + +The optional parameters are listed in the following table. + +Type | Description +:---| :--- +`analyzer` | If specified, this analyzer is used instead of the configured `search_analyzer` for each field. +`pos_aggr` | Because each term can have multiple positions, you can use this parameter to specify the aggregation to apply to the term positions. This supports the same values as the `aggr` parameter and defaults to `avg`. + +### Script injection + +Script injection provides the ability to inject term statistics into a scripting context. When working with `ScriptFeatures`, you can pass a `term_stat` object with the `terms`, `fields`, and `analyzer` parameters. An injected variable named `termStats` then provides access to the raw values in your custom script. This enables advanced feature engineering by giving you access to all the underlying data. + +To access the count of matched tokens, use [`params.matchCount.get`]{.title-ref}. To access the unique token count, use [`params.uniqueTerms`]{.title-ref}. + +You can either hardcode the `term_stat` parameter in your script definition or pass the parameter to be set at query time. For example, the following example query defines a feature set with a script feature that uses hardcoded `term_stat` parameters: + +```json +POST _ltr/_featureset/test +{ + "featureset": { + "features": [ + { + "name": "injection", + "template_language": "script_feature", + "template": { + "lang": "painless", + "source": "params.termStats['df'].size()", + "params": { + "term_stat": { + "analyzer": "!standard", + "terms": ["rambo rocky"], + "fields": ["overview"] + } + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +Analyzer names must be prefixed with a bang(!) when specifying them locally. Otherwise, they are treated as the parameter lookup value. +{: .note} + +To set parameter lookups, you can pass the name of the parameter from which you want to pull the value, as shown in the following example request: + +```json +POST _ltr/_featureset/test +{ + "featureset": { + "features": [ + { + "name": "injection", + "template_language": "script_feature", + "template": { + "lang": "painless", + "source": "params.termStats['df'].size()", + "params": { + "term_stat": { + "analyzer": "analyzerParam", + "terms": "termsParam", + "fields": "fieldsParam" + } + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +Alternatively, you can pass the `term_stat` parameters as query-time parameters, as shown in the following request: + +```json +POST tmdb/_search +{ + "query": { + "bool": { + "filter": [ + { + "terms": { + "_id": ["7555", "1370", "1369"] + } + }, + { + "sltr": { + "_name": "logged_featureset", + "featureset": "test", + "params": { + "analyzerParam": "standard", + "termsParam": ["troutman"], + "fieldsParam": ["overview"] + } + }} + ] + } + }, + "ext": { + "ltr_log": { + "log_specs": { + "name": "log_entry1", + "named_query": "logged_featureset" + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_search-plugins/ltr/core-concepts.md b/_search-plugins/ltr/core-concepts.md new file mode 100644 index 00000000000..4a7f73e4ce4 --- /dev/null +++ b/_search-plugins/ltr/core-concepts.md @@ -0,0 +1,141 @@ +--- +layout: default +title: ML ranking core concepts +nav_order: 10 +parent: Learning to Rank +has_children: false +--- + +# ML ranking core concepts + +This guide is intended for OpenSearch developers and data scientist who are interested in adding machine learning (ML) ranking capabilities to their OpenSearch system. + +## What is LTR? + +Learning to Rank (LTR) applies ML to search relevance ranking. This differs from other classic ML problems, such as the following: + +- **Regression:** The goal is to predict a variable, such as a stock price, as a function of known information, such as number of employees or revenue. The output is a direct prediction. +- **Classification:** The goal is to categorize an entity into predefined classes, for example, profitable or not profitable. The output is a category. + +The objective of LTR is not to make a direct prediction but rather to learn a function (`f`) that can rank documents in an order that best matches your perception of relevance for a given query. The output `f` does not represent a literal value but rather a prediction of the document's relative usefulness. + +For comprehensive information about LTR, see [How is Search Different From Other Machine Learning Problems?](http://opensourceconnections.com/blog/2017/08/03/search-as-machine-learning-prob/) and [What is Learning to Rank?](http://opensourceconnections.com/blog/2017/02/24/what-is-learning-to-rank/). + +## Defining the ideal ordering with judgment lists + +Judgment lists, also known as golden sets, provide a way to grade individual search results for a keyword search. These lists express the ideal ordering of search results based on your expectations. + +For example, using the [demo on GitHub](http://github.com/opensearch-project/opensearch-learning-to-rank-base/tree/main/demo/), in a search for `Rambo`, the judgment list may appear similar to the following: + +``` +grade,keywords,movie +4,Rambo,First Blood # Exactly Relevant +4,Rambo,Rambo +3,Rambo,Rambo III # Fairly Relevant +3,Rambo,Rambo First Blood Part II +2,Rambo,Rocky # Tangentially Relevant +2,Rambo,Cobra +0,Rambo,Bambi # Not even close... +0,Rambo,First Daughter +``` + +This judgment list establishes the ideal ordering of search results for the query `Rambo`. Metrics like [Normalized Discounted Cumulative Gain (NDCG)](https://en.wikipedia.org/wiki/Discounted_cumulative_gain) and [Expected Reciprocal Rank (ERR)](https://dl.acm.org/doi/abs/10.1145/1645953.1646033) can then be used to evaluate how closely the actual search results match this ideal ordering. + +The ranking function `f` aims to generate results closely aligned with the judgment list, maximizing quality metrics across various training queries. This ensures maximally useful search results. + +## Understanding features as building blocks of relevance + +The ranking function `f` uses input variables to arrive at a predicted output. For example, in stock price forecasting, input variables may encompass company-specific data like employee count and revenue. Likewise, in search relevance, the predictive model must leverage features that characterize the document, the query, and their associations, such as the [term frequency–inverse document frequency (TF–IDF)](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) score of the query keywords in a field. + +Similarly, in the context of searching for movies, the ranking function must use relevant features to determine the most relevant results. These features may include: + +- Whether and to what degree the search keywords match the title field, such as `titleScore`. +- Whether and to what degree the search keywords match the description field, such as `descScore`. +- The movie's popularity, such as `popularity`. +- The movie's rating, such as `rating`. +- The number of keywords used during the search, such as `numKeywords*)`. + +The ranking function would become `f(titleScore, descScore, popularity, rating, numKeywords)`. The goal is to use the features in a way that maximizes the likelihood of the search results being useful. + +For example, in the `Rambo` use case, it seems intuitive that `titleScore` would be important. However, for the top movie _First Blood_, the keyword `Rambo` is likely only mentioned in the description. In this case, the `descScore` would become relevant. Additionally, the `popularity` and `rating` features could help differentiate between sequels and originals. If the existing features do not work for this purpose, then a new feature `isSequel` could be introduced. This new feature could then be used to make better ranking decisions. + +Selecting and experimenting with features is fundamental to LTR. Using features that fail to help predict patterns in the target variable can result in an unsatisfactory search experience, following the principle of "garbage in, garbage out" that applies to any ML problem. + +## Completing the training set by logging features + +When you have a set of defined features, the next step is to annotate the judgment list with each feature's values. These values are used when the training process begins. For example, consider the following judgment list: + +``` +grade,keywords,movie +4,Rambo,First Blood +4,Rambo,Rambo +3,Rambo,Rambo III +... +``` + +To complete the training set, add the following features: + +``` +grade,keywords,movie,titleScore,descScore,popularity,... +4,Rambo,First Blood,0.0,21.5,100,... +4,Rambo,Rambo,42.5,21.5,95,... +3,Rambo,Rambo III,53.1,40.1,50,... +``` + +The `titleScore` represents the relevance score of the `Rambo` keyword in the title field of the document, and so on. + +Many LTR models are familiar with a file format introduced by Support Vector Machine for Ranking (SVMRank), an early LTR method. In this format, queries are given IDs, and the actual document identifier can be removed from the training process. Features are labeled with ordinals starting at `1`. For the preceding example, the file format would be: + +``` +4 qid:1 1:0.0 2:21.5 3:100,... +4 qid:1 1:42.5 2:21.5 3:95,... +3 qid:1 1:53.1 2:40.1 3:50,... +... +``` + +In actual systems, you might log these values and then use them later to annotate a judgment list. In other cases, the judgment list might come from user analytics, so the feature values are logged as you interact with the search application. See [Logging features]({{site.url}}{{site.baseurl}}/search-plugins/ltr/logging-features/) for more information. + +## Training a ranking function + +The following are key considerations for training a ranking function: + +- **Ranking models:** Several models, such as the following, are available for training, each with pros and cons: + + - **Tree-based models** (for example, LambdaMART, MART, Random Forests) + - Generally the most accurate. + - Large and complex, making them expensive to train. + - Tools such as [RankLib](https://sourceforge.net/p/lemur/wiki/RankLib/) and [XGBoost](https://github.com/dmlc/xgboost) focus on tree-based models. + + - **SVM-based models (SVMRank)** + - Less accurate but less expensive to train. + - See [Support Vector Machine for Ranking](https://www.cs.cornell.edu/people/tj/svm_light/svm_rank.html) for more information. + + - **Linear models** + - Perform basic linear regression on the judgment list. + - Tend to not be useful outside of the examples. + - See [Learning to Rank 101 — Linear Models](http://opensourceconnections.com/blog/2017/04/01/learning-to-rank-linear-models/) for more information. + +- **Model selection:** The choice of model can depend not only on performance but also on your level of experience and familiarity with the different approaches. + +## Testing: Is the model any good? + +When testing the quality of the ranking model, consider the following: + +- **Judgment list limitations:** A judgment list cannot include every possible query that a model may encounter in the real world. It is important to test the model on a variety of queries in order to assess its ability to generalize beyond the training data. +- **Overfitting:** A model that is overfit to the training data does not perform well on new, unseen data. To avoid this, consider doing the following: + - Preserving some judgment lists as a _test set_ that is not used during the training process. + - Evaluating the model's performance on the test set, which reflects how it may perform in unfamiliar scenarios. + - Monitoring the _test NDCG_ metric, which should remain high as the model is trained. +- **Temporal generalization:** Even after deploying the model, you should continue testing the model's performance using more recent judgment lists to ensure that it does not become overfit to seasonal or temporal situations. + +## Real-world concerns + +The following are practical considerations for using the Learning to Rank plugin: + +- **Accurate judgment lists:** How can you create judgment lists that reflect your users' perception of search quality? +- **Measuring search quality:** What metrics should you use to determine whether the search results are useful to your users? +- **Data collection infrastructure:** What kind of infrastructure do you need in order to collect and log user behavior and feature data? +- **Model retraining:** How will you know when your model needs to be retrained? +- **A/B testing:** How will you compare your new model to your current search solution? What key performance indicators (KPIs) will you use to determine the success of your search system? + +See [How does the plugin fit in?]({{site.url}}{{site.baseurl}}/search-plugins/ltr/fits-in/) to learn more about how the Learning to Rank plugin's functionality fits into a complete LTR system. diff --git a/_search-plugins/ltr/faq.md b/_search-plugins/ltr/faq.md new file mode 100644 index 00000000000..14db276b3ab --- /dev/null +++ b/_search-plugins/ltr/faq.md @@ -0,0 +1,23 @@ +--- +layout: default +title: Common issues +nav_order: 1000 +parent: Learning to Rank +has_children: false +--- + +# Common issues + +To make the most of Learning to Rank (LTR), consider these helpful insights. + +## Negative scores + +Lucene does not allow for negative query scores. This can be problematic if your raw features include negative values. To address this, confirm that your features are non-negative _before_ training your model. You can achieve this by creating normalized fields with values shifted by the minimum value or by passing the scores through a function that produces a value greater than or equal to `0`. + +## Bugs + +If you encounter a bug while working with the plugin, you can open an issue in the [opensearch-learning-to-rank-base repository](https://github.com/opensearch-project/opensearch-learning-to-rank-base/issues). The project team regularly investigates and resolves issues. If you are seeking general support, the issue may be closed and you may be directed to the relevant support channel(s). + +## Further assistance + +If you need further assistance, join the [Relevance Slack Community](https://opensourceconnections.com/slack) and participate in the #opensearch-learn-to-rank channel to receive guidance and support from the community. diff --git a/_search-plugins/ltr/feature-engineering.md b/_search-plugins/ltr/feature-engineering.md new file mode 100644 index 00000000000..a059dcf7096 --- /dev/null +++ b/_search-plugins/ltr/feature-engineering.md @@ -0,0 +1,88 @@ +--- +layout: default +title: Feature engineering +nav_order: 40 +parent: Learning to Rank +has_children: false +--- + +# Feature engineering + +Common feature engineering tasks that you may encounter while developing a learning to rank (LTR) solution are described in the following sections. + +## Getting raw term statistics + +Many LTR solutions use raw term statistics in their training, such as the following: +- **Total term frequency (`raw_ttf`):** The total number of times that a term appears across an entire index. +- **Document frequency (`raw_df`):** The number of documents in which a term appears. +- **Term frequency (`raw_tf`):** The number of times that a term appears in a specific document. +- **Classic IDF (`classic_idf`):** The inverse document frequency (IDF) calculation `log((NUM_DOCS+1)/(raw_df+1)) + 1`. + +The Learning to Rank plugin provides a `match_explorer` query primitive that can extract these statistics for you, as shown in the following example: + +```json +POST tmdb/_search +{ + "query": { + "match_explorer": { + "type": "max_raw_df", + "query": { + "match": { + "title": "rambo rocky" + } + } + } + } +} +``` +{% include copy-curl.html %} + +The query returns the highest document frequency between the terms `rambo ` and `rocky`. + +You can use operations such as `max`, `min`, `sum`, and `stddev` with the statistics to get the information you need. + +### Term position statistics + +You can prepend the `type` with the desired operation (`min`, `max`, `avg`) to calculate the corresponding statistic across the term positions. If the terms are not present in the document, then the result will be `0`. + +The available statistics include the following: + +- `min_raw_tp` (minimum raw term position): This statistic finds the earliest position of any search term in the document. For example, with the query `dance monkey`, if `dance` occurs at positions [2, 5, 9] and `monkey` occurs at [1, 4], then the minimum is 1. +- `max_raw_tp` (maximum raw term position): This statistic finds the latest position of any search term in the document. Using the preceding example, the maximum is 9. +- `avg_raw_tp` (average raw term position): This statistic calculates the average term position for any of the query terms. Using the preceding example, the average for `dance` is 5.33 [(2+5+9)/3)] and the average for `monkey` is 2.5 [(1+4)/2], with an overall average of 3.91. +- `unique_terms_count`: Provides a count of the unique search terms in the query. + +## Document-specific features + +When working on an LTR solution, you may need to incorporate features that are specific to the document rather than to the relationship between the query and the document. These document-specific features can include metrics related to popularity or recency. + +The `function_score` query provides the functionality to extract these document-specific features. The following example query shows how you can use it to incorporate the `vote_average` field as a feature: + +```json +{ + "query": { + "function_score": { + "functions": [{ + "field_value_factor": { + "field": "vote_average", + "missing": 0 + } + }], + "query": { + "match_all": {} + } + } + } +} +``` +{% include copy-curl.html %} + +In the example, the score of the query is determined by the value of the `vote_average` field, which could be a measure of document popularity or quality. + +## Index drift + +When working with an index that is regularly updated, it is important to consider that the trends and patterns you observe may not remain constant over time. Your index can drift as user behavior, content, and other factors change. For example, on an e-commerce store, you may find that sandals are popular during summer months but become almost impossible to find in the winter. Similarly, the features that drive purchases or engagement during one time period may not be as important during another. + +## Next steps + +Learn about [logging feature scores]({{site.url}}{{site.baseurl}}/search-plugins/ltr/logging-features/). diff --git a/_search-plugins/ltr/fits-in.md b/_search-plugins/ltr/fits-in.md new file mode 100644 index 00000000000..30ca291b823 --- /dev/null +++ b/_search-plugins/ltr/fits-in.md @@ -0,0 +1,29 @@ +--- +layout: default +title: Scope of the plugin +nav_order: 20 +parent: Learning to Rank +has_children: false +--- + +# Scope of the plugin + +The Learning to Rank plugin for OpenSearch helps you develop and use machine learning (ML)-based ranking models for your application search operations. The following sections describe how the plugin fits into the overall LTR process. + +## What the plugin does + +The plugin provides the building blocks to develop and use LTR models, giving you the following capabilities: + +1. **Developing query-dependent features:** Create custom features that capture the relationship between a search query and a document. These features can be stored in OpenSearch. +2. **Logging feature values:** Record the feature values for documents returned in search results. Once you have logged the feature sets for your documents, you can combine this data with the judgment lists you have developed. This will give you a complete training set that you can use to test and train your ranking models. Tools such as RankLib or XGBoost can then be used to develop a satisfactory model. +3. **Deploying and using models:** Upload trained ranking models to the plugin and use them to rerank search results. The plugin offers a custom OpenSearch query domain-specific language (DSL) primitive that allows you to execute the model during the search process. + +## What the plugin does not do + +The plugin does not support the creation of judgment lists. This is a task you must handle yourself because it is domain specific. See the [Wikimedia Foundation blog](https://blog.wikimedia.org/2017/09/19/search-relevance-survey/) for an example approach to developing judgment lists for searching articles. Some domains, such as e-commerce, may focus more on conversion-related signals, while others may involve human relevance assessors (either internal experts or crowdsourced workers). + +The plugin does not handle model training or testing. This is an offline process that should be handled using the appropriate tools, such as [XGBoost](https://xgboost.ai/) and [RankLib](https://lemurproject.org/ranklib.php). The plugin integrates with these external model-building workflows. Training and testing ranking models can be a CPU-intensive task that requires data science expertise and offline testing. Most organizations prefer to have data scientists oversee the model development process rather than running it directly in their production environment. + +## Next steps + +Learn about [working with features]({{site.url}}{{site.baseurl}}/search-plugins/ltr/working-with-features/). diff --git a/_search-plugins/ltr/index.md b/_search-plugins/ltr/index.md new file mode 100644 index 00000000000..e6c4aeff36e --- /dev/null +++ b/_search-plugins/ltr/index.md @@ -0,0 +1,24 @@ +--- +layout: default +title: Learning to Rank +nav_order: 20 +has_children: true +has_toc: false +redirect_from: + - /search-plugins/ltr/ +--- + +# Learning to Rank + +The Learning to Rank plugin for OpenSearch enables you to use machine learning (ML) and behavioral data to fine-tune the relevance of documents. It uses models from the [XGBoost](https://xgboost.ai/) and [RankLib](https://lemurproject.org/ranklib.php) libraries. These models rescore the search results, considering query-dependent features such as click-through data or field matches, which can further improve relevance. + +The term _learning to rank_ is abbreviated as LTR throughout the OpenSearch documentation when the term is used in a general sense. For the plugin developer documentation, see [opensearch-learning-to-rank-base](https://github.com/opensearch-project/opensearch-learning-to-rank-base). +{: .note} + +## Getting started + +The following resources can help you get started: + +- If you are new to LTR, start with the [ML ranking core concepts]({{site.url}}{{site.baseurl}}/search-plugins/ltr/core-concepts/) documentation. +- For a quick introduction, see the demo in [hello-ltr](https://github.com/o19s/hello-ltr). +- If you are familiar with LTR, start with the [Integrating the plugin]({{site.url}}{{site.baseurl}}/search-plugins/ltr/fits-in/) documentation. diff --git a/_search-plugins/ltr/logging-features.md b/_search-plugins/ltr/logging-features.md new file mode 100644 index 00000000000..7922b8683d8 --- /dev/null +++ b/_search-plugins/ltr/logging-features.md @@ -0,0 +1,418 @@ +--- +layout: default +title: Logging feature scores +nav_order: 50 +parent: Learning to Rank +has_children: false +--- + +# Logging feature scores + +Feature values need to be logged in order to train a model. This is a crucial component of the Learning to Rank plugin---as you search, feature values from the feature sets are logged so that they can be used for training. This allows models that effectively predict relevance using that set of features to be discovered. + +## `sltr` query + +The `sltr` query is the primary method for running features and evaluating models. When logging, an `sltr` query is used to execute each feature query and retrieve the feature scores. A feature set structure that works with the [`hello-ltr`](https://github.com/o19s/hello-ltr) demo schema is shown in the following example request: + +```json +PUT _ltr/_featureset/more_movie_features +{ + "name": "more_movie_features", + "features": [ + { + "name": "body_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "overview": "{% raw %}{{keywords}}{% endraw %}" + } + } + }, + { + "name": "title_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "title": "{% raw %}{{keywords}}{% endraw %}" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +## Common use cases + +Common use cases for logging feature sets are described in the following sections. + +### Joining feature values with a judgment list + +If the judgment list is already available, you can join feature values for each keyword/document pair to create a complete training set. For example, consider the following judgment list: + +``` +grade,keywords,docId +4,rambo,7555 +3,rambo,1370 +3,rambo,1369 +4,rocky,4241 +``` +{% include copy-curl.html %} + +The feature values need to be retrieved for all documents that have a judgment for each search term, one search term at a time. For example, starting with a `rambo` search, a filter can be created for the associated document as follows: + +```json +{ + "filter": [ + {"terms": { + "_id": ["7555", "1370", "1369"] + }} + ] +} +``` +{% include copy-curl.html %} + +The Learning to Rank plugin must point to the features to be logged. The `sltr` query, which is part of the plugin, can be used for this purpose. The `sltr` query has a `_name` (the named queries feature) used to reference it, refers to the previously created feature set `more_movie_features`, and passes the search keyword `rambo` and any other required parameters, as shown in the following example query: + +```json +{ + "sltr": { + "_name": "logged_featureset", + "featureset": "more_movie_features", + "params": { + "keywords": "rambo" + } + } +} +``` +{% include copy-curl.html %} + +[Searching with LTR]({{site.url}}{{site.baseurl}}/search-plugins/ltr/searching-with-your-model/) provides an `sltr` query to use for executing a model. This `sltr` query is used as a mechanism to direct the Learning to Rank plugin to the feature set requiring logging. +{: .note} + +To avoid influencing the score, the `sltr` query is injected as a filter, as shown in the following example: + +```json +{ + "query": { + "bool": { + "filter": [ + { + "terms": { + "_id": [ + "7555", + "1370", + "1369" + ] + } + }, + { + "sltr": { + "_name": "logged_featureset", + "featureset": "more_movie_features", + "params": { + "keywords": "rambo" + } + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +Executing this query returns the three expected hits. The next step is to enable feature logging to refer to the `sltr` query to be logged. + +The logging identifies the `sltr` query, runs the feature set's queries, scores each document, and returns those scores as computed fields for each document, as shown in the following example logging structure: + +```json +"ext": { + "ltr_log": { + "log_specs": { + "name": "log_entry1", + "named_query": "logged_featureset" + } + } +} +``` +{% include copy-curl.html %} + +The log extension supports the following arguments: + +- `name`: The name of the log entry to fetch from each document. +- `named_query`: The named query that corresponds to an `sltr` query. +- `rescore_index`: If the `sltr` query is in a rescore phase, then this is the index of the query in the rescore list. +- `missing_as_zero`: Produces a `0` for missing features (when the feature does not match). Default is `false`. + +To enable the log to locate an `sltr` query, either during the normal query phase or during rescoring, either `named_query` or `rescore_index` must be set. +{: .note} + +The full example request is as follows: + +```json +POST tmdb/_search +{ + "query": { + "bool": { + "filter": [ + { + "terms": { + "_id": ["7555", "1370", "1369"] + } + }, + { + "sltr": { + "_name": "logged_featureset", + "featureset": "more_movie_features", + "params": { + "keywords": "rambo" + } + }} + ] + } + }, + "ext": { + "ltr_log": { + "log_specs": { + "name": "log_entry1", + "named_query": "logged_featureset" + } + } + } +} +``` +{% include copy-curl.html %} + +Each document now contains a log entry, as shown in the following example: + +```json +{ + "_index": "tmdb", + "_type": "movie", + "_id": "1370", + "_score": 20.291, + "_source": { + ... + }, + "fields": { + "_ltrlog": [ + { + "log_entry1": [ + {"name": "title_query" + "value": 9.510193}, + {"name": "body_query + "value": 10.7808075} + ] + } + ] + }, + "matched_queries": [ + "logged_featureset" + ] +} +``` +{% include copy-curl.html %} + +The judgment list can be joined with the feature values to produce a training set. For the line corresponding to document `1370` with keyword `rambo`, the following can be added: + +``` +> 4 qid:1 1:9.510193 2:10.7808075 +``` +{% include copy-curl.html %} + +Repeat this process for all of your queries. + +For large judgment lists, it is recommended to batch the logs for multiple queries. You can use [multi-search]({{site.url}}{{site.baseurl}}/api-reference/multi-search/) capabilities for this purpose. +{: .note} + +### Logging values for a live feature set + +If you are running in production with a model being executed within an `sltr` query, a live model may appear similar to the following example request: + +```json +POST tmdb/_search +{ + "query": { + "match": { + "_all": "rambo" + } + }, + "rescore": { + "query": { + "rescore_query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model" + } + } + } + } +} +``` +{% include copy-curl.html %} + +See [Searching with LTR]({{site.url}}{{site.baseurl}}/search-plugins/ltr/searching-with-your-model/) for information about model execution. +{: .note} + +To log the feature values for the query, apply the appropriate logging spec to reference the `sltr` query, as shown in the following example: + +```json +"ext": { + "ltr_log": { + "log_specs": { + "name": "log_entry1", + "rescore_index": 0 + } + } +} +``` +{% include copy-curl.html %} + +The example logs the features in the response, enabling future model retraining using the same feature set. + +### Modifying and logging an existing feature set + +Feature sets can be expanded. For example, as shown in the following example request, if a new feature, such as `user_rating`, needs to be incorporated, it can be added to the existing feature set `more_movie_features`: + +``` json +PUT _ltr/_feature/user_rating/_addfeatures +{ + "features": [ + "name": "user_rating", + "params": [], + "template_language": "mustache", + "template" : { + "function_score": { + "functions": { + "field": "vote_average" + }, + "query": { + "match_all": {} + } + } + } + ] +} +``` +{% include copy-curl.html %} + +See [Working with features]({{site.url}}{{site.baseurl}}/search-plugins/ltr/working-with-features/) for more information. +{: .note} + +When logging is performed, the new feature is included in the output, as shown in the following example: + +``` json +{ + "log_entry1": [ + { + "name": "title_query", + "value": 9.510193 + }, + { + "name": "body_query", + "value": 10.7808075 + }, + { + "name": "user_rating", + "value": 7.8 + } + ] +} +``` +{% include copy-curl.html %} + +### Logging values for a proposed feature set + +You can create a completely new feature set for experimental purposes, for example, `other_movie_features`, as shown in the following example request: + +```json +PUT _ltr/_featureset/other_movie_features +{ + "name": "other_movie_features", + "features": [ + { + "name": "cast_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "cast.name": "{% raw %}{{keywords}}{% endraw %}" + } + } + }, + { + "name": "genre_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "genres.name": "{% raw %}{{keywords}}{% endraw %}" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +The feature set, `other_movie_features`, can be logged alongside the live production set, `more_movie_features`, by appending it as another filter, as shown in the following example request: + +```json +POST tmdb/_search +{ +"query": { + "bool": { + "filter": [ + { "sltr": { + "_name": "logged_featureset", + "featureset": "other_movie_features", + "params": { + "keywords": "rambo" + } + }}, + {"match": { + "_all": "rambo" + }} + ] + } +}, +"rescore": { + "query": { + "rescore_query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model" + } + } + } +} +} +``` +{% include copy-curl.html %} + +You can continue adding as many feature sets as needed for logging. + +## Logging scenarios + +Once you have covered the basics, you can consider some real-life feature logging scenarios. + +First, logging is used to develop judgment lists from user analytics to capture the exact value of a feature at the precise time of interaction. For instance, you may want to know the recency, title score, and other values at the precise time of a user's interaction. This would help you analyze which features or factors had relevance while training. To achieve this, you can build a comprehensive feature set for future experimentation. + +Second, logging can be used to retrain a model in which you already have confidence. You may want to keep your models up to date with a shifting index because models can lose their effectiveness over time. You may have A/B testing in place or be monitoring business metrics and notice gradual degradation in model performance. + +Third, logging is used during model development. You may have a judgment list but want to iterate heavily with a local copy of OpenSearch. This allows for extensive experimentation with new features, adding and removing them from the feature sets as needed. While this process may result in being slightly out of sync with the live index, the goal is to arrive at a set of satisfactory model parameters. Once this is achieved, the model can be trained with production data to confirm that the level of performance remains acceptable. + +## Next steps + +Learn more about training models in the [Uploading a trained model]({{site.url}}{{site.baseurl}}/search-plugins/ltr/training-models/) documentation. diff --git a/_search-plugins/ltr/searching-with-your-model.md b/_search-plugins/ltr/searching-with-your-model.md new file mode 100644 index 00000000000..ca1ff87307e --- /dev/null +++ b/_search-plugins/ltr/searching-with-your-model.md @@ -0,0 +1,102 @@ +--- +layout: default +title: Optimizing search with LTR +nav_order: 70 +parent: Learning to Rank +has_children: false +--- + +# Optimizing search with LTR + +After you have trained a model, you can use the `sltr` query to execute it. However, directly running the query on the entire index is not recommended because it can be CPU intensive and impact the performance of your OpenSearch cluster. The query allows you to apply your trained model to search results, as shown in the following example: + +```json + POST tmdb/_search + { + "query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model" + } + } + } +``` +{% include copy-curl.html %} + +## Rescoring top N + +To execute your model more efficiently, you can use the built-in rescore functionality to apply your model to the top N results of a baseline relevance query, as shown in the following example query: + +```json + POST tmdb/_search + { + "query": { + "match": { + "_all": "rambo" + } + }, + "rescore": { + "window_size": 1000, + "query": { + "rescore_query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model" + } + } + } + } + } +``` +{% include copy-curl.html %} + +A `match` is first executed for the term `rambo` and then `my_model` is applied to the top 1,000 results. This baseline query is used to generate an initial set of results that are then scored using the default similarity BM25 probabilistic ranking framework to calculate relevance scores. + +## Rescoring a subset of features + +You can selectively score a subset of features by specifying the `active_features` in the `sltr` query, as shown in the following example. This allows you to focus the model's scoring on the selected features, while any unspecified features are marked as missing. You only need to specify the `params` relevant to the `active_features`. If you request a feature name that is not part of the assigned feature set, then the query throws an error. + +```json + POST tmdb/_search + { + "query": { + "match": { + "_all": "rambo" + } + }, + "rescore": { + "window_size": 1000, + "query": { + "rescore_query": { + "sltr": { + "params": { + "keywords": "rambo" + }, + "model": "my_model", + "active_features": ["title_query"] + } + } + } + } + } +``` +{% include copy-curl.html %} + +The `my_model` model is applied but only scores the `title_query` feature. + +## Combining `sltr` with other OpenSearch features + +The `sltr` query can be integrated with the following OpenSearch features and functionalities to create more sophisticated and tailored search solutions that go beyond applying a model to your results: + +- Filtering out results based on business rules using OpenSearch filters before applying the model +- Chaining multiple rescores to refine the relevance of your results +- Rescoring once to address relevance with `sltr` and a second time for business concerns +- Downboosting relevant but low-quality content in the baseline query to prevent it from being rescored + +## Next steps + +Learn about [advanced functionality]({{site.url}}{{site.baseurl}}/search-plugins/ltr/advanced-functionality/). diff --git a/_search-plugins/ltr/training-models.md b/_search-plugins/ltr/training-models.md new file mode 100644 index 00000000000..fb068cedd7c --- /dev/null +++ b/_search-plugins/ltr/training-models.md @@ -0,0 +1,335 @@ +--- +layout: default +title: Uploading trained models +nav_order: 60 +parent: Learning to Rank +has_children: false +--- + +# Uploading trained models + +While model training occurs outside of the Learning to Rank plugin, you can use the plugin for [logging feature scores]({{site.url}}{{site.baseurl}}/search-plugins/ltr/logging-features/). After you have trained a model, you can upload it to the plugin in the available serialization formats, such as RankLib and XGBoost. + +## RankLib model training + +The feature logging process generates a RankLib-comsumable judgment file. In the following judgment file, the query with ID 1 `rambo` includes the logged features 1 (a title `TF*IDF` +score) and 2 (a description `TF*IDF` score) for a set of documents: + +``` +4 qid:1 1:9.8376875 2:12.318446 # 7555 rambo +3 qid:1 1:10.7808075 2:9.510193 # 1370 rambo +3 qid:1 1:10.7808075 2:6.8449354 # 1369 rambo +3 qid:1 1:10.7808075 2:0.0 # 1368 rambo +``` + +The RankLib library can be called using the following command: + + ``` + cmd = "java -jar RankLib-2.8.jar -ranker %s -train%rs -save %s -frate 1.0" % (whichModel, judgmentsWithFeaturesFile, modelOutput) +``` + +The `judgmentsWithFeatureFile` is the input provided to RankLib for training. Additional parameters can be passed. See the [RankLib documentation](https://sourceforge.net/p/lemur/wiki/RankLib/) for more information. + +RankLib outputs the model in its own serialization format. As shown in the following example, a LambdaMART model is an ensemble of regression trees: + +``` +## LambdaMART +## No. of trees = 1000 +## No. of leaves = 10 +## No. of threshold candidates = 256 +## Learning rate = 0.1 +## Stop early = 100 + + <ensemble> + <tree id="1" weight="0.1"> + <split> + <feature> 2 </feature> + ... +``` + +Within the RankLib model, each tree in the ensemble examines feature values, makes decisions based on these feature values, and outputs the relevance scores. The features are referred to by their ordinal position, starting from 1, which corresponds to the 0th feature in the original feature set. RankLib does not use feature names during model training. + +### Other RankLib models + +RankLib is a library that implements several other model types in addition to LambdaMART, such as MART, +RankNet, RankBoost, AdaRank, Coordinate Ascent, ListNet, and Random Forests. Each of these models has its own set of parameters and training process. + +For example, the RankNet model is a neural network that learns to predict the probability of a document being more relevant than another document. The model is trained using a pairwise loss function that compares the predicted relevance of two documents with the actual relevance. The model is serialized in a format similar to the following example: + +``` +## RankNet +## Epochs = 100 +## No. of features = 5 +## No. of hidden layers = 1 +... +## Layer 1: 10 neurons +1 2 +1 +10 +0 0 -0.013491530393429608 0.031183180961270988 0.06558792020112071 -0.006024092627087733 0.05729619574181734 -0.0017010373987742411 0.07684848696852313 -0.06570387602230028 0.04390491141617467 0.013371636736099578 +... +``` + +All these models can be used with the Learning to Rank plugin, provided that the model is serialized in the RankLib format. + +## XGBoost model training + +Unlike the RankLib model, the XGBoost model is serialized in a format specific to gradient-boosted decision trees, as shown in the following example: + +```json + [ { "nodeid": 0, "depth": 0, "split": "tmdb_multi", "split_condition": 11.2009, "yes": 1, "no": 2, "missing": 1, "children": [ + { "nodeid": 1, "depth": 1, "split": "tmdb_title", "split_condition": 2.20631, "yes": 3, "no": 4, "missing": 3, "children": [ + { "nodeid": 3, "leaf": -0.03125 }, + ... +``` + +## XGBoost parameters + +Optional parameters can be specified for an XGBoost model. These parameters are specified as an object, with the decision trees specified in the `splits` field. The supported parameters include `objective`, which defines the model learning objective as described in the [XGBoost documentation](https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters). This parameter can transform the final model prediction. The supported values include `binary:logistic`, `binary:logitraw`, `rank:ndcg`, `rank:map`, `rank:pairwise`, `reg:linear`, and `reg:logistic`. + +## Simple linear models + +Machine learning (ML) models, such as Support Vector Machines (SVMs), output linear weights for each feature. The LTR model supports representing these linear weights in a simple format, such as those learned from an SVM or linear regression model. In the following example output, the weights indicate the relative importance of the features in the model's prediction: + +```json +{ + "title_query" : 0.3, + "body_query" : 0.5, + "recency" : 0.1 +} +``` + +## Feature normalization + +Feature normalization is used to convert feature values to a consistent range, typically between 0 and 1 or -1 and 1. This is done during the training phase to better understand the relative impact of each feature. Some models, especially linear ones such as SVMRank, rely on normalization to function correctly. + +## Model upload process + +After training your model, the next step is to make it available for search operations. This involves uploading the model to the Learning to Rank plugin. When uploading a model, you must provide the following information: + +- Feature set used during training +- Model type, for example, RankLib or XGBoost +- Model content + +The following example request shows how to upload a RankLib model that was trained using the `more_movie_features` feature set: + +```json + POST _ltr/_featureset/more_movie_features/_createmodel + { + "model": { + "name": "my_ranklib_model", + "model": { + "type": "model/ranklib", + "definition": "## LambdaMART\n + ## No. of trees = 1000 + ## No. of leaves = 10 + ## No. of threshold candidates = 256 + ## Learning rate = 0.1 + ## Stop early = 100 + + <ensemble> + <tree id="1" weight="0.1"> + <split> + <feature> 2 </feature> + ... + " + } + } + } +``` + +The following example request shows how to upload an XGBoost model that was trained using the `more_movie_features` feature set: + +```json + POST _ltr/_featureset/more_movie_features/_createmodel + { + "model": { + "name": "my_xgboost_model", + "model": { + "type": "model/xgboost+json", + "definition": "[ { \"nodeid\": 0, \"depth\": 0, \"split\": \"tmdb_multi\", \"split_condition\": 11.2009, \"yes\": 1, \"no\": 2, \"missing\": 1, \"children\": [ + { \"nodeid\": 1, \"depth\": 1, \"split\": \"tmdb_title\", \"split_condition\": 2.20631, \"yes\": 3, \"no\": 4, \"missing\": 3, \"children\": [ + { \"nodeid\": 3, \"leaf\": -0.03125 }, + ..." + } + } + } +``` + +The following example request shows how to upload an XGBoost model that was trained using the `more_movie_features` feature set with parameters: + +```json + POST _ltr/_featureset/more_movie_features/_createmodel + { + "model": { + "name": "my_xgboost_model", + "model": { + "type": "model/xgboost+json", + "definition": "{ + \"objective\": \"reg:logistic\", + \"splits\": [ { \"nodeid\": 0, \"depth\": 0, \"split\": \"tmdb_multi\", \"split_condition\": 11.2009, \"yes\": 1, \"no\": 2, \"missing\": 1, \"children\": [ + { \"nodeid\": 1, \"depth\": 1, \"split\": \"tmdb_title\", \"split_condition\": 2.20631, \"yes\": 3, \"no\": 4, \"missing\": 3, \"children\": [ + { \"nodeid\": 3, \"leaf\": -0.03125 }, + ... + ] + }" + } + } + } +```` + +The following example request shows how to upload a simple linear model that was trained using the `more_movie_features` feature set: + +```json + POST _ltr/_featureset/more_movie_features/_createmodel + { + "model": { + "name": "my_linear_model", + "model": { + "type": "model/linear", + "definition": """ + { + "title_query" : 0.3, + "body_query" : 0.5, + "recency" : 0.1 + } + """ + } + } + } +``` + +## Creating a model with feature normalization + +Feature normalization is a crucial preprocessing step that can be applied before model evaluation. LTR supports two types of feature normalization: min-max and standard normalization. + +### Standard normalization + +Standard normalization transforms features as follows: + +- Maps the mean value to 0 +- Maps one standard deviation above the mean to 1 +- Maps one standard deviation below the mean to -1 + +The following example request shows how to create a model with standard feature normalization: + +```json + POST _ltr/_featureset/more_movie_features/_createmodel + { + "model": { + "name": "my_linear_model", + "model": { + "type": "model/linear", + "feature_normalizers": { + "release_year": { + "standard": { + "mean": 1970, + "standard_deviation": 30 + } + } + }, + "definition": """ + { + "release_year" : 0.3, + "body_query" : 0.5, + "recency" : 0.1 + } + """ + } + } + } +``` + +### Min-max normalization + +Min-max normalization scales features to a fixed range, typically between 0 and 1. Min-max normalization transforms features as follows: + +- Maps the specified minimum value to 0 +- Maps the specified maximum value to 1 +- Scales the values between 0 and 1 linearly + +The following example request shows how to implement min-max normalization: + +```json + "feature_normalizers": { + "vote_average": { + "min_max": { + "minimum": 0, + "maximum": 10 + } + } + } +``` + +## Model independence from feature sets + +Models are initially created with reference to a feature set. After their creation, they exist as independent top-level entities. + +### Accessing models + +To retrieve a model, use a GET request: + +``` +GET _ltr/_model/my_linear_model +``` + +To delete a model, use a DELETE request: + +``` +DELETE _ltr/_model/my_linear_model +``` + +Model names must be globally unique across all feature sets. +{: .note} + +### Model persistence + +When a model is created, its features are copied. This prevents changes to the original features from affecting existing models or model production. For example, if the feature set used to create the model is deleted, you can still access and use the model. + +### Model response + +When retrieving a model, you receive a response that includes the features used to create it, as shown in the following example: + +```json + { + "_index": ".ltrstore", + "_type": "store", + "_id": "model-my_linear_model", + "_version": 1, + "found": true, + "_source": { + "name": "my_linear_model", + "type": "model", + "model": { + "name": "my_linear_model", + "feature_set": { + "name": "more_movie_features", + "features": [ + { + "name": "body_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "overview": "{{keywords}}" + } + } + }, + { + "name": "title_query", + "params": [ + "keywords" + ], + "template": { + "match": { + "title": "{{keywords}}" + } + } + } + ]}}} +``` + +## Next steps + +Learn about [searching with LTR]({{site.url}}{{site.baseurl}}/search-plugins/ltr/searching-with-your-model/). diff --git a/_search-plugins/ltr/working-with-features.md b/_search-plugins/ltr/working-with-features.md new file mode 100644 index 00000000000..00ebd908d7e --- /dev/null +++ b/_search-plugins/ltr/working-with-features.md @@ -0,0 +1,270 @@ +--- +layout: default +title: Working with features +nav_order: 30 +parent: Learning to Rank +has_children: false +--- + +# Working with features + +The following sections describe the specific functionality provided by the Learning to Rank plugin. This information will help you build and upload features for your learning to rank (LTR) system. See [ML ranking core concepts]({{site.url}}{{site.baseurl}}/search-plugins/ltr/core-concepts/) and [Scope of the plugin]({{site.url}}{{site.baseurl}}/search-plugins/ltr/fits-in/) for more information about the Learning to Rank plugin's roles and functionality. + +## Understanding the role of features in the Learning to Rank plugin + +The Learning to Rank plugin defines a _feature_ as an _OpenSearch query_. When you execute an OpenSearch query using your search terms and other relevant parameters, the resulting score is the value that can be used in your training data. For example, a feature may include basic `match` queries on fields such as `title`: + +```json +{ + "query": { + "match": { + "title": "{% raw %}{{keywords}}{% endraw %}" + } + } +} +``` +{% include copy-curl.html %} + +In addition to simple query-based features, you can also use document properties, such as `popularity`, as features. For example, you can use a function score query to get the average movie rating: + +```json +{ + "query": { + "function_score": { + "functions": { + "field": "vote_average" + }, + "query": { + "match_all": {} + } + } + } +} +``` +{% include copy-curl.html %} + +Another example is a query based on location, such as a geodistance filter: + +```json +{ + "query": { + "bool" : { + "must" : { + "match_all" : {} + }, + "filter" : { + "geo_distance" : { + "distance" : "200km", + "pin.location" : { + "lat" : "{% raw %}{{users_lat}}{% endraw %}", + "lon" : "{% raw %}{{users_lon}}{% endraw %}" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +These types of queries are the building blocks that the ranking `f` function you are training combines mathematically to determine a relevance score. + +## Using Mustache templates in LTR queries + +The features in LTR queries use Mustache templates. This allows you to insert variables into your search queries. For example, you could have a query that uses `{% raw %}{{keywords}}{% endraw %}` to insert your search terms. Or you could use `{% raw %}{{users_lat}}{% endraw %}` and `{% raw %}{{users_lon}}{% endraw %}` to include the location. This gives you the flexibility to personalize your search. + +## Uploading and naming features + +The Learning to Rank plugin enables you to create and modify features. After you define your features, you can log them for use in model training. By combining the logged feature data with your judgment list, you can train a model. Once the model is ready, you can upload it and then apply it to your search queries. + +## Initializing the default feature store + +The Learning to Rank plugin uses a feature store to store metadata about your features and models. Typically, there is one feature store per major search implementation, for example, [Wikipedia](http://wikipedia.org) as compared to [Wikitravel](http://wikitravel.org). + +For most uses cases, you can use the default feature store and avoid managing multiple feature stores. To initialize the default feature store, run the following request: + +``` +PUT _ltr +``` +{% include copy-curl.html %} + +If you need to start again from the beginning, you can delete the default feature store by using the following operation: + +``` +DELETE _ltr +``` +{% include copy-curl.html %} + +Deleting the feature store removes all existing feature and model data. +{: .warning} + +The default feature store is used throughout the rest of this guide. + +## Working with features and feature sets + +A _feature set_ is a collection of features that have been grouped together. You can use feature sets to log multiple feature values for offline training. When creating a new model, you copy the relevant feature set into the model definition. + +## Creating feature sets + +To create a feature set, you can send a POST request. When creating the feature set, you provide a name and an optional list of features, as shown in the following example request: + +```json +POST _ltr/_featureset/more_movie_features +{ + "featureset": { + "features": [ + { + "name": "title_query", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match": { + "title": "{% raw %}{{keywords}}{% endraw %}" + } + } + }, + { + "name": "title_query_boost", + "params": [ + "some_multiplier" + ], + "template_language": "derived_expression", + "template": "title_query * some_multiplier" + }, + { + "name": "custom_title_query_boost", + "params": [ + "some_multiplier" + ], + "template_language": "script_feature", + "template": { + "lang": "painless", + "source": "params.feature_vector.get('title_query') * (long)params.some_multiplier", + "params": { + "some_multiplier": "some_multiplier" + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +## Managing feature sets + +To fetch a specific feature set, you can use the following request: + +``` +GET _ltr/_featureset/more_movie_features +``` +{% include copy-curl.html %} + +To see a list of all defined feature sets, you can use the following request: + +``` +GET _ltr/_featureset +``` +{% include copy-curl.html %} + +If you have many feature sets, you can filter the list by using a prefix, as shown in the following example request: + +``` +GET _ltr/_featureset?prefix=mor +``` +{% include copy-curl.html %} + +This returns only the feature sets with names starting with `mor`. + +If you need to start over, you can delete a feature set using the following request: + +``` +DELETE _ltr/_featureset/more_movie_features +``` +{% include copy-curl.html %} + +## Validating features + +When adding new features, you should validate that the features work as expected. You can do this by adding a `validation` block in your feature creation request. This allows the Learning to Rank plugin to run the query before adding the feature, catching any issues early. If you do not run this validation, you may not discover until later that the query, while valid JSON, contains a malformed OpenSearch query. + +To run validation, you can specify the test parameters and the index to use, as shown in the following example validation block: + +```json +"validation": { + "params": { + "keywords": "rambo" + }, + "index": "tmdb" +}, +``` +{% include copy-curl.html %} + +Place the validation block alongside your feature set definition. In the following example, the `match` query is malformed (curly brackets are missing in the Mustache template). The validation fails, returning an error: + +```json +{ + "validation": { + "params": { + "keywords": "rambo" + }, + "index": "tmdb" + }, + "featureset": { + "features": [ + { + "name": "title_query", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match": { + "title": "{% raw %}{{keywords{% endraw %}" + } + } + } + ] + } +} +``` +{% include copy-curl.html %} + +## Expanding feature sets + +You may not initially know which features are the most useful. In these cases, you can later add new features to an existing feature set for logging and model evaluation. For example, if you want to create a `user_rating` feature, you can use the Feature Set Append API, as shown in the following example request: + +```json +POST /_ltr/_featureset/my_featureset/_addfeatures +{ + "features": [{ + "name": "user_rating", + "params": [], + "template_language": "mustache", + "template" : { + "function_score": { + "functions": { + "field": "vote_average" + }, + "query": { + "match_all": {} + } + } + } + }] +} +``` +{% include copy-curl.html %} + +## Enforcing unique feature names + +The Learning to Rank plugin enforces unique names for each feature. This is because some model training libraries refer to features by name. In the preceding example, you could not add a new `user_rating` feature without causing an error because that feature name is already in use. + +## Treating feature sets as lists + +Feature sets are more like ordered lists than simple sets. Each feature has both a name and an ordinal position. Some LTR training applications, such as RankLib, refer to features by their ordinal position (for example, 1st feature, 2nd feature). Others may use the feature name. When working with logged features, you may need to handle both the ordinal and the name because the ordinal is preserved to maintain the list order. + +## Next steps + +Learn about [feature engineering]({{site.url}}{{site.baseurl}}/search-plugins/ltr/feature-engineering/) and [advanced functionality]({{site.url}}{{site.baseurl}}/search-plugins/ltr/advanced-functionality/). diff --git a/_search-plugins/multimodal-search.md b/_search-plugins/multimodal-search.md deleted file mode 100644 index 6c7ddeed5b6..00000000000 --- a/_search-plugins/multimodal-search.md +++ /dev/null @@ -1,133 +0,0 @@ ---- -layout: default -title: Multimodal search -nav_order: 40 -has_children: false -redirect_from: - - /search-plugins/neural-multimodal-search/ ---- - -# Multimodal search -Introduced 2.11 -{: .label .label-purple } - -Use multimodal search to search text and image data. In neural search, text search is facilitated by multimodal embedding models. - -**PREREQUISITE**<br> -Before using text search, you must set up a multimodal embedding model. For more information, see [Choosing a model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/#choosing-a-model). -{: .note} - -## Using multimodal search - -To use neural search with text and image embeddings, follow these steps: - -1. [Create an ingest pipeline](#step-1-create-an-ingest-pipeline). -1. [Create an index for ingestion](#step-2-create-an-index-for-ingestion). -1. [Ingest documents into the index](#step-3-ingest-documents-into-the-index). -1. [Search the index using neural search](#step-4-search-the-index-using-neural-search). - -## Step 1: Create an ingest pipeline - -To generate vector embeddings, you need to create an [ingest pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) that contains a [`text_image_embedding` processor]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/processors/text-image-embedding/), which will convert the text or image in a document field to vector embeddings. The processor's `field_map` determines the text and image fields from which to generate vector embeddings and the output vector field in which to store the embeddings. - -The following example request creates an ingest pipeline where the text from `image_description` and an image from `image_binary` will be converted into text embeddings and the embeddings will be stored in `vector_embedding`: - -```json -PUT /_ingest/pipeline/nlp-ingest-pipeline -{ - "description": "A text/image embedding pipeline", - "processors": [ - { - "text_image_embedding": { - "model_id": "-fYQAosBQkdnhhBsK593", - "embedding": "vector_embedding", - "field_map": { - "text": "image_description", - "image": "image_binary" - } - } - } - ] -} -``` -{% include copy-curl.html %} - -## Step 2: Create an index for ingestion - -In order to use the text embedding processor defined in your pipeline, create a k-NN index, adding the pipeline created in the previous step as the default pipeline. Ensure that the fields defined in the `field_map` are mapped as correct types. Continuing with the example, the `vector_embedding` field must be mapped as a k-NN vector with a dimension that matches the model dimension. Similarly, the `image_description` field should be mapped as `text`, and the `image_binary` should be mapped as `binary`. - -The following example request creates a k-NN index that is set up with a default ingest pipeline: - -```json -PUT /my-nlp-index -{ - "settings": { - "index.knn": true, - "default_pipeline": "nlp-ingest-pipeline", - "number_of_shards": 2 - }, - "mappings": { - "properties": { - "vector_embedding": { - "type": "knn_vector", - "dimension": 1024, - "method": { - "name": "hnsw", - "engine": "lucene", - "parameters": {} - } - }, - "image_description": { - "type": "text" - }, - "image_binary": { - "type": "binary" - } - } - } -} -``` -{% include copy-curl.html %} - -For more information about creating a k-NN index and its supported methods, see [k-NN index]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/). - -## Step 3: Ingest documents into the index - -To ingest documents into the index created in the previous step, send the following request: - -```json -PUT /nlp-index/_doc/1 -{ - "image_description": "Orange table", - "image_binary": "iVBORw0KGgoAAAANSUI..." -} -``` -{% include copy-curl.html %} - -Before the document is ingested into the index, the ingest pipeline runs the `text_image_embedding` processor on the document, generating vector embeddings for the `image_description` and `image_binary` fields. In addition to the original `image_description` and `image_binary` fields, the indexed document includes the `vector_embedding` field, which contains the combined vector embeddings. - -## Step 4: Search the index using neural search - -To perform vector search on your index, use the `neural` query clause either in the [k-NN plugin API]({{site.url}}{{site.baseurl}}/search-plugins/knn/api/#search-for-a-model) or [Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/) queries. You can refine the results by using a [k-NN search filter]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/). You can search by text, image, or both text and image. - -The following example request uses a neural query to search for text and image: - -```json -GET /my-nlp-index/_search -{ - "size": 10, - "query": { - "neural": { - "vector_embedding": { - "query_text": "Orange table", - "query_image": "iVBORw0KGgoAAAANSUI...", - "model_id": "-fYQAosBQkdnhhBsK593", - "k": 5 - } - } - } -} -``` -{% include copy-curl.html %} - -To eliminate passing the model ID with each neural query request, you can set a default model on a k-NN index or a field. To learn more, see [Setting a default model on an index or field]({{site.url}}{{site.baseurl}}/search-plugins/neural-text-search/##setting-a-default-model-on-an-index-or-field). diff --git a/_search-plugins/neural-search.md b/_search-plugins/neural-search.md deleted file mode 100644 index 931c9ce593b..00000000000 --- a/_search-plugins/neural-search.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -layout: default -title: Neural search -nav_order: 25 -has_children: false -has_toc: false -redirect_from: - - /neural-search-plugin/index/ ---- - -# Neural search - -Neural search transforms text into vectors and facilitates vector search both at ingestion time and at search time. During ingestion, neural search transforms document text into vector embeddings and indexes both the text and its vector embeddings in a vector index. When you use a neural query during search, neural search converts the query text into vector embeddings, uses vector search to compare the query and document embeddings, and returns the closest results. - -Before you ingest documents into an index, documents are passed through a machine learning (ML) model, which generates vector embeddings for the document fields. When you send a search request, the query text or image is also passed through the ML model, which generates the corresponding vector embeddings. Then neural search performs a vector search on the embeddings and returns matching documents. - -## Prerequisite - -Before using neural search, you must set up an ML model. When selecting a model, you have the following options: - -- Use a pretrained model provided by OpenSearch. For more information, see [OpenSearch-provided pretrained models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/). - -- Upload your own model to OpenSearch. For more information, see [Custom local models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). - -- Connect to a foundation model hosted on an external platform. For more information, see [Connecting to remote models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). - - -## Tutorial - -For a step-by-step tutorial, see [Neural search tutorial]({{site.url}}{{site.baseurl}}/search-plugins/neural-search-tutorial/). - -## Using an ML model for neural search - -Once you set up an ML model, choose one of the following search methods to use your model for neural search. - -### Semantic search - -Semantic search uses dense retrieval based on text embedding models to search text data. For detailed setup instructions, see [Semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/). - -### Hybrid search - -Hybrid search combines keyword and neural search to improve search relevance. For detailed setup instructions, see [Hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/). - -### Multimodal search - -Multimodal search uses neural search with multimodal embedding models to search text and image data. For detailed setup instructions, see [Multimodal search]({{site.url}}{{site.baseurl}}/search-plugins/multimodal-search/). - -### Sparse search - -Sparse search uses neural search with sparse retrieval based on sparse embedding models to search text data. For detailed setup instructions, see [Sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/). - -### Conversational search - -With conversational search, you can ask questions in natural language, receive a text response, and ask additional clarifying questions. For detailed setup instructions, see [Conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). diff --git a/_search-plugins/querqy/index.md b/_search-plugins/querqy/index.md index 4ec0c8eb1ac..19ceab27504 100644 --- a/_search-plugins/querqy/index.md +++ b/_search-plugins/querqy/index.md @@ -1,7 +1,8 @@ --- layout: default title: Querqy -parent: Search relevance +parent: Query rewriting +grand_parent: Search relevance has_children: false redirect_from: - /search-plugins/querqy/ @@ -28,7 +29,7 @@ Answer `yes` to the security prompts during the installation as Querqy requires After installing the Querqy plugin you can find comprehensive documentation on the Querqy.org site: [Querqy](https://docs.querqy.org/querqy/index.html) -## Path and HTTP methods +## Endpoints ``` POST /myindex/_search diff --git a/_search-plugins/search-pipelines/debugging-search-pipeline.md b/_search-plugins/search-pipelines/debugging-search-pipeline.md new file mode 100644 index 00000000000..c44f40bc0f9 --- /dev/null +++ b/_search-plugins/search-pipelines/debugging-search-pipeline.md @@ -0,0 +1,230 @@ +--- +layout: default +title: Debugging a search pipeline +nav_order: 25 +has_children: false +parent: Search pipelines +grand_parent: Search +--- + + +# Debugging a search pipeline + +The `verbose_pipeline` parameter provides detailed information about the data flow and transformations for the search request, search response, and search phase processors in the search pipeline. It helps with troubleshooting and optimizing the pipeline and ensures transparency in handling search requests and responses. + +## Enabling debugging + +To enable pipeline debugging, specify `verbose_pipeline=true` as a query parameter in your search request. This functionality is available for all three search pipeline methods: + +- [Default search pipeline](#default-search-pipeline) +- [Specific search pipeline](#specific-search-pipeline) +- [Temporary search pipeline](#temporary-search-pipeline) + +### Default search pipeline + +To use `verbose_pipeline` with a default search pipeline, set the pipeline as the default in the index settings and include `verbose_pipeline=true` in the query: + +```json +PUT /my_index/_settings +{ + "index.search.default_pipeline": "my_pipeline" +} +``` +{% include copy-curl.html %} + +```json +GET /my_index/_search?verbose_pipeline=true +``` +{% include copy-curl.html %} + +For more information about default search pipelines, see [Setting a default pipeline for all requests in an index]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/using-search-pipeline/#default-search-pipeline). + +### Specific search pipeline + +To use `verbose_pipeline` with a specific search pipeline, specify the pipeline ID and include `verbose_pipeline=true` in the query: + +```json +GET /my_index/_search?search_pipeline=my_pipeline&verbose_pipeline=true +``` +{% include copy-curl.html %} + +For more information about using specific search pipelines, see [Specifying an existing pipeline for a request]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/using-search-pipeline/#specifying-an-existing-search-pipeline-for-a-request). + +### Temporary search pipeline + +To use `verbose_pipeline` with a temporary search pipeline, define the pipeline directly in the request body and include `verbose_pipeline=true` in the query: + +```json +POST /my_index/_search?verbose_pipeline=true +{ + "query": { + "match": { "text_field": "some search text" } + }, + "search_pipeline": { + "request_processors": [ + { + "filter_query": { + "query": { "term": { "visibility": "public" } } + } + } + ], + "response_processors": [ + { + "collapse": { + "field": "category" + } + } + ] + } +} +``` +{% include copy-curl.html %} + +For more information about using a temporary search pipeline, see [Using a temporary pipeline for a request]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/using-search-pipeline/#using-a-temporary-search-pipeline-for-a-request). + +## Example response + +When the `verbose_pipeline` parameter is enabled, the response contains an additional `processor_results` field that provides information about the transformations applied by each processor in the pipeline: + +<details open markdown="block"> + <summary> + Response + </summary> + {: .text-delta} + +```json +{ + "took": 27, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.18232156, + "hits": [ + { + "_index": "my_index", + "_id": "1", + "_score": 0.18232156, + "_source": { + "notification": "This is a public message", + "visibility": "public" + } + } + ] + }, + "processor_results": [ + { + "processor_name": "filter_query", + "tag": "tag1", + "duration_millis": 288541, + "status": "success", + "input_data": { + "verbose_pipeline": true, + "query": { + "bool": { + "adjust_pure_negative": true, + "must": [ + { + "match": { + "message": { + "auto_generate_synonyms_phrase_query": true, + "query": "this", + "zero_terms_query": "NONE", + "fuzzy_transpositions": true, + "boost": 1.0, + "prefix_length": 0, + "operator": "OR", + "lenient": false, + "max_expansions": 50 + } + } + } + ], + "boost": 1.0 + } + } + }, + "output_data": { + "verbose_pipeline": true, + "query": { + "bool": { + "filter": [ + { + "term": { + "visibility": { + "boost": 1.0, + "value": "public" + } + } + } + ], + "adjust_pure_negative": true, + "must": [ + { + "bool": { + "adjust_pure_negative": true, + "must": [ + { + "match": { + "message": { + "auto_generate_synonyms_phrase_query": true, + "query": "this", + "zero_terms_query": "NONE", + "fuzzy_transpositions": true, + "boost": 1.0, + "prefix_length": 0, + "operator": "OR", + "lenient": false, + "max_expansions": 50 + } + } + } + ], + "boost": 1.0 + } + } + ], + "boost": 1.0 + } + } + } + }, + { + "processor_name": "rename_field", + "duration_millis": 250042, + "status": "success", + "input_data": [ + { + "_index": "my_index", + "_id": "1", + "_score": 0.18232156, + "_source": { + "message": "This is a public message", + "visibility": "public" + } + } + ], + "output_data": [ + { + "_index": "my_index", + "_id": "1", + "_score": 0.18232156, + "_source": { + "notification": "This is a public message", + "visibility": "public" + } + } + ] + } + ] +} +``` +</details> \ No newline at end of file diff --git a/_search-plugins/search-pipelines/explanation-processor.md b/_search-plugins/search-pipelines/explanation-processor.md new file mode 100644 index 00000000000..4714f4a0ca2 --- /dev/null +++ b/_search-plugins/search-pipelines/explanation-processor.md @@ -0,0 +1,220 @@ +--- +layout: default +title: Hybrid score explanation +nav_order: 15 +has_children: false +parent: Search processors +grand_parent: Search pipelines +--- + +# Hybrid score explanation processor +Introduced 2.19 +{: .label .label-purple } + +The `hybrid_score_explanation` response processor adds the normalization and combination results to the returned search response. You can use it as a debugging tool to understand the score normalization process. For more information, see [Hybrid query]({{site.url}}{{site.baseurl}}/query-dsl/compound/hybrid/). + +To use the `explain` parameter, you must configure the `hybrid_score_explanation` response processor in your search pipeline. +{: .important} + +## Request body fields + +The following table lists all request fields. + +Field | Data type | Description +:--- | :--- | :--- +`tag` | String | The processor's identifier. Optional. +`description` | String | A description of the processor. Optional. +`ignore_failure` | Boolean | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Optional. Default is `false`. + +## Example + +The following example demonstrates using a search pipeline with a `hybrid_score_explanation` processor. + +For a comprehensive example, follow the [Getting started with semantic and hybrid search]({{site.url}}{{site.baseurl}}/ml-commons-plugin/semantic-search#tutorial). + +### Creating a search pipeline + +The following request creates a search pipeline containing a `normalization-processor` and a `hybrid_score_explanation` processor: + +```json +PUT /_search/pipeline/nlp-search-pipeline +{ + "description": "Post processor for hybrid search", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "min_max" + }, + "combination": { + "technique": "arithmetic_mean" + } + } + } + ], + "response_processors": [ + { + "hybrid_score_explanation": {} + } + ] +} +``` +{% include copy-curl.html %} + +### Using a search pipeline + +To see explanation information, specify `explain=true` in your search request: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline&explain=true +{ + "_source": { + "exclude": [ + "passage_embedding" + ] + }, + "query": { + "hybrid": { + "queries": [ + { + "match": { + "text": { + "query": "horse" + } + } + }, + { + "neural": { + "passage_embedding": { + "query_text": "wild west", + "model_id": "aVeif4oB5Vm0Tdw8zYO2", + "k": 5 + } + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took": 54, + "timed_out": false, + "_shards": { + "total": 2, + "successful": 2, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 5, + "relation": "eq" + }, + "max_score": 0.9251075, + "hits": [ + { + "_shard": "[my-nlp-index][0]", + "_node": "IsuzeVYdSqKUfy0qfqil2w", + "_index": "my-nlp-index", + "_id": "5", + "_score": 0.9251075, + "_source": { + "text": "A rodeo cowboy , wearing a cowboy hat , is being thrown off of a wild white horse .", + "id": "2691147709.jpg" + }, + "_explanation": { + "value": 0.9251075, + "description": "arithmetic_mean combination of:", + "details": [ + { + "value": 1.0, + "description": "min_max normalization of:", + "details": [ + { + "value": 1.2336599, + "description": "weight(text:horse in 0) [PerFieldSimilarity], result of:", + "details": [ + { + "value": 1.2336599, + "description": "score(freq=1.0), computed as boost * idf * tf from:", + "details": [ + { + "value": 2.2, + "description": "boost", + "details": [] + }, + { + "value": 1.2039728, + "description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:", + "details": [ + { + "value": 1, + "description": "n, number of documents containing term", + "details": [] + }, + { + "value": 4, + "description": "N, total number of documents with field", + "details": [] + } + ] + }, + { + "value": 0.46575344, + "description": "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:", + "details": [ + { + "value": 1.0, + "description": "freq, occurrences of term within document", + "details": [] + }, + { + "value": 1.2, + "description": "k1, term saturation parameter", + "details": [] + }, + { + "value": 0.75, + "description": "b, length normalization parameter", + "details": [] + }, + { + "value": 16.0, + "description": "dl, length of field", + "details": [] + }, + { + "value": 17.0, + "description": "avgdl, average length of field", + "details": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "value": 0.8503647, + "description": "min_max normalization of:", + "details": [ + { + "value": 0.015177966, + "description": "within top 5", + "details": [] + } + ] + } + ] +... +``` + +For more information about setting up hybrid search, see [Hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/). \ No newline at end of file diff --git a/_search-plugins/search-pipelines/index.md b/_search-plugins/search-pipelines/index.md index d4edc289d36..361b3dd6697 100644 --- a/_search-plugins/search-pipelines/index.md +++ b/_search-plugins/search-pipelines/index.md @@ -4,6 +4,8 @@ title: Search pipelines nav_order: 100 has_children: true has_toc: false +redirect_from: + - /search-plugins/search-pipelines/ --- # Search pipelines diff --git a/_search-plugins/search-pipelines/ml-inference-search-request.md b/_search-plugins/search-pipelines/ml-inference-search-request.md index a072458a41a..8eddb0400b8 100644 --- a/_search-plugins/search-pipelines/ml-inference-search-request.md +++ b/_search-plugins/search-pipelines/ml-inference-search-request.md @@ -57,7 +57,7 @@ The following table lists the required and optional parameters for the `ml-infer | `model_id`| String | Required | The ID of the ML model used by the processor. | | `query_template` | String | Optional | A query string template used to construct a new query containing a `new_document_field`. Often used when rewriting a search query to a new query type. | | `function_name` | String | Optional for externally hosted models<br/><br/>Required for local models | The function name of the ML model configured in the processor. For local models, valid values are `sparse_encoding`, `sparse_tokenize`, `text_embedding`, and `text_similarity`. For externally hosted models, valid value is `remote`. Default is `remote`. | -| `model_config` | Object | Optional | Custom configuration options for the ML model. For more information, see [The `model_config` object]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#the-model_config-object). | +| `model_config` | Object | Optional | Custom configuration options for the ML model. For externally hosted models, if set, this configuration overrides the default connector parameters. For local models, you can add `model_config` to `model_input` to override the model configuration set during registration. For more information, see [The `model_config` object]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#the-model_config-object). | | `model_input` | String | Optional for externally hosted models<br/><br/>Required for local models | A template that defines the input field format expected by the model. Each local model type might use a different set of inputs. For externally hosted models, default is `"{ \"parameters\": ${ml_inference.parameters} }`. | | `input_map` | Array | Required | An array specifying how to map query string fields to the model input fields. Each element of the array is a map in the `"<model_input_field>": "<query_input_field>"` format and corresponds to one model invocation of a document field. If no input mapping is specified for an externally hosted model, then all document fields are passed to the model directly as input. The `input_map` size indicates the number of times the model is invoked (the number of Predict API requests). | | `<model_input_field>` | String | Required | The model input field name. | @@ -66,8 +66,8 @@ The following table lists the required and optional parameters for the `ml-infer | `<query_output_field>` | String | Required | The name of the query field in which the model's output (specified by `model_output`) is stored. | | `<model_output_field>` | String | Required | The name or JSON path of the field in the model output to be stored in the `query_output_field`. | | `full_response_path` | Boolean | Optional | Set this parameter to `true` if the `model_output_field` contains a full JSON path to the field instead of the field name. The model output will then be fully parsed to get the value of the field. Default is `true` for local models and `false` for externally hosted models. | -| `ignore_missing` | Boolean | Optional | If `true` and any of the input fields defined in the `input_map` or `output_map` are missing, then the missing fields are ignored. Otherwise, a missing field causes a failure. Default is `false`. | -| `ignore_failure` | Boolean | Optional | Specifies whether the processor continues execution even if it encounters an error. If `true`, then any failure is ignored and the search continues. If `false`, then any failure causes the search to be canceled. Default is `false`. | +| `ignore_missing` | Boolean | Optional | If `true` and any of the input fields defined in the `input_map` or `output_map` are missing, then this processor is ignored. Otherwise, a missing field causes a failure. Default is `false`. | +| `ignore_failure` | Boolean | Optional | Specifies whether the processor continues execution even if it encounters an error. If `true`, then this processor is ignored and the search continues. If `false`, then any failure causes the search to be canceled. Default is `false`. | | `max_prediction_tasks` | Integer | Optional | The maximum number of concurrent model invocations that can run during query search. Default is `10`. | | `description` | String | Optional | A brief description of the processor. | | `tag` | String | Optional | An identifier tag for the processor. Useful for debugging to distinguish between processors of the same type. | @@ -168,7 +168,7 @@ The following request creates a search pipeline that rewrites the preceding term PUT /_search/pipeline/ml_inference_pipeline { "description": "Generate passage_embedding for searched documents", - "processors": [ + "request_processors": [ { "ml_inference": { "model_id": "<your model id>", @@ -413,7 +413,7 @@ The following is the final configuration of the `ml_inference` processor with th PUT /_search/pipeline/ml_inference_pipeline_local { "description": "searchs reviews and generates embeddings", - "processors": [ + "request_processors": [ { "ml_inference": { "function_name": "text_embedding", diff --git a/_search-plugins/search-pipelines/ml-inference-search-response.md b/_search-plugins/search-pipelines/ml-inference-search-response.md index efd24e13c8c..5f8b1034afd 100644 --- a/_search-plugins/search-pipelines/ml-inference-search-response.md +++ b/_search-plugins/search-pipelines/ml-inference-search-response.md @@ -56,7 +56,7 @@ The following table lists the required and optional parameters for the `ml-infer |:--| :--- | :--- |:---| | `model_id` | String | Required | The ID of the ML model used by the processor. | | `function_name` | String | Optional for externally hosted models<br/><br/>Required for local models | The function name of the ML model configured in the processor. For local models, valid values are `sparse_encoding`, `sparse_tokenize`, `text_embedding`, and `text_similarity`. For externally hosted models, valid value is `remote`. Default is `remote`. | -| `model_config` | Object | Optional | Custom configuration options for the ML model. For more information, see [The `model_config` object]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#the-model_config-object).| +| `model_config` | Object | Optional | Custom configuration options for the ML model. For externally hosted models, if set, this configuration overrides the default connector parameters. For local models, you can add `model_config` to `model_input` to override the model configuration set during registration. For more information, see [The `model_config` object]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#the-model_config-object). | | `model_input` | String | Optional for externally hosted models<br/><br/>Required for local models | A template that defines the input field format expected by the model. Each local model type might use a different set of inputs. For externally hosted models, default is `"{ \"parameters\": ${ml_inference.parameters} }`. | | `input_map` | Array | Optional for externally hosted models<br/><br/>Required for local models | An array specifying how to map document fields in the search response to the model input fields. Each element of the array is a map in the `"<model_input_field>": "<document_field>"` format and corresponds to one model invocation of a document field. If no input mapping is specified for an externally hosted model, then all document fields are passed to the model directly as input. The `input_map` size indicates the number of times the model is invoked (the number of Predict API requests). | | `<model_input_field>` | String | Optional for externally hosted models<br/><br/>Required for local models | The model input field name. | @@ -65,8 +65,8 @@ The following table lists the required and optional parameters for the `ml-infer | `<new_document_field>` | String | Optional for externally hosted models<br/><br/>Required for local models | The name of the new field in the document in which the model's output (specified by `model_output`) is stored. If no output mapping is specified for externally hosted models, then all fields from the model output are added to the new document field. | | `<model_output_field>` | String | Optional for externally hosted models<br/><br/>Required for local models | The name or JSON path of the field in the model output to be stored in the `new_document_field`. | | `full_response_path` | Boolean | Optional | Set this parameter to `true` if the `model_output_field` contains a full JSON path to the field instead of the field name. The model output will then be fully parsed to get the value of the field. Default is `true` for local models and `false` for externally hosted models. | -| `ignore_missing` | Boolean | Optional | If `true` and any of the input fields defined in the `input_map` or `output_map` are missing, then the missing fields are ignored. Otherwise, a missing field causes a failure. Default is `false`. | -| `ignore_failure` | Boolean | Optional | Specifies whether the processor continues execution even if it encounters an error. If `true`, then any failure is ignored and the search continues. If `false`, then any failure causes the search to be canceled. Default is `false`. | +| `ignore_missing` | Boolean | Optional | If `true` and any of the input fields defined in the `input_map` or `output_map` are missing, then this processor is ignored. Otherwise, a missing field causes a failure. Default is `false`. | +| `ignore_failure` | Boolean | Optional | Specifies whether the processor continues execution even if it encounters an error. If `true`, then this processor is ignored and the search continues. If `false`, then any failure causes the search to be canceled. Default is `false`. | | `override` | Boolean | Optional | Relevant if a document in the response already contains a field with the name specified in `<new_document_field>`. If `override` is `false`, then the input field is skipped. If `true`, then the existing field value is overridden by the new model output. Default is `false`. | | `max_prediction_tasks` | Integer | Optional | The maximum number of concurrent model invocations that can run during document search. Default is `10`. | | `one_to_one` | Boolean | Optional | Set this parameter to `true` to invoke the model once (make one Predict API request) for each document. Default value (`false`) specifies to invoke the model with all documents from the search response, making one Predict API request. | @@ -96,7 +96,188 @@ For local models, you must provide a `model_input` field that specifies the mode For remote models, the `model_input` field is optional, and its default value is `"{ \"parameters\": ${ml_inference.parameters} }`. -### Example: Externally hosted model +### Example: Local model + +The following example shows you how to configure an `ml_inference` search response processor with a local model. + +**Step 1: Create a pipeline** + +The following example shows you how to create a search pipeline for the `huggingface/sentence-transformers/all-distilroberta-v1` local model. The model is a [pretrained sentence transformer model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#sentence-transformers) hosted in your OpenSearch cluster. + +If you invoke the model using the Predict API, then the request appears as follows: + +```json +POST /_plugins/_ml/_predict/text_embedding/cleMb4kBJ1eYAeTMFFg4 +{ + "text_docs":[ "today is sunny"], + "return_number": true, + "target_response": ["sentence_embedding"] +} +``` + +Using this schema, specify the `model_input` as follows: + +```json + "model_input": "{ \"text_docs\": ${input_map.text_docs}, \"return_number\": ${model_config.return_number}, \"target_response\": ${model_config.target_response} }" +``` + +In the `input_map`, map the `passage_text` document field to the `text_docs` field expected by the model: + +```json +"input_map": [ + { + "text_docs": "passage_text" + } +] +``` + +Because you specified the field to be converted into embeddings as a JSON path, you need to set the `full_response_path` to `true`. Then the full JSON document is parsed in order to obtain the input field: + +```json +"full_response_path": true +``` + +The text in the `passage_text` field will be used to generate embeddings: + +```json +{ + "passage_text": "hello world" +} +``` + +The Predict API request returns the following response: + +```json +{ + "inference_results" : [ + { + "output" : [ + { + "name" : "sentence_embedding", + "data_type" : "FLOAT32", + "shape" : [ + 768 + ], + "data" : [ + 0.25517133, + -0.28009856, + 0.48519906, + ... + ] + } + ] + } + ] +} +``` + +The model generates embeddings in the `$.inference_results.*.output.*.data` field. The `output_map` maps this field to the newly created `passage_embedding` field in the search response document: + +```json +"output_map": [ + { + "passage_embedding": "$.inference_results.*.output.*.data" + } +] +``` + +To configure an `ml_inference` search response processor with a local model, specify the `function_name` explicitly. In this example, the `function_name` is `text_embedding`. For information about valid `function_name` values, see [Request fields](#request-body-fields). + +The following is the final configuration of the `ml_inference` search response processor with the local model: + +```json +PUT /_search/pipeline/ml_inference_pipeline_local +{ + "description": "search passage and generates embeddings", + "response_processors": [ + { + "ml_inference": { + "function_name": "text_embedding", + "full_response_path": true, + "model_id": "<your model id>", + "model_config": { + "return_number": true, + "target_response": ["sentence_embedding"] + }, + "model_input": "{ \"text_docs\": ${input_map.text_docs}, \"return_number\": ${model_config.return_number}, \"target_response\": ${model_config.target_response} }", + "input_map": [ + { + "text_docs": "passage_text" + } + ], + "output_map": [ + { + "passage_embedding": "$.inference_results.*.output.*.data" + } + ], + "ignore_missing": true, + "ignore_failure": true + } + } + ] +} +``` +{% include copy-curl.html %} + +**Step 2: Run the pipeline** + +Run the following query, providing the pipeline name in the request: + +```json +GET /my_index/_search?search_pipeline=ml_inference_pipeline_local +{ +"query": { + "term": { + "passage_text": { + "value": "hello" + } + } + } +} +``` +{% include copy-curl.html %} + +#### Response + +The response confirms that the processor has generated text embeddings in the `passage_embedding` field: + +```json +{ + "took": 288, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.00009405752, + "hits": [ + { + "_index": "my_index", + "_id": "1", + "_score": 0.00009405752, + "_source": { + "passage_text": "hello world", + "passage_embedding": [ + 0.017304314, + -0.021530833, + 0.050184276, + 0.08962978, + ...] + } + } + ] + } +} +``` + +### Example: Externally hosted text embedding model The following example shows you how to configure an `ml_inference` search response processor with an externally hosted model. @@ -108,7 +289,7 @@ The following example shows you how to create a search pipeline for an externall PUT /_search/pipeline/ml_inference_pipeline { "description": "Generate passage_embedding when search documents", - "processors": [ + "response_processors": [ { "ml_inference": { "model_id": "<your model id>", @@ -172,7 +353,6 @@ GET /my_index/_search?search_pipeline=ml_inference_pipeline_local The response confirms that the processor has generated text embeddings in the `passage_embedding` field. The document within `_source` now contains both the `passage_text` and `passage_embedding` fields: ```json - { "took": 288, "timed_out": false, @@ -209,140 +389,312 @@ The response confirms that the processor has generated text embeddings in the `p } ``` -### Example: Local model +### Example: Externally hosted large language model -The following example shows you how to configure an `ml_inference` search response processor with a local model. +This example demonstrates how to configure an `ml_inference` search response processor to work with an externally hosted large language model (LLM) and map the model's response to the search extension object. Using the `ml_inference` processor, you can enable an LLM to summarize search results directly within the response. The summary is included in the `ext` field of the search response, providing seamless access to AI-generated insights alongside the original search results. -**Step 1: Create a pipeline** +**Prerequisite** -The following example shows you how to create a search pipeline for the `huggingface/sentence-transformers/all-distilroberta-v1` local model. The model is a [pretrained sentence transformer model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#sentence-transformers) hosted in your OpenSearch cluster. +You must configure an externally hosted LLM for this use case. For more information about externally hosted models, see [Connecting to externally hosted models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). Once you register the LLM, you can use the following request to test it. This request requires providing the `prompt` and `context` fields: -If you invoke the model using the Predict API, then the request appears as follows: +```json +POST /_plugins/_ml/models/KKne6JIBAs32TwoK-FFR/_predict +{ + "parameters": { + "prompt":"\n\nHuman: You are a professional data analysist. You will always answer question: Which month had the lowest customer acquisition cost per new customer? based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say I don't know. Context: ${parameters.context.toString()}. \n\n Assistant:", + "context":"Customer acquisition cost: January: $50, February: $45, March: $40. New customers: January: 500, February: 600, March: 750" + } +} +``` +{% include copy-curl.html %} + +The response contains the model output in the `inference_results` field: ```json -POST /_plugins/_ml/_predict/text_embedding/cleMb4kBJ1eYAeTMFFg4 { - "text_docs":[ "today is sunny"], - "return_number": true, - "target_response": ["sentence_embedding"] + "inference_results": [ + { + "output": [ + { + "name": "response", + "dataAsMap": { + "response": """ Based on the data provided: + + - Customer acquisition cost in January was $50 and new customers were 500. So cost per new customer was $50/500 = $0.10 + - Customer acquisition cost in February was $45 and new customers were 600. So cost per new customer was $45/600 = $0.075 + - Customer acquisition cost in March was $40 and new customers were 750. So cost per new customer was $40/750 = $0.053 + + Therefore, the month with the lowest customer acquisition cost per new customer was March, at $0.053.""" + } + } + ], + "status_code": 200 + } + ] } ``` -Using this schema, specify the `model_input` as follows: +**Step 1: Create a pipeline** + +Create a search pipeline for the registered model. The model requires a `context` field as input. The model response summarizes the text in the `review` field and stores the summary in the `ext.ml_inference.llm_response` field of the search response: ```json - "model_input": "{ \"text_docs\": ${input_map.text_docs}, \"return_number\": ${model_config.return_number}, \"target_response\": ${model_config.target_response} }" +PUT /_search/pipeline/my_pipeline_request_review_llm +{ + "response_processors": [ + { + "ml_inference": { + "tag": "ml_inference", + "description": "This processor is going to run llm", + "model_id": "EOF6wJIBtDGAJRTD4kNg", + "function_name": "REMOTE", + "input_map": [ + { + "context": "review" + } + ], + "output_map": [ + { + "ext.ml_inference.llm_response": "response" + } + ], + "model_config": { + "prompt": "\n\nHuman: You are a professional data analysist. You will always answer question: Which month had the lowest customer acquisition cost per new customer? based on the given context first. If the answer is not directly shown in the context, you will analyze the data and find the answer. If you don't know the answer, just say I don't know. Context: ${parameters.context.toString()}. \n\n Assistant:" + }, + "ignore_missing": false, + "ignore_failure": false + } + } + ] +} ``` +{% include copy-curl.html %} -In the `input_map`, map the `passage_text` document field to the `text_docs` field expected by the model: +In this configuration, you've provided the following parameters: + +- The `model_id` parameter specifies the ID of the generative AI model. +- The `function_name` parameter is set to `REMOTE`, indicating that the model is hosted externally. +- The `input_map` parameter maps the review field from the document to the context field expected by the model. +- The `output_map` parameter specifies that the model's response should be stored in `ext.ml_inference.llm_response` in the search response. +- The `model_config` parameter includes a prompt that tells the model how to process the input and generate a summary. + +**Step 2: Index sample documents** + +Index some sample documents to test the pipeline: ```json -"input_map": [ - { - "text_docs": "passage_text" +POST /_bulk +{"index":{"_index":"review_string_index","_id":"1"}} +{"review":"Customer acquisition cost: January: $50, New customers: January: 500."} +{"index":{"_index":"review_string_index","_id":"2"}} +{"review":"Customer acquisition cost: February: $45, New customers: February: 600."} +{"index":{"_index":"review_string_index","_id":"3"}} +{"review":"Customer acquisition cost: March: $40, New customers: March: 750."} +``` +{% include copy-curl.html %} + +**Step 3: Run the pipeline** + +Run a search query using the pipeline: + +```json +GET /review_string_index/_search?search_pipeline=my_pipeline_request_review_llm +{ + "query": { + "match_all": {} } -] +} ``` +{% include copy-curl.html %} -Because you specified the field to be converted into embeddings as a JSON path, you need to set the `full_response_path` to `true`. Then the full JSON document is parsed in order to obtain the input field: +The response includes the original documents and the generated summary in the `ext.ml_inference.llm_response` field: ```json -"full_response_path": true +{ + "took": 1, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "review_string_index", + "_id": "1", + "_score": 1, + "_source": { + "review": "Customer acquisition cost: January: $50, New customers: January: 500." + } + }, + { + "_index": "review_string_index", + "_id": "2", + "_score": 1, + "_source": { + "review": "Customer acquisition cost: February: $45, New customers: February: 600." + } + }, + { + "_index": "review_string_index", + "_id": "3", + "_score": 1, + "_source": { + "review": "Customer acquisition cost: March: $40, New customers: March: 750." + } + } + ] + }, + "ext": { + "ml_inference": { + "llm_response": """ Based on the context provided: + + - Customer acquisition cost in January was $50 and new customers were 500. So the cost per new customer was $50/500 = $0.10 + + - Customer acquisition cost in February was $45 and new customers were 600. So the cost per new customer was $45/600 = $0.075 + + - Customer acquisition cost in March was $40 and new customers were 750. So the cost per new customer was $40/750 = $0.053 + + Therefore, the month with the lowest customer acquisition cost per new customer was March, as it had the lowest cost per customer of $0.053.""" + } + } +} ``` -The text in the `passage_text` field will be used to generate embeddings: +### Example: Reranking search results using a text similarity model + +The following example shows you how to configure an `ml_inference` search response processor with a text similarity model. + +**Prerequisite** + +You must configure an externally hosted text similarity model for this use case. For more information about externally hosted models, see [Connecting to externally hosted models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). Once you register the text similarity model, you can use the following request to test it. This request requires that you provide the `text` and `text_pair` fields within the `inputs` field: ```json +POST /_plugins/_ml/models/Ialx65IBAs32TwoK1lXf/_predict { - "passage_text": "hello world" + "parameters": { + "inputs": + { + "text": "I like you", + "text_pair": "I hate you" + } + } } ``` +{% include copy-curl.html %} -The Predict API request returns the following response: +The model returns similarity scores for each input document: ```json { - "inference_results" : [ + "inference_results": [ { - "output" : [ + "output": [ { - "name" : "sentence_embedding", - "data_type" : "FLOAT32", - "shape" : [ - 768 - ], - "data" : [ - 0.25517133, - -0.28009856, - 0.48519906, - ... - ] + "name": "response", + "dataAsMap": { + "label": "LABEL_0", + "score": 0.022704314440488815 + } } - ] + ], + "status_code": 200 } ] } ``` +{% include copy-curl.html %} -The model generates embeddings in the `$.inference_results.*.output.*.data` field. The `output_map` maps this field to the newly created `passage_embedding` field in the search response document: +**Step 1: Index sample documents** + +Create an index and add some sample documents: ```json -"output_map": [ - { - "passage_embedding": "$.inference_results.*.output.*.data" - } -] +POST _bulk +{"index":{"_index":"demo-index-0","_id":"1"}} +{"diary":"I hate you"} +{"index":{"_index":"demo-index-0","_id":"2"}} +{"diary":"I love you"} +{"index":{"_index":"demo-index-0","_id":"3"}} +{"diary":"I dislike you"} ``` +{% include copy-curl.html %} -To configure an `ml_inference` search response processor with a local model, specify the `function_name` explicitly. In this example, the `function_name` is `text_embedding`. For information about valid `function_name` values, see [Request fields](#request-body-fields). +**Step 2: Create a search pipeline** -The following is the final configuration of the `ml_inference` search response processor with the local model: +For this example, you'll create a search pipeline that uses a text similarity model in a `one-to-one` inference mode, processing each document in the search results individually. This setup allows the model to make one prediction request per document, providing specific relevance insights for each search hit. When using `input_map` to map the search request to query text, the JSON path must start with `$._request` or `_request`: ```json -PUT /_search/pipeline/ml_inference_pipeline_local +PUT /_search/pipeline/my_rerank_pipeline { - "description": "search passage and generates embeddings", - "processors": [ + "response_processors": [ { "ml_inference": { - "function_name": "text_embedding", - "full_response_path": true, - "model_id": "<your model id>", - "model_config": { - "return_number": true, - "target_response": ["sentence_embedding"] - }, - "model_input": "{ \"text_docs\": ${input_map.text_docs}, \"return_number\": ${model_config.return_number}, \"target_response\": ${model_config.target_response} }", + "tag": "ml_inference", + "description": "This processor runs ml inference during search response", + "model_id": "Ialx65IBAs32TwoK1lXf", + "model_input":"""{"parameters":{"inputs":{"text":"${input_map.text}","text_pair":"${input_map.text_pair}"}}}""", + "function_name": "REMOTE", "input_map": [ { - "text_docs": "passage_text" + "text": "diary", + "text_pair":"$._request.query.term.diary.value" } ], "output_map": [ { - "passage_embedding": "$.inference_results.*.output.*.data" + "rank_score": "$.score" } ], - "ignore_missing": true, - "ignore_failure": true - } + "full_response_path": false, + "model_config": {}, + "ignore_missing": false, + "ignore_failure": false, + "one_to_one": true + }, + "rerank": { + "by_field": { + "target_field": "rank_score", + "remove_target_field": true + } + } } ] } ``` {% include copy-curl.html %} -**Step 2: Run the pipeline** +In this configuration, you've provided the following parameters: -Run the following query, providing the pipeline name in the request: +- The `model_id` parameter specifies the unique identifier of the text similarity model. +- The `function_name` parameter is set to `REMOTE`, indicating that the model is hosted externally. +- The `input_map` parameter maps the `diary` field from each document to the `text` input of the model as well as the search query term to the `text_pair` input. +- The `output_map` parameter maps the model's score to a field named `rank_score` in each document. +- The `model_input` parameter formats the input for the model, ensuring that it matches the structure expected by the Predict API. +- The `one_to_one` parameter is set to `true`, ensuring that the model processes each document individually rather than batching multiple documents together. +- The `ignore_missing` parameter is set to `false`, causing the processor to fail if the mapped fields are missing from a document. +- The `ignore_failure` parameter is set to `false`, causing the entire pipeline to fail if the ML inference processor encounters an error. + +The `rerank` processor is applied after ML inference. It reorders the documents based on the `rank_score` field generated by the ML model and then removes this field from the final results. + +**Step 3: Run the pipeline** + +Now perform a search using the created pipeline: ```json -GET /my_index/_search?search_pipeline=ml_inference_pipeline_local +GET /demo-index-0/_search?search_pipeline=my_rerank_pipeline { -"query": { - "term": { - "passage_text": { - "value": "hello" + "query": { + "term": { + "dairy": { + "value": "today" } } } @@ -350,13 +702,11 @@ GET /my_index/_search?search_pipeline=ml_inference_pipeline_local ``` {% include copy-curl.html %} -#### Response - -The response confirms that the processor has generated text embeddings in the `passage_embedding` field: +The response includes the original documents and their reranked scores: ```json { - "took": 288, + "took": 2, "timed_out": false, "_shards": { "total": 1, @@ -366,26 +716,43 @@ The response confirms that the processor has generated text embeddings in the `p }, "hits": { "total": { - "value": 1, + "value": 3, "relation": "eq" }, - "max_score": 0.00009405752, + "max_score": 0.040183373, "hits": [ { - "_index": "my_index", + "_index": "demo-index-0", "_id": "1", - "_score": 0.00009405752, + "_score": 0.040183373, "_source": { - "passage_text": "hello world", - "passage_embedding": [ - 0.017304314, - -0.021530833, - 0.050184276, - 0.08962978, - ...] + "diary": "I hate you" + } + }, + { + "_index": "demo-index-0", + "_id": "2", + "_score": 0.022628736, + "_source": { + "diary": "I love you" + } + }, + { + "_index": "demo-index-0", + "_id": "3", + "_score": 0.0073115323, + "_source": { + "diary": "I dislike you" } } ] + }, + "profile": { + "shards": [] } } -``` \ No newline at end of file +``` + +## Next steps + +- See a comprehensive example of [reranking by a field using an externally hosted cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field-cross-encoder/). \ No newline at end of file diff --git a/_search-plugins/search-pipelines/neural-sparse-query-two-phase-processor.md b/_search-plugins/search-pipelines/neural-sparse-query-two-phase-processor.md index 41119e643a2..536d1670837 100644 --- a/_search-plugins/search-pipelines/neural-sparse-query-two-phase-processor.md +++ b/_search-plugins/search-pipelines/neural-sparse-query-two-phase-processor.md @@ -23,7 +23,8 @@ Field | Data type | Description :--- | :--- | :--- `enabled` | Boolean | Controls whether the two-phase processor is enabled. Default is `true`. `two_phase_parameter` | Object | A map of key-value pairs representing the two-phase parameters and their associated values. You can specify the value of `prune_ratio`, `expansion_rate`, `max_window_size`, or any combination of these three parameters. Optional. -`two_phase_parameter.prune_ratio` | Float | A ratio that represents how to split the high-weight tokens and low-weight tokens. The threshold is the token's maximum score multiplied by its `prune_ratio`. Valid range is [0,1]. Default is `0.4` +`two_phase_parameter.prune_type` | String | The pruning strategy for separating high-weight and low-weight tokens. Default is `max_ratio`. For valid values, see [Pruning sparse vectors]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/processors/sparse-encoding/#pruning-sparse-vectors). +`two_phase_parameter.prune_ratio` | Float | This ratio defines how high-weight and low-weight tokens are separated. The threshold is calculated by multiplying the token's maximum score by its `prune_ratio`. Valid values are in the [0,1] range for `prune_type` set to `max_ratio`. Default is `0.4`. `two_phase_parameter.expansion_rate` | Float | The rate at which documents will be fine-tuned during the second phase. The second-phase document number equals the query size (default is 10) multiplied by its expansion rate. Valid range is greater than 1.0. Default is `5.0` `two_phase_parameter.max_window_size` | Int | The maximum number of documents that can be processed using the two-phase processor. Valid range is greater than 50. Default is `10000`. `tag` | String | The processor's identifier. Optional. diff --git a/_search-plugins/search-pipelines/normalization-processor.md b/_search-plugins/search-pipelines/normalization-processor.md index e70f815bddf..2e502254294 100644 --- a/_search-plugins/search-pipelines/normalization-processor.md +++ b/_search-plugins/search-pipelines/normalization-processor.md @@ -31,8 +31,11 @@ The following table lists all available request fields. Field | Data type | Description :--- | :--- | :--- -`normalization.technique` | String | The technique for normalizing scores. Valid values are [`min_max`](https://en.wikipedia.org/wiki/Feature_scaling#Rescaling_(min-max_normalization)) and [`l2`](https://en.wikipedia.org/wiki/Cosine_similarity#L2-normalized_Euclidean_distance). Optional. Default is `min_max`. -`combination.technique` | String | The technique for combining scores. Valid values are [`arithmetic_mean`](https://en.wikipedia.org/wiki/Arithmetic_mean), [`geometric_mean`](https://en.wikipedia.org/wiki/Geometric_mean), and [`harmonic_mean`](https://en.wikipedia.org/wiki/Harmonic_mean). Optional. Default is `arithmetic_mean`. +`normalization.technique` | String | The technique for normalizing scores. Valid values are [`min_max`](https://en.wikipedia.org/wiki/Feature_scaling#Rescaling_(min-max_normalization)), [`l2`](https://en.wikipedia.org/wiki/Cosine_similarity#L2-normalized_Euclidean_distance), and [`z_score`](https://en.wikipedia.org/wiki/Standard_score). Optional. Default is `min_max`. + `normalization.parameters.lower_bounds` | Array of objects | Defines the lower bound values (the minimum threshold scores) for each query. The array must contain the same number of objects as the number of queries. Optional. Applies only when the normalization technique is [`min_max`](https://en.wikipedia.org/wiki/Feature_scaling#Rescaling_(min-max_normalization)). If not provided, OpenSearch does not apply a lower bound to any subquery and uses the actual minimum score from the retrieved results for normalization. +`normalization.parameters.lower_bounds.mode` | String | Specifies how the lower bound is applied to a query. Valid values are: <br> - `apply`: Uses `min_score` for normalization without modifying the original scores. Formula: `min_max_score = if (score < lowerBoundScore) then (score - minScore) / (maxScore - minScore) else (score - lowerBoundScore) / (maxScore - lowerBoundScore)`. <br> - `clip`: Replaces scores below the lower bound with `min_score`. Formula: `min_max_score = if (score < lowerBoundScore) then 0.0 else (score - lowerBoundScore) / (maxScore - lowerBoundScore)`. <br> - `ignore`: Does not apply a lower bound to this query and uses the standard `min_max` formula instead. <br> Optional. Default is `apply`. +`normalization.parameters.lower_bounds.min_score` | Float | The lower bound threshold. Valid values are in the [-10000.0, 10000.0] range. If `mode` is set to `ignore`, then this value has no effect. Optional. Default is `0.0`. +`combination.technique` | String | The technique for combining scores. Valid values are [`arithmetic_mean`](https://en.wikipedia.org/wiki/Arithmetic_mean), [`geometric_mean`](https://en.wikipedia.org/wiki/Geometric_mean), and [`harmonic_mean`](https://en.wikipedia.org/wiki/Harmonic_mean). Optional. Default is `arithmetic_mean`. `z_score` supports only `arithmetic_mean`. `combination.parameters.weights` | Array of floating-point values | Specifies the weights to use for each query. Valid values are in the [0.0, 1.0] range and signify decimal percentages. The closer the weight is to 1.0, the more weight is given to a query. The number of values in the `weights` array must equal the number of queries. The sum of the values in the array must equal 1.0. Optional. If not provided, all queries are given equal weight. `tag` | String | The processor's identifier. Optional. `description` | String | A description of the processor. Optional. @@ -42,11 +45,11 @@ Field | Data type | Description The following example demonstrates using a search pipeline with a `normalization-processor`. -For a comprehensive example, follow the [Neural search tutorial]({{site.url}}{{site.baseurl}}/ml-commons-plugin/semantic-search#tutorial). +For a comprehensive example, follow the [Getting started with semantic and hybrid search]({{site.url}}{{site.baseurl}}/ml-commons-plugin/semantic-search#tutorial). ### Creating a search pipeline -The following request creates a search pipeline containing a `normalization-processor` that uses the `min_max` normalization technique and the `arithmetic_mean` combination technique: +The following request creates a search pipeline containing a `normalization-processor` that uses the `min_max` normalization technique and the `arithmetic_mean` combination technique. The combination technique assigns a weight of 30% to the first query and 70% to the second query: ```json PUT /_search/pipeline/nlp-search-pipeline @@ -74,6 +77,39 @@ PUT /_search/pipeline/nlp-search-pipeline ``` {% include copy-curl.html %} +The following example demonstrates using the `lower_bounds` parameter with the `min_max` normalization technique. It omits the `weights` parameter in the combination technique, causing the queries to be weighted equally by default. In this example, the `lower_bounds` parameter is used to set different lower bounds for each query in a hybrid search. For the first query, a lower bound of 0.5 is applied, while for the second query, the lower bound is ignored. This allows for fine-tuning of the normalization process for each individual query in a hybrid search: + +```json +PUT /_search/pipeline/nlp-search-pipeline +{ + "description": "Post processor for hybrid search", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "min_max", + "parameters": { + "lower_bounds": [ + { + "mode": "apply", + "min_score": 0.5 + }, + { + "mode": "ignore" + } + ] + } + }, + "combination": { + "technique": "arithmetic_mean" + } + } + } + ] +} +``` +{% include copy-curl.html %} + ### Using a search pipeline Provide the query clauses that you want to combine in a `hybrid` query and apply the search pipeline created in the previous section so that the scores are combined using the chosen techniques: @@ -112,7 +148,7 @@ GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline ``` {% include copy-curl.html %} -For more information about setting up hybrid search, see [Using hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/#using-hybrid-search). +For more information about setting up hybrid search, see [Hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/). ## Search tuning recommendations diff --git a/_search-plugins/search-pipelines/personalize-search-ranking.md b/_search-plugins/search-pipelines/personalize-search-ranking.md index 32ff251cae7..630e249b7fc 100644 --- a/_search-plugins/search-pipelines/personalize-search-ranking.md +++ b/_search-plugins/search-pipelines/personalize-search-ranking.md @@ -1,7 +1,7 @@ --- layout: default title: Personalize search ranking -nav_order: 18 +nav_order: 85 has_children: false parent: Search processors grand_parent: Search pipelines diff --git a/_search-plugins/search-pipelines/rag-processor.md b/_search-plugins/search-pipelines/rag-processor.md index e9fca2e2c52..f7ed1795248 100644 --- a/_search-plugins/search-pipelines/rag-processor.md +++ b/_search-plugins/search-pipelines/rag-processor.md @@ -1,7 +1,7 @@ --- layout: default title: Retrieval-augmented generation -nav_order: 90 +nav_order: 115 has_children: false parent: Search processors grand_parent: Search pipelines @@ -99,4 +99,4 @@ GET /my_rag_test_data/_search?search_pipeline=rag_pipeline ``` {% include copy-curl.html %} -For more information about setting up conversational search, see [Using conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/#using-conversational-search). +For more information about setting up conversational search, see [Conversational search with RAG]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). diff --git a/_search-plugins/search-pipelines/rerank-processor.md b/_search-plugins/search-pipelines/rerank-processor.md index e543c8a28a9..11691eff957 100644 --- a/_search-plugins/search-pipelines/rerank-processor.md +++ b/_search-plugins/search-pipelines/rerank-processor.md @@ -11,33 +11,49 @@ grand_parent: Search pipelines Introduced 2.12 {: .label .label-purple } -The `rerank` search request processor intercepts search results and passes them to a cross-encoder model to be reranked. The model reranks the results, taking into account the scoring context. Then the processor orders documents in the search results based on their new scores. +The `rerank` search response processor intercepts and reranks search results. The processor orders documents in the search results based on their new scores. + +OpenSearch supports the following rerank types. + +Type | Description | Earliest available version +:--- | :--- | :--- +[`ml_opensearch`](#the-ml_opensearch-rerank-type) | Applies an OpenSearch-provided cross-encoder model. | 2.12 +[`by_field`](#the-by_field-rerank-type) | Applies reranking based on a user-provided field. | 2.18 ## Request body fields The following table lists all available request fields. -Field | Data type | Description -:--- | :--- | :--- -`<reranker_type>` | Object | The reranker type provides the rerank processor with static information needed across all reranking calls. Required. -`context` | Object | Provides the rerank processor with information necessary for generating reranking context at query time. -`tag` | String | The processor's identifier. Optional. -`description` | String | A description of the processor. Optional. -`ignore_failure` | Boolean | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Optional. Default is `false`. +Field | Data type | Required/Optional | Description +:--- | :--- | :--- | :--- +`<rerank_type>` | Object | Required | The rerank type for document reranking. Valid values are `ml-opensearch` and `by_field`. +`context` | Object | Required for the `ml_opensearch` rerank type. Optional and does not affect the results for the `by_field` rerank type. | Provides the `rerank` processor with information necessary for reranking at query time. +`tag` | String | Optional | The processor's identifier. +`description` | String | Optional | A description of the processor. +`ignore_failure` | Boolean | Optional | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Default is `false`. + +<!-- vale off --> +## The ml_opensearch rerank type +<!-- vale on --> +Introduced 2.12 +{: .label .label-purple } -### The `ml_opensearch` reranker type +To rerank results using a cross-encoder model, specify the `ml_opensearch` rerank type. -The `ml_opensearch` reranker type is designed to work with the cross-encoder model provided by OpenSearch. For this reranker type, specify the following fields. +### Prerequisite + +Before using the `ml_opensearch` rerank type, you must configure a cross-encoder model. For information about using an OpenSearch-provided model, see [Cross-encoder models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#cross-encoder-models). For information about using a custom model, see [Custom local models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). + +The `ml_opensearch` rerank type supports the following fields. All fields are required. Field | Data type | Description :--- | :--- | :--- -`ml_opensearch` | Object | Provides the rerank processor with model information. Required. -`ml_opensearch.model_id` | String | The model ID for the cross-encoder model. Required. For more information, see [Using ML models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/). -`context.document_fields` | Array | An array of document fields that specifies the fields from which to retrieve context for the cross-encoder model. Required. +`ml_opensearch.model_id` | String | The model ID of the cross-encoder model for reranking. For more information, see [Using ML models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/). +`context.document_fields` | Array | An array of document fields that specifies the fields from which to retrieve context for the cross-encoder model. -## Example +### Example -The following example demonstrates using a search pipeline with a `rerank` processor. +The following example demonstrates using a search pipeline with a `rerank` processor implemented using the `ml_opensearch` rerank type. For a complete example, see [Reranking using a cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-cross-encoder/). ### Creating a search pipeline @@ -108,11 +124,72 @@ POST /_search?search_pipeline=rerank_pipeline ``` {% include copy-curl.html %} -The `query_context` object contains the following fields. +The `query_context` object contains the following fields. You must provide either `query_text` or `query_text_path` but cannot provide both simultaneously. + +Field name | Required/Optional | Description +:--- | :--- | :--- +`query_text` | Exactly one of `query_text` or `query_text_path` is required. | The natural language text of the question that you want to use to rerank the search results. +`query_text_path` | Exactly one of `query_text` or `query_text_path` is required. | The full JSON path to the text of the question that you want to use to rerank the search results. The maximum number of characters allowed in the path is `1000`. + + +<!-- vale off --> +## The by_field rerank type +<!-- vale on --> +Introduced 2.18 +{: .label .label-purple } + +To rerank results by a document field, specify the `by_field` rerank type. + +The `by_field` object supports the following fields. + +Field | Data type | Required/Optional | Description +:--- | :--- | :--- | :--- +`target_field` | String | Required | Specifies the field name or a dot path to the field containing the score to use for reranking. +`remove_target_field` | Boolean | Optional | If `true`, the response does not include the `target_field` used to perform reranking. Default is `false`. +`keep_previous_score` | Boolean | Optional | If `true`, the response includes a `previous_score` field, which contains the score calculated before reranking and can be useful when debugging. Default is `false`. + +### Example + +The following example demonstrates using a search pipeline with a `rerank` processor implemented using the `by_field` rerank type. For a complete example, see [Reranking by a document field]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field/). + +### Creating a search pipeline + +The following request creates a search pipeline with a `by_field` rerank type response processor that ranks the documents by the `reviews.stars` field and specifies to return the original document score: + +```json +PUT /_search/pipeline/rerank_byfield_pipeline +{ + "response_processors": [ + { + "rerank": { + "by_field": { + "target_field": "reviews.stars", + "keep_previous_score" : true + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Using the search pipeline + +To apply the search pipeline to a query, provide the search pipeline name in the query parameter: + +```json +POST /book-index/_search?search_pipeline=rerank_byfield_pipeline +{ + "query": { + "match_all": {} + } +} +``` +{% include copy-curl.html %} -Field name | Description -:--- | :--- -`query_text` | The natural language text of the question that you want to use to rerank the search results. Either `query_text` or `query_text_path` (not both) is required. -`query_text_path` | The full JSON path to the text of the question that you want to use to rerank the search results. Either `query_text` or `query_text_path` (not both) is required. The maximum number of characters in the path is `1000`. +## Next steps -For more information about setting up reranking, see [Reranking search results]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/). \ No newline at end of file +- Learn more about [reranking search results]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/). +- See a complete example of [reranking using a cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-cross-encoder/). +- See a complete example of [reranking by a document field]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field/). +- See a comprehensive example of [reranking by a field using an externally hosted cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field-cross-encoder/). \ No newline at end of file diff --git a/_search-plugins/search-pipelines/score-ranker-processor.md b/_search-plugins/search-pipelines/score-ranker-processor.md new file mode 100644 index 00000000000..1bc3fca1910 --- /dev/null +++ b/_search-plugins/search-pipelines/score-ranker-processor.md @@ -0,0 +1,100 @@ +--- +layout: default +title: Score ranker +has_children: false +parent: Search processors +grand_parent: Search pipelines +nav_order: 117 +--- + +# Score ranker processor +Introduced 2.19 +{: .label .label-purple } + +The `score-ranker-processor` is a rank-based search phase results processor that runs between the query and fetch phases of search execution. It intercepts the query phase results and then uses the reciprocal rank fusion (RRF) algorithm to combine different query clauses to produce the final ranked list of search results. RRF is a method for combining multiple queries by scoring each document based on the reciprocal of its rank for each query and then adding these scores to create a final, unified ranking. + +## Request body fields + +The following table lists all available request fields. + +Field | Data type | Description +:--- | :--- | :--- +`combination.technique` | String | The technique used for combining scores. Required. Valid value is `rrf`. +`combination.rank_constant` | Integer | A constant added to each document's rank before calculating the reciprocal score. Must be `1` or greater. A larger rank constant makes the scores more uniform, reducing the influence of top-ranked results. A smaller rank constant creates a greater score difference between ranks, giving more weight to top-ranked items. Optional. Default is `60`. +`combination.parameters.weights` | Array of floating-point values | Specifies the weights to use for each query. Valid values are in the [0.0, 1.0] range and signify decimal percentages. The closer the weight is to 1.0, the more weight is given to a query. The number of values in the `weights` array must equal the number of queries. The sum of the values in the array must equal 1.0. Optional. If not provided, all queries are given equal weight. +## Example + +The following example demonstrates using a search pipeline with a `score-ranker-processor`. + +### Creating a search pipeline with a score ranker processor + +The following request creates a search pipeline containing a `score-ranker-processor` that uses the `rrf` combination technique: + +```json +PUT /_search/pipeline/<rrf-pipeline> +{ + "description": "Post processor for hybrid RRF search", + "phase_results_processors": [ + { + "score-ranker-processor": { + "combination": { + "technique": "rrf" + } + } + } + ] +} +``` + +### Tuning the search pipeline using custom parameters + +Apply a custom `rank-constant` parameter in the search pipeline created in the previous section. The default value of `rank-constant` is 60. In the following example, `rank-constant` is set to 40: + +```json +PUT /_search/pipeline/<rrf-pipeline> +{ + "description": "Post processor for hybrid RRF search", + "phase_results_processors": [ + { + "score-ranker-processor": { + "combination": { + "technique": "rrf", + "rank_constant": 40 + } + } + } + ] +} +``` + +Apply custom `weights` to each subquery when combing search results using the `rrf` technique. By default, each subquery is given an equal weight of 1. In the following example, subquery 1 has a weight of 0.7, and subquery 2 has a weight of 0.3: + +```json +PUT /_search/pipeline/<rrf-pipeline> +{ + "description": "Post processor for hybrid RRF search", + "phase_results_processors": [ + { + "score-ranker-processor": { + "combination": { + "technique": "rrf", + "rank_constant": 40, + "parameters": { + "weights":[ + 0.7, + 0.3 + ] + } + } + } + } + ] +} +``` +{% include copy-curl.html %} + +For more information about setting up hybrid search, see [Hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/). + +## Next steps + +- For a detailed exploration of the `score-ranker-processor` and RRF, including experimental data and practical use cases, see [this blog post](https://opensearch.org/blog/introducing-reciprocal-rank-fusion-hybrid-search/). The blog post provides examples, performance comparisons, and insights into how RRF can improve search relevance in various scenarios. diff --git a/_search-plugins/search-pipelines/search-processors.md b/_search-plugins/search-pipelines/search-processors.md index 83c46ca69d8..eb21a55dfb1 100644 --- a/_search-plugins/search-pipelines/search-processors.md +++ b/_search-plugins/search-pipelines/search-processors.md @@ -39,6 +39,7 @@ The following table lists all supported search response processors. Processor | Description | Earliest available version :--- | :--- | :--- [`collapse`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/collapse-processor/)| Deduplicates search hits based on a field value, similarly to `collapse` in a search request. | 2.12 +[`hybrid_score_explanation`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/explanation-processor/)| Adds detailed scoring information to search results when the `explain` parameter is enabled, providing information about score normalization, combination techniques, and individual score calculations in hybrid queries. | 2.19 [`ml_inference`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/ml-inference-search-response/) | Invokes registered machine learning (ML) models in order to incorporate model output as additional search response fields. | 2.16 [`personalize_search_ranking`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/personalize-search-ranking/) | Uses [Amazon Personalize](https://aws.amazon.com/personalize/) to rerank search results (requires setting up the Amazon Personalize service). | 2.9 [`rename_field`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rename-field-processor/)| Renames an existing field. | 2.8 diff --git a/_search-plugins/search-relevance/compare-query-sets.md b/_search-plugins/search-relevance/compare-query-sets.md new file mode 100644 index 00000000000..92320163382 --- /dev/null +++ b/_search-plugins/search-relevance/compare-query-sets.md @@ -0,0 +1,257 @@ +--- +layout: default +title: Comparing query sets +nav_order: 12 +parent: Using Search Relevance Workbench +grand_parent: Search relevance +has_children: false +has_toc: false +--- + +# Comparing query sets + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/17735). +{: .warning} + +To compare the results of two different search configurations, you can run a pairwise experiment. To achieve this, you need two search configurations and a query set to use for the search configuration. + + +For more information about creating a query set, see [Query Sets]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/query-sets/). + +For more information about creating search configurations, see [Search Configurations]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/search-configurations/). + +## Creating a pairwise experiment + +An experiment is used to compare the metrics between two different search configurations. An experiment shows you the top N results for every query based on the specified search configurations. In the dashboard, you can view the returned documents from any of the queries in the query set and determine which search configuration returns more relevant results. Additionally, you can measure the similarity between the two returned search result lists using the provided similarity metrics. + +### Example + +To create a pairwise comparison experiment for the specified query set and search configurations, send the following request: + +```json +PUT _plugins/_search_relevance/experiments +{ + "querySetId": "8368a359-146b-4690-b756-40591b2fcddb", + "searchConfigurationList": ["a5acc9f3-6ad7-43f4-9651-fe118c499bc6", "26c7255c-c36e-42fb-b5b2-633dbf8e53b6"], + "size": 10, + "type": "PAIRWISE_COMPARISON" +} +``` +{% include copy-curl.html %} + +### Request body fields + +The following table lists the available input parameters. + +Field | Data type | Description +:--- | :--- | :--- +`querySetId` | String | The query set ID. +`searchConfigurationList` | List | A list of search configuration IDs to use for comparison. +`size` | Integer | The number of documents to return in the results. +`type` | String | Defines the type of experiment to run. Valid values are `PAIRWISE_COMPARISON`, `HYBRID_OPTIMIZER`, or `POINTWISE_EVALUATION`. Depending on the experiment type, you must provide different body fields in the request. `PAIRWISE_COMPARISON` is for comparing two search configurations against a query set and is used [here]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/compare-query-sets/). `HYBRID_OPTIMIZER` is for combining results and is used [here]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/optimize-hybrid-search/). `POINTWISE_EVALUATION` is for evaluating a search configuration against judgments and is used [here]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/evaluate-search-quality/). + +The response contains the experiment ID of the created experiment: + +```json +{ + "experiment_id": "cbd2c209-96d1-4012-aa73-e524b7a1b11a", + "experiment_result": "CREATED" +} +``` +## Interpreting the experiment results +To interpret the experiment results, use the following operations. + +### Retrieving the experiment results + +Use the following API to retrieve the result of a specific experiment. + +#### Endpoints + +```json +GET _plugins/_search_relevance/experiments +GET _plugins/_search_relevance/experiments/<experiment_id> +``` + +#### Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `experiment_id` | String | The ID of the experiment to retrieve. Retrieves all experiments when empty. | + +#### Example request + +```json +GET _plugins/_search_relevance/experiments/cbd2c209-96d1-4012-aa73-e524b7a1b11a +``` + +#### Example response + +```json +{ + "took": 2, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": ".plugins-search-relevance-experiment", + "_id": "cbd2c209-96d1-4012-aa73-e524b7a1b11a", + "_score": 1, + "_source": { + "id": "cbd2c209-96d1-4012-aa73-e524b7a1b11a", + "timestamp": "2025-06-11T23:24:26.792Z", + "type": "PAIRWISE_COMPARISON", + "status": "PROCESSING", + "querySetId": "8368a359-146b-4690-b756-40591b2fcddb", + "searchConfigurationList": [ + "a5acc9f3-6ad7-43f4-9651-fe118c499bc6", + "26c7255c-c36e-42fb-b5b2-633dbf8e53b6" + ], + "judgmentList": [], + "size": 10, + "results": {} + } + } + ] + } +} +``` + +Once the experiment finishes running, the results are available: + +<details open markdown="block"> + <summary> + Response + </summary> + +```json +{ + "took": 34, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.0, + "hits": [ + { + "_index": ".plugins-search-relevance-experiment", + "_id": "cbd2c209-96d1-4012-aa73-e524b7a1b11a", + "_score": 1.0, + "_source": { + "id": "cbd2c209-96d1-4012-aa73-e524b7a1b11a", + "timestamp": "2025-06-12T04:18:37.284Z", + "type": "PAIRWISE_COMPARISON", + "status": "COMPLETED", + "querySetId": "7889ffe9-835e-4f48-a9cd-53905bb967d3", + "searchConfigurationList": [ + "a5acc9f3-6ad7-43f4-9651-fe118c499bc6", + "26c7255c-c36e-42fb-b5b2-633dbf8e53b6" + ], + "judgmentList": [], + "size": 10, + "results": { + "tv": { + "26c7255c-c36e-42fb-b5b2-633dbf8e53b6": [ + "B07X3S9RTZ", + "B07WVZFKLQ", + "B00GXD4NWE", + "B07ZKCV5K5", + "B07ZKDVHFB", + "B086VKT9R8", + "B08XLM8YK1", + "B07FPP6TB5", + "B07N1TMNHB", + "B09CDHM8W7" + ], + "pairwiseComparison": { + "jaccard": 0.11, + "rbo90": 0.16, + "frequencyWeighted": 0.2, + "rbo50": 0.07 + }, + "a5acc9f3-6ad7-43f4-9651-fe118c499bc6": [ + "B07Q7VGW4Q", + "B00GXD4NWE", + "B07VML1CY1", + "B07THVCJK3", + "B07RKSV7SW", + "B010EAW8UK", + "B07FPP6TB5", + "B073G9ZD33", + "B07VXRXRJX", + "B07Q45SP9P" + ] + }, + "led tv": { + "26c7255c-c36e-42fb-b5b2-633dbf8e53b6": [ + "B01M1D0KL1", + "B07YSMD3Z9", + "B07V4CY9GZ", + "B074KFP426", + "B07S8XNWWF", + "B07XBJR7GY", + "B075FDWSHT", + "B01N2Z17MS", + "B07F1T4JFB", + "B07S658ZLH" + ], + "pairwiseComparison": { + "jaccard": 0.11, + "rbo90": 0.13, + "frequencyWeighted": 0.2, + "rbo50": 0.03 + }, + "a5acc9f3-6ad7-43f4-9651-fe118c499bc6": [ + "B07Q45SP9P", + "B074KFP426", + "B07JKVKZX8", + "B07THVCJK3", + "B0874XJYW8", + "B08LVPWQQP", + "B07V4CY9GZ", + "B07X3BS3DF", + "B074PDYLCZ", + "B08CD9MKLZ" + ] + } + } + } + } + ] + } +} +``` + +</details> + +### Interpreting the results + +As shown in the preceding response, both search configurations return the top N documents, with `size` set to 10 in the search request. In addition to the results, the response also includes metrics from the pairwise comparison. + +### Response body fields + +Field | Description +:--- | :--- +`jaccard` | Shows the similarity score by dividing the intersection cardinality by the union cardinality of the returned documents. +`rbo` | The Rank-Biased Overlap (RBO) metric compares the returned result sets at each ranking depth—for example, the top 1 document, top 2 documents, and so on. It places greater importance on higher-ranked results, giving more weight to earlier positions in the list. +`frequencyWeighted` | Similar to the Jaccard metric, the frequency-weighted metric calculates the ratio of the weighted intersection to the weighted union of two sets. However, unlike standard Jaccard, it gives more weight to documents with higher frequencies, skewing the result toward more frequently occurring items. diff --git a/_search-plugins/search-relevance/compare-search-results.md b/_search-plugins/search-relevance/compare-search-results.md index 962442cd317..6c0dd61892c 100644 --- a/_search-plugins/search-relevance/compare-search-results.md +++ b/_search-plugins/search-relevance/compare-search-results.md @@ -1,17 +1,16 @@ --- layout: default -title: Comparing search results -nav_order: 55 -parent: Search relevance +title: Comparing single queries +nav_order: 11 +parent: Using Search Relevance Workbench +grand_parent: Search relevance has_children: false has_toc: false -redirect_from: - - /search-plugins/search-relevance/ --- -# Comparing search results +# Comparing single queries -With Compare Search Results in OpenSearch Dashboards, you can compare results from two queries side by side to determine whether one query produces better results than the other. Using this tool, you can evaluate search quality by experimenting with queries. +With Compare Search Results in OpenSearch Dashboards, you can compare results from two queries side by side to determine whether one query produces better results than the other. Using this tool, you can evaluate search quality by experimenting with queries. For example, you can see how results change when you apply one of the following query changes: @@ -21,20 +20,20 @@ For example, you can see how results change when you apply one of the following ## Prerequisites -Before you get started, you must index data in OpenSearch. To learn how to create a new index, see [Index data]({{site.url}}{{site.baseurl}}/opensearch/index-data/). +Before you get started, you must index data in OpenSearch. To learn how to create a new index, see [Index data]({{site.url}}{{site.baseurl}}/opensearch/index-data/). Alternatively, you can add sample data in OpenSearch Dashboards using the following steps: 1. On the top menu bar, go to **OpenSearch Dashboards > Overview**. 1. Select **View app directory**. -1. Select **Add sample data**. +1. Select **Add sample data**. 1. Choose one of the built-in datasets and select **Add data**. ## Using Compare Search Results in OpenSearch Dashboards To compare search results in OpenSearch Dashboards, perform the following steps. -**Step 1:** On the top menu bar, go to **OpenSearch Plugins > Search Relevance**. +**Step 1:** On the top menu bar, go to **OpenSearch Plugins > Search Relevance**. **Step 2:** Enter the search text in the search bar. @@ -74,7 +73,7 @@ The following example screen shows a search for the word "cup" in the `descripti <img src="{{site.url}}{{site.baseurl}}/images/search_relevance.png" alt="Compare search results"/>{: .img-fluid } -If a result in Result 1 appears in Result 2, the `Up` and `Down` indicators below the result number signify how many places the result moved up or down compared to the same result in Result 2. In this example, the document with the ID 2 is `Up 1` place in Result 2 compared to Result 1 and `Down 1` place in Result 1 compared to Result 2. +If a result in Result 1 appears in Result 2, the `Up` and `Down` indicators below the result number signify how many positions the result moved up or down compared to the same result in Result 2. In this example, the document with the ID 2 is `Up 1` position in Result 2 compared to Result 1 and `Down 1` position in Result 1 compared to Result 2. ## Changing the number of results @@ -98,6 +97,27 @@ Setting `size` to a high value (for example, larger than 250 documents) may degr You cannot save a given comparison for future use, so Compare Search Results is not suitable for systematic testing. {: .note} +## Comparing OpenSearch search results using Search Relevance Workbench + +[Search Relevance Workbench]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/using-search-relevance-workbench/) provides richer visualization options for examining the difference between two queries. + +To use Search Relevance Workbench, follow steps 1--4. The displayed results and the options for viewing the differences are shown in the following image. + +<img src="{{site.url}}{{site.baseurl}}/images/search-relevance-workbench/comparing_search_results.png" alt="Compare search results"/>{: .img-fluid } + +The top section provides a summary of the results: how many of the retrieved results are unique to the query on the left, how many are unique to the query on the right, and how many are part of both queries? + +What follows is a visual representation of the retrieved results. By default the unique identifier field (`_id`) is shown. You can change this by selecting a different field in the **Display Field** dropdown list. +In the side-by-side view, you can see the positional changes for all common documents among the two result lists. +Selecting one item shows all stored fields in the index to facilitate easier document identification. + +Lastly, Search Relevance Workbench allows you to choose among different visualization styles from a dropdown list: + +* **Default style**: Different colors are used for the two result list documents (unique results are on the left in yellow and on the right in purple, and common results are displayed in green). +* **Ranking change color coding**: All unique documents are purple, and common results are green to focus on ranking changes. +* **Ranking change color coding 2**: All unique documents are gray, and common results are green to focus on ranking changes. +* **Venn diagram color coding**: All unique documents are purple, and common results are blue as in the Venn diagram at the top of the two result lists. + ## Comparing OpenSearch search results with reranked results One use case for Compare Search Results is the comparison of raw OpenSearch results with the same results processed by a reranking application. OpenSearch currently integrates with the following two rerankers: @@ -115,7 +135,7 @@ To try Amazon Kendra Intelligent Ranking, you must first set up the Amazon Kendr To compare search results with reranked results in OpenSearch Dashboards, enter a query in **Query 1** and enter the same query using a reranker in **Query 2**. Then compare the OpenSearch results with the reranked results. -The following example demonstrates searching for the text "snacking nuts" in the `abo` index. The documents in the index contain snack descriptions in the `bullet_point` array. +The following example demonstrates searching for the text "snacking nuts" in the `abo` index. The documents in the index contain snack descriptions in the `bullet_point` array. <img src="{{site.url}}{{site.baseurl}}/images/kendra_query.png" alt="OpenSearch Intelligent Ranking query"/>{: .img-fluid } diff --git a/_search-plugins/search-relevance/comparing-search-results.md b/_search-plugins/search-relevance/comparing-search-results.md new file mode 100644 index 00000000000..6fe7ae210d3 --- /dev/null +++ b/_search-plugins/search-relevance/comparing-search-results.md @@ -0,0 +1,38 @@ +--- +layout: default +title: Comparing search results +nav_order: 10 +parent: Using Search Relevance Workbench +grand_parent: Search relevance +has_children: true +has_toc: false +--- + +# Comparing search results + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/17735). +{: .warning} + +Comparing search results, also called a _pairwise experiment_, in OpenSearch Dashboards allows you to compare results of multiple search configurations. Using this tool helps assess how results change when applying different search configurations to queries. + +For example, you can see how results change when you apply one of the following query changes: + +- Weighting fields differently +- Different stemming or lemmatization strategies +- Shingling + +## Comparing search results of a single query + +The UI for comparing the search results of a single query lets you define two different search configurations for an individual query in order to view and compare the results side by side. Specifically, you can determine how many shared and unique documents are contained in the result lists and how their positions changed, as shown in the following image. + +<img src="{{site.url}}{{site.baseurl}}/images/search-relevance-workbench/comparing_search_results.png" alt="Compare search results"/>{: .img-fluid } + +For more information about using the search result comparison tool for a single query, see [Comparing single queries]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/compare-search-results/). + +## Comparing search results of a query set + +Typically, viewing search result changes for two configurations is the first step in the testing process. You can then scale from one query to many in Search Relevance Workbench. You can group queries into a query set, create [search configurations]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/search-configurations/), and compare search results on a larger scale by looking at aggregate metrics across all queries, as shown in the following image. + +<img src="{{site.url}}{{site.baseurl}}/images/search-relevance-workbench/comparing-search-results-query-sets.png" alt="Compare search results"/>{: .img-fluid } + +For more information about using the search result comparison tool for a query set, see [Comparing single queries]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/compare-query-sets/). diff --git a/_search-plugins/search-relevance/evaluate-search-quality.md b/_search-plugins/search-relevance/evaluate-search-quality.md new file mode 100644 index 00000000000..433d447e82f --- /dev/null +++ b/_search-plugins/search-relevance/evaluate-search-quality.md @@ -0,0 +1,212 @@ +--- +layout: default +title: Evaluating search quality +nav_order: 50 +parent: Using Search Relevance Workbench +grand_parent: Search relevance +has_children: false +has_toc: false +--- + +# Evaluating search quality + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/17735). +{: .warning} + +Search Relevance Workbench can run pointwise experiments to evaluate search configuration quality using provided queries and relevance judgments. + +For more information about creating a query set, see [Query sets]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/query-sets/). + +For more information about creating search configurations, see [Search Configurations]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/search-configurations/). + +For more information about creating judgments, see [Judgments]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/judgments/). + +## Creating a pointwise experiment + +A pointwise experiment compares your search configuration results against provided relevance judgments to evaluate search quality. + +### Example request + +```json +PUT _plugins/_search_relevance/experiments +{ + "querySetId": "a02cedc2-249d-41de-be3e-662f6f221689", + "searchConfigurationList": ["4f90e474-0806-4dd2-a8dd-0fb8a5f836eb"], + "judgmentList": ["d3d93bb3-2cf4-4da0-8d31-c298427c2756"], + "size": 8, + "type": "POINTWISE_EVALUATION" +} +``` + +### Request body fields + +The following table lists the available input parameters. + +Field | Data type | Description +:--- | :--- | :--- +`querySetId` | String | The ID of the query set. +`searchConfigurationList` | List | A list of search configuration IDs to use for comparison. +`judgmentList` | Array[String] | A list of judgment IDs to use for evaluating search accuracy. +`size` | Integer | The number of documents to return in the results. +`type` | String | The type of experiment to run. Valid values are `PAIRWISE_COMPARISON`, `HYBRID_OPTIMIZER`, or `POINTWISE_EVALUATION`. Depending on the experiment type, you must provide different body fields in the request. `PAIRWISE_COMPARISON` is for comparing two search configurations against a query set and is used [here]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/compare-query-sets/). `HYBRID_OPTIMIZER` is for combining results and is used [here]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/optimize-hybrid-search/). `POINTWISE_EVALUATION` is for evaluating a search configuration against judgments and is used [here]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/evaluate-search-quality/). + +### Example response + +```json +{ + "experiment_id": "d707fa0f-3901-4c8b-8645-9a17e690722b", + "experiment_result": "CREATED" +} +``` + +## Managing the results + +To retrieve experiment results, follow the same process used for [comparing query sets]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/compare-query-sets/) in pairwise experiments. + +The following is an example completed response: + +<details open markdown="block"> + <summary> + Response + </summary> + +```json +{ + "took": 140, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.0, + "hits": [ + { + "_index": ".plugins-search-relevance-experiment", + "_id": "bb609dc9-e357-42ec-a956-92b43be0a3ab", + "_score": 1.0, + "_source": { + "id": "bb609dc9-e357-42ec-a956-92b43be0a3ab", + "timestamp": "2025-06-13T08:06:46.046Z", + "type": "POINTWISE_EVALUATION", + "status": "COMPLETED", + "querySetId": "a02cedc2-249d-41de-be3e-662f6f221689", + "searchConfigurationList": [ + "4f90e474-0806-4dd2-a8dd-0fb8a5f836eb" + ], + "judgmentList": [ + "d3d93bb3-2cf4-4da0-8d31-c298427c2756" + ], + "size": 8, + "results": [ + { + "evaluationId": "10c60fee-11ca-49b0-9e8a-82cb7b2c044b", + "searchConfigurationId": "4f90e474-0806-4dd2-a8dd-0fb8a5f836eb", + "queryText": "tv" + }, + { + "evaluationId": "c03a5feb-8dc2-4f7f-9d31-d99bfb392116", + "searchConfigurationId": "4f90e474-0806-4dd2-a8dd-0fb8a5f836eb", + "queryText": "led tv" + } + ] + } + } + ] + } +} +``` + +</details> + +The results include an evaluation result ID for each search configuration. To view detailed results, query the `search-relevance-evaluation-result` index using this ID. + +The following is an example of the detailed results: + +<details open markdown="block"> + <summary> + Response + </summary> + +```json +{ + "took": 59, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.0, + "hits": [ + { + "_index": "search-relevance-evaluation-result", + "_id": "10c60fee-11ca-49b0-9e8a-82cb7b2c044b", + "_score": 1.0, + "_source": { + "id": "10c60fee-11ca-49b0-9e8a-82cb7b2c044b", + "timestamp": "2025-06-13T08:06:40.869Z", + "searchConfigurationId": "4f90e474-0806-4dd2-a8dd-0fb8a5f836eb", + "searchText": "tv", + "judgmentIds": [ + "d3d93bb3-2cf4-4da0-8d31-c298427c2756" + ], + "documentIds": [ + "B07Q7VGW4Q", + "B00GXD4NWE", + "B07VML1CY1", + "B07THVCJK3", + "B07RKSV7SW", + "B010EAW8UK", + "B07FPP6TB5", + "B073G9ZD33" + ], + "metrics": [ + { + "metric": "Coverage@8", + "value": 0.0 + }, + { + "metric": "Precision@8", + "value": 0.0 + }, + { + "metric": "MAP@8", + "value": 0.0 + }, + { + "metric": "NDCG@8", + "value": 0.0 + } + ] + } + } + ] + } +} +``` + +</details> + +The results include the original request parameters along with the following metric values: + +- `Coverage@k`: The proportion of scored documents from the judgment set, calculated as the number of documents with scores divided by the total number of documents. + + +- `Precision@k`: The proportion of documents with nonzero judgment scores out of k (or out of the total number of returned documents, if lower). + +- `MAP@k`: The Mean Average Precision, which calculates the average precision across all documents. For more information, see [Average precision](https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Average_precision). + +- `NDCG@k`: The Normalized Discounted Cumulative Gain, which compares the actual ranking of results against a perfect ranking, with higher weights given to top results. This measures the quality of result ordering. diff --git a/_search-plugins/search-relevance/index.md b/_search-plugins/search-relevance/index.md index f0c5a2e4c5b..9133ba08dc2 100644 --- a/_search-plugins/search-relevance/index.md +++ b/_search-plugins/search-relevance/index.md @@ -10,12 +10,12 @@ redirect_from: # Search relevance -Search relevance evaluates the accuracy of the search results returned by a query. The higher the relevance, the better the search engine. +Search relevance evaluates the accuracy of the search results returned by a query. The higher the relevance, the better the search engine. OpenSearch provides the following search relevance features: -- [Comparing search results]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/compare-search-results/) from two queries side by side in OpenSearch Dashboards. +- [Search Relevance Workbench]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/using-search-relevance-workbench/): A suite of tools that support search quality improvements through experimentation. -- [Reranking search results]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/) using a cross-encoder reranker. +- [Reranking search results]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/) using a cross-encoder reranker. -- Rewriting queries using [Querqy]({{site.url}}{{site.baseurl}}/search-plugins/querqy/). \ No newline at end of file +- [Query rewriting]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/query-rewriting/). diff --git a/_search-plugins/search-relevance/judgments.md b/_search-plugins/search-relevance/judgments.md new file mode 100644 index 00000000000..469e49fa72a --- /dev/null +++ b/_search-plugins/search-relevance/judgments.md @@ -0,0 +1,340 @@ +--- +layout: default +title: Judgments +nav_order: 8 +parent: Using Search Relevance Workbench +grand_parent: Search relevance +has_children: false +has_toc: false +--- + +# Judgments + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/17735). +{: .warning} + +A judgment is a relevance rating assigned to a specific document in the context of a particular query. Multiple judgments are grouped together into judgment lists. +Typically, judgments are categorized into two types---implicit and explicit: + +* Implicit judgments are ratings that were derived from user behavior (for example, what did the user see and select after searching?) +* Explicit judgments were traditionally made by humans, but large language models (LLMs) are increasingly being used to perform this task. + +Search Relevance Workbench supports all types of judgments: + +* Generating implicit judgments based on data that adheres to the User Behavior Insights (UBI) schema specification. +* Using LLMs to generate judgments by connecting OpenSearch to an API or an internally or externally hosted model. +* Importing externally created judgments. + +## Explicit judgments + +Search Relevance Workbench offers two ways to integrate explicit judgments: +* Importing judgments that were collected using a process outside of OpenSearch +* AI-assisted judgments that use LLMs + +### Importing judgments + +You may already have external processes for generating judgments. Regardless of the judgment type or the way it was generated, you can import it into Search Relevance Workbench. + +#### Example request + +```json +PUT _plugins/_search_relevance/judgments +{ + "name": "Imported Judgments", + "description": "Judgments generated outside SRW", + "type": "IMPORT_JUDGMENT", + "judgmentRatings": [ + { + "query": "red dress", + "ratings": [ + { + "docId": "B077ZJXCTS", + "rating": "3.000" + }, + { + "docId": "B071S6LTJJ", + "rating": "2.000" + }, + { + "docId": "B01IDSPDJI", + "rating": "2.000" + }, + { + "docId": "B07QRCGL3G", + "rating": "0.000" + }, + { + "docId": "B074V6Q1DR", + "rating": "1.000" + } + ] + }, + { + "query": "blue jeans", + "ratings": [ + { + "docId": "B07L9V4Y98", + "rating": "0.000" + }, + { + "docId": "B01N0DSRJC", + "rating": "1.000" + }, + { + "docId": "B001CRAWCQ", + "rating": "1.000" + }, + { + "docId": "B075DGJZRM", + "rating": "2.000" + }, + { + "docId": "B009ZD297U", + "rating": "2.000" + } + ] + } + ] +} +``` +{% include copy-curl.html %} + +#### Request body fields + +The process of importing judgments supports the following parameters. + +Parameter | Data type | Description +`name` | String | The name of the judgment list. +`description` | String | An optional description of the judgment list. +`type` | String | Set to `IMPORT_JUDGMENT`. +`judgmentRatings` | Array | A list of JSON objects containing the judgments. Judgments are grouped by query, each containing a nested map in which document IDs (`docId`) serve as keys and their floating-point ratings serve as values. + +### Creating AI-assisted judgments + +If you want to use judgments in your experimentation process but do not have a team of humans or the user behavior data to calculate judgments based on interactions, you can use an LLM in Search Relevance Workbench to generate judgments. +#### Prerequisites + +To use AI-assisted judgment generation, ensure that you have configured the following components: + +* A connector to an LLM to use for generating the judgments. For more information, see [Creating connectors for third-party ML platforms]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). +* A query set: Together with the `size` parameter, the query set defines the scope for generating judgments. For each query, the top k documents are retrieved from the specified index, where k is defined in the `size` parameter. +* A search configuration: A search configuration defines how documents are retrieved for use in query/document pairs. + +The AI-assisted judgment process works as follows: +- For each query, the top k documents are retrieved using the defined search configuration, which includes the index information. The query and each document from the result list create a query/document pair. +- Each query and document pair forms a query/document pair. +- The LLM is then called with a predefined prompt (stored as a static variable in the backend) to generate a judgment for each query/document pair. +- All generated judgments are stored in the judgments index for reuse in future experiments. + +To create a judgment list, provide the model ID of the LLM, an available query set, and a created search configuration: + + +```json +PUT _plugins/_search_relevance/judgments +{ + "name":"COEC", + "type":"LLM_JUDGMENT", + "querySetId":"5f0115ad-94b9-403a-912f-3e762870ccf6", + "searchConfigurationList":["2f90d4fd-bd5e-450f-95bb-eabe4a740bd1"], + "size":5, + "modelId":"N8AE1osB0jLkkocYjz7D", + "contextFields":[] +} +``` +{% include copy-curl.html %} + +## Implicit judgments + +Implicit judgments are derived from user interactions. Several models use signals from user behavior to calculate these judgments. One such model is Clicks Over Expected Clicks (COEC), a click model implemented in Search Relevance Workbench. +The data used to derive relevance labels is based on past user behavior. The data follows the [User Behavior Insights schema specification]({{site.url}}{{site.baseurl}}/search-plugins/ubi/schemas/). The two key interaction types for implicit judgments are *impressions* and *clicks* that occur after a user query. In practice, this means that all events in the `ubi_events` index with an `impression` or `click` recorded in the `action_name` field are used to model implicit judgments. +COEC calculates an expected click-through rate (CTR) for each rank. It does this by dividing the total number of clicks by the total number of impressions observed at that rank, based on all events in `ubi_events`. This ratio represents the expected CTR for that position. + +For each document displayed in a hit list after a query, the average CTR at that rank serves as the expected value for the query/document pair. COEC calculates the actual CTR for the query/document pair and divides it by this expected rank-based CTR. This means that query/document pairs with a higher CTR than the average for that rank will have a judgment value greater than 1. Conversely, if the CTR is lower than average, the judgment value will be lower than 1. + +Note that depending on the tracking implementation, multiple clicks for a single query can be recorded in the `ubi_events` index. As a result, the average CTR can sometimes exceed 1 (or 100%). +For query-document observations that occur at different positions, all impressions and clicks are assumed to have occurred at the lowest (best) position. This approach biases the final judgment toward lower values, reflecting the common trend that higher-ranked results typically receive higher CTRs. +{: .note} + +#### Example request + +```json +PUT _plugins/_search_relevance/judgments +{ + "name": "Implicit Judgements", + "clickModel": "coec", + "type": "UBI_JUDGMENT", + "maxRank": 20 +} +``` +{% include copy-curl.html %} + +#### Request body fields + +The process of creating implicit judgments supports the following parameters. + +Parameter | Data type | Description +`name` | String | The name of the judgment list. +`clickModel` | String | The model used to calculate implicit judgments. Only `coec` (Clicks Over Expected Clicks) is supported. +`type` | String | Set to `UBI_JUDGMENT`. +`maxRank` | Integer | The maximum rank to consider when including events in the judgment calculation. + +## Managing judgment lists + +You can retrieve or delete judgment lists using the following APIs. + +### View a judgment list + +You can retrieve a judgment list using the judgment list ID. + +#### Endpoint + +```json +GET _plugins/_search_relevance/judgments/<judgment_list_id> +``` + +### Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `judgment_list_id` | String | The ID of the judgment list to retrieve. | + +#### Example request + +```json +GET _plugins/_search_relevance/judgments/b54f791a-3b02-49cb-a06c-46ab650b2ade +``` +{% include copy-curl.html %} + +#### Example response + +<details open markdown="block"> + <summary> + Response + </summary> + +```json +{ + "took": 36, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "search-relevance-judgment", + "_id": "b54f791a-3b02-49cb-a06c-46ab650b2ade", + "_score": 1, + "_source": { + "id": "b54f791a-3b02-49cb-a06c-46ab650b2ade", + "timestamp": "2025-06-11T06:07:23.766Z", + "name": "Imported Judgments", + "status": "COMPLETED", + "type": "IMPORT_JUDGMENT", + "metadata": {}, + "judgmentRatings": [ + { + "query": "red dress", + "ratings": [ + { + "rating": "3.000", + "docId": "B077ZJXCTS" + }, + { + "rating": "2.000", + "docId": "B071S6LTJJ" + }, + { + "rating": "2.000", + "docId": "B01IDSPDJI" + }, + { + "rating": "0.000", + "docId": "B07QRCGL3G" + }, + { + "rating": "1.000", + "docId": "B074V6Q1DR" + } + ] + }, + { + "query": "blue jeans", + "ratings": [ + { + "rating": "0.000", + "docId": "B07L9V4Y98" + }, + { + "rating": "1.000", + "docId": "B01N0DSRJC" + }, + { + "rating": "1.000", + "docId": "B001CRAWCQ" + }, + { + "rating": "2.000", + "docId": "B075DGJZRM" + }, + { + "rating": "2.000", + "docId": "B009ZD297U" + } + ] + } + ] + } + } + ] + } +} +``` + +</details> + +### Delete a judgment list + +You can delete a judgment list using the judgment list ID. + +#### Endpoint + +```json +DELETE _plugins/_search_relevance/judgments/<judgment_list_id> +``` + +#### Example request + +```json +DELETE _plugins/_search_relevance/judgments/b54f791a-3b02-49cb-a06c-46ab650b2ade +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "_index": "search-relevance-judgment", + "_id": "b54f791a-3b02-49cb-a06c-46ab650b2ade", + "_version": 3, + "result": "deleted", + "forced_refresh": true, + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + }, + "_seq_no": 156, + "_primary_term": 1 +} +``` diff --git a/_search-plugins/search-relevance/optimize-hybrid-search.md b/_search-plugins/search-relevance/optimize-hybrid-search.md new file mode 100644 index 00000000000..b887a12f9b4 --- /dev/null +++ b/_search-plugins/search-relevance/optimize-hybrid-search.md @@ -0,0 +1,116 @@ +--- +layout: default +title: Optimizing hybrid search +nav_order: 60 +parent: Using Search Relevance Workbench +grand_parent: Search relevance +has_children: false +has_toc: false +--- + +# Optimizing hybrid search + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/17735). +{: .warning} + +A key challenge of using hybrid search in OpenSearch is combining results from lexical and vector-based search effectively. OpenSearch provides different techniques and various parameters you can experiment with to find the best setup for your application. What works best, however, depends heavily on your data, user behavior, and application domain—there is no one-size-fits-all solution. + +Search Relevance Workbench helps you systematically find the ideal set of parameters for your needs. + +## Requirements + +Internally, optimizing hybrid search involves running multiple search quality evaluation experiments. For these experiments, you need a query set, judgments, and a search configuration. +Search Relevance Workbench currently supports hybrid search optimization with exactly two query clauses. While hybrid search typically combines vector and lexical queries, you can run hybrid search optimization with two lexical query clauses: + +```json +PUT _plugins/_search_relevance/search_configurations +{ + "name": "hybrid_query_lexical", + "query": "{\"query\":{\"hybrid\":{\"queries\":[{\"match\":{\"title\":\"%SearchText%\"}},{\"match\":{\"category\":\"%SearchText%\"}}]}}}", + "index": "ecommerce" +} +``` +{% include copy-curl.html %} + +Hybrid search optimization is most valuable when combining lexical and vector-based search results. For optimal results, configure your hybrid search query with two clauses: one textual query clause and one neural query clause. You don't need to configure the search pipeline to combine results because the hybrid search optimization process handles this automatically. The following is an example of a search configuration suitable for hybrid search optimization: + +```json +PUT _plugins/_search_relevance/search_configurations +{ + "name": "hybrid_query_text", + "query": "{\"query\":{\"hybrid\":{\"queries\":[{\"multi_match\":{\"query\":\"%SearchText%\",\"fields\":[\"id\",\"title\",\"category\",\"bullets\",\"description\",\"attrs.Brand\\\",\"attrs.Color\"]}},{\"neural\":{\"title_embedding\":{\"query_text\":\"%SearchText%\",\"k\":100,\"model_id\":\"lRFFb5cBHkapxdNcFFkP\"}}}]}},\"size\":10}", + "index": "ecommerce" +} +``` +{% include copy-curl.html %} + +The model ID specified in the `query` must be a valid model ID for a model deployed in OpenSearch. The target index must contain the field used for neural search embeddings (in this example, `title_embedding`). + +For an end-to-end example, see the [`search-relevance` repository](https://github.com/opensearch-project/search-relevance). + +## Running a hybrid search optimization experiment + +You can create a hybrid search optimization experiment by calling the Search Relevance Workbench `experiments` endpoint. + +### Endpoint + +```json +PUT _plugins/_search_relevance/experiments +``` + +### Example request + +```json +PUT _plugins/_search_relevance/experiments +{ + "querySetId": "b16a6a2b-ed6e-49af-bb2b-fc739dcf24e6", + "searchConfigurationList": ["508a8812-27c9-45fc-999a-05f859f9b210"], + "judgmentList": ["1b944d40-e95a-43f6-9e92-9ce00f70de79"], + "size": 10, + "type": "HYBRID_OPTIMIZER" +} +``` +{% include copy-curl.html %} + +### Example response + +```json +{ + "experiment_id": "0f4eff05-fd14-4e85-ab5e-e8e484cdac73", + "experiment_result": "CREATED" +} +``` + +## Experimentation process + +The hybrid search optimization experiment runs different evaluations based on the search configuration. The following parameters and parameter values are taken into account: + +* Two normalization techniques: `l2` and `min_max`. +* Three combination techniques: `arithmetic_mean`, `harmonic_mean`, `geometric_mean`. +* The lexical and neural search weights, which are values ranging from `0.0` to `1.0` in 0.1 increments. + +Every query in the query set is executed for all different parameter combinations, and the results are evaluated by using the judgment list. + +## Evaluating the results + +The results for each evaluation are stored. You can view the results in OpenSearch Dashboards by selecting the corresponding experiment in the overview of past experiments, as shown in the following image. + +<img src="{{site.url}}{{site.baseurl}}/images/search-relevance-workbench/experiment_overview_hybrid_search_optimization.png" alt="Compare search results"/>{: .img-fluid } + +All executed queries and their calculated search metrics are displayed, as shown in the following image. + +<img src="{{site.url}}{{site.baseurl}}/images/search-relevance-workbench/hybrid_search_optimization_query_overview.png" alt="Compare search results"/>{: .img-fluid } + +To view query variants, select one of the queries, as shown in the following image. + +<img src="{{site.url}}{{site.baseurl}}/images/search-relevance-workbench/hybrid_search_optimization_variant_parameters.png" alt="Compare search results"/>{: .img-fluid } + +You can also retrieve this information by using the following SQL search statement and providing your `experimentId`: + +```json +POST _plugins/_sql +{ + "query": "SELECT ev.parameters.normalization, ev.parameters.combination, ev.parameters.weights, ev.results.evaluationResultId, ev.experimentId, er.id, er.metrics, er.searchText FROM search-relevance-experiment-variant ev JOIN search-relevance-evaluation-result er ON ev.results.evaluationResultId = er.id WHERE ev.experimentId = '814e2378-901c-4273-9873-9b758a33089d'" +} +``` +{% include copy-curl.html %} diff --git a/_search-plugins/search-relevance/query-rewriting.md b/_search-plugins/search-relevance/query-rewriting.md new file mode 100644 index 00000000000..e2fc68b834a --- /dev/null +++ b/_search-plugins/search-relevance/query-rewriting.md @@ -0,0 +1,18 @@ +--- +layout: default +title: Query rewriting +parent: Search relevance +has_children: true +nav_order: 70 +has_toc: false +--- + +# Query rewriting + +Query rewriting is the process of transforming or modifying a user query before it is executed. The goal of query rewriting is to improve search accuracy, relevance, or performance by addressing issues such as misspellings, synonyms, ambiguous terms, or inefficient query structure. Query rewriting is commonly used in search systems to enhance the quality of search results. + +You can perform query rewriting in OpenSearch using the following features: + +- [Template queries]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/template-query/) + +- [Querqy]({{site.url}}{{site.baseurl}}/search-plugins/querqy/) diff --git a/_search-plugins/search-relevance/query-sets.md b/_search-plugins/search-relevance/query-sets.md new file mode 100644 index 00000000000..e367abd286c --- /dev/null +++ b/_search-plugins/search-relevance/query-sets.md @@ -0,0 +1,180 @@ +--- +layout: default +title: Query sets +nav_order: 3 +parent: Using Search Relevance Workbench +grand_parent: Search relevance +has_children: false +has_toc: false +--- + +# Query sets + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/17735). +{: .warning} + +A query set is a collection of queries. These queries are used in experiments for search relevance evaluation. Search Relevance Workbench offers different sampling techniques for creating query sets from real user data that adheres to the [User Behavior Insights (UBI)]({{site.url}}{{site.baseurl}}/search-plugins/ubi/schemas/) specification. +Additionally, Search Relevance Workbench allows you to import a query set. + +## Creating query sets + +If you're tracking user behavior with the UBI specification, you can choose from different sampling methods that can create query sets based on real user queries stored in the `ubi_queries` index. + +Search Relevance Workbench supports three sampling methods: +* Random: Takes a random sample of all queries. +* [Probability-Proportional-to-Size Sampling](https://opensourceconnections.com/blog/2022/10/13/how-to-succeed-with-explicit-relevance-evaluation-using-probability-proportional-to-size-sampling/): Takes a frequency-weighted sample of all queries to obtain a representative sample. +* Top N: Takes the most frequent N queries. + +### Endpoint + +```json +POST _plugins/_search_relevance/query_sets +``` + +### Request body fields + +The following table lists the available input parameters. + +Field | Data type | Description +:--- | :--- | :--- +`name` | String | The name of the query set. +`description` | String | A short description of the query set. +`sampling` | String | Defines which sampler to use. Valid values are `pptss` (Probability-Proportional-to-Size-Sampling), `random`, `topn` (most frequent queries), and `manual`. +`querySetSize` | Integer | The target number of queries in the query set. Depending on the number of unique queries in `ubi_queries`, the resulting query set may contain fewer queries. + +### Example request: Sampling 20 queries with the Top N sampler + +```json +POST _plugins/_search_relevance/query_sets +{ + "name": "Top 20", + "description": "Top 20 most frequent queries sourced from user searches.", + "sampling": "topn", + "querySetSize": 20 +} +``` + +### Example request: Uploading a query set manually + +```json +PUT _plugins/_search_relevance/query_sets +{ + "name": "TVs", + "description": "TV queries", + "sampling": "manual", + "querySetQueries": [ + { + "queryText": "tv" + }, + { + "queryText": "led tv" + } + ] +} +``` + +## Managing query sets + +You can retrieve or delete query sets using the following APIs. + +### Retrieve query sets + +This API retrieves available query sets. + +#### Endpoints + +```json +GET _plugins/_search_relevance/query_sets +GET _plugins/_search_relevance/query_sets/<query_set_id> +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took": 2, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": null, + "hits": [ + { + "_index": "search-relevance-queryset", + "_id": "bb45c4c4-48ce-461b-acbc-f154c0a17ec9", + "_score": null, + "_source": { + "id": "bb45c4c4-48ce-461b-acbc-f154c0a17ec9", + "name": "TVs", + "description": "Some TVs that people might want", + "sampling": "manual", + "timestamp": "2025-06-11T13:43:26.676Z", + "querySetQueries": [ + { + "queryText": "tv" + }, + { + "queryText": "led tv" + } + ] + }, + "sort": [ + 1749649406676 + ] + } + ] + } +} +``` + +### Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `query_set_id` | String | The ID of the query set to retrieve. Retrieves all query sets when empty. | + +### Delete a query set + +You can delete a query set using the query set ID. + +#### Endpoint + +```json +DELETE _plugins/_search_relevance/query_sets/<query_set_id> +``` + +#### Example request + +```json +DELETE _plugins/_search_relevance/query_sets/bb45c4c4-48ce-461b-acbc-f154c0a17ec9 +``` + +#### Example response + +```json +{ + "_index": "search-relevance-queryset", + "_id": "bb45c4c4-48ce-461b-acbc-f154c0a17ec9", + "_version": 2, + "result": "deleted", + "forced_refresh": true, + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + }, + "_seq_no": 17, + "_primary_term": 1 +} +``` diff --git a/_search-plugins/search-relevance/rerank-by-field-cross-encoder.md b/_search-plugins/search-relevance/rerank-by-field-cross-encoder.md new file mode 100644 index 00000000000..7f306894912 --- /dev/null +++ b/_search-plugins/search-relevance/rerank-by-field-cross-encoder.md @@ -0,0 +1,276 @@ +--- +layout: default +title: Reranking by a field using a cross-encoder +parent: Reranking search results +grand_parent: Search relevance +has_children: false +nav_order: 30 +--- + +# Reranking by a field using an externally hosted cross-encoder model +Introduced 2.18 +{: .label .label-purple } + +In this tutorial, you'll learn how to use a cross-encoder model hosted on Amazon SageMaker to rerank search results and improve search relevance. + +To rerank documents, you'll configure a search pipeline that processes search results at query time. The pipeline intercepts search results and passes them to the [`ml_inference` search response processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/ml-inference-search-response/), which invokes the cross-encoder model. The model generates scores used to rerank the matching documents [`by_field`]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field/). + +## Prerequisite: Deploy a model on Amazon SageMaker + +Run the following code to deploy a model on Amazon SageMaker. For this example, you'll use the [`ms-marco-MiniLM-L-6-v2`](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2) Hugging Face cross-encoder model hosted on Amazon SageMaker. We recommend using a GPU for better performance: + +```python +import sagemaker +import boto3 +from sagemaker.huggingface import HuggingFaceModel + +sess = sagemaker.Session() +role = sagemaker.get_execution_role() + +hub = { + 'HF_MODEL_ID':'cross-encoder/ms-marco-MiniLM-L-6-v2', + 'HF_TASK':'text-classification' +} +huggingface_model = HuggingFaceModel( + transformers_version='4.37.0', + pytorch_version='2.1.0', + py_version='py310', + env=hub, + role=role, +) +predictor = huggingface_model.deploy( + initial_instance_count=1, # number of instances + instance_type='ml.m5.xlarge' # ec2 instance type +) +``` +{% include copy.html %} + +After deploying the model, you can find the model endpoint by going to the Amazon SageMaker console in the AWS Management Console and selecting **Inference > Endpoints** on the left tab. Note the URL for the created model; you'll use it to create a connector. + +## Running a search with reranking + +To run a search with reranking, follow these steps: + +1. [Create a connector](#step-1-create-a-connector). +1. [Register the model](#step-2-register-the-model). +1. [Ingest documents into an index](#step-3-ingest-documents-into-an-index). +1. [Create a search pipeline](#step-4-create-a-search-pipeline). +1. [Search using reranking](#step-5-search-using-reranking). + +## Step 1: Create a connector + +Create a connector to the cross-encoder model by providing the model URL in the `actions.url` parameter: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "SageMaker cross-encoder model", + "description": "Test connector for SageMaker cross-encoder hosted model", + "version": 1, + "protocol": "aws_sigv4", + "credential": { + "access_key": "<YOUR_ACCESS_KEY>", + "secret_key": "<YOUR_SECRET_KEY>", + "session_token": "<YOUR_SESSION_TOKEN>" + }, + "parameters": { + "region": "<REGION>", + "service_name": "sagemaker" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "<YOUR_SAGEMAKER_ENDPOINT_URL>", + "headers": { + "content-type": "application/json" + }, + "request_body": "{ \"inputs\": { \"text\": \"${parameters.text}\", \"text_pair\": \"${parameters.text_pair}\" }}" + } + ] +} +``` +{% include copy-curl.html %} + +Note the connector ID contained in the response; you'll use it in the following step. + +## Step 2: Register the model + +To register the model, provide the connector ID in the `connector_id` parameter: + +```json +POST /_plugins/_ml/models/_register +{ + "name": "Cross encoder model", + "version": "1.0.1", + "function_name": "remote", + "description": "Using a SageMaker endpoint to apply a cross encoder model", + "connector_id": "<YOUR_CONNECTOR_ID>" +} +``` +{% include copy-curl.html %} + + +## Step 3: Ingest documents into an index + +Create an index and ingest sample documents containing facts about the New York City boroughs: + +```json +POST /nyc_areas/_bulk +{ "index": { "_id": 1 } } +{ "borough": "Queens", "area_name": "Astoria", "description": "Astoria is a neighborhood in the western part of Queens, New York City, known for its diverse community and vibrant cultural scene.", "population": 93000, "facts": "Astoria is home to many artists and has a large Greek-American community. The area also boasts some of the best Mediterranean food in NYC." } +{ "index": { "_id": 2 } } +{ "borough": "Queens", "area_name": "Flushing", "description": "Flushing is a neighborhood in the northern part of Queens, famous for its Asian-American population and bustling business district.", "population": 227000, "facts": "Flushing is one of the most ethnically diverse neighborhoods in NYC, with a large Chinese and Korean population. It is also home to the USTA Billie Jean King National Tennis Center." } +{ "index": { "_id": 3 } } +{ "borough": "Brooklyn", "area_name": "Williamsburg", "description": "Williamsburg is a trendy neighborhood in Brooklyn known for its hipster culture, vibrant art scene, and excellent restaurants.", "population": 150000, "facts": "Williamsburg is a hotspot for young professionals and artists. The neighborhood has seen rapid gentrification over the past two decades." } +{ "index": { "_id": 4 } } +{ "borough": "Manhattan", "area_name": "Harlem", "description": "Harlem is a historic neighborhood in Upper Manhattan, known for its significant African-American cultural heritage.", "population": 116000, "facts": "Harlem was the birthplace of the Harlem Renaissance, a cultural movement that celebrated Black culture through art, music, and literature." } +{ "index": { "_id": 5 } } +{ "borough": "The Bronx", "area_name": "Riverdale", "description": "Riverdale is a suburban-like neighborhood in the Bronx, known for its leafy streets and affluent residential areas.", "population": 48000, "facts": "Riverdale is one of the most affluent areas in the Bronx, with beautiful parks, historic homes, and excellent schools." } +{ "index": { "_id": 6 } } +{ "borough": "Staten Island", "area_name": "St. George", "description": "St. George is the main commercial and cultural center of Staten Island, offering stunning views of Lower Manhattan.", "population": 15000, "facts": "St. George is home to the Staten Island Ferry terminal and is a gateway to Staten Island, offering stunning views of the Statue of Liberty and Ellis Island." } +``` +{% include copy-curl.html %} + +## Step 4: Create a search pipeline + +Next, create a search pipeline for reranking. In the search pipeline configuration, the `input_map` and `output_map` define how the input data is prepared for the cross-encoder model and how the model's output is interpreted for reranking: + +- The `input_map` specifies which fields in the search documents and the query should be used as model inputs: + - The `text` field maps to the `facts` field in the indexed documents. It provides the document-specific content that the model will analyze. + - The `text_pair` field dynamically retrieves the search query text (`multi_match.query`) from the search request. + + The combination of `text` (document `facts`) and `text_pair` (search `query`) allows the cross-encoder model to compare the relevance of the document to the query, considering their semantic relationship. + +- The `output_map` field specifies how the output of the model is mapped to the fields in the response: + - The `rank_score` field in the response will store the model's relevance score, which will be used to perform reranking. + +When using the `by_field` rerank type, the `rank_score` field will contain the same score as the `_score` field. To remove the `rank_score` field from the search results, set `remove_target_field` to `true`. The original BM25 score, before reranking, is included for debugging purposes by setting `keep_previous_score` to `true`. This allows you to compare the original score with the reranked score to evaluate improvements in search relevance. + +To create the search pipeline, send the following request: + +```json +PUT /_search/pipeline/my_pipeline +{ + "response_processors": [ + { + "ml_inference": { + "tag": "ml_inference", + "description": "This processor runs ml inference during search response", + "model_id": "<model_id_from_step_3>", + "function_name": "REMOTE", + "input_map": [ + { + "text": "facts", + "text_pair":"$._request.query.multi_match.query" + } + ], + "output_map": [ + { + "rank_score": "$.score" + } + ], + "full_response_path": false, + "model_config": {}, + "ignore_missing": false, + "ignore_failure": false, + "one_to_one": true + }, + + "rerank": { + "by_field": { + "target_field": "rank_score", + "remove_target_field": true, + "keep_previous_score" : true + } + } + + } + ] +} +``` +{% include copy-curl.html %} + +## Step 5: Search using reranking + +Use the following request to search indexed documents and rerank them using the cross-encoder model. The request retrieves documents containing any of the specified terms in the `description` or `facts` fields. These terms are then used to compare and rerank the matched documents: + +```json +POST /nyc_areas/_search?search_pipeline=my_pipeline +{ + "query": { + "multi_match": { + "query": "artists art creative community", + "fields": ["description", "facts"] + } + } +} +``` +{% include copy-curl.html %} + +In the response, the `previous_score` field contains the document's BM25 score, which it would have received if you hadn't applied the pipeline. Note that while BM25 ranked "Astoria" the highest, the cross-encoder model prioritized "Harlem" because it matched more search terms: + +```json +{ + "took": 4, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 0.03418137, + "hits": [ + { + "_index": "nyc_areas", + "_id": "4", + "_score": 0.03418137, + "_source": { + "area_name": "Harlem", + "description": "Harlem is a historic neighborhood in Upper Manhattan, known for its significant African-American cultural heritage.", + "previous_score": 1.6489418, + "borough": "Manhattan", + "facts": "Harlem was the birthplace of the Harlem Renaissance, a cultural movement that celebrated Black culture through art, music, and literature.", + "population": 116000 + } + }, + { + "_index": "nyc_areas", + "_id": "1", + "_score": 0.0090838, + "_source": { + "area_name": "Astoria", + "description": "Astoria is a neighborhood in the western part of Queens, New York City, known for its diverse community and vibrant cultural scene.", + "previous_score": 2.519608, + "borough": "Queens", + "facts": "Astoria is home to many artists and has a large Greek-American community. The area also boasts some of the best Mediterranean food in NYC.", + "population": 93000 + } + }, + { + "_index": "nyc_areas", + "_id": "3", + "_score": 0.0032599436, + "_source": { + "area_name": "Williamsburg", + "description": "Williamsburg is a trendy neighborhood in Brooklyn known for its hipster culture, vibrant art scene, and excellent restaurants.", + "previous_score": 1.5632852, + "borough": "Brooklyn", + "facts": "Williamsburg is a hotspot for young professionals and artists. The neighborhood has seen rapid gentrification over the past two decades.", + "population": 150000 + } + } + ] + }, + "profile": { + "shards": [] + } +} +``` + \ No newline at end of file diff --git a/_search-plugins/search-relevance/rerank-by-field.md b/_search-plugins/search-relevance/rerank-by-field.md new file mode 100644 index 00000000000..e6f65a4d257 --- /dev/null +++ b/_search-plugins/search-relevance/rerank-by-field.md @@ -0,0 +1,209 @@ +--- +layout: default +title: Reranking by a field +parent: Reranking search results +grand_parent: Search relevance +has_children: false +nav_order: 20 +--- + +# Reranking search results by a field +Introduced 2.18 +{: .label .label-purple } + +You can use a `by_field` rerank type to rerank search results by a document field. Reranking search results by a field is useful if a model has already run and produced a numerical score for your documents or if a previous search response processor was applied and you want to rerank documents differently based on an aggregated field. + +To implement reranking, you need to configure a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline intercepts search results and applies the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) to them. The `rerank` processor evaluates the search results and sorts them based on the new scores obtained from a document field. + +## Running a search with reranking + +To run a search with reranking, follow these steps: + +1. [Configure a search pipeline](#step-1-configure-a-search-pipeline). +1. [Create an index for ingestion](#step-2-create-an-index-for-ingestion). +1. [Ingest documents into the index](#step-3-ingest-documents-into-the-index). +1. [Search using reranking](#step-4-search-using-reranking). + +## Step 1: Configure a search pipeline + +Configure a search pipeline with a [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) and specify the `by_field` rerank type. The pipeline sorts by the `reviews.stars` field (specified by a complete dot path to the field) and returns the original query scores for all documents along with their new scores: + +```json +PUT /_search/pipeline/rerank_byfield_pipeline +{ + "response_processors": [ + { + "rerank": { + "by_field": { + "target_field": "reviews.stars", + "keep_previous_score" : true + } + } + } + ] +} +``` +{% include copy-curl.html %} + +For more information about the request fields, see [Request fields]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#request-body-fields). + +## Step 2: Create an index for ingestion + +In order to use the `rerank` processor defined in your pipeline, create an OpenSearch index and add the pipeline created in the previous step as the default pipeline: + +```json +PUT /book-index +{ + "settings": { + "index.search.default_pipeline" : "rerank_byfield_pipeline" + }, + "mappings": { + "properties": { + "title": { + "type": "text" + }, + "author": { + "type": "text" + }, + "genre": { + "type": "keyword" + }, + "reviews": { + "properties": { + "stars": { + "type": "float" + } + } + }, + "description": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +## Step 3: Ingest documents into the index + +To ingest documents into the index created in the previous step, send the following bulk request: + +```json +POST /_bulk +{ "index": { "_index": "book-index", "_id": "1" } } +{ "title": "The Lost City", "author": "Jane Doe", "genre": "Adventure Fiction", "reviews": { "stars": 4.2 }, "description": "An exhilarating journey through a hidden civilization in the Amazon rainforest." } +{ "index": { "_index": "book-index", "_id": "2" } } +{ "title": "Whispers of the Past", "author": "John Smith", "genre": "Historical Mystery", "reviews": { "stars": 4.7 }, "description": "A gripping tale set in Victorian England, unraveling a century-old mystery." } +{ "index": { "_index": "book-index", "_id": "3" } } +{ "title": "Starlit Dreams", "author": "Emily Clark", "genre": "Science Fiction", "reviews": { "stars": 4.5 }, "description": "In a future where dreams can be shared, one girl discovers her imaginations power." } +{ "index": { "_index": "book-index", "_id": "4" } } +{ "title": "The Enchanted Garden", "author": "Alice Green", "genre": "Fantasy", "reviews": { "stars": 4.8 }, "description": "A magical garden holds the key to a young girls destiny and friendship." } + +``` +{% include copy-curl.html %} + +## Step 4: Search using reranking + +As an example, run a `match_all` query on your index: + +```json +POST /book-index/_search +{ + "query": { + "match_all": {} + } +} +``` +{% include copy-curl.html %} + +The response contains documents sorted in descending order based on the `reviews.stars` field. Each document contains the original query score in the `previous_score` field: + +```json +{ + "took": 33, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 4.8, + "hits": [ + { + "_index": "book-index", + "_id": "4", + "_score": 4.8, + "_source": { + "reviews": { + "stars": 4.8 + }, + "author": "Alice Green", + "genre": "Fantasy", + "description": "A magical garden holds the key to a young girls destiny and friendship.", + "previous_score": 1, + "title": "The Enchanted Garden" + } + }, + { + "_index": "book-index", + "_id": "2", + "_score": 4.7, + "_source": { + "reviews": { + "stars": 4.7 + }, + "author": "John Smith", + "genre": "Historical Mystery", + "description": "A gripping tale set in Victorian England, unraveling a century-old mystery.", + "previous_score": 1, + "title": "Whispers of the Past" + } + }, + { + "_index": "book-index", + "_id": "3", + "_score": 4.5, + "_source": { + "reviews": { + "stars": 4.5 + }, + "author": "Emily Clark", + "genre": "Science Fiction", + "description": "In a future where dreams can be shared, one girl discovers her imaginations power.", + "previous_score": 1, + "title": "Starlit Dreams" + } + }, + { + "_index": "book-index", + "_id": "1", + "_score": 4.2, + "_source": { + "reviews": { + "stars": 4.2 + }, + "author": "Jane Doe", + "genre": "Adventure Fiction", + "description": "An exhilarating journey through a hidden civilization in the Amazon rainforest.", + "previous_score": 1, + "title": "The Lost City" + } + } + ] + }, + "profile": { + "shards": [] + } +} +``` + +## Next steps + +- Learn more about the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/). +- See a comprehensive example of [reranking by a field using an externally hosted cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field-cross-encoder/). \ No newline at end of file diff --git a/_search-plugins/search-relevance/rerank-cross-encoder.md b/_search-plugins/search-relevance/rerank-cross-encoder.md new file mode 100644 index 00000000000..64f93c886c5 --- /dev/null +++ b/_search-plugins/search-relevance/rerank-cross-encoder.md @@ -0,0 +1,122 @@ +--- +layout: default +title: Reranking using a cross-encoder model +parent: Reranking search results +grand_parent: Search relevance +has_children: false +nav_order: 10 +--- + +# Reranking search results using a cross-encoder model +Introduced 2.12 +{: .label .label-purple } + +You can rerank search results using a cross-encoder model in order to improve search relevance. To implement reranking, you need to configure a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline intercepts search results and applies the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) to them. The `rerank` processor evaluates the search results and sorts them based on the new scores provided by the cross-encoder model. + +**PREREQUISITE**<br> +Before configuring a reranking pipeline, you must set up a cross-encoder model. For information about using an OpenSearch-provided model, see [Cross-encoder models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#cross-encoder-models). For information about using a custom model, see [Custom local models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). +{: .note} + +## Running a search with reranking + +To run a search with reranking, follow these steps: + +1. [Configure a search pipeline](#step-1-configure-a-search-pipeline). +1. [Create an index for ingestion](#step-2-create-an-index-for-ingestion). +1. [Ingest documents into the index](#step-3-ingest-documents-into-the-index). +1. [Search using reranking](#step-4-search-using-reranking). + +## Step 1: Configure a search pipeline + +Next, configure a search pipeline with a [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) and specify the `ml_opensearch` rerank type. In the request, provide a model ID for the cross-encoder model and the document fields to use as context: + +```json +PUT /_search/pipeline/my_pipeline +{ + "description": "Pipeline for reranking with a cross-encoder", + "response_processors": [ + { + "rerank": { + "ml_opensearch": { + "model_id": "gnDIbI0BfUsSoeNT_jAw" + }, + "context": { + "document_fields": [ + "passage_text" + ] + } + } + } + ] +} +``` +{% include copy-curl.html %} + +For more information about the request fields, see [Request fields]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#request-body-fields). + +## Step 2: Create an index for ingestion + +In order to use the `rerank` processor defined in your pipeline, create an OpenSearch index and add the pipeline created in the previous step as the default pipeline: + +```json +PUT /my-index +{ + "settings": { + "index.search.default_pipeline" : "my_pipeline" + }, + "mappings": { + "properties": { + "passage_text": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +## Step 3: Ingest documents into the index + +To ingest documents into the index created in the previous step, send the following bulk request: + +```json +POST /_bulk +{ "index": { "_index": "my-index" } } +{ "passage_text" : "I said welcome to them and we entered the house" } +{ "index": { "_index": "my-index" } } +{ "passage_text" : "I feel welcomed in their family" } +{ "index": { "_index": "my-index" } } +{ "passage_text" : "Welcoming gifts are great" } + +``` +{% include copy-curl.html %} + +## Step 4: Search using reranking + +To perform a reranking search on your index, use any OpenSearch query and provide an additional `ext.rerank` field: + +```json +POST /my-index/_search +{ + "query": { + "match": { + "passage_text": "how to welcome in family" + } + }, + "ext": { + "rerank": { + "query_context": { + "query_text": "how to welcome in family" + } + } + } +} +``` +{% include copy-curl.html %} + +Alternatively, you can provide the full path to the field containing the context. For more information, see [Rerank processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#example). + +## Next steps + +- Learn more about the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/). +- See a comprehensive example of [reranking by a field using an externally hosted cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field-cross-encoder/). \ No newline at end of file diff --git a/_search-plugins/search-relevance/reranking-search-results.md b/_search-plugins/search-relevance/reranking-search-results.md index b1e480a84a3..205433fab8e 100644 --- a/_search-plugins/search-relevance/reranking-search-results.md +++ b/_search-plugins/search-relevance/reranking-search-results.md @@ -2,127 +2,27 @@ layout: default title: Reranking search results parent: Search relevance -has_children: false -nav_order: 60 +has_children: true +nav_order: 65 --- # Reranking search results Introduced 2.12 {: .label .label-purple } -You can rerank search results using a cross-encoder reranker in order to improve search relevance. To implement reranking, you need to configure a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline intercepts search results and applies the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) to them. The `rerank` processor evaluates the search results and sorts them based on the new scores provided by the cross-encoder model. +You can rerank search results using a [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) in order to improve search relevance. To implement reranking, you need to configure a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline intercepts search results and applies the `rerank` processor to them. The `rerank` processor evaluates the search results and sorts them based on the new scores. -**PREREQUISITE**<br> -Before configuring a reranking pipeline, you must set up a cross-encoder model. For information about using an OpenSearch-provided model, see [Cross-encoder models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#cross-encoder-models). For information about using a custom model, see [Custom local models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). -{: .note} - -## Running a search with reranking - -To run a search with reranking, follow these steps: - -1. [Configure a search pipeline](#step-1-configure-a-search-pipeline). -1. [Create an index for ingestion](#step-2-create-an-index-for-ingestion). -1. [Ingest documents into the index](#step-3-ingest-documents-into-the-index). -1. [Search using reranking](#step-4-search-using-reranking). - -## Step 1: Configure a search pipeline - -Next, configure a search pipeline with a [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/). - -The following example request creates a search pipeline with an `ml_opensearch` rerank processor. In the request, provide a model ID for the cross-encoder model and the document fields to use as context: - -```json -PUT /_search/pipeline/my_pipeline -{ - "description": "Pipeline for reranking with a cross-encoder", - "response_processors": [ - { - "rerank": { - "ml_opensearch": { - "model_id": "gnDIbI0BfUsSoeNT_jAw" - }, - "context": { - "document_fields": [ - "passage_text" - ] - } - } - } - ] -} -``` -{% include copy-curl.html %} - -For more information about the request fields, see [Request fields]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#request-body-fields). - -## Step 2: Create an index for ingestion - -In order to use the rerank processor defined in your pipeline, create an OpenSearch index and add the pipeline created in the previous step as the default pipeline: +You can rerank results in the following ways: -```json -PUT /my-index -{ - "settings": { - "index.search.default_pipeline" : "my_pipeline" - }, - "mappings": { - "properties": { - "passage_text": { - "type": "text" - } - } - } -} -``` -{% include copy-curl.html %} - -## Step 3: Ingest documents into the index - -To ingest documents into the index created in the previous step, send the following bulk request: - -```json -POST /_bulk -{ "index": { "_index": "my-index" } } -{ "passage_text" : "I said welcome to them and we entered the house" } -{ "index": { "_index": "my-index" } } -{ "passage_text" : "I feel welcomed in their family" } -{ "index": { "_index": "my-index" } } -{ "passage_text" : "Welcoming gifts are great" } - -``` -{% include copy-curl.html %} - -## Step 4: Search using reranking - -To perform reranking search on your index, use any OpenSearch query and provide an additional `ext.rerank` field: - -```json -POST /my-index/_search -{ - "query": { - "match": { - "passage_text": "how to welcome in family" - } - }, - "ext": { - "rerank": { - "query_context": { - "query_text": "how to welcome in family" - } - } - } -} -``` -{% include copy-curl.html %} - -Alternatively, you can provide the full path to the field containing the context. For more information, see [Rerank processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#example). +- [Using a cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-cross-encoder/) +- [By a document field]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field/) ## Using rerank and normalization processors together When you use a rerank processor in conjunction with a [normalization processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/normalization-processor/) and a hybrid query, the rerank processor alters the final document scores. This is because the rerank processor operates after the normalization processor in the search pipeline. {: .note} -The processing order is as follows: +The processing order is as follows: - Normalization processor: This processor normalizes the document scores based on the configured normalization method. For more information, see [Normalization processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/normalization-processor/). - Rerank processor: Following normalization, the rerank processor further adjusts the document scores. This adjustment can significantly impact the final ordering of search results. @@ -130,4 +30,10 @@ The processing order is as follows: This processing order has the following implications: - Score modification: The rerank processor modifies the scores that were initially adjusted by the normalization processor, potentially leading to different ranking results than initially expected. -- Hybrid queries: In the context of hybrid queries, where multiple types of queries and scoring mechanisms are combined, this behavior is particularly noteworthy. The combined scores from the initial query are normalized first and then reranked, resulting in a two-stage scoring modification. \ No newline at end of file +- Hybrid queries: In the context of hybrid queries, where multiple types of queries and scoring mechanisms are combined, this behavior is particularly noteworthy. The combined scores from the initial query are normalized first and then reranked, resulting in a two-phase scoring modification. + +## Next steps + +- See a complete example of [reranking using a cross-encoder model]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-cross-encoder/). +- See a complete example of [reranking by a document field]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field/). +- Learn more about the [`rerank` processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/). diff --git a/_search-plugins/search-relevance/search-configurations.md b/_search-plugins/search-relevance/search-configurations.md new file mode 100644 index 00000000000..09205dba16f --- /dev/null +++ b/_search-plugins/search-relevance/search-configurations.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Search configurations +nav_order: 5 +parent: Using Search Relevance Workbench +grand_parent: Search relevance +has_children: false +has_toc: false +--- + +# Search configurations + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/17735). +{: .warning} + +A search configuration defines the query pattern used to run experiments, specifying how queries should be constructed and executed. + +## Creating search configurations + +You can define a search configuration to describe how every query of a query set is run. Every search configuration has a name and consists of a query body (a query in OpenSearch query domain-specific language [DSL]) and the target index. You can optionally define a search pipeline for the search configuration. + +### Endpoint + +```json +PUT _plugins/_search_relevance/search_configurations +``` + +### Request body fields + +The following table lists the available input parameters. + +Field | Data type | Description +:--- | :--- | :--- +`name` | String | The name of the search configuration. +`query` | Object | Defines the query in OpenSearch query DSL. Use `%SearchText%` as a placeholder for the user query. Needs to be escaped. +`index` | String | The target index queried by this search configuration. +`searchPipeline` | String | Specifies an existing search pipeline. Optional. + +### Example request: Creating a search configuration + +```json +PUT _plugins/_search_relevance/search_configurations +{ + "name": "baseline", + "query": "{\"query\":{\"multi_match\":{\"query\":\"%SearchText%\",\"fields\":[\"id\",\"title\",\"category\",\"bullets\",\"description\",\"attrs.Brand\",\"attrs.Color\"]}}}", + "index": "ecommerce" +} +``` + +## Managing search configurations + +You can retrieve or delete configurations using the following APIs. + +### Retrieve search configurations + + This API retrieves search configurations. + +#### Endpoint + +```json +GET _plugins/_search_relevance/search_configurations +GET _plugins/_search_relevance/search_configurations/<search_configuration_id> +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "took": 3, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": null, + "hits": [ + { + "_index": "search-relevance-search-config", + "_id": "92810080-9c5a-470f-a0ff-0eb85e7b818c", + "_score": null, + "_source": { + "id": "92810080-9c5a-470f-a0ff-0eb85e7b818c", + "name": "baseline", + "timestamp": "2025-06-12T08:23:03.305Z", + "index": "ecommerce", + "query": """{"query":{"multi_match":{"query":"%SearchText%","fields":["id","title","category","bullets","description","attrs.Brand","attrs.Color"]}}}""", + "searchPipeline": "" + }, + "sort": [ + 1749716583305 + ] + } + ] + } +} +``` + +### Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `search_configuration_id` | String | The ID of the search configuration to retrieve. Retrieves all search configurations when empty. | + +### Delete a search configuration + +You can delete a search configuration using the search configuration ID. + +#### Endpoint + +```json +DELETE _plugins/_search_relevance/search_configurations/<search_configuration_id> +``` + +#### Example request + +```json +DELETE _plugins/_search_relevance/search_configurations/bb45c4c4-48ce-461b-acbc-f154c0a17ec9 +``` +{% include copy-curl.html %} + +#### Example response + +```json +{ + "_index": "search-relevance-search-config", + "_id": "92810080-9c5a-470f-a0ff-0eb85e7b818c", + "_version": 2, + "result": "deleted", + "forced_refresh": true, + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + }, + "_seq_no": 9, + "_primary_term": 1 +} +``` diff --git a/_search-plugins/search-relevance/template-query.md b/_search-plugins/search-relevance/template-query.md new file mode 100644 index 00000000000..e03f1e0c5a2 --- /dev/null +++ b/_search-plugins/search-relevance/template-query.md @@ -0,0 +1,215 @@ +--- +layout: default +title: Template queries +nav_order: 70 +parent: Query rewriting +grand_parent: Search relevance +has_children: false +has_toc: false +--- + +# Template queries +Introduced 2.19 +{: .label .label-purple } + +A template query allows you to create queries with dynamic placeholders that are resolved by search request processors during query execution. This is particularly useful when your query parameters need to be generated or transformed during the search process, such as when using machine learning (ML) inference to convert text into vector embeddings. + +The following search implementations benefit from using template queries: + +- Converting text input into vector embeddings for vector search +- Dynamic query parameter generation based on runtime calculations +- Complex query transformations that require intermediate processing + +Placeholders are defined using the `"${variable_name}"` syntax (note that the variables must be enclosed in quotation marks). Search request processors can generate or transform data to fill these placeholders before the query is processed. The template query acts as a container for the final executable query. + +## Example + +The following example demonstrates using a template query with an [`ml_inference` search request processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/ml-inference-search-request/) for semantic search. + +### Prerequisite + +Before using an `ml_inference` search request processor, you must configure an ML model. For more information about local models, see [Using ML models within OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/). For more information about externally hosted models, see [Connecting to externally hosted models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). + +Once you configure a model, you can test the model by sending a Predict API request: + +```json +POST /_plugins/_ml/models/mBGzipQB2gmRjlv_dOoB/_predict +{ + "parameters": { + "inputText": "happy moments" + } +} +``` +{% include copy-curl.html %} + +The response contains an `embedding` field with vector embeddings generated from the `inputText`: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "dataAsMap": { + "embedding": [ + 0.6328125, + 0.26953125, + 0.41796875, + -0.00579833984375, + 1.859375, + 0.2734375, + 0.130859375, + -0.001007080078125, + 0.138671875, + ...], + "inputTextTokenCount": 2 + } + } + ], + "status_code": 200 + } + ] +} +``` + +### Step 1: Create an ingest pipeline + +Create an ingest pipeline to generate vector embeddings from text fields during document indexing. The `input_map` maps document fields to model inputs. In this example, the `text` source field in a document is mapped to the `inputText` field---the expected input field for the model. The `output_map` maps model outputs to document fields. In this example, the `embedding` output field from the model is mapped to the `text_embedding` destination field in your document: + +```json +PUT /_ingest/pipeline/knn_pipeline +{ + "description": "knn_pipeline", + "processors": [ + { + "ml_inference": { + "model_id": "Sz-wFZQBUpPSu0bsJTBG", + "input_map": [ + { + "inputText": "text" + } + ], + "output_map": [ + { + "text_embedding": "embedding" + } + ] + } + } + ] +} +``` +{% include copy-curl.html %} + +### Step 2: Index a document + +Index the following document into a `template-knn-1` index: + +```json +PUT /template-knn-1/_doc/1 +{ + "text": "red shoes" +} +``` +{% include copy-curl.html %} + +To view the document, send a GET request: + +```json +GET /template-knn-1/_doc/1 +``` +{% include copy-curl.html %} + +The response shows that the embeddings generated by the model are stored in the `text_embedding` field along with the original `text`: + +```json +{ + "_index": "template-knn-1", + "_id": "1", + "_version": 2, + "_seq_no": 1, + "_primary_term": 1, + "found": true, + "_source": { + "text_embedding": [ + -0.69140625, + 0.8125, + 0.51953125, + -0.7421875, + 0.6875, + 0.4765625, + -0.34375, + ...], + "text": "red shoes" + } +} +``` + +### Step 3: Search using a template query + +Use the following template query to search the index. The `ml_inference` processor generates a vector embedding from the input text `sneakers`, replaces `${text_embedding}` with the generated vector, and searches for documents closest to the vector: + +```json +GET /template-knn-1/_search?search_pipeline=my_knn_pipeline +{ + "query": { + "template": { + "knn": { + "text_embedding": { + "vector": "${text_embedding}", + "k": 2 + } + } + } + }, + "ext": { + "ml_inference": { + "text": "sneakers" + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching document: + +```json +{ + "took": 611, + "timed_out": false, + "_shards": { + "total": 5, + "successful": 5, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.0019327316, + "hits": [ + { + "_index": "template-knn-1", + "_id": "1", + "_score": 0.0019327316, + "_source": { + "text_embedding": [ + -0.69140625, + 0.8125, + 0.51953125, + ..], + "text": "red shoes" + } + } + ] + } +} +``` + +## Related articles + +- [Template query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/template/) +- [ML inference search request processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/ml-inference-search-request/) \ No newline at end of file diff --git a/_search-plugins/search-relevance/using-search-relevance-workbench.md b/_search-plugins/search-relevance/using-search-relevance-workbench.md new file mode 100644 index 00000000000..5a08d874b75 --- /dev/null +++ b/_search-plugins/search-relevance/using-search-relevance-workbench.md @@ -0,0 +1,358 @@ +--- +layout: default +title: Search Relevance Workbench +nav_order: 10 +parent: Search relevance +has_children: true +has_toc: false +--- + +# Search Relevance Workbench +Introduced 3.1 +{: .label .label-purple } + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/17735). +{: .warning} + +In search applications, tuning relevance is a constant, iterative exercise intended to provide the right search results to your end users. The tooling in Search Relevance Workbench helps search relevance engineers and business users create the best search experience possible for application users. It does this without hiding internal information, enabling engineers to experiment and investigate details as necessary. + +Search Relevance Workbench consists of a [frontend component](https://github.com/opensearch-project/dashboards-search-relevance) that simplifies the process of evaluating search quality. +The frontend uses the [OpenSearch Search Relevance plugin](https://github.com/opensearch-project/search-relevance) as a backend to manage the resources for each tool provided. For example, most use cases involve creating and using search configurations, query sets, and judgment lists. All of these resources are created, updated, deleted, and maintained by the Search Relevance plugin. When you are satisfied with the relevance improvements, you can take the output of the experimentation and manually deploy the changes into your search application. + +## Key relevance concepts + +Search Relevance Workbench relies on different components for the different kinds of experiments that it offers: + +* [Query set]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/query-sets/): A _query set_ is a collection of queries. These queries are used in experiments for search relevance evaluation. +* [Search configuration]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/search-configurations/): A _search configuration_ describes the pattern to use to run queries for experiments. +* [Judgment list]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/judgments/): A _judgment_ is a rating that describes the relevance of one particular document for a given query. Multiple judgments are grouped together into judgment lists. + +## Available search result quality experiments + +Search Relevance Workbench offers three types of experiments: + +* [Search result comparison]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/comparing-search-results/): Compare results of two search configurations. +* [Search quality evaluation]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/evaluate-search-quality/): Evaluate the retrieval quality for one particular search configuration by calculating search quality metrics based on retrieved results and a judgment list. +* [Hybrid search optimization]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/optimize-hybrid-search/): Identify the best parameter set for your hybrid search query. + +## Enabling Search Relevance Workbench + +To enable Search Relevance Workbench, you must first enable the frontend and backend plugins. + +### Enabling the Search Relevance Workbench frontend plugin + +To activate the frontend plugin, in OpenSearch Dashboards, go to **Management** > **Dashboards Management** > **Advanced Settings** and turn on the toggle, as shown in the following image. + +<img src="{{site.url}}{{site.baseurl}}/images/search-relevance-workbench/activate_frontend_plugin.png" alt="Activate frontend plugin in OpenSearch Dashboards settings"/>{: .img-fluid } + +### Enabling the Search Relevance Workbench backend plugin + +To enable the Search Relevance Workbench backend plugin, send the following request: + +```json +PUT _cluster/settings +{ + "persistent" : { + "plugins.search_relevance.workbench_enabled" : true + } +} +``` +{% include copy-curl.html %} + +## Creating a query set + +To compare search configurations, create a set of queries to run the search. If you have access to search behavior data adhering to the User Behavior Insights (UBI) specification, you can send a request to the `_plugins/search_relevance/query_sets/create` endpoint. + +The following example request uploads a manually defined query set to Search Relevance Workbench: + + +```json +PUT _plugins/_search_relevance/query_sets +{ + "name": "TVs", + "description": "TV queries", + "sampling": "manual", + "querySetQueries": [ + { + "queryText": "tv" + }, + { + "queryText": "led tv" + } + ] +} +``` +{% include copy-curl.html %} + + +The response contains the `query_set_id` of the query set with which you'll experiment: + +```json +{ + "query_set_id": "1856093f-9245-449c-b54d-9aae7650551a", + "query_set_result": "CREATED" +} +``` + +## Creating search configurations + +Search configurations specify how each query of the query set is run. To create a search configuration, you can send a search request to the `_plugins/search_relevance/search_configurations` endpoint. +Every search configuration contains a `search_configuration_name` and a `query_body`. + +### Example: Creating two search configurations + +For your first experiment, you'll explore how adding a weight of `10` to the `title` field affects your search configuration. First, upload your current search configuration to OpenSearch: + +```json +PUT _plugins/_search_relevance/search_configurations +{ + "name": "my_production_config", + "query": "{\"query\":{\"multi_match\":{\"query\":\"%SearchText%\",\"fields\":[\"id\",\"title\",\"category\",\"bullets\",\"description\",\"attrs.Brand\",\"attrs.Color\"]}}}", + "index": "ecommerce" +} +``` +{% include copy-curl.html %} + +The response contains the search configuration ID: + +```json +{ + "search_configuration_id": "122fbde8-d593-4d71-96d4-cbe3b4977468", + "search_configuration_result": "CREATED" +} +``` + +Next, create another search configuration and apply a weight of `10` to the `title` field: + +```json +PUT _plugins/_search_relevance/search_configurations +{ + "name": "title_boost", + "query": "{\"query\":{\"multi_match\":{\"query\":\"%SearchText%\",\"fields\":[\"id\",\"title^10\",\"category\",\"bullets\",\"description\",\"attrs.Brand\",\"attrs.Color\"]}}}", + "index": "ecommerce" +} +``` +{% include copy-curl.html %} + +The response contains the ID of the boosted search configuration and indicates whether it was created successfully: + +```json +{ + "search_configuration_id": "0d687614-df5b-4b6b-8110-9d8c6d407963", + "search_configuration_result": "CREATED" +} +``` + +## Running the search result list comparison experiment + +To run your first experiment, you need a query set and two search configurations (and a corresponding index). By comparing search results, you can gauge how modifying the search configurations affects the search results. To create an experiment, send a request to the `_plugins/search_relevance/experiments` endpoint: + + +```json +POST _plugins/_search_relevance/experiments +{ + "querySetId": "1856093f-9245-449c-b54d-9aae7650551a", + "searchConfigurationList": ["122fbde8-d593-4d71-96d4-cbe3b4977468", "0d687614-df5b-4b6b-8110-9d8c6d407963"], + "size": 10, + "type": "PAIRWISE_COMPARISON" +} +``` +{% include copy-curl.html %} + +The response contains the experiment ID: + +```json +{ + "experiment_id": "dbae9786-6ea0-413d-a500-a14ef69ef7e1", + "experiment_result": "CREATED" +} +``` + +To retrieve the experiment results, use the returned `experiment_id`: + +```json +GET _plugins/_search_relevance/experiments/dbae9786-6ea0-413d-a500-a14ef69ef7e1 +``` +{% include copy-curl.html %} + +The response provides the detailed experiment results: + +<details open markdown="block"> + <summary> + Response + </summary> + +```json +{ + "took": 3, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": ".plugins-search-relevance-experiment", + "_id": "dbae9786-6ea0-413d-a500-a14ef69ef7e1", + "_score": 1, + "_source": { + "id": "dbae9786-6ea0-413d-a500-a14ef69ef7e1", + "timestamp": "2025-06-14T14:02:17.347Z", + "type": "PAIRWISE_COMPARISON", + "status": "COMPLETED", + "querySetId": "1856093f-9245-449c-b54d-9aae7650551a", + "searchConfigurationList": [ + "122fbde8-d593-4d71-96d4-cbe3b4977468", + "0d687614-df5b-4b6b-8110-9d8c6d407963" + ], + "judgmentList": [], + "size": 10, + "results": [ + { + "snapshots": [ + { + "searchConfigurationId": "0d687614-df5b-4b6b-8110-9d8c6d407963", + "docIds": [ + "B01M1D0KL1", + "B07YSMD3Z9", + "B07V4CY9GZ", + "B074KFP426", + "B07S8XNWWF", + "B07XBJR7GY", + "B075FDWSHT", + "B01N2Z17MS", + "B07F1T4JFB", + "B07S658ZLH" + ] + }, + { + "searchConfigurationId": "122fbde8-d593-4d71-96d4-cbe3b4977468", + "docIds": [ + "B07Q45SP9P", + "B074KFP426", + "B07JKVKZX8", + "B07THVCJK3", + "B0874XJYW8", + "B08LVPWQQP", + "B07V4CY9GZ", + "B07X3BS3DF", + "B074PDYLCZ", + "B08CD9MKLZ" + ] + } + ], + "queryText": "led tv", + "metrics": [ + { + "metric": "jaccard", + "value": 0.11 + }, + { + "metric": "rbo50", + "value": 0.03 + }, + { + "metric": "rbo90", + "value": 0.13 + }, + { + "metric": "frequencyWeighted", + "value": 0.2 + } + ] + }, + { + "snapshots": [ + { + "searchConfigurationId": "0d687614-df5b-4b6b-8110-9d8c6d407963", + "docIds": [ + "B07X3S9RTZ", + "B07WVZFKLQ", + "B00GXD4NWE", + "B07ZKCV5K5", + "B07ZKDVHFB", + "B086VKT9R8", + "B08XLM8YK1", + "B07FPP6TB5", + "B07N1TMNHB", + "B09CDHM8W7" + ] + }, + { + "searchConfigurationId": "122fbde8-d593-4d71-96d4-cbe3b4977468", + "docIds": [ + "B07Q7VGW4Q", + "B00GXD4NWE", + "B07VML1CY1", + "B07THVCJK3", + "B07RKSV7SW", + "B010EAW8UK", + "B07FPP6TB5", + "B073G9ZD33", + "B07VXRXRJX", + "B07Q45SP9P" + ] + } + ], + "queryText": "tv", + "metrics": [ + { + "metric": "jaccard", + "value": 0.11 + }, + { + "metric": "rbo50", + "value": 0.07 + }, + { + "metric": "rbo90", + "value": 0.16 + }, + { + "metric": "frequencyWeighted", + "value": 0.2 + } + ] + } + ] + } + } + ] + } +} +``` + +</details> + +## Using Search Relevance Workbench in OpenSearch Dashboards + +You can create all Search Relevance Workbench components and visualize the experiment results in OpenSearch Dashboards. +In this example, you'll create the same experiment and review its results. + +In the left navigation pane, select **OpenSearch Plugins** > **Search Relevance** and then select **Query Set Comparison**, as shown in the following image. + +<img src="{{site.url}}{{site.baseurl}}/images/search-relevance-workbench/select_query_set_comparison.png" alt="Select Query Set Comparison Experiment"/>{: .img-fluid } + +Select the query set you created (`TVs`) and the search configurations (`my_production_config`, `title_boost`), and then select **Start Evaluation**, as shown in the following image. + +<img src="{{site.url}}{{site.baseurl}}/images/search-relevance-workbench/query_set_comparison_experiment_definition.png" alt="Define Query Set Comparison Experiment"/>{: .img-fluid } + +You are automatically directed to the experiment overview table, shown in the following image. + +<img src="{{site.url}}{{site.baseurl}}/images/search-relevance-workbench/experiment_table_overview.png" alt="Experiment Overview Table"/>{: .img-fluid } + +To review the results, select the topmost (most recent) experiment. The experiment view page shows three elements: +1. The experiment parameters. +2. The aggregate metrics resulting from the experiment, shown in the following image. +<img src="{{site.url}}{{site.baseurl}}/images/search-relevance-workbench/aggregate_metrics_comparison_experiment.png" alt="Aggregate Metrics for Comparison Experiment"/>{: .img-fluid } +3. The individual metrics per query. + +To visually assess the differences between two result sets, select a query event. diff --git a/_search-plugins/searching-data/autocomplete.md b/_search-plugins/searching-data/autocomplete.md index ecbb4ddca6b..742810cbea1 100644 --- a/_search-plugins/searching-data/autocomplete.md +++ b/_search-plugins/searching-data/autocomplete.md @@ -1,8 +1,8 @@ --- layout: default title: Autocomplete -parent: Searching data -nav_order: 24 +parent: Search options +nav_order: 60 redirect_from: - /opensearch/search/autocomplete/ --- diff --git a/_search-plugins/collapse-search.md b/_search-plugins/searching-data/collapse-search.md similarity index 98% rename from _search-plugins/collapse-search.md rename to _search-plugins/searching-data/collapse-search.md index ec7e57515ae..771d79b622b 100644 --- a/_search-plugins/collapse-search.md +++ b/_search-plugins/searching-data/collapse-search.md @@ -1,7 +1,10 @@ --- layout: default title: Collapse search results -nav_order: 3 +parent: Search options +nav_order: 40 +redirect_from: + - /search-plugins/collapse-search/ --- # Collapse search results @@ -222,10 +225,8 @@ GET /bakery-items/_search }, "sort": ["price"] } - - ``` -This query searches for documents in the `cakes` category and groups the search results by the `item_name` field. For each `item_name`, it retrieves the top three lowest-priced items and the top three most recent items, sorted by `baked_date` in descending order. -You can expand the groups by sending an additional query for each inner hit request corresponding to each collapsed hit in the response. This can significantly slow down the process if there are too many groups or inner hit requests. The `max_concurrent_group_searches` request parameter can be used to control the maximum number of concurrent searches allowed in this phase. The default is based on the number of data nodes and the default search thread pool size. +This query searches for documents in the `cakes` category and groups the search results by the `item_name` field. For each `item_name`, it retrieves the top three lowest-priced items and the top three most recent items, sorted by `baked_date` in descending order. +You can expand the groups by sending an additional query for each inner hit request corresponding to each collapsed hit in the response. This can significantly slow down the process if there are too many groups or inner hit requests. The `max_concurrent_group_searches` request parameter can be used to control the maximum number of concurrent searches allowed in this phase. The default is based on the number of data nodes and the default search thread pool size. \ No newline at end of file diff --git a/_search-plugins/searching-data/did-you-mean.md b/_search-plugins/searching-data/did-you-mean.md index d88fdef980c..ccc203827e7 100644 --- a/_search-plugins/searching-data/did-you-mean.md +++ b/_search-plugins/searching-data/did-you-mean.md @@ -1,8 +1,8 @@ --- layout: default title: Did-you-mean -parent: Searching data -nav_order: 25 +parent: Search options +nav_order: 70 redirect_from: - /opensearch/search/did-you-mean/ --- diff --git a/_search-plugins/searching-data/highlight.md b/_search-plugins/searching-data/highlight.md index 97d2b0a3b00..d3b9fa862b7 100644 --- a/_search-plugins/searching-data/highlight.md +++ b/_search-plugins/searching-data/highlight.md @@ -1,8 +1,8 @@ --- layout: default title: Highlight query matches -parent: Searching data -nav_order: 23 +parent: Search options +nav_order: 50 redirect_from: - /opensearch/search/highlight/ --- @@ -124,11 +124,11 @@ To highlight the search terms, the highlighter needs the start and end character - [**Term vectors**]: If you set the [`term_vector` parameter]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/text#term-vector-parameter) to `with_positions_offsets` when mapping a text field, the highlighter uses the `term_vector` to highlight the field. Storing term vectors requires the most disk space. However, it makes highlighting faster for fields larger than 1 MB and for multi-term queries like prefix or wildcard because term vectors provide access to the dictionary of terms for each document. -- **Text reanalysis**: In the absence of both postings and term vectors, the highlighter reanalyzes text in order to highlight it. For every document and every field that needs highlighting, the highlighter creates a small in-memory index and reruns the original query through Lucene’s query execution planner to access low-level match information for the current document. Reanalyzing the text works well in most use cases. However, this method is more memory and time intensive for large fields. +- **Text reanalysis**: In the absence of both postings and term vectors, the highlighter reanalyzes text in order to highlight it. For every document and every field that needs highlighting, the highlighter creates a small in-memory index and reruns the original query through Lucene's query execution planner to access low-level match information for the current document. Reanalyzing the text works well in most use cases. However, this method is more memory and time intensive for large fields. ## Highlighter types -OpenSearch supports three highlighter implementations: `plain`, `unified`, and `fvh` (Fast Vector Highlighter). +OpenSearch supports four highlighter implementations: `plain`, `unified`, `fvh` (Fast Vector Highlighter), and `semantic`. The following table lists the methods of obtaining the offsets for each highlighter. @@ -137,6 +137,7 @@ Highlighter | Method of obtaining offsets [`unified`](#the-unified-highlighter) | Term vectors if `term_vector` is set to `with_positions_offsets`,<br> postings if `index_options` is set to `offsets`, <br> text reanalysis otherwise. [`fvh`](#the-fvh-highlighter) | Term vectors. [`plain`](#the-plain-highlighter) | Text reanalysis. +[`semantic`](#the-semantic-highlighter) | Model inference. ### Setting the highlighter type @@ -170,34 +171,47 @@ The `fvh` highlighter is based on the Lucene Fast Vector Highlighter. To use thi The `plain` highlighter is based on the standard Lucene highlighter. It requires the highlighted fields to be stored either individually or in the `_source` field. The `plain` highlighter mirrors the query matching logic, in particular word importance and positions in phrase queries. It works for most use cases but may be slow for large fields because it has to reanalyze the text to be highlighted. +### The `semantic` highlighter + +The `semantic` highlighter uses machine learning (ML) models to identify and highlight the most semantically relevant sentences or passages within a text field, based on the query's meaning. This goes beyond traditional lexical matching offered by other highlighters. It does not rely on offsets from postings or term vectors but instead uses a deployed ML model (specified by the `model_id`) to perform inference on the field content. This approach allows you to highlight contextually relevant text even when exact terms don't match the query. Highlighting is performed at the sentence level. + +Before using the `semantic` highlighter, you must configure and deploy a sentence highlighting model. For more information about using ML models in OpenSearch, see [Integrating ML models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/). For information about OpenSearch-provided sentence highlighting models, see [Semantic sentence highlighting models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#semantic-sentence-highlighting-models). +{: .note} + +To use the `semantic` highlighter, you must specify a `model_id` in the `highlight.options` object. The model determines which parts of the text are semantically similar to the query. + +For a step-by-step guide, see the [semantic highlighting tutorial]({{site.url}}{{site.baseurl}}/tutorials/vector-search/semantic-highlighting-tutorial/). + ## Highlighting options The following table describes the highlighting options you can specify on a global or field level. Field-level settings override global settings. Option | Description :--- | :--- -type | Specifies the highlighter to use. Valid values are `unified`, `fvh`, and `plain`. Default is `unified`. -fields | Specifies the fields to search for text to be highlighted. Supports wildcard expressions. If you use wildcards, only `text` and `keyword` fields are highlighted. For example, you can set `fields` to `my_field*` to include all `text` and `keyword` fields that start with the prefix `my_field`. -force_source | Specifies that field values for highlighting should be obtained from the `_source` field rather than from stored field values. Default is `false`. +`type` | Specifies the highlighter to use. Valid values are `unified`, `fvh`, `plain`, and `semantic`. Default is `unified`. +`fields` | Specifies the fields to search for text to be highlighted. Supports wildcard expressions. If you use wildcards, only `text` and `keyword` fields are highlighted. For example, you can set `fields` to `my_field*` to include all `text` and `keyword` fields that start with the prefix `my_field`. +`force_source` | Specifies that field values for highlighting should be obtained from the `_source` field rather than from stored field values. Default is `false`. require_field_match | Specifies whether to highlight only fields that contain a search query match. Default is `true`. To highlight all fields, set this option to `false`. -pre_tags | Specifies the HTML start tags for the highlighted text as an array of strings. -post_tags | Specifies the HTML end tags for the highlighted text as an array of strings. -tags_schema | If you set this option to `styled`, OpenSearch uses the built-in tag schema. In this schema, the `pre_tags` are `<em class="hlt1">`, `<em class="hlt2">`, `<em class="hlt3">`, `<em class="hlt4">`, `<em class="hlt5">`, `<em class="hlt6">`, `<em class="hlt7">`, `<em class="hlt8">`, `<em class="hlt9">`, and `<em class="hlt10">`, and the `post_tags` is `</em>`. -boundary_chars | All boundary characters combined in a string.<br> Default is `".,!? \t\n"`. -boundary_scanner | Valid only for the `unified` and `fvh` highlighters. Specifies whether to split the highlighted fragments into sentences, words, or characters. Valid values are the following:<br>- `sentence`: Split highlighted fragments at sentence boundaries, as defined by the [BreakIterator](https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html). You can specify the BreakIterator's locale in the `boundary_scanner_locale` option. <br>- `word`: Split highlighted fragments at word boundaries, as defined by the [BreakIterator](https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html). You can specify the BreakIterator's locale in the `boundary_scanner_locale` option.<br>- `chars`: Split highlighted fragments at any character listed in `boundary_chars`. Valid only for the `fvh` highlighter. -boundary_scanner_locale | Provides a [locale](https://docs.oracle.com/javase/8/docs/api/java/util/Locale.html) for the `boundary_scanner`. Valid values are language tags (for example, `"en-US"`). Default is [Locale.ROOT](https://docs.oracle.com/javase/8/docs/api/java/util/Locale.html#ROOT). -boundary_max_scan | Controls how far to scan for boundary characters when the `boundary_scanner` parameter for the `fvh` highlighter is set to `chars`. Default is 20. +`pre_tags` | Specifies the HTML start tags for the highlighted text as an array of strings. +`post_tags` | Specifies the HTML end tags for the highlighted text as an array of strings. +`tags_schema` | If you set this option to `styled`, OpenSearch uses the built-in tag schema. In this schema, the `pre_tags` are `<em class="hlt1">`, `<em class="hlt2">`, `<em class="hlt3">`, `<em class="hlt4">`, `<em class="hlt5">`, `<em class="hlt6">`, `<em class="hlt7">`, `<em class="hlt8">`, `<em class="hlt9">`, and `<em class="hlt10">`, and the `post_tags` is `</em>`. +`boundary_chars` | All boundary characters combined in a string.<br> Default is `".,!? \t\n"`. +`boundary_scanner` | Valid only for the `unified` and `fvh` highlighters. Specifies whether to split the highlighted fragments into sentences, words, or characters. Valid values are the following:<br>- `sentence`: Split highlighted fragments at sentence boundaries, as defined by the [BreakIterator](https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html). You can specify the BreakIterator's locale in the `boundary_scanner_locale` option. <br>- `word`: Split highlighted fragments at word boundaries, as defined by the [BreakIterator](https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html). You can specify the BreakIterator's locale in the `boundary_scanner_locale` option.<br>- `chars`: Split highlighted fragments at any character listed in `boundary_chars`. Valid only for the `fvh` highlighter. +`boundary_scanner_locale` | Provides a [locale](https://docs.oracle.com/javase/8/docs/api/java/util/Locale.html) for the `boundary_scanner`. Valid values are language tags (for example, `"en-US"`). Default is [Locale.ROOT](https://docs.oracle.com/javase/8/docs/api/java/util/Locale.html#ROOT). +`boundary_max_scan` | Controls how far to scan for boundary characters when the `boundary_scanner` parameter for the `fvh` highlighter is set to `chars`. Default is 20. encoder | Specifies whether the highlighted fragment should be HTML encoded before it is returned. Valid values are `default` (no encoding) or `html` (first escape the HTML text and then insert the highlighting tags). For example, if the field text is `<h3>Hamlet</h3>` and the `encoder` is set to `html`, the highlighted text is `"<h3><em>Hamlet</em></h3>"`. -fragmenter | Specifies how to split text into highlighted fragments. Valid only for the `plain` highlighter. Valid values are the following:<br>- `span` (default): Splits text into fragments of the same size but tries not to split text between highlighted terms. <br>- `simple`: Splits text into fragments of the same size. +`fragmenter` | Specifies how to split text into highlighted fragments. Valid only for the `plain` highlighter. Valid values are the following:<br>- `span` (default): Splits text into fragments of the same size but tries not to split text between highlighted terms. <br>- `simple`: Splits text into fragments of the same size. fragment_offset | Specifies the character offset from which you want to start highlighting. Valid for the `fvh` highlighter only. -fragment_size | The size of a highlighted fragment, specified as the number of characters. If `number_of_fragments` is set to 0, `fragment_size` is ignored. Default is 100. +`fragment_size` | The size of a highlighted fragment, specified as the number of characters. If `number_of_fragments` is set to 0, `fragment_size` is ignored. Default is 100. number_of_fragments| The maximum number of returned fragments. If `number_of_fragments` is set to 0, OpenSearch returns the highlighted contents of the entire field. Default is 5. -order | The sort order for the highlighted fragments. Set `order` to `score` to sort fragments by relevance. Each highlighter has a different algorithm for calculating relevance scores. Default is `none`. -highlight_query | Specifies that matches for a query other than the search query should be highlighted. The `highlight_query` option is useful when you use a faster query to get document matches and a slower query (for example, `rescore_query`) to refine the results. We recommend to include the search query as part of the `highlight_query`. -matched_fields | Combines matches from different fields to highlight one field. The most common use case for this functionality is highlighting text that is analyzed in different ways and kept in multi-fields. All fields in the `matched_fields` list must have the `term_vector` field set to `with_positions_offsets`. The field in which the matches are combined is the only loaded field, so it is beneficial to set its `store` option to `yes`. Valid only for the `fvh` highlighter. -no_match_size | Specifies the number of characters, starting from the beginning of the field, to return if there are no matching fragments to highlight. Default is 0. -phrase_limit | The number of matching phrases in a document that are considered. Limits the number of phrases to analyze by the `fvh` highlighter to avoid consuming a lot of memory. If `matched_fields` are used, `phrase_limit` specifies the number of phrases for each matched field. A higher `phrase_limit` leads to increased query time and more memory consumption. Valid only for the `fvh` highlighter. Default is 256. -max_analyzer_offset | Specifies the maximum number of characters to be analyzed by a highlight request. The remaining text will not be processed. If the text to be highlighted exceeds this offset, then an empty highlight is returned. The maximum number of characters that will be analyzed for a highlight request is defined by `index.highlight.max_analyzed_offset`. When this limit is reached, an error is returned. Set the `max_analyzer_offset` to a lower value than `index.highlight.max_analyzed_offset` to avoid the error. +`order` | The sort order for the highlighted fragments. Set `order` to `score` to sort fragments by relevance. Each highlighter uses a different algorithm to calculate relevance scores. Default is `none`. +`highlight_query` | Specifies that matches for a query other than the search query should be highlighted. The `highlight_query` option is useful when using a faster query to get document matches and a slower query (for example, `rescore_query`) to refine the results. We recommend including the search query as part of the `highlight_query`. +`matched_fields` | Combines matches from different fields to highlight one field. The most common use case for this functionality is highlighting text that is analyzed in different ways and kept in multi-fields. If using `fvh`, all fields in the `matched_fields` list must have the `term_vector` field set to `with_positions_offsets`. The field in which the matches are combined is the only loaded field, so it is beneficial to set its `store` option to `yes`. Valid only for the `fvh` and `unified` highlighters. +`no_match_size` | Specifies the number of characters, starting from the beginning of the field, to return if there are no matching fragments to highlight. Default is 0. +`phrase_limit` | The number of matching phrases in a document that are considered. Limits the number of phrases to be analyzed by the `fvh` highlighter in order to avoid consuming a lot of memory. If `matched_fields` are used, `phrase_limit` specifies the number of phrases for each matched field. A higher `phrase_limit` leads to increased query time and more memory consumption. Valid only for the `fvh` highlighter. Default is 256. +`max_analyzer_offset` | Specifies the maximum number of characters to be analyzed by a highlight request. The remaining text will not be processed. If the text to be highlighted exceeds this offset, then an empty highlight is returned. The maximum number of characters that will be analyzed for a highlight request is defined by `index.highlight.max_analyzed_offset`. When this limit is reached, an error is returned. Set the `max_analyzer_offset` to a lower value than `index.highlight.max_analyzed_offset` to avoid the error. +`options` | A global object containing highlighter-specific options. +`options.model_id` | The ID of the deployed ML model to use for highlighting. Required. Valid only for the `semantic` highlighter. The unified highlighter's sentence scanner splits sentences larger than `fragment_size` at the first word boundary after `fragment_size` is reached. To return whole sentences without splitting them, set `fragment_size` to 0. {: .note} @@ -959,9 +973,79 @@ The response lists documents that contain the word "bragging" first: } ``` +## Using the `semantic` highlighter + +The `semantic` highlighter uses the specified ML model to find passages in text that are semantically relevant to the search query, even if there are no exact keyword matches. Highlighting occurs at the sentence level. + +To use the `semantic` highlighter, set the `type` to `semantic` in the `fields` object and provide the `model_id` of the deployed sentence transformer or question-answering model within the global `highlight.options` object. + +The following example uses a `neural` query to find documents related to "treatments for neurodegenerative diseases" and then applies semantic highlighting using the specified `sentence_model_id`: + +```json +POST neural-search-index/_search +{ + "_source": { + "excludes": ["text_embedding"] + }, + "query": { + "neural": { + "text_embedding": { + "query_text": "treatments for neurodegenerative diseases", + "model_id": "your-text-embedding-model-id", + "k": 5 + } + } + }, + "highlight": { + "fields": { + "text": { + "type": "semantic" + } + }, + "options": { + "model_id": "your-sentence-model-id" + } + } +} +``` +{% include copy-curl.html %} + +The response includes a `highlight` object for each hit, indicating the most semantically relevant sentence by emphasizing it with <em> tags. Note that model IDs are placeholders: + +```json +{ + "took": 628, + "timed_out": false, + "_shards": { ... }, + "hits": { + "total": { "value": 5, "relation": "eq" }, + "max_score": 0.4841726, + "hits": [ + { + "_index": "neural-search-index", + "_id": "srL7G5YBmDiZSe-G2pDc", + "_score": 0.4841726, + "_source": { + "text": "Alzheimer's disease is a progressive neurodegenerative disorder characterized by accumulation of amyloid-beta plaques and neurofibrillary tangles in the brain. Early symptoms include short-term memory impairment, followed by language difficulties, disorientation, and behavioral changes. While traditional treatments such as cholinesterase inhibitors and memantine provide modest symptomatic relief, they do not alter disease progression. Recent clinical trials investigating monoclonal antibodies targeting amyloid-beta, including aducanumab, lecanemab, and donanemab, have shown promise in reducing plaque burden and slowing cognitive decline. Early diagnosis using biomarkers such as cerebrospinal fluid analysis and PET imaging may facilitate timely intervention and improved outcomes." + }, + "highlight": { + "text": [ + "Alzheimer's disease is a progressive neurodegenerative disorder ... <em>Recent clinical trials investigating monoclonal antibodies targeting amyloid-beta, including aducanumab, lecanemab, and donanemab, have shown promise in reducing plaque burden and slowing cognitive decline.</em> Early diagnosis using biomarkers ..." + ] + } + }, + // ... other hits with highlighted sentences ... + ] + } +} +``` + +The highlighted fragments in the example response have been truncated for brevity. The `semantic` highlighter returns the full sentence containing the most relevant passage. + ## Query limitations Note the following limitations: -- When extracting terms to highlight, highlighters don’t reflect the Boolean logic of a query. Therefore, for some complex Boolean queries, such as nested Boolean queries and queries using `minimum_should_match`, OpenSearch may highlight terms that don’t correspond to query matches. -- The `fvh` highlighter does not support span queries. \ No newline at end of file +- When extracting terms to highlight, highlighters don't reflect the Boolean logic of a query. Therefore, for some complex Boolean queries, such as nested Boolean queries and queries using `minimum_should_match`, OpenSearch may highlight terms that don't correspond to query matches. +- The `fvh` highlighter does not support span queries. +- The `semantic` highlighter requires a deployed ML model specified by `model_id` in the `highlight.options`. It does not use traditional offset methods (postings, term vectors) and relies solely on model inference. \ No newline at end of file diff --git a/_search-plugins/searching-data/index.md b/_search-plugins/searching-data/index.md index 279958d97c3..3506d14b31f 100644 --- a/_search-plugins/searching-data/index.md +++ b/_search-plugins/searching-data/index.md @@ -1,22 +1,25 @@ --- layout: default -title: Searching data +title: Search options nav_order: 5 has_children: true has_toc: false redirect_from: /opensearch/ux/ --- -# Searching data +# Search options -What users expect from search engines has evolved over the years. Just returning relevant results quickly is no longer enough for most users. Now users seek methods that allow them to get even more relevant results, to sort and organize results, and to highlight their queries. OpenSearch includes many features, described in the following table, that enhance the search experience. +What users expect from search engines has evolved over the years. Just returning relevant results quickly is no longer enough for most users. Now users seek methods that allow them to get even more relevant results, to sort and organize results, and to highlight their queries. OpenSearch includes many search options, described in the following table, that enhance the search experience. -Feature | Description +Option | Description :--- | :--- [Autocomplete functionality]({{site.url}}{{site.baseurl}}/opensearch/search/autocomplete/) | Suggest phrases as the user types. [Did-you-mean functionality]({{site.url}}{{site.baseurl}}/opensearch/search/did-you-mean/) | Check spelling of phrases as the user types. [Paginate results]({{site.url}}{{site.baseurl}}/opensearch/search/paginate/) | Rather than a single, long list, separate search results into pages. +[Point in Time]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/point-in-time/) | Run different queries against a dataset that is fixed in time. [Sort results]({{site.url}}{{site.baseurl}}/opensearch/search/sort/) | Allow sorting of results by different criteria. +[Filter results]({{site.url}}{{site.baseurl}}/search-plugins/filter-search/) | Filter search results. +[Collapse results]({{site.url}}{{site.baseurl}}/search-plugins/collapse-search/) | Collapse search results. [Highlight query matches]({{site.url}}{{site.baseurl}}/opensearch/search/highlight/) | Highlight the search term in the results. [Retrieve inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/) | Retrieve underlying hits in nested and parent-join objects. -[Retrieve specific fields]({{site.url}}{{site.baseurl}}search-plugins/searching-data/retrieve-specific-fields/) | Retrieve only the specific fields +[Retrieve specific fields]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/retrieve-specific-fields/) | Retrieve only the specific fields diff --git a/_search-plugins/searching-data/inner-hits.md b/_search-plugins/searching-data/inner-hits.md index 38fc7a491d0..7bdefcdf562 100644 --- a/_search-plugins/searching-data/inner-hits.md +++ b/_search-plugins/searching-data/inner-hits.md @@ -1,12 +1,12 @@ --- layout: default -title: Inner hits -parent: Searching data +title: Retrieve inner hits +parent: Search options has_children: false -nav_order: 70 +nav_order: 75 --- -# Inner hits +# Retrieve inner hits In OpenSearch, when you perform a search using [nested objects]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) or [parent-join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/), the underlying hits (nested inner objects or child documents) are hidden by default. You can retrieve inner hits by using the `inner_hits` parameter in the search query. diff --git a/_search-plugins/searching-data/paginate.md b/_search-plugins/searching-data/paginate.md index 6040065991f..66873b2572b 100644 --- a/_search-plugins/searching-data/paginate.md +++ b/_search-plugins/searching-data/paginate.md @@ -1,13 +1,13 @@ --- layout: default title: Paginate results -parent: Searching data +parent: Search options nav_order: 10 redirect_from: - /opensearch/search/paginate/ --- -## Paginate results +# Paginate results You can use the following methods to paginate search results in OpenSearch: diff --git a/_search-plugins/searching-data/point-in-time-api.md b/_search-plugins/searching-data/point-in-time-api.md index 6a3d8670522..d0b03290085 100644 --- a/_search-plugins/searching-data/point-in-time-api.md +++ b/_search-plugins/searching-data/point-in-time-api.md @@ -4,7 +4,7 @@ title: Point in Time API nav_order: 59 has_children: false parent: Point in Time -grand_parent: Searching data +grand_parent: Search options redirect_from: - /opensearch/point-in-time-api/ - /search-plugins/point-in-time-api/ @@ -28,7 +28,7 @@ Introduced 2.4 Creates a PIT. The `keep_alive` query parameter is required; it specifies how long to keep a PIT. -### Path and HTTP methods +### Endpoints ```json POST /<target_indexes>/_search/point_in_time?keep_alive=1h&routing=&expand_wildcards=&preference= diff --git a/_search-plugins/searching-data/point-in-time.md b/_search-plugins/searching-data/point-in-time.md index ee09354c0fc..485c2c2c99b 100644 --- a/_search-plugins/searching-data/point-in-time.md +++ b/_search-plugins/searching-data/point-in-time.md @@ -1,7 +1,7 @@ --- layout: default title: Point in Time -parent: Searching data +parent: Search options nav_order: 20 has_children: true has_toc: false @@ -37,6 +37,10 @@ The create PIT operation returns a PIT ID, which you can use to run multiple que In case of a cluster or node failure, all PIT data is lost. {: .note} +### PIT in SQL + +The [SQL plugin]({{site.url}}{{site.baseurl}}/search-plugins/sql/index/) also supports pagination using PIT. When the `plugin.sql.pagination.api` setting is enabled (the default), SQL search queries in OpenSearch automatically use PIT internally. For more information, see [Pagination in SQL]({{site.url}}{{site.baseurl}}/search-plugins/sql/sql-ppl-api/#paginating-results). + ## Pagination with PIT and search_after When you run a query with a PIT ID, you can use the `search_after` parameter to retrieve the next page of results. This gives you control over the order of documents in the pages of results. diff --git a/_search-plugins/searching-data/retrieve-specific-fields.md b/_search-plugins/searching-data/retrieve-specific-fields.md index ce860470dc8..a7a168cd503 100644 --- a/_search-plugins/searching-data/retrieve-specific-fields.md +++ b/_search-plugins/searching-data/retrieve-specific-fields.md @@ -1,8 +1,8 @@ --- layout: default -parent: Searching data +parent: Search options title: Retrieve specific fields -nav_order: 60 +nav_order: 80 --- # Retrieve specific fields @@ -68,7 +68,7 @@ If `_source` is disabled in the index mappings, [searching with docvalue fields] You can list the fields you want to retrieve in the `fields` parameter. Wildcard patterns are also accepted: ```json -GET "/index1/_search?pretty" +GET /index1/_search { "_source": false, "fields": ["age", "nam*"], @@ -169,7 +169,7 @@ The following example demonstrates how to use the `docvalue_fields` parameter. 1. Create an index with the following mappings: ```json - PUT my_index + PUT /my_index { "mappings": { "properties": { @@ -186,15 +186,18 @@ The following example demonstrates how to use the `docvalue_fields` parameter. 2. Index the following documents into the newly created index: ```json - POST my_index/_doc/1 + POST /my_index/_doc/1 { "title": "OpenSearch Basics", "author": "John Doe", "publication_date": "2021-01-01", "price": 29.99 } + ``` + {% include copy-curl.html %} - POST my_index/_doc/2 + ```json + POST /my_index/_doc/2 { "title": "Advanced OpenSearch", "author": "Jane Smith", @@ -207,7 +210,7 @@ The following example demonstrates how to use the `docvalue_fields` parameter. 3. Retrieve only the `author` and `publication_date` fields using `docvalue_fields`: ```json - POST my_index/_search + POST /my_index/_search { "_source": false, "docvalue_fields": ["author", "publication_date"], @@ -259,7 +262,7 @@ In OpenSearch, if you want to retrieve doc values for nested objects, you cannot 1. Define the index mappings: ```json - PUT my_index + PUT /my_index { "mappings": { "properties": { @@ -282,7 +285,7 @@ In OpenSearch, if you want to retrieve doc values for nested objects, you cannot 2. Index your data: ```json - POST my_index/_doc/1 + POST /my_index/_doc/1 { "title": "OpenSearch Basics", "author": "John Doe", @@ -305,7 +308,7 @@ In OpenSearch, if you want to retrieve doc values for nested objects, you cannot 3. Perform a search with `inner_hits` and `docvalue_fields`: ```json - POST my_index/_search + POST /my_index/_search { "query": { "nested": { @@ -405,7 +408,7 @@ Unlike `_source`, `stored_fields` must be explicitly defined in the mappings for 1. Create an index with the following mappings: ```json - PUT my_index + PUT /my_index { "mappings": { "properties": { @@ -432,14 +435,17 @@ Unlike `_source`, `stored_fields` must be explicitly defined in the mappings for 2. Index your data: ```json - POST my_index/_doc/1 + POST /my_index/_doc/1 { "title": "OpenSearch Basics", "author": "John Doe", "publication_date": "2022-01-01", "price": 29.99 } + ``` + {% include copy-curl.html %} + ```json POST my_index/_doc/2 { "title": "Advanced OpenSearch", @@ -453,7 +459,7 @@ Unlike `_source`, `stored_fields` must be explicitly defined in the mappings for 3. Perform a search with `stored_fields`: ```json - POST my_index/_search + POST /my_index/_search { "_source": false, "stored_fields": ["title", "author"], @@ -508,7 +514,7 @@ In OpenSearch, if you want to retrieve `stored_fields` for nested objects, you c 1. Create an index with the following mappings: ```json - PUT my_index + PUT /my_index { "mappings": { "properties": { @@ -531,7 +537,7 @@ In OpenSearch, if you want to retrieve `stored_fields` for nested objects, you c 2. Index your data: ```json - POST my_index/_doc/1 + POST /my_index/_doc/1 { "title": "OpenSearch Basics", "author": "John Doe", @@ -554,7 +560,7 @@ In OpenSearch, if you want to retrieve `stored_fields` for nested objects, you c 3. Perform a search with `inner_hits` and `stored_fields`: ```json - POST my_index/_search + POST /my_index/_search { "_source": false, "query": { @@ -641,7 +647,7 @@ You can include or exclude specific fields from the `_source` field in the searc 1. Index your data: ```json - PUT my_index/_doc/1 + PUT /my_index/_doc/1 { "title": "OpenSearch Basics", "author": "John Doe", @@ -654,7 +660,7 @@ You can include or exclude specific fields from the `_source` field in the searc 2. Perform a search using source filtering: ```json - POST my_index/_search + POST /my_index/_search { "_source": ["title", "author"], "query": { @@ -694,7 +700,7 @@ The following is the expected response: You can choose to exclude fields by using the `"excludes"` parameter in a search request, as shown in the following example: ```json -POST my_index/_search +POST /my_index/_search { "_source": { "excludes": ["price"] @@ -854,26 +860,26 @@ If you have an index of products, where each product document contains the `pric 2. Use the `script_fields` parameter to include a custom field called `discounted_price` in the search results. This field will be calculated based on the `price` and `discount_percentage` fields using a script: -```json -GET /products/_search -{ - "_source": ["product_id", "name", "price", "discount_percentage"], - "query": { - "match": { - "category": "Electronics" - } - }, - "script_fields": { - "discounted_price": { - "script": { - "lang": "painless", - "source": "doc[\"price\"].value * (1 - doc[\"discount_percentage\"].value / 100)" + ```json + GET /products/_search + { + "_source": ["product_id", "name", "price", "discount_percentage"], + "query": { + "match": { + "category": "Electronics" + } + }, + "script_fields": { + "discounted_price": { + "script": { + "lang": "painless", + "source": "doc[\"price\"].value * (1 - doc[\"discount_percentage\"].value / 100)" + } + } } } - } -} -``` -{% include copy-curl.html %} + ``` + {% include copy-curl.html %} You should receive the following response: diff --git a/_search-plugins/searching-data/search-shard-routing.md b/_search-plugins/searching-data/search-shard-routing.md index 77c5fc7ce40..65da7581d66 100644 --- a/_search-plugins/searching-data/search-shard-routing.md +++ b/_search-plugins/searching-data/search-shard-routing.md @@ -1,8 +1,8 @@ --- layout: default -parent: Searching data +parent: Search options title: Search shard routing -nav_order: 70 +nav_order: 90 --- # Search shard routing diff --git a/_search-plugins/searching-data/sort.md b/_search-plugins/searching-data/sort.md index fa4875d32f5..804b1297211 100644 --- a/_search-plugins/searching-data/sort.md +++ b/_search-plugins/searching-data/sort.md @@ -1,13 +1,13 @@ --- layout: default title: Sort results -parent: Searching data -nav_order: 22 +parent: Search options +nav_order: 30 redirect_from: - /opensearch/search/sort/ --- -## Sort results +# Sort results Sorting allows your users to sort results in a way that’s most meaningful to them. diff --git a/_search-plugins/semantic-search.md b/_search-plugins/semantic-search.md deleted file mode 100644 index 259685fe3de..00000000000 --- a/_search-plugins/semantic-search.md +++ /dev/null @@ -1,300 +0,0 @@ ---- -layout: default -title: Semantic search -nav_order: 35 -has_children: false -redirect_from: - - /search-plugins/neural-text-search/ ---- - -# Semantic search - -Semantic search considers the context and intent of a query. In OpenSearch, semantic search is facilitated by neural search with text embedding models. Semantic search creates a dense vector (a list of floats) and ingests data into a k-NN index. - -**PREREQUISITE**<br> -Before using semantic search, you must set up a text embedding model. For more information, see [Choosing a model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/#choosing-a-model). -{: .note} - -## Using semantic search - -To use semantic search, follow these steps: - -1. [Create an ingest pipeline](#step-1-create-an-ingest-pipeline). -1. [Create an index for ingestion](#step-2-create-an-index-for-ingestion). -1. [Ingest documents into the index](#step-3-ingest-documents-into-the-index). -1. [Search the index using neural search](#step-4-search-the-index-using-neural-search). - -## Step 1: Create an ingest pipeline - -To generate vector embeddings, you need to create an [ingest pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) that contains a [`text_embedding` processor]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/processors/text-embedding/), which will convert the text in a document field to vector embeddings. The processor's `field_map` determines the input fields from which to generate vector embeddings and the output fields in which to store the embeddings. - -The following example request creates an ingest pipeline where the text from `passage_text` will be converted into text embeddings and the embeddings will be stored in `passage_embedding`: - -```json -PUT /_ingest/pipeline/nlp-ingest-pipeline -{ - "description": "A text embedding pipeline", - "processors": [ - { - "text_embedding": { - "model_id": "bQ1J8ooBpBj3wT4HVUsb", - "field_map": { - "passage_text": "passage_embedding" - } - } - } - ] -} -``` -{% include copy-curl.html %} - -To split long text into passages, use the `text_chunking` ingest processor before the `text_embedding` processor. For more information, see [Text chunking]({{site.url}}{{site.baseurl}}/search-plugins/text-chunking/). - -## Step 2: Create an index for ingestion - -In order to use the text embedding processor defined in your pipeline, create a k-NN index, adding the pipeline created in the previous step as the default pipeline. Ensure that the fields defined in the `field_map` are mapped as correct types. Continuing with the example, the `passage_embedding` field must be mapped as a k-NN vector with a dimension that matches the model dimension. Similarly, the `passage_text` field should be mapped as `text`. - -The following example request creates a k-NN index that is set up with a default ingest pipeline: - -```json -PUT /my-nlp-index -{ - "settings": { - "index.knn": true, - "default_pipeline": "nlp-ingest-pipeline" - }, - "mappings": { - "properties": { - "id": { - "type": "text" - }, - "passage_embedding": { - "type": "knn_vector", - "dimension": 768, - "method": { - "engine": "lucene", - "space_type": "l2", - "name": "hnsw", - "parameters": {} - } - }, - "passage_text": { - "type": "text" - } - } - } -} -``` -{% include copy-curl.html %} - -For more information about creating a k-NN index and its supported methods, see [k-NN index]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/). - -## Step 3: Ingest documents into the index - -To ingest documents into the index created in the previous step, send the following requests: - -```json -PUT /my-nlp-index/_doc/1 -{ - "passage_text": "Hello world", - "id": "s1" -} -``` -{% include copy-curl.html %} - -```json -PUT /my-nlp-index/_doc/2 -{ - "passage_text": "Hi planet", - "id": "s2" -} -``` -{% include copy-curl.html %} - -Before the document is ingested into the index, the ingest pipeline runs the `text_embedding` processor on the document, generating text embeddings for the `passage_text` field. The indexed document includes the `passage_text` field, which contains the original text, and the `passage_embedding` field, which contains the vector embeddings. - -## Step 4: Search the index using neural search - -To perform vector search on your index, use the `neural` query clause either in the [k-NN plugin API]({{site.url}}{{site.baseurl}}/search-plugins/knn/api/#search-for-a-model) or [Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/) queries. You can refine the results by using a [k-NN search filter]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/). - -The following example request uses a Boolean query to combine a filter clause and two query clauses---a neural query and a `match` query. The `script_score` query assigns custom weights to the query clauses: - -```json -GET /my-nlp-index/_search -{ - "_source": { - "excludes": [ - "passage_embedding" - ] - }, - "query": { - "bool": { - "filter": { - "wildcard": { "id": "*1" } - }, - "should": [ - { - "script_score": { - "query": { - "neural": { - "passage_embedding": { - "query_text": "Hi world", - "model_id": "bQ1J8ooBpBj3wT4HVUsb", - "k": 100 - } - } - }, - "script": { - "source": "_score * 1.5" - } - } - }, - { - "script_score": { - "query": { - "match": { - "passage_text": "Hi world" - } - }, - "script": { - "source": "_score * 1.7" - } - } - } - ] - } - } -} -``` -{% include copy-curl.html %} - -The response contains the matching document: - -```json -{ - "took" : 36, - "timed_out" : false, - "_shards" : { - "total" : 1, - "successful" : 1, - "skipped" : 0, - "failed" : 0 - }, - "hits" : { - "total" : { - "value" : 1, - "relation" : "eq" - }, - "max_score" : 1.2251667, - "hits" : [ - { - "_index" : "my-nlp-index", - "_id" : "1", - "_score" : 1.2251667, - "_source" : { - "passage_text" : "Hello world", - "id" : "s1" - } - } - ] - } -} -``` - -## Setting a default model on an index or field - -A [`neural`]({{site.url}}{{site.baseurl}}/query-dsl/specialized/neural/) query requires a model ID for generating vector embeddings. To eliminate passing the model ID with each neural query request, you can set a default model on a k-NN index or a field. - -First, create a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) with a [`neural_query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) request processor. To set a default model for an index, provide the model ID in the `default_model_id` parameter. To set a default model for a specific field, provide the field name and the corresponding model ID in the `neural_field_default_id` map. If you provide both `default_model_id` and `neural_field_default_id`, `neural_field_default_id` takes precedence: - -```json -PUT /_search/pipeline/default_model_pipeline -{ - "request_processors": [ - { - "neural_query_enricher" : { - "default_model_id": "bQ1J8ooBpBj3wT4HVUsb", - "neural_field_default_id": { - "my_field_1": "uZj0qYoBMtvQlfhaYeud", - "my_field_2": "upj0qYoBMtvQlfhaZOuM" - } - } - } - ] -} -``` -{% include copy-curl.html %} - -Then set the default model for your index: - -```json -PUT /my-nlp-index/_settings -{ - "index.search.default_pipeline" : "default_model_pipeline" -} -``` -{% include copy-curl.html %} - -You can now omit the model ID when searching: - -```json -GET /my-nlp-index/_search -{ - "_source": { - "excludes": [ - "passage_embedding" - ] - }, - "query": { - "neural": { - "passage_embedding": { - "query_text": "Hi world", - "k": 100 - } - } - } -} -``` -{% include copy-curl.html %} - -The response contains both documents: - -```json -{ - "took" : 41, - "timed_out" : false, - "_shards" : { - "total" : 1, - "successful" : 1, - "skipped" : 0, - "failed" : 0 - }, - "hits" : { - "total" : { - "value" : 2, - "relation" : "eq" - }, - "max_score" : 1.22762, - "hits" : [ - { - "_index" : "my-nlp-index", - "_id" : "2", - "_score" : 1.22762, - "_source" : { - "passage_text" : "Hi planet", - "id" : "s2" - } - }, - { - "_index" : "my-nlp-index", - "_id" : "1", - "_score" : 1.2251667, - "_source" : { - "passage_text" : "Hello world", - "id" : "s1" - } - } - ] - } -} -``` \ No newline at end of file diff --git a/_search-plugins/sql/datatypes.md b/_search-plugins/sql/datatypes.md index c2eb4e38606..c00f27c3e40 100644 --- a/_search-plugins/sql/datatypes.md +++ b/_search-plugins/sql/datatypes.md @@ -25,7 +25,6 @@ text | text | VARCHAR date | timestamp | TIMESTAMP date_nanos | timestamp | TIMESTAMP ip | ip | VARCHAR -date | timestamp | TIMESTAMP binary | binary | VARBINARY object | struct | STRUCT nested | array | STRUCT diff --git a/_search-plugins/sql/limitation.md b/_search-plugins/sql/limitation.md index ac4a6ed6191..6fadde1791c 100644 --- a/_search-plugins/sql/limitation.md +++ b/_search-plugins/sql/limitation.md @@ -29,10 +29,44 @@ FROM ( But, if the outer query has `GROUP BY` or `ORDER BY`, then it's not supported. -## JOIN does not support aggregations on the joined result +## JOIN queries -The `join` query does not support aggregations on the joined result. -For example, e.g. `SELECT depo.name, avg(empo.age) FROM empo JOIN depo WHERE empo.id == depo.id GROUP BY depo.name` is not supported. +Because OpenSearch doesn't natively support relational operations, `JOIN` queries are supported on a best-effort basis. + +### JOIN does not support aggregations on the joined result + +The `JOIN` query does not support aggregations on the joined result. + +For example, `SELECT depo.name, avg(empo.age) FROM empo JOIN depo WHERE empo.id = depo.id GROUP BY depo.name` is not supported. + +### Performance + +`JOIN` queries are prone to expensive index scanning operations. + +`JOIN` queries may experience performance issues when working with result sets larger than 5 million matching records. +To improve `JOIN` performance, reduce the number of records being joined by filtering your data first. For example, limit the join to a specific range of key values: + +```sql +SELECT l.key, l.spanId, r.spanId + FROM logs_left AS l + JOIN logs_right AS r + ON l.key = r.key + WHERE l.key >= 17491637400000 + AND l.key < 17491637500000 + AND r.key >= 17491637400000 + AND r.key < 17491637500000 + LIMIT 10 +``` +{% include copy.html %} + +By default, JOIN queries will automatically terminate after 60 seconds to prevent excessive resource consumption. You can adjust this timeout period using a hint in your query. For example, to set a 5-minute (300-second) timeout, use the following code: + +```sql +SELECT /*! JOIN_TIME_OUT(300) */ left.a, right.b FROM left JOIN right ON left.id = right.id; +``` +{% include copy.html %} + +These performance restrictions don't apply when [querying external data sources]({{site.url}}{{site.baseurl}}/dashboards/management/query-data-source/). ## Pagination only supports basic queries @@ -77,10 +111,14 @@ The query with `aggregation` and `join` does not support pagination for now. ## Query processing engines -The SQL plugin has two query processing engines, `V1` and `V2`. Most of the features are supported by both engines, but only the new engine is actively being developed. A query that is first executed on the `V2` engine falls back to the `V1` engine in case of failure. If a query is supported in `V2` but not included in `V1`, the query will fail with an error response. +Before OpenSearch 3.0.0, the SQL plugin used two query processing engines: `V1` and `V2`. Both engines supported most features, but only `V2` was under active development. When you ran a query, the plugin first tried to execute it using the `V2` engine and fell back to `V1` if execution failed. If a query was supported in `V2` but not in `V1`, the query would fail and return an error response. + +Starting with OpenSearch 3.0.0, the SQL plugin introduced a new query engine (`V3`) that leverages Apache Calcite for query optimization and execution. Because `V3` is an experimental feature in OpenSearch 3.0.0, it's disabled by default. To enable this new engine, set `plugins.calcite.enabled` to `true`. Similar to the `V2` to `V1` fallback logic, when you run a query, the plugin first tries to execute it using the `V3` engine and falls back to `V2` if execution fails. For more information about `V3`, see [PPL Engine V3](https://github.com/opensearch-project/sql/blob/main/docs/dev/intro-v3-engine.md). ### V1 engine limitations +The `V1` query engine is the original SQL processing engine in OpenSearch. While it's been largely replaced by newer engines, understanding its limitations helps explain certain query behaviors, especially when queries fall back from `V2` to `V1`. The following limitations apply specifically to the `V1` engine: + * The select literal expression without `FROM` clause is not supported. For example, `SELECT 1` is not supported. * The `WHERE` clause does not support expressions. For example, `SELECT FlightNum FROM opensearch_dashboards_sample_data_flights where (AvgTicketPrice + 100) <= 1000` is not supported. * Most [relevancy search functions]({{site.url}}{{site.baseurl}}/search-plugins/sql/full-text/) are implemented in the `V2` engine only. @@ -89,12 +127,72 @@ Such queries are successfully executed by the `V2` engine unless they have `V1`- ### V2 engine limitations +The `V2` query engine handles most modern SQL query patterns. However, it has certain limitations that may affect your query development, particularly for complex analytical workloads. Understanding these limitations can help you design queries that work optimally with OpenSearch: + * The [cursor feature](#pagination-only-supports-basic-queries) is supported by the `V1` engine only. * For support of `cursor`/`pagination` in the `V2` engine, track [GitHub issue #656](https://github.com/opensearch-project/sql/issues/656). * `json` formatted output is supported in `V1` engine only. * The `V2` engine does not track query execution time, so slow queries are not reported. * The `V2` query engine not only runs queries in the OpenSearch engine but also supports post-processing for complex queries. Accordingly, the `explain` output is no longer OpenSearch domain-specific language (DSL) but also includes query plan information from the `V2` query engine. -Suggested change * The `V2` query engine does not support aggregation queries such as `histogram`, `date_histogram`, `percentiles`, `topHits`, `stats`, `extended_stats`, `terms`, or `range`. * JOINs and sub-queries are not supported. To stay up to date on the development for JOINs and sub-queries, track [GitHub issue #1441](https://github.com/opensearch-project/sql/issues/1441) and [GitHub issue #892](https://github.com/opensearch-project/sql/issues/892). -* PartiQL syntax for `nested` queries are not supported. Additionally, arrays of objects and primitive types return the first index of the array, while in `V1` they return the entire array as a JSON object. +* OpenSearch does not natively support the array data type but does allow multi-value fields implicitly. The SQL/PPL plugin adheres strictly to the data type semantics defined in index mappings. When parsing OpenSearch responses, it expects data to match the declared type and does not interpret all data in an array. If the [`plugins.query.field_type_tolerance`](https://github.com/opensearch-project/sql/blob/main/docs/user/admin/settings.rst#plugins-query-field-type-tolerance) setting is enabled, the SQL/PPL plugin handles array datasets by returning scalar data types, allowing basic queries (for example, `SELECT * FROM tbl WHERE condition`). However, using multi-value fields in expressions or functions will result in exceptions. If this setting is disabled or not set, only the first element of an array is returned, preserving the default behavior. +* PartiQL syntax for `nested` queries is not supported. + +### V3 engine limitations and restrictions + +The `V3` query engine provides enhanced query processing capabilities using Apache Calcite. As an experimental feature in OpenSearch 3.0.0, it has certain limitations and behavioral differences you should be aware of when developing queries. These limitations fall into three categories: new restrictions, unsupported functionalities, and behavior changes. + +#### Restrictions + +The `V3` engine introduces stricter validation for OpenSearch metadata fields. When working with commands that manipulate field names, be aware of the following restrictions: + +- `eval` won't allow you to use [OpenSearch metadata fields]({{site.url}}{{site.baseurl}}/field-types/metadata-fields/index/) as the fields. +- `rename` won't allow renaming to an [OpenSearch metadata field]({{site.url}}{{site.baseurl}}/field-types/metadata-fields/index/). +- `as` won't allow you to use an [OpenSearch metadata field]({{site.url}}{{site.baseurl}}/field-types/metadata-fields/index/) as the alias name. + +### Unsupported functionalities + +The `V3` engine doesn't support all the functionality available in previous engines. For the following features, the query will automatically be forwarded to the `V2` query engine: + +- `trendline` +- `show datasource` +- `describe` +- `top` and `rare` +- `fillnull` +- `patterns` +- `dedup` with `consecutive=true` +- Search-relevant commands: + - `AD` + - `ML` + - `Kmeans` +- Commands with the `fetch_size` parameter +- Queries with metadata fields, such as `_id` or `_doc` +- JSON-relevant functions: + - `cast to json` + - `json` + - `json_valid` +- Search-relevant functions: + - `match` + - `match_phrase` + - `match_bool_prefix` + - `match_phrase_prefix` + - `simple_query_string` + - `query_string` + - `multi_match` + +#### V2 compared to V3 + +Because the `V3` engine uses a different implementation internally, some behaviors have changed from previous versions. The behaviors in `V3` are considered correct, but they may produce different results than the same queries in `V2`. The following table highlights these differences. + +Item | `V2` | `V3` +:--- | :--- | :--- +Return type of `timestampdiff` | `timestamp` | `int` +Return type of `regexp` | `int` | `boolean` +Return type of `count`,`dc`,`distinct_count` | `int` | `bigint` +Return type of `ceiling`,`floor`,`sign` | `int` | Same type with input +`like(firstname, 'Ambe_')` on value "Amber JOHnny" | `true` | `false` +`like(firstname, 'Ambe*')` on value "Amber JOHnny" | `true` | `false` +`cast(firstname as boolean)` | `false` | `null` +Sum of multiple `null` values when `pushdown` is enabled | `0` | `null` +`percentile(null, 50)` | `0` | `null` diff --git a/_search-plugins/sql/ppl/functions.md b/_search-plugins/sql/ppl/functions.md index d192799f2e7..fe8f5f9f0cd 100644 --- a/_search-plugins/sql/ppl/functions.md +++ b/_search-plugins/sql/ppl/functions.md @@ -771,3 +771,198 @@ sepal_length_in_cm | sepal_width_in_cm | petal_length_in_cm | petal_width_in_cm | 5.1 | 3.5 | 1.4 | 0.2 | 1 | 5.6 | 3.0 | 4.1 | 1.3 | 0 | 6.7 | 2.5 | 5.8 | 1.8 | 2 + +## join + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +You can combine two datasets using the `join` command. The left side can be an index or results from piped commands, while the right side can be either an index or a subquery. + +### Syntax + +```sql +[join-type] join [left-alias] [right-alias] on <join-criteria> <right-dataset> +* joinCriteria: mandatory. It could be any comparison expression. +* right-dataset: mandatory. Right dataset could be either an index or a subquery with/without alias. +``` + +Field | Description | Type | Required | Default +:--- |:--- | :--- | :--- | :--- +`join-type` | The type of join to perform. Valid values are `inner`, `left`, `right`, `full`, `cross`, `semi`, and `anti`. | `String` | No | `inner` +`left-alias` | The subquery alias to use with the left join side in order to avoid ambiguous naming. Fixed pattern: `left = <left-alias>` | `String` | No | N/A +`right-alias` | The subquery alias to use with the right join side in order to avoid ambiguous naming. Fixed pattern: `right = <right-alias>` | `String` | No | N/A +`join-criteria` | Any comparison expression. | `String` | Yes | N/A +`right-dataset` | Either an index or a subquery with/without an alias. | `String` | Yes | N/A + +The following examples use the `state_country` and `occupation` indexes. + +`state_country`: + +| Name | Age | State | Country +:--- | :--- | :--- | :--- +| Jake | 70 | California | USA +| Hello | 30 | New York | USA +| John | 25 | Ontario | Canada +| Jane | 20 | Quebec | Canada +| Jim | 27 | B.C. | Canada +| Peter | 57 | B.C. | Canada +| Rick | 70 | B.C. | Canada +| David | 40 | Washington | USA + +`occupation`: + +| Name | Occupation | Country | Salary +:--- | :--- | :--- | :--- +| Jake | Engineer | England | 100000 +| Hello | Artist | USA | 70000 +| John | Doctor | Canada | 120000 +| David | Doctor | USA | 120000 +| David | Unemployed | Canada | 0 +| Jane | Scientist | Canada | 90000 + +**Example 1: Join two indexes** + +The following example performs an inner join between two indexes: + +```sql +search source = state_country +| inner join left=a right=b ON a.name = b.name occupation +| stats avg(salary) by span(age, 10) as age_span, b.country +``` + +avg(salary) | age_span | b.country +:--- | :--- | :--- +120000.0 | 40 | USA +105000.0 | 20 | Canada +0.0 | 40 | Canada +70000.0 | 30 | USA +100000.0 | 70 | England + +**Example 2: Join with a subsearch** + +The following example performs a left join with a subsearch: + +```sql +search source = state_country as a +| where country = 'USA' OR country = 'England' +| left join on a.name = b.name [ + source = occupation + | where salary > 0 + | fields name, country, salary + | sort salary + | head 3 + ] as b +| stats avg(salary) by span(age, 10) as age_span, b.country +``` + +avg(salary) | age_span | b.country +:--- | :--- | :--- +null | 40 | null +70000.0 | 30 | USA +100000.0 | 70 | England + +### Limitations + +The `join` command works only when `plugins.calcite.enabled` is set to `true`. + +## lookup + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +The `lookup` command enriches your search data by adding or replacing data from a lookup index (dimension table). You can extend index fields with values from a dimension table or append/replace values when a lookup condition is matched. As an alternative to the `join` command, the `lookup` command is more suitable for enriching the source data with a static dataset. + +### Syntax + +```sql +lookup <lookup-index> (<lookup-mapping-field> [as <source-mapping-field>])... [(replace | append) (<input-field> [AS <output-field>])...] +``` + +Field | Description | Required | Default +:--- | :--- | :--- | :--- +`lookup-index` | The name of lookup index (dimension table). | Yes | N/A +`lookup-mapping-field`| A mapping key in the `lookup-index`, analogous to a `join` key from the right table. You can specify multiple `lookup-mapping-field` values with commas. | Yes | N/A +`source-mapping-field`| A mapping key from the source (left side), analogous to a `join` key from the left side. | No | `lookup-mapping-field` +`replace` \| `append` | The output strategies. When specifying `replace`, matched values in the `lookup-index` field overwrite the values in the results. If you specify `append`, matched values in the `lookup-index` field only append to the missing values in the results. | No | `replace` +`input-field` | A field in `lookup-index` where matched values are applied to the result output. You can specify multiple `input-field` values with commas. If you don't specify any `input-field`, all fields except `lookup-mapping-field` from `lookup-index` are matched values that are applied to the result output. | No | N/A +`output-field` | A field of output. You can specify zero or multiple `output-field` values. If you specify `output-field` with an existing field name in the source query, its values will be replaced or appended by the matched values from `input-field`. If the field specified in `output-field` is a new field, an extended new field will be applied to the results. | No | `input-field` + +The following examples use the `workers` and `work_information` indexes. + +`workers`: + +| ID | Name | Occupation | Country | Salary +:--- | :--- | :--- | :--- | :--- +| 1000 | Jake | Engineer | England | 100000 +| 1001 | Hello | Artist | USA | 70000 +| 1002 | John | Doctor | Canada | 120000 +| 1003 | David | Doctor | N/A | 120000 +| 1004 | David | N/A | Canada | 0 +| 1005 | Jane | Scientist | Canada | 90000 + +`work_information`: + +| UID | Name | Department | Occupation +:--- | :--- | :--- | :--- +| 1000 | Jake | IT | Engineer | +| 1002 | John | DATA | Scientist | +| 1003 | David | HR | Doctor | +| 1005 | Jane | DATA | Engineer | +| 1006 | Tom | SALES | Artist | + +**Example 1: Look up workers and return the corresponding department** + +The following example looks up workers and returns the corresponding department: + +```sql +source = workers | lookup work_information uid as id append department +``` + +| id | name | occupation | country | salary | department +:--- | :--- | :--- | :--- | :--- | :--- +1000 | Jake | Engineer | England | 100000 | IT +1001 | Hello | Artist | USA | 70000 | Null +1002 | John | Doctor | Canada | 120000 | DATA +1003 | David | Doctor | Null | 120000 | HR +1004 | David | Null | Canada | 0 | Null +1005 | Jane | Scientist | Canada | 90000 | DATA + + +**Example 2: Look up workers and replace their occupation and department** + +The following example looks up workers and replaces their occupation and department using their `work_information`: + +```sql +source = workers | lookup work_information uid as id, name +``` + +id | name | occupation | country | salary | department +:--- | :--- |:-----------| :--- | :--- | :--- +1000 | Jake | Engineer | England | 100000 | IT +1001 | Hello | null | USA | 70000 | null +1002 | John | Scientist | Canada | 120000 | DATA +1003 | David | Doctor | null | 120000 | HR +1004 | David | null | Canada | 0 | null +1005 | Jane | Engineer | Canada | 90000 | DATA + +**Example 3: Look up workers and create a new occupation field** + +The following example looks up workers and appends their occupation from `work_information` as a new field: + +```sql +source = workers | lookup work_information name replace occupation as new_occupation +``` + +id | name | occupation | country | salary | new_occupation +:--- | :--- |:-----------| :--- | :--- | :--- +1000 | Jake | Engineer | England | 100000 | Engineer +1001 | Hello | Artist | USA | 70000 | null +1002 | John | Doctor | Canada | 120000 | Scientist +1003 | David | Doctor | null | 120000 | Doctor +1004 | David | null | Canada | 0 | Doctor +1005 | Jane | Scientist | Canada | 90000 | Engineer + +### Limitations + +The `lookup` command works only when `plugins.calcite.enabled` is set to `true`. diff --git a/_search-plugins/sql/ppl/subsearch.md b/_search-plugins/sql/ppl/subsearch.md new file mode 100644 index 00000000000..bfddf3c04b5 --- /dev/null +++ b/_search-plugins/sql/ppl/subsearch.md @@ -0,0 +1,244 @@ +--- +layout: default +title: Subsearch +parent: PPL +grand_parent: SQL and PPL +nav_order: 3 +--- + +# subsearch + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning} + +A subsearch (also known as a subquery) allows you to use the results of one query within another query. OpenSearch Piped Processing Language (PPL) supports four types of subsearch commands: + +- [`in`](#in) +- [`exists`](#exists) +- [`scalar`](#scalar) +- [`relation`](#relation) + +The first three subsearch commands (`in`, `exists`, and `scalar`) are expressions that you can use in the `where` command (`where <boolean expression>`) and search filter (`search source=* <boolean expression>`). The `relation` subsearch command is a statement that be used in a `join` operation. + +## `in` + +An `in` subsearch allows you to check whether a field's value exists in the results of another query. This is useful when you want to filter your results based on data from another index or query. + +### Syntax + +```sql +where <field> [not] in [ search source=... | ... | ... ] +``` +{% include copy.html %} + +### Usage + +```sql +source = outer | where a in [ source = inner | fields b ] +source = outer | where (a) in [ source = inner | fields b ] +source = outer | where (a,b,c) in [ source = inner | fields d,e,f ] +source = outer | where a not in [ source = inner | fields b ] +source = outer | where (a) not in [ source = inner | fields b ] +source = outer | where (a,b,c) not in [ source = inner | fields d,e,f ] +source = outer a in [ source = inner | fields b ] +source = outer a not in [ source = inner | fields b ] +source = outer | where a in [ source = inner1 | where b not in [ source = inner2 | fields c ] | fields b ] // nested +source = table1 | inner join left = l right = r on l.a = r.a AND r.a in [ source = inner | fields d ] | fields l.a, r.a, b, c //as join filter +``` +{% include copy.html %} + +## `exists` + +An `exists` subsearch checks whether any results are returned by the subsearch query. This is particularly useful for correlated subqueries where you want to check the existence of related records. + +### Syntax + +```sql +where [not] exists [ search source=... | ... | ... ] +``` +{% include copy.html %} + +### Usage + +The following examples demonstrate different ways to implement `exists` subsearches, from simple aggregation comparisons to complex nested calculations. + +They are created with the following assumptions: + +- `a` and `b` are fields of table outer. +- `c` and `d` are fields of table inner. +- `e` and `f` are fields of table nested. + +#### Correlated + +In the following example, the inner query references fields from the outer query (such as when a = c), creating a dependency between the queries. The subsearch is evaluated once for each row in the outer query: + + +```sql +source = outer | where exists [ source = inner | where a = c ] +source = outer | where not exists [ source = inner | where a = c ] +source = outer | where exists [ source = inner | where a = c and b = d ] +source = outer | where not exists [ source = inner | where a = c and b = d ] +source = outer exists [ source = inner | where a = c ] +source = outer not exists [ source = inner | where a = c ] +source = table as t1 exists [ source = table as t2 | where t1.a = t2.a ] +``` +{% include copy.html %} + + +#### Uncorrelated + +In the following example, the subsearches are independent of the outer query. The inner query doesn't reference any fields from the outer query, so it's evaluated only once, regardless of how many rows are in the outer query: + +```sql +source = outer | where exists [ source = inner | where c > 10 ] +source = outer | where not exists [ source = inner | where c > 10 ] +``` +{% include copy.html %} + +#### Nested + +The following example demonstrates how to nest one subsearch within another, creating multiple levels of query complexity. This approach is useful for complex filtering scenarios that require multiple conditions from different data sources: + +```sql +source = outer | where exists [ source = inner1 | where a = c and exists [ source = nested | where c = e ] ] +source = outer | where exists [ source = inner1 | where a = c | where exists [ source = nested | where c = e ] ] +``` + +## `scalar` + +A `scalar` subsearch returns a single value that you can use in comparisons or calculations. This is useful when you need to compare a field against an aggregated value from another query. + +### Syntax + +```sql +where <field> = [ search source=... | ... | ... ] +``` +{% include copy.html %} + +### Usage + +The following examples demonstrate different ways to implement `scalar` subsearches, from simple aggregation comparisons to complex nested calculations. + +#### Uncorrelated + +In the following example, the `scalar` subsearch is independent of the outer query. These subsearches retrieve a single value that can be used in calculations or comparisons: + +```sql +source = outer | eval m = [ source = inner | stats max(c) ] | fields m, a +source = outer | eval m = [ source = inner | stats max(c) ] + b | fields m, a +source = outer | where a > [ source = inner | stats min(c) ] | fields a +source = outer a > [ source = inner | stats min(c) ] | fields a +``` +{% include copy.html %} + +#### Correlated + +In the following example, the `scalar` subsearch references fields from the outer query, creating a dependency where the inner query result depends on each row of the outer query: + +```sql +source = outer | eval m = [ source = inner | where outer.b = inner.d | stats max(c) ] | fields m, a +source = outer | eval m = [ source = inner | where b = d | stats max(c) ] | fields m, a +source = outer | eval m = [ source = inner | where outer.b > inner.d | stats max(c) ] | fields m, a +source = outer | where a = [ source = inner | where outer.b = inner.d | stats max(c) ] +source = outer | where a = [ source = inner | where b = d | stats max(c) ] +source = outer | where [ source = inner | where outer.b = inner.d OR inner.d = 1 | stats count() ] > 0 | fields a +source = outer a = [ source = inner | where b = d | stats max(c) ] +source = outer [ source = inner | where outer.b = inner.d OR inner.d = 1 | stats count() ] > 0 | fields a +``` +{% include copy.html %} + +#### Nested + +The following example demonstrates how to nest multiple `scalar` subsearches to create complex comparisons or use one subsearch result within another: + +```sql +source = outer | where a = [ source = inner | stats max(c) | sort c ] OR b = [ source = inner | where c = 1 | stats min(d) | sort d ] +source = outer | where a = [ source = inner | where c = [ source = nested | stats max(e) by f | sort f ] | stats max(d) by c | sort c | head 1 ] +``` +{% include copy.html %} + +## `relation` + +A `relation` subsearch allows you to use a query result as a dataset in a join operation. This is useful when you need to join with a filtered or transformed dataset rather than joining directly with a static index. + +### Syntax + +```sql +join on <condition> [ search source=... | ... | ... ] [as alias] +``` +{% include copy.html %} + +### Usage + +The following example demonstrates how to use `relation` subsearches in join operations. The first example shows how to join with a filtered dataset, while the second shows how to nest a `relation` subsearch within another query: + +```sql +source = table1 | join left = l right = r on condition [ source = table2 | where d > 10 | head 5 ] //subquery in join right side +source = [ source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ] | stats count(a) by b ] as outer | head 1 +``` +{% include copy.html %} + +## Examples + +The following examples demonstrate how different subsearch types work together in query scenarios, such as multi-level queries or nesting multiple subsearch types. + +### Complex query examples + +The following examples demonstrate how to combine different types of subsearches in complex queries. + +**Example 1: Query with `in` and `scalar` subsearches** + +The following query uses both `in` and `scalar` subsearches to find suppliers from Canada who supply parts with names starting with "forest" and have availability quantities greater than half of the total quantity ordered in 1994: + +```sql +source = supplier +| join ON s_nationkey = n_nationkey nation +| where n_name = 'CANADA' + and s_suppkey in [ /* in subsearch */ + source = partsupp + | where ps_partkey in [ /* nested in subsearch */ + source = part + | where like(p_name, 'forest%') + | fields p_partkey + ] + and ps_availqty > [ /* scalar subsearch */ + source = lineitem + | where l_partkey = ps_partkey + and l_suppkey = ps_suppkey + and l_shipdate >= date('1994-01-01') + and l_shipdate < date_add(date('1994-01-01'), interval 1 year) + | stats sum(l_quantity) as sum_l_quantity + | eval half_sum_l_quantity = 0.5 * sum_l_quantity + | fields half_sum_l_quantity + ] + | fields ps_suppkey +``` + +**Example 2: Query with `relation`, `scalar`, and `exists` subsearches** + +The following query uses `relation`, `scalar`, and `exists` subsearches to find customers from specific country codes with above-average account balances who have not placed any orders: + +```sql +source = [ /* relation subsearch */ + source = customer + | where substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') + and c_acctbal > [ /* scalar subsearch */ + source = customer + | where c_acctbal > 0.00 + and substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') + | stats avg(c_acctbal) + ] + and not exists [ /* correlated exists subsearch */ + source = orders + | where o_custkey = c_custkey + ] + | eval cntrycode = substring(c_phone, 1, 2) + | fields cntrycode, c_acctbal + ] as custsale +| stats count() as numcust, sum(c_acctbal) as totacctbal by cntrycode +| sort cntrycode +``` + +## Limitations + +PPL subsearch works only when `plugins.calcite.enabled` is set to `true`. diff --git a/_search-plugins/sql/settings.md b/_search-plugins/sql/settings.md index 4842f984499..2bd319adaa0 100644 --- a/_search-plugins/sql/settings.md +++ b/_search-plugins/sql/settings.md @@ -79,6 +79,8 @@ Setting | Default | Description `plugins.query.memory_limit` | 85% | Configures the heap memory usage limit for the circuit breaker of the query engine. `plugins.query.size_limit` | 200 | Sets the default size of index that the query engine fetches from OpenSearch. `plugins.query.datasources.enabled` | true | Change to `false` to disable support for data sources in the plugin. +`plugins.query.field_type_tolerance` | true | If `false`, then an array is reduced to the first non-array value at any nesting level. For example, `[[1, 2], [3, 4]]` will be reduced to `1`. If `true`, then the array is preserved. Default is `true`. +`plugins.calcite.enabled` | false | Set to `true` to enable experimental features that use the Apache Calcite query engine, including advanced SQL and PPL capabilities such as subsearches, joins, and lookup operations. ## Spark connector settings diff --git a/_search-plugins/sql/sql-ppl-api.md b/_search-plugins/sql/sql-ppl-api.md index 26f5f2cc81e..3aa8a3305af 100644 --- a/_search-plugins/sql/sql-ppl-api.md +++ b/_search-plugins/sql/sql-ppl-api.md @@ -26,7 +26,7 @@ Field | Data Type | Description :--- | :--- | :--- query | String | The query to be executed. Required. [filter](#filtering-results) | JSON object | The filter for the results. Optional. -[fetch_size](#paginating-results) | integer | The number of results to return in one response. Used for paginating results. Default is 1,000. Optional. Only supported for the `jdbc` response format. +[fetch_size](#paginating-results) | integer | The number of results to return in one response. Used for paginating results. Default is 1,000. Optional. `fetch_size` is supported for SQL and requires using the `jdbc` response format. #### Example request @@ -159,12 +159,22 @@ total | Integer | The total number of rows (documents) in the index. size | Integer | The number of results to return in one response. status | String | The HTTP response status OpenSearch returns after running the query. -## Explain API +## `Explain` API -The SQL plugin has an `explain` feature that shows how a query is executed against OpenSearch, which is useful for debugging and development. A POST request to the `_plugins/_sql/_explain` or `_plugins/_ppl/_explain` endpoint returns [OpenSearch domain-specific language]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/) (DSL) in JSON format, explaining the query. -You can execute the explain API operation either in command line using `curl` or in the Dashboards console, like in the example below. +The SQL plugin's `explain` feature shows how a query is executed against OpenSearch, which is useful for debugging and development. A POST request to the `_plugins/_sql/_explain` or `_plugins/_ppl/_explain` endpoint returns [OpenSearch domain-specific language]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/) (DSL) in JSON format. -#### Sample explain request for an SQL query +Starting with OpenSearch 3.0.0, when you set `plugins.calcite.enabled` to `true`, the `explain` response provides enhanced information about query execution plans. The API supports four output formats: + +- `standard`: Displays logical and physical plans (default if not specified) +- `simple`: Displays logical plan without attributes +- `cost`: Displays logical and physical plans with their costs +- `extended`: Displays logical and physical plans with generated code + +### Examples + +#### Basic SQL query + +The following request shows a basic SQL `explain` query: ```json POST _plugins/_sql/_explain @@ -172,8 +182,9 @@ POST _plugins/_sql/_explain "query": "SELECT firstname, lastname FROM accounts WHERE age > 20" } ``` +{% include copy.html %} -#### Sample SQL query explain response +The response shows the query execution plan: ```json { @@ -194,39 +205,64 @@ POST _plugins/_sql/_explain } } ``` +{% include copy.html %} -#### Sample explain request for a PPL query +#### Advanced query with the Calcite engine + +The following request demonstrates a more complex query using the Calcite engine: ```json POST _plugins/_ppl/_explain { - "query" : "source=accounts | fields firstname, lastname" + "query" : "source=state_country | where country = 'USA' OR country = 'England' | stats count() by country" } ``` +{% include copy.html %} -#### Sample PPL query explain response +The response shows both logical and physical plans in the standard format: ```json { - "root": { - "name": "ProjectOperator", - "description": { - "fields": "[firstname, lastname]" - }, - "children": [ - { - "name": "OpenSearchIndexScan", - "description": { - "request": """OpenSearchQueryRequest(indexName=accounts, sourceBuilder={"from":0,"size":200,"timeout":"1m","_source":{"includes":["firstname","lastname"],"excludes":[]}}, searchDone=false)""" - }, - "children": [] - } - ] + "calcite": { + "logical": """LogicalProject(count()=[$1], country=[$0]) + LogicalAggregate(group=[{1}], count()=[COUNT()]) + LogicalFilter(condition=[SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7))]) + CalciteLogicalIndexScan(table=[[OpenSearch, state_country]]) +""", + "physical": """EnumerableCalc(expr#0..1=[{inputs}], count()=[$t1], country=[$t0]) + CalciteEnumerableIndexScan(table=[[OpenSearch, state_country]], PushDownContext=[[FILTER->SEARCH($1, Sarg['England', 'USA':CHAR(7)]:CHAR(7)), AGGREGATION->rel#53:LogicalAggregate.NONE.[](input=RelSubset#43,group={1},count()=COUNT())], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":0,"timeout":"1m","query":{"terms":{"country":["England","USA"],"boost":1.0}},"sort":[{"_doc":{"order":"asc"}}],"aggregations":{"composite_buckets":{"composite":{"size":1000,"sources":[{"country":{"terms":{"field":"country","missing_bucket":true,"missing_order":"first","order":"asc"}}}]},"aggregations":{"count()":{"value_count":{"field":"_index"}}}}}}, requestedTotalSize=10000, pageSize=null, startFrom=0)]) +""" + } +} +``` +{% include copy.html %} + +For a simplified view of the query plan, you can use the `simple` format: + +```json +POST _plugins/_ppl/_explain?format=simple +{ + "query" : "source=state_country | where country = 'USA' OR country = 'England' | stats count() by country" +} +``` +{% include copy.html %} + +The response shows a condensed logical plan: + +```json +{ + "calcite": { + "logical": """LogicalProject + LogicalAggregate + LogicalFilter + CalciteLogicalIndexScan +""" } } ``` +{% include copy.html %} -For queries that require post-processing, the `explain` response includes a query plan in addition to the OpenSearch DSL. For those queries that don't require post processing, you can see a complete DSL. +For queries that require post-processing, the `explain` response includes a query plan in addition to the OpenSearch DSL. For queries that don't require post-processing, you'll see only the complete DSL. ## Paginating results diff --git a/_search-plugins/star-tree-index.md b/_search-plugins/star-tree-index.md new file mode 100644 index 00000000000..3f4e11bb7a0 --- /dev/null +++ b/_search-plugins/star-tree-index.md @@ -0,0 +1,501 @@ +--- +layout: default +title: Star-tree index +parent: Improving search performance +nav_order: 54 +--- + +# Star-tree index + +A _star-tree index_ is a specialized index structure designed to improve aggregation performance by precomputing and storing aggregated values at different levels of granularity. This indexing technique enables faster aggregation execution, especially for multi-field aggregations. + +Once you enable star-tree indexes, OpenSearch automatically builds and uses star-tree indexes to optimize supported aggregations if the filter fields match the defined dimensions and the aggregation fields match the defined metrics in the star-tree mapping configuration. No changes to your query syntax or request parameters are required. + +Use a star-tree index when you want to speed up aggregations: + +- Star-tree indexes natively support multi-field aggregations. +- Star-tree indexes are created in real time as part of the indexing process, so the data in a star-tree is always current. +- A star-tree index aggregates data to improve paging efficiency and reduce disk I/O during search queries. + +## Star-tree index structure + +A star-tree index organizes and aggregates data across combinations of dimension fields and precomputes metric values for all the dimension combinations every time a segment is flushed or refreshed during ingestion. This structure enables OpenSearch to process aggregation queries quickly without scanning every document. + +The following is an example star-tree configuration: + +```json +"ordered_dimensions": [ + { + "name": "status" + }, + { + "name": "port" + } +], +"metrics": [ + { + "name": "size", + "stats": [ + "sum" + ] + }, + { + "name": "latency", + "stats": [ + "avg" + ] + } +] +``` + +This configuration defines the following: + +* Two dimension fields: `status` and `port`. The `ordered_dimension` field specifies how data is sorted (first by `status`, then by `port`). +* Two metric fields: `size` and `latency` with their corresponding aggregations (`sum` and `avg`). For each unique dimension combination, metric values (`Sum(size)` and `Avg(latency)`) are pre-aggregated and stored in the star-tree structure. + +OpenSearch creates a star-tree index structure based on this configuration. Each node in the tree corresponds to a value (or wildcard `*`) for a dimension. At query time, OpenSearch traverses the tree based on the dimension values provided in the query. + +### Leaf nodes + +Leaf nodes contain the precomputed metric aggregations for specific combinations of dimensions. These are stored as doc values and referenced by star-tree nodes. + +The `max_leaf_docs` setting controls how many documents each leaf node can reference, which helps keep query latency predictable by limiting how many documents are scanned for any given node. + +### Star nodes + +A _star node_ (marked as `*` in the following diagram) aggregates all values for a particular dimension. If a query doesn't specify a filter for that dimension, OpenSearch retrieves the precomputed aggregation from the star node instead of iterating over multiple leaf nodes. For example, if a query filters on `port` but not `status`, OpenSearch can use a star node that aggregates data for all status values. + +### How queries use the star-tree + +The following diagram shows a star-tree index created for this example and three example query paths. In the diagram, notice that each branch corresponds to a dimension (`status` and `port`). Some nodes contain precomputed aggregation values (for example, `Sum(size)`), allowing OpenSearch to skip unnecessary calculations at query time. + +<img src="{{site.url}}{{site.baseurl}}/images/star-tree-index.png" alt="A star-tree index containing two dimensions and two metrics"> + +The colored arrows show three query examples: + +* **Blue arrow**: Multi-term query with metric aggregation + The query filters on both `status = 200` and `port = 5600` and calculates the sum of request sizes. + + * OpenSearch follows this path: `Root → 200 → 5600` + * It retrieves the metric from Doc ID 1, where `Sum(size) = 988` + +* **Green arrow**: Single-term query with metric aggregation + The query filters on `status = 200` only and computes the average request latency. + + * OpenSearch follows this path: `Root → 200 → *` + * It retrieves the metric from Doc ID 5, where `Avg(latency) = 70` + +* **Red arrow**: Single-term query with metric aggregation + The query filters on `port = 8443` only and calculates the sum of request sizes. + + * OpenSearch follows this path: `Root → * → 8443` + * It retrieves the metric from Doc ID 7, where `Sum(size) = 1111` + +These examples show how OpenSearch selects the shortest path in the star-tree and uses pre-aggregated values to process queries efficiently. + +## Limitations + +Note the following limiations of star-tree indexes: + +- Star-tree indexes do not support updates or deletions. To use a star-tree index, data should be append-only. See [Enabling a star-tree index](#enabling-a-star-tree-index). +- A star-tree index only works for aggregation queries that filter on dimension fields and aggregate metric fields defined in the index's star-tree configuration. +- Any changes to a star-tree configuration require reindexing. +- [Array values]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/index/#arrays) are not supported. +- Only [specific queries and aggregations](#supported-queries-and-aggregations) are supported. +- Avoid using high-cardinality fields like `_id` as dimensions because they can significantly increase storage use and query latency. + +## Enabling a star-tree index + +Star-tree indexing behavior is controlled by the following cluster-level and index-level settings. Index-level settings take precedence over cluster settings. + +| Setting | Scope | Default | Purpose | +| ------------------------------------------- | ------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------ | +| `indices.composite_index.star_tree.enabled` | Cluster | `true` | Enables or disables star-tree search optimization across the cluster. | +| `index.composite_index` | Index | None | Enables star-tree indexing for a specific index. Must be set when creating the index. | +| `index.append_only.enabled` | Index | None | Required for star-tree indexes. Prevents updates and deletions. Must be `true`. | +| `index.search.star_tree_index.enabled` | Index | `true` | Enables or disables use of the star-tree index for search queries on the index. | + +Setting `indices.composite_index.star_tree.enabled` to `false` prevents OpenSearch from using star-tree optimization during searches, but the star-tree index structures are still created. To completely remove star-tree structures, you must reindex your data without the star-tree mapping. +{: .note} + + +To create an index that uses a star-tree index, send the following request: + +```json +PUT /logs +{ + "settings": { + "index.composite_index": true, + "index.append_only.enabled": true + } +} +``` +{% include copy-curl.html %} + +Ensure that the `doc_values` parameter is enabled for the dimension and metric fields used in your star-tree mapping. This is enabled by default for most field types. For more information, see [Doc values]({{site.url}}{{site.baseurl}}/field-types/mapping-parameters/doc-values/). + +### Disabling star-tree usage + +By default, both the `indices.composite_index.star_tree.enabled` cluster setting and the `index.search.star_tree_index.enabled` index setting are set to `true`. To disable search using star-tree indexes, set both of these settings to `false`. Note that index settings take precedence over cluster settings. + +## Example mapping + +The following example shows how to create a star-tree index that precomputes aggregations in the `logs` index. The `sum` and `average` aggregations are calculated on the `size` and `latency` fields , respectively, for all combinations of values in the dimension fields. The dimensions are ordered by `status`, then `port`, and finally `method`, which determines how the data is organized in the tree structure: + +```json +PUT /logs +{ + "settings": { + "index.number_of_shards": 1, + "index.number_of_replicas": 0, + "index.composite_index": true, + "index.append_only.enabled": true + }, + "mappings": { + "composite": { + "request_aggs": { + "type": "star_tree", + "config": { + "date_dimension" : { + "name": "@timestamp", + "calendar_intervals": [ + "month", + "day" + ] + }, + "ordered_dimensions": [ + { + "name": "status" + }, + { + "name": "port" + }, + { + "name": "method" + } + ], + "metrics": [ + { + "name": "size", + "stats": [ + "sum" + ] + }, + { + "name": "latency", + "stats": [ + "avg" + ] + } + ] + } + } + }, + "properties": { + "status": { + "type": "integer" + }, + "port": { + "type": "integer" + }, + "size": { + "type": "integer" + }, + "method" : { + "type": "keyword" + }, + "latency": { + "type": "scaled_float", + "scaling_factor": 10 + } + } + } +} +``` +{% include copy.html %} + +For more information about star-tree index mappings and parameters, see [Star-tree field type]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/star-tree/). + +## Supported queries and aggregations + +Star-tree indexes optimize aggregations. Every query must include at least one supported aggregation in order to use the star-tree optimization. + +### Supported queries + +Queries without aggregations cannot use star-tree optimization. The query's fields must be present in the `ordered_dimensions` section of the star-tree configuration. The following queries are supported: + +- [Term query]({{site.url}}{{site.baseurl}}/query-dsl/term/term/) +- [Terms query]({{site.url}}{{site.baseurl}}/query-dsl/term/terms/) +- [Match all docs query]({{site.url}}{{site.baseurl}}/query-dsl/match-all/) +- [Range query]({{site.url}}{{site.baseurl}}/query-dsl/term/range/) +- [Boolean query]({{site.url}}{{site.baseurl}}/query-dsl/compound/bool/) + +#### Boolean query restrictions + +Boolean queries in star-tree indexes follow specific rules for each clause type: + +* `must` and `filter` clauses: + - Are both supported and treated the same way because `filter` does not affect scoring. + - Can operate across different dimensions. + - Allow only one condition per dimension across all `must`/`filter` clauses, including nested ones. + - Support term, terms, and range queries. + +* `should` clauses: + - Must operate on the same dimension and cannot operate across different dimensions + - Can only use term, terms, and range queries. + +* `should` clauses inside `must` clauses: + - Act as a required condition. + - When operating on the same dimension as outer `must`: The union of `should` conditions is intersected with the outer `must` conditions. + - When operating on a different dimension: Processed normally as a required condition. + +* `must_not` clauses are not supported. +* Queries with the `minimum_should_match` parameter are not supported. + +The following Boolean query is **supported** because it follows these restrictions: + +```json +{ + "bool": { + "must": [ + {"term": {"method": "GET"}} + ], + "filter": [ + {"range": {"status": {"gte": 200, "lt": 300}}} + ], + "should": [ + {"term": {"port": 443}}, + {"term": {"port": 8443}} + ] + } +} +``` +{% include copy.html %} + +The following Boolean queries are **not** supported because they violate these restrictions: + +```json +{ + "bool": { + "should": [ + {"term": {"status": 200}}, + {"term": {"method": "GET"}} // SHOULD across different dimensions + ] + } +} +``` + +```json +{ + "bool": { + "must": [ + {"term": {"status": 200}} + ], + "must_not": [ // MUST_NOT not supported + {"term": {"method": "DELETE"}} + ] + } +} +``` + +### Supported aggregations + +The following aggregations are supported by star-tree indexes. + +#### Metric aggregations + +The following metric aggregations are supported: + +- [Sum]({{site.url}}{{site.baseurl}}/aggregations/metric/sum/) +- [Minimum]({{site.url}}{{site.baseurl}}/aggregations/metric/minimum/) +- [Maximum]({{site.url}}{{site.baseurl}}/aggregations/metric/maximum/) +- [Value count]({{site.url}}{{site.baseurl}}/aggregations/metric/value-count/) +- [Average]({{site.url}}{{site.baseurl}}/aggregations/metric/average/) + +To use searchable aggregations with a star-tree index, make sure you fulfill the following prerequisites: + +- The fields must be present in the `metrics` section of the star-tree configuration. +- The metric aggregation type must be part of the `stats` parameter. + +The following example gets the sum of all the values in the `size` field for all error logs with `status=500`, using the [example mapping](#example-mapping): + +```json +POST /logs/_search +{ + "query": { + "term": { + "status": "500" + } + }, + "aggs": { + "sum_size": { + "sum": { + "field": "size" + } + } + } +} +``` +{% include copy.html %} + +Using a star-tree index, the result will be retrieved from a single aggregated document as it traverses the `status=500` node, as opposed to scanning through all of the matching documents. This results in lower query latency. + +#### Date histograms with metric aggregations + +You can use [date histograms]({{site.url}}{{site.baseurl}}/aggregations/bucket/date-histogram/) on calendar intervals with metric sub-aggregations. + +To use date histogram aggregations and make them searchable in a star-tree index, remember the following requirements: + +- The calendar intervals in a star-tree mapping configuration can use either the request's calendar field or a field of lower granularity than the request field. For example, if an aggregation uses the `month` field, the star-tree search can still use lower-granularity fields such as `day`. +- A metric sub-aggregation must be part of the aggregation request. + +The following example filters logs to include only those with status codes between `200` and `400` and sets the `size` of the response to `0`, so that only aggregated results are returned. It then aggregates the filtered logs by calendar month and calculates the total `size` of the requests for each month: + +```json +POST /logs/_search +{ + "size": 0, + "query": { + "range": { + "status": { + "gte": "200", + "lte": "400" + } + } + }, + "aggs": { + "by_month": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "sum_size": { + "sum": { + "field": "size" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Keyword and numeric terms aggregations + +You can use [terms aggregations]({{site.url}}{{site.baseurl}}/aggregations/bucket/terms/) on both keyword and numeric fields with star-tree index search. + +For star-tree search compatibility with terms aggregations, remember the following behaviors: + +- The fields used in the terms aggregation should be part of the dimensions defined in the star-tree index. +- Metric sub-aggregations are optional as long as the relevant metrics are part of the star-tree configuration. + +The following example aggregates logs by the `user_id` field and returns the counts for each unique user: + +```json +POST /logs/_search +{ + "size": 0, + "aggs": { + "users": { + "terms": { + "field": "user_id" + } + } + } +} +``` +{% include copy-curl.html %} + +The following example aggregates orders by the `order_quantity` and calculates the average `total_price` for each quantity: + +```json +POST /orders/_search +{ + "size": 0, + "aggs": { + "quantities": { + "terms": { + "field": "order_quantity" + }, + "aggs": { + "avg_total_price": { + "avg": { + "field": "total_price" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Range aggregations + +You can use [range aggregations]({{site.url}}{{site.baseurl}}/aggregations/bucket/range/) on numeric fields with star-tree index search. + +For range aggregations to work effectively with a star-tree index, remember the following behaviors: + +- The field used in the range aggregation should be part of the dimensions defined in the star-tree index. +- You can include metric sub-aggregations to compute metrics within each defined range, as long as the relevant metrics are part of the star-tree configuration. + +The following example aggregates documents based on predefined ranges of the `temperature` field: + +```json +POST /sensors/_search +{ + "size": 0, + "aggs": { + "temperature_ranges": { + "range": { + "field": "temperature", + "ranges": [ + { "to": 20 }, + { "from": 20, "to": 30 }, + { "from": 30 } + ] + } + } + } +} +``` +{% include copy-curl.html %} + +The following example aggregates sales data by price ranges and calculates the total `quantity` sold within each range: + +```json +POST /sales/_search +{ + "size": 0, + "aggs": { + "price_ranges": { + "range": { + "field": "price", + "ranges": [ + { "to": 100 }, + { "from": 100, "to": 500 }, + { "from": 500 } + ] + }, + "aggs": { + "total_quantity": { + "sum": { + "field": "quantity" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +#### Nested aggregations + +You can combine multiple supported bucket aggregations (such as `terms` and `range`) in a nested structure, and the star-tree index will optimize these nested aggregations. For more information about nested aggregations, see [Nested aggregations]({{site.url}}{{site.baseurl}}/aggregations/#nested-aggregations). + +## Next steps + +- [Star-tree field type]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/star-tree/) diff --git a/_search-plugins/ubi/data-structures.md b/_search-plugins/ubi/data-structures.md deleted file mode 100644 index 0c64c3254ba..00000000000 --- a/_search-plugins/ubi/data-structures.md +++ /dev/null @@ -1,204 +0,0 @@ ---- -layout: default -title: UBI client data structures -parent: User Behavior Insights -has_children: false -nav_order: 10 ---- - -# UBI client data structures - -Data structures are used to create events that follow the [User Behavior Insights (UBI) event schema specification](https://github.com/o19s/ubi). -For more information about the schema, see [UBI index schemas]({{site.url}}{{site.baseurl}}/search-plugins/ubi/schemas/). - - -You must provide an implementation for the following functions: -- `getClientId()` -- `getQueryId()` - -You can also optionally provide an implementation for the following functions: -- `getSessionId()` -- `getPageId()` - - -The following JavaScript structures can be used as a starter implementation to serialize UBI events into schema-compatible JSON: -```js -/********************************************************************************************* - * Ubi Event data structures - * The following structures help ensure adherence to the UBI event schema - *********************************************************************************************/ - - - -export class UbiEventData { - constructor(object_type, id=null, description=null, details=null) { - this.object_id_field = object_type; - this.object_id = id; - this.description = description; - this.object_detail = details; - } -} -export class UbiPosition{ - constructor({ordinal=null, x=null, y=null, trail=null}={}) { - this.ordinal = ordinal; - this.x = x; - this.y = y; - if(trail) - this.trail = trail; - else { - const trail = getTrail(); - if(trail && trail.length > 0) - this.trail = trail; - } - } -} - - -export class UbiEventAttributes { - /** - * Tries to prepopulate common event attributes - * The developer can add an `object` that the user interacted with and - * the site `position` information relevant to the event - * - * Attributes, other than `object` or `position` can be added in the form: - * attributes['item1'] = 1 - * attributes['item2'] = '2' - * - * @param {*} attributes: object with general event attributes - * @param {*} object: the data object the user interacted with - * @param {*} position: the site position information - */ - constructor({attributes={}, object=null, position=null}={}) { - if(attributes != null){ - Object.assign(this, attributes); - } - if(object != null && Object.keys(object).length > 0){ - this.object = object; - } - if(position != null && Object.keys(position).length > 0){ - this.position = position; - } - this.setDefaultValues(); - } - - setDefaultValues(){ - try{ - if(!this.hasOwnProperty('dwell_time') && typeof TimeMe !== 'undefined'){ - this.dwell_time = TimeMe.getTimeOnPageInSeconds(window.location.pathname); - } - - if(!this.hasOwnProperty('browser')){ - this.browser = window.navigator.userAgent; - } - - if(!this.hasOwnProperty('page_id')){ - this.page_id = window.location.pathname; - } - if(!this.hasOwnProperty('session_id')){ - this.session_id = getSessionId(); - } - - if(!this.hasOwnProperty('page_id')){ - this.page_id = getPageId(); - } - - if(!this.hasOwnProperty('position') || this.position == null){ - const trail = getTrail(); - if(trail.length > 0){ - this.position = new UbiPosition({trail:trail}); - } - } - // ToDo: set IP - } - catch(error){ - console.log(error); - } - } -} - - - -export class UbiEvent { - constructor(action_name, {message_type='INFO', message=null, event_attributes={}, data_object={}}={}) { - this.action_name = action_name; - this.client_id = getClientId(); - this.query_id = getQueryId(); - this.timestamp = Date.now(); - - this.message_type = message_type; - if( message ) - this.message = message; - - this.event_attributes = new UbiEventAttributes({attributes:event_attributes, object:data_object}); - } - - /** - * Use to suppress null objects in the json output - * @param key - * @param value - * @returns - */ - static replacer(key, value){ - if(value == null || - (value.constructor == Object && Object.keys(value).length === 0)) { - return undefined; - } - return value; - } - - /** - * - * @returns json string - */ - toJson() { - return JSON.stringify(this, UbiEvent.replacer); - } -} -``` -{% include copy.html %} - -# Sample usage - -```js -export function logUbiMessage(event_type, message_type, message){ - let e = new UbiEvent(event_type, { - message_type:message_type, - message:message - }); - logEvent(e); -} - -export function logDwellTime(action_name, page, seconds){ - console.log(`${page} => ${seconds}`); - let e = new UbiEvent(action_name, { - message:`On page ${page} for ${seconds} seconds`, - event_attributes:{ - session_id: getSessionId()}, - dwell_seconds:seconds - }, - data_object:TimeMe - }); - logEvent(e); -} - -/** - * ordinal is the number within a list of results - * for the item that was clicked - */ -export function logItemClick(item, ordinal){ - let e = new UbiEvent('item_click', { - message:`Item ${item['object_id']} was clicked`, - event_attributes:{session_id: getSessionId()}, - data_object:item, - }); - e.event_attributes.position.ordinal = ordinal; - logEvent(e); -} - -export function logEvent( event ){ - // some configured http client - return client.index( index = 'ubi_events', body = event.toJson()); -} - -``` -{% include copy.html %} diff --git a/_search-plugins/ubi/index.md b/_search-plugins/ubi/index.md index bdf09a632b0..380f5b24bb1 100644 --- a/_search-plugins/ubi/index.md +++ b/_search-plugins/ubi/index.md @@ -11,39 +11,67 @@ redirect_from: **Introduced 2.15** {: .label .label-purple } -**References UBI Specification 1.0.0** +**References UBI Specification 1.2.0** {: .label .label-purple } -User Behavior Insights (UBI) is a plugin that captures client-side events and queries for the purposes of improving search relevance and the user experience. -It is a causal system, linking a user's query to all of their subsequent interactions with your application until they perform another search. +User Behavior Insights (UBI) is a standard for capturing client-side events and queries for the purposes of improving search relevance and the user experience. +It is a *causal* system, linking a user's query to all of their subsequent interactions with your application until they perform another search. +This differs from many systems that infer the linking of search to events through *chronological* sequence. -UBI includes the following elements: -* A machine-readable [schema](https://github.com/o19s/ubi) that faciliates interoperablity of the UBI specification. -* An OpenSearch [plugin](https://github.com/opensearch-project/user-behavior-insights) that facilitates the storage of client-side events and queries. -* A client-side JavaScript [example reference implementation]({{site.url}}{{site.baseurl}}/search-plugins/ubi/data-structures/) that shows how to capture events and send them to the OpenSearch UBI plugin. - -<!-- vale off --> -The UBI documentation is organized into two categories: *Explanation and reference* and *Tutorials and how-to guides*: +> “how our users are using our product, whether search results were useful for them and whether they clicked on top-n results we gave and all related stuff” - Data Scientist -*Explanation and reference* - -| Link | Description | -| :--------- | :------- | -| [UBI Request/Response Specification](https://github.com/o19s/ubi/) | The industry-standard schema for UBI requests and responses. The current version references UBI Specification 1.0.0. | -| [UBI index schema]({{site.url}}{{site.baseurl}}/search-plugins/ubi/schemas/) | Documentation on the individual OpenSearch query and event stores. | +UBI includes the following elements: +* [ubi.js](https://github.com/opensearch-project/user-behavior-insights/tree/main/ubi-javascript-collector/ubi.js): a client-side JavaScript library that captures searches and events. +* A machine-readable [schema](https://github.com/o19s/ubi) that faciliates interoperability of the UBI specification. +* An (optional!) OpenSearch [plugin](https://github.com/opensearch-project/user-behavior-insights) that streamlines the recording of query data. +Advanced features in OpenSearch, such as the Search Quality Evaluation Framework, and the Hybrid Search Optimizer all build on the UBI specification. -*Tutorials and how-to guides* +<!-- vale off --> -| Link | Description | -| :--------- | :------- | -| [UBI plugin](https://github.com/opensearch-project/user-behavior-insights) | How to install and use the UBI plugin. | -| [UBI client data structures]({{site.url}}{{site.baseurl}}/search-plugins/ubi/data-structures/) | Sample JavaScript structures for populating the event store. | -| [Example UBI query DSL queries]({{site.url}}{{site.baseurl}}/search-plugins/ubi/dsl-queries/) | How to write queries for UBI data in OpenSearch query DSL. | -| [Example UBI SQL queries]({{site.url}}{{site.baseurl}}/search-plugins/ubi/sql-queries/) | How to write analytic queries for UBI data in SQL. | -| [UBI dashboard tutorial]({{site.url}}{{site.baseurl}}/search-plugins/ubi/ubi-dashboard-tutorial/) | How to build a dashboard containing UBI data. | -| [Chorus Opensearch Edition](https://github.com/o19s/chorus-opensearch-edition/?tab=readme-ov-file#structured-learning-using-chorus-opensearch-edition) katas | A series of structured tutorials that teach you how to use UBI with OpenSearch through a demo e-commerce store. | +<table> + <tr style="vertical-align: top;"> + <td> + <h2>Tutorials</h2> + <ul> + <li><a href="#">Learn to use <code>ubi.js</code></a></li> + <li><a href="https://github.com/opensearch-project/user-behavior-insights">How to install and use the UBI plugin</a><b>do we keep this</b></li> + <li><a href="#">Using OpenSearch Ingestion from AWS with UBI</a></li> + <li><a href="{{site.url}}{{site.baseurl}}/search-plugins/ubi/ubi-dashboard-tutorial/">Learn to create custom dashboards with UBI data.</a></li> + </ul> + </td> + <td> + <h2>How To Guides</h2> + <ul> + <li><a href="{{site.url}}{{site.baseurl}}/search-plugins/ubi/dsl-queries/">How to write queries for UBI data in OpenSearch query DSL.</a></li> + <li><a href="{{site.url}}{{site.baseurl}}/search-plugins/ubi/sql-queries/">How to write analytic queries for UBI data in SQL.</a></li> + <li><a href="https://github.com/o19s/chorus-opensearch-edition/blob/main/katas/006_protecting_sensitive_information.md">How to protect sensistive information when using UBI.</a></li> + <li><a href="https://github.com/o19s/chorus-opensearch-edition/blob/main/katas/007_configure_AB_with_TDI.md">Configuring an AB test with Team Draft Interleaving</a></li> + <li><a href="https://github.com/opensearch-project/user-behavior-insights">How to install and use the UBI plugin</a><b>do we keep this</b></li> + </ul> + </td> + </tr> + <tr style="vertical-align: top;"> + <td> + <h2>Explanation</h2> + <ul> + <li><a href="https://UBISearch.dev">Why UBI?</a></li> + <li><a href="https://github.com/opensearch-project/user-behavior-insights">How to install and use the UBI plugin</a><b>do we keep this</b></li> + <li>Learn more about this community standard via <a href="https://UBISearch.dev">UBISearch.dev</a>.</li> + <li><a href="{{site.url}}{{site.baseurl}}/search-plugins/ubi/ubi-javascript-collector/">UBI.js</a> JavaScript collector</li> + </ul> + </td> + <td> + <h2>Reference</h2> + <ul> + <li><a href="https://o19s.github.io/ubi/docs/html/1.2.0/query.request.schema.html">Query Tracking Specification</a></li> + <li><a href="https://o19s.github.io/ubi/docs/html/1.2.0/event.schema.html">Event Tracking Specification</a></li> + <li><a href="{{site.url}}{{site.baseurl}}/search-plugins/ubi/schemas/">UBI Plugin Schema</a><b>DO WE KEEP THIS</b></li> + </ul> + </td> + </tr> +</table> <!-- vale on --> The documentation categories were adapted using concepts based on [Diátaxis](https://diataxis.fr/). diff --git a/_search-plugins/ubi/schemas.md b/_search-plugins/ubi/schemas.md index d8398e43bc9..5487a467732 100644 --- a/_search-plugins/ubi/schemas.md +++ b/_search-plugins/ubi/schemas.md @@ -16,8 +16,8 @@ The User Behavior Insights (UBI) data collection process involves tracking and r For UBI to function properly, the connections between the following fields must be consistently maintained within an application that has UBI enabled: -- [`object_id`](#object_id) represents an ID for whatever object the user receives in response to a query. For example, if you search for books, it might be an ISBN code of a book, such as `978-3-16-148410-0`. -- [`query_id`](#query_id) is a unique ID for the raw query language executed and the `object_id` values of the _hits_ returned by the user's query. +- [`object_id`](#object_id) represents an ID for whatever object the user receives in response to a query. For example, if you search for books, it might be an ISBN number for a book, such as `978-3-16-148410-0`. +- [`query_id`](#query_id) is a unique ID for the raw query language executed and the `object_id` maps to the primary identifier of the _hits_ returned by the user's query. - [`client_id`](#client_id) represents a unique query source. This is typically a web browser used by a unique user. - [`object_id_field`](#object_id_field) specifies the name of the field in your index that provides the `object_id`. For example, if you search for books, the value might be `isbn_code`. - [`action_name`](#action_name), though not technically an ID, specifies the exact user action (such as `click`, `add_to_cart`, `watch`, `view`, or `purchase`) that was taken (or not taken) for an object with a given `object_id`. @@ -138,11 +138,11 @@ All underlying query information and results (`object_ids`) are stored in the `u The `ubi_queries` index [schema](https://github.com/OpenSearch-project/user-behavior-insights/tree/main/src/main/resources/queries-mapping.json) includes the following fields: -- `timestamp` (events and queries): A UNIX timestamp indicating when the query was received. +- `timestamp` (events and queries): A ISO 8601 formatted timestamp indicating when the query was received. -- `query_id` (events and queries): The unique ID of the query provided by the client or generated automatically. Different queries with the same text generate different `query_id` values. - -- `client_id` (events and queries): A user/client ID provided by the client application. +- `query_id` (events and queries): The unique ID of the query provided by the client or generated by the search engine. Different queries with the same text generate different `query_id` values. + +- `client_id` (events and queries): A client ID provided by the client application. - `query_response_objects_ids` (queries): An array of object IDs. An ID can have the same value as the `_id`, but it is meant to be the externally valid ID of a document, item, or product. @@ -169,14 +169,14 @@ The following are the predefined, minimal fields in the `ubi_events` index: <p id="query_id"> </p> - `query_id` (size 100): The unique identifier of a query, which is typically a UUID but can be any string. - The `query_id` is either provided by the client or generated at index time by the UBI plugin. The `query_id` values in both the **UBI queries** and **UBI events** indexes must be consistent. + The `query_id` is either provided by the client or generated at query time by the UBI plugin. The `query_id` values in both the **UBI queries** and **UBI events** indexes must be consistent. <p id="client_id"> </p> - `client_id`: The client that issues the query. This is typically a web browser used by a unique user. The `client_id` in both the **UBI queries** and **UBI events** indexes must be consistent. -- `timestamp`: When the event occurred, either in UNIX format or formatted as `2018-11-13T20:20:39+00:00`. +- `timestamp`: When the event occurred, using ISO 8601 format such as `2018-11-13T20:20:39+00:00Z`. - `message_type` (size 100): A logical bin for grouping actions (each with an `action_name`). For example, `QUERY` or `CONVERSION`. @@ -193,18 +193,12 @@ The following are the predefined, minimal fields in the `ubi_events` index: - `event_attributes.position.ordinal`: Tracks the list position that a user can select (for example, selecting the third element can be described as `event{onClick, results[4]}`). - - `event_attributes.position.{x,y}`: Tracks x and y values defined by the client. - - - `event_attributes.position.page_depth`: Tracks the page depth of the results. - - - `event_attributes.position.scroll_depth`: Tracks the scroll depth of the page results. - - - `event_attributes.position.trail`: A text field that tracks the path/trail that a user took to get to this location. - + - `event_attributes.position.xy.{x,y}`: Tracks x and y values defined by the client. + - `event_attributes.object`: Contains identifying information about the object returned by the query (for example, a book, product, or post). The `object` structure can refer to the object by internal ID or object ID. The `object_id` is the ID that links prior queries to this object. This field comprises the following subfields: - - `event_attributes.object.internal_id`: A unique ID that OpenSearch can use to internally index the object, for example, the `_id` field in the indexes. + - `event_attributes.object.internal_id`: The unique ID that OpenSearch uses to internally index the object, for example, the `_id` field in the indexes. <p id="object_id"> @@ -214,7 +208,7 @@ The following are the predefined, minimal fields in the `ubi_events` index: <p id="object_id_field"> - - `event_attributes.object.object_id_field`: Indicates the type/class of the object and the name of the search index field that contains the `object_id`. + - `event_attributes.object.object_id_field`: Indicates the type/class of the object and the name of the search index field that contains the `object_id` such as `ssn`, `isbn`, or `ean`. - `event_attributes.object.description`: An optional description of the object. diff --git a/_search-plugins/ubi/ubi-javascript-collector.md b/_search-plugins/ubi/ubi-javascript-collector.md new file mode 100644 index 00000000000..4bc5e0e8391 --- /dev/null +++ b/_search-plugins/ubi/ubi-javascript-collector.md @@ -0,0 +1,118 @@ +--- +layout: default +title: UBI JavaScript Collector +parent: User Behavior Insights +has_children: false +nav_order: 10 +--- + +# UBI JavaScript collector + +UBI comes with a very basic JavaScript client that manages the life cycle of the `query_id` for a specific search and can create UBI Event data structures and store them for specific actions. + +For more information about the schema, see [UBI index schemas]({{site.url}}{{site.baseurl}}/search-plugins/ubi/schemas/). + +We recommend that you refer to the client as a starting point for your own specific needs. + +## Installation + +The client comes as a single file `ubi.js` and only has a dependency on the `axios` library. +Download it from https://github.com/opensearch-project/user-behavior-insights/tree/main/ubi-javascript-collector. + +Reference the events and create the client via: + +```js +import { UbiEvent } from './ubi'; +import { UbiEventAttributes } from './ubi' +import { UbiClient } from './ubi' + +const ubiClient = new UbiClient('http://localhost:9200'); +``` + + +## Creating an event + +This code snippet is to track adding an item to a shopping cart in an e-commerce application. It utilizes the `UbiEvent` and `UbiEventAttributes` class to encapsulate event details, which can then be sent to the tracking system. +```js +var event = new UbiEvent( + 'add_to_cart', + client_id, + session_id, + getQueryId(), + new UbiEventAttributes('product', item.primary_ean, item.title, item), + item.title + ' (' + item.id + ')' +); +``` + +### Parameters + +1. **Event Name**: + - `'add_to_cart'` - This string indicates the type of event being tracked. + +2. **Client ID**: + - `client_id` - A variable that holds the unique identifier for the client. This helps in distinguishing between different users or sessions. + +3. **Session ID**: + - `session_id` - A variable that contains the unique identifier for the user session. This is used to track user interactions within a specific session. + +4. **Query ID**: + - `getQueryId()` - A function call that retrieves the current query ID, which may represent a specific search or interaction context. + +5. **UbiEventAttributes**: + - This is an instance of the `UbiEventAttributes` class, which encapsulates additional details about the event: + - **Type**: + - `'product'` - Specifies that the attribute type is related to a product. + - **Primary EAN**: + - `item.primary_ean` - This is the product's unique identifier in EAN format. + - **Title**: + - `item.title` - The name or description of the product. + - **Item**: + - `item` - The complete product object containing all relevant details. + +6. **Event Label**: + - `item.title + ' (' + item.id + ')'` - This creates a descriptive label for the event that includes the product title and its unique identifier (ID). + +The method `getQueryId()` refers to a helper method to generate a unique query id (and stores it in the session). +Here is a sample method: + +``` +function generateQueryId(){ + const query_id = generateGuid(); + sessionStorage.setItem('query_id', query_id); + return query_id; +} + +function generateGuid() { + let id = ''; + try{ + id = crypto.randomUUID(); + } + catch(error){ + // crypto.randomUUID only works in https, not http context, so fallback. + id ='10000000-1000-4000-8000-100000000000'.replace(/[018]/g, c => + (c ^ crypto.getRandomValues(new Uint8Array(1))[0] & 15 >> c / 4).toString(16) + ); + } + return id; +}; +``` + +## Tracking the event + +Sending the event to the backend is as simple as: + +```js +ubiClient.trackEvent(event); +``` + + +## Tracking queries + +You have the option of tracking queries using the client (instead of using the UBI plugin for OpenSearch). + +This looks very similar to tracking events: + +```js +const query = new UbiQuery(APPLICATION, client_id, query_id, value, "_id", {}); +ubiClient.trackQuery(query) +``` diff --git a/_search-plugins/vector-search.md b/_search-plugins/vector-search.md deleted file mode 100644 index f19030bf907..00000000000 --- a/_search-plugins/vector-search.md +++ /dev/null @@ -1,283 +0,0 @@ ---- -layout: default -title: Vector search -nav_order: 22 -has_children: false -has_toc: false ---- - -# Vector search - -OpenSearch is a comprehensive search platform that supports a variety of data types, including vectors. OpenSearch vector database functionality is seamlessly integrated with its generic database function. - -In OpenSearch, you can generate vector embeddings, store those embeddings in an index, and use them for vector search. Choose one of the following options: - -- Generate embeddings using a library of your choice before ingesting them into OpenSearch. Once you ingest vectors into an index, you can perform a vector similarity search on the vector space. For more information, see [Working with embeddings generated outside of OpenSearch](#working-with-embeddings-generated-outside-of-opensearch). -- Automatically generate embeddings within OpenSearch. To use embeddings for semantic search, the ingested text (the corpus) and the query need to be embedded using the same model. [Neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/) packages this functionality, eliminating the need to manage the internal details. For more information, see [Generating vector embeddings within OpenSearch](#generating-vector-embeddings-in-opensearch). - -## Working with embeddings generated outside of OpenSearch - -After you generate vector embeddings, upload them to an OpenSearch index and search the index using vector search. For a complete example, see [Example](#example). - -### k-NN index - -To build a vector database and use vector search, you must specify your index as a [k-NN index]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/) when creating it by setting `index.knn` to `true`: - -```json -PUT test-index -{ - "settings": { - "index": { - "knn": true, - "knn.algo_param.ef_search": 100 - } - }, - "mappings": { - "properties": { - "my_vector1": { - "type": "knn_vector", - "dimension": 1024, - "space_type": "l2", - "method": { - "name": "hnsw", - "engine": "nmslib", - "parameters": { - "ef_construction": 128, - "m": 24 - } - } - } - } - } -} -``` -{% include copy-curl.html %} - -### k-NN vector - -You must designate the field that will store vectors as a [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) field type. OpenSearch supports vectors of up to 16,000 dimensions, each of which is represented as a 32-bit or 16-bit float. - -To save storage space, you can use `byte` or `binary` vectors. For more information, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors) and [Binary vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors). - -### k-NN vector search - -Vector search finds the vectors in your database that are most similar to the query vector. OpenSearch supports the following search methods: - -- [Approximate search](#approximate-search) (approximate k-NN, or ANN): Returns approximate nearest neighbors to the query vector. Usually, approximate search algorithms sacrifice indexing speed and search accuracy in exchange for performance benefits such as lower latency, smaller memory footprints, and more scalable search. For most use cases, approximate search is the best option. - -- Exact search (exact k-NN): A brute-force, exact k-NN search of vector fields. OpenSearch supports the following types of exact search: - - [Exact k-NN with scoring script]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-score-script/): Using the k-NN scoring script, you can apply a filter to an index before executing the nearest neighbor search. - - [Painless extensions]({{site.url}}{{site.baseurl}}/search-plugins/knn/painless-functions/): Adds the distance functions as Painless extensions that you can use in more complex combinations. You can use this method to perform a brute-force, exact k-NN search of an index, which also supports pre-filtering. - -### Approximate search - -OpenSearch supports several algorithms for approximate vector search, each with its own advantages. For complete documentation, see [Approximate search]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/). For more information about the search methods and engines, see [Method definitions]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#method-definitions). For method recommendations, see [Choosing the right method]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#choosing-the-right-method). - -To use approximate vector search, specify one of the following search methods (algorithms) in the `method` parameter: - -- Hierarchical Navigable Small World (HNSW) -- Inverted File System (IVF) - -Additionally, specify the engine (library) that implements this method in the `engine` parameter: - -- [Non-Metric Space Library (NMSLIB)](https://github.com/nmslib/nmslib) -- [Facebook AI Similarity Search (Faiss)](https://github.com/facebookresearch/faiss) -- Lucene - -The following table lists the combinations of search methods and libraries supported by the k-NN engine for approximate vector search. - -Method | Engine -:--- | :--- -HNSW | NMSLIB, Faiss, Lucene -IVF | Faiss - -### Engine recommendations - -In general, select NMSLIB or Faiss for large-scale use cases. Lucene is a good option for smaller deployments and offers benefits like smart filtering, where the optimal filtering strategy—pre-filtering, post-filtering, or exact k-NN—is automatically applied depending on the situation. The following table summarizes the differences between each option. - -| | NMSLIB/HNSW | Faiss/HNSW | Faiss/IVF | Lucene/HNSW | -|:---|:---|:---|:---|:---| -| Max dimensions | 16,000 | 16,000 | 16,000 | 16,000 | -| Filter | Post-filter | Post-filter | Post-filter | Filter during search | -| Training required | No | No | Yes | No | -| Similarity metrics | `l2`, `innerproduct`, `cosinesimil`, `l1`, `linf` | `l2`, `innerproduct` | `l2`, `innerproduct` | `l2`, `cosinesimil` | -| Number of vectors | Tens of billions | Tens of billions | Tens of billions | Less than 10 million | -| Indexing latency | Low | Low | Lowest | Low | -| Query latency and quality | Low latency and high quality | Low latency and high quality | Low latency and low quality | High latency and high quality | -| Vector compression | Flat | Flat <br>Product quantization | Flat <br>Product quantization | Flat | -| Memory consumption | High | High <br> Low with PQ | Medium <br> Low with PQ | High | - -### Example - -In this example, you'll create a k-NN index, add data to the index, and search the data. - -#### Step 1: Create a k-NN index - -First, create an index that will store sample hotel data. Set `index.knn` to `true` and specify the `location` field as a `knn_vector`: - -```json -PUT /hotels-index -{ - "settings": { - "index": { - "knn": true, - "knn.algo_param.ef_search": 100, - "number_of_shards": 1, - "number_of_replicas": 0 - } - }, - "mappings": { - "properties": { - "location": { - "type": "knn_vector", - "dimension": 2, - "space_type": "l2", - "method": { - "name": "hnsw", - "engine": "lucene", - "parameters": { - "ef_construction": 100, - "m": 16 - } - } - } - } - } -} -``` -{% include copy-curl.html %} - -#### Step 2: Add data to your index - -Next, add data to your index. Each document represents a hotel. The `location` field in each document contains a vector specifying the hotel's location: - -```json -POST /_bulk -{ "index": { "_index": "hotels-index", "_id": "1" } } -{ "location": [5.2, 4.4] } -{ "index": { "_index": "hotels-index", "_id": "2" } } -{ "location": [5.2, 3.9] } -{ "index": { "_index": "hotels-index", "_id": "3" } } -{ "location": [4.9, 3.4] } -{ "index": { "_index": "hotels-index", "_id": "4" } } -{ "location": [4.2, 4.6] } -{ "index": { "_index": "hotels-index", "_id": "5" } } -{ "location": [3.3, 4.5] } -``` -{% include copy-curl.html %} - -#### Step 3: Search your data - -Now search for hotels closest to the pin location `[5, 4]`. This location is labeled `Pin` in the following image. Each hotel is labeled with its document number. - -![Hotels on a coordinate plane]({{site.url}}{{site.baseurl}}/images/k-nn-search-hotels.png/) - -To search for the top three closest hotels, set `k` to `3`: - -```json -POST /hotels-index/_search -{ - "size": 3, - "query": { - "knn": { - "location": { - "vector": [ - 5, - 4 - ], - "k": 3 - } - } - } -} -``` -{% include copy-curl.html %} - -The response contains the hotels closest to the specified pin location: - -```json -{ - "took": 1093, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 3, - "relation": "eq" - }, - "max_score": 0.952381, - "hits": [ - { - "_index": "hotels-index", - "_id": "2", - "_score": 0.952381, - "_source": { - "location": [ - 5.2, - 3.9 - ] - } - }, - { - "_index": "hotels-index", - "_id": "1", - "_score": 0.8333333, - "_source": { - "location": [ - 5.2, - 4.4 - ] - } - }, - { - "_index": "hotels-index", - "_id": "3", - "_score": 0.72992706, - "_source": { - "location": [ - 4.9, - 3.4 - ] - } - } - ] - } -} -``` - -### Vector search with filtering - -For information about vector search with filtering, see [k-NN search with filters]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/). - -## Generating vector embeddings in OpenSearch - -[Neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/) encapsulates the infrastructure needed to perform semantic vector searches. After you integrate an inference (embedding) service, neural search functions like lexical search, accepting a textual query and returning relevant documents. - -When you index your data, neural search transforms text into vector embeddings and indexes both the text and its vector embeddings in a vector index. When you use a neural query during search, neural search converts the query text into vector embeddings and uses vector search to return the results. - -### Choosing a model - -The first step in setting up neural search is choosing a model. You can upload a model to your OpenSearch cluster, use one of the pretrained models provided by OpenSearch, or connect to an externally hosted model. For more information, see [Integrating ML models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/). - -### Neural search tutorial - -For a step-by-step tutorial, see [Neural search tutorial]({{site.url}}{{site.baseurl}}/search-plugins/neural-search-tutorial/). - -### Search methods - -Choose one of the following search methods to use your model for neural search: - -- [Semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/): Uses dense retrieval based on text embedding models to search text data. - -- [Hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/): Combines lexical and neural search to improve search relevance. - -- [Multimodal search]({{site.url}}{{site.baseurl}}/search-plugins/multimodal-search/): Uses neural search with multimodal embedding models to search text and image data. - -- [Neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/): Uses neural search with sparse retrieval based on sparse embedding models to search text data. - -- [Conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/): With conversational search, you can ask questions in natural language, receive a text response, and ask additional clarifying questions. diff --git a/_security-analytics/api-tools/detector-api.md b/_security-analytics/api-tools/detector-api.md index 5b0f9a9eb06..a0f0f683e55 100644 --- a/_security-analytics/api-tools/detector-api.md +++ b/_security-analytics/api-tools/detector-api.md @@ -345,7 +345,7 @@ Field | Type | Description This API uses the detector ID to specify and delete a detector. -### Path and HTTP methods +### Endpoints ```json DELETE /_plugins/_security_analytics/detectors/IJAXz4QBrmVplM4JYxx_ @@ -372,7 +372,7 @@ DELETE /_plugins/_security_analytics/detectors/<detector Id> The Get Detector API retrieves the detector details. Use the detector ID in the call to fetch detector details. -### Path and HTTP methods +### Endpoints ```json GET /_plugins/_security_analytics/detectors/x-dwFIYBT6_n8WeuQjo4 diff --git a/_security-analytics/api-tools/rule-api.md b/_security-analytics/api-tools/rule-api.md index d9de875d12b..904a06bcbfe 100644 --- a/_security-analytics/api-tools/rule-api.md +++ b/_security-analytics/api-tools/rule-api.md @@ -111,7 +111,7 @@ falsepositives: "reason": "{\"error\":\"Sigma rule must have a log source\",\"error\":\"Sigma rule must have a detection definitions\"}", "caused_by": { "type": "exception", - "reason": "java.util.Arrays$ArrayList: {\"error\":\"Sigma rule must have a log source\",\"error\":\"Sigma rule must have a detection definitions\"}" + "reason": "{\"error\":\"Sigma rule must have a log source\",\"error\":\"Sigma rule must have a detection definitions\"}" } }, "status": 400 diff --git a/_security-analytics/threat-intelligence/api/findings.md b/_security-analytics/threat-intelligence/api/findings.md index 5c648ab2ae1..23474e4e403 100644 --- a/_security-analytics/threat-intelligence/api/findings.md +++ b/_security-analytics/threat-intelligence/api/findings.md @@ -18,7 +18,7 @@ The threat intelligence Alerts and Findings API retrieves information about aler Retrieves any alerts related to threat intelligence monitors. -### Path and HTTP methods +### Endpoints ```json GET /_plugins/_security_analytics/threat_intel/alerts @@ -94,7 +94,7 @@ A threat intelligence alert can have one of the following states. Updates the status of the specified alerts to `ACKNOWLEDGED` or `COMPLETED`. Only alerts in the `ACTIVE` state can be updated. -### Path and HTTP methods +### Endpoints ```json PUT /plugins/security_analytics/threat_intel/alerts/status @@ -174,7 +174,7 @@ PUT /plugins/security_analytics/threat_intel/alerts/status?state=COMPLETED&alert Returns threat intelligence indicator of compromise (IOC) findings. When the threat intelligence monitor finds a malicious IOC during a data scan, a finding is automatically generated. -### Path and HTTP methods +### Endpoints ```json GET /_plugins/_security_analytics/threat_intel/findings/ diff --git a/_security-analytics/threat-intelligence/api/monitor.md b/_security-analytics/threat-intelligence/api/monitor.md index e22b31f1562..672e402eaf3 100644 --- a/_security-analytics/threat-intelligence/api/monitor.md +++ b/_security-analytics/threat-intelligence/api/monitor.md @@ -8,7 +8,7 @@ nav_order: 35 # Monitor API -You can use the threat intelligence Monitor API to create, search, and update [monitors](https://opensearch.org/docs/latest/observing-your-data/alerting/monitors/) for your threat intelligence feeds. +You can use the threat intelligence Monitor API to create, search, and update [monitors]({{site.url}}{{site.baseurl}}/observing-your-data/alerting/monitors/) for your threat intelligence feeds. --- @@ -16,7 +16,7 @@ You can use the threat intelligence Monitor API to create, search, and update [m Creates or updates a threat intelligence monitor. -### Path and HTTP methods +### Endpoints The `POST` method creates a new monitor. The `PUT` method updates a monitor. @@ -207,7 +207,7 @@ The following section provides example requests for the Monitor API. Deletes an existing threat intelligence monitor. -### Path and HTTP methods +### Endpoints ```json DELETE /_plugins/_security_analytics/threat_intel/monitors/<monitor_id> diff --git a/_security-analytics/threat-intelligence/api/source.md b/_security-analytics/threat-intelligence/api/source.md index e9bd540477c..efa71fd9688 100644 --- a/_security-analytics/threat-intelligence/api/source.md +++ b/_security-analytics/threat-intelligence/api/source.md @@ -14,7 +14,7 @@ The threat intelligence Source API updates and returns information about tasks r Creates or updates a threat intelligence source and loads indicators of compromise (IOCs) from that source. -### Path and HTTP methods +### Endpoints ```json POST _plugins/_security_analytics/threat_intel/sources @@ -283,7 +283,7 @@ The following example responses show what OpenSearch returns after a successful Retrieves the threat intelligence source configuration details. -### Path and HTTP methods +### Endpoints ```json @@ -346,7 +346,7 @@ GET /_plugins/_security_analytics/threat_intel/sources/<source-id> Searches for threat intelligence source matches based on the search query. The request body expects a search query. For query options, see [Query DSL]({{site.url}}{{site.baseurl}}/query-dsl/). -### Path and HTTP methods +### Endpoints ```json POST /_plugins/_security_analytics/threat_intel/sources/_search @@ -440,7 +440,7 @@ POST /_plugins/_security_analytics/threat_intel/sources/_search Deletes a threat intelligence source. -### Path and HTTP methods +### Endpoints ```json DELETE /_plugins/_security_analytics/threat_intel/sources/<source-id> @@ -466,7 +466,7 @@ DELETE /_plugins/_security_analytics/threat_intel/sources/2c0u7JAB9IJUg27gcjUp Downloads any IOCs from the threat intelligence source. Only supports the `S3_CUSTOM` type source. -### Path and HTTP methods +### Endpoints ```json POST /_plugins/_security_analytics/threat_intel/sources/<source-id>/_refresh diff --git a/_security-analytics/threat-intelligence/getting-started.md b/_security-analytics/threat-intelligence/getting-started.md index b26063bed07..d7575063203 100644 --- a/_security-analytics/threat-intelligence/getting-started.md +++ b/_security-analytics/threat-intelligence/getting-started.md @@ -7,7 +7,7 @@ nav_order: 41 # Getting started -To get started with threat intelligence, you'll need to set up your threat intelligence sources and set up monitors to scan your log sources. The following tutorial shows you how to get started using OpenSearch Dashboards. Alternatively, you can use the [API](({{site.url}}{{site.baseurl}}/security-analytics/threat-intelligence/api/threat-intel-api/). +To get started with threat intelligence, you'll need to set up your threat intelligence sources and set up monitors to scan your log sources. The following tutorial shows you how to get started using OpenSearch Dashboards. Alternatively, you can use the [API]({{site.url}}{{site.baseurl}}/security-analytics/threat-intelligence/api/threat-intel-api/). ## Threat intelligence view diff --git a/_security/access-control/api.md b/_security/access-control/api.md index 0a6aedbb95c..abb89380cbd 100644 --- a/_security/access-control/api.md +++ b/_security/access-control/api.md @@ -171,7 +171,7 @@ Introduced 1.0 Changes the password for the current user. -#### Path and HTTP methods +#### Endpoints ```json PUT _plugins/_security/api/account @@ -1568,7 +1568,7 @@ PUT _plugins/_security/api/nodesdn/<cluster-name> Makes a bulk update for the list of distinguished names. -#### Path and HTTP methods +#### Endpoints ```json PATCH _plugins/_security/api/nodesdn @@ -1682,7 +1682,7 @@ GET _plugins/_security/api/ssl/certs Reload transport layer communication certificates. These REST APIs let a super admin (or a user with sufficient permissions to access this API) reload transport layer certificates. -#### Path and HTTP methods +#### Endpoints ```json PUT /_plugins/_security/api/ssl/transport/reloadcerts @@ -1717,7 +1717,7 @@ curl -X PUT "https://your-opensearch-cluster/_plugins/_security/api/ssl/transpor Reload HTTP layer communication certificates. These REST APIs let a super admin (or a user with sufficient permissions to access this API) reload HTTP layer certificates. -#### Path and HTTP methods +#### Endpoints ```json PUT /_plugins/_security/api/ssl/http/reloadcerts diff --git a/_security/access-control/field-masking.md b/_security/access-control/field-masking.md index 188def5a6cb..0211a509933 100644 --- a/_security/access-control/field-masking.md +++ b/_security/access-control/field-masking.md @@ -30,15 +30,15 @@ Field masking works alongside field-level security on the same per-role, per-ind ## Set the salt setting -You can set the salt (a random string used to hash your data) in `opensearch.yml` using the optional `plugins.security.compliance.salt` setting. The salt value must fullfil the following requirements: +You can set the salt (a random string used to hash your data) in `opensearch.yml` using the optional `plugins.security.compliance.salt` setting. The salt value must fulfill the following requirements: -- Must be at least 32 characters. +- Must be at least 16 characters. - Use only ASCII characters. The following example shows a salt value: ```yml -plugins.security.compliance.salt: abcdefghijklmnopqrstuvqxyz1234567890 +plugins.security.compliance.salt: abcdefghijklmnop ``` Although setting the salt is optional, it is highly recommended. @@ -79,12 +79,16 @@ See [Create role]({{site.url}}{{site.baseurl}}/security/access-control/api/#crea By default, the Security plugin uses the BLAKE2b algorithm, but you can use any hashing algorithm that your JVM provides. This list typically includes MD5, SHA-1, SHA-384, and SHA-512. -You can override the default algorithm in `opensearch.yml` using the option default masking algorithm setting `plugins.security.masked_fields.algorithm.default`, as shown in the following example: +You can override the default algorithm in `opensearch.yml` using the optional default masking algorithm setting `plugins.security.masked_fields.algorithm.default`, as shown in the following example: ```yml plugins.security.masked_fields.algorithm.default: SHA-256 ``` -. +OpenSearch 3.x contains a bug fix to apply the default BLAKE2b algorithm correctly. You can override the default algorithm in OpenSearch 3.x to continue to produce the same masked values as OpenSearch 1.x and 2.x in `opensearch.yml` using the optional default masking algorithm setting `plugins.security.masked_fields.algorithm.default`, as shown in the following example: + +```yml +plugins.security.masked_fields.algorithm.default: BLAKE2B_LEGACY_DEFAULT +``` To specify a different algorithm, add it after the masked field in `roles.yml`, as shown in the following: diff --git a/_security/access-control/permissions.md b/_security/access-control/permissions.md index 5a75a0a5a7d..d09fa0f4607 100644 --- a/_security/access-control/permissions.md +++ b/_security/access-control/permissions.md @@ -267,6 +267,7 @@ See [Asynchronous search]({{site.url}}{{site.baseurl}}/search-plugins/async/inde See [ISM API]({{site.url}}{{site.baseurl}}/im-plugin/ism/api/). +- cluster:indices:admin/opensearch/ism/managedindex - cluster:admin/opendistro/ism/managedindex/add - cluster:admin/opendistro/ism/managedindex/change - cluster:admin/opendistro/ism/managedindex/remove @@ -528,7 +529,7 @@ These permissions apply to an index or index pattern. You might want a user to h | `indices:monitor/data_stream/stats` | Permission to stream stats. | | `indices:monitor/recovery` | Permission to access recovery stats. | | `indices:monitor/segments` | Permission to access segment stats. | -| `indices:monitor/settings/get` | Permission to get mointor settings. | +| `indices:monitor/settings/get` | Permission to get monitor settings. | | `indices:monitor/shard_stores` | Permission to access shard store stats. | | `indices:monitor/stats` | Permission to access monitoring stats. | | `indices:monitor/upgrade` | Permission to access upgrade stats. | diff --git a/_security/access-control/users-roles.md b/_security/access-control/users-roles.md index b182e1576a3..8f5bbf3d290 100644 --- a/_security/access-control/users-roles.md +++ b/_security/access-control/users-roles.md @@ -247,7 +247,7 @@ Map the role to your user: OpenSearch user roles are essential for controlling access to cluster resources. Users can be categorized as regular users, admin users, or super admin users based on their access rights and responsibilities. -For more information about defining users, see [Defining users](https://opensearch.org/docs/latest/security/access-control/users-roles/#defining-users). For more information about defining roles, see [Defining roles](https://opensearch.org/docs/latest/security/access-control/users-roles/#defining-roles). +For more information about defining users, see [Defining users]({{site.url}}{{site.baseurl}}/security/access-control/users-roles/#defining-users). For more information about defining roles, see [Defining roles]({{site.url}}{{site.baseurl}}/security/access-control/users-roles/#defining-roles). ### Regular users @@ -259,7 +259,7 @@ Admin users have elevated permissions that allow them to perform various adminis - Configure permissions. - Adjust backend settings. -Admin users can perform these tasks by configuring settings in the `opensearch.yml` file, using OpenSearch Dashboards, or interacting with the REST API. For more information about configuring users and roles, see [predefined roles](https://opensearch.org/docs/latest/security/access-control/users-roles/#predefined-roles). +Admin users can perform these tasks by configuring settings in the `opensearch.yml` file, using OpenSearch Dashboards, or interacting with the REST API. For more information about configuring users and roles, see [predefined roles]({{site.url}}{{site.baseurl}}/security/access-control/users-roles/#predefined-roles). ### Super admin users Super admin users have the highest level of administrative authority within the OpenSearch environment. This role is typically reserved for select users and should be managed carefully. @@ -280,4 +280,4 @@ plugins.security.authcz.admin_dn: If the super admin certificate is signed by a different CA, then the admin CA must be concatenated with the node's CA in the file defined in `plugins.security.ssl.http.pemtrustedcas_filepath` in `opensearch.yml`. -For more information, see [Configuring super admin certificates](https://opensearch.org/docs/latest/security/configuration/tls/#configuring-admin-certificates). +For more information, see [Configuring super admin certificates]({{site.url}}{{site.baseurl}}/security/configuration/tls/#configuring-admin-certificates). diff --git a/_security/audit-logs/storage-types.md b/_security/audit-logs/storage-types.md index 719287ad7fa..a07d98db596 100644 --- a/_security/audit-logs/storage-types.md +++ b/_security/audit-logs/storage-types.md @@ -16,6 +16,7 @@ Setting | Description :--- | :--- debug | Outputs to stdout. Useful for testing and debugging. internal_opensearch | Writes to an audit index on the current OpenSearch cluster. +internal_opensearch_data_stream | Writes to an audit log data stream on the current OpenSearch cluster. external_opensearch | Writes to an audit index on a remote OpenSearch cluster. webhook | Sends events to an arbitrary HTTP endpoint. log4j | Writes the events to a Log4j logger. You can use any Log4j [appender](https://logging.apache.org/log4j/2.x/manual/appenders.html), such as SNMP, JDBC, Cassandra, and Kafka. @@ -23,10 +24,29 @@ log4j | Writes the events to a Log4j logger. You can use any Log4j [appender](ht You configure the output location in `opensearch.yml`: ``` -plugins.security.audit.type: <debug|internal_opensearch|external_opensearch|webhook|log4j> +plugins.security.audit.type: <debug|internal_opensearch|internal_opensearch_data_stream|external_opensearch|webhook|log4j> ``` -`external_opensearch`, `webhook`, and `log4j` all have additional configuration options. Details follow. +`internal_opensearch_data_stream`, `external_opensearch`, `webhook`, and `log4j` can be customized with additional configuration options. For more information, see [Internal OpenSearch data streams](#internal-opensearch-data-streams). + + +## Internal OpenSearch data streams + +You can configure the `internal_opensearch_data_stream` type with the following parameters. + + +Name | Data type | Description +:--- | :--- | :--- +`plugins.security.audit.config.data_stream.name` | String | The name of the audit log data stream. Default is `opensearch-security-auditlog`. + +### Template settings + +Name | Data type | Description +:--- | :--- | :--- +`plugins.security.audit.config.data_stream.template.manage` | Boolean | When `true`, the template for the data stream is managed by OpenSearch. Default is `true`. +`plugins.security.audit.config.data_stream.template.name` | String | The name of the data stream template. Default is `opensearch-security-auditlog`. +`plugins.security.audit.config.data_stream.template.number_of_replicas` | Integer | The number of replicas for the data stream. Default is `0`. +`plugins.security.audit.config.data_stream.template.number_of_shards` | Integer | The number of shards for the data stream. Default is `1`. ## External OpenSearch diff --git a/_security/authentication-backends/jwt.md b/_security/authentication-backends/jwt.md index ef32b9f71a8..907a9a5f2b7 100644 --- a/_security/authentication-backends/jwt.md +++ b/_security/authentication-backends/jwt.md @@ -121,7 +121,7 @@ Name | Description `jwt_header` | The HTTP header in which the token is transmitted. This is typically the `Authorization` header with the `Bearer` schema,`Authorization: Bearer <token>`. Default is `Authorization`. Replacing this field with a value other than `Authorization` prevents the audit log from properly redacting the JWT header from audit messages. It is recommended that users only use `Authorization` when using JWTs with audit logging. `jwt_url_parameter` | If the token is not transmitted in the HTTP header but rather as an URL parameter, define the name of the parameter here. `subject_key` | The key in the JSON payload that stores the username. If not set, the [subject](https://tools.ietf.org/html/rfc7519#section-4.1.2) registered claim is used. -`roles_key` | The key in the JSON payload that stores the user's roles. The value of this key must be a comma-separated list of roles. +`roles_key` | The key in the JSON payload that stores the user's roles. The value must be a comma-separated list of roles. You can configure `roles_key` as a list to extract roles from nested JWT claims. `required_audience` | The name of the audience that the JWT must specify. You can set a single value (for example, `project1`) or multiple comma-separated values (for example, `project1,admin`). If you set multiple values, the JWT must have at least one required audience. This parameter corresponds to the [`aud` claim of the JWT](https://datatracker.ietf.org/doc/html/rfc7519#section-4.1.3). `required_issuer` | The target issuer of JWT stored in the JSON payload. This corresponds to the [`iss` claim of the JWT](https://datatracker.ietf.org/doc/html/rfc7519#section-4.1.1). `jwt_clock_skew_tolerance_seconds` | Sets a window of time, in seconds, to compensate for any disparity between the JWT authentication server and OpenSearch node clock times, thereby preventing authentication failures due to the misalignment. Security sets 30 seconds as the default. Use this setting to apply a custom value. diff --git a/_security/authentication-backends/ldap.md b/_security/authentication-backends/ldap.md index 9f98f7f5b00..e0636f293cb 100755 --- a/_security/authentication-backends/ldap.md +++ b/_security/authentication-backends/ldap.md @@ -21,7 +21,7 @@ We provide a fully functional example that can help you understand how to use an 1. Download and unzip [the example zip file]({{site.url}}{{site.baseurl}}/assets/examples/ldap-example-v2.13.zip). 1. Update the `.env` file with a strong password for `admin` user. -1. At the command line, run `docker-compose up`. +1. At the command line, run `docker compose up`. 1. Review the files: * `docker-compose.yml` defines a single OpenSearch node, an LDAP server, and a PHP administration tool for the LDAP server. diff --git a/_security/authentication-backends/openid-connect.md b/_security/authentication-backends/openid-connect.md index 8e785a9e650..4d4a5220d31 100755 --- a/_security/authentication-backends/openid-connect.md +++ b/_security/authentication-backends/openid-connect.md @@ -63,7 +63,7 @@ Name | Description `jwt_header` | The HTTP header that stores the token. Typically the `Authorization` header with the `Bearer` schema: `Authorization: Bearer <token>`. Optional. Default is `Authorization`. `jwt_url_parameter` | If the token is not transmitted in the HTTP header, but as an URL parameter, define the name of the parameter here. Optional. `subject_key` | The key in the JSON payload that stores the user's name. If not defined, the [subject](https://tools.ietf.org/html/rfc7519#section-4.1.2) registered claim is used. Most IdP providers use the `preferred_username` claim. Optional. -`roles_key` | The key in the JSON payload that stores the user's roles. The value of this key must be a comma-separated list of roles. Required only if you want to use roles in the JWT. +`roles_key` | The key in the JSON payload that stores the user's roles. The value must be a comma-separated list of roles. This key is required only if you want to use roles in the JWT. You can configure `roles_key` as a list to extract roles from nested JWT claims. ## OpenID Connect URL @@ -427,7 +427,7 @@ The following steps use Docker and [Keycloak IdP](https://www.keycloak.org/) to - `new-realm.json` specifies the details of the [realm](https://www.keycloak.org/docs/latest/server_admin/#core-concepts-and-terms). In this example, the realm is named `new`. - `config.yml` configures `basic_internal_auth_domain` and `oidc_auth_domain`. - `opensearch_dashboards.yml` should point to Keycloak for authentication. Make sure that the `opensearch_security.openid.connect_url` setting points to the URL of the realm. -5. At the command line, run `docker-compose up`. +5. At the command line, run `docker compose up`. 6. Access OpenSearch Dashboards at `http://localhost:5601` and log in with username `testuser` and password `testpassword` configured in the `new-realm.json` file. After logging in, the `testuser` receives the backend role `admin` from Keycloak, which is mapped to the `all_access` OpenSearch role. These backend roles can be managed using the Keycloak Administrative Console at http://localhost:8080, using username `admin` and password `admin`. diff --git a/_security/authentication-backends/saml.md b/_security/authentication-backends/saml.md index 652345ccdcb..1e853d3c2c4 100755 --- a/_security/authentication-backends/saml.md +++ b/_security/authentication-backends/saml.md @@ -38,7 +38,7 @@ We provide a fully functional example that can help you understand how to use SA 1. From the command line, run: ```zsh - $ docker-compose up. + $ docker compose up. ``` 1. Access OpenSearch Dashboards at [http://localhost:5601](http://localhost:5601){:target='\_blank'}. diff --git a/_security/configuration/api-rate-limiting.md b/_security/configuration/api-rate-limiting.md index a5481bfee1a..e7df76a7a7a 100644 --- a/_security/configuration/api-rate-limiting.md +++ b/_security/configuration/api-rate-limiting.md @@ -73,13 +73,13 @@ auth_failure_listeners: The following table describes the individual settings for this type of configuration. -| Setting | Description | -| :--- | :--- | -| `type` | The type of rate limiting. In this case, `ip`. | -| `allowed_tries` | The number of login attempts allowed before login attempts are blocked. Be aware that increasing the number increases heap usage. | +| Setting | Description | +| :--- |:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `type` | The type of rate limiting. In this case, `ip`. | +| `allowed_tries` | The number of login attempts allowed before login attempts are blocked. Be aware that increasing the number increases heap usage. | | `time_window_seconds` | The window of time during which the value for `allowed_tries` is enforced. For example, if `allowed_tries` is `3` and `time_window_seconds` is `60`, an IP address has 3 attempts to log in successfully within a 60-second time span before login attempts are blocked. | -| `block_expiry_seconds` | The window of time during which login attempts remain blocked after a failed login. After this time elapses, login is reset and the IP address can attempt to log in again. | -| `max_blocked_clients` | The maximum number of blocked IP addresses. This limits heap usage to avoid a potential DoS attack. | -| `max_tracked_clients` | The maximum number of tracked IP addresses with failed login attempts. This limits heap usage to avoid a potential DoS attack. | -| `ignore_hosts` | A list of IP addresses or hostname patterns to ignore for rate limiting. `config.dynamic.hosts_resolver_mode` must be set to `ip-hostname` to support hostname matching. | +| `block_expiry_seconds` | The window of time during which login attempts remain blocked after a failed login. After this time elapses, login is reset and the IP address can attempt to log in again. | +| `max_blocked_clients` | The maximum number of blocked IP addresses. This limits heap usage to avoid a potential DoS attack. | +| `max_tracked_clients` | The maximum number of tracked IP addresses with failed login attempts. This limits heap usage to avoid a potential DoS attack. | +| `ignore_hosts` | A list of IP addresses, CIDR ranges, or hostname patterns to ignore for rate limiting. `config.dynamic.hosts_resolver_mode` must be set to `ip-hostname` to support hostname matching. | diff --git a/_security/configuration/demo-configuration.md b/_security/configuration/demo-configuration.md index be188169ad8..98fd8c0273a 100644 --- a/_security/configuration/demo-configuration.md +++ b/_security/configuration/demo-configuration.md @@ -29,7 +29,7 @@ Use the following steps to set up the Security plugin using Docker: 2. Run the following command: ```bash -docker-compose up +docker compose up ``` {% include copy.html %} @@ -49,7 +49,7 @@ If you want to disable the Security plugin when using Docker, set the `DISABLE_S - One special character 4. Make sure that Docker is running on your local machine -5. Run `docker-compose up` from the file directory where your `docker-compose.yml` file and `.env` file are located. +5. Run `docker compose up` from the file directory where your `docker-compose.yml` file and `.env` file are located. ### TAR (Linux) and Mac OS diff --git a/_security/configuration/disable-enable-security.md b/_security/configuration/disable-enable-security.md index 38bcc01cdd2..1bab20d330b 100755 --- a/_security/configuration/disable-enable-security.md +++ b/_security/configuration/disable-enable-security.md @@ -96,7 +96,7 @@ Refer to the following installation types to remove the OpenSearch Dashboards pl 1. In `docker-compose.yml`, change `opensearchproject/opensearch-dashboards:{{site.opensearch_dashboards_version}}` to `opensearch-dashboards-no-security`. 1. Change `OPENSEARCH_HOSTS` or `opensearch.hosts` to `http://` rather than `https://`. -1. Enter `docker-compose up`. +1. Enter `docker compose up`. #### Tarball @@ -174,7 +174,7 @@ Use the following steps to reinstall the plugin: 3. Add the necessary configuration to `opensearch.yml` for TLS encryption. See [Configuration]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/security-settings/) for information about the settings that need to be configured. -4. Create the `OPENSEARCH_INITIAL_ADMIN_PASSWORD` variable. For more information, see [Setting up a custom admin password](https://opensearch.org/docs/latest/security/configuration/demo-configuration/#setting-up-a-custom-admin-password). +4. Create the `OPENSEARCH_INITIAL_ADMIN_PASSWORD` variable. For more information, see [Setting up a custom admin password]({{site.url}}{{site.baseurl}}/security/configuration/demo-configuration/#setting-up-a-custom-admin-password). 5. Restart the nodes and reenable shard allocation: diff --git a/_security/configuration/index.md b/_security/configuration/index.md index f68667d92df..09247442116 100644 --- a/_security/configuration/index.md +++ b/_security/configuration/index.md @@ -53,7 +53,7 @@ For more information, see [Configuring TLS certificates]({{site.url}}{{site.base The `config.yml` file allows you to configure the authentication and authorization mechanisms for OpenSearch. Update the authentication backend settings in `<OPENSEARCH_HOME>/config/opensearch-security/config.yml` according to your requirements. -For example, to use LDAP as your authentication backend, add the following settings: +For example, to use the internal authentication backend, add the following settings: ``` authc: @@ -86,7 +86,7 @@ After initial setup, if you make changes to your security configuration or disab 1. Find the `securityadmin` script. The script is typically stored in the OpenSearch plugins directory, `plugins/opensearch-security/tools/securityadmin.[sh|bat]`. - Note: If you're using OpenSearch 1.x, the `securityadmin` script is located in the `plugins/opendistro_security/tools/` directory. - - For more information, see [Basic usage](https://opensearch.org/docs/latest/security/configuration/security-admin/#basic-usage). + - For more information, see [Basic usage]({{site.url}}{{site.baseurl}}/security/configuration/security-admin/#basic-usage). 2. Run the script by using the following command: ``` ./plugins/opensearch-security/tools/securityadmin.[sh|bat] diff --git a/_security/configuration/opensearch-keystore.md b/_security/configuration/opensearch-keystore.md index 01406050995..f27c764edda 100644 --- a/_security/configuration/opensearch-keystore.md +++ b/_security/configuration/opensearch-keystore.md @@ -44,6 +44,7 @@ You can append each command with the following options: - `-h, --help`: Displays help information about the script and its options. - `-s, --silent`: Provides minimal output when the script responds to a command. - `-v, --verbose`: Provides a verbose output for debugging purposes. +- `-p, --password` (`create` command only): Specifies the password to use for encrypting the keystore. If this flag isn't used, the keystore will be created without a password. ## Examples @@ -66,6 +67,17 @@ The script responds with a confirmation that the keystore was created: Created opensearch keystore in $OPENSEARCH_HOME/config/opensearch.keystore ``` +### Create a new password-protected keystore + +To create a new password-protected keystore, run the following command: + +```bash +./bin/opensearch-keystore create -p +``` +{% include copy.html %} + +If a keystore already exists, the script will ask whether you would like to overwrite the existing keystore. + ### Setting a keystore password The following command sets a new keystore password: diff --git a/_security/configuration/tls.md b/_security/configuration/tls.md index a4115b8c25c..97e3b2b2799 100755 --- a/_security/configuration/tls.md +++ b/_security/configuration/tls.md @@ -137,30 +137,7 @@ plugins.security.authcz.admin_dn: For security reasons, you cannot use wildcards or regular expressions as values for the `admin_dn` setting. -For more information about admin and super admin user roles, see [Admin and super admin roles](https://opensearch.org/docs/latest/security/access-control/users-roles/#admin-and-super-admin-roles). - - -## (Advanced) OpenSSL - -The Security plugin supports OpenSSL, but we only recommend it if you use Java 8. If you use Java 11, we recommend the default configuration. - -To use OpenSSL, you must install OpenSSL, the Apache Portable Runtime, and a Netty version with OpenSSL support matching your platform on all nodes. - -If OpenSSL is enabled, but for one reason or another the installation does not work, the Security plugin falls back to the Java JCE as the security engine. - -Name | Description -:--- | :--- -`plugins.security.ssl.transport.enable_openssl_if_available` | Enable OpenSSL on the transport layer if available. Optional. Default is `true`. -`plugins.security.ssl.http.enable_openssl_if_available` | Enable OpenSSL on the REST layer if available. Optional. Default is `true`. - -{% comment %} -1. Install [OpenSSL 1.1.0](https://www.openssl.org/community/binaries.html) on every node. -1. Install [Apache Portable Runtime](https://apr.apache.org) on every node: - - ``` - sudo yum install apr - ``` -{% endcomment %} +For more information about admin and super admin user roles, see [Admin and super admin roles]({{site.url}}{{site.baseurl}}/security/access-control/users-roles/#admin-and-super-admin-roles). ## (Advanced) Hostname verification and DNS lookup @@ -264,40 +241,73 @@ These settings allow for the use of encrypted passwords in the settings. ## Hot reloading TLS certificates -Updating expired or nearly expired TLS certificates does not require restarting the cluster. Instead, enable hot reloading of TLS cerificates by adding the following line to `opensearch.yml`: +Updating expired or nearly expired TLS certificates on the HTTP and transport layers does not require restarting the cluster. Instead, you can enable hot reloading of TLS certificates. When enabled, in-place hot reloading monitors your keystore resources for updates every 5 seconds. If you add or modify a certificate, key file, or keystore setting in the Opensearch `config` directory, the nodes in the cluster detect the change and automatically reload the keys and certificates. + +To enable in-place hot reloading, add the following line to `opensearch.yml`: + +```yml +plugins.security.ssl.certificates_hot_reload.enabled: true +``` +{% include copy.html %} + +### Using the Reload Certificates API +When not using hot reloading, you can use the Reload Certificates API to reread the replaced certificates. -`plugins.security.ssl_cert_reload_enabled: true` +To enable the Reload Certificates API, add the following line to `opensearch.yml`: + +```yml +plugins.security.ssl_cert_reload_enabled: true +``` +{% include copy.html %} This setting is `false` by default. {: .note } -After enabling hot reloading, use the Reload Certificates API to replace the expired certificates. The API expects the old certificates to be replaced with valid certificates issued with the same `Issuer/Subject DN` and `SAN`. The new certificates also need be stored in the same location as the previous certificates in order to prevent any changes to the `opensearch.yml` file. +After enabling reloading, use the Reload Certificates API to replace the expired certificates. The new certificates need to be stored in the same location as the previous certificates in order to prevent any changes to the `opensearch.yml` file. + +By default, the Reload Certificates API expects the old certificates to be replaced with valid certificates issued with the same `Issuer/Subject DN` and `SAN`. This behavior can be disabled by adding the following settings in `opensearch.yml`: + +```yml +plugins.security.ssl.http.enforce_cert_reload_dn_verification: false +plugins.security.ssl.transport.enforce_cert_reload_dn_verification: false +``` +{% include copy.html %} + Only a [superadmin]({{site.url}}{{site.baseurl}}/security/configuration/tls/#configuring-admin-certificates) can use the Reload Certificates API. {: .note } -### Reload TLS certificates on the transport layer - The following command reloads TLS certificates on the transport layer: +#### Reload TLS certificates on the transport layer + + The following command reloads TLS certificates on the transport layer using the Reload Certificates API: - ```json - curl --cacert <ca.pem> --cert <admin.pem> --key <admin.key> -XPUT https://localhost:9200/_plugins/_security/api/ssl/transport/reloadcerts - ``` - {% include copy.html %} +```json +curl --cacert <ca.pem> --cert <admin.pem> --key <admin.key> -XPUT https://localhost:9200/_plugins/_security/api/ssl/transport/reloadcerts +``` +{% include copy-curl.html %} You should receive the following response: -```{ "message": "successfully updated transport certs"}``` -### Reload TLS certificates on the http layer +``` +{ "message": "successfully updated transport certs"} +``` + +#### Reload TLS certificates on the HTTP layer -The following command reloads TLS certificates on the `http` layer: +The following command reloads TLS certificates on the HTTP layer using the Reload Certificates API: - ```json - curl --cacert <ca.pem> --cert <admin.pem> --key <admin.key> -XPUT https://localhost:9200/_plugins/_security/api/ssl/http/reloadcerts - ``` - {% include copy.html %} +```json +curl --cacert <ca.pem> --cert <admin.pem> --key <admin.key> -XPUT https://localhost:9200/_plugins/_security/api/ssl/http/reloadcerts +``` +{% include copy-curl.html %} You should receive the following response: -```{ "message": "successfully updated http certs"}``` +``` +{ "message": "successfully updated http certs"} +``` + + + diff --git a/_security/configuration/yaml.md b/_security/configuration/yaml.md index 2694e3a24fc..dff3819a937 100644 --- a/_security/configuration/yaml.md +++ b/_security/configuration/yaml.md @@ -94,7 +94,7 @@ requests: # Only allow GET requests to /sample-index1/_doc/1 and /sample-index2/ ## internal_users.yml -This file contains any initial users that you want to add to the Security plugin's internal user database. You can find this file in ``<OPENSEARCH_HOME>/config/opensearch-security/internal_users.yml`. +This file contains any initial users that you want to add to the Security plugin's internal user database. You can find this file in `<OPENSEARCH_HOME>/config/opensearch-security/internal_users.yml`. The file format requires a hashed password. To generate one, run `plugins/opensearch-security/tools/hash.sh -p <new-password>`. If you decide to keep any of the demo users, *change their passwords* and re-run [securityadmin.sh]({{site.url}}{{site.baseurl}}/security/configuration/security-admin/) to apply the new passwords. diff --git a/_tools/index.md b/_tools/index.md index c9d446a81a6..cb0ec3f60c9 100644 --- a/_tools/index.md +++ b/_tools/index.md @@ -26,7 +26,7 @@ For information about Data Prepper, the server-side data collector for filtering Historically, many multiple popular agents and ingestion tools have worked with Elasticsearch OSS, such as Beats, Logstash, Fluentd, FluentBit, and OpenTelemetry. OpenSearch aims to continue to support a broad set of agents and ingestion tools, but not all have been tested or have explicitly added OpenSearch compatibility. -As an intermediate compatibility solution, OpenSearch has a setting that instructs the cluster to return version 7.10.2 rather than its actual version. +As an intermediate compatibility solution, OpenSearch 1.x and 2.x provide a setting that instructs the cluster to return version 7.10.2 rather than its actual version. If you use clients that include a version check, such as versions of Logstash OSS or Filebeat OSS between 7.x - 7.12.x, enable the setting: @@ -84,6 +84,7 @@ Some users report compatibility issues with ingest pipelines on these versions o | ODFE 1.0 to 1.12 | *Yes* | *Yes* | *No* | *Yes* | *Yes* | | ODFE 1.13 | *Yes* | *Yes* | *No* | *Yes* | *Yes* | | OpenSearch 1.x to 2.x | Yes via version setting | Yes via version setting | *No* | *Yes* | Yes, with Elastic Common Schema Setting | +| OpenSearch 3.x | *No* | *No* | *No* | *Yes* | Yes, with Elastic Common Schema Setting | \* Most current compatible version with Elasticsearch OSS. diff --git a/_tools/k8s-operator.md b/_tools/k8s-operator.md index 7ee1c1adeea..5027dcf3045 100644 --- a/_tools/k8s-operator.md +++ b/_tools/k8s-operator.md @@ -63,40 +63,40 @@ Then install the OpenSearch Kubernetes Operator using the following steps: 3. Enter `make build manifests`. 4. Start a Kubernetes cluster. When using minikube, open a new terminal window and enter `minikube start`. Kubernetes will now use a containerized minikube cluster with a namespace called `default`. Make sure that `~/.kube/config` points to the cluster. -```yml -apiVersion: v1 -clusters: -- cluster: - certificate-authority: /Users/naarcha/.minikube/ca.crt - extensions: - - extension: - last-update: Mon, 29 Aug 2022 10:11:47 CDT - provider: minikube.sigs.k8s.io - version: v1.26.1 - name: cluster_info - server: https://127.0.0.1:61661 - name: minikube -contexts: -- context: - cluster: minikube - extensions: - - extension: - last-update: Mon, 29 Aug 2022 10:11:47 CDT - provider: minikube.sigs.k8s.io - version: v1.26.1 - name: context_info - namespace: default - user: minikube - name: minikube -current-context: minikube -kind: Config -preferences: {} -users: -- name: minikube - user: - client-certificate: /Users/naarcha/.minikube/profiles/minikube/client.crt - client-key: /Users/naarcha/.minikube/profiles/minikube/client.key -``` + ```yml + apiVersion: v1 + clusters: + - cluster: + certificate-authority: /Users/naarcha/.minikube/ca.crt + extensions: + - extension: + last-update: Mon, 29 Aug 2022 10:11:47 CDT + provider: minikube.sigs.k8s.io + version: v1.26.1 + name: cluster_info + server: https://127.0.0.1:61661 + name: minikube + contexts: + - context: + cluster: minikube + extensions: + - extension: + last-update: Mon, 29 Aug 2022 10:11:47 CDT + provider: minikube.sigs.k8s.io + version: v1.26.1 + name: context_info + namespace: default + user: minikube + name: minikube + current-context: minikube + kind: Config + preferences: {} + users: + - name: minikube + user: + client-certificate: /Users/naarcha/.minikube/profiles/minikube/client.crt + client-key: /Users/naarcha/.minikube/profiles/minikube/client.key + ``` 5. Enter `make install` to create the CustomResourceDefinition that runs in your Kubernetes cluster. 6. Start the OpenSearch Kubernetes Operator. Enter `make run`. @@ -146,4 +146,4 @@ kubectl delete -f opensearch-cluster.yaml To learn more about how to customize your Kubernetes OpenSearch cluster, including data persistence, authentication methods, and scaling, see the [OpenSearch Kubernetes Operator User Guide](https://github.com/Opster/opensearch-k8s-operator/blob/main/docs/userguide/main.md). -If you want to contribute to the development of the OpenSearch Kubernetes Operator, see the repo [design documents](https://github.com/Opster/opensearch-k8s-operator/blob/main/docs/designs/high-level.md). \ No newline at end of file +If you want to contribute to the development of the OpenSearch Kubernetes Operator, see the repo [design documents](https://github.com/Opster/opensearch-k8s-operator/blob/main/docs/designs/high-level.md). diff --git a/_tools/logstash/read-from-opensearch.md b/_tools/logstash/read-from-opensearch.md index 53024c233bc..f1da9dc8ed5 100644 --- a/_tools/logstash/read-from-opensearch.md +++ b/_tools/logstash/read-from-opensearch.md @@ -13,7 +13,7 @@ redirect_from: As we ship Logstash events to an OpenSearch cluster using the [OpenSearch output plugin](https://github.com/opensearch-project/logstash-output-opensearch), we can also perform read operations on an OpenSearch cluster and load data into Logstash using the [OpenSearch input plugin](https://github.com/opensearch-project/logstash-input-opensearch). The OpenSearch input plugin reads the search query results performed on an OpenSearch cluster and loads them into Logstash. This lets you replay test logs, reindex, and perform other operations based on the loaded data. You can schedule ingestions to run periodically by using -[cron expressions](https://opensearch.org/docs/latest/monitoring-plugins/alerting/cron/), or manually load data into Logstash by running the query once. +[cron expressions]({{site.url}}{{site.baseurl}}/monitoring-plugins/alerting/cron/), or manually load data into Logstash by running the query once. @@ -51,4 +51,4 @@ Like the output plugin, after adding your configuration to the `pipeline.conf` f Adding `stdout{}` to the `output{}` section of your `pipeline.conf` file prints the query results to the console. -To reindex the data into an OpenSearch domain, add the destination domain configuration in the `output{}` section like shown [here](https://opensearch.org/docs/latest/tools/logstash/index/). +To reindex the data into an OpenSearch domain, add the destination domain configuration in the `output{}` section like shown [here]({{site.url}}{{site.baseurl}}/tools/logstash/index/). diff --git a/_tools/logstash/ship-to-opensearch.md b/_tools/logstash/ship-to-opensearch.md index 6ea355b34fa..ae8dbbde2e1 100644 --- a/_tools/logstash/ship-to-opensearch.md +++ b/_tools/logstash/ship-to-opensearch.md @@ -158,7 +158,7 @@ The following list provides details on the credential resolution logic: The OpenSearch output plugin can store both time series datasets (such as logs, events, and metrics) and non-time series data in OpenSearch. The data stream is recommended to index time series datasets (such as logs, metrics, and events) into OpenSearch. -To learn more about data streams, see the [data stream documentation](https://opensearch.org/docs/latest/opensearch/data-streams/). +To learn more about data streams, see the [data stream documentation]({{site.url}}{{site.baseurl}}/opensearch/data-streams/). To ingest data into a data stream through Logstash, create the data stream and specify the name of the data stream and set the `action` setting to `create`, as shown in the following example configuration: diff --git a/_troubleshoot/security-admin.md b/_troubleshoot/security-admin.md index f4770c1ddb5..976e4356156 100644 --- a/_troubleshoot/security-admin.md +++ b/_troubleshoot/security-admin.md @@ -92,7 +92,7 @@ Connected as CN=node-0.example.com,OU=SSL,O=Test,L=Test,C=DE ERR: CN=node-0.example.com,OU=SSL,O=Test,L=Test,C=DE is not an admin user ``` -You must use an admin certificate when executing the script. To learn more, see [Configuring super admin certificates](https://opensearch.org/docs/latest/security/configuration/tls/#configuring-admin-certificates). +You must use an admin certificate when executing the script. To learn more, see [Configuring super admin certificates]({{site.url}}{{site.baseurl}}/security/configuration/tls/#configuring-admin-certificates). ## Use the diagnose option diff --git a/_tuning-your-cluster/availability-and-recovery/remote-store/index.md b/_tuning-your-cluster/availability-and-recovery/remote-store/index.md index d5dc99d5fef..63812a6327e 100644 --- a/_tuning-your-cluster/availability-and-recovery/remote-store/index.md +++ b/_tuning-your-cluster/availability-and-recovery/remote-store/index.md @@ -105,7 +105,7 @@ You can use remote-backed storage to: ## Benchmarks -The OpenSearch Project has run remote store using multiple workload options available within the [OpenSearch Benchmark](https://opensearch.org/docs/latest/benchmark/index/) tool. This section summarizes the benchmark results for the following workloads: +The OpenSearch Project has run remote store using multiple workload options available within the [OpenSearch Benchmark]({{site.url}}{{site.baseurl}}/benchmark/index/) tool. This section summarizes the benchmark results for the following workloads: - [StackOverflow](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/so) - [HTTP logs](https://github.com/opensearch-project/opensearch-benchmark-workloads/tree/main/http_logs) diff --git a/_tuning-your-cluster/availability-and-recovery/remote-store/remote-store-stats-api.md b/_tuning-your-cluster/availability-and-recovery/remote-store/remote-store-stats-api.md index b184930e1d4..5bbd5c226cd 100644 --- a/_tuning-your-cluster/availability-and-recovery/remote-store/remote-store-stats-api.md +++ b/_tuning-your-cluster/availability-and-recovery/remote-store/remote-store-stats-api.md @@ -15,7 +15,7 @@ Use the Remote Store Stats API to monitor shard-level remote-backed storage perf Metrics returned from this API only relate to indexes stored on remote-backed nodes. For an aggregated output on an index at the node or cluster level, use the [Index Stats]({{site.url}}{{site.baseurl}}/api-reference/index-apis/stats/), [Nodes Stats]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/nodes-stats/), or [Cluster Stats]({{site.url}}{{site.baseurl}}/api-reference/cluster-api/cluster-stats/) API. -## Path and HTTP methods +## Endpoints ```json GET _remotestore/stats/<index_name> @@ -290,9 +290,9 @@ The `segment.upload` object contains the following fields. | `total_remote_refresh` | The total number of remote refreshes. | | `total_uploads_in_bytes` | The total number of bytes in all uploads to the remote store. | | `remote_refresh_size_in_bytes.last_successful` | The size of the data uploaded during the last successful refresh. | -| `remote_refresh_size_in_bytes.moving_avg` | The average size of the data, in bytes, uploaded in the last *N* refreshes. *N* is defined in the `remote_store.moving_average_window_size` setting. For more information, see [Remote segment backpressure](https://opensearch.org/docs/latest/tuning-your-cluster/availability-and-recovery/remote-store/remote-segment-backpressure/). | -| `upload_latency_in_bytes_per_sec.moving_avg` | The average speed of remote segment uploads, in bytes per second, for the last *N* uploads. *N* is defined in the `remote_store.moving_average_window_size` setting. For more information, see [Remote segment backpressure](https://opensearch.org/docs/latest/tuning-your-cluster/availability-and-recovery/remote-store/remote-segment-backpressure/). | -| `remote_refresh_latency_in_millis.moving_avg` | The average amount of time, in milliseconds, taken by a single remote refresh during the last *N* remote refreshes. *N* is defined in the `remote_store.moving_average_window_size` setting. For more information, see [Remote segment backpressure](https://opensearch.org/docs/latest/tuning-your-cluster/availability-and-recovery/remote-store/remote-segment-backpressure/). | +| `remote_refresh_size_in_bytes.moving_avg` | The average size of the data, in bytes, uploaded in the last *N* refreshes. *N* is defined in the `remote_store.moving_average_window_size` setting. For more information, see [Remote segment backpressure]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/remote-store/remote-segment-backpressure/). | +| `upload_latency_in_bytes_per_sec.moving_avg` | The average speed of remote segment uploads, in bytes per second, for the last *N* uploads. *N* is defined in the `remote_store.moving_average_window_size` setting. For more information, see [Remote segment backpressure]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/remote-store/remote-segment-backpressure/). | +| `remote_refresh_latency_in_millis.moving_avg` | The average amount of time, in milliseconds, taken by a single remote refresh during the last *N* remote refreshes. *N* is defined in the `remote_store.moving_average_window_size` setting. For more information, see [Remote segment backpressure]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/remote-store/remote-segment-backpressure/). | The `segment.download` object contains the following fields. diff --git a/_tuning-your-cluster/availability-and-recovery/rule-based-autotagging/autotagging.md b/_tuning-your-cluster/availability-and-recovery/rule-based-autotagging/autotagging.md new file mode 100644 index 00000000000..4f63e495177 --- /dev/null +++ b/_tuning-your-cluster/availability-and-recovery/rule-based-autotagging/autotagging.md @@ -0,0 +1,202 @@ +--- +layout: default +title: Rule-based auto-tagging +nav_order: 20 +parent: Workload management +grand_parent: Availability and recovery +--- + +# Rule-based auto-tagging + +Rule-based auto-tagging automatically assigns feature-specific values to incoming requests and evaluates those requests against a set of predefined rules by matching request attributes. +For example, the workload management feature uses index patterns as an attribute and assigns workload group IDs. + +Rule-based auto-tagging offers the following benefits: + +* Flexible attribute-based matching +* Support for feature-specific matching logic +* Consistent policy application +* Automated request classification +* Reduced administrative overhead +* Centralized rule management +* Easy policy updates + +Rule-based auto-tagging provides a flexible framework for implementing feature-specific request handling. Although this topic uses workload management as an example, the attribute-based matching system can be adapted for other OpenSearch features and use cases. +{: .tip } + +## Key concepts + +Before reviewing the rule configuration and behavior, it's important to understand the following key components of rule-based auto-tagging: + +* **Rule**: Defines matching criteria (attributes) and the value to assign. +* **Attributes**: Key-value pairs used to match rules (such as index patterns, user roles, or request types). +* **Feature-specific value**: The value assigned when a rule matches. +* **Pattern matching**: The matching behavior (exact or pattern based) for attribute values. + +## Rule structure and management + +Proper rule structure and management are essential for effective auto-tagging. This section describes the rule schema and how to manage rules. + +### Rule schema + +The following rule schema includes matching attributes and a feature-specific value: + +```json +{ + "_id": "fwehf8302582mglfio349==", + "index_patterns": ["logs-prod-*"], + "other_attribute": ["value1", "value2"], + "workload_group": "production_workload_id", + "updated_at": 1683256789000 +} +``` + +### Managing rules + +Use the following API operations to manage rules for workload management. + +#### Create or update a rule + +```http +PUT /_rules/workload_group +{ + "index_patterns": ["prod-logs-*"], + "other_attribute": ["value1"], + "workload_group": "production_workload_id" +} +``` + +#### List rules + +```http +GET /_rules/workload_group +``` + +#### Delete a rule + +```http +DELETE /_rules/workload_group/{rule_id} +``` + +## Attribute matching + +The attribute matching system determines which rules apply to a given request. Each attribute type can support different matching behaviors, based on the following attribute types: + +1. **Exact matching**: Attribute values must match exactly. +2. **Pattern matching**: Supports wildcards (such as index patterns). +3. **List matching**: Matches any item in a list. +4. **Range matching**: Matches values within a defined range. + +For example, in workload management, index patterns support: + +* Exact match: `logs-2025-04` +* Prefix pattern: `logs-2025-*` + +Note that matching behavior is determined by the feature and attribute type. + +### Rule precedence + +When multiple rules match a request, OpenSearch uses the following precedence rules: + +1. Rules with more specific attribute matches are prioritized. +2. Feature-specific tie-breaking logic is applied. + +For example, with index patterns: + +* `logs-prod-2025-*` takes precedence over `logs-prod-*`. +* `logs-prod-*` takes precedence over `logs-*`. + +### Evaluation process + +OpenSearch evaluates incoming requests using the following process: + +1. OpenSearch receives a request. +2. The system evaluates request attributes against defined rules. +3. The most specific matching rule's value is assigned. +4. If no rules match, no value is assigned. + +## Examples + +These examples demonstrate how rule-based auto-tagging works in workload management, which uses index patterns as its primary attribute. + +### Multiple attribute matching + +```json +{ + "index_patterns": ["logs-prod-*"], + "request_type": ["search", "count"], + "workload_group": "production_search_workload_id" +} + +{ + "index_patterns": ["logs-prod-*"], + "workload_group": "production_workload_id" +} +``` + +### Attribute specificity + +```json +{ + "index_patterns": ["logs-*"], + "workload_group": "general_workload_id" +} + +{ + "index_patterns": ["logs-prod-service-*"], + "workload_group": "prod_service_workload_id" +} +``` + +## Best practices + +Follow these best practices for designing and operating rule-based auto-tagging. + +### Designing rules + +When creating rules, focus on building logical, specific configurations that support your workload and access patterns. Consider the following guidelines: + +* Identify the most relevant attributes for your use case. +* Use specific attribute values for precise control. +* Combine multiple attributes when appropriate. +* Use consistent naming conventions. +* Document attribute matching behavior. + +### Managing attributes + +Attribute selection and configuration significantly influence rule effectiveness. To manage attributes successfully, perform the following actions: + +* Understand each attribute's matching behavior. +* Start with the most specific criteria needed. +* Avoid overlapping rules unless intentional. +* Plan for future attribute value patterns. + +### Operations + +Ongoing operations and monitoring help maintain rule quality over time. Use the following best practices to ensure that your feature rules are reliable and effective: + +* Test new rules in a development environment. +* Monitor rule matches in system logs. +* Document rule configurations. +* Regularly review rule effectiveness. +* Remove unused rules. + +## Troubleshooting + +When creating rules, the following issues can occur: + +* **No value assigned**: This issue typically occurs when the request attributes do not match any existing rules. + For example, suppose `index_pattern` is a valid allowed attribute. If a request is made to search `logs_q1_2025` but no rule exists for that value, the request will not match any rule and will result in a missing assignment. + +* **Unexpected value**: This can happen when multiple rules are defined with overlapping or conflicting conditions. + For example, consider the following rules: + 1. `{ "username": ["dev*"], "index_pattern": ["logs*"] }` + 2. `{ "index_pattern": ["logs*", "events*"] }` + + If a user with the username `dev_john` sends a search request to `logs_q1_25`, it will match the first rule based on the `username` and `index_pattern` attributes. The request will not match the second rule, even though the `index_pattern` also qualifies. + +You can resolve both issues by validating your configuration using one of the following techniques: + +- **Test rules with sample requests**: First, create a rule using the REST API, and then send a request that matches the rule's attributes. For example, for a rule with `"index_pattern": ["logs*", "events*"]`, you can send a request to a `logs` or `events` index. Then verify the workload management statistics by querying the [Workload Management Stats API]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/workload-management/wlm-feature-overview/#workload-management-stats-api). + +- **Use the [List rules API](#list-rules)** to confirm rule definitions. diff --git a/_tuning-your-cluster/availability-and-recovery/rule-based-autotagging/rule-lifecycle-api.md b/_tuning-your-cluster/availability-and-recovery/rule-based-autotagging/rule-lifecycle-api.md new file mode 100644 index 00000000000..d08e7f1cf3d --- /dev/null +++ b/_tuning-your-cluster/availability-and-recovery/rule-based-autotagging/rule-lifecycle-api.md @@ -0,0 +1,208 @@ +--- +layout: default +title: Rule Lifecycle API +nav_order: 20 +parent: Rule based autotagging +grand_parent: Availability and recovery +--- + +# Rule Lifecycle API + +The Rule Lifecycle API allows you to create, update, retrieve, and delete rules. Each rule is associated with a specific feature type and contains a feature value and at least one attribute. +These rules are designed to automatically assign feature values to incoming queries based on the specified attributes, helping to categorize and manage queries automatically. + +## Endpoints + +The following sections describe the API endpoints available for managing rules across different feature types. + +### Create a rule + +Use the followiing endpoint to add a new rule for a specific feature type: + +```json +PUT /_rules/{feature_type} +POST /_rules/{feature_type} +``` + +### Update a rule + +Use the following endpoint to modify an existing rule by specifying both the feature type and rule ID in the path parameters: + +```json +PUT /_rules/{feature_type}/{_id} +POST /_rules/{feature_type}/{_id} +``` + +### Get a rule + +Use the following endpoint to retrieve either a specific rule by ID or list all rules for a feature type: + +```json +GET /_rules/{feature_type}/{_id} +GET /_rules/{feature_type} +``` + +### Delete a rule + +Use the following endpoint to remove a rule by specifying both the feature type and rule ID: + +```json +DELETE /_rules/{feature_type}/{_id} +``` + +## Path parameters + +The following table lists the available path parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `feature_type` | String | The category of the rule that defines the type of feature, such as `workload_group`. | +| `_id` | String | The unique identifier for the rule. Required for `UPDATE`, `GET`, and `DELETE` operations. | + +## Query parameters + +The following table lists the available query parameters. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `search_after` | String | The token used to retrieve the next page of results for pagination. | +| `<attribute_key>` | String | Filters results to rules where `<attribute_key>` matches one of the specified values. | + +## Request body fields + +The following table lists the fields available in the request body. + +| Field | Data type | Description | +| :--- | :--- | :--- | +| `description` | String | The human-readable explanation or purpose of the rule. | +| `<attribute_key>` | Array | A list of attribute values that must match the query in order for the rule to apply. | +| `<feature_type>` | String | The feature value assigned when the rule matches. | + + +## Example requests + +The following example demonstrates how to use the Rule Lifecycle API to create a rule. + +### Create a rule + +The following request creates a rule that assigns a `workload_group` value based on matching `index_pattern` attributes: + +```json +PUT _rules/workload_group +{ + "description": "description for rule", + "index_pattern": ["log*", "event*"], + "workload_group": "EITBzjFkQ6CA-semNWGtRQ" +} +``` +{% include copy-curl.html %} + +### Update a rule + +The following request updates a rule with ID `0A6RULxkQ9yLqn4r8LPrIg`: + +```json +PUT _rules/workload_group/0A6RULxkQ9yLqn4r8LPrIg +{ + "description": "updated_description for rule", + "index_pattern": ["log*"], + "workload_group": "EITBzjFkQ6CA-semNWGtRQ" +} +``` +{% include copy-curl.html %} + +You can't change the `feature_type`. Fields that are not updated can be omitted. +{: .note } + +### Get a rule + +The following request retrieves a rule by ID: + +```json +GET /_rules/{feature_type}/{_id} +``` +{% include copy-curl.html %} + +The following request retrieves all rules for a feature type: + +```json +GET /_rules/{feature_type} +``` +{% include copy-curl.html %} + +The following request returns all rules of the feature type `workload_group` that contain the attribute `index_pattern` with values `a` or `b`: + +```json +GET /_rules/workload_group?index_pattern=a,b +``` +{% include copy-curl.html %} + +If a `GET` request returns more results than can be included in a single response, the system paginates the results and includes a `search_after` field in the response. +To retrieve the next page, send another request to the same endpoint using the same filters and include the `search_after` value from the previous response as a query parameter. + +The following example continues the search for all rules of the `workload_group` feature type where the `index_pattern` attribute contains the values `a` or `b`: + +```json +"GET /_rules/workload_group?index_pattern=a,b&search_after=z1MJApUB0zgMcDmz-UQq" +``` +{% include copy-curl.html %} + +## Example responses + +<details open markdown="block"> + <summary> + Response: Create or update rule + </summary> + {: .text-delta } + +```json +{ + "_id": "wi6VApYBoX5wstmtU_8l", + "description": "description for rule", + "index_pattern": ["log*", "event*"], + "workload_group": "EITBzjFkQ6CA-semNWGtRQ", + "updated_at": "2025-04-04T20:54:22.406Z" +} +``` + +</details> + + +<details markdown="block"> + <summary> + Response: Get rules + </summary> + {: .text-delta } + +```json +{ + "rules": [ + { + "_id": "z1MJApUB0zgMcDmz-UQq", + "description": "Rule for tagging workload_group_id to index123", + "index_pattern": ["index123"], + "workload_group": "workload_group_id", + "updated_at": "2025-02-14T01:19:22.589Z" + }, + ... + ], + "search_after": ["z1MJApUB0zgMcDmz-UQq"] +} +``` + +If the `search_after` field is present in the response, more results are available. +To retrieve the next page, include the `search_after` value in the next `GET` request as a query parameter, such as `GET /_rules/{feature_type}?search_after=z1MJApUB0zgMcDmz-UQq`. + +</details> + + +## Response body fields + +| Field | Data type | Description | +| :--- | :--- | :--- | +| `_id` | String | The unique identifier for the rule. | +| `description` | String | The explanation or purpose of the rule. | +| `updated_at` | String | The timestamp of the most recent update to the rule in UTC format. | +| `<attribute_key>` | Array | The attribute values used to match incoming queries. | +| `<feature_type>` | String | The value assigned to the feature type if the rule matches. | +| `search_after` | Array | The token for paginating additional results. Present only if more results exist. | \ No newline at end of file diff --git a/_tuning-your-cluster/availability-and-recovery/segment-replication/backpressure.md b/_tuning-your-cluster/availability-and-recovery/segment-replication/backpressure.md index 498aae55fc5..b942de7b94d 100644 --- a/_tuning-your-cluster/availability-and-recovery/segment-replication/backpressure.md +++ b/_tuning-your-cluster/availability-and-recovery/segment-replication/backpressure.md @@ -24,7 +24,7 @@ Field | Data type | Description `segrep.pressure.checkpoint.limit` | Integer | The maximum number of indexing checkpoints that a replica shard can fall behind when copying from primary. Once `segrep.pressure.checkpoint.limit` is breached along with `segrep.pressure.time.limit`, the segment replication backpressure mechanism is initiated. Default is `4` checkpoints. `segrep.pressure.replica.stale.limit `| Floating point | The maximum number of stale replica shards that can exist in a replication group. Once `segrep.pressure.replica.stale.limit` is breached, the segment replication backpressure mechanism is initiated. Default is `.5`, which is 50% of a replication group. -## Path and HTTP methods +## Endpoints You can use the segment replication API endpoint to retrieve segment replication backpressure metrics as follows: diff --git a/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md b/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md index d13955f3f06..4268b987c18 100644 --- a/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md +++ b/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md @@ -16,9 +16,12 @@ The searchable snapshot feature incorporates techniques like caching frequently ## Configuring a node to use searchable snapshots -To configure the searchable snapshots feature, create a node in your `opensearch.yml file` and define the node role as `search`. Optionally, you can also configure the `cache.size` property for the node. +As of OpenSearch 3.0, nodes that use the searchable snapshots feature must have the `warm` node role instead of the `search` role. +{: .important} -A `search` node reserves storage for the cache to perform searchable snapshot queries. In the case of a dedicated search node where the node exclusively has the `search` role, this value defaults to a fixed percentage (80%) of available storage. In other cases, the value needs to be configured by the user using the `node.search.cache.size` setting. +To configure the searchable snapshots feature, create a node in your `opensearch.yml` file and define the node role as `warm`. Optionally, you can also configure the `cache.size` property for the node. + +A `warm` node reserves storage for the cache to perform searchable snapshot queries. In the case of a dedicated search node where the node exclusively has the `warm` role, this value defaults to a fixed percentage (80%) of available storage. In other cases, the value needs to be configured using the `node.search.cache.size` setting. Parameter | Type | Description :--- | :--- | :--- @@ -27,26 +30,26 @@ Parameter | Type | Description ```yaml node.name: snapshots-node -node.roles: [ search ] +node.roles: [ warm ] node.search.cache.size: 50gb ``` -If you're running Docker, you can create a node with the `search` node role by adding the line `- node.roles=search` to your `docker-compose.yml` file: +If you're running Docker, you can create a node with the `warm` node role by adding the line `- node.roles=warm` to your `docker-compose.yml` file: ```yaml version: '3' services: opensearch-node1: - image: opensearchproject/opensearch:2.7.0 + image: opensearchproject/opensearch:3.0.0 container_name: opensearch-node1 environment: - cluster.name=opensearch-cluster - node.name=opensearch-node1 - - node.roles=search + - node.roles=warm - node.search.cache.size=50gb ``` - +- Starting with version 2.18, k-NN indexes support searchable snapshots for the NMSLIB and Faiss engines. ## Create a searchable snapshot index @@ -54,7 +57,7 @@ A searchable snapshot index is created by specifying the `remote_snapshot` stora Request Field | Description :--- | :--- -`storage_type` | `local` indicates that all snapshot metadata and index data will be downloaded to local storage. <br /><br > `remote_snapshot` indicates that snapshot metadata will be downloaded to the cluster, but the remote repository will remain the authoritative store of the index data. Data will be downloaded and cached as necessary to service queries. At least one node in the cluster must be configured with the `search` node role in order to restore a snapshot using the `remote_snapshot` type. <br /><br > Defaults to `local`. +`storage_type` | `local` indicates that all snapshot metadata and index data will be downloaded to local storage. <br /><br > `remote_snapshot` indicates that snapshot metadata will be downloaded to the cluster, but the remote repository will remain the authoritative store of the index data. Data will be downloaded and cached as necessary to service queries. At least one node in the cluster must be configured with the `warm` node role in order to restore a snapshot using the `remote_snapshot` type. <br /><br > Default is `local`. #### Example request @@ -106,7 +109,6 @@ The following are known limitations of the searchable snapshots feature: - Accessing data from a remote repository is slower than local disk reads, so higher latencies on search queries are expected. - Many remote object stores charge on a per-request basis for retrieval, so users should closely monitor any costs incurred. -- Searching remote data can impact the performance of other queries running on the same node. We recommend that users provision dedicated nodes with the `search` role for performance-critical applications. +- Searching remote data can impact the performance of other queries running on the same node. We recommend that users provision dedicated nodes with the `warm` role for performance-critical applications. - For better search performance, consider [force merging]({{site.url}}{{site.baseurl}}/api-reference/index-apis/force-merge/) indexes into a smaller number of segments before taking a snapshot. For the best performance, at the cost of using compute resources prior to snapshotting, force merge your index into one segment. - We recommend configuring a maximum ratio of remote data to local disk cache size using the `cluster.filecache.remote_data_ratio` setting. A ratio of 5 is a good starting point for most workloads to ensure good query performance. If the ratio is too large, then there may not be sufficient disk space to handle the search workload. For more details on the maximum ratio of remote data, see issue [#11676](https://github.com/opensearch-project/OpenSearch/issues/11676). -- k-NN native-engine-based indexes using `faiss` and `nmslib` engines are incompatible with searchable snapshots. diff --git a/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore.md b/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore.md index 812d5104c79..b45afadc862 100644 --- a/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore.md +++ b/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore.md @@ -68,7 +68,7 @@ Before you can take a snapshot, you have to "register" a snapshot repository. A ``` {% include copy-curl.html %} -You will most likely not need to specify any parameters except for `location`. For allowed request parameters, see [Register or update snapshot repository API](https://opensearch.org/docs/latest/api-reference/snapshots/create-repository/). +You will most likely not need to specify any parameters except for `location`. For allowed request parameters, see [Register or update snapshot repository API]({{site.url}}{{site.baseurl}}/api-reference/snapshots/create-repository/). ### Amazon S3 @@ -110,6 +110,20 @@ You will most likely not need to specify any parameters except for `location`. F sudo ./bin/opensearch-keystore add s3.client.default.secret_key ``` +1. (Optional) If you're using a custom S3 endpoint (for example, MinIO), disable the Amazon EC2 metadata connection: + + ```bash + export AWS_EC2_METADATA_DISABLED=true + ``` + + If you're installing OpenSearch using Helm, update the following settings in your values file: + + ```yml + extraEnvs: + - name: AWS_EC2_METADATA_DISABLED + value: "true" + ``` + 1. (Optional) If you're using temporary credentials, add your session token: ```bash @@ -176,17 +190,34 @@ You will most likely not need to specify any parameters except for `location`. F ```json { - "Version": "2012-10-17", - "Statement": [{ - "Action": [ - "s3:*" - ], - "Effect": "Allow", - "Resource": [ - "arn:aws:s3:::your-bucket", - "arn:aws:s3:::your-bucket/*" - ] - }] + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "s3:GetBucketLocation", + "s3:ListBucket", + "s3:ListBucketMultipartUploads", + "s3:ListBucketVersions" + ], + "Effect": "Allow", + "Resource": [ + "arn:aws:s3:::your-bucket" + ] + }, + { + "Action": [ + "s3:AbortMultipartUpload", + "s3:DeleteObject", + "s3:GetObject", + "s3:ListMultipartUploadParts", + "s3:PutObject" + ], + "Effect": "Allow", + "Resource": [ + "arn:aws:s3:::your-bucket/*" + ] + } + ] } ``` @@ -204,7 +235,7 @@ You will most likely not need to specify any parameters except for `location`. F ``` {% include copy-curl.html %} -You will most likely not need to specify any parameters except for `bucket` and `base_path`. For allowed request parameters, see [Register or update snapshot repository API](https://opensearch.org/docs/latest/api-reference/snapshots/create-repository/). +You will most likely not need to specify any parameters except for `bucket` and `base_path`. For allowed request parameters, see [Register or update snapshot repository API]({{site.url}}{{site.baseurl}}/api-reference/snapshots/create-repository/). ### Registering a Microsoft Azure storage account using Helm @@ -250,7 +281,7 @@ Use the following steps to register a snapshot repository backed by an Azure sto azure-snapshot-storage-account-key: ### Insert base64 encoded key ``` -1. [Deploy OpenSearch using Helm](https://opensearch.org/docs/latest/install-and-configure/install-opensearch/helm/) with the following additional values. Specify the value of the storage account in the `AZURE_SNAPSHOT_STORAGE_ACCOUNT` environment variable: +1. [Deploy OpenSearch using Helm]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/helm/) with the following additional values. Specify the value of the storage account in the `AZURE_SNAPSHOT_STORAGE_ACCOUNT` environment variable: ```yaml extraInitContainers: @@ -479,15 +510,19 @@ Request parameters | Description `include_global_state` | Whether to restore the cluster state. Default is `false`. `include_aliases` | Whether to restore aliases alongside their associated indexes. Default is `true`. `partial` | Whether to allow the restoration of partial snapshots. Default is `false`. -`rename_pattern` | If you want to rename indexes as you restore them, use this option to specify a regular expression that matches all indexes you want to restore. Use capture groups (`()`) to reuse portions of the index name. -`rename_replacement` | If you want to rename indexes as you restore them, use this option to specify the replacement pattern. Use `$0` to include the entire matching index name, `$1` to include the content of the first capture group, and so on. +`rename_pattern` | If you want to rename indexes, use this option to specify a regular expression that matches all the indexes that you want to restore and rename. Use capture groups (`()`) to reuse portions of the index name. +`rename_replacement` | If you want to rename indexes, use this option to specify the name replacement pattern. Use `$0` to include the entire matching index name or the number of the capture group. For example, `$1` would include the content of the first capture group. +`rename_alias_pattern` | If you want to rename aliases, use this option to specify a regular expression that matches all the aliases you want to restore and rename. Use capture groups (`()`) to reuse portions of the alias name. +`rename_alias_replacement` | If you want to rename aliases, use this option to specify the name replacement pattern. Use `$0` to include the entire matching alias name or the number of the capture group. For example, `$1` would include the content of the first capture group. `index_settings` | If you want to change [index settings]({{site.url}}{{site.baseurl}}/im-plugin/index-settings/) applied during the restore operation, specify them here. You cannot change `index.number_of_shards`. `ignore_index_settings` | Rather than explicitly specifying new settings with `index_settings`, you can ignore certain index settings in the snapshot and use the cluster defaults applied during restore. You cannot ignore `index.number_of_shards`, `index.number_of_replicas`, or `index.auto_expand_replicas`. `storage_type` | `local` indicates that all snapshot metadata and index data will be downloaded to local storage. <br /><br > `remote_snapshot` indicates that snapshot metadata will be downloaded to the cluster, but the remote repository will remain the authoritative store of the index data. Data will be downloaded and cached as necessary to service queries. At least one node in the cluster must be configured with the [search role]({{site.url}}{{site.baseurl}}/security/access-control/users-roles/) in order to restore a snapshot using the type `remote_snapshot`. <br /><br > Defaults to `local`. ### Conflicts and compatibility -One way to avoid naming conflicts when restoring indexes is to use the `rename_pattern` and `rename_replacement` options. You can then, if necessary, use the `_reindex` API to combine the two. The simpler way is to delete existing indexes prior to restoring from a snapshot. +One way to avoid index naming conflicts when restoring indexes is to use the `rename_pattern` and `rename_replacement` options. You can then, if necessary, use the `_reindex` API to combine the two. However, it may be simpler to delete the indexes that caused the conflict prior to restoring them from a snapshot. + +Similarly, to avoid alias naming conflicts when restoring indexes with aliases, you can use the `rename_alias_pattern` and `rename_alias_replacement` options. You can use the `_close` API to close existing indexes prior to restoring from a snapshot, but the index in the snapshot has to have the same number of shards as the existing index. diff --git a/_tuning-your-cluster/availability-and-recovery/workload-management/create-workload-group-rules-api.md b/_tuning-your-cluster/availability-and-recovery/workload-management/create-workload-group-rules-api.md new file mode 100644 index 00000000000..48901eca791 --- /dev/null +++ b/_tuning-your-cluster/availability-and-recovery/workload-management/create-workload-group-rules-api.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Create workload group rules +nav_order: 20 +parent: Workload management +grand_parent: Availability and recovery +--- + +# Create workload group rules + +The Create Workload Group Rules API for the `workload_group` feature type allows you to define, update, retrieve, and delete rules that automatically assign workload group IDs to incoming queries. When a query matches the attributes specified in a rule, OpenSearch tags the query with the corresponding workload group ID. This eliminates the need for clients to manually include the workload group ID in each request. + + +## Supported attributes + +The following table lists the attributes supported by the `workload_group` feature type. + +| Attribute | Data type | Description | +| :--- | :--- | :--- | +| `index_pattern` | List | A list of strings used to match the target indexes of incoming queries. Each string can be an exact index name or a prefix ending in `*` to support wildcard matching, for example, `logs*`. | + + +## Example requests + +The following example demonstrates how to use the Create Workload Group Rules API to create a rule for the `workload_group` feature type. + +### Create a rule + +The following request creates a rule that assigns a `workload_group` value when the `index_pattern` attribute matches: + +```json +PUT _rules/workload_group +{ + "description": "description for rule", + "index_pattern": ["log*", "event*"], + "workload_group": "EITBzjFkQ6CA-semNWGtRQ" +} +``` +{% include copy-curl.html %} + +### Update a rule + +The following request updates a rule with the ID `0A6RULxkQ9yLqn4r8LPrIg`: + +```json +PUT _rules/workload_group/0A6RULxkQ9yLqn4r8LPrIg +{ + "description": "updated_description for rule", + "index_pattern": ["log*"], + "workload_group": "EITBzjFkQ6CA-semNWGtRQ" +} +``` +{% include copy-curl.html %} + +You can't change the `feature_type`. You can omit any fields that you don't want to update. +{: .note } + +### Get a rule + +You can retrieve rules created using the Create Workload Group Rules API by rule ID, by attribute filters, or by using pagination. + +The following request retrieves a rule by ID for the `workload_group` feature type: + +```json +GET /_rules/workload_group/{_id} +``` +{% include copy-curl.html %} + +The following request retrieves all rules for the `workload_group` feature type: + +```json +GET /_rules/workload_group +``` +{% include copy-curl.html %} + +The following request retrieves all rules for the `workload_group` feature type where the `index_pattern` attribute matches `a` or `b`: + +```json +GET /_rules/workload_group?index_pattern=a,b +``` +{% include copy-curl.html %} + +If the response contains more results than can fit on a single page, OpenSearch paginates the results and includes a `search_after` value in the response. +To retrieve the next page, send another request to the same endpoint using the same filters and include the `search_after` value from the previous response as a query parameter. + +The following request provides the next page of rules from the same workload group: + +```json +"GET /_rules/workload_group?index_pattern=a,b&search_after=z1MJApUB0zgMcDmz-UQq" +``` +{% include copy-curl.html %} + +## Example responses + +<details open markdown="block"> + <summary> + Response: Create or update rule + </summary> + {: .text-delta } + +```json +{ + "_id": "wi6VApYBoX5wstmtU_8l", + "description": "description for rule", + "index_pattern": ["log*", "event*"], + "workload_group": "EITBzjFkQ6CA-semNWGtRQ", + "updated_at": "2025-04-04T20:54:22.406Z" +} +``` + +</details> + + +<details markdown="block"> + <summary> + Response: Get rules + </summary> + {: .text-delta } + +```json +{ + "rules": [ + { + "_id": "z1MJApUB0zgMcDmz-UQq", + "description": "Rule for tagging workload_group_id to index123", + "index_pattern": ["index123"], + "workload_group": "workload_group_id", + "updated_at": "2025-02-14T01:19:22.589Z" + }, + ... + ], + "search_after": ["z1MJApUB0zgMcDmz-UQq"] +} +``` + +If the `search_after` field is present in the response, more results are available. +To retrieve the next page, include the `search_after` value in the next `GET` request as a query parameter, such as `GET /_rules/{feature_type}?search_after=z1MJApUB0zgMcDmz-UQq`. + +</details> diff --git a/_tuning-your-cluster/availability-and-recovery/workload-management/wlm-feature-overview.md b/_tuning-your-cluster/availability-and-recovery/workload-management/wlm-feature-overview.md new file mode 100644 index 00000000000..8d7968b9d41 --- /dev/null +++ b/_tuning-your-cluster/availability-and-recovery/workload-management/wlm-feature-overview.md @@ -0,0 +1,194 @@ +--- +layout: default +title: Workload management +nav_order: 70 +has_children: true +parent: Availability and recovery +--- + +Introduced 2.18 +{: .label .label-purple } + +# Workload management + +Workload management allows you to group search traffic and isolate network resources, preventing the overuse of network resources by specific requests. It offers the following benefits: + +- Tenant-level admission control and reactive query management. When resource usage exceeds configured limits, it automatically identifies and cancels demanding queries, ensuring fair resource distribution. + +- Tenant-level isolation within the cluster for search workloads, operating at the node level. + +## Installing workload management + +To install workload management, use the following command: + +```json +./bin/opensearch-plugin install workload-management +``` +{% include copy-curl.html %} + +## Workload groups + +A _workload group_ is a logical grouping of tasks with defined resource limits. System administrators can dynamically manage workload groups using the Workload Management APIs. These workload groups can be used to create search requests with resource limits. + +### Permissions + +Only users with administrator-level permissions can create and update workload groups using the Workload Management APIs. + +### Operating modes + +The following operating modes determine the operating level for a workload group: + +- **Disabled mode**: Workload management is disabled. + +- **Enabled mode**: Workload management is enabled and will cancel and reject queries once the workload group's configured thresholds are reached. + +- **Monitor_only mode** (Default): Workload management will monitor tasks but will not cancel or reject any queries. + +### Example request + +The following example request adds a workload group named `analytics`: + +```json +PUT _wlm/workload_group +{ + “name”: “analytics”, + “resiliency_mode”: “enforced”, + “resource_limits”: { + “cpu”: 0.4, + “memory”: 0.2 + } +} +``` +{% include copy-curl.html %} + +When creating a workload group, make sure that the sum of the resource limits for a single resource, such as `cpu` or `memory`, does not exceed `1`. + +### Example response + +OpenSearch responds with the set resource limits and the `_id` for the workload group: + +```json +{ + "_id":"preXpc67RbKKeCyka72_Gw", + "name":"analytics", + "resiliency_mode":"enforced", + "resource_limits":{ + "cpu":0.4, + "memory":0.2 + }, + "updated_at":1726270184642 +} +``` + +## Using `workloadGroupID` + +You can associate a query request with a `workloadGroupID` to manage and allocate resources within the limits defined by the workload group. By using this ID, request routing and tracking are associated with the workload group, ensuring resource quotas and task limits are maintained. + +The following example query uses the `workloadGroupID` to ensure that the query does not exceed that workload group's resource limits: + +```json +GET testindex/_search +Host: localhost:9200 +Content-Type: application/json +workloadGroupId: preXpc67RbKKeCyka72_Gw +{ + "query": { + "match": { + "field_name": "value" + } + } +} +``` +{% include copy-curl.html %} + +## Workload management settings + +The following settings can be used to customize workload management using the `_cluster/settings` API. + +| **Setting name** | **Description** | +|:-----------------------------------------------------------| :--- | +| `wlm.workload_group.duress_streak` | Determines the node duress threshold. Once the threshold is reached, the node is marked as `in duress`. | +| `wlm.workload_group.enforcement_interval` | Defines the monitoring interval. | +| `wlm.workload_group.mode` | Defines the [operating mode](#operating-modes). | +| `wlm.workload_group.node.memory_rejection_threshold` | Defines the workload group level `memory` threshold. When the threshold is reached, the request is rejected. | +| `wlm.workload_group.node.cpu_rejection_threshold` | Defines the workload group level `cpu` threshold. When the threshold is reached, the request is rejected. | +| `wlm.workload_group.node.memory_cancellation_threshold` | Controls whether the node is considered to be in duress when the `memory` threshold is reached. Requests routed to nodes in duress are canceled. | +| `wlm.workload_group.node.cpu_cancellation_threshold` | Controls whether the node is considered to be in duress when the `cpu` threshold is reached. Requests routed to nodes in duress are canceled. | + +When setting rejection and cancellation thresholds, remember that the rejection threshold for a resource should always be lower than the cancellation threshold. + +## Workload Management Stats API + +The Workload Management Stats API returns workload management metrics for a workload group, using the following method: + +```json +GET _wlm/stats +``` +{% include copy-curl.html %} + +### Example response + +```json +{ + “_nodes”: { + “total”: 1, + “successful”: 1, + “failed”: 0 + }, + “cluster_name”: “XXXXXXYYYYYYYY”, + “A3L9EfBIQf2anrrUhh_goA”: { + “workload_groups”: { + “16YGxFlPRdqIO7K4EACJlw”: { + “total_completions”: 33570, + “total_rejections”: 0, + “total_cancellations”: 0, + “cpu”: { + “current_usage”: 0.03319935314357281, + “cancellations”: 0, + “rejections”: 0 + }, + “memory”: { + “current_usage”: 0.002306486276211217, + “cancellations”: 0, + “rejections”: 0 + } + }, + “DEFAULT_WORKLOAD_GROUP”: { + “total_completions”: 42572, + “total_rejections”: 0, + “total_cancellations”: 0, + “cpu”: { + “current_usage”: 0, + “cancellations”: 0, + “rejections”: 0 + }, + “memory”: { + “current_usage”: 0, + “cancellations”: 0, + “rejections”: 0 + } + } + } + } +} +``` +{% include copy-curl.html %} + +### Response body fields + +| Field name | Description | +| :--- |:-------------------------------------------------------------------------------------------------------------------------------------------------| +| `total_completions` | The total number of request completions in the `workload_group` at the given node. This includes all shard-level and coordinator-level requests. | +| `total_rejections` | The total number request rejections in the `workload_group` at the given node. This includes all shard-level and coordinator-level requests. | +| `total_cancellations` | The total number of cancellations in the `workload_group` at the given node. This includes all shard-level and coordinator-level requests. | +| `cpu` | The `cpu` resource type statistics for the `workload_group`. | +| `memory` | The `memory` resource type statistics for the `workload_group`. | + +### Resource type statistics + +| Field name | Description | +| :--- |:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `current_usage` | The resource usage for the `workload_group` at the given node based on the last run of the monitoring thread. This value is updated based on the `wlm.workload_group.enforcement_interval`. | +| `cancellations` | The number of cancellations resulting from the cancellation threshold being reached. | +| `rejections` | The number of rejections resulting from the cancellation threshold being reached. | + diff --git a/_tuning-your-cluster/availability-and-recovery/workload-management/workload-group-lifecycle-api.md b/_tuning-your-cluster/availability-and-recovery/workload-management/workload-group-lifecycle-api.md new file mode 100644 index 00000000000..e8d49ae912c --- /dev/null +++ b/_tuning-your-cluster/availability-and-recovery/workload-management/workload-group-lifecycle-api.md @@ -0,0 +1,155 @@ +--- +layout: default +title: Workload Group Lifecycle API +nav_order: 20 +parent: Workload management +grand_parent: Availability and recovery +--- + +# Workload Group Lifecycle API + +The Workload Group Lifecycle API creates, updates, retrieves, and deletes workload groups. The API categorizes queries into specific groups, called _workload groups_, based on desired resource limits. + +## Endpoints + + +### Create a workload group + +<!-- spec_insert_start +api: wlm.create_query_group +component: endpoints +omit_header: true +--> +```json +PUT /_wlm/query_group +``` +<!-- spec_insert_end --> + +### Update a workload group + +<!-- spec_insert_start +api: wlm.create_query_group +component: endpoints +omit_header: true +--> +```json +PUT /_wlm/query_group +``` +<!-- spec_insert_end --> + +### Get a workload group + +<!-- spec_insert_start +api: wlm.get_query_group +component: endpoints +omit_header: true +--> +```json +GET /_wlm/query_group +GET /_wlm/query_group/{name} +``` +<!-- spec_insert_end --> + +### Delete a workload group + +<!-- spec_insert_start +api: wlm.create_query_group +component: endpoints +omit_header: true +--> +```json +PUT /_wlm/query_group +``` +<!-- spec_insert_end --> + + +## Request body fields + +| Field | Description | +| :--- | :--- | +| `_id` | The ID of the workload group, which can be used to associate query requests with the group and enforce the group's resource limits. | +| `name` | The name of the workload group. | +| `resiliency_mode` | The resiliency mode of the workload group. Valid modes are `enforced`, `soft`, and `monitor`. For more information about resiliency modes, see [Operating modes]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/workload-management/wlm-feature-overview/#operating-modes). | +| `resource_limits` | The resource limits for query requests in the workload group. Valid resources are `cpu` and `memory`. | + +When creating a workload group, make sure that the sum of the resource limits for a single resource, either `cpu` or `memory`, does not exceed 1. + +## Example requests + +The following example requests show how to use the Workload Group Lifecycle API. + +### Create a workload group + +```json +PUT _wlm/workload_group +{ + "name": "analytics", + "resiliency_mode": "enforced", + "resource_limits": { + "cpu": 0.4, + "memory": 0.2 + } +} +``` +{% include copy-curl.html %} + +### Update a workload group + +```json +PUT _wlm/workload_group/analytics +{ + "resiliency_mode": "monitor", + "resource_limits": { + "cpu": 0.41, + "memory": 0.21 + } +} +``` +{% include copy-curl.html %} + + +## Example responses + +OpenSearch returns responses similar to the following. + +### Creating a workload group + +```json +{ + "_id":"preXpc67RbKKeCyka72_Gw", + "name":"analytics", + "resiliency_mode":"enforced", + "resource_limits":{ + "cpu":0.4, + "memory":0.2 + }, + "updated_at":1726270184642 +} +``` + +### Updating a workload group + +```json +{ + "_id":"preXpc67RbKKeCyka72_Gw", + "name":"analytics", + "resiliency_mode":"monitor", + "resource_limits":{ + "cpu":0.41, + "memory":0.21 + }, + "updated_at":1726270333804 +} +``` + +## Response body fields + +| Field | Description | +| :--- | :--- | +| `_id` | The ID of the workload group. | +| `name` | The name of the workload group. Required when creating a new workload group. | +| `resiliency_mode` | The resiliency mode of the workload group. | +| `resource_limits` | The resource limits of the workload group. | +| `updated_at` | The time at which the workload group was last updated. | + + diff --git a/_tuning-your-cluster/cluster-manager-task-throttling.md b/_tuning-your-cluster/cluster-manager-task-throttling.md index ace4547d440..bdb9caebf77 100644 --- a/_tuning-your-cluster/cluster-manager-task-throttling.md +++ b/_tuning-your-cluster/cluster-manager-task-throttling.md @@ -11,7 +11,7 @@ For many cluster state updates, such as defining a mapping or creating an index, The first line of defense is to implement mechanisms in the caller nodes to avoid task overload on the cluster manager. However, even with those mechanisms in place, the cluster manager needs a built-in way to protect itself: cluster manager task throttling. -To turn on cluster manager task throttling, you need to set throttling limits. The cluster manager uses the throttling limits to determine whether to reject a task. +By default, the cluster manager uses predefined throttling limits to determine whether to reject a task. You can modify these limits or disable throttling for specific task types. The cluster manager rejects a task based on its type. For any incoming task, the cluster manager evaluates the total number of tasks of the same type in the pending task queue. If this number exceeds the threshold for this task type, the cluster manager rejects the incoming task. Rejecting a task does not affect tasks of a different type. For example, if the cluster manager rejects a `put-mapping` task, it can still accept a subsequent `create-index` task. @@ -21,7 +21,7 @@ When the cluster manager rejects a task, the node performs retries with exponent You can set throttling limits by specifying them in the `cluster_manager.throttling.thresholds` object and updating the [OpenSearch cluster settings]({{site.url}}{{site.baseurl}}/api-reference/cluster-settings). The setting is dynamic, so you can change the behavior of this feature without restarting your cluster. -By default, throttling is disabled for all task types. +By default, throttling is enabled for all task types. To disable throttling for a specific task type, set its threshold value to `-1`. {: .note} The request has the following format: @@ -32,58 +32,65 @@ PUT _cluster/settings "persistent": { "cluster_manager.throttling.thresholds" : { "<task-type>" : { - "value" : <threshold limit> + "value" : <threshold> } } } } ``` -The following table describes the `cluster_manager.throttling.thresholds` object. +The `cluster_manager.throttling.thresholds` object contains the following fields. -Field Name | Description +Field name | Description :--- | :--- -task-type | The task type. See [supported task types](#supported-task-types) for a list of valid values. -value | The maximum number of tasks of the `task-type` type in the cluster manager's pending task queue. Default is `-1` (no task throttling). - -## Supported task types - -The following task types are supported: - -- `create-index` -- `update-settings` -- `cluster-update-settings` -- `auto-create` -- `delete-index` -- `delete-dangling-index` -- `create-data-stream` -- `remove-data-stream` -- `rollover-index` -- `index-aliases` -- `put-mapping` -- `create-index-template` -- `remove-index-template` -- `create-component-template` -- `remove-component-template` -- `create-index-template-v2` -- `remove-index-template-v2` -- `put-pipeline` -- `delete-pipeline` -- `create-persistent-task` -- `finish-persistent-task` -- `remove-persistent-task` -- `update-task-state` -- `put-script` -- `delete-script` -- `put-repository` -- `delete-repository` -- `create-snapshot` -- `delete-snapshot` -- `update-snapshot-state` -- `restore-snapshot` -- `cluster-reroute-api` - -#### Example request +`<task-type>` | The task type. For a list of valid task types, see [supported task types and default thresholds](#supported-task-types-and-default-thresholds). +`<task-type>.value` | The maximum number of tasks of the `task-type` type in the cluster manager's pending task queue. <br> For default thresholds for each task type, see [Supported task types and default thresholds](#supported-task-types-and-default-thresholds). + +## Supported task types and default thresholds + +The following table lists all supported task types and their default throttling threshold values. + +Task type | Threshold +:--- | :--- +`create-index `| 50 +`update-settings` | 50 +`cluster-update-settings` | 50 +`auto-create` | 200 +`delete-index` | 50 +`delete-dangling-index `| 50 +`create-data-stream` | 50 +`remove-data-stream` | 50 +`rollover-index` | 200 +`index-aliases` | 200 +`put-mapping` | 10000 +`create-index-template` | 50 +`remove-index-template` | 50 +`create-component-template` | 50 +`remove-component-template` | 50 +`create-index-template-v2` | 50 +`remove-index-template-v2` | 50 +`put-pipeline` | 50 +`delete-pipeline` | 50 +`put-search-pipeline` | 50 +`delete-search-pipeline` | 50 +`create-persistent-task` | 50 +`finish-persistent-task` | 50 +`remove-persistent-task` | 50 +`update-task-state` | 50 +`create-query-group` | 50 +`delete-query-group` | 50 +`update-query-group` | 50 +`put-script` | 50 +`delete-script` | 50 +`put-repository` | 50 +`delete-repository` | 50 +`create-snapshot` | 50 +`delete-snapshot` | 50 +`update-snapshot-state` | 5000 +`restore-snapshot` | 50 +`cluster-reroute-api` | 50 + +## Example request The following request sets the throttling threshold for the `put-mapping` task type to 100: @@ -99,9 +106,4 @@ PUT _cluster/settings } } ``` - -Set the threshold to `-1` to disable throttling for a task type. -{: .note} - - - +{% include copy-curl.html %} diff --git a/_tuning-your-cluster/index.md b/_tuning-your-cluster/index.md index fa0973395fc..735d355138a 100644 --- a/_tuning-your-cluster/index.md +++ b/_tuning-your-cluster/index.md @@ -20,7 +20,7 @@ To create and deploy an OpenSearch cluster according to your requirements, it’ There are many ways to design a cluster. The following illustration shows a basic architecture that includes a four-node cluster that has one dedicated cluster manager node, one dedicated coordinating node, and two data nodes that are cluster manager eligible and also used for ingesting data. - The nomenclature for the cluster manager node is now referred to as the cluster manager node. + The master node is now referred to as the cluster manager node. {: .note } ![multi-node cluster architecture diagram]({{site.url}}{{site.baseurl}}/images/cluster.png) @@ -37,7 +37,8 @@ Data | Stores and searches data. Performs all data-related operations (indexing, Ingest | Pre-processes data before storing it in the cluster. Runs an ingest pipeline that transforms your data before adding it to an index. | If you plan to ingest a lot of data and run complex ingest pipelines, we recommend you use dedicated ingest nodes. You can also optionally offload your indexing from the data nodes so that your data nodes are used exclusively for searching and aggregating. Coordinating | Delegates client requests to the shards on the data nodes, collects and aggregates the results into one final result, and sends this result back to the client. | A couple of dedicated coordinating-only nodes is appropriate to prevent bottlenecks for search-heavy workloads. We recommend using CPUs with as many cores as you can. Dynamic | Delegates a specific node for custom work, such as machine learning (ML) tasks, preventing the consumption of resources from data nodes and therefore not affecting any OpenSearch functionality. -Search | Provides access to [searchable snapshots]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot/). Incorporates techniques like frequently caching used segments and removing the least used data segments in order to access the searchable snapshot index (stored in a remote long-term storage source, for example, Amazon S3 or Google Cloud Storage). | Search nodes contain an index allocated as a snapshot cache. Thus, we recommend dedicated nodes with a setup with more compute (CPU and memory) than storage capacity (hard disk). +Warm | Provides access to [searchable snapshots]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot/). Incorporates techniques like frequently caching used segments and removing the least used data segments in order to access the searchable snapshot index (stored in a remote long-term storage source, for example, Amazon Simple Storage Service [Amazon S3] or Google Cloud Storage). | Search nodes contain an index allocated as a snapshot cache. Thus, we recommend using dedicated nodes with more compute (CPU and memory) than storage capacity (hard disk). +Search | Search nodes are dedicated nodes that host only search replica shards, helping separate search workloads from indexing workloads. | Because search nodes host search replicas and handle search traffic, we recommend using them for dedicated memory-optimized instances. By default, each node is a cluster-manager-eligible, data, ingest, and coordinating node. Deciding on the number of nodes, assigning node types, and choosing the hardware for each node type depends on your use case. You must take into account factors like the amount of time you want to hold on to your data, the average size of your documents, your typical workload (indexing, searches, aggregations), your expected price-performance ratio, your risk tolerance, and so on. diff --git a/_tuning-your-cluster/performance.md b/_tuning-your-cluster/performance.md index 28f47aeacb7..b5066a890cd 100644 --- a/_tuning-your-cluster/performance.md +++ b/_tuning-your-cluster/performance.md @@ -32,12 +32,9 @@ An increased `index.translog.flush_threshold_size` can also increase the time th Before increasing `index.translog.flush_threshold_size`, call the following API operation to get current flush operation statistics: ```json -curl -XPOST "os-endpoint/index-name/_stats/flush?pretty" +GET /<index>/_stats/flush?pretty ``` -{% include copy.html %} - - -Replace the `os-endpoint` and `index-name` with your endpoint and index name. +{% include copy-curl.html %} In the output, note the number of flushes and the total time. The following example output shows that there are 124 flushes, which took 17,690 milliseconds: @@ -53,9 +50,15 @@ In the output, note the number of flushes and the total time. The following exam To increase the flush threshold size, call the following API operation: ```json -curl -XPUT "os-endpoint/index-name/_settings?pretty" -d "{"index":{"translog.flush_threshold_size" : "1024MB"}}" +PUT /<index>/_settings +{ + "index": + { + "translog.flush_threshold_size" : "1024MB" + } +} ``` -{% include copy.html %} +{% include copy-curl.html %} In this example, the flush threshold size is set to 1024 MB, which is ideal for instances that have more than 32 GB of memory. @@ -65,9 +68,9 @@ Choose the appropriate threshold size for your cluster. Run the stats API operation again to see whether the flush activity changed: ```json -curl -XGET "os-endpoint/index-name/_stats/flush?pretty" +GET /<index>/_stats/flush ``` -{% include copy.html %} +{% include copy-curl.html %} It's a best practice to increase the `index.translog.flush_threshold_size` only for the current index. After you confirm the outcome, apply the changes to the index template. {: .note} @@ -127,14 +130,14 @@ To reduce the size of the OpenSearch response, use the `filter_path` parameter t In the following example, the `index-name`, `type-name`, and `took` fields are excluded from the response: ```json -curl -XPOST "es-endpoint/index-name/type-name/_bulk?pretty&filter_path=-took,-items.index._index,-items.index._type" -H 'Content-Type: application/json' -d' +POST /_bulk?pretty&filter_path=-took,-items.index._index,-items.index._type { "index" : { "_index" : "test2", "_id" : "1" } } { "user" : "testuser" } { "update" : {"_id" : "1", "_index" : "test2"} } { "doc" : {"user" : "example"} } ``` -{% include copy.html %} +{% include copy-curl.html %} ## Compression codecs -In OpenSearch 2.9 and later, there are two new codecs for compression: `zstd` and `zstd_no_dict`. You can optionally specify a compression level for these in the `index.codec.compression_level` setting with values in the [1, 6] range. [Benchmark]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/#benchmarking) data shows that `zstd` provides a 7% better write throughput and `zstd_no_dict` provides a 14% better throughput, along with a 30% improvement in storage compared with the `default` codec. For more information about compression, see [Index codecs]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/). \ No newline at end of file +In OpenSearch 2.9 and later, there are two new codecs for compression: `zstd` and `zstd_no_dict`. You can optionally specify a compression level for these in the `index.codec.compression_level` setting with values in the [1, 6] range. [Benchmark]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/#benchmarking) data shows that `zstd` provides a 7% better write throughput and `zstd_no_dict` provides a 14% better throughput, along with a 30% improvement in storage compared with the `default` codec. For more information about compression, see [Index codecs]({{site.url}}{{site.baseurl}}/im-plugin/index-codecs/). diff --git a/_tuning-your-cluster/replication-plugin/getting-started.md b/_tuning-your-cluster/replication-plugin/getting-started.md index 465a431996e..d5a36c7007b 100644 --- a/_tuning-your-cluster/replication-plugin/getting-started.md +++ b/_tuning-your-cluster/replication-plugin/getting-started.md @@ -55,7 +55,7 @@ plugins.security.nodes_dn: ``` ## Example setup -To start two single-node clusters on the same network, save this sample file as `docker-compose.yml` and run `docker-compose up`: +To start two single-node clusters on the same network, save this sample file as `docker-compose.yml` and run `docker compose up`: ```yml version: '3' diff --git a/_tuning-your-cluster/separate-index-and-search-workloads.md b/_tuning-your-cluster/separate-index-and-search-workloads.md new file mode 100644 index 00000000000..e1eabba81df --- /dev/null +++ b/_tuning-your-cluster/separate-index-and-search-workloads.md @@ -0,0 +1,200 @@ +--- +layout: default +title: Separate index and search workloads +nav_order: 42 +has_children: false +redirect_from: + - /tuning-your-cluster/seperate-index-and-search-workloads/ +--- + +# Separate index and search workloads + +In a remote-store-enabled cluster with a segment-replication-enabled index, you can segregate indexing and search workloads across different hardware by using the specialized `search` node role and provisioning corresponding search replicas in the index. + +OpenSearch uses two types of replicas: + +- **Write replicas**: Act as redundant copies of the primary shard. If a primary shard fails (for example, due to node drop or hardware issues), a write replica can be promoted as the new primary to ensure high availability for write operations. +- **Search replicas**: Work for search queries exclusively. Search replicas cannot be promoted as primaries. + +## Benefits of separating workloads + +Separating index and search workloads provides the following benefits: + +1. **Parallel and isolated processing**: Process indexing and search workloads in parallel and isolate them from each other to improve overall system throughput and ensure predictable performance. +2. **Independent scalability**: Scale indexing and search independently by adding more data nodes (for write replicas) or search nodes (for search replicas). +3. **Failure resilience**: Prevent failures in indexing or search from affecting each other to improve overall system availability. +4. **Cost efficiency and performance**: Use specialized hardware (for example, compute-optimized instances for indexing and memory-optimized instances for search) to reduce costs and enhance performance. +5. **Tuning flexibility**: Separately optimize performance settings, like buffers and caches, for indexing and search workloads. + +## Setting up workload separation + +To separate indexing and search workloads, you need to configure search nodes, enable the remote store, and add search replicas to your index. Follow these steps to set up workload separation in your cluster. + +### Step 1: Configure search nodes + +Before you can separate your workloads, you need to designate specific nodes for search operations. Search nodes are dedicated to serving search requests and can help optimize your cluster's search performance. + +The following request configures a node for search-only workloads in `opensearch.yml`: + +```yaml +node.name: searcher-node1 +node.roles: [ search ] +``` + +### Step 2: Enable the remote store + +The remote store provides a centralized storage location for your index data. This configuration is essential for segment replication and ensures that all nodes can access the same data, regardless of their role. Remote storage is particularly useful in cloud environments where you want to separate storage from compute resources. + +The following request sets the repository configuration for a remote store (for example, Amazon Simple Storage Service [Amazon S3]) in `opensearch.yml`: + +```yaml +node.attr.remote_store.segment.repository: "my-repository" +node.attr.remote_store.translog.repository: "my-repository" +node.attr.remote_store.state.repository: "my-repository" +node.attr.remote_store.repository.my-repository.type: s3 +node.attr.remote_store.repository.my-repository.settings.bucket: <Bucket Name 1> +node.attr.remote_store.repository.my-repository.settings.base_path: <Bucket Base Path 1> +node.attr.remote_store.repository.my-repository.settings.region: <Region> +``` + +For more information, see [Remote-backed storage]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/remote-store/index/). + +When separating index and search workloads, set `cluster.remote_store.state.enabled` to `true` during initial setup. This setting ensures that OpenSearch stores index metadata in the remote store, enabling seamless recovery of search replicas in [search-only mode](#turn-off-write-workloads-with-search-only-mode). For more information, see [Search replica recovery scenarios](#search-replica-recovery-scenarios). +{: .note} + + +### Step 3: Add search replicas to an index + +After configuring your nodes and the remote store, you need to set up search replicas for your indexes. Search replicas are copies of your index that are dedicated to handling search requests, allowing you to scale your search capacity independently of your indexing capacity. + +By default, indexes created in a remote-store-enabled cluster use segment replication. For more information, see [Segment replication]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/segment-replication/index/). + +You can add search replicas for an index using the `number_of_search_replicas` setting (default is 0) in one the following ways. + +#### Option 1: Create an index with search replicas + +Use this option when you're creating a new index and want to configure search replicas at the beginning of the process. This approach is ideal for planning your workload separation strategy before indexing data. + +The following request creates an index with one primary, one replica, and two search replicas: + +```json +PUT /my-index +{ + "settings": { + "index": { + "number_of_shards": 1, + "number_of_replicas": 1, + "number_of_search_replicas": 2, + } + } +} +``` +{% include copy-curl.html %} + +#### Option 2: Update the search replica count for an existing index + +Use this option when you have an existing index and want to add or modify search replicas. This is useful when you need to adjust your search capacity based on changing workload demands. + +The following request updates the search replica count: + +```json +PUT /my-index/_settings +{ + "settings": { + "index": { + "number_of_search_replicas": 1 + } + } +} +``` +{% include copy-curl.html %} + +#### Option 3: Restore an index from a snapshot with search replicas + +Use this option when you're restoring an index from a snapshot and want to configure search replicas during the restore process. This is particularly useful for disaster recovery scenarios or when migrating indexes between clusters. + +The following request restores an index from a snapshot with search replicas: + +```json +POST /_snapshot/my-repository/my-snapshot/_restore +{ + "indices": "my-index", + "index_settings": { + "index.number_of_search_replicas": 2, + "index.replication.type": "SEGMENT" + } +}' +``` +{% include copy-curl.html %} + +## Additional configuration + +After setting up basic workload separation, you can fine-tune your configuration to optimize performance and resource utilization. The following settings allow you to control search routing, automatically scale replicas, and manage write workloads based on your specific needs. + +### Enforce cluster-level search request routing + +When search replicas are enabled, all search traffic is routed to them by default. The following request enforces or relaxes this routing behavior: + +```json +PUT /_cluster/settings +{ + "persistent": { + "cluster.routing.search_replica.strict": "true" + } +} +``` +{% include copy-curl.html %} + +The `cluster.routing.search_replica.strict` setting supports the following options: + +- `true` (default): Route only to search replicas. +- `false`: Allow fallback to primary/write replicas if needed. + +### Automatically scale search replicas + +Use the `auto_expand_search_replicas` index setting to automatically scale search replicas based on the number of available search nodes in the cluster. For more information, see [Index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#dynamic-index-level-index-settings). + +### Turn off write workloads with search-only mode + +You can use the `_scale` API to turn off primary shards and write replicas for an index when you don't need to write to it. This approach works well for write-once, read-many scenarios like log analytics, where you can reduce resource usage by keeping only search replicas active. + +The following request turns on search-only mode by deactivating write replicas: + +```json +POST my_index/_scale +{ + "search_only": true +} +``` +{% include copy-curl.html %} + +The following request turns off search-only mode by activating write replicas: + +```json +POST my_index/_scale +{ + "search_only": false +} +``` +{% include copy-curl.html %} + +#### Search replica recovery scenarios + +OpenSearch handles recovery of search replicas in search-only mode differently depending on the configuration. + +##### Scenario 1: Persistent data directory with remote store state disabled + +When you use a persistent data directory and set `cluster.remote_store.state.enabled` to `false`, search replicas recover automatically after node restarts. + +##### Scenario 2: Remote store state enabled without a persistent data directory + +When `cluster.remote_store.state.enabled` is set to `true` and there is no persistent data directory, OpenSearch recovers search replicas without requiring primaries or write replicas. Because remote store state is enabled, OpenSearch retains the index metadata after a restart. The allocation logic skips the active primary check for search replicas, allowing them to be allocated so that search queries remain functional. + +##### Scenario 3: Remote store state enabled with a persistent data directory + +This configuration provides seamless recovery. In search-only mode, with both a persistent data directory and `cluster.remote_store.state.enabled` set to `true`, OpenSearch starts only search replicas—excluding primaries and write replicas—ensuring the index can be queried after restart. + +##### Scenario 4: No persistent data directory and remote store state disabled + +When both the persistent data directory is missing and `cluster.remote_store.state.enabled` is set to `false`, all local state is lost on restart. OpenSearch has no metadata reference, so the index becomes unrecoverable. + diff --git a/_tutorials/gen-ai/agents/build-plan-execute-reflect-agent.md b/_tutorials/gen-ai/agents/build-plan-execute-reflect-agent.md new file mode 100644 index 00000000000..5fd11348471 --- /dev/null +++ b/_tutorials/gen-ai/agents/build-plan-execute-reflect-agent.md @@ -0,0 +1,340 @@ +--- +layout: default +title: Building a plan-execute-reflect agent +parent: Agentic AI +grand_parent: Generative AI +nav_order: 20 +--- + +# Building a plan-execute-reflect agent + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/ml-commons/issues/3745). +{: .warning} + +This tutorial describes how to build and use a _plan-execute-reflect_ agent. This agent can be used to solve complex problems that benefit from multi-step execution and reasoning. In this example, you will ask the agent to analyze flight data in your OpenSearch index. For more information about this agent, see [Plan-execute-reflect agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/plan-execute-reflect/). + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Prerequisite + +Log in to the OpenSearch Dashboards home page, select **Add sample data**, and add the **Sample Flight data**. + +## Step 1: Prepare an LLM + +A plan-execute-reflect agent requires a large language model (LLM) in order to function. This tutorial uses the [Anthropic Claude 3.7 model hosted on Amazon Bedrock](https://aws.amazon.com/bedrock/claude/). You can also [use other supported LLMs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/plan-execute-reflect/#supported-llms). + +### Step 1(a): Create a connector + +Create a connector for the model: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "Amazon Bedrock Claude 3.7-sonnet connector", + "description": "Connector to Amazon Bedrock service for the Claude model", + "version": 1, + "protocol": "aws_sigv4", + "parameters": { + "region": "your_aws_region", + "service_name": "bedrock", + "model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0" + }, + "credential": { + "access_key": "your_aws_access_key", + "secret_key": "your_aws_secret_key", + "session_token": "your_aws_session_token" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://bedrock-runtime.${parameters.region}.amazonaws.com/model/${parameters.model}/converse", + "headers": { + "content-type": "application/json" + }, + "request_body": "{ \"system\": [{\"text\": \"${parameters.system_prompt}\"}], \"messages\": [${parameters._chat_history:-}{\"role\":\"user\",\"content\":[{\"text\":\"${parameters.prompt}\"}]}${parameters._interactions:-}]${parameters.tool_configs:-} }" + } + ] +} +``` +{% include copy-curl.html %} + +Note the connector ID; you'll use it to register the model. + +### Step 1(b): Register the model + +Register the model: + +```json +POST /_plugins/_ml/models/_register +{ + "name": "Bedrock Claude Sonnet model", + "function_name": "remote", + "description": "Bedrock Claude 3.7 sonnet model for Plan, Execute and Reflect Agent", + "connector_id": "your_connector_id" +} +``` +{% include copy-curl.html %} + +Note the model ID; you'll use it in the following steps. + +### Step 1(c): Configure a retry policy + +Because the agent is a long-running agent that executes multiple steps, we strongly recommend configuring a retry policy for your model. For more information, see the `client_config` parameter in [Configuration parameters]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/#configuration-parameters). For example, to configure unlimited retries, set `max_retry_times` to `-1`: + +```json +PUT /_plugins/_ml/models/your_model_id +{ + "connector": { + "client_config": { + "max_retry_times": -1, + "retry_backoff_millis": 300, + "retry_backoff_policy": "exponential_full_jitter" + } + } +} +``` +{% include copy-curl.html %} + +## Step 2: Create an agent + +Create a `plan_execute_and_reflect` agent configured with the following information: + +- Meta information: `name`, `type`, `description`. +- LLM information: The agent uses an LLM to reason, devise a plan for completing the task, execute the steps in the plan using appropriate tools, and reflect on the intermediate results in order to optimize the plan. +- Tools: A tool is a function that can be executed by the agent. Each tool can define its own `name`, `description`, `parameters` and `attributes`. +- Memory: Stores chat messages. OpenSearch currently only supports one memory type: `conversation_index`. + +For more information about all request fields, see [Register Agent API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/register-agent/#request-body-fields). + +To register the agent, send the following request. In this example, you'll create an agent with the `ListIndexTool`, `SearchIndexTool`, and `IndexMappingTool`: + +```json +POST _plugins/_ml/agents/_register +{ + "name": "My Plan Execute and Reflect agent with Claude 3.7", + "type": "plan_execute_and_reflect", + "description": "this is a test agent", + "llm": { + "model_id": "your_llm_model_id_from_step1", + "parameters": { + "prompt": "${parameters.question}" + }}, + "memory": { + "type": "conversation_index" + }, + "parameters": { + "_llm_interface": "bedrock/converse/claude" + }, + "tools": [ + { + "type": "ListIndexTool" + }, + { + "type": "SearchIndexTool" + }, + { + "type": "IndexMappingTool" + } + ], +} +``` +{% include copy-curl.html %} + +Note the agent ID; you'll use it in the next step. + +You can configure other tools that are relevant to your use case as needed. To configure other tools, make sure to provide the `attributes` field for the tool. This is crucial because `attributes` are used to inform the LLM of the expected input schema for executing the tool. + +`ListIndexTool`, `SearchIndexTool`, `IndexMappingTool`, and `WebSearchTool` contain predefined attributes. For example, the `ListIndexTool` provides the following attributes: + +```json +tools: [{ + "type": "ListIndexTool", + "attributes": { + "input_schema": { + "type": "object", + "properties": { + "indices": { + "type": "array", + "items": { + "type": "string" + }, + "description": "OpenSearch index name list, separated by comma. for example: [\"index1\", \"index2\"], use empty array [] to list all indices in the cluster" + } + }, + }, + "strict": false + } +}] +``` + +### Test the agent + +Use the following tips to test your `plan_execute_and_reflect` agent effectively: + +- **Trace agent execution**: Use the Get Message Traces API to view detailed execution steps: + ```http + GET _plugins/_ml/memory/message/your_message_id/traces + ``` + +- **Mitigate hallucinations**: An LLM may "hallucinate" by selecting the wrong tool or misinterpreting the task, especially if the agent is configured with too many tools. To avoid hallucinations, try the following options: + - Limit the number of tools configured in an agent. + - Provide clear, specific descriptions for each tool. + - Ensure the agent has access to all necessary tools for the task. + - Include relevant context about your cluster in the prompt; for example, `Can you identify the error in my cluster by analyzing the "spans" and "logs" indexes?` + +- **Configure retries**: LLM calls can occasionally fail. Set up retries to improve reliability. For more information, see the `client_config` parameter in [Configuration parameters]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/#configuration-parameters). + +To test the agent, run it using the [Execute Agent API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/agent-apis/execute-agent/). Because this agent performs long-running tasks, we recommend running it asynchronously to avoid timeouts. Use the `async=true` query parameter to run the agent as a separate task: + +```json +POST _plugins/_ml/agents/your_agent_id/_execute?async=true +{ + "parameters": { + "question": "How many flights from Beijing to Seattle?" + } +} +``` +{% include copy-curl.html %} + +Note the `task_id` and `memory_id` in the response. You'll use these to track progress and view results. + +Use the following request to check whether the task is still running or has completed: + +```json +GET _plugins/_ml/tasks/your_task_id +``` +{% include copy-curl.html %} + +Once the task is completed, it returns a response from the agent: + +```json +{ + "task_type": "AGENT_EXECUTION", + "function_name": "AGENT", + "state": "COMPLETED", + "worker_node": [ + "q5yAqa75RM-rv0I67V1VVQ" + ], + "create_time": 1746148548710, + "last_update_time": 1746148706345, + "is_async": false, + "response": { + "memory_id": "bzWQjpYBKhItn1nNYHtu", + "inference_results": [ + { + "output": [ + { + "result": "bzWQjpYBKhItn1nNYHtu", + "name": "memory_id" + }, + { + "result": "cDWQjpYBKhItn1nNYHuS", + "name": "parent_interaction_id" + }, + { + "result": "dTWQjpYBKhItn1nNbHsw", + "name": "executor_agent_memory_id" + }, + { + "result": "YjWQjpYBKhItn1nN6oYk", + "name": "executor_agent_parent_interaction_id" + }, + { + "name": "response", + "dataAsMap": { + "response": """# Comprehensive Analysis Report: Flights from Beijing to Seattle + +## Executive Summary +After analyzing the OpenSearch sample flight dataset, I found that there are 0 direct flights from Beijing to Seattle in the dataset. + +## Analysis Process + +### Step 1: Identify Available Data Sources +I began by examining the indices available in the OpenSearch cluster to locate flight-related data. This search revealed one relevant index: `opensearch_dashboards_sample_data_flights`, which contains 13,059 flight records with comprehensive information including origin and destination cities, flight numbers, carriers, and other flight details. + +### Step 2: Data Schema Analysis +I analyzed the index structure and confirmed it contains the necessary fields for this investigation, including: +- Origin/destination city names (`OriginCityName`, `DestCityName`) +- Airport codes (`Origin`, `Dest`) +- Airport IDs (`OriginAirportID`, `DestAirportID`) +- Geographic information for origins and destinations + +### Step 3: Query Construction and Execution +I created and executed a search query to find flights where: +- Origin city is Beijing (also checked for "Beijing Capital International Airport" and airport code "PEK") +- Destination city is Seattle (also checked for "Seattle Tacoma International Airport" and airport code "SEA") + +### Step 4: Result Verification +To ensure the search was properly constructed, I verified that: +1. Flights from Beijing to other destinations exist in the dataset +2. Flights to Seattle from other origins exist in the dataset + +This confirmed that both cities are represented in the data, but no flights connect them directly. + +## Key Findings +- Beijing appears as an origin city in the dataset, with flights to destinations including Warsaw, Pittsburgh, Xi'an, Vienna, and Chicago/Rockford +- Seattle appears as both origin and destination in the dataset, with connections to cities like Vienna, Istanbul, New Orleans, St Louis, and Treviso +- The dataset contains 0 flights from Beijing to Seattle + +## Conclusion +Based on a comprehensive search of the OpenSearch flight sample dataset, there are 0 flights from Beijing to Seattle in this dataset. While both cities appear in the dataset with connections to other locations, this specific route is not represented in the sample data.""" + } + } + ] + } + ] + } +} +``` + +The agent execution response includes several key fields: + +- `memory_id`: The ID of the memory that stores all messages exchanged between the `plan_execute_and_reflect` agent and the LLM. +- `parent_interaction_id`: The `message_id` of the parent message that initiated the conversation in the planning agent. +- `executor_agent_memory_id`: The ID of the memory that stores messages exchanged between the internal executor agent and the LLM. +- `executor_agent_parent_interaction_id`: The `message_id` of the parent message in the executor agent's conversation. +- `response`: The final result produced by the agent after all steps are executed. + +When you execute a plan-execute-reflect agent asynchronously, the API returns the `memory_id` and the `parent_interaction_id` of the planner agent once the agent is started. + +In the final response, the API also returns the `executor_agent_memory_id` and `executor_agent_parent_interaction_id`, which correspond to the internal executor agent responsible for carrying out each step of the plan. The `executor_agent_memory_id` and `executor_agent_parent_interaction_id` are updated in the task as soon as they are available, even before the agent has completed execution. This enables real-time tracking of the execution process. + +To inspect the message history of the agent, use the Get Memory API: + +```json +GET _plugins/_ml/memory/your_memory_id/messages +``` +{% include copy-curl.html %} + +For more information, see the [Memory APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/). + +Note the `message_id` of the relevant message and use it to fetch the step-by-step execution trace: + +```json +GET _plugins/_ml/memory/message/your_message_id/traces +``` +{% include copy-curl.html %} + +For more information, see the [Get Message Traces API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/get-message-traces/). + +### Test conversational memory + +To continue the same conversation, specify the conversation's `memory_id` when executing the agent. Previous messages are extracted and provided as context to the model. Use the `memory_id` of the planner agent to continue a conversation: + +```json +POST _plugins/_ml/agents/your_agent_id/_execute?async=true +{ + "parameters": { + "question": "your_question", + "memory_id": "your_memory_id", + } +} +``` +{% include copy-curl.html %} + +## Next steps + +- For information about using other models, see [Supported LLMs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/plan-execute-reflect/#supported-llms). +- For information about creating agents with custom prompts, see [Modifying default prompts]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/plan-execute-reflect/#modifying-default-prompts). \ No newline at end of file diff --git a/_tutorials/gen-ai/agents/index.md b/_tutorials/gen-ai/agents/index.md new file mode 100644 index 00000000000..9263916d94e --- /dev/null +++ b/_tutorials/gen-ai/agents/index.md @@ -0,0 +1,31 @@ +--- +layout: default +title: Agentic AI +parent: Generative AI +has_children: true +has_toc: false +nav_order: 20 +redirect_from: + - /tutorials/gen-ai/agents/ +flows: + - heading: "Building a flow agent" + link: /ml-commons-plugin/agents-tools/agents-tools-tutorial/ + description: "Learn how to build a flow agent for RAG" + list: + - "<b>Platform:</b> OpenSearch" + - "<b>Model:</b> Anthropic Claude" + - "<b>Deployment:</b> Amazon Bedrock" + - heading: "Building a plan-execute-reflect agent" + link: /tutorials/gen-ai/agents/build-plan-execute-reflect-agent/ + description: "Learn how to build a powerful <i>plan-execute-reflect</i> agent for solving complex problems" + list: + - "<b>Platform:</b> OpenSearch" + - "<b>Model:</b> Anthropic Claude 3.7 Sonnet" + - "<b>Deployment:</b> Amazon Bedrock" +--- + +# Agentic AI tutorials + +The following tutorials show you how to build agents and chatbots using OpenSearch. + +{% include cards.html cards=page.flows %} \ No newline at end of file diff --git a/_tutorials/gen-ai/ai-search-flows/index.md b/_tutorials/gen-ai/ai-search-flows/index.md new file mode 100644 index 00000000000..f6a4414ea77 --- /dev/null +++ b/_tutorials/gen-ai/ai-search-flows/index.md @@ -0,0 +1,21 @@ +--- +layout: default +title: AI search workflows +parent: Generative AI +has_children: false +has_toc: false +nav_order: 40 +redirect_from: + - /tutorials/ai-search-flows/ + - /tutorials/gen-ai/ai-search-flows/ +flows: + - heading: Creating and customizing AI search workflows + link: /vector-search/ai-search/workflow-builder/ + description: "Learn how to build AI search flows in OpenSearch Dashboards" +--- + +# AI search workflows tutorials + +The following tutorials show you how to build AI search workflows. + +{% include cards.html cards=page.flows %} \ No newline at end of file diff --git a/_ml-commons-plugin/tutorials/build-chatbot.md b/_tutorials/gen-ai/chatbots/build-chatbot.md similarity index 99% rename from _ml-commons-plugin/tutorials/build-chatbot.md rename to _tutorials/gen-ai/chatbots/build-chatbot.md index 1e512981061..1f2898b1a85 100644 --- a/_ml-commons-plugin/tutorials/build-chatbot.md +++ b/_tutorials/gen-ai/chatbots/build-chatbot.md @@ -1,8 +1,15 @@ --- layout: default title: Build your own chatbot -parent: Tutorials -nav_order: 60 +parent: Chatbots +grand_parent: Generative AI +has_children: false +has_toc: false +nav_order: 170 +redirect_from: + - /ml-commons-plugin/tutorials/build-chatbot/ + - /vector-search/tutorials/chatbots/build-chatbot/ + - /tutorials/gen-ai/chatbots/build-chatbot/ --- # Build your own chatbot @@ -275,7 +282,7 @@ POST _plugins/_ml/agents/_register } }, { - "type": "CatIndexTool", + "type": "ListIndexTool", "description": "Use this tool to get OpenSearch index information: (health, status, index, uuid, primary count, replica count, docs.count, docs.deleted, store.size, primary.store.size). \nIt takes 2 optional arguments named `index` which is a comma-delimited list of one or more indices to get information from (default is an empty list meaning all indices), and `local` which means whether to return information from the local node only instead of the cluster manager node (default is false)." }, { @@ -309,7 +316,7 @@ Note the following testing tips: - Avoid configuring many tools in an agent. - Provide a detailed tool description clarifying what the tool can do. - Specify the tool to use in the LLM question, for example, `Can you use the PPLTool to query the opensearch_dashboards_sample_data_ecommerce index so it can calculate how many orders were placed last week?`. - - Specify the tool to use when executing an agent. For example, specify that only `PPLTool` and `CatIndexTool` should be used to process the current request. + - Specify the tool to use when executing an agent. For example, specify that only the `PPLTool` and `ListIndexTool` should be used to process the current request. Test the agent: @@ -319,7 +326,7 @@ POST _plugins/_ml/agents/your_agent_id/_execute "parameters": { "question": "Can you query with index opensearch_dashboards_sample_data_ecommerce to calculate how many orders in last week?", "verbose": false, - "selected_tools": ["PPLTool", "CatIndexTool"] + "selected_tools": ["PPLTool", "ListIndexTool"] } } ``` diff --git a/_tutorials/gen-ai/chatbots/index.md b/_tutorials/gen-ai/chatbots/index.md new file mode 100644 index 00000000000..9ddac7a5e5e --- /dev/null +++ b/_tutorials/gen-ai/chatbots/index.md @@ -0,0 +1,37 @@ +--- +layout: default +title: Chatbots +parent: Generative AI +has_children: true +has_toc: false +nav_order: 30 +redirect_from: + - /vector-search/tutorials/chatbots/ + - /tutorials/gen-ai/chatbots/ +chatbots: + - heading: RAG chatbot + link: /tutorials/gen-ai/chatbots/rag-chatbot/ + list: + - "<b>Platform:</b> OpenSearch" + - "<b>Model:</b> Anthropic Claude" + - "<b>Deployment:</b> Amazon Bedrock" + - heading: RAG with a conversational flow agent + link: /tutorials/gen-ai/chatbots/rag-conversational-agent/ + list: + - "<b>Platform:</b> OpenSearch" + - "<b>Model:</b> Anthropic Claude" + - "<b>Deployment:</b> Amazon Bedrock" + - heading: Build your own chatbot + link: /tutorials/gen-ai/chatbots/build-chatbot/ + list: + - "<b>Platform:</b> OpenSearch" + - "<b>Model:</b> Anthropic Claude" + - "<b>Deployment:</b> Amazon Bedrock" +--- + +# Tutorials: Building chatbots + +The following machine learning (ML) tutorials show you how to implement chatbots using agents. + +{% include cards.html cards=page.chatbots %} + \ No newline at end of file diff --git a/_ml-commons-plugin/tutorials/rag-chatbot.md b/_tutorials/gen-ai/chatbots/rag-chatbot.md similarity index 97% rename from _ml-commons-plugin/tutorials/rag-chatbot.md rename to _tutorials/gen-ai/chatbots/rag-chatbot.md index 5dddded23af..9afa460bfa0 100644 --- a/_ml-commons-plugin/tutorials/rag-chatbot.md +++ b/_tutorials/gen-ai/chatbots/rag-chatbot.md @@ -1,15 +1,22 @@ --- layout: default title: RAG chatbot -parent: Tutorials -nav_order: 50 +parent: Chatbots +grand_parent: Generative AI +nav_order: 150 +has_children: false +has_toc: false +redirect_from: + - /ml-commons-plugin/tutorials/rag-chatbot/ + - /vector-search/tutorials/chatbots/rag-chatbot/ + - /tutorials/gen-ai/chatbots/rag-chatbot/ --- # RAG chatbot One of the known limitations of large language models (LLMs) is that their knowledge base only contains information from the period of time during which they were trained. LLMs have no knowledge of recent events or of your internal data. You can augment the LLM knowledge base by using retrieval-augmented generation (RAG). -This tutorial illustrates how to build your own chatbot using [agents and tools](https://opensearch.org/docs/latest/ml-commons-plugin/agents-tools/index/) and RAG. RAG supplements the LLM knowledge base with information contained in OpenSearch indexes. +This tutorial shows you how to build your own chatbot using [agents and tools]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/) and RAG. RAG supplements the LLM knowledge base with information contained in OpenSearch indexes. Replace the placeholders beginning with the prefix `your_` with your own values. {: .note} diff --git a/_ml-commons-plugin/tutorials/rag-conversational-agent.md b/_tutorials/gen-ai/chatbots/rag-conversational-agent.md similarity index 96% rename from _ml-commons-plugin/tutorials/rag-conversational-agent.md rename to _tutorials/gen-ai/chatbots/rag-conversational-agent.md index 86fe38416a6..314b306bbb5 100644 --- a/_ml-commons-plugin/tutorials/rag-conversational-agent.md +++ b/_tutorials/gen-ai/chatbots/rag-conversational-agent.md @@ -1,8 +1,15 @@ --- layout: default title: RAG chatbot with a conversational flow agent -parent: Tutorials -nav_order: 40 +parent: Chatbots +grand_parent: Generative AI +nav_order: 160 +has_children: false +has_toc: false +redirect_from: + - /ml-commons-plugin/tutorials/rag-conversational-agent/ + - /vector-search/tutorials/chatbots/rag-conversational-agent/ + - /tutorials/gen-ai/chatbots/rag-conversational-agent/ --- # RAG chatbot with a conversational flow agent @@ -16,7 +23,7 @@ An alternative way to build RAG conversational search is to use a RAG pipeline. ## Prerequisite -In this tutorial, you'll build a RAG application that provides an OpenSearch [k-NN index]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/) as a knowledge base for a large language model (LLM). For data retrieval, you'll use [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/). For a comprehensive semantic search tutorial, see [Neural search tutorial]({{site.url}}{{site.baseurl}}/search-plugins/neural-search-tutorial/). +In this tutorial, you'll build a RAG application that provides an OpenSearch [vector index]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/) as a knowledge base for a large language model (LLM). For data retrieval, you'll use [semantic search]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/). For a comprehensive semantic search setup, see [this tutorial]({{site.url}}{{site.baseurl}}/search-plugins/neural-search-tutorial/). First, you'll need to update your cluster settings. If you don't have a dedicated machine learning (ML) node, set `"plugins.ml_commons.only_run_on_ml_node": false`. To avoid triggering a native memory circuit breaker, set `"plugins.ml_commons.native_memory_threshold"` to 100%: @@ -44,7 +51,7 @@ Register a text embedding model that will translate text into vector embeddings: POST /_plugins/_ml/models/_register { "name": "huggingface/sentence-transformers/all-MiniLM-L12-v2", - "version": "1.0.1", + "version": "1.0.2", "model_format": "TORCH_SCRIPT" } ``` @@ -104,9 +111,9 @@ PUT /_ingest/pipeline/test_population_data_pipeline For more information about ingest pipelines, see [Ingest pipelines]({{site.url}}{{site.baseurl}}/ingest-pipelines/). -### Step 1.3: Create a k-NN index +### Step 1.3: Create a vector index -Create a k-NN index specifying the ingest pipeline as a default pipeline: +Create a vector index specifying the ingest pipeline as a default pipeline: ```json PUT test_population_data @@ -133,11 +140,11 @@ PUT test_population_data ``` {% include copy-curl.html %} -For more information about k-NN indexes, see [k-NN index]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/). +For more information about vector indexes, see [Creating a vector index]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/). ### Step 1.4: Ingest data -Ingest test data into the k-NN index: +Ingest test data into the vector index: ```json POST _bulk @@ -244,7 +251,7 @@ POST /_plugins/_ml/models/your_LLM_model_id/_predict ## Step 3: Register an agent -OpenSearch provides the following agent types: `flow`, `conversational_flow`, and `conversational`. For more information about agents, see [Agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/#agents). +OpenSearch provides the following agent types: `flow`, `conversational_flow`, and `conversational`. For more information about agents, see [Agents]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/agents/). You will use a `conversational_flow` agent in this tutorial. The agent consists of the following: @@ -632,7 +639,7 @@ POST /_plugins/_ml/agents/your_agent_id/_execute ``` {% include copy-curl.html %} -### Run a neural search query +### Run a vector search ```json POST /_plugins/_ml/agents/your_agent_id/_execute @@ -662,7 +669,7 @@ To expose the `question` parameter, see [Exposing only the `question` parameter] ### Run a hybrid search query -Hybrid search combines keyword and neural search to improve search relevance. For more information, see [Hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/). +Hybrid search combines keyword and vector search to improve search relevance. For more information, see [Hybrid search]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/). Configure a search pipeline: diff --git a/_tutorials/gen-ai/index.md b/_tutorials/gen-ai/index.md new file mode 100644 index 00000000000..d326b2cf8ef --- /dev/null +++ b/_tutorials/gen-ai/index.md @@ -0,0 +1,31 @@ +--- +layout: default +title: Generative AI +has_children: true +has_toc: false +nav_order: 30 +redirect_from: + - /tutorials/gen-ai/ +cards: + - heading: "RAG" + description: "Build retrieval-augmented generation and conversational search applications" + link: "/tutorials/gen-ai/rag/" + - heading: "Agentic AI" + description: "Build your generative AI applications using agents" + link: "/tutorials/gen-ai/agents/" + - heading: "Chatbots" + description: "Build your generative AI applications using chatbots" + link: "/tutorials/gen-ai/chatbots/" + - heading: "AI search workflows" + link: "/tutorials/gen-ai/ai-search-flows/" + description: "Build and configure AI search applications visually in OpenSearch Dashboards" + - heading: "Model guardrails" + description: "Add safety boundaries to your models to ensure controlled responses" + link: "/tutorials/gen-ai/model-controls/" +--- + +# Generative AI tutorials + +Explore the following tutorials to learn about implementing generative AI applications using the OpenSearch vector database. For more information about OpenSearch generative AI functionality, see [Vector search]({{site.url}}{{site.baseurl}}/vector-search/) and [Machine learning]({{site.url}}{{site.baseurl}}/ml-commons-plugin/). + +{% include cards.html cards=page.cards %} diff --git a/_tutorials/gen-ai/model-controls/bedrock-guardrails.md b/_tutorials/gen-ai/model-controls/bedrock-guardrails.md new file mode 100644 index 00000000000..944f08c07a2 --- /dev/null +++ b/_tutorials/gen-ai/model-controls/bedrock-guardrails.md @@ -0,0 +1,401 @@ +--- +layout: default +title: Amazon Bedrock model guardrails +parent: Model guardrails +grand_parent: Generative AI +nav_order: 170 +redirect_from: + - /ml-commons-plugin/tutorials/bedrock-guardrails/ + - /vector-search/tutorials/model-controls/bedrock-guardrails/ +--- + +# Amazon Bedrock model guardrails + +This tutorial shows you how to apply Amazon Bedrock guardrails to your externally hosted models in two ways: + +- [Using the Amazon Bedrock Guardrails standalone API](#using-the-amazon-bedrock-guardrails-standalone-api) +- [Using guardrails embedded in the Amazon Bedrock Model Inference API](#using-guardrails-embedded-in-the-amazon-bedrock-model-inference-api) + +For more information about guardrails, see [Configuring model guardrails]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/guardrails/). + +Replace the placeholders starting with the prefix `your_` with your own values. +{: .note} + +## Prerequisites + +Before you begin, you must create your Amazon Bedrock guardrails. For detailed instructions, see [Create a guardrail](https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails-create.html). + +## Using the Amazon Bedrock Guardrails standalone API + +Use the following steps to call the Amazon Bedrock Guardrails standalone API. + +### Step 1: Create a connector for your Amazon Bedrock guardrail endpoint + +First, create a connector that will interface with your Amazon Bedrock guardrail endpoint. This connector will handle authentication and communication with the guardrail service: + +```json +POST _plugins/_ml/connectors/_create +{ + "name": "BedRock Guardrail Connector", + "description": "BedRock Guardrail Connector", + "version": 1, + "protocol": "aws_sigv4", + "parameters": { + "region": "your_aws_region like us-east-1", + "service_name": "bedrock", + "source": "INPUT" + }, + "credential": { + "access_key": "your_aws_access_key", + "secret_key": "your_aws_secret_key", + "session_token": "your_aws_session_token" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://bedrock-runtime.${parameters.region}.amazonaws.com/guardrail/your_guardrailIdentifier/version/1/apply", + "headers": { + "content-type": "application/json" + }, + "request_body": "{\"source\":\"${parameters.source}\", \"content\":[ { \"text\":{\"text\": \"${parameters.question}\"} } ] }" + } + ] +} +``` +{% include copy-curl.html %} + +### Step 2: Register the guardrail model + +Now that you've created a connector, register it as a remote guardrail model that will be used to validate inputs: + +```json +POST _plugins/_ml/models/_register +{ + "name": "bedrock test guardrail API", + "function_name": "remote", + "description": "guardrail test model", + "connector_id": "your_guardrail_connector_id" +} +``` +{% include copy-curl.html %} + +### Step 3: Test the guardrail model + +Verify that the guardrail is properly filtering inappropriate content: + +```json +POST _plugins/_ml/models/your_model_id/_predict +{ + "parameters": { + "question": "\n\nHuman:How to rob a bank\n\nAssistant:" + } +} +``` +{% include copy-curl.html %} + +The response shows that the guardrail blocks the request when it detects inappropriate content: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "dataAsMap": { + "action": "GUARDRAIL_INTERVENED", + "assessments": [ + { + "contentPolicy": { + "filters": [ + { + "action": "BLOCKED", + "confidence": "HIGH", + "type": "VIOLENCE" + }, + { + "action": "BLOCKED", + "confidence": "HIGH", + "type": "PROMPT_ATTACK" + } + ] + }, + "wordPolicy": { + "customWords": [ + { + "action": "BLOCKED", + "match": "rob" + } + ] + } + } + ], + "blockedResponse": "Sorry, the model cannot answer this question.", + "output": [ + { + "text": "Sorry, the model cannot answer this question." + } + ], + "outputs": [ + { + "text": "Sorry, the model cannot answer this question." + } + ], + "usage": { + "contentPolicyUnits": 1.0, + "contextualGroundingPolicyUnits": 0.0, + "sensitiveInformationPolicyFreeUnits": 0.0, + "sensitiveInformationPolicyUnits": 0.0, + "topicPolicyUnits": 1.0, + "wordPolicyUnits": 1.0 + } + } + } + ], + "status_code": 200 + } + ] +} +``` + +### Step 4: Create a Claude model connector + +To use the guardrails with an Amazon Bedrock Claude model, first create a connector for the Claude endpoint: + +```json +POST _plugins/_ml/connectors/_create +{ + "name": "BedRock claude Connector", + "description": "BedRock claude Connector", + "version": 1, + "protocol": "aws_sigv4", + "parameters": { + "region": "your_aws_region like us-east-1", + "service_name": "bedrock", + "anthropic_version": "bedrock-2023-05-31", + "max_tokens_to_sample": 8000, + "temperature": 0.0001, + "response_filter": "$.completion" + }, + "credential": { + "access_key": "your_aws_access_key", + "secret_key": "your_aws_secret_key", + "session_token": "your_aws_session_token" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-v2/invoke", + "headers": { + "content-type": "application/json", + "x-amz-content-sha256": "required" + }, + "request_body": "{\"prompt\":\"${parameters.prompt}\", \"max_tokens_to_sample\":${parameters.max_tokens_to_sample}, \"temperature\":${parameters.temperature}, \"anthropic_version\":\"${parameters.anthropic_version}\" }" + } + ] +} +``` +{% include copy-curl.html %} + +### Step 5: Register the Claude model + +Register the Claude model with input guardrails enabled. This configuration ensures that all requests sent to the model are first validated by the guardrails: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "Bedrock Claude V2 model", + "function_name": "remote", + "description": "Bedrock Claude V2 model", + "connector_id": "your_connector_id", + "guardrails": { + "input_guardrail": { + "model_id": "your_guardrail_model_id", + "response_filter":"$.action", + "response_validation_regex": "^\"NONE\"$" + }, + "type": "model" + } +} +``` +{% include copy-curl.html %} + +### Step 6: Test the model + +First, test the model with acceptable input: + +```json +POST /_plugins/_ml/models/your_model_id/_predict +{ + "parameters": { + "prompt": "\n\nHuman:${parameters.question}\n\nnAssistant:", + "question": "hello" + } +} +``` +{% include copy-curl.html %} + +The response shows that the call was successful: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "dataAsMap": { + "response": " Hello!" + } + } + ], + "status_code": 200 + } + ] +} +``` + +Next, test the model with inappropriate input: + +```json +POST /_plugins/_ml/models/your_model_id/_predict +{ + "parameters": { + "prompt": "\n\nHuman:${parameters.question}\n\nnAssistant:", + "question": "how to rob a bank" + } +} +``` +{% include copy-curl.html %} + +The response shows that the inappropriate input was blocked: + +```json +{ + "error": { + "root_cause": [ + { + "type": "illegal_argument_exception", + "reason": "guardrails triggered for user input" + } + ], + "type": "illegal_argument_exception", + "reason": "guardrails triggered for user input" + }, + "status": 400 +} +``` + +## Using guardrails embedded in the Amazon Bedrock Model Inference API + +Use the following steps to use the guardrails embedded in the Model Inference API. + +### Step 1: Create a connector for an Amazon Bedrock model containing guardrail headers + +Create a connector that includes guardrail headers in its configuration. In this approach, the guardrail checks are embedded directly in the model inference process. The `post_process_function` is required in order to define the logic used by the model to block inappropriate input: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "BedRock claude Connector", + "description": "BedRock claude Connector", + "version": 1, + "protocol": "aws_sigv4", + "parameters": { + "region": "your_aws_region like us-east-1", + "service_name": "bedrock", + "max_tokens_to_sample": 8000, + "temperature": 0.0001 + }, + "credential": { + "access_key": "your_aws_access_key", + "secret_key": "your_aws_secret_key", + "session_token": "your_aws_session_token" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-v2/invoke", + "headers": { + "content-type": "application/json", + "x-amz-content-sha256": "required", + "X-Amzn-Bedrock-Trace": "ENABLED", + "X-Amzn-Bedrock-GuardrailIdentifier": "your_GuardrailIdentifier", + "X-Amzn-Bedrock-GuardrailVersion": "your_bedrock_guardrail_version" + }, + "request_body": "{\"prompt\":\"${parameters.prompt}\", \"max_tokens_to_sample\":${parameters.max_tokens_to_sample}, \"temperature\":${parameters.temperature}, \"anthropic_version\":\"${parameters.anthropic_version}\" }", + "post_process_function": "\n if (params['amazon-bedrock-guardrailAction']=='INTERVENED') throw new IllegalArgumentException(\"test guardrail from post process function\");\n " + } + ] +} +``` +{% include copy-curl.html %} + +### Step 2: Register the model + +Register the model using the connector with embedded guardrails: + +```json +POST _plugins/_ml/models/_register +{ + "name": "bedrock model with guardrails", + "function_name": "remote", + "description": "guardrails test model", + "connector_id": "your_connector_id" +} +``` +{% include copy-curl.html %} + +### Step 3: Test the model + +Verify that the embedded guardrails are functioning by testing them with potentially inappropriate input: + +```json +POST _plugins/_ml/models/your_model_id/_predict +{ + "parameters": { + "input": "\n\nHuman:how to rob a bank\n\nAssistant:" + } +} +``` +{% include copy-curl.html %} + +The response shows that the inappropriate input was blocked: + +```json +{ + "error": { + "root_cause": [ + { + "type": "m_l_exception", + "reason": "Fail to execute predict in aws connector" + } + ], + "type": "m_l_exception", + "reason": "Fail to execute predict in aws connector", + "caused_by": { + "type": "script_exception", + "reason": "runtime error", + "script_stack": [ + "throw new IllegalArgumentException(\"test guardrail from post process function\");\n ", + " ^---- HERE" + ], + "script": " ...", + "lang": "painless", + "position": { + "offset": 73, + "start": 67, + "end": 152 + }, + "caused_by": { + "type": "illegal_argument_exception", + "reason": "test guardrail from post process function" + } + } + }, + "status": 500 +} +``` \ No newline at end of file diff --git a/_tutorials/gen-ai/model-controls/index.md b/_tutorials/gen-ai/model-controls/index.md new file mode 100644 index 00000000000..552f21b911a --- /dev/null +++ b/_tutorials/gen-ai/model-controls/index.md @@ -0,0 +1,24 @@ +--- +layout: default +title: Model guardrails +parent: Generative AI +has_children: true +has_toc: false +nav_order: 50 +redirect_from: + - /vector-search/tutorials/model-controls/ + - /tutorials/gen-ai/model-controls/ +model_controls: + - heading: Amazon Bedrock guardrails + link: /tutorials/gen-ai/model-controls/bedrock-guardrails/ + list: + - "<b>Platform:</b> OpenSearch" + - "<b>Model:</b> Anthropic Claude" + - "<b>Deployment:</b> Amazon Bedrock" +--- + +# Model guardrails tutorials + +The following tutorials show you how to implement model guardrails. + +{% include cards.html cards=page.model_controls %} \ No newline at end of file diff --git a/_tutorials/gen-ai/rag/conversational-search-claude-bedrock.md b/_tutorials/gen-ai/rag/conversational-search-claude-bedrock.md new file mode 100644 index 00000000000..94de5bce2eb --- /dev/null +++ b/_tutorials/gen-ai/rag/conversational-search-claude-bedrock.md @@ -0,0 +1,561 @@ +--- +layout: default +title: Conversational search using Anthropic Claude on Amazon Bedrock +parent: RAG +grand_parent: Generative AI +nav_order: 160 +redirect_from: + - /vector-search/tutorials/conversational-search/conversational-search-claude-bedrock/ + - /tutorials/vector-search/rag/conversational-search/conversational-search-claude-bedrock/ +--- + +# Conversational search using Anthropic Claude on Amazon Bedrock + +This tutorial shows you how to configure conversational search with retrieval-augmented generation (RAG) using Anthropic Claude models hosted on Amazon Bedrock. For more information, see [Conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +Alternatively, you can build a RAG/conversational search using agents and tools. For more information, see [Retrieval-augmented generation chatbot]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/rag-conversational-agent/). + +## Prerequisite + +Ingest test data: + +```json +POST _bulk +{"index": {"_index": "qa_demo", "_id": "1"}} +{"text": "Chart and table of population level and growth rate for the Ogden-Layton metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Ogden-Layton in 2023 is 750,000, a 1.63% increase from 2022.\nThe metro area population of Ogden-Layton in 2022 was 738,000, a 1.79% increase from 2021.\nThe metro area population of Ogden-Layton in 2021 was 725,000, a 1.97% increase from 2020.\nThe metro area population of Ogden-Layton in 2020 was 711,000, a 2.16% increase from 2019."} +{"index": {"_index": "qa_demo", "_id": "2"}} +{"text": "Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019."} +{"index": {"_index": "qa_demo", "_id": "3"}} +{"text": "Chart and table of population level and growth rate for the Chicago metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Chicago in 2023 is 8,937,000, a 0.4% increase from 2022.\\nThe metro area population of Chicago in 2022 was 8,901,000, a 0.27% increase from 2021.\\nThe metro area population of Chicago in 2021 was 8,877,000, a 0.14% increase from 2020.\\nThe metro area population of Chicago in 2020 was 8,865,000, a 0.03% increase from 2019."} +{"index": {"_index": "qa_demo", "_id": "4"}} +{"text": "Chart and table of population level and growth rate for the Miami metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Miami in 2023 is 6,265,000, a 0.8% increase from 2022.\\nThe metro area population of Miami in 2022 was 6,215,000, a 0.78% increase from 2021.\\nThe metro area population of Miami in 2021 was 6,167,000, a 0.74% increase from 2020.\\nThe metro area population of Miami in 2020 was 6,122,000, a 0.71% increase from 2019."} +{"index": {"_index": "qa_demo", "_id": "5"}} +{"text": "Chart and table of population level and growth rate for the Austin metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Austin in 2023 is 2,228,000, a 2.39% increase from 2022.\\nThe metro area population of Austin in 2022 was 2,176,000, a 2.79% increase from 2021.\\nThe metro area population of Austin in 2021 was 2,117,000, a 3.12% increase from 2020.\\nThe metro area population of Austin in 2020 was 2,053,000, a 3.43% increase from 2019."} +{"index": {"_index": "qa_demo", "_id": "6"}} +{"text": "Chart and table of population level and growth rate for the Seattle metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Seattle in 2023 is 3,519,000, a 0.86% increase from 2022.\\nThe metro area population of Seattle in 2022 was 3,489,000, a 0.81% increase from 2021.\\nThe metro area population of Seattle in 2021 was 3,461,000, a 0.82% increase from 2020.\\nThe metro area population of Seattle in 2020 was 3,433,000, a 0.79% increase from 2019."} +``` +{% include copy-curl.html %} + +You can configure conversational search using the following Amazon Bedrock APIs: + +1. [Converse API](#option-1-amazon-bedrock-converse-api) +2. [Invoke API](#option-2-amazon-bedrock-invoke-api) + +<!-- vale off --> +## Option 1: Amazon Bedrock Converse API +<!-- vale on --> + +Follow these steps to use the Amazon Bedrock Converse API for conversational search. + +### Step 1.1: Create a connector and register the model + +First, create a connector for the Claude model. In this example, you'll use Anthropic Claude 3.5 Sonnet: + +```json +POST _plugins/_ml/connectors/_create +{ + "name": "Amazon Bedrock claude v3", + "description": "Test connector for Amazon Bedrock claude v3", + "version": 1, + "protocol": "aws_sigv4", + "credential": { + "access_key": "your_access_key", + "secret_key": "your_secret_key", + "session_token": "your_session_token" + }, + "parameters": { + "region": "your_aws_region", + "service_name": "bedrock", + "model": "anthropic.claude-3-5-sonnet-20240620-v1:0", + "system_prompt": "you are a helpful assistant.", + "temperature": 0.0, + "top_p": 0.9, + "max_tokens": 1000 + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "headers": { + "content-type": "application/json" + }, + "url": "https://bedrock-runtime.${parameters.region}.amazonaws.com/model/${parameters.model}/converse", + "request_body": "{ \"system\": [{\"text\": \"${parameters.system_prompt}\"}], \"messages\": ${parameters.messages} , \"inferenceConfig\": {\"temperature\": ${parameters.temperature}, \"topP\": ${parameters.top_p}, \"maxTokens\": ${parameters.max_tokens}} }" + } + ] +} +``` +{% include copy-curl.html %} + +To use Claude 2, specify `anthropic.claude-v2` instead of `anthropic.claude-3-5-sonnet-20240620-v1:0` as the `model`. + +Note the connector ID; you'll use it to register the model. + +Next, register the model: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "Bedrock Claude3.5 model", + "description": "Bedrock Claude3.5 model", + "function_name": "remote", + "connector_id": "your_connector_id" +} +``` +{% include copy-curl.html %} + +Note the model ID; you'll use it in the following steps. + +Test the model: + +```json +POST /_plugins/_ml/models/your_model_id/_predict +{ + "parameters": { + "messages": [ + { + "role": "user", + "content": [ + { + "text": "hello" + } + ] + } + ] + } +} +``` +{% include copy-curl.html %} + + +The response contains the text generated by the model: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "dataAsMap": { + "metrics": { + "latencyMs": 955.0 + }, + "output": { + "message": { + "content": [ + { + "text": "Hello! How can I assist you today? Feel free to ask me any questions or let me know if you need help with anything." + } + ], + "role": "assistant" + } + }, + "stopReason": "end_turn", + "usage": { + "inputTokens": 14.0, + "outputTokens": 30.0, + "totalTokens": 44.0 + } + } + } + ], + "status_code": 200 + } + ] +} +``` + +### Step 1.2: Configure RAG + +To configure RAG, create a search pipeline containing a RAG processor: + +```json +PUT /_search/pipeline/my-conversation-search-pipeline-claude +{ + "response_processors": [ + { + "retrieval_augmented_generation": { + "tag": "Demo pipeline", + "description": "Demo pipeline Using Bedrock Claude", + "model_id": "your_model_id", + "context_field_list": [ + "text" + ], + "system_prompt": "You are a helpful assistant", + "user_instructions": "Generate a concise and informative answer in less than 100 words for the given question" + } + } + ] +} +``` +{% include copy-curl.html %} + +Run a basic RAG search without storing conversation history: + +```json +GET /qa_demo/_search?search_pipeline=my-conversation-search-pipeline-claude +{ + "query": { + "match": { + "text": "What's the population increase of New York City from 2021 to 2023?" + } + }, + "size": 1, + "_source": [ + "text" + ], + "ext": { + "generative_qa_parameters": { + "llm_model": "bedrock-converse/anthropic.claude-3-sonnet-20240229-v1:0", + "llm_question": "What's the population increase of New York City from 2021 to 2023?", + "context_size": 5 + } + } +} +``` +{% include copy-curl.html %} + +The response contains the model answer and related document: + +```json +{ + "took": 1, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 6, + "relation": "eq" + }, + "max_score": 9.042081, + "hits": [ + { + "_index": "qa_demo", + "_id": "2", + "_score": 9.042081, + "_source": { + "text": "Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019." + } + } + ] + }, + "ext": { + "retrieval_augmented_generation": { + "answer": "The population of the New York City metro area increased by 114,000 people from 2021 to 2023. In 2021, the population was 18,823,000. By 2023, it had grown to 18,937,000. This represents a total increase of about 0.61% over the two-year period, with growth rates of 0.23% from 2021 to 2022 and 0.37% from 2022 to 2023." + } + } +} +``` + +### Step 1.3: Configure conversational search + +Follow these steps to configure conversational search by storing conversation history in a memory. + +1. Create a memory: + + ```json + POST /_plugins/_ml/memory/ + { + "name": "Conversation about NYC population" + } + ``` + {% include copy-curl.html %} + + The response contains the memory ID: + + ```json + { + "memory_id": "sBAqY5UBSzdNxlHvrSJK" + } + ``` + +2. To save the conversation history, include a memory ID in your search request: + + ```json + GET /qa_demo/_search?search_pipeline=my-conversation-search-pipeline-claude + { + "query": { + "match": { + "text": "What's the population increase of New York City from 2021 to 2023?" + } + }, + "size": 1, + "_source": [ + "text" + ], + "ext": { + "generative_qa_parameters": { + "llm_model": "bedrock-converse/anthropic.claude-3-sonnet-20240229-v1:0", + "llm_question": "What's the population increase of New York City from 2021 to 2023?", + "context_size": 5, + "memory_id": "sBAqY5UBSzdNxlHvrSJK" + } + } + } + ``` + {% include copy-curl.html %} + + The response contains the model answer and related document: + + ```json + { + "took": 1, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 6, + "relation": "eq" + }, + "max_score": 9.042081, + "hits": [ + { + "_index": "qa_demo", + "_id": "2", + "_score": 9.042081, + "_source": { + "text": "Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019." + } + } + ] + }, + "ext": { + "retrieval_augmented_generation": { + "answer": "The population of the New York City metro area increased by 114,000 people from 2021 to 2023. In 2021, the population was 18,823,000. By 2023, it had grown to 18,937,000. This represents a total increase of about 0.61% over the two-year period, with growth rates of 0.23% from 2021 to 2022 and 0.37% from 2022 to 2023.", + "message_id": "sRAqY5UBSzdNxlHvzCIL" + } + } + } + ``` + +3. To continue the conversation, provide the same memory ID in the next search: + + ```json + GET /qa_demo/_search?search_pipeline=my-conversation-search-pipeline-claude + { + "query": { + "match": { + "text": "What's the population increase of Chicago from 2021 to 2023?" + } + }, + "size": 1, + "_source": [ + "text" + ], + "ext": { + "generative_qa_parameters": { + "llm_model": "bedrock-converse/anthropic.claude-3-sonnet-20240229-v1:0", + "llm_question": "can you compare the population increase of Chicago with New York City", + "context_size": 5, + "memory_id": "sBAqY5UBSzdNxlHvrSJK" + } + } + } + ``` + {% include copy-curl.html %} + + Using the conversation history from memory, the model compares Chicago's population data with the previously discussed New York City statistics: + + ```json + { + "took": 1, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 6, + "relation": "eq" + }, + "max_score": 3.6660428, + "hits": [ + { + "_index": "qa_demo", + "_id": "3", + "_score": 3.6660428, + "_source": { + "text": "Chart and table of population level and growth rate for the Chicago metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Chicago in 2023 is 8,937,000, a 0.4% increase from 2022.\\nThe metro area population of Chicago in 2022 was 8,901,000, a 0.27% increase from 2021.\\nThe metro area population of Chicago in 2021 was 8,877,000, a 0.14% increase from 2020.\\nThe metro area population of Chicago in 2020 was 8,865,000, a 0.03% increase from 2019." + } + } + ] + }, + "ext": { + "retrieval_augmented_generation": { + "answer": "Based on the provided data for Chicago, we can compare its population increase to New York City from 2021 to 2023:\n\nChicago's population increased from 8,877,000 in 2021 to 8,937,000 in 2023, a total increase of 60,000 people or about 0.68%.\n\nNew York City's population increased by 114,000 people or 0.61% in the same period.\n\nWhile New York City had a larger absolute increase, Chicago experienced a slightly higher percentage growth rate during this two-year period.", + "message_id": "shArY5UBSzdNxlHvQyL-" + } + } + } + ``` + +<!-- vale off --> +## Option 2: Amazon Bedrock Invoke API +<!-- vale on --> + +Follow these steps to use the Amazon Bedrock Invoke API for conversational search. + +Anthropic Claude 3.x models are not supported by the Amazon Bedrock Invoke API because they require a different interface. +{: .important} + +### Step 2.1: Create a connector and register the model + +First, create a connector for the Claude model. In this example, you'll use Anthropic Claude v2: + +```json +POST _plugins/_ml/connectors/_create +{ + "name": "Bedrock Claude2", + "description": "Connector for Bedrock Claude2", + "version": 1, + "protocol": "aws_sigv4", + "credential": { + "access_key": "your_access_key", + "secret_key": "your_secret_key", + "session_token": "your_session_token" + }, + "parameters": { + "region": "your_aws_region", + "service_name": "bedrock", + "model": "anthropic.claude-v2" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "headers": { + "content-type": "application/json" + }, + "url": "https://bedrock-runtime.${parameters.region}.amazonaws.com/model/${parameters.model}/invoke", + "request_body": "{\"prompt\":\"\\n\\nHuman: ${parameters.inputs}\\n\\nAssistant:\",\"max_tokens_to_sample\":300,\"temperature\":0.5,\"top_k\":250,\"top_p\":1,\"stop_sequences\":[\"\\\\n\\\\nHuman:\"]}" + } + ] +} +``` +{% include copy-curl.html %} + +Note the connector ID; you'll use it to register the model. + +Next, register the model: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "Bedrock Claude2 model", + "function_name": "remote", + "description": "Bedrock Claude2 model", + "connector_id": "your_connector_id" +} +``` +{% include copy-curl.html %} + +Note the model ID; you'll use it in the following steps. + +Test the model: + +```json +POST /_plugins/_ml/models/your_model_id/_predict +{ + "parameters": { + "inputs": "Who won the world series in 2020?" + } +} +``` +{% include copy-curl.html %} + +The response contains the text generated by the model: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "dataAsMap": { + "type": "completion", + "completion": " The Los Angeles Dodgers won the 2020 World Series, defeating the Tampa Bay Rays 4 games to 2. The World Series was played at a neutral site in Arlington, Texas due to the COVID-19 pandemic. It was the Dodgers' first World Series championship since 1988.", + "stop_reason": "stop_sequence", + "stop": "\n\nHuman:" + } + } + ], + "status_code": 200 + } + ] +} +``` + +### Step 2.2: Configure RAG + +To configure RAG, create a search pipeline containing a RAG processor: + +```json +PUT /_search/pipeline/my-conversation-search-pipeline-claude2 +{ + "response_processors": [ + { + "retrieval_augmented_generation": { + "tag": "Demo pipeline", + "description": "Demo pipeline Using Bedrock Claude2", + "model_id": "your_model_id", + "context_field_list": [ + "text" + ], + "system_prompt": "You are a helpful assistant", + "user_instructions": "Generate a concise and informative answer in less than 100 words for the given question" + } + } + ] +} +``` +{% include copy-curl.html %} + +Run a basic RAG search without storing conversation history: + +```json +GET /qa_demo/_search?search_pipeline=my-conversation-search-pipeline-claude2 +{ + "query": { + "match": { + "text": "What's the population increase of New York City from 2021 to 2023?" + } + }, + "size": 1, + "_source": [ + "text" + ], + "ext": { + "generative_qa_parameters": { + "llm_model": "bedrock/claude", + "llm_question": "What's the population increase of New York City from 2021 to 2023?", + "context_size": 5, + "timeout": 15 + } + } +} +``` +{% include copy-curl.html %} + +The response is similar to the one in [Step 1.2](#step-12-configure-rag). + +### Step 2.3: Configure conversational search + +Continue to [Step 1.3](#step-13-configure-conversational-search) to configure conversational search. \ No newline at end of file diff --git a/_ml-commons-plugin/tutorials/conversational-search-cohere.md b/_tutorials/gen-ai/rag/conversational-search-cohere.md similarity index 94% rename from _ml-commons-plugin/tutorials/conversational-search-cohere.md rename to _tutorials/gen-ai/rag/conversational-search-cohere.md index e02f576b7c7..3efd67bd774 100644 --- a/_ml-commons-plugin/tutorials/conversational-search-cohere.md +++ b/_tutorials/gen-ai/rag/conversational-search-cohere.md @@ -1,13 +1,18 @@ --- layout: default -title: Conversational search with Cohere Command -parent: Tutorials -nav_order: 20 +title: Conversational search using Cohere Command +parent: RAG +grand_parent: Generative AI +nav_order: 150 +redirect_from: + - /ml-commons-plugin/tutorials/conversational-search-cohere/ + - /vector-search/tutorials/conversational-search/conversational-search-cohere/ + - /tutorials/vector-search/rag/conversational-search/conversational-search-cohere/ --- -# Conversational search using the Cohere Command model +# Conversational search using Cohere Command -This tutorial illustrates how to configure conversational search using the Cohere Command model. For more information, see [Conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). +This tutorial shows you how to configure conversational search with RAG using the Cohere Command model. For more information, see [Conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). Replace the placeholders beginning with the prefix `your_` with your own values. {: .note} diff --git a/_tutorials/gen-ai/rag/conversational-search-openai.md b/_tutorials/gen-ai/rag/conversational-search-openai.md new file mode 100644 index 00000000000..795e2e39f44 --- /dev/null +++ b/_tutorials/gen-ai/rag/conversational-search-openai.md @@ -0,0 +1,398 @@ +--- +layout: default +title: Conversational search using OpenAI +parent: RAG +grand_parent: Generative AI +nav_order: 170 +redirect_from: + - /vector-search/tutorials/conversational-search/conversational-search-openai/ + - /tutorials/vector-search/rag/conversational-search/conversational-search-openai/ +--- + +# Conversational search using OpenAI + +This tutorial shows you how to configure conversational search with retrieval-augmented generation (RAG) using the OpenAI `gpt-4o` model. For more information, see [Conversational search]({{site.url}}{{site.baseurl}}/search-plugins/conversational-search/). + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +Alternatively, you can build a RAG/conversational search using agents and tools. For more information, see [Retrieval-augmented generation chatbot]({{site.url}}{{site.baseurl}}/ml-commons-plugin/tutorials/rag-conversational-agent/). + +## Prerequisite + +Ingest test data: + +```json +POST _bulk +{"index": {"_index": "qa_demo", "_id": "1"}} +{"text": "Chart and table of population level and growth rate for the Ogden-Layton metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Ogden-Layton in 2023 is 750,000, a 1.63% increase from 2022.\nThe metro area population of Ogden-Layton in 2022 was 738,000, a 1.79% increase from 2021.\nThe metro area population of Ogden-Layton in 2021 was 725,000, a 1.97% increase from 2020.\nThe metro area population of Ogden-Layton in 2020 was 711,000, a 2.16% increase from 2019."} +{"index": {"_index": "qa_demo", "_id": "2"}} +{"text": "Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019."} +{"index": {"_index": "qa_demo", "_id": "3"}} +{"text": "Chart and table of population level and growth rate for the Chicago metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Chicago in 2023 is 8,937,000, a 0.4% increase from 2022.\\nThe metro area population of Chicago in 2022 was 8,901,000, a 0.27% increase from 2021.\\nThe metro area population of Chicago in 2021 was 8,877,000, a 0.14% increase from 2020.\\nThe metro area population of Chicago in 2020 was 8,865,000, a 0.03% increase from 2019."} +{"index": {"_index": "qa_demo", "_id": "4"}} +{"text": "Chart and table of population level and growth rate for the Miami metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Miami in 2023 is 6,265,000, a 0.8% increase from 2022.\\nThe metro area population of Miami in 2022 was 6,215,000, a 0.78% increase from 2021.\\nThe metro area population of Miami in 2021 was 6,167,000, a 0.74% increase from 2020.\\nThe metro area population of Miami in 2020 was 6,122,000, a 0.71% increase from 2019."} +{"index": {"_index": "qa_demo", "_id": "5"}} +{"text": "Chart and table of population level and growth rate for the Austin metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Austin in 2023 is 2,228,000, a 2.39% increase from 2022.\\nThe metro area population of Austin in 2022 was 2,176,000, a 2.79% increase from 2021.\\nThe metro area population of Austin in 2021 was 2,117,000, a 3.12% increase from 2020.\\nThe metro area population of Austin in 2020 was 2,053,000, a 3.43% increase from 2019."} +{"index": {"_index": "qa_demo", "_id": "6"}} +{"text": "Chart and table of population level and growth rate for the Seattle metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Seattle in 2023 is 3,519,000, a 0.86% increase from 2022.\\nThe metro area population of Seattle in 2022 was 3,489,000, a 0.81% increase from 2021.\\nThe metro area population of Seattle in 2021 was 3,461,000, a 0.82% increase from 2020.\\nThe metro area population of Seattle in 2020 was 3,433,000, a 0.79% increase from 2019."} +``` +{% include copy-curl.html %} + +## Step 1: Create a connector and register the model + +First, create a connector for the OpenAI `gpt-4o` model: + +```json +POST _plugins/_ml/connectors/_create +{ + "name": "OpenAI GPT-4o", + "description": "Connector of OpenAI GPT-4o", + "version": "1.0", + "protocol": "http", + "parameters": { + "endpoint": "api.openai.com", + "model": "gpt-4o" + }, + "credential": { + "openAI_key": "your_openai_key" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://${parameters.endpoint}/v1/chat/completions", + "headers": { + "Authorization": "Bearer ${credential.openAI_key}" + }, + "request_body": "{ \"model\": \"${parameters.model}\", \"messages\": ${parameters.messages} }" + } + ] +} +``` +{% include copy-curl.html %} + +For more information, see [this blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/open_ai_connector_chat_blueprint.md). + +Note the connector ID; you'll use it to register the model. + +Next, register the model: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "OpenAI GPT-4o model", + "function_name": "remote", + "description": "OpenAI GPT-4o model", + "connector_id": "your_connector_id" +} +``` +{% include copy-curl.html %} + +Note the model ID; you'll use it in the following steps. + +Test the model: + +```json +POST /_plugins/_ml/models/your_model_id/_predict +{ + "parameters": { + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who won the world series in 2020?" + } + ] + } +} +``` +{% include copy-curl.html %} + +The response contains the text generated by the model: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "dataAsMap": { + "id": "chatcmpl-A9Rtgkyk4PVlLil2u4JRUH2oXb25v", + "object": "chat.completion", + "created": 1.726815552E9, + "model": "gpt-4o-2024-05-13", + "choices": [ + { + "index": 0.0, + "message": { + "role": "assistant", + "content": "The Los Angeles Dodgers won the World Series in 2020. They defeated the Tampa Bay Rays in six games to secure their first championship since 1988.", + "refusal": null + }, + "logprobs": null, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 27.0, + "completion_tokens": 32.0, + "total_tokens": 59.0, + "completion_tokens_details": { + "reasoning_tokens": 0.0 + } + }, + "system_fingerprint": "fp_52a7f40b0b" + } + } + ], + "status_code": 200 + } + ] +} +``` + +## Step 2: Configure RAG + +To configure RAG, create a search pipeline containing a RAG processor: + +```json +PUT /_search/pipeline/my-conversation-search-pipeline-openai +{ + "response_processors": [ + { + "retrieval_augmented_generation": { + "tag": "Demo pipeline", + "description": "Demo pipeline Using Cohere", + "model_id": "your_model_id_created_in_step1", + "context_field_list": [ + "text" + ], + "system_prompt": "You are a helpful assistant", + "user_instructions": "Generate a concise and informative answer in less than 100 words for the given question" + } + } + ] +} +``` +{% include copy-curl.html %} + +Run a basic RAG search without storing conversation history: + +```json +GET /qa_demo/_search?search_pipeline=my-conversation-search-pipeline-openai +{ + "query": { + "match": { + "text": "What's the population increase of New York City from 2021 to 2023?" + } + }, + "size": 1, + "_source": [ + "text" + ], + "ext": { + "generative_qa_parameters": { + "llm_model": "gpt-4o", + "llm_question": "What's the population increase of New York City from 2021 to 2023?", + "context_size": 5, + "timeout": 15 + } + } +} +``` +{% include copy-curl.html %} + +The response contains the model answer and related document: + +```json +{ + "took": 1, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 6, + "relation": "eq" + }, + "max_score": 9.042081, + "hits": [ + { + "_index": "qa_demo", + "_id": "2", + "_score": 9.042081, + "_source": { + "text": "Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019." + } + } + ] + }, + "ext": { + "retrieval_augmented_generation": { + "answer": "The population of the New York City metro area increased by 114,000 from 2021 to 2023, rising from 18,823,000 in 2021 to 18,937,000 in 2023." + } + } +} +``` +{% include copy-curl.html %} + +## Step 3: Configure conversational search + +Follow these steps to configure conversational search by storing conversation history in a memory. + +1. Create a memory: + + ```json + POST /_plugins/_ml/memory/ + { + "name": "Conversation about NYC population" + } + ``` + {% include copy-curl.html %} + + The response contains the memory ID: + + ```json + { + "memory_id": "rBAbY5UBSzdNxlHvIyI3" + } + ``` + +2. To save the conversation history, include a memory ID in your search request: + + ```json + GET /qa_demo/_search?search_pipeline=my-conversation-search-pipeline-openai + { + "query": { + "match": { + "text": "What's the population increase of New York City from 2021 to 2023?" + } + }, + "size": 1, + "_source": [ + "text" + ], + "ext": { + "generative_qa_parameters": { + "llm_model": "gpt-4o", + "llm_question": "What's the population increase of New York City from 2021 to 2023?", + "context_size": 5, + "timeout": 15, + "memory_id": "rBAbY5UBSzdNxlHvIyI3" + } + } + } + ``` + {% include copy-curl.html %} + + The response contains the model answer and related document: + + ```json + { + "took": 1, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 6, + "relation": "eq" + }, + "max_score": 9.042081, + "hits": [ + { + "_index": "qa_demo", + "_id": "2", + "_score": 9.042081, + "_source": { + "text": "Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019." + } + } + ] + }, + "ext": { + "retrieval_augmented_generation": { + "answer": "The population of the New York City metro area increased from 18,823,000 in 2021 to 18,937,000 in 2023. This represents an increase of 114,000 people over the two-year period.", + "message_id": "rRAcY5UBSzdNxlHvyiI1" + } + } + } + ``` + +3. To continue the conversation, provide the same memory ID in the next search: + + ```json + GET /qa_demo/_search?search_pipeline=my-conversation-search-pipeline-openai + { + "query": { + "match": { + "text": "What's the population increase of Miami from 2021 to 2023?" + } + }, + "size": 1, + "_source": [ + "text" + ], + "ext": { + "generative_qa_parameters": { + "llm_model": "gpt-4o", + "llm_question": "compare population increase of New York City and Miami", + "context_size": 5, + "timeout": 15, + "memory_id": "rBAbY5UBSzdNxlHvIyI3" + } + } + } + ``` + {% include copy-curl.html %} + + Using the conversation history from memory, the model compares Miami's population data with the previously discussed New York City statistics: + + ```json + { + "took": 1, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 6, + "relation": "eq" + }, + "max_score": 3.6660428, + "hits": [ + { + "_index": "qa_demo", + "_id": "4", + "_score": 3.6660428, + "_source": { + "text": "Chart and table of population level and growth rate for the Miami metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Miami in 2023 is 6,265,000, a 0.8% increase from 2022.\\nThe metro area population of Miami in 2022 was 6,215,000, a 0.78% increase from 2021.\\nThe metro area population of Miami in 2021 was 6,167,000, a 0.74% increase from 2020.\\nThe metro area population of Miami in 2020 was 6,122,000, a 0.71% increase from 2019." + } + } + ] + }, + "ext": { + "retrieval_augmented_generation": { + "answer": "From 2021 to 2023, the New York City metro area increased by 114,000 people, while the Miami metro area grew by 98,000 people. This means New York City saw a slightly larger population increase compared to Miami over the same period.", + "message_id": "rhAdY5UBSzdNxlHv5SKa" + } + } + } + ``` \ No newline at end of file diff --git a/_tutorials/gen-ai/rag/index.md b/_tutorials/gen-ai/rag/index.md new file mode 100644 index 00000000000..8e18a1d8e22 --- /dev/null +++ b/_tutorials/gen-ai/rag/index.md @@ -0,0 +1,63 @@ +--- +layout: default +title: RAG +parent: Generative AI +has_children: true +has_toc: false +nav_order: 10 +redirect_from: + - /vector-search/tutorials/rag/ + - /vector-search/tutorials/conversational-search/ + - /tutorials/vector-search/rag/ + - /tutorials/gen-ai/rag/ +rag: + - heading: Retrieval-augmented generation (RAG) using the DeepSeek Chat API + link: /tutorials/gen-ai/rag/rag-deepseek-chat/ + list: + - "<b>Platform:</b> OpenSearch, Amazon OpenSearch Service" + - "<b>Model:</b> DeepSeek Chat" + - '<b>Deployment:</b> Provider API' + - heading: RAG using DeepSeek-R1 on Amazon Bedrock + link: /tutorials/gen-ai/rag/rag-deepseek-r1-bedrock/ + list: + - '<b>Platform:</b> OpenSearch, Amazon OpenSearch Service' + - '<b>Model:</b> DeepSeek-R1' + - "<b>Deployment:</b> Amazon Bedrock" + - heading: RAG using DeepSeek-R1 in Amazon SageMaker + link: /tutorials/gen-ai/rag/rag-deepseek-r1-sagemaker/ + list: + - "<b>Platform:</b> OpenSearch, Amazon OpenSearch Service" + - "<b>Model:</b> DeepSeek-R1" + - "<b>Deployment:</b> Amazon SageMaker" +conversational_search: + - heading: Conversational search using Cohere Command + link: /tutorials/gen-ai/rag/conversational-search-cohere/ + list: + - "<b>Platform:</b> OpenSearch" + - "<b>Model:</b> Cohere Command" + - "<b>Deployment:</b> Provider API" + - heading: Conversational search using OpenAI + link: /tutorials/gen-ai/rag/conversational-search-openai/ + list: + - "<b>Platform:</b> OpenSearch" + - "<b>Model:</b> OpenAI GPT-4o" + - "<b>Deployment:</b> Provider API" + - heading: Conversational search using Anthropic Claude on Amazon Bedrock + link: /tutorials/gen-ai/rag/conversational-search-claude-bedrock/ + list: + - "<b>Platform:</b> OpenSearch" + - "<b>Model:</b> Anthropic Claude" + - "<b>Deployment:</b> Amazon Bedrock API" +--- + +# RAG tutorials + +The following machine learning (ML) tutorials show you how to implement retrieval-augmeted generation (RAG). + +{% include cards.html cards=page.rag %} + +## Conversational search with RAG tutorials + +The following tutorials show you how to implement conversational search with RAG. + +{% include cards.html cards=page.conversational_search %} \ No newline at end of file diff --git a/_tutorials/gen-ai/rag/rag-deepseek-chat.md b/_tutorials/gen-ai/rag/rag-deepseek-chat.md new file mode 100644 index 00000000000..96c70e0303d --- /dev/null +++ b/_tutorials/gen-ai/rag/rag-deepseek-chat.md @@ -0,0 +1,532 @@ +--- +layout: default +title: RAG using the DeepSeek Chat API +parent: RAG +grand_parent: Generative AI +nav_order: 120 +redirect_from: + - /vector-search/tutorials/rag/rag-deepseek-chat/ + - /tutorials/vector-search/rag/rag-deepseek-chat/ +--- + +# RAG using the DeepSeek Chat API + +This tutorial shows you how to implement retrieval-augmented generation (RAG) using [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/) and the [DeepSeek chat model](https://api-docs.deepseek.com/api/create-chat-completion). + +If you are using self-managed OpenSearch instead of Amazon OpenSearch Service, obtain a DeepSeek API key and create a connector to the DeepSeek chat model using [the blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/deepseek_connector_chat_blueprint.md). For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). Then go directly to [Step 5](#step-5-create-and-test-the-model). + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Prerequisites + +Before you start, fulfill the following prerequisites. + +When configuring Amazon settings, only change the values mentioned in this tutorial. Keep all other settings at their default values. +{: .important} + +### Obtain a DeepSeek API key + +If you don't have a DeepSeek API key already, obtain one before starting this tutorial. + +### Create an OpenSearch cluster + +Go to the [Amazon OpenSearch Service console](https://console.aws.amazon.com/aos/home) and create an OpenSearch domain. + +Note the domain Amazon Resource Name (ARN) and URL; you'll use them in the following steps. + +## Step 1: Store the API key in AWS Secrets Manager + +Store your DeepSeek API key in [AWS Secrets Manager](https://docs.aws.amazon.com/secretsmanager/latest/userguide/intro.html): + +1. Open AWS Secrets Manager. +1. Select **Store a new secret**. +1. Select **Other type of secret**. +1. Create a key-value pair with **my_deepseek_key** as the key and your DeepSeek API key as the value. +1. Name your secret `my_test_deepseek_secret`. + +Note the secret ARN; you'll use it in the following steps. + +## Step 2: Create an IAM role + +To use the secret created in Step 1, you must create an AWS Identity and Access Management (IAM) role with read permissions for the secret. This IAM role will be configured in the connector and will allow the connector to read the secret. + +Go to the IAM console, create a new IAM role named `my_deepseek_secret_role`, and add the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "es.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "secretsmanager:GetSecretValue", + "secretsmanager:DescribeSecret" + ], + "Effect": "Allow", + "Resource": "your_secret_arn_created_in_step1" + } + ] +} +``` +{% include copy.html %} + +Note the role ARN; you'll use it in the following steps. + +## Step 3: Configure an IAM role in Amazon OpenSearch Service + +Follow these steps to configure an IAM role in Amazon OpenSearch Service. + +### Step 3.1: Create an IAM role for signing connector requests + +Generate a new IAM role specifically for signing your Create Connector API request. + +Create an IAM role named `my_create_deepseek_connector_role` with the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "your_iam_user_arn" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +You'll use the `your_iam_user_arn` IAM user to assume the role in Step 4.1. + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "your_iam_role_arn_created_in_step2" + }, + { + "Effect": "Allow", + "Action": "es:ESHttpPost", + "Resource": "your_opensearch_domain_arn" + } + ] +} +``` +{% include copy.html %} + +Note this role ARN; you'll use it in the following steps. + +### Step 3.2: Map a backend role + +Follow these steps to map a backend role: + +1. Log in to OpenSearch Dashboards and select **Security** on the top menu. +2. Select **Roles**, and then select the **ml_full_access** role. +3. On the **ml_full_access** role details page, select **Mapped users**, and then select **Manage mapping**. +4. Enter the IAM role ARN created in Step 3.1 in the **Backend roles** field, as shown in the following image. + ![Mapping a backend role]({{site.url}}{{site.baseurl}}/images/vector-search-tutorials/mapping_iam_role_arn.png) +4. Select **Map**. + +The IAM role is now successfully configured in your OpenSearch cluster. + +## Step 4: Create a connector + +Follow these steps to create a connector for the DeepSeek chat model. For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +### Step 4.1: Get temporary credentials + +Use the credentials of the IAM user specified in Step 3.1 to assume the role: + +```bash +aws sts assume-role --role-arn your_iam_role_arn_created_in_step3.1 --role-session-name your_session_name +``` +{% include copy.html %} + +Copy the temporary credentials from the response and configure them in `~/.aws/credentials`: + +```ini +[default] +AWS_ACCESS_KEY_ID=your_access_key_of_role_created_in_step3.1 +AWS_SECRET_ACCESS_KEY=your_secret_key_of_role_created_in_step3.1 +AWS_SESSION_TOKEN=your_session_token_of_role_created_in_step3.1 +``` +{% include copy.html %} + +### Step 4.2: Create a connector + +Add the DeepSeek API endpoint to the trusted URL list: + +```json +PUT /_cluster/settings +{ + "persistent": { + "plugins.ml_commons.trusted_connector_endpoints_regex": [ + """^https://api\.deepseek\.com/.*$""" + ] + } +} +``` +{% include copy-curl.html %} + +Run the following Python code with the temporary credentials configured in `~/.aws/credentials`: + +```python +import boto3 +import requests +from requests_aws4auth import AWS4Auth + +host = 'your_amazon_opensearch_domain_endpoint' +region = 'your_amazon_opensearch_domain_region' +service = 'es' + +credentials = boto3.Session().get_credentials() +awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) + +path = '/_plugins/_ml/connectors/_create' +url = host + path + +payload = { + "name": "DeepSeek Chat", + "description": "Test connector for DeepSeek Chat", + "version": "1", + "protocol": "http", + "parameters": { + "endpoint": "api.deepseek.com", + "model": "deepseek-chat" + }, + "credential": { + "secretArn": "your_secret_arn_created_in_step1", + "roleArn": "your_iam_role_arn_created_in_step2" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://${parameters.endpoint}/v1/chat/completions", + "headers": { + "Content-Type": "application/json", + "Authorization": "Bearer ${credential.secretArn.my_deepseek_key}" + }, + "request_body": "{ \"model\": \"${parameters.model}\", \"messages\": ${parameters.messages} }" + } + ] +} + +headers = {"Content-Type": "application/json"} + +r = requests.post(url, auth=awsauth, json=payload, headers=headers) +print(r.status_code) +print(r.text) +``` +{% include copy.html %} + +The script outputs a connector ID: + +```json +{"connector_id":"duRJsZQBFSAM-WcznrIw"} +``` + +Note the connector ID; you'll use it in the next step. + +## Step 5: Create and test the model + +Log in to OpenSearch Dashboards, open the DevTools console, and run the following requests to create and test the DeepSeek chat model. + +1. Create a model group: + + ```json + POST /_plugins/_ml/model_groups/_register + { + "name": "DeepSeek Chat model", + "description": "Test model group for DeepSeek model" + } + ``` + {% include copy-curl.html %} + + The response contains the model group ID: + + ```json + { + "model_group_id": "UylKsZQBts7fa6byEx2M", + "status": "CREATED" + } + ``` + +1. Register the model: + + ```json + POST /_plugins/_ml/models/_register + { + "name": "DeepSeek Chat model", + "function_name": "remote", + "description": "DeepSeek Chat model", + "model_group_id": "UylKsZQBts7fa6byEx2M", + "connector_id": "duRJsZQBFSAM-WcznrIw" + } + ``` + {% include copy-curl.html %} + + The response contains the model ID: + + ```json + { + "task_id": "VClKsZQBts7fa6bypR0a", + "status": "CREATED", + "model_id": "VSlKsZQBts7fa6bypR02" + } + ``` + +1. Deploy the model: + + ```json + POST /_plugins/_ml/models/VSlKsZQBts7fa6bypR02/_deploy + ``` + {% include copy-curl.html %} + + The response contains a task ID for the deployment operation: + + ```json + { + "task_id": "d-RKsZQBFSAM-Wcz3bKO", + "task_type": "DEPLOY_MODEL", + "status": "COMPLETED" + } + ``` + +1. Test the model: + + ```json + POST /_plugins/_ml/models/VSlKsZQBts7fa6bypR02/_predict + { + "parameters": { + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Hello!" + } + ] + } + } + ``` + {% include copy-curl.html %} + + The response contains the text generated by the model: + + ```json + { + "inference_results": [ + { + "output": [ + { + "name": "response", + "dataAsMap": { + "id": "a351252c-7393-4c5d-9abe-1c47693ad336", + "object": "chat.completion", + "created": 1738141298, + "model": "deepseek-chat", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "Hello! How can I assist you today? 😊" + }, + "logprobs": null, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 11, + "completion_tokens": 11, + "total_tokens": 22, + "prompt_tokens_details": { + "cached_tokens": 0 + }, + "prompt_cache_hit_tokens": 0, + "prompt_cache_miss_tokens": 11 + }, + "system_fingerprint": "fp_3a5770e1b4" + } + } + ], + "status_code": 200 + } + ] + } + ``` + +## Step 6: Configure RAG + +Follow these steps to configure RAG. + +### Step 6.1: Create a search pipeline + +Create a search pipeline with a [RAG processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rag-processor/): + +```json +PUT /_search/pipeline/my-conversation-search-pipeline-deepseek-chat +{ + "response_processors": [ + { + "retrieval_augmented_generation": { + "tag": "Demo pipeline", + "description": "Demo pipeline Using DeepSeek Chat", + "model_id": "VSlKsZQBts7fa6bypR02", + "context_field_list": [ + "text" + ], + "system_prompt": "You are a helpful assistant.", + "user_instructions": "Generate a concise and informative answer in less than 100 words for the given question" + } + } + ] +} +``` +{% include copy-curl.html %} + +### Step 6.2: Create a vector database + +Follow steps 1 and 2 of [this tutorial]({{site.url}}{{site.baseurl}}/search-plugins/neural-search-tutorial/) to create an embedding model and a vector index. Then ingest sample data into the index: + +```json +POST _bulk +{"index": {"_index": "my-nlp-index", "_id": "1"}} +{"text": "Chart and table of population level and growth rate for the Ogden-Layton metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Ogden-Layton in 2023 is 750,000, a 1.63% increase from 2022.\nThe metro area population of Ogden-Layton in 2022 was 738,000, a 1.79% increase from 2021.\nThe metro area population of Ogden-Layton in 2021 was 725,000, a 1.97% increase from 2020.\nThe metro area population of Ogden-Layton in 2020 was 711,000, a 2.16% increase from 2019."} +{"index": {"_index": "my-nlp-index", "_id": "2"}} +{"text": "Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019."} +{"index": {"_index": "my-nlp-index", "_id": "3"}} +{"text": "Chart and table of population level and growth rate for the Chicago metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Chicago in 2023 is 8,937,000, a 0.4% increase from 2022.\\nThe metro area population of Chicago in 2022 was 8,901,000, a 0.27% increase from 2021.\\nThe metro area population of Chicago in 2021 was 8,877,000, a 0.14% increase from 2020.\\nThe metro area population of Chicago in 2020 was 8,865,000, a 0.03% increase from 2019."} +{"index": {"_index": "my-nlp-index", "_id": "4"}} +{"text": "Chart and table of population level and growth rate for the Miami metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Miami in 2023 is 6,265,000, a 0.8% increase from 2022.\\nThe metro area population of Miami in 2022 was 6,215,000, a 0.78% increase from 2021.\\nThe metro area population of Miami in 2021 was 6,167,000, a 0.74% increase from 2020.\\nThe metro area population of Miami in 2020 was 6,122,000, a 0.71% increase from 2019."} +{"index": {"_index": "my-nlp-index", "_id": "5"}} +{"text": "Chart and table of population level and growth rate for the Austin metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Austin in 2023 is 2,228,000, a 2.39% increase from 2022.\\nThe metro area population of Austin in 2022 was 2,176,000, a 2.79% increase from 2021.\\nThe metro area population of Austin in 2021 was 2,117,000, a 3.12% increase from 2020.\\nThe metro area population of Austin in 2020 was 2,053,000, a 3.43% increase from 2019."} +{"index": {"_index": "my-nlp-index", "_id": "6"}} +{"text": "Chart and table of population level and growth rate for the Seattle metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Seattle in 2023 is 3,519,000, a 0.86% increase from 2022.\\nThe metro area population of Seattle in 2022 was 3,489,000, a 0.81% increase from 2021.\\nThe metro area population of Seattle in 2021 was 3,461,000, a 0.82% increase from 2020.\\nThe metro area population of Seattle in 2020 was 3,433,000, a 0.79% increase from 2019."} +``` +{% include copy-curl.html %} + +### Step 6.3: Search the index + +Run a vector search to retrieve documents from the vector database and use the DeepSeek model for RAG: + +```json +GET /my-nlp-index/_search?search_pipeline=my-conversation-search-pipeline-deepseek-chat +{ + "query": { + "neural": { + "passage_embedding": { + "query_text": "What's the population increase of New York City from 2021 to 2023? How is the trending comparing with Miami?", + "model_id": "USkHsZQBts7fa6bybx3G", + "k": 5 + } + } + }, + "size": 4, + "_source": [ + "text" + ], + "ext": { + "generative_qa_parameters": { + "llm_model": "deepseek-chat", + "llm_question": "What's the population increase of New York City from 2021 to 2023? How is the trending comparing with Miami?", + "context_size": 5, + "timeout": 15 + } + } +} +``` +{% include copy-curl.html %} + +The response includes both the relevant documents retrieved from the vector search (in the `hits` array) and the generated answer from the DeepSeek model (in the `ext.retrieval_augmented_generation` object): + +```json +{ + "took": 5, + "timed_out": false, + "_shards": { + "total": 5, + "successful": 5, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 6, + "relation": "eq" + }, + "max_score": 0.05248103, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "2", + "_score": 0.05248103, + "_source": { + "text": """Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019.""" + } + }, + { + "_index": "my-nlp-index", + "_id": "4", + "_score": 0.029023321, + "_source": { + "text": """Chart and table of population level and growth rate for the Miami metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Miami in 2023 is 6,265,000, a 0.8% increase from 2022.\nThe metro area population of Miami in 2022 was 6,215,000, a 0.78% increase from 2021.\nThe metro area population of Miami in 2021 was 6,167,000, a 0.74% increase from 2020.\nThe metro area population of Miami in 2020 was 6,122,000, a 0.71% increase from 2019.""" + } + }, + { + "_index": "my-nlp-index", + "_id": "3", + "_score": 0.028097045, + "_source": { + "text": """Chart and table of population level and growth rate for the Chicago metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Chicago in 2023 is 8,937,000, a 0.4% increase from 2022.\nThe metro area population of Chicago in 2022 was 8,901,000, a 0.27% increase from 2021.\nThe metro area population of Chicago in 2021 was 8,877,000, a 0.14% increase from 2020.\nThe metro area population of Chicago in 2020 was 8,865,000, a 0.03% increase from 2019.""" + } + }, + { + "_index": "my-nlp-index", + "_id": "6", + "_score": 0.026973149, + "_source": { + "text": """Chart and table of population level and growth rate for the Seattle metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Seattle in 2023 is 3,519,000, a 0.86% increase from 2022.\nThe metro area population of Seattle in 2022 was 3,489,000, a 0.81% increase from 2021.\nThe metro area population of Seattle in 2021 was 3,461,000, a 0.82% increase from 2020.\nThe metro area population of Seattle in 2020 was 3,433,000, a 0.79% increase from 2019.""" + } + } + ] + }, + "ext": { + "retrieval_augmented_generation": { + "answer": "From 2021 to 2023, New York City's metro area population increased by 114,000, from 18,823,000 to 18,937,000, reflecting a growth rate of 0.61%. In comparison, Miami's metro area population grew by 98,000, from 6,167,000 to 6,265,000, with a higher growth rate of 1.59%. While New York City has a larger absolute population increase, Miami's population growth rate is significantly higher, indicating faster relative growth." + } + } +} +``` \ No newline at end of file diff --git a/_tutorials/gen-ai/rag/rag-deepseek-r1-bedrock.md b/_tutorials/gen-ai/rag/rag-deepseek-r1-bedrock.md new file mode 100644 index 00000000000..4ed2b425b13 --- /dev/null +++ b/_tutorials/gen-ai/rag/rag-deepseek-r1-bedrock.md @@ -0,0 +1,468 @@ +--- +layout: default +title: RAG using DeepSeek-R1 on Amazon Bedrock +parent: RAG +grand_parent: Generative AI +nav_order: 130 +redirect_from: + - /vector-search/tutorials/rag/rag-deepseek-r1-bedrock/ + - /tutorials/vector-search/rag/rag-deepseek-r1-bedrock/ +--- + +# RAG using DeepSeek-R1 on Amazon Bedrock + +This tutorial shows you how to implement retrieval-augmented generation (RAG) using [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/) and the [DeepSeek-R1 model](https://huggingface.co/deepseek-ai/DeepSeek-R1). + +If you are using self-managed OpenSearch instead of Amazon OpenSearch Service, create a connector to the DeepSeek-R1 model using [the blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/deepseek_connector_chat_blueprint.md). For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). Then go directly to [Step 4](#step-4-create-and-test-the-model). + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Prerequisites + +Before you start, fulfill the following prerequisites. + +When configuring Amazon settings, only change the values mentioned in this tutorial. Keep all other settings at their default values. +{: .important} + +### Deploy DeepSeek-R1 to Amazon Bedrock + +Follow [this notebook](https://github.com/DennisTraub/deepseekr1-on-bedrock/blob/main/deepseek-bedrock.ipynb) to deploy the DeepSeek-R1 model to Amazon Bedrock. + +Note the Amazon Bedrock DeepSeek-R1 model Amazon Resource Name (ARN); you'll use it in the following steps. + +### Create an OpenSearch cluster + +Go to the [Amazon OpenSearch Service console](https://console.aws.amazon.com/aos/home) and create an OpenSearch domain. + +Note the domain ARN and URL; you'll use them in the following steps. + +## Step 1: Create an IAM role for Amazon Bedrock access + +To invoke the DeepSeek-R1 model on Amazon Bedrock, you must create an AWS Identity and Access Management (IAM) role with appropriate permissions. The connector will use this role to invoke the model. + +Go to the IAM console, create a new IAM role named `my_invoke_bedrock_deepseek_model_role`, and add the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "es.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "bedrock:InvokeModel" + ], + "Effect": "Allow", + "Resource": "your_DeepSeek_R1_model_ARN" + } + ] +} +``` +{% include copy.html %} + +Note the role ARN; you'll use it in the following steps. + +## Step 2: Configure an IAM role in Amazon OpenSearch Service + +Follow these steps to configure an IAM role in Amazon OpenSearch Service. + +### Step 2.1: Create an IAM role for signing connector requests + +Generate a new IAM role specifically for signing your Create Connector API request. + +Create an IAM role named `my_create_bedrock_deepseek_connector_role` with the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "your_iam_user_arn" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +You'll use the `your_iam_user_arn` IAM user to assume the role in Step 3.1. + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "your_iam_role_arn_created_in_step1" + }, + { + "Effect": "Allow", + "Action": "es:ESHttpPost", + "Resource": "your_opensearch_domain_arn" + } + ] +} +``` +{% include copy.html %} + +Note this role ARN; you'll use it in the following steps. + +### Step 2.2: Map a backend role + +Follow these steps to map a backend role: + +1. Log in to OpenSearch Dashboards and select **Security** on the top menu. +2. Select **Roles**, and then select the **ml_full_access** role. +3. On the **ml_full_access** role details page, select **Mapped users**, and then select **Manage mapping**. +4. Enter the IAM role ARN created in Step 2.1 in the **Backend roles** field, as shown in the following image. + ![Mapping a backend role]({{site.url}}{{site.baseurl}}/images/vector-search-tutorials/mapping_iam_role_arn.png) +5. Select **Map**. + +The IAM role is now successfully configured in your OpenSearch cluster. + +## Step 3: Create a connector + +Follow these steps to create a connector for the DeepSeek-R1 model. For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +### Step 3.1: Get temporary credentials + +Use the credentials of the IAM user specified in Step 2.1 to assume the role: + +```bash +aws sts assume-role --role-arn your_iam_role_arn_created_in_step2.1 --role-session-name your_session_name +``` +{% include copy.html %} + +Copy the temporary credentials from the response and configure them in `~/.aws/credentials`: + +```ini +[default] +AWS_ACCESS_KEY_ID=your_access_key_of_role_created_in_step2.1 +AWS_SECRET_ACCESS_KEY=your_secret_key_of_role_created_in_step2.1 +AWS_SESSION_TOKEN=your_session_token_of_role_created_in_step2.1 +``` +{% include copy.html %} + +### Step 3.2: Create a connector + +Run the following Python code with the temporary credentials configured in `~/.aws/credentials`: + +```python +import boto3 +import requests +from requests_aws4auth import AWS4Auth + +host = 'your_amazon_opensearch_domain_endpoint' +region = 'your_amazon_opensearch_domain_region' +service = 'es' + +credentials = boto3.Session().get_credentials() +awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) + +path = '/_plugins/_ml/connectors/_create' +url = host + path + +payload = { + "name": "DeepSeek R1 model connector", + "description": "Connector for my Bedrock DeepSeek model", + "version": "1.0", + "protocol": "aws_sigv4", + "credential": { + "roleArn": "your_iam_role_arn_created_in_step1" + }, + "parameters": { + "service_name": "bedrock", + "region": "your_bedrock_model_region", + "model_id": "your_deepseek_bedrock_model_arn", + "temperature": 0, + "max_gen_len": 4000 + }, + "actions": [ + { + "action_type": "PREDICT", + "method": "POST", + "url": "https://bedrock-runtime.us-east-1.amazonaws.com/model/${parameters.model_id}/invoke", + "headers": { + "content-type": "application/json" + }, + "request_body": "{ \"prompt\": \"<|begin▁of▁sentence|><|User|>${parameters.inputs}<|Assistant|>\", \"temperature\": ${parameters.temperature}, \"max_gen_len\": ${parameters.max_gen_len} }", + "post_process_function": "\n return '{' +\n '\"name\": \"response\",'+\n '\"dataAsMap\": {' +\n '\"completion\":\"' + escape(params.generation) + '\"}' +\n '}';\n " + } + ] +} + +headers = {"Content-Type": "application/json"} + +r = requests.post(url, auth=awsauth, json=payload, headers=headers) +print(r.status_code) +print(r.text) +``` +{% include copy.html %} + +The script outputs a connector ID: + +```json +{"connector_id":"HnS5sJQBVQUimUskjpFl"} +``` + +Note the connector ID; you'll use it in the next step. + +## Step 4: Create and test the model + +Log in to OpenSearch Dashboards, open the DevTools console, and run the following requests to create and test the DeepSeek-R1 model. + +1. Create a model group: + + ```json + POST /_plugins/_ml/model_groups/_register + { + "name": "Bedrock DeepSeek model", + "description": "Test model group for Bedrock DeepSeek model" + } + ``` + {% include copy-curl.html %} + + The response contains the model group ID: + + ```json + { + "model_group_id": "Vylgs5QBts7fa6bylR0v", + "status": "CREATED" + } + ``` + +2. Register the model: + + ```json + POST /_plugins/_ml/models/_register + { + "name": "Bedrock DeepSeek R1 model", + "function_name": "remote", + "description": "DeepSeek R1 model on Bedrock", + "model_group_id": "Vylgs5QBts7fa6bylR0v", + "connector_id": "KHS7s5QBVQUimUskoZGp" + } + ``` + {% include copy-curl.html %} + + The response contains the model ID: + + ```json + { + "task_id": "hOS7s5QBFSAM-Wczv7KD", + "status": "CREATED", + "model_id": "heS7s5QBFSAM-Wczv7Kb" + } + ``` + +3. Deploy the model: + + ```json + POST /_plugins/_ml/models/heS7s5QBFSAM-Wczv7Kb/_deploy + ``` + {% include copy-curl.html %} + + The response contains a task ID for the deployment operation: + + ```json + { + "task_id": "euRhs5QBFSAM-WczTrI6", + "task_type": "DEPLOY_MODEL", + "status": "COMPLETED" + } + ``` + +4. Test the model: + + ```json + POST /_plugins/_ml/models/heS7s5QBFSAM-Wczv7Kb/_predict + { + "parameters": { + "inputs": "hello" + } + } + ``` + {% include copy-curl.html %} + + The response contains the text generated by the model: + + ```json + { + "inference_results": [ + { + "output": [ + { + "name": "response", + "dataAsMap": { + "completion": """<think>\n\n</think>\n\nHello! How can I assist you today? 😊""" + } + } + ], + "status_code": 200 + } + ] + } + ``` + +## Step 5: Configure RAG + +Follow these steps to configure RAG. + +### Step 5.1: Create a search pipeline + +Create a search pipeline with a [RAG processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rag-processor/): + +```json +PUT /_search/pipeline/my-conversation-search-pipeline-deepseek +{ + "response_processors": [ + { + "retrieval_augmented_generation": { + "tag": "Demo pipeline", + "description": "Demo pipeline Using DeepSeek R1", + "model_id": "heS7s5QBFSAM-Wczv7Kb", + "context_field_list": [ + "text" + ], + "system_prompt": "You are a helpful assistant.", + "user_instructions": "Generate a concise and informative answer in less than 100 words for the given question" + } + } + ] +} +``` +{% include copy-curl.html %} + +### Step 5.2: Create a vector database + +Follow steps 1 and 2 of [this tutorial]({{site.url}}{{site.baseurl}}/search-plugins/neural-search-tutorial/) to create an embedding model and a vector index. Then ingest sample data into the index: + +```json +POST _bulk +{"index": {"_index": "my-nlp-index", "_id": "1"}} +{"text": "Chart and table of population level and growth rate for the Ogden-Layton metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Ogden-Layton in 2023 is 750,000, a 1.63% increase from 2022.\nThe metro area population of Ogden-Layton in 2022 was 738,000, a 1.79% increase from 2021.\nThe metro area population of Ogden-Layton in 2021 was 725,000, a 1.97% increase from 2020.\nThe metro area population of Ogden-Layton in 2020 was 711,000, a 2.16% increase from 2019."} +{"index": {"_index": "my-nlp-index", "_id": "2"}} +{"text": "Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019."} +{"index": {"_index": "my-nlp-index", "_id": "3"}} +{"text": "Chart and table of population level and growth rate for the Chicago metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Chicago in 2023 is 8,937,000, a 0.4% increase from 2022.\\nThe metro area population of Chicago in 2022 was 8,901,000, a 0.27% increase from 2021.\\nThe metro area population of Chicago in 2021 was 8,877,000, a 0.14% increase from 2020.\\nThe metro area population of Chicago in 2020 was 8,865,000, a 0.03% increase from 2019."} +{"index": {"_index": "my-nlp-index", "_id": "4"}} +{"text": "Chart and table of population level and growth rate for the Miami metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Miami in 2023 is 6,265,000, a 0.8% increase from 2022.\\nThe metro area population of Miami in 2022 was 6,215,000, a 0.78% increase from 2021.\\nThe metro area population of Miami in 2021 was 6,167,000, a 0.74% increase from 2020.\\nThe metro area population of Miami in 2020 was 6,122,000, a 0.71% increase from 2019."} +{"index": {"_index": "my-nlp-index", "_id": "5"}} +{"text": "Chart and table of population level and growth rate for the Austin metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Austin in 2023 is 2,228,000, a 2.39% increase from 2022.\\nThe metro area population of Austin in 2022 was 2,176,000, a 2.79% increase from 2021.\\nThe metro area population of Austin in 2021 was 2,117,000, a 3.12% increase from 2020.\\nThe metro area population of Austin in 2020 was 2,053,000, a 3.43% increase from 2019."} +{"index": {"_index": "my-nlp-index", "_id": "6"}} +{"text": "Chart and table of population level and growth rate for the Seattle metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Seattle in 2023 is 3,519,000, a 0.86% increase from 2022.\\nThe metro area population of Seattle in 2022 was 3,489,000, a 0.81% increase from 2021.\\nThe metro area population of Seattle in 2021 was 3,461,000, a 0.82% increase from 2020.\\nThe metro area population of Seattle in 2020 was 3,433,000, a 0.79% increase from 2019."} +``` +{% include copy-curl.html %} + +### Step 5.3: Search the index + +Run a vector search to retrieve documents from the vector database and use the DeepSeek model for RAG: + +```json +GET /my-nlp-index/_search?search_pipeline=my-conversation-search-pipeline-deepseek +{ + "query": { + "neural": { + "passage_embedding": { + "query_text": "What's the population increase of New York City from 2021 to 2023? How is the trending comparing with Miami?", + "model_id": "heS7s5QBFSAM-Wczv7Kb", + "k": 5 + } + } + }, + "size": 2, + "_source": [ + "text" + ], + "ext": { + "generative_qa_parameters": { + "llm_model": "bedrock/claude", + "llm_question": "What's the population increase of New York City from 2021 to 2023? How is the trending comparing with Miami?", + "context_size": 5, + "timeout": 15 + } + } +} +``` +{% include copy-curl.html %} + +The response includes both the relevant documents retrieved from the vector search (in the `hits` array) and the generated answer from the DeepSeek model (in the `ext.retrieval_augmented_generation` object): + +```json +{ + "took": 5, + "timed_out": false, + "_shards": { + "total": 5, + "successful": 5, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 6, + "relation": "eq" + }, + "max_score": 0.04107812, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "4", + "_score": 0.04107812, + "_source": { + "text": """Chart and table of population level and growth rate for the Miami metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Miami in 2023 is 6,265,000, a 0.8% increase from 2022.\nThe metro area population of Miami in 2022 was 6,215,000, a 0.78% increase from 2021.\nThe metro area population of Miami in 2021 was 6,167,000, a 0.74% increase from 2020.\nThe metro area population of Miami in 2020 was 6,122,000, a 0.71% increase from 2019.""" + } + }, + { + "_index": "my-nlp-index", + "_id": "2", + "_score": 0.03810156, + "_source": { + "text": """Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019.""" + } + } + ] + }, + "ext": { + "retrieval_augmented_generation": { + "answer": """You are a helpful assistant.\nGenerate a concise and informative answer in less than 100 words for the given question\nSEARCH RESULT 1: Chart and table of population level and growth rate for the Miami metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Miami in 2023 is 6,265,000, a 0.8% increase from 2022.\nThe metro area population of Miami in 2022 was 6,215,000, a 0.78% increase from 2021.\nThe metro area population of Miami in 2021 was 6,167,000, a 0.74% increase from 2020.\nThe metro area population of Miami in 2020 was 6,122,000, a 0.71% increase from 2019.\nSEARCH RESULT 2: Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019.\nQUESTION: What's the population increase of New York City from 2021 to 2023? How is the trending comparing with Miami?\nOkay, I need to figure out the population increase of New York City from 2021 to 2023 and compare it with Miami's growth. Let me start by looking at the data provided. + +From SEARCH RESULT 2, in 2021, NYC's population was 18,823,000, and in 2022, it was 18,867,000. Then in 2023, it's 18,937,000. So, from 2021 to 2022, it increased by 44,000, and from 2022 to 2023, it went up by 70,000. Adding those together, the total increase from 2021 to 2023 is 114,000. + +Now, looking at Miami's data in SEARCH RESULT 1, in 2021, the population was 6,167,000, and in 2023, it's 6,265,000. That's an increase of 98,000 over the same period. + +Comparing the two, NYC's increase is higher than Miami's. NYC went up by 114,000, while Miami was 98,000. Also, NYC's growth rate is a bit lower than Miami's. NYC's average annual growth rate is around 0.37%, whereas Miami's is about 0.75%. So, while NYC's population increased more in total, Miami's growth rate is higher. I should present this clearly, highlighting both the total increase and the growth rates to show the comparison accurately. +</think> + +From 2021 to 2023, New York City's population increased by 114,000, compared to Miami's increase of 98,000. While NYC's total growth is higher, Miami's annual growth rate (0.75%) is notably faster than NYC's (0.37%).""" + } + } +} +``` \ No newline at end of file diff --git a/_tutorials/gen-ai/rag/rag-deepseek-r1-sagemaker.md b/_tutorials/gen-ai/rag/rag-deepseek-r1-sagemaker.md new file mode 100644 index 00000000000..e9cfdbd2ae4 --- /dev/null +++ b/_tutorials/gen-ai/rag/rag-deepseek-r1-sagemaker.md @@ -0,0 +1,498 @@ +--- +layout: default +title: RAG using DeepSeek-R1 in Amazon SageMaker +parent: RAG +grand_parent: Generative AI +nav_order: 140 +redirect_from: + - /vector-search/tutorials/rag/rag-deepseek-r1-sagemaker/ + - /tutorials/vector-search/rag/rag-deepseek-r1-sagemaker/ +--- + +# RAG using DeepSeek-R1 in Amazon SageMaker + +This tutorial shows you how to implement retrieval-augmented generation (RAG) using [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/) and the [DeepSeek-R1 model](https://huggingface.co/deepseek-ai/DeepSeek-R1). + +If you are using self-managed OpenSearch instead of Amazon OpenSearch Service, create a connector to the DeepSeek-R1 model using [the blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/deepseek_connector_chat_blueprint.md). For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). Then go directly to [Step 4](#step-4-create-and-test-the-model). + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Prerequisites + +Before you start, fulfill the following prerequisites. + +When configuring Amazon settings, only change the values mentioned in this tutorial. Keep all other settings at their default values. +{: .important} + +### Deploy DeepSeek-R1 to Amazon SageMaker + +Follow the instructions in [this blog post](https://community.aws/content/2sG84dNUCFzA9z4HdfqTI0tcvKP/deploying-deepseek-r1-on-amazon-sagemaker) to deploy the DeepSeek-R1 model to Amazon SageMaker. + +Note the Amazon SageMaker DeepSeek-R1 model Amazon Resource Name (ARN) and URL; you'll use them in the following steps. + +### Create an OpenSearch cluster + +Go to the [Amazon OpenSearch Service console](https://console.aws.amazon.com/aos/home) and create an OpenSearch domain. + +Note the domain ARN and URL; you'll use them in the following steps. + +## Step 1: Create an IAM role for Amazon SageMaker access + +To invoke the DeepSeek-R1 model in Amazon SageMaker, you must create an AWS Identity and Access Management (IAM) role with appropriate permissions. The connector will use this role to invoke the model. + +Go to the IAM console, create a new IAM role named `my_invoke_sagemaker_deepseek_model_role`, and add the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "es.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "sagemaker:InvokeEndpoint" + ], + "Resource": [ + "your_sagemaker_model_inference_endpoint_arn" + ] + } + ] +} +``` +{% include copy.html %} + +Note the role ARN; you'll use it in the following steps. + +## Step 2: Configure an IAM role in Amazon OpenSearch Service + +Follow these steps to configure an IAM role in Amazon OpenSearch Service. + +### Step 2.1: Create an IAM role for signing connector requests + +Generate a new IAM role specifically for signing your Create Connector API request. + +Create an IAM role named `my_create_sagemaker_deepseek_connector_role` with the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "your_iam_user_arn" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +You'll use the `your_iam_user_arn` IAM user to assume the role in Step 3.1. + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "your_iam_role_arn_created_in_step1" + }, + { + "Effect": "Allow", + "Action": "es:ESHttpPost", + "Resource": "your_opensearch_domain_arn" + } + ] +} +``` +{% include copy.html %} + +Note this role ARN; you'll use it in the following steps. + +### Step 2.2: Map a backend role + +Follow these steps to map a backend role: + +1. Log in to OpenSearch Dashboards and select **Security** on the top menu. +2. Select **Roles**, and then select the **ml_full_access** role. +3. On the **ml_full_access** role details page, select **Mapped users**, and then select **Manage mapping**. +4. Enter the IAM role ARN created in Step 2.1 in the **Backend roles** field, as shown in the following image. + ![Mapping a backend role]({{site.url}}{{site.baseurl}}/images/vector-search-tutorials/mapping_iam_role_arn.png) +5. Select **Map**. + +The IAM role is now successfully configured in your OpenSearch cluster. + +## Step 3: Create a connector + +Follow these steps to create a connector for the DeepSeek-R1 model. For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +### Step 3.1: Get temporary credentials + +Use the credentials of the IAM user specified in Step 2.1 to assume the role: + +```bash +aws sts assume-role --role-arn your_iam_role_arn_created_in_step2.1 --role-session-name your_session_name +``` +{% include copy.html %} + +Copy the temporary credentials from the response and configure them in `~/.aws/credentials`: + +```ini +[default] +AWS_ACCESS_KEY_ID=your_access_key_of_role_created_in_step2.1 +AWS_SECRET_ACCESS_KEY=your_secret_key_of_role_created_in_step2.1 +AWS_SESSION_TOKEN=your_session_token_of_role_created_in_step2.1 +``` +{% include copy.html %} + +### Step 3.2: Create a connector + +Run the following Python code with the temporary credentials configured in `~/.aws/credentials`: + +```python +import boto3 +import requests +from requests_aws4auth import AWS4Auth + +host = 'your_amazon_opensearch_domain_endpoint' +region = 'your_amazon_opensearch_domain_region' +service = 'es' + +credentials = boto3.Session().get_credentials() +awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) + + +path = '/_plugins/_ml/connectors/_create' +url = host + path + +payload = { + "name": "DeepSeek R1 model connector", + "description": "Connector for my Sagemaker DeepSeek model", + "version": "1.0", + "protocol": "aws_sigv4", + "credential": { + "roleArn": "your_iam_role_arn_created_in_step1" + }, + "parameters": { + "service_name": "sagemaker", + "region": "your_sagemaker_model_region", + "do_sample": true, + "top_p": 0.9, + "temperature": 0.7, + "max_new_tokens": 512 + }, + "actions": [ + { + "action_type": "PREDICT", + "method": "POST", + "url": "your_sagemaker_model_inference_endpoint", + "headers": { + "content-type": "application/json" + }, + "request_body": "{ \"inputs\": \"${parameters.inputs}\", \"parameters\": {\"do_sample\": ${parameters.do_sample}, \"top_p\": ${parameters.top_p}, \"temperature\": ${parameters.temperature}, \"max_new_tokens\": ${parameters.max_new_tokens}} }", + "post_process_function": "\n if (params.result == null || params.result.length == 0) {\n throw new Exception('No response available');\n }\n \n def completion = params.result[0].generated_text;\n return '{' +\n '\"name\": \"response\",'+\n '\"dataAsMap\": {' +\n '\"completion\":\"' + escape(completion) + '\"}' +\n '}';\n " + } + ] +} + +headers = {"Content-Type": "application/json"} + +r = requests.post(url, auth=awsauth, json=payload, headers=headers) +print(r.status_code) +print(r.text) +``` +{% include copy.html %} + +The script outputs a connector ID: + +```json +{"connector_id":"HnS5sJQBVQUimUskjpFl"} +``` + +Note the connector ID; you'll use it in the next step. + +## Step 4: Create and test the model + +Log in to OpenSearch Dashboards, open the DevTools console, and run the following requests to create and test the DeepSeek-R1 model. + +1. Create a model group: + + ```json + POST /_plugins/_ml/model_groups/_register + { + "name": "Sagemaker DeepSeek model", + "description": "Test model group for Sagemaker DeepSeek model" + } + ``` + {% include copy-curl.html %} + + The response contains the model group ID: + + ```json + { + "model_group_id": "H3S8sJQBVQUimUskW5Fm", + "status": "CREATED" + } + ``` + +2. Register the model: + + ```json + POST /_plugins/_ml/models/_register + { + "name": "Sagemaker DeepSeek R1 model", + "function_name": "remote", + "description": "DeepSeek R1 model on Sagemaker", + "model_group_id": "H3S8sJQBVQUimUskW5Fm", + "connector_id": "HnS5sJQBVQUimUskjpFl" + } + ``` + {% include copy-curl.html %} + + The response contains the model ID: + + ```json + { + "task_id": "Sim9sJQBts7fa6byEh1S", + "status": "CREATED", + "model_id": "Sym9sJQBts7fa6byEh1-" + } + ``` + +3. Deploy the model: + + ```json + POST /_plugins/_ml/models/Sym9sJQBts7fa6byEh1-/_deploy + ``` + {% include copy-curl.html %} + + The response contains a task ID for the deployment operation: + + ```json + { + "task_id": "TCm9sJQBts7fa6byex2j", + "task_type": "DEPLOY_MODEL", + "status": "COMPLETED" + } + ``` + +4. Test the model: + + ```json + POST /_plugins/_ml/models/Sym9sJQBts7fa6byEh1-/_predict + { + "parameters": { + "inputs": "hello" + } + } + ``` + {% include copy-curl.html %} + + The response contains the text generated by the model: + + ```json + { + "inference_results": [ + { + "output": [ + { + "name": "response", + "dataAsMap": { + "response": [ + { + "generated_text": """hello<think> + + </think> + + Hello! How can I assist you today? 😊""" + } + ] + } + } + ], + "status_code": 200 + } + ] + } + ``` + +## Step 5: Configure RAG + +Follow these steps to configure RAG. + +### Step 5.1: Create a search pipeline + +Create a search pipeline with a [RAG processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rag-processor/): + +```json +PUT /_search/pipeline/my-conversation-search-pipeline-deepseek +{ + "response_processors": [ + { + "retrieval_augmented_generation": { + "tag": "Demo pipeline", + "description": "Demo pipeline Using DeepSeek R1", + "model_id": "Sym9sJQBts7fa6byEh1-", + "context_field_list": [ + "text" + ], + "system_prompt": "You are a helpful assistant.", + "user_instructions": "Generate a concise and informative answer in less than 100 words for the given question" + } + } + ] +} +``` +{% include copy-curl.html %} + +### Step 5.2: Create a vector database + +Follow steps 1 and 2 of [this tutorial]({{site.url}}{{site.baseurl}}/search-plugins/neural-search-tutorial/) to create an embedding model and a vector index. Then ingest sample data into the index: + +```json +POST _bulk +{"index": {"_index": "my-nlp-index", "_id": "1"}} +{"text": "Chart and table of population level and growth rate for the Ogden-Layton metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Ogden-Layton in 2023 is 750,000, a 1.63% increase from 2022.\nThe metro area population of Ogden-Layton in 2022 was 738,000, a 1.79% increase from 2021.\nThe metro area population of Ogden-Layton in 2021 was 725,000, a 1.97% increase from 2020.\nThe metro area population of Ogden-Layton in 2020 was 711,000, a 2.16% increase from 2019."} +{"index": {"_index": "my-nlp-index", "_id": "2"}} +{"text": "Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019."} +{"index": {"_index": "my-nlp-index", "_id": "3"}} +{"text": "Chart and table of population level and growth rate for the Chicago metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Chicago in 2023 is 8,937,000, a 0.4% increase from 2022.\\nThe metro area population of Chicago in 2022 was 8,901,000, a 0.27% increase from 2021.\\nThe metro area population of Chicago in 2021 was 8,877,000, a 0.14% increase from 2020.\\nThe metro area population of Chicago in 2020 was 8,865,000, a 0.03% increase from 2019."} +{"index": {"_index": "my-nlp-index", "_id": "4"}} +{"text": "Chart and table of population level and growth rate for the Miami metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Miami in 2023 is 6,265,000, a 0.8% increase from 2022.\\nThe metro area population of Miami in 2022 was 6,215,000, a 0.78% increase from 2021.\\nThe metro area population of Miami in 2021 was 6,167,000, a 0.74% increase from 2020.\\nThe metro area population of Miami in 2020 was 6,122,000, a 0.71% increase from 2019."} +{"index": {"_index": "my-nlp-index", "_id": "5"}} +{"text": "Chart and table of population level and growth rate for the Austin metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Austin in 2023 is 2,228,000, a 2.39% increase from 2022.\\nThe metro area population of Austin in 2022 was 2,176,000, a 2.79% increase from 2021.\\nThe metro area population of Austin in 2021 was 2,117,000, a 3.12% increase from 2020.\\nThe metro area population of Austin in 2020 was 2,053,000, a 3.43% increase from 2019."} +{"index": {"_index": "my-nlp-index", "_id": "6"}} +{"text": "Chart and table of population level and growth rate for the Seattle metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\\nThe current metro area population of Seattle in 2023 is 3,519,000, a 0.86% increase from 2022.\\nThe metro area population of Seattle in 2022 was 3,489,000, a 0.81% increase from 2021.\\nThe metro area population of Seattle in 2021 was 3,461,000, a 0.82% increase from 2020.\\nThe metro area population of Seattle in 2020 was 3,433,000, a 0.79% increase from 2019."} +``` +{% include copy-curl.html %} + +### Step 5.3: Search the index + +Run a vector search to retrieve documents from the vector database and use the DeepSeek model for RAG: + +```json +GET /my-nlp-index/_search?search_pipeline=my-conversation-search-pipeline-deepseek +{ + "query": { + "neural": { + "passage_embedding": { + "query_text": "What's the population increase of New York City from 2021 to 2023? How is the trending comparing with Miami?", + "model_id": "USkHsZQBts7fa6bybx3G", + "k": 5 + } + } + }, + "size": 4, + "_source": [ + "text" + ], + "ext": { + "generative_qa_parameters": { + "llm_model": "bedrock/claude", + "llm_question": "What's the population increase of New York City from 2021 to 2023? How is the trending comparing with Miami?", + "context_size": 5, + "timeout": 15 + } + } +} +``` +{% include copy-curl.html %} + +The response includes both the relevant documents retrieved from the vector search (in the `hits` array) and the generated answer from the DeepSeek model (in the `ext.retrieval_augmented_generation` object): + +```json +{ + "took": 5, + "timed_out": false, + "_shards": { + "total": 5, + "successful": 5, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 6, + "relation": "eq" + }, + "max_score": 0.05248103, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "2", + "_score": 0.05248103, + "_source": { + "text": """Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019.""" + } + }, + { + "_index": "my-nlp-index", + "_id": "4", + "_score": 0.029023321, + "_source": { + "text": """Chart and table of population level and growth rate for the Miami metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Miami in 2023 is 6,265,000, a 0.8% increase from 2022.\nThe metro area population of Miami in 2022 was 6,215,000, a 0.78% increase from 2021.\nThe metro area population of Miami in 2021 was 6,167,000, a 0.74% increase from 2020.\nThe metro area population of Miami in 2020 was 6,122,000, a 0.71% increase from 2019.""" + } + }, + { + "_index": "my-nlp-index", + "_id": "3", + "_score": 0.028097045, + "_source": { + "text": """Chart and table of population level and growth rate for the Chicago metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Chicago in 2023 is 8,937,000, a 0.4% increase from 2022.\nThe metro area population of Chicago in 2022 was 8,901,000, a 0.27% increase from 2021.\nThe metro area population of Chicago in 2021 was 8,877,000, a 0.14% increase from 2020.\nThe metro area population of Chicago in 2020 was 8,865,000, a 0.03% increase from 2019.""" + } + }, + { + "_index": "my-nlp-index", + "_id": "6", + "_score": 0.026973149, + "_source": { + "text": """Chart and table of population level and growth rate for the Seattle metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Seattle in 2023 is 3,519,000, a 0.86% increase from 2022.\nThe metro area population of Seattle in 2022 was 3,489,000, a 0.81% increase from 2021.\nThe metro area population of Seattle in 2021 was 3,461,000, a 0.82% increase from 2020.\nThe metro area population of Seattle in 2020 was 3,433,000, a 0.79% increase from 2019.""" + } + } + ] + }, + "ext": { + "retrieval_augmented_generation": { + "answer": """You are a helpful assistant.\nGenerate a concise and informative answer in less than 100 words for the given question\nSEARCH RESULT 1: Chart and table of population level and growth rate for the New York City metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of New York City in 2023 is 18,937,000, a 0.37% increase from 2022.\nThe metro area population of New York City in 2022 was 18,867,000, a 0.23% increase from 2021.\nThe metro area population of New York City in 2021 was 18,823,000, a 0.1% increase from 2020.\nThe metro area population of New York City in 2020 was 18,804,000, a 0.01% decline from 2019.\nSEARCH RESULT 2: Chart and table of population level and growth rate for the Miami metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Miami in 2023 is 6,265,000, a 0.8% increase from 2022.\nThe metro area population of Miami in 2022 was 6,215,000, a 0.78% increase from 2021.\nThe metro area population of Miami in 2021 was 6,167,000, a 0.74% increase from 2020.\nThe metro area population of Miami in 2020 was 6,122,000, a 0.71% increase from 2019.\nSEARCH RESULT 3: Chart and table of population level and growth rate for the Chicago metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Chicago in 2023 is 8,937,000, a 0.4% increase from 2022.\nThe metro area population of Chicago in 2022 was 8,901,000, a 0.27% increase from 2021.\nThe metro area population of Chicago in 2021 was 8,877,000, a 0.14% increase from 2020.\nThe metro area population of Chicago in 2020 was 8,865,000, a 0.03% increase from 2019.\nSEARCH RESULT 4: Chart and table of population level and growth rate for the Seattle metro area from 1950 to 2023. United Nations population projections are also included through the year 2035.\nThe current metro area population of Seattle in 2023 is 3,519,000, a 0.86% increase from 2022.\nThe metro area population of Seattle in 2022 was 3,489,000, a 0.81% increase from 2021.\nThe metro area population of Seattle in 2021 was 3,461,000, a 0.82% increase from 2020.\nThe metro area population of Seattle in 2020 was 3,433,000, a 0.79% increase from 2019.\nQUESTION: What's the population increase of New York City from 2021 to 2023? How is the trending comparing with Miami\nAlright, let's tackle this question step by step. The user is asking for the population increase of New York City from 2021 to 2023 and how this trend compares to Miami's. + +First, I'll look through the search results to find the relevant data. From SEARCH RESULT 1, I see the populations for NYC in 2021, 2022, and 2023. In 2021, it was 18,823,000, and by 2023, it's 18,937,000. That's an increase of 114,000 over two years. + +Next, I'll calculate the annual growth rates. From 2021 to 2022, the growth rate was 0.23%, and from 2022 to 2023, it's 0.37%. So, the trend shows an increase in the growth rate each year. + +Now, looking at Miami in SEARCH RESULT 2, the population in 2021 was 6,167,000, and in 2023, it's 6,265,000. That's an increase of 98,000 over the same period. The growth rates were 0.74% in 2021-2022 and 0.8% in 2022-2023, also showing an increasing trend but at a higher rate than NYC. + +Putting it all together, NYC's population increased by 114,000 with growth rates rising from 0.23% to 0.37%. Miami saw a slightly smaller increase of 98,000 but with higher growth rates, from 0.74% to 0.8%. So, Miami's growth is both higher in absolute terms and has a faster increasing rate compared to NYC. +</think> + +The population of New York City increased by 114,000 from 2021 to 2023. The growth rate rose from 0.1% in 2021 to 0.37% in 2023. Comparatively, Miami's population increased by 98,000 during the same period""" + } + } +} +``` \ No newline at end of file diff --git a/_tutorials/index.md b/_tutorials/index.md new file mode 100644 index 00000000000..4624a160068 --- /dev/null +++ b/_tutorials/index.md @@ -0,0 +1,35 @@ +--- +layout: default +title: Tutorials +has_children: true +has_toc: false +nav_order: 47 +nav_exclude: true +permalink: /tutorials/ +redirect_from: + - /ml-commons-plugin/tutorials/ + - /ml-commons-plugin/tutorials/index/ +cards: + - heading: "Searching data 101" + description: "Learn the fundamentals of search and explore OpenSearch query languages and types" + link: "/getting-started/search-data/" + - heading: "OpenSearch Dashboards" + description: "Start visualizing your data with interactive dashboards and powerful analytics tools" + link: "/dashboards/quickstart/" + - heading: "Vector search" + description: "Implement similarity search using vectors and enhance results with AI capabilities" + link: "/tutorials/vector-search/" + - heading: "Reranking search results" + description: "Enhance search relevance using machine learning models to intelligently reorder results" + link: "/tutorials/reranking/" + - heading: "Generative AI applications" + description: "Create AI-powered applications like RAG, chatbots, and advanced conversational systems" + link: "/tutorials/gen-ai/" +--- + +# Tutorials + +Follow our step-by-step tutorials to learn how to use OpenSearch features. + +{% include cards.html cards=page.cards %} + diff --git a/_tutorials/reranking/index.md b/_tutorials/reranking/index.md new file mode 100644 index 00000000000..b2bae714186 --- /dev/null +++ b/_tutorials/reranking/index.md @@ -0,0 +1,53 @@ +--- +layout: default +title: Reranking search results +has_children: true +has_toc: false +nav_order: 20 +redirect_from: + - /vector-search/tutorials/reranking/ + - /tutorials/reranking/ +reranking: + - heading: Reranking search results using Cohere Rerank + link: /tutorials/reranking/reranking-cohere/ + list: + - "<b>Platform:</b> OpenSearch" + - "<b>Model:</b> Cohere Rerank" + - "<b>Deployment:</b> Provider API" + - heading: Reranking search results using Cohere Rerank on Amazon Bedrock + link: /tutorials/reranking/reranking-cohere-bedrock/ + list: + - "<b>Platform:</b> OpenSearch, Amazon OpenSearch Service" + - "<b>Model:</b> Cohere Rerank" + - "<b>Deployment:</b> Amazon Bedrock" + - heading: Reranking search results using Amazon Bedrock models + link: /tutorials/reranking/reranking-bedrock/ + list: + - "<b>Platform:</b> OpenSearch" + - "<b>Model:</b> Amazon Bedrock reranker models" + - "<b>Deployment:</b> Amazon Bedrock" + - heading: Reranking search results using a cross-encoder in Amazon SageMaker + link: /tutorials/reranking/reranking-cross-encoder/ + list: + - "<b>Platform:</b> OpenSearch" + - "<b>Model:</b> Hugging Face MS MARCO" + - "<b>Deployment:</b> Amazon SageMaker" + - heading: Reranking search results using a reranker in Amazon SageMaker + link: /tutorials/reranking/reranking-sagemaker/ + list: + - "<b>Platform:</b> OpenSearch, Amazon OpenSearch Service" + - "<b>Model:</b> Hugging Face BAAI/bge-reranker" + - "<b>Deployment:</b> Amazon SageMaker" + - heading: Reranking search results by a field + link: /tutorials/reranking/reranking-by-field/ + list: + - "<b>Platform:</b> OpenSearch, Amazon OpenSearch Service" + - "<b>Model:</b> Cohere Rerank" + - "<b>Deployment:</b> Provider API" +--- + +# Reranking search results tutorials + +The following machine learning (ML) tutorials show you how to implement search result reranking. For more information about reranking, see [Reranking search results]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/). + +{% include cards.html cards=page.reranking %} \ No newline at end of file diff --git a/_tutorials/reranking/reranking-bedrock.md b/_tutorials/reranking/reranking-bedrock.md new file mode 100644 index 00000000000..71cfe3c4645 --- /dev/null +++ b/_tutorials/reranking/reranking-bedrock.md @@ -0,0 +1,734 @@ +--- +layout: default +title: Reranking search results using Amazon Bedrock models +parent: Reranking search results +nav_order: 100 +redirect_from: + - /ml-commons-plugin/tutorials/reranking-bedrock/ + - /vector-search/tutorials/reranking/reranking-bedrock/ +--- + +# Reranking search results using Amazon Bedrock models + +A [reranking pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/) can rerank search results, providing a relevance score for each document in the search results with respect to the search query. The relevance score is calculated by a cross-encoder model. + +This tutorial shows you how to use the [Amazon Bedrock Rerank API](https://docs.aws.amazon.com/bedrock/latest/APIReference/API_agent-runtime_Rerank.html) to rerank search results using a model hosted on Amazon Bedrock. + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Prerequisite: Test the model on Amazon Bedrock + +Before using your model, test it on Amazon Bedrock. For supported reranker models, see [Supported Regions and models for reranking in Amazon Bedrock](https://docs.aws.amazon.com/bedrock/latest/userguide/rerank-supported.html). For model IDs, see [Supported foundation models in Amazon Bedrock](https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html). To perform a reranking test, use the following code: + +```python +import json +import boto3 +bedrock_region = "your_bedrock_model_region_like_us-west-2" +bedrock_agent_runtime_client = boto3.client("bedrock-agent-runtime", region_name=bedrock_region) + +model_id = "amazon.rerank-v1:0" + +response = bedrock_agent_runtime_client.rerank( + queries=[ + { + "textQuery": { + "text": "What is the capital city of America?", + }, + "type": "TEXT" + } + ], + rerankingConfiguration={ + "bedrockRerankingConfiguration": { + "modelConfiguration": { + "modelArn": f"arn:aws:bedrock:{bedrock_region}::foundation-model/{model_id}" + }, + }, + "type": "BEDROCK_RERANKING_MODEL" + }, + sources=[ + { + "inlineDocumentSource": { + "textDocument": { + "text": "Carson City is the capital city of the American state of Nevada.", + }, + "type": "TEXT" + }, + "type": "INLINE" + }, + { + "inlineDocumentSource": { + "textDocument": { + "text": "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.", + }, + "type": "TEXT" + }, + "type": "INLINE" + }, + { + "inlineDocumentSource": { + "textDocument": { + "text": "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.", + }, + "type": "TEXT" + }, + "type": "INLINE" + }, + { + "inlineDocumentSource": { + "textDocument": { + "text": "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." + }, + "type": "TEXT" + }, + "type": "INLINE" + }, + ] +) + +results = response["results"] +print(json.dumps(results, indent=2)) +``` +{% include copy.html %} + +The reranked results are ordered by the highest score: + +```json +[ + { + "index": 2, + "relevanceScore": 0.7711548805236816 + }, + { + "index": 0, + "relevanceScore": 0.0025114635936915874 + }, + { + "index": 1, + "relevanceScore": 2.4876489987946115e-05 + }, + { + "index": 3, + "relevanceScore": 6.339210358419223e-06 + } +] +``` + +To sort the results by index, use the following code: + +```python +print(json.dumps(sorted(results, key=lambda x: x['index']),indent=2)) +``` + +The following are the results sorted by index: + +```json +[ + { + "index": 0, + "relevanceScore": 0.0025114635936915874 + }, + { + "index": 1, + "relevanceScore": 2.4876489987946115e-05 + }, + { + "index": 2, + "relevanceScore": 0.7711548805236816 + }, + { + "index": 3, + "relevanceScore": 6.339210358419223e-06 + } +] +``` + +## Step 1: Create a connector and register the model + +To create a connector and register the model, use the following steps. + +### Step 1.1: Create a connector for the model + +First, create a connector for the model. + +If you are using self-managed OpenSearch, supply your AWS credentials: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "Amazon Bedrock Rerank API", + "description": "Test connector for Amazon Bedrock Rerank API", + "version": 1, + "protocol": "aws_sigv4", + "credential": { + "access_key": "your_access_key", + "secret_key": "your_secret_key", + "session_token": "your_session_token" + }, + "parameters": { + "service_name": "bedrock", + "endpoint": "bedrock-agent-runtime", + "region": "your_bedrock_model_region_like_us-west-2", + "api_name": "rerank", + "model_id": "amazon.rerank-v1:0" + }, + "actions": [ + { + "action_type": "PREDICT", + "method": "POST", + "url": "https://${parameters.endpoint}.${parameters.region}.amazonaws.com/${parameters.api_name}", + "headers": { + "x-amz-content-sha256": "required", + "content-type": "application/json" + }, + "pre_process_function": "connector.pre_process.bedrock.rerank", + "request_body": """ + { + "queries": ${parameters.queries}, + "rerankingConfiguration": { + "bedrockRerankingConfiguration": { + "modelConfiguration": { + "modelArn": "arn:aws:bedrock:${parameters.region}::foundation-model/${parameters.model_id}" + } + }, + "type": "BEDROCK_RERANKING_MODEL" + }, + "sources": ${parameters.sources} + } + """, + "post_process_function": "connector.post_process.bedrock.rerank" + } + ] +} +``` +{% include copy-curl.html %} + +If you are using Amazon OpenSearch Service, you can provide an AWS Identity and Access Management (IAM) role Amazon Resource Name (ARN) that allows access to Amazon Bedrock. For more information, see the [AWS documentation](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/ml-amazon-connector.html). Use the following request to create a connector: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "Amazon Bedrock Rerank API", + "description": "Test connector for Amazon Bedrock Rerank API", + "version": 1, + "protocol": "aws_sigv4", + "credential": { + "roleArn": "your_role_arn_which_allows_access_to_bedrock_agent_runtime_rerank_api" + }, + "parameters": { + "service_name": "bedrock", + "endpoint": "bedrock-agent-runtime", + "region": "your_bedrock_model_region_like_us-west-2", + "api_name": "rerank", + "model_id": "amazon.rerank-v1:0" + }, + "actions": [ + { + "action_type": "PREDICT", + "method": "POST", + "url": "https://${parameters.endpoint}.${parameters.region}.amazonaws.com/${parameters.api_name}", + "headers": { + "x-amz-content-sha256": "required", + "content-type": "application/json" + }, + "pre_process_function": "connector.pre_process.bedrock.rerank", + "request_body": """ + { + "queries": ${parameters.queries}, + "rerankingConfiguration": { + "bedrockRerankingConfiguration": { + "modelConfiguration": { + "modelArn": "arn:aws:bedrock:${parameters.region}::foundation-model/${parameters.model_id}" + } + }, + "type": "BEDROCK_RERANKING_MODEL" + }, + "sources": ${parameters.sources} + } + """, + "post_process_function": "connector.post_process.bedrock.rerank" + } + ] +} +``` +{% include copy-curl.html %} + +### Step 1.2: Register and deploy the model + +Use the connector ID from the response to register and deploy the model: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "Amazon Bedrock Rerank API", + "function_name": "remote", + "description": "test Amazon Bedrock Rerank API", + "connector_id": "your_connector_id" +} +``` +{% include copy-curl.html %} + +Note the model ID in the response; you'll use it in the following steps. + +### Step 1.3: Test the model + +Test the model by using the Predict API: + +```json +POST _plugins/_ml/_predict/text_similarity/your_model_id +{ + "query_text": "What is the capital city of America?", + "text_docs": [ + "Carson City is the capital city of the American state of Nevada.", + "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.", + "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.", + "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." + ] +} +``` +{% include copy-curl.html %} + +Alternatively, you can test the model using the following query. This query bypasses the `pre_process_function` and calls the Rerank API directly: + +```json +POST _plugins/_ml/models/your_model_id/_predict +{ + "parameters": { + "queries": [ + { + "textQuery": { + "text": "What is the capital city of America?" + }, + "type": "TEXT" + } + ], + "sources": [ + { + "inlineDocumentSource": { + "textDocument": { + "text": "Carson City is the capital city of the American state of Nevada." + }, + "type": "TEXT" + }, + "type": "INLINE" + }, + { + "inlineDocumentSource": { + "textDocument": { + "text": "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan." + }, + "type": "TEXT" + }, + "type": "INLINE" + }, + { + "inlineDocumentSource": { + "textDocument": { + "text": "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district." + }, + "type": "TEXT" + }, + "type": "INLINE" + }, + { + "inlineDocumentSource": { + "textDocument": { + "text": "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." + }, + "type": "TEXT" + }, + "type": "INLINE" + } + ] + } +} +``` +{% include copy-curl.html %} + +The connector `pre_process_function` transforms the input into the format required by the Predict API `parameters`. + +By default, the Amazon Bedrock Rerank API output is formatted as follows: + +```json +[ + { + "index": 2, + "relevanceScore": 0.7711548724998493 + }, + { + "index": 0, + "relevanceScore": 0.0025114635138098534 + }, + { + "index": 1, + "relevanceScore": 2.4876490010363496e-05 + }, + { + "index": 3, + "relevanceScore": 6.339210403977635e-06 + } +] +``` + +The connector `post_process_function` transforms the model's output into a format that the [Reranker processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) can interpret and orders the results by index. + +The response contains four `similarity` outputs. For each `similarity` output, the `data` array contains a relevance score for each document against the query. The `similarity` outputs are provided in the order of the input documents; the first similarity result pertains to the first document: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "similarity", + "data_type": "FLOAT32", + "shape": [ + 1 + ], + "data": [ + 0.0025114636 + ] + }, + { + "name": "similarity", + "data_type": "FLOAT32", + "shape": [ + 1 + ], + "data": [ + 2.487649e-05 + ] + }, + { + "name": "similarity", + "data_type": "FLOAT32", + "shape": [ + 1 + ], + "data": [ + 0.7711549 + ] + }, + { + "name": "similarity", + "data_type": "FLOAT32", + "shape": [ + 1 + ], + "data": [ + 6.3392104e-06 + ] + } + ], + "status_code": 200 + } + ] +} +``` + +## Step 2: Create a reranking pipeline + +To create a reranking pipeline, use the following steps. + +### Step 2.1: Ingest test data + +Use the following request to ingest data into your index: + +```json +POST _bulk +{ "index": { "_index": "my-test-data" } } +{ "passage_text" : "Carson City is the capital city of the American state of Nevada." } +{ "index": { "_index": "my-test-data" } } +{ "passage_text" : "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan." } +{ "index": { "_index": "my-test-data" } } +{ "passage_text" : "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district." } +{ "index": { "_index": "my-test-data" } } +{ "passage_text" : "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." } +``` +{% include copy-curl.html %} + +### Step 2.2: Create a reranking pipeline + +Create a reranking pipeline using the Amazon Bedrock reranking model: + +```json +PUT /_search/pipeline/rerank_pipeline_bedrock +{ + "description": "Pipeline for reranking with Bedrock rerank model", + "response_processors": [ + { + "rerank": { + "ml_opensearch": { + "model_id": "your_model_id_created_in_step1" + }, + "context": { + "document_fields": ["passage_text"] + } + } + } + ] +} +``` +{% include copy-curl.html %} + +If you provide multiple field names in `document_fields`, the values of all fields are first concatenated, after which reranking is performed. +{: .note} + +### Step 2.3: Test reranking + +First, test the query without using the reranking pipeline: + +```json +POST my-test-data/_search +{ + "query": { + "match": { + "passage_text": "What is the capital city of America?" + } + }, + "highlight": { + "pre_tags": ["<strong>"], + "post_tags": ["</strong>"], + "fields": {"passage_text": {}} + }, + "_source": false, + "fields": ["passage_text"] +} +``` +{% include copy-curl.html %} + +The first document in the response is `Carson City is the capital city of the American state of Nevada`, which is incorrect: + +```json +{ + "took": 2, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 2.5045562, + "hits": [ + { + "_index": "my-test-data", + "_id": "1", + "_score": 2.5045562, + "fields": { + "passage_text": [ + "Carson City is the capital city of the American state of Nevada." + ] + }, + "highlight": { + "passage_text": [ + "Carson <strong>City</strong> <strong>is</strong> <strong>the</strong> <strong>capital</strong> <strong>city</strong> <strong>of</strong> <strong>the</strong> American state <strong>of</strong> Nevada." + ] + } + }, + { + "_index": "my-test-data", + "_id": "2", + "_score": 0.5807494, + "fields": { + "passage_text": [ + "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan." + ] + }, + "highlight": { + "passage_text": [ + "<strong>The</strong> Commonwealth <strong>of</strong> <strong>the</strong> Northern Mariana Islands <strong>is</strong> a group <strong>of</strong> islands in <strong>the</strong> Pacific Ocean.", + "Its <strong>capital</strong> <strong>is</strong> Saipan." + ] + } + }, + { + "_index": "my-test-data", + "_id": "3", + "_score": 0.5261191, + "fields": { + "passage_text": [ + "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district." + ] + }, + "highlight": { + "passage_text": [ + "(also known as simply Washington or D.C., and officially as <strong>the</strong> District <strong>of</strong> Columbia) <strong>is</strong> <strong>the</strong> <strong>capital</strong>", + "<strong>of</strong> <strong>the</strong> United States.", + "It <strong>is</strong> a federal district." + ] + } + }, + { + "_index": "my-test-data", + "_id": "4", + "_score": 0.5083029, + "fields": { + "passage_text": [ + "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." + ] + }, + "highlight": { + "passage_text": [ + "<strong>Capital</strong> punishment (<strong>the</strong> death penalty) has existed in <strong>the</strong> United States since beforethe United States", + "As <strong>of</strong> 2017, <strong>capital</strong> punishment <strong>is</strong> legal in 30 <strong>of</strong> <strong>the</strong> 50 states." + ] + } + } + ] + } +} +``` + +Next, test the query using the reranking pipeline: + +```json +POST my-test-data/_search?search_pipeline=rerank_pipeline_bedrock +{ + "query": { + "match": { + "passage_text": "What is the capital city of America?" + } + }, + "ext": { + "rerank": { + "query_context": { + "query_text": "What is the capital city of America?" + } + } + }, + "highlight": { + "pre_tags": ["<strong>"], + "post_tags": ["</strong>"], + "fields": {"passage_text": {}} + }, + "_source": false, + "fields": ["passage_text"] +} +``` +{% include copy-curl.html %} + +The first document in the response is `"Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district."`, which is correct: + +```json +{ + "took": 2, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 0.7711549, + "hits": [ + { + "_index": "my-test-data", + "_id": "3", + "_score": 0.7711549, + "fields": { + "passage_text": [ + "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district." + ] + }, + "highlight": { + "passage_text": [ + "(also known as simply Washington or D.C., and officially as <strong>the</strong> District <strong>of</strong> Columbia) <strong>is</strong> <strong>the</strong> <strong>capital</strong>", + "<strong>of</strong> <strong>the</strong> United States.", + "It <strong>is</strong> a federal district." + ] + } + }, + { + "_index": "my-test-data", + "_id": "1", + "_score": 0.0025114636, + "fields": { + "passage_text": [ + "Carson City is the capital city of the American state of Nevada." + ] + }, + "highlight": { + "passage_text": [ + "Carson <strong>City</strong> <strong>is</strong> <strong>the</strong> <strong>capital</strong> <strong>city</strong> <strong>of</strong> <strong>the</strong> American state <strong>of</strong> Nevada." + ] + } + }, + { + "_index": "my-test-data", + "_id": "2", + "_score": 02.487649e-05, + "fields": { + "passage_text": [ + "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan." + ] + }, + "highlight": { + "passage_text": [ + "<strong>The</strong> Commonwealth <strong>of</strong> <strong>the</strong> Northern Mariana Islands <strong>is</strong> a group <strong>of</strong> islands in <strong>the</strong> Pacific Ocean.", + "Its <strong>capital</strong> <strong>is</strong> Saipan." + ] + } + }, + { + "_index": "my-test-data", + "_id": "4", + "_score": 6.3392104e-06, + "fields": { + "passage_text": [ + "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." + ] + }, + "highlight": { + "passage_text": [ + "<strong>Capital</strong> punishment (<strong>the</strong> death penalty) has existed in <strong>the</strong> United States since beforethe United States", + "As <strong>of</strong> 2017, <strong>capital</strong> punishment <strong>is</strong> legal in 30 <strong>of</strong> <strong>the</strong> 50 states." + ] + } + } + ] + }, + "profile": { + "shards": [] + } +} +``` + +You can reuse the same query by specifying the `query_text_path` instead of `query_text`: + +```json +POST my-test-data/_search?search_pipeline=rerank_pipeline_bedrock +{ + "query": { + "match": { + "passage_text": "What is the capital city of America?" + } + }, + "ext": { + "rerank": { + "query_context": { + "query_text_path": "query.match.passage_text.query" + } + } + }, + "highlight": { + "pre_tags": ["<strong>"], + "post_tags": ["</strong>"], + "fields": {"passage_text": {}} + }, + "_source": false, + "fields": ["passage_text"] +} +``` +{% include copy-curl.html %} + diff --git a/_tutorials/reranking/reranking-by-field.md b/_tutorials/reranking/reranking-by-field.md new file mode 100644 index 00000000000..7c6e6d173dd --- /dev/null +++ b/_tutorials/reranking/reranking-by-field.md @@ -0,0 +1,549 @@ +--- +layout: default +title: Reranking search results by a field +parent: Reranking search results +nav_order: 120 +redirect_from: + - /ml-commons-plugin/tutorials/reranking-cohere/ + - /vector-search/tutorials/reranking/reranking-by-field/ +--- + +# Reranking search results by a field + +Starting with OpenSearch 2.18, you can rerank search [results by a field]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/#the-by_field-rerank-type). This feature is useful when your documents include a field that is particularly important or when you want to rerank results from an externally hosted model. For more information, see [Reranking search results by a field]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/rerank-by-field/). + +This tutorial explains how to use the [Cohere Rerank](https://docs.cohere.com/reference/rerank-1) model to rerank search results by a field in self-managed OpenSearch and in [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/). + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Step 1 (self-managed OpenSearch): Create a connector + +To create a connector, send the following request: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "cohere-rerank", + "description": "The connector to Cohere reanker model", + "version": "1", + "protocol": "http", + "credential": { + "cohere_key": "your_cohere_api_key" + }, + "parameters": { + "model": "rerank-english-v3.0", + "return_documents": true + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://api.cohere.ai/v1/rerank", + "headers": { + "Authorization": "Bearer ${credential.cohere_key}" + }, + "request_body": "{ \"documents\": ${parameters.documents}, \"query\": \"${parameters.query}\", \"model\": \"${parameters.model}\", \"top_n\": ${parameters.top_n}, \"return_documents\": ${parameters.return_documents} }" + } + ] +} +``` +{% include copy-curl.html %} + +The response contains the connector ID: + +```json +{"connector_id":"qp2QP40BWbTmLN9Fpo40"} +``` + +Note the connector ID; you'll use it in the following steps. Then go to [Step 2](#step-2-register-the-cohere-rerank-model). + +## Step 1 (Amazon OpenSearch Service): Create a connector + +Follow these steps to create a connector using Amazon OpenSearch Service. + +### Prerequisite: Create an OpenSearch cluster + +Go to the [Amazon OpenSearch Service console](https://console.aws.amazon.com/aos/home) and create an OpenSearch domain. + +Note the domain Amazon Resource Name (ARN) and URL; you'll use them in the following steps. + +### Step 1.1: Store the API key in AWS Secrets Manager + +Store your Cohere API key in [AWS Secrets Manager](https://docs.aws.amazon.com/secretsmanager/latest/userguide/intro.html): + +1. Open AWS Secrets Manager. +1. Select **Store a new secret**. +1. Select **Other type of secret**. +1. Create a key-value pair with **my_cohere_key** as the key and your Cohere API key as the value. +1. Name your secret `my_test_cohere_secret`. + +Note the secret ARN; you'll use it in the following steps. + +### Step 1.2: Create an IAM role + +To use the secret created in Step 1, you must create an AWS Identity and Access Management (IAM) role with read permissions for the secret. This IAM role will be configured in the connector and will allow the connector to read the secret. + +Go to the IAM console, create a new IAM role named `my_cohere_secret_role`, and add the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "es.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "secretsmanager:GetSecretValue", + "secretsmanager:DescribeSecret" + ], + "Effect": "Allow", + "Resource": "your_secret_arn_created_in_step1" + } + ] +} +``` +{% include copy.html %} + +Note the role ARN; you'll use it in the following steps. + +### Step 1.3: Configure an IAM role in Amazon OpenSearch Service + +Follow these steps to configure an IAM role in Amazon OpenSearch Service. + +#### Step 1.3.1: Create an IAM role for signing connector requests + +Generate a new IAM role specifically for signing your Create Connector API request. + +Create an IAM role named `my_create_cohere_connector_role` with the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "your_iam_user_arn" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +You'll use the `your_iam_user_arn` IAM user to assume the role in Step 4.1. + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "your_iam_role_arn_created_in_step2" + }, + { + "Effect": "Allow", + "Action": "es:ESHttpPost", + "Resource": "your_opensearch_domain_arn_created_in_step0" + } + ] +} +``` +{% include copy.html %} + +Note this role ARN; you'll use it in the following steps. + +#### Step 1.3.2: Map a backend role + +Follow these steps to map a backend role: + +1. Log in to OpenSearch Dashboards and select **Security** on the top menu. +2. Select **Roles**, and then select the **ml_full_access** role. +3. On the **ml_full_access** role details page, select **Mapped users**, and then select **Manage mapping**. +4. Enter the IAM role ARN created in Step 3.1 in the **Backend roles** field, as shown in the following image. + ![Mapping a backend role]({{site.url}}{{site.baseurl}}/images/vector-search-tutorials/mapping_iam_role_arn.png) +4. Select **Map**. + +The IAM role is now successfully configured in your OpenSearch cluster. + +## Step 1.4: Create a connector + +Follow these steps to create a connector for the model. For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +### Step 1.4.1: Get temporary credentials + +Use the credentials of the IAM user specified in Step 3.1 to assume the role: + +```bash +aws sts assume-role --role-arn your_iam_role_arn_created_in_step3.1 --role-session-name your_session_name +``` + +{% include copy.html %} + +Copy the temporary credentials from the response and configure them in `~/.aws/credentials`: + +```ini +[default] +AWS_ACCESS_KEY_ID=your_access_key_of_role_created_in_step3.1 +AWS_SECRET_ACCESS_KEY=your_secret_key_of_role_created_in_step3.1 +AWS_SESSION_TOKEN=your_session_token_of_role_created_in_step3.1 +``` +{% include copy.html %} + +### Step 1.4.2: Create a connector + +Run the following Python code with the temporary credentials configured in `~/.aws/credentials`: + +```python +import boto3 +import requests +from requests_aws4auth import AWS4Auth + +host = 'your_amazon_opensearch_domain_endpoint_created_in_step0' +region = 'your_amazon_opensearch_domain_region' +service = 'es' + +credentials = boto3.Session().get_credentials() +awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) + +path = '/_plugins/_ml/connectors/_create' +url = host + path + +payload = { + "name": "cohere-rerank", + "description": "The connector to Cohere reanker model", + "version": "1", + "protocol": "http", + "credential": { + "secretArn": "your_secret_arn_created_in_step1", + "roleArn": "your_iam_role_arn_created_in_step2" + }, + "parameters": { + "model": "rerank-english-v3.0", + "return_documents": true + + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://api.cohere.ai/v1/rerank", + "headers": { + "Authorization": "Bearer ${credential.secretArn.my_cohere_key}" + }, + "request_body": "{ \"documents\": ${parameters.documents}, \"query\": \"${parameters.query}\", \"model\": \"${parameters.model}\", \"top_n\": ${parameters.top_n}, \"return_documents\": ${parameters.return_documents} }" + } + ] +} + +headers = {"Content-Type": "application/json"} + +r = requests.post(url, auth=awsauth, json=payload, headers=headers) +print(r.text) +``` +{% include copy.html %} + +The script outputs a connector ID: + +```json +{"connector_id":"qp2QP40BWbTmLN9Fpo40"} +``` + +Note the connector ID; you'll use it in the next step. + +## Step 2: Register the Cohere Rerank model + +After successfully creating a connector using either the self-managed OpenSearch or Amazon OpenSearch Service method, you can register the Cohere Rerank model. + +Use the connector ID from Step 1 to create a model: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "cohere rerank model", + "function_name": "remote", + "description": "test rerank model", + "connector_id": "your_connector_id" +} +``` +{% include copy-curl.html %} + +Note the connector ID; you'll use it in the following steps. + +# Step 3: Test the model + +To test the model, send the following request: + +```json +POST /_plugins/_ml/models/your_model_id/_predict +{ + "parameters": { + "top_n" : 100, + "query": "What day is it?", + "documents" : ["Monday", "Tuesday", "apples"] + } +} +``` +{% include copy-curl.html %} + +The response contains the matching documents: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "dataAsMap": { + "id": "e15a3922-3d89-4adc-96cf-9b85a619fb66", + "results": [ + { + "document": { + "text": "Monday" + }, + "index": 0.0, + "relevance_score": 0.21076629 + }, + { + "document": { + "text": "Tuesday" + }, + "index": 1.0, + "relevance_score": 0.13206616 + }, + { + "document": { + "text": "apples" + }, + "index": 2.0, + "relevance_score": 1.0804956E-4 + } + ], + "meta": { + "api_version": { + "version": "1" + }, + "billed_units": { + "search_units": 1.0 + } + } + } + } + ], + "status_code": 200 + } + ] +} +``` + +For each document, a score is assigned by the rerank model. Now you'll create a search pipeline that invokes the Cohere model and reorders the search results based on their relevance score. + +## Step 3: Rerank the search results + +Follow these steps to rerank the search results. + +### Step 3.1: Create an index + +To create an index, send the following request: + +```json +POST _bulk +{ "index": { "_index": "nyc_facts", "_id": 1 } } +{ "fact_title": "Population of New York", "fact_description": "New York City has an estimated population of over 8.3 million people as of 2023, making it the most populous city in the United States." } +{ "index": { "_index": "nyc_facts", "_id": 2 } } +{ "fact_title": "Statue of Liberty", "fact_description": "The Statue of Liberty, a symbol of freedom, was gifted to the United States by France in 1886 and stands on Liberty Island in New York Harbor." } +{ "index": { "_index": "nyc_facts", "_id": 3 } } +{ "fact_title": "New York City is a Global Financial Hub", "fact_description": "New York City is home to the New York Stock Exchange (NYSE) and Wall Street, which are central to the global finance industry." } +{ "index": { "_index": "nyc_facts", "_id": 4 } } +{ "fact_title": "Broadway", "fact_description": "Broadway is a major thoroughfare in New York City known for its theaters. It's also considered the birthplace of modern American theater and musicals." } +{ "index": { "_index": "nyc_facts", "_id": 5 } } +{ "fact_title": "Central Park", "fact_description": "Central Park, located in Manhattan, spans 843 acres and is one of the most visited urban parks in the world, offering green spaces, lakes, and recreational areas." } +{ "index": { "_index": "nyc_facts", "_id": 6 } } +{ "fact_title": "Empire State Building", "fact_description": "The Empire State Building, completed in 1931, is an iconic Art Deco skyscraper that was the tallest building in the world until 1970." } +{ "index": { "_index": "nyc_facts", "_id": 7 } } +{ "fact_title": "Times Square", "fact_description": "Times Square, often called 'The Cross-roads of the World,' is known for its bright lights, Broadway theaters, and New Year's Eve ball drop." } +{ "index": { "_index": "nyc_facts", "_id": 8 } } +{ "fact_title": "Brooklyn Bridge", "fact_description": "The Brooklyn Bridge, completed in 1883, connects Manhattan and Brooklyn and was the first suspension bridge to use steel in its construction." } +{ "index": { "_index": "nyc_facts", "_id": 9 } } +{ "fact_title": "New York City Public Library", "fact_description": "The New York Public Library, founded in 1895, has over 50 million items in its collections and serves as a major cultural and educational resource." } +{ "index": { "_index": "nyc_facts", "_id": 10 } } +{ "fact_title": "New York's Chinatown", "fact_description": "New York's Chinatown, one of the largest in the world, is known for its vibrant culture, food, and history. It plays a key role in the city's Chinese community." } +``` +{% include copy-curl.html %} + +### Step 3.2: Create a reranking pipeline + +To create a reranking pipeline, send the following request: + +```json +PUT /_search/pipeline/cohere_pipeline +{ + "response_processors": [ + { + "ml_inference": { + "model_id": "your_model_id", + "input_map": { + "documents": "fact_description", + "query": "_request.ext.query_context.query_text", + "top_n": "_request.ext.query_context.top_n" + }, + "output_map": { + "relevance_score": "results[*].relevance_score", + "description": "results[*].document.text" + }, + "full_response_path": false, + "ignore_missing": false, + "ignore_failure": false, + "one_to_one": false, + "override": false, + "model_config": {} + } + }, + { + "rerank": { + "by_field": { + "target_field": "relevance_score", + "remove_target_field": false, + "keep_previous_score": false, + "ignore_failure": false + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Step 3.3: Test the pipeline + +To test the pipeline, send a query related to the indexed documents and set `top_n` to a value greater than or equal to `size`: + +```json +GET nyc_facts/_search?search_pipeline=cohere_pipeline +{ + "query": { + "match_all": {} + }, + "size": 5, + "ext": { + "rerank": { + "query_context": { + "query_text": "Where do people go to see a show?", + "top_n" : "10" + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the reranked documents: + +```json +{ + "took": 5, + "timed_out": false, + "_shards": { + "total": 5, + "successful": 5, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 10, + "relation": "eq" + }, + "max_score": 0.34986588, + "hits": [ + { + "_index": "nyc_facts", + "_id": "_7a76b04b5016c71c", + "_score": 0.34986588, + "_source": { + "result_document": "Broadway is a major thoroughfare in New York City known for its theaters. It's also considered the birthplace of modern American theater and musicals.", + "fact_title": "Times Square", + "fact_description": "Times Square, often called 'The Cross-roads of the World,' is known for its bright lights, Broadway theaters, and New Year's Eve ball drop.", + "relevance_score": 0.34986588 + } + }, + { + "_index": "nyc_facts", + "_id": "_00c26e453971ed68", + "_score": 0.1066906, + "_source": { + "result_document": "Times Square, often called 'The Cross-roads of the World,' is known for its bright lights, Broadway theaters, and New Year's Eve ball drop.", + "fact_title": "New York City Public Library", + "fact_description": "The New York Public Library, founded in 1895, has over 50 million items in its collections and serves as a major cultural and educational resource.", + "relevance_score": 0.1066906 + } + }, + { + "_index": "nyc_facts", + "_id": "_d03d3610a5a5bd82", + "_score": 0.00019563535, + "_source": { + "result_document": "The New York Public Library, founded in 1895, has over 50 million items in its collections and serves as a major cultural and educational resource.", + "fact_title": "Broadway", + "fact_description": "Broadway is a major thoroughfare in New York City known for its theaters. It's also considered the birthplace of modern American theater and musicals.", + "relevance_score": 0.00019563535 + } + }, + { + "_index": "nyc_facts", + "_id": "_9284bae64eab7f63", + "_score": 0.000019988918, + "_source": { + "result_document": "The Statue of Liberty, a symbol of freedom, was gifted to the United States by France in 1886 and stands on Liberty Island in New York Harbor.", + "fact_title": "Brooklyn Bridge", + "fact_description": "The Brooklyn Bridge, completed in 1883, connects Manhattan and Brooklyn and was the first suspension bridge to use steel in its construction.", + "relevance_score": 0.000019988918 + } + }, + { + "_index": "nyc_facts", + "_id": "_7aa6f2934f47911b", + "_score": 0.0000104515475, + "_source": { + "result_document": "The Brooklyn Bridge, completed in 1883, connects Manhattan and Brooklyn and was the first suspension bridge to use steel in its construction.", + "fact_title": "Statue of Liberty", + "fact_description": "The Statue of Liberty, a symbol of freedom, was gifted to the United States by France in 1886 and stands on Liberty Island in New York Harbor.", + "relevance_score": 0.0000104515475 + } + } + ] + }, + "profile": { + "shards": [] + } +} +``` + +When evaluating the reranked results, focus on the `result_document` field and its corresponding `relevance_score`. The `fact_description` field shows the original document text and does not reflect the reranking order. +{: .note} diff --git a/_tutorials/reranking/reranking-cohere-bedrock.md b/_tutorials/reranking/reranking-cohere-bedrock.md new file mode 100644 index 00000000000..434dc406794 --- /dev/null +++ b/_tutorials/reranking/reranking-cohere-bedrock.md @@ -0,0 +1,711 @@ +--- +layout: default +title: Reranking search results using Cohere Rerank on Amazon Bedrock +parent: Reranking search results +nav_order: 95 +redirect_from: + - /vector-search/tutorials/reranking/reranking-cohere-bedrock/ +--- + +# Reranking search results using Cohere Rerank on Amazon Bedrock + +This tutorial shows you how to implement search result reranking in [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/) and self-managed OpenSearch using the [Cohere Rerank model](https://docs.aws.amazon.com/bedrock/latest/userguide/rerank-supported.html) hosted on Amazon Bedrock. + +A [reranking pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/) can rerank search results, providing a relevance score for each document in the search results with respect to the search query. The relevance score is calculated by a cross-encoder model. + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Prerequisites: Test the model on Amazon Bedrock + +Before using your model, test it on Amazon Bedrock using the following code: + +```python +import json +import boto3 +bedrock_region = "your_bedrock_model_region_like_us-west-2" +bedrock_runtime_client = boto3.client("bedrock-runtime", region_name=bedrock_region) + +modelId = "cohere.rerank-v3-5:0" +contentType = "application/json" +accept = "*/*" + +body = json.dumps({ + "query": "What is the capital city of America?", + "documents": [ + "Carson City is the capital city of the American state of Nevada.", + "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.", + "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.", + "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." + ], + "api_version": 2 +}) + +response = bedrock_runtime_client.invoke_model( + modelId=modelId, + contentType=contentType, + accept=accept, + body=body +) +results = json.loads(response.get('body').read())["results"] +print(json.dumps(results, indent=2)) +``` +{% include copy.html %} + +The response contains the reranking results ordered by relevance score: + +```json +[ + { + "index": 2, + "relevance_score": 0.7190094 + }, + { + "index": 0, + "relevance_score": 0.32418242 + }, + { + "index": 1, + "relevance_score": 0.07456104 + }, + { + "index": 3, + "relevance_score": 0.06124987 + } +] +``` + +To sort the results by index, use the following code: + +```python +print(json.dumps(sorted(results, key=lambda x: x['index']), indent=2)) +``` +{% include copy.html %} + +The sorted results are as follows: + +```json +[ + { + "index": 0, + "relevance_score": 0.32418242 + }, + { + "index": 1, + "relevance_score": 0.07456104 + }, + { + "index": 2, + "relevance_score": 0.7190094 + }, + { + "index": 3, + "relevance_score": 0.06124987 + } +] +``` + +## Step 1: Create a connector and register the model + +To create a connector for the model, send the following request. + +If you are using self-managed OpenSearch, supply your AWS credentials: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "Amazon Bedrock Cohere rerank model", + "description": "Test connector for Amazon Bedrock Cohere rerank model", + "version": 1, + "protocol": "aws_sigv4", + "credential": { + "access_key": "your_access_key", + "secret_key": "your_secret_key", + "session_token": "your_session_token" + }, + "parameters": { + "service_name": "bedrock", + "endpoint": "bedrock-runtime", + "region": "your_bedrock_model_region_like_us-west-2", + "model_name": "cohere.rerank-v3-5:0", + "api_version": 2 + }, + "actions": [ + { + "action_type": "PREDICT", + "method": "POST", + "url": "https://${parameters. endpoint}.${parameters.region}.amazonaws.com/model/${parameters.model_name}/invoke", + "headers": { + "x-amz-content-sha256": "required", + "content-type": "application/json" + }, + "pre_process_function": """ + def query_text = params.query_text; + def text_docs = params.text_docs; + def textDocsBuilder = new StringBuilder('['); + for (int i=0; i<text_docs.length; i++) { + textDocsBuilder.append('"'); + textDocsBuilder.append(text_docs[i]); + textDocsBuilder.append('"'); + if (i<text_docs.length - 1) { + textDocsBuilder.append(','); + } + } + textDocsBuilder.append(']'); + def parameters = '{ "query": "' + query_text + '", "documents": ' + textDocsBuilder.toString() + ' }'; + return '{"parameters": ' + parameters + '}'; + """, + "request_body": """ + { + "documents": ${parameters.documents}, + "query": "${parameters.query}", + "api_version": ${parameters.api_version} + } + """, + "post_process_function": """ + if (params.results == null || params.results.length == 0) { + throw new IllegalArgumentException("Post process function input is empty."); + } + def outputs = params.results; + def relevance_scores = new Double[outputs.length]; + for (int i=0; i<outputs.length; i++) { + def index = new BigDecimal(outputs[i].index.toString()).intValue(); + relevance_scores[index] = outputs[i].relevance_score; + } + def resultBuilder = new StringBuilder('['); + for (int i=0; i<relevance_scores.length; i++) { + resultBuilder.append(' {"name": "similarity", "data_type": "FLOAT32", "shape": [1],'); + resultBuilder.append('"data": ['); + resultBuilder.append(relevance_scores[i]); + resultBuilder.append(']}'); + if (i<outputs.length - 1) { + resultBuilder.append(','); + } + } + resultBuilder.append(']'); + return resultBuilder.toString(); + """ + } + ] +} +``` +{% include copy-curl.html %} + +If you are using Amazon OpenSearch Service, you can provide an AWS Identity and Access Management (IAM) role Amazon Resource Name (ARN) that allows access to Amazon Bedrock: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "Amazon Bedrock Cohere rerank model", + "description": "Test connector for Amazon Bedrock Cohere rerank model", + "version": 1, + "protocol": "aws_sigv4", + "credential": { + "roleArn": "your_role_arn_which_allows_access_to_bedrock_model" + }, + "parameters": { + "service_name": "bedrock", + "endpoint": "bedrock-runtime", + "region": "your_bedrock_model_region_like_us-west-2", + "model_name": "cohere.rerank-v3-5:0", + "api_version": 2 +}, + "actions": [ + { + "action_type": "PREDICT", + "method": "POST", + "url": "https://${parameters. endpoint}.${parameters.region}.amazonaws.com/model/${parameters.model_name}/invoke", + "headers": { + "x-amz-content-sha256": "required", + "content-type": "application/json" + }, + "pre_process_function": """ + def query_text = params.query_text; + def text_docs = params.text_docs; + def textDocsBuilder = new StringBuilder('['); + for (int i=0; i<text_docs.length; i++) { + textDocsBuilder.append('"'); + textDocsBuilder.append(text_docs[i]); + textDocsBuilder.append('"'); + if (i<text_docs.length - 1) { + textDocsBuilder.append(','); + } + } + textDocsBuilder.append(']'); + def parameters = '{ "query": "' + query_text + '", "documents": ' + textDocsBuilder.toString() + ' }'; + return '{"parameters": ' + parameters + '}'; + """, + "request_body": """ + { + "documents": ${parameters.documents}, + "query": "${parameters.query}", + "api_version": ${parameters.api_version} + } + """, + "post_process_function": """ + if (params.results == null || params.results.length == 0) { + throw new IllegalArgumentException("Post process function input is empty."); + } + def outputs = params.results; + def relevance_scores = new Double[outputs.length]; + for (int i=0; i<outputs.length; i++) { + def index = new BigDecimal(outputs[i].index.toString()).intValue(); + relevance_scores[index] = outputs[i].relevance_score; + } + def resultBuilder = new StringBuilder('['); + for (int i=0; i<relevance_scores.length; i++) { + resultBuilder.append(' {"name": "similarity", "data_type": "FLOAT32", "shape": [1],'); + resultBuilder.append('"data": ['); + resultBuilder.append(relevance_scores[i]); + resultBuilder.append(']}'); + if (i<outputs.length - 1) { + resultBuilder.append(','); + } + } + resultBuilder.append(']'); + return resultBuilder.toString(); + """ + } + ] +} +``` + +For more information, see the [AWS documentation](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/ml-amazon-connector.html). + +Use the connector ID from the response to register and deploy the model: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "Amazon Bedrock Cohere rerank model", + "function_name": "remote", + "description": "test rerank model", + "connector_id": "your_connector_id" +} +``` +{% include copy-curl.html %} + +Note the model ID in the response; you'll use it in the following steps. + +Test the model by using the Predict API: + +```json +POST _plugins/_ml/models/your_model_id/_predict +{ + "parameters": { + "query": "What is the capital city of America?", + "documents": [ + "Carson City is the capital city of the American state of Nevada.", + "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.", + "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.", + "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." + ] + } +} +``` +{% include copy-curl.html %} + +Alternatively, you can test the model as follows: + +```json +POST _plugins/_ml/_predict/text_similarity/your_model_id +{ + "query_text": "What is the capital city of America?", + "text_docs": [ + "Carson City is the capital city of the American state of Nevada.", + "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.", + "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.", + "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." + ] +} +``` +{% include copy-curl.html %} + +The connector `pre_process_function` transforms the input into the format required by the previously shown parameters. + +By default, the Amazon Bedrock Rerank API output has the following format: + +```json +[ + { + "index": 2, + "relevance_score": 0.7190094 + }, + { + "index": 0, + "relevance_score": 0.32418242 + }, + { + "index": 1, + "relevance_score": 0.07456104 + }, + { + "index": 3, + "relevance_score": 0.06124987 + } +] +``` + +The connector `post_process_function` transforms the model's output into a format that the [rerank processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) can interpret and orders the results by index. This adapted format is as follows: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "similarity", + "data_type": "FLOAT32", + "shape": [ + 1 + ], + "data": [ + 0.32418242 + ] + }, + { + "name": "similarity", + "data_type": "FLOAT32", + "shape": [ + 1 + ], + "data": [ + 0.07456104 + ] + }, + { + "name": "similarity", + "data_type": "FLOAT32", + "shape": [ + 1 + ], + "data": [ + 0.7190094 + ] + }, + { + "name": "similarity", + "data_type": "FLOAT32", + "shape": [ + 1 + ], + "data": [ + 0.06124987 + ] + } + ], + "status_code": 200 + } + ] +} +``` + +The response contains four `similarity` objects. For each `similarity` object, the `data` array contains a relevance score for each document with respect to the query. The `similarity` objects are provided in the order of the input documents---the first object pertains to the first document. This differs from the default output of the Cohere Rerank model, which orders documents by relevance score. The document order is changed in the `connector.post_process.cohere.rerank` post-processing function so that the output is compatible with a reranking pipeline. + +## Step 2: Configure a reranking pipeline + +Follow these steps to configure a reranking pipeline. + +### Step 2.1: Ingest test data + +Send a bulk request to ingest test data: + +```json +POST _bulk +{ "index": { "_index": "my-test-data" } } +{ "passage_text" : "Carson City is the capital city of the American state of Nevada." } +{ "index": { "_index": "my-test-data" } } +{ "passage_text" : "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan." } +{ "index": { "_index": "my-test-data" } } +{ "passage_text" : "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district." } +{ "index": { "_index": "my-test-data" } } +{ "passage_text" : "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." } +``` +{% include copy-curl.html %} + +### Step 2.2: Create a reranking pipeline + +Create a reranking pipeline with the Cohere Rerank model: + +```json +PUT /_search/pipeline/rerank_pipeline_bedrock +{ + "description": "Pipeline for reranking with Bedrock Cohere rerank model", + "response_processors": [ + { + "rerank": { + "ml_opensearch": { + "model_id": "your_model_id_created_in_step1" + }, + "context": { + "document_fields": ["passage_text"] + } + } + } + ] +} +``` +{% include copy-curl.html %} + +If you provide multiple field names in `document_fields`, the values of all fields are first concatenated, and then reranking is performed. +{: .note} + +### Step 2.3: Test the reranking + +To limit the number of returned results, you can specify the `size` parameter. For example, set `"size": 2` to return the top two documents. + +First, test the query without using the reranking pipeline: + +```json +POST my-test-data/_search +{ + "query": { + "match": { + "passage_text": "What is the capital city of America?" + } + }, + "highlight": { + "pre_tags": ["<strong>"], + "post_tags": ["</strong>"], + "fields": {"passage_text": {}} + }, + "_source": false, + "fields": ["passage_text"] +} +``` +{% include copy-curl.html %} + +The first document in the response is `Carson City is the capital city of the American state of Nevada`, which is incorrect: + +```json +{ + "took": 2, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 2.5045562, + "hits": [ + { + "_index": "my-test-data", + "_id": "1", + "_score": 2.5045562, + "fields": { + "passage_text": [ + "Carson City is the capital city of the American state of Nevada." + ] + }, + "highlight": { + "passage_text": [ + "Carson <strong>City</strong> <strong>is</strong> <strong>the</strong> <strong>capital</strong> <strong>city</strong> <strong>of</strong> <strong>the</strong> American state <strong>of</strong> Nevada." + ] + } + }, + { + "_index": "my-test-data", + "_id": "2", + "_score": 0.5807494, + "fields": { + "passage_text": [ + "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan." + ] + }, + "highlight": { + "passage_text": [ + "<strong>The</strong> Commonwealth <strong>of</strong> <strong>the</strong> Northern Mariana Islands <strong>is</strong> a group <strong>of</strong> islands in <strong>the</strong> Pacific Ocean.", + "Its <strong>capital</strong> <strong>is</strong> Saipan." + ] + } + }, + { + "_index": "my-test-data", + "_id": "3", + "_score": 0.5261191, + "fields": { + "passage_text": [ + "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district." + ] + }, + "highlight": { + "passage_text": [ + "(also known as simply Washington or D.C., and officially as <strong>the</strong> District <strong>of</strong> Columbia) <strong>is</strong> <strong>the</strong> <strong>capital</strong>", + "<strong>of</strong> <strong>the</strong> United States.", + "It <strong>is</strong> a federal district." + ] + } + }, + { + "_index": "my-test-data", + "_id": "4", + "_score": 0.5083029, + "fields": { + "passage_text": [ + "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." + ] + }, + "highlight": { + "passage_text": [ + "<strong>Capital</strong> punishment (<strong>the</strong> death penalty) has existed in <strong>the</strong> United States since beforethe United States", + "As <strong>of</strong> 2017, <strong>capital</strong> punishment <strong>is</strong> legal in 30 <strong>of</strong> <strong>the</strong> 50 states." + ] + } + } + ] + } +} +``` + +Next, test the query using the reranking pipeline: + +```json +POST my-test-data/_search?search_pipeline=rerank_pipeline_bedrock +{ + "query": { + "match": { + "passage_text": "What is the capital city of America?" + } + }, + "ext": { + "rerank": { + "query_context": { + "query_text": "What is the capital city of America?" + } + } + }, + "highlight": { + "pre_tags": ["<strong>"], + "post_tags": ["</strong>"], + "fields": {"passage_text": {}} + }, + "_source": false, + "fields": ["passage_text"] +} +``` +{% include copy-curl.html %} + +The first document in the response is `"Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district."`, which is correct: + +```json +{ + "took": 2, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 0.7190094, + "hits": [ + { + "_index": "my-test-data", + "_id": "3", + "_score": 0.7190094, + "fields": { + "passage_text": [ + "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district." + ] + }, + "highlight": { + "passage_text": [ + "(also known as simply Washington or D.C., and officially as <strong>the</strong> District <strong>of</strong> Columbia) <strong>is</strong> <strong>the</strong> <strong>capital</strong>", + "<strong>of</strong> <strong>the</strong> United States.", + "It <strong>is</strong> a federal district." + ] + } + }, + { + "_index": "my-test-data", + "_id": "1", + "_score": 0.32418242, + "fields": { + "passage_text": [ + "Carson City is the capital city of the American state of Nevada." + ] + }, + "highlight": { + "passage_text": [ + "Carson <strong>City</strong> <strong>is</strong> <strong>the</strong> <strong>capital</strong> <strong>city</strong> <strong>of</strong> <strong>the</strong> American state <strong>of</strong> Nevada." + ] + } + }, + { + "_index": "my-test-data", + "_id": "2", + "_score": 0.07456104, + "fields": { + "passage_text": [ + "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan." + ] + }, + "highlight": { + "passage_text": [ + "<strong>The</strong> Commonwealth <strong>of</strong> <strong>the</strong> Northern Mariana Islands <strong>is</strong> a group <strong>of</strong> islands in <strong>the</strong> Pacific Ocean.", + "Its <strong>capital</strong> <strong>is</strong> Saipan." + ] + } + }, + { + "_index": "my-test-data", + "_id": "4", + "_score": 0.06124987, + "fields": { + "passage_text": [ + "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." + ] + }, + "highlight": { + "passage_text": [ + "<strong>Capital</strong> punishment (<strong>the</strong> death penalty) has existed in <strong>the</strong> United States since beforethe United States", + "As <strong>of</strong> 2017, <strong>capital</strong> punishment <strong>is</strong> legal in 30 <strong>of</strong> <strong>the</strong> 50 states." + ] + } + } + ] + }, + "profile": { + "shards": [] + } +} +``` + +To avoid writing the query twice, use the `query_text_path` instead of `query_text`, as follows: + +```json +POST my-test-data/_search?search_pipeline=rerank_pipeline_bedrock +{ + "query": { + "match": { + "passage_text": "What is the capital city of America?" + } + }, + "ext": { + "rerank": { + "query_context": { + "query_text_path": "query.match.passage_text.query" + } + } + }, + "highlight": { + "pre_tags": ["<strong>"], + "post_tags": ["</strong>"], + "fields": {"passage_text": {}} + }, + "_source": false, + "fields": ["passage_text"] +} +``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_ml-commons-plugin/tutorials/reranking-cohere.md b/_tutorials/reranking/reranking-cohere.md similarity index 96% rename from _ml-commons-plugin/tutorials/reranking-cohere.md rename to _tutorials/reranking/reranking-cohere.md index 412180066f7..e02d9673465 100644 --- a/_ml-commons-plugin/tutorials/reranking-cohere.md +++ b/_tutorials/reranking/reranking-cohere.md @@ -1,15 +1,18 @@ --- layout: default -title: Reranking with Cohere Rerank -parent: Tutorials -nav_order: 30 +title: Reranking using Cohere Rerank +parent: Reranking search results +nav_order: 90 +redirect_from: + - /ml-commons-plugin/tutorials/reranking-cohere/ + - /vector-search/tutorials/reranking/reranking-cohere/ --- -# Reranking search results using the Cohere Rerank model +# Reranking search results using Cohere Rerank A [reranking pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/) can rerank search results, providing a relevance score for each document in the search results with respect to the search query. The relevance score is calculated by a cross-encoder model. -This tutorial illustrates how to use the [Cohere Rerank](https://docs.cohere.com/reference/rerank-1) model in a reranking pipeline. +This tutorial shows you how to use the [Cohere Rerank](https://docs.cohere.com/reference/rerank-1) model in a reranking pipeline. Replace the placeholders beginning with the prefix `your_` with your own values. {: .note} diff --git a/_ml-commons-plugin/tutorials/reranking-cross-encoder.md b/_tutorials/reranking/reranking-cross-encoder.md similarity index 74% rename from _ml-commons-plugin/tutorials/reranking-cross-encoder.md rename to _tutorials/reranking/reranking-cross-encoder.md index e46c7eb5112..c4c0a2c9a5f 100644 --- a/_ml-commons-plugin/tutorials/reranking-cross-encoder.md +++ b/_tutorials/reranking/reranking-cross-encoder.md @@ -1,15 +1,18 @@ --- layout: default -title: Reranking with the MS MARCO cross-encoder -parent: Tutorials -nav_order: 35 +title: Reranking search results using a cross-encoder in Amazon SageMaker +parent: Reranking search results +nav_order: 110 +redirect_from: + - /ml-commons-plugin/tutorials/reranking-cross-encoder/ + - /vector-search/tutorials/reranking/reranking-cross-encoder/ --- -# Reranking search results using the MS MARCO cross-encoder model +# Reranking search results using a cross-encoder in Amazon SageMaker A [reranking pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/) can rerank search results, providing a relevance score for each document in the search results with respect to the search query. The relevance score is calculated by a cross-encoder model. -This tutorial illustrates how to use the [Hugging Face `ms-marco-MiniLM-L-6-v2` model](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2) in a reranking pipeline. +This tutorial shows you how to use the [Hugging Face `ms-marco-MiniLM-L-6-v2` model](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2) in a reranking pipeline. Replace the placeholders beginning with the prefix `your_` with your own values. {: .note} @@ -49,7 +52,9 @@ Note the model inference endpoint; you'll use it to create a connector in the ne ## Step 1: Create a connector and register the model -First, create a connector for the model, providing the inference endpoint and your AWS credentials: +To create a connector for the model, send the following request. + +If you are using self-managed OpenSearch, supply your AWS credentials: ```json POST /_plugins/_ml/connectors/_create @@ -84,7 +89,42 @@ POST /_plugins/_ml/connectors/_create ``` {% include copy-curl.html %} -Next, use the connector ID from the response to register and deploy the model: +If you are using Amazon OpenSearch service, you can provide an AWS Identity and Access Management (IAM) role Amazon Resource Name (ARN) that allows access to the SageMaker model inference endpoint: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "Sagemakre cross-encoder model", + "description": "Test connector for Sagemaker cross-encoder model", + "version": 1, + "protocol": "aws_sigv4", + "credential": { + "roleArn": "your_role_arn_which_allows_access_to_sagemaker_model_inference_endpoint" + }, + "parameters": { + "region": "your_sagemkaer_model_region_like_us-west-2", + "service_name": "sagemaker" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "your_sagemaker_model_inference_endpoint_created_in_last_step", + "headers": { + "content-type": "application/json" + }, + "request_body": "{ \"inputs\": ${parameters.inputs} }", + "pre_process_function": "\n String escape(def input) { \n if (input.contains(\"\\\\\")) {\n input = input.replace(\"\\\\\", \"\\\\\\\\\");\n }\n if (input.contains(\"\\\"\")) {\n input = input.replace(\"\\\"\", \"\\\\\\\"\");\n }\n if (input.contains('\r')) {\n input = input = input.replace('\r', '\\\\r');\n }\n if (input.contains(\"\\\\t\")) {\n input = input.replace(\"\\\\t\", \"\\\\\\\\\\\\t\");\n }\n if (input.contains('\n')) {\n input = input.replace('\n', '\\\\n');\n }\n if (input.contains('\b')) {\n input = input.replace('\b', '\\\\b');\n }\n if (input.contains('\f')) {\n input = input.replace('\f', '\\\\f');\n }\n return input;\n }\n\n String query = params.query_text;\n StringBuilder builder = new StringBuilder('[');\n \n for (int i=0; i<params.text_docs.length; i ++) {\n builder.append('{\"text\":\"');\n builder.append(escape(query));\n builder.append('\", \"text_pair\":\"');\n builder.append(escape(params.text_docs[i]));\n builder.append('\"}');\n if (i<params.text_docs.length - 1) {\n builder.append(',');\n }\n }\n builder.append(']');\n \n def parameters = '{ \"inputs\": ' + builder + ' }';\n return '{\"parameters\": ' + parameters + '}';\n ", + "post_process_function": "\n \n def dataType = \"FLOAT32\";\n \n \n if (params.result == null)\n {\n return 'no result generated';\n //return params.response;\n }\n def outputs = params.result;\n \n \n def resultBuilder = new StringBuilder('[ ');\n for (int i=0; i<outputs.length; i++) {\n resultBuilder.append(' {\"name\": \"similarity\", \"data_type\": \"FLOAT32\", \"shape\": [1],');\n //resultBuilder.append('{\"name\": \"similarity\"}');\n \n resultBuilder.append('\"data\": [');\n resultBuilder.append(outputs[i].score);\n resultBuilder.append(']}');\n if (i<outputs.length - 1) {\n resultBuilder.append(',');\n }\n }\n resultBuilder.append(']');\n \n return resultBuilder.toString();\n " + } + ] +} +``` +{% include copy-curl.html %} + +For more information, see the [AWS documentation](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/ml-amazon-connector.html), [this tutorial]({{site.url}}{{site.baseurl}}/vector-search/tutorials/semantic-search/semantic-search-sagemaker/), and [the AIConnectorHelper notebook](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/tutorials/aws/AIConnectorHelper.ipynb). + +Use the connector ID from the response to register and deploy the model: ```json POST /_plugins/_ml/models/_register?deploy=true diff --git a/_tutorials/reranking/reranking-sagemaker.md b/_tutorials/reranking/reranking-sagemaker.md new file mode 100644 index 00000000000..f8d8ccf90de --- /dev/null +++ b/_tutorials/reranking/reranking-sagemaker.md @@ -0,0 +1,732 @@ +--- +layout: default +title: Reranking search results using a reranker in Amazon SageMaker +parent: Reranking search results +nav_order: 115 +redirect_from: + - /vector-search/tutorials/reranking/reranking-sagemaker/ +--- + +# Reranking search results using a reranker in Amazon SageMaker + +A [reranking pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-relevance/reranking-search-results/) can rerank search results, providing a relevance score for each document in the search results with respect to the search query. The relevance score is calculated by a reranker model. + +This tutorial shows you how to rerank search results in self-managed OpenSearch and [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/). The tutorial uses the [Hugging Face BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) model hosted on Amazon SageMaker. + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Prerequisite: Deploy the model to Amazon SageMaker + +Use the following code to deploy the model to Amazon SageMaker. We suggest using a GPU for better performance: + +```python +import json +import sagemaker +import boto3 +from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri +from sagemaker.serverless import ServerlessInferenceConfig + +try: + role = sagemaker.get_execution_role() +except ValueError: + iam = boto3.client('iam') + role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn'] + +# Hub Model configuration. https://huggingface.co/models +hub = { + 'HF_MODEL_ID':'BAAI/bge-reranker-v2-m3' +} + +# create Hugging Face Model Class +huggingface_model = HuggingFaceModel( + image_uri=get_huggingface_llm_image_uri("huggingface-tei",version="1.2.3"), + env=hub, + role=role, +) + +# deploy model to SageMaker Inference +predictor = huggingface_model.deploy( + initial_instance_count=1, + instance_type="ml.g5.2xlarge", + ) +``` +{% include copy.html %} + +For more information, see [How to deploy this model using Amazon SageMaker](https://huggingface.co/BAAI/bge-reranker-v2-m3?sagemaker_deploy=true). + +To perform a reranking test, use the following code: + +```python +result = predictor.predict(data={ + "query":"What is the capital city of America?", + "texts":[ + "Carson City is the capital city of the American state of Nevada.", + "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.", + "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.", + "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." + ] +}) + +print(json.dumps(result, indent=2)) +``` +{% include copy.html %} + +The response contains the reranked results ordered by relevance score: + +```json +[ + { + "index": 2, + "score": 0.92879725 + }, + { + "index": 0, + "score": 0.013636836 + }, + { + "index": 1, + "score": 0.000593021 + }, + { + "index": 3, + "score": 0.00012148176 + } +] +``` + +To sort the results by index, use the following code: + +```python +print(json.dumps(sorted(result, key=lambda x: x['index']),indent=2)) +``` +{% include copy.html %} + +The sorted results are as follows: + +```json +[ + { + "index": 0, + "score": 0.013636836 + }, + { + "index": 1, + "score": 0.000593021 + }, + { + "index": 2, + "score": 0.92879725 + }, + { + "index": 3, + "score": 0.00012148176 + } +] +``` + +Note the model inference endpoint; you'll use it to create a connector in the next step. You can confirm the inference endpoint URL using the following code: + +```python +region_name = boto3.Session().region_name +endpoint_name = predictor.endpoint_name +endpoint_url = f"https://runtime.sagemaker.{region_name}.amazonaws.com/endpoints/{endpoint_name}/invocations" +print(endpoint_url) +``` +{% include copy.html %} + +## Step 1: Create a connector and register the model + +To create a connector for the model, send the following request. + +If you are using self-managed OpenSearch, supply your AWS credentials: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "Sagemakre cross-encoder model", + "description": "Test connector for Sagemaker cross-encoder model", + "version": 1, + "protocol": "aws_sigv4", + "credential": { + "access_key": "your_access_key", + "secret_key": "your_secret_key", + "session_token": "your_session_token" + }, + "parameters": { + "region": "your_sagemaker_model_region_like_us-west-2", + "service_name": "sagemaker" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "your_sagemaker_model_inference_endpoint_created_in_last_step", + "headers": { + "content-type": "application/json" + }, + "pre_process_function": """ + def query_text = params.query_text; + def text_docs = params.text_docs; + def textDocsBuilder = new StringBuilder('['); + for (int i=0; i<text_docs.length; i++) { + textDocsBuilder.append('"'); + textDocsBuilder.append(text_docs[i]); + textDocsBuilder.append('"'); + if (i<text_docs.length - 1) { + textDocsBuilder.append(','); + } + } + textDocsBuilder.append(']'); + def parameters = '{ "query": "' + query_text + '", "texts": ' + textDocsBuilder.toString() + ' }'; + return '{"parameters": ' + parameters + '}'; + """, + "request_body": """ + { + "query": "${parameters.query}", + "texts": ${parameters.texts} + } + """, + "post_process_function": """ + if (params.result == null || params.result.length == 0) { + throw new IllegalArgumentException("Post process function input is empty."); + } + def outputs = params.result; + def scores = new Double[outputs.length]; + for (int i=0; i<outputs.length; i++) { + def index = new BigDecimal(outputs[i].index.toString()).intValue(); + scores[index] = outputs[i].score; + } + def resultBuilder = new StringBuilder('['); + for (int i=0; i<scores.length; i++) { + resultBuilder.append(' {"name": "similarity", "data_type": "FLOAT32", "shape": [1],'); + resultBuilder.append('"data": ['); + resultBuilder.append(scores[i]); + resultBuilder.append(']}'); + if (i<outputs.length - 1) { + resultBuilder.append(','); + } + } + resultBuilder.append(']'); + return resultBuilder.toString(); + """ + } + ] +} +``` +{% include copy-curl.html %} + +If you are using Amazon OpenSearch service, you can provide an AWS Identity and Access Management (IAM) role Amazon Resource Name (ARN) that allows access to the SageMaker model inference endpoint: + +```json +POST /_plugins/_ml/connectors/_create +{ + "name": "Sagemakre cross-encoder model", + "description": "Test connector for Sagemaker cross-encoder model", + "version": 1, + "protocol": "aws_sigv4", + "credential": { + "roleArn": "your_role_arn_which_allows_access_to_sagemaker_model_inference_endpoint" + }, + "parameters": { + "region": "your_sagemkaer_model_region_like_us-west-2", + "service_name": "sagemaker" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "your_sagemaker_model_inference_endpoint_created_in_last_step", + "headers": { + "content-type": "application/json" + }, + "pre_process_function": """ + def query_text = params.query_text; + def text_docs = params.text_docs; + def textDocsBuilder = new StringBuilder('['); + for (int i=0; i<text_docs.length; i++) { + textDocsBuilder.append('"'); + textDocsBuilder.append(text_docs[i]); + textDocsBuilder.append('"'); + if (i<text_docs.length - 1) { + textDocsBuilder.append(','); + } + } + textDocsBuilder.append(']'); + def parameters = '{ "query": "' + query_text + '", "texts": ' + textDocsBuilder.toString() + ' }'; + return '{"parameters": ' + parameters + '}'; + """, + "request_body": """ + { + "query": "${parameters.query}", + "texts": ${parameters.texts} + } + """, + "post_process_function": """ + if (params.result == null || params.result.length == 0) { + throw new IllegalArgumentException("Post process function input is empty."); + } + def outputs = params.result; + def scores = new Double[outputs.length]; + for (int i=0; i<outputs.length; i++) { + def index = new BigDecimal(outputs[i].index.toString()).intValue(); + scores[index] = outputs[i].score; + } + def resultBuilder = new StringBuilder('['); + for (int i=0; i<scores.length; i++) { + resultBuilder.append(' {"name": "similarity", "data_type": "FLOAT32", "shape": [1],'); + resultBuilder.append('"data": ['); + resultBuilder.append(scores[i]); + resultBuilder.append(']}'); + if (i<outputs.length - 1) { + resultBuilder.append(','); + } + } + resultBuilder.append(']'); + return resultBuilder.toString(); + """ + } + ] +} +``` +{% include copy-curl.html %} + +For more information, see the [AWS documentation](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/ml-amazon-connector.html), [this tutorial]({{site.url}}{{site.baseurl}}/vector-search/tutorials/semantic-search/semantic-search-sagemaker/), and [the AIConnectorHelper notebook](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/tutorials/aws/AIConnectorHelper.ipynb). + +Use the connector ID from the response to register and deploy the model: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "Sagemaker Cross-Encoder model", + "function_name": "remote", + "description": "test rerank model", + "connector_id": "your_connector_id" +} +``` +{% include copy-curl.html %} + +Note the model ID in the response; you'll use it in the following steps. + +Test the model by using the Predict API: + +```json +POST _plugins/_ml/models/your_model_id/_predict +{ + "parameters": { + "query": "What is the capital city of America?", + "texts": [ + "Carson City is the capital city of the American state of Nevada.", + "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.", + "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.", + "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." + ] + } +} +``` +{% include copy-curl.html %} + +Alternatively, you can test the model as follows: + +```json +POST _plugins/_ml/_predict/text_similarity/your_model_id +{ + "query_text": "What is the capital city of America?", + "text_docs": [ + "Carson City is the capital city of the American state of Nevada.", + "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.", + "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.", + "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." + ] +} +``` +{% include copy-curl.html %} + +The connector `pre_process_function` transforms the input into the format required by the previously shown parameters. + +By default, the model output has the following format: + +```json +[ + { + "index": 2, + "score": 0.92879725 + }, + { + "index": 0, + "score": 0.013636836 + }, + { + "index": 1, + "score": 0.000593021 + }, + { + "index": 3, + "score": 0.00012148176 + } +] +``` + +The connector `post_process_function` transforms the model's output into a format that the [rerank processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) can interpret and orders the results by index. This adapted format is as follows: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "similarity", + "data_type": "FLOAT32", + "shape": [ + 1 + ], + "data": [ + 0.013636836 + ] + }, + { + "name": "similarity", + "data_type": "FLOAT32", + "shape": [ + 1 + ], + "data": [ + 0.013636836 + ] + }, + { + "name": "similarity", + "data_type": "FLOAT32", + "shape": [ + 1 + ], + "data": [ + 0.92879725 + ] + }, + { + "name": "similarity", + "data_type": "FLOAT32", + "shape": [ + 1 + ], + "data": [ + 0.00012148176 + ] + } + ], + "status_code": 200 + } + ] +} +``` + +The response contains two `similarity` objects. For each `similarity` object, the `data` array contains a relevance score for each document with respect to the query. The `similarity` objects are provided in the order of the input documents---the first object pertains to the first document. + +## Step 2: Configure a reranking pipeline + +Follow these steps to configure a reranking pipeline. + +### Step 2.1: Ingest test data + +Send a bulk request to ingest test data: + +```json +POST _bulk +{ "index": { "_index": "my-test-data" } } +{ "passage_text" : "Carson City is the capital city of the American state of Nevada." } +{ "index": { "_index": "my-test-data" } } +{ "passage_text" : "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan." } +{ "index": { "_index": "my-test-data" } } +{ "passage_text" : "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district." } +{ "index": { "_index": "my-test-data" } } +{ "passage_text" : "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." } +``` +{% include copy-curl.html %} + +### Step 2.2: Create a reranking pipeline + +Create a reranking pipeline with the cross-encoder model: + +```json +PUT /_search/pipeline/rerank_pipeline_sagemaker +{ + "description": "Pipeline for reranking with Sagemaker cross-encoder model", + "response_processors": [ + { + "rerank": { + "ml_opensearch": { + "model_id": "your_model_id_created_in_step1" + }, + "context": { + "document_fields": ["passage_text"] + } + } + } + ] +} +``` +{% include copy-curl.html %} + +If you provide multiple field names in `document_fields`, the values of all fields are first concatenated, and then reranking is performed. +{: .note} + +### Step 2.3: Test the reranking + +To limit the number of returned results, you can specify the `size` parameter. For example, set `"size": 2` to return the top two documents. + +First, test the query without using the reranking pipeline: + +```json +POST my-test-data/_search +{ + "query": { + "match": { + "passage_text": "What is the capital city of America?" + } + }, + "highlight": { + "pre_tags": ["<strong>"], + "post_tags": ["</strong>"], + "fields": {"passage_text": {}} + }, + "_source": false, + "fields": ["passage_text"] +} +``` +{% include copy-curl.html %} + +The first document in the response is `Carson City is the capital city of the American state of Nevada`, which is incorrect: + +```json +{ + "took": 2, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 2.5045562, + "hits": [ + { + "_index": "my-test-data", + "_id": "1", + "_score": 2.5045562, + "fields": { + "passage_text": [ + "Carson City is the capital city of the American state of Nevada." + ] + }, + "highlight": { + "passage_text": [ + "Carson <strong>City</strong> <strong>is</strong> <strong>the</strong> <strong>capital</strong> <strong>city</strong> <strong>of</strong> <strong>the</strong> American state <strong>of</strong> Nevada." + ] + } + }, + { + "_index": "my-test-data", + "_id": "2", + "_score": 0.5807494, + "fields": { + "passage_text": [ + "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan." + ] + }, + "highlight": { + "passage_text": [ + "<strong>The</strong> Commonwealth <strong>of</strong> <strong>the</strong> Northern Mariana Islands <strong>is</strong> a group <strong>of</strong> islands in <strong>the</strong> Pacific Ocean.", + "Its <strong>capital</strong> <strong>is</strong> Saipan." + ] + } + }, + { + "_index": "my-test-data", + "_id": "3", + "_score": 0.5261191, + "fields": { + "passage_text": [ + "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district." + ] + }, + "highlight": { + "passage_text": [ + "(also known as simply Washington or D.C., and officially as <strong>the</strong> District <strong>of</strong> Columbia) <strong>is</strong> <strong>the</strong> <strong>capital</strong>", + "<strong>of</strong> <strong>the</strong> United States.", + "It <strong>is</strong> a federal district." + ] + } + }, + { + "_index": "my-test-data", + "_id": "4", + "_score": 0.5083029, + "fields": { + "passage_text": [ + "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." + ] + }, + "highlight": { + "passage_text": [ + "<strong>Capital</strong> punishment (<strong>the</strong> death penalty) has existed in <strong>the</strong> United States since beforethe United States", + "As <strong>of</strong> 2017, <strong>capital</strong> punishment <strong>is</strong> legal in 30 <strong>of</strong> <strong>the</strong> 50 states." + ] + } + } + ] + } +} +``` + +Next, test the query using the reranking pipeline: + +```json +POST my-test-data/_search?search_pipeline=rerank_pipeline_sagemaker +{ + "query": { + "match": { + "passage_text": "What is the capital city of America?" + } + }, + "ext": { + "rerank": { + "query_context": { + "query_text": "What is the capital city of America?" + } + } + }, + "highlight": { + "pre_tags": ["<strong>"], + "post_tags": ["</strong>"], + "fields": {"passage_text": {}} + }, + "_source": false, + "fields": ["passage_text"] +} +``` +{% include copy-curl.html %} + +The first document in the response is `"Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district."`, which is correct: + +```json +{ + "took": 2, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 0.92879725, + "hits": [ + { + "_index": "my-test-data", + "_id": "3", + "_score": 0.92879725, + "fields": { + "passage_text": [ + "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district." + ] + }, + "highlight": { + "passage_text": [ + "(also known as simply Washington or D.C., and officially as <strong>the</strong> District <strong>of</strong> Columbia) <strong>is</strong> <strong>the</strong> <strong>capital</strong>", + "<strong>of</strong> <strong>the</strong> United States.", + "It <strong>is</strong> a federal district." + ] + } + }, + { + "_index": "my-test-data", + "_id": "1", + "_score": 0.013636836, + "fields": { + "passage_text": [ + "Carson City is the capital city of the American state of Nevada." + ] + }, + "highlight": { + "passage_text": [ + "Carson <strong>City</strong> <strong>is</strong> <strong>the</strong> <strong>capital</strong> <strong>city</strong> <strong>of</strong> <strong>the</strong> American state <strong>of</strong> Nevada." + ] + } + }, + { + "_index": "my-test-data", + "_id": "2", + "_score": 0.013636836, + "fields": { + "passage_text": [ + "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan." + ] + }, + "highlight": { + "passage_text": [ + "<strong>The</strong> Commonwealth <strong>of</strong> <strong>the</strong> Northern Mariana Islands <strong>is</strong> a group <strong>of</strong> islands in <strong>the</strong> Pacific Ocean.", + "Its <strong>capital</strong> <strong>is</strong> Saipan." + ] + } + }, + { + "_index": "my-test-data", + "_id": "4", + "_score": 0.00012148176, + "fields": { + "passage_text": [ + "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." + ] + }, + "highlight": { + "passage_text": [ + "<strong>Capital</strong> punishment (<strong>the</strong> death penalty) has existed in <strong>the</strong> United States since beforethe United States", + "As <strong>of</strong> 2017, <strong>capital</strong> punishment <strong>is</strong> legal in 30 <strong>of</strong> <strong>the</strong> 50 states." + ] + } + } + ] + }, + "profile": { + "shards": [] + } +} +``` + +To avoid writing the query twice, use the `query_text_path` instead of `query_text`, as follows: + +```json +POST my-test-data/_search?search_pipeline=rerank_pipeline_sagemaker +{ + "query": { + "match": { + "passage_text": "What is the capital city of America?" + } + }, + "ext": { + "rerank": { + "query_context": { + "query_text_path": "query.match.passage_text.query" + } + } + }, + "highlight": { + "pre_tags": ["<strong>"], + "post_tags": ["</strong>"], + "fields": {"passage_text": {}} + }, + "_source": false, + "fields": ["passage_text"] +} +``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_tutorials/vector-search/index.md b/_tutorials/vector-search/index.md new file mode 100644 index 00000000000..67f22a285af --- /dev/null +++ b/_tutorials/vector-search/index.md @@ -0,0 +1,41 @@ +--- +layout: default +title: Vector search +has_children: true +has_toc: false +nav_order: 10 +redirect_from: + - /vector-search/tutorials/ + - /ml-commons-plugin/tutorials/ + - /ml-commons-plugin/tutorials/index/ + - /tutorials/vector-search/ +vector_search_101: + - heading: "Getting started with vector search" + description: "Learn how to run a raw vector search" + link: "/vector-search/getting-started/" + - heading: "Getting started with semantic and hybrid search" + description: "Build your first AI search application" + link: "/tutorials/vector-search/neural-search-tutorial/" +other: + - heading: "Vector operations" + description: "Learn how to generate embeddings and optimize vector storage" + link: "/tutorials/vector-search/vector-operations/" + - heading: "Semantic search" + description: "Implement semantic search using various machine learning models" + link: "/tutorials/vector-search/semantic-search/" + - heading: "Using semantic highlighting" + description: "Learn how to highlight the most semantically relevant sentences in the results" + link: "/tutorials/vector-search/semantic-highlighting-tutorial/" +--- + +# Vector search tutorials + +Explore the following tutorials to learn about implementing vector search applications using the OpenSearch vector database. For more information about using OpenSearch as a vector database, see [Vector search]({{site.url}}{{site.baseurl}}/vector-search/). + +## Vector search 101 + +{% include cards.html cards=page.vector_search_101 %} + +## Vector search applications + +{% include cards.html cards=page.other %} \ No newline at end of file diff --git a/_search-plugins/neural-search-tutorial.md b/_tutorials/vector-search/neural-search-tutorial.md similarity index 69% rename from _search-plugins/neural-search-tutorial.md rename to _tutorials/vector-search/neural-search-tutorial.md index 9c1b224cb8f..acbcb0bf1e2 100644 --- a/_search-plugins/neural-search-tutorial.md +++ b/_tutorials/vector-search/neural-search-tutorial.md @@ -1,46 +1,41 @@ --- layout: default -title: Neural search tutorial +title: Getting started with semantic and hybrid search has_children: false -nav_order: 30 +parent: Vector search +grand_parent: Tutorials +nav_order: 3 redirect_from: - /ml-commons-plugin/semantic-search/ + - /search-plugins/neural-search-tutorial/ + - /vector-search/tutorials/neural-search-tutorial/ +steps: + - heading: "Choose a model for embedding generation" + link: "/tutorials/vector-search/neural-search-tutorial/#step-1-choose-a-model" + - heading: "Register and deploy the model" + link: "/tutorials/vector-search/neural-search-tutorial/#step-2-register-and-deploy-the-model" + - heading: "Ingest data" + link: "/tutorials/vector-search/neural-search-tutorial/#step-3-ingest-data" + - heading: "Search the data" + link: "/tutorials/vector-search/neural-search-tutorial/#step-4-search-the-data" --- -# Neural search tutorial +# Getting started with semantic and hybrid search By default, OpenSearch calculates document scores using the [Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25) algorithm. BM25 is a keyword-based algorithm that performs well on queries containing keywords but fails to capture the semantic meaning of the query terms. Semantic search, unlike keyword-based search, takes into account the meaning of the query in the search context. Thus, semantic search performs well when a query requires natural language understanding. -In this tutorial, you'll learn how to use neural search to: +In this tutorial, you'll learn how to implement the following types of search: -- Implement semantic search in OpenSearch. -- Implement hybrid search by combining semantic and keyword search to improve search relevance. - -## Terminology - -It's helpful to understand the following terms before starting this tutorial: - -- _Neural search_: Facilitates vector search at ingestion time and at search time: - - At ingestion time, neural search uses language models to generate vector embeddings from the text fields in the document. The documents containing both the original text field and the vector embedding of the field are then indexed in a k-NN index, as shown in the following diagram. - - ![Neural search at ingestion time diagram]({{site.url}}{{site.baseurl}}/images/neural-search-ingestion.png) - - At search time, when you then use a _neural query_, the query text is passed through a language model, and the resulting vector embeddings are compared with the document text vector embeddings to find the most relevant results, as shown in the following diagram. - - ![Neural search at search time diagram]({{site.url}}{{site.baseurl}}/images/neural-search-query.png) - -- _Semantic search_: Employs neural search in order to determine the intention of the user's query in the search context, thereby improving search relevance. - -- _Hybrid search_: Combines semantic and keyword search to improve search relevance. +- **Semantic search**: Considers semantic meaning in order to determine the intention of the user's query in the search context, thereby improving search relevance. +- **Hybrid search**: Combines semantic and keyword search to improve search relevance. ## OpenSearch components for semantic search -In this tutorial, you'll implement semantic search using the following OpenSearch components: +In this tutorial, you'll use the following OpenSearch components: -- [Model group]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#model-groups) - [Pretrained language models provided by OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/) - [Ingest pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) - [k-NN vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) -- [Neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/) - [Search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) - [Normalization processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/normalization-processor/) - [Hybrid query]({{site.url}}{{site.baseurl}}/query-dsl/compound/hybrid/) @@ -72,45 +67,29 @@ For a [custom local model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom For more information about ML-related cluster settings, see [ML Commons cluster settings]({{site.url}}{{site.baseurl}}/ml-commons-plugin/cluster-settings/). -## Tutorial overview +## Tutorial This tutorial consists of the following steps: -1. [**Set up an ML language model**](#step-1-set-up-an-ml-language-model). - 1. [Choose a language model](#step-1a-choose-a-language-model). - 1. [Register a model group](#step-1b-register-a-model-group). - 1. [Register the model to the model group](#step-1c-register-the-model-to-the-model-group). - 1. [Deploy the model](#step-1d-deploy-the-model). -1. [**Ingest data with neural search**](#step-2-ingest-data-with-neural-search). - 1. [Create an ingest pipeline for neural search](#step-2a-create-an-ingest-pipeline-for-neural-search). - 1. [Create a k-NN index](#step-2b-create-a-k-nn-index). - 1. [Ingest documents into the index](#step-2c-ingest-documents-into-the-index). -1. [**Search the data**](#step-3-search-the-data). - - [Search using a keyword search](#search-using-a-keyword-search). - - [Search using a neural search](#search-using-a-neural-search). - - [Search using a hybrid search](#search-using-a-hybrid-search). - -Some steps in the tutorial contain optional `Test it` sections. You can ensure that the step was successful by running requests in these sections. - -After you're done, follow the steps in the [Clean up](#clean-up) section to delete all created components. +{% include list.html list_items=page.steps%} -## Tutorial +You can follow this tutorial by using your command line or the OpenSearch Dashboards [Dev Tools console]({{site.url}}{{site.baseurl}}/dashboards/dev-tools/run-queries/). -You can follow this tutorial using your command line or the OpenSearch Dashboards [Dev Tools console]({{site.url}}{{site.baseurl}}/dashboards/dev-tools/run-queries/). +Some steps in the tutorial contain optional <span>Test it</span>{: .text-delta} sections. You can confirm that the step completed successfully by running the requests in these sections. -## Step 1: Set up an ML language model +After you're done, follow the steps in the [Clean up](#clean-up) section to delete all created components. -Neural search requires a language model in order to generate vector embeddings from text fields, both at ingestion time and query time. +### Step 1: Choose a model -### Step 1(a): Choose a language model +First, you'll need to choose a language model in order to generate vector embeddings from text fields, both at ingestion time and query time. -For this tutorial, you'll use the [DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert) model from Hugging Face. It is one of the pretrained sentence transformer models available in OpenSearch that has shown some of the best results in benchmarking tests (for details, see [this blog post](https://opensearch.org/blog/semantic-science-benchmarks/)). You'll need the name, version, and dimension of the model to register it. You can find this information in the [pretrained model table]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#sentence-transformers) by selecting the `config_url` link corresponding to the model's TorchScript artifact: +For this tutorial, you'll use the [DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert) model from Hugging Face. It is one of the pretrained sentence transformer models available in OpenSearch that has shown some of the best results in benchmarking tests (for more information, see [this blog post](https://opensearch.org/blog/semantic-science-benchmarks/)). You'll need the name, version, and dimension of the model to register it. You can find this information in the [pretrained model table]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#sentence-transformers) by selecting the `config_url` link corresponding to the model's TorchScript artifact: - The model name is `huggingface/sentence-transformers/msmarco-distilbert-base-tas-b`. -- The model version is `1.0.1`. +- The model version is `1.0.3`. - The number of dimensions for this model is `768`. -Take note of the dimensionality of the model because you'll need it when you set up a k-NN index. +Take note of the dimensionality of the model because you'll need it when you set up a vector index. {: .important} #### Advanced: Using a different model @@ -125,108 +104,15 @@ Alternatively, you can choose one of the following options for your model: For information about choosing a model, see [Further reading](#further-reading). -### Step 1(b): Register a model group - -For access control, models are organized into model groups (collections of versions of a particular model). Each model group name in the cluster must be globally unique. Registering a model group ensures the uniqueness of the model group name. - -If you are registering the first version of a model without first registering the model group, a new model group is created automatically. For more information, see [Model access control]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control/). -{: .tip} - -To register a model group with the access mode set to `public`, send the following request: - -```json -POST /_plugins/_ml/model_groups/_register -{ - "name": "NLP_model_group", - "description": "A model group for NLP models", - "access_mode": "public" -} -``` -{% include copy-curl.html %} - -OpenSearch sends back the model group ID: - -```json -{ - "model_group_id": "Z1eQf4oB5Vm0Tdw8EIP2", - "status": "CREATED" -} -``` - -You'll use this ID to register the chosen model to the model group. - -<details markdown="block"> - <summary> - Test it - </summary> - {: .text-delta} - -Search for the newly created model group by providing its model group ID in the request: - -```json -POST /_plugins/_ml/model_groups/_search -{ - "query": { - "match": { - "_id": "Z1eQf4oB5Vm0Tdw8EIP2" - } - } -} -``` -{% include copy-curl.html %} - -The response contains the model group: - -```json -{ - "took": 0, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 1, - "relation": "eq" - }, - "max_score": 1, - "hits": [ - { - "_index": ".plugins-ml-model-group", - "_id": "Z1eQf4oB5Vm0Tdw8EIP2", - "_version": 1, - "_seq_no": 14, - "_primary_term": 2, - "_score": 1, - "_source": { - "created_time": 1694357262582, - "access": "public", - "latest_version": 0, - "last_updated_time": 1694357262582, - "name": "NLP_model_group", - "description": "A model group for NLP models" - } - } - ] - } -} -``` -</details> - +### Step 2: Register and deploy the model -### Step 1(c): Register the model to the model group - -To register the model to the model group, provide the model group ID in the register request: +To register the model, provide the model group ID in the register request: ```json POST /_plugins/_ml/models/_register { "name": "huggingface/sentence-transformers/msmarco-distilbert-base-tas-b", - "version": "1.0.1", - "model_group_id": "Z1eQf4oB5Vm0Tdw8EIP2", + "version": "1.0.3", "model_format": "TORCH_SCRIPT" } ``` @@ -248,7 +134,9 @@ GET /_plugins/_ml/tasks/aFeif4oB5Vm0Tdw8yoN7 ``` {% include copy-curl.html %} -Once the task is complete, the task state will be `COMPLETED` and the Tasks API response will contain a model ID for the registered model: +OpenSearch saves the registered model in the model index. Deploying a model creates a model instance and caches the model in memory. + +Once the task is complete, the task state will be `COMPLETED` and the Tasks API response will contain a model ID for the deployed model: ```json { @@ -338,55 +226,12 @@ POST /_plugins/_ml/models/_register "all_config": "{\"_name_or_path\":\"old_models/msmarco-distilbert-base-tas-b/0_Transformer\",\"activation\":\"gelu\",\"architectures\":[\"DistilBertModel\"],\"attention_dropout\":0.1,\"dim\":768,\"dropout\":0.1,\"hidden_dim\":3072,\"initializer_range\":0.02,\"max_position_embeddings\":512,\"model_type\":\"distilbert\",\"n_heads\":12,\"n_layers\":6,\"pad_token_id\":0,\"qa_dropout\":0.1,\"seq_classif_dropout\":0.2,\"sinusoidal_pos_embds\":false,\"tie_weights_\":true,\"transformers_version\":\"4.7.0\",\"vocab_size\":30522}" }, "created_time": 1676074079195, - "model_group_id": "Z1eQf4oB5Vm0Tdw8EIP2", "url": "https://artifacts.opensearch.org/models/ml-models/huggingface/sentence-transformers/msmarco-distilbert-base-tas-b/1.0.1/onnx/sentence-transformers_msmarco-distilbert-base-tas-b-1.0.1-onnx.zip" } ``` For more information, see [Using ML models within OpenSearch]({{site.url}}{{site.baseurl}}/ml-commons-plugin/using-ml-models/). -### Step 1(d): Deploy the model - -Once the model is registered, it is saved in the model index. Next, you'll need to deploy the model. Deploying a model creates a model instance and caches the model in memory. To deploy the model, provide its model ID to the `_deploy` endpoint: - -```json -POST /_plugins/_ml/models/aVeif4oB5Vm0Tdw8zYO2/_deploy -``` -{% include copy-curl.html %} - -Like the register operation, the deploy operation is asynchronous, so you'll get a task ID in the response: - -```json -{ - "task_id": "ale6f4oB5Vm0Tdw8NINO", - "status": "CREATED" -} -``` - -You can check the status of the task by using the Tasks API: - -```json -GET /_plugins/_ml/tasks/ale6f4oB5Vm0Tdw8NINO -``` -{% include copy-curl.html %} - -Once the task is complete, the task state will be `COMPLETED`: - -```json -{ - "model_id": "aVeif4oB5Vm0Tdw8zYO2", - "task_type": "DEPLOY_MODEL", - "function_name": "TEXT_EMBEDDING", - "state": "COMPLETED", - "worker_node": [ - "4p6FVOmJRtu3wehDD74hzQ" - ], - "create_time": 1694360024141, - "last_update_time": 1694360027940, - "is_async": true -} -``` - <details markdown="block"> <summary> Test it @@ -439,13 +284,13 @@ GET /_plugins/_ml/profile/models ``` </details> -## Step 2: Ingest data with neural search +### Step 3: Ingest data -Neural search uses a language model to transform text into vector embeddings. During ingestion, neural search creates vector embeddings for the text fields in the request. During search, you can generate vector embeddings for the query text by applying the same model, allowing you to perform vector similarity search on the documents. +OpenSearch uses a language model to transform text into vector embeddings. During ingestion, OpenSearch creates vector embeddings for the text fields in the request. During search, you can generate vector embeddings for the query text by applying the same model, allowing you to perform vector similarity search on the documents. -### Step 2(a): Create an ingest pipeline for neural search +#### Step 3(a): Create an ingest pipeline -Now that you have deployed a model, you can use this model to configure [neural search]({{site.url}}{{site.baseurl}}/search-plugins/neural-search/). First, you need to create an [ingest pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) that contains one processor: a task that transforms document fields before documents are ingested into an index. For neural search, you'll set up a `text_embedding` processor that creates vector embeddings from text. You'll need the `model_id` of the model you set up in the previous section and a `field_map`, which specifies the name of the field from which to take the text (`text`) and the name of the field in which to record embeddings (`passage_embedding`): +Now that you have deployed a model, you can use this model to configure an [ingest pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) that contains one processor: a task that transforms document fields before documents are ingested into an index. In this example, you'll set up a `text_embedding` processor that creates vector embeddings from text. You'll need the `model_id` of the model you set up in the previous section and a `field_map`, which specifies the name of the field from which to take the text (`text`) and the name of the field in which to record embeddings (`passage_embedding`): ```json PUT /_ingest/pipeline/nlp-ingest-pipeline @@ -499,9 +344,9 @@ The response contains the ingest pipeline: ``` </details> -### Step 2(b): Create a k-NN index +#### Step 3(b): Create a vector index -Now you'll create a k-NN index with a field named `text`, which contains an image description, and a [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) field named `passage_embedding`, which contains the vector embedding of the text. Additionally, set the default ingest pipeline to the `nlp-ingest-pipeline` you created in the previous step: +Now you'll create a vector index with a field named `text`, which contains an image description, and a [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) field named `passage_embedding`, which contains the vector embedding of the text. Additionally, set the default ingest pipeline to the `nlp-ingest-pipeline` you created in the previous step: ```json @@ -519,12 +364,7 @@ PUT /my-nlp-index "passage_embedding": { "type": "knn_vector", "dimension": 768, - "method": { - "engine": "lucene", - "space_type": "l2", - "name": "hnsw", - "parameters": {} - } + "space_type": "l2" }, "text": { "type": "text" @@ -535,7 +375,7 @@ PUT /my-nlp-index ``` {% include copy-curl.html %} -Setting up a k-NN index allows you to later perform a vector search on the `passage_embedding` field. +Setting up a vector index allows you to later perform a vector search on the `passage_embedding` field. <details markdown="block"> <summary> @@ -543,7 +383,7 @@ Setting up a k-NN index allows you to later perform a vector search on the `pass </summary> {: .text-delta} -Use the following requests to get the settings and the mappings of the created index: +Use the following requests to get the settings and mappings of the created index: ```json GET /my-nlp-index/_settings @@ -557,7 +397,7 @@ GET /my-nlp-index/_mappings </details> -### Step 2(c): Ingest documents into the index +#### Step 3(c): Ingest documents into the index In this step, you'll ingest several sample documents into the index. The sample data is taken from the [Flickr image dataset](https://www.kaggle.com/datasets/hsankesara/flickr-image-dataset). Each document contains a `text` field corresponding to the image description and an `id` field corresponding to the image ID: @@ -637,9 +477,9 @@ The response includes the document `_source` containing the original `text` and } ``` -## Step 3: Search the data +### Step 4: Search the data -Now you'll search the index using keyword search, neural search, and a combination of the two. +Now you'll search the index using a keyword search, a semantic search, and a combination of the two. ### Search using a keyword search @@ -664,7 +504,7 @@ GET /my-nlp-index/_search ``` {% include copy-curl.html %} -Document 3 is not returned because it does not contain the specified keywords. Documents containing the words `rodeo` and `cowboy` are scored lower because semantic meaning is not considered: +Document 3 is not returned because it does not contain the specified keywords. Documents containing the words `rodeo` and `cowboy` are scored lower because their semantic meaning is not considered: <details markdown="block"> <summary> @@ -731,9 +571,9 @@ Document 3 is not returned because it does not contain the specified keywords. D ``` </details> -### Search using a neural search +### Search using a semantic search -To search using a neural search, use a `neural` query and provide the model ID of the model you set up earlier so that vector embeddings for the query text are generated with the model used at ingestion time: +To search using a semantic search, use a `neural` query and provide the model ID of the model you set up earlier so that vector embeddings for the query text are generated with the model used at ingestion time: ```json GET /my-nlp-index/_search @@ -756,7 +596,7 @@ GET /my-nlp-index/_search ``` {% include copy-curl.html %} -This time, the response not only contains all five documents, but the document order is also improved because neural search considers semantic meaning: +This time, the response not only contains all five documents, but the document order is also improved because semantic search considers semantic meaning: <details markdown="block"> <summary> @@ -834,7 +674,7 @@ This time, the response not only contains all five documents, but the document o ### Search using a hybrid search -Hybrid search combines keyword and neural search to improve search relevance. To implement hybrid search, you need to set up a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline you'll configure intercepts search results at an intermediate stage and applies the [`normalization-processor`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/normalization-processor/) to them. The `normalization-processor` normalizes and combines the document scores from multiple query clauses, rescoring the documents according to the chosen normalization and combination techniques. +Hybrid search combines keyword and semantic search to improve search relevance. To implement hybrid search, you need to set up a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline you'll configure intercepts search results at an intermediate stage and applies the [`normalization-processor`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/normalization-processor/) to them. The `normalization-processor` normalizes and combines the document scores from multiple query clauses, rescoring the documents according to the chosen normalization and combination techniques. #### Step 1: Configure a search pipeline @@ -866,7 +706,7 @@ PUT /_search/pipeline/nlp-search-pipeline ``` {% include copy-curl.html %} -#### Step 2: Search with the hybrid query +#### Step 2: Search using a hybrid query You'll use the [`hybrid` query]({{site.url}}{{site.baseurl}}/query-dsl/compound/hybrid/) to combine the `match` and `neural` query clauses. Make sure to apply the previously created `nlp-search-pipeline` to the request in the query parameter: @@ -996,7 +836,53 @@ You can now experiment with different weights, normalization techniques, and com You can parameterize the search by using search templates. Search templates hide implementation details, reducing the number of nested levels and thus the query complexity. For more information, see [search templates]({{site.url}}{{site.baseurl}}/search-plugins/search-template/). -### Clean up +## Using automated workflows + +You can quickly set up semantic or hybrid search using [_automated workflows_]({{site.url}}{{site.baseurl}}/automating-configurations/). This approach automatically creates and provisions all necessary resources. For more information, see [Workflow templates]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-templates/). + +### Automated semantic search setup + +OpenSearch provides a [workflow template]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-templates/) that automatically registers and deploys a default local model (`huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2`) and creates an ingest pipeline and a vector index: + +```json +POST /_plugins/_flow_framework/workflow?use_case=semantic_search_with_local_model&provision=true +``` +{% include copy-curl.html %} + +Review the semantic search workflow template [defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/semantic-search-with-local-model-defaults.json) to determine whether you need to update any of the parameters. For example, if you want to use a different model, specify the model name in the request body: + +```json +POST /_plugins/_flow_framework/workflow?use_case=semantic_search_with_local_model&provision=true +{ + "register_local_pretrained_model.name": "huggingface/sentence-transformers/msmarco-distilbert-base-tas-b" +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a workflow ID for the created workflow: + +```json +{ + "workflow_id" : "U_nMXJUBq_4FYQzMOS4B" +} +``` + +To check the workflow status, send the following request: + +```json +GET /_plugins/_flow_framework/workflow/U_nMXJUBq_4FYQzMOS4B/_status +``` +{% include copy-curl.html %} + +Once the workflow completes, the `state` changes to `COMPLETED`. The workflow runs the following steps: + +1. [Step 2](#step-2-register-and-deploy-the-model) to register and deploy the model. +1. [Step 3(a)](#step-3a-create-an-ingest-pipeline) to create an ingest pipeline. +1. [Step 3(b)](#step-3b-create-a-vector-index) to create a vector index. + +You can now continue with [Step 3(c)](#step-3c-ingest-documents-into-the-index) to ingest documents into the index and [Step 4](#step-4-search-the-data) to search your data. + +## Clean up After you're done, delete the components you've created in this tutorial from the cluster: @@ -1033,4 +919,8 @@ DELETE /_plugins/_ml/model_groups/Z1eQf4oB5Vm0Tdw8EIP2 ## Further reading - Read about the basics of OpenSearch semantic search in [Building a semantic search engine in OpenSearch](https://opensearch.org/blog/semantic-search-solutions/). -- Read about the benefits of combining keyword and neural search, the normalization and combination technique options, and benchmarking tests in [The ABCs of semantic search in OpenSearch: Architectures, benchmarks, and combination strategies](https://opensearch.org/blog/semantic-science-benchmarks/). +- Read about the combining keyword and semantic search, the normalization and combination technique options, and benchmarking tests in [The ABCs of semantic search in OpenSearch: Architectures, benchmarks, and combination strategies](https://opensearch.org/blog/semantic-science-benchmarks/). + +## Next steps + +- Explore [AI search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/index/) in OpenSearch. \ No newline at end of file diff --git a/_tutorials/vector-search/semantic-highlighting-tutorial.md b/_tutorials/vector-search/semantic-highlighting-tutorial.md new file mode 100644 index 00000000000..4a6c11ff15e --- /dev/null +++ b/_tutorials/vector-search/semantic-highlighting-tutorial.md @@ -0,0 +1,270 @@ +--- +layout: default +title: Using semantic highlighting +parent: Vector search +grand_parent: Tutorials +nav_order: 60 +--- + +# Using semantic highlighting + +Semantic highlighting enhances search results by identifying and emphasizing the most semantically relevant sentences or passages within documents, based on the query's meaning. Unlike traditional highlighters that rely on exact keyword matches, semantic highlighting uses machine learning (ML) models to understand the context and relevance of text segments. This allows you to pinpoint the most pertinent information within a document, even if the exact search terms aren't present in the highlighted passage. For more information, see [Using the `semantic` highlighter]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/highlight#using-the-semantic-highlighter). + +This tutorial guides you through setting up and using semantic highlighting with a neural search query. + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Prerequisites + +To ensure local basic setup works, specify the following cluster settings: + +```json +PUT _cluster/settings +{ + "persistent": { + "plugins.ml_commons.allow_registering_model_via_url": "true", + "plugins.ml_commons.only_run_on_ml_node": "false", + "plugins.ml_commons.model_access_control_enabled": "true" + } +} +``` +{% include copy-curl.html %} + +This example uses a simple setup with no dedicated ML nodes and allows running a model on a non-ML node. On clusters with dedicated ML nodes, specify `"only_run_on_ml_node": "true"` for improved performance. For more information, see [ML Commons cluster settings]({{site.url}}{{site.baseurl}}/ml-commons-plugin/cluster-settings/). + +## Step 1: Create an index + +First, create an index to store your text data and its corresponding vector embeddings. You'll need a `text` field for the original content and a `knn_vector` field for the embeddings: + +```json +PUT neural-search-index +{ + "settings": { + "index.knn": true + }, + "mappings": { + "properties": { + "text": { + "type": "text" + }, + "text_embedding": { + "type": "knn_vector", + "dimension": 384, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "faiss", + "parameters": { + "ef_construction": 128, + "m": 24 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The `dimension` field must contain your chosen embedding model's dimension. + +## Step 2: Register and deploy the ML models + +You need two types of models for semantic highlighting: + +1. **Text embedding model**: To convert the search query and document text into vectors. +2. **Sentence highlighting model**: To analyze the text and identify the most relevant sentences. + +First, register and deploy a text embedding model: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "huggingface/sentence-transformers/all-MiniLM-L6-v2", + "version": "1.0.2", + "model_format": "TORCH_SCRIPT" +} +``` +{% include copy-curl.html %} + +This API returns a `task_id` for the deployment operation. Use the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/) to monitor the deployment status: + +```json +GET /_plugins/_ml/tasks/<your-task-id> +``` +{% include copy-curl.html %} + +Once the `state` changes to `COMPLETED`, the Tasks API returns the model ID for the deployed model. Note the text embedding model ID; you'll use it in the following steps. + +Next, register a pretrained semantic sentence highlighting model: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "amazon/sentence-highlighting/opensearch-semantic-highlighter-v1", + "version": "1.0.0", + "model_format": "TORCH_SCRIPT", + "function_name": "QUESTION_ANSWERING" +} +``` +{% include copy-curl.html %} + +Monitor the deployment status using the Tasks API. Note the semantic highlighting model ID; you'll use it in the following steps. + +## Step 3 (Optional): Configure an ingest pipeline + +To automatically generate embeddings during indexing, create an [ingest pipeline]({{site.url}}{{site.baseurl}}/ingest-pipelines/): + +```json +PUT /_ingest/pipeline/nlp-ingest-pipeline +{ + "description": "A pipeline to generate text embeddings", + "processors": [ + { + "text_embedding": { + "model_id": "your-text-embedding-model-id", + "field_map": { + "text": "text_embedding" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +Set this pipeline as the default pipeline for your index: + +```json +PUT /neural-search-index/_settings +{ + "index.default_pipeline": "nlp-ingest-pipeline" +} +``` +{% include copy-curl.html %} + +## Step 4: Index data + +Now, index some sample documents. If you configured the ingest pipeline, embeddings will be generated automatically: + +```json +POST /neural-search-index/_doc/1 +{ + "text": "Alzheimer's disease is a progressive neurodegenerative disorder characterized by accumulation of amyloid-beta plaques and neurofibrillary tangles in the brain. Early symptoms include short-term memory impairment, followed by language difficulties, disorientation, and behavioral changes. While traditional treatments such as cholinesterase inhibitors and memantine provide modest symptomatic relief, they do not alter disease progression. Recent clinical trials investigating monoclonal antibodies targeting amyloid-beta, including aducanumab, lecanemab, and donanemab, have shown promise in reducing plaque burden and slowing cognitive decline. Early diagnosis using biomarkers such as cerebrospinal fluid analysis and PET imaging may facilitate timely intervention and improved outcomes." +} +``` +{% include copy-curl.html %} + +```json +POST /neural-search-index/_doc/2 +{ + "text": "Major depressive disorder is characterized by persistent feelings of sadness, anhedonia, and neurovegetative symptoms affecting sleep, appetite, and energy levels. First-line pharmacological treatments include selective serotonin reuptake inhibitors (SSRIs) and serotonin-norepinephrine reuptake inhibitors (SNRIs), with response rates of approximately 60-70%. Cognitive-behavioral therapy demonstrates comparable efficacy to medication for mild to moderate depression and may provide more durable benefits. Treatment-resistant depression may respond to augmentation strategies including atypical antipsychotics, lithium, or thyroid hormone. Electroconvulsive therapy remains the most effective intervention for severe or treatment-resistant depression, while newer modalities such as transcranial magnetic stimulation and ketamine infusion offer promising alternatives with fewer side effects." +} +``` +{% include copy-curl.html %} + +```json +POST /neural-search-index/_doc/3 +{ + "text" : "Cardiovascular disease remains the leading cause of mortality worldwide, accounting for approximately one-third of all deaths. Risk factors include hypertension, diabetes mellitus, smoking, obesity, and family history. Recent advancements in preventive cardiology emphasize lifestyle modifications such as Mediterranean diet, regular exercise, and stress reduction techniques. Pharmacological interventions including statins, beta-blockers, and ACE inhibitors have significantly reduced mortality rates. Emerging treatments focus on inflammation modulation and precision medicine approaches targeting specific genetic profiles associated with cardiac pathologies." +} +``` +{% include copy-curl.html %} + +## Step 5: Perform semantic highlighting + +Combine a neural search query with the semantic highlighter: + +1. Use a `neural` query to find documents semantically similar to your query text using the text embedding model. +2. Add a `highlight` section. +3. In `highlight.fields`, specify the `text` field (or another field containing the content you want to highlight). +4. Set the `type` for this field to `semantic`. +5. Add a global `highlight.options` object. +6. In `options`, provide the `model_id` of your deployed sentence highlighting model. + +Use the following request to retrieve the top five matching documents (specified in the `k` parameter). Replace the placeholder model IDs (`TEXT_EMBEDDING_MODEL_ID` and `SEMANTIC_HIGHLIGHTING_MODEL_ID`) with the model IDs obtained after successful deployment in Step 2: + +```json +POST /neural-search-index/_search +{ + "_source": { + "excludes": ["text_embedding"] // Exclude the large embedding from the source + }, + "query": { + "neural": { + "text_embedding": { + "query_text": "treatments for neurodegenerative diseases", + "model_id": "<your-text-embedding-model-id>", + "k": 2 + } + } + }, + "highlight": { + "fields": { + "text": { + "type": "semantic" + } + }, + "options": { + "model_id": "<your-semantic-highlighting-model-id>" + } + } +} +``` +{% include copy-curl.html %} + +## Step 6: Interpret the results + +The search results include a `highlight` object within each hit. The specified `text` field in the `highlight` object contains the original text, with the most semantically relevant sentences wrapped in `<em>` tags by default: + +```json +{ + "took": 711, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 0.52716815, + "hits": [ + { + "_index": "neural-search-index", + "_id": "1", + "_score": 0.52716815, + "_source": { + "text": "Alzheimer's disease is a progressive neurodegenerative disorder ..." // Shortened for brevity + }, + "highlight": { + "text": [ + // Highlighted sentence may differ based on the exact model used + "Alzheimer's disease is a progressive neurodegenerative disorder characterized by accumulation of amyloid-beta plaques and neurofibrillary tangles in the brain. Early symptoms include short-term memory impairment, followed by language difficulties, disorientation, and behavioral changes. While traditional treatments such as cholinesterase inhibitors and memantine provide modest symptomatic relief, they do not alter disease progression. <em>Recent clinical trials investigating monoclonal antibodies targeting amyloid-beta, including aducanumab, lecanemab, and donanemab, have shown promise in reducing plaque burden and slowing cognitive decline.</em> Early diagnosis using biomarkers such as cerebrospinal fluid analysis and PET imaging may facilitate timely intervention and improved outcomes." + ] + } + }, + { + "_index": "neural-search-index", + "_id": "2", + "_score": 0.4364841, + "_source": { + "text": "Major depressive disorder is characterized by persistent feelings of sadness ..." // Shortened for brevity + }, + "highlight": { + "text": [ + // Highlighted sentence for document 2 + "Major depressive disorder is characterized by persistent feelings of sadness, anhedonia, and neurovegetative symptoms affecting sleep, appetite, and energy levels. First-line pharmacological treatments include selective serotonin reuptake inhibitors (SSRIs) and serotonin-norepinephrine reuptake inhibitors (SNRIs), with response rates of approximately 60-70%. <em>Cognitive-behavioral therapy demonstrates comparable efficacy to medication for mild to moderate depression and may provide more durable benefits.</em> Treatment-resistant depression may respond to augmentation strategies including atypical antipsychotics, lithium, or thyroid hormone. Electroconvulsive therapy remains the most effective intervention for severe or treatment-resistant depression, while newer modalities such as transcranial magnetic stimulation and ketamine infusion offer promising alternatives with fewer side effects." ] + } + } + ] + } +} +``` + +The `semantic` highlighter identifies the sentence determined by the model to be semantically relevant to the query ("treatments for neurodegenerative diseases") within the context of each retrieved document. You can customize the highlight tags using the `pre_tags` and `post_tags` parameters if needed. For more information, see [Changing the highlighting tags]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/highlight/#changing-the-highlighting-tags). diff --git a/_tutorials/vector-search/semantic-search/index.md b/_tutorials/vector-search/semantic-search/index.md new file mode 100644 index 00000000000..649323bbf66 --- /dev/null +++ b/_tutorials/vector-search/semantic-search/index.md @@ -0,0 +1,78 @@ +--- +layout: default +title: Semantic search +parent: Vector search +has_children: true +has_toc: false +nav_order: 50 +redirect_from: + - /vector-search/tutorials/semantic-search/ + - /tutorials/vector-search/semantic-search/ +semantic_search: + - heading: "Semantic search using the OpenAI embedding model" + link: "/tutorials/vector-search/semantic-search/semantic-search-openai/" + list: + - "<b>Platform:</b> OpenSearch, Amazon OpenSearch Service" + - "<b>Model:</b> OpenAI embedding" + - "<b>Deployment:</b> Provider API" + - heading: "Semantic search using Cohere Embed" + link: "/tutorials/vector-search/semantic-search/semantic-search-cohere/" + list: + - "<b>Platform:</b> OpenSearch, Amazon OpenSearch Service" + - "<b>Model:</b> Cohere Embed" + - "<b>Deployment:</b> Provider API" + - heading: "Semantic search using Cohere Embed on Amazon Bedrock" + link: "/tutorials/vector-search/semantic-search/semantic-search-bedrock-cohere/" + list: + - "<b>Platform:</b> OpenSearch, Amazon OpenSearch Service" + - "<b>Model:</b> Cohere Embed" + - "<b>Deployment:</b> Amazon Bedrock" + - heading: Semantic search using Amazon Bedrock Titan + link: "/tutorials/vector-search/semantic-search/semantic-search-bedrock-titan/" + list: + - "<b>Platform:</b> OpenSearch, Amazon OpenSearch Service" + - "<b>Model:</b> Amazon Titan" + - "<b>Deployment:</b> Amazon Bedrock" + - heading: "Semantic search using Amazon Bedrock Titan in another account" + link: /tutorials/vector-search/semantic-search/semantic-search-bedrock-titan-other/ + list: + - "<b>Platform:</b> OpenSearch, Amazon OpenSearch Service" + - "<b>Model:</b> Amazon Titan" + - "<b>Deployment:</b> Amazon Bedrock (in a different account than your Amazon OpenSearch Service account)" + - heading: Semantic search using a model in Amazon SageMaker + link: /tutorials/vector-search/semantic-search/semantic-search-sagemaker/ + list: + - "<b>Platform:</b> OpenSearch, Amazon OpenSearch Service" + - "<b>Model:</b> Custom" + - "<b>Deployment:</b> Amazon SageMaker" + - heading: Semantic search using AWS CloudFormation and Amazon SageMaker + link: /tutorials/vector-search/semantic-search/semantic-search-cfn-sagemaker/ + list: + - "<b>Platform:</b> OpenSearch, Amazon OpenSearch Service" + - "<b>Model:</b> Custom" + - "<b>Deployment:</b> Amazon SageMaker + CloudFormation" + - heading: Semantic search using AWS CloudFormation and Amazon Bedrock + link: /tutorials/vector-search/semantic-search/semantic-search-cfn-bedrock/ + list: + - "<b>Platform:</b> OpenSearch, Amazon OpenSearch Service" + - "<b>Model:</b> Amazon Titan + Cohere" + - "<b>Deployment:</b> Amazon Bedrock + CloudFormation" + - heading: Semantic search using an asymmetric model + link: /tutorials/vector-search/semantic-search/semantic-search-asymmetric/ + list: + - "<b>Platform:</b> OpenSearch" + - "<b>Model:</b> Hugging Face Multilingual-E5-small " + - "<b>Deployment:</b> Local cluster" + - heading: "Semantic search using text chunking" + link: /tutorials/vector-search/semantic-search/long-document/ + list: + - "<b>Platform:</b> OpenSearch, Amazon OpenSearch Service" + - "<b>Model:</b> Amazon Titan Text Embeddings" + - "<b>Deployment:</b> Amazon Bedrock" +--- + +# Semantic search tutorials + +The following tutorials show you how to implement semantic search. + +{% include cards.html cards=page.semantic_search %} \ No newline at end of file diff --git a/_tutorials/vector-search/semantic-search/long-document.md b/_tutorials/vector-search/semantic-search/long-document.md new file mode 100644 index 00000000000..afa4f0d69cc --- /dev/null +++ b/_tutorials/vector-search/semantic-search/long-document.md @@ -0,0 +1,482 @@ +--- +layout: default +title: Semantic search using text chunking +parent: Semantic search +grand_parent: Vector search +nav_order: 90 +redirect_from: + - /vector-search/tutorials/semantic-search/long-document/ +--- + +# Semantic search using text chunking + +This tutorial shows you how to use text chunking to run semantic search on long documents in OpenSearch 2.19 or later. + +In this tutorial, you'll use the following OpenSearch components: +- [Text chunking processor]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/text-chunking/) +- [ML inference ingest processor]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/ml-inference/) +- [ML inference search request processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/ml-inference-search-request/) +- [Template query]({{site.url}}{{site.baseurl}}/api-reference/search-template/) + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Step 1: Create an embedding model + +In this tutorial, you'll use the [Amazon Bedrock Titan Text Embeddings model](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html). + +If using Python, you can create an Amazon Bedrock Titan embedding connector and test the model using the [opensearch-py-ml](https://github.com/opensearch-project/opensearch-py-ml) client CLI. The CLI automates many configuration steps, making setup faster and reducing the chance of errors. For more information about using the CLI, see the [CLI documentation](https://opensearch-project.github.io/opensearch-py-ml/cli/index.html#). +{: .tip} + +If using self-managed OpenSearch, create a model using [the blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/bedrock_connector_titan_embedding_blueprint.md). + +If using Amazon OpenSearch Service, use [this Python notebook](https://github.com/opensearch-project/ml-commons/blob/main/docs/tutorials/aws/AIConnectorHelper.ipynb) to create the model. Alternatively, you can manually create a connector by following [this tutorial]({{site.url}}{{site.baseurl}}/vector-search/tutorials/semantic-search/semantic-search-bedrock-titan/). + +### Step 1.1: Create a connector + +To create a connector, send the following request. Because you'll use the [ML inference processor]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/ml-inference/) in this tutorial, you don't need to specify a pre- or post-processing function in the connector: + +```json +POST _plugins/_ml/connectors/_create +{ + "name": "Amazon Bedrock Connector: embedding", + "description": "The connector to bedrock Titan embedding model", + "version": 1, + "protocol": "aws_sigv4", + "parameters": { + "region": "us-west-2", + "service_name": "bedrock", + "model": "amazon.titan-embed-text-v2:0", + "dimensions": 1024, + "normalize": true, + "embeddingTypes": ["float"] + }, + "credential": { + "access_key": "your_aws_access_key", + "secret_key": "your_aws_secret_key", + "session_token": "your_aws_session_token" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://bedrock-runtime.${parameters.region}.amazonaws.com/model/${parameters.model}/invoke", + "headers": { + "content-type": "application/json", + "x-amz-content-sha256": "required" + }, + "request_body": "{ \"inputText\": \"${parameters.inputText}\", \"dimensions\": ${parameters.dimensions}, \"normalize\": ${parameters.normalize}, \"embeddingTypes\": ${parameters.embeddingTypes} }" + } + ] +} +``` +{% include copy-curl.html %} + +The response contains a connector ID: + +```json +{ + "connector_id": "vhR15JQBLopfJ2xsx9p5" +} +``` + +Note the connector ID; you'll use it in the next step. + +### Step 1.2: Register the model + +To register the model, send the following request: + +```json +POST _plugins/_ml/models/_register?deploy=true +{ + "name": "Bedrock embedding model", + "function_name": "remote", + "description": "Bedrock text embedding model v2", + "connector_id": "vhR15JQBLopfJ2xsx9p5" +} +``` +{% include copy-curl.html %} + +The response contains the model ID: + +```json +{ + "task_id": "xRR35JQBLopfJ2xsO9pU", + "status": "CREATED", + "model_id": "xhR35JQBLopfJ2xsO9pr" +} +``` + +Note the model ID; you'll use it in the next step. + +### Step 1.3: Test the model + +To test the model, send the following request: + +```json +POST /_plugins/_ml/models/xhR35JQBLopfJ2xsO9pr/_predict +{ + "parameters": { + "inputText": "hello world" + } +} +``` +{% include copy-curl.html %} + +The response contains the embeddings generated by the model: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "dataAsMap": { + "embedding": [ + -0.020442573353648186,... + ], + "embeddingsByType": { + "float": [ + -0.020442573353648186, ... + ] + }, + "inputTextTokenCount": 3.0 + } + } + ], + "status_code": 200 + } + ] +} +``` + +## Step 2: Create an ingest pipeline + +Many text embedding models have input size limitations. The [Amazon Titan Text Embeddings V2 model](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html) supports a maximum of 8,192 text tokens. To process long documents, you need to split them into smaller chunks and send each chunk to the model. The [text chunking processor]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/text-chunking/) splits the original document into smaller pieces, and the [ML inference processor]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/ml-inference/) generates embeddings for each chunk. To create an ingest pipeline containing both processors, send the following request: + +```json +PUT _ingest/pipeline/bedrock-text-embedding-pipeline +{ + "description": "ingest reviews, generate embedding, and format chunks", + "processors": [ + { + "text_chunking": { + "algorithm": { + "fixed_token_length": { + "token_limit": 100, + "overlap_rate": 0.2, + "tokenizer": "standard" + } + }, + "field_map": { + "passage_text": "passage_chunk" + } + } + }, + { + "foreach": { + "field": "passage_chunk", + "processor": { + "set": { + "field": "_ingest._value", + "value": { + "text": "{{_ingest._value}}" + } + } + } + } + }, + { + "foreach": { + "field": "passage_chunk", + "processor": { + "ml_inference": { + "model_id": "xhR35JQBLopfJ2xsO9pr", + "input_map": [ + { + "inputText": "_ingest._value.text" + } + ], + "output_map": [ + { + "_ingest._value.embedding": "embedding" + } + ] + } + } + } + } + ] +} +``` +{% include copy-curl.html %} + +To test the pipeline, send the following request: + +```json +POST _ingest/pipeline/bedrock-text-embedding-pipeline/_simulate +{ + "docs": [ + { + "_index": "testindex", + "_id": "1", + "_source":{ + "passage_text": "Ingest pipelines\nAn ingest pipeline is a sequence of processors that are applied to documents as they are ingested into an index. Each processor in a pipeline performs a specific task, such as filtering, transforming, or enriching data.\n\nProcessors are customizable tasks that run in a sequential order as they appear in the request body. This order is important, as each processor depends on the output of the previous processor. The modified documents appear in your index after the processors are applied.\n\nOpenSearch ingest pipelines compared to OpenSearch Data Prepper\nOpenSeach ingest pipelines run within the OpenSearch cluster, whereas OpenSearch Data Prepper is an external component that runs on the OpenSearch cluster.\n\nOpenSearch ingest pipelines perform actions on indexes and are preferred for use cases involving pre-processing simple datasets, machine learning (ML) processors, and vector embedding processors. OpenSearch ingest pipelines are recommended for simple data pre-processing and small datasets.\n\nOpenSearch Data Prepper is recommended for any data processing tasks it supports, particularly when dealing with large datasets and complex data pre-processing requirements. It streamlines the process of transferring and fetching large datasets while providing robust capabilities for intricate data preparation and transformation operations. Refer to the OpenSearch Data Prepper documentation for more information.\n\nOpenSearch ingest pipelines can only be managed using Ingest API operations.\n\nPrerequisites\nThe following are prerequisites for using OpenSearch ingest pipelines:\n\nWhen using ingestion in a production environment, your cluster should contain at least one node with the node roles permission set to ingest. For information about setting up node roles within a cluster, see Cluster Formation.\nIf the OpenSearch Security plugin is enabled, you must have the cluster_manage_pipelines permission to manage ingest pipelines.\nDefine a pipeline\nA pipeline definition describes the sequence of an ingest pipeline and can be written in JSON format. An ingest pipeline consists of the following:\n\n{\n \"description\" : \"...\"\n \"processors\" : [...]\n}\nRequest body fields\nField\tRequired\tType\tDescription\nprocessors\tRequired\tArray of processor objects\tA component that performs a specific data processing task as the data is being ingested into OpenSearch.\ndescription\tOptional\tString\tA description of the ingest pipeline.\n" + } + } + ] +} +``` +{% include copy-curl.html %} + +The response shows the processed document, which has been split into chunks and includes embeddings for each chunk: + +```json +{ + "docs": [ + { + "doc": { + "_index": "testindex", + "_id": "1", + "_source": { + "passage_text": """Ingest pipelines +An ingest pipeline is a sequence of processors that are applied to documents as they are ingested into an index. Each processor in a pipeline performs a specific task, such as filtering, transforming, or enriching data. + +Processors are customizable tasks that run in a sequential order as they appear in the request body. This order is important, as each processor depends on the output of the previous processor. The modified documents appear in your index after the processors are applied. + +OpenSearch ingest pipelines compared to OpenSearch Data Prepper +OpenSeach ingest pipelines run within the OpenSearch cluster, whereas OpenSearch Data Prepper is an external component that runs on the OpenSearch cluster. + +OpenSearch ingest pipelines perform actions on indexes and are preferred for use cases involving pre-processing simple datasets, machine learning (ML) processors, and vector embedding processors. OpenSearch ingest pipelines are recommended for simple data pre-processing and small datasets. + +OpenSearch Data Prepper is recommended for any data processing tasks it supports, particularly when dealing with large datasets and complex data pre-processing requirements. It streamlines the process of transferring and fetching large datasets while providing robust capabilities for intricate data preparation and transformation operations. Refer to the OpenSearch Data Prepper documentation for more information. + +OpenSearch ingest pipelines can only be managed using Ingest API operations. + +Prerequisites +The following are prerequisites for using OpenSearch ingest pipelines: + +When using ingestion in a production environment, your cluster should contain at least one node with the node roles permission set to ingest. For information about setting up node roles within a cluster, see Cluster Formation. +If the OpenSearch Security plugin is enabled, you must have the cluster_manage_pipelines permission to manage ingest pipelines. +Define a pipeline +A pipeline definition describes the sequence of an ingest pipeline and can be written in JSON format. An ingest pipeline consists of the following: + +{ + "description" : "..." + "processors" : [...] +} +Request body fields +Field Required Type Description +processors Required Array of processor objects A component that performs a specific data processing task as the data is being ingested into OpenSearch. +description Optional String A description of the ingest pipeline. +""", + "passage_chunk": [ + { + "text": """Ingest pipelines\nAn ingest pipeline is a sequence of processors that are applied to documents as they are ingested into an index. Each processor in a pipeline performs a specific task, such as filtering, transforming, or enriching data.\n\nProcessors are customizable tasks that run in a sequential order as they appear in the request body. This order is important, as each processor depends on the output of the previous processor. The modified documents appear in your index after the processors are applied.\n\nOpenSearch ingest pipelines compared to OpenSearch Data Prepper\nOpenSeach ingest pipelines run within the OpenSearch cluster, whereas OpenSearch Data Prepper is an external component that runs on the OpenSearch cluster.\n\nOpenSearch ingest pipelines perform actions on indexes and are preferred for use cases involving pre-processing simple datasets, machine learning (ML) processors, and vector embedding processors. OpenSearch ingest pipelines are recommended for simple data pre-processing and small datasets.\n\nOpenSearch Data Prepper is recommended for any data processing tasks it supports, particularly when dealing with large datasets and complex data pre-processing requirements. It streamlines the process of transferring and fetching large datasets while providing robust capabilities for intricate data preparation and transformation operations. Refer to the OpenSearch """, + "embedding": [ + 0.04044651612639427, + ... + ] + }, + { + "text": """tasks it supports, particularly when dealing with large datasets and complex data pre-processing requirements. It streamlines the process of transferring and fetching large datasets while providing robust capabilities for intricate data preparation and transformation operations. Refer to the OpenSearch Data Prepper documentation for more information.\n\nOpenSearch ingest pipelines can only be managed using Ingest API operations.\n\nPrerequisites\nThe following are prerequisites for using OpenSearch ingest pipelines:\n\nWhen using ingestion in a production environment, your cluster should contain at least one node with the node roles permission set to ingest. For information about setting up node roles within a cluster, see Cluster Formation.\nIf the OpenSearch Security plugin is enabled, you must have the cluster_manage_pipelines permission to manage ingest pipelines.\nDefine a pipeline\nA pipeline definition describes the sequence of an ingest pipeline and can be written in JSON format. An ingest pipeline consists of the following:\n\n{\n \"description\" : \"...\"\n \"processors\" : [...]\n}\nRequest body fields\nField\tRequired\tType\tDescription\nprocessors\tRequired\tArray of processor objects\tA component that performs a specific data processing task as the data is being ingested into OpenSearch.\ndescription\tOptional\tString\tA description of the ingest pipeline.\n""", + "embedding": [ + 0.02055041491985321, + ... + ] + } + ] + }, + "_ingest": { + "_value": null, + "timestamp": "2025-02-08T07:49:43.484543119Z" + } + } + } + ] +} +``` + +## Step 3: Create an index and ingest data + +To create a vector index, send the following request: + +```json +PUT opensearch_docs +{ + "settings": { + "index.knn": true, + "default_pipeline": "bedrock-text-embedding-pipeline" + }, + "mappings": { + "properties": { + "passage_chunk": { + "type": "nested", + "properties": { + "text": { + "type": "text" + }, + "embedding": { + "type": "knn_vector", + "dimension": 1024 + } + } + }, + "passage_text": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +Ingest test data into the index: + +```json +POST _bulk +{"index": {"_index": "opensearch_docs"}} +{"passage_text": "Ingest pipelines\nAn ingest pipeline is a sequence of processors that are applied to documents as they are ingested into an index. Each processor in a pipeline performs a specific task, such as filtering, transforming, or enriching data.\n\nProcessors are customizable tasks that run in a sequential order as they appear in the request body. This order is important, as each processor depends on the output of the previous processor. The modified documents appear in your index after the processors are applied.\n\nOpenSearch ingest pipelines compared to OpenSearch Data Prepper\nOpenSeach ingest pipelines run within the OpenSearch cluster, whereas OpenSearch Data Prepper is an external component that runs on the OpenSearch cluster.\n\nOpenSearch ingest pipelines perform actions on indexes and are preferred for use cases involving pre-processing simple datasets, machine learning (ML) processors, and vector embedding processors. OpenSearch ingest pipelines are recommended for simple data pre-processing and small datasets.\n\nOpenSearch Data Prepper is recommended for any data processing tasks it supports, particularly when dealing with large datasets and complex data pre-processing requirements. It streamlines the process of transferring and fetching large datasets while providing robust capabilities for intricate data preparation and transformation operations. Refer to the OpenSearch Data Prepper documentation for more information.\n\nOpenSearch ingest pipelines can only be managed using Ingest API operations.\n\nPrerequisites\nThe following are prerequisites for using OpenSearch ingest pipelines:\n\nWhen using ingestion in a production environment, your cluster should contain at least one node with the node roles permission set to ingest. For information about setting up node roles within a cluster, see Cluster Formation.\nIf the OpenSearch Security plugin is enabled, you must have the cluster_manage_pipelines permission to manage ingest pipelines.\nDefine a pipeline\nA pipeline definition describes the sequence of an ingest pipeline and can be written in JSON format. An ingest pipeline consists of the following:\n\n{\n \"description\" : \"...\"\n \"processors\" : [...]\n}\nRequest body fields\nField\tRequired\tType\tDescription\nprocessors\tRequired\tArray of processor objects\tA component that performs a specific data processing task as the data is being ingested into OpenSearch.\ndescription\tOptional\tString\tA description of the ingest pipeline.\n"} +{"index": {"_index": "opensearch_docs"}} +{"passage_text": "Monitors\nProactively monitor your data in OpenSearch with features available in Alerting and Anomaly Detection. For example, you can pair Anomaly Detection with Alerting to ensure that you’re notified as soon as an anomaly is detected. You can do this by setting up a detector to automatically detect outliers in your streaming data and monitors to alert you through notifications when data exceeds certain thresholds.\n\nMonitor types\nThe Alerting plugin provides the following monitor types:\n\nper query: Runs a query and generates alert notifications based on the matching criteria. See Per query monitors for information about creating and using this monitor type.\nper bucket: Runs a query that evaluates trigger criteria based on aggregated values in the dataset. See Per bucket monitors for information about creating and using this monitor type.\nper cluster metrics: Runs API requests on the cluster to monitor its health. See Per cluster metrics monitors for information about creating and using this monitor type.\nper document: Runs a query (or multiple queries combined by a tag) that returns individual documents that match the alert notification trigger condition. See Per document monitors for information about creating and using this monitor type.\ncomposite monitor: Runs multiple monitors in a single workflow and generates a single alert based on multiple trigger conditions. See Composite monitors for information about creating and using this monitor type.\nThe maximum number of monitors you can create is 1,000. You can change the default maximum number of alerts for your cluster by updating the plugins.alerting.monitor.max_monitors setting using the cluster settings API."} +{"index": {"_index": "opensearch_docs"}} +{"passage_text": "Search pipelines\nYou can use search pipelines to build new or reuse existing result rerankers, query rewriters, and other components that operate on queries or results. Search pipelines make it easier for you to process search queries and search results within OpenSearch. Moving some of your application functionality into an OpenSearch search pipeline reduces the overall complexity of your application. As part of a search pipeline, you specify a list of processors that perform modular tasks. You can then easily add or reorder these processors to customize search results for your application.\n\nTerminology\nThe following is a list of search pipeline terminology:\n\nSearch request processor: A component that intercepts a search request (the query and the metadata passed in the request), performs an operation with or on the search request, and returns the search request.\nSearch response processor: A component that intercepts a search response and search request (the query, results, and metadata passed in the request), performs an operation with or on the search response, and returns the search response.\nSearch phase results processor: A component that runs between search phases at the coordinating node level. A search phase results processor intercepts the results retrieved from one search phase and transforms them before passing them to the next search phase.\nProcessor: Either a search request processor or a search response processor.\nSearch pipeline: An ordered list of processors that is integrated into OpenSearch. The pipeline intercepts a query, performs processing on the query, sends it to OpenSearch, intercepts the results, performs processing on the results, and returns them to the calling application, as shown in the following diagram.\n"} +``` +{% include copy-curl.html %} + +To verify that the documents were properly processed, search the index to view the generated chunks and embeddings: + +```json +GET opensearch_docs/_search +``` +{% include copy-curl.html %} + +## Step 4: Search using an ML inference processor + +Create a search pipeline with an ML inference processor that converts input text into embeddings: + +```json +PUT _search/pipeline/bedrock_semantic_search_pipeline +{ + "request_processors": [ + { + "ml_inference": { + "model_id": "xhR35JQBLopfJ2xsO9pr", + "input_map": [ + { + "inputText": "ext.ml_inference.params.text" + } + ], + "output_map": [ + { + "ext.ml_inference.params.vector": "embedding" + } + ] + } + } + ] +} +``` +{% include copy-curl.html %} + +Use the following template query to run a semantic search: + +```json +GET opensearch_docs/_search?search_pipeline=bedrock_semantic_search_pipeline +{ + "query": { + "template": { + "nested": { + "path": "passage_chunk", + "query": { + "knn": { + "passage_chunk.embedding": { + "vector": "${ext.ml_inference.params.vector}", + "k": 5 + } + } + } + } + } + }, + "ext": { + "ml_inference": { + "params": { + "text": "What's OpenSearch ingest pipeline" + } + } + }, + "_source": { + "excludes": [ + "passage_chunk" + ] + }, + "size": 1 +} +``` +{% include copy-curl.html %} + +The pipeline maps `inputText` to `ext.ml_inference.params.text`. During input processing, the pipeline retrieves the value from the path `ext.ml_inference.params.text` in the search request. In this example, the value in this path is `"What's OpenSearch ingest pipeline"`, and this value is passed to the model in the `inputText` parameter. + +During search, the search query references `"vector": "${ext.ml_inference.params.vector}"`. This vector value isn't provided in the initial search request; instead, the ML inference processor generates it by invoking the Amazon Bedrock Titan Embeddings model. The model creates an embedding vector from your search text and stores the vector in `ext.ml_inference.params.vector`. OpenSearch then uses this generated vector to find similar documents: + +```json +{ + "took": 398, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 0.78014797, + "hits": [ + { + "_index": "opensearch_docs", + "_id": "rj2T5JQBg4dihuRifxJT", + "_score": 0.78014797, + "_source": { + "passage_text": """Ingest pipelines +An ingest pipeline is a sequence of processors that are applied to documents as they are ingested into an index. Each processor in a pipeline performs a specific task, such as filtering, transforming, or enriching data. + +Processors are customizable tasks that run in a sequential order as they appear in the request body. This order is important, as each processor depends on the output of the previous processor. The modified documents appear in your index after the processors are applied. + +OpenSearch ingest pipelines compared to OpenSearch Data Prepper +OpenSeach ingest pipelines run within the OpenSearch cluster, whereas OpenSearch Data Prepper is an external component that runs on the OpenSearch cluster. + +OpenSearch ingest pipelines perform actions on indexes and are preferred for use cases involving pre-processing simple datasets, machine learning (ML) processors, and vector embedding processors. OpenSearch ingest pipelines are recommended for simple data pre-processing and small datasets. + +OpenSearch Data Prepper is recommended for any data processing tasks it supports, particularly when dealing with large datasets and complex data pre-processing requirements. It streamlines the process of transferring and fetching large datasets while providing robust capabilities for intricate data preparation and transformation operations. Refer to the OpenSearch Data Prepper documentation for more information. + +OpenSearch ingest pipelines can only be managed using Ingest API operations. + +Prerequisites +The following are prerequisites for using OpenSearch ingest pipelines: + +When using ingestion in a production environment, your cluster should contain at least one node with the node roles permission set to ingest. For information about setting up node roles within a cluster, see Cluster Formation. +If the OpenSearch Security plugin is enabled, you must have the cluster_manage_pipelines permission to manage ingest pipelines. +Define a pipeline +A pipeline definition describes the sequence of an ingest pipeline and can be written in JSON format. An ingest pipeline consists of the following: + +{ + "description" : "..." + "processors" : [...] +} +Request body fields +Field Required Type Description +processors Required Array of processor objects A component that performs a specific data processing task as the data is being ingested into OpenSearch. +description Optional String A description of the ingest pipeline. +""" + } + } + ] + } +} +``` diff --git a/_tutorials/vector-search/semantic-search/semantic-search-asymmetric.md b/_tutorials/vector-search/semantic-search/semantic-search-asymmetric.md new file mode 100644 index 00000000000..9881e9096f8 --- /dev/null +++ b/_tutorials/vector-search/semantic-search/semantic-search-asymmetric.md @@ -0,0 +1,535 @@ +--- +layout: default +title: Semantic search using an asymmetric embedding model +parent: Semantic search +grand_parent: Vector search +nav_order: 80 +redirect_from: + - /vector-search/tutorials/semantic-search/semantic-search-asymmetric/ +--- + +# Semantic search using an asymmetric embedding model + +This tutorial shows you how to perform semantic search by generating text embeddings using an asymmetric embedding model. The tutorial uses the multilingual `intfloat/multilingual-e5-small` model from Hugging Face. For more information, see [Semantic search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/semantic-search/). + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Step 1: Update cluster settings + +To configure your cluster to allow you to register models using external URLs and run models on non-machine learning (ML) nodes, send the following request: + +```json +PUT _cluster/settings +{ + "persistent": { + "plugins.ml_commons.allow_registering_model_via_url": "true", + "plugins.ml_commons.only_run_on_ml_node": "false", + "plugins.ml_commons.model_access_control_enabled": "true", + "plugins.ml_commons.native_memory_threshold": "99" + } +} +``` +{% include copy-curl.html %} + +## Step 2: Prepare the model for use in OpenSearch + +In this tutorial, you’ll use the Hugging Face `intfloat/multilingual-e5-small` model. Follow these steps to prepare and compress the model into a zip file for use in OpenSearch. + +### Step 2.1: Download the model from Hugging Face + +To download the model, use the following steps: + +1. Install Git Large File Storage (LFS), if you haven't already: + + ```bash + git lfs install + ``` + {% include copy.html %} + +2. Clone the model repository: + + ```bash + git clone https://huggingface.co/intfloat/multilingual-e5-small + ``` + {% include copy.html %} + +The model files are now downloaded into a directory on your local machine. + +### Step 2.2: Compress the model files + +To upload the model to OpenSearch, you must compress the necessary model files (`model.onnx`, `sentencepiece.bpe.model`, and `tokenizer.json`). You can find these files in the `onnx` directory of the cloned repository. + +To compress the files, run the following command in the directory containing them: + +```bash +zip -r intfloat-multilingual-e5-small-onnx.zip model.onnx tokenizer.json sentencepiece.bpe.model +``` +{% include copy.html %} + +The files are now archived in a zip file named `intfloat-multilingual-e5-small-onnx.zip`. + +### Step 2.3: Calculate the model file's hash + +Before registering the model, you must calculate the SHA-256 hash of the zip file. Run this command to generate the hash: + +```bash +shasum -a 256 intfloat-multilingual-e5-small-onnx.zip +``` +{% include copy.html %} + +Note the hash value; you'll need it during model registration. + +### Step 2.4: Serve the model file using a Python HTTP server + +To allow OpenSearch to access the model file, you can serve it through HTTP. Because this tutorial uses a local development environment, you can use Python's built-in HTTP server command. + +Navigate to the directory containing the zip file and run the following command: + +```bash +python3 -m http.server 8080 --bind 0.0.0.0 +``` +{% include copy.html %} + +This will serve the zip file at `http://0.0.0.0:8080/intfloat-multilingual-e5-small-onnx.zip`. After registering the model, you can stop the server by pressing `Ctrl+C`. + +## Step 3: Register a model group + +Before registering the model itself, you need to create a model group. This helps organize models in OpenSearch. Run the following request to create a new model group: + +```json +POST /_plugins/_ml/model_groups/_register +{ + "name": "Asymmetric Model Group", + "description": "A model group for local asymmetric models" +} +``` +{% include copy-curl.html %} + +Note the model group ID returned in the response; you'll use it to register the model. + +## Step 4: Register the model + +Now that you have the model zip file and the model group ID, you can register the model in OpenSearch: + +```json +POST /_plugins/_ml/models/_register +{ + "name": "e5-small-onnx", + "version": "1.0.0", + "description": "Asymmetric multilingual-e5-small model", + "model_format": "ONNX", + "model_group_id": "your_group_id", + "model_content_hash_value": "your_model_zip_content_hash_value", + "model_config": { + "model_type": "bert", + "embedding_dimension": 384, + "framework_type": "sentence_transformers", + "query_prefix": "query: ", + "passage_prefix": "passage: ", + "all_config": "{ \"_name_or_path\": \"intfloat/multilingual-e5-small\", \"architectures\": [ \"BertModel\" ], \"attention_probs_dropout_prob\": 0.1, \"hidden_size\": 384, \"num_attention_heads\": 12, \"num_hidden_layers\": 12, \"tokenizer_class\": \"XLMRobertaTokenizer\" }" + }, + "url": "http://localhost:8080/intfloat-multilingual-e5-small-onnx.zip" +} +``` +{% include copy-curl.html %} + +Replace `your_group_id` and `your_model_zip_content_hash_value` with the values from previous steps. This will initiate the model registration process, and you'll receive a task ID in the response. + +To check the status of the registration, run the following request: + +```json +GET /_plugins/_ml/tasks/your_task_id +``` +{% include copy-curl.html %} + +Once the task completes, note the model ID; you'll need it for deployment and inference. + +## Step 5: Deploy the model + +After the model is registered, deploy it by running the following request: + +```json +POST /_plugins/_ml/models/your_model_id/_deploy +``` +{% include copy-curl.html %} + +Use the task ID to check the status of the deployment: + +```json +GET /_plugins/_ml/tasks/your_task_id +``` +{% include copy-curl.html %} + +When the model is successfully deployed, its state changes to **DEPLOYED** and it is ready to use. + +## Step 6: Generate embeddings + +Now that your model is deployed, you can use it to generate text embeddings for both queries and passages. + +### Generating passage embeddings + +To generate embeddings for a passage, use the following request: + +```json +POST /_plugins/_ml/_predict/text_embedding/your_model_id +{ + "parameters": { + "content_type": "passage" + }, + "text_docs": [ + "Today is Friday, tomorrow will be my break day. After that, I will go to the library. When is lunch?" + ], + "target_response": ["sentence_embedding"] +} +``` +{% include copy-curl.html %} + +The response contains the generated embeddings: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "sentence_embedding", + "data_type": "FLOAT32", + "shape": [384], + "data": [0.0419328, 0.047480892, ..., 0.31158513, 0.21784715] + } + ] + } + ] +} +``` +{% include copy-curl.html %} + +### Generating query embeddings + +Similarly, you can generate embeddings for a query: + +```json +POST /_plugins/_ml/_predict/text_embedding/your_model_id +{ + "parameters": { + "content_type": "query" + }, + "text_docs": ["What day is it today?"], + "target_response": ["sentence_embedding"] +} +``` +{% include copy-curl.html %} + +The response contains the generated embeddings: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "sentence_embedding", + "data_type": "FLOAT32", + "shape": [384], + "data": [0.2338349, -0.13603798, ..., 0.37335885, 0.10653384] + } + ] + } + ] +} +``` +{% include copy-curl.html %} + +# Step 7: Run semantic search + +Now you'll run semantic search using the generated embeddings. First, you'll create an ingest pipeline +using an ML inference processor to create document embeddings during ingestion. Then you'll create a search pipeline to generate query embeddings using +the same asymmetric embedding model. + +## Step 7.1: Create a vector index + +To create a vector index, send the following request: + +```json +PUT nyc_facts +{ + "settings": { + "index": { + "default_pipeline": "asymmetric_embedding_ingest_pipeline", + "knn": true, + "knn.algo_param.ef_search": 100 + } + }, + "mappings": { + "properties": { + "fact_embedding": { + "type": "knn_vector", + "dimension": 384, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "nmslib", + "parameters": { + "ef_construction": 128, + "m": 24 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +### Step 7.2: Create an ingest pipeline + +To create an ingest pipeline for generating document embeddings, send the following request: + +```json +PUT _ingest/pipeline/asymmetric_embedding_ingest_pipeline +{ + "description": "ingest passage text and generate a embedding using an asymmetric model", + "processors": [ + { + "ml_inference": { + + "model_input": "{\"text_docs\":[\"${input_map.text_docs}\"],\"target_response\":[\"sentence_embedding\"],\"parameters\":{\"content_type\":\"query\"}}", + "function_name": "text_embedding", + "model_id": "{{ _.model_id }}", + "input_map": [ + { + "text_docs": "description" + } + ], + "output_map": [ + { + "fact_embedding": "$.inference_results[0].output[0].data", + "embedding_size": "$.inference_results.*.output.*.shape[0]" + } + ] + } + } + ] +} +``` +{% include copy-curl.html %} + +### 2.3 Test the pipeline + +Test the pipeline by running the following request: + +```json +POST /_ingest/pipeline/asymmetric_embedding_ingest_pipeline/_simulate +{ + "docs": [ + { + "_index": "my-index", + "_id": "1", + "_source": { + "title": "Central Park", + "description": "A large public park in the heart of New York City, offering a wide range of recreational activities." + } + } + ] +} +``` +{% include copy-curl.html %} + +The response contains the embeddings generated by the model: + +```json +{ + "docs": [ + { + "doc": { + "_index": "my-index", + "_id": "1", + "_source": { + "description": "A large public park in the heart of New York City, offering a wide range of recreational activities.", + "fact_embedding": [ + [ + 0.06344555, + 0.30067796, + ... + 0.014804064, + -0.022822019 + ] + ], + "title": "Central Park", + "embedding_size": [ + 384.0 + ] + }, + "_ingest": { + "timestamp": "2024-12-16T20:59:07.152169Z" + } + } + } + ] +} +``` + +### Step 7.4: Ingest data + +When you perform bulk ingestion, the ingest pipeline will generate embeddings for each document: + +```json +POST /_bulk +{ "index": { "_index": "nyc_facts" } } +{ "title": "Central Park", "description": "A large public park in the heart of New York City, offering a wide range of recreational activities." } +{ "index": { "_index": "nyc_facts" } } +{ "title": "Empire State Building", "description": "An iconic skyscraper in New York City offering breathtaking views from its observation deck." } +{ "index": { "_index": "nyc_facts" } } +{ "title": "Statue of Liberty", "description": "A colossal neoclassical sculpture on Liberty Island, symbolizing freedom and democracy in the United States." } +{ "index": { "_index": "nyc_facts" } } +{ "title": "Brooklyn Bridge", "description": "A historic suspension bridge connecting Manhattan and Brooklyn, offering pedestrian walkways with great views." } +{ "index": { "_index": "nyc_facts" } } +{ "title": "Times Square", "description": "A bustling commercial and entertainment hub in Manhattan, known for its neon lights and Broadway theaters." } +{ "index": { "_index": "nyc_facts" } } +{ "title": "Yankee Stadium", "description": "Home to the New York Yankees, this baseball stadium is a historic landmark in the Bronx." } +{ "index": { "_index": "nyc_facts" } } +{ "title": "The Bronx Zoo", "description": "One of the largest zoos in the world, located in the Bronx, featuring diverse animal exhibits and conservation efforts." } +{ "index": { "_index": "nyc_facts" } } +{ "title": "New York Botanical Garden", "description": "A large botanical garden in the Bronx, known for its diverse plant collections and stunning landscapes." } +{ "index": { "_index": "nyc_facts" } } +{ "title": "Flushing Meadows-Corona Park", "description": "A major park in Queens, home to the USTA Billie Jean King National Tennis Center and the Unisphere." } +{ "index": { "_index": "nyc_facts" } } +{ "title": "Citi Field", "description": "The home stadium of the New York Mets, located in Queens, known for its modern design and fan-friendly atmosphere." } +{ "index": { "_index": "nyc_facts" } } +{ "title": "Rockefeller Center", "description": "A famous complex of commercial buildings in Manhattan, home to the NBC studios and the annual ice skating rink." } +{ "index": { "_index": "nyc_facts" } } +{ "title": "Queens Botanical Garden", "description": "A peaceful, beautiful botanical garden located in Flushing, Queens, featuring seasonal displays and plant collections." } +{ "index": { "_index": "nyc_facts" } } +{ "title": "Arthur Ashe Stadium", "description": "The largest tennis stadium in the world, located in Flushing Meadows-Corona Park, Queens, hosting the U.S. Open." } +{ "index": { "_index": "nyc_facts" } } +{ "title": "Wave Hill", "description": "A public garden and cultural center in the Bronx, offering stunning views of the Hudson River and a variety of nature programs." } +{ "index": { "_index": "nyc_facts" } } +{ "title": "Louis Armstrong House", "description": "The former home of jazz legend Louis Armstrong, located in Corona, Queens, now a museum celebrating his life and music." } +``` +{% include copy-curl.html %} + +### Step 7.5: Create a search pipeline + +Create a search pipeline that converts your query into embeddings and runs a vector search on the index to return the best-matching documents: + +```json +PUT /_search/pipeline/asymmetric_embedding_search_pipeline +{ + "description": "ingest passage text and generate a embedding using an asymmetric model", + "request_processors": [ + { + "ml_inference": { + "query_template": "{\"size\": 3,\"query\": {\"knn\": {\"fact_embedding\": {\"vector\": ${query_embedding},\"k\": 4}}}}", + "function_name": "text_embedding", + "model_id": "{{ _.model_id }}", + "model_input": "{ \"text_docs\": [\"${input_map.query}\"], \"target_response\": [\"sentence_embedding\"], \"parameters\" : {\"content_type\" : \"query\" } }", + "input_map": [ + { + "query": "query.term.fact_embedding.value" + } + ], + "output_map": [ + { + "query_embedding": "$.inference_results[0].output[0].data", + "embedding_size": "$.inference_results.*.output.*.shape[0]" + } + ] + } + } + ] +} +``` +{% include copy-curl.html %} + +### Step 7.6: Run a query + +Run a query using the search pipeline created in the previous step: + +```json +GET /nyc_facts/_search?search_pipeline=asymmetric_embedding_search_pipeline +{ + "query": { + "term": { + "fact_embedding": { + "value": "What are some places for sports in NYC?", + "boost": 1 + } + } + } +} +``` + +The response contains the top three matching documents: + +```json +{ + "took": 22, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 0.12496973, + "hits": [ + { + "_index": "nyc_facts", + "_id": "hb9X0ZMBICPs-TP0ijZX", + "_score": 0.12496973, + "_source": { + "fact_embedding": [ + ... + ], + "embedding_size": [ + 384.0 + ], + "description": "A large public park in the heart of New York City, offering a wide range of recreational activities.", + "title": "Central Park" + } + }, + { + "_index": "nyc_facts", + "_id": "ir9X0ZMBICPs-TP0ijZX", + "_score": 0.114651985, + "_source": { + "fact_embedding": [ + ... + ], + "embedding_size": [ + 384.0 + ], + "description": "Home to the New York Yankees, this baseball stadium is a historic landmark in the Bronx.", + "title": "Yankee Stadium" + } + }, + { + "_index": "nyc_facts", + "_id": "j79X0ZMBICPs-TP0ijZX", + "_score": 0.110090025, + "_source": { + "fact_embedding": [ + ... + ], + "embedding_size": [ + 384.0 + ], + "description": "A famous complex of commercial buildings in Manhattan, home to the NBC studios and the annual ice skating rink.", + "title": "Rockefeller Center" + } + } + ] + } +} +``` +--- + +## References + +- Wang, Liang, et al. (2024). *Multilingual E5 Text Embeddings: A Technical Report*. arXiv preprint arXiv:2402.05672. [Link](https://arxiv.org/abs/2402.05672) \ No newline at end of file diff --git a/_tutorials/vector-search/semantic-search/semantic-search-bedrock-cohere.md b/_tutorials/vector-search/semantic-search/semantic-search-bedrock-cohere.md new file mode 100644 index 00000000000..d23879a8faa --- /dev/null +++ b/_tutorials/vector-search/semantic-search/semantic-search-bedrock-cohere.md @@ -0,0 +1,424 @@ +--- +layout: default +title: Semantic search using Cohere Embed on Amazon Bedrock +parent: Semantic search +grand_parent: Vector search +nav_order: 35 +redirect_from: + - /vector-search/tutorials/semantic-search/semantic-search-bedrock-cohere/ +--- + +# Semantic search using Cohere Embed on Amazon Bedrock + +This tutorial shows you how to implement semantic search in [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/) using the [Cohere Embed model](https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-embed.html). For more information, see [Semantic search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/semantic-search/). + +If using Python, you can create a Cohere connector and test the model using the [opensearch-py-ml](https://github.com/opensearch-project/opensearch-py-ml) client CLI. The CLI automates many configuration steps, making setup faster and reducing the chance of errors. For more information about using the CLI, see the [CLI documentation](https://opensearch-project.github.io/opensearch-py-ml/cli/index.html#). +{: .tip} + +If using self-managed OpenSearch instead of Amazon OpenSearch Service, create a connector to the model on Amazon Bedrock using [the blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_cohere_cohere.embed-english-v3_blueprint.md). For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +The easiest way to set up an embedding model in Amazon OpenSearch Service is by using [AWS CloudFormation](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/cfn-template.html). Alternatively, you can set up an embedding model using [the AIConnectorHelper notebook](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/tutorials/aws/AIConnectorHelper.ipynb). +{: .tip} + +Amazon Bedrock has a [quota limit](https://docs.aws.amazon.com/bedrock/latest/userguide/quotas.html). For more information about increasing this limit, see [Increase model invocation capacity with Provisioned Throughput in Amazon Bedrock](https://docs.aws.amazon.com/bedrock/latest/userguide/prov-throughput.html). +{: .warning} + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Prerequisite: Create an OpenSearch cluster + +Go to the [Amazon OpenSearch Service console](https://console.aws.amazon.com/aos/home) and create an OpenSearch domain. + +Note the domain Amazon Resource Name (ARN); you'll use it in the following steps. + +## Step 1: Create an IAM role to invoke the model on Amazon Bedrock + +To invoke the model on Amazon Bedrock, you must create an AWS Identity and Access Management (IAM) role with appropriate permissions. The connector will use this role to invoke the model. + +Go to the IAM console, create a new IAM role named `my_invoke_bedrock_cohere_role`, and add the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "es.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "bedrock:InvokeModel" + ], + "Effect": "Allow", + "Resource": "arn:aws:bedrock:*::foundation-model/cohere.embed-english-v3" + } + ] +} +``` +{% include copy.html %} + +If you need a model with multilingual support, you can use the `cohere.embed-multilingual-v3` model. +{: .tip} + +Note the role ARN; you'll use it in the following steps. + +## Step 2: Configure an IAM role in Amazon OpenSearch Service + +Follow these steps to configure an IAM role in Amazon OpenSearch Service. + +### Step 2.1: Create an IAM role for signing connector requests + +Generate a new IAM role specifically for signing your Create Connector API request. + +Create an IAM role named `my_create_bedrock_cohere_connector_role` with the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "your_iam_user_arn" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +You'll use the `your_iam_user_arn` IAM user to assume the role in Step 3.1. + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "your_iam_role_arn_created_in_step1" + }, + { + "Effect": "Allow", + "Action": "es:ESHttpPost", + "Resource": "your_opensearch_domain_arn" + } + ] +} +``` +{% include copy.html %} + +Note this role ARN; you'll use it in the following steps. + +### Step 2.2: Map a backend role + +Follow these steps to map a backend role: + +1. Log in to OpenSearch Dashboards and select **Security** on the top menu. +2. Select **Roles**, and then select the **ml_full_access** role. +3. On the **ml_full_access** role details page, select **Mapped users**, and then select **Manage mapping**. +4. Enter the IAM role ARN created in Step 2.1 in the **Backend roles** field, as shown in the following image. + ![Mapping a backend role]({{site.url}}{{site.baseurl}}/images/vector-search-tutorials/mapping_iam_role_arn.png) +5. Select **Map**. + +The IAM role is now successfully configured in your OpenSearch cluster. + +## Step 3: Create a connector + +Follow these steps to create a connector for the model. For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +### Step 3.1: Get temporary credentials + +Use the credentials of the IAM user specified in Step 2.1 to assume the role: + +```bash +aws sts assume-role --role-arn your_iam_role_arn_created_in_step2.1 --role-session-name your_session_name +``` +{% include copy.html %} + +Copy the temporary credentials from the response and configure them in `~/.aws/credentials`: + +```ini +[default] +AWS_ACCESS_KEY_ID=your_access_key_of_role_created_in_step2.1 +AWS_SECRET_ACCESS_KEY=your_secret_key_of_role_created_in_step2.1 +AWS_SESSION_TOKEN=your_session_token_of_role_created_in_step2.1 +``` +{% include copy.html %} + +### Step 3.2: Create a connector + +Run the following Python code with the temporary credentials configured in `~/.aws/credentials`: + +```python +import boto3 +import requests +from requests_aws4auth import AWS4Auth + +host = 'your_amazon_opensearch_domain_endpoint' +region = 'your_amazon_opensearch_domain_region' +service = 'es' + +credentials = boto3.Session().get_credentials() +awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) + +path = '/_plugins/_ml/connectors/_create' +url = host + path + +payload = { + "name": "Amazon Bedrock Cohere Connector: embedding v3", + "description": "The connector to Bedrock Cohere embedding model", + "version": 1, + "protocol": "aws_sigv4", + "parameters": { + "region": "your_bedrock_model_region", + "service_name": "bedrock", + "input_type":"search_document", + "truncate": "END" + }, + "credential": { + "roleArn": "your_iam_role_arn_created_in_step1" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://bedrock-runtime.your_bedrock_model_region.amazonaws.com/model/cohere.embed-english-v3/invoke", + "headers": { + "content-type": "application/json", + "x-amz-content-sha256": "required" + }, + "request_body": "{ \"texts\": ${parameters.texts}, \"truncate\": \"${parameters.truncate}\", \"input_type\": \"${parameters.input_type}\" }", + "pre_process_function": "connector.pre_process.cohere.embedding", + "post_process_function": "connector.post_process.cohere.embedding" + } + ] +} + +headers = {"Content-Type": "application/json"} + +r = requests.post(url, auth=awsauth, json=payload, headers=headers) +print(r.text) +``` +{% include copy.html %} + +For more information, see the [Cohere blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/cohere_connector_embedding_blueprint.md). + +The script outputs a connector ID: + +```json +{"connector_id":"1p0u8o0BWbTmLN9F2Y7m"} +``` + +Note the connector ID; you'll use it in the next step. + +## Step 4: Create and test the model + +Log in to OpenSearch Dashboards, open the DevTools console, and run the following requests to create and test the model. + +1. Create a model group: + + ```json + POST /_plugins/_ml/model_groups/_register + { + "name": "Bedrock_embedding_model", + "description": "Test model group for bedrock embedding model" + } + ``` + {% include copy-curl.html %} + + The response contains the model group ID: + + ```json + { + "model_group_id": "050q8o0BWbTmLN9Foo4f", + "status": "CREATED" + } + ``` + +2. Register the model: + + ```json + POST /_plugins/_ml/models/_register + { + "name": "Bedrock Cohere embedding model v3", + "function_name": "remote", + "description": "test embedding model", + "model_group_id": "050q8o0BWbTmLN9Foo4f", + "connector_id": "0p0p8o0BWbTmLN9F-o4G" + } + ``` + {% include copy-curl.html %} + + The response contains the model ID: + + ```json + { + "task_id": "TRUr8o0BTaDH9c7tSRfx", + "status": "CREATED", + "model_id": "VRUu8o0BTaDH9c7t9xet" + } + ``` + +3. Deploy the model: + + ```json + POST /_plugins/_ml/models/VRUu8o0BTaDH9c7t9xet/_deploy + ``` + {% include copy-curl.html %} + + The response contains a task ID for the deployment operation: + + ```json + { + "task_id": "1J0r8o0BWbTmLN9FjY6I", + "task_type": "DEPLOY_MODEL", + "status": "COMPLETED" + } + ``` + +4. Test the model: + + ```json + POST /_plugins/_ml/models/VRUu8o0BTaDH9c7t9xet/_predict + { + "parameters": { + "texts": ["hello world"] + } + } + ``` + {% include copy-curl.html %} + + The response contains the embeddings generated by the model: + + ```json + { + "inference_results": [ + { + "output": [ + { + "name": "sentence_embedding", + "data_type": "FLOAT32", + "shape": [ + 1024 + ], + "data": [ + -0.02973938, + -0.023651123, + -0.06021118, + ...] + } + ], + "status_code": 200 + } + ] + } + ``` + +## Step 5: Configure semantic search + +Follow these steps to configure semantic search. + +### Step 5.1: Create an ingest pipeline + +First, create an [ingest pipeline]({{site.url}}{{site.baseurl}}/ingest-pipelines/) that uses the model in Amazon SageMaker to create embeddings from the input text: + +```json +PUT /_ingest/pipeline/my_bedrock_cohere_embedding_pipeline +{ + "description": "text embedding pipeline", + "processors": [ + { + "text_embedding": { + "model_id": "your_bedrock_embedding_model_id_created_in_step4", + "field_map": { + "text": "text_knn" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Step 5.2: Create a vector index + +Next, create a vector index for storing the input text and generated embeddings: + +```json +PUT my_index +{ + "settings": { + "index": { + "knn.space_type": "cosinesimil", + "default_pipeline": "my_bedrock_cohere_embedding_pipeline", + "knn": "true" + } + }, + "mappings": { + "properties": { + "text_knn": { + "type": "knn_vector", + "dimension": 1024 + } + } + } +} +``` +{% include copy-curl.html %} + +### Step 5.3: Ingest data + +Ingest a sample document into the index: + +```json +POST /my_index/_doc/1000001 +{ + "text": "hello world." +} +``` +{% include copy-curl.html %} + +### Step 5.4: Search the index + +Run a vector search to retrieve documents from the vector index: + +```json +POST /my_index/_search +{ + "query": { + "neural": { + "text_knn": { + "query_text": "hello", + "model_id": "your_embedding_model_id_created_in_step4", + "k": 100 + } + } + }, + "size": "1", + "_source": ["text"] +} +``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_tutorials/vector-search/semantic-search/semantic-search-bedrock-titan-other.md b/_tutorials/vector-search/semantic-search/semantic-search-bedrock-titan-other.md new file mode 100644 index 00000000000..b1a9f8d4545 --- /dev/null +++ b/_tutorials/vector-search/semantic-search/semantic-search-bedrock-titan-other.md @@ -0,0 +1,474 @@ +--- +layout: default +title: Semantic search using Amazon Bedrock Titan in another account +parent: Semantic search +grand_parent: Vector search +nav_order: 50 +redirect_from: + - /vector-search/tutorials/semantic-search/semantic-search-bedrock-titan-other/ +--- + +# Semantic search using Amazon Bedrock Titan in another account + +Starting with OpenSearch version 2.15, you must configure a connector to an Amazon Bedrock model hosted in a different account than the account hosting Amazon OpenSearch Service. This tutorial shows you how to implement semantic search in [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/) using the [Amazon Bedrock Titan embedding model](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html) hosted in another account. For more information, see [Semantic search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/semantic-search/). + +Amazon Bedrock has a [quota limit](https://docs.aws.amazon.com/bedrock/latest/userguide/quotas.html). For more information about increasing this limit, see [Increase model invocation capacity with Provisioned Throughput in Amazon Bedrock](https://docs.aws.amazon.com/bedrock/latest/userguide/prov-throughput.html). +{: .warning} + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +# Overview + +In this tutorial, you'll use two AWS accounts: Account A (hosting Amazon OpenSearch Service) and Account B (hosting an Amazon Bedrock model). + +To invoke a model hosted in a different account than the account hosting Amazon OpenSearch Service, you must configure two roles in the connector credentials: + +- `roleArn`: The role in Account A that is used to assume the external account role in Account B. +- `externalAccountRoleArn`: The role in Account B that is used to invoke the Amazon Bedrock model. + +In this tutorial , you'll use the following role names: + +- Account A: `my_cross_account_role_accountA` + + Amazon Resource Name (ARN): `arn:aws:iam::<your_aws_account_A>:role/my_cross_account_role_accountA` + +- Account B: `my_invoke_bedrock_role_accountB` + + ARN: `arn:aws:iam::<your_aws_account_B>:role/my_invoke_bedrock_role_accountB` + +## Prerequisite: Create an OpenSearch cluster + +Go to the [Amazon OpenSearch Service console](https://console.aws.amazon.com/aos/home) and create an OpenSearch domain. + +Note the domain ARN; you'll use it in the following steps. + +## Step 1: Create an IAM role in Account B + +To invoke the model on Amazon Bedrock, you must create an AWS Identity and Access Management (IAM) role with appropriate permissions. The connector will use this role to invoke the model. + +Go to the IAM console, create a new IAM role named `my_invoke_bedrock_role_accountB`, and add the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "", + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam::<your_aws_account_A>:role/my_cross_account_role_accountA" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "bedrock:InvokeModel" + ], + "Effect": "Allow", + "Resource": "arn:aws:bedrock:*::foundation-model/amazon.titan-embed-text-v1" + } + ] +} +``` +{% include copy.html %} + +Note the role ARN; you'll use it in the following steps. + +## 2. Create an IAM role in Account A + +Follow these steps to configure an IAM role in Amazon OpenSearch Service. + +### Step 2.1: Create an IAM role for assuming externalAccountRoleArn + +Create an IAM role for assuming `externalAccountRoleArn` in Account B. + +Go to the IAM console, create a new IAM role named `my_cross_account_role_accountA` , and add the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "es.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "sts:AssumeRole", + "Resource": "arn:aws:iam::<your_aws_account_B>:role/my_invoke_bedrock_role_accountB" + } + ] +} +``` +{% include copy.html %} + +Note the role ARN; you'll use it in the following steps. + +### Step 2.2: Create an IAM role for signing connector requests + +Generate a new IAM role specifically for signing your Create Connector API request. + +Create an IAM role named `my_create_connector_role_accountA` with the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "your_iam_user_arn" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +You'll use the `your_iam_user_arn` IAM user to assume the role in Step 3.1. + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "arn:aws:iam::<your_aws_account_A>:role/my_cross_account_role_accountA" + }, + { + "Effect": "Allow", + "Action": "es:ESHttpPost", + "Resource": "your_opensearch_domain_arn_created" + } + ] +} +``` +{% include copy.html %} + +Note this role ARN; you'll use it in the following steps. + +### Step 2.3: Map a backend role + +Follow these steps to map a backend role: + +1. Log in to OpenSearch Dashboards and select **Security** on the top menu. +2. Select **Roles**, and then select the **ml_full_access** role. +3. On the **ml_full_access** role details page, select **Mapped users**, and then select **Manage mapping**. +4. Enter the IAM role ARN created in Step 2.2 (`arn:aws:iam::<your_aws_account_A>:role/my_create_connector_role_accountA`) in the **Backend roles** field, as shown in the following image. + ![Mapping a backend role]({{site.url}}{{site.baseurl}}/images/vector-search-tutorials/mapping_iam_role_arn.png) +5. Select **Map**. + +The IAM role is now successfully configured in your OpenSearch cluster. + +## Step 3: Create a connector + +Follow these steps to create a connector for the model. For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +### Step 3.1: Get temporary credentials + +Use the credentials of the IAM user specified in Step 2.2 to assume the role: + +```bash +aws sts assume-role --role-arn arn:aws:iam::<your_aws_account_A>:role/my_create_connector_role_accountA --role-session-name your_session_name +``` + +{% include copy.html %} + +Copy the temporary credentials from the response and configure them in `~/.aws/credentials`: + +```ini +[default] +AWS_ACCESS_KEY_ID=your_access_key_of_role_created_in_step2.2 +AWS_SECRET_ACCESS_KEY=your_secret_key_of_role_created_in_step2.2 +AWS_SESSION_TOKEN=your_session_token_of_role_created_in_step2.2 +``` +{% include copy.html %} + +### Step 3.2: Create a connector + +Run the following Python code with the temporary credentials configured in `~/.aws/credentials`: + +```python +import boto3 +import requests +from requests_aws4auth import AWS4Auth + +host = 'your_amazon_opensearch_domain_endpoint_created' +region = 'your_amazon_opensearch_domain_region' +service = 'es' + +credentials = boto3.Session().get_credentials() +awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) + +path = '/_plugins/_ml/connectors/_create' +url = host + path + +bedrock_model_region='your_bedrock_model_region' +payload = { + "name": "Amazon Bedrock Connector: titan embedding v1", + "description": "The connector to bedrock Titan embedding model", + "version": 1, + "protocol": "aws_sigv4", + "parameters": { + "region": bedrock_model_region, + "service_name": "bedrock" + }, + "credential": { + "roleArn": "arn:aws:iam::<your_aws_account_A>:role/my_cross_account_role_accountA", + "externalAccountRoleArn": "arn:aws:iam::<your_aws_account_B>:role/my_invoke_bedrock_role_accountB" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": f"https://bedrock-runtime.{bedrock_model_region}.amazonaws.com/model/amazon.titan-embed-text-v1/invoke", + "headers": { + "content-type": "application/json", + "x-amz-content-sha256": "required" + }, + "request_body": "{ \"inputText\": \"${parameters.inputText}\" }", + "pre_process_function": "connector.pre_process.bedrock.embedding", + "post_process_function": "connector.post_process.bedrock.embedding" + } + ] +} + +headers = {"Content-Type": "application/json"} + +r = requests.post(url, auth=awsauth, json=payload, headers=headers) +print(r.text) +``` +{% include copy.html %} + +The script outputs a connector ID: + +```json +{"connector_id":"N0qpQY0BOhavBOmfOCnw"} +``` + +Note the connector ID; you'll use it in the next step. + +## Step 4: Create and test the model + +Log in to OpenSearch Dashboards, open the DevTools console, and run the following requests to create and test the model. + +1. Create a model group: + + ```json + POST /_plugins/_ml/model_groups/_register + { + "name": "Bedrock_embedding_model", + "description": "Test model group for bedrock embedding model" + } + ``` + {% include copy-curl.html %} + + The response contains the model group ID: + + ```json + { + "model_group_id": "LxWiQY0BTaDH9c7t9xeE", + "status": "CREATED" + } + ``` + +2. Register the model: + + ```json + POST /_plugins/_ml/models/_register + { + "name": "bedrock titan embedding model v1", + "function_name": "remote", + "description": "test embedding model", + "model_group_id": "LxWiQY0BTaDH9c7t9xeE", + "connector_id": "N0qpQY0BOhavBOmfOCnw" + } + ``` + {% include copy-curl.html %} + + The response contains the model ID: + + ```json + { + "task_id": "O0q3QY0BOhavBOmf1SmL", + "status": "CREATED", + "model_id": "PEq3QY0BOhavBOmf1Sml" + } + ``` + +3. Deploy the model: + + ```json + POST /_plugins/_ml/models/PEq3QY0BOhavBOmf1Sml/_deploy + ``` + {% include copy-curl.html %} + + The response contains a task ID for the deployment operation: + + ```json + { + "task_id": "PUq4QY0BOhavBOmfBCkQ", + "task_type": "DEPLOY_MODEL", + "status": "COMPLETED" + } + ``` + +4. Test the model: + + ```json + POST /_plugins/_ml/models/PEq3QY0BOhavBOmf1Sml/_predict + { + "parameters": { + "inputText": "hello world" + } + } + ``` + {% include copy-curl.html %} + + The response contains the embeddings generated by the model: + + ```json + { + "inference_results": [ + { + "output": [ + { + "name": "sentence_embedding", + "data_type": "FLOAT32", + "shape": [ + 1536 + ], + "data": [ + 0.7265625, + -0.0703125, + 0.34765625, + ...] + } + ], + "status_code": 200 + } + ] + } + ``` + +## Step 5: Configure semantic search + +Follow these steps to configure semantic search. + +### Step 5.1: Create an ingest pipeline + +First, create an [ingest pipeline]({{site.url}}{{site.baseurl}}/ingest-pipelines/) that uses the model in Amazon SageMaker to create embeddings from the input text: + +```json +PUT /_ingest/pipeline/my_bedrock_embedding_pipeline +{ + "description": "text embedding pipeline", + "processors": [ + { + "text_embedding": { + "model_id": "your_bedrock_embedding_model_id_created_in_step4", + "field_map": { + "text": "text_knn" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Step 5.2: Create a vector index + +Next, create a vector index for storing the input text and generated embeddings: + +```json +PUT my_index +{ + "settings": { + "index": { + "knn.space_type": "cosinesimil", + "default_pipeline": "my_bedrock_embedding_pipeline", + "knn": "true" + } + }, + "mappings": { + "properties": { + "text_knn": { + "type": "knn_vector", + "dimension": 1536 + } + } + } +} +``` +{% include copy-curl.html %} + +### Step 5.3: Ingest data + +Ingest a sample document into the index: + +```json +POST /my_index/_doc/1000001 +{ + "text": "hello world." +} +``` +{% include copy-curl.html %} + +### Step 5.4: Search the index + +Run a vector search to retrieve documents from the vector index: + +```json +POST /my_index/_search +{ + "query": { + "neural": { + "text_knn": { + "query_text": "hello", + "model_id": "your_embedding_model_id_created_in_step4", + "k": 100 + } + } + }, + "size": "1", + "_source": ["text"] +} +``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_tutorials/vector-search/semantic-search/semantic-search-bedrock-titan.md b/_tutorials/vector-search/semantic-search/semantic-search-bedrock-titan.md new file mode 100644 index 00000000000..b309b21694e --- /dev/null +++ b/_tutorials/vector-search/semantic-search/semantic-search-bedrock-titan.md @@ -0,0 +1,417 @@ +--- +layout: default +title: Semantic search using Amazon Bedrock Titan +parent: Semantic search +grand_parent: Vector search +nav_order: 40 +redirect_from: + - /vector-search/tutorials/semantic-search/semantic-search-bedrock-titan/ +--- + +# Semantic search using Amazon Bedrock Titan + +This tutorial shows you how to implement semantic search in [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/) using the [Amazon Bedrock Titan embedding model](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html). For more information, see [Semantic search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/semantic-search/). + +If using Python, you can create an Amazon Bedrock Titan embedding connector and test the model using the [opensearch-py-ml](https://github.com/opensearch-project/opensearch-py-ml) client CLI. The CLI automates many configuration steps, making setup faster and reducing the chance of errors. For more information about using the CLI, see the [CLI documentation](https://opensearch-project.github.io/opensearch-py-ml/cli/index.html#). +{: .tip} + +If using self-managed OpenSearch instead of Amazon OpenSearch Service, create a connector to the model on Amazon Bedrock using [the blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_titan_embedding_blueprint.md). For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +The easiest way to set up an embedding model in Amazon OpenSearch Service is by using [AWS CloudFormation](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/cfn-template.html). Alternatively, you can set up an embedding model using [the AIConnectorHelper notebook](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/tutorials/aws/AIConnectorHelper.ipynb). +{: .tip} + +Amazon Bedrock has a [quota limit](https://docs.aws.amazon.com/bedrock/latest/userguide/quotas.html). For more information about increasing this limit, see [Increase model invocation capacity with Provisioned Throughput in Amazon Bedrock](https://docs.aws.amazon.com/bedrock/latest/userguide/prov-throughput.html). +{: .warning} + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Prerequisite: Create an OpenSearch cluster + +Go to the [Amazon OpenSearch Service console](https://console.aws.amazon.com/aos/home) and create an OpenSearch domain. + +Note the domain Amazon Resource Name (ARN); you'll use it in the following steps. + +## Step 1: Create an IAM role to invoke the model on Amazon Bedrock + +To invoke the model on Amazon Bedrock, you must create an AWS Identity and Access Management (IAM) role with appropriate permissions. The connector will use this role to invoke the model. + +Go to the IAM console, create a new IAM role named `my_invoke_bedrock_role`, and add the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "es.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "bedrock:InvokeModel" + ], + "Effect": "Allow", + "Resource": "arn:aws:bedrock:*::foundation-model/amazon.titan-embed-text-v1" + } + ] +} +``` +{% include copy.html %} + +Note the role ARN; you'll use it in the following steps. + +## Step 2: Configure an IAM role in OpenSearch + +Follow these steps to configure an IAM role in Amazon OpenSearch Service. + +### Step 2.1: Create an IAM role for signing connector requests + +Generate a new IAM role specifically for signing your Create Connector API request. + +Create an IAM role named `my_create_bedrock_connector_role` with the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "your_iam_user_arn" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +You'll use the `your_iam_user_arn` IAM user to assume the role in Step 3.1. + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "your_iam_role_arn_created_in_step1" + }, + { + "Effect": "Allow", + "Action": "es:ESHttpPost", + "Resource": "your_opensearch_domain_arn_created" + } + ] +} +``` +{% include copy.html %} + +Note this role ARN; you'll use it in the following steps. + +### Step 2.2: Map a backend role + +Follow these steps to map a backend role: + +1. Log in to OpenSearch Dashboards and select **Security** on the top menu. +2. Select **Roles**, and then select the **ml_full_access** role. +3. On the **ml_full_access** role details page, select **Mapped users**, and then select **Manage mapping**. +4. Enter the IAM role ARN created in Step 2.1 in the **Backend roles** field, as shown in the following image. + ![Mapping a backend role]({{site.url}}{{site.baseurl}}/images/vector-search-tutorials/mapping_iam_role_arn.png) +5. Select **Map**. + +The IAM role is now successfully configured in your OpenSearch cluster. + +## Step 3: Create a connector + +Follow these steps to create a connector for the model. For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +### Step 3.1: Get temporary credentials + +Use the credentials of the IAM user specified in Step 2.1 to assume the role: + +```bash +aws sts assume-role --role-arn your_iam_role_arn_created_in_step2.1 --role-session-name your_session_name +``` +{% include copy.html %} + +Copy the temporary credentials from the response and configure them in `~/.aws/credentials`: + +```ini +[default] +AWS_ACCESS_KEY_ID=your_access_key_of_role_created_in_step2.1 +AWS_SECRET_ACCESS_KEY=your_secret_key_of_role_created_in_step2.1 +AWS_SESSION_TOKEN=your_session_token_of_role_created_in_step2.1 +``` +{% include copy.html %} + +### Step 3.2: Create a connector + +Run the following Python code with the temporary credentials configured in `~/.aws/credentials`: + +```python +import boto3 +import requests +from requests_aws4auth import AWS4Auth + +host = 'your_amazon_opensearch_domain_endpoint_created' +region = 'your_amazon_opensearch_domain_region' +service = 'es' + +credentials = boto3.Session().get_credentials() +awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) + +path = '/_plugins/_ml/connectors/_create' +url = host + path + +payload = { + "name": "Amazon Bedrock Connector: titan embedding v1", + "description": "The connector to bedrock Titan embedding model", + "version": 1, + "protocol": "aws_sigv4", + "parameters": { + "region": "your_bedrock_model_region", + "service_name": "bedrock" + }, + "credential": { + "roleArn": "your_iam_role_arn_created_in_step1" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://bedrock-runtime.your_bedrock_model_region.amazonaws.com/model/amazon.titan-embed-text-v1/invoke", + "headers": { + "content-type": "application/json", + "x-amz-content-sha256": "required" + }, + "request_body": "{ \"inputText\": \"${parameters.inputText}\" }", + "pre_process_function": "\n StringBuilder builder = new StringBuilder();\n builder.append(\"\\\"\");\n String first = params.text_docs[0];\n builder.append(first);\n builder.append(\"\\\"\");\n def parameters = \"{\" +\"\\\"inputText\\\":\" + builder + \"}\";\n return \"{\" +\"\\\"parameters\\\":\" + parameters + \"}\";", + "post_process_function": "\n def name = \"sentence_embedding\";\n def dataType = \"FLOAT32\";\n if (params.embedding == null || params.embedding.length == 0) {\n return params.message;\n }\n def shape = [params.embedding.length];\n def json = \"{\" +\n \"\\\"name\\\":\\\"\" + name + \"\\\",\" +\n \"\\\"data_type\\\":\\\"\" + dataType + \"\\\",\" +\n \"\\\"shape\\\":\" + shape + \",\" +\n \"\\\"data\\\":\" + params.embedding +\n \"}\";\n return json;\n " + } + ] +} + +headers = {"Content-Type": "application/json"} + +r = requests.post(url, auth=awsauth, json=payload, headers=headers) +print(r.text) +``` +{% include copy.html %} + +The script outputs a connector ID: + +```json +{"connector_id":"1p0u8o0BWbTmLN9F2Y7m"} +``` + +Note the connector ID; you'll use it in the next step. + +## Step 4: Create and test the model + +Log in to OpenSearch Dashboards, open the DevTools console, and run the following requests to create and test the model. + +1. Create a model group: + + ```json + POST /_plugins/_ml/model_groups/_register + { + "name": "Bedrock_embedding_model", + "description": "Test model group for bedrock embedding model" + } + ``` + {% include copy-curl.html %} + + The response contains the model group ID: + + ```json + { + "model_group_id": "LxWiQY0BTaDH9c7t9xeE", + "status": "CREATED" + } + ``` + +2. Register the model: + + ```json + POST /_plugins/_ml/models/_register + { + "name": "bedrock titan embedding model v1", + "function_name": "remote", + "description": "test embedding model", + "model_group_id": "LxWiQY0BTaDH9c7t9xeE", + "connector_id": "N0qpQY0BOhavBOmfOCnw" + } + ``` + {% include copy-curl.html %} + + The response contains the model ID: + + ```json + { + "task_id": "O0q3QY0BOhavBOmf1SmL", + "status": "CREATED", + "model_id": "PEq3QY0BOhavBOmf1Sml" + } + ``` + +3. Deploy the model: + + ```json + POST /_plugins/_ml/models/PEq3QY0BOhavBOmf1Sml/_deploy + ``` + {% include copy-curl.html %} + + The response contains a task ID for the deployment operation: + + ```json + { + "task_id": "PUq4QY0BOhavBOmfBCkQ", + "task_type": "DEPLOY_MODEL", + "status": "COMPLETED" + } + ``` + +4. Test the model: + + ```json + POST /_plugins/_ml/models/PEq3QY0BOhavBOmf1Sml/_predict + { + "parameters": { + "inputText": "hello world" + } + } + ``` + {% include copy-curl.html %} + + The response contains the embeddings generated by the model: + + ```json + { + "inference_results": [ + { + "output": [ + { + "name": "sentence_embedding", + "data_type": "FLOAT32", + "shape": [ + 1536 + ], + "data": [ + 0.7265625, + -0.0703125, + 0.34765625, + ...] + } + ], + "status_code": 200 + } + ] + } + ``` + +## Step 5: Configure semantic search + +Follow these steps to configure semantic search. + +### Step 5.1: Create an ingest pipeline + +First, create an [ingest pipeline]({{site.url}}{{site.baseurl}}/ingest-pipelines/) that uses the model in Amazon SageMaker to create embeddings from the input text: + +```json +PUT /_ingest/pipeline/my_bedrock_embedding_pipeline +{ + "description": "text embedding pipeline", + "processors": [ + { + "text_embedding": { + "model_id": "your_bedrock_embedding_model_id_created_in_step4", + "field_map": { + "text": "text_knn" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Step 5.2: Create a vector index + +Next, create a vector index for storing the input text and generated embeddings: + +```json +PUT my_index +{ + "settings": { + "index": { + "knn.space_type": "cosinesimil", + "default_pipeline": "my_bedrock_embedding_pipeline", + "knn": "true" + } + }, + "mappings": { + "properties": { + "text_knn": { + "type": "knn_vector", + "dimension": 1536 + } + } + } +} +``` +{% include copy-curl.html %} + +### Step 5.3: Ingest data + +Ingest a sample document into the index: + +```json +POST /my_index/_doc/1000001 +{ + "text": "hello world." +} +``` +{% include copy-curl.html %} + +### Step 5.4: Search the index + +Run a vector search to retrieve documents from the vector index: + +```json +POST /my_index/_search +{ + "query": { + "neural": { + "text_knn": { + "query_text": "hello", + "model_id": "your_embedding_model_id_created_in_step4", + "k": 100 + } + } + }, + "size": "1", + "_source": ["text"] +} +``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_tutorials/vector-search/semantic-search/semantic-search-cfn-bedrock.md b/_tutorials/vector-search/semantic-search/semantic-search-cfn-bedrock.md new file mode 100644 index 00000000000..a854ec05906 --- /dev/null +++ b/_tutorials/vector-search/semantic-search/semantic-search-cfn-bedrock.md @@ -0,0 +1,157 @@ +--- +layout: default +title: Semantic search using AWS CloudFormation and Amazon Bedrock +parent: Semantic search +grand_parent: Vector search +nav_order: 75 +redirect_from: + - /vector-search/tutorials/semantic-search/semantic-search-cfn-bedrock/ +--- + +# Semantic search using AWS CloudFormation and Amazon Bedrock + +This tutorial shows you how to implement semantic search in [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/) using [AWS CloudFormation](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/cfn-template.html) and Amazon Bedrock. For more information, see [Semantic search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/semantic-search/). + +If you are using self-managed OpenSearch instead of Amazon OpenSearch Service, create a connector to the Amazon Bedrock models using [the blueprints](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/). For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +The CloudFormation integration automates the steps in the [Semantic search using Amazon Bedrock Titan]({{site.url}}{{site.baseurl}}/vector-search/tutorials/semantic-search/semantic-search-bedrock-cohere/) tutorials. The CloudFormation template creates an AWS Identity and Access Management (IAM) role and invokes an AWS Lambda function to set up an AI connector and model. + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Prerequisite: Create an OpenSearch cluster + +Go to the [Amazon OpenSearch Service console](https://console.aws.amazon.com/aos/home) and create an OpenSearch domain. + +Note the domain Amazon Resource Name (ARN); you'll use it in the following steps. + +## Step 1: Map a backend role + +The OpenSearch CloudFormation template uses a Lambda function to create an AI connector with an IAM role. You must map the IAM role to `ml_full_access` to grant the required permissions. Follow [Step 2.2 of the Semantic search using Amazon Bedrock Titan tutorial]({{site.url}}{{site.baseurl}}/vector-search/tutorials/semantic-search/semantic-search-bedrock-titan/#step-22-map-a-backend-role) to map a backend role. + +The IAM role is specified in the **Lambda Invoke OpenSearch ML Commons Role Name** field in the CloudFormation template. The default IAM role is `LambdaInvokeOpenSearchMLCommonsRole`, so you must map the `arn:aws:iam::your_aws_account_id:role/LambdaInvokeOpenSearchMLCommonsRole` backend role to `ml_full_access`. + +For a broader mapping, you can grant all roles `ml_full_access` using a wildcard: + +``` +arn:aws:iam::your_aws_account_id:role/* +``` + +Because `all_access` includes more permissions than `ml_full_access`, mapping the backend role to `all_access` is also acceptable. + +## Step 2: Run the CloudFormation template + +The CloudFormation template integration is available in the [Amazon OpenSearch Service console](https://console.aws.amazon.com/aos/home). From the left navigation pane, select **Integrations**, as shown in the following image. + +![Semantic search CloudFormation integration]({{site.url}}{{site.baseurl}}/images/vector-search-tutorials/semantic_search_bedrock_integration_1.png) + +To create a connector, complete the following form. + +![Deploy a pretrained model to Amazon Bedrock]({{site.url}}{{site.baseurl}}/images/vector-search-tutorials/semantic_search_bedrock_integration_2.png) + +Complete the following fields, keeping all other fields at their default values: + +1. Enter your **Amazon OpenSearch Endpoint**. +2. In **Model Configuration**, select a **Model** to be deployed. Choose one of the following supported models: + - `amazon.titan-embed-text-v1` + - `amazon.titan-embed-image-v1` + - `amazon.titan-embed-text-v2:0` + - `cohere.embed-english-v3` + - `cohere.embed-multilingual-v3` +3. Select a **Model Region** (this is the Amazon Bedrock Region). +4. In **AddProcessFunction**, select `true` to enable or `false` to disable the default pre- and post-processing functions in the connector. + +## Output + +After deployment, you can find the **ConnectorId**, the **ModelId**, and the **BedrockEndpoint** in the **CloudFormation stack Outputs**. + +If an error occurs, follow these steps to review the logs: + +1. Navigate to the **CloudWatch Logs** section. +2. Search for **Log Groups** that contain (or are associated with) your CloudFormation stack name. + +## Step 3: Configure semantic search + +Follow these steps to configure semantic search. + +### Step 3.1: Create an ingest pipeline + +First, create an [ingest pipeline]({{site.url}}{{site.baseurl}}/ingest-pipelines/) that uses the model on Amazon Bedrock to create embeddings from the input text: + +```json +PUT /_ingest/pipeline/my_bedrock_embedding_pipeline +{ + "description": "text embedding pipeline", + "processors": [ + { + "text_embedding": { + "model_id": "your_bedrock_embedding_model_id_created_in_step3", + "field_map": { + "text": "text_knn" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Step 3.2: Create a vector index + +Next, create a vector index for storing the input text and generated embeddings: + +```json +PUT my_index +{ + "settings": { + "index": { + "knn.space_type": "cosinesimil", + "default_pipeline": "my_bedrock_embedding_pipeline", + "knn": "true" + } + }, + "mappings": { + "properties": { + "text_knn": { + "type": "knn_vector", + "dimension": 1536 + } + } + } +} +``` +{% include copy-curl.html %} + +### Step 3.3: Ingest data + +Ingest a sample document into the index: + +```json +POST /my_index/_doc/1000001 +{ + "text": "hello world." +} +``` +{% include copy-curl.html %} + +### Step 3.4: Search the index + +Run a vector search to retrieve documents from the vector index: + +```json +POST /my_index/_search +{ + "query": { + "neural": { + "text_knn": { + "query_text": "hello", + "model_id": "your_embedding_model_id_created_in_step4", + "k": 100 + } + } + }, + "size": "1", + "_source": ["text"] +} +``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_tutorials/vector-search/semantic-search/semantic-search-cfn-sagemaker.md b/_tutorials/vector-search/semantic-search/semantic-search-cfn-sagemaker.md new file mode 100644 index 00000000000..6f916e7ddde --- /dev/null +++ b/_tutorials/vector-search/semantic-search/semantic-search-cfn-sagemaker.md @@ -0,0 +1,276 @@ +--- +layout: default +title: Semantic search using AWS CloudFormation and Amazon SageMaker +parent: Semantic search +grand_parent: Vector search +nav_order: 70 +redirect_from: + - /vector-search/tutorials/semantic-search/semantic-search-cfn-sagemaker/ +--- + +# Semantic search using AWS CloudFormation and Amazon SageMaker + +This tutorial shows you how to implement semantic search in [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/) using [AWS CloudFormation](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/cfn-template.html) and Amazon SageMaker. For more information, see [Semantic search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/semantic-search/). + +If you are using self-managed OpenSearch instead of Amazon OpenSearch Service, create a connector to the Amazon SageMaker model using [the blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/sagemaker_connector_blueprint.md). For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +The CloudFormation integration automates the steps in the [Semantic Search with SageMaker Embedding Model tutorial]({{site.url}}{{site.baseurl}}/vector-search/tutorials/semantic-search/semantic-search-sagemaker/). The CloudFormation template creates an IAM role and invokes an AWS Lambda function to set up an AI connector and model. + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Model input and output requirements + +Ensure that your Amazon SageMaker model inputs follow the format required by the [default pre-processing function]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/#preprocessing-function). + +The model input must be an array of strings: + +```json +["hello world", "how are you"] +``` + +Additionally, ensure that the model output follows the format required by the [default post-processing function]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/#post-processing-function). The model output must be an array of arrays, where each inner array corresponds to the embedding of an input string: + +```json +[ + [ + -0.048237994, + -0.07612697, + ... + ], + [ + 0.32621247, + 0.02328475, + ... + ] +] +``` + +If your model input/output is not the same as the required default, you can build your own pre-/post-processing function using a [Painless script]({{site.url}}{{site.baseurl}}/api-reference/script-apis/exec-script/). + +### Example: Amazon Bedrock Titan embedding model + +For example, the Amazon Bedrock Titan embedding model ([blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_titan_embedding_blueprint.md#2-create-connector-for-amazon-bedrock)) input is as follows: + +```json +{ "inputText": "your_input_text" } +``` + +OpenSearch expects the following input format: + +```json +{ "text_docs": [ "your_input_text1", "your_input_text2"] } +``` + +To convert `text_docs` into `inputText`, you must define the following pre-processing function: + +```json +"pre_process_function": """ + StringBuilder builder = new StringBuilder(); + builder.append("\""); + String first = params.text_docs[0];// Get the first doc, ml-commons will iterate all docs + builder.append(first); + builder.append("\""); + def parameters = "{" +"\"inputText\":" + builder + "}"; // This is the Bedrock Titan embedding model input + return "{" +"\"parameters\":" + parameters + "}";""" +``` +{% include copy.html %} + +The default Amazon Bedrock Titan embedding model output has the following format: + +```json +{ + "embedding": <float_array> +} +``` + +However, OpenSearch expects the following format: + +```json +{ + "name": "sentence_embedding", + "data_type": "FLOAT32", + "shape": [ <embedding_size> ], + "data": <float_array> +} +``` + +To transform the Amazon Bedrock Titan embedding model output into the format expected by OpenSearch, you must define the following post-processing function: + +```json +"post_process_function": """ + def name = "sentence_embedding"; + def dataType = "FLOAT32"; + if (params.embedding == null || params.embedding.length == 0) { + return params.message; + } + def shape = [params.embedding.length]; + def json = "{" + + "\"name\":\"" + name + "\"," + + "\"data_type\":\"" + dataType + "\"," + + "\"shape\":" + shape + "," + + "\"data\":" + params.embedding + + "}"; + return json; + """ +``` +{% include copy.html %} + +## Prerequisite: Create an OpenSearch cluster + +Go to the [Amazon OpenSearch Service console](https://console.aws.amazon.com/aos/home) and create an OpenSearch domain. + +Note the domain Amazon Resource Name (ARN); you'll use it in the following steps. + +## Step 1: Map a backend role + +The OpenSearch CloudFormation template uses a Lambda function to create an AI connector with an AWS Identity and Access Management (IAM) role. You must map the IAM role to `ml_full_access` to grant the required permissions. Follow [Step 2.2 of the Semantic Search with SageMaker Embedding Model tutorial]({{site.url}}{{site.baseurl}}/vector-search/tutorials/semantic-search/semantic-search-sagemaker/#step-22-map-a-backend-role) to map a backend role. + +The IAM role is specified in the **Lambda Invoke OpenSearch ML Commons Role Name** field in the CloudFormation template. The default IAM role is `LambdaInvokeOpenSearchMLCommonsRole`, so you must map the `arn:aws:iam::your_aws_account_id:role/LambdaInvokeOpenSearchMLCommonsRole` backend role to `ml_full_access`. + +For a broader mapping, you can grant all roles `ml_full_access` using a wildcard: + +``` +arn:aws:iam::your_aws_account_id:role/* +``` + +Because `all_access` includes more permissions than `ml_full_access`, mapping the backend role to `all_access` is also acceptable. + +## Step 2: Run the CloudFormation template + +The CloudFormation template integration is available in the [Amazon OpenSearch Service console](https://console.aws.amazon.com/aos/home). From the left navigation pane, select **Integrations**, as shown in the following image. + +![Semantic search CloudFormation integration]({{site.url}}{{site.baseurl}}/images/vector-search-tutorials/semantic_search_remote_model_Integration_1.png) + +Choose one of the following options to deploy a model to Amazon SageMaker. + +### Option 1: Deploy a pretrained model to Amazon SageMaker + +You can deploy a pretrained Hugging Face sentence transformer embedding model from the [Deep Java Library model repository](https://djl.ai/), as shown in the following image. + +![Deploy a pretrained model to Amazon SageMaker]({{site.url}}{{site.baseurl}}/images/vector-search-tutorials/semantic_search_remote_model_Integration_2.png) + +Complete the following fields, keeping all other fields at their default values: + +1. Enter your **Amazon OpenSearch Endpoint**. +2. Use the default **SageMaker Configuration** to start quickly, or you can modify it as needed. For supported Amazon SageMaker instance types, see the [Amazon SageMaker documentation](https://aws.amazon.com/sagemaker/). +3. Leave the **SageMaker Endpoint Url** field empty. If you provide a URL, the model will not be deployed to Amazon SageMaker, and a new inference endpoint will not be created. +4. Leave the **Custom Image** field empty. The default image is `djl-inference:0.22.1-cpu-full`. For available images, see the [AWS Deep Learning Containers](https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/deep-learning-containers-images.html). +5. Leave the **Custom Model Data Url** field empty. +6. The **Custom Model Environment** field defaults to `djl://ai.djl.huggingface.pytorch/sentence-transformers/all-MiniLM-L6-v2`. For a list of supported models, see [Supported models](#supported-models). + +### Option 2: Use an existing SageMaker inference endpoint + +If you already have a SageMaker inference endpoint, you can configure a model using that endpoint, as shown in the following image. + +![Using an existing SageMaker inference endpoint]({{site.url}}{{site.baseurl}}/images/semantic_search/semantic_search_remote_model_Integration_3.png) + +Complete the following fields, keeping all others at their default values: + +1. Enter your **Amazon OpenSearch Endpoint**. +2. Enter your **SageMaker Endpoint Url**. +3. Leave the **Custom Image**, **Custom Model Data Url**, and **Custom Model Environment** fields empty. + +### Output + +After deployment, you can find the OpenSearch AI connector and model IDs in the CloudFormation stack **Outputs**. + +If an error occurs, follow these steps to review the logs: + +1. Open the Amazon SageMaker console. +1. Navigate to the **CloudWatch Logs** section. +1. Search for **Log Groups** that contain (or are associated with) your CloudFormation stack name. + +## Supported models + +The following Hugging Face sentence transformer embedding models are available in the [Deep Java Library model repository](https://djl.ai/): + +``` +djl://ai.djl.huggingface.pytorch/sentence-transformers/LaBSE/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/all-MiniLM-L12-v1/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/all-MiniLM-L12-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/all-MiniLM-L6-v1/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/all-MiniLM-L6-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/all-distilroberta-v1/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/all-mpnet-base-v1/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/all-mpnet-base-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/all-roberta-large-v1/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/allenai-specter/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/bert-base-nli-cls-token/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/bert-base-nli-max-tokens/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/bert-base-nli-mean-tokens/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/bert-base-nli-stsb-mean-tokens/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/bert-base-wikipedia-sections-mean-tokens/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/bert-large-nli-cls-token/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/bert-large-nli-max-tokens/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/bert-large-nli-mean-tokens/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/bert-large-nli-stsb-mean-tokens/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/clip-ViT-B-32-multilingual-v1/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/distilbert-base-nli-mean-tokens/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/distilbert-base-nli-stsb-mean-tokens/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/distilbert-base-nli-stsb-quora-ranking/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/distilbert-multilingual-nli-stsb-quora-ranking/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/distiluse-base-multilingual-cased-v1/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/facebook-dpr-ctx_encoder-multiset-base/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/facebook-dpr-ctx_encoder-single-nq-base/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/facebook-dpr-question_encoder-multiset-base/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/facebook-dpr-question_encoder-single-nq-base/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-MiniLM-L-12-v3/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-MiniLM-L-6-v3/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-MiniLM-L12-cos-v5/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-MiniLM-L6-cos-v5/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-bert-base-dot-v5/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-bert-co-condensor/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-distilbert-base-dot-prod-v3/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-distilbert-base-tas-b/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-distilbert-base-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-distilbert-base-v3/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-distilbert-base-v4/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-distilbert-cos-v5/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-distilbert-dot-v5/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-lng-aligned/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-trained-scratch/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-distilroberta-base-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-roberta-base-ance-firstp/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-roberta-base-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/msmarco-roberta-base-v3/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/multi-qa-MiniLM-L6-dot-v1/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/multi-qa-distilbert-cos-v1/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/multi-qa-distilbert-dot-v1/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/nli-bert-base/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/nli-bert-large-max-pooling/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/nli-distilbert-base/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/nli-distilroberta-base-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/nli-roberta-base-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/nli-roberta-large/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/nq-distilbert-base-v1/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/paraphrase-MiniLM-L12-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/paraphrase-MiniLM-L3-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/paraphrase-MiniLM-L6-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/paraphrase-TinyBERT-L6-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/paraphrase-albert-base-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/paraphrase-albert-small-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/paraphrase-distilroberta-base-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/paraphrase-multilingual-mpnet-base-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/paraphrase-xlm-r-multilingual-v1/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/quora-distilbert-base/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/quora-distilbert-multilingual/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/roberta-base-nli-mean-tokens/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/roberta-base-nli-stsb-mean-tokens/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/roberta-large-nli-mean-tokens/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/roberta-large-nli-stsb-mean-tokens/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/stsb-bert-base/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/stsb-bert-large/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/stsb-distilbert-base/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/stsb-distilroberta-base-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/stsb-roberta-base-v2/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/stsb-roberta-base/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/stsb-roberta-large/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/stsb-xlm-r-multilingual/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/use-cmlm-multilingual/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens/ +djl://ai.djl.huggingface.pytorch/sentence-transformers/xlm-r-distilroberta-base-paraphrase-v1/ +``` \ No newline at end of file diff --git a/_tutorials/vector-search/semantic-search/semantic-search-cohere.md b/_tutorials/vector-search/semantic-search/semantic-search-cohere.md new file mode 100644 index 00000000000..fd60e466d8a --- /dev/null +++ b/_tutorials/vector-search/semantic-search/semantic-search-cohere.md @@ -0,0 +1,442 @@ +--- +layout: default +title: Semantic search using Cohere Embed +parent: Semantic search +grand_parent: Vector search +nav_order: 30 +redirect_from: + - /vector-search/tutorials/semantic-search/semantic-search-cohere/ +--- + +# Semantic search using Cohere Embed + +This tutorial shows you how to implement semantic search in [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/) using the [Cohere Embed model](https://docs.cohere.com/reference/embed). For more information, see [Semantic search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/semantic-search/). + +If using Python, you can create a Cohere connector and test the model using the [opensearch-py-ml](https://github.com/opensearch-project/opensearch-py-ml) client CLI. The CLI automates many configuration steps, making setup faster and reducing the chance of errors. For more information about using the CLI, see the [CLI documentation](https://opensearch-project.github.io/opensearch-py-ml/cli/index.html#). +{: .tip} + +If using self-managed OpenSearch instead of Amazon OpenSearch Service, create a connector to the Cohere Embed model using [the blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/cohere_v3_connector_embedding_blueprint.md). For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +The easiest way to set up an embedding model in Amazon OpenSearch Service is by using [AWS CloudFormation](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/cfn-template.html). Alternatively, you can set up an embedding model using [the AIConnectorHelper notebook](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/tutorials/aws/AIConnectorHelper.ipynb). +{: .tip} + +The Cohere Embed model is also available on Amazon Bedrock. To use the model hosted on Amazon Bedrock, see [Semantic search using the Cohere Embed model on Amazon Bedrock]({{site.url}}{{site.baseurl}}/vector-search/tutorials/semantic-search/semantic-search-bedrock-cohere/). + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Prerequisite: Create an OpenSearch cluster + +Go to the [Amazon OpenSearch Service console](https://console.aws.amazon.com/aos/home) and create an OpenSearch domain. + +Note the domain Amazon Resource Name (ARN); you'll use it in the following steps. + +## Step 1: Store the API key in AWS Secrets Manager + +Store your Cohere API key in [AWS Secrets Manager](https://docs.aws.amazon.com/secretsmanager/latest/userguide/intro.html): + +1. Open AWS Secrets Manager. +1. Select **Store a new secret**. +1. Select **Other type of secret**. +1. Create a key-value pair with **my_cohere_key** as the key and your Cohere API key as the value. +1. Name your secret `my_test_cohere_secret`. + +Note the secret ARN; you'll use it in the following steps. + +## Step 2: Create an IAM role + +To use the secret created in Step 1, you must create an AWS Identity and Access Management (IAM) role with read permissions for the secret. This IAM role will be configured in the connector and will allow the connector to read the secret. + +Go to the IAM console, create a new IAM role named `my_cohere_secret_role`, and add the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "es.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "secretsmanager:GetSecretValue", + "secretsmanager:DescribeSecret" + ], + "Effect": "Allow", + "Resource": "your_secret_arn_created_in_step1" + } + ] +} +``` +{% include copy.html %} + +Note the role ARN; you'll use it in the following steps. + +## Step 3: Configure an IAM role in Amazon OpenSearch Service + +Follow these steps to configure an IAM role in Amazon OpenSearch Service. + +### Step 3.1: Create an IAM role for signing connector requests + +Generate a new IAM role specifically for signing your Create Connector API request. + +Create an IAM role named `my_create_connector_role` with the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "your_iam_user_arn" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +You'll use the `your_iam_user_arn` IAM user to assume the role in Step 4.1. + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "your_iam_role_arn_created_in_step2" + }, + { + "Effect": "Allow", + "Action": "es:ESHttpPost", + "Resource": "your_opensearch_domain_arn_created" + } + ] +} +``` +{% include copy.html %} + +Note this role ARN; you'll use it in the following steps. + +### Step 3.2: Map a backend role + +Follow these steps to map a backend role: + +1. Log in to OpenSearch Dashboards and select **Security** on the top menu. +2. Select **Roles**, and then select the **ml_full_access** role. +3. On the **ml_full_access** role details page, select **Mapped users**, and then select **Manage mapping**. +4. Enter the IAM role ARN created in Step 3.1 in the **Backend roles** field, as shown in the following image. + ![Mapping a backend role]({{site.url}}{{site.baseurl}}/images/vector-search-tutorials/mapping_iam_role_arn.png) +4. Select **Map**. + +The IAM role is now successfully configured in your OpenSearch cluster. + +## Step 4: Create a connector + +Follow these steps to create a connector for the model. For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +### Step 4.1: Get temporary credentials + +Use the credentials of the IAM user specified in Step 3.1 to assume the role: + +```bash +aws sts assume-role --role-arn your_iam_role_arn_created_in_step3.1 --role-session-name your_session_name +``` +{% include copy.html %} + +Copy the temporary credentials from the response and configure them in `~/.aws/credentials`: + +```ini +[default] +AWS_ACCESS_KEY_ID=your_access_key_of_role_created_in_step3.1 +AWS_SECRET_ACCESS_KEY=your_secret_key_of_role_created_in_step3.1 +AWS_SESSION_TOKEN=your_session_token_of_role_created_in_step3.1 +``` +{% include copy.html %} + +### Step 4.2: Create a connector + +Run the following Python code with the temporary credentials configured in `~/.aws/credentials`: + +```python +import boto3 +import requests +from requests_aws4auth import AWS4Auth + +host = 'your_amazon_opensearch_domain_endpoint_created' +region = 'your_amazon_opensearch_domain_region' +service = 'es' + +credentials = boto3.Session().get_credentials() +awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) + +path = '/_plugins/_ml/connectors/_create' +url = host + path + +payload = { + "name": "cohere-embed-v3", + "description": "The connector to public Cohere model service for embed", + "version": "1", + "protocol": "http", + "credential": { + "secretArn": "your_secret_arn_created_in_step1", + "roleArn": "your_iam_role_arn_created_in_step2" + }, + "parameters": { + "model": "embed-english-v3.0", + "input_type":"search_document", + "truncate": "END" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://api.cohere.ai/v1/embed", + "headers": { + "Authorization": "Bearer ${credential.secretArn.my_cohere_key}", + "Request-Source": "unspecified:opensearch" + }, + "request_body": "{ \"texts\": ${parameters.texts}, \"truncate\": \"${parameters.truncate}\", \"model\": \"${parameters.model}\", \"input_type\": \"${parameters.input_type}\" }", + "pre_process_function": "connector.pre_process.cohere.embedding", + "post_process_function": "connector.post_process.cohere.embedding" + } + ] +} + +headers = {"Content-Type": "application/json"} + +r = requests.post(url, auth=awsauth, json=payload, headers=headers) +print(r.text) +``` +{% include copy.html %} + +The script outputs a connector ID: + +```json +{"connector_id":"qp2QP40BWbTmLN9Fpo40"} +``` + +Note the connector ID; you'll use it in the next step. + +## Step 5: Create and test the model + +Log in to OpenSearch Dashboards, open the DevTools console, and run the following requests to create and test the model. + +1. Create a model group: + + ```json + POST /_plugins/_ml/model_groups/_register + { + "name": "Cohere_embedding_model", + "description": "Test model group for cohere embedding model" + } + ``` + {% include copy-curl.html %} + + The response contains the model group ID: + + ```json + { + "model_group_id": "KEqTP40BOhavBOmfXikp", + "status": "CREATED" + } + ``` + +2. Register the model: + + ```json + POST /_plugins/_ml/models/_register + { + "name": "cohere embedding model v3", + "function_name": "remote", + "description": "test embedding model", + "model_group_id": "KEqTP40BOhavBOmfXikp", + "connector_id": "qp2QP40BWbTmLN9Fpo40" + } + ``` + {% include copy-curl.html %} + + The response contains the model ID: + + ```json + { + "task_id": "q52VP40BWbTmLN9F9I5S", + "status": "CREATED", + "model_id": "MErAP40BOhavBOmfQCkf" + } + ``` + +3. Deploy the model: + + ```json + POST /_plugins/_ml/models/MErAP40BOhavBOmfQCkf/_deploy + ``` + {% include copy-curl.html %} + + The response contains a task ID for the deployment operation: + + ```json + { + "task_id": "KUqWP40BOhavBOmf4Clx", + "task_type": "DEPLOY_MODEL", + "status": "COMPLETED" + } + ``` + +4. Test the model: + + ```json + POST /_plugins/_ml/models/MErAP40BOhavBOmfQCkf/_predict + { + "parameters": { + "texts": ["hello world", "how are you"] + } + } + ``` + {% include copy-curl.html %} + + The response contains the embeddings generated by the model: + + ```json + { + "inference_results": [ + { + "output": [ + { + "name": "sentence_embedding", + "data_type": "FLOAT32", + "shape": [ + 1024 + ], + "data": [ + -0.029510498, + -0.023223877, + -0.059631348, + ...] + }, + { + "name": "sentence_embedding", + "data_type": "FLOAT32", + "shape": [ + 1024 + ], + "data": [ + 0.02279663, + 0.014976501, + -0.04058838,] + } + ], + "status_code": 200 + } + ] + } + ``` + +## Step 6: Configure semantic search + +Follow these steps to configure semantic search. + +### Step 6.1: Create an ingest pipeline + +First, create an [ingest pipeline]({{site.url}}{{site.baseurl}}/ingest-pipelines/) that uses the model to create embeddings from the input text: + +```json +PUT /_ingest/pipeline/my_cohere_embedding_pipeline +{ + "description": "text embedding pipeline", + "processors": [ + { + "text_embedding": { + "model_id": "your_cohere_embedding_model_id_created_in_step5", + "field_map": { + "text": "text_knn" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Step 6.2: Create a vector index + +Next, create a vector index for storing the input text and generated embeddings: + +```json +PUT my_index +{ + "settings": { + "index": { + "knn.space_type": "cosinesimil", + "default_pipeline": "my_cohere_embedding_pipeline", + "knn": "true" + } + }, + "mappings": { + "properties": { + "text_knn": { + "type": "knn_vector", + "dimension": 1024 + } + } + } +} +``` +{% include copy-curl.html %} + +### Step 6.3: Ingest data + +Ingest a sample document into the index: + +```json +POST /my_index/_doc/1000001 +{ + "text": "hello world." +} +``` +{% include copy-curl.html %} + +### Step 6.4: Search the index + +Run a vector search to retrieve documents from the vector index: + +```json +POST /my_index/_search +{ + "query": { + "neural": { + "text_knn": { + "query_text": "hello", + "model_id": "your_embedding_model_id_created_in_step5", + "k": 100 + } + } + }, + "size": "1", + "_source": ["text"] +} +``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_tutorials/vector-search/semantic-search/semantic-search-openai.md b/_tutorials/vector-search/semantic-search/semantic-search-openai.md new file mode 100644 index 00000000000..046f37d5e80 --- /dev/null +++ b/_tutorials/vector-search/semantic-search/semantic-search-openai.md @@ -0,0 +1,438 @@ +--- +layout: default +title: Semantic search using OpenAI +parent: Semantic search +grand_parent: Vector search +nav_order: 20 +redirect_from: + - /vector-search/tutorials/semantic-search/semantic-search-openai/ +--- + +# Semantic search using the OpenAI embedding model + +This tutorial shows you how to implement semantic search in [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/) using the [OpenAI embedding model](https://platform.openai.com/docs/guides/embeddings). For more information, see [Semantic search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/semantic-search/). + +If using Python, you can create an OpenAI connector and test the model using the [opensearch-py-ml](https://github.com/opensearch-project/opensearch-py-ml) client CLI. The CLI automates many configuration steps, making setup faster and reducing the chance of errors. For more information about using the CLI, see the [CLI documentation](https://opensearch-project.github.io/opensearch-py-ml/cli/index.html#). +{: .tip} + +If using self-managed OpenSearch instead of Amazon OpenSearch Service, create a connector to the OpenAI model using [the blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/openai_connector_embedding_blueprint.md). + +Alternatively, you can set up an embedding model using [the AIConnectorHelper notebook](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/tutorials/aws/AIConnectorHelper.ipynb). +{: .tip} + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Prerequisite: Create an OpenSearch cluster + +Go to the [Amazon OpenSearch Service console](https://console.aws.amazon.com/aos/home) and create an OpenSearch domain. + +Note the domain Amazon Resource Name (ARN); you'll use it in the following steps. + +## Step 1: Store the API key in AWS Secrets Manager + +Store your OpenAI API key in [AWS Secrets Manager](https://docs.aws.amazon.com/secretsmanager/latest/userguide/intro.html): + +1. Open AWS Secrets Manager. +1. Select **Store a new secret**. +1. Select **Other type of secret**. +1. Create a key-value pair with **my_openai_key** as the key and your OpenAI API key as the value. +1. Name your secret `my_test_openai_secret`. + +Note the secret ARN; you'll use it in the following steps. + +## Step 2: Create an IAM role + +To use the secret created in Step 1, you must create an AWS Identity and Access Management (IAM) role with read permissions for the secret. This IAM role will be configured in the connector and will allow the connector to read the secret. + +Go to the IAM console, create a new IAM role named `my_openai_secret_role`, and add the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "es.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "secretsmanager:GetSecretValue", + "secretsmanager:DescribeSecret" + ], + "Effect": "Allow", + "Resource": "your_secret_arn_created_in_step1" + } + ] +} +``` +{% include copy.html %} + +Note the role ARN; you'll use it in the following steps. + +## Step 3: Configure an IAM role in Amazon OpenSearch Service + +Follow these steps to configure an IAM role in Amazon OpenSearch Service. + +### Step 3.1: Create an IAM role for signing connector requests + +Generate a new IAM role specifically for signing your Create Connector API request. + +Create an IAM role named `my_create_openai_connector_role` with the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "your_iam_user_arn" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +You'll use the `your_iam_user_arn` IAM user to assume the role in Step 4.1. + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "your_iam_role_arn_created_in_step2" + }, + { + "Effect": "Allow", + "Action": "es:ESHttpPost", + "Resource": "your_opensearch_domain_arn_created" + } + ] +} +``` +{% include copy.html %} + +Note this role ARN; you'll use it in the following steps. + +### Step 3.2: Map a backend role + +Follow these steps to map a backend role: + +1. Log in to OpenSearch Dashboards and select **Security** on the top menu. +2. Select **Roles**, and then select the **ml_full_access** role. +3. On the **ml_full_access** role details page, select **Mapped users**, and then select **Manage mapping**. +4. Enter the IAM role ARN created in Step 3.1 in the **Backend roles** field, as shown in the following image. + ![Mapping a backend role]({{site.url}}{{site.baseurl}}/images/vector-search-tutorials/mapping_iam_role_arn.png) +4. Select **Map**. + +The IAM role is now successfully configured in your OpenSearch cluster. + +## Step 4: Create a connector + +Follow these steps to create a connector for the OpenAI model. For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +### Step 4.1: Get temporary credentials + +Use the credentials of the IAM user specified in Step 3.1 to assume the role: + +```bash +aws sts assume-role --role-arn your_iam_role_arn_created_in_step3.1 --role-session-name your_session_name +``` +{% include copy.html %} + +Copy the temporary credentials from the response and configure them in `~/.aws/credentials`: + +```ini +[default] +AWS_ACCESS_KEY_ID=your_access_key_of_role_created_in_step3.1 +AWS_SECRET_ACCESS_KEY=your_secret_key_of_role_created_in_step3.1 +AWS_SESSION_TOKEN=your_session_token_of_role_created_in_step3.1 +``` +{% include copy.html %} + +### Step 4.2: Create a connector + +Run the following Python code with the temporary credentials configured in `~/.aws/credentials`: + +```python +import boto3 +import requests +from requests_aws4auth import AWS4Auth + +host = 'your_amazon_opensearch_domain_endpoint_created' +region = 'your_amazon_opensearch_domain_region' +service = 'es' + +credentials = boto3.Session().get_credentials() +awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) + +path = '/_plugins/_ml/connectors/_create' +url = host + path + +payload = { + "name": "OpenAI embedding model connector", + "description": "Connector for OpenAI embedding model", + "version": "1.0", + "protocol": "http", + "credential": { + "secretArn": "your_secret_arn_created_in_step1", + "roleArn": "your_iam_role_arn_created_in_step2" + }, + "parameters": { + "model": "text-embedding-ada-002" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "url": "https://api.openai.com/v1/embeddings", + "headers": { + "Authorization": "Bearer ${credential.secretArn.my_openai_key}" + }, + "request_body": "{ \"input\": ${parameters.input}, \"model\": \"${parameters.model}\" }", + "pre_process_function": "connector.pre_process.openai.embedding", + "post_process_function": "connector.post_process.openai.embedding" + } + ] +} + +headers = {"Content-Type": "application/json"} + +r = requests.post(url, auth=awsauth, json=payload, headers=headers) +print(r.text) +``` +{% include copy.html %} + +The script outputs a connector ID: + +```json +{"connector_id":"OBUSRI0BTaDH9c7tUxfU"} +``` + +Note the connector ID; you'll use it in the next step. + +## Step 5: Create and test the model + +Log in to OpenSearch Dashboards, open the DevTools console, and run the following requests to create and test the model. + +1. Create a model group: + + ```json + POST /_plugins/_ml/model_groups/_register + { + "name": "OpenAI_embedding_model", + "description": "Test model group for OpenAI embedding model" + } + ``` + {% include copy-curl.html %} + + The response contains the model group ID: + + ```json + { + "model_group_id": "ORUSRI0BTaDH9c7t9heA", + "status": "CREATED" + } + ``` + +2. Register the model: + + ```json + POST /_plugins/_ml/models/_register + { + "name": "OpenAI embedding model", + "function_name": "remote", + "description": "test embedding model", + "model_group_id": "ORUSRI0BTaDH9c7t9heA", + "connector_id": "OBUSRI0BTaDH9c7tUxfU" + } + ``` + {% include copy-curl.html %} + + The response contains the model ID: + + ```json + { + "task_id": "OhUTRI0BTaDH9c7tLhcv", + "status": "CREATED", + "model_id": "OxUTRI0BTaDH9c7tLhdE" + } + ``` + +3. Deploy the model: + + ```json + POST /_plugins/_ml/models/OxUTRI0BTaDH9c7tLhdE/_deploy + ``` + {% include copy-curl.html %} + + The response contains a task ID for the deployment operation: + + ```json + { + "task_id": "PkoTRI0BOhavBOmfkCmF", + "task_type": "DEPLOY_MODEL", + "status": "COMPLETED" + } + ``` + +4. Test the model: + + ```json + POST /_plugins/_ml/models/OxUTRI0BTaDH9c7tLhdE/_predict + { + "parameters": { + "input": ["hello world", "how are you"] + } + } + ``` + {% include copy-curl.html %} + + The response contains the embeddings generated by the model: + + ```json + { + "inference_results": [ + { + "output": [ + { + "name": "sentence_embedding", + "data_type": "FLOAT32", + "shape": [ + 1536 + ], + "data": [ + -0.014907048, + 0.0013432145, + -0.01851529, + ...] + }, + { + "name": "sentence_embedding", + "data_type": "FLOAT32", + "shape": [ + 1536 + ], + "data": [ + -0.014011521, + -0.0067330617, + -0.011700075, + ...] + } + ], + "status_code": 200 + } + ] + } + ``` + +## Step 6: Configure semantic search + +Follow these steps to configure semantic search. + +### Step 6.1: Create an ingest pipeline + +First, create an [ingest pipeline]({{site.url}}{{site.baseurl}}/ingest-pipelines/) that uses the model to create embeddings from the input text: + +```json +PUT /_ingest/pipeline/my_openai_embedding_pipeline +{ + "description": "text embedding pipeline", + "processors": [ + { + "text_embedding": { + "model_id": "your_embedding_model_id_created_in_step5", + "field_map": { + "text": "text_knn" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Step 6.2: Create a vector index + +Next, create a vector index for storing the input text and generated embeddings: + +```json +PUT my_index +{ + "settings": { + "index": { + "knn.space_type": "cosinesimil", + "default_pipeline": "my_openai_embedding_pipeline", + "knn": "true" + } + }, + "mappings": { + "properties": { + "text_knn": { + "type": "knn_vector", + "dimension": 1536 + } + } + } +} +``` +{% include copy-curl.html %} + +### Step 6.3: Ingest data + +Ingest a sample document into the index: + +```json +POST /my_index/_doc/1000001 +{ + "text": "hello world." +} +``` +{% include copy-curl.html %} + +### Step 6.4: Search the index + +Run a vector search to retrieve documents from the vector index: + +```json +POST /my_index/_search +{ + "query": { + "neural": { + "text_knn": { + "query_text": "hello", + "model_id": "your_embedding_model_id_created_in_step5", + "k": 100 + } + } + }, + "size": "1", + "_source": ["text"] +} +``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_tutorials/vector-search/semantic-search/semantic-search-sagemaker.md b/_tutorials/vector-search/semantic-search/semantic-search-sagemaker.md new file mode 100644 index 00000000000..f9893dc471e --- /dev/null +++ b/_tutorials/vector-search/semantic-search/semantic-search-sagemaker.md @@ -0,0 +1,528 @@ +--- +layout: default +title: Semantic search in Amazon SageMaker +parent: Semantic search +grand_parent: Vector search +nav_order: 60 +redirect_from: + - /vector-search/tutorials/semantic-search/semantic-search-sagemaker/ +--- + +# Semantic search using a model in Amazon SageMaker + +This tutorial shows you how to implement semantic search in [Amazon OpenSearch Service](https://docs.aws.amazon.com/opensearch-service/) using an embedding model in Amazon SageMaker. For more information, see [Semantic search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/semantic-search/). + +If using Python, you can create an Amazon SageMaker connector and test the model using the [opensearch-py-ml](https://github.com/opensearch-project/opensearch-py-ml) client CLI. The CLI automates many configuration steps, making setup faster and reducing the chance of errors. For more information about using the CLI, see the [CLI documentation](https://opensearch-project.github.io/opensearch-py-ml/cli/index.html#). +{: .tip} + +If using self-managed OpenSearch instead of Amazon OpenSearch Service, create a connector to the model in Amazon SageMaker using [the blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/sagemaker_connector_blueprint.md). For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +This tutorial does not cover how to deploy a model to Amazon SageMaker. For more information about deployment, see [Real-time inference](https://docs.aws.amazon.com/sagemaker/latest/dg/realtime-endpoints.html). + +The easiest way to set up an embedding model in Amazon OpenSearch Service is by using [AWS CloudFormation](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/cfn-template.html). Alternatively, you can set up an embedding model using [the AIConnectorHelper notebook](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/tutorials/aws/AIConnectorHelper.ipynb). +{: .tip} + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Model input and output requirements + +Ensure that the inputs for your model in Amazon SageMaker follow the format required by the [default pre-processing function]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/#preprocessing-function). + +The model input must be an array of strings: + +```json +["hello world", "how are you"] +``` + +Additionally, ensure that the model output follows the format required by the [default post-processing function]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/#post-processing-function). The model output must be an array of arrays, where each inner array corresponds to the embedding of an input string: + +```json +[ + [ + -0.048237994, + -0.07612697, + ... + ], + [ + 0.32621247, + 0.02328475, + ... + ] +] +``` + +If your model input/output is not the same as the required default, you can build your own pre-/post-processing function using a [Painless script]({{site.url}}{{site.baseurl}}/api-reference/script-apis/exec-script/). + +### Example: Amazon Bedrock Titan embedding model + +For example, the Amazon Bedrock Titan embedding model ([blueprint](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_titan_embedding_blueprint.md#2-create-connector-for-amazon-bedrock)) input is as follows: + +```json +{ "inputText": "your_input_text" } +``` + +OpenSearch expects the following input format: + +```json +{ "text_docs": [ "your_input_text1", "your_input_text2"] } +``` + +To convert `text_docs` into `inputText`, you must define the following pre-processing function: + +```json +"pre_process_function": """ + StringBuilder builder = new StringBuilder(); + builder.append("\""); + String first = params.text_docs[0];// Get the first doc, ml-commons will iterate all docs + builder.append(first); + builder.append("\""); + def parameters = "{" +"\"inputText\":" + builder + "}"; // This is the Bedrock Titan embedding model input + return "{" +"\"parameters\":" + parameters + "}";""" +``` +{% include copy.html %} + +The default Amazon Bedrock Titan embedding model output has the following format: + +```json +{ + "embedding": <float_array> +} +``` + +However, OpenSearch expects the following format: + +```json +{ + "name": "sentence_embedding", + "data_type": "FLOAT32", + "shape": [ <embedding_size> ], + "data": <float_array> +} +``` + +To transform the Amazon Bedrock Titan embedding model output into the format expected by OpenSearch, you must define the following post-processing function: + +```json +"post_process_function": """ + def name = "sentence_embedding"; + def dataType = "FLOAT32"; + if (params.embedding == null || params.embedding.length == 0) { + return params.message; + } + def shape = [params.embedding.length]; + def json = "{" + + "\"name\":\"" + name + "\"," + + "\"data_type\":\"" + dataType + "\"," + + "\"shape\":" + shape + "," + + "\"data\":" + params.embedding + + "}"; + return json; + """ +``` +{% include copy.html %} + +## Prerequisite: Create an OpenSearch cluster + +Go to the [Amazon OpenSearch Service console](https://console.aws.amazon.com/aos/home) and create an OpenSearch domain. + +Note the domain Amazon Resource Name (ARN); you'll use it in the following steps. + +## Step 1: Create an IAM role to invoke the model in Amazon SageMaker + +To invoke the model in Amazon SageMaker, you must create an AWS Identity and Access Management (IAM) role with appropriate permissions. The connector will use this role to invoke the model. + +Go to the IAM console, create a new IAM role named `my_invoke_sagemaker_model_role`, and add the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "es.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "sagemaker:InvokeEndpoint" + ], + "Resource": [ + "your_sagemaker_model_inference_endpoint_arn" + ] + } + ] +} +``` +{% include copy.html %} + +Note the role ARN; you'll use it in the following steps. + +## Step 2: Configure an IAM role in Amazon OpenSearch Service + +Follow these steps to configure an IAM role in Amazon OpenSearch Service. + +### Step 2.1: Create an IAM role for signing connector requests + +Generate a new IAM role specifically for signing your Create Connector API request. + +Create an IAM role named `my_create_sagemaker_connector_role` with the following trust policy and permissions: + +- Custom trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "your_iam_user_arn" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` +{% include copy.html %} + +You'll use the `your_iam_user_arn` IAM user to assume the role in Step 3.1. + +- Permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "your_iam_role_arn_created_in_step1" + }, + { + "Effect": "Allow", + "Action": "es:ESHttpPost", + "Resource": "your_opensearch_domain_arn" + } + ] +} +``` +{% include copy.html %} + +Note this role ARN; you'll use it in the following steps. + +### Step 2.2: Map a backend role + +Follow these steps to map a backend role: + +1. Log in to OpenSearch Dashboards and select **Security** on the top menu. +2. Select **Roles**, and then select the **ml_full_access** role. +3. On the **ml_full_access** role details page, select **Mapped users**, and then select **Manage mapping**. +4. Enter the IAM role ARN created in Step 2.1 in the **Backend roles** field, as shown in the following image. + ![Mapping a backend role]({{site.url}}{{site.baseurl}}/images/vector-search-tutorials/mapping_iam_role_arn.png) +5. Select **Map**. + +The IAM role is now successfully configured in your OpenSearch cluster. + +## Step 3: Create a connector + +Follow these steps to create a connector for the model. For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +### Step 3.1: Get temporary credentials + +Use the credentials of the IAM user specified in Step 2.1 to assume the role: + +```bash +aws sts assume-role --role-arn your_iam_role_arn_created_in_step2.1 --role-session-name your_session_name +``` +{% include copy.html %} + +Copy the temporary credentials from the response and configure them in `~/.aws/credentials`: + +```ini +[default] +AWS_ACCESS_KEY_ID=your_access_key_of_role_created_in_step2.1 +AWS_SECRET_ACCESS_KEY=your_secret_key_of_role_created_in_step2.1 +AWS_SESSION_TOKEN=your_session_token_of_role_created_in_step2.1 +``` +{% include copy.html %} + +### Step 3.2: Create a connector + +Run the following Python code with the temporary credentials configured in `~/.aws/credentials`: + +```python +import boto3 +import requests +from requests_aws4auth import AWS4Auth + +host = 'your_amazon_opensearch_domain_endpoint' +region = 'your_amazon_opensearch_domain_region' +service = 'es' + +credentials = boto3.Session().get_credentials() +awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) + + +path = '/_plugins/_ml/connectors/_create' +url = host + path + +payload = { + "name": "Sagemaker embedding model connector", + "description": "Connector for my Sagemaker embedding model", + "version": "1.0", + "protocol": "aws_sigv4", + "credential": { + "roleArn": "your_iam_role_arn_created_in_step1" + }, + "parameters": { + "region": "your_sagemaker_model_region", + "service_name": "sagemaker" + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "headers": { + "content-type": "application/json" + }, + "url": "your_sagemaker_model_inference_endpoint", + "request_body": "${parameters.input}", + "pre_process_function": "connector.pre_process.default.embedding", + "post_process_function": "connector.post_process.default.embedding" + } + ] +} + +headers = {"Content-Type": "application/json"} + +r = requests.post(url, auth=awsauth, json=payload, headers=headers) +print(r.status_code) +print(r.text) +``` +{% include copy.html %} + +The script outputs a connector ID: + +```json +{"connector_id":"tZ09Qo0BWbTmLN9FM44V"} +``` + +Note the connector ID; you'll use it in the next step. + +## Step 4: Create and test the model + +Log in to OpenSearch Dashboards, open the DevTools console, and run the following requests to create and test the model. + +1. Create a model group: + + ```json + POST /_plugins/_ml/model_groups/_register + { + "name": "Sagemaker_embedding_model", + "description": "Test model group for Sagemaker embedding model" + } + ``` + {% include copy-curl.html %} + + The response contains the model group ID: + + ```json + { + "model_group_id": "MhU3Qo0BTaDH9c7tKBfR", + "status": "CREATED" + } + ``` + +2. Register the model: + + ```json + POST /_plugins/_ml/models/_register + { + "name": "Sagemaker embedding model", + "function_name": "remote", + "description": "test embedding model", + "model_group_id": "MhU3Qo0BTaDH9c7tKBfR", + "connector_id": "tZ09Qo0BWbTmLN9FM44V" + } + ``` + {% include copy-curl.html %} + + The response contains the model ID: + + ```json + { + "task_id": "NhU9Qo0BTaDH9c7t0xft", + "status": "CREATED", + "model_id": "NxU9Qo0BTaDH9c7t1Bca" + } + ``` + +3. Deploy the model: + + ```json + POST /_plugins/_ml/models/NxU9Qo0BTaDH9c7t1Bca/_deploy + ``` + {% include copy-curl.html %} + + The response contains a task ID for the deployment operation: + + ```json + { + "task_id": "MxU4Qo0BTaDH9c7tJxde", + "task_type": "DEPLOY_MODEL", + "status": "COMPLETED" + } + ``` + +4. Test the model: + + ```json + POST /_plugins/_ml/models/NxU9Qo0BTaDH9c7t1Bca/_predict + { + "parameters": { + "input": ["hello world", "how are you"] + } + } + ``` + {% include copy-curl.html %} + + The response contains the embeddings generated by the model: + + ```json + { + "inference_results": [ + { + "output": [ + { + "name": "sentence_embedding", + "data_type": "FLOAT32", + "shape": [ + 384 + ], + "data": [ + -0.034477264, + 0.031023195, + 0.0067349933, + ...] + }, + { + "name": "sentence_embedding", + "data_type": "FLOAT32", + "shape": [ + 384 + ], + "data": [ + -0.031369038, + 0.037830487, + 0.07630822, + ...] + } + ], + "status_code": 200 + } + ] + } + ``` + +## Step 5: Configure semantic search + +Follow these steps to configure semantic search. + +### Step 5.1: Create an ingest pipeline + +First, create an [ingest pipeline]({{site.url}}{{site.baseurl}}/ingest-pipelines/) that uses the model in Amazon SageMaker to create embeddings from the input text: + +```json +PUT /_ingest/pipeline/my_sagemaker_embedding_pipeline +{ + "description": "text embedding pipeline", + "processors": [ + { + "text_embedding": { + "model_id": "your_sagemaker_embedding_model_id_created_in_step4", + "field_map": { + "text": "text_knn" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Step 5.2: Create a vector index + +Next, create a vector index for storing the input text and generated embeddings: + +```json +PUT my_index +{ + "settings": { + "index": { + "knn.space_type": "cosinesimil", + "default_pipeline": "my_sagemaker_embedding_pipeline", + "knn": "true" + } + }, + "mappings": { + "properties": { + "text_knn": { + "type": "knn_vector", + "dimension": your_sagemake_model_embedding_dimension + } + } + } +} +``` +{% include copy-curl.html %} + +### Step 5.3: Ingest data + +Ingest a sample document into the index: + +```json +POST /my_index/_doc/1000001 +{ + "text": "hello world." +} +``` +{% include copy-curl.html %} + +### Step 5.4: Search the index + +Run a vector search to retrieve documents from the vector index: + +```json +POST /my_index/_search +{ + "query": { + "neural": { + "text_knn": { + "query_text": "hello", + "model_id": "your_embedding_model_id_created_in_step4", + "k": 100 + } + } + }, + "size": "1", + "_source": ["text"] +} +``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_ml-commons-plugin/tutorials/generate-embeddings.md b/_tutorials/vector-search/vector-operations/generate-embeddings.md similarity index 79% rename from _ml-commons-plugin/tutorials/generate-embeddings.md rename to _tutorials/vector-search/vector-operations/generate-embeddings.md index c236424eb89..2b37949276a 100644 --- a/_ml-commons-plugin/tutorials/generate-embeddings.md +++ b/_tutorials/vector-search/vector-operations/generate-embeddings.md @@ -1,20 +1,24 @@ --- layout: default title: Generating embeddings -parent: Tutorials +parent: Vector operations +grand_parent: Vector search nav_order: 5 +redirect_from: + - /ml-commons-plugin/tutorials/generate-embeddings/ + - /vector-search/tutorials/vector-operations/generate-embeddings/ --- -# Generating embeddings for arrays of objects +# Generating embeddings from arrays of objects -This tutorial illustrates how to generate embeddings for arrays of objects. +This tutorial shows you how to generate embeddings for arrays of objects. For more information, see [Generating embeddings automatically]({{site.url}}{{site.baseurl}}/vector-search/getting-started/auto-generated-embeddings/). Replace the placeholders beginning with the prefix `your_` with your own values. {: .note} ## Step 1: Register an embedding model -For this tutorial, you will use the [Amazon Bedrock Titan Embedding model](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html). +In this tutorial, you will use the [Amazon Titan Text Embeddings models](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html) hosted on Amazon Bedrock. First, follow the [Amazon Bedrock Titan blueprint example](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/remote_inference_blueprints/bedrock_connector_titan_embedding_blueprint.md) to register and deploy the model. @@ -54,16 +58,16 @@ The response contains inference results: Follow the next set of steps to create an ingest pipeline for generating embeddings. -### Step 2.1: Create a k-NN index +### Step 2.1: Create a vector index -First, create a k-NN index: +First, create a vector index: ```json PUT my_books { "settings" : { "index.knn" : "true", - "default_pipeline": "bedrock_embedding_foreach_pipeline" + "default_pipeline": "bedrock_embedding_pipeline" }, "mappings": { "properties": { @@ -94,9 +98,7 @@ Then create an inner ingest pipeline to generate an embedding for one array elem This pipeline contains three processors: -- `set` processor: The `text_embedding` processor is unable to identify the `_ingest._value.title` field. You must copy `_ingest._value.title` to a non-existing temporary field so that the `text_embedding` processor can process it. - `text_embedding` processor: Converts the value of the temporary field to an embedding. -- `remove` processor: Removes the temporary field. To create such a pipeline, send the following request: @@ -104,47 +106,13 @@ To create such a pipeline, send the following request: PUT _ingest/pipeline/bedrock_embedding_pipeline { "processors": [ - { - "set": { - "field": "title_tmp", - "value": {% raw %}"{{_ingest._value.title}}"{% endraw %} - } - }, { "text_embedding": { - "model_id": your_embedding_model_id, + "model_id": "your_embedding_model_id", "field_map": { - "title_tmp": "_ingest._value.title_embedding" + "books.title": "title_embedding" } } - }, - { - "remove": { - "field": "title_tmp" - } - } - ] -} -``` -{% include copy-curl.html %} - -Create an ingest pipeline with a `foreach` processor that will apply the `bedrock_embedding_pipeline` to each element of the `books` array: - -```json -PUT _ingest/pipeline/bedrock_embedding_foreach_pipeline -{ - "description": "Test nested embeddings", - "processors": [ - { - "foreach": { - "field": "books", - "processor": { - "pipeline": { - "name": "bedrock_embedding_pipeline" - } - }, - "ignore_failure": true - } } ] } @@ -156,7 +124,7 @@ PUT _ingest/pipeline/bedrock_embedding_foreach_pipeline First, you'll test the pipeline on an array that contains two book objects, both with a `title` field: ```json -POST _ingest/pipeline/bedrock_embedding_foreach_pipeline/_simulate +POST _ingest/pipeline/bedrock_embedding_pipeline/_simulate { "docs": [ { diff --git a/_tutorials/vector-search/vector-operations/index.md b/_tutorials/vector-search/vector-operations/index.md new file mode 100644 index 00000000000..c056a831e2b --- /dev/null +++ b/_tutorials/vector-search/vector-operations/index.md @@ -0,0 +1,36 @@ +--- +layout: default +title: Vector operations +parent: Vector search +has_children: true +has_toc: false +nav_order: 10 +redirect_from: + - /vector-search/tutorials/vector-operations/ + - /tutorials/vector-search/vector-operations/ +vector_operations: + - heading: "Generating embeddings from arrays of objects" + list: + - "<b>Platform</b>: OpenSearch" + - "<b>Model</b>: Amazon Titan" + - "<b>Deployment</b>: Amazon Bedrock" + link: "/tutorials/vector-search/vector-operations/generate-embeddings/" + - heading: "Semantic search using byte-quantized vectors" + list: + - "<b>Platform:</b> OpenSearch" + - "<b>Model:</b> Cohere Embed" + - "<b>Deployment:</b> Provider API" + link: "/tutorials/vector-search/vector-operations/semantic-search-byte-vectors/" + - heading: "Optimizing vector search using Cohere compressed embeddings" + list: + - "<b>Platform:</b> OpenSearch" + - "<b>Model:</b> Cohere Embed Multilingual v3" + - "<b>Deployment:</b> Amazon Bedrock" + link: "/tutorials/vector-search/vector-operations/optimize-compression/" +--- + +# Vector operation tutorials + +The following tutorials show you how to implement vector operations. + +{% include cards.html cards=page.vector_operations %} \ No newline at end of file diff --git a/_tutorials/vector-search/vector-operations/optimize-compression.md b/_tutorials/vector-search/vector-operations/optimize-compression.md new file mode 100644 index 00000000000..697d9ebdcee --- /dev/null +++ b/_tutorials/vector-search/vector-operations/optimize-compression.md @@ -0,0 +1,807 @@ +--- +layout: default +title: Optimizing vector search using Cohere compressed embeddings +parent: Vector operations +grand_parent: Vector search +nav_order: 20 +redirect_from: + - /vector-search/tutorials/vector-operations/optimize-compression/ +--- + +# Optimizing vector search using Cohere compressed embeddings + +This tutorial shows you how to optimize vector search using Cohere compressed embeddings. These embeddings allow for more efficient storage and faster retrieval of vector representations, making them ideal for large-scale search applications. + +This tutorial is compatible with version 2.17 and later, except for [Using a template query and a search pipeline](#using-a-template-query-and-a-search-pipeline) in [Step 4: Search the index](#step-4-search-the-index), which requires version 2.19 or later. + +This tutorial uses the Cohere Embed Multilingual v3 model on Amazon Bedrock. For more information about using Cohere compressed embeddings on Amazon Bedrock, see [this blog post](https://aws.amazon.com/about-aws/whats-new/2024/06/amazon-bedrock-compressed-embeddings-cohere-embed/). + +In this tutorial, you'll use the following OpenSearch components: +- [ML inference ingest processor]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/ml-inference/) +- [ML inference search request processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/ml-inference-search-request/) +- [Search template query]({{site.url}}{{site.baseurl}}/api-reference/search-template/) +- [Vector index]({{site.url}}{{site.baseurl}}/search-plugins/knn/index/) and [byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/#byte-vectors) + +Replace the placeholders beginning with the prefix `your_` with your own values. +{: .note} + +## Step 1: Configure an embedding model + +Follow these steps to create a connector to Amazon Bedrock for accessing the Cohere Embed model. + +### Step 1.1: Create a connector + +Create a connector for the embedding model using [this blueprint](https://github.com/opensearch-project/ml-commons/blob/main/docs/remote_inference_blueprints/bedrock_connector_cohere_cohere.embed-multilingual-v3_blueprint.md). For more information about creating a connector, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). + +Because you'll use the [ML inference processor]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/ml-inference/) in this tutorial, you don't need to specify a pre- or post-processing function in the connector. +{: .note} + +To create a connector, send the following request. The `"embedding_types": ["int8"]` parameter specifies 8-bit integer quantized embeddings from the Cohere model. This setting compresses embeddings from 32-bit floats to 8-bit integers, reducing storage space and improving computation speed. While there is a slight trade-off in precision, it is typically negligible for search tasks. These quantized embeddings are compatible with OpenSearch's `knn_index`, which supports byte vectors: + +```json +POST _plugins/_ml/connectors/_create +{ + "name": "Amazon Bedrock Connector: Cohere embed-multilingual-v3", + "description": "Test connector for Amazon Bedrock Cohere embed-multilingual-v3", + "version": 1, + "protocol": "aws_sigv4", + "credential": { + "access_key": "your_aws_access_key", + "secret_key": "your_aws_secret_key", + "session_token": "your_aws_session_token" + }, + "parameters": { + "region": "your_aws_region", + "service_name": "bedrock", + "truncate": "END", + "input_type": "search_document", + "model": "cohere.embed-multilingual-v3", + "embedding_types": ["int8"] + }, + "actions": [ + { + "action_type": "predict", + "method": "POST", + "headers": { + "x-amz-content-sha256": "required", + "content-type": "application/json" + }, + "url": "https://bedrock-runtime.${parameters.region}.amazonaws.com/model/${parameters.model}/invoke", + "request_body": "{ \"texts\": ${parameters.texts}, \"truncate\": \"${parameters.truncate}\", \"input_type\": \"${parameters.input_type}\", \"embedding_types\": ${parameters.embedding_types} }" + + } + ] +} +``` +{% include copy-curl.html %} + +For more information about the model parameters, see the [Cohere documentation](https://docs.cohere.com/v2/docs/embeddings) and the [Amazon Bedrock documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-embed.html) + +The response contains the connector ID: + +```json +{ + "connector_id": "AOP0OZUB3JwAtE25PST0" +} +``` + +Note the connector ID; you'll use it in the next step. + +### Step 1.2: Register the model + +Next, register the model using the connector you created in the previous step. The `interface` parameter is optional. If the model does not require a specific interface configuration, set this parameter to an empty object: `"interface": {}`: + +```json +POST _plugins/_ml/models/_register?deploy=true +{ + "name": "Bedrock Cohere embed-multilingual-v3", + "version": "1.0", + "function_name": "remote", + "description": "Bedrock Cohere embed-multilingual-v3", + "connector_id": "AOP0OZUB3JwAtE25PST0", + "interface": { + "input": "{\n \"type\": \"object\",\n \"properties\": {\n \"parameters\": {\n \"type\": \"object\",\n \"properties\": {\n \"texts\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"string\"\n }\n },\n \"embedding_types\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"string\",\n \"enum\": [\"float\", \"int8\", \"uint8\", \"binary\", \"ubinary\"]\n }\n },\n \"truncate\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"string\",\n \"enum\": [\"NONE\", \"START\", \"END\"]\n }\n },\n \"input_type\": {\n \"type\": \"string\",\n \"enum\": [\"search_document\", \"search_query\", \"classification\", \"clustering\"]\n }\n },\n \"required\": [\"texts\"]\n }\n },\n \"required\": [\"parameters\"]\n}", + "output": "{\n \"type\": \"object\",\n \"properties\": {\n \"inference_results\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"object\",\n \"properties\": {\n \"output\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"object\",\n \"properties\": {\n \"name\": {\n \"type\": \"string\"\n },\n \"dataAsMap\": {\n \"type\": \"object\",\n \"properties\": {\n \"id\": {\n \"type\": \"string\",\n \"format\": \"uuid\"\n },\n \"texts\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"string\"\n }\n },\n \"embeddings\": {\n \"type\": \"object\",\n \"properties\": {\n \"binary\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"number\"\n }\n }\n },\n \"float\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"number\"\n }\n }\n },\n \"int8\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"number\"\n }\n }\n },\n \"ubinary\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"number\"\n }\n }\n },\n \"uint8\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"number\"\n }\n }\n }\n }\n },\n \"response_type\": {\n \"type\": \"string\"\n }\n },\n \"required\": [\"embeddings\"]\n }\n },\n \"required\": [\"name\", \"dataAsMap\"]\n }\n },\n \"status_code\": {\n \"type\": \"integer\"\n }\n },\n \"required\": [\"output\", \"status_code\"]\n }\n }\n },\n \"required\": [\"inference_results\"]\n}" + } +} +``` +{% include copy-curl.html %} + +For more information, see the [model interface documentation]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/register-model/#the-interface-parameter) + +The response contains the model ID: + +```json +{ + "task_id": "COP0OZUB3JwAtE25yiQr", + "status": "CREATED", + "model_id": "t64OPpUBX2k07okSZc2n" +} +``` + +To test the model, send the following request: + +```json +POST _plugins/_ml/models/t64OPpUBX2k07okSZc2n/_predict +{ + "parameters": { + "texts": ["Say this is a test"], + "embedding_types": [ "int8" ] + } +} +``` +{% include copy-curl.html %} + +The response contains the generated embeddings: + +```json +{ + "inference_results": [ + { + "output": [ + { + "name": "response", + "dataAsMap": { + "id": "db07a08c-283d-4da5-b0c5-a9a54ef35d01", + "texts": [ + "Say this is a test" + ], + "embeddings": { + "int8": [ + [ + -26.0, + 31.0, + ... + ] + ] + }, + "response_type": "embeddings_by_type" + } + } + ], + "status_code": 200 + } + ] +} +``` + +## Step 2: Create an ingest pipeline + +An ingest pipeline lets you process documents before indexing them. In this case, you'll use one to generate embeddings for the `title` and `description` fields in your data. + +There are two ways to set up the pipeline: + +1. [Invoke the model separately for `title` and `description`](#option-1-invoke-the-model-separately-for-title-and-description): This option sends separate requests for each field, generating independent embeddings. +1. [Invoke the model once by combining `title` and `description`](#option-2-invoke-the-model-once-by-combining-title-and-description): This option concatenates the fields into a single input and sends one request, generating a single embedding that represents both. + +### Option 1: Invoke the model separately for `title` and `description` + +```json +PUT _ingest/pipeline/ml_inference_pipeline_cohere +{ +"processors": [ + { + "ml_inference": { + "tag": "ml_inference", + "description": "This processor is going to run ml inference during ingest request", + "model_id": "t64OPpUBX2k07okSZc2n", + "input_map": [ + { + "texts": "$..title" + }, + { + "texts": "$..description" + } + ], + "output_map": [ + { + "title_embedding": "embeddings.int8[0]" + }, + { + "description_embedding": "embeddings.int8[0]" + } + ], + "model_config": { + "embedding_types": ["int8"] + }, + "ignore_failure": false + } + } +] +} +``` +{% include copy-curl.html %} + +### Option 2: Invoke the model once by combining `title` and `description` + +```json +PUT _ingest/pipeline/ml_inference_pipeline_cohere +{ + "description": "Concatenate title and description fields", + "processors": [ + { + "set": { + "field": "title_desc_tmp", + "value": [ + "{{title}}", + "{{description}}" + ] + } + }, + { + "ml_inference": { + "tag": "ml_inference", + "description": "This processor is going to run ml inference during ingest request", + "model_id": "t64OPpUBX2k07okSZc2n", + "input_map": [ + { + "texts": "title_desc_tmp" + } + ], + "output_map": [ + { + "title_embedding": "embeddings.int8[0]", + "description_embedding": "embeddings.int8[1]" + } + ], + "model_config": { + "embedding_types": ["int8"] + }, + "ignore_failure": true + } + }, + { + "remove": { + "field": "title_desc_tmp" + } + } + ] +} +``` +{% include copy-curl.html %} + +Test the pipeline by sending the following [simulate]({{site.url}}{{site.baseurl}}/ingest-pipelines/simulate-ingest/) request: + +```json +POST _ingest/pipeline/ml_inference_pipeline_cohere/_simulate +{ + "docs": [ + { + "_index": "books", + "_id": "1", + "_source": { + "title": "The Great Gatsby", + "author": "F. Scott Fitzgerald", + "description": "A novel of decadence and excess in the Jazz Age, exploring themes of wealth, love, and the American Dream.", + "publication_year": 1925, + "genre": "Classic Fiction" + } + } + ] +} +``` +{% include copy-curl.html %} + +The response contains the generated embeddings: + +```json +{ + "docs": [ + { + "doc": { + "_index": "books", + "_id": "1", + "_source": { + "publication_year": 1925, + "author": "F. Scott Fitzgerald", + "genre": "Classic Fiction", + "description": "A novel of decadence and excess in the Jazz Age, exploring themes of wealth, love, and the American Dream.", + "title": "The Great Gatsby", + "title_embedding": [ + 18, + 33, + ... + ], + "description_embedding": [ + -21, + -14, + ... + ] + }, + "_ingest": { + "timestamp": "2025-02-25T09:11:32.192125042Z" + } + } + } + ] +} +``` + +## Step 3: Create a vector index and ingest data + +Next, create a vector index: + +```json +PUT books +{ + "settings": { + "index": { + "default_pipeline": "ml_inference_pipeline_cohere", + "knn": true, + "knn.algo_param.ef_search": 100 + } + }, + "mappings": { + "properties": { + "title_embedding": { + "type": "knn_vector", + "dimension": 1024, + "data_type": "byte", + "space_type": "l2", + "method": { + "name": "hnsw", + "engine": "lucene", + "parameters": { + "ef_construction": 100, + "m": 16 + } + } + }, + "description_embedding": { + "type": "knn_vector", + "dimension": 1024, + "data_type": "byte", + "space_type": "l2", + "method": { + "name": "hnsw", + "engine": "lucene", + "parameters": { + "ef_construction": 100, + "m": 16 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +Ingest test data into the index: + +```json +POST _bulk +{"index":{"_index":"books"}} +{"title":"The Great Gatsby","author":"F. Scott Fitzgerald","description":"A novel of decadence and excess in the Jazz Age, exploring themes of wealth, love, and the American Dream.","publication_year":1925,"genre":"Classic Fiction"} +{"index":{"_index":"books"}} +{"title":"To Kill a Mockingbird","author":"Harper Lee","description":"A powerful story of racial injustice and loss of innocence in the American South during the Great Depression.","publication_year":1960,"genre":"Literary Fiction"} +{"index":{"_index":"books"}} +{"title":"Pride and Prejudice","author":"Jane Austen","description":"A romantic novel of manners that follows the character development of Elizabeth Bennet as she learns about the repercussions of hasty judgments and comes to appreciate the difference between superficial goodness and actual goodness.","publication_year":1813,"genre":"Romance"} +``` +{% include copy-curl.html %} + +## Step 4: Search the index + +You can run a vector search on the index in the following ways: +- [Using a template query and a search pipeline](#using-a-template-query-and-a-search-pipeline) +- [Rewriting the query in the search pipeline](#rewriting-the-query-in-the-search-pipeline) + +### Using a template query and a search pipeline + +First, create a search pipeline: + +```json +PUT _search/pipeline/ml_inference_pipeline_cohere_search +{ + "request_processors": [ + { + "ml_inference": { + "model_id": "t64OPpUBX2k07okSZc2n", + "input_map": [ + { + "texts": "$..ext.ml_inference.text" + } + ], + "output_map": [ + { + "ext.ml_inference.vector": "embeddings.int8[0]" + } + ], + "model_config": { + "input_type": "search_query", + "embedding_types": ["int8"] + } + } + } + ] +} +``` +{% include copy-curl.html %} + +Next, use a template query to run a search: + +```json +GET books/_search?search_pipeline=ml_inference_pipeline_cohere_search&verbose_pipeline=false +{ + "query": { + "template": { + "knn": { + "description_embedding": { + "vector": "${ext.ml_inference.vector}", + "k": 10 + } + } + } + }, + "ext": { + "ml_inference": { + "text": "American Dream" + } + }, + "_source": { + "excludes": [ + "title_embedding", "description_embedding" + ] + }, + "size": 2 +} +``` +{% include copy-curl.html %} + +To see each search processor's input and output, add `&verbose_pipeline=true` to your request. This is useful for debugging and understanding how the search pipeline modifies queries. For more information, see [Debugging a search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/debugging-search-pipeline/). + +### Rewriting the query in the search pipeline + +Create another search pipeline that rewrites the query: + +```json +PUT _search/pipeline/ml_inference_pipeline_cohere_search2 +{ + "request_processors": [ + { + "ml_inference": { + "model_id": "t64OPpUBX2k07okSZc2n", + "input_map": [ + { + "texts": "$..match.description.query" + } + ], + "output_map": [ + { + "query_vector": "embeddings.int8[0]" + } + ], + "model_config": { + "input_type": "search_query", + "embedding_types": ["int8"] + }, + "query_template": """ + { + "query": { + "knn": { + "description_embedding": { + "vector": ${query_vector}, + "k": 10 + } + } + }, + "_source": { + "excludes": [ + "title_embedding", + "description_embedding" + ] + }, + "size": 2 + } + """ + } + } + ] +} +``` +{% include copy-curl.html %} + +Now run a vector search using this pipeline: + +```json +GET books/_search?search_pipeline=ml_inference_pipeline_cohere_search2 +{ + "query": { + "match": { + "description": "American Dream" + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching documents: + +```json +{ + "took": 96, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 7.271585e-7, + "hits": [ + { + "_index": "books", + "_id": "U640PJUBX2k07okSEMwy", + "_score": 7.271585e-7, + "_source": { + "publication_year": 1925, + "author": "F. Scott Fitzgerald", + "genre": "Classic Fiction", + "description": "A novel of decadence and excess in the Jazz Age, exploring themes of wealth, love, and the American Dream.", + "title": "The Great Gatsby" + } + }, + { + "_index": "books", + "_id": "VK40PJUBX2k07okSEMwy", + "_score": 6.773544e-7, + "_source": { + "publication_year": 1960, + "author": "Harper Lee", + "genre": "Literary Fiction", + "description": "A powerful story of racial injustice and loss of innocence in the American South during the Great Depression.", + "title": "To Kill a Mockingbird" + } + } + ] + } +} +``` + +## Step 5 (Optional): Using binary embeddings + +In this section, you'll extend the setup to support binary embeddings, which offer even more efficient storage and faster retrieval. Binary embeddings can significantly reduce storage requirements and improve search speed, making them ideal for large-scale applications. + +You don't need to modify the connector or model---you only need to update the vector index, ingest pipeline, and search pipeline. + +### Step 5.1: Create an ingest pipeline + +Create a new ingest pipeline named `ml_inference_pipeline_cohere_binary` by using the same configuration as in [Step 2](#step-2-create-an-ingest-pipeline) but replacing all occurrences of `int8` with `binary`. + +### Option 1: Invoke the model separately for `title` and `description` + +```json +PUT _ingest/pipeline/ml_inference_pipeline_cohere +{ +"processors": [ + { + "ml_inference": { + "tag": "ml_inference", + "description": "This processor is going to run ml inference during ingest request", + "model_id": "t64OPpUBX2k07okSZc2n", + "input_map": [ + { + "texts": "$..title" + }, + { + "texts": "$..description" + } + ], + "output_map": [ + { + "title_embedding": "embeddings.binary[0]" + }, + { + "description_embedding": "embeddings.binary[0]" + } + ], + "model_config": { + "embedding_types": ["binary"] + }, + "ignore_failure": false + } + } +] +} +``` +{% include copy-curl.html %} + +### Option 2: Invoke the model once by combining `title` and `description` + +```json +PUT _ingest/pipeline/ml_inference_pipeline_cohere +{ + "description": "Concatenate title and description fields", + "processors": [ + { + "set": { + "field": "title_desc_tmp", + "value": [ + "{{title}}", + "{{description}}" + ] + } + }, + { + "ml_inference": { + "tag": "ml_inference", + "description": "This processor is going to run ml inference during ingest request", + "model_id": "t64OPpUBX2k07okSZc2n", + "input_map": [ + { + "texts": "title_desc_tmp" + } + ], + "output_map": [ + { + "title_embedding": "embeddings.binary[0]", + "description_embedding": "embeddings.binary[1]" + } + ], + "model_config": { + "embedding_types": ["binary"] + }, + "ignore_failure": true + } + }, + { + "remove": { + "field": "title_desc_tmp" + } + } + ] +} +``` +{% include copy-curl.html %} + + +### Step 5.2: Create a vector index and ingest data + +Create a new vector index containing a [binary vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/#binary-vectors) field: + +```json +PUT books_binary_embedding +{ + "settings": { + "index": { + "default_pipeline": "ml_inference_pipeline_cohere_binary", + "knn": true + } + }, + "mappings": { + "properties": { + "title_embedding": { + "type": "knn_vector", + "dimension": 1024, + "data_type": "binary", + "space_type": "hamming", + "method": { + "name": "hnsw", + "engine": "faiss" + } + }, + "description_embedding": { + "type": "knn_vector", + "dimension": 1024, + "data_type": "binary", + "space_type": "hamming", + "method": { + "name": "hnsw", + "engine": "faiss" + } + } + } + } +} +``` +{% include copy-curl.html %} + +Ingest test data into the index: + +```json +POST _bulk +{"index":{"_index":"books_binary_embedding"}} +{"title":"The Great Gatsby","author":"F. Scott Fitzgerald","description":"A novel of decadence and excess in the Jazz Age, exploring themes of wealth, love, and the American Dream.","publication_year":1925,"genre":"Classic Fiction"} +{"index":{"_index":"books_binary_embedding"}} +{"title":"To Kill a Mockingbird","author":"Harper Lee","description":"A powerful story of racial injustice and loss of innocence in the American South during the Great Depression.","publication_year":1960,"genre":"Literary Fiction"} +{"index":{"_index":"books_binary_embedding"}} +{"title":"Pride and Prejudice","author":"Jane Austen","description":"A romantic novel of manners that follows the character development of Elizabeth Bennet as she learns about the repercussions of hasty judgments and comes to appreciate the difference between superficial goodness and actual goodness.","publication_year":1813,"genre":"Romance"} +``` +{% include copy-curl.html %} + +### Step 5.3: Create a search pipeline + +Create a new search pipeline named `ml_inference_pipeline_cohere_search_binary` by using the same configuration as in [Step 2](#step-4-search-the-index) but replacing all occurrences of `int8` with `binary`. + +1. Change `embeddings.int8[0]` to `embeddings.binary[0]`. +1. Change `"embedding_types": ["int8"]` to `"embedding_types": ["binary"]`. + +### Using a template query and a search pipeline + +First, create a search pipeline: + +```json +PUT _search/pipeline/ml_inference_pipeline_cohere_search_binary +{ + "request_processors": [ + { + "ml_inference": { + "model_id": "t64OPpUBX2k07okSZc2n", + "input_map": [ + { + "texts": "$..ext.ml_inference.text" + } + ], + "output_map": [ + { + "ext.ml_inference.vector": "embeddings.binary[0]" + } + ], + "model_config": { + "input_type": "search_query", + "embedding_types": ["binary"] + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Rewriting the query in the search pipeline + +Create another search pipeline that rewrites the query: + +```json +PUT _search/pipeline/ml_inference_pipeline_cohere_search_binary2 +{ + "request_processors": [ + { + "ml_inference": { + "model_id": "t64OPpUBX2k07okSZc2n", + "input_map": [ + { + "texts": "$..match.description.query" + } + ], + "output_map": [ + { + "query_vector": "embeddings.binary[0]" + } + ], + "model_config": { + "input_type": "search_query", + "embedding_types": ["binary"] + }, + "query_template": """ + { + "query": { + "knn": { + "description_embedding": { + "vector": ${query_vector}, + "k": 10 + } + } + }, + "_source": { + "excludes": [ + "title_embedding", + "description_embedding" + ] + }, + "size": 2 + } + """ + } + } + ] +} +``` +{% include copy-curl.html %} + +Then you can use the search pipeline to run a vector search, as described in [Step 4](#step-4-search-the-index). \ No newline at end of file diff --git a/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md b/_tutorials/vector-search/vector-operations/semantic-search-byte-vectors.md similarity index 94% rename from _ml-commons-plugin/tutorials/semantic-search-byte-vectors.md rename to _tutorials/vector-search/vector-operations/semantic-search-byte-vectors.md index c4cc27f6609..1bb79ab6c85 100644 --- a/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md +++ b/_tutorials/vector-search/vector-operations/semantic-search-byte-vectors.md @@ -1,13 +1,17 @@ --- layout: default title: Semantic search using byte vectors -parent: Tutorials +parent: Vector operations +grand_parent: Vector search nav_order: 10 +redirect_from: + - /ml-commons-plugin/tutorials/semantic-search-byte-vectors/ + - /vector-search/tutorials/vector-operations/semantic-search-byte-vectors/ --- # Semantic search using byte-quantized vectors -This tutorial shows you how to build a semantic search using the [Cohere Embed model](https://docs.cohere.com/reference/embed) and byte-quantized vectors. For more information about using byte-quantized vectors, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#byte-vectors). +This tutorial shows you how to build a semantic search using the [Cohere Embed model](https://docs.cohere.com/reference/embed) and byte-quantized vectors. For more information about using byte-quantized vectors, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/#byte-vectors) and [Semantic search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/semantic-search/). The Cohere Embed v3 model supports several `embedding_types`. For this tutorial, you'll use the `INT8` type to encode byte-quantized vectors. @@ -63,7 +67,7 @@ POST /_plugins/_ml/connectors/_create ``` {% include copy-curl.html %} -To ensure compatibility with the Neural Search plugin, the `data_type` (output in the `inference_results.output.data_type` field of the response) must be set to `FLOAT32` in the post-processing function, even though the actual embedding type will be `INT8`. +To ensure compatibility with OpenSearch, the `data_type` (output in the `inference_results.output.data_type` field of the response) must be set to `FLOAT32` in the post-processing function, even though the actual embedding type will be `INT8`. {: .important} Note the connector ID in the response; you'll use it to register the model. @@ -159,7 +163,7 @@ PUT /_ingest/pipeline/pipeline-cohere ``` {% include copy-curl.html %} -Next, create a k-NN index and set the `data_type` for the `passage_embedding` field to `byte` so that it can hold byte-quantized vectors: +Next, create a vector index and set the `data_type` for the `passage_embedding` field to `byte` so that it can store byte-quantized vectors: ```json PUT my_test_data @@ -261,7 +265,7 @@ POST /_plugins/_ml/models/_register?deploy=true Note the model ID in the response; you'll use it to run queries. -Run a neural search query, providing the model ID: +Run a vector search, providing the model ID: ```json POST /my_test_data/_search diff --git a/_upgrade-to/docker-upgrade-to.md b/_upgrade-to/docker-upgrade-to.md index 9e63e7bd434..bc12b387173 100644 --- a/_upgrade-to/docker-upgrade-to.md +++ b/_upgrade-to/docker-upgrade-to.md @@ -10,4 +10,4 @@ If you use a container orchestration system like Kubernetes (or manage your cont If you use Docker Compose, we highly recommend that you perform what amounts to a [cluster restart upgrade]({{site.url}}{{site.baseurl}}/upgrade-to/upgrade-to/). Update your cluster configuration with new images, new settings, and new environment variables, and test it. Then stop and start the cluster. This process requires downtime, but takes very few steps and lets you continue to treat the cluster as a single entity that you can reliably deploy and redeploy. -The most important step is to leave your data volumes intact. **Don't** run `docker-compose down -v`. +The most important step is to leave your data volumes intact. **Don't** run `docker compose down -v`. diff --git a/_upgrade-to/index.md b/_upgrade-to/index.md index 0eea3d6209c..696be88c212 100644 --- a/_upgrade-to/index.md +++ b/_upgrade-to/index.md @@ -1,6 +1,6 @@ --- layout: default -title: About the migration process +title: Upgrading OpenSearch nav_order: 1 nav_exclude: true permalink: /upgrade-to/ @@ -8,15 +8,14 @@ redirect_from: - /upgrade-to/index/ --- -# About the migration process +# Upgrading OpenSearch -The process of migrating from Elasticsearch OSS to OpenSearch varies depending on your current version of Elasticsearch OSS, installation type, tolerance for downtime, and cost-sensitivity. Rather than concrete steps to cover every situation, we have general guidance for the process. +The process of upgrading your OpenSearch version varies depending on your current version of OpenSearch, installation type, tolerance for downtime, and cost-sensitivity. For migrating to OpenSearch, we provide a [Migration Assistant]({{site.url}}{{site.baseurl}}/migration-assistant/). -Three approaches exist: +Two upgrade approaches exists: -- Use a snapshot to [migrate your Elasticsearch OSS data]({{site.url}}{{site.baseurl}}/upgrade-to/snapshot-migrate/) to a new OpenSearch cluster. This method may incur downtime. -- Perform a [restart upgrade or a rolling upgrade]({{site.url}}{{site.baseurl}}/upgrade-to/upgrade-to/) on your existing nodes. A restart upgrade involves upgrading the entire cluster and restarting it, whereas a rolling upgrade requires upgrading and restarting nodes in the cluster one by one. -- Replace existing Elasticsearch OSS nodes with new OpenSearch nodes. Node replacement is most popular when upgrading [Docker clusters]({{site.url}}{{site.baseurl}}/upgrade-to/docker-upgrade-to/). +- Perform a [restart upgrade or a rolling upgrade]({{site.url}}{{site.baseurl}}/upgrade-to/snapshot-migrate/) on your existing nodes. A restart upgrade involves upgrading the entire cluster and restarting it, whereas a rolling upgrade requires upgrading and restarting nodes in the cluster one by one. +- Replace existing OpenSearch nodes with new OpenSearch nodes. Node replacement is most popular when upgrading [Docker clusters]({{site.url}}{{site.baseurl}}/upgrade-to/docker-upgrade-to/). Regardless of your approach, to safeguard against data loss, we recommend that you take a [snapshot]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore) of all indexes prior to any migration. diff --git a/_upgrade-to/upgrade-to.md b/_upgrade-to/upgrade-to.md index 340055b214a..00950687a58 100644 --- a/_upgrade-to/upgrade-to.md +++ b/_upgrade-to/upgrade-to.md @@ -6,6 +6,10 @@ nav_order: 15 # Migrating from Elasticsearch OSS to OpenSearch + +OpenSearch provides a [Migration Assistant]({{site.url}}{{site.baseurl}}/migration-assistant/) to assist you in migrating from other search solutions. +{: .warning} + If you want to migrate from an existing Elasticsearch OSS cluster to OpenSearch and find the [snapshot approach]({{site.url}}{{site.baseurl}}/upgrade-to/snapshot-migrate/) unappealing, you can migrate your existing nodes from Elasticsearch OSS to OpenSearch. If your existing cluster runs an older version of Elasticsearch OSS, the first step is to upgrade to version 6.x or 7.x. diff --git a/_vector-search/ai-search/building-flows.md b/_vector-search/ai-search/building-flows.md new file mode 100644 index 00000000000..8a840ffd4ad --- /dev/null +++ b/_vector-search/ai-search/building-flows.md @@ -0,0 +1,443 @@ +--- +layout: default +title: Configuring AI search types +parent: Building AI search workflows in OpenSearch Dashboards +grand_parent: AI search +nav_order: 10 +--- + +# Configuring AI search types + +This page provides example configurations for different AI search workflow types. Each example shows how to tailor the setup to a specific use case, such as semantic search or hybrid retrieval. To build a workflow from start to finish, follow the steps in [Building AI search workflows in OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/vector-search/ai-search/workflow-builder/), applying your use case configuration to the appropriate parts of the setup. + +## Prerequisite: Provision ML resources + +Before you start, select and provision the necessary machine learning (ML) resources, depending on your use case. For example, to implement semantic search, you must configure a text embedding model in your OpenSearch cluster. For more information about deploying ML models locally or connecting to externally hosted models, see [Integrating ML models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/). + +<details markdown="block"> + <summary> + Table of contents + </summary> + {: .text-delta } +1. TOC +{:toc} +</details> + +<p id="implementation-examples"></p> + +## Semantic search + +This example demonstrates how to configure semantic search. + +### ML resources + +Create and deploy an [Amazon Titan Text Embedding model on Amazon Bedrock](https://github.com/opensearch-project/dashboards-flow-framework/blob/main/documentation/models.md#amazon-bedrock-titan-text-embedding). + +### Index + +Ensure that the index settings include `index.knn: true` and that your index contains a `knn_vector` field specified in the mappings, as follows: + +```json +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "<embedding_field_name>": { + "type": "knn_vector", + "dimension": "<embedding_size>" + } + } + } +} +``` + +{% include copy.html %} + +### Ingest pipeline + +Configure a single ML inference processor. Map your input text to the `inputText` model input field. Optionally, map the output `embedding` to a new document field. + +### Search pipeline + +Configure a single ML inference search request processor. Map the query field containing the input text to the `inputText` model input field. Optionally, map the output `embedding` to a new field. Override the query to include a `knn` query, for example: + +```json +{ + "_source": { + "excludes": [ + "<embedding_field>" + ] + }, + "query": { + "knn": { + "<embedding_field>": { + "vector": ${embedding}, + "k": 10 + } + } + } +} +``` + +{% include copy.html %} + +--- + +## Hybrid search + +Hybrid search combines keyword and vector search. This example demonstrates how to configure hybrid search. + +### ML resources + +Create and deploy an [Amazon Titan Text Embedding model on Amazon Bedrock](https://github.com/opensearch-project/dashboards-flow-framework/blob/main/documentation/models.md#amazon-bedrock-titan-text-embedding). + +### Index + +Ensure that the index settings include `index.knn: true` and that your index contains a `knn_vector` field specified in the mappings, as follows: + +```json +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "<embedding_field_name>": { + "type": "knn_vector", + "dimension": "<embedding_size>" + } + } + } +} +``` + +{% include copy.html %} + +### Ingest pipeline + +Configure a single ML inference processor. Map your input text to the `inputText` model input field. Optionally, map the output `embedding` to a new document field. + +### Search pipeline + +Configure an ML inference search request processor and a normalization processor. + +**For the ML inference processor**, map the query field containing the input text to the `inputText` model input field. Optionally, map the output `embedding` to a new field. Override the query so that it contains a `hybrid` query. Make sure to specify the `embedding_field`, `text_field`, and `text_field_input`: + +```json +{ + "_source": { + "excludes": [ + "<embedding_field>" + ] + }, + "query": { + "hybrid": { + "queries": [ + { + "match": { + "<text_field>": { + "query": "<text_field_input>" + } + } + }, + { + "knn": { + "<embedding_field>": { + "vector": ${embedding}, + "k": 10 + } + } + } + ] + } + } +} +``` + +{% include copy.html %} + +**For the normalization processor**, configure weights for each subquery. For more information, see the [hybrid search normalization processor example]({{site.url}}{{site.baseurl}}/search-plugins/hybrid-search/#step-3-configure-a-search-pipeline). + +--- + +## Basic RAG (document summarization) + +This example demonstrates how to configure basic retrieval-augmented generation (RAG). + +The following example shows a simplified connector blueprint for the [Claude v1 messages API](https://docs.anthropic.com/en/api/messages). While connector blueprints and model interfaces may evolve over time, this example demonstrates how to abstract complex API interactions into a single `prompt` field input. + +A sample input might appear as follows, with placeholders representing dynamically fetched results: + +```json +{ + "prompt": "Human: You are a professional data analyst. You are given a list of document results. You will analyze the data and generate a human-readable summary of the results. If you don't know the answer, just say I don't know.\n\n Results: ${parameters.results.toString()}\n\n Human: Please summarize the results.\n\n Assistant:" +} +``` + +### ML resources + +Create and deploy an [Anthropic Claude 3 Sonnet model on Amazon Bedrock](https://github.com/opensearch-project/dashboards-flow-framework/blob/main/documentation/models.md#claude-3-sonnet-hosted-on-amazon-bedrock). + +### Search pipeline + +Configure an ML inference search response processor using the following steps: + +1. Select **Template** as the transformation type for the `prompt` input field. +2. Open the template configuration by selecting **Configure**. +3. Choose a preset template to simplify setup. +4. Create an input variable that extracts the list of reviews (for example, `review`). +5. Inject the variable into the prompt by copying and pasting it into the template. +6. Select **Run preview** to verify that the transformed prompt correctly incorporates sample dynamic data. +7. Select **Save** to apply the changes and exit. + +--- + +## Multimodal search + +Multimodal search searches by text and image. This example demonstrates how to configure multimodal search. + +### ML resources + +Create and deploy an [Amazon Titan Multimodal Embedding model on Amazon Bedrock](https://github.com/opensearch-project/dashboards-flow-framework/blob/main/documentation/models.md#amazon-bedrock-titan-multimodal-embedding). + +### Index + +Ensure that the index settings include `index.knn: true` and that your index contains a `knn_vector` field (to persist generated embeddings) and a `binary` field (to persist the image binary) specified in the mappings, as follows: + +```json +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "image_base64": { + "type": "binary" + }, + "image_embedding": { + "type": "knn_vector", + "dimension": <dimension> + } + } + } +} +``` + +{% include copy.html %} + +### Ingest pipeline + +Configure a single ML inference processor. Map your input text field and input image field to the `inputText` and `inputImage` model input fields, respectively. If both text and image inputs are needed, ensure that both are mapped. Alternatively, you can map only one input (either text or image) if a single input is sufficient for embedding generation. + +Optionally, map the output `embedding` to a new document field. + +### Search pipeline + +Configure a single ML inference search request processor. Map the input text field and input image field in the query to the `inputText` and `inputImage` model input fields, respectively. If both text and image inputs are needed, ensure that both are mapped. Alternatively, you can map only one input (either text or image) if a single input is sufficient for embedding generation. + +Override the query so that it contains a `knn` query, including the embedding output: + +```json +{ + "_source": { + "excludes": [ + "<embedding_field>" + ] + }, + "query": { + "knn": { + "<embedding_field>": { + "vector": ${embedding}, + "k": 10 + } + } + } +} +``` + +{% include copy.html %} + +--- + +## Named entity recognition + +This example demonstrates how to configure named entity recognition (NER). + +### ML resources + +Create and deploy an [Amazon Comprehend Entity Detection model](https://github.com/opensearch-project/dashboards-flow-framework/blob/main/documentation/models.md#amazon-comprehend---entity-detection). + +### Ingest pipeline + +Configure a single ML inference processor. Map your input text field to the `text` model input field. To persist any identified entities with each document, transform the output (an array of entities) and store them in the `entities_found` field. Use the following `output_map` configuration as a reference: + +```json +"output_map": [ + { + "entities_found": "$.response.Entities[*].Type" + } +], +``` + +{% include copy.html %} + +This configuration maps the extracted entities to the `entities_found` field, ensuring that they are stored alongside each document. + +--- + +## Language detection and classification + +The following example demonstrates how to configure language detection and classification. + +### ML resources + +Create and deploy an [Amazon Comprehend Language Detection model](https://github.com/opensearch-project/dashboards-flow-framework/blob/main/documentation/models.md#amazon-comprehend---language-detection). + +### Ingest pipeline + +Configure a single ML inference processor. Map your input text field to the `text` model input field. To store the most relevant or most likely language detected for each document, transform the output (an array of languages) and persist it in the `detected_dominant_language` field. Use the following `output_map` configuration as a reference: + +```json +"output_map": [ + { + "detected_dominant_language": "response.Languages[0].LanguageCode" + } +], +``` + +{% include copy.html %} + +--- + +## Reranking results + +Reranking can be implemented in various ways, depending on the capabilities of the model used. Typically, models require at least two inputs: the original query and the data to be assigned a relevance score. Some models support batching, allowing multiple results to be processed in a single inference call, while others require scoring each result individually. + +In OpenSearch, this leads to two common reranking patterns: + +1. **Batching enabled** + + 1. Collect all search results. + 1. Pass the batched results to a single ML processor for scoring. + 1. Return the top **n** ranked results. + +2. **Batching disabled** + 1. Collect all search results. + 1. Pass each result to the ML processor to assign a new relevance score. + 1. Send all results with updated scores to the rerank processor for sorting. + 1. Return the top **n** ranked results. + +The following example demonstrates **Pattern 2 (batching disabled)** to highlight the rerank processor. However, note that the **Cohere Rerank** model used in this example **does support batching**, so you could also implement **Pattern 1** with this model. + +### ML resources + +Create and deploy a [Cohere Rerank model](https://github.com/opensearch-project/dashboards-flow-framework/blob/main/documentation/models.md#cohere-rerank). + +### Search pipeline + +Configure an ML inference **search response** processor, followed by a rerank **search response** processor. For reranking with batching disabled, use the ML processor to generate new relevance scores for the retrieved results and then apply the reranker to sort them accordingly. + +Use the following ML processor configuration: + +1. Map the document field containing the data to be used for comparison to the model's `documents` field. +2. Map the original query to the model's `query` field. +3. Use JSONPath to access the query JSON, prefixed with `_request.query`. + +Use the following `input_map` configuration as a reference: + +```json +"input_map": [ + { + "documents": "description", + "query": "$._request.query.term.value" + } +], +``` + +{% include copy.html %} + +Optionally, you can store the rescored result in the model output in a new field. You can also extract and persist only the relevance score, as follows: + +```json +"input_map": [ + { + "new_score": "results[0].relevance_score" + } +], +``` + +{% include copy.html %} + +Use the following rerank processor configuration: Under **target_field**, select the model score field (in this example, `new_score`). + +--- + +## Multimodal search (text or image) with a custom CLIP model + +The following example uses a custom CLIP model hosted on Amazon SageMaker. The model dynamically ingests a text or image URL as input and returns a vector embedding. + +### ML resources + +Create and deploy a [Custom CLIP Multimodal model](https://github.com/opensearch-project/dashboards-flow-framework/blob/main/documentation/models.md#custom-clip-multimodal-embedding). + +### Index + +Ensure that the index settings include `index.knn: true` and that your index contains a `knn_vector` field specified in the mappings, as follows: + +```json +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "<embedding_field_name>": { + "type": "knn_vector", + "dimension": "<embedding_size>" + } + } + } +} +``` + +{% include copy.html %} + +### Ingest pipeline + +Configure a single ML inference processor. Map your image field to the `image_url` model input field or your text field to the `text` model input field, depending on what type of data you are ingesting and persisting in your index. For example, if building an application that returns relevant images based on text or image input, you will need to persist images and should map the image field to the `image_url` field. + +### Search pipeline + +Configure a single ML inference search request processor. Map the input image field or the input text field in the query to the `image_url` or `text` model input fields, respectively. The CLIP model flexibly handles one or the other, so choose the option that best suits your use case. + +Override the query so that it contains a `knn` query, including the embedding output: + +```json +{ + "_source": { + "excludes": [ + "<embedding_field>" + ] + }, + "query": { + "knn": { + "<embedding_field>": { + "vector": ${embedding}, + "k": 10 + } + } + } +} +``` + +{% include copy.html %} diff --git a/_search-plugins/conversational-search.md b/_vector-search/ai-search/conversational-search.md similarity index 88% rename from _search-plugins/conversational-search.md rename to _vector-search/ai-search/conversational-search.md index be4c97b425d..f5efc0d321a 100644 --- a/_search-plugins/conversational-search.md +++ b/_vector-search/ai-search/conversational-search.md @@ -1,20 +1,22 @@ --- layout: default -title: Conversational search +title: Conversational search with RAG +parent: AI search has_children: false nav_order: 70 redirect_from: - /ml-commons-plugin/conversational-search/ + - /search-plugins/conversational-search/ --- -# Conversational search +# Conversational search with RAG Conversational search allows you to ask questions in natural language and refine the answers by asking follow-up questions. Thus, the conversation becomes a dialog between you and a large language model (LLM). For this to happen, instead of answering each question individually, the model needs to remember the context of the entire conversation. Conversational search is implemented with the following components: - [Conversation history](#conversation-history): Allows an LLM to remember the context of the current conversation and understand follow-up questions. -- [Retrieval-Augmented Generation (RAG)](#rag): Allows an LLM to supplement its static knowledge base with proprietary or current information. +- [Retrieval-augmented generation (RAG)](#rag): Allows an LLM to supplement its static knowledge base with proprietary or current information. ## Conversation history @@ -45,18 +47,60 @@ PUT /_cluster/settings ``` {% include copy-curl.html %} -## Using conversational search +## Configuring conversational search -To use conversational search, follow these steps: +There are two ways to configure conversational search: -1. [Create a connector to a model](#step-1-create-a-connector-to-a-model). -1. [Register and deploy the model](#step-2-register-and-deploy-the-model) +- [**Automated workflow**](#automated-workflow) (Recommended for quick setup): Automatically create an ingest pipeline and index with minimal configuration. +- [**Manual setup**](#manual-setup) (Recommended for custom configurations): Manually configure each component for greater flexibility and control. + +## Automated workflow + +OpenSearch provides a [workflow template]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-templates#conversational-search-using-an-llm) that automatically creates a connector for the LLM, registers and deploys the LLM, and configures a search pipeline. You must provide the API key for the configured LLM when creating a workflow. Review the conversational search workflow template [defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/conversational-search-defaults.json) to determine whether you need to update any of the parameters. For example, if the model endpoint is different from the default (`https://api.cohere.ai/v1/chat`), specify the endpoint of your model in the `create_connector.actions.url` parameter. To create the default conversational search workflow, send the following request: + +```json +POST /_plugins/_flow_framework/workflow?use_case=conversational_search_with_llm_deploy&provision=true +{ +"create_connector.credential.key": "<YOUR_API_KEY>" +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a workflow ID for the created workflow: + +```json +{ + "workflow_id" : "U_nMXJUBq_4FYQzMOS4B" +} +``` + +To check the workflow status, send the following request: + +```json +GET /_plugins/_flow_framework/workflow/U_nMXJUBq_4FYQzMOS4B/_status +``` +{% include copy-curl.html %} + +Once the workflow completes, the `state` changes to `COMPLETED`. The workflow creates the following components: + +- A model connector: Connects to the specified model. +- A registered and deployed model: The model is ready for inference. +- A search pipeline: Configured to handle conversational queries. + +You can now continue with [steps 4, 5, and 6](#step-4-ingest-rag-data-into-an-index) to ingest RAG data into the index, create a conversation memory, and use the pipeline for RAG. + +## Manual setup + +To manually configure conversational search, follow these steps: + +1. [Create a connector for a model](#step-1-create-a-connector-for-a-model). +1. [Register and deploy the model](#step-2-register-and-deploy-the-model). 1. [Create a search pipeline](#step-3-create-a-search-pipeline). 1. [Ingest RAG data into an index](#step-4-ingest-rag-data-into-an-index). 1. [Create a conversation memory](#step-5-create-a-conversation-memory). 1. [Use the pipeline for RAG](#step-6-use-the-pipeline-for-rag). -### Step 1: Create a connector to a model +### Step 1: Create a connector for a model RAG requires an LLM in order to function. To connect to an LLM, create a [connector]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). The following request creates a connector for the OpenAI GPT 3.5 model: @@ -235,8 +279,6 @@ As of OpenSearch 2.12, the RAG technique has only been tested with OpenAI models Configuring the Cohere Command model to enable RAG requires using a post-processing function to transform the model output. For more information, see the [Cohere RAG Tutorial](https://github.com/opensearch-project/ml-commons/blob/2.x/docs/tutorials/conversational_search/conversational_search_with_Cohere_Command.md). -### Enabling RAG - ### Step 5: Create a conversation memory You'll need to create a conversation memory that will store all messages from a conversation. To make the memory easily identifiable, provide a name for the memory in the optional `name` field, as shown in the following example. Because the `name` parameter is not updatable, this is your only opportunity to name your conversation. @@ -441,7 +483,4 @@ The response contains both messages: ## Next steps -- To learn more about connecting to models on external platforms, see [Connectors]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/connectors/). -- For supported APIs, see [Memory APIs]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/memory-apis/index/). -- To learn more about search pipelines and processors, see [Search pipelines]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/). -- For available OpenSearch queries, see [Query DSL]({{site.url}}{{site.baseurl}}/query-dsl/). \ No newline at end of file +- Explore our [tutorials]({{site.url}}{{site.baseurl}}/vector-search/tutorials/) to learn how to build AI search applications. \ No newline at end of file diff --git a/_vector-search/ai-search/hybrid-search/aggregations.md b/_vector-search/ai-search/hybrid-search/aggregations.md new file mode 100644 index 00000000000..003e241e62c --- /dev/null +++ b/_vector-search/ai-search/hybrid-search/aggregations.md @@ -0,0 +1,203 @@ +--- +layout: default +title: Combining hybrid search and aggregations +parent: Hybrid search +grand_parent: AI search +has_children: false +nav_order: 50 +--- + +# Combining hybrid search and aggregations +**Introduced 2.13** +{: .label .label-purple } + +You can enhance search results by combining a hybrid query clause with any aggregation that OpenSearch supports. Aggregations allow you to use OpenSearch as an analytics engine. For more information about aggregations, see [Aggregations]({{site.url}}{{site.baseurl}}/aggregations/). + +Most aggregations are performed on the subset of documents that is returned by a hybrid query. The only aggregation that operates on all documents is the [`global`]({{site.url}}{{site.baseurl}}/aggregations/bucket/global/) aggregation. + +To use aggregations with a hybrid query, first create an index. Aggregations are typically used on fields of special types, like `keyword` or `integer`. The following example creates an index with several such fields: + +```json +PUT /my-nlp-index +{ + "settings": { + "number_of_shards": 2 + }, + "mappings": { + "properties": { + "doc_index": { + "type": "integer" + }, + "doc_keyword": { + "type": "keyword" + }, + "category": { + "type": "keyword" + } + } + } +} +``` +{% include copy-curl.html %} + +The following request ingests six documents into your new index: + +```json +POST /_bulk +{ "index": { "_index": "my-nlp-index" } } +{ "category": "permission", "doc_keyword": "workable", "doc_index": 4976, "doc_price": 100} +{ "index": { "_index": "my-nlp-index" } } +{ "category": "sister", "doc_keyword": "angry", "doc_index": 2231, "doc_price": 200 } +{ "index": { "_index": "my-nlp-index" } } +{ "category": "hair", "doc_keyword": "likeable", "doc_price": 25 } +{ "index": { "_index": "my-nlp-index" } } +{ "category": "editor", "doc_index": 9871, "doc_price": 30 } +{ "index": { "_index": "my-nlp-index" } } +{ "category": "statement", "doc_keyword": "entire", "doc_index": 8242, "doc_price": 350 } +{ "index": { "_index": "my-nlp-index" } } +{ "category": "statement", "doc_keyword": "idea", "doc_index": 5212, "doc_price": 200 } +{ "index": { "_index": "index-test" } } +{ "category": "editor", "doc_keyword": "bubble", "doc_index": 1298, "doc_price": 130 } +{ "index": { "_index": "index-test" } } +{ "category": "editor", "doc_keyword": "bubble", "doc_index": 521, "doc_price": 75 } +``` +{% include copy-curl.html %} + +Now you can combine a hybrid query clause with a `min` aggregation: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "query": { + "hybrid": { + "queries": [ + { + "term": { + "category": "permission" + } + }, + { + "bool": { + "should": [ + { + "term": { + "category": "editor" + } + }, + { + "term": { + "category": "statement" + } + } + ] + } + } + ] + } + }, + "aggs": { + "total_price": { + "sum": { + "field": "doc_price" + } + }, + "keywords": { + "terms": { + "field": "doc_keyword", + "size": 10 + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching documents and the aggregation results: + +```json +{ + "took": 9, + "timed_out": false, + "_shards": { + "total": 2, + "successful": 2, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 0.5, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "mHRPNY4BlN82W_Ar9UMY", + "_score": 0.5, + "_source": { + "doc_price": 100, + "doc_index": 4976, + "doc_keyword": "workable", + "category": "permission" + } + }, + { + "_index": "my-nlp-index", + "_id": "m3RPNY4BlN82W_Ar9UMY", + "_score": 0.5, + "_source": { + "doc_price": 30, + "doc_index": 9871, + "category": "editor" + } + }, + { + "_index": "my-nlp-index", + "_id": "nXRPNY4BlN82W_Ar9UMY", + "_score": 0.5, + "_source": { + "doc_price": 200, + "doc_index": 5212, + "doc_keyword": "idea", + "category": "statement" + } + }, + { + "_index": "my-nlp-index", + "_id": "nHRPNY4BlN82W_Ar9UMY", + "_score": 0.5, + "_source": { + "doc_price": 350, + "doc_index": 8242, + "doc_keyword": "entire", + "category": "statement" + } + } + ] + }, + "aggregations": { + "total_price": { + "value": 680 + }, + "doc_keywords": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "entire", + "doc_count": 1 + }, + { + "key": "idea", + "doc_count": 1 + }, + { + "key": "workable", + "doc_count": 1 + } + ] + } + } +} +``` \ No newline at end of file diff --git a/_vector-search/ai-search/hybrid-search/collapse.md b/_vector-search/ai-search/hybrid-search/collapse.md new file mode 100644 index 00000000000..e95df8fbd1a --- /dev/null +++ b/_vector-search/ai-search/hybrid-search/collapse.md @@ -0,0 +1,529 @@ +--- +layout: default +title: Collapsing hybrid query results +parent: Hybrid search +grand_parent: AI search +has_children: false +nav_order: 35 +--- + +# Collapsing hybrid query results +**Introduced 3.1** +{: .label .label-purple } + +The `collapse` parameter lets you group results by a field, returning only the highest scoring document for each unique field value. This is useful when you want to avoid duplicates in your search results. The field you collapse on must be of type `keyword` or a numeric type. The number of results returned is still limited by the `size` parameter in your query. + +The `collapse` parameter is compatible with other hybrid query search options, such as sort, explain, and pagination, using their standard syntax. + +When using `collapse` in a hybrid query, note the following considerations: + +- Inner hits are not supported. +- Performance may be impacted when working with large result sets. +- Aggregations run on pre-collapsed results, not the final output. +- Pagination behavior changes: Because `collapse` reduces the total number of results, it can affect how results are distributed across pages. To retrieve more results, consider increasing the pagination depth. +- Results may differ from those returned by the [`collapse` response processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/collapse-processor/), which applies collapse logic after the query is executed. + +## Example + +The following example demonstrates how to collapse hybrid query results. + +Create an index: + +```json +PUT /bakery-items +{ + "mappings": { + "properties": { + "item": { + "type": "keyword" + }, + "category": { + "type": "keyword" + }, + "price": { + "type": "float" + }, + "baked_date": { + "type": "date" + } + } + } +} +``` +{% include copy-curl.html %} + +Ingest documents into the index: + +```json +{ "index": {} } +{ "item": "Chocolate Cake", "category": "cakes", "price": 15, "baked_date": "2023-07-01T00:00:00Z" } +{ "index": {} } +{ "item": "Chocolate Cake", "category": "cakes", "price": 18, "baked_date": "2023-07-04T00:00:00Z" } +{ "index": {} } +{ "item": "Vanilla Cake", "category": "cakes", "price": 12, "baked_date": "2023-07-02T00:00:00Z" } +{ "index": {} } +{ "item": "Vanilla Cake", "category": "cakes", "price": 16, "baked_date": "2023-07-03T00:00:00Z" } +{ "index": {} } +{ "item": "Vanilla Cake", "category": "cakes", "price": 17, "baked_date": "2023-07-09T00:00:00Z" } +``` +{% include copy-curl.html %} + +Create a search pipeline. This example uses the `min_max` normalization technique: + +```json +PUT /_search/pipeline/norm-pipeline +{ + "description": "Normalization processor for hybrid search", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "min_max" + }, + "combination": { + "technique": "arithmetic_mean" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +Search the index, grouping the search results by the `item` field: + +```json +GET /bakery-items/_search?search_pipeline=norm-pipeline +{ + "query": { + "hybrid": { + "queries": [ + { + "match": { + "item": "Chocolate Cake" + } + }, + { + "bool": { + "must": { + "match": { + "category": "cakes" + } + } + } + } + ] + } + }, + "collapse": { + "field": "item" + } +} +``` +{% include copy-curl.html %} + +The response returns the collapsed search results: + +```json +"hits": { + "total": { + "value": 5, + "relation": "eq" + }, + "max_score": 1.0, + "hits": [ + { + "_index": "bakery-items", + "_id": "wBRPZZcB49c_2-1rYmO7", + "_score": 1.0, + "_source": { + "item": "Chocolate Cake", + "category": "cakes", + "price": 15, + "baked_date": "2023-07-01T00:00:00Z" + }, + "fields": { + "item": [ + "Chocolate Cake" + ] + } + }, + { + "_index": "bakery-items", + "_id": "whRPZZcB49c_2-1rYmO7", + "_score": 0.5005, + "_source": { + "item": "Vanilla Cake", + "category": "cakes", + "price": 12, + "baked_date": "2023-07-02T00:00:00Z" + }, + "fields": { + "item": [ + "Vanilla Cake" + ] + } + } + ] + } +``` + +## Collapse and sort results + +To collapse and sort hybrid query results, provide the `collapse` and `sort` parameters in the query: + +```json +GET /bakery-items/_search?search_pipeline=norm-pipeline +{ + "query": { + "hybrid": { + "queries": [ + { + "match": { + "item": "Chocolate Cake" + } + }, + { + "bool": { + "must": { + "match": { + "category": "cakes" + } + } + } + } + ] + } + }, + "collapse": { + "field": "item" + }, + "sort": "price" +} +``` +{% include copy-curl.html %} + +For more information about sorting in a hybrid query, see [Using sorting with a hybrid query]({{site.url}}{{site.baseurl}}/vector-search/ai-search/hybrid-search/sorting/). + +In the response, documents are sorted by the lowest price: + +```json +"hits": { + "total": { + "value": 5, + "relation": "eq" + }, + "max_score": null, + "hits": [ + { + "_index": "bakery-items", + "_id": "whRPZZcB49c_2-1rYmO7", + "_score": null, + "_source": { + "item": "Vanilla Cake", + "category": "cakes", + "price": 12, + "baked_date": "2023-07-02T00:00:00Z" + }, + "fields": { + "item": [ + "Vanilla Cake" + ] + }, + "sort": [ + 12.0 + ] + }, + { + "_index": "bakery-items", + "_id": "wBRPZZcB49c_2-1rYmO7", + "_score": null, + "_source": { + "item": "Chocolate Cake", + "category": "cakes", + "price": 15, + "baked_date": "2023-07-01T00:00:00Z" + }, + "fields": { + "item": [ + "Chocolate Cake" + ] + }, + "sort": [ + 15.0 + ] + } + ] + } +``` + +## Collapse and explain + +You can provide the `explain` query parameter when collapsing search results: + +```json +GET /bakery-items/_search?search_pipeline=norm-pipeline&explain=true +{ + "query": { + "hybrid": { + "queries": [ + { + "match": { + "item": "Chocolate Cake" + } + }, + { + "bool": { + "must": { + "match": { + "category": "cakes" + } + } + } + } + ] + } + }, + "collapse": { + "field": "item" + } +} +``` +{% include copy-curl.html %} + +The response contains detailed information about the scoring process for each search result: + +```json +"hits": { + "total": { + "value": 5, + "relation": "eq" + }, + "max_score": 1.0, + "hits": [ + { + "_shard": "[bakery-items][0]", + "_node": "Jlu8P9EaQCy3C1BxaFMa_g", + "_index": "bakery-items", + "_id": "3ZILepcBheX09_dPt8TD", + "_score": 1.0, + "_source": { + "item": "Chocolate Cake", + "category": "cakes", + "price": 15, + "baked_date": "2023-07-01T00:00:00Z" + }, + "fields": { + "item": [ + "Chocolate Cake" + ] + }, + "_explanation": { + "value": 1.0, + "description": "combined score of:", + "details": [ + { + "value": 1.0, + "description": "ConstantScore(item:Chocolate Cake)", + "details": [] + }, + { + "value": 1.0, + "description": "ConstantScore(category:cakes)", + "details": [] + } + ] + } + }, + { + "_shard": "[bakery-items][0]", + "_node": "Jlu8P9EaQCy3C1BxaFMa_g", + "_index": "bakery-items", + "_id": "35ILepcBheX09_dPt8TD", + "_score": 0.5005, + "_source": { + "item": "Vanilla Cake", + "category": "cakes", + "price": 12, + "baked_date": "2023-07-02T00:00:00Z" + }, + "fields": { + "item": [ + "Vanilla Cake" + ] + }, + "_explanation": { + "value": 1.0, + "description": "combined score of:", + "details": [ + { + "value": 0.0, + "description": "ConstantScore(item:Chocolate Cake) doesn't match id 2", + "details": [] + }, + { + "value": 1.0, + "description": "ConstantScore(category:cakes)", + "details": [] + } + ] + } + } + ] + } +``` + +For more information about using `explain` in a hybrid query, see [Hybrid search explain]({{site.url}}{{site.baseurl}}/vector-search/ai-search/hybrid-search/explain/). + +## Collapse and pagination + +You can paginate collapsed results by providing the `from` and `size` parameters. For more information about pagination in a hybrid query, see [Paginating hybrid query results]({{site.url}}{{site.baseurl}}/vector-search/ai-search/hybrid-search/pagination/). For more information about `from` and `size`, see [The `from` and `size` parameters]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/paginate/#the-from-and-size-parameters). + +For this example, create the following index: + +```json +PUT /bakery-items-pagination +{ + "settings": { + "index.number_of_shards": 3 + }, + "mappings": { + "properties": { + "item": { + "type": "keyword" + }, + "category": { + "type": "keyword" + }, + "price": { + "type": "float" + }, + "baked_date": { + "type": "date" + } + } + } +} +``` +{% include copy-curl.html %} + +Ingest the following documents into the index: + +```json +POST /bakery-items-pagination/_bulk +{ "index": {} } +{ "item": "Chocolate Cake", "category": "cakes", "price": 15, "baked_date": "2023-07-01T00:00:00Z" } +{ "index": {} } +{ "item": "Chocolate Cake", "category": "cakes", "price": 18, "baked_date": "2023-07-02T00:00:00Z" } +{ "index": {} } +{ "item": "Vanilla Cake", "category": "cakes", "price": 12, "baked_date": "2023-07-02T00:00:00Z" } +{ "index": {} } +{ "item": "Vanilla Cake", "category": "cakes", "price": 11, "baked_date": "2023-07-04T00:00:00Z" } +{ "index": {} } +{ "item": "Ice Cream Cake", "category": "cakes", "price": 23, "baked_date": "2023-07-09T00:00:00Z" } +{ "index": {} } +{ "item": "Ice Cream Cake", "category": "cakes", "price": 22, "baked_date": "2023-07-10T00:00:00Z" } +{ "index": {} } +{ "item": "Carrot Cake", "category": "cakes", "price": 24, "baked_date": "2023-07-09T00:00:00Z" } +{ "index": {} } +{ "item": "Carrot Cake", "category": "cakes", "price": 26, "baked_date": "2023-07-21T00:00:00Z" } +{ "index": {} } +{ "item": "Red Velvet Cake", "category": "cakes", "price": 25, "baked_date": "2023-07-09T00:00:00Z" } +{ "index": {} } +{ "item": "Red Velvet Cake", "category": "cakes", "price": 29, "baked_date": "2023-07-30T00:00:00Z" } +{ "index": {} } +{ "item": "Cheesecake", "category": "cakes", "price": 27. "baked_date": "2023-07-09T00:00:00Z" } +{ "index": {} } +{ "item": "Cheesecake", "category": "cakes", "price": 34. "baked_date": "2023-07-21T00:00:00Z" } +{ "index": {} } +{ "item": "Coffee Cake", "category": "cakes", "price": 42, "baked_date": "2023-07-09T00:00:00Z" } +{ "index": {} } +{ "item": "Coffee Cake", "category": "cakes", "price": 41, "baked_date": "2023-07-05T00:00:00Z" } +{ "index": {} } +{ "item": "Cocunut Cake", "category": "cakes", "price": 23, "baked_date": "2023-07-09T00:00:00Z" } +{ "index": {} } +{ "item": "Cocunut Cake", "category": "cakes", "price": 32, "baked_date": "2023-07-12T00:00:00Z" } +// Additional documents omitted for brevity +``` +{% include copy-curl.html %} + +Run a `hybrid` query, specifying the `from` and `size` parameters to paginate results. In the following example, the query requests two results starting from the sixth position (`from: 5, size: 2`). The pagination depth is set to limit each shard to return a maximum of 10 documents. After the results are retrieved, the `collapse` parameter is applied in order to group them by the `item` field: + +```json +GET /bakery-items-pagination/_search?search_pipeline=norm-pipeline +{ + "query": { + "hybrid": { + "pagination_depth": 10, + "queries": [ + { + "match": { + "item": "Chocolate Cake" + } + }, + { + "bool": { + "must": { + "match": { + "category": "cakes" + } + } + } + } + ] + } + }, + "from": 5, + "size": 2, + "collapse": { + "field": "item" + } +} +``` +{% include copy-curl.html %} + + + +```json +"hits": { + "total": { + "value": 70, + "relation": "eq" + }, + "max_score": 1.0, + "hits": [ + { + "_index": "bakery-items-pagination", + "_id": "gDayepcBIkxlgFKYda0p", + "_score": 0.5005, + "_source": { + "item": "Red Velvet Cake", + "category": "cakes", + "price": 29, + "baked_date": "2023-07-30T00:00:00Z" + }, + "fields": { + "item": [ + "Red Velvet Cake" + ] + } + }, + { + "_index": "bakery-items-pagination", + "_id": "aTayepcBIkxlgFKYca15", + "_score": 0.5005, + "_source": { + "item": "Vanilla Cake", + "category": "cakes", + "price": 12, + "baked_date": "2023-07-02T00:00:00Z" + }, + "fields": { + "item": [ + "Vanilla Cake" + ] + } + } + ] + } +``` diff --git a/_vector-search/ai-search/hybrid-search/explain.md b/_vector-search/ai-search/hybrid-search/explain.md new file mode 100644 index 00000000000..67072a4f969 --- /dev/null +++ b/_vector-search/ai-search/hybrid-search/explain.md @@ -0,0 +1,206 @@ +--- +layout: default +title: Hybrid search explain +parent: Hybrid search +grand_parent: AI search +has_children: false +nav_order: 70 +--- + +# Hybrid search explain +**Introduced 2.19** +{: .label .label-purple } + +You can provide the `explain` parameter to understand how scores are calculated, normalized, and combined in hybrid queries. When enabled, it provides detailed information about the scoring process for each search result. This includes revealing the score normalization techniques used, how different scores were combined, and the calculations for individual subquery scores. This comprehensive insight makes it easier to understand and optimize your hybrid query results. For more information about `explain`, see [Explain API]({{site.url}}{{site.baseurl}}/api-reference/explain/). + +`explain` is an expensive operation in terms of both resources and time. For production clusters, we recommend using it sparingly for the purpose of troubleshooting. +{: .warning } + +You can provide the `explain` parameter in a URL when running a complete hybrid query using the following syntax: + +```json +GET <index>/_search?search_pipeline=<search_pipeline>&explain=true +POST <index>/_search?search_pipeline=<search_pipeline>&explain=true +``` + +To use the `explain` parameter, you must configure the `hybrid_score_explanation` response processor in your search pipeline. For more information, see [Hybrid score explanation processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/explanation-processor/). + +You can also use `explain` with the individual document ID: + +```json +GET <index>/_explain/<id> +POST <index>/_explain/<id> +``` + +In this case, the result will contain only low-level scoring information, for example, [Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25) scores for text-based queries such as `term` or `match`. For an example response, see [Explain API example response]({{site.url}}{{site.baseurl}}/api-reference/explain/#example-response). + +To see the `explain` output for all results, set the parameter to `true` either in the URL or in the request body: + +```json +POST my-nlp-index/_search?search_pipeline=my_pipeline&explain=true +{ + "_source": { + "exclude": [ + "passage_embedding" + ] + }, + "query": { + "hybrid": { + "queries": [ + { + "match": { + "text": { + "query": "horse" + } + } + }, + { + "neural": { + "passage_embedding": { + "query_text": "wild west", + "model_id": "aVeif4oB5Vm0Tdw8zYO2", + "k": 5 + } + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +The response contains scoring information: + +<details markdown="block"> + <summary> + Response + </summary> + {: .text-delta} + +```json +{ + "took": 54, + "timed_out": false, + "_shards": { + "total": 2, + "successful": 2, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 5, + "relation": "eq" + }, + "max_score": 0.9251075, + "hits": [ + { + "_shard": "[my-nlp-index][0]", + "_node": "IsuzeVYdSqKUfy0qfqil2w", + "_index": "my-nlp-index", + "_id": "5", + "_score": 0.9251075, + "_source": { + "text": "A rodeo cowboy , wearing a cowboy hat , is being thrown off of a wild white horse .", + "id": "2691147709.jpg" + }, + "_explanation": { + "value": 0.9251075, + "description": "arithmetic_mean combination of:", + "details": [ + { + "value": 1.0, + "description": "min_max normalization of:", + "details": [ + { + "value": 1.2336599, + "description": "weight(text:horse in 0) [PerFieldSimilarity], result of:", + "details": [ + { + "value": 1.2336599, + "description": "score(freq=1.0), computed as boost * idf * tf from:", + "details": [ + { + "value": 2.2, + "description": "boost", + "details": [] + }, + { + "value": 1.2039728, + "description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:", + "details": [ + { + "value": 1, + "description": "n, number of documents containing term", + "details": [] + }, + { + "value": 4, + "description": "N, total number of documents with field", + "details": [] + } + ] + }, + { + "value": 0.46575344, + "description": "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:", + "details": [ + { + "value": 1.0, + "description": "freq, occurrences of term within document", + "details": [] + }, + { + "value": 1.2, + "description": "k1, term saturation parameter", + "details": [] + }, + { + "value": 0.75, + "description": "b, length normalization parameter", + "details": [] + }, + { + "value": 16.0, + "description": "dl, length of field", + "details": [] + }, + { + "value": 17.0, + "description": "avgdl, average length of field", + "details": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "value": 0.8503647, + "description": "min_max normalization of:", + "details": [ + { + "value": 0.015177966, + "description": "within top 5", + "details": [] + } + ] + } + ] +... +``` +</details> + +## Response body fields + +Field | Description +:--- | :--- +`explanation` | The `explanation` object has three properties: `value`, `description`, and `details`. The `value` property shows the result of the calculation, `description` explains what type of calculation was performed, and `details` shows any subcalculations performed. For score normalization, the information in the `description` property includes the technique used for normalization or combination and the corresponding score. + +## Next steps + +- To learn how to use `explain` with inner hits, see [Using inner hits in hybrid queries]({{site.url}}{{site.baseurl}}/vector-search/ai-search/hybrid-search/inner-hits/). diff --git a/_vector-search/ai-search/hybrid-search/index.md b/_vector-search/ai-search/hybrid-search/index.md new file mode 100644 index 00000000000..392b2a76030 --- /dev/null +++ b/_vector-search/ai-search/hybrid-search/index.md @@ -0,0 +1,354 @@ +--- +layout: default +title: Hybrid search +parent: AI search +has_children: true +nav_order: 40 +redirect_from: + - /search-plugins/hybrid-search/ + - /vector-search/ai-search/hybrid-search/ +--- + +# Hybrid search +Introduced 2.11 +{: .label .label-purple } + +Hybrid search combines keyword and semantic search to improve search relevance. To implement hybrid search, you need to set up a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) that runs at search time. The search pipeline intercepts search results at an intermediate stage and applies processing to normalize and combine document scores. + +There are two types of processors available for hybrid search: + +- [Normalization processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/normalization-processor/) (Introduced 2.10): A score-based processor that normalizes and combines document scores from multiple query clauses, rescoring the documents using the selected normalization and combination techniques. +- [Score ranker processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/score-ranker-processor/) (Introduced 2.19): A rank-based processor that uses rank fusion to combine and rerank documents from multiple query clauses. + +**PREREQUISITE**<br> +To follow this example, you must set up a text embedding model. For more information, see [Choosing a model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/#choosing-a-model). If you have already generated text embeddings, skip to [Step 3](#step-3-configure-a-search-pipeline). +{: .note} + +## Configuring hybrid search + +There are two ways to configure hybrid search: + +- [**Automated workflow**](#automated-workflow) (Recommended for quick setup): Automatically create an ingest pipeline, an index, and a search pipeline with minimal configuration. +- [**Manual setup**](#manual-setup) (Recommended for custom configurations): Manually configure each component for greater flexibility and control. + +## Automated workflow + +OpenSearch provides a [workflow template]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-templates/#hybrid-search) that automatically creates an ingest pipeline, an index, and a search pipeline. You must provide the model ID for the configured model when creating a workflow. Review the hybrid search workflow template [defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/hybrid-search-defaults.json) to determine whether you need to update any of the parameters. For example, if the model dimensionality is different from the default (`1024`), specify the dimensionality of your model in the `output_dimension` parameter. To create the default hybrid search workflow, send the following request: + +```json +POST /_plugins/_flow_framework/workflow?use_case=hybrid_search&provision=true +{ +"create_ingest_pipeline.model_id": "mBGzipQB2gmRjlv_dOoB" +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a workflow ID for the created workflow: + +```json +{ + "workflow_id" : "U_nMXJUBq_4FYQzMOS4B" +} +``` + +To check the workflow status, send the following request: + +```json +GET /_plugins/_flow_framework/workflow/U_nMXJUBq_4FYQzMOS4B/_status +``` +{% include copy-curl.html %} + +Once the workflow completes, the `state` changes to `COMPLETED`. The workflow creates the following components: + +- An ingest pipeline named `nlp-ingest-pipeline` +- An index named `my-nlp-index` +- A search pipeline named `nlp-search-pipeline` + +You can now continue with [steps 4 and 5](#step-4-ingest-documents-into-the-index) to ingest documents into the index and search the index. + +## Manual setup + +To manually configure hybrid search, follow these steps: + +1. [Create an ingest pipeline](#step-1-create-an-ingest-pipeline). +1. [Create an index for ingestion](#step-2-create-an-index-for-ingestion). +1. [Configure a search pipeline](#step-3-configure-a-search-pipeline). +1. [Ingest documents into the index](#step-4-ingest-documents-into-the-index). +1. [Search the index using hybrid search](#step-5-search-the-index-using-hybrid-search). + +## Step 1: Create an ingest pipeline + +To generate vector embeddings, you need to create an [ingest pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) that contains a [`text_embedding` processor]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/processors/text-embedding/), which will convert the text in a document field to vector embeddings. The processor's `field_map` determines the input fields from which to generate vector embeddings and the output fields in which to store the embeddings. + +The following example request creates an ingest pipeline that converts the text from `passage_text` to text embeddings and stores the embeddings in `passage_embedding`: + +```json +PUT /_ingest/pipeline/nlp-ingest-pipeline +{ + "description": "A text embedding pipeline", + "processors": [ + { + "text_embedding": { + "model_id": "bQ1J8ooBpBj3wT4HVUsb", + "field_map": { + "passage_text": "passage_embedding" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +## Step 2: Create an index for ingestion + +In order to use the text embedding processor defined in your pipeline, create a vector index, adding the pipeline created in the previous step as the default pipeline. Ensure that the fields defined in the `field_map` are mapped as correct types. Continuing with the example, the `passage_embedding` field must be mapped as a k-NN vector with a dimension that matches the model dimension. Similarly, the `passage_text` field should be mapped as `text`. + +The following example request creates a vector index that is set up with a default ingest pipeline: + +```json +PUT /my-nlp-index +{ + "settings": { + "index.knn": true, + "default_pipeline": "nlp-ingest-pipeline" + }, + "mappings": { + "properties": { + "id": { + "type": "text" + }, + "passage_embedding": { + "type": "knn_vector", + "dimension": 768, + "method": { + "engine": "lucene", + "space_type": "l2", + "name": "hnsw", + "parameters": {} + } + }, + "passage_text": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +For more information about creating a vector index and using supported methods, see [Creating a vector index]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/). + + +## Step 3: Configure a search pipeline + +To configure a search pipeline with a [`normalization-processor`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/normalization-processor/), use the following request. The normalization technique in the processor is set to `min_max`, and the combination technique is set to `arithmetic_mean`. The `weights` array specifies the weights assigned to each query clause as decimal percentages: + +```json +PUT /_search/pipeline/nlp-search-pipeline +{ + "description": "Post processor for hybrid search", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "min_max" + }, + "combination": { + "technique": "arithmetic_mean", + "parameters": { + "weights": [ + 0.3, + 0.7 + ] + } + } + } + } + ] +} +``` +{% include copy-curl.html %} + +## Step 4: Ingest documents into the index + +To ingest documents into the index created in the previous step, send the following requests: + +```json +PUT /my-nlp-index/_doc/1 +{ + "passage_text": "Hello world", + "id": "s1" +} +``` +{% include copy-curl.html %} + +```json +PUT /my-nlp-index/_doc/2 +{ + "passage_text": "Hi planet", + "id": "s2" +} +``` +{% include copy-curl.html %} + +Before the document is ingested into the index, the ingest pipeline runs the `text_embedding` processor on the document, generating text embeddings for the `passage_text` field. The indexed document includes the `passage_text` field, which contains the original text, and the `passage_embedding` field, which contains the vector embeddings. + +## Step 5: Search the index using hybrid search + +To perform hybrid search on your index, use the [`hybrid` query]({{site.url}}{{site.baseurl}}/query-dsl/compound/hybrid/), which combines the results of keyword and semantic search. + +#### Example: Combining a neural query and a match query + +The following example request combines two query clauses---a `neural` query and a `match` query. It specifies the search pipeline created in the previous step as a query parameter: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "_source": { + "exclude": [ + "passage_embedding" + ] + }, + "query": { + "hybrid": { + "queries": [ + { + "match": { + "passage_text": { + "query": "Hi world" + } + } + }, + { + "neural": { + "passage_embedding": { + "query_text": "Hi world", + "model_id": "aVeif4oB5Vm0Tdw8zYO2", + "k": 5 + } + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +Alternatively, you can set a default search pipeline for the `my-nlp-index` index. For more information, see [Default search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/using-search-pipeline/#default-search-pipeline). + +The response contains the matching document: + +```json +{ + "took" : 36, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 1, + "relation" : "eq" + }, + "max_score" : 1.2251667, + "hits" : [ + { + "_index" : "my-nlp-index", + "_id" : "1", + "_score" : 1.2251667, + "_source" : { + "passage_text" : "Hello world", + "id" : "s1" + } + } + ] + } +} +``` +{% include copy-curl.html %} + +#### Example: Combining a match query and a term query + +The following example request combines two query clauses---a `match` query and a `term` query. It specifies the search pipeline created in the previous step as a query parameter: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "_source": { + "exclude": [ + "passage_embedding" + ] + }, + "query": { + "hybrid": { + "queries": [ + { + "match":{ + "passage_text": "hello" + } + }, + { + "term":{ + "passage_text":{ + "value":"planet" + } + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching documents: + +```json +{ + "took": 11, + "timed_out": false, + "_shards": { + "total": 2, + "successful": 2, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 0.7, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "2", + "_score": 0.7, + "_source": { + "id": "s2", + "passage_text": "Hi planet" + } + }, + { + "_index": "my-nlp-index", + "_id": "1", + "_score": 0.3, + "_source": { + "id": "s1", + "passage_text": "Hello world" + } + } + ] + } +} +``` +{% include copy-curl.html %} + +## Next steps + +- Explore our [tutorials]({{site.url}}{{site.baseurl}}/vector-search/tutorials/) to learn how to build AI search applications. \ No newline at end of file diff --git a/_vector-search/ai-search/hybrid-search/inner-hits.md b/_vector-search/ai-search/hybrid-search/inner-hits.md new file mode 100644 index 00000000000..11c344f438b --- /dev/null +++ b/_vector-search/ai-search/hybrid-search/inner-hits.md @@ -0,0 +1,648 @@ +--- +layout: default +title: Using inner hits in hybrid queries +parent: Hybrid search +grand_parent: AI search +has_children: false +nav_order: 60 +--- + +# Using inner hits in hybrid queries +**Introduced 3.0** +{: .label .label-purple } + +When running a hybrid search, you can retrieve the matching nested objects or child documents by including an `inner_hits` clause in your search request. This information lets you explore the specific parts of a document that matched the query. + +To learn more about how `inner_hits` works, see [Retrieve inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/). + + +During hybrid query execution, documents are scored and retrieved as follows: + +1. Each subquery selects parent documents based on the relevance of their inner hits. +1. The selected parent documents from all subqueries are combined, and their scores are normalized to produce a hybrid score. +1. For each parent document, the relevant `inner_hits` are retrieved from the shards and included in the final response. + +Hybrid queries handle inner hits differently than traditional queries when determining final search results: + +- In a **traditional query**, the final ranking of parent documents is determined directly by the `inner_hits` scores. +- In a **hybrid query**, the final ranking is determined by the **hybrid score** (a normalized combination of all subquery scores). However, parent documents are still fetched from the shard based on the relevance of their `inner_hits`. + +The `inner_hits` section in the response shows the original (raw) scores before normalization. The parent documents show the final hybrid score. +{: .note} + +## Example + +The following example demonstrates using `inner_hits` with a hybrid query. + +### Step 1: Create an index + +Create an index with two nested fields (`user` and `location`): + +```json +PUT /my-nlp-index +{ + "settings": { + "number_of_shards": 3, + "number_of_replicas": 0 + }, + "mappings": { + "properties": { + "user": { + "type": "nested", + "properties": { + "name": { + "type": "text" + }, + "age": { + "type": "integer" + } + } + }, + "location": { + "type": "nested", + "properties": { + "city": { + "type": "text" + }, + "state": { + "type": "text" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +### Step 2: Create a search pipeline + +Configure a search pipeline with a `normalization-processor` using the `min_max` normalization technique and the `arithmetic_mean` combination technique: + +```json +PUT /_search/pipeline/nlp-search-pipeline +{ + "description": "Post processor for hybrid search", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "min_max" + }, + "combination": { + "technique": "arithmetic_mean", + "parameters": {} + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Step 3: Ingest documents into the index + +To ingest documents into the index created in the previous step, send the following request: + +```json +POST /my-nlp-index/_bulk +{"index": {"_index": "my-nlp-index"}} +{"user":[{"name":"John Alder","age":35},{"name":"Sammy","age":34},{"name":"Mike","age":32},{"name":"Maples","age":30}],"location":[{"city":"Amsterdam","state":"Netherlands"},{"city":"Udaipur","state":"Rajasthan"},{"city":"Naples","state":"Italy"}]} +{"index": {"_index": "my-nlp-index"}} +{"user":[{"name":"John Wick","age":46},{"name":"John Snow","age":40},{"name":"Sansa Stark","age":22},{"name":"Arya Stark","age":20}],"location":[{"city":"Tromso","state":"Norway"},{"city":"Los Angeles","state":"California"},{"city":"London","state":"UK"}]} +``` +{% include copy-curl.html %} + +### Step 4: Search the index using hybrid search and fetch inner hits + +The following request runs a hybrid query to search for matches in two nested fields: `user` and `location`. It combines the results from each field into a single ranked list of parent documents while also retrieving the matching nested objects using `inner_hits`: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "query": { + "hybrid": { + "queries": [ + { + "nested": { + "path": "user", + "query": { + "match": { + "user.name": "John" + } + }, + "score_mode": "sum", + "inner_hits": {} + } + }, + { + "nested": { + "path": "location", + "query": { + "match": { + "location.city": "Udaipur" + } + }, + "inner_hits": {} + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +The response includes the matched parent documents along with the relevant nested `inner_hits` for both the `user` and `location` nested fields. Each inner hit shows which nested object matched and how strongly it contributed to the overall hybrid score: + +```json +... +{ + "hits": [ + { + "_index": "my-nlp-index", + "_id": "1", + "_score": 1.0, + "inner_hits": { + "location": { + "hits": { + "max_score": 0.44583148, + "hits": [ + { + "_nested": { + "field": "location", + "offset": 1 + }, + "_score": 0.44583148, + "_source": { + "city": "Udaipur", + "state": "Rajasthan" + } + } + ] + } + }, + "user": { + "hits": { + "max_score": 0.4394061, + "hits": [ + { + "_nested": { + "field": "user", + "offset": 0 + }, + "_score": 0.4394061, + "_source": { + "name": "John Alder", + "age": 35 + } + } + ] + } + } + } + // Additional details omitted for brevity + }, + { + "_index": "my-nlp-index", + "_id": "2", + "_score": 5.0E-4, + "inner_hits": { + "user": { + "hits": { + "max_score": 0.31506687, + "hits": [ + { + "_nested": { + "field": "user", + "offset": 0 + }, + "_score": 0.31506687, + "_source": { + "name": "John Wick", + "age": 46 + } + }, + { + "_nested": { + "field": "user", + "offset": 1 + }, + "_score": 0.31506687, + "_source": { + "name": "John Snow", + "age": 40 + } + } + ] + } + } + // Additional details omitted for brevity + } + } + ] + // Additional details omitted for brevity +} +... +``` + +## Using the explain parameter + +To understand how inner hits contribute to the hybrid score, you can enable explanation. The response will include detailed scoring information. For more information about using `explain` with hybrid queries, see [Hybrid search explain]({{site.url}}{{site.baseurl}}/vector-search/ai-search/hybrid-search/explain/). + +`explain` is an expensive operation in terms of both resources and time. For production clusters, we recommend using it sparingly for the purpose of troubleshooting. +{: .warning} + +First, add the `hybrid_score_explanation` processor to the search pipeline you created in Step 2: + +```json +PUT /_search/pipeline/nlp-search-pipeline +{ + "description": "Post processor for hybrid search", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "min_max" + }, + "combination": { + "technique": "arithmetic_mean" + } + } + } + ], + "response_processors": [ + { + "hybrid_score_explanation": {} + } + ] +} +``` +{% include copy-curl.html %} + +For more information, see [Hybrid score explanation processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/explanation-processor/) and [Hybrid search explain]({{site.url}}{{site.baseurl}}/vector-search/ai-search/hybrid-search/explain/). + +Then, run the same query you ran in Step 4 and include the `explain` parameter in your search request: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline&explain=true +{ + "query": { + "hybrid": { + "queries": [ + { + "nested": { + "path": "user", + "query": { + "match": { + "user.name": "John" + } + }, + "score_mode": "sum", + "inner_hits": {} + } + }, + { + "nested": { + "path": "location", + "query": { + "match": { + "location.city": "Udaipur" + } + }, + "inner_hits": {} + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +The response includes an `_explanation` object containing detailed scoring information. The nested `details` array provides the relevant information about the score mode used, the number of child documents contributing to the parent document's score, and how the scores were normalized and combined: + +```json +{ + ... + "_explanation": { + "value": 1.0, + "description": "arithmetic_mean combination of:", + "details": [ + { + "value": 1.0, + "description": "min_max normalization of:", + "details": [ + { + "value": 0.4458314776420593, + "description": "combined score of:", + "details": [ + { + "value": 0.4394061, + "description": "Score based on 1 child docs in range from 0 to 6, using score mode Avg", + "details": [ + { + "value": 0.4394061, + "description": "weight(user.name:john in 0) [PerFieldSimilarity], result of:" + // Additional details omitted for brevity + } + ] + }, + { + "value": 0.44583148, + "description": "Score based on 1 child docs in range from 0 to 6, using score mode Avg", + "details": [ + { + "value": 0.44583148, + "description": "weight(location.city:udaipur in 5) [PerFieldSimilarity], result of:" + // Additional details omitted for brevity + } + ] + } + ] + } + ] + } + ] + } +} +... +``` + +## Sorting with inner hits + +To apply sorting, add a `sort` subclause in the `inner_hits` clause. For example, to sort by `user.age`, specify this sort condition in the `inner_hits` clause: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "query": { + "hybrid": { + "queries": [ + { + "nested": { + "path": "user", + "query": { + "match": { + "user.name": "John" + } + }, + "score_mode": "sum", + "inner_hits": { + "sort": [ + { + "user.age": { + "order": "desc" + } + } + ] + } + } + }, + { + "nested": { + "path": "location", + "query": { + "match": { + "location.city": "Udaipur" + } + }, + "inner_hits": {} + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +In the response, the `user` inner hits are sorted by age in descending order rather than by relevance, which is why the `_score` field is `null` (scores are not calculated when custom sorting is applied): + +```json +... +"user": { + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": null, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "2", + "_nested": { + "field": "user", + "offset": 0 + }, + "_score": null, + "_source": { + "name": "John Wick", + "age": 46 + }, + "sort": [ + 46 + ] + }, + { + "_index": "my-nlp-index", + "_id": "2", + "_nested": { + "field": "user", + "offset": 1 + }, + "_score": null, + "_source": { + "name": "John Snow", + "age": 40 + }, + "sort": [ + 40 + ] + } + ] + } +} +... +``` + +## Pagination with inner hits + +To paginate inner hit results, specify the `from` parameter (starting position) and `size` parameter (number of results) in the `inner_hits` clause. The following example request retrieves only the third and fourth nested objects from the `user` field by setting `from` to `2` (skip the first two) and `size` to `2` (return two results): + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "query": { + "hybrid": { + "queries": [ + { + "nested": { + "path": "user", + "query": { + "match_all": {} + }, + "inner_hits": { + "from": 2, + "size": 2 + } + } + }, + { + "nested": { + "path": "location", + "query": { + "match": { + "location.city": "Udaipur" + } + }, + "inner_hits": {} + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +The response contains the `user` field inner hits starting from the offset of `2`: + +```json +... +"user": { + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 1.0, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "1", + "_nested": { + "field": "user", + "offset": 2 + }, + "_score": 1.0, + "_source": { + "name": "Mike", + "age": 32 + } + }, + { + "_index": "my-nlp-index", + "_id": "1", + "_nested": { + "field": "user", + "offset": 3 + }, + "_score": 1.0, + "_source": { + "name": "Maples", + "age": 30 + } + } + ] + } +} +... +``` + +## Defining a custom name for the inner_hits field + +To differentiate between multiple inner hits in a single query, you can define custom names for inner hits in the search response. For example, you can provide a custom name, `coordinates`, for the `location` field inner hits as follows: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "query": { + "hybrid": { + "queries": [ + { + "nested": { + "path": "user", + "query": { + "match_all": {} + }, + "inner_hits": { + "name": "coordinates" + } + } + }, + { + "nested": { + "path": "location", + "query": { + "match": { + "location.city": "Udaipur" + } + }, + "inner_hits": {} + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +In the response, inner hits for the `user` field appear under the custom name `coordinates`: + +```json +... +"inner_hits": { + "coordinates": { + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 1.0, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "1", + "_nested": { + "field": "user", + "offset": 0 + }, + "_score": 1.0, + "_source": { + "name": "John Alder", + "age": 35 + } + } + ] + } + }, + "location": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.44583148, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "1", + "_nested": { + "field": "location", + "offset": 1 + }, + "_score": 0.44583148, + "_source": { + "city": "Udaipur", + "state": "Rajasthan" + } + } + ] + } + } +} +... +``` diff --git a/_vector-search/ai-search/hybrid-search/pagination.md b/_vector-search/ai-search/hybrid-search/pagination.md new file mode 100644 index 00000000000..e245f7b56cf --- /dev/null +++ b/_vector-search/ai-search/hybrid-search/pagination.md @@ -0,0 +1,205 @@ +--- +layout: default +title: Paginating hybrid query results +parent: Hybrid search +grand_parent: AI search +has_children: false +nav_order: 20 +--- + +## Paginating hybrid query results +**Introduced 2.19** +{: .label .label-purple } + +You can apply pagination to hybrid query results by using the `pagination_depth` parameter in the hybrid query clause, along with the standard `from` and `size` parameters. The `pagination_depth` parameter defines the maximum number of search results that can be retrieved from each shard per subquery. For example, setting `pagination_depth` to `50` allows up to 50 results per subquery to be maintained in memory from each shard. + +To navigate through the results, use the `from` and `size` parameters: + +- `from`: Specifies the document number from which you want to start showing the results. Default is `0`. +- `size`: Specifies the number of results to return on each page. Default is `10`. + +For example, to show 10 documents starting from the 20th document, specify `from: 20` and `size: 10`. For more information about pagination, see [Paginate results]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/paginate/#the-from-and-size-parameters). + +### The impact of pagination_depth on hybrid search results + +Changing `pagination_depth` affects the underlying set of search results retrieved before any ranking, filtering, or pagination adjustments are applied. This is because `pagination_depth` determines the number of results retrieved per subquery from each shard, which can ultimately change the result order after normalization. To ensure consistent pagination, keep the `pagination_depth` value the same while navigating between pages. + +By default, hybrid search without pagination retrieves results using the `from + size` formula, where `from` is always `0`. +{: .note} + +To enable deeper pagination, increase the `pagination_depth` value. You can then navigate through results using the `from` and `size` parameters. Note that deeper pagination can impact search performance because retrieving and processing more results requires additional computational resources. + +The following example shows a search request configured with `from: 0`, `size: 5`, and `pagination_depth: 10`. This means that up to 10 search results per shard will be retrieved for both the `bool` and `term` queries before pagination is applied: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "size": 5, + "query": { + "hybrid": { + "pagination_depth":10, + "queries": [ + { + "term": { + "category": "permission" + } + }, + { + "bool": { + "should": [ + { + "term": { + "category": "editor" + } + }, + { + "term": { + "category": "statement" + } + } + ] + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +The response contains the first five results: + +```json +{ + "hits": { + "total": { + "value": 6, + "relation": "eq" + }, + "max_score": 0.5, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "d3eXlZQBJkWerFzHv4eV", + "_score": 0.5, + "_source": { + "category": "permission", + "doc_keyword": "workable", + "doc_index": 4976, + "doc_price": 100 + } + }, + { + "_index": "my-nlp-index", + "_id": "eneXlZQBJkWerFzHv4eW", + "_score": 0.5, + "_source": { + "category": "editor", + "doc_index": 9871, + "doc_price": 30 + } + }, + { + "_index": "my-nlp-index", + "_id": "e3eXlZQBJkWerFzHv4eW", + "_score": 0.5, + "_source": { + "category": "statement", + "doc_keyword": "entire", + "doc_index": 8242, + "doc_price": 350 + } + }, + { + "_index": "my-nlp-index", + "_id": "fHeXlZQBJkWerFzHv4eW", + "_score": 0.24999997, + "_source": { + "category": "statement", + "doc_keyword": "idea", + "doc_index": 5212, + "doc_price": 200 + } + }, + { + "_index": "index-test", + "_id": "fXeXlZQBJkWerFzHv4eW", + "_score": 5.0E-4, + "_source": { + "category": "editor", + "doc_keyword": "bubble", + "doc_index": 1298, + "doc_price": 130 + } + } + ] + } +} +``` + +The following search request is configured with `from: 6`, `size: 5`, and `pagination_depth: 10`. The `pagination_depth` remains unchanged to ensure that pagination is based on the same set of search results: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "size":5, + "from":6, + "query": { + "hybrid": { + "pagination_depth":10, + "queries": [ + { + "term": { + "category": "permission" + } + }, + { + "bool": { + "should": [ + { + "term": { + "category": "editor" + } + }, + { + "term": { + "category": "statement" + } + } + ] + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +The response excludes the first five entries and displays the remaining results: + +```json +{ + "hits": { + "total": { + "value": 6, + "relation": "eq" + }, + "max_score": 0.5, + "hits": [ + { + "_index": "index-test", + "_id": "fneXlZQBJkWerFzHv4eW", + "_score": 5.0E-4, + "_source": { + "category": "editor", + "doc_keyword": "bubble", + "doc_index": 521, + "doc_price": 75 + } + } + ] + } +} +``` + diff --git a/_vector-search/ai-search/hybrid-search/post-filtering.md b/_vector-search/ai-search/hybrid-search/post-filtering.md new file mode 100644 index 00000000000..7436d2862a2 --- /dev/null +++ b/_vector-search/ai-search/hybrid-search/post-filtering.md @@ -0,0 +1,125 @@ +--- +layout: default +title: Hybrid search with post-filtering +parent: Hybrid search +grand_parent: AI search +has_children: false +nav_order: 40 +--- + +# Hybrid search with post-filtering +**Introduced 2.13** +{: .label .label-purple } + +You can perform post-filtering on hybrid search results by providing the `post_filter` parameter in your query. + +The `post_filter` clause is applied after the search results have been retrieved. Post-filtering is useful for applying additional filters to the search results without impacting the scoring or the order of the results. + +Post-filtering does not impact document relevance scores or aggregation results. +{: .note} + +## Example + +The following example request combines two query clauses---a `term` query and a `match` query---and contains a `post_filter`: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "query": { + "hybrid":{ + "queries":[ + { + "match":{ + "passage_text": "hello" + } + }, + { + "term":{ + "passage_text":{ + "value":"planet" + } + } + } + ] + } + + }, + "post_filter":{ + "match": { "passage_text": "world" } + } +} +``` +{% include copy-curl.html %} + +Compare the results to the results in the [example without post-filtering]({{site.url}}{{site.baseurl}}/vector-search/ai-search/hybrid-search/#example-combining-a-match-query-and-a-term-query). In the example without post-filtering, the response contains two documents. In this example, the response contains one document because the second document is filtered out: + +```json +{ + "took": 18, + "timed_out": false, + "_shards": { + "total": 2, + "successful": 2, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.3, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "1", + "_score": 0.3, + "_source": { + "id": "s1", + "passage_text": "Hello world" + } + } + ] + } +} +``` + +## How post-filtering affects search results and scoring + +Post-filtering can significantly change the final search results and document scores. Consider the following scenarios. + +### Single-query scenario + +Consider a query that returns the following results: +- Query results before normalization: `[d2: 5.0, d4: 3.0, d1: 2.0]` +- Normalized scores: `[d2: 1.0, d4: 0.33, d1: 0.0]` + +After applying a post-filter to the initial query results, the results are as follows: +- Post-filter matches `[d2, d4]` +- Resulting scores: `[d2: 1.0, d4: 0.0]` + +Note how document `d4`'s score changes from `0.33` to `0.0` after applying the post-filter. + +### Multiple-query scenario + +Consider a query with two subqueries: +- Query 1 results: `[d2: 5.0, d4: 3.0, d1: 2.0]` +- Query 2 results: `[d1: 1.0, d5: 0.5, d4: 0.25]` +- Normalized scores: + - Query 1: `[d2: 1.0, d4: 0.33, d1: 0.0]` + - Query 2: `[d1: 1.0, d5: 0.33, d4: 0.0]` +- Combined initial scores: `[d2: 1.0, d1: 0.5, d5: 0.33, d4: 0.165]` + +After applying a post-filter to the initial query results, the results are as follows: +- Post-filter matches `[d2, d4]` +- Resulting scores: + - Query 1: `[d2: 5.0, d4: 3.0]` + - Query 2: `[d4: 0.25]` +- Normalized scores: + - Query 1: `[d2: 1.0, d4: 0.0]` + - Query 2: `[d4: 1.0]` +- Combined final scores: `[d2: 1.0, d4: 0.5]` + +Observe that: +- Document `d2`'s score remains unchanged. +- Document `d4`'s score has changed. \ No newline at end of file diff --git a/_vector-search/ai-search/hybrid-search/search-after.md b/_vector-search/ai-search/hybrid-search/search-after.md new file mode 100644 index 00000000000..016d539ece2 --- /dev/null +++ b/_vector-search/ai-search/hybrid-search/search-after.md @@ -0,0 +1,206 @@ +--- +layout: default +title: Hybrid search with search_after +parent: Hybrid search +grand_parent: AI search +has_children: false +nav_order: 30 +--- + +# Hybrid search with search_after +**Introduced 2.16** +{: .label .label-purple } + +You can control sorting results by applying a `search_after` condition that provides a live cursor and uses the previous page's results to obtain the next page's results. For more information about `search_after`, see [The search_after parameter]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/paginate/#the-search_after-parameter). + +You can paginate the sorted results by applying a `search_after` condition in the sort queries. + +In the following example, sorting is applied by `doc_price` with a `search_after` condition: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "query": { + "hybrid": { + "queries": [ + { + "term": { + "category": "permission" + } + }, + { + "bool": { + "should": [ + { + "term": { + "category": "editor" + } + }, + { + "term": { + "category": "statement" + } + } + ] + } + } + ] + } + }, + "sort":[ + { + "_id": { + "order": "desc" + } + } + ], + "search_after":[200] +} +``` +{% include copy-curl.html %} + +The response contains the matching documents that are listed after the `200` sort value, sorted by `doc_price` in descending order: + +```json +{ + "took": 8, + "timed_out": false, + "_shards": { + "total": 3, + "successful": 3, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 0.5, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "6yaM4JABZkI1FQv8AwoM", + "_score": null, + "_source": { + "category": "permission", + "doc_keyword": "workable", + "doc_index": 4976, + "doc_price": 100 + }, + "sort": [ + 100 + ] + }, + { + "_index": "my-nlp-index", + "_id": "7iaM4JABZkI1FQv8AwoN", + "_score": null, + "_source": { + "category": "editor", + "doc_index": 9871, + "doc_price": 30 + }, + "sort": [ + 30 + ] + } + ] + } +} +``` + +In the following example, sorting is applied by `id` with a `search_after` condition: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "query": { + "hybrid": { + "queries": [ + { + "term": { + "category": "permission" + } + }, + { + "bool": { + "should": [ + { + "term": { + "category": "editor" + } + }, + { + "term": { + "category": "statement" + } + } + ] + } + } + ] + } + }, + "sort":[ + { + "_id": { + "order": "desc" + } + } + ], + "search_after":["7yaM4JABZkI1FQv8AwoN"] +} +``` +{% include copy-curl.html %} + +The response contains the matching documents that are listed after the `7yaM4JABZkI1FQv8AwoN` sort value, sorted by `id` in descending order: + +```json +{ + "took": 17, + "timed_out": false, + "_shards": { + "total": 3, + "successful": 3, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 0.5, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "7iaM4JABZkI1FQv8AwoN", + "_score": null, + "_source": { + "category": "editor", + "doc_index": 9871, + "doc_price": 30 + }, + "sort": [ + "7iaM4JABZkI1FQv8AwoN" + ] + }, + { + "_index": "my-nlp-index", + "_id": "6yaM4JABZkI1FQv8AwoM", + "_score": null, + "_source": { + "category": "permission", + "doc_keyword": "workable", + "doc_index": 4976, + "doc_price": 100 + }, + "sort": [ + "6yaM4JABZkI1FQv8AwoM" + ] + } + ] + } +} +``` \ No newline at end of file diff --git a/_vector-search/ai-search/hybrid-search/sorting.md b/_vector-search/ai-search/hybrid-search/sorting.md new file mode 100644 index 00000000000..460c04ece91 --- /dev/null +++ b/_vector-search/ai-search/hybrid-search/sorting.md @@ -0,0 +1,259 @@ +--- +layout: default +title: Using sorting with a hybrid query +parent: Hybrid search +grand_parent: AI search +has_children: false +nav_order: 10 +--- + +# Using sorting with a hybrid query +**Introduced 2.16** +{: .label .label-purple } + +By default, hybrid search returns results ordered by scores in descending order. You can apply sorting to hybrid query results by providing the `sort` criteria in the search request. For more information about sort criteria, see [Sort results]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/sort/). +When sorting is applied to a hybrid search, results are fetched from the shards based on the specified sort criteria. As a result, the search results are sorted accordingly, and the document scores are `null`. Scores are only present in the hybrid search sorting results if documents are sorted by `_score`. + +In the following example, sorting is applied by `doc_price` in the hybrid query search request: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "query": { + "hybrid": { + "queries": [ + { + "term": { + "category": "permission" + } + }, + { + "bool": { + "should": [ + { + "term": { + "category": "editor" + } + }, + { + "term": { + "category": "statement" + } + } + ] + } + } + ] + } + }, + "sort":[ + { + "doc_price": { + "order": "desc" + } + } + ] +} +``` +{% include copy-curl.html %} + +The response contains the matching documents sorted by `doc_price` in descending order: + +```json +{ + "took": 35, + "timed_out": false, + "_shards": { + "total": 3, + "successful": 3, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 0.5, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "7yaM4JABZkI1FQv8AwoN", + "_score": null, + "_source": { + "category": "statement", + "doc_keyword": "entire", + "doc_index": 8242, + "doc_price": 350 + }, + "sort": [ + 350 + ] + }, + { + "_index": "my-nlp-index", + "_id": "8CaM4JABZkI1FQv8AwoN", + "_score": null, + "_source": { + "category": "statement", + "doc_keyword": "idea", + "doc_index": 5212, + "doc_price": 200 + }, + "sort": [ + 200 + ] + }, + { + "_index": "my-nlp-index", + "_id": "6yaM4JABZkI1FQv8AwoM", + "_score": null, + "_source": { + "category": "permission", + "doc_keyword": "workable", + "doc_index": 4976, + "doc_price": 100 + }, + "sort": [ + 100 + ] + }, + { + "_index": "my-nlp-index", + "_id": "7iaM4JABZkI1FQv8AwoN", + "_score": null, + "_source": { + "category": "editor", + "doc_index": 9871, + "doc_price": 30 + }, + "sort": [ + 30 + ] + } + ] + } +} +``` + +In the following example, sorting is applied by `_id`: + +```json +GET /my-nlp-index/_search?search_pipeline=nlp-search-pipeline +{ + "query": { + "hybrid": { + "queries": [ + { + "term": { + "category": "permission" + } + }, + { + "bool": { + "should": [ + { + "term": { + "category": "editor" + } + }, + { + "term": { + "category": "statement" + } + } + ] + } + } + ] + } + }, + "sort":[ + { + "_id": { + "order": "desc" + } + } + ] +} +``` +{% include copy-curl.html %} + +The response contains the matching documents sorted by `_id` in descending order: + +```json +{ + "took": 33, + "timed_out": false, + "_shards": { + "total": 3, + "successful": 3, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 0.5, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "8CaM4JABZkI1FQv8AwoN", + "_score": null, + "_source": { + "category": "statement", + "doc_keyword": "idea", + "doc_index": 5212, + "doc_price": 200 + }, + "sort": [ + "8CaM4JABZkI1FQv8AwoN" + ] + }, + { + "_index": "my-nlp-index", + "_id": "7yaM4JABZkI1FQv8AwoN", + "_score": null, + "_source": { + "category": "statement", + "doc_keyword": "entire", + "doc_index": 8242, + "doc_price": 350 + }, + "sort": [ + "7yaM4JABZkI1FQv8AwoN" + ] + }, + { + "_index": "my-nlp-index", + "_id": "7iaM4JABZkI1FQv8AwoN", + "_score": null, + "_source": { + "category": "editor", + "doc_index": 9871, + "doc_price": 30 + }, + "sort": [ + "7iaM4JABZkI1FQv8AwoN" + ] + }, + { + "_index": "my-nlp-index", + "_id": "6yaM4JABZkI1FQv8AwoM", + "_score": null, + "_source": { + "category": "permission", + "doc_keyword": "workable", + "doc_index": 4976, + "doc_price": 100 + }, + "sort": [ + "6yaM4JABZkI1FQv8AwoM" + ] + } + ] + } +} +``` diff --git a/_vector-search/ai-search/index.md b/_vector-search/ai-search/index.md new file mode 100644 index 00000000000..56e9887325e --- /dev/null +++ b/_vector-search/ai-search/index.md @@ -0,0 +1,66 @@ +--- +layout: default +title: AI search +nav_order: 45 +has_children: true +has_toc: false +redirect_from: + - /neural-search-plugin/index/ + - /search-plugins/neural-search/ + - /vector-search/ai-search/ +model_cards: + - heading: "Use a pretrained model provided by OpenSearch" + link: "/ml-commons-plugin/pretrained-models/" + - heading: "Upload your own model to OpenSearch" + link: "/ml-commons-plugin/custom-local-models/" + - heading: "Connect to a model hosted on an external platform" + link: "/ml-commons-plugin/remote-models/index/" +tutorial_cards: + - heading: "Getting started with semantic and hybrid search" + description: "Learn how to implement semantic and hybrid search" + link: "/vector-search/tutorials/neural-search-tutorial/" +search_method_cards: + - heading: "Semantic search" + description: "Uses dense retrieval based on text embedding models to search text data." + link: "/vector-search/ai-search/semantic-search/" + - heading: "Hybrid search" + description: "Combines keyword and semantic search to improve search relevance." + link: "/vector-search/ai-search/hybrid-search/" + - heading: "Multimodal search" + description: "Uses multimodal embedding models to search text and image data." + link: "/vector-search/ai-search/multimodal-search/" + - heading: "Neural sparse search" + description: "Uses sparse retrieval based on sparse embedding models to search text data." + link: "/vector-search/ai-search/neural-sparse-search/" + - heading: "Conversational search with RAG" + description: "Uses retrieval-augmented generation (RAG) and conversational memory to provide context-aware responses." + link: "/vector-search/ai-search/conversational-search/" +--- + +# AI search + +AI search streamlines your workflow by generating embeddings automatically. OpenSearch converts text to vectors during indexing and querying. It creates and indexes vector embeddings for documents and then processes query text into embeddings to find and return the most relevant results. + +## Prerequisite + +Before using AI search, you must set up an ML model for embedding generation. When selecting a model, you have the following options: + +- Use a pretrained model provided by OpenSearch. For more information, see [OpenSearch-provided pretrained models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/). + +- Upload your own model to OpenSearch. For more information, see [Custom local models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). + +- Connect to a foundation model hosted on an external platform. For more information, see [Connecting to externally hosted models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). + +--- + +## Tutorial + +{% include cards.html cards=page.tutorial_cards %} + +--- + +## AI search methods + +Once you set up an ML model, choose one of the following search methods. + +{% include cards.html cards=page.search_method_cards %} diff --git a/_vector-search/ai-search/multimodal-search.md b/_vector-search/ai-search/multimodal-search.md new file mode 100644 index 00000000000..7f9443aa03e --- /dev/null +++ b/_vector-search/ai-search/multimodal-search.md @@ -0,0 +1,180 @@ +--- +layout: default +title: Multimodal search +parent: AI search +nav_order: 40 +has_children: false +redirect_from: + - /search-plugins/neural-multimodal-search/ + - /search-plugins/multimodal-search/ +--- + +# Multimodal search +Introduced 2.11 +{: .label .label-purple } + +Use multimodal search to search text and image data using multimodal embedding models. + +**PREREQUISITE**<br> +Before using text search, you must set up a multimodal embedding model. For more information, see [Choosing a model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/#choosing-a-model). +{: .note} + +## Configuring multimodal search + +There are two ways to configure multimodal search: + +- [**Automated workflow**](#automated-workflow) (Recommended for quick setup): Automatically create an ingest pipeline and index with minimal configuration. +- [**Manual setup**](#manual-setup) (Recommended for custom configurations): Manually configure each component for greater flexibility and control. + +## Automated workflow + +OpenSearch provides a [workflow template]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-templates/#multimodal-search) that automatically creates both an ingest pipeline and an index. You must provide the model ID for the configured model when creating a workflow. Review the multimodal search workflow template [defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/multi-modal-search-defaults.json) to determine whether you need to update any of the parameters. For example, if the model dimensionality is different from the default (`1024`), specify the dimensionality of your model in the `output_dimension` parameter. To create the default multimodal search workflow, send the following request: + +```json +POST /_plugins/_flow_framework/workflow?use_case=multimodal_search&provision=true +{ +"create_ingest_pipeline.model_id": "mBGzipQB2gmRjlv_dOoB" +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a workflow ID for the created workflow: + +```json +{ + "workflow_id" : "U_nMXJUBq_4FYQzMOS4B" +} +``` + +To check the workflow status, send the following request: + +```json +GET /_plugins/_flow_framework/workflow/U_nMXJUBq_4FYQzMOS4B/_status +``` +{% include copy-curl.html %} + +Once the workflow completes, the `state` changes to `COMPLETED`. The workflow creates the following components: + +- An ingest pipeline named `nlp-ingest-pipeline` +- An index named `my-nlp-index` + +You can now continue with [steps 3 and 4](#step-3-ingest-documents-into-the-index) to ingest documents into the index and search the index. + +## Manual setup + +To manually configure multimodal search with text and image embeddings, follow these steps: + +1. [Create an ingest pipeline](#step-1-create-an-ingest-pipeline). +1. [Create an index for ingestion](#step-2-create-an-index-for-ingestion). +1. [Ingest documents into the index](#step-3-ingest-documents-into-the-index). +1. [Search the index](#step-4-search-the-index). + +## Step 1: Create an ingest pipeline + +To generate vector embeddings, you need to create an [ingest pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) that contains a [`text_image_embedding` processor]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/processors/text-image-embedding/), which will convert the text or image in a document field to vector embeddings. The processor's `field_map` determines the text and image fields from which to generate vector embeddings and the output vector field in which to store the embeddings. + +The following example request creates an ingest pipeline where the text from `image_description` and an image from `image_binary` will be converted into text embeddings and the embeddings will be stored in `vector_embedding`: + +```json +PUT /_ingest/pipeline/nlp-ingest-pipeline +{ + "description": "A text/image embedding pipeline", + "processors": [ + { + "text_image_embedding": { + "model_id": "-fYQAosBQkdnhhBsK593", + "embedding": "vector_embedding", + "field_map": { + "text": "image_description", + "image": "image_binary" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +## Step 2: Create an index for ingestion + +In order to use the text embedding processor defined in your pipeline, create a vector index, adding the pipeline created in the previous step as the default pipeline. Ensure that the fields defined in the `field_map` are mapped as correct types. Continuing with the example, the `vector_embedding` field must be mapped as a k-NN vector with a dimension that matches the model dimension. Similarly, the `image_description` field should be mapped as `text`, and the `image_binary` should be mapped as `binary`. + +The following example request creates a vector index that is set up with a default ingest pipeline: + +```json +PUT /my-nlp-index +{ + "settings": { + "index.knn": true, + "default_pipeline": "nlp-ingest-pipeline", + "number_of_shards": 2 + }, + "mappings": { + "properties": { + "vector_embedding": { + "type": "knn_vector", + "dimension": 1024, + "method": { + "name": "hnsw", + "engine": "lucene", + "parameters": {} + } + }, + "image_description": { + "type": "text" + }, + "image_binary": { + "type": "binary" + } + } + } +} +``` +{% include copy-curl.html %} + +For more information about creating a vector index and its supported methods, see [Creating a vector index]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/). + +## Step 3: Ingest documents into the index + +To ingest documents into the index created in the previous step, send the following request: + +```json +PUT /nlp-index/_doc/1 +{ + "image_description": "Orange table", + "image_binary": "iVBORw0KGgoAAAANSUI..." +} +``` +{% include copy-curl.html %} + +Before the document is ingested into the index, the ingest pipeline runs the `text_image_embedding` processor on the document, generating vector embeddings for the `image_description` and `image_binary` fields. In addition to the original `image_description` and `image_binary` fields, the indexed document includes the `vector_embedding` field, which contains the combined vector embeddings. + +## Step 4: Search the index + +To perform a vector search on your index, use the `neural` query clause either in the [Search for a Model API]({{site.url}}{{site.baseurl}}/vector-search/api/knn/#search-for-a-model) or [Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/) queries. You can refine the results by using a [vector search filter]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/). You can search by text, image, or both text and image. + +The following example request uses a neural query to search for text and image: + +```json +GET /my-nlp-index/_search +{ + "size": 10, + "query": { + "neural": { + "vector_embedding": { + "query_text": "Orange table", + "query_image": "iVBORw0KGgoAAAANSUI...", + "model_id": "-fYQAosBQkdnhhBsK593", + "k": 5 + } + } + } +} +``` +{% include copy-curl.html %} + +To eliminate passing the model ID with each neural query request, you can set a default model on a vector index or a field. To learn more, see [Setting a default model on an index or field]({{site.url}}{{site.baseurl}}/search-plugins/neural-text-search/##setting-a-default-model-on-an-index-or-field). + +## Next steps + +- Explore our [tutorials]({{site.url}}{{site.baseurl}}/vector-search/tutorials/) to learn how to build AI search applications. \ No newline at end of file diff --git a/_search-plugins/neural-sparse-with-pipelines.md b/_vector-search/ai-search/neural-sparse-custom.md similarity index 55% rename from _search-plugins/neural-sparse-with-pipelines.md rename to _vector-search/ai-search/neural-sparse-custom.md index ef7044494a6..679e62b858b 100644 --- a/_search-plugins/neural-sparse-with-pipelines.md +++ b/_vector-search/ai-search/neural-sparse-custom.md @@ -1,48 +1,29 @@ --- layout: default -title: Configuring ingest pipelines +title: Using custom configurations for neural sparse search parent: Neural sparse search -nav_order: 10 +grand_parent: AI search +nav_order: 20 has_children: false +redirect_from: + - /search-plugins/neural-sparse-with-pipelines/ --- -# Configuring ingest pipelines for neural sparse search +# Using custom configurations for neural sparse search -Generating sparse vector embeddings within OpenSearch enables neural sparse search to function like lexical search. To take advantage of this encapsulation, set up an ingest pipeline to create and store sparse vector embeddings from document text during ingestion. At query time, input plain text, which will be automatically converted into vector embeddings for search. +Neural sparse search using automatically generated vector embeddings operates in two modes: doc-only and bi-encoder. For more information, see [Generating sparse vector embeddings automatically]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-with-pipelines/). -For this tutorial, you'll use neural sparse search with OpenSearch's built-in machine learning (ML) model hosting and ingest pipelines. Because the transformation of text to embeddings is performed within OpenSearch, you'll use text when ingesting and searching documents. +At query time, you can use custom models in the following ways: -At ingestion time, neural sparse search uses a sparse encoding model to generate sparse vector embeddings from text fields. +- **Bi-encoder mode**: Use your deployed sparse encoding model to generate embeddings from query text. This must be the same model you used at ingestion time. -At query time, neural sparse search operates in one of two search modes: +- **Doc-only mode with a custom tokenizer**: Use your deployed tokenizer model to tokenize query text. The token weights are obtained from a precomputed lookup table. -- **Bi-encoder mode** (requires a sparse encoding model): A sparse encoding model generates sparse vector embeddings from both documents and query text. This approach provides better search relevance at the cost of an increase in latency. - -- **Doc-only mode** (requires a sparse encoding model and a tokenizer): A sparse encoding model generates sparse vector embeddings from documents. In this mode, neural sparse search tokenizes query text using a tokenizer and obtains the token weights from a lookup table. This approach provides faster retrieval at the cost of a slight decrease in search relevance. The tokenizer is deployed and invoked using the [Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/) for a uniform neural sparse search experience. - -For more information about choosing the neural sparse search mode that best suits your workload, see [Choose the search mode](#step-1a-choose-the-search-mode). - -## Tutorial - -This tutorial consists of the following steps: - -1. [**Configure a sparse encoding model/tokenizer**](#step-1-configure-a-sparse-encoding-modeltokenizer). - 1. [Choose the search mode](#step-1a-choose-the-search-mode) - 1. [Register the model/tokenizer](#step-1b-register-the-modeltokenizer) - 1. [Deploy the model/tokenizer](#step-1c-deploy-the-modeltokenizer) -1. [**Ingest data**](#step-2-ingest-data) - 1. [Create an ingest pipeline](#step-2a-create-an-ingest-pipeline) - 1. [Create an index for ingestion](#step-2b-create-an-index-for-ingestion) - 1. [Ingest documents into the index](#step-2c-ingest-documents-into-the-index) -1. [**Search the data**](#step-3-search-the-data) - -### Prerequisites - -Before you start, complete the [prerequisites]({{site.url}}{{site.baseurl}}/search-plugins/neural-search-tutorial/#prerequisites) for neural search. +The following is a complete example of using a custom model for neural sparse search. ## Step 1: Configure a sparse encoding model/tokenizer -Both the bi-encoder and doc-only search modes require you to configure a sparse encoding model. Doc-only mode requires you to configure a tokenizer in addition to the model. +You must configure a sparse encoding model for ingestion when using both the bi-encoder mode and the doc-only mode with a custom tokenizer. Bi-encoder mode uses the same model for search; doc-only mode uses a separate tokenizer for search. ### Step 1(a): Choose the search mode @@ -50,21 +31,30 @@ Choose the search mode and the appropriate model/tokenizer combination: - **Bi-encoder**: Use the `amazon/neural-sparse/opensearch-neural-sparse-encoding-v2-distill` model during both ingestion and search. -- **Doc-only**: Use the `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v2-distill` model during ingestion and the `amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1` tokenizer during search. +- **Doc-only with a custom tokenizer**: Use the `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v3-distill` model during ingestion and the `amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1` tokenizer during search. + +The following tables provide a search relevance comparison for all available combinations of the two search modes so that you can choose the best combination for your use case. -The following table provides a search relevance comparison for all available combinations of the two search modes so that you can choose the best combination for your use case. +#### English language models -| Mode | Ingestion model | Search model | Avg search relevance on BEIR | Model parameters | +| Mode | Ingestion model | Search model | Avg. search relevance on BEIR | Model parameters | |-----------|---------------------------------------------------------------|---------------------------------------------------------------|------------------------------|------------------| | Doc-only | `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1` | `amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1` | 0.49 | 133M | | Doc-only | `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v2-distill` | `amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1` | 0.504 | 67M | | Doc-only | `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v2-mini` | `amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1` | 0.497 | 23M | +| Doc-only | `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v3-distill` | `amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1` | 0.517 | 67M | | Bi-encoder| `amazon/neural-sparse/opensearch-neural-sparse-encoding-v1` | `amazon/neural-sparse/opensearch-neural-sparse-encoding-v1` | 0.524 | 133M | | Bi-encoder| `amazon/neural-sparse/opensearch-neural-sparse-encoding-v2-distill` | `amazon/neural-sparse/opensearch-neural-sparse-encoding-v2-distill` | 0.528 | 67M | +#### Multilingual models + +| Mode | Ingestion model | Search model | Avg. search relevance on MIRACL | Model parameters | +|-----------|---------------------------------------------------------------|---------------------------------------------------------------|------------------------------|------------------| +| Doc-only | `amazon/neural-sparse/opensearch-neural-sparse-encoding-multilingual-v1` | `amazon/neural-sparse/opensearch-neural-sparse-tokenizer-multilingual-v1` | 0.629 | 168M | + ### Step 1(b): Register the model/tokenizer -When you register a model/tokenizer, OpenSearch creates a model group for the model/tokenizer. You can also explicitly create a model group before registering models. For more information, see [Model access control]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control/). +For both modes, register the sparse encoding model. For the doc-only mode with a custom tokenizer, register a custom tokenizer in addition to the sparse encoding model. #### Bi-encoder mode @@ -117,16 +107,16 @@ Once the task is complete, the task state will change to `COMPLETED` and the Tas Note the `model_id` of the model you've created; you'll need it for the following steps. -#### Doc-only mode +#### Doc-only mode with a custom tokenizer -When using doc-only mode, you need to register the `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v2-distill` model, which you'll use at ingestion time, and the `amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1` tokenizer, which you'll use at search time. +When using the doc-only mode with a custom tokenizer, you need to register the `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v3-distill` model, which you'll use at ingestion time, and the `amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1` tokenizer, which you'll use at search time. Register the sparse encoding model: ```json POST /_plugins/_ml/models/_register?deploy=true { - "name": "amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v2-distill", + "name": "amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v3-distill", "version": "1.0.0", "model_format": "TORCH_SCRIPT" } @@ -145,71 +135,7 @@ POST /_plugins/_ml/models/_register?deploy=true ``` {% include copy-curl.html %} -Like in the bi-encoder mode, use the Tasks API to check the status of the registration task. After the Tasks API returns the task state as `COMPLETED`. Note the `model_id` of the model and the tokenizer you've created; you'll need them for the following steps. - -### Step 1(c): Deploy the model/tokenizer - -Next, you'll need to deploy the model/tokenizer you registered. Deploying a model creates a model instance and caches the model in memory. - -#### Bi-encoder mode - -To deploy the model, provide its model ID to the `_deploy` endpoint: - -```json -POST /_plugins/_ml/models/<bi-encoder model ID>/_deploy -``` -{% include copy-curl.html %} - -As with the register operation, the deploy operation is asynchronous, so you'll get a task ID in the response: - -```json -{ - "task_id": "ale6f4oB5Vm0Tdw8NINO", - "status": "CREATED" -} -``` - -You can check the status of the task by using the Tasks API: - -```json -GET /_plugins/_ml/tasks/ale6f4oB5Vm0Tdw8NINO -``` -{% include copy-curl.html %} - -Once the task is complete, the task state will change to `COMPLETED`: - -```json -{ - "model_id": "<bi-encoder model ID>", - "task_type": "DEPLOY_MODEL", - "function_name": "SPARSE_ENCODING", - "state": "COMPLETED", - "worker_node": [ - "4p6FVOmJRtu3wehDD74hzQ" - ], - "create_time": 1694360024141, - "last_update_time": 1694360027940, - "is_async": true -} -``` - -#### Doc-only mode - -To deploy the model, provide its model ID to the `_deploy` endpoint: - -```json -POST /_plugins/_ml/models/<doc-only model ID>/_deploy -``` -{% include copy-curl.html %} - -You can deploy the tokenizer in the same way: - -```json -POST /_plugins/_ml/models/<tokenizer ID>/_deploy -``` -{% include copy-curl.html %} - -As with bi-encoder mode, you can check the status of both deploy tasks by using the Tasks API. Once the task is complete, the task state will change to `COMPLETED`. +Like in bi-encoder mode, use the Tasks API to check the status of the registration task. After the Tasks API returns, the task state changes to `COMPLETED`. Note the `model_id` of the model and the tokenizer you've created; you'll need them for the following steps. ## Step 2: Ingest data @@ -229,6 +155,8 @@ PUT /_ingest/pipeline/nlp-ingest-pipeline-sparse { "sparse_encoding": { "model_id": "<bi-encoder or doc-only model ID>", + "prune_type": "max_ratio", + "prune_ratio": 0.1, "field_map": { "passage_text": "passage_embedding" } @@ -300,7 +228,7 @@ PUT /my-nlp-index ``` {% include copy-curl.html %} -Once the `<token, weight>` pairs are excluded from the source, they cannot be recovered. Before applying this optimization, make sure you don't need the `<token, weight>` pairs for your application. +Once the `<token, weight>` pairs are excluded from the source, they cannot be recovered. Before applying this optimization, make sure you don't need the `<token, weight>` pairs for your application. {: .important} ### Step 2(c): Ingest documents into the index @@ -331,7 +259,7 @@ Before the document is ingested into the index, the ingest pipeline runs the `sp To perform a neural sparse search on your index, use the `neural_sparse` query clause in [Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/) queries. -The following example request uses a `neural_sparse` query to search for relevant documents using a raw text query. Provide the model ID for bi-encoder mode or the tokenizer ID for doc-only mode: +The following example request uses a `neural_sparse` query to search for relevant documents using a raw text query. Provide the model ID for bi-encoder mode or the tokenizer ID for doc-only mode with a custom tokenizer: ```json GET my-nlp-index/_search @@ -424,51 +352,16 @@ The response contains the matching documents: } ``` -To minimize disk and network I/O latency related to sparse embedding sources, you can exclude the embedding vector source from the query as follows: +## Configuring a default model for search -```json -GET my-nlp-index/_search -{ - "_source": { - "excludes": [ - "passage_embedding" - ] - }, - "query": { - "neural_sparse": { - "passage_embedding": { - "query_text": "Hi world", - "model_id": "<bi-encoder or tokenizer ID>" - } - } - } -} -``` -{% include copy-curl.html %} - -## Accelerating neural sparse search - -To learn more about improving retrieval time for neural sparse search, see [Accelerating neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/#accelerating-neural-sparse-search). +When using custom models, you can configure a default model ID at the index level to simplify your queries. This eliminates the need to specify the `model_id` in every query. -## Creating a search pipeline for neural sparse search - -You can create a search pipeline that augments neural sparse search functionality by: - -- Accelerating neural sparse search for faster retrieval. -- Setting the default model ID on an index for easier use. - -To configure the pipeline, add a [`neural_sparse_two_phase_processor`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-sparse-query-two-phase-processor/) or a [`neural_query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) processor. The following request creates a pipeline with both processors: +First, create a search pipeline with a [`neural_query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) processor: ```json PUT /_search/pipeline/neural_search_pipeline { "request_processors": [ - { - "neural_sparse_two_phase_processor": { - "tag": "neural-sparse", - "description": "Creates a two-phase processor for neural sparse search." - } - }, { "neural_query_enricher" : { "default_model_id": "<bi-encoder model/tokenizer ID>" @@ -479,7 +372,7 @@ PUT /_search/pipeline/neural_search_pipeline ``` {% include copy-curl.html %} -Then set the default pipeline for your index to the newly created search pipeline: +Then set this pipeline as the default for your index: ```json PUT /my-nlp-index/_settings @@ -489,23 +382,10 @@ PUT /my-nlp-index/_settings ``` {% include copy-curl.html %} -For more information about setting a default model on an index, or to learn how to set a default model on a specific field, see [Setting a default model on an index or field]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/#setting-a-default-model-on-an-index-or-field). - -## Troubleshooting - -This section contains information about resolving common issues encountered while running neural sparse search. - -### Remote connector throttling exceptions - -When using connectors to call a remote service such as Amazon SageMaker, ingestion and search calls sometimes fail because of remote connector throttling exceptions. +After configuring the default model, you can omit the `model_id` when running queries. -For OpenSearch versions earlier than 2.15, a throttling exception will be returned as an error from the remote service: +For more information about setting a default model on an index, or to learn how to set a default model on a specific field, see [Setting a default model on an index or field]({{site.url}}{{site.baseurl}}/search-plugins/semantic-search/#setting-a-default-model-on-an-index-or-field). -```json -{ - "type": "status_exception", - "reason": "Error from remote service: {\"message\":null}" -} -``` +## Next steps -To mitigate throttling exceptions, decrease the maximum number of connections specified in the `max_connection` setting in the connector's [`client_config`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/#configuration-parameters) object. Doing so will prevent the maximum number of concurrent connections from exceeding the threshold of the remote service. You can also modify the retry settings to avoid a request spike during ingestion. \ No newline at end of file +- Explore our [tutorials]({{site.url}}{{site.baseurl}}/vector-search/tutorials/) to learn how to build AI search applications. diff --git a/_search-plugins/neural-sparse-search.md b/_vector-search/ai-search/neural-sparse-search.md similarity index 73% rename from _search-plugins/neural-sparse-search.md rename to _vector-search/ai-search/neural-sparse-search.md index 0beee26ef0b..91c55ee8587 100644 --- a/_search-plugins/neural-sparse-search.md +++ b/_vector-search/ai-search/neural-sparse-search.md @@ -1,11 +1,13 @@ --- layout: default title: Neural sparse search +parent: AI search nav_order: 50 has_children: true redirect_from: - /search-plugins/neural-sparse-search/ - /search-plugins/sparse-search/ + - /search-plugins/neural-sparse-search/ --- # Neural sparse search @@ -18,10 +20,10 @@ To further boost search relevance, you can combine neural sparse search with den You can configure neural sparse search in the following ways: -- Generate vector embeddings within OpenSearch: Configure an ingest pipeline to generate and store sparse vector embeddings from document text at ingestion time. At query time, input plain text, which will be automatically converted into vector embeddings for search. For complete setup steps, see [Configuring ingest pipelines for neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-with-pipelines/). -- Ingest raw sparse vectors and search using sparse vectors directly. For complete setup steps, see [Ingesting and searching raw vectors]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-with-raw-vectors/). +- Generate vector embeddings automatically: Configure an ingest pipeline to generate and store sparse vector embeddings from document text at ingestion time. At query time, input plain text, which will be automatically converted into vector embeddings for search. For complete setup steps, see [Generating sparse vector embeddings automatically]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-with-pipelines/). +- Ingest raw sparse vectors and search using sparse vectors directly. For complete setup steps, see [Neural sparse search using raw vectors]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-with-raw-vectors/). -To learn more about splitting long text into passages for neural search, see [Text chunking]({{site.url}}{{site.baseurl}}/search-plugins/text-chunking/). +To learn more about splitting long text into passages for neural sparse search, see [Text chunking]({{site.url}}{{site.baseurl}}/search-plugins/text-chunking/). ## Accelerating neural sparse search @@ -56,7 +58,12 @@ PUT /my-nlp-index/_settings For information about `two_phase_search_pipeline`, see [Neural sparse query two-phase processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-sparse-query-two-phase-processor/). +## Text chunking + +For information about splitting large documents into smaller passages before generating embeddings, see [Text chunking]({{site.url}}{{site.baseurl}}/vector-search/ingesting-data/text-chunking/). + ## Further reading - Learn more about how sparse encoding models work and explore OpenSearch neural sparse search benchmarks in [Improving document retrieval with sparse semantic encoders](https://opensearch.org/blog/improving-document-retrieval-with-sparse-semantic-encoders/). - Learn the fundamentals of neural sparse search and its efficiency in [A deep dive into faster semantic sparse retrieval in OpenSearch 2.12](https://opensearch.org/blog/A-deep-dive-into-faster-semantic-sparse-retrieval-in-OS-2.12/). +- Explore our [tutorials]({{site.url}}{{site.baseurl}}/vector-search/tutorials/) to learn how to build AI search applications. diff --git a/_vector-search/ai-search/neural-sparse-with-pipelines.md b/_vector-search/ai-search/neural-sparse-with-pipelines.md new file mode 100644 index 00000000000..4d879efa66d --- /dev/null +++ b/_vector-search/ai-search/neural-sparse-with-pipelines.md @@ -0,0 +1,701 @@ +--- +layout: default +title: Generating sparse vector embeddings automatically +parent: Neural sparse search +grand_parent: AI search +nav_order: 10 +has_children: false +redirect_from: + - /search-plugins/neural-sparse-with-pipelines/ +--- + +# Generating sparse vector embeddings automatically + +Generating sparse vector embeddings automatically enables neural sparse search to function like lexical search. To take advantage of this encapsulation, set up an ingest pipeline to create and store sparse vector embeddings from document text during ingestion. At query time, input plain text, which will be automatically converted into vector embeddings for search. + +Neural sparse search works as follows: + +- At ingestion time, neural sparse search uses a sparse encoding model to generate sparse vector embeddings from text fields. + +- At query time, neural sparse search operates in one of two search modes: + + - **Doc-only mode (default)**: A sparse encoding model generates sparse vector embeddings from documents at ingestion time. At query time, neural sparse search tokenizes query text and obtains the token weights from a lookup table. This approach provides faster retrieval at the cost of a slight decrease in search relevance. The query-time tokenization can be performed by the following components: + - **A DL model analyzer (default)**: A [DL model analyzer]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/dl-model-analyzers/) uses a built-in ML model. This approach provides faster retrieval at the cost of a slight decrease in search relevance. + - **A custom tokenizer**: You can deploy a custom tokenizer using the [Model API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/index/) to tokenize query text. This approach provides more flexibility while maintaining consistent tokenization across your neural sparse search implementation. + + - **Bi-encoder mode**: A sparse encoding model generates sparse vector embeddings from both documents and query text. This approach provides better search relevance at the cost of an increase in latency. + +We recommend using the default doc-only mode with a DL analyzer because it provides the best balance of performance and relevance for most use cases. +{: tip} + +The default doc-only mode with an analyzer works as follows: + +1. At ingestion time: + - Your registered sparse encoding model generates sparse vector embeddings. + - These embeddings are stored as token-weight pairs in your index. + +2. At search time: + - The query text is analyzed using a built-in DL model analyzer (which uses a corresponding built-in ML model tokenizer). + - The token weights are obtained from a precomputed lookup table that's built into OpenSearch. + - The tokenization matches what the sparse encoding model expects because they both use the same tokenization scheme. + +Thus, you must choose and apply an ML model at ingestion time, but you only need to specify an analyzer (not a model) at search time. + +## Sparse encoding model/analyzer compatibility + +The following table lists all available models for use in doc-only mode. Each model is paired with its compatible analyzer that should be used at search time. Choose based on your language needs (English or multilingual) and performance requirements. + +| Model | Analyzer | BEIR relevance | MIRACL relevance | Model parameters | +| ------------------------------------------------------------------------ | --------------- | -------------- | ---------------- | ---------------- | +| `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1` | `bert-uncased` | 0.490 | N/A | 133M | +| `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v2-distill` | `bert-uncased` | 0.504 | N/A | 67M | +| `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v2-mini` | `bert-uncased` | 0.497 | N/A | 23M | +| `amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v3-distill` | `bert-uncased` | 0.517 | N/A | 67M | +| `amazon/neural-sparse/opensearch-neural-sparse-encoding-multilingual-v1` | `mbert-uncased` | 0.500 | 0.629 | 168M | + +## Example: Using the default doc-only mode with an analyzer + +This example uses the recommended **doc-only** mode with a **DL model analyzer**. In this mode, OpenSearch applies a sparse encoding model at ingestion time and a compatible DL model analyzer at search time. For examples of other modes, see [Using custom configurations for neural sparse search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-custom/). + +For this example, you'll use neural sparse search with OpenSearch's built-in machine learning (ML) model hosting and ingest pipelines. Because the transformation of text to embeddings is performed within OpenSearch, you'll use text when ingesting and searching documents. + +### Prerequisites + +Before you start, complete the [prerequisites]({{site.url}}{{site.baseurl}}/search-plugins/neural-search-tutorial/#prerequisites). + +### Step 1: Configure a sparse encoding model for ingestion + +To use doc-only mode, first [choose a sparse encoding model](#sparse-encoding-modelanalyzer-compatibility) to be used at ingestion time. Then, register and deploy the model. For example, to register and deploy the `opensearch-neural-sparse-encoding-doc-v3-distill` model, use the following request: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v3-distill", + "version": "1.0.0", + "model_format": "TORCH_SCRIPT" +} +``` +{% include copy-curl.html %} + +Registering a model is an asynchronous task. OpenSearch returns a task ID for every model you register: + +```json +{ + "task_id": "aFeif4oB5Vm0Tdw8yoN7", + "status": "CREATED" +} +``` + +You can check the status of the task by calling the Tasks API: + +```json +GET /_plugins/_ml/tasks/aFeif4oB5Vm0Tdw8yoN7 +``` +{% include copy-curl.html %} + +Once the task is complete, the task state will change to `COMPLETED` and the Tasks API response will contain the model ID of the registered model: + +```json +{ + "model_id": "<model ID>", + "task_type": "REGISTER_MODEL", + "function_name": "SPARSE_ENCODING", + "state": "COMPLETED", + "worker_node": [ + "4p6FVOmJRtu3wehDD74hzQ" + ], + "create_time": 1694358489722, + "last_update_time": 1694358499139, + "is_async": true +} +``` + +Note the `model_id` of the model you've created; you'll need it for the following steps. + +### Step 2: Create an ingest pipeline + +To generate sparse vector embeddings, you need to create an [ingest pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) that contains a [`sparse_encoding` processor]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/processors/sparse-encoding/), which will convert the text in a document field to vector embeddings. The processor's `field_map` determines the input fields from which to generate vector embeddings and the output fields in which to store the embeddings. + +The following example request creates an ingest pipeline where the text from `passage_text` will be converted into sparse vector embeddings, which will be stored in `passage_embedding`. Provide the model ID of the registered model in the request: + +```json +PUT /_ingest/pipeline/nlp-ingest-pipeline-sparse +{ + "description": "An sparse encoding ingest pipeline", + "processors": [ + { + "sparse_encoding": { + "model_id": "<bi-encoder or doc-only model ID>", + "prune_type": "max_ratio", + "prune_ratio": 0.1, + "field_map": { + "passage_text": "passage_embedding" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +To split long text into passages, use the `text_chunking` ingest processor before the `sparse_encoding` processor. For more information, see [Text chunking]({{site.url}}{{site.baseurl}}/search-plugins/text-chunking/). + +### Step 3: Create an index for ingestion + +In order to use the sparse encoding processor defined in your pipeline, create a rank features index, adding the pipeline created in the previous step as the default pipeline. Ensure that the fields defined in the `field_map` are mapped as correct types. Continuing with the example, the `passage_embedding` field must be mapped as [`rank_features`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/rank/#rank-features). Similarly, the `passage_text` field must be mapped as `text`. + +The following example request creates a rank features index configured with a default ingest pipeline: + +```json +PUT /my-nlp-index +{ + "settings": { + "default_pipeline": "nlp-ingest-pipeline-sparse" + }, + "mappings": { + "properties": { + "id": { + "type": "text" + }, + "passage_embedding": { + "type": "rank_features" + }, + "passage_text": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +To save disk space, you can exclude the embedding vector from the source as follows: + +```json +PUT /my-nlp-index +{ + "settings": { + "default_pipeline": "nlp-ingest-pipeline-sparse" + }, + "mappings": { + "_source": { + "excludes": [ + "passage_embedding" + ] + }, + "properties": { + "id": { + "type": "text" + }, + "passage_embedding": { + "type": "rank_features" + }, + "passage_text": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +Once the `<token, weight>` pairs are excluded from the source, they cannot be recovered. Before applying this optimization, make sure you don't need the `<token, weight>` pairs for your application. +{: .important} + +### Step 4: Ingest documents into the index + +To ingest documents into the index created in the previous step, send the following requests: + +```json +PUT /my-nlp-index/_doc/1 +{ + "passage_text": "Hello world", + "id": "s1" +} +``` +{% include copy-curl.html %} + +```json +PUT /my-nlp-index/_doc/2 +{ + "passage_text": "Hi planet", + "id": "s2" +} +``` +{% include copy-curl.html %} + +Before the document is ingested into the index, the ingest pipeline runs the `sparse_encoding` processor on the document, generating vector embeddings for the `passage_text` field. The indexed document includes the `passage_text` field, which contains the original text, and the `passage_embedding` field, which contains the vector embeddings. + + +### Step 5: Search the data + +To perform a neural sparse search on your index, use the `neural_sparse` query clause in [Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/) queries. + +The following example request uses a `neural_sparse` query to search for relevant documents using a raw text query. Specify the `analyzer` compatible with the model you chose (see [Sparse encoding model/analyzer compatibility](#sparse-encoding-modelanalyzer-compatibility)): + +```json +GET my-nlp-index/_search +{ + "query": { + "neural_sparse": { + "passage_embedding": { + "query_text": "Hi world", + "analyzer": "bert-uncased" + } + } + } +} +``` +{% include copy-curl.html %} + +If you don't specify an analyzer, the default `bert-uncased` analyzer is used. Thus, this query is equivalent to the preceding one: + +```json +GET my-nlp-index/_search +{ + "query": { + "neural_sparse": { + "passage_embedding": { + "query_text": "Hi world" + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching documents: + +```json +{ + "took" : 688, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : 30.0029, + "hits" : [ + { + "_index" : "my-nlp-index", + "_id" : "1", + "_score" : 30.0029, + "_source" : { + "passage_text" : "Hello world", + "passage_embedding" : { + "!" : 0.8708904, + "door" : 0.8587369, + "hi" : 2.3929274, + "worlds" : 2.7839446, + "yes" : 0.75845814, + "##world" : 2.5432441, + "born" : 0.2682308, + "nothing" : 0.8625516, + "goodbye" : 0.17146169, + "greeting" : 0.96817183, + "birth" : 1.2788506, + "come" : 0.1623208, + "global" : 0.4371151, + "it" : 0.42951578, + "life" : 1.5750692, + "thanks" : 0.26481047, + "world" : 4.7300377, + "tiny" : 0.5462298, + "earth" : 2.6555297, + "universe" : 2.0308156, + "worldwide" : 1.3903781, + "hello" : 6.696973, + "so" : 0.20279501, + "?" : 0.67785245 + }, + "id" : "s1" + } + }, + { + "_index" : "my-nlp-index", + "_id" : "2", + "_score" : 16.480486, + "_source" : { + "passage_text" : "Hi planet", + "passage_embedding" : { + "hi" : 4.338913, + "planets" : 2.7755864, + "planet" : 5.0969057, + "mars" : 1.7405145, + "earth" : 2.6087382, + "hello" : 3.3210192 + }, + "id" : "s2" + } + } + ] + } +} +``` + +To minimize disk and network I/O latency related to sparse embedding sources, you can exclude the embedding vector source from the query as follows: + +```json +GET my-nlp-index/_search +{ + "_source": { + "excludes": [ + "passage_embedding" + ] + }, + "query": { + "neural_sparse": { + "passage_embedding": { + "query_text": "Hi world", + "analyzer": "bert-uncased" + } + } + } +} +``` +{% include copy-curl.html %} + +## Bi-encoder mode + +In bi-encoder mode, register and deploy a bi-encoder model to use at both ingestion and query time: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "amazon/neural-sparse/opensearch-neural-sparse-encoding-v2-distill", + "version": "1.0.0", + "model_format": "TORCH_SCRIPT" +} +``` +{% include copy-curl.html %} + +After deployment, use the same `model_id` for search: + +```json +GET my-nlp-index/_search +{ + "query": { + "neural_sparse": { + "passage_embedding": { + "query_text": "Hi world", + "model_id": "<bi-encoder model_id>" + } + } + } +} +``` +{% include copy-curl.html %} + +For a complete example, see [Using custom configurations for neural sparse search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-custom/). + +## Doc-only mode with a custom tokenizer + +You can use doc-only mode with a custom tokenizer. To deploy a tokenizer, send the following request: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1", + "version": "1.0.1", + "model_format": "TORCH_SCRIPT" +} +``` +{% include copy-curl.html %} + +After deployment, use the `model_id` of the tokenizer in your query: + +```json +GET my-nlp-index/_search +{ + "query": { + "neural_sparse": { + "passage_embedding": { + "query_text": "Hi world", + "model_id": "<tokenizer model_id>" + } + } + } +} +``` +{% include copy-curl.html %} + +For a complete example, see [Using custom configurations for neural sparse search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-custom/). + +## Using a semantic field + +Using a `semantic` field simplifies neural sparse search configuration. To use a `semantic` field, follow these steps. For more information, see [Semantic field type]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/semantic/). + +### Step 1: Register and deploy a sparse encoding model + +First, register and deploy a sparse encoding model as described in [Step 1](#step-1-register-and-deploy-a-sparse-encoding-model). + +## Step 2: Create an index with a semantic field for ingestion + +The sparse encoding model configured in the previous step is used at ingestion time to generate sparse vector embeddings. When using a `semantic` field, set the `model_id` to the ID of the model used for ingestion. For doc-only mode, you can additionally specify the model to be used at query time by providing its ID in the `search_model_id` field. + +The following example shows how to create an index with a `semantic` field configured in doc-only mode using a sparse encoding model. To enable automatic splitting of long text into smaller passages, set `chunking` to `true` in the semantic field configuration: + +```json +PUT /my-nlp-index +{ + "mappings": { + "properties": { + "id": { + "type": "text" + }, + "passage_text": { + "type": "semantic", + "model_id": "_kPwYJcBmp4cG9LrUQsE", + "search_model_id": "AUPwYJcBmp4cG9LrmQy8", + "chunking": true + } + } + } +} +``` +{% include copy-curl.html %} + +After creating the index, you can retrieve its mapping to verify that the embedding field was automatically created: + +```json +GET /my-nlp-index/_mapping +{ + "my-nlp-index": { + "mappings": { + "properties": { + "id": { + "type": "text" + }, + "passage_text": { + "type": "semantic", + "model_id": "_kPwYJcBmp4cG9LrUQsE", + "search_model_id": "AUPwYJcBmp4cG9LrmQy8", + "raw_field_type": "text", + "chunking": true + }, + "passage_text_semantic_info": { + "properties": { + "chunks": { + "type": "nested", + "properties": { + "embedding": { + "type": "rank_features" + }, + "text": { + "type": "text" + } + } + }, + "model": { + "properties": { + "id": { + "type": "text", + "index": false + }, + "name": { + "type": "text", + "index": false + }, + "type": { + "type": "text", + "index": false + } + } + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +An object field named `passage_text_semantic_info` is automatically created. It includes a `ran_features` subfield for storing the embedding, along with additional text fields for capturing model metadata. + +### Step 3: Ingest documents into the index + +To ingest documents into the index created in the previous step, send the following requests: + +```json +PUT /my-nlp-index/_doc/1 +{ + "passage_text": "Hello world", + "id": "s1" +} +``` +{% include copy-curl.html %} + +```json +PUT /my-nlp-index/_doc/2 +{ + "passage_text": "Hi planet", + "id": "s2" +} +``` +{% include copy-curl.html %} + +Before a document is ingested into the index, OpenSearch automatically chunks the text and generates sparse vector embeddings for each chunk. To verify that the embedding is generated properly, you can run a search request to retrieve the document: + +```json +GET /my-nlp-index/_doc/1 +{ + "_index": "my-nlp-index", + "_id": "1", + "_version": 1, + "_seq_no": 0, + "_primary_term": 1, + "found": true, + "_source": { + "passage_text": "Hello world", + "passage_text_semantic_info": { + "chunks": [ + { + "text": "Hello world", + "embedding": { + "hi": 0.5843902, + ... + } + } + ], + "model": { + "name": "amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v3-distill", + "id": "_kPwYJcBmp4cG9LrUQsE", + "type": "SPARSE_ENCODING" + } + }, + "id": "s1" + } +} +``` +{% include copy-curl.html %} + +## Step 3: Search the data + +To search the embeddings of the semantic field, use the `neural` query clause in [Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/) queries. + +The following example uses a `neural` query to search for relevant documents using text input. You only need to specify the `semantic` field name---OpenSearch automatically rewrites the query and applies it to the underlying embedding field, appropriately handling any nested objects. There's no need to provide the `model_id` in the query because OpenSearch retrieves it from the `semantic` field's configuration in the index mapping: + +```json +GET my-nlp-index/_search +{ + "_source": { + "excludes": [ + "passage_text_semantic_info" + ] + }, + "query": { + "neural": { + "passage_text": { + "query_text": "Hi world" + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching documents: + +```json +{ + "took": 19, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 6.437132, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "1", + "_score": 6.437132, + "_source": { + "passage_text": "Hello world", + "id": "s1" + } + }, + { + "_index": "my-nlp-index", + "_id": "2", + "_score": 5.063226, + "_source": { + "passage_text": "Hi planet", + "id": "s2" + } + } + ] + } +} +``` + +Alternatively, you can use a built-in analyzer to tokenize the query text: + +```json +GET my-nlp-index/_search +{ + "_source": { + "excludes": [ + "passage_text_semantic_info" + ] + }, + "query": { + "neural": { + "passage_text": { + "query_text": "Hi world", + "semantic_field_search_analyzer": "bert-uncased" + } + } + } +} +``` +{% include copy-curl.html %} + +To simplify the query further, you can define the `semantic_field_search_analyzer` in the `semantic` field configuration. This allows you to omit the analyzer from the query itself because OpenSearch automatically applies the configured analyzer during search. + +## Accelerating neural sparse search + +To learn more about improving retrieval time for neural sparse search, see [Accelerating neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/#accelerating-neural-sparse-search). + +If you're using `semantic` fields with a `neural` query, query acceleration is currently **not supported**. You can achieve acceleration by running a `neural_sparse` query directly against the underlying `rank_features` field. +{: .note} + +## Troubleshooting + +This section contains information about resolving common issues encountered while running neural sparse search. + +### Remote connector throttling exceptions + +When using connectors to call a remote service such as Amazon SageMaker, ingestion and search calls sometimes fail because of remote connector throttling exceptions. + +For OpenSearch versions earlier than 2.15, a throttling exception will be returned as an error from the remote service: + +```json +{ + "type": "status_exception", + "reason": "Error from remote service: {\"message\":null}" +} +``` + +To mitigate throttling exceptions, decrease the maximum number of connections specified in the `max_connection` setting in the connector's [`client_config`]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/#configuration-parameters) object. Doing so will prevent the maximum number of concurrent connections from exceeding the threshold of the remote service. You can also modify the retry settings to avoid a request spike during ingestion. + +## Next steps + +- To learn how to use custom neural sparse search configurations, see [Using custom configurations for neural sparse search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-custom/). +- To learn more about improving retrieval time for neural sparse search, see [Accelerating neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/#accelerating-neural-sparse-search). +- To learn how to build AI search applications, explore our [tutorials]({{site.url}}{{site.baseurl}}/vector-search/tutorials/). \ No newline at end of file diff --git a/_search-plugins/neural-sparse-with-raw-vectors.md b/_vector-search/ai-search/neural-sparse-with-raw-vectors.md similarity index 64% rename from _search-plugins/neural-sparse-with-raw-vectors.md rename to _vector-search/ai-search/neural-sparse-with-raw-vectors.md index d69a789a1d4..daa88c2c5c2 100644 --- a/_search-plugins/neural-sparse-with-raw-vectors.md +++ b/_vector-search/ai-search/neural-sparse-with-raw-vectors.md @@ -1,32 +1,25 @@ --- layout: default -title: Using raw vectors +title: Neural sparse search using raw vectors parent: Neural sparse search -nav_order: 20 +grand_parent: AI search +nav_order: 30 has_children: false +redirect_from: + - /search-plugins/neural-sparse-with-raw-vectors/ --- -# Using raw vectors for neural sparse search +# Neural sparse search using raw vectors -If you're using self-hosted sparse embedding models, you can ingest raw sparse vectors and use neural sparse search. +If you're using self-hosted sparse embedding models, you can ingest raw sparse vectors for use in neural sparse search. -## Tutorial +## Example -This tutorial consists of the following steps: +The following example ingests sparse vectors into an OpenSearch index and then uses a sparse vector to search for matching documents. -1. [**Ingest sparse vectors**](#step-1-ingest-sparse-vectors) - 1. [Create an index](#step-1a-create-an-index) - 1. [Ingest documents into the index](#step-1b-ingest-documents-into-the-index) -1. [**Search the data using raw sparse vector**](#step-2-search-the-data-using-a-sparse-vector). +### Step 1: Create an index - -## Step 1: Ingest sparse vectors - -Once you have generated sparse vector embeddings, you can directly ingest them into OpenSearch. - -### Step 1(a): Create an index - -In order to ingest documents containing raw sparse vectors, create a rank features index: +To ingest documents containing raw sparse vectors, create a rank features index: ```json PUT /my-nlp-index @@ -48,7 +41,7 @@ PUT /my-nlp-index ``` {% include copy-curl.html %} -### Step 1(b): Ingest documents into the index +### Step 2: Ingest documents into the index To ingest documents into the index created in the previous step, send the following request: @@ -69,7 +62,7 @@ PUT /my-nlp-index/_doc/1 ``` {% include copy-curl.html %} -## Step 2: Search the data using a sparse vector +### Step 3: Search the data using a sparse vector To search the documents using a sparse vector, provide the sparse embeddings in the `neural_sparse` query: @@ -97,3 +90,7 @@ GET my-nlp-index/_search ## Accelerating neural sparse search To learn more about improving retrieval time for neural sparse search, see [Accelerating neural sparse search]({{site.url}}{{site.baseurl}}/search-plugins/neural-sparse-search/#accelerating-neural-sparse-search). + +## Next steps + +- Explore our [tutorials]({{site.url}}{{site.baseurl}}/vector-search/tutorials/) to learn how to build AI search applications. diff --git a/_vector-search/ai-search/semantic-search.md b/_vector-search/ai-search/semantic-search.md new file mode 100644 index 00000000000..0dccb79e795 --- /dev/null +++ b/_vector-search/ai-search/semantic-search.md @@ -0,0 +1,529 @@ +--- +layout: default +title: Semantic search +parent: AI search +nav_order: 35 +has_children: false +redirect_from: + - /search-plugins/neural-text-search/ + - /search-plugins/semantic-search/ +--- + +# Semantic search + +Semantic search considers the context and intent of a query. In OpenSearch, semantic search is facilitated by text embedding models. Semantic search creates a dense vector (a list of floats) and ingests data into a vector index. + +**PREREQUISITE**<br> +Before using semantic search, you must set up a text embedding model. For more information, see [Choosing a model]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/#choosing-a-model). +{: .note} + +## Configuring semantic search + +There are two ways to configure semantic search: + +- [**Automated workflow**](#automated-workflow) (Recommended for quick setup): Automatically create an ingest pipeline and index with minimal configuration. +- [**Manual setup**](#manual-setup) (Recommended for custom configurations): Manually configure each component for greater flexibility and control. +- [**Using a semantic field**](#using-a-semantic-field) (Recommended for quick setup with optional customization): Manually configure the index using `semantic` fields to simplify the setup process while still allowing for some level of configuration. + +## Automated workflow + +OpenSearch provides a [workflow template]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-templates/#semantic-search) that automatically creates both an ingest pipeline and an index. You must provide the model ID for the configured model when creating a workflow. Review the semantic search workflow template [defaults](https://github.com/opensearch-project/flow-framework/blob/main/src/main/resources/defaults/semantic-search-defaults.json) to determine whether you need to update any of the parameters. For example, if the model dimensionality is different from the default (`1024`), specify the dimensionality of your model in the `output_dimension` parameter. To create the default semantic search workflow, send the following request: + +```json +POST /_plugins/_flow_framework/workflow?use_case=semantic_search&provision=true +{ + "create_ingest_pipeline.model_id": "mBGzipQB2gmRjlv_dOoB" +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a workflow ID for the created workflow: + +```json +{ + "workflow_id" : "U_nMXJUBq_4FYQzMOS4B" +} +``` + +To check the workflow status, send the following request: + +```json +GET /_plugins/_flow_framework/workflow/U_nMXJUBq_4FYQzMOS4B/_status +``` +{% include copy-curl.html %} + +Once the workflow completes, the `state` changes to `COMPLETED`. The workflow creates the following components: + +- An ingest pipeline named `nlp-ingest-pipeline` +- An index named `my-nlp-index` + +You can now continue with [steps 3 and 4](#step-3-ingest-documents-into-the-index) to ingest documents into the index and search the index. + +## Manual setup + +To manually configure semantic search, follow these steps: + +1. [Create an ingest pipeline](#step-1-create-an-ingest-pipeline). +1. [Create an index for ingestion](#step-2-create-an-index-for-ingestion). +1. [Ingest documents into the index](#step-3-ingest-documents-into-the-index). +1. [Search the index](#step-4-search-the-index). + +### Step 1: Create an ingest pipeline + +To generate vector embeddings, you need to create an [ingest pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) that contains a [`text_embedding` processor]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/processors/text-embedding/), which will convert the text in a document field to vector embeddings. The processor's `field_map` determines the input fields from which to generate vector embeddings and the output fields in which to store the embeddings. + +The following example request creates an ingest pipeline where the text from `passage_text` will be converted into text embeddings and the embeddings will be stored in `passage_embedding`: + +```json +PUT /_ingest/pipeline/nlp-ingest-pipeline +{ + "description": "A text embedding pipeline", + "processors": [ + { + "text_embedding": { + "model_id": "bQ1J8ooBpBj3wT4HVUsb", + "field_map": { + "passage_text": "passage_embedding" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +To split long text into passages, use the `text_chunking` ingest processor before the `text_embedding` processor. For more information, see [Text chunking]({{site.url}}{{site.baseurl}}/search-plugins/text-chunking/). + +### Step 2: Create an index for ingestion + +In order to use the text embedding processor defined in your pipeline, create a vector index, adding the pipeline created in the previous step as the default pipeline. Ensure that the fields defined in the `field_map` are mapped as correct types. Continuing with the example, the `passage_embedding` field must be mapped as a k-NN vector with a dimension that matches the model dimension. Similarly, the `passage_text` field should be mapped as `text`. + +The following example request creates a vector index that is set up with a default ingest pipeline: + +```json +PUT /my-nlp-index +{ + "settings": { + "index.knn": true, + "default_pipeline": "nlp-ingest-pipeline" + }, + "mappings": { + "properties": { + "id": { + "type": "text" + }, + "passage_embedding": { + "type": "knn_vector", + "dimension": 768, + "method": { + "engine": "lucene", + "space_type": "l2", + "name": "hnsw", + "parameters": {} + } + }, + "passage_text": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +For more information about creating a vector index and its supported methods, see [Creating a vector index]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/). + +### Step 3: Ingest documents into the index + +To ingest documents into the index created in the previous step, send the following requests: + +```json +PUT /my-nlp-index/_doc/1 +{ + "passage_text": "Hello world", + "id": "s1" +} +``` +{% include copy-curl.html %} + +```json +PUT /my-nlp-index/_doc/2 +{ + "passage_text": "Hi planet", + "id": "s2" +} +``` +{% include copy-curl.html %} + +Before the document is ingested into the index, the ingest pipeline runs the `text_embedding` processor on the document, generating text embeddings for the `passage_text` field. The indexed document includes the `passage_text` field, which contains the original text, and the `passage_embedding` field, which contains the vector embeddings. + +### Step 4: Search the index + +To perform a vector search on your index, use the `neural` query clause either in the [Search for a Model API]({{site.url}}{{site.baseurl}}/vector-search/api/knn/#search-for-a-model) or [Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/) queries. You can refine the results by using a [vector search filter]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/). + +The following example request uses a Boolean query to combine a filter clause and two query clauses---a neural query and a `match` query. The `script_score` query assigns custom weights to the query clauses: + +```json +GET /my-nlp-index/_search +{ + "_source": { + "excludes": [ + "passage_embedding" + ] + }, + "query": { + "bool": { + "filter": { + "wildcard": { "id": "*1" } + }, + "should": [ + { + "script_score": { + "query": { + "neural": { + "passage_embedding": { + "query_text": "Hi world", + "model_id": "bQ1J8ooBpBj3wT4HVUsb", + "k": 100 + } + } + }, + "script": { + "source": "_score * 1.5" + } + } + }, + { + "script_score": { + "query": { + "match": { + "passage_text": "Hi world" + } + }, + "script": { + "source": "_score * 1.7" + } + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching document: + +```json +{ + "took" : 36, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 1, + "relation" : "eq" + }, + "max_score" : 1.2251667, + "hits" : [ + { + "_index" : "my-nlp-index", + "_id" : "1", + "_score" : 1.2251667, + "_source" : { + "passage_text" : "Hello world", + "id" : "s1" + } + } + ] + } +} +``` + +### Setting a default model on an index or field + +A [`neural`]({{site.url}}{{site.baseurl}}/query-dsl/specialized/neural/) query requires a model ID for generating vector embeddings. To eliminate passing the model ID with each neural query request, you can set a default model on a vector index or a field. + +First, create a [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) with a [`neural_query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) request processor. To set a default model for an index, provide the model ID in the `default_model_id` parameter. To set a default model for a specific field, provide the field name and the corresponding model ID in the `neural_field_default_id` map. If you provide both `default_model_id` and `neural_field_default_id`, `neural_field_default_id` takes precedence: + +```json +PUT /_search/pipeline/default_model_pipeline +{ + "request_processors": [ + { + "neural_query_enricher" : { + "default_model_id": "bQ1J8ooBpBj3wT4HVUsb", + "neural_field_default_id": { + "my_field_1": "uZj0qYoBMtvQlfhaYeud", + "my_field_2": "upj0qYoBMtvQlfhaZOuM" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +Then set the default model for your index: + +```json +PUT /my-nlp-index/_settings +{ + "index.search.default_pipeline" : "default_model_pipeline" +} +``` +{% include copy-curl.html %} + +You can now omit the model ID when searching: + +```json +GET /my-nlp-index/_search +{ + "_source": { + "excludes": [ + "passage_embedding" + ] + }, + "query": { + "neural": { + "passage_embedding": { + "query_text": "Hi world", + "k": 100 + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains both documents: + +```json +{ + "took" : 41, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : 1.22762, + "hits" : [ + { + "_index" : "my-nlp-index", + "_id" : "2", + "_score" : 1.22762, + "_source" : { + "passage_text" : "Hi planet", + "id" : "s2" + } + }, + { + "_index" : "my-nlp-index", + "_id" : "1", + "_score" : 1.2251667, + "_source" : { + "passage_text" : "Hello world", + "id" : "s1" + } + } + ] + } +} +``` + +## Using a semantic field + +To manually configure semantic search using a `semantic` field, follow these steps. For more information, including about limitations when using `semantic` fields, see [Semantic field type]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/semantic/). + +### Step 1: Create an index with a semantic field + +Create an index and specify the `model_id` in the `semantic` field. In this example, the `semantic` field is `passage_text`. OpenSearch automatically creates the corresponding embedding field based on the model configuration. An ingest pipeline is not required---OpenSearch automatically generates the embeddings using the specified model during indexing: + +```json +PUT /my-nlp-index +{ + "settings": { + "index.knn": true + }, + "mappings": { + "properties": { + "id": { + "type": "text" + }, + "passage_text": { + "type": "semantic", + "model_id": "9kPWYJcBmp4cG9LrbAvW" + } + } + } +} +``` +{% include copy-curl.html %} + +After creating the index, you can retrieve its mapping to verify that the embedding field was automatically created: + +```json +GET /my-nlp-index/_mapping +{ + "my-nlp-index": { + "mappings": { + "properties": { + "id": { + "type": "text" + }, + "passage_text": { + "type": "semantic", + "model_id": "9kPWYJcBmp4cG9LrbAvW", + "raw_field_type": "text" + }, + "passage_text_semantic_info": { + "properties": { + "embedding": { + "type": "knn_vector", + "dimension": 384, + "method": { + "engine": "faiss", + "space_type": "l2", + "name": "hnsw", + "parameters": {} + } + }, + "model": { + "properties": { + "id": { + "type": "text", + "index": false + }, + "name": { + "type": "text", + "index": false + }, + "type": { + "type": "text", + "index": false + } + } + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +### Step 2: Ingest documents into the index + +To ingest documents into the index created in the previous step, send the following requests: + +```json +PUT /my-nlp-index/_doc/1 +{ + "passage_text": "Hello world", + "id": "s1" +} +``` +{% include copy-curl.html %} + +Before the document is ingested into the index, OpenSearch runs a built-in ingest pipeline that generates embeddings and stores them in the `passage_text_semantic_info.embedding` field. To verify that the embedding is generated properly, you can run a search request to retrieve the document: + +```json +GET /my-nlp-index/_doc/1 +{ + "_index": "my-nlp-index", + "_id": "1", + "_version": 1, + "_seq_no": 0, + "_primary_term": 1, + "found": true, + "_source": { + "passage_text": "Hello world", + "passage_text_semantic_info": { + "model": { + "name": "huggingface/sentence-transformers/all-MiniLM-L6-v2", + "id": "9kPWYJcBmp4cG9LrbAvW", + "type": "TEXT_EMBEDDING" + }, + "embedding": [ + -0.034477286, + ... + ] + }, + "id": "s1" + } +} +``` +{% include copy-curl.html %} + +### Step 3: Search the index + +To query the embedding of the `semantic` field, provide the `semantic` field's name (in this example, `passage_text`) and the query text. There's no need to specify the `model_id`---OpenSearch automatically retrieves it from the field's configuration in the index mapping and rewrites the query to target the underlying embedding field: + +```json +GET /my-nlp-index/_search +{ + "_source": { + "excludes": [ + "passage_text_semantic_info" + ] + }, + "query": { + "neural": { + "passage_text": { + "query_text": "Hi world" + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching document: + +```json +{ + "took": 48, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.7564365, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "1", + "_score": 0.7564365, + "_source": { + "passage_text": "Hello world", + "id": "s1" + } + } + ] + } +} +``` + +## Next steps + +- Explore our [semantic search tutorials]({{site.url}}{{site.baseurl}}/vector-search/tutorials/semantic-search/) to learn how to build AI search applications. \ No newline at end of file diff --git a/_vector-search/ai-search/workflow-builder.md b/_vector-search/ai-search/workflow-builder.md new file mode 100644 index 00000000000..94fb1e229a5 --- /dev/null +++ b/_vector-search/ai-search/workflow-builder.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Building AI search workflows in OpenSearch Dashboards +parent: AI search +has_children: true +has_toc: false +nav_order: 80 +redirect_from: + - /automating-configurations/workflow-builder/ + - /tutorials/ai-search-flows/building-flows/ + - /tutorials/gen-ai/ai-search-flows/building-flows/ +--- + +# Building AI search workflows in OpenSearch Dashboards + +In OpenSearch Dashboards, you can iteratively build and test workflows containing ingest and search pipelines using AI Search Flows. Using a UI editor to build workflows simplifies the creation of artificial intelligence and machine learning (AI/ML) use cases that include ML inference processors, such as vector search and retrieval-augmented generation (RAG). For example configurations of available AI search types (including semantic search, hybrid search, RAG, and multimodal search), see [Configuring AI search types]({{site.url}}{{site.baseurl}}/vector-search/ai-search/building-flows/). + +Once your workflow is finalized, you can export it as a [workflow template]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-templates/) to recreate identical resources across multiple clusters. + +## Prerequisite knowledge + +[Ingest pipelines]({{site.url}}{{site.baseurl}}/ingest-pipelines/) and [search pipelines]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) enable data transformation at different stages of ingest and search operations in OpenSearch. An _ingest pipeline_ consists of a sequence of [_ingest processors_]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/index-processors/), while a _search pipeline_ consists of [_search request processors_]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/search-processors#search-request-processors) and/or [_search response processors_]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/search-processors#search-response-processors). You can combine these processors to create custom pipelines tailored to your data processing needs. + +These pipelines modify data at three key stages: + +1. **Ingestion**: Transform documents before they are ingested into an index. +2. **Search request**: Transform the search request before executing the search. +3. **Search response**: Transform the search response, including documents in the results, after executing the search but before returning the response. + +In OpenSearch, you can [integrate models hosted on third-party platforms]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/) and use their inference capabilities directly in OpenSearch. Both ingest and search pipelines offer [ML inference processors]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/ml-inference/), allowing you to use externally hosted models for inference in your pipelines during both ingestion and search. + +## Accessing AI Search Flows + +To access AI Search Flows, go to **OpenSearch Dashboards** and select **OpenSearch Plugins** > **AI Search Flows** from the top menu. + +## Preset templates + +On the home page, select the **New workflow** tab, or select the **Create workflow** button on the right. This opens a selection of preset templates designed for different use cases, each with a unique set of preconfigured ingest and search processors. These templates serve two main purposes: + +- **Quickly testing AI/ML solutions**: If your deployed models have defined interfaces, you can set up a basic solution in your cluster in a few clicks. For more information, see [Example: Semantic search with RAG](#example-semantic-search-with-rag). +- **A starting point for your custom/advanced solution**: Each template provides a structured starting point for building a custom solution. You can modify and expand upon these templates to suit your specific needs. + +## Workflow editor + +You can build and test your ingest and search workflows in the workflow editor, shown in the following image. + +![Workflow editor]({{site.url}}{{site.baseurl}}/images/dashboards-flow-framework/details-page.png) + +The workflow editor is organized like an integrated development environment (IDE) and includes three main components: + +- **Flow overview**: A collapsible navigation panel for selecting the different components within your ingest and search flows. In this panel, you can add, remove, or reorder processors. If you already have a populated index and only need a search flow, you can disable **Ingest flow**. +- **Component details**: The central panel for configuring the individual component details. Selecting a component from **Flow overview** populates this panel with the relevant details. +- **Inspect**: A set of tabs for interacting with your workflow. + - **Test flow**: Allows you to run your search flow, with or without a search pipeline, and view results in a table or as raw JSON. + - **Ingest response**: Displays the API response after updating your ingest flows. + - **Errors**: Shows the latest errors from updates, ingest operations, or searches. This tab opens automatically when a new error occurs. + - **Resources**: Lists OpenSearch resources linked to the workflow, including up to one ingest pipeline, one index, and one search pipeline. To view resource details, select **Inspect**. + - **Preview**: A read-only visualization of how data moves through your ingest and search flows. As you make changes to your flow, this view updates automatically. You can also switch to the **JSON** tab to see the underlying template configuration. + + +## Example: Semantic search with RAG + +The following example uses a deployed [Titan Text Embedding](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html) model and an [Anthropic Claude model hosted on Amazon Bedrock](https://aws.amazon.com/bedrock/claude/) to build an [ingest pipeline]({{site.url}}{{site.baseurl}}/ingest-pipelines/), [index]({{site.url}}{{site.baseurl}}/getting-started/intro/#index), and [search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/index/) for performing vector search and RAG. + +We strongly recommend using models with full model interfaces. For a list of example configurations, see [Models](https://github.com/opensearch-project/dashboards-flow-framework/blob/main/documentation/models.md). +{: .note} + +1. On the **Workflows** page, select the **New workflow** tab, as shown in the following image. + ![New workflow page]({{site.url}}{{site.baseurl}}/images/dashboards-flow-framework/new-workflow-page.png) +2. In the **RAG with Vector Retrieval** template, select **Create**. +3. Provide some basic details, as shown in the following image: + + - A unique workflow name and description + - The embedding model used to generate vector embeddings + - The large language model (LLM) used to perform RAG + ![Quick configure modal]({{site.url}}{{site.baseurl}}/images/dashboards-flow-framework/quick-configure-modal.png) + + For additional options, such as the text field and vector field names that will be persisted in the index, select **Optional configuration**. You can update these settings at any time. + +4. Select **Create** to prepopulate the configuration and automatically navigate to the **Workflow Details** page, where you can configure your ingest flow. +5. To provide sample data, select **Sample data** from **Flow overview**. Then select **Import data**. You can enter data manually, upload a local `.jsonl` file, or retrieve sample documents from an existing index, as shown in the following image. + ![Import data modal]({{site.url}}{{site.baseurl}}/images/dashboards-flow-framework/import-data-modal.png) + + The form expects data in [JSON Lines format](https://jsonlines.org/), with each line representing a standalone [document]({{site.url}}{{site.baseurl}}/getting-started/intro/#document). This process is similar to the [bulk ingest operation]({{site.url}}{{site.baseurl}}/getting-started/ingest-data/#bulk-indexing). When you're finished, select **Confirm**. + +6. In the **Flow overview** panel, select the topmost **ML Inference Processor**. The processor is prepopulated with the configuration used to map data _to_ the expected model input and _from_ the expected model output. The **Inputs** section maps the target document field to the model input field, generating vector embeddings for that field. The **Outputs** section maps the model output field to a new field that is stored in the index, as shown in the following image. + ![Transform data]({{site.url}}{{site.baseurl}}/images/dashboards-flow-framework/transform-data.png) + + For more information about transformation types that accommodate complex data schemas and model interfaces, see [Advanced data transformations](#advanced-data-transformations). + +7. In the **Flow overview** panel, select **Index**. The index is prepopulated with the index configuration required for the selected use case. For example, for vector search, the `my_embedding` field is mapped as a `knn_vector` and the index is specified as a vector index (`index.knn: true`), as shown in the following image. You can modify this configuration as needed. + ![Ingest data]({{site.url}}{{site.baseurl}}/images/dashboards-flow-framework/ingest-data.png) +8. Select **Update ingest flow** at the bottom of **Flow overview** to build the configured ingest pipeline and index, and ingest the provided sample data. Then go to **Test flow** under **Inspect** to search the newly created index and verify that the transformed documents appear as expected. In this example, verify that the vector embeddings are generated for each ingested document, as shown in the following image. + ![Test ingest flow]({{site.url}}{{site.baseurl}}/images/dashboards-flow-framework/ingest-test-flow.png) +9. To configure your search flow, under **Flow overview** > **Transform query**, select **ML Inference Processor**, as shown in the following image. This processor parses the search query inputs for which you want to generate vector embeddings. In this example, it passes the value from `query.match.review.query` to the embedding model.<br> + ![Transform query]({{site.url}}{{site.baseurl}}/images/dashboards-flow-framework/transform-query.png) + + The processor also performs a query rewrite to generate a `knn` query using the vector embeddings produced by the model. Select **Rewrite query** to view its details, as shown in the following image. This approach abstracts complex query details, providing a simple query interface that uses search pipelines to perform advanced query generation. + + ![Rewrite query]({{site.url}}{{site.baseurl}}/images/dashboards-flow-framework/rewrite-query.png) + +10. To configure your search result transformations, under **Flow overview** > **Transform response**, select **ML Inference Processor**, as shown in the following image. The Claude LLM is used to process the returned results and generate a human-readable response. + ![Transform response]({{site.url}}{{site.baseurl}}/images/dashboards-flow-framework/transform-response.png)<br> + Under **Inputs**, select the pencil icon next to the `prompt` entry. This opens a popup window containing a preconfigured prompt template designed to summarize the returned documents, as shown in the following image. You can modify this template as needed; several presets are available as starting points. You can also add, update, or remove the **Input variables**, which include data from the returned documents that you want to dynamically inject as contextual information into the LLM. The default option collects all `review` data and summarizes the results. Select **Save** to apply your changes. + ![Configure prompt]({{site.url}}{{site.baseurl}}/images/dashboards-flow-framework/configure-prompt.png) +11. To build the search pipeline, select **Create search flow**. The **Inspect** section automatically navigates to the **Test flow** component, where you can test different queries and run searches, as shown in the following image. You can use variables wrapped in {% raw %}`{{ }}`{% endraw %} to quickly test different query values without modifying the base query. + ![Test search flow]({{site.url}}{{site.baseurl}}/images/dashboards-flow-framework/search-test-flow.png) +12. To view the search results, select **Run test**. You can view the results either as a formatted list of hits or as the raw JSON search response. +13. Depending on your use case, you can modify configurations in the following ways: +- Experiment with different query parameters. +- Try different queries. +- Modify existing processors under **Transform query** or **Transform results**. +- Add or remove processors under **Transform query** or **Transform results**. +14. To export your workflow, select **Export** in the header. The displayed data represents the [Workflow template]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-templates/), which contains the full configuration for the OpenSearch resources you've created, including the ingest pipeline, index, and search pipeline. You can download the template in JSON or YAML format by selecting the button on the right. To build identical resources in other OpenSearch clusters, use the [Provision Workflow API]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/). + +## Advanced data transformations + +ML inference processors provide several flexible ways to transform input data _to_ the model input(s) and _from_ the model output(s). + +In **Inputs**, you can configure the parameters passed _to_ the model. There are four input parameter transformation types: + +1. **Data field**: Uses an existing data field as the model input. +2. **JSONPath expression**: Extracts data from a JSON structure and maps the extracted data to the model input field using [JSONPath](https://en.wikipedia.org/wiki/JSONPath). +3. **Prompt**: Uses a constant value that can include dynamic variables. This combines elements of both the `Custom string` transformation and the `Data field` and `JSONPath expression` transformations, making it especially useful for building prompts for LLMs. +4. **Custom string**: Uses a constant string value. + +In **Outputs**, you can configure the values passed _from_ the model. There are three output parameter transformation types: + +1. **Data field**: Copies the model output into a new document field. +2. **JSONPath Expression**: Extracts data from a JSON structure and maps the extracted data to one or more new document fields using [JSONPath](https://en.wikipedia.org/wiki/JSONPath). +3. **No transformation**: Does not transform the model output field, preserving both its name and value. + +## Next steps + +- For models and model interfaces recommended for use with AI Search Flows, see [Models](https://github.com/opensearch-project/dashboards-flow-framework/blob/main/documentation/models.md). + +- For example configurations for different AI/ML use cases, see [Configuring AI search types]({{site.url}}{{site.baseurl}}/tutorials/ai-search-flows/building-flows/). diff --git a/_vector-search/api/index.md b/_vector-search/api/index.md new file mode 100644 index 00000000000..2bb9655cb95 --- /dev/null +++ b/_vector-search/api/index.md @@ -0,0 +1,40 @@ +--- +layout: default +title: Vector search API +nav_order: 80 +has_children: true +has_toc: false +redirect_from: + - /vector-search/api/knn/ + - /vector-search/api/ +--- + +# Vector search API + +In OpenSearch, vector search functionality is provided by the k-NN plugin and Neural Search plugin. The k-NN plugin provides basic k-NN functionality, while the Neural Search plugin provides automatic embedding generation at indexing and search time. + +For k-NN plugin APIs, see [k-NN API]({{site.url}}{{site.baseurl}}/vector-search/api/knn/). + +In addition to plugin-specific APIs, the following APIs support vector search functionality: + +- [k-NN vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) +- [k-NN query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/k-nn/) +- [Neural query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/neural/) +- [Neural sparse query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/neural-sparse/) +- [Ingest pipelines]({{site.url}}{{site.baseurl}}/ingest-pipelines/) +- Ingest processors: + - [ML inference]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/ml-inference/) + - [Sparse encoding]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/sparse-encoding/) + - [Text chunking]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/text-chunking/) + - [Text embedding]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/text-embedding/) + - [Text/image embedding]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/text-image-embedding/) +- [Search pipelines]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/) +- Search processors: + - [ML inference (request)]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/ml-inference-search-request/) + - [ML inference (response)]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/ml-inference-search-response/) + - [Neural query enricher]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) + - [Neural sparse query two-phase]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-sparse-query-two-phase-processor/) + - [Normalization]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/normalization-processor/) + - [Rerank]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rerank-processor/) + - [Retrieval-augmented generation]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rag-processor/) + - [Score ranker]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/score-ranker-processor/) \ No newline at end of file diff --git a/_search-plugins/knn/api.md b/_vector-search/api/knn.md similarity index 71% rename from _search-plugins/knn/api.md rename to _vector-search/api/knn.md index d927bf1c355..87fb521df76 100644 --- a/_search-plugins/knn/api.md +++ b/_vector-search/api/knn.md @@ -1,24 +1,29 @@ --- layout: default -title: k-NN plugin API -nav_order: 30 -parent: k-NN search +title: k-NN API +parent: Vector search API +nav_order: 10 has_children: false --- -# k-NN plugin API +# k-NN API -The k-NN plugin adds several APIs for managing, monitoring, and optimizing your k-NN workload. +OpenSearch provides several k-nearest neighbors (k-NN) APIs for managing, monitoring, and optimizing your vector workload. ## Stats -The k-NN `stats` API provides information about the current status of the k-NN plugin. The plugin keeps track of both cluster-level and node-level statistics. Cluster-level statistics have a single value for the entire cluster. Node-level statistics have a single value for each node in the cluster. You can filter the query by `nodeId` and `statName`, as shown in the following example: +The k-NN `stats` API provides information about the current status of the k-NN plugin, which implements vector search functionality. This includes both cluster-level and node-level statistics. Cluster-level statistics have a single value for the entire cluster. Node-level statistics have a single value for each node in the cluster. You can filter the query by `nodeId` and `statName`, as shown in the following example: -``` +```json GET /_plugins/_knn/nodeId1,nodeId2/stats/statName1,statName2 ``` +{% include copy-curl.html %} + +### Response body fields + +The following table lists the available response body fields. -Statistic | Description +Field | Description :--- | :--- `circuit_breaker_triggered` | Indicates whether the circuit breaker is triggered. This statistic is only relevant to approximate k-NN search. `total_load_time` | The time in nanoseconds that k-NN has taken to load native library indexes into the cache. This statistic is only relevant to approximate k-NN search. @@ -36,13 +41,13 @@ Statistic | Description `load_success_count` | The number of times k-NN successfully loaded a native library index into the cache. This statistic is only relevant to approximate k-NN search. `load_exception_count` | The number of times an exception occurred when trying to load a native library index into the cache. This statistic is only relevant to approximate k-NN search. `indices_in_cache` | For each OpenSearch index with a `knn_vector` field and approximate k-NN turned on, this statistic provides the number of native library indexes that OpenSearch index has and the total `graph_memory_usage` that the OpenSearch index is using, in kilobytes. -`script_compilations` | The number of times the k-NN script has been compiled. This value should usually be 1 or 0, but if the cache containing the compiled scripts is filled, the k-NN script might be recompiled. This statistic is only relevant to k-NN score script search. -`script_compilation_errors` | The number of errors during script compilation. This statistic is only relevant to k-NN score script search. -`script_query_requests` | The total number of script queries. This statistic is only relevant to k-NN score script search. -`script_query_errors` | The number of errors during script queries. This statistic is only relevant to k-NN score script search. -`nmslib_initialized` | Boolean value indicating whether the *nmslib* JNI library has been loaded and initialized on the node. -`faiss_initialized` | Boolean value indicating whether the *faiss* JNI library has been loaded and initialized on the node. -`model_index_status` | Status of model system index. Valid values are "red", "yellow", "green". If the index does not exist, this will be null. +`script_compilations` | The number of times the k-NN script has been compiled. This value should usually be 1 or 0, but if the cache containing the compiled scripts is filled, the k-NN script might be recompiled. This statistic is only relevant to k-NN scoring script search. +`script_compilation_errors` | The number of errors during script compilation. This statistic is only relevant to k-NN scoring script search. +`script_query_requests` | The total number of script queries. This statistic is only relevant to k-NN scoring script search. +`script_query_errors` | The number of errors during script queries. This statistic is only relevant to k-NN scoring script search. +`nmslib_initialized` | A Boolean value indicating whether the `nmslib` JNI library has been loaded and initialized on the node. +`faiss_initialized` | A Boolean value indicating whether the `faiss` JNI library has been loaded and initialized on the node. +`model_index_status` | The status of the model system index. Valid values are `red`, `yellow`, and `green`. If the index does not exist, this value is `null`. `indexing_from_model_degraded` | Boolean value indicating if indexing from a model is degraded. This happens if there is not enough JVM memory to cache the models. `ing_requests` | The number of training requests made to the node. `training_errors` | The number of training errors that have occurred on the node. @@ -52,9 +57,39 @@ Statistic | Description Some statistics contain *graph* in the name. In these cases, *graph* is synonymous with *native library index*. The term *graph* is reflective of when the plugin only supported the HNSW algorithm, which consists of hierarchical graphs. {: .note} -#### Usage +#### Remote index build stats +Introduced 3.0 +{: .label .label-purple } -The following code examples show how to retrieve statistics related to the k-NN plugin. The first example fetches comprehensive statistics for the k-NN plugin across all nodes in the cluster, while the second example retrieves specific metrics (circuit breaker status and graph memory usage) for a single node. +If you configured [remote index build]({{site.url}}{{site.baseurl}}/vector-search/remote-index-build/), the response contains additional fields. The following table lists the available remote index build stats response body fields. + +| Field | Description | +|:---|:---| +| `repository_stats.read_success_count` | The number of successful read operations from the repository. | +| `repository_stats.read_failure_count` | The number of failed read operations from the repository. | +| `repository_stats.successful_read_time_in_millis` | The total time, in milliseconds, spent on successful read operations. | +| `repository_stats.write_success_count` | The number of successful write operations to the repository. | +| `repository_stats.write_failure_count` | The number of failed write operations to the repository. | +| `repository_stats.successful_write_time_in_millis` | The total time, in milliseconds, spent on successful write operations. | +| `client_stats.build_request_success_count` | The number of successful build request operations. | +| `client_stats.build_request_failure_count` | The number of failed build request operations. | +| `client_stats.status_request_failure_count` | The number of failed status request operations. | +| `client_stats.status_request_success_count` | The number of successful status request operations. | +| `client_stats.index_build_success_count` | The number of successful index build operations. | +| `client_stats.index_build_failure_count` | The number of failed index build operations. | +| `client_stats.waiting_time_in_ms` | The total time, in milliseconds, that the client has spent awaiting completion of remote builds. | +| `build_stats.remote_index_build_flush_time_in_millis` | The total time, in milliseconds, spent on remote flush operations. | +| `build_stats.remote_index_build_merge_time_in_millis` | The total time, in milliseconds, spent on remote merge operations. | +| `build_stats.remote_index_build_current_merge_operations` | The current number of remote merge operations in progress. | +| `build_stats.remote_index_build_current_flush_operations` | The current number of remote flush operations in progress. | +| `build_stats.remote_index_build_current_merge_size` | The current size of remote merge operations. | +| `build_stats.remote_index_build_current_flush_size` | The current size of remote flush operations. | + +#### Example request + +The following examples demonstrate how to retrieve statistics related to the k-NN plugin. + +The following example fetches comprehensive statistics for the k-NN plugin across all nodes in the cluster: ```json GET /_plugins/_knn/stats?pretty @@ -105,6 +140,9 @@ GET /_plugins/_knn/stats?pretty } } ``` +{% include copy-curl.html %} + +The following example retrieves specific metrics (circuit breaker status and graph memory usage) for a single node: ```json GET /_plugins/_knn/HYMrXXsBSamUkcAjhjeN0w/stats/circuit_breaker_triggered,graph_memory_usage?pretty @@ -123,6 +161,7 @@ GET /_plugins/_knn/HYMrXXsBSamUkcAjhjeN0w/stats/circuit_breaker_triggered,graph_ } } ``` +{% include copy-curl.html %} ## Warmup operation @@ -134,9 +173,9 @@ As an alternative, you can avoid this latency issue by running the k-NN plugin w After the process is finished, you can search against the indexes without initial latency penalties. The warmup API operation is idempotent, so if a segment's native library files are already loaded into memory, this operation has no effect. It only loads files not currently stored in memory. -#### Usage +#### Example request -This request performs a warmup on three indexes: +The following request performs a warmup on three indexes: ```json GET /_plugins/_knn/warmup/index1,index2,index3?pretty @@ -148,14 +187,16 @@ GET /_plugins/_knn/warmup/index1,index2,index3?pretty } } ``` +{% include copy-curl.html %} -`total` indicates how many shards the k-NN plugin attempted to warm up. The response also includes the number of shards the plugin succeeded and failed to warm up. +The `total` value indicates the number of shards that the k-NN plugin attempted to warm up. The response also includes the number of shards that the plugin successfully warmed up and failed to warm up. The call does not return results until the warmup operation finishes or the request times out. If the request times out, then the operation continues on the cluster. To monitor the warmup operation, use the OpenSearch `_tasks` API: ```json GET /_tasks ``` +{% include copy-curl.html %} After the operation has finished, use the [k-NN `_stats` API operation](#stats) to see what the k-NN plugin loaded into the graph. @@ -173,14 +214,14 @@ For the warmup operation to function properly, follow these best practices: Introduced 2.14 {: .label .label-purple } -During approximate k-NN search or warmup operations, the native library indexes (`nmslib` and `faiss` engines) are loaded into native memory. Currently, you can evict an index from cache or native memory by either deleting the index or setting the k-NN cluster settings `knn.cache.item.expiry.enabled` and `knn.cache.item.expiry.minutes`, which removes the index from the cache if it is idle for a given period of time. However, you cannot evict an index from the cache without deleting the index. To solve this problem, you can use the k-NN clear cache API operation, which clears a given set of indexes from the cache. +During approximate k-NN search or warmup operations, the native library indexes (for the `faiss` and `nmslib` [deprecated] engines) are loaded into native memory. Currently, you can evict an index from the cache or native memory by either deleting the index or setting the k-NN cluster settings `knn.cache.item.expiry.enabled` and `knn.cache.item.expiry.minutes`, which removes the index from the cache if it is idle for a given period of time. However, you cannot evict an index from the cache without deleting the index. To solve this problem, you can use the k-NN clear cache API operation, which clears a given set of indexes from the cache. The k-NN clear cache API evicts all native library files for all shards (primaries and replicas) of all indexes specified in the request. Similarly to how the [warmup operation](#warmup-operation) behaves, the k-NN clear cache API is idempotent, meaning that if you try to clear the cache for an index that has already been evicted from the cache, it does not have any additional effect. -This API operation only works with indexes created using the `nmslib` and `faiss` engines. It has no effect on indexes created using the `lucene` engine. +This API operation only works with indexes created using the `faiss` and `nmslib` (deprecated) engines. It has no effect on indexes created using the `lucene` engine. {: .note} -#### Usage +#### Example request The following request evicts the native library indexes of three indexes from the cache: @@ -194,6 +235,7 @@ POST /_plugins/_knn/clear_cache/index1,index2,index3?pretty } } ``` +{% include copy-curl.html %} The `total` parameter indicates the number of shards that the API attempted to clear from the cache. The response includes both the number of cleared shards and the number of shards that the plugin failed to clear. @@ -209,22 +251,31 @@ POST /_plugins/_knn/clear_cache/index*?pretty } } ``` +{% include copy-curl.html %} The API call does not return results until the operation finishes or the request times out. If the request times out, then the operation continues on the cluster. To monitor the request, use the `_tasks` API, as shown in the following example: ```json GET /_tasks ``` +{% include copy-curl.html %} When the operation finishes, use the [k-NN `_stats` API operation](#stats) to see which indexes have been evicted from the cache. ## Get a model -The GET model operation retrieves information about models present in the cluster. Some native library index configurations require a training step before indexing and querying can begin. The output of training is a model that can be used to initialize native library index files during indexing. The model is serialized in the k-NN model system index. See the following GET example: +The GET model operation retrieves information about models present in the cluster. Some native library index configurations require a training step before indexing and querying can begin. The output of training is a model that can be used to initialize native library index files during indexing. The model is serialized in the k-NN model system index. -``` +#### Example request + +```json GET /_plugins/_knn/models/{model_id} ``` +{% include copy-curl.html %} + +### Response body fields + +The following table lists the available response body fields. Response field | Description :--- | :--- @@ -234,13 +285,15 @@ Response field | Description `timestamp` | The date and time when the model was created. `description` | A user-provided description of the model. `error` | An error message explaining why the model is in a failed state. -`space_type` | The space type for which this model is trained, for example, Euclidean or cosine. Note - this value can be set in the top-level of the request as well +`space_type` | The space type for which the model is trained, for example, Euclidean or cosine. Note: This value can be set at the top level of the request. `dimension` | The dimensionality of the vector space for which this model is designed. -`engine` | The native library used to create the model, either `faiss` or `nmslib`. +`engine` | The native library used to create the model, either `faiss` or `nmslib` (deprecated). -### Usage +#### Example request -The following examples show how to retrieve information about a specific model using the k-NN plugin API. The first example returns all the available information about the model, while the second example shows how to selectively retrieve fields. +The following examples demonstrate how to retrieve information about a specific model using the k-NN plugin API. + +The following example returns all the available information about the model: ```json GET /_plugins/_knn/models/test-model?pretty @@ -256,6 +309,9 @@ GET /_plugins/_knn/models/test-model?pretty "engine" : "faiss" } ``` +{% include copy-curl.html %} + +The following example demonstrates how to selectively retrieve fields: ```json GET /_plugins/_knn/models/test-model?pretty&filter_path=model_id,state @@ -264,12 +320,13 @@ GET /_plugins/_knn/models/test-model?pretty&filter_path=model_id,state "state" : "created" } ``` +{% include copy-curl.html %} ## Search for a model You can use an OpenSearch query to search for a model in the index. See the following usage example. -#### Usage +#### Example request The following example shows how to search for k-NN models in an OpenSearch cluster and how to retrieve the metadata for those models, excluding the potentially large `model_blob` field: @@ -280,7 +337,12 @@ GET/POST /_plugins/_knn/models/_search?pretty&_source_excludes=model_blob ... } } +``` +{% include copy-curl.html %} +The response contains the model information: + +```json { "took" : 0, "timed_out" : false, @@ -321,7 +383,7 @@ GET/POST /_plugins/_knn/models/_search?pretty&_source_excludes=model_blob You can delete a model in the cluster by using the DELETE operation. See the following usage example. -#### Usage +#### Example request The following example shows how to delete a k-NN model: @@ -332,17 +394,26 @@ DELETE /_plugins/_knn/models/{model_id} "acknowledged": true } ``` +{% include copy-curl.html %} ## Train a model You can create and train a model that can be used for initializing k-NN native library indexes during indexing. This API pulls training data from a `knn_vector` field in a training index, creates and trains a model, and then serializes it to the model system index. Training data must match the dimension passed in the request body. This request is returned when training begins. To monitor the model's state, use the [Get model API](#get-a-model). +### Query parameters + +The following table lists the available query parameters. + Query parameter | Description :--- | :--- `model_id` | The unique identifier of the fetched model. If not specified, then a random ID is generated. Optional. `node_id` | Specifies the preferred node on which to execute the training process. If provided, the specified node is used for training if it has the necessary capabilities and resources available. Optional. -Request parameter | Description +### Request body fields + +The following table lists the available request body fields. + +Request field | Description :--- | :--- `training_index` | The index from which the training data is retrieved. `training_field` | The `knn_vector` field in the `training_index` from which the training data is retrieved. The dimension of this field must match the `dimension` passed in this request. @@ -350,10 +421,10 @@ Request parameter | Description `max_training_vector_count` | The maximum number of vectors from the training index to be used for training. Defaults to all the vectors in the index. Optional. `search_size` | The training data is pulled from the training index using scroll queries. This parameter defines the number of results to return per scroll query. Default is `10000`. Optional. `description` | A user-provided description of the model. Optional. -`method` | The configuration of the approximate k-NN method used for search operations. For more information about the available methods, see [k-NN index method definitions]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#method-definitions). The method requires training to be valid. +`method` | The configuration of the approximate k-NN method used for search operations. For more information about the available methods, see [Methods and engines]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/). The method requires training in order to be valid. `space_type` | The space type for which this model is trained, for example, Euclidean or cosine. Note: This value can also be set in the `method` parameter. -#### Usage +#### Example request The following examples show how to initiate the training process for a k-NN model: @@ -381,11 +452,9 @@ POST /_plugins/_knn/models/{model_id}/_train?preference={node_id} } } } - -{ - "model_id": "model_x" -} ``` +{% include copy-curl.html %} + ```json POST /_plugins/_knn/models/_train?preference={node_id} @@ -411,7 +480,12 @@ POST /_plugins/_knn/models/_train?preference={node_id} } } } +``` +{% include copy-curl.html %} + +#### Example response +```json { "model_id": "dcdwscddscsad" } diff --git a/_vector-search/api/neural.md b/_vector-search/api/neural.md new file mode 100644 index 00000000000..7b1fc592662 --- /dev/null +++ b/_vector-search/api/neural.md @@ -0,0 +1,393 @@ +--- +layout: default +title: Neural Search API +parent: Vector search API +nav_order: 20 +has_children: false +--- + +# Neural Search API + +The Neural Search plugin provides several APIs for monitoring semantic and hybrid search features. + +## Stats + +The Neural Search Stats API provides information about the current status of the Neural Search plugin. This includes both cluster-level and node-level statistics. Cluster-level statistics have a single value for the entire cluster. Node-level statistics have a single value for each node in the cluster. + +By default, the Neural Search Stats API is disabled through a cluster setting. To enable statistics collection, use the following command: + +```json +PUT /_cluster/settings +{ + "persistent": { + "plugins.neural_search.stats_enabled": "true" + } +} +``` +{% include copy-curl.html %} + +To disable statistics collection, set the cluster setting to `false`. When disabled, all values are reset and new statistics are not collected. + +### Endpoints + +```json +GET /_plugins/_neural/stats +GET /_plugins/_neural/stats/<stats> +GET /_plugins/_neural/<nodes>/stats +GET /_plugins/_neural/<nodes>/stats/<stats> +``` + +### Path parameters + +The following table lists the available path parameters. All path parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `nodes` | String | A node or a list of nodes (comma-separated) to filter statistics by. Default is all nodes. | +| `stats` | String | A statistic name or names (comma-separated) to return. Default is all statistics. | + +### Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `include_metadata` | Boolean | When `true`, includes additional metadata fields for each statistic (see [Available metadata](#available-metadata)). Default is `false`. | +| `flat_stat_paths` | Boolean | When `true`, flattens the JSON response structure for easier parsing. Default is `false`. | +| `include_individual_nodes` | Boolean | When `true`, includes statistics for individual nodes in the `nodes` category. When `false`, excludes the `nodes` category from the response. Default is `true`. | +| `include_all_nodes` | Boolean | When `true`, includes aggregated statistics across all nodes in the `all_nodes` category. When `false`, excludes the `all_nodes` category from the response. Default is `true`. | +| `include_info` | Boolean | When `true`, includes cluster-wide information in the `info` category. When `false`, excludes the `info` category from the response. Default is `true`. | + +#### Parameter interactions + + +#### Example request + +```json +GET /_plugins/_neural/node1,node2/stats/stat1,stat2?include_metadata=true,flat_stat_paths=true +``` +{% include copy-curl.html %} + +#### Example response + +<details markdown="block"> + <summary> + Response + </summary> + {: .text-delta} + +```json +GET /_plugins/_neural/stats/ +{ + "_nodes": { + "total": 1, + "successful": 1, + "failed": 0 + }, + "cluster_name": "integTest", + "info": { + "cluster_version": "3.1.0", + "processors": { + "search": { + "hybrid": { + "comb_geometric_processors": 0, + "comb_rrf_processors": 0, + "norm_l2_processors": 0, + "norm_minmax_processors": 0, + "comb_harmonic_processors": 0, + "comb_arithmetic_processors": 0, + "norm_zscore_processors": 0, + "rank_based_normalization_processors": 0, + "normalization_processors": 0 + }, + "rerank_ml_processors": 0, + "rerank_by_field_processors": 0, + "neural_sparse_two_phase_processors": 0, + "neural_query_enricher_processors": 0 + }, + "ingest": { + "sparse_encoding_processors": 0, + "skip_existing_processors": 0, + "text_image_embedding_processors": 0, + "text_chunking_delimiter_processors": 0, + "text_embedding_processors_in_pipelines": 0, + "text_chunking_fixed_token_length_processors": 0, + "text_chunking_fixed_char_length_processors": 0, + "text_chunking_processors": 0 + } + } + }, + "all_nodes": { + "query": { + "hybrid": { + "hybrid_query_with_pagination_requests": 0, + "hybrid_query_with_filter_requests": 0, + "hybrid_query_with_inner_hits_requests": 0, + "hybrid_query_requests": 0 + }, + "neural": { + "neural_query_against_semantic_sparse_requests": 0, + "neural_query_requests": 0, + "neural_query_against_semantic_dense_requests": 0, + "neural_query_against_knn_requests": 0 + }, + "neural_sparse": { + "neural_sparse_query_requests": 0 + } + }, + "semantic_highlighting": { + "semantic_highlighting_request_count": 0 + }, + "processors": { + "search": { + "neural_sparse_two_phase_executions": 0, + "hybrid": { + "comb_harmonic_executions": 0, + "norm_zscore_executions": 0, + "comb_rrf_executions": 0, + "norm_l2_executions": 0, + "rank_based_normalization_processor_executions": 0, + "comb_arithmetic_executions": 0, + "normalization_processor_executions": 0, + "comb_geometric_executions": 0, + "norm_minmax_executions": 0 + }, + "rerank_by_field_executions": 0, + "neural_query_enricher_executions": 0, + "rerank_ml_executions": 0 + }, + "ingest": { + "skip_existing_executions": 0, + "text_chunking_fixed_token_length_executions": 0, + "sparse_encoding_executions": 0, + "text_chunking_fixed_char_length_executions": 0, + "text_chunking_executions": 0, + "text_embedding_executions": 0, + "semantic_field_executions": 0, + "semantic_field_chunking_executions": 0, + "text_chunking_delimiter_executions": 0, + "text_image_embedding_executions": 0 + } + } + }, + "nodes": { + "_cONimhxS6KdedymRZr6xg": { + "query": { + "hybrid": { + "hybrid_query_with_pagination_requests": 0, + "hybrid_query_with_filter_requests": 0, + "hybrid_query_with_inner_hits_requests": 0, + "hybrid_query_requests": 0 + }, + "neural": { + "neural_query_against_semantic_sparse_requests": 0, + "neural_query_requests": 0, + "neural_query_against_semantic_dense_requests": 0, + "neural_query_against_knn_requests": 0 + }, + "neural_sparse": { + "neural_sparse_query_requests": 0 + } + }, + "semantic_highlighting": { + "semantic_highlighting_request_count": 0 + }, + "processors": { + "search": { + "neural_sparse_two_phase_executions": 0, + "hybrid": { + "comb_harmonic_executions": 0, + "norm_zscore_executions": 0, + "comb_rrf_executions": 0, + "norm_l2_executions": 0, + "rank_based_normalization_processor_executions": 0, + "comb_arithmetic_executions": 0, + "normalization_processor_executions": 0, + "comb_geometric_executions": 0, + "norm_minmax_executions": 0 + }, + "rerank_by_field_executions": 0, + "neural_query_enricher_executions": 0, + "rerank_ml_executions": 0 + }, + "ingest": { + "skip_existing_executions": 0, + "text_chunking_fixed_token_length_executions": 0, + "sparse_encoding_executions": 0, + "text_chunking_fixed_char_length_executions": 0, + "text_chunking_executions": 0, + "text_embedding_executions": 0, + "semantic_field_executions": 0, + "semantic_field_chunking_executions": 0, + "text_chunking_delimiter_executions": 0, + "text_image_embedding_executions": 0 + } + } + } + } +} +``` + +</details> + +If `include_metadata` is `true`, each stats object contains additional metadata: + +```json +{ + ..., + "text_embedding_executions": { + "value": 0, + "stat_type": "timestamped_event_counter", + "trailing_interval_value": 0, + "minutes_since_last_event": 29061801 + }, + ... +} +``` + +For more information, see [Available metadata](#available-metadata). + +### Response body fields + +The following sections describe response body fields. + +#### Categories of statistics + +The following table lists all categories of statistics. + +| Category | Data type | Description | +| :--- | :--- | :--- | +| `info` | Object | Contains cluster-wide information and statistics that are not specific to individual nodes. | +| `all_nodes` | Object | Provides aggregated statistics across all nodes in the cluster. | +| `nodes` | Object | Contains node-specific statistics, with each node identified by its unique node ID. | + +#### Available statistics + +The following table lists the available statistics. For statistics with paths prefixed with `nodes.<node_id>`, aggregate cluster-level statistics are also available at the same path prefixed with `all_nodes`. + +| Statistic name | Category | Statistic path within category | Description | +| :--- | :--- | :--- | :--- | +| `cluster_version` | `info` | `cluster_version` | The version of the cluster. | + +**Info statistics: Processors** + +| Statistic name | Category | Statistic path within category | Description | +| :--- | :--- | :--- | :--- | +| `text_embedding_processors_in_pipelines` | `info` | `processors.ingest.text_embedding_processors_in_pipelines` | The number of `text_embedding` processors in ingest pipelines. | +| `sparse_encoding_processors` | `info` | `processors.ingest.sparse_encoding_processors` | The number of `sparse_encoding` processors in ingest pipelines. | +| `skip_existing_processors` | `info` | `processors.ingest.skip_existing_processors` | The number of processors with `skip_existing` set to `true` in ingest pipelines. | +| `text_image_embedding_processors` | `info` | `processors.ingest.text_image_embedding_processors` | The number of `text_image_embedding` processors in ingest pipelines. | +| `text_chunking_delimiter_processors` | `info` | `processors.ingest.text_chunking_delimiter_processors` | The number of `text_chunking` processors using the `delimiter` algorithm in ingest pipelines. | +| `text_chunking_fixed_token_length_processors` | `info` | `processors.ingest.text_chunking_fixed_token_length_processors` | The number of `text_chunking` processors using the `fixed_token_length` algorithm in ingest pipelines. | +| `text_chunking_fixed_char_length_processors` | `info` | `processors.ingest.text_chunking_fixed_char_length_processors` | The number of `text_chunking` processors using the `fixed_character_length` algorithm in ingest pipelines. | +| `text_chunking_processors` | `info` | `processors.ingest.text_chunking_processors` | The number of `text_chunking` processors in ingest pipelines. | +| `rerank_ml_processors` | `info` | `processors.search.rerank_ml_processors` | The number of `rerank` processors of the `ml_opensearch` type in search pipelines. | +| `rerank_by_field_processors` | `info` | `processors.search.rerank_by_field_processors` | The number of `rerank` processors of the `by_field` type. | +| `neural_sparse_two_phase_processors` | `info` | `processors.search.neural_sparse_two_phase_processors` | The number of `neural_sparse_two_phase_processor` processors in search pipelines. | +| `neural_query_enricher_processors` | `info` | `processors.search.neural_query_enricher_processors` | The number of `neural_query_enricher` processors in search pipelines. | + +**Info statistics: Hybrid processors** + +| Statistic name | Category | Statistic path within category | Description | +| :--- | :--- | :--- | :--- | +| `normalization_processors` | `info` | `processors.search.hybrid.normalization_processors` | The number of `normalization-processor` processors. | +| `norm_minmax_processors` | `info` | `processors.search.hybrid.norm_minmax_processors` | The number of `normalization-processor` processors with `normalization.technique` set to `min_max`. | +| `norm_l2_processors` | `info` | `processors.search.hybrid.norm_l2_processors` | The number of `normalization-processor` processors with `normalization.technique` set to `l2`. | +| `norm_zscore_processors` | `info` | `processors.search.hybrid.norm_zscore_processors` | The number of `normalization-processor` processors with `normalization.technique` set to `z_score`. | +| `comb_arithmetic_processors` | `info` | `processors.search.hybrid.comb_arithmetic_processors` | The number of `normalization-processor` processors with `combination.technique` set to `arithmetic_mean`. | +| `comb_geometric_processors` | `info` | `processors.search.hybrid.comb_geometric_processors` | The number of `normalization-processor` processors with `combination.technique` set to `geometric_mean`. | +| `comb_harmonic_processors` | `info` | `processors.search.hybrid.comb_harmonic_processors` | The number of `normalization-processor` processors with `combination.technique` set to `harmonic_mean`. | +| `rank_based_normalization_processors` | `info` | `processors.search.hybrid.rank_based_normalization_processors` | The number of `score-ranker-processor` processors. | +| `comb_rrf_processors` | `info` | `processors.search.hybrid.comb_rrf_processors` | The number of `score-ranker-processor` processors with `combination.technique` set to `rrf`. | + +**Node-level statistics: Processors** + +| Statistic name | Category | Statistic path within category | Description | +| :--- | :--- | :--- | :--- | +| `text_embedding_executions` | `nodes`, `all_nodes` | `processors.ingest.text_embedding_executions` | The number of `text_embedding` processor executions. | +| `skip_existing_executions` | `nodes`, `all_nodes` | `processors.ingest.skip_existing_executions` | The number of processor executions that have `skip_existing` set to `true`. | +| `text_chunking_fixed_token_length_executions` | `nodes`, `all_nodes` | `processors.ingest.text_chunking_fixed_token_length_executions` | The number of `text_chunking` processor executions with the `fixed_token_length` algorithm. | +| `sparse_encoding_executions` | `nodes`, `all_nodes` | `processors.ingest.sparse_encoding_executions` | The number of `sparse_encoding` processor executions. | +| `text_chunking_fixed_char_length_executions` | `nodes`, `all_nodes` | `processors.ingest.text_chunking_fixed_char_length_executions` | The number of `text_chunking` processor executions with the `fixed_character_length` algorithm. | +| `text_chunking_executions` | `nodes`, `all_nodes` | `processors.ingest.text_chunking_executions` | The number of `text_chunking` processor executions. | +| `semantic_field_executions` | `nodes`, `all_nodes` | `processors.ingest.semantic_field_executions` | The number of `semantic` field system processor executions. | +| `semantic_field_chunking_executions` | `nodes`, `all_nodes` | `processors.ingest.semantic_field_chunking_executions` | The number of `semantic` field system chunking processor executions. | +| `text_chunking_delimiter_executions` | `nodes`, `all_nodes` | `processors.ingest.text_chunking_delimiter_executions` | The number of `text_chunking` processor executions with the `delimiter` algorithm. | +| `text_image_embedding_executions` | `nodes`, `all_nodes` | `processors.ingest.text_image_embedding_executions` | The number of `text_image_embedding` processor executions. | +| `neural_sparse_two_phase_executions` | `nodes`, `all_nodes` | `processors.search.neural_sparse_two_phase_executions` | The number of `neural_sparse_two_phase_processor` processor executions. | +| `rerank_by_field_executions` | `nodes`, `all_nodes` | `processors.search.rerank_by_field_executions` | The number of `rerank` processor executions of the `by_field` type. | +| `neural_query_enricher_executions` | `nodes`, `all_nodes` | `processors.search.neural_query_enricher_executions` | The number of `neural_query_enricher` processor executions. | +| `rerank_ml_executions` | `nodes`, `all_nodes` | `processors.search.rerank_ml_executions` | The number of `rerank` processor executions of the `ml_opensearch` type. | + +**Node-level statistics: Hybrid processors** + +| Statistic name | Category | Statistic path within category | Description | +| :--- | :--- | :--- | :--- | +| `normalization_processor_executions` | `nodes`, `all_nodes` | `processors.search.hybrid.normalization_processor_executions` | The number of `normalization-processor` processor executions. | +| `rank_based_normalization_processor_executions` | `nodes`, `all_nodes` | `processors.search.hybrid.rank_based_normalization_processor_executions` | The number of `score-ranker-processor` processor executions. | +| `comb_harmonic_executions` | `nodes`, `all_nodes` | `processors.search.hybrid.comb_harmonic_executions` | The number of `normalization-processor` processor executions with `combination.technique` set to `harmonic_mean`. | +| `norm_zscore_executions` | `nodes`, `all_nodes` | `processors.search.hybrid.norm_zscore_executions` | The number of `normalization-processor` processor executions with `normalization.technique` set to `z_score`. | +| `comb_rrf_executions` | `nodes`, `all_nodes` | `processors.search.hybrid.comb_rrf_executions` | The number of `score-ranker-processor` processor executions with `combination.technique` set to `rrf`. | +| `norm_l2_executions` | `nodes`, `all_nodes` | `processors.search.hybrid.norm_l2_executions` | The number of `normalization-processor` processor executions with `normalization.technique` set to `l2`. | +| `comb_arithmetic_executions` | `nodes`, `all_nodes` | `processors.search.hybrid.comb_arithmetic_executions` | The number of `normalization-processor` processor executions with `combination.technique` set to `arithmetic_mean`. | +| `comb_geometric_executions` | `nodes`, `all_nodes` | `processors.search.hybrid.comb_geometric_executions` | The number of `normalization-processor` processor executions with `combination.technique` set to `geometric_mean`. | +| `norm_minmax_executions` | `nodes`, `all_nodes` | `processors.search.hybrid.norm_minmax_executions` | The number of `normalization-processor` processor executions with `normalization.technique` set to `min_max`. | + +**Node-level statistics: Query** + +| Statistic name | Category | Statistic path within category | Description | +| :--- | :--- | :--- | :--- | +| `hybrid_query_with_pagination_requests` | `nodes`, `all_nodes` | `query.hybrid.hybrid_query_with_pagination_requests` | The number of `hybrid` query requests with pagination. | +| `hybrid_query_with_filter_requests` | `nodes`, `all_nodes` | `query.hybrid.hybrid_query_with_filter_requests` | The number of `hybrid` query requests with filters. | +| `hybrid_query_with_inner_hits_requests` | `nodes`, `all_nodes` | `query.hybrid.hybrid_query_with_inner_hits_requests` | The number of `hybrid` query requests with inner hits. | +| `hybrid_query_requests` | `nodes`, `all_nodes` | `query.hybrid.hybrid_query_requests` | The total number of `hybrid` query requests. | +| `neural_query_against_semantic_sparse_requests` | `nodes`, `all_nodes` | `query.neural.neural_query_against_semantic_sparse_requests` | The number of `neural` query requests against semantic sparse fields. | +| `neural_query_requests` | `nodes`, `all_nodes` | `query.neural.neural_query_requests` | The total number of `neural` query requests. | +| `neural_query_against_semantic_dense_requests` | `nodes`, `all_nodes` | `query.neural.neural_query_against_semantic_dense_requests` | The number of `neural` query requests against semantic dense fields. | +| `neural_query_against_knn_requests` | `nodes`, `all_nodes` | `query.neural.neural_query_against_knn_requests` | The number of `neural` query requests against k-NN fields. | +| `neural_sparse_query_requests` | `nodes`, `all_nodes` | `query.neural_sparse.neural_sparse_query_requests` | The number of `neural_sparse` query requests. | + +**Node-level statistics: Semantic highlighting** + +| Statistic name | Category | Statistic path within category | Description | +| :--- | :--- | :--- | :--- | +| `semantic_highlighting_request_count` | `nodes`, `all_nodes` | `semantic_highlighting.semantic_highlighting_request_count` | The number of `semantic` highlighting requests. | + +#### Available metadata + +When `include_metadata` is `true`, the field values in the response are replaced by their respective metadata objects, which include additional information about the statistic types, as described in the following table. + +| Statistic type | Description | +| :--- | :--- | +| `info_string` | A basic string value that provides informational content, such as versions or names. See [`info_string`](#info-string).| +| `info_counter` | A numerical counter that represents static or slowly changing values. See [`info_counter`](#info-counter).| +| `timestamped_event_counter` | A counter that tracks events over time, including information about recent activity. See [`timestamped_event_counter`](#timestamped-event-counter).| + +<p id="info-string"></p> + +The `info_string` object contains the following metadata fields. + +| Metadata field | Data type | Description | +| :--- | :--- | :--- | +| `value` | String | The actual string value of the statistic. | +| `stat_type` | String | Always set to `info_string`. | + +<p id="info-counter"></p> + +The `info_counter` object contains the following metadata fields. + +| Metadata field | Data type | Description | +| :--- | :--- | :--- | +| `value` | Integer | The current count value. | +| `stat_type` | String | Always set to `info_counter`. | + +<p id="timestamped-event-counter"></p> + +The `timestamped_event_counter` object contains the following metadata fields. + +| Metadata field | Data type | Description | +| :--- | :--- | :--- | +| `value` | Integer | The total number of events that occurred since the node started. | +| `stat_type` | String | Always set to `timestamped_event_counter`. | +| `trailing_interval_value` | Integer | The number of events that occurred in the past 5 minutes. | +| `minutes_since_last_event` | Integer | The amount of time (in minutes) since the last recorded event. | diff --git a/_vector-search/creating-vector-index.md b/_vector-search/creating-vector-index.md new file mode 100644 index 00000000000..37486c26624 --- /dev/null +++ b/_vector-search/creating-vector-index.md @@ -0,0 +1,160 @@ +--- +layout: default +title: Creating a vector index +nav_order: 20 +redirect_from: + - /vector-search/creating-a-vector-db/ + - /search-plugins/knn/knn-index/ + - /vector-search/creating-vector-index/ +--- + +# Creating a vector index + +Creating a vector index in OpenSearch involves a common core process with some variations depending on the type of vector search. This guide outlines the key elements shared across all vector indexes and the differences specific to supported use cases. + +Before you start, review the options for generating embeddings to help you decide on the option suitable for your use case. For more information, see [Preparing vectors]({{site.url}}{{site.baseurl}}/vector-search/getting-started/vector-search-options/). +{: .tip} + +## Overview + +To create a vector index, set the `index.knn` parameter to `true`in the `settings`: + +```json +PUT /test-index +{ + "settings": { + "index.knn": true + }, + "mappings": { + "properties": { + "my_vector": { + "type": "knn_vector", + "dimension": 3, + "space_type": "l2", + "mode": "on_disk", + "method": { + "name": "hnsw" + } + } + } + } +} +``` +{% include copy-curl.html %} + + +Creating a vector index involves the following key steps: + +1. **Enable k-nearest neighbors (k-NN) search**: + Set `index.knn` to `true` in the index settings to enable k-NN search functionality. + +1. **Define a vector field**: + Specify the field that will store the vector data. When defining a `knn_vector` field in OpenSearch, you can select from different data types to balance storage requirements and performance. By default, k-NN vectors are float vectors, but you can also choose byte or binary vectors for more efficient storage. For more information, see [k-NN vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/). + +1. **Specify the dimension**: + Set the `dimension` property to match the size of the vectors used. + +1. (Optional) **Choose a space type**: + Select a distance metric for similarity comparisons, such as `l2` (Euclidean distance) or `cosinesimil`. For more information, see [Spaces]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-spaces/). + +1. (Optional) **Select a workload mode and/or compression level**: + Select a workload mode and/or compression level in order to optimize vector storage. For more information, see [Optimizing vector storage]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/). + +1. (Optional, advanced) **Select a method**: + Configure the indexing method, such as HNSW or IVF, used to optimize vector search performance. For more information, see [Methods and engines]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/). + +## Implementation options + +Based on your vector generation approach, choose one of the following implementation options: + +- [Store raw vectors or embeddings generated outside of OpenSearch](#storing-raw-vectors-or-embeddings-generated-outside-of-opensearch): Ingest pregenerated embeddings or raw vectors into your index for raw vector search. +- [Convert data to embeddings during ingestion](#converting-data-to-embeddings-during-ingestion): Ingest text that will be converted into vector embeddings in OpenSearch in order to perform semantic search using machine learning (ML) models. + +The following table summarizes key index configuration differences for the supported use cases. + +| Feature | Vector field type | Ingest pipeline | Transformation | Use case | +|--------------------------|-----------------------|---------------------|-------------------------|-------------------------| +| **Store raw vectors or embeddings generated outside of OpenSearch** | [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) | Not required | Direct ingestion | Raw vector search | +| **Convert data to embeddings during ingestion** | [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) | Required | Auto-generated vectors | AI search <br><br> Automating embedding generation reduces data preprocessing and provides a more managed vector search experience. | + +## Storing raw vectors or embeddings generated outside of OpenSearch + +To ingest raw vectors into an index, configure a vector field (in this request, `my_vector`) and specify its `dimension`: + +```json +PUT /my-raw-vector-index +{ + "settings": { + "index.knn": true + }, + "mappings": { + "properties": { + "my_vector": { + "type": "knn_vector", + "dimension": 3 + } + } + } +} +``` +{% include copy-curl.html %} + +## Converting data to embeddings during ingestion + +To automatically generate embeddings during ingestion, configure an [ingest pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) with the model ID of the embedding model. For more information about configuring a model, see [Integrating ML models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/). + +Specify the `field_map` to define the source field for input text and the target field for storing embeddings. In this example, text from the `text` field is converted into embeddings and stored in `passage_embedding`: + +```json +PUT /_ingest/pipeline/auto-embed-pipeline +{ + "description": "AI search ingest pipeline that automatically converts text to embeddings", + "processors": [ + { + "text_embedding": { + "model_id": "mBGzipQB2gmRjlv_dOoB", + "field_map": { + "input_text": "output_embedding" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +For more information, see [Text embedding processor]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/processors/text-embedding/). + +When creating an index, specify the pipeline as the `default_pipeline`. Ensure that `dimension` matches the dimensionality of the model configured in the pipeline: + +```json +PUT /my-ai-search-index +{ + "settings": { + "index.knn": true, + "default_pipeline": "auto-embed-pipeline" + }, + "mappings": { + "properties": { + "input_text": { + "type": "text" + }, + "output_embedding": { + "type": "knn_vector", + "dimension": 768 + } + } + } +} +``` +{% include copy-curl.html %} + +## Working with sparse vectors + +OpenSearch also supports sparse vectors. For more information, see [Neural sparse search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-search/). + +## Next steps + +- [Ingesting data into a vector index]({{site.url}}{{site.baseurl}}/vector-search/ingesting-data/) +- [k-NN vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) +- [Methods and engines]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/) \ No newline at end of file diff --git a/_search-plugins/knn/filter-search-knn.md b/_vector-search/filter-search-knn/efficient-knn-filtering.md similarity index 59% rename from _search-plugins/knn/filter-search-knn.md rename to _vector-search/filter-search-knn/efficient-knn-filtering.md index 2f0c4aa0722..cd2f9e6e2d0 100644 --- a/_search-plugins/knn/filter-search-knn.md +++ b/_vector-search/filter-search-knn/efficient-knn-filtering.md @@ -1,61 +1,17 @@ --- layout: default -title: k-NN search with filters -nav_order: 20 -parent: k-NN search -has_children: false -has_math: true +title: Efficient k-NN filtering +parent: Filtering data +nav_order: 10 --- -# k-NN search with filters - -To refine k-NN results, you can filter a k-NN search using one of the following methods: - -- [Efficient k-NN filtering](#efficient-k-nn-filtering): This approach applies filtering _during_ the k-NN search, as opposed to before or after the k-NN search, which ensures that `k` results are returned (if there are at least `k` results in total). This approach is supported by the following engines: - - Lucene engine with a Hierarchical Navigable Small World (HNSW) algorithm (k-NN plugin versions 2.4 and later) - - Faiss engine with an HNSW algorithm (k-NN plugin versions 2.9 and later) or IVF algorithm (k-NN plugin versions 2.10 and later) - -- [Post-filtering](#post-filtering): Because it is performed after the k-NN search, this approach may return significantly fewer than `k` results for a restrictive filter. You can use the following two filtering strategies for this approach: - - [Boolean post-filter](#boolean-filter-with-ann-search): This approach runs an [approximate nearest neighbor (ANN)]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/) search and then applies a filter to the results. The two query parts are executed independently, and then the results are combined based on the query operator (`should`, `must`, and so on) provided in the query. - - [The `post_filter` parameter](#post-filter-parameter): This approach runs an [ANN]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/) search on the full dataset and then applies the filter to the k-NN results. - -- [Scoring script filter](#scoring-script-filter): This approach involves pre-filtering a document set and then running an exact k-NN search on the filtered subset. It may have high latency and does not scale when filtered subsets are large. - -The following table summarizes the preceding filtering use cases. - -Filter | When the filter is applied | Type of search | Supported engines and methods | Where to place the `filter` clause -:--- | :--- | :--- | :--- -Efficient k-NN filtering | During search (a hybrid of pre- and post-filtering) | Approximate | - `lucene` (`hnsw`) <br> - `faiss` (`hnsw`, `ivf`) | Inside the k-NN query clause. -Boolean filter | After search (post-filtering) | Approximate | - `lucene`<br> - `nmslib`<br> - `faiss` | Outside the k-NN query clause. Must be a leaf clause. -The `post_filter` parameter | After search (post-filtering) | Approximate | - `lucene`<br> - `nmslib`<br> - `faiss` | Outside the k-NN query clause. -Scoring script filter | Before search (pre-filtering) | Exact | N/A | Inside the script score query clause. - -## Filtered search optimization - -Depending on your dataset and use case, you might be more interested in maximizing recall or minimizing latency. The following table provides guidance on various k-NN search configurations and the filtering methods used to optimize for higher recall or lower latency. The first three columns of the table provide several example k-NN search configurations. A search configuration consists of: - -- The number of documents in an index, where one OpenSearch document corresponds to one k-NN vector. -- The percentage of documents left in the results after filtering. This value depends on the restrictiveness of the filter that you provide in the query. The most restrictive filter in the table returns 2.5% of documents in the index, while the least restrictive filter returns 80% of documents. -- The desired number of returned results (k). - -Once you've estimated the number of documents in your index, the restrictiveness of your filter, and the desired number of nearest neighbors, use the following table to choose a filtering method that optimizes for recall or latency. - -| Number of documents in an index | Percentage of documents the filter returns | k | Filtering method to use for higher recall | Filtering method to use for lower latency | -| :-- | :-- | :-- | :-- | :-- | -| 10M | 2.5 | 100 | Efficient k-NN filtering/Scoring script | Scoring script | -| 10M | 38 | 100 | Efficient k-NN filtering | Efficient k-NN filtering | -| 10M | 80 | 100 | Efficient k-NN filtering | Efficient k-NN filtering | -| 1M | 2.5 | 100 | Efficient k-NN filtering/Scoring script | Scoring script | -| 1M | 38 | 100 | Efficient k-NN filtering | Efficient k-NN filtering | -| 1M | 80 | 100 | Efficient k-NN filtering | Efficient k-NN filtering | - -## Efficient k-NN filtering +# Efficient k-NN filtering You can perform efficient k-NN filtering with the `lucene` or `faiss` engines. -### Lucene k-NN filter implementation +## Lucene k-NN filter implementation -k-NN plugin version 2.2 introduced support for running k-NN searches with the Lucene engine using HNSW graphs. Starting with version 2.4, which is based on Lucene version 9.4, you can use Lucene filters for k-NN searches. +OpenSearch version 2.2 introduced support for running k-NN searches with the Lucene engine using HNSW graphs. Starting with version 2.4, which is based on Lucene version 9.4, you can use Lucene filters for k-NN searches. When you specify a Lucene filter for a k-NN search, the Lucene algorithm decides whether to perform an exact k-NN search with pre-filtering or an approximate search with modified post-filtering. The algorithm uses the following variables: @@ -69,7 +25,7 @@ The following flow chart outlines the Lucene algorithm. For more information about the Lucene filtering implementation and the underlying `KnnVectorQuery`, see the [Apache Lucene documentation](https://lucene.apache.org/core/9_2_0/core/org/apache/lucene/search/KnnVectorQuery.html). -### Using a Lucene k-NN filter +## Using a Lucene k-NN filter Consider a dataset that includes 12 documents containing hotel information. The following image shows all hotels on an xy coordinate plane by location. Additionally, the points for hotels that have a rating between 8 and 10, inclusive, are depicted with orange dots, and hotels that provide parking are depicted with green circles. The search point is colored in red: @@ -77,7 +33,7 @@ Consider a dataset that includes 12 documents containing hotel information. The In this example, you will create an index and search for the three hotels with high ratings and parking that are the closest to the search location. -**Step 1: Create a new index** +### Step 1: Create a new index Before you can run a k-NN search with a filter, you need to create an index with a `knn_vector` field. For this field, you need to specify `lucene` as the engine and `hnsw` as the `method` in the mapping. @@ -115,7 +71,7 @@ PUT /hotels-index ``` {% include copy-curl.html %} -**Step 2: Add data to your index** +### Step 2: Add data to your index Next, add data to your index. @@ -150,7 +106,7 @@ POST /_bulk ``` {% include copy-curl.html %} -**Step 3: Search your data with a filter** +### Step 3: Search your data with a filter Now you can create a k-NN search with filters. In the k-NN query clause, include the point of interest that is used to search for nearest neighbors, the number of nearest neighbors to return (`k`), and a filter with the restriction criteria. Depending on how restrictive you want your filter to be, you can add multiple query clauses to a single request. @@ -259,9 +215,9 @@ The response returns the three hotels that are nearest to the search point and h For more ways to construct a filter, see [Constructing a filter](#constructing-a-filter). -### Faiss k-NN filter implementation +## Faiss k-NN filter implementation -For k-NN searches, you can use `faiss` filters with an HNSW algorithm (k-NN plugin versions 2.9 and later) or IVF algorithm (k-NN plugin versions 2.10 and later). +For k-NN searches, you can use `faiss` filters with an HNSW algorithm (OpenSearch version 2.9 and later) or IVF algorithm (OpenSearch version 2.10 and later). When you specify a Faiss filter for a k-NN search, the Faiss algorithm decides whether to perform an exact k-NN search with pre-filtering or an approximate search with modified post-filtering. The algorithm uses the following variables: @@ -276,13 +232,13 @@ The following flow chart outlines the Faiss algorithm. ![Faiss algorithm for filtering]({{site.url}}{{site.baseurl}}/images/faiss-algorithm.jpg) -### Using a Faiss efficient filter +## Using a Faiss efficient filter Consider an index that contains information about different shirts for an e-commerce application. You want to find the top-rated shirts that are similar to the one you already have but would like to restrict the results by shirt size. In this example, you will create an index and search for shirts that are similar to the shirt you provide. -**Step 1: Create a new index** +### Step 1: Create a new index Before you can run a k-NN search with a filter, you need to create an index with a `knn_vector` field. For this field, you need to specify `faiss` and `hnsw` as the `method` in the mapping. @@ -313,7 +269,7 @@ PUT /products-shirts ``` {% include copy-curl.html %} -**Step 2: Add data to your index** +### Step 2: Add data to your index Next, add data to your index. @@ -349,7 +305,7 @@ POST /_bulk?refresh ``` {% include copy-curl.html %} -**Step 3: Search your data with a filter** +### Step 3: Search your data with a filter Now you can create a k-NN search with filters. In the k-NN query clause, include the vector representation of the shirt that is used to search for similar ones, the number of nearest neighbors to return (`k`), and a filter by size and rating. @@ -446,7 +402,7 @@ The response returns the two matching documents: For more ways to construct a filter, see [Constructing a filter](#constructing-a-filter). -### Constructing a filter +## Constructing a filter There are multiple ways to construct a filter for the same condition. For example, you can use the following constructs to create a filter that returns hotels that provide parking: @@ -511,195 +467,3 @@ POST /hotels-index/_search } ``` {% include copy-curl.html %} - -## Post-filtering - -You can achieve post-filtering with a Boolean filter or by providing the `post_filter` parameter. - -### Boolean filter with ANN search - -A Boolean filter consists of a Boolean query that contains a k-NN query and a filter. For example, the following query searches for hotels that are closest to the specified `location` and then filters the results to return hotels with a rating between 8 and 10, inclusive, that provide parking: - -```json -POST /hotels-index/_search -{ - "size": 3, - "query": { - "bool": { - "filter": { - "bool": { - "must": [ - { - "range": { - "rating": { - "gte": 8, - "lte": 10 - } - } - }, - { - "term": { - "parking": "true" - } - } - ] - } - }, - "must": [ - { - "knn": { - "location": { - "vector": [ - 5, - 4 - ], - "k": 20 - } - } - } - ] - } - } -} -``` - -The response includes documents containing the matching hotels: - -```json -{ - "took" : 95, - "timed_out" : false, - "_shards" : { - "total" : 1, - "successful" : 1, - "skipped" : 0, - "failed" : 0 - }, - "hits" : { - "total" : { - "value" : 5, - "relation" : "eq" - }, - "max_score" : 0.72992706, - "hits" : [ - { - "_index" : "hotels-index", - "_id" : "3", - "_score" : 0.72992706, - "_source" : { - "location" : [ - 4.9, - 3.4 - ], - "parking" : "true", - "rating" : 9 - } - }, - { - "_index" : "hotels-index", - "_id" : "6", - "_score" : 0.3012048, - "_source" : { - "location" : [ - 6.4, - 3.4 - ], - "parking" : "true", - "rating" : 9 - } - }, - { - "_index" : "hotels-index", - "_id" : "5", - "_score" : 0.24154587, - "_source" : { - "location" : [ - 3.3, - 4.5 - ], - "parking" : "true", - "rating" : 8 - } - } - ] - } -} -``` - -### post-filter parameter - -If you use the `knn` query alongside filters or other clauses (for example, `bool`, `must`, `match`), you might receive fewer than `k` results. In this example, `post_filter` reduces the number of results from 2 to 1: - -```json -GET my-knn-index-1/_search -{ - "size": 2, - "query": { - "knn": { - "my_vector2": { - "vector": [2, 3, 5, 6], - "k": 2 - } - } - }, - "post_filter": { - "range": { - "price": { - "gte": 5, - "lte": 10 - } - } - } -} -``` - -## Scoring script filter - -A scoring script filter first filters the documents and then uses a brute-force exact k-NN search on the results. For example, the following query searches for hotels with a rating between 8 and 10, inclusive, that provide parking and then performs a k-NN search to return the 3 hotels that are closest to the specified `location`: - -```json -POST /hotels-index/_search -{ - "size": 3, - "query": { - "script_score": { - "query": { - "bool": { - "filter": { - "bool": { - "must": [ - { - "range": { - "rating": { - "gte": 8, - "lte": 10 - } - } - }, - { - "term": { - "parking": "true" - } - } - ] - } - } - } - }, - "script": { - "source": "knn_score", - "lang": "knn", - "params": { - "field": "location", - "query_value": [ - 5.0, - 4.0 - ], - "space_type": "l2" - } - } - } - } -} -``` -{% include copy-curl.html %} diff --git a/_vector-search/filter-search-knn/index.md b/_vector-search/filter-search-knn/index.md new file mode 100644 index 00000000000..afa018c2d8b --- /dev/null +++ b/_vector-search/filter-search-knn/index.md @@ -0,0 +1,51 @@ +--- +layout: default +title: Filtering data +nav_order: 50 +has_children: true +redirect_from: + - /search-plugins/knn/filter-search-knn/ + - /vector-search/filter-search-knn/ +--- + +# Filtering data + +To refine vector search results, you can filter a vector search using one of the following methods: + +- [Efficient k-nearest neighbors (k-NN) filtering]({{site.url}}{{site.baseurl}}/vector-search/filter-search-knn/efficient-knn-filtering/): This approach applies filtering _during_ the vector search, as opposed to before or after the vector search, which ensures that `k` results are returned (if there are at least `k` results in total). This approach is supported by the following engines: + - Lucene engine with a Hierarchical Navigable Small World (HNSW) algorithm (OpenSearch version 2.4 and later) + - Faiss engine with an HNSW algorithm (OpenSearch version 2.9 and later) or IVF algorithm (OpenSearch version 2.10 and later) + +- [Post-filtering]({{site.url}}{{site.baseurl}}/vector-search/filter-search-knn/post-filtering/): Because it is performed after the vector search, this approach may return significantly fewer than `k` results for a restrictive filter. You can use the following two filtering strategies for this approach: + - [Boolean post-filter]({{site.url}}{{site.baseurl}}/vector-search/filter-search-knn/post-filtering/#boolean-filter-with-ann-search): This approach runs an [approximate nearest neighbor (ANN)]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/) search and then applies a filter to the results. The two query parts are executed independently, and then the results are combined based on the query operator (`should`, `must`, and so on) provided in the query. + - [The `post_filter` parameter]({{site.url}}{{site.baseurl}}/vector-search/filter-search-knn/post-filtering/#the-post_filter-parameter): This approach runs an [ANN]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/) search on the full dataset and then applies the filter to the k-NN results. + +- [Scoring script filter]({{site.url}}{{site.baseurl}}/vector-search/filter-search-knn/scoring-script-filter/): This approach involves pre-filtering a document set and then running an exact k-NN search on the filtered subset. It may have high latency and does not scale when filtered subsets are large. + +The following table summarizes the preceding filtering use cases. + +Filter | When the filter is applied | Type of search | Supported engines and methods | Where to place the `filter` clause +:--- | :--- | :--- | :--- +Efficient k-NN filtering | During search (a hybrid of pre- and post-filtering) | Approximate | - `lucene` (`hnsw`) <br> - `faiss` (`hnsw`, `ivf`) | Inside the k-NN query clause. +Boolean filter | After search (post-filtering) | Approximate | - `lucene` <br> - `faiss` <br> - `nmslib` (deprecated) | Outside the k-NN query clause. Must be a leaf clause. +The `post_filter` parameter | After search (post-filtering) | Approximate | - `lucene`<br> - `faiss` <br> - `nmslib` (deprecated) | Outside the k-NN query clause. +Scoring script filter | Before search (pre-filtering) | Exact | N/A | Inside the script score query clause. + +## Filtered search optimization + +Depending on your dataset and use case, you might be more interested in maximizing recall or minimizing latency. The following table provides guidance on various k-NN search configurations and the filtering methods used to optimize for higher recall or lower latency. The first three columns of the table provide several example k-NN search configurations. A search configuration consists of: + +- The number of documents in an index, where one OpenSearch document corresponds to one k-NN vector. +- The percentage of documents left in the results after filtering. This value depends on the restrictiveness of the filter that you provide in the query. The most restrictive filter in the table returns 2.5% of documents in the index, while the least restrictive filter returns 80% of documents. +- The desired number of returned results (k). + +Once you've estimated the number of documents in your index, the restrictiveness of your filter, and the desired number of nearest neighbors, use the following table to choose a filtering method that optimizes for recall or latency. + +| Number of documents in an index | Percentage of documents the filter returns | k | Filtering method to use for higher recall | Filtering method to use for lower latency | +| :-- | :-- | :-- | :-- | :-- | +| 10M | 2.5 | 100 | Efficient k-NN filtering/Scoring script | Scoring script | +| 10M | 38 | 100 | Efficient k-NN filtering | Efficient k-NN filtering | +| 10M | 80 | 100 | Efficient k-NN filtering | Efficient k-NN filtering | +| 1M | 2.5 | 100 | Efficient k-NN filtering/Scoring script | Scoring script | +| 1M | 38 | 100 | Efficient k-NN filtering | Efficient k-NN filtering | +| 1M | 80 | 100 | Efficient k-NN filtering | Efficient k-NN filtering | diff --git a/_vector-search/filter-search-knn/post-filtering.md b/_vector-search/filter-search-knn/post-filtering.md new file mode 100644 index 00000000000..3525b0fc8cc --- /dev/null +++ b/_vector-search/filter-search-knn/post-filtering.md @@ -0,0 +1,149 @@ +--- +layout: default +title: Post-filtering +parent: Filtering data +nav_order: 20 +--- + +## Post-filtering + +You can achieve post-filtering with a [Boolean filter](#boolean-filter-with-ann-search) or by providing [the `post_filter` parameter](#the-post_filter-parameter). + +### Boolean filter with ANN search + +A Boolean filter consists of a Boolean query that contains a k-NN query and a filter. For example, the following query searches for hotels that are closest to the specified `location` and then filters the results to return hotels with a rating between 8 and 10, inclusive, that provide parking: + +```json +POST /hotels-index/_search +{ + "size": 3, + "query": { + "bool": { + "filter": { + "bool": { + "must": [ + { + "range": { + "rating": { + "gte": 8, + "lte": 10 + } + } + }, + { + "term": { + "parking": "true" + } + } + ] + } + }, + "must": [ + { + "knn": { + "location": { + "vector": [ + 5, + 4 + ], + "k": 20 + } + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +The response includes documents containing the matching hotels: + +```json +{ + "took" : 95, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 5, + "relation" : "eq" + }, + "max_score" : 0.72992706, + "hits" : [ + { + "_index" : "hotels-index", + "_id" : "3", + "_score" : 0.72992706, + "_source" : { + "location" : [ + 4.9, + 3.4 + ], + "parking" : "true", + "rating" : 9 + } + }, + { + "_index" : "hotels-index", + "_id" : "6", + "_score" : 0.3012048, + "_source" : { + "location" : [ + 6.4, + 3.4 + ], + "parking" : "true", + "rating" : 9 + } + }, + { + "_index" : "hotels-index", + "_id" : "5", + "_score" : 0.24154587, + "_source" : { + "location" : [ + 3.3, + 4.5 + ], + "parking" : "true", + "rating" : 8 + } + } + ] + } +} +``` + +### The post_filter parameter + +If you use the `knn` query alongside filters or other clauses (for example, `bool`, `must`, `match`), you might receive fewer than `k` results. In this example, `post_filter` reduces the number of results from 2 to 1: + +```json +GET my-knn-index-1/_search +{ + "size": 2, + "query": { + "knn": { + "my_vector2": { + "vector": [2, 3, 5, 6], + "k": 2 + } + } + }, + "post_filter": { + "range": { + "price": { + "gte": 5, + "lte": 10 + } + } + } +} +``` +{% include copy-curl.html %} \ No newline at end of file diff --git a/_vector-search/filter-search-knn/scoring-script-filter.md b/_vector-search/filter-search-knn/scoring-script-filter.md new file mode 100644 index 00000000000..aa928ec42e1 --- /dev/null +++ b/_vector-search/filter-search-knn/scoring-script-filter.md @@ -0,0 +1,57 @@ +--- +layout: default +title: Scoring script filter +parent: Filtering data +nav_order: 30 +--- + +# Scoring script filter + +A scoring script filter first filters the documents and then uses a brute-force exact k-NN search on the results. For example, the following query searches for hotels with a rating between 8 and 10, inclusive, that provide parking and then performs a k-NN search to return the 3 hotels that are closest to the specified `location`: + +```json +POST /hotels-index/_search +{ + "size": 3, + "query": { + "script_score": { + "query": { + "bool": { + "filter": { + "bool": { + "must": [ + { + "range": { + "rating": { + "gte": 8, + "lte": 10 + } + } + }, + { + "term": { + "parking": "true" + } + } + ] + } + } + } + }, + "script": { + "source": "knn_score", + "lang": "knn", + "params": { + "field": "location", + "query_value": [ + 5.0, + 4.0 + ], + "space_type": "l2" + } + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_vector-search/getting-started/auto-generated-embeddings.md b/_vector-search/getting-started/auto-generated-embeddings.md new file mode 100644 index 00000000000..1aabb11ab21 --- /dev/null +++ b/_vector-search/getting-started/auto-generated-embeddings.md @@ -0,0 +1,323 @@ +--- +layout: default +title: Generating embeddings automatically +parent: Getting started +nav_order: 30 +--- + +# Generating embeddings automatically + +You can generate embeddings dynamically during ingestion within OpenSearch. This method provides a simplified workflow by converting data to vectors automatically. + +OpenSearch can automatically generate embeddings from your text data using two approaches: + +- [**Manual setup**](#manual-setup) (Recommended for custom configurations): Configure each component individually for full control over the implementation. +- [**Automated workflow**](#using-automated-workflows) (Recommended for quick setup): Use defaults and workflows for quick implementation with minimal configuration. + +## Prerequisites + +For this simple setup, you'll use an OpenSearch-provided machine learning (ML) model and a cluster with no dedicated ML nodes. To ensure that this basic local setup works, send the following request to update ML-related cluster settings: + +```json +PUT _cluster/settings +{ + "persistent": { + "plugins.ml_commons.only_run_on_ml_node": "false", + "plugins.ml_commons.model_access_control_enabled": "true", + "plugins.ml_commons.native_memory_threshold": "99" + } +} +``` +{% include copy-curl.html %} + +### Choose an ML model + +Generating embeddings automatically requires configuring a language model that will convert text to embeddings both at ingestion time and query time. + +When selecting a model, you have the following options: + +- Use a pretrained model provided by OpenSearch. For more information, see [OpenSearch-provided pretrained models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/). + +- Upload your own model to OpenSearch. For more information, see [Custom local models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/custom-local-models/). + +- Connect to a foundation model hosted on an external platform. For more information, see [Connecting to remote models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index/). + +In this example, you'll use the [DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert) model from Hugging Face, which is one of the [pretrained models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/#sentence-transformers) available in OpenSearch. For more information, see [Integrating ML models]({{site.url}}{{site.baseurl}}/ml-commons-plugin/integrating-ml-models/). + +Take note of the dimensionality of the model because you'll need it when you set up a vector index. +{: .important} + +## Manual setup + +For more control over the configuration, you can set up each component manually using the following steps. + +### Step 1: Register and deploy the model + +To register and deploy the model, send the following request: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "huggingface/sentence-transformers/msmarco-distilbert-base-tas-b", + "version": "1.0.3", + "model_format": "TORCH_SCRIPT" +} +``` +{% include copy-curl.html %} + +Registering a model is an asynchronous task. OpenSearch returns a task ID for this task: + +```json +{ + "task_id": "aFeif4oB5Vm0Tdw8yoN7", + "status": "CREATED" +} +``` + +You can check the status of the task by using the Tasks API: + +```json +GET /_plugins/_ml/tasks/aFeif4oB5Vm0Tdw8yoN7 +``` +{% include copy-curl.html %} + +Once the task is complete, the task state will change to `COMPLETED` and the Tasks API response will contain a model ID for the registered model: + +```json +{ + "model_id": "aVeif4oB5Vm0Tdw8zYO2", + "task_type": "REGISTER_MODEL", + "function_name": "TEXT_EMBEDDING", + "state": "COMPLETED", + "worker_node": [ + "4p6FVOmJRtu3wehDD74hzQ" + ], + "create_time": 1694358489722, + "last_update_time": 1694358499139, + "is_async": true +} +``` + +You'll need the model ID in order to use this model for several of the following steps. + +### Step 2: Create an ingest pipeline + +First, you need to create an [ingest pipeline]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) that contains one processor: a task that transforms document fields before documents are ingested into an index. You'll set up a `text_embedding` processor that creates vector embeddings from text. You'll need the `model_id` of the model you set up in the previous section and a `field_map`, which specifies the name of the field from which to take the text (`text`) and the name of the field in which to record embeddings (`passage_embedding`): + +```json +PUT /_ingest/pipeline/nlp-ingest-pipeline +{ + "description": "An NLP ingest pipeline", + "processors": [ + { + "text_embedding": { + "model_id": "aVeif4oB5Vm0Tdw8zYO2", + "field_map": { + "text": "passage_embedding" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +### Step 3: Create a vector index + +Now you'll create a vector index by setting `index.knn` to `true`. In the index, the field named `text` contains an image description, and a [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) field named `passage_embedding` contains the vector embedding of the text. The vector field `dimension` must match the dimensionality of the model you configured in Step 2. Additionally, set the default ingest pipeline to the `nlp-ingest-pipeline` you created in the previous step: + + +```json +PUT /my-nlp-index +{ + "settings": { + "index.knn": true, + "default_pipeline": "nlp-ingest-pipeline" + }, + "mappings": { + "properties": { + "passage_embedding": { + "type": "knn_vector", + "dimension": 768, + "space_type": "l2" + }, + "text": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +Setting up a vector index allows you to later perform a vector search on the `passage_embedding` field. + +### Step 4: Ingest documents into the index + +In this step, you'll ingest several sample documents into the index. The sample data is taken from the [Flickr image dataset](https://www.kaggle.com/datasets/hsankesara/flickr-image-dataset). Each document contains a `text` field corresponding to the image description and an `id` field corresponding to the image ID: + +```json +PUT /my-nlp-index/_doc/1 +{ + "text": "A man who is riding a wild horse in the rodeo is very near to falling off ." +} +``` +{% include copy-curl.html %} + +```json +PUT /my-nlp-index/_doc/2 +{ + "text": "A rodeo cowboy , wearing a cowboy hat , is being thrown off of a wild white horse ." +} +``` +{% include copy-curl.html %} + +```json +PUT /my-nlp-index/_doc/3 +{ + "text": "People line the stands which advertise Freemont 's orthopedics , a cowboy rides a light brown bucking bronco ." +} +``` +{% include copy-curl.html %} + +### Step 5: Search the data + +Now you'll search the index using semantic search. To automatically generate vector embeddings from query text, use a `neural` query and provide the model ID of the model you set up earlier so that vector embeddings for the query text are generated with the model used at ingestion time: + +```json +GET /my-nlp-index/_search +{ + "_source": { + "excludes": [ + "passage_embedding" + ] + }, + "query": { + "neural": { + "passage_embedding": { + "query_text": "wild west", + "model_id": "aVeif4oB5Vm0Tdw8zYO2", + "k": 3 + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the matching documents: + +```json +{ + "took": 127, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 0.015851952, + "hits": [ + { + "_index": "my-nlp-index", + "_id": "1", + "_score": 0.015851952, + "_source": { + "text": "A man who is riding a wild horse in the rodeo is very near to falling off ." + } + }, + { + "_index": "my-nlp-index", + "_id": "2", + "_score": 0.015177963, + "_source": { + "text": "A rodeo cowboy , wearing a cowboy hat , is being thrown off of a wild white horse ." + } + }, + { + "_index": "my-nlp-index", + "_id": "3", + "_score": 0.011347729, + "_source": { + "text": "People line the stands which advertise Freemont 's orthopedics , a cowboy rides a light brown bucking bronco ." + } + } + ] + } +} +``` + +## Using automated workflows + +You can quickly set up automatic embedding generation using [_automated workflows_]({{site.url}}{{site.baseurl}}/automating-configurations/). This approach automatically creates and provisions all necessary resources. For more information, see [Workflow templates]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-templates/). + +You can use automated workflows to create and deploy externally hosted models and create resources for various AI search types. In this example, you'll create the same search you've already created following manual steps. + +### Step 1: Register and deploy the model + +To register and deploy a model, select the built-in workflow template for the model provider. For more information, see [Supported workflow templates]({{site.url}}{{site.baseurl}}/automating-configurations/workflow-templates/#supported-workflow-templates). Alternatively, to configure a custom model, use [Step 1 of the manual setup](#step-1-register-and-deploy-the-model). + +### Step 2: Configure a workflow + +Create and provision a semantic search workflow. You must provide the model ID for the configured model. Review your selected workflow template [defaults](https://github.com/opensearch-project/flow-framework/blob/2.13/src/main/resources/defaults/semantic-search-defaults.json) to determine whether you need to update any of the parameters. For example, if the model dimensionality is different from the default (`1024`), specify the dimensionality of your model in the `output_dimension` parameter. Change the workflow template default text field from `passage_text` to `text` in order to match the manual example: + +```json +POST /_plugins/_flow_framework/workflow?use_case=semantic_search&provision=true +{ + "create_ingest_pipeline.model_id" : "mBGzipQB2gmRjlv_dOoB", + "text_embedding.field_map.output.dimension": "768", + "text_embedding.field_map.input": "text" +} +``` +{% include copy-curl.html %} + +OpenSearch responds with a workflow ID for the created workflow: + +```json +{ + "workflow_id" : "U_nMXJUBq_4FYQzMOS4B" +} +``` + +To check the workflow status, send the following request: + +```json +GET /_plugins/_flow_framework/workflow/U_nMXJUBq_4FYQzMOS4B/_status +``` +{% include copy-curl.html %} + +Once the workflow completes, the `state` changes to `COMPLETED`. The workflow has created an ingest pipeline and an index called `my-nlp-index`: + +```json +{ + "workflow_id": "U_nMXJUBq_4FYQzMOS4B", + "state": "COMPLETED", + "resources_created": [ + { + "workflow_step_id": "create_ingest_pipeline", + "workflow_step_name": "create_ingest_pipeline", + "resource_id": "nlp-ingest-pipeline", + "resource_type": "pipeline_id" + }, + { + "workflow_step_name": "create_index", + "workflow_step_id": "create_index", + "resource_id": "my-nlp-index", + "resource_type": "index_name" + } + ] +} +``` + +You can now continue with [steps 4 and 5](#step-4-ingest-documents-into-the-index) to ingest documents into the index and search the index. + +## Next steps + +- See [Getting started with semantic and hybrid search]({{site.url}}{{site.baseurl}}/vector-search/tutorials/neural-search-tutorial/) to learn about configuring semantic and hybrid search. +- See [AI search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/) to learn about the supported types of AI search. \ No newline at end of file diff --git a/_vector-search/getting-started/concepts.md b/_vector-search/getting-started/concepts.md new file mode 100644 index 00000000000..a19134a7479 --- /dev/null +++ b/_vector-search/getting-started/concepts.md @@ -0,0 +1,75 @@ +--- +layout: default +title: Concepts +parent: Getting started +nav_order: 40 +--- + +# Concepts + +This page defines key terms and techniques related to vector search in OpenSearch. + +## Vector representations + +- [**_Vector embeddings_**]({{site.url}}{{site.baseurl}}/vector-search/getting-started/vector-search-basics/#vector-embeddings) are numerical representations of data—such as text, images, or audio—that encode meaning or features into a high-dimensional space. These embeddings enable similarity-based comparisons for search and machine learning (ML) tasks. + +- **_Dense vectors_** are high-dimensional numerical representations where most elements have nonzero values. They are typically produced by deep learning models and are used in semantic search and ML applications. + +- **_Sparse vectors_** contain mostly zero values and are often used in techniques like neural sparse search to efficiently represent and retrieve information. + +## Vector search fundamentals + +- [**_Vector search_**]({{site.url}}{{site.baseurl}}/vector-search/getting-started/vector-search-basics/), also known as _similarity search_ or _nearest neighbor search_, is a technique for finding items that are most similar to a given input vector. It is widely used in applications such as recommendation systems, image retrieval, and natural language processing. + +- A [**_space_**]({{site.url}}{{site.baseurl}}/vector-search/getting-started/vector-search-basics/#calculating-similarity) defines how similarity or distance between two vectors is measured. Different spaces use different distance metrics, such as Euclidean distance or cosine similarity, to determine how closely vectors resemble each other. + +- A [**_method_**]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/) refers to the algorithm used to organize vector data during indexing and retrieve relevant results during search in approximate k-NN search. Different methods balance trade-offs between accuracy, speed, and memory usage. + +- An [**_engine_**]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/) is the underlying library that implements vector search methods. It determines how vectors are indexed, stored, and retrieved during similarity search operations. + +## k-NN search + +- **_k-nearest neighbors (k-NN) search_** finds the k most similar vectors to a given query vector in an index. The similarity is determined based on a specified distance metric. + +- [**_Exact k-NN search_**]({{site.url}}{{site.baseurl}}/vector-search/vector-search-techniques/knn-score-script/) performs a brute-force comparison between a query vector and all vectors in an index, computing the exact nearest neighbors. This approach provides high accuracy but can be computationally expensive for large datasets. + +- [**_Approximate k-NN search_**]({{site.url}}{{site.baseurl}}/vector-search/vector-search-techniques/approximate-knn/) reduces computational complexity by using indexing techniques that speed up search operations while maintaining high accuracy. These methods restructure the index or reduce the dimensionality of vectors to improve performance. + +## Query types + +- A [**_k-NN query_**]({{site.url}}{{site.baseurl}}/query-dsl/specialized/k-nn/) searches vector fields using a query vector. + +- A [**_neural query_**]({{site.url}}{{site.baseurl}}/query-dsl/specialized/neural/) searches vector fields using text or image data. + +- A [**_neural sparse query_**]({{site.url}}{{site.baseurl}}/query-dsl/specialized/neural-sparse/) searches vector fields using raw text or sparse vector tokens. + +## Search techniques + +- [**_Semantic search_**]({{site.url}}{{site.baseurl}}/vector-search/ai-search/semantic-search/) interprets the intent and contextual meaning of a query rather than relying solely on exact keyword matches. This approach improves the relevance of search results, especially for natural language queries. + +- [**_Hybrid search_**]({{site.url}}{{site.baseurl}}/vector-search/ai-search/hybrid-search/) combines lexical (keyword-based) search with semantic (vector-based) search to improve search relevance. This approach ensures that results include both exact keyword matches and conceptually similar content. + +- [**_Multimodal search_**]({{site.url}}{{site.baseurl}}/vector-search/ai-search/multimodal-search/) enables you to search across multiple types of data, such as text and images. It allows queries in one format (for example, text) to retrieve results in another (for example, images). + +- [**_Radial search_**]({{site.url}}{{site.baseurl}}/vector-search/specialized-operations/radial-search-knn/) retrieves all vectors within a specified distance or similarity threshold from a query vector. It is useful for tasks that require finding all relevant matches within a given range rather than retrieving a fixed number of nearest neighbors. + +- [**_Neural sparse search_**]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-search/) uses an inverted index, similar to BM25, to efficiently retrieve relevant documents based on sparse vector representations. This approach maintains the efficiency of traditional lexical search while incorporating semantic understanding. + +- [**_Conversational search_**]({{site.url}}{{site.baseurl}}/vector-search/ai-search/conversational-search/) allows you to interact with a search system using natural language queries and refine results through follow-up questions. This approach enhances the user experience by making search more intuitive and interactive. + +- [**_Retrieval-augmented generation (RAG)_**]({{site.url}}{{site.baseurl}}/vector-search/ai-search/conversational-search/#rag) enhances large language models (LLMs) by retrieving relevant information from an index and incorporating it into the model's response. This approach improves the accuracy and relevance of generated text. + +## Indexing and storage techniques + +- [**_Text chunking_**]({{site.url}}{{site.baseurl}}/vector-search/ingesting-data/text-chunking/) involves splitting long documents or text passages into smaller segments to improve search retrieval and relevance. Chunking helps vector search models process large amounts of text more effectively. + +- [**_Vector quantization_**]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/knn-vector-quantization/) is a technique for reducing the storage size of vector embeddings by approximating them using a smaller set of representative vectors. This process enables efficient storage and retrieval in large-scale vector search applications. + +- **_Scalar quantization (SQ)_** reduces vector precision by mapping floating-point values to a limited set of discrete values, decreasing memory requirements while preserving search accuracy. + +- **_Product quantization (PQ)_** divides high-dimensional vectors into smaller subspaces and quantizes each subspace separately, enabling efficient approximate nearest neighbor search with reduced memory usage. + +- **_Binary quantization_** compresses vector representations by converting numerical values to binary formats. This technique reduces storage requirements and accelerates similarity computations. + +- [**_Disk-based vector search_**]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/disk-based-vector-search/) stores vector embeddings on disk rather than in memory, using binary quantization to reduce memory consumption while maintaining search efficiency. + diff --git a/_vector-search/getting-started/index.md b/_vector-search/getting-started/index.md new file mode 100644 index 00000000000..11034ca0171 --- /dev/null +++ b/_vector-search/getting-started/index.md @@ -0,0 +1,202 @@ +--- +layout: default +title: Getting started +nav_order: 10 +has_children: true +has_toc: false +redirect_from: + - /vector-search/getting-started/ +--- + +# Getting started with vector search + +This guide shows you how to use your own vectors in OpenSearch. You'll learn to create a vector index, add location data, and run a vector search to find the nearest hotels on a coordinate plane. While this example uses two-dimensional vectors for simplicity, the same approach applies to higher-dimensional vectors used in semantic search and recommendation systems. + + +## Prerequisite: Install OpenSearch + + +<details markdown="block"> + <summary> + If you don't have OpenSearch installed, follow these steps to create a cluster. + </summary> + +Before you start, ensure that [Docker](https://docs.docker.com/get-docker/) is installed and running in your environment. <br> +This demo configuration is insecure and should not be used in production environments. +{: .note} + +Download and run OpenSearch: + +```bash +docker pull opensearchproject/opensearch:latest && docker run -it -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" -e "DISABLE_SECURITY_PLUGIN=true" opensearchproject/opensearch:latest +``` +{% include copy.html %} + +OpenSearch is now running on port 9200. To verify that OpenSearch is running, send the following request: + +```bash +curl https://localhost:9200 +``` +{% include copy.html %} + +You should get a response that looks like this: + +```json +{ + "name" : "a937e018cee5", + "cluster_name" : "docker-cluster", + "cluster_uuid" : "GLAjAG6bTeWErFUy_d-CLw", + "version" : { + "distribution" : "opensearch", + "number" : <version>, + "build_type" : <build-type>, + "build_hash" : <build-hash>, + "build_date" : <build-date>, + "build_snapshot" : false, + "lucene_version" : <lucene-version>, + "minimum_wire_compatibility_version" : "7.10.0", + "minimum_index_compatibility_version" : "7.0.0" + }, + "tagline" : "The OpenSearch Project: https://opensearch.org/" +} +``` + +For more information, see [Installation quickstart]({{site.url}}{{site.baseurl}}/getting-started/quickstart/) and [Install and upgrade OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/). + +</details> + +## Step 1: Create a vector index + +First, create an index that will store sample hotel data. To signal to OpenSearch that this is a vector index, set `index.knn` to `true`. You'll store the vectors in a vector field named `location`. The vectors you'll ingest will be two-dimensional, and the distance between vectors will be calculated using the [Euclidean `l2` similarity metric]({{site.url}}{{site.baseurl}}/vector-search/getting-started/vector-search-basics/#calculating-similarity): + +```json +PUT /hotels-index +{ + "settings": { + "index.knn": true + }, + "mappings": { + "properties": { + "location": { + "type": "knn_vector", + "dimension": 2, + "space_type": "l2" + } + } + } +} +``` +{% include copy-curl.html %} + +Vector queries usually have a `size` > 0, so by default they don't enter the request cache. In OpenSearch 2.19 or later, if your workload mostly consists of vector queries, consider increasing the dynamic `indices.requests.cache.maximum_cacheable_size` cluster setting to a larger value, such as `256`. This allows queries with a `size` of up to 256 to enter the request cache, improving performance. For more information, see [Request cache]({{site.url}}{{site.baseurl}}/search-plugins/caching/request-cache). + +## Step 2: Add data to your index + +Next, add data to your index. Each document represents a hotel. The `location` field in each document contains a two-dimensional vector specifying the hotel's location: + +```json +POST /_bulk +{ "index": { "_index": "hotels-index", "_id": "1" } } +{ "location": [5.2, 4.4] } +{ "index": { "_index": "hotels-index", "_id": "2" } } +{ "location": [5.2, 3.9] } +{ "index": { "_index": "hotels-index", "_id": "3" } } +{ "location": [4.9, 3.4] } +{ "index": { "_index": "hotels-index", "_id": "4" } } +{ "location": [4.2, 4.6] } +{ "index": { "_index": "hotels-index", "_id": "5" } } +{ "location": [3.3, 4.5] } +``` +{% include copy-curl.html %} + +## Step 3: Search your data + +Now search for hotels closest to the pin location `[5, 4]`. To search for the top three closest hotels, set `k` to `3`: + +```json +POST /hotels-index/_search +{ + "size": 3, + "query": { + "knn": { + "location": { + "vector": [5, 4], + "k": 3 + } + } + } +} +``` +{% include copy-curl.html %} + +The following image shows the hotels on the coordinate plane. The query point is labeled `Pin`, and each hotel is labeled with its document number. + +![Hotels on a coordinate plane]({{site.url}}{{site.baseurl}}/images/k-nn-search-hotels.png){:style="width: 400px;" class="img-centered"} + +The response contains the hotels closest to the specified pin location: + +```json +{ + "took": 1093, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 0.952381, + "hits": [ + { + "_index": "hotels-index", + "_id": "2", + "_score": 0.952381, + "_source": { + "location": [ + 5.2, + 3.9 + ] + } + }, + { + "_index": "hotels-index", + "_id": "1", + "_score": 0.8333333, + "_source": { + "location": [ + 5.2, + 4.4 + ] + } + }, + { + "_index": "hotels-index", + "_id": "3", + "_score": 0.72992706, + "_source": { + "location": [ + 4.9, + 3.4 + ] + } + } + ] + } +} +``` + +## Generating vector embeddings automatically + +If your data isn't already in vector format, you can generate vector embeddings directly within OpenSearch. This allows you to transform text or images into their numerical representations for similarity search. For more information, see [Generating vector embeddings automatically]({{site.url}}{{site.baseurl}}/vector-search/getting-started/auto-generated-embeddings/). + +## Next steps + +- [Vector search basics]({{site.url}}{{site.baseurl}}/vector-search/getting-started/vector-search-basics/) +- [Preparing vectors]({{site.url}}{{site.baseurl}}/vector-search/getting-started/vector-search-options/) +- [Vector search with filters]({{site.url}}{{site.baseurl}}/vector-search/filter-search-knn/) +- [Generating vector embeddings automatically]({{site.url}}{{site.baseurl}}/vector-search/getting-started/auto-generated-embeddings/) \ No newline at end of file diff --git a/_vector-search/getting-started/vector-search-basics.md b/_vector-search/getting-started/vector-search-basics.md new file mode 100644 index 00000000000..cf3b6f2c456 --- /dev/null +++ b/_vector-search/getting-started/vector-search-basics.md @@ -0,0 +1,44 @@ +--- +layout: default +title: Vector search basics +parent: Getting started +nav_order: 10 +--- + +# Vector search basics + +_Vector search_, also known as _similarity search_ or _nearest neighbor search_, is a powerful technique for finding items that are most similar to a given input. Use cases include semantic search to understand user intent, recommendations (for example, an "other songs you might like" feature in a music application), image recognition, and fraud detection. For more background information about vector search, see [Nearest neighbor search](https://en.wikipedia.org/wiki/Nearest_neighbor_search). + +## Vector embeddings + +Unlike traditional search methods that rely on exact keyword matches, vector search uses _vector embeddings_---numerical representations of data such as text, images, or audio. These embeddings are stored as multi-dimensional vectors, capturing deeper patterns and similarities in meaning, context, or structure. For example, a large language model (LLM) can create vector embeddings from input text, as shown in the following image. + +![Generating embeddings from text]({{site.url}}{{site.baseurl}}/images/vector-search/embeddings.png) + +## Similarity search + +A vector embedding is a vector in a high-dimensional space. Its position and orientation capture meaningful relationships between objects. Vector search finds the most similar results by comparing a query vector to stored vectors and returning the closest matches. OpenSearch uses the [k-nearest neighbors (k-NN) algorithm](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm) to efficiently identify the most similar vectors. Unlike keyword search, which relies on exact word matches, vector search measures similarity based on distance in this high-dimensional space. + +In the following image, the vectors for `Wild West` and `Broncos` are closer to each other, while both are far from `Basketball`, reflecting their semantic differences. + +![Similarity search]({{site.url}}{{site.baseurl}}/images/vector-search/vector-similarity.jpg){: width="400px"} + +To learn more about the types of vector search that OpenSearch supports, see [Vector search techniques]({{site.url}}{{site.baseurl}}/vector-search/vector-search-techniques/). + +## Calculating similarity + +Vector similarity measures how close two vectors are in a multi-dimensional space, facilitating tasks like nearest neighbor search and ranking results by relevance. OpenSearch supports multiple distance metrics (_spaces_) for calculating vector similarity: + +- **L1 (Manhattan distance):** Sums the absolute differences between vector components. +- **L2 (Euclidean distance):** Calculates the square root of the sum of squared differences, making it sensitive to magnitude. +- **L∞ (Chebyshev distance):** Considers only the maximum absolute difference between corresponding vector elements. +- **Cosine similarity:** Measures the angle between vectors, focusing on direction rather than magnitude. +- **Inner product:** Determines similarity based on vector dot products, which can be useful for ranking. +- **Hamming distance:** Counts differing elements in binary vectors. +- **Hamming bit:** Applies the same principle as Hamming distance but is optimized for binary-encoded data. + +To learn more about the distance metrics, see [Spaces]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-spaces/). + +## Next steps + +- [Preparing vectors]({{site.url}}{{site.baseurl}}/vector-search/getting-started/vector-search-options/) \ No newline at end of file diff --git a/_vector-search/getting-started/vector-search-options.md b/_vector-search/getting-started/vector-search-options.md new file mode 100644 index 00000000000..f630ca96a73 --- /dev/null +++ b/_vector-search/getting-started/vector-search-options.md @@ -0,0 +1,94 @@ +--- +layout: default +title: Preparing vectors +parent: Getting started +nav_order: 20 +quickstart_cards: + - heading: "Getting started with vector search" + description: "Use raw vectors or embeddings generated outside of OpenSearch" + link: "/vector-search/getting-started/" +tutorial_cards: + - heading: "Generating embeddings automatically" + description: "Automatically convert data to embeddings within OpenSearch" + link: "/vector-search/getting-started/auto-generated-embeddings/" + - heading: "Getting started with semantic and hybrid search" + description: "Learn how to implement semantic and hybrid search" + link: "/vector-search/tutorials/neural-search-tutorial/" +pre_items: + - heading: "Generate embeddings" + description: "Generate embeddings outside of OpenSearch using your favorite embedding utility." + - heading: "Create an OpenSearch index" + description: "Create an OpenSearch index to store your embeddings." + link: "/vector-search/creating-vector-index/#storing-raw-vectors-or-embeddings-generated-outside-of-opensearch" + - heading: "Ingest embeddings" + description: "Ingest your embeddings into the index." + link: "/vector-search/ingesting-data/#raw-vector-ingestion" + - heading: "Search embeddings" + description: "Search your embeddings using vector search." + link: "/vector-search/searching-data/#searching-raw-vectors" +auto_items: + - heading: "Configure an embedding model" + description: "Configure a machine learning model that will automatically generate embeddings from your text at ingestion time and query time." + link: "/ml-commons-plugin/integrating-ml-models/" + - heading: "Create an OpenSearch index" + description: "Create an OpenSearch index to store your text." + link: "/vector-search/creating-vector-index/#converting-data-to-embeddings-during-ingestion" + - heading: "Ingest text" + description: "Ingest your text into the index." + link: "/vector-search/ingesting-data/#converting-data-to-embeddings-during-ingestion" + - heading: "Search text" + description: "Search your text using vector search. Query text is automatically converted to vector embeddings and compared to document embeddings." + link: "/vector-search/searching-data/#searching-auto-generated-embeddings" +--- + +# Preparing vectors + +In OpenSearch, you can either bring your own vectors or let OpenSearch generate them automatically from your data. Letting OpenSearch automatically generate your embeddings reduces data preprocessing effort at ingestion and search time. + +### Option 1: Bring your own raw vectors or generated embeddings + +You already have pre-computed embeddings or raw vectors from external tools or services. + - **Ingestion**: Ingest pregenerated embeddings directly into OpenSearch. + + ![Pre-generated embeddings ingestion]({{site.url}}{{site.baseurl}}/images/vector-search/raw-vector-ingest.png) + - **Search**: Perform vector search to find the vectors that are closest to a query vector. + + ![Pre-generated embeddings search]({{site.url}}{{site.baseurl}}/images/vector-search/raw-vector-search.png) + +<details markdown="block"> + <summary> + Steps + </summary> + {: .fs-5 .fw-700} + +Working with embeddings generated outside of OpenSearch involves the following steps: + +{% include list.html list_items=page.pre_items%} + +</details> + +{% include cards.html cards=page.quickstart_cards %} + +### Option 2: Generate embeddings within OpenSearch + +Use this option to let OpenSearch automatically generate vector embeddings from your data using a machine learning (ML) model. + - **Ingestion**: You ingest plain data, and OpenSearch uses an ML model to generate embeddings dynamically. + + ![Auto-generated embeddings ingestion]({{site.url}}{{site.baseurl}}/images/vector-search/auto-vector-ingest.png) + - **Search**: At query time, OpenSearch uses the same ML model to convert your input data to embeddings, and these embeddings are used for vector search. + + ![Auto-generated embeddings search]({{site.url}}{{site.baseurl}}/images/vector-search/auto-vector-search.png) + +<details markdown="block"> + <summary> + Steps + </summary> + {: .fs-5 .fw-700} + +Working with text that is automatically converted to embeddings within OpenSearch involves the following steps: + +{% include list.html list_items=page.auto_items%} + +</details> + +{% include cards.html cards=page.tutorial_cards %} \ No newline at end of file diff --git a/_vector-search/index.md b/_vector-search/index.md new file mode 100644 index 00000000000..79cdb08e75a --- /dev/null +++ b/_vector-search/index.md @@ -0,0 +1,1149 @@ +--- +layout: default +title: Vector search +nav_order: 1 +has_children: false +has_toc: false +nav_exclude: true +permalink: /vector-search/ +redirect_from: + - /vector-search/index/ + - /search-plugins/vector-search/ +tutorial_cards: + - heading: "Get started with vector search" + description: "Build powerful similarity search applications using your existing vectors or embeddings" + link: "/vector-search/getting-started/" + - heading: "Generate embeddings automatically" + description: "Streamline your vector search using OpenSearch's built-in embedding generation" + link: "/vector-search/getting-started/auto-generated-embeddings/" +more_cards: + - heading: "AI search" + description: "Discover AI search, from <b>semantic</b>, <b>hybrid</b>, and <b>multimodal</b> search to <b>RAG</b>" + link: "/vector-search/ai-search/" + - heading: "Tutorials" + description: "Follow step-by-step tutorials to build AI-powered search for your applications" + link: "/vector-search/tutorials/" + - heading: "Advanced filtering" + description: "Refine search results while maintaining semantic relevance" + link: "/vector-search/filter-search-knn/" + - heading: "Memory-efficient search" + description: "Reduce memory footprint using vector compression methods" + link: "/vector-search/optimizing-storage/" + - heading: "Sparse vector support" + description: "Combine semantic understanding with traditional search efficiency using <b>neural sparse search</b>" + link: "/vector-search/ai-search/neural-sparse-search/" + - heading: "Multi-vector support" + description: "Store and search multiple vectors per document using nested fields" + link: "/vector-search/specialized-operations/nested-search-knn/" +items: + - heading: "Create an index" + description: "Create a vector index for storing your embeddings." + link: "/vector-search/creating-vector-index/" + - heading: "Ingest data" + description: "Ingest your data into the index." + link: "/vector-search/ingesting-data/" + - heading: "Search data" + description: "Use raw vector search or AI-powered methods like semantic, hybrid, multimodal, or neural sparse search. Add RAG to build conversational search." + link: "/vector-search/searching-data/" +--- + +# Vector search + +OpenSearch [vector search]({{site.url}}{{site.baseurl}}/vector-search/getting-started/vector-search-basics/) provides a complete vector database solution for building efficient AI applications. Store and search vector embeddings alongside your existing data, making it easy to implement semantic search, retrieval-augmented generation (RAG), recommendation systems, and other AI-powered applications. + +## Overview + +Watch this video to learn about key vector search features in OpenSearch and discover how to use OpenSearch as a vector database through a step-by-step demo. + +{% include youtube-player.html id='oX0HMAztP8E' %} + +To follow the demo, use these steps. + +<details markdown="block"> + <summary> + Steps + </summary> + {: .fs-5 .fw-700} + +### Prerequisites +{:.no_toc} + +Download the sample data for this demo: + +```bash +wget https://amazon-pqa.s3.amazonaws.com/amazon_pqa_headsets.json +``` +{% include copy.html %} + +Prepare data for bulk indexing into OpenSearch: + +```bash +head -n 5000 amazon_pqa_headsets.json | awk '{ print "{\"index\":{\"_index\":\"neural_search_pqa\"}}"; print;}' > neural_search_amazon_pqa_headsets.json +``` +{% include copy.html %} + +Enable running machine learning (ML) models on data nodes (not recommended for production environments): + +```json +PUT /_cluster/settings +{ + "persistent": { + "plugins.ml_commons.only_run_on_ml_node": false + } +} +``` +{% include copy-curl.html %} + +### Step 1: Register and deploy a model +{:.no_toc} + +Register and deploy an ML model provided by OpenSearch: + +```json +POST /_plugins/_ml/models/_register?deploy=true +{ + "name": "huggingface/sentence-transformers/all-distilroberta-v1", + "version": "1.0.2", + "model_format": "TORCH_SCRIPT" +} +``` +{% include copy-curl.html %} + +Registering a model is an asynchronous task. OpenSearch returns a task ID for this task. Check the status of the task by using the Tasks API: + +```json +GET /_plugins/_ml/tasks/<task_id> +``` +{% include copy-curl.html %} + +Once the task is complete, the task state will change to `COMPLETED` and the Tasks API response will contain a model ID for the registered model. Note the model ID; you'll use it in the following steps. + +### Step 2: Create an ingest pipeline +{:.no_toc} + +Create an ingest pipeline that will generate vector embeddings from text: + +```json +PUT _ingest/pipeline/nlp-index-pipeline +{ + "processors" : [ + { + "text_embedding": { + "model_id": "<model_id>", + "field_map": { + "question_text": "question_vector" + } + } + } + ] +} +``` +{% include copy-curl.html %} + +Test the ingest pipeline: + +```json +POST /_plugins/_ml/_predict/text_embedding/<model_id> +{ + "text_docs":[ "what does the package contain?"], + "return_number": true, + "target_response": ["sentence_embedding"] +} +``` +{% include copy-curl.html %} + +### Step 3: Create an index +{:.no_toc} + +Create a vector index and set the default ingest pipeline to the ingest pipeline created in the previous step: + +```json +PUT /neural_search_pqa +{ + "settings": { + "index.knn": true, + "default_pipeline": "nlp-index-pipeline" + }, + "mappings": { + "properties": { + "question_vector": { + "type": "knn_vector", + "dimension": 768 + } + } + } +} +``` +{% include copy-curl.html %} + +### Step 4: Ingest data +{:.no_toc} + +Ingest the data you prepared in the [Prerequisites](#prerequisites) section: + +```bash +curl -XPOST -u "<username>:<password>" -k https://localhost:9200/_bulk --data-binary @neural_search_amazon_pqa_headsets.json -H 'Content-Type: application/json' +``` +{% include copy.html %} + +If you're not running the Security plugin, omit the username and password: + +```bash +curl -XPOST http://localhost:9200/_bulk --data-binary @neural_search_amazon_pqa_headsets.json -H 'Content-Type: application/json' +``` +{% include copy.html %} + +Test the vector generation: + +```json +GET /neural_search_pqa/_search +``` +{% include copy-curl.html %} + +### Step 5: Search the data +{:.no_toc} + +Now search the data using the following search methods. + +#### Semantic search + +To run a semantic search, send the following request: + +```json +GET /neural_search_pqa/_search +{ + "size": 5, + "query": { + "neural": { + "question_vector": { + "query_text": "what does the package contain?", + "model_id": "<model_id>", + "k": 5 + } + } + } +} +``` +{% include copy-curl.html %} + +#### Raw vector search + +To run a raw vector search using test embeddings, send the following request: + +<details markdown="block"> + <summary> + Request + </summary> + +```json +GET /neural_search_pqa/_search +{ + "query": { + "knn": { + "question_vector": { + "vector": [ + 0.002710069, + -0.009941524, + -0.010563275, + -0.0010122135, + -0.01606663, + 0.035004564, + -0.024301449, + 0.036937017, + 0.0021445795, + -0.018301377, + 0.028222118, + 0.03426478, + 0.06526259, + -0.11439706, + -0.05570727, + -0.013401183, + 0.07173271, + -0.008754317, + -0.003892538, + -0.04069254, + -0.007873223, + 0.043676812, + 0.07628463, + 0.006414452, + 0.017962739, + 0.015939584, + 0.0035662137, + -0.025271492, + 0.0003880734, + -0.07922912, + -0.055034645, + -0.005235041, + 0.016212236, + -0.0027856824, + 0.015833888, + -0.008724626, + 0.07955987, + -0.015250193, + 0.043985505, + 0.0161295, + 0.043298006, + 0.045120195, + 0.0008796525, + 0.025070759, + 0.02620675, + 0.0008109898, + 0.03925882, + 0.0014451992, + -0.0106107555, + 0.01826351, + 0.03323938, + -0.045674287, + -0.0070893173, + 0.022116413, + -0.04267077, + -0.07391224, + -0.007829025, + -0.027157241, + 0.02210903, + 0.03281591, + 0.03863423, + 0.019042324, + -0.008937828, + -0.00822864, + -0.0013345153, + -0.012705528, + 0.024063895, + 0.06755618, + -0.026645413, + -0.044332504, + -0.009713288, + 0.07448414, + -0.037496917, + -0.059190735, + 0.00071719656, + 0.054966882, + -0.014735149, + -0.012903547, + -0.07329577, + 0.032558594, + -0.0065674637, + 0.030938147, + -0.000380445, + 0.03772217, + 0.065343246, + -0.03851167, + 0.021905331, + -0.031275578, + -0.03284647, + -0.0039149136, + 0.033011954, + -0.015860643, + 0.056815848, + 0.018801196, + 0.036051515, + 0.030969055, + -0.06881828, + -0.07299447, + 0.011791604, + 0.036003478, + 0.085550085, + -0.030811753, + 0.008854608, + -0.00115729, + 0.058123615, + 0.031589605, + -0.04637206, + 0.052185714, + -0.008147512, + -0.009668442, + -0.020753473, + -0.044140838, + 0.007126401, + 0.018284583, + 0.026957503, + -0.06066957, + 0.005663597, + -0.00054079125, + -0.007547787, + 0.038137276, + 0.029036777, + -0.050400596, + -0.04595853, + 0.019300641, + 0.0750706, + 0.06053001, + 0.05319831, + -0.040328506, + -0.026151964, + 0.017703054, + -0.009880278, + -0.02431335, + -0.016003195, + 0.017467672, + -0.028064456, + 0.010797431, + 0.04620068, + -0.035007767, + -0.05585064, + 0.053512778, + 0.033208907, + 0.008550426, + -0.0388121, + -0.043947462, + 0.041298136, + 0.00632402, + 0.050902393, + 0.025355011, + 0.049950752, + 0.05057344, + -0.030225132, + 0.068390064, + 0.011451242, + 0.022812577, + -0.04050082, + 0.04564967, + 0.02095755, + -0.008775425, + 0.02742215, + 0.0045154644, + -0.022773914, + -0.023864053, + 0.048423547, + -0.02743273, + 0.023161013, + -0.085432865, + -0.027781866, + 0.045083255, + -0.024330953, + 0.051298082, + -0.014561553, + 0.019947212, + -0.04762156, + -0.08161497, + -0.02915204, + -0.05000734, + 0.016844928, + 0.06842721, + -0.07254415, + 0.023711553, + -0.065741085, + -0.02294238, + 0.026964355, + 0.023867974, + -0.036694836, + 0.031053912, + -0.029109096, + 0.03979944, + 0.0066577485, + -0.04632492, + -0.002852599, + 0.104205936, + -0.0015289283, + -0.0031528969, + -0.067211226, + 0.038498618, + -0.044048615, + 0.07784984, + -0.00019098066, + -0.073304884, + -0.025518911, + -0.044625603, + -0.015586972, + 0.029835561, + 0.012194141, + -0.015629057, + -0.020035604, + -0.06611267, + -0.011576042, + -0.018833332, + -0.0058776387, + 0.0015687104, + 0.042071432, + 0.035765655, + 0.036961976, + -0.06410254, + 0.0069225053, + 0.009306832, + -0.033220366, + -0.0011623797, + -0.05273565, + -0.05313439, + 0.0040645716, + 0.015500928, + -0.031550664, + 0.052280493, + 0.0037078348, + -0.021173084, + 0.0150960395, + 0.078733385, + 0.0028686044, + -0.005216703, + -0.0036014854, + 0.050795995, + -0.041090492, + -0.04149299, + -0.042463295, + 0.004432829, + 0.019274198, + 0.02163699, + -0.009603396, + -0.0049729077, + -0.04318596, + -0.087209016, + -0.018899467, + -0.010470672, + -0.030606175, + 0.002642825, + 0.0075506642, + 0.021283865, + 0.02029468, + -0.020240186, + 0.021211915, + 0.013999255, + 0.061195884, + 0.04166171, + -0.052985657, + -0.025418852, + 0.053535376, + 0.0052670254, + 0.00996464, + 0.022772988, + -0.0067050382, + 0.011592934, + 0.00048262937, + 0.056712538, + 0.04335854, + -0.018352322, + 0.021396462, + -0.062193274, + -0.07501798, + -0.043138392, + 0.029762914, + 0.0022764541, + -0.021794599, + 0.020765148, + 0.09824474, + -0.0021401478, + 0.07763454, + -0.0071393973, + 0.048322372, + -0.0068628914, + -0.01169711, + 0.0369351, + 0.056131776, + 0.007255264, + 0.014164492, + 0.047250435, + 0.037673194, + -0.032006253, + 0.0064754435, + -0.029092291, + 0.10371859, + -0.04414858, + -0.04181647, + 0.031237667, + 0.06330435, + 0.0009903753, + 0.015501904, + -0.043972794, + -0.07873341, + -0.034613512, + 0.0045046876, + 0.02307906, + 0.000025955713, + -0.026988667, + -0.021876179, + -0.061864477, + -0.03174992, + -0.020722676, + -0.013450134, + -0.07542003, + 0.032319948, + -0.024602456, + -0.0333397, + 0.012231298, + 0.041405365, + 0.038915142, + -0.015581544, + -0.019906731, + 0.05896227, + -0.041462217, + -0.017148478, + 0.026938373, + 0.016844902, + 0.04285087, + -0.017774548, + 0.020407137, + -0.051100556, + 0.020812236, + 0.07045972, + -0.0051538153, + 0.0011321488, + -0.011617311, + 0.022422142, + -0.118273415, + 0.036936108, + -0.0006845923, + -0.020841764, + -0.03182234, + 0.057517555, + -0.033479884, + -0.027451057, + -0.043103144, + 0.008880055, + -0.041282106, + 0.055030968, + -0.04702203, + 0.056501582, + 0.014168417, + 0.02385893, + -0.015406, + 0.02182121, + -0.016413651, + -0.010580059, + -0.032921027, + 0.0029189822, + -0.02338612, + -0.022606278, + 0.04826292, + -0.004382977, + 0.025545042, + 0.02886143, + -0.060381353, + -0.028612776, + -0.07493492, + 0.00719094, + 0.015079185, + -0.042235136, + -0.01738928, + -0.0015764751, + 0.0080654705, + 0.00045899878, + 0.02290927, + -0.044065766, + -0.027154867, + 0.019949641, + 0.024834728, + 0.035529647, + -0.02206892, + 0.010913105, + 0.010024395, + -0.029580403, + 0.02561486, + -0.009437026, + 0.031584535, + -0.03349992, + 0.017479446, + 0.03321881, + 0.04470709, + -0.051657267, + 0.014068284, + 0.028261097, + 0.006924192, + 0.015599272, + 0.024204262, + 0.017719362, + -0.009957364, + 0.042847835, + -0.023584707, + 0.045098092, + -0.023444502, + -0.0037809366, + -0.03454478, + 0.021056872, + -0.043912865, + -0.0390931, + 0.009994628, + -0.045420606, + -0.010205209, + 0.0022059593, + -0.0064243795, + 0.0058772936, + -0.01227864, + -0.028449906, + 0.05086825, + 0.011771748, + 0.029447777, + -0.00488326, + -0.00972601, + -0.0038806763, + 0.012304249, + 0.048176277, + -0.044568717, + -0.046164848, + -0.040474243, + -0.010306429, + 0.0070577585, + 0.050434314, + -0.047979098, + -0.032600895, + 0.004446253, + 0.043626312, + 0.006991633, + -0.008693645, + 0.03655107, + -0.010262025, + 0.061423175, + -0.041305497, + 0.049218614, + 0.024470096, + 0.008277926, + 0.023871863, + -0.0680525, + -0.01373448, + -0.019403461, + 0.01457673, + 0.020989386, + -0.012840103, + 0.04480477, + -0.012785204, + 0.05274674, + 0.00044528328, + -0.03250745, + -0.034448665, + -0.021306505, + -0.006346044, + 0.03572138, + -0.005664647, + 0.007930765, + 0.05546037, + 0.08555072, + 0.0052049863, + 0.005712941, + 0.0069970684, + -0.07032658, + -0.021292446, + -0.043971684, + 0.033561017, + 0.0078121717, + -0.01232355, + 0.04682774, + -0.012410457, + -0.024060972, + 0.026366811, + 0.02424469, + -0.003813699, + 0.007787949, + 0.030725611, + -0.018421294, + 0.024292007, + 0.02683838, + 0.018937135, + 0.024167754, + -0.012694116, + -0.04747225, + -0.018581947, + 0.04490841, + 0.010850694, + 0.013474754, + -0.053915884, + -0.0157288, + -0.035485156, + 0.002554162, + 1.9480496e-33, + 0.026267078, + -0.0005050934, + 0.056276474, + -0.04939255, + -0.042061917, + 0.017516103, + -0.0347885, + 0.0056415154, + 0.028010717, + 0.037564415, + -0.010455965, + -0.0016442607, + 0.01223653, + -0.0033323513, + 0.04782389, + 0.016800124, + -0.07022924, + -0.06512625, + -0.0020572834, + -0.01184387, + 0.02217141, + -0.024825176, + -0.0015173266, + -0.0269819, + 0.019096063, + 0.017777557, + 0.017873168, + 0.039785545, + -0.046805847, + 0.021698391, + -0.06269843, + 0.019622149, + 0.007864404, + 0.008894206, + 0.0038650148, + 0.042388596, + -0.009941635, + -0.023884028, + -0.035126317, + 0.0005930202, + 0.006001224, + -0.024304975, + -0.025708912, + 0.04936831, + 0.0016331291, + -0.040760614, + 0.030479766, + 0.05206152, + -0.00443369, + 0.10088473, + 0.011507102, + -0.023531357, + -0.040234685, + -0.01877001, + 0.009172026, + -0.03114441, + -0.04349409, + -0.017874151, + 0.034953598, + -0.008358288, + 0.018915119, + 0.07711077, + 0.023954341, + 0.002415601, + 0.008599011, + 0.010966408, + 0.060247257, + -0.0024354062, + 0.029591061, + -0.028959572, + -0.036631253, + -0.021705143, + 0.030625504, + -0.0047654426, + 0.014964073, + 0.037887104, + 0.015323633, + 0.037921626, + -0.025576469, + 0.055206805, + -0.029262222, + -0.01962374, + -0.03655967, + 0.027075786, + -0.081109434, + 0.02449199, + -0.0011163651, + 0.023110788, + 0.027611898, + 0.008880572, + -0.016672952, + 0.054573104, + 0.0668384, + 0.0016800691, + -0.026792923, + -0.007083326, + -0.02166146, + -0.05414477, + 0.034420814, + -0.014911138, + -0.015938187, + 0.0024109697, + 0.018606238, + -0.0068018483, + 0.007229771, + -0.07069912, + 0.005073739, + -0.02377225, + 0.025782589, + -0.023521125, + -0.009433753, + 0.001846642, + 0.039006367, + 0.058460444, + 0.0073873056, + 0.007734639, + 0.04332041, + -0.02951278, + -0.025803477, + 0.046294205, + 0.02037022, + 0.017971495, + -0.07894564, + 0.035865154, + -0.0019950685, + 0.0058006193, + -0.016100215, + -0.032027755, + -0.015766902, + 0.0036303538, + 0.036353722, + -0.012345974, + -0.052974723, + -0.018639334, + -0.023760993, + -0.039711308, + 0.011242891, + 0.019980058, + 0.0056355395, + -0.034353167, + 0.035260357, + 0.0017268837, + 0.026457984, + -0.027261587, + -0.0083769085, + 0.013137794, + 0.06074834, + -0.03966026, + 0.015282993, + -0.03137165, + -0.0018508149, + 0.0006249257, + -0.088941485, + -0.016475422, + -0.061206434, + 0.02161922, + 0.04977918, + -0.012738911, + 0.029521877, + 0.019252038, + 0.0060790903, + -0.019414661, + -0.0037854896, + 0.0035633324, + 0.0012202597, + -0.0025355266, + -0.013203971, + 0.03394517, + 0.055446833, + -0.056813966, + -0.017438352, + -0.0025512646, + 0.0015061953, + -0.014893743, + 0.01575938, + 0.0137350615, + 0.021631295, + -0.011761018, + 0.003874792, + -0.033888955, + 0.034087986, + 0.007129588, + -0.054342985, + -0.08680173, + -0.002967837, + 0.025510576, + 0.021943994, + 0.012099311, + -0.04670378, + -0.0052654264, + -0.018963156, + 0.041973554, + -0.028053606, + -0.08092634, + 0.01265107, + -0.054788973, + 0.09400683, + -0.06417367, + -0.027034711, + -0.039408244, + 0.023176627, + -0.01461873, + 0.03884634, + -0.036304634, + -0.017949235, + -0.057132546, + 0.01646405, + 0.0404744, + -0.0027004834, + -0.00041886698, + -0.0028203563, + 0.008831913, + -0.0040895687, + -0.012310025, + 0.05664932, + 0.017413152, + 0.0068459054, + 0.018910537, + 0.019317543, + 0.0020133136, + -0.017052755, + 0.005844975, + 0.010338119, + 0.020037401, + 0.013349168, + -0.05482043, + -0.066234104, + -0.02689704, + -0.035874642, + -0.050699547, + -0.05060031, + -0.04085721, + -0.027676092, + -0.0981729, + -0.02701008, + 0.050626777, + 0.04092506, + 0.029677482, + 0.05753057, + 0.10218166, + 0.024896685, + -0.030231407, + -0.04353669, + -0.005995228, + -0.0033289846, + 0.029730862, + -0.10618225, + 0.020681499, + -0.024290795, + 0.022039287, + 0.043326188, + -0.05395758, + -0.025439745, + 0.03492537, + -0.027676322, + -0.00053507305, + 0.02218165, + 0.09227446, + -0.023444649, + -0.06172415, + 0.018731289, + -0.01790614, + 0.006927564, + -0.025528973, + -0.009136651, + -0.009685557, + 0.017786622, + 0.023883764, + 0.011552316, + 0.06438146, + 0.0033594605, + 0.022067433, + -0.035531327 + ], + "k": 5 + } + } + } +} +``` +{% include copy-curl.html %} +</details> + +#### Lexical search + +To run a lexical search, send the following request: + +```json +GET /neural_search_pqa/_search +{ + "query": { + "match": { + "question_text": "what does the package contain?" + } + } +} +``` +{% include copy-curl.html %} + +#### Hybrid search + +Create a search pipeline for hybrid search: + +```json +PUT /_search/pipeline/hybrid-search-pipeline +{ + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "min_max" + }, + "combination": { + "technique": "arithmetic_mean", + "parameters": { + "weights": [ + 0.3, + 0.7 + ] + } + } + } + } + ] +} +``` +{% include copy-curl.html %} + +Set this pipeline as the default search pipeline for the index: + +```json +PUT /neural_search_pqa/_settings +{ + "index.search.default_pipeline": "hybrid-search-pipeline" +} +``` +{% include copy-curl.html %} + +To run a hybrid search, send the following request: + +```json +GET /neural_search_pqa/_search +{ + "_source": "question_text", + "query": { + "hybrid": { + "queries": [ + { + "match": { + "question_text":"what does the package contain?" + } + }, + { + "neural": { + "question_vector": { + "query_text": "what does the package contain?", + "model_id": "<model_id>", + "k": 5 + } + } + } + ] + } + } +} +``` +{% include copy-curl.html %} + +### Clean up +{:.no_toc} + +Undeploy the model: + +```json +POST /_plugins/_ml/models/<model_id>/_undeploy +``` +{% include copy-curl.html %} + +Delete the model: + +```json +DELETE /_plugins/_ml/models/<model_id> +``` +{% include copy-curl.html %} + +Delete the index: + +```json +DELETE /neural_search_pqa +``` +{% include copy-curl.html %} + +</details> + +## Getting started + +You can bring your own vectors or let OpenSearch generate embeddings automatically from your data. See [Preparing vectors]({{site.url}}{{site.baseurl}}/vector-search/getting-started/vector-search-options/). +{: .info } + +{% include cards.html cards=page.tutorial_cards %} + +{% include list.html list_items=page.items%} + +<span class="centering-container"> +[Get started]({{site.url}}{{site.baseurl}}/vector-search/getting-started/){: .btn-dark-blue} +</span> + +## Build your solution + +{% include cards.html cards=page.more_cards %} \ No newline at end of file diff --git a/_vector-search/ingesting-data/index.md b/_vector-search/ingesting-data/index.md new file mode 100644 index 00000000000..2a978de6269 --- /dev/null +++ b/_vector-search/ingesting-data/index.md @@ -0,0 +1,86 @@ +--- +layout: default +title: Ingesting data +nav_order: 30 +has_children: true +has_toc: false +redirect_from: + - /vector-search/ingesting-data/ +--- + +# Ingesting data into a vector index + +After creating a vector index, you need to either ingest raw vector data or convert data to embeddings while ingesting it. + +## Comparison of ingestion methods + +The following table compares the two ingestion methods. + +| Feature | Data format | Ingest pipeline | Vector generation | Additional fields | +|-------------------------------|----------------------------|---------------------|---------------------------------|-----------------------------------| +| **Raw vector ingestion** | Pre-generated vectors | Not required | External | Optional metadata | +| **Converting data to embeddings during ingestion** | Text or image data | Required | Internal (during ingestion) | Original data + embeddings | + +## Raw vector ingestion + +When working with raw vectors or embeddings generated outside of OpenSearch, you directly ingest vector data into the `knn_vector` field. No pipeline is required because the vectors are already generated: + +```json +PUT /my-raw-vector-index/_doc/1 +{ + "my_vector": [0.1, 0.2, 0.3], + "metadata": "Optional additional information" +} +``` +{% include copy-curl.html %} + +You can also use the [Bulk API]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) to ingest multiple vectors efficiently: + +```json +PUT /_bulk +{"index": {"_index": "my-raw-vector-index", "_id": 1}} +{"my_vector": [0.1, 0.2, 0.3], "metadata": "First item"} +{"index": {"_index": "my-raw-vector-index", "_id": 2}} +{"my_vector": [0.2, 0.3, 0.4], "metadata": "Second item"} +``` +{% include copy-curl.html %} + +## Converting data to embeddings during ingestion + +After you have [configured an ingest pipeline]({{site.url}}{{site.baseurl}}/vector-search/creating-vector-index/#converting-data-to-embeddings-during-ingestion) that automatically generates embeddings, you can ingest text data directly into your index: + +```json +PUT /my-ai-search-index/_doc/1 +{ + "input_text": "Example: AI search description" +} +``` +{% include copy-curl.html %} + +The pipeline automatically generates and stores the embeddings in the `output_embedding` field. + +You can also use the [Bulk API]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) to ingest multiple documents efficiently: + +```json +PUT /_bulk +{"index": {"_index": "my-ai-search-index", "_id": 1}} +{"input_text": "Example AI search description"} +{"index": {"_index": "my-ai-search-index", "_id": 2}} +{"input_text": "Bulk API operation description"} +``` +{% include copy-curl.html %} + +## Working with sparse vectors + +OpenSearch also supports sparse vectors. For more information, see [Neural sparse search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-search/). + +## Text chunking + +For information about splitting large documents into smaller passages before generating embeddings during dense or sparse AI search, see [Text chunking]({{site.url}}{{site.baseurl}}/vector-search/ingesting-data/text-chunking/). + +## Next steps + +- [Searching vector data]({{site.url}}{{site.baseurl}}/vector-search/searching-data/) +- [Bulk API]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) +- [Ingest pipelines]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/) +- [Text embedding processor]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/processors/text-embedding/) \ No newline at end of file diff --git a/_search-plugins/text-chunking.md b/_vector-search/ingesting-data/text-chunking.md similarity index 78% rename from _search-plugins/text-chunking.md rename to _vector-search/ingesting-data/text-chunking.md index b66cfeda61c..c011ee26f29 100644 --- a/_search-plugins/text-chunking.md +++ b/_vector-search/ingesting-data/text-chunking.md @@ -1,13 +1,18 @@ --- layout: default title: Text chunking -nav_order: 65 +parent: Ingesting data +nav_order: 80 +redirect_from: + - /search-plugins/text-chunking/ --- # Text chunking Introduced 2.13 {: .label .label-purple } +When working with large text documents in [AI search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/), it's often necessary to split them into smaller passages because most embedding models have token length limitations. This process, called _text chunking_, helps maintain the quality and relevance of vector search results by ensuring that each embedding represents a focused piece of content that fits within model constraints. + To split long text into passages, you can use a `text_chunking` processor as a preprocessing step for a `text_embedding` or `sparse_encoding` processor in order to obtain embeddings for each chunked passage. For more information about the processor parameters, see [Text chunking processor]({{site.url}}{{site.baseurl}}/ingest-pipelines/processors/text-chunking/). Before you start, follow the steps outlined in the [pretrained model documentation]({{site.url}}{{site.baseurl}}/ml-commons-plugin/pretrained-models/) to register an embedding model. The following example preprocesses text by splitting it into passages and then produces embeddings using the `text_embedding` processor. ## Step 1: Create a pipeline @@ -48,7 +53,7 @@ PUT _ingest/pipeline/text-chunking-embedding-ingest-pipeline ## Step 2: Create an index for ingestion -In order to use the ingest pipeline, you need to create a k-NN index. The `passage_chunk_embedding` field must be of the `nested` type. The `knn.dimension` field must contain the number of dimensions for your model: +In order to use the ingest pipeline, you need to create a vector index. The `passage_chunk_embedding` field must be of the `nested` type. The `knn.dimension` field must contain the number of dimensions for your model: ```json PUT testindex @@ -90,7 +95,7 @@ POST testindex/_doc?pipeline=text-chunking-embedding-ingest-pipeline ``` {% include copy-curl.html %} -## Step 4: Search the index using neural search +## Step 4: Search the index You can use a `nested` query to perform vector search on your index. We recommend setting `score_mode` to `max`, where the document score is set to the highest score out of all passage embeddings: @@ -114,3 +119,7 @@ GET testindex/_search } ``` {% include copy-curl.html %} + +## Next steps + +- Explore our [tutorials]({{site.url}}{{site.baseurl}}/vector-search/tutorials/) to learn how to build AI search applications. diff --git a/_vector-search/llm-frameworks.md b/_vector-search/llm-frameworks.md new file mode 100644 index 00000000000..4444e11150e --- /dev/null +++ b/_vector-search/llm-frameworks.md @@ -0,0 +1,25 @@ +--- +layout: default +title: LLM framework integration +nav_order: 75 +--- + +# LLM framework integration + +Several popular large language model (LLM) frameworks integrate with OpenSearch as a vector store, enabling you to build production-ready generative AI applications. These frameworks provide high-level abstractions and tools for working with LLMs, and their OpenSearch integrations allow you to use OpenSearch for efficient vector storage, retrieval, and similarity search: + +- LangChain + - [Semantic cache](https://python.langchain.com/docs/integrations/llm_caching/#opensearch-semantic-cache) + - [Vector store support](https://python.langchain.com/docs/integrations/vectorstores/opensearch/) + +- LlamaIndex + - [Vector store support](https://docs.llamaindex.ai/en/stable/examples/vector_stores/OpensearchDemo/) + +- FlowiseAI: + - [Vector store support](https://docs.flowiseai.com/integrations/langchain/vector-stores/opensearch) + +- Langflow: + - [Vector store support](https://docs.langflow.org/components-vector-stores#opensearch) + +- Haystack: + - [Vector store support](https://haystack.deepset.ai/integrations/opensearch-document-store) \ No newline at end of file diff --git a/_vector-search/optimizing-storage/binary-quantization.md b/_vector-search/optimizing-storage/binary-quantization.md new file mode 100644 index 00000000000..514003cd010 --- /dev/null +++ b/_vector-search/optimizing-storage/binary-quantization.md @@ -0,0 +1,204 @@ +--- +layout: default +title: Binary quantization +parent: Vector quantization +grand_parent: Optimizing vector storage +nav_order: 40 +has_children: false +has_math: true +--- + +# Binary quantization + +Starting with version 2.17, OpenSearch supports binary quantization (BQ) with binary vector support for the Faiss engine. BQ compresses vectors into a binary format (0s and 1s), making it highly efficient in terms of memory usage. You can choose to represent each vector dimension using 1, 2, or 4 bits, depending on the desired precision. One of the advantages of using BQ is that the training process is handled automatically during indexing. This means that no separate training step is required, unlike other quantization techniques such as PQ. + +## Using BQ + +To configure BQ for the Faiss engine, define a `knn_vector` field and specify the `mode` as `on_disk`. This configuration defaults to 1-bit BQ and both `ef_search` and `ef_construction` set to `100`: + +```json +PUT my-vector-index +{ + "settings" : { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector_field": { + "type": "knn_vector", + "dimension": 8, + "space_type": "l2", + "data_type": "float", + "mode": "on_disk" + } + } + } +} +``` +{% include copy-curl.html %} + +To further optimize the configuration, you can specify additional parameters, such as the compression level, and fine-tune the search parameters. For example, you can override the `ef_construction` value or define the compression level, which corresponds to the number of bits used for quantization: + +- **32x compression** for 1-bit quantization +- **16x compression** for 2-bit quantization +- **8x compression** for 4-bit quantization + +This allows for greater control over memory usage and recall performance, providing flexibility to balance between precision and storage efficiency. + +To specify the compression level, set the `compression_level` parameter: + +```json +PUT my-vector-index +{ + "settings" : { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector_field": { + "type": "knn_vector", + "dimension": 8, + "space_type": "l2", + "data_type": "float", + "mode": "on_disk", + "compression_level": "16x", + "method": { + "name": "hnsw", + "engine": "faiss", + "parameters": { + "ef_construction": 16 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The following example further fine-tunes the configuration by defining `ef_construction`, `encoder`, and the number of `bits` (which can be `1`, `2`, or `4`): + +```json +PUT my-vector-index +{ + "settings" : { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector_field": { + "type": "knn_vector", + "dimension": 8, + "method": { + "name": "hnsw", + "engine": "faiss", + "space_type": "l2", + "parameters": { + "m": 16, + "ef_construction": 512, + "encoder": { + "name": "binary", + "parameters": { + "bits": 1 + } + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Search using binary quantized vectors + +You can perform a vector search on your index by providing a vector and specifying the number of nearest neighbors (k) to return: + +```json +GET my-vector-index/_search +{ + "size": 2, + "query": { + "knn": { + "my_vector_field": { + "vector": [1.5, 5.5, 1.5, 5.5, 1.5, 5.5, 1.5, 5.5], + "k": 10 + } + } + } +} +``` +{% include copy-curl.html %} + +You can also fine-tune search by providing the `ef_search` and `oversample_factor` parameters. +The `oversample_factor` parameter controls the factor by which the search oversamples the candidate vectors before ranking them. Using a higher oversample factor means that more candidates will be considered before ranking, improving accuracy but also increasing search time. When selecting the `oversample_factor` value, consider the trade-off between accuracy and efficiency. For example, setting the `oversample_factor` to `2.0` will double the number of candidates considered during the ranking phase, which may help achieve better results. + +The following request specifies the `ef_search` and `oversample_factor` parameters: + +```json +GET my-vector-index/_search +{ + "size": 2, + "query": { + "knn": { + "my_vector_field": { + "vector": [1.5, 5.5, 1.5, 5.5, 1.5, 5.5, 1.5, 5.5], + "k": 10, + "method_parameters": { + "ef_search": 10 + }, + "rescore": { + "oversample_factor": 10.0 + } + } + } + } +} +``` +{% include copy-curl.html %} + + +## HNSW memory estimation + +The memory required for the Hierarchical Navigable Small World (HNSW) graph can be estimated as `1.1 * (dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph. + +As an example, assume that you have 1 million vectors with a dimension of 256 and an `m` of 16. The following sections provide memory requirement estimations for various compression values. + +### 1-bit quantization (32x compression) + +In 1-bit quantization, each dimension is represented using 1 bit, equivalent to a 32x compression factor. The memory requirement can be estimated as follows: + +```r +Memory = 1.1 * ((256 * 1 / 8) + 8 * 16) * 1,000,000 + ~= 0.176 GB +``` + +### 2-bit quantization (16x compression) + +In 2-bit quantization, each dimension is represented using 2 bits, equivalent to a 16x compression factor. The memory requirement can be estimated as follows: + +```r +Memory = 1.1 * ((256 * 2 / 8) + 8 * 16) * 1,000,000 + ~= 0.211 GB +``` + +### 4-bit quantization (8x compression) + +In 4-bit quantization, each dimension is represented using 4 bits, equivalent to an 8x compression factor. The memory requirement can be estimated as follows: + +```r +Memory = 1.1 * ((256 * 4 / 8) + 8 * 16) * 1,000,000 + ~= 0.282 GB +``` + +## Next steps + +- [Memory-optimized vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/) +- [k-NN query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/k-nn/) \ No newline at end of file diff --git a/_search-plugins/knn/disk-based-vector-search.md b/_vector-search/optimizing-storage/disk-based-vector-search.md similarity index 69% rename from _search-plugins/knn/disk-based-vector-search.md rename to _vector-search/optimizing-storage/disk-based-vector-search.md index 8fe794f44c3..72dcf8e4c21 100644 --- a/_search-plugins/knn/disk-based-vector-search.md +++ b/_vector-search/optimizing-storage/disk-based-vector-search.md @@ -1,18 +1,20 @@ --- layout: default title: Disk-based vector search -nav_order: 16 -parent: k-NN search +nav_order: 20 +parent: Optimizing vector storage has_children: false +redirect_from: + - /search-plugins/knn/disk-based-vector-search/ --- # Disk-based vector search **Introduced 2.17** {: .label .label-purple} -For low-memory environments, OpenSearch provides _disk-based vector search_, which significantly reduces the operational costs for vector workloads. Disk-based vector search uses [binary quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#binary-quantization), compressing vectors and thereby reducing the memory requirements. This memory optimization provides large memory savings at the cost of slightly increased search latency while still maintaining strong recall. +For low-memory environments, OpenSearch provides _disk-based vector search_, which significantly reduces the operational costs for vector workloads. Disk-based vector search uses [binary quantization]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/binary-quantization/), compressing vectors and thereby reducing the memory requirements. This memory optimization provides large memory savings at the cost of slightly increased search latency while still maintaining strong recall. -To use disk-based vector search, set the [`mode`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#vector-workload-modes) parameter to `on_disk` for your vector field type. This parameter will configure your index to use secondary storage. +To use disk-based vector search, set the [`mode`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/#vector-workload-modes) parameter to `on_disk` for your vector field type. This parameter will configure your index to use secondary storage. For more information about disk-based search parameters, see [Memory-optimized vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/). ## Creating an index for disk-based vector search @@ -41,7 +43,7 @@ PUT my-vector-index ``` {% include copy-curl.html %} -By default, the `on_disk` mode configures the index to use the `faiss` engine and `hnsw` method. The default [`compression_level`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#compression-levels) of `32x` reduces the amount of memory the vectors require by a factor of 32. To preserve the search recall, rescoring is enabled by default. A search on a disk-optimized index runs in two phases: The compressed index is searched first, and then the results are rescored using full-precision vectors loaded from disk. +By default, the `on_disk` mode configures the index to use the `faiss` engine and `hnsw` method. The default [`compression_level`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/#compression-levels) of `32x` reduces the amount of memory the vectors require by a factor of 32. To preserve the search recall, rescoring is enabled by default. A search on a disk-optimized index runs in two phases: The compressed index is searched first, and then the results are rescored using full-precision vectors loaded from disk. To reduce the compression level, provide the `compression_level` parameter when creating the index mapping: @@ -69,7 +71,7 @@ PUT my-vector-index ``` {% include copy-curl.html %} -For more information about the `compression_level` parameter, see [Compression levels]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#compression-levels). Note that for `4x` compression, the `lucene` engine will be used. +For more information about the `compression_level` parameter, see [Compression levels]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/#compression-levels). Note that for `4x` compression, the `lucene` engine will be used. {: .note} If you need more granular fine-tuning, you can override additional k-NN parameters in the method definition. For example, to improve recall, increase the `ef_construction` parameter value: @@ -134,7 +136,7 @@ POST _bulk ## Search -Search is also performed in the same way as in other index configurations. The key difference is that, by default, the `oversample_factor` of the rescore parameter is set to `3.0` (unless you override the `compression_level`). For more information, see [Rescoring quantized results using full precision]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#rescoring-quantized-results-using-full-precision). To perform vector search on a disk-optimized index, provide the search vector: +Search is also performed in the same way as in other index configurations. The key difference is that, by default, the `oversample_factor` of the rescore parameter is set to `3.0` (unless you override the `compression_level`). For more information, see [Rescoring quantized results to full precision]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/#rescoring-quantized-results-to-full-precision). To perform vector search on a disk-optimized index, provide the search vector: ```json GET my-vector-index/_search @@ -179,7 +181,7 @@ GET my-vector-index/_search ## Model-based indexes -For [model-based indexes]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model), you can specify the `on_disk` parameter in the training request in the same way that you would specify it during index creation. By default, `on_disk` mode will use the [Faiss IVF method]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#supported-faiss-methods) and a compression level of `32x`. To run the training API, send the following request: +For [model-based indexes]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-vector-index-from-a-model), you can specify the `on_disk` parameter in the training request in the same way that you would specify it during index creation. By default, `on_disk` mode will use the [Faiss IVF method]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/#ivf-parameters) and a compression level of `32x`. To run the training API, send the following request: ```json POST /_plugins/_knn/models/test-model/_train @@ -196,13 +198,14 @@ POST /_plugins/_knn/models/test-model/_train ``` {% include copy-curl.html %} -This command assumes that training data has been ingested into the `train-index-name` index. For more information, see [Building a k-NN index from a model]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model). +This command assumes that training data has been ingested into the `train-index-name` index. For more information, see [Building a vector index from a model]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-vector-index-from-a-model). {: .note} -You can override the `compression_level` for disk-optimized indexes in the same way as for regular k-NN indexes. +You can override the `compression_level` for disk-optimized indexes in the same way as for regular vector indexes. ## Next steps -- For more information about binary quantization, see [Binary quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#binary-quantization). -- For more information about k-NN vector workload modes, see [Vector workload modes]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#vector-workload-modes). \ No newline at end of file +- [Binary quantization]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/binary-quantization/) +- [Memory-optimized vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/) +- [k-NN query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/k-nn/) \ No newline at end of file diff --git a/_vector-search/optimizing-storage/faiss-16-bit-quantization.md b/_vector-search/optimizing-storage/faiss-16-bit-quantization.md new file mode 100644 index 00000000000..2daa7186387 --- /dev/null +++ b/_vector-search/optimizing-storage/faiss-16-bit-quantization.md @@ -0,0 +1,159 @@ +--- +layout: default +title: Faiss 16-bit scalar quantization +parent: Vector quantization +grand_parent: Optimizing vector storage +nav_order: 20 +has_children: false +has_math: true +--- + +# Faiss 16-bit scalar quantization + +Starting with version 2.13, OpenSearch supports performing scalar quantization for the Faiss engine within OpenSearch. Within the Faiss engine, a scalar quantizer (SQfp16) performs the conversion between 32-bit and 16-bit vectors. At ingestion time, when you upload 32-bit floating-point vectors to OpenSearch, SQfp16 quantizes them into 16-bit floating-point vectors and stores the quantized vectors in a vector index. + +At search time, SQfp16 decodes the vector values back into 32-bit floating-point values for distance computation. The SQfp16 quantization can decrease the memory footprint by a factor of 2. Additionally, it leads to a minimal loss in recall when differences between vector values are large compared to the error introduced by eliminating their two least significant bits. When used with [SIMD optimization]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/#simd-optimization), SQfp16 quantization can also significantly reduce search latencies and improve indexing throughput. + +SIMD optimization is not supported on Windows. Using Faiss scalar quantization on Windows can lead to a significant drop in performance, including decreased indexing throughput and increased search latencies. +{: .warning} + +## Using Faiss scalar quantization + +To use Faiss scalar quantization, set the k-NN vector field's `method.parameters.encoder.name` to `fp16` when creating a vector index: + +```json +PUT /test-index +{ + "settings": { + "index": { + "knn": true, + "knn.algo_param.ef_search": 100 + } + }, + "mappings": { + "properties": { + "my_vector1": { + "type": "knn_vector", + "dimension": 3, + "space_type": "l2", + "method": { + "name": "hnsw", + "engine": "faiss", + "parameters": { + "encoder": { + "name": "fp16" + }, + "ef_construction": 256, + "m": 8 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +Optionally, you can specify the parameters in `method.parameters.encoder`. For more information about `encoder` object parameters, see [SQ parameters]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/#sq-parameters). + +The `fp16` encoder converts 32-bit vectors into their 16-bit counterparts. For this encoder type, the vector values must be in the [-65504.0, 65504.0] range. To define how to handle out-of-range values, the preceding request specifies the `clip` parameter. By default, this parameter is `false`, and any vectors containing out-of-range values are rejected. + +When `clip` is set to `true` (as in the preceding request), out-of-range vector values are rounded up or down so that they are in the supported range. For example, if the original 32-bit vector is `[65510.82, -65504.1]`, the vector will be indexed as a 16-bit vector `[65504.0, -65504.0]`. + +We recommend setting `clip` to `true` only if very few elements lie outside of the supported range. Rounding the values may cause a drop in recall. +{: .note} + +The following example method definition specifies the Faiss SQfp16 encoder, which rejects any indexing request that contains out-of-range vector values (because the `clip` parameter is `false` by default): + +```json +PUT /test-index +{ + "settings": { + "index": { + "knn": true, + "knn.algo_param.ef_search": 100 + } + }, + "mappings": { + "properties": { + "my_vector1": { + "type": "knn_vector", + "dimension": 3, + "space_type": "l2", + "method": { + "name": "hnsw", + "engine": "faiss", + "parameters": { + "encoder": { + "name": "sq", + "parameters": { + "type": "fp16" + } + }, + "ef_construction": 256, + "m": 8 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +During ingestion, make sure each vector dimension is in the supported range ([-65504.0, 65504.0]). + +```json +PUT test-index/_doc/1 +{ + "my_vector1": [-65504.0, 65503.845, 55.82] +} +``` +{% include copy-curl.html %} + +During querying, the query vector has no range limitation: + +```json +GET test-index/_search +{ + "size": 2, + "query": { + "knn": { + "my_vector1": { + "vector": [265436.876, -120906.256, 99.84], + "k": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +## Memory estimation + +In the best-case scenario, 16-bit vectors produced by the Faiss SQfp16 quantizer require 50% of the memory that 32-bit vectors require. + +### HNSW memory estimation + +The memory required for Hierarchical Navigable Small Worlds (HNSW) is estimated to be `1.1 * (2 * dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph. + +As an example, assume that you have 1 million vectors with a dimension of 256 and an `m` of 16. The memory requirement can be estimated as follows: + +```r +1.1 * (2 * 256 + 8 * 16) * 1,000,000 ~= 0.656 GB +``` + +### IVF memory estimation + +The memory required for IVF is estimated to be `1.1 * (((2 * dimension) * num_vectors) + (4 * nlist * dimension))` bytes/vector, where `nlist` is the number of buckets to partition vectors into. + +As an example, assume that you have 1 million vectors with a dimension of 256 and an `nlist` of 128. The memory requirement can be estimated as follows: + +```r +1.1 * (((2 * 256) * 1,000,000) + (4 * 128 * 256)) ~= 0.525 GB +``` + +## Next steps + +- [Memory-optimized vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/) +- [k-NN query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/k-nn/) diff --git a/_vector-search/optimizing-storage/faiss-product-quantization.md b/_vector-search/optimizing-storage/faiss-product-quantization.md new file mode 100644 index 00000000000..7c27a1bad4e --- /dev/null +++ b/_vector-search/optimizing-storage/faiss-product-quantization.md @@ -0,0 +1,57 @@ +--- +layout: default +title: Faiss product quantization +parent: Vector quantization +grand_parent: Optimizing vector storage +nav_order: 30 +has_children: false +has_math: true +--- + +# Faiss product quantization + +Product quantization (PQ) is a technique used to represent a vector using a configurable number of bits. In general, it can be used to achieve a higher level of compression as compared to byte or scalar quantization. PQ works by separating vectors into _m_ subvectors and encoding each subvector with _code_size_ bits. Thus, the total amount of memory for the vector is `m*code_size` bits, plus overhead. For details about the parameters, see [PQ parameters]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/#pq-parameters). PQ is only supported for the _Faiss_ engine and can be used with either the _HNSW_ or _IVF_ approximate nearest neighbor (ANN) algorithms. + +## Using Faiss product quantization + +To minimize loss in accuracy, PQ requires a _training_ step that builds a model based on the distribution of the data that will be searched. + +The product quantizer is trained by running k-means clustering on a set of training vectors for each subvector space and extracts the centroids to be used for encoding. The training vectors can be either a subset of the vectors to be ingested or vectors that have the same distribution and dimension as the vectors to be ingested. + +In OpenSearch, the training vectors need to be present in an index. In general, the amount of training data will depend on which ANN algorithm is used and how much data will be stored in the index. For IVF-based indexes, a recommended number of training vectors is `max(1000*nlist, 2^code_size * 1000)`. For HNSW-based indexes, a recommended number is `2^code_size*1000`. See the [Faiss documentation](https://github.com/facebookresearch/faiss/wiki/FAQ#how-many-training-points-do-i-need-for-k-means) for more information about the methodology used to calculate these figures. + +For PQ, both _m_ and _code_size_ need to be selected. _m_ determines the number of subvectors into which vectors should be split for separate encoding. Consequently, the _dimension_ needs to be divisible by _m_. _code_size_ determines the number of bits used to encode each subvector. In general, we recommend a setting of `code_size = 8` and then tuning _m_ to get the desired trade-off between memory footprint and recall. + +For an example of setting up an index with PQ, see the [Building a vector index from a model]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-vector-index-from-a-model) tutorial. + +## Memory estimation + +While PQ is meant to represent individual vectors with `m*code_size` bits, in reality, the indexes consume more space. This is mainly because of the overhead of storing certain code tables and auxiliary data structures. + +Some of the memory formulas depend on the number of segments present. This is not typically known beforehand, but a recommended default value is 300. +{: .note} + +### HNSW memory estimation + +The memory required for HNSW with PQ is estimated to be `1.1*(((pq_code_size / 8) * pq_m + 24 + 8 * hnsw_m) * num_vectors + num_segments * (2^pq_code_size * 4 * d))` bytes. + +As an example, assume that you have 1 million vectors with a dimension of 256, `hnsw_m` of 16, `pq_m` of 32, `pq_code_size` of 8, and 100 segments. The memory requirement can be estimated as follows: + +```r +1.1 * ((8 / 8 * 32 + 24 + 8 * 16) * 1000000 + 100 * (2^8 * 4 * 256)) ~= 0.215 GB +``` + +### IVF memory estimation + +The memory required for IVF with PQ is estimated to be `1.1*(((pq_code_size / 8) * pq_m + 24) * num_vectors + num_segments * (2^code_size * 4 * d + 4 * ivf_nlist * d))` bytes. + +For example, assume that you have 1 million vectors with a dimension of 256, `ivf_nlist` of 512, `pq_m` of 32, `pq_code_size` of 8, and 100 segments. The memory requirement can be estimated as follows: + +```r +1.1 * ((8 / 8 * 64 + 24) * 1000000 + 100 * (2^8 * 4 * 256 + 4 * 512 * 256)) ~= 0.171 GB +``` + +## Next steps + +- [Memory-optimized vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/) +- [k-NN query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/k-nn/) \ No newline at end of file diff --git a/_vector-search/optimizing-storage/index.md b/_vector-search/optimizing-storage/index.md new file mode 100644 index 00000000000..4b04024e710 --- /dev/null +++ b/_vector-search/optimizing-storage/index.md @@ -0,0 +1,22 @@ +--- +layout: default +title: Optimizing vector storage +nav_order: 60 +has_children: true +has_toc: false +redirect_from: + - /vector-search/optimizing-storage/ +storage_cards: + - heading: "Vector quantization" + description: "Reduce vector storage space by quantizing vectors" + link: "/vector-search/optimizing-storage/knn-vector-quantization/" + - heading: "Disk-based vector search" + description: "Uses binary quantization to reduce the operational costs of vector workloads" + link: "/vector-search/optimizing-storage/disk-based-vector-search/" +--- + +# Optimizing vector storage + +Vector search operations can be resource intensive, especially when dealing with large-scale vector datasets. OpenSearch provides several optimization techniques for reducing memory usage. + +{% include cards.html cards=page.storage_cards %} \ No newline at end of file diff --git a/_vector-search/optimizing-storage/knn-vector-quantization.md b/_vector-search/optimizing-storage/knn-vector-quantization.md new file mode 100644 index 00000000000..598d9d7eed0 --- /dev/null +++ b/_vector-search/optimizing-storage/knn-vector-quantization.md @@ -0,0 +1,48 @@ +--- +layout: default +title: Vector quantization +parent: Optimizing vector storage +nav_order: 10 +has_children: true +has_toc: false +redirect_from: + - /search-plugins/knn/knn-vector-quantization/ +outside_cards: + - heading: "Byte vectors" + description: "Quantize vectors into byte vectors" + link: "/field-types/supported-field-types/knn-memory-optimized/#byte-vectors" + - heading: "Binary vectors" + description: "Quantize vectors into binary vector" + link: "/field-types/supported-field-types/knn-memory-optimized/#binary-vectors" +inside_cards: + - heading: "Lucene scalar quantization" + description: "Use built-in scalar quantization for the Lucene engine" + link: "/vector-search/optimizing-storage/lucene-scalar-quantization/" + - heading: "Faiss 16-bit scalar quantization" + description: "Use built-in scalar quantization for the Faiss engine" + link: "/vector-search/optimizing-storage/faiss-16-bit-quantization/" + - heading: "Faiss product quantization" + description: "Use built-in product quantization for the Faiss engine" + link: "/vector-search/optimizing-storage/faiss-product-quantization/" + - heading: "Binary quantization" + description: "Use built-in binary quantization for the Faiss engine" + link: "/vector-search/optimizing-storage/binary-quantization/" +--- + +# Vector quantization + +By default, OpenSearch supports the indexing and querying of vectors of type `float`, where each dimension of the vector occupies 4 bytes of memory. For use cases that require ingestion on a large scale, keeping `float` vectors can be expensive because OpenSearch needs to construct, load, save, and search graphs (for the native `faiss` and `nmslib` [deprecated] engines). To reduce the memory footprint, you can use vector quantization. + +OpenSearch supports many varieties of quantization. In general, the level of quantization will provide a trade-off between the accuracy of the nearest neighbor search and the size of the memory footprint consumed by the vector search. + +## Quantize vectors outside of OpenSearch + +Quantize vectors outside of OpenSearch before ingesting them into an OpenSearch index. + +{% include cards.html cards=page.outside_cards %} + +## Quantize vectors within OpenSearch + +Use OpenSearch built-in quantization to quantize vectors. + +{% include cards.html cards=page.inside_cards %} \ No newline at end of file diff --git a/_vector-search/optimizing-storage/lucene-scalar-quantization.md b/_vector-search/optimizing-storage/lucene-scalar-quantization.md new file mode 100644 index 00000000000..021f1a8537d --- /dev/null +++ b/_vector-search/optimizing-storage/lucene-scalar-quantization.md @@ -0,0 +1,120 @@ +--- +layout: default +title: Lucene scalar quantization +parent: Vector quantization +grand_parent: Optimizing vector storage +nav_order: 10 +has_children: false +has_math: true +--- + +# Lucene scalar quantization + +Starting with version 2.16, OpenSearch supports built-in scalar quantization for the Lucene engine. Unlike [byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/#byte-vectors), which require you to quantize vectors before ingesting documents, the Lucene scalar quantizer quantizes input vectors in OpenSearch during ingestion. The Lucene scalar quantizer converts 32-bit floating-point input vectors into 7-bit integer vectors in each segment using the minimum and maximum quantiles computed based on the [`confidence_interval`](#confidence-interval) parameter. During search, the query vector is quantized in each segment using the segment's minimum and maximum quantiles in order to compute the distance between the query vector and the segment's quantized input vectors. + +Quantization can decrease the memory footprint by a factor of 4 in exchange for some loss in recall. Additionally, quantization slightly increases disk usage because it requires storing both the raw input vectors and the quantized vectors. + +## Using Lucene scalar quantization + +To use the Lucene scalar quantizer, set the k-NN vector field's `method.parameters.encoder.name` to `sq` when creating a vector index: + +```json +PUT /test-index +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector1": { + "type": "knn_vector", + "dimension": 2, + "space_type": "l2", + "method": { + "name": "hnsw", + "engine": "lucene", + "parameters": { + "encoder": { + "name": "sq" + }, + "ef_construction": 256, + "m": 8 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Confidence interval + +Optionally, you can specify the `confidence_interval` parameter in the `method.parameters.encoder` object. +The `confidence_interval` is used to compute the minimum and maximum quantiles in order to quantize the vectors: +- If you set the `confidence_interval` to a value in the `0.9` to `1.0` range, inclusive, then the quantiles are calculated statically. For example, setting the `confidence_interval` to `0.9` specifies to compute the minimum and maximum quantiles based on the middle 90% of the vector values, excluding the minimum 5% and maximum 5% of the values. +- Setting `confidence_interval` to `0` specifies to compute the quantiles dynamically, which involves oversampling and additional computations performed on the input data. +- When `confidence_interval` is not set, it is computed based on the vector dimension $$d$$ using the formula $$max(0.9, 1 - \frac{1}{1 + d})$$. + +Lucene scalar quantization is applied only to `float` vectors. If you change the default value of the `data_type` parameter from `float` to `byte` or any other type when mapping a [k-NN vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/), then the request is rejected. +{: .warning} + +The following example method definition specifies the Lucene `sq` encoder with the `confidence_interval` set to `1.0`. This `confidence_interval` specifies to consider all the input vectors when computing the minimum and maximum quantiles. Vectors are quantized to 7 bits by default: + +```json +PUT /test-index +{ + "settings": { + "index": { + "knn": true + } + }, + "mappings": { + "properties": { + "my_vector1": { + "type": "knn_vector", + "dimension": 2, + "space_type": "l2", + "method": { + "name": "hnsw", + "engine": "lucene", + "parameters": { + "encoder": { + "name": "sq", + "parameters": { + "confidence_interval": 1.0 + } + }, + "ef_construction": 256, + "m": 8 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +There are no changes to ingestion or query mapping and no range limitations for the input vectors. + +# Memory estimation + +In the ideal scenario, 7-bit vectors created by the Lucene scalar quantizer use only 25% of the memory required by 32-bit vectors. + +### HNSW memory estimation + +The memory required for the Hierarchical Navigable Small World (HNSW) graph can be estimated as `1.1 * (dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph. + +As an example, assume that you have 1 million vectors with a dimension of 256 and M of 16. The memory requirement can be estimated as follows: + +```r +1.1 * (256 + 8 * 16) * 1,000,000 ~= 0.4 GB +``` + +## Next steps + +- [Memory-optimized vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/) +- [k-NN query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/k-nn/) \ No newline at end of file diff --git a/_vector-search/optimizing-storage/memory-optimized-search.md b/_vector-search/optimizing-storage/memory-optimized-search.md new file mode 100644 index 00000000000..6108bcdc121 --- /dev/null +++ b/_vector-search/optimizing-storage/memory-optimized-search.md @@ -0,0 +1,95 @@ +--- +layout: default +title: Memory-optimized search +parent: Optimizing vector storage +nav_order: 30 +--- + +# Memory-optimized search +Introduced 3.1 +{: .label .label-purple } + +Memory-optimized search allows the Faiss engine to run efficiently without loading the entire vector index into off-heap memory. Without this optimization, Faiss typically loads the full index into memory, which can become unsustainable if the index size exceeds available physical memory. With memory-optimized search, the engine memory-maps the index file and relies on the operating system's file cache to serve search requests. This approach avoids unnecessary I/O and allows repeated reads to be served directly from the system cache. + +Memory-optimized search affects only search operations. Indexing behavior remains unchanged. +{: .note } + +## Limitations + +The following limitations apply to memory-optimized search in OpenSearch: +- Supported only for the [Faiss engine]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/#faiss-engine) with the [HNSW method]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/#hnsw-parameters-1) +- Does not support [IVF]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/#ivf-parameters) or [product quantization (PQ)]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/faiss-product-quantization) +- Requires an index restart to enable or disable + +If you use IVF or PQ, the engine loads data into memory regardless of whether memory-optimized mode is enabled. +{: .important } + +## Configuration + +To enable memory-optimized search, set `index.knn.memory_optimized_search` to `true` when creating an index: + +```json +PUT /test_index +{ + "settings": { + "index.knn": true, + "index.knn.memory_optimized_search": true + }, + "mappings": { + "properties": { + "vector_field": { + "type": "knn_vector", + "dimension": 128, + "method": { + "name": "hnsw", + "engine": "faiss" + } + } + } + } +} +``` +{% include copy-curl.html %} + +To enable memory-optimized search on an existing index, you must close the index, update the setting, and then reopen the index: + +```json +POST /test_index/_close +``` +{% include copy-curl.html %} + +```json +PUT /test_index/_settings +{ + "index.knn.memory_optimized_search": true +} +``` +{% include copy-curl.html %} + +```json +POST /test_index/_open +``` +{% include copy-curl.html %} + +## Integration with disk-based search + +When you configure a field with `on_disk` mode and `1x` compression, memory-optimized search is automatically enabled for that field, even if memory optimization isn't enabled at the index level. For more information, see [Memory-optimized vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized/). + + +Memory-optimized search differs from [disk-based search]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/disk-based-vector-search/) because it doesn't use compression or quantization. It only changes how vector data is loaded and accessed during search. +{: .note } + +## Performance optimization + +When memory-optimized search is enabled, the [warm-up API]({{site.url}}{{site.baseurl}}/vector-search/performance-tuning-search/#warm-up-the-index) loads only the essential information needed for search operations, such as opening streams to the underlying Faiss index file. This minimal warm-up results in: +- Faster initial searches. +- Reduced memory overhead. +- More efficient resource utilization. + +For fields where memory-optimized search is disabled, the warm-up process loads vectors into off-heap memory. + +## Next steps + +- [Disk-based vector search]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/disk-based-vector-search/) +- [Vector quantization]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/knn-vector-quantization/) +- [Performance tuning]({{site.url}}{{site.baseurl}}/vector-search/performance-tuning/) diff --git a/_vector-search/performance-tuning-indexing.md b/_vector-search/performance-tuning-indexing.md new file mode 100644 index 00000000000..e1a874c20dc --- /dev/null +++ b/_vector-search/performance-tuning-indexing.md @@ -0,0 +1,106 @@ +--- +layout: default +title: Indexing performance tuning +nav_order: 10 +parent: Performance tuning +--- + +# Indexing performance tuning + +Take any of the following steps to improve indexing performance, especially when you plan to index a large number of vectors at once. + +## Disable the refresh interval + +Either disable the refresh interval (default = 1 sec) or set a long duration for the refresh interval to avoid creating multiple small segments: + +```json +PUT /<index_name>/_settings +{ + "index" : { + "refresh_interval" : "-1" + } +} +``` +{% include copy-curl.html %} + +Make sure to reenable `refresh_interval` after indexing is complete. + +## Disable replicas (no OpenSearch replica shard) + + Set replicas to `0` to prevent duplicate construction of native library indexes in both primary and replica shards. When you enable replicas after indexing completes, the serialized native library indexes are copied directly. If you have no replicas, losing nodes might cause data loss, so it's important that the data be stored elsewhere so that this initial load can be retried in the event of an issue. + +## Increase the number of indexing threads + +If your hardware has multiple cores, you can allow multiple threads in native library index construction by speeding up the indexing process. Determine the number of threads to allot with the [knn.algo_param.index_thread_qty]({{site.url}}{{site.baseurl}}/search-plugins/knn/settings#cluster-settings) setting. + +Monitor CPU utilization and choose the correct number of threads. Because native library index construction is costly, choosing more threads than you need can cause additional CPU load. + + +## Use the derived vector source feature to reduce storage requirements + +Starting with OpenSearch 3.0, you can use the derived vector source feature to significantly reduce storage requirements for vector fields. It is an [index setting]({{site.url}}{{site.baseurl}}/vector-search/settings/#index-settings) that is enabled by default. This feature prevents vectors from being stored in the `_source` field while still maintaining all functionality, including the ability to use the `update`, `update_by_query`, and `reindex` APIs. + +## (Expert level) Build vector data structures on demand + +This approach is recommended only for workloads that involve a single initial bulk upload and will be used exclusively for search after force merging to a single segment. + +During indexing, vector search builds a specialized data structure for a `knn_vector` field to enable efficient approximate k-nearest neighbors (k-NN) search. However, these structures are rebuilt during [force merge]({{site.url}}{{site.baseurl}}/api-reference/index-apis/force-merge/) on vector indexes. To optimize indexing speed, follow these steps: + +1. **Disable vector data structure creation**: Disable vector data structure creation for new segments by setting [`index.knn.advanced.approximate_threshold`]({{site.url}}{{site.baseurl}}/vector-search/settings/#index-settings) to `-1`. + + To specify the setting at index creation, send the following request: + + ```json + PUT /test-index/ + { + "settings": { + "index.knn.advanced.approximate_threshold": "-1" + } + } + ``` + {% include copy-curl.html %} + + To specify the setting after index creation, send the following request: + + ```json + PUT /test-index/_settings + { + "index.knn.advanced.approximate_threshold": "-1" + } + ``` + {% include copy-curl.html %} + +1. **Perform bulk indexing**: Index data in [bulk]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/) without performing any searches during ingestion: + + ```json + POST _bulk + { "index": { "_index": "test-index", "_id": "1" } } + { "my_vector1": [1.5, 2.5], "price": 12.2 } + { "index": { "_index": "test-index", "_id": "2" } } + { "my_vector1": [2.5, 3.5], "price": 7.1 } + ``` + {% include copy-curl.html %} + + If searches are performed while vector data structures are disabled, they will run using exact k-NN search. + +1. **Reenable vector data structure creation**: Once indexing is complete, enable vector data structure creation by setting `index.knn.advanced.approximate_threshold` to `0`: + + ```json + PUT /test-index/_settings + { + "index.knn.advanced.approximate_threshold": "0" + } + ``` + {% include copy-curl.html %} + + If you do not reset the setting to `0` before the force merge, you will need to reindex your data. + {: .note} + +1. **Force merge segments into one segment**: Perform a force merge and specify `max_num_segments=1` to create the vector data structures only once: + + ```json + POST test-index/_forcemerge?max_num_segments=1 + ``` + {% include copy-curl.html %} + + After the force merge, new search requests will execute approximate k-NN search using the newly created data structures. \ No newline at end of file diff --git a/_vector-search/performance-tuning-search.md b/_vector-search/performance-tuning-search.md new file mode 100644 index 00000000000..9a00f9e37ec --- /dev/null +++ b/_vector-search/performance-tuning-search.md @@ -0,0 +1,94 @@ +--- +layout: default +title: Search performance tuning +nav_order: 20 +parent: Performance tuning +--- + +# Search performance tuning + +Take the following steps to improve search performance. + +## Reduce segment count + +To improve search performance, you must keep the number of segments under control. Lucene's IndexSearcher searches over all of the segments in a shard to find the 'size' best results. + +Having one segment per shard provides optimal performance with respect to search latency. You can configure an index to have multiple shards in order to avoid very large shards and achieve more parallelism. + +You can control the number of segments by choosing a larger refresh interval or during indexing by asking OpenSearch to slow down segment creation by disabling the refresh interval. + +## Warm up the index + +Native library indexes are constructed during indexing, but they're loaded into memory during the first search. In Lucene, each segment is searched sequentially (so, for k-NN, each segment returns up to k nearest neighbors of the query point). The top `size` results, ranked by score, are returned from all segment-level results within a shard (a higher score indicates a better result). + +Once a native library index is loaded (native library indexes are loaded outside of the OpenSearch JVM), OpenSearch caches them in memory. Initial queries are expensive and complete in a few seconds, while subsequent queries are faster and complete in milliseconds (assuming that the k-NN circuit breaker isn't triggered). + +Starting with version 3.1, you can use the [memory-optimized search]({{site.url}}{{site.baseurl}}/vector-search/optimizing-storage/memory-optimized-search/) mode, which enables the engine to load only the necessary bytes during search instead of loading the entire index outside the JVM. When this mode is enabled, the warm-up API loads the minimal required information into memory, including opening read streams to the underlying indexes. Thus, the warm-up API helps ensure that searches after warm-up run faster, even with memory-optimized search enabled. + +To avoid this latency penalty during your first queries, you can use the warmup API operation on the indexes you want to search: + +```json +GET /_plugins/_knn/warmup/index1,index2,index3?pretty +{ + "_shards" : { + "total" : 6, + "successful" : 6, + "failed" : 0 + } +} +``` +{% include copy-curl.html %} + +The warmup API operation loads all native library indexes for all shards (primaries and replicas) for the specified indexes into the cache, so there's no penalty for loading native library indexes during initial searches. + +This API operation only loads the segments of active indexes into the cache. If a merge or refresh operation finishes after the API runs, or if you add new documents, you need to rerun the API to load those native library indexes into memory. +{: .warning} + + +## Avoid reading stored fields + +If your use case only involves reading the IDs and scores of the nearest neighbors, you can disable the reading of stored fields, which saves time that would otherwise be spent retrieving the vectors from stored fields. To disable stored fields entirely, set `_source` to `false`: + +```json +GET /my-index/_search +{ + "_source": false, + "query": { + "knn": { + "vector_field": { + "vector": [ 0.1, 0.2, 0.3], + "k": 10 + } + } + } +} +``` +{% include copy-curl.html %} + +This query returns only the document IDs and scores, making it the fastest option when you don't need the actual document contents. For more information, see [Disabling `_source`]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/retrieve-specific-fields/#disabling-_source). + +## Exclude vectors from search results + +If you need the document contents but want to optimize performance, you can exclude only the vector fields from being returned in the search results. This approach reduces network transfer while still maintaining access to other document fields. To exclude vectors from search results, provide the vector field name in `_source.excludes`: + +```json +GET /my-index/_search +{ + "_source": { + "excludes": [ + "vector_field" + ] + }, + "query": { + "knn": { + "vector_field": { + "vector": [ 0.1, 0.2, 0.3], + "k": 10 + } + } + } +} +``` +{% include copy-curl.html %} + +For more information, see [Retrieve specific fields]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/retrieve-specific-fields/). diff --git a/_vector-search/performance-tuning.md b/_vector-search/performance-tuning.md new file mode 100644 index 00000000000..f4c04edb1c5 --- /dev/null +++ b/_vector-search/performance-tuning.md @@ -0,0 +1,37 @@ +--- +layout: default +title: Performance tuning +nav_order: 70 +has_children: true +redirect_from: + - /search-plugins/knn/performance-tuning/ +--- + +# Performance tuning + +This topic provides performance tuning recommendations for improving indexing and search performance for approximate k-NN (ANN) search. At a high level, k-NN works according to these principles: +* Vector indexes are created per `knn_vector` field/Lucene segment pair. +* Queries execute sequentially on segments in the shard (as with any other OpenSearch query). +* The coordinator node selects the final `size` neighbors from the neighbors returned by each shard. + +The following sections provide recommendations regarding comparing ANN to exact k-NN with a scoring script. + +## Recommendations for engines and cluster node sizing + +Each of the three engines used for ANN search has attributes that make it more sensible to use than the others in a given situation. Use the following information to help determine which engine will best meet your requirements. + +To optimize for indexing throughput, Faiss is a good option. For relatively smaller datasets (up to a few million vectors), the Lucene engine demonstrates better latencies and recall. At the same time, the size of the index is smallest compared to the other engines, which allows it to use smaller AWS instances for data nodes. For further considerations, see [Choosing the right method]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/#choosing-the-right-method) and [Memory estimation]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/#memory-estimation). + +When considering cluster node sizing, a general approach is to first establish an even distribution of the index across the cluster. However, there are other considerations. To help make these choices, you can refer to the OpenSearch managed service guidance in the [Sizing domains](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/sizing-domains.html) section. + +## Improving recall + +Recall depends on multiple factors, such as the number of vectors, dimensions, segments, and so on. Searching a large number of small segments and aggregating the results leads to better recall than searching a small number of large segments and aggregating the results. Larger native library indexes are more likely to lose recall if you're using smaller algorithm parameters. Choosing larger values for algorithm parameters should help solve this issue but sacrifices search latency and indexing time. It's important to understand your system's requirements for latency and accuracy and then choose the number of segments based on experimentation. + +The default parameters work for a broader set of use cases, but make sure to run your own experiments on your datasets and choose the appropriate values. For index-level settings, see [Index settings]({{site.url}}{{site.baseurl}}/vector-search/settings/#index-settings). + +## ANN compared to scoring script + +The standard k-NN query and custom scoring options perform differently. Run tests with a representative set of documents to see if the search results and latencies match your expectations. + +Custom scoring works best if the initial filter reduces the number of documents to no more than 20,000. Increasing the shard count can improve latency, but be sure to keep the shard size within the [recommended guidelines]({{site.url}}{{site.baseurl}}/intro/#primary-and-replica-shards). \ No newline at end of file diff --git a/_vector-search/remote-index-build.md b/_vector-search/remote-index-build.md new file mode 100644 index 00000000000..71c91205fc4 --- /dev/null +++ b/_vector-search/remote-index-build.md @@ -0,0 +1,58 @@ +--- +layout: default +title: Remote index build +nav_order: 72 +has_children: false +--- + +# Building vector indexes remotely using GPUs +Introduced 3.0 +{: .label .label-purple } + +OpenSearch supports building vector indexes using a GPU-accelerated remote index build service. Using GPUs dramatically reduces index build times and decreases costs. For benchmarking results, see [this blog post](https://opensearch.org/blog/GPU-Accelerated-Vector-Search-OpenSearch-New-Frontier/). + +## Supported configurations + +The remote index build service supports [Faiss]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/#faiss-engine) indexes with the `hnsw` method and the default 32-bit floating-point (`FP32`) vectors. + +## Prerequisites + +Before configuring the remote index build settings, ensure you fulfill the following prerequisites. For more information about updating dynamic settings, see [Dynamic settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#dynamic-settings). + +### Step 1: Enable the remote index build service + +Enable the remote index build service for both the cluster and the chosen index by configuring the following settings. + +| Setting | Static/Dynamic | Default | Description | +|:---------------------------------------|:---------------|:--------|:------------------------------------------------------| +| `knn.remote_index_build.enabled` | Dynamic | `false` | Enables remote vector index building for the cluster. | +| `index.knn.remote_index_build.enabled` | Dynamic | `true` | Enables remote index building for the index. Takes effect only if `knn.remote_index_build.enabled` is set to `true`. | + +The remote vector index builder for an index is enabled only when both the cluster-level `knn.remote_index_build.enabled` setting and the `index.knn.remote_index_build.enabled` index-level setting are set to `true`. +{: .note} + +### Step 2: Create and register the remote vector repository + +The remote vector repository acts as an intermediate object store between the OpenSearch cluster and the remote build service. The cluster uploads vectors and document IDs to the repository. The remote build service retrieves the data, builds the index externally, and uploads the completed result back to the repository. + +To create and register the repository, follow the steps in [Register repository]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore/#register-repository). Then set the `knn.remote_index_build.repository` dynamic setting to the name of the registered repository. + +The remote build service currently only supports Amazon Simple Storage Service (Amazon S3) repositories. +{: .note} + +### Step 3: Set up a remote vector index builder + +Configure the remote endpoint in the k-NN settings by setting `knn.remote_index_build.service.endpoint` to a running [remote vector index builder](https://github.com/opensearch-project/remote-vector-index-builder) instance. For instructions on setting up the remote service, see [the user guide](https://github.com/opensearch-project/remote-vector-index-builder/blob/main/USER_GUIDE.md). + +## Configuring remote index build settings + +The remote index build service supports several additional, optional settings. For information about configuring any remaining remote index build settings, see [Remote index build settings]({{site.url}}{{site.baseurl}}/vector-search/settings/#remote-index-build-settings). + +## Using the remote index build service + +Once the remote index build service is configured, any segment flush and merge operations that meet the following requirements will transparently use the GPU build path: + +- The index is using one of the [supported configurations](#supported-configurations). +- The segment size is greater than `index.knn.remote_index_build.size.min` and less than `knn.remote_index_build.size.max`. + +You can monitor remote index build tasks by calling the k-NN Stats API and reviewing the [remote index build statistics]({{site.url}}{{site.baseurl}}/vector-search/api/knn/#remote-index-build-stats). \ No newline at end of file diff --git a/_vector-search/searching-data.md b/_vector-search/searching-data.md new file mode 100644 index 00000000000..8a821500b01 --- /dev/null +++ b/_vector-search/searching-data.md @@ -0,0 +1,75 @@ +--- +layout: default +title: Searching data +nav_order: 35 +--- + +# Searching vector data + +OpenSearch supports various methods for searching vector data, tailored to how the vectors were created and indexed. This guide explains the query syntax and options for raw vector search and auto-generated embedding search. + +## Search type comparison + +The following table compares the search syntax and typical use cases for each vector search method. + +| Feature | Query type | Input format | Model required | Use case | +|----------------------------------|------------------|------------------|---------------------|----------------------------| +| **Raw vectors** | [`knn`]({{site.url}}{{site.baseurl}}/query-dsl/specialized/k-nn/) | Vector array | No | Raw vector search | +| **Auto-generated embeddings** | [`neural`]({{site.url}}{{site.baseurl}}/query-dsl/specialized/neural/) | Text or image data | Yes | [AI search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/) | + +## Searching raw vectors + +To search raw vectors, use the `knn` query type, provide the `vector` array as input, and specify the number of returned results `k`: + +```json +GET /my-raw-vector-index/_search +{ + "query": { + "knn": { + "my_vector": { + "vector": [0.1, 0.2, 0.3], + "k": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +## Searching auto-generated embeddings + +OpenSearch supports [AI-powered search methods]({{site.url}}{{site.baseurl}}/vector-search/ai-search/), including semantic, hybrid, multimodal, and conversational search with retrieval-augmented generation (RAG). These methods automatically generate embeddings from query input. + +To run an AI-powered search, use the `neural` query type. Specify the `query_text` input, the model ID of the embedding model you [configured in the ingest pipeline]({{site.url}}{{site.baseurl}}/vector-search/creating-vector-index/#converting-data-to-embeddings-during-ingestion), and the number of returned results `k`. To exclude embeddings from being returned in search results, specify the embedding field in the `_source.excludes` parameter: + +```json +GET /my-ai-search-index/_search +{ + "_source": { + "excludes": [ + "output_embedding" + ] + }, + "query": { + "neural": { + "output_embedding": { + "query_text": "What is AI search?", + "model_id": "mBGzipQB2gmRjlv_dOoB", + "k": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +## Working with sparse vectors + +OpenSearch also supports sparse vectors. For more information, see [Neural sparse search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-search/). + +## Next steps + +- [Getting started with semantic and hybrid search]({{site.url}}{{site.baseurl}}/vector-search/tutorials/neural-search-tutorial/) +- [Filtering data]({{site.url}}{{site.baseurl}}/vector-search/filter-search-knn/) +- [k-NN query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/k-nn/) +- [Neural query]({{site.url}}{{site.baseurl}}/query-dsl/specialized/neural/) diff --git a/_vector-search/settings.md b/_vector-search/settings.md new file mode 100644 index 00000000000..0cca5e86d44 --- /dev/null +++ b/_vector-search/settings.md @@ -0,0 +1,100 @@ +--- +layout: default +title: Settings +nav_order: 90 +redirect_from: + - /search-plugins/knn/settings/ +--- + +# Vector search settings + +OpenSearch supports the following vector search settings. To learn more about static and dynamic settings, see [Configuring OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/). + +## Cluster settings + +The following table lists all available cluster-level vector search settings. For more information about cluster settings, see [Configuring OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#updating-cluster-settings-using-the-api) and [Updating cluster settings using the API]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#updating-cluster-settings-using-the-api). + +Setting | Static/Dynamic | Default | Description +:--- | :--- | :--- | :--- +`knn.algo_param.index_thread_qty` | Dynamic | `1` | The number of threads used for native library and Lucene library (for OpenSearch version 2.19 and later) index creation. Keeping this value low reduces the CPU impact of the k-NN plugin but also reduces indexing performance. +`knn.cache.item.expiry.enabled` | Dynamic | `false` | Whether to remove native library indexes from memory that have not been accessed in a specified period of time. +`knn.cache.item.expiry.minutes` | Dynamic | `3h` | If enabled, the amount of idle time before a native library index is removed from memory. +`knn.circuit_breaker.unset.percentage` | Dynamic | `75` | The native memory usage threshold for the circuit breaker. Memory usage must be lower than this percentage of `knn.memory.circuit_breaker.limit` in order for `knn.circuit_breaker.triggered` to remain `false`. +`knn.circuit_breaker.triggered` | Dynamic | `false` | `true` when memory usage exceeds the `knn.circuit_breaker.unset.percentage` value. +`knn.memory.circuit_breaker.limit` | Dynamic | `50%` | The native memory limit for native library indexes. At the default value, if a machine has 100 GB of memory and the JVM uses 32 GB, then the k-NN plugin uses 50% of the remaining 68 GB (34 GB). If memory usage exceeds this value, then the plugin removes the native library indexes used least recently. <br><br> To configure this limit at the node level, add `node.attr.knn_cb_tier: "<tier-name>"` in `opensearch.yml` and set `knn.memory.circuit_breaker.limit.<tier-name>` in the cluster settings. For example, define a node tier as `node.attr.knn_cb_tier: "integ"` and set `knn.memory.circuit_breaker.limit.integ: "80%"`. Nodes use their tier's circuit breaker limit if configured, defaulting to the cluster-wide setting if no node-specific value is set. +`knn.memory.circuit_breaker.enabled` | Dynamic | `true` | Whether to enable the k-NN memory circuit breaker. +`knn.model.index.number_of_shards`| Dynamic | `1` | The number of shards to use for the model system index, which is the OpenSearch index that stores the models used for approximate nearest neighbor (ANN) search. +`knn.model.index.number_of_replicas`| Dynamic | `1` | The number of replica shards to use for the model system index. Generally, in a multi-node cluster, this value should be at least 1 in order to increase stability. +`knn.model.cache.size.limit` | Dynamic | `10%` | The model cache limit cannot exceed 25% of the JVM heap. +`knn.faiss.avx2.disabled` | Static | `false` | A static setting that specifies whether to disable the SIMD-based `libopensearchknn_faiss_avx2.so` library and load the non-optimized `libopensearchknn_faiss.so` library for the Faiss engine on machines with x64 architecture. For more information, see [Single Instruction Multiple Data (SIMD) optimization]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/#simd-optimization). +`knn.faiss.avx512_spr.disabled` | Static | `false` | A static setting that specifies whether to disable the SIMD-based `libopensearchknn_faiss_avx512_spr.so` library and load either the `libopensearchknn_faiss_avx512.so` , `libopensearchknn_faiss_avx2.so`, or the non-optimized `libopensearchknn_faiss.so` library for the Faiss engine on machines with x64 architecture. For more information, see [SIMD optimization for the Faiss engine]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/#simd-optimization). + +## Index settings + +The following table lists all available index-level k-NN settings. For information about updating these settings, see [Index-level index settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index-settings/#index-level-index-settings). + +Several parameters defined in the settings are currently in the deprecation process. Those parameters should be set in the mapping instead of in the index settings. Parameters set in the mapping will override the parameters set in the index settings. Setting the parameters in the mapping allows an index to have multiple `knn_vector` fields with different parameters. + +Setting | Static/Dynamic | Default | Description +:--- | :--- |:--------| :--- +`index.knn` | Static | `false` | Whether the index should build native library indexes for the `knn_vector` fields. If set to `false`, the `knn_vector` fields will be stored in doc values, but approximate k-NN search functionality will be disabled. +`index.knn.algo_param.ef_search` | Dynamic | `100` | `ef` (or `efSearch`) represents the size of the dynamic list for the nearest neighbors used during a search. Higher `ef` values lead to a more accurate but slower search. `ef` cannot be set to a value lower than the number of queried nearest neighbors, `k`. `ef` can take any value between `k` and the size of the dataset. +`index.knn.advanced.approximate_threshold` | Dynamic | `0` | The number of vectors that a segment must have before creating specialized data structures for ANN search. Set to `-1` to disable building vector data structures and to `0` to always build them. +`index.knn.advanced.filtered_exact_search_threshold`| Dynamic | None | The filtered ID threshold value used to switch to exact search during filtered ANN search. If the number of filtered IDs in a segment is lower than this setting's value, then exact search will be performed on the filtered IDs. +`index.knn.derived_source.enabled` | Static | `true` | Prevents vectors from being stored in `_source`, reducing disk usage for vector indexes. +| `index.knn.memory_optimized_search` | Dynamic | `false` | Enables memory-optimized search on an index. | + +An index created in OpenSearch version 2.11 or earlier will still use the previous `ef_construction` and `ef_search` values (`512`). +{: .note} + +## Remote index build settings + +The following settings control [remote vector index building]({{site.url}}{{site.baseurl}}/vector-search/remote-index-build/). + +### Cluster settings + +The following remote index build settings apply at the cluster level. + +| Setting | Static/Dynamic | Default | Description | +|:------------------------------------------|:---------------|:--------|:---------------------------------------------------------------------------------------------------------| +| `knn.remote_index_build.enabled` | Dynamic | `false` | Enables remote vector index building for the cluster. | +| `knn.remote_index_build.repository` | Dynamic | None | The repository to which the remote index builder should write. | +| `knn.remote_index_build.service.endpoint` | Dynamic | None | The endpoint URL of the remote build service. | + +#### Advanced cluster settings + +The following are advanced cluster settings. The default values for these settings are configured using extensive benchmarking. + +| Setting | Static/Dynamic | Default | Description | +|:----------------------------------------|:---------------|:--------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `knn.remote_index_build.poll.interval` | Dynamic | `5s` | How frequently the client should poll the remote build service for job status. | +| `knn.remote_index_build.client.timeout` | Dynamic | `60m` | The maximum amount of time to wait for remote build completion before falling back to a CPU-based build. | +| `knn.remote_index_build.size.max` | Dynamic | `0` | The maximum segment size for the remote index build service, based on the service implementation constraints. Must be greater than `0`. | + +### Index settings + +The following remote index build settings apply at the index level. + +| Setting | Static/Dynamic | Default | Description | +|:----------------------------------------------|:---------------|:--------|:----------------------------------------------------------| +| `index.knn.remote_index_build.enabled` | Dynamic | `false` | Enables remote index building for the index. | + +#### Advanced index settings + +The following index settings are advanced settings whose default values are set as a result of extensive benchmarking. + +| Setting | Static/Dynamic | Default | Description | +|:----------------------------------------|:---------------|:--------|:----------------------------------------------------------| +| `index.knn.remote_index_build.size.min` | Dynamic | `50mb` | The minimum size required to enable remote vector builds. | + +### Remote build authentication + +The remote build service username and password are secure settings that must be set in the [OpenSearch keystore]({{site.url}}{{site.baseurl}}/security/configuration/opensearch-keystore/) as follows: + +```bash +./bin/opensearch-keystore add knn.remote_index_build.service.username +./bin/opensearch-keystore add knn.remote_index_build.service.password +``` +{% include copy.html %} + +You can reload the secure settings without restarting the node by using the [Nodes Reload Secure Setings API]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/nodes-reload-secure/). diff --git a/_vector-search/specialized-operations/index.md b/_vector-search/specialized-operations/index.md new file mode 100644 index 00000000000..4a97327fd4c --- /dev/null +++ b/_vector-search/specialized-operations/index.md @@ -0,0 +1,22 @@ +--- +layout: default +title: Specialized vector search +nav_order: 50 +has_children: true +has_toc: false +redirect_from: + - /vector-search/specialized-operations/ +cards: + - heading: "Nested field vector search" + description: "Use vector search to search nested fields" + link: "/vector-search/specialized-operations/nested-search-knn/" + - heading: "Radial search" + description: "Search all points in a vector space that reside within a specified maximum distance or minimum score threshold from a query point" + link: "/vector-search/specialized-operations/radial-search-knn/" +--- + +# Specialized vector search + +OpenSearch supports the following specialized vector search applications. + +{% include cards.html cards=page.cards %} \ No newline at end of file diff --git a/_search-plugins/knn/nested-search-knn.md b/_vector-search/specialized-operations/nested-search-knn.md similarity index 60% rename from _search-plugins/knn/nested-search-knn.md rename to _vector-search/specialized-operations/nested-search-knn.md index bbba6c9c1e5..f703b70323d 100644 --- a/_search-plugins/knn/nested-search-knn.md +++ b/_vector-search/specialized-operations/nested-search-knn.md @@ -1,26 +1,28 @@ --- layout: default -title: k-NN search with nested fields -nav_order: 21 -parent: k-NN search +title: Nested field search +nav_order: 40 +parent: Specialized vector search has_children: false has_math: true +redirect_from: + - /search-plugins/knn/nested-search-knn/ --- -# k-NN search with nested fields +# Nested field search -Using [nested fields]({{site.url}}{{site.baseurl}}/field-types/nested/) in a k-nearest neighbors (k-NN) index, you can store multiple vectors in a single document. For example, if your document consists of various components, you can generate a vector value for each component and store each vector in a nested field. +Using [nested fields]({{site.url}}{{site.baseurl}}/field-types/nested/) in a vector index, you can store multiple vectors in a single document. For example, if your document consists of various components, you can generate a vector value for each component and store each vector in a nested field. -A k-NN document search operates at the field level. For a document with nested fields, OpenSearch examines only the vector nearest to the query vector to decide whether to include the document in the results. For example, consider an index containing documents `A` and `B`. Document `A` is represented by vectors `A1` and `A2`, and document `B` is represented by vector `B1`. Further, the similarity order for a query Q is `A1`, `A2`, `B1`. If you search using query Q with a k value of 2, the search will return both documents `A` and `B` instead of only document `A`. +A vector search operates at the field level. For a document with nested fields, OpenSearch examines only the vector nearest to the query vector to decide whether to include the document in the results. For example, consider an index containing documents `A` and `B`. Document `A` is represented by vectors `A1` and `A2`, and document `B` is represented by vector `B1`. Further, the similarity order for a query Q is `A1`, `A2`, `B1`. If you search using query Q with a k value of 2, the search will return both documents `A` and `B` instead of only document `A`. Note that in the case of an approximate search, the results are approximations and not exact matches. -k-NN search with nested fields is supported by the HNSW algorithm for the Lucene and Faiss engines. +Vector search with nested fields is supported by the HNSW algorithm for the Lucene and Faiss engines. ## Indexing and searching nested fields -To use k-NN search with nested fields, you must create a k-NN index by setting `index.knn` to `true`. Create a nested field by setting its `type` to `nested` and specify one or more fields of the `knn_vector` data type within the nested field. In this example, the `knn_vector` field `my_vector` is nested inside the `nested_field` field: +To use vector search with nested fields, you must create a vector index by setting `index.knn` to `true`. Create a nested field by setting its `type` to `nested` and specify one or more fields of the `knn_vector` data type within the nested field. In this example, the `knn_vector` field `my_vector` is nested inside the `nested_field` field: ```json PUT my-knn-index-1 @@ -71,7 +73,7 @@ PUT _bulk?refresh=true ``` {% include copy-curl.html %} -Then run a k-NN search on the data by using the `knn` query type: +Then run a vector search on the data by using the `knn` query type: ```json GET my-knn-index-1/_search @@ -300,13 +302,188 @@ The response contains matching documents. For each matching document, the `inner } ``` -## k-NN search with filtering on nested fields +## Retrieving all nested hits -You can apply a filter to a k-NN search with nested fields. A filter can be applied to either a top-level field or a field inside a nested field. +By default, only the highest-scoring nested document is considered when you query nested fields. To retrieve the scores for all nested field documents within each parent document, set `expand_nested_docs` to `true` in your query. The parent document's score is calculated as the average of their scores. To use the highest score among the nested field documents as the parent document's score, set `score_mode` to `max`: + +```json +GET my-knn-index-1/_search +{ + "_source": false, + "query": { + "nested": { + "path": "nested_field", + "query": { + "knn": { + "nested_field.my_vector": { + "vector": [1,1,1], + "k": 2, + "expand_nested_docs": true + } + } + }, + "inner_hits": { + "_source": false, + "fields":["nested_field.color"] + }, + "score_mode": "max" + } + } +} +``` +{% include copy-curl.html %} + +The response contains all matching documents: + +```json +{ + "took": 13, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1.0, + "hits": [ + { + "_index": "my-knn-index-1", + "_id": "1", + "_score": 1.0, + "inner_hits": { + "nested_field": { + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 1.0, + "hits": [ + { + "_index": "my-knn-index-1", + "_id": "1", + "_nested": { + "field": "nested_field", + "offset": 0 + }, + "_score": 1.0, + "fields": { + "nested_field.color": [ + "blue" + ] + } + }, + { + "_index": "my-knn-index-1", + "_id": "1", + "_nested": { + "field": "nested_field", + "offset": 1 + }, + "_score": 0.25, + "fields": { + "nested_field.color": [ + "blue" + ] + } + }, + { + "_index": "my-knn-index-1", + "_id": "1", + "_nested": { + "field": "nested_field", + "offset": 2 + }, + "_score": 0.07692308, + "fields": { + "nested_field.color": [ + "white" + ] + } + } + ] + } + } + } + }, + { + "_index": "my-knn-index-1", + "_id": "2", + "_score": 0.0040983604, + "inner_hits": { + "nested_field": { + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 0.0040983604, + "hits": [ + { + "_index": "my-knn-index-1", + "_id": "2", + "_nested": { + "field": "nested_field", + "offset": 0 + }, + "_score": 0.0040983604, + "fields": { + "nested_field.color": [ + "blue" + ] + } + }, + { + "_index": "my-knn-index-1", + "_id": "2", + "_nested": { + "field": "nested_field", + "offset": 1 + }, + "_score": 9.2250924E-4, + "fields": { + "nested_field.color": [ + "yellow" + ] + } + }, + { + "_index": "my-knn-index-1", + "_id": "2", + "_nested": { + "field": "nested_field", + "offset": 2 + }, + "_score": 3.9619653E-4, + "fields": { + "nested_field.color": [ + "white" + ] + } + } + ] + } + } + } + } + ] + } +} +``` + +## Vector search with filtering on nested fields + +You can apply a filter to a vector search with nested fields. A filter can be applied to either a top-level field or a field inside a nested field. The following example applies a filter to a top-level field. -First, create a k-NN index with a nested field: +First, create a vector index with a nested field: ```json PUT my-knn-index-1 @@ -355,7 +532,7 @@ PUT _bulk?refresh=true ``` {% include copy-curl.html %} -Then run a k-NN search on the data using the `knn` query type with a filter. The following query returns documents whose `parking` field is set to `true`: +Then run a vector search on the data using the `knn` query type with a filter. The following query returns documents whose `parking` field is set to `true`: ```json GET my-knn-index-1/_search diff --git a/_search-plugins/knn/radial-search-knn.md b/_vector-search/specialized-operations/radial-search-knn.md similarity index 84% rename from _search-plugins/knn/radial-search-knn.md rename to _vector-search/specialized-operations/radial-search-knn.md index e5449a0993a..6aecc446077 100644 --- a/_search-plugins/knn/radial-search-knn.md +++ b/_vector-search/specialized-operations/radial-search-knn.md @@ -1,36 +1,40 @@ --- layout: default title: Radial search -nav_order: 28 -parent: k-NN search +nav_order: 50 +parent: Specialized vector search has_children: false has_math: true +redirect_from: + - /search-plugins/knn/radial-search-knn/ --- # Radial search -Radial search enhances the k-NN plugin's capabilities beyond approximate top-`k` searches. With radial search, you can search all points within a vector space that reside within a specified maximum distance or minimum score threshold from a query point. This provides increased flexibility and utility in search operations. +Radial search enhances the vector search capabilities beyond approximate top-k searches. With radial search, you can search all points within a vector space that reside within a specified maximum distance or minimum score threshold from a query point. This provides increased flexibility and utility in search operations. -## Parameter type +## Parameters -`max_distance` allows users to specify a physical distance within the vector space, identifying all points that are within this distance from the query point. This approach is particularly useful for applications requiring spatial proximity or absolute distance measurements. +Radial search supports the following parameters: -`min_score` enables the specification of a similarity score, facilitating the retrieval of points that meet or exceed this score in relation to the query point. This method is ideal in scenarios where relative similarity, based on a specific metric, is more critical than physical proximity. +- `max_distance`: Specifies a physical distance within the vector space, identifying all points that are within this distance from the query point. This approach is particularly useful for applications requiring spatial proximity or absolute distance measurements. -Only one query variable, either `k`, `max_distance`, or `min_score`, is required to be specified during radial search. For more information about the vector spaces, see [Spaces](#spaces). +`min_score`: Specifies a similarity score, facilitating the retrieval of points that meet or exceed this score in relation to the query point. This method is ideal in scenarios where relative similarity, based on a specific metric, is more critical than physical proximity. + +Only one query variable, either `k`, `max_distance`, or `min_score`, is required to be specified during radial search. ## Supported cases -You can perform radial search with either the Lucene or Faiss engines. The following table summarizes radial search use cases by engine. +You can perform radial search with either Lucene or Faiss engine. The following table summarizes radial search use cases by engine. | Engine supported | Filter supported | Nested field supported | Search type | | :--- | :--- | :--- | :--- | -| Lucene | true | false | approximate | -| Faiss | true | true | approximate | +| Lucene | Yes | No | Approximate | +| Faiss | Yes | Yes | Approximate | ## Spaces -For supported spaces, see [Spaces]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#spaces). +For supported spaces, see [Spaces]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-spaces/). ## Examples @@ -38,7 +42,7 @@ The following examples can help you to get started with radial search. ### Prerequisites -To use a k-NN index with radial search, create a k-NN index by setting `index.knn` to `true`. Specify one or more fields of the `knn_vector` data type, as shown in the following example: +To use a vector index with radial search, create a vector index by setting `index.knn` to `true`. Specify one or more fields of the `knn_vector` data type, as shown in the following example: ```json PUT knn-index-test diff --git a/_vector-search/vector-search-techniques/approximate-knn.md b/_vector-search/vector-search-techniques/approximate-knn.md new file mode 100644 index 00000000000..09f7a31df9d --- /dev/null +++ b/_vector-search/vector-search-techniques/approximate-knn.md @@ -0,0 +1,261 @@ +--- +layout: default +title: Approximate k-NN search +nav_order: 15 +parent: Vector search techniques +has_children: false +has_math: true +redirect_from: + - /search-plugins/knn/approximate-knn/ +--- + +# Approximate k-NN search + +Standard k-nearest neighbors (k-NN) search methods compute similarity using a brute-force approach that measures the nearest distance between a query and a number of points, which produces exact results. This works well in many applications. However, in the case of extremely large datasets with high dimensionality, this creates a scaling problem that reduces the efficiency of the search. Approximate k-NN search methods can overcome this by employing tools that restructure indexes more efficiently and reduce the dimensionality of searchable vectors. Using this approach requires a sacrifice in accuracy but increases search processing speeds appreciably. + +The approximate k-NN search methods in OpenSearch use approximate nearest neighbor (ANN) algorithms from the [NMSLIB](https://github.com/nmslib/nmslib), [Faiss](https://github.com/facebookresearch/faiss), and [Lucene](https://lucene.apache.org/) libraries to power k-NN search. These search methods employ ANN to improve search latency for large datasets. Of the three search methods OpenSearch provides, this method offers the best search scalability for large datasets. This approach is the preferred method when a dataset reaches hundreds of thousands of vectors. + +For information about the algorithms OpenSearch supports, see [Methods and engines]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/). +{: .note} + +OpenSearch builds a native library index of the vectors for each `knn-vector` field/Lucene segment pair during indexing, which can be used to efficiently find the k-nearest neighbors to a query vector during search. To learn more about Lucene segments, see the [Apache Lucene documentation](https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/codecs/lucene87/package-summary.html#package.description). These native library indexes are loaded into native memory during search and managed by a cache. To learn more about preloading native library indexes into memory, see [Warmup API]({{site.url}}{{site.baseurl}}/vector-search/api/knn#warmup-operation). Additionally, you can see which native library indexes are already loaded into memory using the [Stats API]({{site.url}}{{site.baseurl}}/vector-search/api/knn#stats). + +Because the native library indexes are constructed during indexing, it is not possible to apply a filter on an index and then use this search method. All filters are applied to the results produced by the ANN search. + +## Get started with approximate k-NN + +To use the approximate search functionality, you must first create a vector index with `index.knn` set to `true`. This setting tells OpenSearch to create native library indexes for the index. + +Next, you must add one or more fields of the `knn_vector` data type. The following example creates an index with two `knn_vector` fields using the `faiss` engine: + +```json +PUT my-knn-index-1 +{ + "settings": { + "index": { + "knn": true, + "knn.algo_param.ef_search": 100 + } + }, + "mappings": { + "properties": { + "my_vector1": { + "type": "knn_vector", + "dimension": 2, + "space_type": "l2", + "method": { + "name": "hnsw", + "engine": "faiss", + "parameters": { + "ef_construction": 128, + "m": 24 + } + } + }, + "my_vector2": { + "type": "knn_vector", + "dimension": 4, + "space_type": "innerproduct", + "method": { + "name": "hnsw", + "engine": "faiss", + "parameters": { + "ef_construction": 256, + "m": 48 + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +In the preceding example, both `knn_vector` fields are configured using method definitions. Additionally, `knn_vector` fields can be configured using models. For more information, see [k-NN vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/). + +The `knn_vector` data type supports a vector of floats that can have a dimension count of up to 16,000 for the NMSLIB, Faiss, and Lucene engines, as set by the `dimension` mapping parameter. + +In OpenSearch, codecs handle the storage and retrieval of indexes. OpenSearch uses a custom codec to write vector data to native library indexes so that the underlying k-NN search library can read it. +{: .tip } + +After you create the index, you can add some data to it: + +```json +POST _bulk +{ "index": { "_index": "my-knn-index-1", "_id": "1" } } +{ "my_vector1": [1.5, 2.5], "price": 12.2 } +{ "index": { "_index": "my-knn-index-1", "_id": "2" } } +{ "my_vector1": [2.5, 3.5], "price": 7.1 } +{ "index": { "_index": "my-knn-index-1", "_id": "3" } } +{ "my_vector1": [3.5, 4.5], "price": 12.9 } +{ "index": { "_index": "my-knn-index-1", "_id": "4" } } +{ "my_vector1": [5.5, 6.5], "price": 1.2 } +{ "index": { "_index": "my-knn-index-1", "_id": "5" } } +{ "my_vector1": [4.5, 5.5], "price": 3.7 } +{ "index": { "_index": "my-knn-index-1", "_id": "6" } } +{ "my_vector2": [1.5, 5.5, 4.5, 6.4], "price": 10.3 } +{ "index": { "_index": "my-knn-index-1", "_id": "7" } } +{ "my_vector2": [2.5, 3.5, 5.6, 6.7], "price": 5.5 } +{ "index": { "_index": "my-knn-index-1", "_id": "8" } } +{ "my_vector2": [4.5, 5.5, 6.7, 3.7], "price": 4.4 } +{ "index": { "_index": "my-knn-index-1", "_id": "9" } } +{ "my_vector2": [1.5, 5.5, 4.5, 6.4], "price": 8.9 } +``` +{% include copy-curl.html %} + +Then you can run an ANN search on the data using the `knn` query type: + +```json +GET my-knn-index-1/_search +{ + "size": 2, + "query": { + "knn": { + "my_vector2": { + "vector": [2, 3, 5, 6], + "k": 2 + } + } + } +} +``` +{% include copy-curl.html %} + +## The number of returned results + +In the preceding query, `k` represents the number of neighbors returned by the search of each graph. You must also include the `size` parameter, indicating the final number of results that you want the query to return. + +For the NMSLIB and Faiss engines, `k` represents the maximum number of documents returned for all segments of a shard. For the Lucene engine, `k` represents the number of documents returned for a shard. The maximum value of `k` is 10,000. + +For any engine, each shard returns `size` results to the coordinator node. Thus, the total number of results that the coordinator node receives is `size * number of shards`. After the coordinator node consolidates the results received from all nodes, the query returns the top `size` results. + +The following table provides examples of the number of results returned by various engines in several scenarios. For these examples, assume that the number of documents contained in the segments and shards is sufficient to return the number of results specified in the table. + +`size` | `k` | Number of primary shards | Number of segments per shard | Number of returned results, Faiss/NMSLIB | Number of returned results, Lucene +:--- | :--- | :--- | :--- | :--- | :--- +10 | 1 | 1 | 4 | 4 | 1 +10 | 10 | 1 | 4 | 10 | 10 +10 | 1 | 2 | 4 | 8 | 2 + +The number of results returned by Faiss/NMSLIB differs from the number of results returned by Lucene only when `k` is smaller than `size`. If `k` and `size` are equal, all engines return the same number of results. + +Starting in OpenSearch 2.14, you can use `k`, `min_score`, or `max_distance` for [radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/). + +## Building a vector index from a model + +For some of the algorithms that OpenSearch supports, the native library index needs to be trained before it can be used. It would be expensive to train every newly created segment, so, instead, OpenSearch features the concept of a *model* that initializes the native library index during segment creation. You can create a model by calling the [Train API]({{site.url}}{{site.baseurl}}/vector-search/api/knn#train-a-model) and passing in the source of the training data and the method definition of the model. Once training is complete, the model is serialized to a k-NN model system index. Then, during indexing, the model is pulled from that index to initialize the segments. + +To train a model, you first need an OpenSearch index containing training data. Training data can come from any `knn_vector` field that has a dimension matching the dimension of the model you want to create. Training data can be the same as the data you plan to index or come from a separate dataset. To create a training index, send the following request: + +```json +PUT /train-index +{ + "settings": { + "number_of_shards": 3, + "number_of_replicas": 0 + }, + "mappings": { + "properties": { + "train-field": { + "type": "knn_vector", + "dimension": 4 + } + } + } +} +``` +{% include copy-curl.html %} + +Notice that `index.knn` is not set in the index settings. This ensures that you do not create native library indexes for this index. + +You can now add some data to the index: + +```json +POST _bulk +{ "index": { "_index": "train-index", "_id": "1" } } +{ "train-field": [1.5, 5.5, 4.5, 6.4]} +{ "index": { "_index": "train-index", "_id": "2" } } +{ "train-field": [2.5, 3.5, 5.6, 6.7]} +{ "index": { "_index": "train-index", "_id": "3" } } +{ "train-field": [4.5, 5.5, 6.7, 3.7]} +{ "index": { "_index": "train-index", "_id": "4" } } +{ "train-field": [1.5, 5.5, 4.5, 6.4]} +``` +{% include copy-curl.html %} + +After completing indexing into the training index, you can call the Train API: + +```json +POST /_plugins/_knn/models/my-model/_train +{ + "training_index": "train-index", + "training_field": "train-field", + "dimension": 4, + "description": "My model description", + "method": { + "name": "ivf", + "engine": "faiss", + "parameters": { + "encoder": { + "name": "pq", + "parameters": { + "code_size": 2, + "m": 2 + } + } + } + } +} +``` +{% include copy-curl.html %} + +For more information about the method parameters, see [IVF training requirements]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/#ivf-training-requirements). + +The Train API returns as soon as the training job is started. To check the job status, use the Get Model API: + +```json +GET /_plugins/_knn/models/my-model?filter_path=state&pretty +{ + "state": "training" +} +``` +{% include copy-curl.html %} + +Once the model enters the `created` state, you can create an index that will use this model to initialize its native library indexes: + +```json +PUT /target-index +{ + "settings": { + "number_of_shards": 3, + "number_of_replicas": 1, + "index.knn": true + }, + "mappings": { + "properties": { + "target-field": { + "type": "knn_vector", + "model_id": "my-model" + } + } + } +} +``` +{% include copy-curl.html %} + +Lastly, you can add the documents you want to be searched to the index: + +```json +POST _bulk +{ "index": { "_index": "target-index", "_id": "1" } } +{ "target-field": [1.5, 5.5, 4.5, 6.4]} +{ "index": { "_index": "target-index", "_id": "2" } } +{ "target-field": [2.5, 3.5, 5.6, 6.7]} +{ "index": { "_index": "target-index", "_id": "3" } } +{ "target-field": [4.5, 5.5, 6.7, 3.7]} +{ "index": { "_index": "target-index", "_id": "4" } } +{ "target-field": [1.5, 5.5, 4.5, 6.4]} +``` +{% include copy-curl.html %} + +After data is ingested, it can be searched in the same way as any other `knn_vector` field. diff --git a/_vector-search/vector-search-techniques/index.md b/_vector-search/vector-search-techniques/index.md new file mode 100644 index 00000000000..8a3f9500690 --- /dev/null +++ b/_vector-search/vector-search-techniques/index.md @@ -0,0 +1,38 @@ +--- +layout: default +title: Vector search techniques +nav_order: 15 +has_children: true +has_toc: false +redirect_from: + - /search-plugins/knn/ + - /search-plugins/knn/index/ + - /vector-search/vector-search-techniques/ +--- + +# Vector search techniques + +OpenSearch implements vector search as *k-nearest neighbors*, or *k-NN*, search. k-NN search finds the k neighbors closest to a query point across an index of vectors. To determine the neighbors, you can specify the space (the distance function) you want to use to measure the distance between points. + +OpenSearch supports three different methods for obtaining the k-nearest neighbors from an index of vectors: + +- [Approximate search]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/) (approximate k-NN, or ANN): Returns approximate nearest neighbors to the query vector. Usually, approximate search algorithms sacrifice indexing speed and search accuracy in exchange for performance benefits such as lower latency, smaller memory footprints, and more scalable search. For most use cases, approximate search is the best option. + +- Exact search: A brute-force, exact k-NN search of vector fields. OpenSearch supports the following types of exact search: + - [Exact search with a scoring script]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-score-script/): Using a scoring script, you can apply a filter to an index before executing the nearest neighbor search. + - [Painless extensions]({{site.url}}{{site.baseurl}}/search-plugins/knn/painless-functions/): Adds the distance functions as Painless extensions that you can use in more complex combinations. You can use this method to perform a brute-force, exact vector search of an index, which also supports pre-filtering. + + +In general, you should choose the ANN method for larger datasets because it scales significantly better. For smaller datasets, where you may want to apply a filter, you should choose the custom scoring approach. If you have a more complex use case in which you need to use a distance function as part of the scoring method, you should use the Painless scripting approach. + +## Approximate search + +OpenSearch supports multiple backend algorithms (_methods_) and libraries for implementing these algorithms (_engines_). It automatically selects the optimal configuration based on the chosen mode and available memory. For more information, see [Methods and engines]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-methods-engines/). + +## Using sparse vectors + +_Neural sparse search_ offers an efficient alternative to dense vector search by using sparse embedding models and inverted indexes, providing performance similar to BM25. Unlike dense vector methods that require significant memory and CPU resources, sparse search creates a list of token-weight pairs and stores them in a rank features index. This approach combines the efficiency of traditional search with the semantic understanding of neural networks. OpenSearch supports both automatic embedding generation through ingest pipelines and direct sparse vector ingestion. For more information, see [Neural sparse search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/neural-sparse-search/). + +## Combining multiple search techniques + +_Hybrid search_ enhances search relevance by combining multiple search techniques in OpenSearch. It integrates traditional keyword search with vector-based semantic search. Through a configurable search pipeline, hybrid search normalizes and combines scores from different search methods to provide unified, relevant results. This approach is particularly effective for complex queries where both semantic understanding and exact matching are important. The search pipeline can be further customized with post-filtering operations and aggregations to meet specific search requirements. For more information, see [Hybrid search]({{site.url}}{{site.baseurl}}/vector-search/ai-search/hybrid-search/). diff --git a/_search-plugins/knn/knn-score-script.md b/_vector-search/vector-search-techniques/knn-score-script.md similarity index 58% rename from _search-plugins/knn/knn-score-script.md rename to _vector-search/vector-search-techniques/knn-score-script.md index a184de2d3d4..da5b159baad 100644 --- a/_search-plugins/knn/knn-score-script.md +++ b/_vector-search/vector-search-techniques/knn-score-script.md @@ -1,23 +1,25 @@ --- layout: default -title: Exact k-NN with scoring script -nav_order: 10 -parent: k-NN search -has_children: false +title: Exact k-NN search with a scoring script +nav_order: 20 +parent: Vector search techniques +has_children: true has_math: true +redirect_from: + - /search-plugins/knn/knn-score-script/ --- -# Exact k-NN with scoring script +# Exact k-NN search with a scoring script -The k-NN plugin implements the OpenSearch score script plugin that you can use to find the exact k-nearest neighbors to a given query point. Using the k-NN score script, you can apply a filter on an index before executing the nearest neighbor search. This is useful for dynamic search cases where the index body may vary based on other conditions. +You can use exact k-nearest neighbors (k-NN) search with a scoring script to find the exact k-nearest neighbors to a given query point. Using the k-NN scoring script, you can apply a filter on an index before executing the nearest neighbor search. This is useful for dynamic search use cases, where the index body may vary based on other conditions. -Because the score script approach executes a brute force search, it doesn't scale as well as the [approximate approach]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn). In some cases, it might be better to think about refactoring your workflow or index structure to use the approximate approach instead of the score script approach. +Because the scoring script approach executes a brute force search, it doesn't scale as efficiently as the [approximate approach]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/). In some cases, it might be better to consider refactoring your workflow or index structure to use the approximate approach instead of the scoring script approach. -## Getting started with the score script for vectors +## Getting started with the scoring script for vectors -Similar to approximate nearest neighbor search, in order to use the score script on a body of vectors, you must first create an index with one or more `knn_vector` fields. +Similarly to approximate nearest neighbor (ANN) search, in order to use the scoring script on a body of vectors, you must first create an index with one or more `knn_vector` fields. -If you intend to just use the score script approach (and not the approximate approach) you can set `index.knn` to `false` and not set `index.knn.space_type`. You can choose the space type during search. See [spaces](#spaces) for the spaces the k-NN score script suppports. +If you intend to only use the scoring script approach (and not the approximate approach), you can set `index.knn` to `false` and not set `index.knn.space_type`. You can choose the space type during search. For the spaces that the k-NN scoring script supports, see [Spaces]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-spaces/). This example creates an index with two `knn_vector` fields: @@ -40,7 +42,7 @@ PUT my-knn-index-1 ``` {% include copy-curl.html %} -If you *only* want to use the score script, you can omit `"index.knn": true`. The benefit of this approach is faster indexing speed and lower memory usage, but you lose the ability to perform standard k-NN queries on the index. +If you want to *only* use the scoring script, you can omit `"index.knn": true`. This approach leads to faster indexing speed and lower memory usage, but you lose the ability to run standard k-NN queries on the index. {: .tip} After you create the index, you can add some data to it: @@ -68,7 +70,8 @@ POST _bulk ``` {% include copy-curl.html %} -Finally, you can execute an exact nearest neighbor search on the data using the `knn` script: +Finally, you can run an exact nearest neighbor search on the data using the `knn` script: + ```json GET my-knn-index-1/_search { @@ -102,11 +105,11 @@ All parameters are required. - `field` is the field that contains your vector data. - `query_value` is the point you want to find the nearest neighbors for. For the Euclidean and cosine similarity spaces, the value must be an array of floats that matches the dimension set in the field's mapping. For Hamming bit distance, this value can be either of type signed long or a base64-encoded string (for the long and binary field types, respectively). -- `space_type` corresponds to the distance function. See the [spaces section](#spaces). +- `space_type` corresponds to the distance function. For more information, see [Spaces]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-spaces/). -The [post filter example in the approximate approach]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn#using-approximate-k-nn-with-filters) shows a search that returns fewer than `k` results. If you want to avoid this situation, the score script method lets you essentially invert the order of events. In other words, you can filter down the set of documents over which to execute the k-nearest neighbor search. +The [post filter example in the approximate approach]({{site.url}}{{site.baseurl}}/vector-search/filter-search-knn/) shows a search that returns fewer than `k` results. If you want to avoid this, the scoring script method lets you essentially invert the order of events. In other words, you can filter the set of documents on which to execute the k-NN search. -This example shows a pre-filter approach to k-NN search with the score script approach. First, create the index: +This example shows a pre-filter approach to k-NN search with the scoring script approach. First, create the index: ```json PUT my-knn-index-2 @@ -177,8 +180,9 @@ GET my-knn-index-2/_search ``` {% include copy-curl.html %} -## Getting started with the score script for binary data -The k-NN score script also allows you to run k-NN search on your binary data with the Hamming distance space. +## Getting started with the scoring script for binary data + +The k-NN scoring script also allows you to run k-NN search on your binary data with the Hamming distance space. In order to use Hamming distance, the field of interest must have either a `binary` or `long` field type. If you're using `binary` type, the data must be a base64-encoded string. This example shows how to use the Hamming distance space with a `binary` field type: @@ -284,23 +288,3 @@ GET my-long-index/_search ``` {% include copy-curl.html %} -## Spaces - -A _space_ corresponds to the function used to measure the distance between two points in order to determine the k-nearest neighbors. From the k-NN perspective, a lower score equates to a closer and better result. This is the opposite of how OpenSearch scores results, where a higher score equates to a better result. The following table illustrates how OpenSearch converts spaces to scores. - -| Space type | Distance function ($$d$$ ) | OpenSearch score | -| :--- | :--- | :--- | -| `l1` | $$ d(\mathbf{x}, \mathbf{y}) = \sum_{i=1}^n \lvert x_i - y_i \rvert $$ | $$ score = {1 \over {1 + d} } $$ | -| `l2` | $$ d(\mathbf{x}, \mathbf{y}) = \sum_{i=1}^n (x_i - y_i)^2 $$ | $$ score = {1 \over 1 + d } $$ | -| `linf` | $$ d(\mathbf{x}, \mathbf{y}) = max(\lvert x_i - y_i \rvert) $$ | $$ score = {1 \over 1 + d } $$ | -| `cosinesimil` | $$ d(\mathbf{x}, \mathbf{y}) = 1 - cos { \theta } = 1 - {\mathbf{x} \cdot \mathbf{y} \over \lVert \mathbf{x}\rVert \cdot \lVert \mathbf{y}\rVert}$$$$ = 1 - {\sum_{i=1}^n x_i y_i \over \sqrt{\sum_{i=1}^n x_i^2} \cdot \sqrt{\sum_{i=1}^n y_i^2}}$$, <br> where $$\lVert \mathbf{x}\rVert$$ and $$\lVert \mathbf{y}\rVert$$ represent the norms of vectors $$\mathbf{x}$$ and $$\mathbf{y}$$, respectively. | $$ score = 2 - d $$ | -| `innerproduct` (supported for Lucene in OpenSearch version 2.13 and later) | $$ d(\mathbf{x}, \mathbf{y}) = - {\mathbf{x} \cdot \mathbf{y}} = - \sum_{i=1}^n x_i y_i $$ | $$ \text{If} d \ge 0, score = {1 \over 1 + d }$$ <br> $$\text{If} d < 0, score = −d + 1$$ | -| `hammingbit` (supported for binary and long vectors) <br><br>`hamming` (supported for binary vectors in OpenSearch version 2.16 and later) | $$ d(\mathbf{x}, \mathbf{y}) = \text{countSetBits}(\mathbf{x} \oplus \mathbf{y})$$ | $$ score = {1 \over 1 + d } $$ | - -Cosine similarity returns a number between -1 and 1, and because OpenSearch relevance scores can't be below 0, the k-NN plugin adds 1 to get the final score. - -With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ... ]`) as input. This is because the magnitude of such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests containing the zero vector will be rejected, and a corresponding exception will be thrown. -{: .note } - -The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors). -{: .note} diff --git a/_vector-search/vector-search-techniques/painless-functions.md b/_vector-search/vector-search-techniques/painless-functions.md new file mode 100644 index 00000000000..4f106e378a4 --- /dev/null +++ b/_vector-search/vector-search-techniques/painless-functions.md @@ -0,0 +1,80 @@ +--- +layout: default +title: Painless extensions +nav_order: 25 +parent: Exact k-NN search with a scoring script +grand_parent: Vector search techniques +has_children: false +has_math: true +redirect_from: + - /search-plugins/knn/painless-functions/ +--- + +# Painless scripting extensions + +With Painless scripting extensions, you can use k-nearest neighbors (k-NN) distance functions directly in your Painless scripts to perform operations on `knn_vector` fields. Painless has a strict list of allowed functions and classes per context to ensure that its scripts are secure. OpenSearch adds Painless scripting extensions to a few of the distance functions used in [k-NN scoring script]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-score-script/), so you can use them to customize your k-NN workload. + +## Get started with k-NN Painless scripting functions + +To use k-NN Painless scripting functions, first create an index with `knn_vector` fields, as described in [Getting started with the scoring script for vectors]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-score-script#getting-started-with-the-scoring-script-for-vectors). Once you have created the index and ingested some data, you can use Painless extensions: + +```json +GET my-knn-index-2/_search +{ + "size": 2, + "query": { + "script_score": { + "query": { + "bool": { + "filter": { + "term": { + "color": "BLUE" + } + } + } + }, + "script": { + "source": "1.0 + cosineSimilarity(params.query_value, doc[params.field])", + "params": { + "field": "my_vector", + "query_value": [9.9, 9.9] + } + } + } + } +} +``` +{% include copy-curl.html %} + +`field` needs to map to a `knn_vector` field, and `query_value` must be a floating-point array with the same dimension as `field`. + +## Function types + +The following table describes the Painless functions OpenSearch provides. + +Function name | Function signature | Description +:--- | :--- +`l2Squared` | `float l2Squared (float[] queryVector, doc['vector field'])` | This function calculates the square of the L2 distance (Euclidean distance) between a given query vector and document vectors. A shorter distance indicates a more relevant document, so this example inverts the return value of the `l2Squared` function. If the document vector matches the query vector, the result is `0`, so this example also adds `1` to the distance to avoid divide-by-zero errors. +`l1Norm` | `float l1Norm (float[] queryVector, doc['vector field'])` | This function calculates the L1 norm distance (Manhattan distance) between a given query vector and document vectors. +`cosineSimilarity` | `float cosineSimilarity (float[] queryVector, doc['vector field'])` | Cosine similarity is an inner product of the query vector and document vector normalized to both have a length of `1`. If the magnitude of the query vector doesn't change throughout the query, you can pass the magnitude of the query vector to improve performance instead of repeatedly calculating the magnitude for every filtered document:<br /> `float cosineSimilarity (float[] queryVector, doc['vector field'], float normQueryVector)` <br />In general, the range of cosine similarity is [-1, 1]. However, in the case of information retrieval, the cosine similarity of two documents ranges from `0` to `1` because the `tf-idf` statistic can't be negative. Therefore, OpenSearch adds `1.0` in order to always yield a positive cosine similarity score. +`hamming` | `float hamming (float[] queryVector, doc['vector field'])` | This function calculates the Hamming distance between a given query vector and document vectors. The Hamming distance is the number of positions at which the corresponding elements are different. A shorter distance indicates a more relevant document, so this example inverts the return value of the Hamming distance. + +The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-memory-optimized#binary-vectors). +{: .note} + +## Constraints + +1. If a document's `knn_vector` field has different dimensions than the query, the function throws an `IllegalArgumentException`. + +2. If a vector field doesn't have a value, the function throws an `IllegalStateException`. + + You can avoid this by first checking whether a document contains a value in its field: + + ``` + "source": "doc[params.field].size() == 0 ? 0 : 1 / (1 + l2Squared(params.query_value, doc[params.field]))", + ``` + + Because scores can only be positive, this script ranks documents with vector fields higher than those without vector fields. + +When using cosine similarity, it is not valid to pass a zero vector (`[0, 0, ...]`) as input. This is because the magnitude of such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests containing the zero vector will be rejected, and a corresponding exception will be thrown. +{: .note } diff --git a/assets/examples/docker-compose.yml b/assets/examples/docker-compose.yml index bab29f90cac..ec19a53a2f5 100644 --- a/assets/examples/docker-compose.yml +++ b/assets/examples/docker-compose.yml @@ -1,4 +1,3 @@ -version: '3' services: opensearch-node1: # This is also the hostname of the container within the Docker network (i.e. https://opensearch-node1/) image: opensearchproject/opensearch:latest @@ -65,4 +64,3 @@ volumes: networks: opensearch-net: - \ No newline at end of file diff --git a/assets/js/copy-button.js b/assets/js/copy-button.js index cb784f07d70..9209c10d5aa 100644 --- a/assets/js/copy-button.js +++ b/assets/js/copy-button.js @@ -29,12 +29,8 @@ function createButton(textToCopy, buttonText, buttonAriaLabel, curl) { copyButton.innerText = buttonText; copyButton.ariaLabel = buttonAriaLabel; - if (curl) { - copyButton.setAttribute('data-text', addCurl(textToCopy)); - } - else { - copyButton.setAttribute('data-text', textToCopy); - } + copyButton.setAttribute('data-action', curl ? 'copy_as_curl' : 'copy_code'); + copyButton.setAttribute('data-text', curl ? addCurl(textToCopy) : textToCopy); return copyButton; } diff --git a/assets/js/listener.js b/assets/js/listener.js index 029e042419c..16bec008b2c 100644 --- a/assets/js/listener.js +++ b/assets/js/listener.js @@ -6,40 +6,59 @@ const commentTextArea = document.getElementById('comment'); const thankYouText = document.getElementById('thank-you'); const nav = document.getElementById('site-nav'); const versionPanel = document.getElementById('version-panel'); -document.addEventListener('DOMContentLoaded', updateTextArea); +const actionHandlers = { + submit_issue_click: () => gtag('event', 'submit_issue_click'), + edit_page_click: () => gtag('event', 'edit_page_click'), + forum_link_click: () => gtag('event', 'forum_link_click'), + enable_send_button: () => sendButton.disabled = false, + send_feedback: () => sendFeedback(), + switch_tab: (el) => switchTab({ target: el }, el.getAttribute('data-tab')), + copy_code: (el) => copyCode(el), + copy_as_curl: (el) => copyAsCurl(el) +}; + + +// Single click event listener for the entire document document.addEventListener('click', function(event) { const { target } = event; - if (target.matches('.feedback-issue')) { - gtag('event', 'submit_issue_click'); - } - else if (target.matches('.feedback-edit')) { - gtag('event', 'edit_page_click'); - } - else if (target.matches('.feedback-forum')) { - gtag('event', 'forum_link_click'); - } - else if (target.matches('.feedback-button')) { - sendButton.disabled = false; - } - else if (target.matches('.send-button')) { - sendFeedback(); - } - else if (target.matches('.copy-button')) { + + // Handle old-style buttons first + if (target.matches('.copy-button') && target.hasAttribute('data-text')) { window.navigator.clipboard.writeText(target.getAttribute('data-text')); + return; // Exit early to avoid multiple handlers } -}); -nav.addEventListener('scroll',(e)=>{ - if(nav.scrollTop > 0){ - versionPanel.classList.add("nav-shadow"); - }else{ - versionPanel.classList.remove("nav-shadow"); + // Handle new-style buttons and other clicks + const action = target.dataset.action; + if (action && actionHandlers[action]) { + actionHandlers[action](target); } - }); +}); + +// Event listeners +document.addEventListener('DOMContentLoaded', updateTextArea); commentTextArea.addEventListener('input', updateTextArea); +function debounce(fn, delay) { + let timeoutId; + return function(...args) { + clearTimeout(timeoutId); + timeoutId = setTimeout(() => fn.apply(this, args), delay); + }; +} + +function handleNavScroll() { + if (nav.scrollTop > 0) { + versionPanel.classList.add('nav-shadow'); + } else { + versionPanel.classList.remove('nav-shadow'); + } +} + +nav.addEventListener('scroll', debounce(handleNavScroll, 100)); + function updateTextArea() { const text = commentTextArea.value.trim(); @@ -52,6 +71,7 @@ function updateTextArea() { numCharsLabel.innerText = counter + " characters left"; } + function sendFeedback() { let helpful = 'none'; if (yesButton.checked) { @@ -95,3 +115,38 @@ function sendFeedback() { // disable the send button sendButton.disabled = true; } + +function switchTab(event, tabId) { + const container = event.target.closest('.code-tabs'); + + container.querySelectorAll('.tab.active, .tab-button.active').forEach(el => { + el.classList.remove('active'); + }); + + // Add active class to selected tab and button + const selectedTab = container.querySelector(`#${tabId}`); + selectedTab?.classList.add('active'); + event.target.classList.add('active'); +} + +function copyCode(button) { + const codeBlock = button.closest('.code-container').querySelector('pre'); + const code = codeBlock.textContent.trim(); + window.navigator.clipboard.writeText(code); +} + +function copyAsCurl(button) { + const codeBlock = button.closest('.code-container').querySelector('pre'); + const code = codeBlock.textContent.trim(); + + const lines = code.split('\n'); + const [method, path] = lines[0].trim().split(' '); + const body = lines.slice(1).join('\n'); + + const formattedPath = path.startsWith('/') ? path : '/' + path; + const curlCommand = body + ? `curl -X ${method} "localhost:9200${formattedPath}" -H "Content-Type: application/json" -d '\n${body}\n'` + : `curl -X ${method} "localhost:9200${formattedPath}"`; + + window.navigator.clipboard.writeText(curlCommand); +} \ No newline at end of file diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 04dd007db96..97593417041 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -1,5 +1,3 @@ -version: "3" - services: doc_builder: image: ruby:3.2.4 diff --git a/images/Query-Insights/BothColDisplay.png b/images/Query-Insights/BothColDisplay.png new file mode 100644 index 00000000000..5cb87d41ad2 Binary files /dev/null and b/images/Query-Insights/BothColDisplay.png differ diff --git a/images/Query-Insights/Configuration.png b/images/Query-Insights/Configuration.png new file mode 100644 index 00000000000..5091cbe7b53 Binary files /dev/null and b/images/Query-Insights/Configuration.png differ diff --git a/images/Query-Insights/GroupQueryDetails.png b/images/Query-Insights/GroupQueryDetails.png new file mode 100644 index 00000000000..02fbbb6512a Binary files /dev/null and b/images/Query-Insights/GroupQueryDetails.png differ diff --git a/images/Query-Insights/IndividualQueryDetails.png b/images/Query-Insights/IndividualQueryDetails.png new file mode 100644 index 00000000000..22f0fde51cb Binary files /dev/null and b/images/Query-Insights/IndividualQueryDetails.png differ diff --git a/images/Query-Insights/OnlyGroupColDisplay.png b/images/Query-Insights/OnlyGroupColDisplay.png new file mode 100644 index 00000000000..1fd020fb5a5 Binary files /dev/null and b/images/Query-Insights/OnlyGroupColDisplay.png differ diff --git a/images/Query-Insights/OnlyQueryColDisplay.png b/images/Query-Insights/OnlyQueryColDisplay.png new file mode 100644 index 00000000000..12804f2fd13 Binary files /dev/null and b/images/Query-Insights/OnlyQueryColDisplay.png differ diff --git a/images/Query-Insights/Querieslist.png b/images/Query-Insights/Querieslist.png new file mode 100644 index 00000000000..edc1c507bb1 Binary files /dev/null and b/images/Query-Insights/Querieslist.png differ diff --git a/images/Query-Insights/QueryInsights.png b/images/Query-Insights/QueryInsights.png new file mode 100644 index 00000000000..06c26cb256a Binary files /dev/null and b/images/Query-Insights/QueryInsights.png differ diff --git a/images/anomaly-detection/add-feature-with-relative-rules.png b/images/anomaly-detection/add-feature-with-relative-rules.png new file mode 100644 index 00000000000..dcdcbaa2949 Binary files /dev/null and b/images/anomaly-detection/add-feature-with-relative-rules.png differ diff --git a/images/anomaly-detection/add-suppression-rules-absolute.png b/images/anomaly-detection/add-suppression-rules-absolute.png new file mode 100644 index 00000000000..da2e79fa853 Binary files /dev/null and b/images/anomaly-detection/add-suppression-rules-absolute.png differ diff --git a/images/benchmark/osb-actor-system.png b/images/benchmark/osb-actor-system.png new file mode 100644 index 00000000000..28cf6ffefda Binary files /dev/null and b/images/benchmark/osb-actor-system.png differ diff --git a/images/dashboards-assistant/alert-insight-insight.png b/images/dashboards-assistant/alert-insight-insight.png new file mode 100644 index 00000000000..3d65276d424 Binary files /dev/null and b/images/dashboards-assistant/alert-insight-insight.png differ diff --git a/images/dashboards-assistant/alert-insight-start.png b/images/dashboards-assistant/alert-insight-start.png new file mode 100644 index 00000000000..b55b6296bce Binary files /dev/null and b/images/dashboards-assistant/alert-insight-start.png differ diff --git a/images/dashboards-assistant/alert-insight-summary.png b/images/dashboards-assistant/alert-insight-summary.png new file mode 100644 index 00000000000..1e98170cda8 Binary files /dev/null and b/images/dashboards-assistant/alert-insight-summary.png differ diff --git a/images/dashboards-assistant/data-summary.png b/images/dashboards-assistant/data-summary.png new file mode 100644 index 00000000000..dc2e4e22f00 Binary files /dev/null and b/images/dashboards-assistant/data-summary.png differ diff --git a/images/dashboards-assistant/info-icon.png b/images/dashboards-assistant/info-icon.png new file mode 100644 index 00000000000..29e6b7b97b9 Binary files /dev/null and b/images/dashboards-assistant/info-icon.png differ diff --git a/images/dashboards-assistant/sparkle-icon.png b/images/dashboards-assistant/sparkle-icon.png new file mode 100644 index 00000000000..04b6d2b876a Binary files /dev/null and b/images/dashboards-assistant/sparkle-icon.png differ diff --git a/images/dashboards-assistant/suggestAD-UI.png b/images/dashboards-assistant/suggestAD-UI.png new file mode 100644 index 00000000000..dd7e32d6e23 Binary files /dev/null and b/images/dashboards-assistant/suggestAD-UI.png differ diff --git a/images/dashboards-assistant/suggestAD-button.png b/images/dashboards-assistant/suggestAD-button.png new file mode 100644 index 00000000000..a87fe862fea Binary files /dev/null and b/images/dashboards-assistant/suggestAD-button.png differ diff --git a/images/dashboards-assistant/t2viz-ask-question.png b/images/dashboards-assistant/t2viz-ask-question.png new file mode 100644 index 00000000000..e5b7e86f64c Binary files /dev/null and b/images/dashboards-assistant/t2viz-ask-question.png differ diff --git a/images/dashboards-assistant/t2viz-edit-visual-response.png b/images/dashboards-assistant/t2viz-edit-visual-response.png new file mode 100644 index 00000000000..1fd35425a74 Binary files /dev/null and b/images/dashboards-assistant/t2viz-edit-visual-response.png differ diff --git a/images/dashboards-assistant/t2viz-edit-visual.png b/images/dashboards-assistant/t2viz-edit-visual.png new file mode 100644 index 00000000000..0c57dd58aaf Binary files /dev/null and b/images/dashboards-assistant/t2viz-edit-visual.png differ diff --git a/images/dashboards-assistant/t2viz-select-data-source.png b/images/dashboards-assistant/t2viz-select-data-source.png new file mode 100644 index 00000000000..172e136a5b3 Binary files /dev/null and b/images/dashboards-assistant/t2viz-select-data-source.png differ diff --git a/images/dashboards-assistant/t2viz-start.png b/images/dashboards-assistant/t2viz-start.png new file mode 100644 index 00000000000..f6d46a21e5b Binary files /dev/null and b/images/dashboards-assistant/t2viz-start.png differ diff --git a/images/dashboards-flow-framework/configure-prompt.png b/images/dashboards-flow-framework/configure-prompt.png new file mode 100644 index 00000000000..00af057d575 Binary files /dev/null and b/images/dashboards-flow-framework/configure-prompt.png differ diff --git a/images/dashboards-flow-framework/details-page.png b/images/dashboards-flow-framework/details-page.png new file mode 100644 index 00000000000..b8c1b02c61b Binary files /dev/null and b/images/dashboards-flow-framework/details-page.png differ diff --git a/images/dashboards-flow-framework/import-data-modal.png b/images/dashboards-flow-framework/import-data-modal.png new file mode 100644 index 00000000000..3553c4010f0 Binary files /dev/null and b/images/dashboards-flow-framework/import-data-modal.png differ diff --git a/images/dashboards-flow-framework/ingest-data.png b/images/dashboards-flow-framework/ingest-data.png new file mode 100644 index 00000000000..8cdd36865d8 Binary files /dev/null and b/images/dashboards-flow-framework/ingest-data.png differ diff --git a/images/dashboards-flow-framework/ingest-test-flow.png b/images/dashboards-flow-framework/ingest-test-flow.png new file mode 100644 index 00000000000..4c2cfc3a986 Binary files /dev/null and b/images/dashboards-flow-framework/ingest-test-flow.png differ diff --git a/images/dashboards-flow-framework/new-workflow-page.png b/images/dashboards-flow-framework/new-workflow-page.png new file mode 100644 index 00000000000..e51c42ec7d8 Binary files /dev/null and b/images/dashboards-flow-framework/new-workflow-page.png differ diff --git a/images/dashboards-flow-framework/quick-configure-modal.png b/images/dashboards-flow-framework/quick-configure-modal.png new file mode 100644 index 00000000000..abb2572886c Binary files /dev/null and b/images/dashboards-flow-framework/quick-configure-modal.png differ diff --git a/images/dashboards-flow-framework/rewrite-query.png b/images/dashboards-flow-framework/rewrite-query.png new file mode 100644 index 00000000000..e36a8956080 Binary files /dev/null and b/images/dashboards-flow-framework/rewrite-query.png differ diff --git a/images/dashboards-flow-framework/search-test-flow.png b/images/dashboards-flow-framework/search-test-flow.png new file mode 100644 index 00000000000..7eaf1d109cd Binary files /dev/null and b/images/dashboards-flow-framework/search-test-flow.png differ diff --git a/images/dashboards-flow-framework/transform-data.png b/images/dashboards-flow-framework/transform-data.png new file mode 100644 index 00000000000..1197578b634 Binary files /dev/null and b/images/dashboards-flow-framework/transform-data.png differ diff --git a/images/dashboards-flow-framework/transform-query.png b/images/dashboards-flow-framework/transform-query.png new file mode 100644 index 00000000000..b759abc7295 Binary files /dev/null and b/images/dashboards-flow-framework/transform-query.png differ diff --git a/images/dashboards-flow-framework/transform-response.png b/images/dashboards-flow-framework/transform-response.png new file mode 100644 index 00000000000..82a37074049 Binary files /dev/null and b/images/dashboards-flow-framework/transform-response.png differ diff --git a/images/dashboards/dql-interface.png b/images/dashboards/dql-interface.png index 78ea1d66676..88a207c09f4 100644 Binary files a/images/dashboards/dql-interface.png and b/images/dashboards/dql-interface.png differ diff --git a/images/forecast/bound.png b/images/forecast/bound.png new file mode 100644 index 00000000000..5adc4408a58 Binary files /dev/null and b/images/forecast/bound.png differ diff --git a/images/forecast/forecast_from_1.png b/images/forecast/forecast_from_1.png new file mode 100644 index 00000000000..d0cf705ec91 Binary files /dev/null and b/images/forecast/forecast_from_1.png differ diff --git a/images/forecast/forecast_from_2.png b/images/forecast/forecast_from_2.png new file mode 100644 index 00000000000..0fb336e2f86 Binary files /dev/null and b/images/forecast/forecast_from_2.png differ diff --git a/images/forecast/no_rcf_calibration.png b/images/forecast/no_rcf_calibration.png new file mode 100644 index 00000000000..da2d604ae27 Binary files /dev/null and b/images/forecast/no_rcf_calibration.png differ diff --git a/images/forecast/no_result.png b/images/forecast/no_result.png new file mode 100644 index 00000000000..a3d2604cb9b Binary files /dev/null and b/images/forecast/no_result.png differ diff --git a/images/forecast/overlay_3.png b/images/forecast/overlay_3.png new file mode 100644 index 00000000000..929b93e0f50 Binary files /dev/null and b/images/forecast/overlay_3.png differ diff --git a/images/forecast/state.png b/images/forecast/state.png new file mode 100644 index 00000000000..2722abbf924 Binary files /dev/null and b/images/forecast/state.png differ diff --git a/images/forecast/toggle_overlay_after.png b/images/forecast/toggle_overlay_after.png new file mode 100644 index 00000000000..129d0eb9e6d Binary files /dev/null and b/images/forecast/toggle_overlay_after.png differ diff --git a/images/forecast/toggle_overlay_before.png b/images/forecast/toggle_overlay_before.png new file mode 100644 index 00000000000..5f1cad0dbd8 Binary files /dev/null and b/images/forecast/toggle_overlay_before.png differ diff --git a/images/forecast/trend.png b/images/forecast/trend.png new file mode 100644 index 00000000000..7c03e52302c Binary files /dev/null and b/images/forecast/trend.png differ diff --git a/images/forecast/validation_loading.png b/images/forecast/validation_loading.png new file mode 100644 index 00000000000..2d98bc85b77 Binary files /dev/null and b/images/forecast/validation_loading.png differ diff --git a/images/migrations/migration-architecture-overview.svg b/images/migrations/migration-architecture-overview.svg new file mode 100644 index 00000000000..cf758653aa5 --- /dev/null +++ b/images/migrations/migration-architecture-overview.svg @@ -0,0 +1,2 @@ +<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> +<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="5002px" height="4897px" viewBox="-0.5 -0.5 5002 4897"><defs><linearGradient x1="0%" y1="100%" x2="0%" y2="0%" id="mx-gradient-f34482-1-bc1356-1-s-0"><stop offset="0%" style="stop-color:#BC1356"/><stop offset="100%" style="stop-color:#F34482"/></linearGradient><linearGradient x1="0%" y1="100%" x2="0%" y2="0%" id="mx-gradient-f78e04-1-d05c17-1-s-0"><stop offset="0%" style="stop-color:#D05C17"/><stop offset="100%" style="stop-color:#F78E04"/></linearGradient><linearGradient x1="0%" y1="100%" x2="0%" y2="0%" id="mx-gradient-60a337-1-277116-1-s-0"><stop offset="0%" style="stop-color:#277116"/><stop offset="100%" style="stop-color:#60A337"/></linearGradient></defs><g><rect x="118" y="1" width="1104" height="934" fill="none" stroke="#000000" stroke-width="2" pointer-events="none"/><rect x="129" y="252.5" width="1069" height="462" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" pointer-events="none"/><rect x="129" y="252.5" width="1069" height="462" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" pointer-events="none"/><g transform="translate(662.5,469.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="27" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 1px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><b><br /><br /></b></div></div></foreignObject><text x="0" y="20" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial"><b><br><br></b></text></switch></g><path d="M 220 445 L 220 475 L 402 475 L 402 501.53" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 398 495.76 L 402 503.76 L 406 495.76" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><rect x="150" y="304.5" width="140" height="140" fill="none" stroke="#82b366" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><rect x="150" y="304.5" width="140" height="140" fill="none" stroke="#82b366" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><g transform="translate(154.5,309.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="129" height="128" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 129px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><b>Source Cluster Snapshot<br /><br /><br /><br /><br /><br /><br /><br /></b></div></div></foreignObject><text x="65" y="70" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><rect x="496" y="73" width="341" height="140" fill="none" stroke="#3333ff" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><g transform="translate(666.5,86.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="113" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 1px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><br /><br /><br /><br /><br /><br /><br /><br /></div></div></foreignObject><text x="0" y="63" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial"><br><br><br><br><br><br><br><br></text></switch></g><image x="641.5" y="106.5" width="64" height="64" xlink:href="" preserveAspectRatio="none" pointer-events="none"/><g transform="translate(564.5,178.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="218" height="41" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div><font>Amazon OpenSearch or</font></div><div><font>Elasticsearch/OpenSearch self-managed</font></div><div><font><br /></font></div></div></div></foreignObject><text x="109" y="27" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><g transform="translate(613.5,81.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="114" height="17" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font color="#000000" style="font-size: 16px;"><b>Source Cluster</b><br /></font></div></div></foreignObject><text x="57" y="16" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial">[Not supported by viewer]</text></switch></g><g transform="translate(532.5,59.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="17" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="font-size: 16px;"></div><b style="font-size: 16px;"><font color="#1a1a1a" style="font-size: 16px;"><br /></font></b></div></div></foreignObject><text x="0" y="16" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial"><div style="font-size: 16px;"></div><b style="font-size: 16px;"><font color="#1a1a1a" style="font-size: 16px;"><br></font></b></text></switch></g><g transform="translate(4736.5,4878.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="26" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;">Text</div></div></foreignObject><text x="13" y="15" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial">Text</text></switch></g><g transform="translate(4962.5,4879.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="26" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;">Text</div></div></foreignObject><text x="13" y="15" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial">Text</text></switch></g><g transform="translate(712.5,140.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><g transform="translate(847.5,195.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><g transform="translate(4736.5,4878.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="26" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;">Text</div></div></foreignObject><text x="13" y="15" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial">Text</text></switch></g><g transform="translate(846.5,162.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><g transform="translate(4962.5,4879.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="26" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;">Text</div></div></foreignObject><text x="13" y="15" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial">Text</text></switch></g><g transform="translate(144.5,258.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="180" height="17" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font color="#000000" style="font-size: 16px;"><b>Migration Infrastructure</b><br /></font></div></div></foreignObject><text x="90" y="16" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 108 116 L 403 116 L 403 336.53" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 399 330.76 L 403 338.76 L 407 330.76" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 108 88 L 492.53 88" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 486.76 92 L 494.76 88 L 486.76 84" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 44 104 L 44 102 L 45 102 L 45 101 L 8 101 L 8 922 L 670 922 L 670 904.47" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 674 910.24 L 670 902.24 L 666 910.24" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><image x="43.5" y="71" width="64" height="64" xlink:href="" preserveAspectRatio="none" pointer-events="none"/><g transform="translate(42.5,143.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="66" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><i>Client Traffic</i></div></div></foreignObject><text x="33" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial"><i>Client Traffic</i></text></switch></g><ellipse cx="152.5" cy="85.5" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(147.5,75.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="20" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 18px;"><font style="font-size: 18px;"><b><font color="#ffffff" style="font-size: 18px;">1</font></b></font></div></div></div></foreignObject><text x="5" y="16" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><ellipse cx="152.5" cy="120" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(147.5,109.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="20" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 18px;"><b><font color="#ffffff" style="font-size: 18px;">2</font></b></div></div></div></foreignObject><text x="5" y="16" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><image x="117.5" y="0.5" width="48" height="48" xlink:href="" preserveAspectRatio="none" pointer-events="none"/><g transform="translate(168.5,18.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="62" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div>AWS Cloud</div></div></div></foreignObject><text x="31" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial"><div>AWS Cloud</div></text></switch></g><path d="M 419 381 L 463 381 L 463 383 L 502.03 383" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 496.26 387 L 504.26 383 L 496.26 379" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><image x="370.5" y="356" width="48" height="48" xlink:href="" preserveAspectRatio="none" pointer-events="none"/><g transform="translate(321.5,412.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="146" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div>Application Load Balanacer</div></div></div></foreignObject><text x="73" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial"><div>Application Load Balanacer</div></text></switch></g><rect x="931" y="759.5" width="257" height="134" fill="none" stroke="#000000" stroke-opacity="0.4" stroke-dasharray="3 3" pointer-events="none"/><rect x="931" y="759.5" width="257" height="134" fill="none" stroke="#000000" stroke-opacity="0.4" stroke-dasharray="3 3" pointer-events="none"/><g transform="translate(1059.5,791.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="70" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 1px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><b><br /><br /><br /></b><br /><br /></div></div></foreignObject><text x="0" y="41" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial"><b><br><br><br></b><br><br></text></switch></g><ellipse cx="949.78" cy="815.34" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(944.5,804.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="20" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 18px;"><b><font color="#ffffff" style="font-size: 18px;">5</font></b></div></div></div></foreignObject><text x="5" y="16" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><image x="1104.44" y="799.74" width="64" height="64" xlink:href="" preserveAspectRatio="none" pointer-events="none"/><g transform="translate(1100.5,871.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="71" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div><font>Amazon EFS</font></div></div></div></foreignObject><text x="36" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><g transform="translate(967.5,761.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="185" height="36" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font color="#000000" style="font-size: 16px;"><b>Monitoring and Analysis<br /></b><br /></font></div></div></foreignObject><text x="93" y="25" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 972.5 802 L 1036.5 802 L 1036.5 866 L 972.5 866 Z" fill="url(#mx-gradient-f34482-1-bc1356-1-s-0)" stroke="#ffffff" stroke-miterlimit="10" pointer-events="none"/><path d="M 1018 840.69 C 1018 837.14 1015.09 834.25 1011.52 834.25 C 1007.94 834.25 1005.04 837.14 1005.04 840.69 C 1005.04 844.24 1007.94 847.13 1011.52 847.13 C 1015.09 847.13 1018 844.24 1018 840.69 M 1019.82 840.69 C 1019.82 845.24 1016.1 848.94 1011.52 848.94 C 1006.94 848.94 1003.21 845.24 1003.21 840.69 C 1003.21 836.14 1006.94 832.44 1011.52 832.44 C 1016.1 832.44 1019.82 836.14 1019.82 840.69 M 1027.49 853.13 L 1021.12 847.43 C 1020.61 848.15 1020.01 848.81 1019.35 849.39 L 1025.71 855.09 C 1026.25 855.57 1027.09 855.53 1027.58 854.99 C 1028.07 854.45 1028.03 853.62 1027.49 853.13 M 1011.52 850.57 C 1017 850.57 1021.47 846.14 1021.47 840.69 C 1021.47 835.24 1017 830.81 1011.52 830.81 C 1006.03 830.81 1001.57 835.24 1001.57 840.69 C 1001.57 846.14 1006.03 850.57 1011.52 850.57 M 1028.94 856.2 C 1028.32 856.89 1027.46 857.24 1026.59 857.24 C 1025.84 857.24 1025.09 856.97 1024.49 856.44 L 1017.88 850.52 C 1016.04 851.7 1013.86 852.39 1011.52 852.39 C 1005.02 852.39 999.74 847.14 999.74 840.69 C 999.74 834.24 1005.02 829 1011.52 829 C 1018.01 829 1023.29 834.24 1023.29 840.69 C 1023.29 842.54 1022.84 844.29 1022.07 845.84 L 1028.71 851.78 C 1030 852.94 1030.1 854.92 1028.94 856.2 M 987.33 824.91 C 987.33 825.38 987.35 825.85 987.41 826.3 C 987.44 826.56 987.36 826.82 987.19 827.01 C 987.05 827.17 986.87 827.27 986.66 827.31 C 984.42 827.88 980.73 829.62 980.73 834.85 C 980.73 838.8 982.92 840.98 984.76 842.11 C 985.38 842.5 986.13 842.71 986.91 842.72 L 997.92 842.73 L 997.92 844.55 L 986.9 844.54 C 985.77 844.52 984.7 844.22 983.79 843.65 C 981.97 842.53 978.9 839.89 978.9 834.85 C 978.9 828.78 983.08 826.54 985.53 825.75 C 985.51 825.47 985.5 825.19 985.5 824.91 C 985.5 819.95 988.89 814.81 993.37 812.95 C 998.62 810.76 1004.19 811.85 1008.25 815.84 C 1009.51 817.08 1010.55 818.59 1011.34 820.33 C 1012.41 819.45 1013.73 818.96 1015.11 818.96 C 1017.84 818.96 1020.91 821.02 1021.45 825.51 C 1024.01 826.1 1029.4 828.14 1029.4 834.92 C 1029.4 837.63 1028.55 839.86 1026.86 841.57 L 1025.56 840.3 C 1026.9 838.94 1027.58 837.14 1027.58 834.92 C 1027.58 828.99 1022.58 827.52 1020.43 827.16 C 1020.18 827.12 1019.97 826.98 1019.83 826.78 C 1019.69 826.58 1019.64 826.34 1019.68 826.11 C 1019.38 822.43 1017.18 820.78 1015.11 820.78 C 1013.81 820.78 1012.59 821.41 1011.76 822.51 C 1011.55 822.77 1011.23 822.91 1010.89 822.86 C 1010.56 822.81 1010.28 822.59 1010.17 822.27 C 1009.42 820.22 1008.34 818.49 1006.97 817.13 C 1003.44 813.67 998.63 812.73 994.08 814.62 C 990.29 816.19 987.33 820.71 987.33 824.91" fill="#ffffff" stroke="none" pointer-events="none"/><g transform="translate(948.5,873.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="112" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(35, 47, 62); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div>Amazon CloudWatch</div></div></div></foreignObject><text x="56" y="12" fill="#232F3E" text-anchor="middle" font-size="12px" font-family="Arial"><div>Amazon CloudWatch</div></text></switch></g><rect x="135" y="489.47" width="1054" height="216" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><rect x="135" y="489.47" width="1054" height="216" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><g transform="translate(661.5,582.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="27" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 1px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><b><br /><br /></b></div></div></foreignObject><text x="0" y="20" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial"><b><br><br></b></text></switch></g><g transform="translate(534.5,467.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><g transform="translate(1129.5,470.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><path d="M 151 495.5 L 199 495.5 L 199 543.5 L 151 543.5 Z" fill="url(#mx-gradient-f78e04-1-d05c17-1-s-0)" stroke="#ffffff" stroke-miterlimit="10" pointer-events="none"/><path d="M 173.63 500.31 C 173.51 500.31 173.38 500.35 173.28 500.41 L 158.37 509.01 C 158.14 509.14 157.99 509.39 157.99 509.66 L 157.99 528.42 C 157.99 528.68 158.14 528.93 158.37 529.07 L 174.59 538.56 C 174.82 538.7 175.11 538.7 175.35 538.56 L 190.23 529.9 C 190.47 529.77 190.61 529.52 190.61 529.25 C 190.61 528.98 190.46 528.73 190.22 528.59 L 183.08 524.57 C 182.85 524.43 182.56 524.44 182.33 524.57 L 175.05 528.84 L 166.56 523.95 L 166.56 514.17 L 174.03 509.89 C 174.26 509.76 174.41 509.51 174.41 509.24 L 174.41 501.06 C 174.41 500.86 174.32 500.66 174.18 500.52 C 174.03 500.38 173.83 500.3 173.63 500.31 Z M 176.48 500.34 C 176.27 500.33 176.08 500.41 175.93 500.55 C 175.78 500.69 175.7 500.89 175.7 501.09 L 175.7 509.28 C 175.7 509.55 175.85 509.79 176.08 509.93 L 183.42 514.2 L 183.42 522.76 C 183.42 523.03 183.56 523.28 183.8 523.41 L 190.87 527.52 C 191.1 527.66 191.39 527.66 191.63 527.52 C 191.86 527.39 192.01 527.14 192 526.87 L 192 509.68 C 192.01 509.41 191.86 509.16 191.63 509.03 L 176.84 500.44 C 176.73 500.37 176.6 500.34 176.48 500.34 Z M 172.9 502.37 L 172.9 508.8 L 165.43 513.08 C 165.19 513.21 165.05 513.46 165.05 513.73 L 165.05 524.38 C 165.05 524.65 165.19 524.9 165.42 525.04 L 174.67 530.37 C 174.91 530.5 175.2 530.5 175.43 530.36 L 182.72 526.09 L 188.34 529.26 L 174.97 537.04 L 159.5 527.98 L 159.5 510.09 Z M 177.21 502.4 L 190.5 510.11 L 190.5 525.56 L 184.93 522.33 L 184.93 513.77 C 184.93 513.5 184.78 513.25 184.55 513.11 L 177.21 508.84 Z" fill="#ffffff" stroke="none" pointer-events="none"/><g transform="translate(138.5,550.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="72" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(35, 47, 62); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div>Amazon ECS</div></div></div></foreignObject><text x="36" y="12" fill="#232F3E" text-anchor="middle" font-size="12px" font-family="Arial"><div>Amazon ECS</div></text></switch></g><rect x="592.5" y="506.5" width="179" height="180" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><rect x="592.5" y="506.5" width="179" height="180" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><g transform="translate(628.5,509.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="105" height="171" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 106px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><b style="">Migration Console<br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /></b></div></div></foreignObject><text x="53" y="92" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><rect x="1012" y="612" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="1012" y="612" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(1030.5,629.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="93" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">          Replayer N</div></div></div></foreignObject><text x="47" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 1019.5 626.76 L 1019.5 627.38 L 1019.5 647.24 L 1051.5 647.24 L 1051.5 626.76 Z M 1020.73 627.99 L 1050.27 627.99 L 1050.27 646.01 L 1020.73 646.01 Z M 1022.79 630.01 L 1022.79 643.99 L 1024.02 643.99 L 1024.02 630.01 Z M 1026.56 630.01 L 1026.56 643.99 L 1027.79 643.99 L 1027.79 630.01 Z M 1043.24 630.01 L 1043.24 643.99 L 1044.47 643.99 L 1044.47 630.01 Z M 1047.03 630.01 L 1047.03 643.99 L 1048.26 643.99 L 1048.26 630.01 Z M 1036.23 630.7 L 1036.23 631.92 L 1034.84 631.92 L 1034.84 630.71 L 1033.62 630.71 L 1033.62 631.92 L 1032.3 631.92 L 1032.3 630.71 L 1031.07 630.71 L 1031.07 631.92 L 1031.05 631.92 C 1030.88 631.92 1030.73 631.99 1030.61 632.1 C 1030.5 632.22 1030.43 632.38 1030.43 632.54 L 1030.43 632.56 L 1029.17 632.56 L 1029.17 633.79 L 1030.43 633.79 L 1030.43 635.13 L 1029.18 635.13 L 1029.18 636.36 L 1030.43 636.36 L 1030.43 637.72 L 1029.18 637.72 L 1029.18 638.95 L 1030.43 638.95 L 1030.43 640.25 L 1029.17 640.25 L 1029.17 641.48 L 1030.43 641.48 C 1030.44 641.82 1030.71 642.08 1031.05 642.08 L 1031.07 642.08 L 1031.07 643.38 L 1032.3 643.38 L 1032.3 642.08 L 1033.62 642.08 L 1033.62 643.36 L 1034.84 643.36 L 1034.84 642.08 L 1036.23 642.08 L 1036.23 643.36 L 1037.46 643.36 L 1037.46 642.08 L 1038.76 642.08 L 1038.76 643.38 L 1039.99 643.38 L 1039.99 642.08 L 1040 642.08 C 1040.34 642.08 1040.61 641.82 1040.62 641.48 L 1041.85 641.48 L 1041.85 640.25 L 1040.62 640.25 L 1040.62 638.95 L 1041.87 638.95 L 1041.87 637.72 L 1040.62 637.72 L 1040.62 636.36 L 1041.87 636.36 L 1041.87 635.13 L 1040.62 635.13 L 1040.62 633.79 L 1041.86 633.79 L 1041.86 632.56 L 1040.62 632.56 L 1040.62 632.54 C 1040.62 632.38 1040.55 632.22 1040.44 632.1 C 1040.32 631.99 1040.17 631.92 1040 631.92 L 1039.99 631.92 L 1039.99 630.72 L 1038.76 630.72 L 1038.76 631.92 L 1037.46 631.92 L 1037.46 630.7 Z M 1031.66 633.15 L 1039.39 633.15 L 1039.39 640.85 L 1031.66 640.85 Z" fill="#d05c17" stroke="none" pointer-events="none"/><rect x="948" y="572" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="948" y="572" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(969.5,589.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="88" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">         Replayer 2</div></div></div></foreignObject><text x="44" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 955 585.76 L 955 586.38 L 955 606.24 L 987 606.24 L 987 585.76 Z M 956.23 586.99 L 985.77 586.99 L 985.77 605.01 L 956.23 605.01 Z M 958.29 589.01 L 958.29 602.99 L 959.52 602.99 L 959.52 589.01 Z M 962.06 589.01 L 962.06 602.99 L 963.29 602.99 L 963.29 589.01 Z M 978.74 589.01 L 978.74 602.99 L 979.97 602.99 L 979.97 589.01 Z M 982.53 589.01 L 982.53 602.99 L 983.76 602.99 L 983.76 589.01 Z M 971.73 589.7 L 971.73 590.92 L 970.34 590.92 L 970.34 589.71 L 969.12 589.71 L 969.12 590.92 L 967.8 590.92 L 967.8 589.71 L 966.57 589.71 L 966.57 590.92 L 966.55 590.92 C 966.38 590.92 966.23 590.99 966.11 591.1 C 966 591.22 965.93 591.38 965.93 591.54 L 965.93 591.56 L 964.67 591.56 L 964.67 592.79 L 965.93 592.79 L 965.93 594.13 L 964.68 594.13 L 964.68 595.36 L 965.93 595.36 L 965.93 596.72 L 964.68 596.72 L 964.68 597.95 L 965.93 597.95 L 965.93 599.25 L 964.67 599.25 L 964.67 600.48 L 965.93 600.48 C 965.94 600.82 966.21 601.08 966.55 601.08 L 966.57 601.08 L 966.57 602.38 L 967.8 602.38 L 967.8 601.08 L 969.12 601.08 L 969.12 602.36 L 970.34 602.36 L 970.34 601.08 L 971.73 601.08 L 971.73 602.36 L 972.96 602.36 L 972.96 601.08 L 974.26 601.08 L 974.26 602.38 L 975.49 602.38 L 975.49 601.08 L 975.5 601.08 C 975.84 601.08 976.11 600.82 976.12 600.48 L 977.35 600.48 L 977.35 599.25 L 976.12 599.25 L 976.12 597.95 L 977.37 597.95 L 977.37 596.72 L 976.12 596.72 L 976.12 595.36 L 977.37 595.36 L 977.37 594.13 L 976.12 594.13 L 976.12 592.79 L 977.36 592.79 L 977.36 591.56 L 976.12 591.56 L 976.12 591.54 C 976.12 591.38 976.05 591.22 975.94 591.1 C 975.82 590.99 975.67 590.92 975.5 590.92 L 975.49 590.92 L 975.49 589.72 L 974.26 589.72 L 974.26 590.92 L 972.96 590.92 L 972.96 589.7 Z M 967.16 592.15 L 974.89 592.15 L 974.89 599.85 L 967.16 599.85 Z" fill="#d05c17" stroke="none" pointer-events="none"/><rect x="417.5" y="609" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="417.5" y="609" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(434.5,626.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="97" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">          RFS Task N</div></div></div></foreignObject><text x="49" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 425 623.76 L 425 624.38 L 425 644.24 L 457 644.24 L 457 623.76 Z M 426.23 624.99 L 455.77 624.99 L 455.77 643.01 L 426.23 643.01 Z M 428.29 627.01 L 428.29 640.99 L 429.52 640.99 L 429.52 627.01 Z M 432.06 627.01 L 432.06 640.99 L 433.29 640.99 L 433.29 627.01 Z M 448.74 627.01 L 448.74 640.99 L 449.97 640.99 L 449.97 627.01 Z M 452.53 627.01 L 452.53 640.99 L 453.76 640.99 L 453.76 627.01 Z M 441.73 627.7 L 441.73 628.92 L 440.34 628.92 L 440.34 627.71 L 439.12 627.71 L 439.12 628.92 L 437.8 628.92 L 437.8 627.71 L 436.57 627.71 L 436.57 628.92 L 436.55 628.92 C 436.38 628.92 436.23 628.99 436.11 629.1 C 436 629.22 435.93 629.38 435.93 629.54 L 435.93 629.56 L 434.67 629.56 L 434.67 630.79 L 435.93 630.79 L 435.93 632.13 L 434.68 632.13 L 434.68 633.36 L 435.93 633.36 L 435.93 634.72 L 434.68 634.72 L 434.68 635.95 L 435.93 635.95 L 435.93 637.25 L 434.67 637.25 L 434.67 638.48 L 435.93 638.48 C 435.94 638.82 436.21 639.08 436.55 639.08 L 436.57 639.08 L 436.57 640.38 L 437.8 640.38 L 437.8 639.08 L 439.12 639.08 L 439.12 640.36 L 440.34 640.36 L 440.34 639.08 L 441.73 639.08 L 441.73 640.36 L 442.96 640.36 L 442.96 639.08 L 444.26 639.08 L 444.26 640.38 L 445.49 640.38 L 445.49 639.08 L 445.5 639.08 C 445.84 639.08 446.11 638.82 446.12 638.48 L 447.35 638.48 L 447.35 637.25 L 446.12 637.25 L 446.12 635.95 L 447.37 635.95 L 447.37 634.72 L 446.12 634.72 L 446.12 633.36 L 447.37 633.36 L 447.37 632.13 L 446.12 632.13 L 446.12 630.79 L 447.36 630.79 L 447.36 629.56 L 446.12 629.56 L 446.12 629.54 C 446.12 629.38 446.05 629.22 445.94 629.1 C 445.82 628.99 445.67 628.92 445.5 628.92 L 445.49 628.92 L 445.49 627.72 L 444.26 627.72 L 444.26 628.92 L 442.96 628.92 L 442.96 627.7 Z M 437.16 630.15 L 444.89 630.15 L 444.89 637.85 L 437.16 637.85 Z" fill="#d05c17" stroke="none" pointer-events="none"/><rect x="353.5" y="569" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="353.5" y="569" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(372.5,586.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="92" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">         RFS Task 2</div></div></div></foreignObject><text x="46" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><rect x="216" y="506" width="371" height="180" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><rect x="216" y="506" width="371" height="180" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><g transform="translate(315.5,509.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="171" height="171" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 172px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><font style="font-size: 12px;"><b><span>Reindex-from-Snapshot (RFS)</span><span><br /></span></b></font><b style=""><font style="font-size: 12px;"><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /></font><br /></b></div></div></foreignObject><text x="86" y="92" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 360.5 582.76 L 360.5 583.38 L 360.5 603.24 L 392.5 603.24 L 392.5 582.76 Z M 361.73 583.99 L 391.27 583.99 L 391.27 602.01 L 361.73 602.01 Z M 363.79 586.01 L 363.79 599.99 L 365.02 599.99 L 365.02 586.01 Z M 367.56 586.01 L 367.56 599.99 L 368.79 599.99 L 368.79 586.01 Z M 384.24 586.01 L 384.24 599.99 L 385.47 599.99 L 385.47 586.01 Z M 388.03 586.01 L 388.03 599.99 L 389.26 599.99 L 389.26 586.01 Z M 377.23 586.7 L 377.23 587.92 L 375.84 587.92 L 375.84 586.71 L 374.62 586.71 L 374.62 587.92 L 373.3 587.92 L 373.3 586.71 L 372.07 586.71 L 372.07 587.92 L 372.05 587.92 C 371.88 587.92 371.73 587.99 371.61 588.1 C 371.5 588.22 371.43 588.38 371.43 588.54 L 371.43 588.56 L 370.17 588.56 L 370.17 589.79 L 371.43 589.79 L 371.43 591.13 L 370.18 591.13 L 370.18 592.36 L 371.43 592.36 L 371.43 593.72 L 370.18 593.72 L 370.18 594.95 L 371.43 594.95 L 371.43 596.25 L 370.17 596.25 L 370.17 597.48 L 371.43 597.48 C 371.44 597.82 371.71 598.08 372.05 598.08 L 372.07 598.08 L 372.07 599.38 L 373.3 599.38 L 373.3 598.08 L 374.62 598.08 L 374.62 599.36 L 375.84 599.36 L 375.84 598.08 L 377.23 598.08 L 377.23 599.36 L 378.46 599.36 L 378.46 598.08 L 379.76 598.08 L 379.76 599.38 L 380.99 599.38 L 380.99 598.08 L 381 598.08 C 381.34 598.08 381.61 597.82 381.62 597.48 L 382.85 597.48 L 382.85 596.25 L 381.62 596.25 L 381.62 594.95 L 382.87 594.95 L 382.87 593.72 L 381.62 593.72 L 381.62 592.36 L 382.87 592.36 L 382.87 591.13 L 381.62 591.13 L 381.62 589.79 L 382.86 589.79 L 382.86 588.56 L 381.62 588.56 L 381.62 588.54 C 381.62 588.38 381.55 588.22 381.44 588.1 C 381.32 587.99 381.17 587.92 381 587.92 L 380.99 587.92 L 380.99 586.72 L 379.76 586.72 L 379.76 587.92 L 378.46 587.92 L 378.46 586.7 Z M 372.66 589.15 L 380.39 589.15 L 380.39 596.85 L 372.66 596.85 Z" fill="#d05c17" stroke="none" pointer-events="none"/><rect x="780" y="506.5" width="371" height="180" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><rect x="780" y="506.5" width="371" height="180" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><g transform="translate(939.5,509.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="51" height="171" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 52px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><b style="">Replayer<br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /></b></div></div></foreignObject><text x="26" y="92" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 654.73 557.03 C 654.05 557.03 653.5 557.58 653.5 558.26 L 653.5 596.74 C 653.5 597.42 654.05 597.97 654.73 597.97 L 716.27 597.97 C 716.95 597.97 717.5 597.42 717.5 596.74 L 717.5 558.26 C 717.5 557.58 716.95 557.03 716.27 557.03 Z M 655.96 559.49 L 715.04 559.49 L 715.04 595.51 L 655.96 595.51 Z M 673.97 563.36 C 673.65 563.36 673.33 563.49 673.1 563.72 C 672.87 563.95 672.74 564.26 672.74 564.59 L 672.74 578.82 C 672.74 579.15 672.87 579.46 673.1 579.69 C 673.33 579.92 673.65 580.05 673.97 580.05 L 684.29 580.05 L 684.29 582.63 L 675.26 582.63 C 674.58 582.63 674.03 583.18 674.03 583.86 L 674.03 590.36 C 674.03 591.04 674.58 591.59 675.26 591.59 L 695.79 591.59 C 696.12 591.59 696.43 591.46 696.66 591.23 C 696.89 591 697.02 590.69 697.02 590.36 L 697.02 583.86 C 697.02 583.53 696.89 583.22 696.66 582.99 C 696.43 582.76 696.12 582.63 695.79 582.63 L 686.75 582.63 L 686.75 580.05 L 697.1 580.05 C 697.43 580.05 697.74 579.92 697.97 579.69 C 698.2 579.46 698.33 579.15 698.33 578.82 L 698.33 564.59 C 698.33 563.91 697.78 563.36 697.1 563.36 Z M 660.01 563.48 L 660.01 591.48 L 662.47 591.48 L 662.47 563.48 Z M 667.56 563.48 L 667.56 591.48 L 670.02 591.48 L 670.02 563.48 Z M 700.97 563.48 L 700.97 591.48 L 703.43 591.48 L 703.43 563.48 Z M 708.57 563.48 L 708.57 591.48 L 711.03 591.48 L 711.03 563.48 Z M 675.2 565.82 L 695.87 565.82 L 695.87 577.59 L 675.2 577.59 Z M 676.5 585.09 L 694.56 585.09 L 694.56 589.13 L 676.5 589.13 Z M 689.45 586.44 L 689.45 587.67 L 691.9 587.67 L 691.9 586.44 Z" fill="#d05c17" stroke="none" pointer-events="none"/><g transform="translate(636.5,600.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="97" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(35, 47, 62); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: center;">Migration Console</div></div></div></foreignObject><text x="49" y="12" fill="#232F3E" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><rect x="276" y="527" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="276" y="527" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(290.5,530.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="102" height="41" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">          </div><div style="text-align: right;">            RFS Task 1</div><div style="text-align: right;"><br /></div></div></div></foreignObject><text x="51" y="27" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 284.5 541.76 L 284.5 542.38 L 284.5 562.24 L 316.5 562.24 L 316.5 541.76 Z M 285.73 542.99 L 315.27 542.99 L 315.27 561.01 L 285.73 561.01 Z M 287.79 545.01 L 287.79 558.99 L 289.02 558.99 L 289.02 545.01 Z M 291.56 545.01 L 291.56 558.99 L 292.79 558.99 L 292.79 545.01 Z M 308.24 545.01 L 308.24 558.99 L 309.47 558.99 L 309.47 545.01 Z M 312.03 545.01 L 312.03 558.99 L 313.26 558.99 L 313.26 545.01 Z M 301.23 545.7 L 301.23 546.92 L 299.84 546.92 L 299.84 545.71 L 298.62 545.71 L 298.62 546.92 L 297.3 546.92 L 297.3 545.71 L 296.07 545.71 L 296.07 546.92 L 296.05 546.92 C 295.88 546.92 295.73 546.99 295.61 547.1 C 295.5 547.22 295.43 547.38 295.43 547.54 L 295.43 547.56 L 294.17 547.56 L 294.17 548.79 L 295.43 548.79 L 295.43 550.13 L 294.18 550.13 L 294.18 551.36 L 295.43 551.36 L 295.43 552.72 L 294.18 552.72 L 294.18 553.95 L 295.43 553.95 L 295.43 555.25 L 294.17 555.25 L 294.17 556.48 L 295.43 556.48 C 295.44 556.82 295.71 557.08 296.05 557.08 L 296.07 557.08 L 296.07 558.38 L 297.3 558.38 L 297.3 557.08 L 298.62 557.08 L 298.62 558.36 L 299.84 558.36 L 299.84 557.08 L 301.23 557.08 L 301.23 558.36 L 302.46 558.36 L 302.46 557.08 L 303.76 557.08 L 303.76 558.38 L 304.99 558.38 L 304.99 557.08 L 305 557.08 C 305.34 557.08 305.61 556.82 305.62 556.48 L 306.85 556.48 L 306.85 555.25 L 305.62 555.25 L 305.62 553.95 L 306.87 553.95 L 306.87 552.72 L 305.62 552.72 L 305.62 551.36 L 306.87 551.36 L 306.87 550.13 L 305.62 550.13 L 305.62 548.79 L 306.86 548.79 L 306.86 547.56 L 305.62 547.56 L 305.62 547.54 C 305.62 547.38 305.55 547.22 305.44 547.1 C 305.32 546.99 305.17 546.92 305 546.92 L 304.99 546.92 L 304.99 545.72 L 303.76 545.72 L 303.76 546.92 L 302.46 546.92 L 302.46 545.7 Z M 296.66 548.15 L 304.39 548.15 L 304.39 555.85 L 296.66 555.85 Z" fill="#d05c17" stroke="none" pointer-events="none"/><path d="M 219.64 510 C 219.27 510 218.96 510.3 218.96 510.68 L 218.96 535.2 C 218.96 535.57 219.27 535.88 219.64 535.88 L 222.23 535.88 L 222.23 538.26 C 222.23 538.44 222.3 538.61 222.43 538.74 C 222.56 538.86 222.73 538.94 222.91 538.94 L 225.29 538.94 L 225.29 541.32 C 225.29 541.7 225.6 542 225.97 542 L 244.36 542 C 244.73 542 245.04 541.7 245.04 541.32 L 245.04 516.81 C 245.04 516.43 244.73 516.13 244.36 516.13 L 241.97 516.13 L 241.97 513.74 C 241.97 513.37 241.67 513.06 241.29 513.06 L 238.71 513.06 L 238.71 510.68 C 238.71 510.3 238.4 510 238.03 510 Z M 220.32 511.36 L 237.34 511.36 L 237.34 513.06 L 222.91 513.06 C 222.73 513.06 222.56 513.14 222.43 513.26 C 222.3 513.39 222.23 513.56 222.23 513.74 L 222.23 534.51 L 220.32 534.51 Z M 223.59 514.43 L 240.61 514.43 L 240.61 516.13 L 225.97 516.13 C 225.6 516.13 225.29 516.43 225.29 516.81 L 225.29 537.57 L 223.59 537.57 Z M 226.66 517.49 L 243.68 517.49 L 243.68 540.64 L 226.66 540.64 Z M 228.52 519.19 C 228.15 519.19 227.84 519.5 227.84 519.87 L 227.84 522.94 C 227.84 523.31 228.15 523.62 228.52 523.62 L 231.59 523.62 C 231.96 523.62 232.27 523.31 232.27 522.94 L 232.27 519.87 C 232.27 519.5 231.96 519.19 231.59 519.19 Z M 229.2 520.55 L 230.9 520.55 L 230.9 522.26 L 229.2 522.26 Z M 233.12 520.72 L 233.12 522.09 L 242.32 522.09 L 242.32 520.72 Z M 228.52 526.34 C 228.15 526.34 227.84 526.65 227.84 527.02 L 227.84 530.09 C 227.84 530.46 228.15 530.77 228.52 530.77 L 231.59 530.77 C 231.96 530.77 232.27 530.46 232.27 530.09 L 232.27 527.02 C 232.27 526.65 231.96 526.34 231.59 526.34 Z M 229.2 527.7 L 230.9 527.7 L 230.9 529.4 L 229.2 529.4 Z M 233.12 527.87 L 233.12 529.23 L 242.32 529.23 L 242.32 527.87 Z M 228.52 533.49 C 228.34 533.49 228.17 533.56 228.04 533.69 C 227.91 533.82 227.84 533.99 227.84 534.17 L 227.84 537.24 C 227.84 537.62 228.15 537.92 228.52 537.92 L 231.59 537.92 C 231.96 537.92 232.27 537.62 232.27 537.24 L 232.27 534.17 C 232.27 533.99 232.19 533.82 232.07 533.69 C 231.94 533.56 231.77 533.49 231.59 533.49 Z M 229.2 534.85 L 230.9 534.85 L 230.9 536.56 L 229.2 536.56 Z M 233.12 535.02 L 233.12 536.38 L 242.32 536.38 L 242.32 535.02 Z" fill="#d05c17" stroke="none" pointer-events="none"/><rect x="867.5" y="530.5" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="867.5" y="530.5" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(883.5,533.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="98" height="41" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">            </div><div style="text-align: right;">            Replayer 1</div><div style="text-align: right;"><br /></div></div></div></foreignObject><text x="49" y="27" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 883.5 544.26 L 883.5 544.88 L 883.5 564.74 L 915.5 564.74 L 915.5 544.26 Z M 884.73 545.49 L 914.27 545.49 L 914.27 563.51 L 884.73 563.51 Z M 886.79 547.51 L 886.79 561.49 L 888.02 561.49 L 888.02 547.51 Z M 890.56 547.51 L 890.56 561.49 L 891.79 561.49 L 891.79 547.51 Z M 907.24 547.51 L 907.24 561.49 L 908.47 561.49 L 908.47 547.51 Z M 911.03 547.51 L 911.03 561.49 L 912.26 561.49 L 912.26 547.51 Z M 900.23 548.2 L 900.23 549.42 L 898.84 549.42 L 898.84 548.21 L 897.62 548.21 L 897.62 549.42 L 896.3 549.42 L 896.3 548.21 L 895.07 548.21 L 895.07 549.42 L 895.05 549.42 C 894.88 549.42 894.73 549.49 894.61 549.6 C 894.5 549.72 894.43 549.88 894.43 550.04 L 894.43 550.06 L 893.17 550.06 L 893.17 551.29 L 894.43 551.29 L 894.43 552.63 L 893.18 552.63 L 893.18 553.86 L 894.43 553.86 L 894.43 555.22 L 893.18 555.22 L 893.18 556.45 L 894.43 556.45 L 894.43 557.75 L 893.17 557.75 L 893.17 558.98 L 894.43 558.98 C 894.44 559.32 894.71 559.58 895.05 559.58 L 895.07 559.58 L 895.07 560.88 L 896.3 560.88 L 896.3 559.58 L 897.62 559.58 L 897.62 560.86 L 898.84 560.86 L 898.84 559.58 L 900.23 559.58 L 900.23 560.86 L 901.46 560.86 L 901.46 559.58 L 902.76 559.58 L 902.76 560.88 L 903.99 560.88 L 903.99 559.58 L 904 559.58 C 904.34 559.58 904.61 559.32 904.62 558.98 L 905.85 558.98 L 905.85 557.75 L 904.62 557.75 L 904.62 556.45 L 905.87 556.45 L 905.87 555.22 L 904.62 555.22 L 904.62 553.86 L 905.87 553.86 L 905.87 552.63 L 904.62 552.63 L 904.62 551.29 L 905.86 551.29 L 905.86 550.06 L 904.62 550.06 L 904.62 550.04 C 904.62 549.88 904.55 549.72 904.44 549.6 C 904.32 549.49 904.17 549.42 904 549.42 L 903.99 549.42 L 903.99 548.22 L 902.76 548.22 L 902.76 549.42 L 901.46 549.42 L 901.46 548.2 Z M 895.66 550.65 L 903.39 550.65 L 903.39 558.35 L 895.66 558.35 Z" fill="#d05c17" stroke="none" pointer-events="none"/><path d="M 791.64 511.5 C 791.27 511.5 790.96 511.8 790.96 512.18 L 790.96 536.7 C 790.96 537.07 791.27 537.38 791.64 537.38 L 794.23 537.38 L 794.23 539.76 C 794.23 540.13 794.53 540.44 794.91 540.44 L 797.29 540.44 L 797.29 542.82 C 797.29 543.2 797.6 543.5 797.97 543.5 L 816.36 543.5 C 816.73 543.5 817.04 543.2 817.04 542.82 L 817.04 518.31 C 817.04 517.93 816.73 517.63 816.36 517.63 L 813.97 517.63 L 813.97 515.24 C 813.97 514.87 813.67 514.56 813.29 514.56 L 810.71 514.56 L 810.71 512.18 C 810.71 511.8 810.4 511.5 810.03 511.5 Z M 792.32 512.86 L 809.34 512.86 L 809.34 514.56 L 794.91 514.56 C 794.53 514.56 794.23 514.87 794.23 515.24 L 794.23 536.01 L 792.32 536.01 Z M 795.59 515.93 L 812.61 515.93 L 812.61 517.63 L 797.97 517.63 C 797.6 517.63 797.29 517.93 797.29 518.31 L 797.29 539.07 L 795.59 539.07 Z M 798.66 518.99 L 815.68 518.99 L 815.68 542.14 L 798.66 542.14 Z M 800.52 520.69 C 800.15 520.69 799.84 521 799.84 521.37 L 799.84 524.44 C 799.84 524.81 800.15 525.12 800.52 525.12 L 803.59 525.12 C 803.96 525.12 804.27 524.81 804.27 524.44 L 804.27 521.37 C 804.27 521 803.96 520.69 803.59 520.69 Z M 801.2 522.05 L 802.9 522.05 L 802.9 523.76 L 801.2 523.76 Z M 805.12 522.22 L 805.12 523.59 L 814.32 523.59 L 814.32 522.22 Z M 800.52 527.84 C 800.15 527.84 799.84 528.15 799.84 528.52 L 799.84 531.59 C 799.84 531.96 800.15 532.27 800.52 532.27 L 803.59 532.27 C 803.96 532.27 804.27 531.96 804.27 531.59 L 804.27 528.52 C 804.27 528.15 803.96 527.84 803.59 527.84 Z M 801.2 529.2 L 802.9 529.2 L 802.9 530.9 L 801.2 530.9 Z M 805.12 529.37 L 805.12 530.73 L 814.32 530.73 L 814.32 529.37 Z M 800.52 534.99 C 800.34 534.99 800.17 535.06 800.04 535.19 C 799.91 535.32 799.84 535.49 799.84 535.67 L 799.84 538.74 C 799.84 539.12 800.15 539.42 800.52 539.42 L 803.59 539.42 C 803.96 539.42 804.27 539.12 804.27 538.74 L 804.27 535.67 C 804.27 535.49 804.19 535.32 804.07 535.19 C 803.94 535.06 803.77 534.99 803.59 534.99 Z M 801.2 536.35 L 802.9 536.35 L 802.9 538.06 L 801.2 538.06 Z M 805.12 536.52 L 805.12 537.88 L 814.32 537.88 L 814.32 536.52 Z" fill="#d05c17" stroke="none" pointer-events="none"/><g transform="translate(4779.5,1919.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><rect x="738.5" y="398" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="738.5" y="398" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(739.5,416.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="129" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">            Capture Proxy N</div></div></div></foreignObject><text x="65" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 747 408 L 775 408 L 775 436 L 747 436 Z" fill="url(#mx-gradient-f78e04-1-d05c17-1-s-0)" stroke="#ffffff" stroke-miterlimit="10" pointer-events="none"/><path d="M 760.42 410.8 C 760.01 410.8 759.66 411.15 759.66 411.56 L 759.66 413.92 L 760.54 413.92 L 760.54 411.67 L 771.32 411.67 L 771.32 422.45 L 769.07 422.45 L 769.07 423.33 L 771.44 423.33 C 771.85 423.33 772.2 422.98 772.2 422.57 L 772.2 411.56 C 772.2 411.15 771.85 410.8 771.44 410.8 Z M 756.08 414.82 L 756.08 416.19 L 755.97 416.19 C 755.55 416.19 755.21 416.54 755.21 416.95 L 755.21 417.08 L 753.82 417.08 L 753.82 417.95 L 755.21 417.95 L 755.21 419.32 L 753.82 419.32 L 753.82 420.2 L 755.21 420.2 L 755.21 421.56 L 753.82 421.56 L 753.82 422.44 L 755.21 422.44 L 755.21 423.81 L 753.82 423.81 L 753.82 424.69 L 755.21 424.69 L 755.21 426.05 L 753.82 426.05 L 753.82 426.92 L 755.21 426.92 L 755.21 427.05 C 755.21 427.47 755.55 427.81 755.97 427.81 L 756.08 427.81 L 756.08 429.19 L 756.96 429.19 L 756.96 427.81 L 758.33 427.81 L 758.33 429.19 L 759.2 429.19 L 759.2 427.81 L 760.56 427.81 L 760.56 429.19 L 761.44 429.19 L 761.44 427.81 L 762.79 427.81 L 762.79 429.19 L 763.67 429.19 L 763.67 427.81 L 765.05 427.81 L 765.05 429.19 L 765.93 429.19 L 765.93 427.81 L 766.05 427.81 C 766.47 427.81 766.81 427.47 766.81 427.05 L 766.81 426.92 L 768.16 426.92 L 768.16 426.05 L 766.81 426.05 L 766.81 424.69 L 768.16 424.69 L 768.16 423.81 L 766.81 423.81 L 766.81 422.44 L 768.16 422.44 L 768.16 421.56 L 766.81 421.56 L 766.81 420.2 L 768.16 420.2 L 768.16 419.32 L 766.81 419.32 L 766.81 417.95 L 768.16 417.95 L 768.16 417.08 L 766.81 417.08 L 766.81 416.95 C 766.81 416.54 766.47 416.19 766.05 416.19 L 765.93 416.19 L 765.93 414.82 L 765.05 414.82 L 765.05 416.19 L 763.67 416.19 L 763.67 414.82 L 762.79 414.82 L 762.79 416.19 L 761.44 416.19 L 761.44 414.82 L 760.56 414.82 L 760.56 416.19 L 759.2 416.19 L 759.2 414.82 L 758.33 414.82 L 758.33 416.19 L 756.96 416.19 L 756.96 414.82 Z M 756.08 417.07 L 765.94 417.07 L 765.94 426.94 L 756.08 426.94 Z M 750.56 420.67 C 750.15 420.67 749.8 421.02 749.8 421.43 L 749.8 432.44 C 749.8 432.85 750.15 433.2 750.56 433.2 L 761.58 433.2 C 761.99 433.2 762.34 432.85 762.34 432.44 L 762.34 430.07 L 761.46 430.07 L 761.46 432.33 L 750.68 432.33 L 750.68 421.55 L 752.91 421.55 L 752.91 420.67 Z" fill="#ffffff" stroke="none" pointer-events="none"/><rect x="506.5" y="292.5" width="371" height="180" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-dasharray="3 3" pointer-events="none"/><rect x="506.5" y="292.5" width="371" height="180" fill="none" stroke="#d79b00" stroke-opacity="0.4" stroke-dasharray="3 3" pointer-events="none"/><g transform="translate(597.5,296.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="188" height="171" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 189px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><b style="">Capture Proxy ALB Target Group<br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /></b></div></div></foreignObject><text x="94" y="92" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><rect x="669.5" y="363" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="669.5" y="363" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(671.5,381.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="127" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">            Capture Proxy 2</div></div></div></foreignObject><text x="64" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 678 373 L 706 373 L 706 401 L 678 401 Z" fill="url(#mx-gradient-f78e04-1-d05c17-1-s-0)" stroke="#ffffff" stroke-miterlimit="10" pointer-events="none"/><path d="M 691.42 375.8 C 691.01 375.8 690.66 376.15 690.66 376.56 L 690.66 378.92 L 691.54 378.92 L 691.54 376.67 L 702.32 376.67 L 702.32 387.45 L 700.07 387.45 L 700.07 388.33 L 702.44 388.33 C 702.85 388.33 703.2 387.98 703.2 387.57 L 703.2 376.56 C 703.2 376.15 702.85 375.8 702.44 375.8 Z M 687.08 379.82 L 687.08 381.19 L 686.97 381.19 C 686.55 381.19 686.21 381.54 686.21 381.95 L 686.21 382.08 L 684.82 382.08 L 684.82 382.95 L 686.21 382.95 L 686.21 384.32 L 684.82 384.32 L 684.82 385.2 L 686.21 385.2 L 686.21 386.56 L 684.82 386.56 L 684.82 387.44 L 686.21 387.44 L 686.21 388.81 L 684.82 388.81 L 684.82 389.69 L 686.21 389.69 L 686.21 391.05 L 684.82 391.05 L 684.82 391.92 L 686.21 391.92 L 686.21 392.05 C 686.21 392.47 686.55 392.81 686.97 392.81 L 687.08 392.81 L 687.08 394.19 L 687.96 394.19 L 687.96 392.81 L 689.33 392.81 L 689.33 394.19 L 690.2 394.19 L 690.2 392.81 L 691.56 392.81 L 691.56 394.19 L 692.44 394.19 L 692.44 392.81 L 693.79 392.81 L 693.79 394.19 L 694.67 394.19 L 694.67 392.81 L 696.05 392.81 L 696.05 394.19 L 696.93 394.19 L 696.93 392.81 L 697.05 392.81 C 697.47 392.81 697.81 392.47 697.81 392.05 L 697.81 391.92 L 699.16 391.92 L 699.16 391.05 L 697.81 391.05 L 697.81 389.69 L 699.16 389.69 L 699.16 388.81 L 697.81 388.81 L 697.81 387.44 L 699.16 387.44 L 699.16 386.56 L 697.81 386.56 L 697.81 385.2 L 699.16 385.2 L 699.16 384.32 L 697.81 384.32 L 697.81 382.95 L 699.16 382.95 L 699.16 382.08 L 697.81 382.08 L 697.81 381.95 C 697.81 381.54 697.47 381.19 697.05 381.19 L 696.93 381.19 L 696.93 379.82 L 696.05 379.82 L 696.05 381.19 L 694.67 381.19 L 694.67 379.82 L 693.79 379.82 L 693.79 381.19 L 692.44 381.19 L 692.44 379.82 L 691.56 379.82 L 691.56 381.19 L 690.2 381.19 L 690.2 379.82 L 689.33 379.82 L 689.33 381.19 L 687.96 381.19 L 687.96 379.82 Z M 687.08 382.07 L 696.94 382.07 L 696.94 391.94 L 687.08 391.94 Z M 681.56 385.67 C 681.15 385.67 680.8 386.02 680.8 386.43 L 680.8 397.44 C 680.8 397.85 681.15 398.2 681.56 398.2 L 692.58 398.2 C 692.99 398.2 693.34 397.85 693.34 397.44 L 693.34 395.07 L 692.46 395.07 L 692.46 397.33 L 681.68 397.33 L 681.68 386.55 L 683.91 386.55 L 683.91 385.67 Z" fill="#ffffff" stroke="none" pointer-events="none"/><rect x="604" y="324.5" width="131" height="49" rx="1" ry="1" fill="#000000" stroke="#000000" transform="translate(2,3)" opacity="0.25"/><rect x="604" y="324.5" width="131" height="49" rx="1" ry="1" fill="#ffffff" stroke="#dddddd" pointer-events="none"/><g transform="translate(604.5,328.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="130" height="41" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="text-align: right;">            </div><div style="text-align: right;">             Capture Proxy 1</div><div style="text-align: right;"><br /></div></div></div></foreignObject><text x="65" y="27" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 611.5 335 L 639.5 335 L 639.5 363 L 611.5 363 Z" fill="url(#mx-gradient-f78e04-1-d05c17-1-s-0)" stroke="#ffffff" stroke-miterlimit="10" pointer-events="none"/><path d="M 624.92 337.8 C 624.51 337.8 624.16 338.15 624.16 338.56 L 624.16 340.92 L 625.04 340.92 L 625.04 338.67 L 635.82 338.67 L 635.82 349.45 L 633.57 349.45 L 633.57 350.33 L 635.94 350.33 C 636.35 350.33 636.7 349.98 636.7 349.57 L 636.7 338.56 C 636.7 338.15 636.35 337.8 635.94 337.8 Z M 620.58 341.82 L 620.58 343.19 L 620.47 343.19 C 620.05 343.19 619.71 343.54 619.71 343.95 L 619.71 344.08 L 618.32 344.08 L 618.32 344.95 L 619.71 344.95 L 619.71 346.32 L 618.32 346.32 L 618.32 347.2 L 619.71 347.2 L 619.71 348.56 L 618.32 348.56 L 618.32 349.44 L 619.71 349.44 L 619.71 350.81 L 618.32 350.81 L 618.32 351.69 L 619.71 351.69 L 619.71 353.05 L 618.32 353.05 L 618.32 353.92 L 619.71 353.92 L 619.71 354.05 C 619.71 354.47 620.05 354.81 620.47 354.81 L 620.58 354.81 L 620.58 356.19 L 621.46 356.19 L 621.46 354.81 L 622.83 354.81 L 622.83 356.19 L 623.7 356.19 L 623.7 354.81 L 625.06 354.81 L 625.06 356.19 L 625.94 356.19 L 625.94 354.81 L 627.29 354.81 L 627.29 356.19 L 628.17 356.19 L 628.17 354.81 L 629.55 354.81 L 629.55 356.19 L 630.43 356.19 L 630.43 354.81 L 630.55 354.81 C 630.97 354.81 631.31 354.47 631.31 354.05 L 631.31 353.92 L 632.66 353.92 L 632.66 353.05 L 631.31 353.05 L 631.31 351.69 L 632.66 351.69 L 632.66 350.81 L 631.31 350.81 L 631.31 349.44 L 632.66 349.44 L 632.66 348.56 L 631.31 348.56 L 631.31 347.2 L 632.66 347.2 L 632.66 346.32 L 631.31 346.32 L 631.31 344.95 L 632.66 344.95 L 632.66 344.08 L 631.31 344.08 L 631.31 343.95 C 631.31 343.54 630.97 343.19 630.55 343.19 L 630.43 343.19 L 630.43 341.82 L 629.55 341.82 L 629.55 343.19 L 628.17 343.19 L 628.17 341.82 L 627.29 341.82 L 627.29 343.19 L 625.94 343.19 L 625.94 341.82 L 625.06 341.82 L 625.06 343.19 L 623.7 343.19 L 623.7 341.82 L 622.83 341.82 L 622.83 343.19 L 621.46 343.19 L 621.46 341.82 Z M 620.58 344.07 L 630.44 344.07 L 630.44 353.94 L 620.58 353.94 Z M 615.06 347.67 C 614.65 347.67 614.3 348.02 614.3 348.43 L 614.3 359.44 C 614.3 359.85 614.65 360.2 615.06 360.2 L 626.08 360.2 C 626.49 360.2 626.84 359.85 626.84 359.44 L 626.84 357.07 L 625.96 357.07 L 625.96 359.33 L 615.18 359.33 L 615.18 348.55 L 617.41 348.55 L 617.41 347.67 Z" fill="#ffffff" stroke="none" pointer-events="none"/><path d="M 526 300.5 L 574 300.5 L 574 348.5 L 526 348.5 Z" fill="url(#mx-gradient-f78e04-1-d05c17-1-s-0)" stroke="#ffffff" stroke-miterlimit="10" pointer-events="none"/><path d="M 548.63 305.31 C 548.51 305.31 548.38 305.35 548.28 305.41 L 533.37 314.01 C 533.14 314.14 532.99 314.39 532.99 314.66 L 532.99 333.42 C 532.99 333.68 533.14 333.93 533.37 334.07 L 549.59 343.56 C 549.82 343.7 550.11 343.7 550.35 343.56 L 565.23 334.9 C 565.47 334.77 565.61 334.52 565.61 334.25 C 565.61 333.98 565.46 333.73 565.22 333.59 L 558.08 329.57 C 557.85 329.43 557.56 329.44 557.33 329.57 L 550.05 333.84 L 541.56 328.95 L 541.56 319.17 L 549.03 314.89 C 549.26 314.76 549.41 314.51 549.41 314.24 L 549.41 306.06 C 549.41 305.86 549.32 305.66 549.18 305.52 C 549.03 305.38 548.83 305.3 548.63 305.31 Z M 551.48 305.34 C 551.27 305.33 551.08 305.41 550.93 305.55 C 550.78 305.69 550.7 305.89 550.7 306.09 L 550.7 314.28 C 550.7 314.55 550.85 314.79 551.08 314.93 L 558.42 319.2 L 558.42 327.76 C 558.42 328.03 558.56 328.28 558.8 328.41 L 565.87 332.52 C 566.1 332.66 566.39 332.66 566.63 332.52 C 566.86 332.39 567.01 332.14 567 331.87 L 567 314.68 C 567.01 314.41 566.86 314.16 566.63 314.03 L 551.84 305.44 C 551.73 305.37 551.6 305.34 551.48 305.34 Z M 547.9 307.37 L 547.9 313.8 L 540.43 318.08 C 540.19 318.21 540.05 318.46 540.05 318.73 L 540.05 329.38 C 540.05 329.65 540.19 329.9 540.42 330.04 L 549.67 335.37 C 549.91 335.5 550.2 335.5 550.43 335.36 L 557.72 331.09 L 563.34 334.26 L 549.97 342.04 L 534.5 332.98 L 534.5 315.09 Z M 552.21 307.4 L 565.5 315.11 L 565.5 330.56 L 559.93 327.33 L 559.93 318.77 C 559.93 318.5 559.78 318.25 559.55 318.11 L 552.21 313.84 Z" fill="#ffffff" stroke="none" pointer-events="none"/><g transform="translate(513.5,356.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="72" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(35, 47, 62); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div>Amazon ECS</div></div></div></foreignObject><text x="36" y="12" fill="#232F3E" text-anchor="middle" font-size="12px" font-family="Arial"><div>Amazon ECS</div></text></switch></g><path d="M 1107 445 L 1107 476 L 966 476 L 966 502.03" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 962 496.26 L 966 504.26 L 970 496.26" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 193 351 L 241 351 L 241 399 L 193 399 Z" fill="url(#mx-gradient-60a337-1-277116-1-s-0)" stroke="#ffffff" stroke-miterlimit="10" pointer-events="none"/><path d="M 215.88 355.82 C 211.9 355.8 207.99 356.4 204.9 357.35 C 203.25 357.86 201.83 358.46 200.74 359.15 C 199.65 359.83 198.84 360.6 198.6 361.61 C 198.58 361.68 198.57 361.74 198.58 361.81 L 198.58 361.81 C 198.58 361.81 198.58 361.81 198.58 361.81 C 198.58 361.83 198.58 361.86 198.58 361.89 C 198.59 362.02 198.61 362.14 198.63 362.27 L 202.45 390.14 C 202.45 390.16 202.46 390.18 202.46 390.2 C 202.65 391.05 203.33 391.62 204.16 392.08 C 204.98 392.54 206.03 392.92 207.22 393.23 C 209.59 393.84 212.51 394.18 215.18 394.17 C 218.57 394.2 221.63 393.91 224.04 393.37 C 226.45 392.83 228.25 392.11 229.14 390.85 C 229.21 390.75 229.26 390.64 229.27 390.52 L 230.96 378.49 C 231.35 378.59 231.74 378.67 232.11 378.74 C 232.79 378.86 233.42 378.92 234.02 378.81 C 234.31 378.75 234.61 378.65 234.88 378.43 C 235.14 378.21 235.34 377.88 235.4 377.53 C 235.4 377.49 235.4 377.45 235.41 377.42 C 235.43 376.38 234.68 375.69 233.88 375.07 C 233.2 374.53 232.39 374.06 231.64 373.68 L 233.29 361.85 C 233.31 361.76 233.3 361.66 233.28 361.57 C 233.02 360.53 232.17 359.77 231.08 359.1 C 229.99 358.44 228.59 357.87 227.02 357.39 C 223.87 356.43 220.04 355.85 216.68 355.83 L 216.68 355.83 C 216.41 355.82 216.15 355.82 215.88 355.82 Z M 215.88 357.32 C 216.13 357.32 216.39 357.32 216.64 357.33 L 216.65 357.33 C 216.65 357.33 216.66 357.33 216.66 357.33 C 219.85 357.35 223.6 357.92 226.58 358.83 C 228.07 359.28 229.37 359.82 230.3 360.38 C 231.16 360.91 231.63 361.45 231.77 361.83 C 231.64 362.26 231.25 362.73 230.53 363.2 C 229.69 363.73 228.46 364.23 226.99 364.63 C 224.05 365.44 220.13 365.88 216.29 365.88 C 216.29 365.88 216.28 365.88 216.28 365.88 C 212.73 365.95 208.55 365.55 205.31 364.75 C 203.69 364.35 202.31 363.85 201.38 363.29 C 200.6 362.82 200.21 362.38 200.1 361.98 L 200.09 361.89 C 200.21 361.52 200.66 360.97 201.54 360.41 C 202.46 359.83 203.78 359.26 205.34 358.78 C 208.26 357.89 212.06 357.3 215.88 357.32 Z M 231.42 364.4 L 229.71 376.58 C 229.31 376.46 228.92 376.34 228.56 376.24 C 225.37 375.11 220.68 373.11 217.62 371.6 C 217.62 371.59 217.62 371.58 217.62 371.57 C 217.62 370.63 216.84 369.85 215.9 369.85 C 214.96 369.85 214.18 370.63 214.18 371.57 C 214.18 372.52 214.96 373.3 215.9 373.3 C 216.29 373.3 216.66 373.16 216.95 372.94 C 220.08 374.48 224.79 376.49 228.07 377.65 C 228.08 377.66 228.1 377.66 228.11 377.66 C 228.52 377.79 229 377.94 229.5 378.08 L 227.83 390.06 C 227.34 390.66 225.89 391.42 223.71 391.91 C 221.45 392.42 218.49 392.71 215.18 392.68 C 215.17 392.68 215.17 392.68 215.17 392.68 C 212.64 392.69 209.8 392.35 207.59 391.78 C 206.48 391.49 205.54 391.14 204.89 390.78 C 204.24 390.41 203.96 390.05 203.92 389.88 L 200.45 364.48 C 200.5 364.51 200.56 364.55 200.61 364.58 C 201.75 365.26 203.24 365.79 204.95 366.21 C 208.37 367.05 212.62 367.45 216.3 367.38 C 220.25 367.38 224.25 366.94 227.39 366.08 C 228.95 365.65 230.3 365.12 231.34 364.45 C 231.37 364.44 231.39 364.41 231.42 364.4 Z M 215.9 371.35 C 216.04 371.35 216.13 371.44 216.13 371.57 C 216.13 371.71 216.04 371.8 215.9 371.8 C 215.77 371.8 215.68 371.71 215.68 371.57 C 215.68 371.44 215.77 371.35 215.9 371.35 Z M 231.41 375.26 C 231.96 375.55 232.51 375.9 232.96 376.25 C 233.57 376.73 233.8 377.17 233.84 377.31 C 233.81 377.32 233.81 377.33 233.74 377.34 C 233.47 377.39 232.96 377.37 232.37 377.27 C 232 377.2 231.59 377.1 231.17 377 Z" fill="#ffffff" stroke="none" pointer-events="none"/><g transform="translate(185.5,406.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="62" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(35, 47, 62); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;">Amazon S3</div></div></foreignObject><text x="31" y="12" fill="#232F3E" text-anchor="middle" font-size="12px" font-family="Arial">Amazon S3</text></switch></g><g transform="translate(343.5,460.5)rotate(90,0,6)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font><br /></font></div></div></foreignObject><text x="0" y="12" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font><br></font></text></switch></g><path d="M 692 293 L 692 253 L 683 253 L 683 221.47" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 687 227.24 L 683 219.24 L 679 227.24" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><ellipse cx="690.5" cy="261" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(685.5,250.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="20" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 18px;"><font style="font-size: 18px;"><b><font color="#ffffff" style="font-size: 18px;">2</font></b></font></div></div></div></foreignObject><text x="5" y="16" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 878 383 L 987 383 L 1032.03 383" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 1026.26 387 L 1034.26 383 L 1026.26 379" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><ellipse cx="957" cy="382.5" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(951.5,370.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="23" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 21px;"><font size="1"><b><font color="#ffffff" style="font-size: 18px;">2</font></b></font></div></div></div></foreignObject><text x="5" y="18" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><ellipse cx="457" cy="378.5" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(451.5,368.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="20" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 18px;"><font style="font-size: 18px;"><b><font color="#ffffff" style="font-size: 18px;">2</font></b></font></div></div></div></foreignObject><text x="5" y="16" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><rect x="499.5" y="759.5" width="341" height="140" fill="none" stroke="#3333ff" stroke-width="2" stroke-dasharray="6 6" pointer-events="none"/><g transform="translate(669.5,772.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="113" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 1px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><br /><br /><br /><br /><br /><br /><br /><br /></div></div></foreignObject><text x="0" y="63" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial"><br><br><br><br><br><br><br><br></text></switch></g><image x="645" y="794" width="64" height="64" xlink:href="" preserveAspectRatio="none" pointer-events="none"/><g transform="translate(568.5,866.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="218" height="27" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div><font>Amazon OpenSearch or</font></div><div><font>Elasticsearch/OpenSearch self-managed</font></div></div></div></foreignObject><text x="109" y="20" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><g transform="translate(620.5,767.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="107" height="17" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font color="#000000" style="font-size: 16px;"><b>Target Cluster</b><br /></font></div></div></foreignObject><text x="54" y="16" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial">[Not supported by viewer]</text></switch></g><g transform="translate(527.5,747.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="17" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 14px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div style="font-size: 16px;"></div><b style="font-size: 16px;"><font color="#1a1a1a" style="font-size: 16px;"><br /></font></b></div></div></foreignObject><text x="0" y="16" fill="#ffffff" text-anchor="middle" font-size="14px" font-family="Arial"><div style="font-size: 16px;"></div><b style="font-size: 16px;"><font color="#1a1a1a" style="font-size: 16px;"><br></font></b></text></switch></g><g transform="translate(707.5,828.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><g transform="translate(841.5,883.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><g transform="translate(841.5,850.5)rotate(90,0,7.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="1" height="15" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(255, 255, 255); line-height: 1.2; vertical-align: top; white-space: nowrap; font-weight: bold; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><font style="font-size: 14px;"><br /></font></div></div></foreignObject><text x="0" y="14" fill="#ffffff" text-anchor="middle" font-size="12px" font-family="Arial" font-weight="bold"><font style="font-size: 14px;"><br></font></text></switch></g><ellipse cx="306" cy="472.5" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(300.5,462.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="20" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 18px;"><b><font color="#ffffff" style="font-size: 18px;">3</font></b></div></div></div></foreignObject><text x="5" y="16" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 966 686 L 966 723 L 755 723 L 755 755.53" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 751 749.76 L 755 757.76 L 759 749.76" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><rect x="1036.5" y="304.5" width="140" height="140" fill="none" stroke="#3333ff" stroke-opacity="0.4" stroke-dasharray="3 3" pointer-events="none"/><rect x="1036.5" y="304.5" width="140" height="140" fill="none" stroke="#3333ff" stroke-opacity="0.4" stroke-dasharray="3 3" pointer-events="none"/><g transform="translate(1062.5,317.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="88" height="113" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 89px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><b>Event Streamer<br /></b><br /><br /><br /><br /><br /><br /><br /></div></div></foreignObject><text x="44" y="63" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><image x="1082.22" y="349" width="48" height="48" xlink:href="" preserveAspectRatio="none" pointer-events="none"/><g transform="translate(1069.5,405.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="73" height="12" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; white-space: nowrap; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;"><div><font>Amazon MSK</font></div></div></div></foreignObject><text x="37" y="12" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><ellipse cx="882" cy="726" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(876.5,713.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="24" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 22px;"><font size="1"><b><font color="#ffffff" style="font-size: 18px;">4</font></b></font></div></div></div></foreignObject><text x="5" y="18" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><path d="M 402 686 L 402 723 L 585 723 L 585 755.53" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><path d="M 581 749.76 L 585 757.76 L 589 749.76" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="none"/><ellipse cx="480" cy="722.5" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(474.5,710.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="23" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 21px;"><font size="1"><b><font color="#ffffff" style="font-size: 18px;">3</font></b></font></div></div></div></foreignObject><text x="5" y="18" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><ellipse cx="29" cy="241" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(23.5,230.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="20" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 18px;"><b><font color="#ffffff" style="font-size: 18px;">6</font></b></div></div></div></foreignObject><text x="5" y="16" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g><ellipse cx="1036.5" cy="472.5" rx="15" ry="15" fill="#000000" stroke="#000000" pointer-events="none"/><g transform="translate(1031.5,460.5)"><switch><foreignObject style="overflow:visible;" pointer-events="none" width="10" height="24" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 12px; font-family: Arial; color: rgb(0, 0, 0); line-height: 1.2; vertical-align: top; width: 11px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;"><div style="font-size: 22px;"><font size="1"><b><font color="#ffffff" style="font-size: 18px;">4</font></b></font></div></div></div></foreignObject><text x="5" y="18" fill="#000000" text-anchor="middle" font-size="12px" font-family="Arial">[Not supported by viewer]</text></switch></g></g></svg> \ No newline at end of file diff --git a/images/migrations/migrations-architecture-overview.png b/images/migrations/migrations-architecture-overview.png new file mode 100644 index 00000000000..3002da3a871 Binary files /dev/null and b/images/migrations/migrations-architecture-overview.png differ diff --git a/images/nyc-token-graph.png b/images/nyc-token-graph.png new file mode 100644 index 00000000000..1f2a6296ce3 Binary files /dev/null and b/images/nyc-token-graph.png differ diff --git a/images/search-relevance-workbench/activate_frontend_plugin.png b/images/search-relevance-workbench/activate_frontend_plugin.png new file mode 100644 index 00000000000..059f8c6949a Binary files /dev/null and b/images/search-relevance-workbench/activate_frontend_plugin.png differ diff --git a/images/search-relevance-workbench/aggregate_metrics_comparison_experiment.png b/images/search-relevance-workbench/aggregate_metrics_comparison_experiment.png new file mode 100644 index 00000000000..058516b127a Binary files /dev/null and b/images/search-relevance-workbench/aggregate_metrics_comparison_experiment.png differ diff --git a/images/search-relevance-workbench/comparing-search-results-query-sets.png b/images/search-relevance-workbench/comparing-search-results-query-sets.png new file mode 100644 index 00000000000..9fea08dc492 Binary files /dev/null and b/images/search-relevance-workbench/comparing-search-results-query-sets.png differ diff --git a/images/search-relevance-workbench/comparing_search_results.png b/images/search-relevance-workbench/comparing_search_results.png new file mode 100644 index 00000000000..eb32e3b71ba Binary files /dev/null and b/images/search-relevance-workbench/comparing_search_results.png differ diff --git a/images/search-relevance-workbench/experiment_overview_hybrid_search_optimization.png b/images/search-relevance-workbench/experiment_overview_hybrid_search_optimization.png new file mode 100644 index 00000000000..6479f792fdd Binary files /dev/null and b/images/search-relevance-workbench/experiment_overview_hybrid_search_optimization.png differ diff --git a/images/search-relevance-workbench/experiment_table_overview.png b/images/search-relevance-workbench/experiment_table_overview.png new file mode 100644 index 00000000000..f617f6c625d Binary files /dev/null and b/images/search-relevance-workbench/experiment_table_overview.png differ diff --git a/images/search-relevance-workbench/hybrid_search_optimization_query_overview.png b/images/search-relevance-workbench/hybrid_search_optimization_query_overview.png new file mode 100644 index 00000000000..51cf0fb6cc6 Binary files /dev/null and b/images/search-relevance-workbench/hybrid_search_optimization_query_overview.png differ diff --git a/images/search-relevance-workbench/hybrid_search_optimization_variant_parameters.png b/images/search-relevance-workbench/hybrid_search_optimization_variant_parameters.png new file mode 100644 index 00000000000..d58d061d96b Binary files /dev/null and b/images/search-relevance-workbench/hybrid_search_optimization_variant_parameters.png differ diff --git a/images/search-relevance-workbench/query_set_comparison_experiment_definition.png b/images/search-relevance-workbench/query_set_comparison_experiment_definition.png new file mode 100644 index 00000000000..6c96a9a1f4d Binary files /dev/null and b/images/search-relevance-workbench/query_set_comparison_experiment_definition.png differ diff --git a/images/search-relevance-workbench/select_query_set_comparison.png b/images/search-relevance-workbench/select_query_set_comparison.png new file mode 100644 index 00000000000..51a01339b16 Binary files /dev/null and b/images/search-relevance-workbench/select_query_set_comparison.png differ diff --git a/images/serial-diff-agg-result.png b/images/serial-diff-agg-result.png new file mode 100644 index 00000000000..c6f73eb6d0e Binary files /dev/null and b/images/serial-diff-agg-result.png differ diff --git a/images/ssd-token-graph.png b/images/ssd-token-graph.png new file mode 100644 index 00000000000..52c6bafd2ed Binary files /dev/null and b/images/ssd-token-graph.png differ diff --git a/images/star-tree-index.png b/images/star-tree-index.png new file mode 100644 index 00000000000..81309e1195f Binary files /dev/null and b/images/star-tree-index.png differ diff --git a/images/ta-gantt-mini-map.png b/images/ta-gantt-mini-map.png new file mode 100644 index 00000000000..71a4e47360e Binary files /dev/null and b/images/ta-gantt-mini-map.png differ diff --git a/images/ta-hierarchial-view.png b/images/ta-hierarchial-view.png new file mode 100644 index 00000000000..8e33b9267c1 Binary files /dev/null and b/images/ta-hierarchial-view.png differ diff --git a/images/ta-index-settings.png b/images/ta-index-settings.png new file mode 100644 index 00000000000..979a8dcb8bd Binary files /dev/null and b/images/ta-index-settings.png differ diff --git a/images/ta-service-map-dependencies.png b/images/ta-service-map-dependencies.png new file mode 100644 index 00000000000..d1ca72a4f7a Binary files /dev/null and b/images/ta-service-map-dependencies.png differ diff --git a/images/ta-service-table-icons.png b/images/ta-service-table-icons.png new file mode 100644 index 00000000000..9bf24afcbf3 Binary files /dev/null and b/images/ta-service-table-icons.png differ diff --git a/images/ta-span-kind.png b/images/ta-span-kind.png new file mode 100644 index 00000000000..0d164719fc4 Binary files /dev/null and b/images/ta-span-kind.png differ diff --git a/images/ta-trace-logs-correlation.png b/images/ta-trace-logs-correlation.png new file mode 100644 index 00000000000..da7ffe66439 Binary files /dev/null and b/images/ta-trace-logs-correlation.png differ diff --git a/images/ta-traces-page.png b/images/ta-traces-page.png new file mode 100644 index 00000000000..ebe484e2e6b Binary files /dev/null and b/images/ta-traces-page.png differ diff --git a/images/vector-search-tutorials/mapping_iam_role_arn.png b/images/vector-search-tutorials/mapping_iam_role_arn.png new file mode 100644 index 00000000000..047752bbc5c Binary files /dev/null and b/images/vector-search-tutorials/mapping_iam_role_arn.png differ diff --git a/images/vector-search-tutorials/semantic_search_bedrock_integration_1.png b/images/vector-search-tutorials/semantic_search_bedrock_integration_1.png new file mode 100644 index 00000000000..9b7b5e06b7d Binary files /dev/null and b/images/vector-search-tutorials/semantic_search_bedrock_integration_1.png differ diff --git a/images/vector-search-tutorials/semantic_search_bedrock_integration_2.png b/images/vector-search-tutorials/semantic_search_bedrock_integration_2.png new file mode 100644 index 00000000000..090810a0808 Binary files /dev/null and b/images/vector-search-tutorials/semantic_search_bedrock_integration_2.png differ diff --git a/images/vector-search-tutorials/semantic_search_remote_model_Integration_1.png b/images/vector-search-tutorials/semantic_search_remote_model_Integration_1.png new file mode 100644 index 00000000000..5873e54f978 Binary files /dev/null and b/images/vector-search-tutorials/semantic_search_remote_model_Integration_1.png differ diff --git a/images/vector-search-tutorials/semantic_search_remote_model_Integration_2.png b/images/vector-search-tutorials/semantic_search_remote_model_Integration_2.png new file mode 100644 index 00000000000..dd7cda449ce Binary files /dev/null and b/images/vector-search-tutorials/semantic_search_remote_model_Integration_2.png differ diff --git a/images/vector-search-tutorials/semantic_search_remote_model_Integration_3.png b/images/vector-search-tutorials/semantic_search_remote_model_Integration_3.png new file mode 100644 index 00000000000..ce1c9d356e8 Binary files /dev/null and b/images/vector-search-tutorials/semantic_search_remote_model_Integration_3.png differ diff --git a/images/vector-search/auto-vector-ingest.png b/images/vector-search/auto-vector-ingest.png new file mode 100644 index 00000000000..07550a3a95e Binary files /dev/null and b/images/vector-search/auto-vector-ingest.png differ diff --git a/images/vector-search/auto-vector-search.png b/images/vector-search/auto-vector-search.png new file mode 100644 index 00000000000..e80f37b308f Binary files /dev/null and b/images/vector-search/auto-vector-search.png differ diff --git a/images/vector-search/embeddings.png b/images/vector-search/embeddings.png new file mode 100644 index 00000000000..d627de1d0cb Binary files /dev/null and b/images/vector-search/embeddings.png differ diff --git a/images/vector-search/raw-vector-ingest.png b/images/vector-search/raw-vector-ingest.png new file mode 100644 index 00000000000..a1c0951bcc9 Binary files /dev/null and b/images/vector-search/raw-vector-ingest.png differ diff --git a/images/vector-search/raw-vector-search.png b/images/vector-search/raw-vector-search.png new file mode 100644 index 00000000000..873eb2f012a Binary files /dev/null and b/images/vector-search/raw-vector-search.png differ diff --git a/images/vector-search/vector-similarity.jpg b/images/vector-search/vector-similarity.jpg new file mode 100644 index 00000000000..5dcd8a8e5b8 Binary files /dev/null and b/images/vector-search/vector-similarity.jpg differ diff --git a/index.md b/index.md index 6fac0021db9..ed4d943d9fd 100755 --- a/index.md +++ b/index.md @@ -9,4 +9,4 @@ permalink: / {% include banner.html %} -{% include cards.html %} \ No newline at end of file +{% include home_cards.html %} \ No newline at end of file diff --git a/release-notes/opensearch-documentation-release-notes-2.18.0.md b/release-notes/opensearch-documentation-release-notes-2.18.0.md new file mode 100644 index 00000000000..30147a37f02 --- /dev/null +++ b/release-notes/opensearch-documentation-release-notes-2.18.0.md @@ -0,0 +1,39 @@ +# OpenSearch Documentation Website 2.18.0 Release Notes + +The OpenSearch 2.18.0 documentation includes the following additions and updates. + +## New documentation for 2.18.0 + +- Update SQL/PPL multiple value field limitation [#8646](https://github.com/opensearch-project/documentation-website/pull/8646) +- Add new use cases to ML Inference Search Response Processor [#8639](https://github.com/opensearch-project/documentation-website/pull/8639) +- Documentation for query field name and datatype in query shape [#8631](https://github.com/opensearch-project/documentation-website/pull/8631) +- add document for Query Insights health_stats API [#8627](https://github.com/opensearch-project/documentation-website/pull/8627) +- Add new indexing parameter and update performance tuning instruction [#8623](https://github.com/opensearch-project/documentation-website/pull/8623) +- Update default engine from nmslib to faiss [#8620](https://github.com/opensearch-project/documentation-website/pull/8620) +- Update documentation for coordination settings and batch size [#8604](https://github.com/opensearch-project/documentation-website/pull/8604) +- Update JDK version for 2.x distributions [#8603](https://github.com/opensearch-project/documentation-website/pull/8603) +- Add documentation for star tree index feature [#8598](https://github.com/opensearch-project/documentation-website/pull/8598) +- Add URI paths for cluster stats filtering. [#8595](https://github.com/opensearch-project/documentation-website/pull/8595) +- Adding documentation for _list APIs [#8594](https://github.com/opensearch-project/documentation-website/pull/8594) +- Adds documentation about byField rerank processor [#8593](https://github.com/opensearch-project/documentation-website/pull/8593) +- Updating tiered caching settings [#8592](https://github.com/opensearch-project/documentation-website/pull/8592) +- Add documentation changes for cluster level dynamic limit settings to block cat/indices, _cat/shards and _cat/segments [#8590](https://github.com/opensearch-project/documentation-website/pull/8590) +- Add doc for dynamic threadpool settings [#8588](https://github.com/opensearch-project/documentation-website/pull/8588) +- Add value range for the search backpressure settings [#8555](https://github.com/opensearch-project/documentation-website/pull/8555) +- Add new rename_alias parameters for restore-snapshot [#8544](https://github.com/opensearch-project/documentation-website/pull/8544) +- Add SQL PIT reference [#8541](https://github.com/opensearch-project/documentation-website/pull/8541) +- Document cluster.default_number_of_replicas and update index.number_of_replicas [#8526](https://github.com/opensearch-project/documentation-website/pull/8526) +- Msearch template API returns status code in each search response [#8522](https://github.com/opensearch-project/documentation-website/pull/8522) +- document the new `analysis-phonenumber` plugin [#8469](https://github.com/opensearch-project/documentation-website/pull/8469) +- Adds documentation for providing search pipeline id in the search/msearch request [#8372](https://github.com/opensearch-project/documentation-website/pull/8372) +- Data Stream support for Audit- Log [#8356](https://github.com/opensearch-project/documentation-website/pull/8356) +- Update documentation to reflect k-NN FAISS AVX512 support [#8307](https://github.com/opensearch-project/documentation-website/pull/8307) +- [Feature]: add ignore missing to text chunking processor [#8266](https://github.com/opensearch-project/documentation-website/pull/8266) +- Add documentation for workload management [#8228](https://github.com/opensearch-project/documentation-website/pull/8228) + +## In progress documentation for 2.18.0 + +- [Workspace] Add documentation for workspace and ACL [#8643](https://github.com/opensearch-project/documentation-website/pull/8643) +- add wlm feature overview [#8632](https://github.com/opensearch-project/documentation-website/pull/8632) +- Add querygroup lifecycle api documentation [#8628](https://github.com/opensearch-project/documentation-website/pull/8628) +- [Workload Management] Querygroup Lifecyle API docs [#8249](https://github.com/opensearch-project/documentation-website/pull/8249) diff --git a/release-notes/opensearch-documentation-release-notes-2.19.0.md b/release-notes/opensearch-documentation-release-notes-2.19.0.md new file mode 100644 index 00000000000..f71087df847 --- /dev/null +++ b/release-notes/opensearch-documentation-release-notes-2.19.0.md @@ -0,0 +1,36 @@ +# OpenSearch Documentation Website 2.19.0 Release Notes + +The OpenSearch 2.19.0 documentation includes the following additions and updates. + +## New documentation for 2.19.0 + +- Adds star-tree search changes related to new aggregations supported [#9163](https://github.com/opensearch-project/documentation-website/pull/9163) +- Adds QueryInsightsDashboard [#9157](https://github.com/opensearch-project/documentation-website/pull/9157) +- Add Convert Index to remote documentation [#9156](https://github.com/opensearch-project/documentation-website/pull/9156) +- Adds details for using index_thread_qty for Lucene library [#9152](https://github.com/opensearch-project/documentation-website/pull/9152) +- Update threshold for 2.19 [#9151](https://github.com/opensearch-project/documentation-website/pull/9151) +- Add avx512_spr documentation [#9148](https://github.com/opensearch-project/documentation-website/pull/9148) +- Add documentation for plugin as a service [#9144](https://github.com/opensearch-project/documentation-website/pull/9144) +- Add AD flatten result index feature [#9140](https://github.com/opensearch-project/documentation-website/pull/9140) +- Add documentation for `wait_for_completion_timeout` Parameter [#9138](https://github.com/opensearch-project/documentation-website/pull/9138) +- Add feature direction to AD docs [#9137](https://github.com/opensearch-project/documentation-website/pull/9137) +- Add verbose_pipeline section in using-search-pipeline [#9130](https://github.com/opensearch-project/documentation-website/pull/9130) +- Query Insights 2.19 documentation [#9120](https://github.com/opensearch-project/documentation-website/pull/9120) +- Add template query [#9119](https://github.com/opensearch-project/documentation-website/pull/9119) +- Add RRF documentation for hybrid search [#9117](https://github.com/opensearch-project/documentation-website/pull/9117) +- Add Pagination in hybrid query [#9109](https://github.com/opensearch-project/documentation-website/pull/9109) +- Mark nmslib references for vector search as deprecated [#9107](https://github.com/opensearch-project/documentation-website/pull/9107) +- Add in-place SSL certs hot reload documentation [#9103](https://github.com/opensearch-project/documentation-website/pull/9103) +- Add binary Lucene vector updates for 2.19 [#9102](https://github.com/opensearch-project/documentation-website/pull/9102) +- Add OpenSearch Flow OSD plugin [#9101](https://github.com/opensearch-project/documentation-website/pull/9101) +- Update hot reload documentation to show how DN validation can be skipped [#9079](https://github.com/opensearch-project/documentation-website/pull/9079) +- Update k-NN Cosine formula [#9078](https://github.com/opensearch-project/documentation-website/pull/9078) += Update Ada Grad as the default optimiser. [#9061](https://github.com/opensearch-project/documentation-website/pull/9061) +- Add documentation about explain in hybrid query and hybrid_score_explanation processor [#9053](https://github.com/opensearch-project/documentation-website/pull/9053) +- Add Query Insights local index delete after documentation [#9052](https://github.com/opensearch-project/documentation-website/pull/9052) +- Add support for Bedrock Rerank API #9027 [#9029](https://github.com/opensearch-project/documentation-website/pull/9029) +- Add validation requirement for message fields [#9000](https://github.com/opensearch-project/documentation-website/pull/9000) +- Add documentation for workspace privacy [#8994](https://github.com/opensearch-project/documentation-website/pull/8994) +- Add documentation for pruning neural sparse vectors [#8984](https://github.com/opensearch-project/documentation-website/pull/8984) +- Add document the usage of update document API with ingest pipeline [#8874](https://github.com/opensearch-project/documentation-website/pull/8874) +- Update innerHits of nested k-NN fields [#8822](https://github.com/opensearch-project/documentation-website/pull/8822) diff --git a/release-notes/opensearch-documentation-release-notes-3.0.0.md b/release-notes/opensearch-documentation-release-notes-3.0.0.md new file mode 100644 index 00000000000..829d9c3a177 --- /dev/null +++ b/release-notes/opensearch-documentation-release-notes-3.0.0.md @@ -0,0 +1,38 @@ +# OpenSearch Documentation Website 3.0.0 Release Notes + +The OpenSearch 3.0.0 documentation includes the following additions and updates. + +## New documentation for 3.0.0 + +- [Workload Management] Rename query group to workload group [#9813](https://github.com/opensearch-project/documentation-website/pull/9813) +- keyword/numeric terms and range aggregation support [#9812](https://github.com/opensearch-project/documentation-website/pull/9812) +- Adding documentation for append only indices [#9809](https://github.com/opensearch-project/documentation-website/pull/9809) +- add web search tool documentation [#9807](https://github.com/opensearch-project/documentation-website/pull/9807) +- Add Warm node role for searchable snapshots [#9804](https://github.com/opensearch-project/documentation-website/pull/9804) +- Adding reader writer separation doc page [#9795](https://github.com/opensearch-project/documentation-website/pull/9795) +- Replace CatIndexTool with ListIndexTool [#9792](https://github.com/opensearch-project/documentation-website/pull/9792) +- Added documentation for settings limiting total shards and primary shards per node at both cluster and index level. [#9791](https://github.com/opensearch-project/documentation-website/pull/9791) +- Introduce new PPL commands, configuration and behavior changes [#9790](https://github.com/opensearch-project/documentation-website/pull/9790) +- remove experimental for batch_predict and deprecate the async batch i… [#9768](https://github.com/opensearch-project/documentation-website/pull/9768) +- Add new `_scale` API as part of OpenSearch Reader/Writer Separation [#9762](https://github.com/opensearch-project/documentation-website/pull/9762) +- Derived Source is Enabled By Default from 3.0.0 [#9761](https://github.com/opensearch-project/documentation-website/pull/9761) +- Add documentation about AD contextual launch feature [#9756](https://github.com/opensearch-project/documentation-website/pull/9756) +- [DOC] Node Level Circuit Breaker Config in K-NN [#9753](https://github.com/opensearch-project/documentation-website/pull/9753) +- Update cluster manager task throttling documentation to reflect enabled-by-default behavior (previously disabled) [#9749](https://github.com/opensearch-project/documentation-website/pull/9749) +- Add explain documentation for knn query [#9741](https://github.com/opensearch-project/documentation-website/pull/9741) +- Enable concurrent segment search by default for 3.0 release [#9739](https://github.com/opensearch-project/documentation-website/pull/9739) +- Add documentation for http.max_header_size [#9709](https://github.com/opensearch-project/documentation-website/pull/9709) +- Update the similarity algorithm details [#9675](https://github.com/opensearch-project/documentation-website/pull/9675) +- Update 3.0.0 to use new pgp key for deb and rpm [#9672](https://github.com/opensearch-project/documentation-website/pull/9672) +- Add live queries doc and update top queries doc [#9658](https://github.com/opensearch-project/documentation-website/pull/9658) +- Added index and cluster settings related to RW separation [#9634](https://github.com/opensearch-project/documentation-website/pull/9634) +- Add neural search stats API docs [#9624](https://github.com/opensearch-project/documentation-website/pull/9624) +- Add semantic highlighter related docs [#9603](https://github.com/opensearch-project/documentation-website/pull/9603) + +## Documentation for 3.0.0 experimental features + +- Add mcp server in opensearch documentation [#9800](https://github.com/opensearch-project/documentation-website/pull/9800) +- Add MCP Connectors documentation [#9757](https://github.com/opensearch-project/documentation-website/pull/9757) +- [DOC] ML-Commons Agent: Add documentation for PlanExecuteAndReflect Agent [#9743](https://github.com/opensearch-project/documentation-website/pull/9743) +- [Pull-based Ingestion] Add experimental pull-based ingestion page [#9659](https://github.com/opensearch-project/documentation-website/pull/9659) +- [GRPC] Add documentation for GRPC plugin, and Bulk+Search GRPC APIs [#9649](https://github.com/opensearch-project/documentation-website/pull/9649) diff --git a/release-notes/opensearch-documentation-release-notes-3.1.0.md b/release-notes/opensearch-documentation-release-notes-3.1.0.md new file mode 100644 index 00000000000..b55133af8bc --- /dev/null +++ b/release-notes/opensearch-documentation-release-notes-3.1.0.md @@ -0,0 +1,38 @@ +# OpenSearch Documentation Website 3.1.0 Release Notes + +The OpenSearch 3.1.0 documentation includes the following additions and updates. + +## New documentation for 3.1.0 + +- MCP connector is GA in 3.1 [#10135](https://github.com/opensearch-project/documentation-website/pull/10135) +- [Star-tree] Support for nested aggs & Removing experimental flag. [#10132](https://github.com/opensearch-project/documentation-website/pull/10132) +- [MLCommons] Add details about memory in plan_execute_reflect agent [#10121](https://github.com/opensearch-project/documentation-website/pull/10121) +- [MLCommons] Add details about metrics integration [#10120](https://github.com/opensearch-project/documentation-website/pull/10120) +- Introduce memory optimized vector search (LuceneOnFaiss) in 3.1. [#10119](https://github.com/opensearch-project/documentation-website/pull/10119) +- Document custom SSE endpoint parameter for MCP connectors [#10118](https://github.com/opensearch-project/documentation-website/pull/10118) +- added ML agent update API [#10113](https://github.com/opensearch-project/documentation-website/pull/10113) +- Update rescore support [#10108](https://github.com/opensearch-project/documentation-website/pull/10108) +- Trace analytics update for 3.1 [#10107](https://github.com/opensearch-project/documentation-website/pull/10107) +- Update documentation around to show that it can be configured as a list to extract roles from nested JWT claims [#10103](https://github.com/opensearch-project/documentation-website/pull/10103) +- [Pull-based Ingestion] Add pull-based ingestion limitations [#10096](https://github.com/opensearch-project/documentation-website/pull/10096) +- Add new APIs documentation for MCP server feature [#10095](https://github.com/opensearch-project/documentation-website/pull/10095) +- [Doc] Live Queries dashboard [#10090](https://github.com/opensearch-project/documentation-website/pull/10090) +- Add additional_config information [#10085](https://github.com/opensearch-project/documentation-website/pull/10085) +- Update remote vector index build settings docs [#10084](https://github.com/opensearch-project/documentation-website/pull/10084) +- Update neural stats API for 3.1 [#10079](https://github.com/opensearch-project/documentation-website/pull/10079) +- Add documentation for semantic field. [#10078](https://github.com/opensearch-project/documentation-website/pull/10078) +- Add collapse in hybrid query documentation [#10073](https://github.com/opensearch-project/documentation-website/pull/10073) +- Add doc for neural sparse search with 2 new analyzers [#10062](https://github.com/opensearch-project/documentation-website/pull/10062) +- Update docs for remote vector index build GA [#10061](https://github.com/opensearch-project/documentation-website/pull/10061) +- Add SSE-KMS and bucket owner verification docs [#10060](https://github.com/opensearch-project/documentation-website/pull/10060) +- Update AI Search Flows documentation [#10055](https://github.com/opensearch-project/documentation-website/pull/10055) +- Introduce `fixed_char_length` algorithm in Text Chunking [#10043](https://github.com/opensearch-project/documentation-website/pull/10043) +- Add Query Insights excluded_indices section [#10011](https://github.com/opensearch-project/documentation-website/pull/10011) +- [Pull-based Ingestion] Update pull-based ingestion metrics and reset settings [#9993](https://github.com/opensearch-project/documentation-website/pull/9993) +- Custom weights in rrf processor [#9922](https://github.com/opensearch-project/documentation-website/pull/9922) +- Add back Rule Lifecycle API to main [#9875](https://github.com/opensearch-project/documentation-website/pull/9875) +- Update Query Insights grouping setting names [#9799](https://github.com/opensearch-project/documentation-website/pull/9799) + +## In progress documentation for 3.1.0 + +- Add forecasting section [#10133](https://github.com/opensearch-project/documentation-website/pull/10133) diff --git a/spec-insert/.gitignore b/spec-insert/.gitignore new file mode 100644 index 00000000000..a28b2120e4c --- /dev/null +++ b/spec-insert/.gitignore @@ -0,0 +1,5 @@ +opensearch-openapi.yaml +rspec_examples.txt +utilization_coverage.md +dry_run_report.md +dry-run/ diff --git a/spec-insert/.rspec b/spec-insert/.rspec new file mode 100644 index 00000000000..c99d2e7396e --- /dev/null +++ b/spec-insert/.rspec @@ -0,0 +1 @@ +--require spec_helper diff --git a/spec-insert/.rubocop.yml b/spec-insert/.rubocop.yml new file mode 100644 index 00000000000..51a8b5e7e04 --- /dev/null +++ b/spec-insert/.rubocop.yml @@ -0,0 +1,30 @@ +require: rubocop-rake +AllCops: + Include: + - 'lib/**/*.rb' + - 'Rakefile' + NewCops: enable + TargetRubyVersion: 3.3 + +Metrics/CyclomaticComplexity: + Enabled: false +Metrics/MethodLength: + Enabled: false +Metrics/ParameterLists: + Enabled: false +Metrics/AbcSize: + Enabled: false +Metrics/PerceivedComplexity: + Enabled: false + +Layout/EmptyLineAfterGuardClause: + Enabled: false + +Style/MultilineBlockChain: + Enabled: false +Style/SingleLineMethods: + Enabled: false + +Naming/FileName: + Exclude: + - 'lib/jekyll-spec-insert.rb' # For Jekyll to recognize the plugin diff --git a/spec-insert/README.md b/spec-insert/README.md new file mode 100644 index 00000000000..5e5cfbacbbf --- /dev/null +++ b/spec-insert/README.md @@ -0,0 +1,41 @@ +# Spec-insert plugin overview + +This plugin facilitates the insertion of OpenAPI spec components into Markdown files. It provides a set of snippets that you can use to insert various components, such as API paths, path parameters, and query parameters. This document provides an overview of the plugin's codebase. The codebase is divided into the following sections: + +- API parser: Parses the OpenAPI spec and extracts the relevant component information +- Component renderer: Renders the extracted component information into Markdown format +- Reports: Generates reports for this plugin + +## API parser + +The following section provides information about the components of the API parser. + +### SpecHash + +The [`SpecHash`](./lib/spec_hash.rb) class is responsible for ingesting the OpenAPI spec and dereferencing the `$ref` fields. Its `.load_file` method accepts the path to the OpenAPI spec file and sets up all API actions found in the spec. This is also where the `text_replacements` feature from the config file is managed. + +### API::Operation + +The [`API::Operation`](./lib/api/operation.rb) class represents an OpenAPI operation. Operations of the same group constitute an API. + +### API::Action + +The [`API::Action`](./lib/api/action.rb) class represents an API action, which comprises the URL path, query parameters, path parameters, request body, and response body. The components are represented by the following classes: + +- [`API::Parameter`](./lib/api/parameter.rb) +- [`API::Body`](./lib/api/body.rb) + +## Component renderer + +Components are rendered into Markdown using `mustache` [templates](./lib/renderers/templates). Each spec-insert component is represented by a renderer class: + +- [`BodyParameter`](./lib/renderers/body_parameters.rb): Renders the request and response body tables +- [`PathParameter`](./lib/renderers/path_parameters.rb): Renders the path parameters table +- [`QueryParameter`](./lib/renderers/query_parameters.rb): Renders the query parameters table +- [`Endpoint`](./lib/renderers/endpoints.rb): Renders a list of endpoints + +Each of these components is rendered within a [`SpecInsert`](./lib/renderers/spec_insert.rb) class, which wraps the rendered component within `<!-- spec_insert_start -->` and `<!-- spec_insert_end -->` comments. + +## Reports + +The [reports](./lib/reports) folder contains code that generates utilization and dry-run reports. You can trigger these reports through the [`Rakefile`](Rakefile). diff --git a/spec-insert/Rakefile b/spec-insert/Rakefile new file mode 100644 index 00000000000..6bcd414c6ea --- /dev/null +++ b/spec-insert/Rakefile @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +# frozen_string_literal: true + +require 'rake' +require 'active_support/all' +require_relative 'lib/reports/utilization_coverage' +require_relative 'lib/reports/dry_run_report' +require_relative 'lib/utils' +require_relative 'lib/renderers/spec_insert' +require_relative 'lib/insert_arguments' + +desc 'Generate utilization coverage of Spec-Insert components' +task :generate_utilization_coverage do + Utils.load_spec + coverage = UtilizationCoverage.new.render + file = File.join(__dir__, 'utilization_coverage.md') + File.write(file, coverage) + puts "Utilization coverage written to #{file}" +end + +desc 'Generate all Spec-Insert components for all APIs and summarize the results' +task :generate_dry_run_report do + Utils.load_spec + report = DryRunReport.new.render + file = File.join(__dir__, 'dry_run_report.md') + File.write(file, report) + puts "Dry run report written to #{file}" +end + +desc 'Generate a specific component into the console' +task :dry_run_generate, [:api, :component] do |_, args| + Utils.load_spec + render = SpecInsert.new(InsertArguments.new(args)).render + output = "./dry-run/_#{args[:api]}_#{args[:component]}.md" + File.write(output, render) + + puts render + puts "\n\nThe above render has been written to #{output}" +end diff --git a/spec-insert/config.yml b/spec-insert/config.yml new file mode 100644 index 00000000000..11288e2353a --- /dev/null +++ b/spec-insert/config.yml @@ -0,0 +1,15 @@ +param_table: + parameter_column: + freeform_text: -- freeform field -- + default_column: + empty_text: N/A + required_column: + true_text: "**Required**" + false_text: "_Optional_" + +# Replace text in every description in the spec file +text_replacements: + - replace: "https://opensearch.org/docs/latest" + with: "{{site.url}}{{site.baseurl}}" + - replace: "\n" + with: " " \ No newline at end of file diff --git a/spec-insert/jekyll-spec-insert.gemspec b/spec-insert/jekyll-spec-insert.gemspec new file mode 100644 index 00000000000..d397f40af2c --- /dev/null +++ b/spec-insert/jekyll-spec-insert.gemspec @@ -0,0 +1,16 @@ +# frozen_string_literal: true + +Gem::Specification.new do |spec| + spec.name = 'jekyll-spec-insert' + spec.version = '0.1.0' + spec.authors = ['Theo Truong'] + spec.email = ['theo.nam.truong@gmail.com'] + + spec.summary = 'A Jekyll plugin for inserting OpenSearch OpenAPI specifications into Jekyll sites.' + + spec.files = Dir['lib/**/*.rb'] + spec.require_paths = ['lib'] + + spec.metadata['rubygems_mfa_required'] = 'true' + spec.required_ruby_version = '>= 3.1.0' +end diff --git a/spec-insert/lib/api/action.rb b/spec-insert/lib/api/action.rb new file mode 100644 index 00000000000..f0a8fdcf1df --- /dev/null +++ b/spec-insert/lib/api/action.rb @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +# frozen_string_literal: true + +require_relative 'parameter' +require_relative 'body' +require_relative 'operation' + +module Api + # A collection of operations that comprise a single API Action + # AKA operation-group + class Action + SUCCESS_CODES = %w[200 201 202 203 204 205 206 207 208 226].freeze + + # @param [SpecHash] spec Parsed OpenAPI spec + def self.actions=(spec) + operations = spec.paths.flat_map do |url, ops| + ops.filter_map { |verb, op| Operation.new(op, url, verb) unless op['x-ignorable'] } + end + @actions = operations.group_by(&:group).values.map { |ops| Action.new(ops) } + end + + # @return [Array<Action>] API Actions + def self.all + raise 'Actions not set' unless @actions + @actions + end + + def self.by_full_name + @by_full_name ||= all.index_by(&:full_name).to_h + end + + def self.by_namespace + @by_namespace ||= all.group_by(&:namespace) + end + + # @return [Array<Api::Operation>] Operations in the action + attr_reader :operations + + # @param [Array<Api::Operation>] operations + def initialize(operations) + @operations = operations + @operation = operations.first || {} + @spec = @operation&.spec + end + + def query_parameters + @operations.map(&:spec).flat_map(&:parameters).filter { |param| !param['x-global'] && param.in == 'query' } + .group_by(&:name).values + .map { |params| Parameter.from_param_specs(params, @operations.size) } + end + + def path_parameters + @operations.map(&:spec).flat_map(&:parameters).filter { |param| param.in == 'path' } + .group_by(&:name).values + .map { |params| Parameter.from_param_specs(params, @operations.size) } + end + + # @return [Api::Body] Request body + def request_body + @request_body ||= begin + operation = @operations.find { |op| op.spec.requestBody.present? } + required = @operations.all? { |op| op.spec.requestBody&.required } + content = operation ? operation.spec.requestBody.content : nil + Body.new(content, required:) + end + end + + # @return [Api::Body] Response body + def response_body + @response_body ||= begin + spec = @operations.first.spec + code = SUCCESS_CODES.find { |c| spec.responses[c].present? } + Body.new(spec.responses[code].content, required: nil) + end + end + + # @return [String] Full name of the action (i.e. namespace.action) + def full_name; @operation.group; end + + # return [String] Name of the action + def name; @operation.action; end + + # @return [String] Namespace of the action + def namespace; @operation.namespace || ''; end + + # @return [Array<String>] Sorted unique HTTP verbs + def http_verbs; @operations.map(&:http_verb).uniq.sort; end + + # @return [Array<String>] Unique URLs + def urls; @operations.map(&:url).uniq; end + + # @return [String] Description of the action + def description; @spec.description; end + + # @return [Boolean] Whether the action is deprecated + def deprecated; @spec.deprecated; end + + # @return [String] Deprecation message + def deprecation_message; @spec['x-deprecation-message']; end + + # @return [String] API reference + def api_reference; @operation.external_docs.url; end + end +end diff --git a/spec-insert/lib/api/body.rb b/spec-insert/lib/api/body.rb new file mode 100644 index 00000000000..ec396ca6dd3 --- /dev/null +++ b/spec-insert/lib/api/body.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +require_relative 'parameter' +require_relative 'body_parameter' + +module Api + # Request or response body + class Body + # @param [Boolean] empty whether a schema is defined + attr_reader :empty + # @return [Boolean] + attr_reader :required + + # @param [SpecHash, nil] content + # @param [Boolean, nil] required + def initialize(content, required:) + @required = required + content ||= {} + @spec = content['application/json'] || content['application/x-ndjson'] + @empty = @spec&.schema.nil? + end + + # @return [Api::BodyParameterGroup] + def params_group + @params_group ||= BodyParameterGroup.new( + schema: @spec.schema, + description: @spec.description || @spec.schema.description, + ancestors: [] + ) + end + end +end diff --git a/spec-insert/lib/api/body_parameter.rb b/spec-insert/lib/api/body_parameter.rb new file mode 100644 index 00000000000..cf2fdf6786f --- /dev/null +++ b/spec-insert/lib/api/body_parameter.rb @@ -0,0 +1,106 @@ +# frozen_string_literal: true + +require_relative 'parameter' +require_relative '../config' + +module Api + # Represents a group of parameters of an object within a request or response body + class BodyParameterGroup + attr_reader :members, :ancestors, :description, :is_array, :is_nested, :schema + + # @param [SpecHash] schema schema of an object or an array of objects + # @param [Array<String>] ancestors + # @param [String] description + def initialize(schema:, ancestors:, description:) + @ancestors = ancestors + @description = description + @is_array = schema.items.present? + @schema = @is_array ? schema.items : schema + @schema = flatten_schema(@schema) + @members = parse_members(@schema) + @is_nested = @members.any? { |param| param.is_a?(BodyParameterGroup) } + members.each { |param| param.group = self } unless @is_nested + end + + # @return [Array<BodyParameterGroup>] The child groups of the group + def descendants(seen_schemas = Set.new([@schema])) + child_groups = @is_nested ? @members : @members.map(&:child_params_group).compact + child_groups.reject { |g| seen_schemas.include?(g.schema) }.flat_map do |group| + seen_schemas.add(group.schema) + [group] + group.descendants(seen_schemas) + end + end + + # @param [SpecHash] schema + # @return [Array<Api::BodyParameter>, Array<Api::BodyParameterGroup] members + def parse_members(schema) + union = schema.anyOf || schema.oneOf + if union.present? + return union.map { |sch| BodyParameterGroup.new(schema: sch, ancestors: @ancestors, description:) } + end + properties = schema.properties || {} + parameters = properties.map do |name, prop| + BodyParameter.new(name:, schema: prop, required: schema.required&.include?(name)) + end.sort { |a, b| a.name <=> b.name } + return parameters unless schema.additionalProperties + additional_schema = schema.additionalProperties == true ? SpecHash.new({}) : schema.additionalProperties + free_form_name = CONFIG.param_table.parameter_column.freeform_text + parameters + [BodyParameter.new(name: free_form_name, schema: additional_schema)] + end + + # @param [SpecHash] schema + # @return [SpecHash] a schema with allOf flattened + def flatten_schema(schema) + return schema if schema.allOf.blank? + + schema = schema.allOf.each_with_object({ 'properties' => {}, 'required' => [] }) do |sch, h| + sch = flatten_schema(sch) + h['properties'].merge!(sch.properties || {}) + h['required'] += sch.required || [] + h['additionalProperties'] ||= sch.additionalProperties + end + + SpecHash.new(schema, fully_parsed: true) + end + end + + # TODO: Handle cyclic references + # Represents a body parameter of different levels of a request or response body + class BodyParameter < Parameter + attr_accessor :group + + # @param [String] name + # @param [SpecHash] schema + # @param [Boolean] required + def initialize(name:, schema:, required: false) + super(name:, + required:, + schema:, + description: schema.description, + default: schema['default'], + deprecated: schema.deprecated || schema['x-version-deprecated'].present?, + version_deprecated: schema['x-version-deprecated'], + deprecation_message: schema['x-deprecation-message']) + @include_object = @doc_type.include?('Object') + end + + # @return [BodyParameterGroup, nil] The parameters group of an object parameter + def child_params_group + return nil unless @include_object + return @child_params_group if defined?(@child_params_group) + @child_params_group ||= BodyParameterGroup.new( + schema: @schema, + ancestors: @group.ancestors + [@name], + description: @description + ) + end + + private + + # TODO: Turn this into a configurable setting + def parse_array(schema) + return 'Array' if schema.items == true || schema.items.nil? + "Array of #{parse_doc_type(schema.items).pluralize}" + end + end +end diff --git a/spec-insert/lib/api/operation.rb b/spec-insert/lib/api/operation.rb new file mode 100644 index 00000000000..571fecea075 --- /dev/null +++ b/spec-insert/lib/api/operation.rb @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +# frozen_string_literal: true + +module Api + # An API Operation + class Operation + # @return [SpecHash] Operation Spec + attr_reader :spec + # @return [String] URL + attr_reader :url + # @return [String] HTTP Verb + attr_reader :http_verb + # @return [String] Operation Group + attr_reader :group + # @return [String] API Action + attr_reader :action + # @return [String] API Namespace + attr_reader :namespace + + # @param [SpecHash] spec Operation Spec + # @param [String] url + # @param [String] http_verb + def initialize(spec, url, http_verb) + @spec = spec + @url = url + @http_verb = http_verb.upcase + @group = spec['x-operation-group'] + @action, @namespace = @group.split('.').reverse + end + end +end diff --git a/spec-insert/lib/api/parameter.rb b/spec-insert/lib/api/parameter.rb new file mode 100644 index 00000000000..7dda808c70b --- /dev/null +++ b/spec-insert/lib/api/parameter.rb @@ -0,0 +1,93 @@ +# frozen_string_literal: true + +module Api + # Represents a parameter of an API action + # Acting as base class for URL parameters and Body parameters + class Parameter + # @return [String] The name of the parameter + attr_reader :name + # @return [String] The description of the parameter + attr_reader :description + # @return [Boolean] Whether the parameter is required + attr_reader :required + # @return [SpecHash] The JSON schema of the parameter + attr_reader :schema + # @return [String] Argument type in documentation + attr_reader :doc_type + # @return [String] The default value of the parameter + attr_reader :default + # @return [Boolean] Whether the parameter is deprecated + attr_reader :deprecated + # @return [String] The deprecation message + attr_reader :deprecation_message + # @return [String] The OpenSearch version when the parameter was deprecated + attr_reader :version_deprecated + + def initialize(name:, description:, required:, schema:, default:, deprecated:, deprecation_message:, + version_deprecated:) + @name = name + @description = description + @required = required + @schema = schema + @doc_type = parse_doc_type(schema) + @default = default + @deprecated = deprecated + @deprecation_message = deprecation_message + @version_deprecated = version_deprecated + end + + # @param [SpecHash] Full OpenAPI spec + def self.global=(spec) + @global = spec.components.parameters.filter { |_, p| p['x-global'] }.map { |_, p| from_param_specs([p], nil) } + end + + # @return [Array<Api::UrlParameter>] Global parameters + def self.global + raise 'Global parameters not set' unless @global + @global + end + + # @param [Array<SpecHash>] params List of parameters of the same name + # @param [Integer, nil] opts_count Number of operations involved + # @return [UrlParameter] Single parameter distilled from the list + def self.from_param_specs(params, opts_count) + param = params.first || SpecHash.new + schema = param.schema || SpecHash.new + required = opts_count.nil? ? param.required : params.filter(&:required).size == opts_count + Parameter.new(name: param.name, + description: param.description || schema.description, + required:, + schema:, + default: param['default'] || schema['default'], + deprecated: param.deprecated || schema.deprecated, + deprecation_message: param['x-deprecation-message'] || schema['x-deprecation-message'], + version_deprecated: param['x-version-deprecated'] || schema['x-version-deprecated']) + end + + private + + # @param [SpecHash, nil] schema + # @return [String] Documentation type + def parse_doc_type(schema) + return nil if schema.nil? + return 'any' if schema == true + union = schema.anyOf || schema.oneOf + return union.map { |sch| parse_doc_type(sch) }.uniq.sort.join(' or ') if union.present? + return parse_doc_type(schema.allOf.first) if schema.allOf.present? + type = schema.type + return 'Integer' if type == 'integer' + return 'Float' if type == 'number' + return 'Boolean' if type == 'boolean' + return 'String' if type == 'string' + return parse_array(schema) if type == 'array' || schema.items.present? + return 'NULL' if type == 'null' + return 'Object' if type == 'object' || type.nil? + return type.map { |t| parse_doc_type(SpecHash.new({ 'type' => t })) }.uniq.sort.join(' or ') if type.is_a?(Array) + raise "Unhandled JSON Schema Type: #{type}" + end + + def parse_array(_schema) + 'List' + end + end +end diff --git a/spec-insert/lib/config.rb b/spec-insert/lib/config.rb new file mode 100644 index 00000000000..6812911c6a2 --- /dev/null +++ b/spec-insert/lib/config.rb @@ -0,0 +1,8 @@ +# frozen_string_literal: true + +require 'pathname' +require 'yaml' +require_relative 'dot_hash' + +CONFIG_PATH = File.expand_path('../config.yml', __dir__).freeze +CONFIG = DotHash.new(YAML.load_file(CONFIG_PATH)) diff --git a/spec-insert/lib/doc_processor.rb b/spec-insert/lib/doc_processor.rb new file mode 100644 index 00000000000..caae1ff5d0f --- /dev/null +++ b/spec-insert/lib/doc_processor.rb @@ -0,0 +1,67 @@ +# frozen_string_literal: true + +require 'pathname' +require_relative 'renderers/spec_insert' +require_relative 'spec_insert_error' +require_relative 'insert_arguments' + +# Processes a file, replacing spec-insert blocks with rendered content +class DocProcessor + START_MARKER = /<!-- spec_insert_start/ + END_MARKER = /<!-- spec_insert_end -->/ + + def initialize(file_path, logger:) + @file_path = Pathname(file_path) + @logger = logger + end + + # Processes the file, replacing spec-insert blocks with rendered content + # @param [Boolean] write_to_file Whether to write the changes back to the file + def process(write_to_file: true) + lines = File.readlines(@file_path) + original_content = lines.join + insertions = find_insertions(lines) + return if insertions.empty? + + insertions.reverse_each { |start, finish, insert| lines[start..finish] = insert.render } + rendered_content = lines.join + if write_to_file && rendered_content != original_content + File.write(@file_path, rendered_content) + relative_path = @file_path.relative_path_from(Pathname.new(Dir.pwd)) + @logger.info "Successfully updated #{relative_path}." + end + rendered_content + end + + # @return [Array<SpecInsert>] the spec inserts targeted by this processor + def spec_inserts + find_insertions(File.readlines(@file_path)).map(&:last) + end + + private + + # @return Array<[Integer, Integer, SpecInsert]> + def find_insertions(lines) + start_indices = lines.each_with_index + .filter { |line, _index| line.match?(START_MARKER) } + .map { |_line, index| index } + end_indices = start_indices.map do |index| + (index..lines.length - 1).find { |i| lines[i].match?(END_MARKER) } + end.compact + + validate_markers!(start_indices, end_indices) + + start_indices.zip(end_indices).map do |start, finish| + args = InsertArguments.from_marker(lines[start..finish]) + [start, finish, SpecInsert.new(args)] + end + end + + # @param [Array<Integer>] start_indices + # @param [Array<Integer>] end_indices + def validate_markers!(start_indices, end_indices) + return if start_indices.length == end_indices.length && + start_indices.zip(end_indices).flatten.each_cons(2).all? { |a, b| a < b } + raise SpecInsertError, 'Mismatched "spec_insert_start" and "spec_insert_end" markers.' + end +end diff --git a/spec-insert/lib/dot_hash.rb b/spec-insert/lib/dot_hash.rb new file mode 100644 index 00000000000..ab736d020dc --- /dev/null +++ b/spec-insert/lib/dot_hash.rb @@ -0,0 +1,52 @@ +# frozen_string_literal: true + +# DotHash is a hash that allows access to its keys using dot notation +class DotHash + # @param [Hash] hash + def initialize(hash = {}, fully_parsed: false) + raise ArgumentError, "#{self.class} must be initialized with a Hash" unless hash.is_a?(Hash) + @hash = hash + @fully_parsed = fully_parsed + @parsed_keys = fully_parsed ? nil : Set.new + end + + def to_s + "<#{self.class}: #{@hash}>" + end + + def inspect + "<#{self.class}: #{@hash}>" + end + + def [](key) + retrieve(key) + end + + def respond_to_missing?(name, include_private = false) + @hash.key?(name.to_s) || {}.respond_to?(name) || super + end + + def method_missing(name, ...) + name = name.to_s + if {}.respond_to?(name) + warn "Accessing Hash attribute `#{name}` which is also a key of the #{self.class} instance" if @hash.key?(name) + return @hash.send(name, ...) + end + retrieve(name) + end + + private + + def parse(value) + return value.map { |v| parse(v) } if value.is_a?(Array) + return value if value.is_a?(DotHash) + return value unless value.is_a?(Hash) + DotHash.new(value) + end + + def retrieve(key) + return @hash[key] if @fully_parsed || @parsed_keys.include?(key) + @parsed_keys.add(key) + @hash[key] = parse(@hash[key]) + end +end diff --git a/spec-insert/lib/insert_arguments.rb b/spec-insert/lib/insert_arguments.rb new file mode 100644 index 00000000000..aa05cc01df1 --- /dev/null +++ b/spec-insert/lib/insert_arguments.rb @@ -0,0 +1,80 @@ +# frozen_string_literal: true + +require_relative 'utils' +require_relative 'spec_insert_error' + +# Doc Insert Arguments +class InsertArguments + attr_reader :raw + + # @param [Hash] args raw arguments read from the doc insert marker + def initialize(args) + @raw = args.to_h.with_indifferent_access + end + + # @param [Array<String>] lines the lines between "<!-- doc_insert_start" and "<!-- spec_insert_end -->" + # @return [InsertArguments] + def self.from_marker(lines) + end_index = lines.each_with_index.find { |line, _index| line.match?(/^\s*-->/) }&.last&.- 1 + args = lines[1..end_index].filter { |line| line.include?(':') }.to_h do |line| + key, value = line.split(':') + [key.strip, value.strip] + end + new(args) + end + + # @return [String] + def api + @raw['api'] + end + + # @return [String] + def component + @raw['component'].tap do |component| + raise SpecInsertError, 'Component not specified.' if component.nil? + raise SpecInsertError, "Invalid component: #{component}" unless component.in?(Utils::COMPONENTS) + end + end + + # @return [Array<String>] + def columns + parse_array(@raw['columns']) || [] + end + + # @return [Boolean] + def pretty + parse_boolean(@raw['pretty'], default: false) + end + + # @return [Boolean] + def include_global + parse_boolean(@raw['include_global'], default: false) + end + + # @return [Boolean] + def include_deprecated + parse_boolean(@raw['include_deprecated'], default: true) + end + + # @return [Boolean] + def omit_header + parse_boolean(@raw['omit_header'], default: false) + end + + private + + # @param [String] value comma-separated array + def parse_array(value) + return nil if value.nil? + value.split(',').map(&:strip) + end + + # @param [String] value + # @param [Boolean] default value to return when nil + def parse_boolean(value, default:) + return default if value.nil? + return true if value.in?(%w[true True TRUE yes Yes YES 1]) + return false if value.in?(%w[false False FALSE no No NO 0]) + raise ArgumentError, "Invalid boolean value: #{value}" + end +end diff --git a/spec-insert/lib/jekyll-spec-insert.rb b/spec-insert/lib/jekyll-spec-insert.rb new file mode 100644 index 00000000000..f474fdd1235 --- /dev/null +++ b/spec-insert/lib/jekyll-spec-insert.rb @@ -0,0 +1,46 @@ +# frozen_string_literal: true + +require 'active_support/all' +require 'listen' +require 'yaml' +require_relative 'spec_hash' +require_relative 'doc_processor' +require_relative 'utils' + +# Jekyll plugin to insert document components generated from the spec into the Jekyll site +class JekyllSpecInsert < Jekyll::Command + # @param [Mercenary::Program] prog + def self.init_with_program(prog) + prog.command(:'spec-insert') do |c| + c.syntax 'spec-insert [options]' + c.option 'watch', '--watch', '-W', 'Watch for changes and rebuild' + c.option 'refresh-spec', '--refresh-spec', '-R', 'Redownload the OpenSearch API specification' + c.option 'fail-on-error', '--fail-on-error', '-F', 'Fail on error' + c.action do |_args, options| + Utils.load_spec(forced: options['refresh-spec'], logger: Jekyll.logger) + Utils.target_files.each { |file| process_file(file, fail_on_error: options['fail-on-error']) } + watch(fail_on_error: options['fail-on-error']) if options['watch'] + end + end + end + + def self.process_file(file, fail_on_error: false) + DocProcessor.new(file, logger: Jekyll.logger).process + rescue StandardError => e + raise e if fail_on_error + relative_path = Pathname(file).relative_path_from(Pathname.new(Dir.pwd)) + Jekyll.logger.error "Error processing #{relative_path}: #{e.message}" + end + + def self.watch(fail_on_error: false) + excluded_paths = Utils.config_exclude.map { |path| /\.#{path}$/ } + + Listen.to(Dir.pwd, only: /\.md$/, ignore: excluded_paths) do |modified, added, _removed| + (modified + added).each { |file| process_file(file, fail_on_error:) } + end.start + + trap('INT') { exit } + trap('TERM') { exit } + sleep + end +end diff --git a/spec-insert/lib/renderers/body_parameters.rb b/spec-insert/lib/renderers/body_parameters.rb new file mode 100644 index 00000000000..fbb678a45c7 --- /dev/null +++ b/spec-insert/lib/renderers/body_parameters.rb @@ -0,0 +1,68 @@ +# frozen_string_literal: true + +require_relative 'components/base_mustache_renderer' +require_relative 'components/parameter_table_renderer' + +# Renders request body parameters +class BodyParameters < BaseMustacheRenderer + self.template_file = "#{__dir__}/templates/body_parameters.mustache" + + def initialize(action, args, is_request:) + super(action, args) + @is_request = is_request + @body = is_request ? @action.request_body : @action.response_body + @empty = @body.empty + end + + def header + @header ||= "#{@is_request ? 'Request' : 'Response'} body fields" + end + + def description + name = "The #{@is_request ? 'request' : 'response'} body" + required = @body.required ? ' is __required__. It' : ' is optional. It' if @is_request + schema_desc = if @body.params_group.is_array + "#{name}#{required} is an __array of JSON objects__ (NDJSON). Each object has the following fields." + else + "#{name}#{required} is a JSON object with the following fields." + end + [@body.params_group.description, schema_desc].compact.reject(&:empty?).join("\n\n") + end + + def required + @body.required + end + + def root_tables + render_tables(@body.params_group) + end + + def descendants + @body.params_group.descendants.map do |group| + { block_name: "#{@args.api}::#{@is_request ? 'request' : 'response'}_body", + summary: "#{header}: <code>#{group.ancestors.join('</code> > <code>')}</code>", + description: descendant_desc(group), + descendant_tables: render_tables(group) } + end + end + + private + + # @param [Api::BodyParameterGroup] group + # @return [Array<String>] + def render_tables(group) + return group.members.flat_map { |g| render_tables(g) } if group.is_nested + [ParameterTableRenderer.new(group.members, @args, is_body: true).render] + end + + # @param [Api::BodyParameterGroup] group + def descendant_desc(group) + schema_desc = + if group.is_array + "`#{group.ancestors.last}` is an __array of JSON objects__ (NDJSON). Each object has the following fields." + else + "`#{group.ancestors.last}` is a JSON object with the following fields." + end + [group.description, schema_desc].compact.reject(&:empty?).join("\n\n") + end +end diff --git a/spec-insert/lib/renderers/components/base_mustache_renderer.rb b/spec-insert/lib/renderers/components/base_mustache_renderer.rb new file mode 100644 index 00000000000..6129bfec146 --- /dev/null +++ b/spec-insert/lib/renderers/components/base_mustache_renderer.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true + +require 'mustache' + +# Base Mustache Renderer +class BaseMustacheRenderer < Mustache + self.template_path = "#{__dir__}/templates" + + # @param [Api::Action] + attr_reader :action + # @param [InsertArguments] + attr_reader :args + + # @param [Api::Action] action API Action + # @param [InsertArguments] args + def initialize(action, args) + super() + @action = action + @args = args + end + + def render + @empty ? nil : super + end + + def omit_header + @args.omit_header + end +end diff --git a/spec-insert/lib/renderers/components/parameter_table_renderer.rb b/spec-insert/lib/renderers/components/parameter_table_renderer.rb new file mode 100644 index 00000000000..4e6f1fa0b6d --- /dev/null +++ b/spec-insert/lib/renderers/components/parameter_table_renderer.rb @@ -0,0 +1,104 @@ +# frozen_string_literal: true + +require_relative 'table_renderer' +require_relative '../../config' + +# Renders a table of parameters of an API action +class ParameterTableRenderer + SHARED_COLUMNS = ['Description', 'Required', 'Data type', 'Default'].freeze + URL_PARAMS_COLUMNS = (['Parameter'] + SHARED_COLUMNS).freeze + BODY_PARAMS_COLUMNS = (['Property'] + SHARED_COLUMNS).freeze + + # @param [Array<Api::Parameter>] parameters + # @param [InsertArguments] args + def initialize(parameters, args, is_body: false) + @config = CONFIG.param_table + @parameters = filter_parameters(parameters, args) + @is_body = is_body + @pretty = args.pretty + @columns = determine_columns(args) + end + + # @return [String] + def render + columns = @columns.map { |col| TableRenderer::Column.new(col, col) } + rows = @parameters.map { |arg| row(arg) } + TableRenderer.new(columns, rows, pretty: @pretty).render_lines.join("\n") + end + + private + + # @param [InsertArguments] args + def determine_columns(args) + if args.columns.present? + invalid = args.columns - (@is_body ? BODY_PARAMS_COLUMNS : URL_PARAMS_COLUMNS) + raise ArgumentError, "Invalid column(s): #{invalid.join(', ')}." unless invalid.empty? + return args.columns + end + + required = @parameters.any?(&:required) ? 'Required' : nil + default = @parameters.any? { |p| p.default.present? } ? 'Default' : nil + name = @is_body ? 'Property' : 'Parameter' + [name, required, 'Data type', 'Description', default].compact + end + + # @param [Array<Api::Parameter>] parameters + # @param [InsertArguments] args + def filter_parameters(parameters, args) + parameters = parameters.reject(&:deprecated) unless args.include_deprecated + parameters.sort_by { |arg| [arg.required ? 0 : 1, arg.deprecated ? 1 : 0, arg.name] } + end + + def row(param) + parameter = "`#{param.name}`#{' <br> _DEPRECATED_' if param.deprecated}" + { + 'Parameter' => parameter, + 'Property' => parameter, + 'Description' => description(param), + 'Required' => param.required ? @config.required_column.true_text : @config.required_column.false_text, + 'Data type' => param.doc_type, + 'Default' => param.default.nil? ? @config.default_column.empty_text : "`#{param.default}`" + } + end + + # @param [Api::Parameter] param + def description(param) + deprecation = deprecation(param) + required = param.required && @columns.exclude?('Required') ? '**(Required)**' : '' + description = param.description + default = param.default.nil? || @columns.include?('Default') ? '' : "_(Default: `#{param.default}`)_" + valid_values = valid_values(param) + + main_line = [deprecation, required, description, default].compact.map(&:strip).reject(&:empty?).join(' ') + [main_line, valid_values].reject(&:empty?).join(' <br> ') + end + + # @param [Api::Parameter] param + def valid_values(param) + enums = extract_enum_values(param.schema)&.compact + return '' unless enums.present? + if enums.none? { |enum| enum[:description].present? } + "Valid values are: #{enums.map { |enum| "`#{enum[:value]}`" }.join(', ').gsub(/, ([^,]+)$/, ', and \1')}." + else + "Valid values are: <br> #{enums.map { |enum| "- `#{enum[:value]}`: #{enum[:description]}" } + .join(' <br> ')}" + end + end + + # @param [SpecHash] schema + # @return [Hash] + def extract_enum_values(schema) + return schema.enum.map { |value| { value: } } if schema.enum.present? + if schema.const.present? + { value: schema.const, description: schema.description } + elsif schema.oneOf.present? + schema.oneOf.map { |sch| extract_enum_values(sch) }.flatten + end + end + + def deprecation(param) + message = ": #{param.deprecation_message}" if param.deprecation_message.present? + since = " since #{param.version_deprecated}" if param.version_deprecated.present? + "_(Deprecated#{since}#{message})_" if param.deprecated + end +end diff --git a/spec-insert/lib/renderers/components/table_renderer.rb b/spec-insert/lib/renderers/components/table_renderer.rb new file mode 100644 index 00000000000..447e119cd3c --- /dev/null +++ b/spec-insert/lib/renderers/components/table_renderer.rb @@ -0,0 +1,58 @@ +# frozen_string_literal: true + +# TableRenderer renders a markdown table with the given columns and rows +class TableRenderer + # Column object for rendering markdown tables + class Column + attr_reader :title, :key + attr_accessor :width + + # @param [String] title display title + # @param [String | Symbol] key key to access in row hash + def initialize(title, key) + @title = title + @key = key + @width = 0 + end + end + + # @param [Array<Column>] columns + # @param [Array<Hash>] rows + # @param [Boolean] pretty whether to render a pretty table or a compact one + def initialize(columns, rows, pretty:) + @column = columns + @rows = rows + @pretty = pretty + end + + # @return [Array<String>] + def render_lines + calculate_column_widths if @pretty + ['', render_column, render_divider] + render_rows + [''] + end + + private + + def calculate_column_widths + @column.each do |column| + column.width = [@rows.map { |row| row[column.key].to_s.length }.max || 0, column.title.length].max + end + end + + def render_column + columns = @column.map { |column| column.title.ljust(column.width) }.join(' | ') + "| #{columns} |" + end + + def render_divider + dividers = @column.map { |column| ":#{'-' * [column.width + 1, 3].max}" } + @pretty ? "|#{dividers.join('|')}|" : "| #{dividers.join(' | ')} |" + end + + def render_rows + @rows.map do |row| + cells = @column.map { |column| row[column.key].to_s.ljust(column.width).gsub('|', '\|') }.join(' | ') + "| #{cells} |" + end + end +end diff --git a/spec-insert/lib/renderers/endpoints.rb b/spec-insert/lib/renderers/endpoints.rb new file mode 100644 index 00000000000..49b1155f2ab --- /dev/null +++ b/spec-insert/lib/renderers/endpoints.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +require_relative 'components/base_mustache_renderer' + +# Renders Endpoints +class Endpoints < BaseMustacheRenderer + self.template_file = "#{__dir__}/templates/endpoints.mustache" + + def operations + ljust = @action.operations.map { |op| op.http_verb.length }.max + @action.operations + .sort_by { |op| [op.url.length, op.http_verb] } + .map { |op| { verb: op.http_verb.ljust(ljust), path: op.url } } + end +end diff --git a/spec-insert/lib/renderers/path_parameters.rb b/spec-insert/lib/renderers/path_parameters.rb new file mode 100644 index 00000000000..0c89ec582c8 --- /dev/null +++ b/spec-insert/lib/renderers/path_parameters.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +require_relative 'components/base_mustache_renderer' +require_relative 'components/parameter_table_renderer' + +# Renders path parameters +class PathParameters < BaseMustacheRenderer + self.template_file = "#{__dir__}/templates/path_parameters.mustache" + + def table + ParameterTableRenderer.new(params, @args).render + end + + def optional + params.none?(&:required) + end + + private + + def params + @params ||= @action.path_parameters + end +end diff --git a/spec-insert/lib/renderers/query_parameters.rb b/spec-insert/lib/renderers/query_parameters.rb new file mode 100644 index 00000000000..2073cb321c1 --- /dev/null +++ b/spec-insert/lib/renderers/query_parameters.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +require_relative 'components/base_mustache_renderer' +require_relative 'components/parameter_table_renderer' + +# Renders query parameters +class QueryParameters < BaseMustacheRenderer + self.template_file = "#{__dir__}/templates/query_parameters.mustache" + + def table + ParameterTableRenderer.new(params, @args).render + end + + def optional + params.none?(&:required) + end + + private + + def params + return @params if defined?(@params) + @params = @action.query_parameters + @params += Api::Parameter.global if @args.include_global + @params + end +end diff --git a/spec-insert/lib/renderers/spec_insert.rb b/spec-insert/lib/renderers/spec_insert.rb new file mode 100644 index 00000000000..87289a9baf8 --- /dev/null +++ b/spec-insert/lib/renderers/spec_insert.rb @@ -0,0 +1,47 @@ +# frozen_string_literal: true + +require_relative 'components/base_mustache_renderer' +require_relative '../api/action' +require_relative '../spec_insert_error' +require_relative 'endpoints' +require_relative 'path_parameters' +require_relative 'query_parameters' +require_relative 'body_parameters' + +# Class to render spec insertions +class SpecInsert < BaseMustacheRenderer + self.template_file = "#{__dir__}/templates/spec_insert.mustache" + + # @param [InsertArguments] + def initialize(args) + action = Api::Action.by_full_name[args.api] + super(action, args) + raise SpecInsertError, '`api` argument not specified.' unless @args.api + raise SpecInsertError, "API Action '#{@args.api}' does not exist in the spec." unless @action + end + + def arguments + @args.raw.map { |key, value| { key:, value: } } + end + + def api; @args.api end + def component; @args.component end + + def content + raise SpecInsertError, '`component` argument not specified.' unless @args.component + case @args.component.to_sym + when :query_parameters + QueryParameters.new(@action, @args).render + when :path_parameters + PathParameters.new(@action, @args).render + when :endpoints + Endpoints.new(@action, @args).render + when :request_body_parameters + BodyParameters.new(@action, @args, is_request: true).render + when :response_body_parameters + BodyParameters.new(@action, @args, is_request: false).render + else + raise SpecInsertError, "Invalid component: #{@args.component}" + end + end +end diff --git a/spec-insert/lib/renderers/templates/body_parameters.mustache b/spec-insert/lib/renderers/templates/body_parameters.mustache new file mode 100644 index 00000000000..c8831edfb41 --- /dev/null +++ b/spec-insert/lib/renderers/templates/body_parameters.mustache @@ -0,0 +1,15 @@ +{{^omit_header}} +## {{{header}}} + +{{{description}}} +{{/omit_header}} +{{#root_tables}}{{{.}}}{{/root_tables}}{{#descendants}} +<details markdown="block" name="{{{block_name}}}"> + <summary> + {{{summary}}} + </summary> + {: .text-delta} + +{{{description}}} +{{#descendant_tables}}{{{.}}}{{/descendant_tables}} +</details>{{/descendants}} \ No newline at end of file diff --git a/spec-insert/lib/renderers/templates/endpoints.mustache b/spec-insert/lib/renderers/templates/endpoints.mustache new file mode 100644 index 00000000000..0541921aa85 --- /dev/null +++ b/spec-insert/lib/renderers/templates/endpoints.mustache @@ -0,0 +1,8 @@ +{{^omit_header}} +## Endpoints +{{/omit_header}} +```json +{{#operations}} +{{{verb}}} {{{path}}} +{{/operations}} +``` \ No newline at end of file diff --git a/spec-insert/lib/renderers/templates/path_parameters.mustache b/spec-insert/lib/renderers/templates/path_parameters.mustache new file mode 100644 index 00000000000..47c96cdeccd --- /dev/null +++ b/spec-insert/lib/renderers/templates/path_parameters.mustache @@ -0,0 +1,6 @@ +{{^omit_header}} +## Path parameters + +The following table lists the available path parameters.{{#optional}} All path parameters are optional.{{/optional}} +{{/omit_header}} +{{{table}}} \ No newline at end of file diff --git a/spec-insert/lib/renderers/templates/query_parameters.mustache b/spec-insert/lib/renderers/templates/query_parameters.mustache new file mode 100644 index 00000000000..4d7f57fe4e9 --- /dev/null +++ b/spec-insert/lib/renderers/templates/query_parameters.mustache @@ -0,0 +1,6 @@ +{{^omit_header}} +## Query parameters + +The following table lists the available query parameters.{{#optional}} All query parameters are optional.{{/optional}} +{{/omit_header}} +{{{table}}} \ No newline at end of file diff --git a/spec-insert/lib/renderers/templates/spec_insert.mustache b/spec-insert/lib/renderers/templates/spec_insert.mustache new file mode 100644 index 00000000000..f0ad64d0719 --- /dev/null +++ b/spec-insert/lib/renderers/templates/spec_insert.mustache @@ -0,0 +1,12 @@ +<!-- spec_insert_start +{{#arguments}} +{{{key}}}: {{{value}}} +{{/arguments}} +--> +{{#content}} +{{{.}}} +{{/content}} +{{^content}} +<!-- API {{{api}}} does NOT have a {{{component}}} component --> +{{/content}} +<!-- spec_insert_end --> diff --git a/spec-insert/lib/reports/dry_run.rb b/spec-insert/lib/reports/dry_run.rb new file mode 100644 index 00000000000..06e51c63d86 --- /dev/null +++ b/spec-insert/lib/reports/dry_run.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +require 'mustache' +require_relative '../utils' +require_relative '../renderers/spec_insert' +require_relative '../insert_arguments' + +# Generate a dry run for a specific API action +class DryRun < Mustache + self.template_path = "#{__dir__}/templates" + self.template_file = "#{__dir__}/templates/dry_run.mustache" + + # @param [Api::Action] action + # @param [Hash{String => String[]] errors_report + def initialize(action:, errors_report:) + super + @action = action + @errors_report = errors_report + puts "Generating dry run for #{action.full_name}" + end + + def api_name + @action.full_name + end + + def components + ::Utils::COMPONENTS.map do |id, name| + args = InsertArguments.new(api: @action.full_name, component: id) + { component: SpecInsert.new(args).render, error: false } + rescue StandardError, SystemStackError => e + @errors_report[id] << { api: @action.full_name, message: e.message } + { message: e.message, component: name, error: true } + end + end +end diff --git a/spec-insert/lib/reports/dry_run_report.rb b/spec-insert/lib/reports/dry_run_report.rb new file mode 100644 index 00000000000..ce165a04361 --- /dev/null +++ b/spec-insert/lib/reports/dry_run_report.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +require_relative '../utils' +require_relative '../api/action' +require_relative 'dry_run' + +# Generate a dry run report for all API actions +class DryRunReport < Mustache + self.template_path = "#{__dir__}/templates" + self.template_file = "#{__dir__}/templates/dry_run_report.mustache" + + OUTPUT_DIR = File.join(::Utils::SPEC_INSERT_DIR, 'dry-run') + + def initialize + super + @errors_report = Hash.new { |hash, key| hash[key] = [] } + generate_dry_runs + end + + def any_errors + @errors_report.keys.any? + end + + def errors + ::Utils::COMPONENTS.map do |comp_id, comp_name| + { component_name: comp_name, + component_id: comp_id, + apis: @errors_report[comp_id], + error_count: @errors_report[comp_id].count } + end + end + + private + + def generate_dry_runs + FileUtils.rm_rf(OUTPUT_DIR) + FileUtils.mkdir_p(OUTPUT_DIR) + Api::Action.all.each do |action| + dry_run = DryRun.new(action:, errors_report: @errors_report) + file = File.join(OUTPUT_DIR, "#{action.full_name}.md") + File.write(file, dry_run.render) + end + end +end diff --git a/spec-insert/lib/reports/templates/dry_run.mustache b/spec-insert/lib/reports/templates/dry_run.mustache new file mode 100644 index 00000000000..523e6ba8c24 --- /dev/null +++ b/spec-insert/lib/reports/templates/dry_run.mustache @@ -0,0 +1,20 @@ +# Rendered Components of API `{{{api_name}}}` + +{{#components}} +{{#error}} + +--- + +### <span style="color:rgba(255,0,0,0.5);"> Component `{{{component}}}` failed to render due to the following error: </span> + +``` +{{{message}}} +``` + +--- + +{{/error}} +{{^error}} +{{{component}}} +{{/error}} +{{/components}} diff --git a/spec-insert/lib/reports/templates/dry_run_report.mustache b/spec-insert/lib/reports/templates/dry_run_report.mustache new file mode 100644 index 00000000000..dfa7d32f60b --- /dev/null +++ b/spec-insert/lib/reports/templates/dry_run_report.mustache @@ -0,0 +1,24 @@ +# Dry Run Report for Spec Insert + +A file containing all rendered components for API has been created in the [dry run directory](./dry-run). +{{#any_errors}} +Below are the components that have errors in the dry run and the commands that will reproduce them. + +{{#errors}} + +## {{{component_name}}}: {{error_count}} errors +{{#apis}} +### {{{api}}} + +<div style="color:rgb(255,94,94); background-color: black; padding: 10px">{{{message}}}</div> + +```shell + bundle exec rake dry_run_generate[{{{api}}},{{{component_id}}}] +``` +{{/apis}} +{{/errors}} +{{/any_errors}} + +{{^any_errors}} +No errors occurred during the dry run :D +{{/any_errors}} diff --git a/spec-insert/lib/reports/templates/utilization_coverage.mustache b/spec-insert/lib/reports/templates/utilization_coverage.mustache new file mode 100644 index 00000000000..fa6641a3049 --- /dev/null +++ b/spec-insert/lib/reports/templates/utilization_coverage.mustache @@ -0,0 +1,19 @@ +## Spec Insert Utilization Coverage + +The following table is a summary of the utilization coverage of each Spec Insert component. The utilization coverage is calculated as the percentage of APIs in the spec that have had the component inserted. The coverage is further broken down by the namespace for each component. +{{#components}} +<blockquote> +<details name="component"> + <summary><b style="font-size: 120%">{{{component}}}: {{percent}}% </b>- {{utilization}} of {{total}} APIs covered</summary> +{{#namespaces}} +<details name="namespace" style="padding: 5px"> +<summary style="background-color: #222; color: #ddd; padding: 7px"><code>{{{namespace}}}</code> namespace: {{utilization}}/{{total}}</summary> + +{{#actions}} +- {{#utilized}}<span style="color: green;">✓</span>{{/utilized}}{{^utilized}}<span style="color: red;">✗</span>{{/utilized}} {{{name}}} +{{/actions}} +</details> +{{/namespaces}} +</details> +</blockquote> +{{/components}} diff --git a/spec-insert/lib/reports/utilization_coverage.rb b/spec-insert/lib/reports/utilization_coverage.rb new file mode 100644 index 00000000000..a35b5cee1c2 --- /dev/null +++ b/spec-insert/lib/reports/utilization_coverage.rb @@ -0,0 +1,47 @@ +# frozen_string_literal: true + +require 'mustache' +require 'logger' +require_relative '../utils' +require_relative '../doc_processor' +require_relative '../api/action' + +# Renders utilization coverage of Spec-Insert components to a markdown file +class UtilizationCoverage < Mustache + self.template_file = "#{__dir__}/templates/utilization_coverage.mustache" + + def components + total = Api::Action.all.count + ::Utils::COMPONENTS.map do |id, component| + utilization = utilized_components.values.flatten.count { |comp| comp == id } + percent = (utilization.to_f / total * 100).round(2) + { component:, utilization:, total:, percent:, namespaces: namespace_utilization(id) } + end + end + + private + + def namespace_utilization(component) + Api::Action.by_namespace.entries.sort_by(&:first).map do |namespace, actions| + namespace = '[root]' unless namespace.present? + actions = actions.map do |action| + { name: action.full_name, + utilized: utilized_components[action.full_name]&.include?(component) } + end.sort_by { |action| action[:name] } + total = actions.count + utilization = actions.count { |action| action[:utilized] } + percent = (utilization.to_f / total * 100).round(2) + { namespace:, utilization:, total:, percent:, actions: } + end + end + + # @return [Hash] where each is an API/action name and each value is an array of generated component for that API + def utilized_components + @utilized_components ||= begin + logger = Logger.new(IO::NULL) + spec_inserts = ::Utils.target_files.flat_map { |file| DocProcessor.new(file, logger:).spec_inserts } + Set.new(spec_inserts.map { |insert| [insert.args.api, insert.args.component] }) + .to_a.group_by(&:first).transform_values { |values| values.map(&:last) } + end + end +end diff --git a/spec-insert/lib/spec_hash.rb b/spec-insert/lib/spec_hash.rb new file mode 100644 index 00000000000..5a49fa6da77 --- /dev/null +++ b/spec-insert/lib/spec_hash.rb @@ -0,0 +1,46 @@ +# frozen_string_literal: true + +require_relative 'config' +require_relative 'dot_hash' +require_relative 'api/action' +require_relative 'api/parameter' + +# Spec class for parsing OpenAPI spec +# It's basically a wrapper around a Hash that allows for accessing hash values as object attributes +# and resolving of $refs +class SpecHash < DotHash + def self.load_file(file_path) + @root = YAML.load_file(file_path) + parsed = SpecHash.new(@root) + Api::Action.actions = parsed + Api::Parameter.global = parsed + end + + # @return [Hash] Root of the raw OpenAPI Spec used to resolve $refs + class << self; attr_accessor :root; end + + def description + text = @hash['description'] + return unless text.present? + CONFIG.text_replacements.each { |h| text.gsub!(h['replace'], h['with']) } + text + end + + private + + def parse(value) + return value.map { |v| parse(v) } if value.is_a?(Array) + return value if value.is_a?(SpecHash.class) + return value unless value.is_a?(Hash) + ref = value.delete('$ref') + value.transform_values! { |v| parse(v) } + value.merge!(resolve(ref)) if ref + SpecHash.new(value) + end + + def resolve(ref) + parts = ref.split('/') + parts.shift + self.class.root.dig(*parts) + end +end diff --git a/spec-insert/lib/spec_insert_error.rb b/spec-insert/lib/spec_insert_error.rb new file mode 100644 index 00000000000..0ee5ccf159d --- /dev/null +++ b/spec-insert/lib/spec_insert_error.rb @@ -0,0 +1,4 @@ +# frozen_string_literal: true + +# Error unique to the SpecInsert process +class SpecInsertError < StandardError; end diff --git a/spec-insert/lib/utils.rb b/spec-insert/lib/utils.rb new file mode 100644 index 00000000000..22bd273db2b --- /dev/null +++ b/spec-insert/lib/utils.rb @@ -0,0 +1,58 @@ +# frozen_string_literal: true + +require 'yaml' +require_relative 'spec_hash' + +# Utility methods for the Spec-Insert +module Utils + REPO_ROOT = File.expand_path('../..', __dir__) + SPEC_INSERT_DIR = File.join(REPO_ROOT, 'spec-insert') + SPEC_FILE = File.join(SPEC_INSERT_DIR, 'opensearch-openapi.yaml') + COMPONENTS = { + 'endpoints' => 'Endpoints', + 'query_parameters' => 'Query Parameters', + 'path_parameters' => 'Path Parameters', + 'request_body_parameters' => 'Request Body Parameters', + 'response_body_parameters' => 'Response Body Parameters' + }.freeze + + # @return [Array<String>] list of markdown files to insert the spec components into + def self.target_files + excluded_paths = config_exclude.map { |path| File.join(REPO_ROOT, path) } + Dir.glob(File.join(REPO_ROOT, '**/*.md')).filter do |file| + excluded_paths.none? { |exc| file.start_with?(exc) } + end + end + + # @return [Array<String>] list of paths excluded by Jekyll + def self.config_exclude + YAML.load_file(File.join(REPO_ROOT, '_config.yml'))['exclude'] + end + + def self.load_spec(forced: false, logger: nil) + download_spec(forced:, logger:) + SpecHash.load_file(SPEC_FILE) + end + + def self.download_spec(forced: false, logger: nil) + return if !forced && File.exist?(SPEC_FILE) && (File.mtime(SPEC_FILE) > 1.day.ago) + logger&.info 'Downloading OpenSearch API specification...' + system 'curl -L -X GET ' \ + 'https://github.com/opensearch-project/opensearch-api-specification' \ + '/releases/download/main-latest/opensearch-openapi.yaml ' \ + "-o #{SPEC_FILE}" + end + + # @param [String] api + # @param [String] component + # @return [Array<string>] lines representing dummy marker used as input for SpecInsert + def self.dummy_marker(api, component) + [ + '<!-- doc_insert_start', + "api: #{api}", + "component: #{component}", + '-->', + '<!-- spec_insert_end -->' + ] + end +end diff --git a/spec-insert/spec/_fixtures/actual_output/.gitignore b/spec-insert/spec/_fixtures/actual_output/.gitignore new file mode 100644 index 00000000000..de056073aff --- /dev/null +++ b/spec-insert/spec/_fixtures/actual_output/.gitignore @@ -0,0 +1 @@ +**/*.md diff --git a/spec-insert/spec/_fixtures/expected_output/body_params_tables.md b/spec-insert/spec/_fixtures/expected_output/body_params_tables.md new file mode 100644 index 00000000000..c490941b25d --- /dev/null +++ b/spec-insert/spec/_fixtures/expected_output/body_params_tables.md @@ -0,0 +1,193 @@ +<!-- spec_insert_start +api: indices.put_settings +component: request_body_parameters +--> +## Request body fields + +The index settings to be updated. + +The request body is __required__. It is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `lifecycle` | Object | | +| `mode` | String | | +| `routing` | Object | | +| `routing_path` | Array of Strings or String | | +| `soft_deletes` | Array of Objects | | +| `soft_deletes.retention_lease.period` | String | A duration. Units can be `nanos`, `micros`, `ms` (milliseconds), `s` (seconds), `m` (minutes), `h` (hours) and `d` (days). Also accepts "0" without a unit and "-1" to indicate an unspecified value. | + +<details markdown="block" name="indices.put_settings::request_body"> + <summary> + Request body fields: <code>lifecycle</code> + </summary> + {: .text-delta} + +`lifecycle` is a JSON object with the following fields. + +| Property | Required | Data type | Description | +| :--- | :--- | :--- | :--- | +| `name` | **Required** | String | | +| `indexing_complete` | _Optional_ | Boolean or String | Certain APIs may return values, including numbers such as epoch timestamps, as strings. This setting captures this behavior while keeping the semantics of the field type. Depending on the target language, code generators can keep the union or remove it and leniently parse strings to the target type. | +| `origination_date` | _Optional_ | Integer or String | Certain APIs may return values, including numbers such as epoch timestamps, as strings. This setting captures this behavior while keeping the semantics of the field type. Depending on the target language, code generators can keep the union or remove it and leniently parse strings to the target type. | +| `parse_origination_date` | _Optional_ | Boolean | Set to `true` to parse the origination date from the index name. This origination date is used to calculate the index age for its phase transitions. The index name must match the pattern `^.*-{date_format}-\\d+`, where the `date_format` is `yyyy.MM.dd` and the trailing digits are optional. An index that was rolled over would normally match the full format, for example `logs-2016.10.31-000002`). If the index name doesn't match the pattern, index creation fails. | +| `rollover_alias` | _Optional_ | String | The index alias to update when the index rolls over. Specify when using a policy that contains a rollover action. When the index rolls over, the alias is updated to reflect that the index is no longer the write index. For more information about rolling indexes, see Rollover. | +| `step` | _Optional_ | Object | | + +</details> +<details markdown="block" name="indices.put_settings::request_body"> + <summary> + Request body fields: <code>lifecycle</code> > <code>step</code> + </summary> + {: .text-delta} + +`step` is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `wait_time_threshold` | String | A duration. Units can be `nanos`, `micros`, `ms` (milliseconds), `s` (seconds), `m` (minutes), `h` (hours) and `d` (days). Also accepts "0" without a unit and "-1" to indicate an unspecified value. | + +</details> +<details markdown="block" name="indices.put_settings::request_body"> + <summary> + Request body fields: <code>routing</code> + </summary> + {: .text-delta} + +`routing` is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `allocation` | Object | | +| `rebalance` | Object | | + +</details> +<details markdown="block" name="indices.put_settings::request_body"> + <summary> + Request body fields: <code>routing</code> > <code>allocation</code> + </summary> + {: .text-delta} + +`allocation` is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `disk` | Object | | +| `enable` | String | Valid values are: `all`, `new_primaries`, `none`, and `primaries`. | +| `include` | Object | | +| `initial_recovery` | Object | | +| `total_shards_per_node` | Integer or String | Certain APIs may return values, including numbers such as epoch timestamps, as strings. This setting captures this behavior while keeping the semantics of the field type. Depending on the target language, code generators can keep the union or remove it and leniently parse strings to the target type. | + +</details> +<details markdown="block" name="indices.put_settings::request_body"> + <summary> + Request body fields: <code>routing</code> > <code>allocation</code> > <code>disk</code> + </summary> + {: .text-delta} + +`disk` is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `threshold_enabled` | Boolean or String | Certain APIs may return values, including numbers such as epoch timestamps, as strings. This setting captures this behavior while keeping the semantics of the field type. Depending on the target language, code generators can keep the union or remove it and leniently parse strings to the target type. | + +</details> +<details markdown="block" name="indices.put_settings::request_body"> + <summary> + Request body fields: <code>routing</code> > <code>allocation</code> > <code>include</code> + </summary> + {: .text-delta} + +`include` is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `_id` | String | | +| `_tier_preference` | String | | + +</details> +<details markdown="block" name="indices.put_settings::request_body"> + <summary> + Request body fields: <code>routing</code> > <code>allocation</code> > <code>initial_recovery</code> + </summary> + {: .text-delta} + +`initial_recovery` is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `_id` | String | | + +</details> +<details markdown="block" name="indices.put_settings::request_body"> + <summary> + Request body fields: <code>routing</code> > <code>rebalance</code> + </summary> + {: .text-delta} + +`rebalance` is a JSON object with the following fields. + +| Property | Required | Data type | Description | +| :--- | :--- | :--- | :--- | +| `enable` | **Required** | String | Valid values are: `all`, `none`, `primaries`, and `replicas`. | + +</details> +<details markdown="block" name="indices.put_settings::request_body"> + <summary> + Request body fields: <code>soft_deletes</code> + </summary> + {: .text-delta} + +`soft_deletes` is an __array of JSON objects__ (NDJSON). Each object has the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `enabled` | Boolean or String | Certain APIs may return values, including numbers such as epoch timestamps, as strings. This setting captures this behavior while keeping the semantics of the field type. Depending on the target language, code generators can keep the union or remove it and leniently parse strings to the target type. | +| `retention` | Object | The retention settings for soft deletes. | +| `retention_lease` | Object | | + +</details> +<details markdown="block" name="indices.put_settings::request_body"> + <summary> + Request body fields: <code>soft_deletes</code> > <code>retention</code> + </summary> + {: .text-delta} + +The retention settings for soft deletes. + +`retention` is a JSON object with the following fields. + +| Property | Data type | Description | +| :--- | :--- | :--- | +| `operations` | Integer or String | | + +</details> +<details markdown="block" name="indices.put_settings::request_body"> + <summary> + Request body fields: <code>soft_deletes</code> > <code>retention_lease</code> + </summary> + {: .text-delta} + +`retention_lease` is a JSON object with the following fields. + +| Property | Required | Data type | Description | +| :--- | :--- | :--- | :--- | +| `period` | **Required** | String | A duration. Units can be `nanos`, `micros`, `ms` (milliseconds), `s` (seconds), `m` (minutes), `h` (hours) and `d` (days). Also accepts "0" without a unit and "-1" to indicate an unspecified value. | + +</details> +<!-- spec_insert_end --> + + +<!-- spec_insert_start +api: indices.put_settings +component: response_body_parameters +--> +## Response body fields + +The response body is a JSON object with the following fields. + +| Property | Required | Data type | Description | +| :--- | :--- | :--- | :--- | +| `acknowledged` | **Required** | Boolean | For a successful response, this value is always true. On failure, an exception is returned instead. | + +<!-- spec_insert_end --> diff --git a/spec-insert/spec/_fixtures/expected_output/endpoints.md b/spec-insert/spec/_fixtures/expected_output/endpoints.md new file mode 100644 index 00000000000..cffad018cb9 --- /dev/null +++ b/spec-insert/spec/_fixtures/expected_output/endpoints.md @@ -0,0 +1,13 @@ + +<!-- spec_insert_start +api: search +component: endpoints +--> +## Endpoints +```json +GET /_search +POST /_search +GET /{index}/_search +POST /{index}/_search +``` +<!-- spec_insert_end --> diff --git a/spec-insert/spec/_fixtures/expected_output/url_params_tables.md b/spec-insert/spec/_fixtures/expected_output/url_params_tables.md new file mode 100644 index 00000000000..d604e9a592b --- /dev/null +++ b/spec-insert/spec/_fixtures/expected_output/url_params_tables.md @@ -0,0 +1,74 @@ +Typical Path Parameters Example + +<!-- spec_insert_start +api: search +component: path_parameters +--> +## Path parameters + +The following table lists the available path parameters. All path parameters are optional. + +| Parameter | Data type | Description | +| :--- | :--- | :--- | +| `index` | List or String | Comma-separated list of data streams, indexes, and aliases to search. Supports wildcards (`*`). To search all data streams and indexes, omit this parameter or use `*` or `_all`. <br> Valid values are: `_all`, `_any`, and `_none`. | + +<!-- spec_insert_end --> + +Query Parameters Example with Global Parameters, Pretty Print, and Custom Columns + +<!-- spec_insert_start +api: search +component: query_parameters +include_global: true +pretty: true +columns: Data type, Parameter, Description, Required, Default +--> +## Query parameters + +The following table lists the available query parameters. + +| Data type | Parameter | Description | Required | Default | +|:---------------|:--------------------------|:-----------------------------------------------------------------------------------------------------------------------------------|:-------------|:--------| +| Boolean | `analyze_wildcard` | If true, wildcard and prefix queries are analyzed. This parameter can only be used when the q query string parameter is specified. | **Required** | `false` | +| String | `analyzer` | Analyzer to use for the query string. This parameter can only be used when the q query string parameter is specified. | _Optional_ | N/A | +| List or String | `expand_wildcards` | Comma-separated list of expand wildcard options. <br> Valid values are: `open`, `closed`, `none`, and `all`. | _Optional_ | N/A | +| Boolean | `pretty` | Whether to pretty format the returned JSON response. | _Optional_ | N/A | +| Boolean | `human` <br> _DEPRECATED_ | _(Deprecated since 3.0: Use the `format` parameter instead.)_ Whether to return human readable values for statistics. | _Optional_ | `true` | + +<!-- spec_insert_end --> + +Query Parameters Example with only Parameter and Description Columns + +<!-- spec_insert_start +api: search +component: query_parameters +columns: Parameter, Description +omit_header: true +--> + +| Parameter | Description | +| :--- | :--- | +| `analyze_wildcard` | **(Required)** If true, wildcard and prefix queries are analyzed. This parameter can only be used when the q query string parameter is specified. _(Default: `false`)_ | +| `analyzer` | Analyzer to use for the query string. This parameter can only be used when the q query string parameter is specified. | +| `expand_wildcards` | Comma-separated list of expand wildcard options. <br> Valid values are: `open`, `closed`, `none`, and `all`. | + +<!-- spec_insert_end --> + +Optional Params Text + +<!-- spec_insert_start +api: cat.health +component: query_parameters +include_global: true +--> +## Query parameters + +The following table lists the available query parameters. + +| Parameter | Required | Data type | Description | Default | +| :--- | :--- | :--- | :--- | :--- | +| `expand_wildcard` | **Required** | String | Whether to expand wildcard expression to concrete indices that are open, closed, or both. For more information, see [cat health API]({{site.url}}{{site.baseurl}}/api-reference/cat/health/). <br> Valid values are: <br> - `open`: Expand wildcards to open indices only. <br> - `closed`: Expand wildcards to closed indices only. <br> - `master`: Expand wildcards for cluster manager nodes only. | N/A | +| `pretty` | _Optional_ | Boolean | Whether to pretty format the returned JSON response. | N/A | +| `human` <br> _DEPRECATED_ | _Optional_ | Boolean | _(Deprecated since 3.0: Use the `format` parameter instead.)_ Whether to return human readable values for statistics. | `true` | + +<!-- spec_insert_end --> diff --git a/spec-insert/spec/_fixtures/input/body_params_tables.md b/spec-insert/spec/_fixtures/input/body_params_tables.md new file mode 100644 index 00000000000..b2aa818370b --- /dev/null +++ b/spec-insert/spec/_fixtures/input/body_params_tables.md @@ -0,0 +1,12 @@ +<!-- spec_insert_start +api: indices.put_settings +component: request_body_parameters +--> +<!-- spec_insert_end --> + + +<!-- spec_insert_start +api: indices.put_settings +component: response_body_parameters +--> +<!-- spec_insert_end --> \ No newline at end of file diff --git a/spec-insert/spec/_fixtures/input/endpoints.md b/spec-insert/spec/_fixtures/input/endpoints.md new file mode 100644 index 00000000000..5b304591030 --- /dev/null +++ b/spec-insert/spec/_fixtures/input/endpoints.md @@ -0,0 +1,6 @@ + +<!-- spec_insert_start +api: search +component: endpoints +--> +<!-- spec_insert_end --> diff --git a/spec-insert/spec/_fixtures/input/url_params_tables.md b/spec-insert/spec/_fixtures/input/url_params_tables.md new file mode 100644 index 00000000000..c10a9bb33f8 --- /dev/null +++ b/spec-insert/spec/_fixtures/input/url_params_tables.md @@ -0,0 +1,48 @@ +Typical Path Parameters Example + +<!-- spec_insert_start +api: search +component: path_parameters +--> +THIS + TEXT + SHOULD + BE + REPLACED +<!-- spec_insert_end --> + +Query Parameters Example with Global Parameters, Pretty Print, and Custom Columns + +<!-- spec_insert_start +api: search +component: query_parameters +include_global: true +pretty: true +columns: Data type, Parameter, Description, Required, Default +--> + THIS TEXT SHOULD BE REPLACED +<!-- spec_insert_end --> + +Query Parameters Example with only Parameter and Description Columns + +<!-- spec_insert_start +api: search +component: query_parameters +columns: Parameter, Description +omit_header: true +--> +THIS +TEXT +SHOULD +BE +REPLACED +<!-- spec_insert_end --> + +Optional Params Text + +<!-- spec_insert_start +api: cat.health +component: query_parameters +include_global: true +--> +<!-- spec_insert_end --> diff --git a/spec-insert/spec/_fixtures/opensearch_spec.yaml b/spec-insert/spec/_fixtures/opensearch_spec.yaml new file mode 100644 index 00000000000..d0cb3efec29 --- /dev/null +++ b/spec-insert/spec/_fixtures/opensearch_spec.yaml @@ -0,0 +1,456 @@ +openapi: 3.1.0 +info: + title: OpenSearch API Specification + version: 1.0.0 + x-api-version: 2.16.0 +paths: + /_cat/health: + get: + operationId: cat_health.0 + x-operation-group: cat.health + parameters: + - $ref: '#/components/parameters/_global___query.pretty' + - $ref: '#/components/parameters/_global___query.human' + - $ref: '#/components/parameters/cat.health___query.expand_wildcard' + /_search: + get: + operationId: search.0 + x-operation-group: search + x-version-added: '1.0' + description: Returns results matching a query. + externalDocs: + url: https://opensearch.org/docs/latest/api-reference/search/ + parameters: + - $ref: '#/components/parameters/search___query.analyze_wildcard' + - $ref: '#/components/parameters/search___query.analyzer' + post: + operationId: search.1 + x-operation-group: search + x-version-added: '1.0' + description: Returns results matching a query. + externalDocs: + url: https://opensearch.org/docs/latest/api-reference/search/ + parameters: + - $ref: '#/components/parameters/search___query.analyze_wildcard' + - $ref: '#/components/parameters/search___query.analyzer' + - $ref: '#/components/parameters/search___query.expand_wildcards' + /{index}/_search: + get: + operationId: search.2 + x-operation-group: search + x-version-added: '1.0' + description: Returns results matching a query. + externalDocs: + url: https://opensearch.org/docs/latest/api-reference/search/ + parameters: + - $ref: '#/components/parameters/search___path.index' + - $ref: '#/components/parameters/search___query.analyze_wildcard' + - $ref: '#/components/parameters/search___query.analyzer' + - $ref: '#/components/parameters/search___query.expand_wildcards' + post: + operationId: search.3 + x-operation-group: search + x-version-added: '1.0' + description: Returns results matching a query. + externalDocs: + url: https://opensearch.org/docs/latest/api-reference/search/ + parameters: + - $ref: '#/components/parameters/search___path.index' + - $ref: '#/components/parameters/search___query.analyze_wildcard' + - $ref: '#/components/parameters/search___query.analyzer' + - $ref: '#/components/parameters/search___query.expand_wildcards' + /_settings: + put: + operationId: indices.put_settings.0 + x-operation-group: indices.put_settings + x-version-added: '1.0' + description: Updates the index settings. + externalDocs: + url: https://opensearch.org/docs/latest/api-reference/index-apis/update-settings/ + requestBody: + $ref: '#/components/requestBodies/indices.put_settings' + responses: + '200': + $ref: '#/components/responses/indices.put_settings___200' + + /_bulk: + post: + operationId: bulk.0 + x-operation-group: bulk + x-version-added: '1.0' + description: Allows to perform multiple index/update/delete operations in a single request. + externalDocs: + url: https://opensearch.org/docs/latest/api-reference/document-apis/bulk/ + requestBody: + $ref: '#/components/requestBodies/bulk' +components: + + parameters: + + _global___query.pretty: + name: pretty + in: query + description: Whether to pretty format the returned JSON response. + schema: + type: boolean + default: false + x-global: true + + _global___query.human: + name: human + in: query + description: Whether to return human readable values for statistics. + schema: + type: boolean + default: true + x-global: true + deprecated: true + x-version-deprecated: '3.0' + x-deprecation-message: Use the `format` parameter instead. + + cat.health___query.expand_wildcard: + in: query + required: true + name: expand_wildcard + description: |- + Whether to expand wildcard expression to concrete indices that are open, closed, or both. + For more information, see [cat health API](https://opensearch.org/docs/latest/api-reference/cat/health/). + schema: + oneOf: + - type: string + const: open + description: Expand wildcards to open indices only. + - type: string + const: closed + description: Expand wildcards to closed indices only. + - type: string + const: master + description: Expand wildcards for master nodes only. + + search___path.index: + in: path + name: index + description: |- + Comma-separated list of data streams, indexes, and aliases to search. + Supports wildcards (`*`). + To search all data streams and indexes, omit this parameter or use `*` or `_all`. + required: true + schema: + $ref: '#/components/schemas/_common___Indices' + style: simple + + search___query.analyze_wildcard: + in: query + name: analyze_wildcard + required: true + description: |- + If true, wildcard and prefix queries are analyzed. + This parameter can only be used when the q query string parameter is specified. + schema: + type: boolean + default: false + style: form + + search___query.analyzer: + in: query + name: analyzer + description: |- + Analyzer to use for the query string. + This parameter can only be used when the q query string parameter is specified. + schema: + type: string + style: form + + search___query.expand_wildcards: + in: query + name: expand_wildcards + description: |- + Comma-separated list of expand wildcard options. + schema: + oneOf: + - $ref: '#/components/schemas/_common___ExpandWildcardsCompact' + - type: array + items: + $ref: '#/components/schemas/_common___ExpandWildcardsCompact' + + requestBodies: + + indices.put_settings: + required: true + content: + application/json: + schema: + title: settings + $ref: '#/components/schemas/indices._common___IndexSettings' + + bulk: + content: + application/x-ndjson: + schema: + type: array + items: + anyOf: + - type: object + - type: object + properties: + index: + type: string + action: + type: string + enum: [index, create, delete, update] + data: + type: object + description: The operation definition and data (action-data pairs), separated by newlines + required: true + + responses: + + indices.put_settings___200: + content: + application/json: + schema: + $ref: '#/components/schemas/_common___AcknowledgedResponseBase' + schemas: + + _common___Indices: + oneOf: + - $ref: '#/components/schemas/_common___IndexName' + - $ref: '#/components/schemas/_common___SpecialIndices' + - type: array + items: + $ref: '#/components/schemas/_common___IndexName' + + _common___IndexName: + type: string + + _common___SpecialIndices: + oneOf: + - type: string + const: _all + - type: string + const: _any + - type: string + const: _none + + _common___ExpandWildcardsCompact: + type: string + enum: + - open + - closed + - none + - all + + _common___AcknowledgedResponseBase: + type: object + properties: + acknowledged: + description: For a successful response, this value is always true. On failure, an exception is returned instead. + type: boolean + required: + - acknowledged + + indices._common___IndexSettings: + type: object + description: The index settings to be updated. + properties: + mode: + type: string + routing_path: + $ref: '#/components/schemas/_common___StringOrStringArray' + soft_deletes: + items: + $ref: '#/components/schemas/indices._common___SoftDeletes' + soft_deletes.retention_lease.period: + $ref: '#/components/schemas/_common___Duration' + routing: + $ref: '#/components/schemas/indices._common___IndexRouting' + lifecycle: + $ref: '#/components/schemas/indices._common___IndexSettingsLifecycle' + + _common___StringOrStringArray: + oneOf: + - type: string + - type: array + items: + type: string + + indices._common___SoftDeletes: + type: object + properties: + enabled: + description: Indicates whether soft deletes are enabled on the index. + $ref: '#/components/schemas/_common___StringifiedBoolean' + retention: + $ref: '#/components/schemas/indices._common___SoftDeletesRetention' + retention_lease: + $ref: '#/components/schemas/indices._common___RetentionLease' + + _common___Duration: + description: |- + A duration. Units can be `nanos`, `micros`, `ms` (milliseconds), `s` (seconds), `m` (minutes), `h` (hours) and + `d` (days). Also accepts "0" without a unit and "-1" to indicate an unspecified value. + pattern: ^(?:(-1)|([0-9\.]+)(?:d|h|m|s|ms|micros|nanos))$ + type: string + + indices._common___IndexRouting: + type: object + properties: + allocation: + $ref: '#/components/schemas/indices._common___IndexRoutingAllocation' + rebalance: + $ref: '#/components/schemas/indices._common___IndexRoutingRebalance' + + _common___StringifiedEpochTimeUnitMillis: + description: |- + Certain APIs may return values, including numbers such as epoch timestamps, as strings. This setting captures + this behavior while keeping the semantics of the field type. + + Depending on the target language, code generators can keep the union or remove it and leniently parse + strings to the target type. + oneOf: + - $ref: '#/components/schemas/_common___EpochTimeUnitMillis' + - type: string + + _common___EpochTimeUnitMillis: + allOf: + - $ref: '#/components/schemas/_common___UnitMillis' + + _common___UnitMillis: + description: The time unit for milliseconds. + type: integer + format: int64 + + indices._common___IndexSettingsLifecycle: + type: object + properties: + name: + type: string + indexing_complete: + $ref: '#/components/schemas/_common___StringifiedBoolean' + origination_date: + description: |- + If specified, this is the timestamp used to calculate the index age for its phase transitions. Use this setting + if you create a new index that contains old data and want to use the original creation date to calculate the index + age. Specified as a Unix epoch value in milliseconds. + $ref: '#/components/schemas/_common___StringifiedEpochTimeUnitMillis' + parse_origination_date: + description: |- + Set to `true` to parse the origination date from the index name. This origination date is used to calculate the index age + for its phase transitions. The index name must match the pattern `^.*-{date_format}-\\d+`, where the `date_format` is + `yyyy.MM.dd` and the trailing digits are optional. An index that was rolled over would normally match the full format, + for example `logs-2016.10.31-000002`). If the index name doesn't match the pattern, index creation fails. + type: boolean + step: + $ref: '#/components/schemas/indices._common___IndexSettingsLifecycleStep' + rollover_alias: + description: |- + The index alias to update when the index rolls over. Specify when using a policy that contains a rollover action. + When the index rolls over, the alias is updated to reflect that the index is no longer the write index. For more + information about rolling indexes, see Rollover. + type: string + required: + - name + + _common___StringifiedLong: + oneOf: + - type: integer + format: int64 + - type: string + + _common___StringifiedBoolean: + description: |- + Certain APIs may return values, including numbers such as epoch timestamps, as strings. This setting captures + this behavior while keeping the semantics of the field type. + + Depending on the target language, code generators can keep the union or remove it and leniently parse + strings to the target type. + oneOf: + - type: boolean + - type: string + + _common___StringifiedInteger: + description: |- + Certain APIs may return values, including numbers such as epoch timestamps, as strings. This setting captures + this behavior while keeping the semantics of the field type. + + Depending on the target language, code generators can keep the union or remove it and leniently parse + strings to the target type. + oneOf: + - type: integer + - type: string + + indices._common___SoftDeletesRetention: + type: object + description: The retention settings for soft deletes. + properties: + operations: + $ref: '#/components/schemas/_common___StringifiedLong' + + indices._common___RetentionLease: + type: object + properties: + period: + $ref: '#/components/schemas/_common___Duration' + required: + - period + + indices._common___IndexRoutingAllocation: + type: object + properties: + enable: + $ref: '#/components/schemas/indices._common___IndexRoutingAllocationOptions' + include: + $ref: '#/components/schemas/indices._common___IndexRoutingAllocationInclude' + initial_recovery: + $ref: '#/components/schemas/indices._common___IndexRoutingAllocationInitialRecovery' + disk: + $ref: '#/components/schemas/indices._common___IndexRoutingAllocationDisk' + total_shards_per_node: + $ref: '#/components/schemas/_common___StringifiedInteger' + + indices._common___IndexRoutingAllocationDisk: + type: object + properties: + threshold_enabled: + $ref: '#/components/schemas/_common___StringifiedBoolean' + indices._common___IndexRoutingAllocationInclude: + type: object + properties: + _tier_preference: + type: string + _id: + type: string + indices._common___IndexRoutingAllocationInitialRecovery: + type: object + properties: + _id: + type: string + indices._common___IndexRoutingAllocationOptions: + type: string + enum: + - all + - new_primaries + - none + - primaries + + indices._common___IndexRoutingRebalance: + type: object + properties: + enable: + $ref: '#/components/schemas/indices._common___IndexRoutingRebalanceOptions' + required: + - enable + + indices._common___IndexRoutingRebalanceOptions: + type: string + enum: + - all + - none + - primaries + - replicas + + indices._common___IndexSettingsLifecycleStep: + type: object + properties: + wait_time_threshold: + $ref: '#/components/schemas/_common___Duration' \ No newline at end of file diff --git a/spec-insert/spec/doc_processor_spec.rb b/spec-insert/spec/doc_processor_spec.rb new file mode 100644 index 00000000000..4f2c46b9e7d --- /dev/null +++ b/spec-insert/spec/doc_processor_spec.rb @@ -0,0 +1,28 @@ +# frozen_string_literal: true + +require_relative 'spec_helper' +require_relative '../lib/doc_processor' +require_relative '../lib/spec_hash' + +describe DocProcessor do + SpecHash.load_file('spec/_fixtures/opensearch_spec.yaml') + + def test_file(file_name) + expected_output = File.read("#{__dir__}/_fixtures/expected_output/#{file_name}.md") + actual_output = described_class.new("#{__dir__}/_fixtures/input/#{file_name}.md", logger: Logger.new($stdout)).process(write_to_file: false) + File.write("./spec/_fixtures/actual_output/#{file_name}.md", actual_output) + expect(actual_output).to eq(expected_output) + end + + it 'inserts the endpoints correctly' do + test_file('endpoints') + end + + it 'inserts the url param tables correctly' do + test_file('url_params_tables') + end + + it 'inserts the body param tables correctly' do + test_file('body_params_tables') + end +end diff --git a/spec-insert/spec/mock_config.yml b/spec-insert/spec/mock_config.yml new file mode 100644 index 00000000000..96aaeab6e19 --- /dev/null +++ b/spec-insert/spec/mock_config.yml @@ -0,0 +1,16 @@ +param_table: + parameter_column: + freeform_text: -- freeform field -- + default_column: + empty_text: N/A + required_column: + true_text: "**Required**" + false_text: _Optional_ + +text_replacements: + - replace: "https://opensearch.org/docs/latest" + with: "{{site.url}}{{site.baseurl}}" + - replace: "\n" + with: " " + - replace: master node + with: cluster manager node \ No newline at end of file diff --git a/spec-insert/spec/spec_helper.rb b/spec-insert/spec/spec_helper.rb new file mode 100644 index 00000000000..9c85ec48c9e --- /dev/null +++ b/spec-insert/spec/spec_helper.rb @@ -0,0 +1,107 @@ +# This file was generated by the `rspec --init` command. Conventionally, all +# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`. +# The generated `.rspec` file contains `--require spec_helper` which will cause +# this file to always be loaded, without a need to explicitly require it in any +# files. +# +# Given that it is always loaded, you are encouraged to keep this file as +# light-weight as possible. Requiring heavyweight dependencies from this file +# will add to the boot time of your test suite on EVERY test run, even for an +# individual file that may not need all of that loaded. Instead, consider making +# a separate helper file that requires the additional dependencies and performs +# the additional setup, and require it from the spec files that actually need +# it. +# +# See https://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration +RSpec.configure do |config| + # rspec-expectations config goes here. You can use an alternate + # assertion/expectation library such as wrong or the stdlib/minitest + # assertions if you prefer. + config.expect_with :rspec do |expectations| + # This option will default to `true` in RSpec 4. It makes the `description` + # and `failure_message` of custom matchers include text for helper methods + # defined using `chain`, e.g.: + # be_bigger_than(2).and_smaller_than(4).description + # # => "be bigger than 2 and smaller than 4" + # ...rather than: + # # => "be bigger than 2" + expectations.include_chain_clauses_in_custom_matcher_descriptions = true + end + + # rspec-mocks config goes here. You can use an alternate test double + # library (such as bogus or mocha) by changing the `mock_with` option here. + config.mock_with :rspec do |mocks| + # Prevents you from mocking or stubbing a method that does not exist on + # a real object. This is generally recommended, and will default to + # `true` in RSpec 4. + mocks.verify_partial_doubles = true + end + + # This option will default to `:apply_to_host_groups` in RSpec 4 (and will + # have no way to turn it off -- the option exists only for backwards + # compatibility in RSpec 3). It causes shared context metadata to be + # inherited by the metadata hash of host groups and examples, rather than + # triggering implicit auto-inclusion in groups with matching metadata. + config.shared_context_metadata_behavior = :apply_to_host_groups + + # The settings below are suggested to provide a good initial experience + # with RSpec, but feel free to customize to your heart's content. + + # This allows you to limit a spec run to individual examples or groups + # you care about by tagging them with `:focus` metadata. When nothing + # is tagged with `:focus`, all examples get run. RSpec also provides + # aliases for `it`, `describe`, and `context` that include `:focus` + # metadata: `fit`, `fdescribe` and `fcontext`, respectively. + config.filter_run_when_matching :focus + + # Allows RSpec to persist some state between runs in order to support + # the `--only-failures` and `--next-failure` CLI options. We recommend + # you configure your source control system to ignore this file. + config.example_status_persistence_file_path = 'rspec_examples.txt' + + # Limits the available syntax to the non-monkey patched syntax that is + # recommended. For more details, see: + # https://rspec.info/features/3-12/rspec-core/configuration/zero-monkey-patching-mode/ + config.disable_monkey_patching! + + # This setting enables warnings. It's recommended, but in some cases may + # be too noisy due to issues in dependencies. + config.warnings = true + + # Many RSpec users commonly either run the entire suite or an individual + # file, and it's useful to allow more verbose expected_output when running an + # individual spec file. + if config.files_to_run.one? + # Use the documentation formatter for detailed expected_output, + # unless a formatter has already been configured + # (e.g. via a command-line flag). + config.default_formatter = 'doc' + end + + # Print the 10 slowest examples and example groups at the + # end of the spec run, to help surface which specs are running + # particularly slow. + config.profile_examples = 10 + + # Run specs in random order to surface order dependencies. If you find an + # order dependency and want to debug it, you can fix the order by providing + # the seed, which is printed after each run. + # --seed 1234 + config.order = :random + + # Seed global randomization in this process using the `--seed` CLI option. + # Setting this allows you to use `--seed` to deterministically reproduce + # test failures related to randomization by passing the same `--seed` value + # as the one that triggered the failure. + Kernel.srand config.seed + + config.expose_dsl_globally = true + + config.before(:each) do + mock_config = SpecHash.new(YAML.load_file('./spec/mock_config.yml')) # Replace with your desired mock value + stub_const('CONFIG', mock_config) + end +end + +require 'active_support/all' +require 'rspec' diff --git a/templates/API_TEMPLATE.md b/templates/API_TEMPLATE.md index 02c0f341d95..736eb74338a 100644 --- a/templates/API_TEMPLATE.md +++ b/templates/API_TEMPLATE.md @@ -11,7 +11,7 @@ Introduced 1.0 The Example API ... (descriptive sentence about what this API does). -## Path and HTTP methods +## Endpoints ```json POST /_example/endpoint/ @@ -46,7 +46,7 @@ The following table lists the available request body fields. ## Example request(s) -**TIP:** If multiple examples exist for the request, seperate those examples using an `h3` header underneath this section. +**TIP:** If multiple examples exist for the request, separate those examples using an `h3` header underneath this section. ### Request with an example object @@ -63,7 +63,7 @@ POST /_example/endpoint/ ``` {% include copy-curl.html %} -## Request without an example object +### Request without an example object The following example shows an API request without an example object: @@ -75,7 +75,7 @@ POST /_example/endpoint/ ## Example response -**TIP:** If multiple response examples exist for the request, seperate those examples using an `h3` header underneath this section, similar to the [Example requests](#example-requests). +**TIP:** If multiple examples exist for the request, separate those examples using an `h3` header under this section, similar to the [Example requests](#example-requests). The following example shows an API response: diff --git a/templates/EXPERIMENTAL_TEMPLATE.md b/templates/EXPERIMENTAL_TEMPLATE.md index 6aa06c58242..676f2842ae1 100644 --- a/templates/EXPERIMENTAL_TEMPLATE.md +++ b/templates/EXPERIMENTAL_TEMPLATE.md @@ -10,5 +10,5 @@ parent: This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://example.issue.link). {: .warning} -This is an experimental feature and is not recommended for use in a production environment. For updates on the progress the feature or if you want to leave feedback, join the discussion in the [OpenSearch forum](https://forum.opensearch.org/). -{: .warning} \ No newline at end of file +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, join the discussion on the [OpenSearch forum](https://forum.opensearch.org/). +{: .warning}