JohnSnowLabs
diff --git a/‎.github/workflows/publish_docs.yaml
Lines changed: 65 additions & 48 deletions b/‎.github/workflows/publish_docs.yaml
Lines changed: 65 additions & 48 deletions
diff --git a/‎CHANGELOG
Lines changed: 40 additions & 0 deletions b/‎CHANGELOG
Lines changed: 40 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 17 additions & 8 deletions b/‎README.md
Lines changed: 17 additions & 8 deletions
diff --git a/‎build.sbt
Lines changed: 3 additions & 2 deletions b/‎build.sbt
Lines changed: 3 additions & 2 deletions
diff --git a/‎conda/meta.yaml
Lines changed: 2 additions & 2 deletions b/‎conda/meta.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/_layouts/landing.html
Lines changed: 1 addition & 1 deletion b/‎docs/_layouts/landing.html
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/api/com/index.html
Lines changed: 4 additions & 4 deletions b/‎docs/api/com/index.html
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/api/com/johnsnowlabs/client/CloudClient.html
Lines changed: 4 additions & 4 deletions b/‎docs/api/com/johnsnowlabs/client/CloudClient.html
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/api/com/johnsnowlabs/client/CloudManager.html
Lines changed: 4 additions & 4 deletions b/‎docs/api/com/johnsnowlabs/client/CloudManager.html
Lines changed: 4 additions & 4 deletions
@@ -3,60 +3,77 @@ name: Publish APIs
 on:
   push:
     branches:
-      - '*release*'
-      - 'release/**'
+      - "*release*"
+      - "release/**"
   pull_request:
     branches:
-      - 'main'
-      - 'master'
-      - '*release*'
-      - 'release/**'
+      - "main"
+      - "master"
+      - "*release*"
+      - "release/**"
+
 env:
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
 jobs:
   build:
     if: "contains(toJSON(github.event.commits.*.message), '[run doc]')"
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     steps:
-    - name: checkout repo
-      uses: actions/checkout@v2
-    - name: Set up JDK 8
-      uses: actions/setup-java@v1
-      with:
-        java-version: 1.8
-    - name: Install Python 3.7
-      uses: actions/setup-python@v2
-      with:
-        python-version: 3.7.7
-        architecture: x64
-    - name: Build Scala APIs
-      run: |
-        sbt doc
-    - name: Install PyPI dependencies
-      run: |
-        python -m pip install --upgrade pip
-        cd ./python/docs && pip install -r requirements_doc.txt
-    - name: Build Python APIs
-      run: |
-        cd ./python/docs
-        make html
-    - name: Commit changes
-      id: commit
-      run: |
-        git config --local user.email "[email protected]"
-        git config --local user.name "github-actions"
-        git add --all
-        if [-z "$(git status --porcelain)"]; then
-           echo "::set-output name=push::false"
-        else
-           git commit -m "Update Scala and Python APIs" -a
-           echo "::set-output name=push::true"
-        fi
-      shell: bash
-    - name: Push changes
-      if: steps.commit.outputs.push == 'true'
-      uses: ad-m/github-push-action@master
-      with:
-         github_token: ${{ secrets.GITHUB_TOKEN }}
-         branch: ${{ github.ref }}
+      - name: Checkout repo
+        uses: actions/checkout@v2
+
+      - name: Set up JDK 8
+        uses: actions/setup-java@v1
+        with:
+          java-version: 1.8
+
+      - name: Install Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+          architecture: "x64"
+
+      - name: Install SBT
+        run: |
+          echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list
+          echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
+          curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x99E82A75642AC823" | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/sbt.gpg > /dev/null
+          sudo apt-get update
+          sudo apt-get install -y sbt
+
+      - name: Build Scala APIs
+        run: sbt doc
+
+      - name: Install PyPI dependencies
+        run: |
+          python -m pip install --upgrade pip
+          cd ./python/docs && pip install -r requirements_doc.txt
+
+      - name: Build Python APIs
+        run: |
+          cd ./python/docs
+          # Run with verbose output to debug any issues
+          SPHINX_APIDOC_OPTIONS=members,undoc-members,show-inheritance sphinx-apidoc -e -f -o ./_api ../sparknlp ../sparknlp/tests
+          make html SPHINXOPTS="-v"
+
+      - name: Commit changes
+        id: commit
+        run: |
+          git config --local user.email "[email protected]"
+          git config --local user.name "github-actions"
+          git add --all
+          if [ -z "$(git status --porcelain)" ]; then
+            echo "push=false" >> $GITHUB_OUTPUT
+          else
+            git commit -m "Update Scala and Python APIs" -a
+            echo "push=true" >> $GITHUB_OUTPUT
+          fi
+        shell: bash
+
+      - name: Push changes
+        if: ${{ steps.commit.outputs.push == 'true' }}
+        uses: ad-m/github-push-action@master
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          branch: ${{ github.ref }}
@@ -1,3 +1,43 @@
+=======
+6.0.0
+=======
+----------------
+New Features & Enhancements
+----------------
+* Introducing new large language models:
+  * OLMo model support (SPARKNLP-1006)
+  * Phi 3.5 Vision model support (SPARKNLP-1060)
+  * LLAVA model support (SPARKNLP-1033)
+  * CoHere model support (SPARKNLP-1032)
+  * Qwen2-VL model support (SPARKNLP-1077)
+  * Llama 3.2 Vision models (SPARKNLP-1078)
+  * Deepseek Janus model support (SPARKNLP-1088)
+  * Added LLAVA v1.5 7b quantized model
+  * Added StarCoder2 3b int8 model
+
+* New MultipleChoice Transformers:
+  * AlbertForMultipleChoice (SPARKNLP-1105)
+  * DistilBertForMultipleChoice (SPARKNLP-1106)
+  * RoBertaForMultipleChoice (SPARKNLP-1107)
+  * XlmRoBertaForMultipleChoice (SPARKNLP-1108)
+
+* New file format support:
+  * Excel files reader (SPARKNLP-1102)
+  * PowerPoint files reader (SPARKNLP-1103)
+  * PDF reader (SPARKNLP-1098)
+  * Text reader (SPARKNLP-1113)
+
+* Other improvements:
+  * AutoGGUFVisionModel for vision model support (SPARKNLP-1079)
+  * Added Extractor to SparkNLP (SPARKNLP-1109)
+  * Updated Python and Scala model names
+  * Improved error handling for AutoGGUF models
+
+----------------
+Bug Fixes
+----------------
+* Fixed typo in MXBAI notebook
+
 ========
 5.5.3
 ========
 
@@ -19,7 +19,7 @@
 
 Spark NLP is a state-of-the-art Natural Language Processing library built on top of Apache Spark. It provides **simple**, **performant** & **accurate** NLP annotations for machine learning pipelines that **scale** easily in a distributed environment.
 
-Spark NLP comes with **83000+** pretrained **pipelines** and **models** in more than **200+** languages.
+Spark NLP comes with **100000+** pretrained **pipelines** and **models** in more than **200+** languages.
 It also offers tasks such as **Tokenization**, **Word Segmentation**, **Part-of-Speech Tagging**, Word and Sentence **Embeddings**, **Named Entity Recognition**, **Dependency Parsing**, **Spell Checking**, **Text Classification**, **Sentiment Analysis**, **Token Classification**, **Machine Translation** (+180 languages), **Summarization**, **Question Answering**, **Table Question Answering**, **Text Generation**, **Image Classification**, **Image to Text (captioning)**, **Automatic Speech Recognition**, **Zero-Shot Learning**, and many more [NLP tasks](#features).
 
 **Spark NLP** is the only open-source NLP library in **production** that offers state-of-the-art transformers such as **BERT**, **CamemBERT**, **ALBERT**, **ELECTRA**, **XLNet**, **DistilBERT**, **RoBERTa**, **DeBERTa**, **XLM-RoBERTa**, **Longformer**, **ELMO**, **Universal Sentence Encoder**, **Llama-2**, **M2M100**, **BART**, **Instructor**, **E5**, **Google T5**, **MarianMT**, **OpenAI GPT2**, **Vision Transformers (ViT)**, **OpenAI Whisper**, **Llama**, **Mistral**, **Phi**, **Qwen2**, and many more not only to **Python** and **R**, but also to **JVM** ecosystem (**Java**, **Scala**, and **Kotlin**) at **scale** by extending **Apache Spark** natively.
@@ -63,7 +63,7 @@ $ java -version
 $ conda create -n sparknlp python=3.7 -y
 $ conda activate sparknlp
 # spark-nlp by default is based on pyspark 3.x
-$ pip install spark-nlp==5.5.3 pyspark==3.3.1
+$ pip install spark-nlp==6.0.0 pyspark==3.3.1
 ```
 
 In Python console or Jupyter `Python3` kernel:
@@ -129,10 +129,11 @@ For a quick example of using pipelines and models take a look at our official [d
 
 ### Apache Spark Support
 
-Spark NLP *5.5.3* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
+Spark NLP *6.0.0* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
 
 | Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x |
 |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
+| 6.0.x     | YES                | YES                | YES                | YES                | YES                | YES                | NO                 | NO                 |
 | 5.5.x     | YES                | YES                | YES                | YES                | YES                | YES                | NO                 | NO                 |
 | 5.4.x     | YES                | YES                | YES                | YES                | YES                | YES                | NO                 | NO                 |
 | 5.3.x     | YES                | YES                | YES                | YES                | YES                | YES                | NO                 | NO                 |
@@ -146,6 +147,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github
 
 | Spark NLP | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10| Scala 2.11 | Scala 2.12 |
 |-----------|------------|------------|------------|------------|------------|------------|------------|
+| 6.0.x     | NO         | YES        | YES        | YES        | YES        | NO         | YES        |
 | 5.5.x     | NO         | YES        | YES        | YES        | YES        | NO         | YES        |
 | 5.4.x     | NO         | YES        | YES        | YES        | YES        | NO         | YES        |
 | 5.3.x     | NO         | YES        | YES        | YES        | YES        | NO         | YES        |
@@ -157,7 +159,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http
 
 ### Databricks Support
 
-Spark NLP 5.5.3 has been tested and is compatible with the following runtimes:
+Spark NLP 6.0.0 has been tested and is compatible with the following runtimes:
 
 | **CPU**            | **GPU**            |
 |--------------------|--------------------|
@@ -174,7 +176,7 @@ We are compatible with older runtimes. For a full list check databricks support
 
 ### EMR Support
 
-Spark NLP 5.5.3 has been tested and is compatible with the following EMR releases:
+Spark NLP 6.0.0 has been tested and is compatible with the following EMR releases:
 
 | **EMR Release**    |
 |--------------------|
@@ -184,6 +186,13 @@ Spark NLP 5.5.3 has been tested and is compatible with the following EMR release
 | emr-7.0.0          |
 | emr-7.1.0          |
 | emr-7.2.0          |
+| emr-7.3.0          |
+| emr-7.4.0          |
+| emr-7.5.0          |
+| emr-7.6.0          |
+| emr-7.7.0          |
+| emr-7.8.0          |
+
 
 We are compatible with older EMR releases. For a full list check EMR support in our official [documentation](https://sparknlp.org/docs/en/install#emr-support)
 
@@ -205,7 +214,7 @@ deployed to Maven central. To add any of our packages as a dependency in your ap
 from our official documentation.
 
 If you are interested, there is a simple SBT project for Spark NLP to guide you on how to use it in your
-projects [Spark NLP SBT S5.5.3r](https://github.com/maziyarpanahi/spark-nlp-starter)
+projects [Spark NLP SBT S6.0.0r](https://github.com/maziyarpanahi/spark-nlp-starter)
 
 ### Python
 
@@ -250,7 +259,7 @@ In Spark NLP we can define S3 locations to:
 
 Please check [these instructions](https://sparknlp.org/docs/en/install#s3-integration) from our official documentation.
 
-## Document5.5.3
+## Documentation
 
 ### Examples
 
@@ -283,7 +292,7 @@ the Spark NLP library:
     keywords = {Spark, Natural language processing, Deep learning, Tensorflow, Cluster},
     abstract = {Spark NLP is a Natural Language Processing (NLP) library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines that can scale easily in a distributed environment. Spark NLP comes with 1100+ pretrained pipelines and models in more than 192+ languages. It supports nearly all the NLP tasks and modules that can be used seamlessly in a cluster. Downloaded more than 2.7 million times and experiencing 9x growth since January 2020, Spark NLP is used by 54% of healthcare organizations as the world’s most widely used NLP library in the enterprise.}
     }
-}5.5.3
+}
 ```
 
 ## Community support
 
@@ -6,7 +6,7 @@ name := getPackageName(is_silicon, is_gpu, is_aarch64)
 
 organization := "com.johnsnowlabs.nlp"
 
-version := "5.5.3"
+version := "6.0.0"
 
 (ThisBuild / scalaVersion) := scalaVer
 
@@ -163,7 +163,8 @@ lazy val utilDependencies = Seq(
   poiDocx
     exclude ("org.apache.logging.log4j", "log4j-api"),
   scratchpad
-    exclude ("org.apache.logging.log4j", "log4j-api")
+    exclude ("org.apache.logging.log4j", "log4j-api"),
+  pdfBox
 )
 
 lazy val typedDependencyParserDependencies = Seq(junit)
 
@@ -1,13 +1,13 @@
 {% set name = "spark-nlp" %}
-{% set version = "5.5.3" %}
+{% set version = "6.0.0" %}
 
 package:
   name: {{ name|lower }}
   version: {{ version }}
 
 source:
   url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/spark-nlp-{{ version }}.tar.gz
-  sha256: b620487092256d02bf8d277374c564cd22384d437c97a4bb5b3b0f1fdfc696e8
+  sha256: 58f4f530105d5c5522fc37ce4d3b63af1e2463b43e000cf69838e0854b468365
 
 build:
   noarch: python
 
@@ -201,7 +201,7 @@ <h3 class="grey h3_title">{{ _section.title }}</h3>
                   <div class="highlight-box">
     {% highlight bash %}
     # Using PyPI
-    $ pip install spark-nlp==5.5.3
+    $ pip install spark-nlp==6.0.0
 
     # Using Anaconda/Conda
     $ conda install -c johnsnowlabs spark-nlp
 
@@ -3,9 +3,9 @@
         <head>
           <meta http-equiv="X-UA-Compatible" content="IE=edge" />
           <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" />
-          <title>Spark NLP 5.5.3 ScalaDoc  - com</title>
-          <meta name="description" content="Spark NLP 5.5.3 ScalaDoc - com" />
-          <meta name="keywords" content="Spark NLP 5.5.3 ScalaDoc com" />
+          <title>Spark NLP 6.0.0 ScalaDoc  - com</title>
+          <meta name="description" content="Spark NLP 6.0.0 ScalaDoc - com" />
+          <meta name="keywords" content="Spark NLP 6.0.0 ScalaDoc com" />
           <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
 
 
@@ -28,7 +28,7 @@
         </head>
         <body>
       <div id="search">
-        <span id="doc-title">Spark NLP 5.5.3 ScalaDoc<span id="doc-version"></span></span>
+        <span id="doc-title">Spark NLP 6.0.0 ScalaDoc<span id="doc-version"></span></span>
         <span class="close-results"><span class="left">&lt;</span> Back</span>
         <div id="textfilter">
           <span class="input">
 
@@ -3,9 +3,9 @@
         <head>
           <meta http-equiv="X-UA-Compatible" content="IE=edge" />
           <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" />
-          <title>Spark NLP 5.5.3 ScalaDoc  - com.johnsnowlabs.client.CloudClient</title>
-          <meta name="description" content="Spark NLP 5.5.3 ScalaDoc - com.johnsnowlabs.client.CloudClient" />
-          <meta name="keywords" content="Spark NLP 5.5.3 ScalaDoc com.johnsnowlabs.client.CloudClient" />
+          <title>Spark NLP 6.0.0 ScalaDoc  - com.johnsnowlabs.client.CloudClient</title>
+          <meta name="description" content="Spark NLP 6.0.0 ScalaDoc - com.johnsnowlabs.client.CloudClient" />
+          <meta name="keywords" content="Spark NLP 6.0.0 ScalaDoc com.johnsnowlabs.client.CloudClient" />
           <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
 
 
@@ -28,7 +28,7 @@
         </head>
         <body>
       <div id="search">
-        <span id="doc-title">Spark NLP 5.5.3 ScalaDoc<span id="doc-version"></span></span>
+        <span id="doc-title">Spark NLP 6.0.0 ScalaDoc<span id="doc-version"></span></span>
         <span class="close-results"><span class="left">&lt;</span> Back</span>
         <div id="textfilter">
           <span class="input">
 
@@ -3,9 +3,9 @@
         <head>
           <meta http-equiv="X-UA-Compatible" content="IE=edge" />
           <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" />
-          <title>Spark NLP 5.5.3 ScalaDoc  - com.johnsnowlabs.client.CloudManager</title>
-          <meta name="description" content="Spark NLP 5.5.3 ScalaDoc - com.johnsnowlabs.client.CloudManager" />
-          <meta name="keywords" content="Spark NLP 5.5.3 ScalaDoc com.johnsnowlabs.client.CloudManager" />
+          <title>Spark NLP 6.0.0 ScalaDoc  - com.johnsnowlabs.client.CloudManager</title>
+          <meta name="description" content="Spark NLP 6.0.0 ScalaDoc - com.johnsnowlabs.client.CloudManager" />
+          <meta name="keywords" content="Spark NLP 6.0.0 ScalaDoc com.johnsnowlabs.client.CloudManager" />
           <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
 
 
@@ -28,7 +28,7 @@
         </head>
         <body>
       <div id="search">
-        <span id="doc-title">Spark NLP 5.5.3 ScalaDoc<span id="doc-version"></span></span>
+        <span id="doc-title">Spark NLP 6.0.0 ScalaDoc<span id="doc-version"></span></span>
         <span class="close-results"><span class="left">&lt;</span> Back</span>
         <div id="textfilter">
           <span class="input">