Unescape query from EMR spark submit parameter (#306)

seankao31 · web-flow · commit b69f38b33d63 · 2024-04-16T17:06:55.000-07:00
* unescape query from EMR spark submit parameter

Signed-off-by: Sean Kao &lt;seankao@amazon.com&gt;

* scalafmtAll

Signed-off-by: Sean Kao &lt;seankao@amazon.com&gt;

---------

Signed-off-by: Sean Kao &lt;seankao@amazon.com&gt;
diff --git a/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintJob.scala b/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintJob.scala
@@ -56,7 +56,7 @@ object FlintJob extends Logging with FlintJobExecutor {
     conf.set(FlintSparkConf.JOB_TYPE.key, jobType)
 
     val dataSource = conf.get("spark.flint.datasource.name", "")
-    val query = queryOption.getOrElse(conf.get(FlintSparkConf.QUERY.key, ""))
+    val query = queryOption.getOrElse(unescapeQuery(conf.get(FlintSparkConf.QUERY.key, "")))
     if (query.isEmpty) {
       throw new IllegalArgumentException(s"Query undefined for the ${jobType} job.")
     }
diff --git a/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintJobExecutor.scala b/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintJobExecutor.scala
@@ -11,6 +11,7 @@ import scala.concurrent.{ExecutionContext, Future, TimeoutException}
 import scala.concurrent.duration.{Duration, MINUTES}
 
 import com.amazonaws.services.s3.model.AmazonS3Exception
+import org.apache.commons.text.StringEscapeUtils.unescapeJava
 import org.opensearch.flint.core.{FlintClient, IRestHighLevelClient}
 import org.opensearch.flint.core.metadata.FlintMetadata
 import org.opensearch.flint.core.metrics.MetricConstants
@@ -361,6 +362,14 @@ trait FlintJobExecutor {
     }
   }
 
+  /**
+   * Unescape the query string which is escaped for EMR spark submit parameter parsing. Ref:
+   * https://github.com/opensearch-project/sql/pull/2587
+   */
+  def unescapeQuery(query: String): String = {
+    unescapeJava(query)
+  }
+
   def executeQuery(
       spark: SparkSession,
       query: String,
@@ -371,6 +380,7 @@ trait FlintJobExecutor {
     val startTime = System.currentTimeMillis()
     // we have to set job group in the same thread that started the query according to spark doc
     spark.sparkContext.setJobGroup(queryId, "Job group for " + queryId, interruptOnCancel = true)
+    logInfo(s"Executing query: $query")
     val result: DataFrame = spark.sql(query)
     // Get Data
     getFormattedData(
diff --git a/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintREPL.scala b/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintREPL.scala
@@ -251,7 +251,7 @@ object FlintREPL extends Logging with FlintJobExecutor {
         if (defaultQuery.isEmpty) {
           throw new IllegalArgumentException("Query undefined for the streaming job.")
         }
-        defaultQuery
+        unescapeQuery(defaultQuery)
       } else ""
     }
   }
diff --git a/spark-sql-application/src/test/scala/org/apache/spark/sql/FlintREPLTest.scala b/spark-sql-application/src/test/scala/org/apache/spark/sql/FlintREPLTest.scala
@@ -95,6 +95,18 @@ class FlintREPLTest
     query shouldBe "SELECT * FROM table"
   }
 
+  test(
+    "getQuery should return unescaped default query for streaming job if queryOption is None") {
+    val queryOption = None
+    val jobType = "streaming"
+    val conf = new SparkConf().set(
+      FlintSparkConf.QUERY.key,
+      "SELECT \\\"1\\\" UNION SELECT '\\\"1\\\"' UNION SELECT \\\"\\\\\\\"1\\\\\\\"\\\"")
+
+    val query = FlintREPL.getQuery(queryOption, jobType, conf)
+    query shouldBe "SELECT \"1\" UNION SELECT '\"1\"' UNION SELECT \"\\\"1\\\"\""
+  }
+
   test(
     "getQuery should throw IllegalArgumentException if queryOption is None and default query is not defined for streaming job") {
     val queryOption = None

Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ object FlintJob extends Logging with FlintJobExecutor {`
`56`	`56`	`conf.set(FlintSparkConf.JOB_TYPE.key, jobType)`
`57`	`57`
`58`	`58`	`val dataSource = conf.get("spark.flint.datasource.name", "")`
`59`		`- val query = queryOption.getOrElse(conf.get(FlintSparkConf.QUERY.key, ""))`
	`59`	`+ val query = queryOption.getOrElse(unescapeQuery(conf.get(FlintSparkConf.QUERY.key, "")))`
`60`	`60`	`if (query.isEmpty) {`
`61`	`61`	`throw new IllegalArgumentException(s"Query undefined for the ${jobType} job.")`
`62`	`62`	`}`
Original file line number	Diff line number	Diff line change
`@@ -251,7 +251,7 @@ object FlintREPL extends Logging with FlintJobExecutor {`
`251`	`251`	`if (defaultQuery.isEmpty) {`
`252`	`252`	`throw new IllegalArgumentException("Query undefined for the streaming job.")`
`253`	`253`	`}`
`254`		`- defaultQuery`
	`254`	`+ unescapeQuery(defaultQuery)`
`255`	`255`	`} else ""`
`256`	`256`	`}`
`257`	`257`	`}`