@@ -8,12 +8,17 @@ package org.apache.spark.sql
8
8
9
9
import java .util .concurrent .atomic .AtomicInteger
10
10
11
+ import scala .concurrent .{ExecutionContext , ExecutionContextExecutor }
12
+
13
+ import org .opensearch .flint .common .model .FlintStatement
14
+ import org .opensearch .flint .common .scheduler .model .LangType
11
15
import org .opensearch .flint .core .logging .CustomLogging
12
16
import org .opensearch .flint .core .metrics .MetricConstants
13
17
import org .opensearch .flint .core .metrics .MetricsUtil .registerGauge
14
18
15
19
import org .apache .spark .internal .Logging
16
20
import org .apache .spark .sql .flint .config .FlintSparkConf
21
+ import org .apache .spark .util .ThreadUtils
17
22
18
23
/**
19
24
* Spark SQL Application entrypoint
@@ -26,52 +31,70 @@ import org.apache.spark.sql.flint.config.FlintSparkConf
26
31
* write sql query result to given opensearch index
27
32
*/
28
33
object FlintJob extends Logging with FlintJobExecutor {
34
+ private val streamingRunningCount = new AtomicInteger (0 )
35
+ private val statementRunningCount = new AtomicInteger (0 )
36
+
29
37
def main (args : Array [String ]): Unit = {
30
38
val (queryOption, resultIndexOption) = parseArgs(args)
31
39
32
40
val conf = createSparkConf()
33
- val jobType = conf.get(" spark.flint.job.type" , FlintJobType .BATCH )
34
- CustomLogging .logInfo(s """ Job type is: ${jobType}""" )
35
- conf.set(FlintSparkConf .JOB_TYPE .key, jobType)
36
-
37
- val dataSource = conf.get(" spark.flint.datasource.name" , " " )
38
- val query = queryOption.getOrElse(unescapeQuery(conf.get(FlintSparkConf .QUERY .key, " " )))
39
- if (query.isEmpty) {
40
- logAndThrow(s " Query undefined for the ${jobType} job. " )
41
- }
42
- val queryId = conf.get(FlintSparkConf .QUERY_ID .key, " " )
43
-
44
- if (resultIndexOption.isEmpty) {
45
- logAndThrow(" resultIndex is not set" )
46
- }
47
- // https://github.com/opensearch-project/opensearch-spark/issues/138
48
- /*
49
- * To execute queries such as `CREATE SKIPPING INDEX ON my_glue1.default.http_logs_plain (`@timestamp` VALUE_SET) WITH (auto_refresh = true)`,
50
- * it's necessary to set `spark.sql.defaultCatalog=my_glue1`. This is because AWS Glue uses a single database (default) and table (http_logs_plain),
51
- * and we need to configure Spark to recognize `my_glue1` as a reference to AWS Glue's database and table.
52
- * By doing this, we effectively map `my_glue1` to AWS Glue, allowing Spark to resolve the database and table names correctly.
53
- * Without this setup, Spark would not recognize names in the format `my_glue1.default`.
54
- */
55
- conf.set(" spark.sql.defaultCatalog" , dataSource)
56
- configDYNMaxExecutors(conf, jobType)
57
-
41
+ val sparkSession = createSparkSession(conf)
58
42
val applicationId =
59
43
environmentProvider.getEnvVar(" SERVERLESS_EMR_VIRTUAL_CLUSTER_ID" , " unknown" )
60
44
val jobId = environmentProvider.getEnvVar(" SERVERLESS_EMR_JOB_ID" , " unknown" )
45
+ val isWarmpoolEnabled = conf.get(FlintSparkConf .WARMPOOL_ENABLED .key, " false" ).toBoolean
46
+ logInfo(s " isWarmpoolEnabled: ${isWarmpoolEnabled}" )
47
+
48
+ if (! isWarmpoolEnabled) {
49
+ val jobType = sparkSession.conf.get(" spark.flint.job.type" , FlintJobType .BATCH )
50
+ CustomLogging .logInfo(s """ Job type is: ${jobType}""" )
51
+ sparkSession.conf.set(FlintSparkConf .JOB_TYPE .key, jobType)
52
+
53
+ val dataSource = conf.get(" spark.flint.datasource.name" , " " )
54
+ val query = queryOption.getOrElse(unescapeQuery(conf.get(FlintSparkConf .QUERY .key, " " )))
55
+ if (query.isEmpty) {
56
+ logAndThrow(s " Query undefined for the ${jobType} job. " )
57
+ }
58
+ val queryId = conf.get(FlintSparkConf .QUERY_ID .key, " " )
61
59
62
- val streamingRunningCount = new AtomicInteger (0 )
63
- val jobOperator =
64
- JobOperator (
60
+ if (resultIndexOption.isEmpty) {
61
+ logAndThrow(" resultIndex is not set" )
62
+ }
63
+
64
+ configDYNMaxExecutors(conf, jobType)
65
+ val flintStatement =
66
+ new FlintStatement (
67
+ " running" ,
68
+ query,
69
+ " " ,
70
+ queryId,
71
+ LangType .SQL ,
72
+ currentTimeProvider.currentEpochMillis(),
73
+ Option .empty,
74
+ Map .empty)
75
+
76
+ val jobOperator = createJobOperator(
77
+ sparkSession,
65
78
applicationId,
66
79
jobId,
67
- createSparkSession(conf),
68
- query,
69
- queryId,
80
+ flintStatement,
70
81
dataSource,
71
82
resultIndexOption.get,
72
83
jobType,
73
- streamingRunningCount)
74
- registerGauge(MetricConstants .STREAMING_RUNNING_METRIC , streamingRunningCount)
75
- jobOperator.start()
84
+ streamingRunningCount,
85
+ statementRunningCount)
86
+ registerGauge(MetricConstants .STREAMING_RUNNING_METRIC , streamingRunningCount)
87
+ jobOperator.start()
88
+ } else {
89
+ // Fetch and execute queries in warm pool mode
90
+ val warmpoolJob =
91
+ WarmpoolJob (
92
+ applicationId,
93
+ jobId,
94
+ sparkSession,
95
+ streamingRunningCount,
96
+ statementRunningCount)
97
+ warmpoolJob.start()
98
+ }
76
99
}
77
100
}
0 commit comments