MLCOMPUTE-548 | enabled DRA by default for Jupyter Spark sessions (#100)

CaptainSame · Sameer Sharma · web-flow · commit 75048225f21c · 2022-12-14T16:03:08.000Z
* MLCOMPUTE-548 | enabled DRA by default for Jupyter Spark sessions, moved default enable logic from paasta spark-run

Co-authored-by: Sameer Sharma &lt;sameersharma@yelp.com&gt;
diff --git a/service_configuration_lib/spark_config.py b/service_configuration_lib/spark_config.py
@@ -268,13 +268,22 @@ def _append_sql_partitions_conf(spark_opts: Dict[str, str]) -> Dict[str, str]:
 
 
 def get_dra_configs(spark_opts: Dict[str, str]) -> Dict[str, str]:
+    # don't enable DRA if it is explicitly disabled
     if (
-        'spark.dynamicAllocation.enabled' not in spark_opts or
+        'spark.dynamicAllocation.enabled' in spark_opts and
         str(spark_opts['spark.dynamicAllocation.enabled']) != 'true'
     ):
         return spark_opts
 
+    log.warning(
+        'Spark Dynamic Resource Allocation (DRA) enabled for this batch. More info: y/spark-dra. '
+        'If your job is performing worse because of DRA, consider disabling DRA. To disable, '
+        'please provide spark.dynamicAllocation.enabled=false in your spark args\n',
+    )
+
+    spark_app_name = spark_opts.get('spark.app.name', '')
     # set defaults if not provided already
+    _append_spark_config(spark_opts, 'spark.dynamicAllocation.enabled', 'true')
     _append_spark_config(spark_opts, 'spark.dynamicAllocation.shuffleTracking.enabled', 'true')
     _append_spark_config(
         spark_opts, 'spark.dynamicAllocation.executorAllocationRatio',
@@ -288,10 +297,11 @@ def get_dra_configs(spark_opts: Dict[str, str]) -> Dict[str, str]:
         f'\nSetting spark.dynamicAllocation.cachedExecutorIdleTimeout as {DEFAULT_DRA_CACHED_EXECUTOR_IDLE_TIMEOUT}. '
         f'Executor with cached data block will be released if it has been idle for this duration. '
         f'If you wish to change the value of cachedExecutorIdleTimeout, please provide the exact value of '
-        f'spark.dynamicAllocation.cachedExecutorIdleTimeout in --spark-args. If your job is performing bad because '
+        f'spark.dynamicAllocation.cachedExecutorIdleTimeout in your spark args. If your job is performing bad because '
         f'the cached data was lost, please consider increasing this value.\n',
     )
 
+    min_ratio_executors = None
     if 'spark.dynamicAllocation.minExecutors' not in spark_opts:
         # the ratio of total executors to be used as minExecutors
         min_executor_ratio = spark_opts.get('spark.yelp.dra.minExecutorRatio', DEFAULT_DRA_MIN_EXECUTOR_RATIO)
@@ -308,19 +318,34 @@ def get_dra_configs(spark_opts: Dict[str, str]) -> Dict[str, str]:
                                    float(min_executor_ratio)),
             )
 
+        min_ratio_executors = min_executors
+
+        warn_msg = '\nSetting spark.dynamicAllocation.minExecutors as'
+
+        # set minExecutors equal to 0 for Jupyter Spark sessions
+        # TODO: add regex to better match Jupyterhub Spark session app name
+        if 'jupyterhub' in spark_app_name:
+            min_executors = 0
+            warn_msg = (
+                f'Looks like you are launching Spark session from a Jupyter notebook. '
+                f'{warn_msg} {min_executors} to save spark costs when any spark action is not running'
+            )
+        else:
+            warn_msg = f'{warn_msg} {min_executors}'
+
         spark_opts['spark.dynamicAllocation.minExecutors'] = str(min_executors)
         log.warning(
-            f'\nSetting spark.dynamicAllocation.minExecutors as {min_executors}. If you wish to '
-            f'change the value of minimum executors, please provide the exact value of '
-            f'spark.dynamicAllocation.minExecutors in --spark-args\n',
+            f'\n{warn_msg}. If you wish to change the value of minimum executors, please provide '
+            f'the exact value of spark.dynamicAllocation.minExecutors in your spark args\n',
         )
 
-        if 'spark.yelp.dra.minExecutorRatio' not in spark_opts:
+        # TODO: add regex to better match Jupyterhub Spark session app name
+        if 'jupyterhub' not in spark_app_name and 'spark.yelp.dra.minExecutorRatio' not in spark_opts:
             log.debug(
                 f'\nspark.yelp.dra.minExecutorRatio not provided. This specifies the ratio of total executors '
                 f'to be used as minimum executors for Dynamic Resource Allocation. More info: y/spark-dra. Using '
                 f'default ratio: {DEFAULT_DRA_MIN_EXECUTOR_RATIO}. If you wish to change this value, please provide '
-                f'the desired spark.yelp.dra.minExecutorRatio in --spark-args\n',
+                f'the desired spark.yelp.dra.minExecutorRatio in your spark args\n',
             )
 
     if 'spark.dynamicAllocation.maxExecutors' not in spark_opts:
@@ -334,7 +359,24 @@ def get_dra_configs(spark_opts: Dict[str, str]) -> Dict[str, str]:
         log.warning(
             f'\nSetting spark.dynamicAllocation.maxExecutors as {max_executors}. If you wish to '
             f'change the value of maximum executors, please provide the exact value of '
-            f'spark.dynamicAllocation.maxExecutors in --spark-args\n',
+            f'spark.dynamicAllocation.maxExecutors in your spark args\n',
+        )
+
+    # TODO: add regex to better match Jupyterhub Spark session app name
+    if 'jupyterhub' in spark_app_name and 'spark.dynamicAllocation.initialExecutors' not in spark_opts:
+        if min_ratio_executors is not None:
+            # set initialExecutors default equal to minimum executors calculated above using
+            # 'spark.yelp.dra.minExecutorRatio' and DEFAULT_DRA_MIN_EXECUTOR_RATIO for Jupyter Spark sessions
+            initial_executors = min_ratio_executors
+        else:
+            # otherwise set initial executors equal to minimum executors
+            initial_executors = int(spark_opts['spark.dynamicAllocation.minExecutors'])
+
+        spark_opts['spark.dynamicAllocation.initialExecutors'] = str(initial_executors)
+        log.warning(
+            f'\nSetting spark.dynamicAllocation.initialExecutors as {initial_executors}. If you wish to '
+            f'change the value of initial executors, please provide the exact value of '
+            f'spark.dynamicAllocation.initialExecutors in your spark args\n',
         )
 
     spark_opts['spark.executor.instances'] = spark_opts['spark.dynamicAllocation.minExecutors']
@@ -896,7 +938,8 @@ def get_spark_conf(
         raise UnsupportedClusterManagerException(cluster_manager)
 
     # configure dynamic resource allocation configs
-    spark_conf = get_dra_configs(spark_conf)
+    if cluster_manager != 'mesos':
+        spark_conf = get_dra_configs(spark_conf)
 
     # configure spark_event_log
     spark_conf = _append_event_log_conf(spark_conf, *aws_creds)
diff --git a/setup.py b/setup.py
@@ -17,7 +17,7 @@
 
 setup(
     name='service-configuration-lib',
-    version='2.13.5',
+    version='2.13.6',
     provides=['service_configuration_lib'],
     description='Start, stop, and inspect Yelp SOA services',
     url='https://github.com/Yelp/service_configuration_lib',
diff --git a/tests/spark_config_test.py b/tests/spark_config_test.py
@@ -548,7 +548,7 @@ def test_adjust_spark_requested_resources_error(
                     'spark.executor.instances': '606',
                 },
                 {
-                    'spark.executor.instances': '606',
+                    'spark.executor.instances': '151',  # enabled by default, 606/4
                 },
             ),
         ],
@@ -790,6 +790,20 @@ def mock_adjust_spark_requested_resources_kubernetes(self):
         with MockConfigFunction('_adjust_spark_requested_resources', return_value) as m:
             yield m
 
+    @pytest.fixture
+    def mock_get_dra_configs(self):
+        return_value = {
+            'spark.dynamicAllocation.enabled': 'true',
+            'spark.dynamicAllocation.maxExecutors': '2',
+            'spark.dynamicAllocation.shuffleTracking.enabled': 'true',
+            'spark.dynamicAllocation.executorAllocationRatio': '0.8',
+            'spark.executor.instances': '2',
+            'spark.dynamicAllocation.minExecutors': '0',
+            'spark.dynamicAllocation.cachedExecutorIdleTimeout': '900s',
+        }
+        with MockConfigFunction('get_dra_configs', return_value) as m:
+            yield m
+
     @pytest.fixture
     def mock_secret(self, tmpdir, monkeypatch):
         secret = 'secret'
@@ -1114,6 +1128,7 @@ def test_leaders_get_spark_conf_kubernetes(
         mock_append_aws_credentials_conf,
         mock_append_sql_partitions_conf,
         mock_adjust_spark_requested_resources_kubernetes,
+        mock_get_dra_configs,
         mock_time,
         assert_ui_port,
         assert_app_name,
@@ -1150,6 +1165,7 @@ def test_leaders_get_spark_conf_kubernetes(
             assert_kubernetes_conf(output) +
             list(other_spark_opts.keys()) +
             list(mock_adjust_spark_requested_resources_kubernetes.return_value.keys()) +
+            list(mock_get_dra_configs.return_value.keys()) +
             list(mock_append_event_log_conf.return_value.keys()) +
             list(mock_append_aws_credentials_conf.return_value.keys()) +
             list(mock_append_sql_partitions_conf.return_value.keys()),
@@ -1201,6 +1217,7 @@ def test_local_spark(
         mock_append_aws_credentials_conf,
         mock_append_sql_partitions_conf,
         mock_adjust_spark_requested_resources_kubernetes,
+        mock_get_dra_configs,
         mock_time,
         assert_ui_port,
         assert_app_name,
@@ -1229,6 +1246,7 @@ def test_local_spark(
             assert_local_conf(output) +
             list(mock_append_event_log_conf.return_value.keys()) +
             list(mock_adjust_spark_requested_resources_kubernetes.return_value.keys()) +
+            list(mock_get_dra_configs.return_value.keys()) +
             list(mock_append_aws_credentials_conf.return_value.keys()) +
             list(mock_append_sql_partitions_conf.return_value.keys()),
         )