|
6 | 6 | import logging |
7 | 7 | import math |
8 | 8 | import os |
| 9 | +import random |
9 | 10 | import re |
| 11 | +import string |
10 | 12 | import time |
11 | 13 | from typing import Any |
12 | 14 | from typing import Dict |
|
38 | 40 |
|
39 | 41 | NON_CONFIGURABLE_SPARK_OPTS = { |
40 | 42 | 'spark.master', |
41 | | - 'spark.app.id', |
42 | 43 | 'spark.ui.port', |
43 | 44 | 'spark.mesos.principal', |
44 | 45 | 'spark.mesos.secret', |
@@ -1029,6 +1030,7 @@ def get_spark_conf( |
1029 | 1030 | aws_region: Optional[str] = None, |
1030 | 1031 | service_account_name: Optional[str] = None, |
1031 | 1032 | force_spark_resource_configs: bool = True, |
| 1033 | + spark_app_id: str = '', # to be removed once verified all applications are not explicitly setting app id |
1032 | 1034 | ) -> Dict[str, str]: |
1033 | 1035 | """Build spark config dict to run with spark on paasta |
1034 | 1036 |
|
@@ -1057,6 +1059,7 @@ def get_spark_conf( |
1057 | 1059 | If not provided, it uses cert files at {K8S_AUTH_FOLDER} to authenticate. |
1058 | 1060 | :param force_spark_resource_configs: skip the resource/instances recalculation. |
1059 | 1061 | This is strongly not recommended. |
| 1062 | + :param explcitly setting spark.app.id |
1060 | 1063 | :returns: spark opts in a dict. |
1061 | 1064 | """ |
1062 | 1065 | # Mesos deprecation |
@@ -1095,11 +1098,19 @@ def get_spark_conf( |
1095 | 1098 | # in all places for metric systems: |
1096 | 1099 | # - since in the Promehteus metrics endpoint those will be converted to '_' |
1097 | 1100 | # - while the 'spark-app-selector' executor pod label will keep the original app id |
1098 | | - if is_jupyter: |
1099 | | - raw_app_id = app_name |
| 1101 | + if len(spark_app_id) == 0: |
| 1102 | + if is_jupyter: |
| 1103 | + raw_app_id = app_name |
| 1104 | + else: |
| 1105 | + random_postfix = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(4)) |
| 1106 | + raw_app_id = f'{paasta_service}__{paasta_instance}__{random_postfix}' |
| 1107 | + app_id = re.sub(r'[\.,-]', '_', _get_k8s_resource_name_limit_size_with_hash(raw_app_id)) |
1100 | 1108 | else: |
1101 | | - raw_app_id = f'{paasta_service}__{paasta_instance}__{int(time.time()) % 10000}' |
1102 | | - app_id = re.sub(r'[\.,-]', '_', _get_k8s_resource_name_limit_size_with_hash(raw_app_id)) |
| 1109 | + log.warning( |
| 1110 | + 'We do not recommend users to set spark.app.id, as it could diminish the clarity of identification ' |
| 1111 | + 'and can potentially cause the monitoring dashboard to not work properly.', |
| 1112 | + ) |
| 1113 | + app_id = spark_app_id |
1103 | 1114 |
|
1104 | 1115 | spark_conf.update({ |
1105 | 1116 | 'spark.app.name': app_name, |
|
0 commit comments