Merge pull request #60 from Yelp/u/srojas/SEC-13906-automatically-configure-temp-creds-provider-if-session-token-is-found

smrojas · web-flow · commit 0c627cb0d739 · 2021-02-08T09:09:19.000-08:00
Automatically Configure TemporaryAWSCredentialsProvider if AWS Session Token Is Included in Session Credentials
diff --git a/service_configuration_lib/spark_config.py b/service_configuration_lib/spark_config.py
@@ -19,6 +19,7 @@
 from boto3 import Session
 
 AWS_CREDENTIALS_DIR = '/etc/boto_cfg/'
+AWS_TEMP_CREDENTIALS_PROVIDER = 'org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider'
 GPU_POOLS_YAML_FILE_PATH = '/nail/srv/configs/gpu_pools.yaml'
 DEFAULT_PAASTA_VOLUME_PATH = '/etc/paasta/volumes.json'
 DEFAULT_SPARK_MESOS_SECRET_FILE = '/nail/etc/paasta_spark_secret'
@@ -46,7 +47,7 @@
     'spark.executorEnv.SPARK_EXECUTOR_DIRS',
     'spark.hadoop.fs.s3a.access.key',
     'spark.hadoop.fs.s3a.secret.key',
-    'spark.hadoop.fs.s3a.session.token'
+    'spark.hadoop.fs.s3a.session.token',
     'spark.kubernetes.pyspark.pythonVersion',
     'spark.kubernetes.container.image',
     'spark.kubernetes.namespace',
@@ -504,10 +505,11 @@ def get_spark_conf(
     :param paasta_service: the service name of the job
     :param paasta_instance: the instance name of the job
     :param docker_img: the docker image used to launch container for spark executor.
-    :param aws_creds: the aws creds to be used for this spark job.
+    :param aws_creds: the aws creds to be used for this spark job. If a key triplet is passed,
+        we configure a different credentials provider to support this workflow.
     :param extra_volumes: extra files to mount on the spark executors
     :param extra_docker_params: extra docker parameters to launch the spark executor
-        cotnainer. This is only being used when `cluster_manager` is set to `mesos`
+        container. This is only being used when `cluster_manager` is set to `mesos`
     :param with_secret: whether the output spark config should include mesos secrets.
         This is only being used when `cluster_manager` is set to `mesos`
     :param needs_docker_cfg: whether we should add docker.cfg file for accessing
@@ -544,6 +546,12 @@ def get_spark_conf(
 
     spark_conf = {**(spark_opts_from_env or {}), **_filter_user_spark_opts(user_spark_opts)}
 
+    # We automatically update the credentials provider if the session token is included.
+    # By default the SimpleAWSCredentials provider is used, which is incompatible with
+    # temporary credentials. More details in SEC-13906.
+    if aws_creds[2] is not None:
+        spark_conf['spark.hadoop.fs.s3a.aws.credentials.provider'] = AWS_TEMP_CREDENTIALS_PROVIDER
+
     spark_conf.update({
         'spark.app.name': app_name,
         'spark.ui.port': str(ui_port),
diff --git a/setup.py b/setup.py
@@ -17,7 +17,7 @@
 
 setup(
     name='service-configuration-lib',
-    version='2.5.2',
+    version='2.5.3',
     provides=['service_configuration_lib'],
     description='Start, stop, and inspect Yelp SOA services',
     url='https://github.com/Yelp/service_configuration_lib',
diff --git a/tests/spark_config_test.py b/tests/spark_config_test.py
@@ -149,6 +149,7 @@ class TestGetSparkConf:
     spark_app_base_name = 'test_app_base_name'
     extra_volumes = [{'hostPath': '/tmp', 'containerPath': '/tmp', 'mode': 'RO'}]
     default_mesos_leader = 'mesos://some-url.yelp.com:5050'
+    aws_provider_key = 'spark.hadoop.fs.s3a.aws.credentials.provider'
 
     @pytest.fixture
     def mock_paasta_volumes(self, monkeypatch, tmpdir):
@@ -632,6 +633,32 @@ def verify(output):
 
         return verify
 
+    def test_get_spark_conf_aws_session(self):
+        other_spark_opts = {'spark.driver.memory': '2g', 'spark.executor.memoryOverhead': '1024'}
+        not_allowed_opts = {'spark.executorEnv.PAASTA_SERVICE': 'random-service'}
+        user_spark_opts = {
+            **({}),
+            **not_allowed_opts,
+            **other_spark_opts,
+        }
+
+        aws_creds = ('key', 'secret', 'token')
+
+        output = spark_config.get_spark_conf(
+            cluster_manager='kubernetes',
+            spark_app_base_name=self.spark_app_base_name,
+            user_spark_opts=user_spark_opts,
+            paasta_cluster=self.cluster,
+            paasta_pool=self.pool,
+            paasta_service=self.service,
+            paasta_instance=self.instance,
+            docker_img=self.docker_image,
+            extra_volumes=self.extra_volumes,
+            aws_creds=aws_creds,
+        )
+        assert self.aws_provider_key in output.keys()
+        assert 'org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider' == output[self.aws_provider_key]
+
     def test_get_spark_conf_mesos(
         self,
         user_spark_opts,