From d7269315654cb44d632cbb860d082ed1103ea95d Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Wed, 28 Aug 2024 15:23:21 +0200 Subject: [PATCH] feat: add nvidia flare demo chore: fix linting issues feat: add namespace variable and remove environment variable feat: change namespace as variable fix: reviews chore(deps): bump super-linter/super-linter from 7.0.0 to 7.1.0 (#477) Bumps [super-linter/super-linter](https://github.com/super-linter/super-linter) from 7.0.0 to 7.1.0. - [Release notes](https://github.com/super-linter/super-linter/releases) - [Changelog](https://github.com/super-linter/super-linter/blob/main/CHANGELOG.md) - [Commits](https://github.com/super-linter/super-linter/compare/v7.0.0...v7.1.0) --- updated-dependencies: - dependency-name: super-linter/super-linter dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> fix!: use an external git repository (#475) - Don't enable the Cloud Source Repositories API. - Don't provision a Cloud Source Repository. - Get the Config Sync Git repository with an input variable. - Let users specify the credentials type to authenticate Config Sync with their repositories. Fix #449 chore: move manifests to workload-pkg folder feat: create nvflare terraform module chore: gitignore terraform stuff (#480) feat: allow config sync to access source repository for synchronization (#479) fix: minor changes to comply with pr review Co-authored-by: Laurent Grangeau feat: Removing the private DNS zone for CSR feat: enabling HTTP load balancing for ASM feat: add nvflare demo deployment chore: manually build the example container image (#481) Add instructions to build the distributed TensorFlow Federated example in the example README, instead of having Terraform orchestrate the build and push process. This simplifies the root Terraform module, by removing something that's needed only for an example, and it's also unlikely for users to have a build pipeline orchestrated this way. feat: add nvflare demo deployment feat: add nvflare demo feat: refactor to have same extension everywhere feat: add templating of manifest resources feat: add nvflare example fix: lint errors fix: lint errors fix: lint errors --- .devcontainer/devcontainer.json | 12 ++- .gitignore | 24 +++++ README.md | 15 ++- config/lint/.checkov.baseline | 85 +++++++++-------- config/lint/.jscpd.json | 7 +- config/lint/super-linter.env | 2 + .../distributed-fl-simulation-k8s/README.md | 30 ++++++ .../federated-learning/tff/nvflare/README.md | 91 +++++++++++++++---- .../nvflare/templates/kustomization.yaml.tpl | 16 ++++ .../nvflare/templates/pv-workspace.yaml.tpl | 20 ++++ .../base-deployment/base-deployment.yaml | 69 ++++++++++++++ .../base-deployment/kustomization.yaml | 3 + .../base-service/base-service.yaml | 19 ++++ .../base-service/kustomization.yaml | 3 + .../base-storage/kustomization.yaml | 4 + .../base-storage/pv-workspace.yaml | 20 ++++ .../base-storage/pvc-workspace.yaml | 15 +++ .../workload-pkg/client1/kustomization.yaml | 16 ++++ .../workload-pkg/client2/kustomization.yaml | 16 ++++ .../workload-pkg/server1/kustomization.yaml | 17 ++++ terraform/acm.tf | 10 +- terraform/dns.tf | 32 ------- terraform/gke.tf | 2 +- terraform/iam.tf | 38 -------- terraform/main.tf | 32 +++---- terraform/network.tf | 54 ++++++++--- .../terraform => terraform/nvflare}/iam.tf | 6 +- .../terraform => terraform/nvflare}/main.tf | 0 .../nvflare}/outputs.tf | 3 +- .../nvflare}/variables.tf | 7 +- .../nvflare}/version.tf | 0 terraform/outputs.tf | 20 ++++ .../scripts/build-push-container-image.sh | 35 ------- terraform/services.tf | 1 - terraform/source-repository.tf | 18 ---- terraform/tenant-configuration.tf | 40 ++------ terraform/variables.tf | 33 ++++++- 37 files changed, 549 insertions(+), 266 deletions(-) create mode 100644 examples/federated-learning/tff/nvflare/templates/kustomization.yaml.tpl create mode 100644 examples/federated-learning/tff/nvflare/templates/pv-workspace.yaml.tpl create mode 100644 examples/federated-learning/tff/nvflare/workload-pkg/base-deployment/base-deployment.yaml create mode 100644 examples/federated-learning/tff/nvflare/workload-pkg/base-deployment/kustomization.yaml create mode 100644 examples/federated-learning/tff/nvflare/workload-pkg/base-service/base-service.yaml create mode 100644 examples/federated-learning/tff/nvflare/workload-pkg/base-service/kustomization.yaml create mode 100644 examples/federated-learning/tff/nvflare/workload-pkg/base-storage/kustomization.yaml create mode 100644 examples/federated-learning/tff/nvflare/workload-pkg/base-storage/pv-workspace.yaml create mode 100644 examples/federated-learning/tff/nvflare/workload-pkg/base-storage/pvc-workspace.yaml create mode 100644 examples/federated-learning/tff/nvflare/workload-pkg/client1/kustomization.yaml create mode 100644 examples/federated-learning/tff/nvflare/workload-pkg/client2/kustomization.yaml create mode 100644 examples/federated-learning/tff/nvflare/workload-pkg/server1/kustomization.yaml rename {examples/federated-learning/tff/nvflare/terraform => terraform/nvflare}/iam.tf (70%) rename {examples/federated-learning/tff/nvflare/terraform => terraform/nvflare}/main.tf (100%) rename {examples/federated-learning/tff/nvflare/terraform => terraform/nvflare}/outputs.tf (87%) rename {examples/federated-learning/tff/nvflare/terraform => terraform/nvflare}/variables.tf (87%) rename {examples/federated-learning/tff/nvflare/terraform => terraform/nvflare}/version.tf (100%) delete mode 100755 terraform/scripts/build-push-container-image.sh delete mode 100644 terraform/source-repository.tf diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 113a00df..22e59838 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,12 +1,20 @@ { "name": "Python 3", - "image": "ghcr.io/super-linter/super-linter:v7.0.0", + "image": "ghcr.io/super-linter/super-linter:v7.1.0", "customizations": { "vscode": { "settings": { "editor.defaultFormatter": "esbenp.prettier-vscode", + "editor.formatOnSave": true, + "editor.formatOnSaveMode": "file", "editor.wordWrap": "off", - "prettier.resolveGlobalModules": true + "prettier.resolveGlobalModules": true, + "[terraform]": { + "editor.defaultFormatter": "hashicorp.terraform" + }, + "[terraform-vars]": { + "editor.defaultFormatter": "hashicorp.terraform" + } }, "extensions": [ "DavidAnson.vscode-markdownlint", diff --git a/.gitignore b/.gitignore index caa56f97..4baa5e81 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,27 @@ super-linter.log # GitHub Actions leftovers github_conf + +# Terraform gitgnore +# Ref: https://github.com/github/gitignore/blob/main/Terraform.gitignore + +# Local .terraform directories +**/.terraform/* + +# .tfstate files +*.tfstate +*.tfstate.* + +# Crash log files +crash.log +crash.*.log + +# Ignore override files as they are usually used to override resources locally and so +# are not checked in +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Ignore transient lock info files created by terraform apply +.terraform.tfstate.lock.info diff --git a/README.md b/README.md index 8240268e..e9713f20 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ To deploy this blueprint you need: - The `serviceusage.googleapis.com` must be enabled on the project. For more information about enabling APIs, see [Enabling and disabling services](https://cloud.google.com/service-usage/docs/enable-disable) +- A Git repository to store the environment configuration. You create the infastructure using Terraform. The blueprint uses a local [Terraform backend](https://www.terraform.io/docs/language/settings/backends/configuration.html), but we recommend to configure a [remote backend](https://www.terraform.io/language/settings/backends/configuration#backend-types) @@ -127,10 +128,16 @@ Users and teams managing tenant apps should not have permissions to change clust 1. Initialize the following Terraform variables: ```hcl - project_id = # Google Cloud project ID where to provision resources with the blueprint. - acm_repository_path = # Path on the host running Terraform to store the GKE descriptors to configure the cluster + project_id = # Google Cloud project ID where to provision resources with the blueprint. + acm_repository_path = # Path on the host running Terraform to store environment configuration + acm_repository_url = # URL of the repository to store environment configuration + acm_secret_type = # Secret type to authenticate with the Config Sync Git repository + acm_source_repository_fqdns = # FQDNs of source repository for Config Sync to allow in the Network Firewall Policy ``` + For more information about setting `acm_secret_type`, see + [Grant access to Git](https://cloud.google.com/kubernetes-engine/enterprise/config-sync/docs/how-to/installing-config-sync#git-creds-secret). + If you don't provide all the necessary inputs, Terraform will exit with an error, and will provide information about the missing inputs. For example, you can create a Terraform variables initialization file and set inputs there. @@ -145,6 +152,9 @@ Users and teams managing tenant apps should not have permissions to change clust The provisioning process may take about 15 minutes to complete. +1. [Grant the Config Sync agent access to the Git repository](https://cloud.google.com/kubernetes-engine/enterprise/config-sync/docs/how-to/installing-config-sync#git-creds-secret) + where the environment configuration will be stored. + 1. Wait for the GKE cluster to be reported as ready in the [GKE Kuberentes clusters dashboard](https://cloud.google.com/kubernetes-engine/docs/concepts/dashboards#kubernetes_clusters). ### Next steps @@ -154,6 +164,7 @@ To familiarize with the environment that you provisioned, you can also deploy the following examples in the GKE cluster: - [Distributed TensorFlow Federated training](./examples/federated-learning/tff/distributed-fl-simulation-k8s/README.md) +- [Nvflare training](./examples/federated-learning/tff/nvflare/README.md) Federated learning is typically split into Cross-silo and Cross-device federated learning. Cross-silo federated computation is where the participating members are organizations or companies, and the number of members is usually small (e.g., within a hundred). diff --git a/config/lint/.checkov.baseline b/config/lint/.checkov.baseline index 1622301c..8e32d0ea 100644 --- a/config/lint/.checkov.baseline +++ b/config/lint/.checkov.baseline @@ -76,10 +76,10 @@ ] }, { - "file": "/examples/federated-learning/tff/nvflare/terraform/iam.tf", + "file": "/terraform/nvflare/iam.tf", "findings": [ { - "resource": "storage_bucket_iam_bindings", + "resource": "module.nvflare.storage_bucket_iam_bindings", "check_ids": [ "CKV_TF_1" ] @@ -87,10 +87,10 @@ ] }, { - "file": "/examples/federated-learning/tff/nvflare/terraform/main.tf", + "file": "/terraform/nvflare/main.tf", "findings": [ { - "resource": "buckets", + "resource": "module.nvflare.buckets", "check_ids": [ "CKV_TF_1" ] @@ -129,7 +129,7 @@ "file": "/examples/federated-learning/tff/nvflare/base-server/base-server-deployment.yml", "findings": [ { - "resource": "Deployment.nvflare.nvflare-server1", + "resource": "Deployment.default.nvflare-server1", "check_ids": [ "CKV_K8S_43", "CKV_K8S_14", @@ -148,7 +148,7 @@ "file": "/examples/federated-learning/tff/nvflare/base-server/kustomization.yaml", "findings": [ { - "resource": "base:Deployment.nvflare.nvflare-server1", + "resource": "base:Deployment.default.nvflare-server1", "check_ids": [ "CKV_K8S_21", "CKV_K8S_22", @@ -162,7 +162,7 @@ ] }, { - "resource": "base:Pod.nvflare.nvflare-server1.run-nvflare-server1", + "resource": "base:Pod.default.nvflare-server1.run-nvflare-server1", "check_ids": [ "CKV2_K8S_6" ] @@ -173,7 +173,7 @@ "file": "/examples/federated-learning/tff/nvflare/base-deployment/base-deployment.yaml", "findings": [ { - "resource": "Deployment.nvflare.nvflare", + "resource": "Deployment.default.nvflare", "check_ids": [ "CKV_K8S_43", "CKV_K8S_14", @@ -192,7 +192,7 @@ "file": "/examples/federated-learning/tff/nvflare/base-deployment/kustomization.yaml", "findings": [ { - "resource": "base:Deployment.nvflare.nvflare", + "resource": "base:Deployment.default.nvflare", "check_ids": [ "CKV_K8S_31", "CKV_K8S_14", @@ -223,7 +223,7 @@ ] }, { - "resource": "base:Pod.default.nvflare.run-nvflare", + "resource": "base:Pod.default.default.run-nvflare", "check_ids": [ "CKV2_K8S_6" ] @@ -252,7 +252,7 @@ "file": "/github/workspace/examples/federated-learning/tff/nvflare/kustomization.yaml", "findings": [ { - "resource": "Deployment.nvflare.nvflare", + "resource": "Deployment.default.nvflare", "check_ids": [ "CKV_K8S_14", "CKV_K8S_38", @@ -271,7 +271,7 @@ "file": "/examples/federated-learning/tff/nvflare/base-client/base-client.yaml", "findings": [ { - "resource": "Deployment.nvflare.nvflare-client", + "resource": "Deployment.default.nvflare-client", "check_ids": [ "CKV_K8S_43", "CKV_K8S_14", @@ -290,7 +290,7 @@ "file": "/examples/federated-learning/tff/nvflare/base-client/kustomization.yaml", "findings": [ { - "resource": "base:Deployment.nvflare.nvflare-client", + "resource": "base:Deployment.default.nvflare-client", "check_ids": [ "CKV_K8S_21", "CKV_K8S_22", @@ -304,7 +304,7 @@ ] }, { - "resource": "base:Pod.nvflare.nvflare-client.run-nvflare-client", + "resource": "base:Pod.default.nvflare-client.run-nvflare-client", "check_ids": [ "CKV2_K8S_6" ] @@ -315,7 +315,7 @@ "file": "/examples/federated-learning/tff/nvflare/base-server/base-server-service.yaml", "findings": [ { - "resource": "Service.nvflare.nvflare-server1", + "resource": "Service.default.nvflare-server1", "check_ids": [ "CKV_K8S_21" ] @@ -326,7 +326,7 @@ "file": "/examples/federated-learning/tff/nvflare/base-server/kustomization.yaml", "findings": [ { - "resource": "base:Service.nvflare.nvflare-server1", + "resource": "base:Service.default.nvflare-server1", "check_ids": [ "CKV_K8S_21" ] @@ -337,7 +337,7 @@ "file": "/examples/federated-learning/tff/nvflare/server1/kustomization.yaml", "findings": [ { - "resource": "base:Deployment.nvflare.nvflare-client1", + "resource": "base:Deployment.default.nvflare-client1", "check_ids": [ "CKV_K8S_21", "CKV_K8S_22", @@ -351,7 +351,7 @@ ] }, { - "resource": "overlay:server1:Deployment.nvflare.nvflare-server1", + "resource": "overlay:server1:Deployment.default.nvflare-server1", "check_ids": [ "CKV_K8S_21", "CKV_K8S_22", @@ -365,7 +365,7 @@ ] }, { - "resource": "overlay:server1:Service.nvflare.nvflare-server1", + "resource": "overlay:server1:Service.default.nvflare-server1", "check_ids": [ "CKV_K8S_21", "CKV_K8S_22", @@ -384,7 +384,7 @@ "file": "/examples/federated-learning/tff/nvflare/client1/kustomization.yaml", "findings": [ { - "resource": "base:Deployment.nvflare.nvflare-client1", + "resource": "base:Deployment.default.nvflare-client1", "check_ids": [ "CKV_K8S_21", "CKV_K8S_22", @@ -398,7 +398,7 @@ ] }, { - "resource": "overlay:client1:Deployment.nvflare.nvflare-client1", + "resource": "overlay:client1:Deployment.default.nvflare-client1", "check_ids": [ "CKV_K8S_21", "CKV_K8S_22", @@ -412,7 +412,7 @@ ] }, { - "resource": "overlay:client1:ConfigMap.nvflare.client-configmap1-tdtbhk5bt6", + "resource": "overlay:client1:ConfigMap.default.client-configmap1-tdtbhk5bt6", "check_ids": [ "CKV_K8S_21" ] @@ -423,7 +423,7 @@ "file": "/examples/federated-learning/tff/nvflare/client2/kustomization.yaml", "findings": [ { - "resource": "base:Deployment.nvflare.nvflare-client2", + "resource": "base:Deployment.default.nvflare-client2", "check_ids": [ "CKV_K8S_21", "CKV_K8S_22", @@ -437,7 +437,7 @@ ] }, { - "resource": "overlay:client2:Deployment.nvflare.nvflare-client2", + "resource": "overlay:client2:Deployment.default.nvflare-client2", "check_ids": [ "CKV_K8S_21", "CKV_K8S_22", @@ -451,7 +451,7 @@ ] }, { - "resource": "overlay:client2:ConfigMap.nvflare.client-configmap2-8cgh9t556m", + "resource": "overlay:client2:ConfigMap.default.client-configmap2-8cgh9t556m", "check_ids": [ "CKV_K8S_21" ] @@ -462,7 +462,7 @@ "file": "/examples/federated-learning/tff/nvflare/kustomization.yaml", "findings": [ { - "resource": "overlay:nvflare:Deployment.nvflare.nvflare-server1", + "resource": "overlay:nvflare:Deployment.default.nvflare-server1", "check_ids": [ "CKV_K8S_21", "CKV_K8S_22", @@ -476,19 +476,19 @@ ] }, { - "resource": "overlay:nvflare:Pod.nvflare.nvflare-server1.run-nvflare-server1", + "resource": "overlay:nvflare:Pod.default.nvflare-server1.run-nvflare-server1", "check_ids": [ "CKV2_K8S_6" ] }, { - "resource": "overlay:server1:Pod.nvflare.nvflare-server1.run-nvflare-server1", + "resource": "overlay:server1:Pod.default.nvflare-server1.run-nvflare-server1", "check_ids": [ "CKV2_K8S_6" ] }, { - "resource": "overlay:nvflare:Deployment.nvflare.nvflare-client1", + "resource": "overlay:nvflare:Deployment.default.nvflare-client1", "check_ids": [ "CKV_K8S_21", "CKV_K8S_22", @@ -502,19 +502,19 @@ ] }, { - "resource": "overlay:nvflare:Pod.nvflare.nvflare-client1.run-nvflare-client1", + "resource": "overlay:nvflare:Pod.default.nvflare-client1.run-nvflare-client1", "check_ids": [ "CKV2_K8S_6" ] }, { - "resource": "overlay:client1:Pod.nvflare.nvflare-client1.run-nvflare-client1", + "resource": "overlay:client1:Pod.default.nvflare-client1.run-nvflare-client1", "check_ids": [ "CKV2_K8S_6" ] }, { - "resource": "overlay:nvflare:Deployment.nvflare.nvflare-client2", + "resource": "overlay:nvflare:Deployment.default.nvflare-client2", "check_ids": [ "CKV_K8S_21", "CKV_K8S_22", @@ -528,31 +528,31 @@ ] }, { - "resource": "overlay:nvflare:Pod.nvflare.nvflare-client2.run-nvflare-client2", + "resource": "overlay:nvflare:Pod.default.nvflare-client2.run-nvflare-client2", "check_ids": [ "CKV2_K8S_6" ] }, { - "resource": "overlay:client2:Pod.nvflare.nvflare-client2.run-nvflare-client2", + "resource": "overlay:client2:Pod.default.nvflare-client2.run-nvflare-client2", "check_ids": [ "CKV2_K8S_6" ] }, { - "resource": "overlay:nvflare:Service.nvflare.nvflare-server1", + "resource": "overlay:nvflare:Service.default.nvflare-server1", "check_ids": [ "CKV_K8S_21" ] }, { - "resource": "overlay:nvflare:ConfigMap.nvflare.client-configmap1-tdtbhk5bt6", + "resource": "overlay:nvflare:ConfigMap.default.client-configmap1-tdtbhk5bt6", "check_ids": [ "CKV_K8S_21" ] }, { - "resource": "overlay:nvflare:ConfigMap.nvflare.client-configmap2-8cgh9t556m", + "resource": "overlay:nvflare:ConfigMap.default.client-configmap2-8cgh9t556m", "check_ids": [ "CKV_K8S_21" ] @@ -656,6 +656,17 @@ } ] }, + { + "file": "/terraform/network.tf", + "findings": [ + { + "resource": "fedlearn-fw-policies", + "check_ids": [ + "CKV_TF_1" + ] + } + ] + }, { "file": "/terraform/cross-device/iam.tf", "findings": [ diff --git a/config/lint/.jscpd.json b/config/lint/.jscpd.json index 1e638fb4..27279964 100644 --- a/config/lint/.jscpd.json +++ b/config/lint/.jscpd.json @@ -1,6 +1,11 @@ { "threshold": 0, "reporters": ["consoleFull"], - "ignore": ["**/configsync/**", "**/examples/**", "**/tenants/**"], + "ignore": [ + "**/.terraform", + "**/configsync/**", + "**/examples/**", + "**/tenants/**" + ], "absolute": true } diff --git a/config/lint/super-linter.env b/config/lint/super-linter.env index 13beafac..b3b297ec 100644 --- a/config/lint/super-linter.env +++ b/config/lint/super-linter.env @@ -1,5 +1,7 @@ FILTER_REGEX_EXCLUDE=.*/examples/federated-learning/tff/.*/*.py LINTER_RULES_PATH=config/lint +SAVE_SUPER_LINTER_OUTPUT=true +SAVE_SUPER_LINTER_SUMMARY=true VALIDATE_ALL_CODEBASE=true VALIDATE_JAVASCRIPT_STANDARD=false VALIDATE_KUBERNETES_KUBECONFORM=false diff --git a/examples/federated-learning/tff/distributed-fl-simulation-k8s/README.md b/examples/federated-learning/tff/distributed-fl-simulation-k8s/README.md index 22f46de3..cb5e5f92 100644 --- a/examples/federated-learning/tff/distributed-fl-simulation-k8s/README.md +++ b/examples/federated-learning/tff/distributed-fl-simulation-k8s/README.md @@ -92,6 +92,7 @@ You can run this example in different runtime environments: ``` 1. Run `terraform apply`. +1. [Build the example container image, and push it to the container image registry](#build-the-example-container-image-and-push-it-to-the-container-image-registry). 1. Wait for GKE to report the coordinator and the workers as `Ready` in the [GKE Workloads dashboard](https://cloud.google.com/kubernetes-engine/docs/concepts/dashboards#workloads). @@ -131,6 +132,7 @@ You can run this example in different runtime environments: ``` 1. Run `terraform apply`. +1. [Build the example container image, and push it to the container image registry](#build-the-example-container-image-and-push-it-to-the-container-image-registry). 1. Open the [GKE Workloads Dashboard](https://cloud.google.com/kubernetes-engine/docs/concepts/dashboards#workloads) and wait for the workers Deployments and Services to be ready. 1. From Cloud Shell, change the working directory to the `terraform` directory that you used to provision @@ -156,10 +158,38 @@ You can run this example in different runtime environments: that exposes the second worker workloads. 1. Run `terraform apply`. +1. [Build the example container image, and push it to the container image registry](#build-the-example-container-image-and-push-it-to-the-container-image-registry). 1. Wait for GKE to report the coordinator and the workers as `Ready` in the [GKE Workloads dashboard](https://cloud.google.com/kubernetes-engine/docs/concepts/dashboards#workloads) in their respective GKE clusters. +### Build the example container image, and push it to the container image registry + +1. Build the example container image locally on your host: + + ```sh + DISTRIBUTED_TFF_EXAMPLE_CONTAINER_IMAGE_REPOSITORY_HOSTNAME="$(terraform output -raw container_image_repository_fully_qualified_hostname)" + DISTRIBUTED_TFF_EXAMPLE_CONTAINER_IMAGE_LOCALIZED_ID="${DISTRIBUTED_TFF_EXAMPLE_CONTAINER_IMAGE_REPOSITORY_HOSTNAME}/$(terraform output -raw container_image_repository_name)/tff-runtime:0.0.1" + + docker build \ + --file "examples/federated-learning/tff/distributed-fl-simulation-k8s/Dockerfile" \ + --tag "${DISTRIBUTED_TFF_EXAMPLE_CONTAINER_IMAGE_LOCALIZED_ID}" \ + "${DISTRIBUTED_TFF_EXAMPLE_CONTAINER_IMAGE_BUILD_CONTEXT_PATH}" + ``` + +1. Authenticate Docker with the Artifact Registry repository: + + ```sh + gcloud auth configure-docker \ + "${DISTRIBUTED_TFF_EXAMPLE_CONTAINER_IMAGE_REPOSITORY_HOSTNAME}" + ``` + +1. Push the container image to the Artifact Registry repository: + + ```sh + docker image push "${DISTRIBUTED_TFF_EXAMPLE_CONTAINER_IMAGE_LOCALIZED_ID}" + ``` + ## Expected output After deploying the workers and the coordinator, you can inspect the logs that diff --git a/examples/federated-learning/tff/nvflare/README.md b/examples/federated-learning/tff/nvflare/README.md index 89ab39e6..de428e65 100644 --- a/examples/federated-learning/tff/nvflare/README.md +++ b/examples/federated-learning/tff/nvflare/README.md @@ -26,29 +26,75 @@ As shown in the preceding diagram, the blueprint helps you to create and configu - Two pods that are the clients that will be connected to the server in the nvidia-client1 and nvidia-client2 namespaces respectively - One pod that is the the server that will aggregate all the results from the computation in the nvflare-infra namespace -## 1. Create custom image +## Deploy the blueprint -For this demo to work, you have to create a custom image with TensorFlow and Nvflare installed. To create the image and push it on Artifact Registry, go to the `docker-image` and build the image: +This example builds on top of the infrastructure that the +[blueprint provides](../../../../README.md), and follows the best practices the +blueprint establishes. + +To deploy the Nvflare demo described in this document, you need to deploy the [Federated learning blueprint](../../../../README.md#deploy-the-blueprint) first. Then, you can deploy the Nvflare demo described in this document. + +### 1. Create custom image + +For this demo to work, you have to create a custom image with TensorFlow and Nvflare installed. From the main `terraform` folder: + +1. Build the example container image locally on your host: + + ```bash + NVFLARE_EXAMPLE_CONTAINER_IMAGE_REPOSITORY_HOSTNAME="$(terraform output -raw container_image_repository_fully_qualified_hostname)" + export NVFLARE_EXAMPLE_CONTAINER_IMAGE_LOCALIZED_ID="${NVFLARE_EXAMPLE_CONTAINER_IMAGE_REPOSITORY_HOSTNAME}/$(terraform output -raw container_image_repository_name)/nvflare-tensorflow" + export NVFLARE_EXAMPLE_CONTAINER_IMAGE_TAG="0.0.1" + NVFLARE_EXAMPLE_CONTAINER_IMAGE_BUILD_CONTEXT_PATH="../examples/federated-learning/tff/nvflare/container-image" + NVFLARE_EXAMPLE_CONTAINER_IMAGE_LOCALIZED_ID_WITH_TAG=${NVFLARE_EXAMPLE_CONTAINER_IMAGE_LOCALIZED_ID}:${NVFLARE_EXAMPLE_CONTAINER_IMAGE_TAG} + + docker build \ + --file "${NVFLARE_EXAMPLE_CONTAINER_IMAGE_BUILD_CONTEXT_PATH}/Dockerfile" \ + --tag "${NVFLARE_EXAMPLE_CONTAINER_IMAGE_LOCALIZED_ID_WITH_TAG}" \ + ${NVFLARE_EXAMPLE_CONTAINER_IMAGE_BUILD_CONTEXT_PATH} + ``` + +1. Authenticate Docker with the Artifact Registry repository: + + ```bash + gcloud auth configure-docker \ + "${NVFLARE_EXAMPLE_CONTAINER_IMAGE_REPOSITORY_HOSTNAME}" + ``` + +1. Push the container image to the Artifact Registry repository: + + ```bash + docker image push "${NVFLARE_EXAMPLE_CONTAINER_IMAGE_LOCALIZED_ID_WITH_TAG}" + ``` + +Once the image has beed built, modify the `kustomization.yaml.tpl` file to add the new name and new tag of the image: ```bash -cd docker-image -export REGION=$(gcloud config get compute/region) -export PROJET_ID=$(gcloud config get core/project) -export REPOSITORY=my-repository -gcloud builds submit --tag ${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/nvflare-tensorflow:1.0.0 +envsubst < ../examples/federated-learning/tff/nvflare/templates/kustomization.yaml.tpl > ../examples/federated-learning/tff/nvflare/kustomization.yaml ``` -Once the image is built, modify the `kustomization.yaml` file to add the new name and new tag of the image. - -## 2. Create infrastructure to store the models +### 2. Create infrastructure to store the models -All the models generated will be stored in a Cloud storage bucket mounted by each pod. To create the Cloud storage and give the right permissions, from the `terraform` folder, run : +All the models generated will be stored in a Cloud storage bucket mounted by each pod. To create the Cloud storage and give the right permissions, from the `terraform` folder, add the flag `nvflare = true` to your tfvars file and run : ```bash terraform apply ``` -## 3. Create Nvflare folder structure +Once the bucket has been generated, , modify the `pv-workspace.yaml.tpl` file to replace the bucket name: + +```bash +export NVFLARE_WORKSPACE_BUCKET_NAME="\"$(terraform output -raw nvflare_workspace_bucket_name)\"" +envsubst < ../examples/federated-learning/tff/nvflare/templates/pv-workspace.yaml.tpl > ../examples/federated-learning/tff/nvflare/workload-pkg/base-storage/pv-workspace.yaml +``` + +Regenerate the `kustomization.yaml` file with all the necessary informations: + +```bash +export NVFLARE_EXAMPLE_WORKLOADS_KUBERNETES_NAMESPACE="$(terraform output -raw nvflare_workloads_kubernetes_namespace)" +envsubst < ../examples/federated-learning/tff/nvflare/templates/pv-workspace.yaml.tpl > ../examples/federated-learning/tff/nvflare/workload-pkg/base-storage/pv-workspace.yaml +``` + +### 3. Create Nvflare folder structure Now that the persistent volume is created, you can start creating the folder structure and upload it to Cloud Storage. First install nvflare on your workstation: @@ -77,7 +123,7 @@ The different folders generated represent the infrastructure you will deploy on - `site-1` and `site-2` are the clients that will be connected to the server - `admin@nvidia.com` is the administration client to start and list jobs -## 4. Clone the repository +### 4. Clone the repository You will need to clone the Nvflare repository that contains the job you will run on the reference architecture: @@ -97,21 +143,26 @@ cp -R ${HOME}/NVFlare/examples/hello-world/hello-tf2 . Now, copy the whole workspace folder in Cloud Storage. The pods will have access to the infrastructure to run jobs: ```bash -gcloud storage -m cp -r ${HOME}/workspace gs://nvflare-storage +gcloud storage -m cp -r ${HOME}/workspace gs://${NVFLARE_WORKSPACE_BUCKET_NAME} ``` -## 5. Deploy the infrastructure +### 5. Deploy the infrastructure -Everything is now setup to be able to submit the job. Deploy the infrastructure: +Everything is now setup to be able to submit the job. Deploy the infrastructure via ConfigSync: ```bash -kubectl apply -k . +mkdir /home/lgrangeau/acm/configsync/nvflare +cp -a ../examples/federated-learning/tff/nvflare/workload-pkg/. /home/lgrangeau/acm/configsync/nvflare +cp ../examples/federated-learning/tff/nvflare/kustomization.yaml /home/lgrangeau/acm/configsync ``` -Both clients and servers rely on kustomize to be deployed. If you want to add more client, just copy/paste the `client1` folder, modify values accordingly and add the new folder in the `kustomization.yaml` file. Do the same with the `server1` folder. Then, redeploy the whole infrastructure: +Then, commit everything in the ConfigSync repository to trigger reconciliation. From the ConfigSync repository: ```bash -kubectl apply -k . +git add nvflare/ +git add kustomization.yaml +git commit -m "feat: add nvflare demo" +git push ``` You should end up with the following running pods: @@ -123,7 +174,7 @@ nvflare-client2-895b65d8f-p4fs9 1/1 Running 0 16h nvflare-server1-66c44ddb47-dhtqz 1/1 Running 0 16h ``` -## 6. Submit the job +### 6. Submit the job Everything is now ready to submit and run the job. Go to the `admin@nvidia.com` folder and connect to the infrastructure. When prompted, the username is `admin@nvidia.com`: diff --git a/examples/federated-learning/tff/nvflare/templates/kustomization.yaml.tpl b/examples/federated-learning/tff/nvflare/templates/kustomization.yaml.tpl new file mode 100644 index 00000000..8e806fa7 --- /dev/null +++ b/examples/federated-learning/tff/nvflare/templates/kustomization.yaml.tpl @@ -0,0 +1,16 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - nvflare/base-storage + - nvflare/server1 + - nvflare/client1 + - nvflare/client2 + +namespace: ${NVFLARE_EXAMPLE_WORKLOADS_KUBERNETES_NAMESPACE} + +images: + - name: nvflare-tensorflow + newName: ${NVFLARE_EXAMPLE_CONTAINER_IMAGE_LOCALIZED_ID} + newTag: ${NVFLARE_EXAMPLE_CONTAINER_IMAGE_TAG} diff --git a/examples/federated-learning/tff/nvflare/templates/pv-workspace.yaml.tpl b/examples/federated-learning/tff/nvflare/templates/pv-workspace.yaml.tpl new file mode 100644 index 00000000..73c06938 --- /dev/null +++ b/examples/federated-learning/tff/nvflare/templates/pv-workspace.yaml.tpl @@ -0,0 +1,20 @@ +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: nvflare-pv + labels: + app: nvflare +spec: + accessModes: + - ReadWriteMany + capacity: + storage: 5Gi + storageClassName: nvflare-storage-class + mountOptions: + - implicit-dirs + csi: + driver: gcsfuse.csi.storage.gke.io + volumeHandle: ${NVFLARE_WORKSPACE_BUCKET_NAME} + volumeAttributes: + gcsfuseLoggingSeverity: warning diff --git a/examples/federated-learning/tff/nvflare/workload-pkg/base-deployment/base-deployment.yaml b/examples/federated-learning/tff/nvflare/workload-pkg/base-deployment/base-deployment.yaml new file mode 100644 index 00000000..3cc2e00d --- /dev/null +++ b/examples/federated-learning/tff/nvflare/workload-pkg/base-deployment/base-deployment.yaml @@ -0,0 +1,69 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nvflare + labels: + run: nvflare +spec: + replicas: 1 + selector: + matchLabels: + run: nvflare + template: + metadata: + labels: + run: nvflare + annotations: + gke-gcsfuse/volumes: "true" + spec: + containers: + - name: nvflare + image: nvflare-tensorflow + imagePullPolicy: Always + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + command: + - /usr/local/bin/python3 + args: + - -u + - -m + - nvflare.private.fed.app.client.client_train + - -m + - /workspace/nvfl/workspace/example_project/prod_00/$(SITE) + - -s + - fed_client.json + - --set + - secure_train=true + - uid=$(SITE) + - config_folder=config + - org=nvidia + env: + - name: SITE + valueFrom: + configMapKeyRef: + key: site + name: configmap + resources: + requests: + cpu: "1" + memory: "1Gi" + limits: + cpu: "1" + memory: "1Gi" + volumeMounts: + - name: nvfl + mountPath: /workspace/nvfl/ + serviceAccountName: ksa + volumes: + - name: nvfl + persistentVolumeClaim: + claimName: nvflare-pv-claim + securityContext: + fsGroup: 1000 diff --git a/examples/federated-learning/tff/nvflare/workload-pkg/base-deployment/kustomization.yaml b/examples/federated-learning/tff/nvflare/workload-pkg/base-deployment/kustomization.yaml new file mode 100644 index 00000000..6d03ffc2 --- /dev/null +++ b/examples/federated-learning/tff/nvflare/workload-pkg/base-deployment/kustomization.yaml @@ -0,0 +1,3 @@ +--- +resources: + - base-deployment.yaml diff --git a/examples/federated-learning/tff/nvflare/workload-pkg/base-service/base-service.yaml b/examples/federated-learning/tff/nvflare/workload-pkg/base-service/base-service.yaml new file mode 100644 index 00000000..1f8775f8 --- /dev/null +++ b/examples/federated-learning/tff/nvflare/workload-pkg/base-service/base-service.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: nvflare + labels: + run: nvflare +spec: + ports: + - port: 8002 + protocol: TCP + targetPort: 8002 + name: flport + - port: 8003 + protocol: TCP + targetPort: 8003 + name: adminport + selector: + run: nvflare-server diff --git a/examples/federated-learning/tff/nvflare/workload-pkg/base-service/kustomization.yaml b/examples/federated-learning/tff/nvflare/workload-pkg/base-service/kustomization.yaml new file mode 100644 index 00000000..4f4cd6d9 --- /dev/null +++ b/examples/federated-learning/tff/nvflare/workload-pkg/base-service/kustomization.yaml @@ -0,0 +1,3 @@ +--- +resources: + - base-service.yaml diff --git a/examples/federated-learning/tff/nvflare/workload-pkg/base-storage/kustomization.yaml b/examples/federated-learning/tff/nvflare/workload-pkg/base-storage/kustomization.yaml new file mode 100644 index 00000000..8ee4cc39 --- /dev/null +++ b/examples/federated-learning/tff/nvflare/workload-pkg/base-storage/kustomization.yaml @@ -0,0 +1,4 @@ +--- +resources: + - pv-workspace.yaml + - pvc-workspace.yaml diff --git a/examples/federated-learning/tff/nvflare/workload-pkg/base-storage/pv-workspace.yaml b/examples/federated-learning/tff/nvflare/workload-pkg/base-storage/pv-workspace.yaml new file mode 100644 index 00000000..eb1a0444 --- /dev/null +++ b/examples/federated-learning/tff/nvflare/workload-pkg/base-storage/pv-workspace.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: nvflare-pv + labels: + app: nvflare +spec: + accessModes: + - ReadWriteMany + capacity: + storage: 5Gi + storageClassName: nvflare-storage-class + mountOptions: + - implicit-dirs + csi: + driver: gcsfuse.csi.storage.gke.io + volumeHandle: "fcp-nvflare-storage-f26e" + volumeAttributes: + gcsfuseLoggingSeverity: warning diff --git a/examples/federated-learning/tff/nvflare/workload-pkg/base-storage/pvc-workspace.yaml b/examples/federated-learning/tff/nvflare/workload-pkg/base-storage/pvc-workspace.yaml new file mode 100644 index 00000000..b6e723a4 --- /dev/null +++ b/examples/federated-learning/tff/nvflare/workload-pkg/base-storage/pvc-workspace.yaml @@ -0,0 +1,15 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: nvflare-pv-claim + labels: + app: nvflare +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 5Gi + volumeName: nvflare-pv + storageClassName: nvflare-storage-class diff --git a/examples/federated-learning/tff/nvflare/workload-pkg/client1/kustomization.yaml b/examples/federated-learning/tff/nvflare/workload-pkg/client1/kustomization.yaml new file mode 100644 index 00000000..b3e8aa7c --- /dev/null +++ b/examples/federated-learning/tff/nvflare/workload-pkg/client1/kustomization.yaml @@ -0,0 +1,16 @@ +--- +resources: + - ../base-deployment + +labels: + - pairs: + run: nvflare-client1 + includeSelectors: true + +nameSuffix: -client1 + +configMapGenerator: + - name: configmap + namespace: nvflare + literals: + - site=site-1 diff --git a/examples/federated-learning/tff/nvflare/workload-pkg/client2/kustomization.yaml b/examples/federated-learning/tff/nvflare/workload-pkg/client2/kustomization.yaml new file mode 100644 index 00000000..3a6412bb --- /dev/null +++ b/examples/federated-learning/tff/nvflare/workload-pkg/client2/kustomization.yaml @@ -0,0 +1,16 @@ +--- +resources: + - ../base-deployment + +labels: + - pairs: + run: nvflare-client2 + includeSelectors: true + +nameSuffix: -client2 + +configMapGenerator: + - name: configmap + namespace: nvflare + literals: + - site=site-2 diff --git a/examples/federated-learning/tff/nvflare/workload-pkg/server1/kustomization.yaml b/examples/federated-learning/tff/nvflare/workload-pkg/server1/kustomization.yaml new file mode 100644 index 00000000..6f25e4b9 --- /dev/null +++ b/examples/federated-learning/tff/nvflare/workload-pkg/server1/kustomization.yaml @@ -0,0 +1,17 @@ +--- +resources: + - ../base-deployment + - ../base-service + +labels: + - pairs: + run: nvflare-server1 + includeSelectors: true + +nameSuffix: -server1 + +configMapGenerator: + - name: configmap + namespace: nvflare + literals: + - site=server1 diff --git a/terraform/acm.tf b/terraform/acm.tf index 0bef7a06..e5de0df1 100644 --- a/terraform/acm.tf +++ b/terraform/acm.tf @@ -30,12 +30,12 @@ resource "google_gke_hub_feature_membership" "acm_feature_member" { version = var.acm_version config_sync { git { - gcp_service_account_email = local.source_repository_service_account_email - sync_repo = google_sourcerepo_repository.configsync-repository.url - sync_branch = var.acm_branch - policy_dir = var.acm_dir - secret_type = "gcpserviceaccount" + sync_repo = var.acm_repository_url + sync_branch = var.acm_branch + policy_dir = var.acm_dir + secret_type = var.acm_secret_type } + prevent_drift = true source_format = "unstructured" } diff --git a/terraform/dns.tf b/terraform/dns.tf index abcdb8a7..6391a525 100644 --- a/terraform/dns.tf +++ b/terraform/dns.tf @@ -115,38 +115,6 @@ module "cloud-dns-private-artifact-registry" { ] } -module "source-repositories-private-artifact-registry" { - source = "terraform-google-modules/cloud-dns/google" - version = "5.2.0" - - description = "Private DNS zone for Cloud Source Repositories" - domain = "source.developers.google.com." - name = "private-cloud-source-repositories" - project_id = data.google_project.project.project_id - type = "private" - - private_visibility_config_networks = [ - module.fedlearn-vpc.network_id - ] - - recordsets = [ - { - name = "*" - type = "CNAME" - ttl = 300 - records = [ - "source.developers.google.com.", - ] - }, - { - name = "" - type = "A" - ttl = 300 - records = local.private_google_access_ips - }, - ] -} - module "distributed-tff-example-dns" { count = local.distributed_tff_example_is_there_a_coordinator && local.distributed_tff_example_are_workers_outside_the_coordinator_mesh ? 1 : 0 diff --git a/terraform/gke.tf b/terraform/gke.tf index 910c643c..d4a4b861 100644 --- a/terraform/gke.tf +++ b/terraform/gke.tf @@ -27,7 +27,7 @@ module "gke" { enable_shielded_nodes = true grant_registry_access = true gcs_fuse_csi_driver = true - http_load_balancing = false + http_load_balancing = true ip_range_pods = "pods" ip_range_services = "services" master_global_access_enabled = true diff --git a/terraform/iam.tf b/terraform/iam.tf index 2a9a5b64..5efaef2d 100644 --- a/terraform/iam.tf +++ b/terraform/iam.tf @@ -47,19 +47,6 @@ module "project-iam-bindings" { ] } -# There's no Terraform module for Cloud Source Repositories bindings, so we -# configure it directly -resource "google_sourcerepo_repository_iam_binding" "binding" { - project = google_sourcerepo_repository.configsync-repository.project - repository = google_sourcerepo_repository.configsync-repository.name - - role = "roles/viewer" - - members = [ - local.source_repository_service_account_iam_email, - ] -} - module "fl-workload-identity" { for_each = local.tenants source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity" @@ -85,28 +72,3 @@ module "fl-workload-identity" { module.service_accounts ] } - -module "cloud-source-repositories-workload-identity" { - source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity" - version = "27.0.0" - project_id = data.google_project.project.project_id - - annotate_k8s_sa = false - k8s_sa_name = "root-reconciler" - location = module.gke.location - name = local.source_repository_service_account_id - namespace = "config-management-system" - use_existing_gcp_sa = true - use_existing_k8s_sa = true - - # The workload identity pool must exist before binding - module_depends_on = [ - module.gke - ] - - depends_on = [ - # Wait for the service accounts to be ready before trying to load data about them - # Ref: https://github.com/terraform-google-modules/terraform-google-kubernetes-engine/issues/1059 - module.service_accounts - ] -} diff --git a/terraform/main.tf b/terraform/main.tf index 3ce4121f..192596a1 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -36,6 +36,8 @@ locals { distributed_tff_example_worker_1_hostname = var.distributed_tff_example_configuration != null && contains(keys(var.distributed_tff_example_configuration), name) ? var.distributed_tff_example_configuration[name].worker_1_hostname : "" distributed_tff_example_worker_2_hostname = var.distributed_tff_example_configuration != null && contains(keys(var.distributed_tff_example_configuration), name) ? var.distributed_tff_example_configuration[name].worker_2_hostname : "" distributed_tff_example_worker_emnist_partition_file_name = var.distributed_tff_example_configuration != null && contains(keys(var.distributed_tff_example_configuration), name) ? var.distributed_tff_example_configuration[name].emnist_partition_file_name : "" + + nvflare_example_deploy = var.nvflare ? true : false } } @@ -77,14 +79,8 @@ locals { list_sa_names = concat( [for tenant in local.tenants : tenant.tenant_nodepool_sa_name], [for tenant in local.tenants : tenant.tenant_apps_sa_name], - [local.source_repository_service_account_name] ) - source_repository_service_account_id = module.service_accounts.service_accounts_map[local.source_repository_service_account_name].account_id - source_repository_service_account_name = "fl-source-repository" - source_repository_service_account_email = module.service_accounts.service_accounts_map[local.source_repository_service_account_name].email - source_repository_service_account_iam_email = "serviceAccount:${local.source_repository_service_account_email}" - acm_config_sync_tenant_configuration_package_source_directory_path = abspath("${path.module}/../tenant-config-pkg") acm_config_sync_destination_directory_path = "${var.acm_repository_path}/${var.acm_dir}" @@ -104,15 +100,16 @@ locals { distributed_tff_example_package_source_fileset = [for f in fileset(local.distributed_tff_example_package_source_directory_path, "**") : "${local.distributed_tff_example_package_source_directory_path}/${f}"] distributed_tff_example_package_source_content_hash = sha512(join("", [for f in local.distributed_tff_example_package_source_fileset : filesha512(f)])) + nvflare_example_source_directory_path = abspath("${path.module}/../examples/federated-learning/tff/nvflare") + nvflare_example_package_source_directory_path = "${local.nvflare_example_source_directory_path}/workload-pkg" + nvflare_example_package_source_fileset = [for f in fileset(local.nvflare_example_package_source_directory_path, "**") : "${local.nvflare_example_package_source_directory_path}/${f}"] + nvflare_example_package_source_content_hash = sha512(join("", [for f in local.nvflare_example_package_source_fileset : filesha512(f)])) + distributed_tff_example_mesh_wide_source_directory_path = "${local.distributed_tff_example_source_directory_path}/mesh-wide" distributed_tff_example_mesh_wide_source_fileset = [for f in fileset(local.distributed_tff_example_mesh_wide_source_directory_path, "**") : "${local.distributed_tff_example_mesh_wide_source_directory_path}/${f}"] distributed_tff_example_mesh_wide_source_content_hash = sha512(join("", [for f in local.distributed_tff_example_mesh_wide_source_fileset : filesha512(f)])) distributed_tff_example_mesh_wide_destination_directory_path = "${local.acm_config_sync_destination_directory_path}/example-tff-image-classification-mesh-wide" - distributed_tff_example_container_image_source_directory_path = "${local.distributed_tff_example_source_directory_path}/container-image" - distributed_tff_example_container_image_source_fileset = [for f in fileset(local.distributed_tff_example_container_image_source_directory_path, "**") : "${local.distributed_tff_example_container_image_source_directory_path}/${f}"] - distributed_tff_example_container_image_source_descriptors_content_hash = sha512(join("", [for f in local.distributed_tff_example_container_image_source_fileset : filesha512(f)])) - acm_config_sync_commit_configuration_script_path = abspath("${path.module}/scripts/commit-repository-changes.sh") delete_fileset_script_path = abspath("${path.module}/scripts/delete-fileset.sh") @@ -139,13 +136,6 @@ locals { copy_distributed_tff_example_mesh_wide_content_script_path = abspath("${path.module}/scripts/copy-tff-example-mesh-wide-content.sh") delete_distributed_tff_example_mesh_wide_content_script_path = local.delete_fileset_script_path - build_push_distributed_tff_example_container_image_script_path = abspath("${path.module}/scripts/build-push-container-image.sh") - - ditributed_tff_example_container_image_repository_hostname = "${google_artifact_registry_repository.container_image_repository.location}-docker.pkg.dev" - distributed_tff_example_container_image_repository_id = "${local.ditributed_tff_example_container_image_repository_hostname}/${google_artifact_registry_repository.container_image_repository.project}/${google_artifact_registry_repository.container_image_repository.repository_id}" - distributed_tff_example_localized_untagged_container_image_id = "${local.distributed_tff_example_container_image_repository_id}/tff-runtime" - distributed_tff_example_localized_container_image_id = "${local.distributed_tff_example_localized_untagged_container_image_id}:${local.distributed_tff_example_container_image_source_descriptors_content_hash}" - # Temporary placeholder tenant_developer_example_account = "someuser@example.com" } @@ -169,3 +159,11 @@ module "cross_device" { spanner_processing_units = var.spanner_processing_units list_apps_sa_iam_emails = local.list_apps_sa_iam_emails[var.cross_device_workloads_kubernetes_namespace] } + +module "nvflare" { + count = var.nvflare ? 1 : 0 + source = "./nvflare" + project_id = data.google_project.project.id + region = var.region + list_apps_sa_iam_emails = local.list_apps_sa_iam_emails[var.nvflare_workloads_kubernetes_namespace] +} diff --git a/terraform/network.tf b/terraform/network.tf index 61ec3b1a..ed386092 100644 --- a/terraform/network.tf +++ b/terraform/network.tf @@ -29,20 +29,6 @@ module "fedlearn-vpc" { routing_mode = "GLOBAL" firewall_rules = [ - { - description = "Default deny egress from node pools" - direction = "EGRESS" - name = "node-pools-deny-egress" - priority = 65535 - ranges = ["0.0.0.0/0"] - target_service_accounts = local.list_nodepool_sa_emails - - deny = [ - { - protocol = "all" - } - ] - }, { description = "Allow egress from node pools to cluster nodes, pods and services" direction = "EGRESS" @@ -133,6 +119,46 @@ module "fedlearn-vpc" { ] } +module "fedlearn-fw-policies" { + source = "terraform-google-modules/network/google//modules/network-firewall-policy" + version = "9.0.0" + + project_id = data.google_project.project.project_id + policy_name = "network-firewall-policies-federated-learning" + target_vpcs = [module.fedlearn-vpc.network_id] + + rules = [ + { + priority = 1000 + direction = "EGRESS" + action = "allow" + rule_name = "node-pools-allow-egress-configsync-source-repository" + description = "Allow egress from node pools to Config Sync source repository" + target_service_accounts = local.list_nodepool_sa_emails + match = { + dest_fqdns = var.acm_source_repository_fqdns # Allow FQDN for Config Sync source repository + layer4_configs = [ + { + ip_protocol = "tcp" + ports = ["22", "443"] # Allow both SSH and HTTPS access + } + ] + } + }, + { + priority = 65535 + direction = "EGRESS" + action = "deny" + rule_name = "node-pools-deny-egress" + description = "Default deny egress from node pools" # Required to add the deny rule in the network firewall policies as they are evaluated after the classical ones + target_service_accounts = local.list_nodepool_sa_emails + match = { + dest_ip_ranges = ["0.0.0.0/0"] + } + } + ] +} + resource "google_compute_address" "nat_ip" { name = "nat-manual-ip" region = module.fedlearn-vpc.subnets[local.fedlearn_subnet_key].region diff --git a/examples/federated-learning/tff/nvflare/terraform/iam.tf b/terraform/nvflare/iam.tf similarity index 70% rename from examples/federated-learning/tff/nvflare/terraform/iam.tf rename to terraform/nvflare/iam.tf index d090792a..8b67ff8e 100644 --- a/examples/federated-learning/tff/nvflare/terraform/iam.tf +++ b/terraform/nvflare/iam.tf @@ -15,11 +15,9 @@ module "storage_bucket_iam_bindings" { source = "terraform-google-modules/iam/google//modules/storage_buckets_iam" version = "7.7.1" - storage_buckets = [var.workspace_bucket_name] + storage_buckets = [module.buckets.bucket.name] bindings = { - "roles/storage.objectUser" = [ - "principal://iam.googleapis.com/projects/${data.google_project.project.number}/locations/global/workloadIdentityPools/${data.google_project.project.project_id}.svc.id.goog/subject/ns/${var.nvflare_namespace}/sa/default", - ] + "roles/storage.objectUser" = var.list_apps_sa_iam_emails } } diff --git a/examples/federated-learning/tff/nvflare/terraform/main.tf b/terraform/nvflare/main.tf similarity index 100% rename from examples/federated-learning/tff/nvflare/terraform/main.tf rename to terraform/nvflare/main.tf diff --git a/examples/federated-learning/tff/nvflare/terraform/outputs.tf b/terraform/nvflare/outputs.tf similarity index 87% rename from examples/federated-learning/tff/nvflare/terraform/outputs.tf rename to terraform/nvflare/outputs.tf index f0269e51..9ffc74d8 100644 --- a/examples/federated-learning/tff/nvflare/terraform/outputs.tf +++ b/terraform/nvflare/outputs.tf @@ -13,5 +13,6 @@ # limitations under the License. output "workspace_bucket_name" { - value = module.buckets.bucket.name + description = "Full name of the workspace bucket" + value = module.buckets.name } diff --git a/examples/federated-learning/tff/nvflare/terraform/variables.tf b/terraform/nvflare/variables.tf similarity index 87% rename from examples/federated-learning/tff/nvflare/terraform/variables.tf rename to terraform/nvflare/variables.tf index 991ddaf5..09ac3fb5 100644 --- a/examples/federated-learning/tff/nvflare/terraform/variables.tf +++ b/terraform/nvflare/variables.tf @@ -28,8 +28,7 @@ variable "workspace_bucket_name" { default = "nvflare-storage" } -variable "nvflare_namespace" { - description = "The namespace where Nvflare will be deployed." - type = string - default = "nvflare" +variable "list_apps_sa_iam_emails" { + description = "List of SA to add roles to when deploying nvflare workload." + type = list(string) } diff --git a/examples/federated-learning/tff/nvflare/terraform/version.tf b/terraform/nvflare/version.tf similarity index 100% rename from examples/federated-learning/tff/nvflare/terraform/version.tf rename to terraform/nvflare/version.tf diff --git a/terraform/outputs.tf b/terraform/outputs.tf index 7ba50f93..f76fa8e9 100644 --- a/terraform/outputs.tf +++ b/terraform/outputs.tf @@ -11,3 +11,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +locals { + container_image_repository_fully_qualified_hostname = "${google_artifact_registry_repository.container_image_repository.location}-docker.pkg.dev" + container_image_repository_name = "${google_artifact_registry_repository.container_image_repository.project}/${google_artifact_registry_repository.container_image_repository.repository_id}" +} + +output "container_image_repository_fully_qualified_hostname" { + description = "Fully qualified name of the container image repository." + value = local.container_image_repository_fully_qualified_hostname +} + +output "container_image_repository_name" { + description = "Container image repository name." + value = local.container_image_repository_name +} + +output "nvflare_workspace_bucket_name" { + description = "Nvflare bucket name" + value = var.nvflare == true ? module.nvflare[0].workspace_bucket_name : null +} diff --git a/terraform/scripts/build-push-container-image.sh b/terraform/scripts/build-push-container-image.sh deleted file mode 100755 index eb69079f..00000000 --- a/terraform/scripts/build-push-container-image.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env sh - -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -o nounset -set -o errexit - -DISTRIBUTED_TFF_EXAMPLE_CONTAINER_IMAGE_BUILD_CONTEXT_PATH="${1}" -DISTRIBUTED_TFF_EXAMPLE_CONTAINER_IMAGE_REPOSITORY_HOSTNAME="${2}" -CONTAINER_IMAGE_LOCALIZED_ID="${3}" - -echo "Build the ${CONTAINER_IMAGE_LOCALIZED_ID} container image. Context: ${DISTRIBUTED_TFF_EXAMPLE_CONTAINER_IMAGE_BUILD_CONTEXT_PATH}" -docker build \ - --file "${DISTRIBUTED_TFF_EXAMPLE_CONTAINER_IMAGE_BUILD_CONTEXT_PATH}/Dockerfile" \ - --tag "${CONTAINER_IMAGE_LOCALIZED_ID}" \ - "${DISTRIBUTED_TFF_EXAMPLE_CONTAINER_IMAGE_BUILD_CONTEXT_PATH}" - -echo "Authenticating Docker against ${DISTRIBUTED_TFF_EXAMPLE_CONTAINER_IMAGE_REPOSITORY_HOSTNAME}" -gcloud auth configure-docker \ - "${DISTRIBUTED_TFF_EXAMPLE_CONTAINER_IMAGE_REPOSITORY_HOSTNAME}" - -echo "Pushing the ${CONTAINER_IMAGE_LOCALIZED_ID} container image" -docker image push "${CONTAINER_IMAGE_LOCALIZED_ID}" diff --git a/terraform/services.tf b/terraform/services.tf index 09b63864..ab680acd 100644 --- a/terraform/services.tf +++ b/terraform/services.tf @@ -50,7 +50,6 @@ module "project-services" { "meshconfig.googleapis.com", "meshtelemetry.googleapis.com", "monitoring.googleapis.com", - "sourcerepo.googleapis.com", "spanner.googleapis.com", "stackdriver.googleapis.com" ] diff --git a/terraform/source-repository.tf b/terraform/source-repository.tf deleted file mode 100644 index 4bf07ea7..00000000 --- a/terraform/source-repository.tf +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -resource "google_sourcerepo_repository" "configsync-repository" { - name = "fl-configsync" - project = data.google_project.project.project_id -} diff --git a/terraform/tenant-configuration.tf b/terraform/tenant-configuration.tf index 244ee337..8aba568e 100644 --- a/terraform/tenant-configuration.tf +++ b/terraform/tenant-configuration.tf @@ -19,7 +19,7 @@ resource "null_resource" "init_acm_repository" { create_command = <<-EOT "${local.init_local_acm_repository_script_path}" \ "${var.acm_repository_path}" \ - "${google_sourcerepo_repository.configsync-repository.url}" \ + "${var.acm_repository_url}" \ "${var.acm_branch}" EOT @@ -83,7 +83,8 @@ resource "null_resource" "tenant_configuration" { "${var.distributed_tff_example_coordinator_namespace}" \ "${!each.value.distributed_tff_example_is_coordinator && var.distributed_tff_example_deploy_ingress_gateway}" \ "${local.distributed_tff_example_are_workers_outside_the_coordinator_mesh}" \ - "${each.value.distributed_tff_example_deploy ? local.distributed_tff_example_localized_container_image_id : "${local.distributed_tff_example_localized_untagged_container_image_id}:latest"}" + "${each.value.distributed_tff_example_deploy ? "${local.container_image_repository_fully_qualified_hostname}/${local.container_image_repository_name}/tff-runtime:0.0.1" : "${local.container_image_repository_fully_qualified_hostname}/${local.container_image_repository_name}/tff-runtime:0.0.1"}" \ + "${each.value.nvflare_example_deploy ? "${local.container_image_repository_fully_qualified_hostname}/${local.container_image_repository_name}/nvflare-tensorflow:0.0.1" : "${local.container_image_repository_fully_qualified_hostname}/${local.container_image_repository_name}/nvflare-tensorflow:0.0.1"}" EOT create_script_hash = md5(file(local.generate_and_copy_acm_tenant_content_script_path)) destroy_command = <<-EOT @@ -92,10 +93,12 @@ resource "null_resource" "tenant_configuration" { EOT destroy_script_hash = md5(file(local.delete_acm_tenant_content_script_path)) - source_contents_hash = local.acm_config_sync_tenant_configuration_package_source_content_hash - distributed_tff_example_package_source_contents_hash = each.value.distributed_tff_example_deploy ? local.distributed_tff_example_package_source_content_hash : "" - distributed_tff_example_container_image_id = each.value.distributed_tff_example_deploy ? local.distributed_tff_example_localized_container_image_id : "" - distributed_tff_example_container_image_source_contents_hash = each.value.distributed_tff_example_deploy ? local.distributed_tff_example_container_image_source_descriptors_content_hash : "" + source_contents_hash = local.acm_config_sync_tenant_configuration_package_source_content_hash + distributed_tff_example_package_source_contents_hash = each.value.distributed_tff_example_deploy ? local.distributed_tff_example_package_source_content_hash : "" + distributed_tff_example_container_image_id = each.value.distributed_tff_example_deploy ? "${local.container_image_repository_fully_qualified_hostname}/${local.container_image_repository_name}/tff-runtime:0.0.1" : "" + + nvflare_example_package_source_contents_hash = each.value.nvflare_example_deploy ? local.nvflare_example_package_source_content_hash : "" + nvflare_example_container_image_id = each.value.nvflare_example_deploy ? "${local.container_image_repository_fully_qualified_hostname}/${local.container_image_repository_name}/nvflare-tensorflow:0.0.1" : "" # Always run this. We check if something needs to be done in the creation script timestamp = timestamp() @@ -116,31 +119,6 @@ resource "null_resource" "tenant_configuration" { ] } -resource "null_resource" "build_push_distributed_tff_example_container_image" { - count = local.deploy_distributed_tff_example_any_tenant ? 1 : 0 - - triggers = { - create_command = <<-EOT - "${local.build_push_distributed_tff_example_container_image_script_path}" \ - "${local.distributed_tff_example_container_image_source_directory_path}" \ - "${local.ditributed_tff_example_container_image_repository_hostname}" \ - "${local.distributed_tff_example_localized_container_image_id}" - EOT - create_script_hash = md5(file(local.build_push_distributed_tff_example_container_image_script_path)) - - source_contents_hash = local.distributed_tff_example_container_image_source_descriptors_content_hash - container_image_id = local.distributed_tff_example_localized_container_image_id - - # Always run this. We check if something needs to be done in the creation script - timestamp = timestamp() - } - - provisioner "local-exec" { - when = create - command = self.triggers.create_command - } -} - resource "null_resource" "copy_mesh_wide_distributed_tff_example_content" { count = local.deploy_distributed_tff_example_any_tenant ? 1 : 0 diff --git a/terraform/variables.tf b/terraform/variables.tf index 7c78a75f..cacce4f5 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -114,23 +114,38 @@ variable "enable_confidential_nodes" { } variable "acm_version" { - description = "Anthos Config Management version" + description = "Config Management version" default = "" type = string } variable "acm_branch" { default = "main" - description = "The Git branch Anthos Config Management will sync to" + description = "The Git branch in the repository that Config Sync will sync with" type = string } variable "acm_dir" { default = "configsync" - description = "The directory in the repository that Anthos Config Management will sync to" + description = "The directory in the repository that Config Sync will sync with" type = string } +variable "acm_repository_url" { + description = "The URL of the repository that Config Sync will sync with" + type = string +} + +variable "acm_secret_type" { + description = "Secret type to authenticate with the Config Sync Git repository. Ref: https://cloud.google.com/kubernetes-engine/enterprise/config-sync/docs/how-to/installing-config-sync#git-creds-secret" + type = string +} + +variable "acm_source_repository_fqdns" { + description = "FQDNs of source repository for Config Sync to allow in the Network Firewall Policy" + type = list(string) +} + # We can't validate if this directory exists because the fileexists function # doesn't support directories (yet?) # Ref: https://github.com/hashicorp/terraform/issues/33394 @@ -186,6 +201,12 @@ variable "cross_device" { default = false } +variable "nvflare" { + description = "Enable nvflare infrastructure deployment" + type = bool + default = false +} + variable "spanner_instance_config" { description = "Multi region config value for the Spanner Instance. Example: 'nam10' for North America." type = string @@ -203,3 +224,9 @@ variable "cross_device_workloads_kubernetes_namespace" { type = string default = "main" } + +variable "nvflare_workloads_kubernetes_namespace" { + description = "Namespace of SA where the cross-device workload will be deployed" + type = string + default = "fltenant1" +}