Skip to content

Commit

Permalink
feat(tpu): add tpu queued resources network (#9605)
Browse files Browse the repository at this point in the history
* Added tpu_queued_resources_network sample

* Changed CODEOWNERS

* Update CreateQueuedResourceWithNetwork.java

---------

Co-authored-by: Eric Schmidt <[email protected]>
  • Loading branch information
TetyanaYahodska and telpirion authored Oct 29, 2024
1 parent 19dcf09 commit 002b4ae
Show file tree
Hide file tree
Showing 7 changed files with 539 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
/security-command-center @GoogleCloudPlatform/java-samples-reviewers @yoshi-approver @GoogleCloudPlatform/cloud-samples-reviewers @GoogleCloudPlatform/dee-infra @GoogleCloudPlatform/gcp-security-command-center
/servicedirectory @GoogleCloudPlatform/java-samples-reviewers @yoshi-approver @GoogleCloudPlatform/cloud-samples-reviewers @GoogleCloudPlatform/dee-infra
/webrisk @GoogleCloudPlatform/java-samples-reviewers @yoshi-approver @GoogleCloudPlatform/cloud-samples-reviewers @GoogleCloudPlatform/dee-infra
/tpu @GoogleCloudPlatform/java-samples-reviewers @yoshi-approver @GoogleCloudPlatform/cloud-samples-reviewers @GoogleCloudPlatform/dee-infra

# DEE Platform Ops (DEEPO)
/errorreporting @GoogleCloudPlatform/java-samples-reviewers @yoshi-approver @GoogleCloudPlatform/cloud-samples-reviewers
Expand Down
101 changes: 101 additions & 0 deletions tpu/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Copyright 2024 Google LLC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://maven.apache.org/POM/4.0.0"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.example.tpu</groupId>
<artifactId>gce-diregapic-samples</artifactId>
<version>1.0-SNAPSHOT</version>

<!--
The parent pom defines common style checks and testing strategies for our samples.
Removing or replacing it should not affect the execution of the samples in anyway.
-->
<parent>
<artifactId>shared-configuration</artifactId>
<groupId>com.google.cloud.samples</groupId>
<version>1.2.0</version>
</parent>

<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
</properties>

<dependencies>
<dependency>
<groupId>com.google.cloud</groupId>
<artifactId>google-cloud-tpu</artifactId>
<version>2.52.0</version>
</dependency>

<dependency>
<groupId>com.google.api</groupId>
<artifactId>gax</artifactId>
</dependency>

<!-- Test dependencies -->
<dependency>
<artifactId>google-cloud-storage</artifactId>
<groupId>com.google.cloud</groupId>
<scope>test</scope>
</dependency>

<dependency>
<artifactId>truth</artifactId>
<groupId>com.google.truth</groupId>
<scope>test</scope>
<version>1.4.0</version>
</dependency>
<dependency>
<artifactId>junit</artifactId>
<groupId>junit</groupId>
<scope>test</scope>
<version>4.13.2</version>
</dependency>

<!--
JUnit Jupiter dependencies to run BeforeEach and AfterEach methods
(in tandem with mvn surefire) before every test.
Without these, mvn surefire skips these methods and leads to concurrency
issues.
-->
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<version>5.10.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>5.13.0</version>
<scope>test</scope>
</dependency>
</dependencies>

<dependencyManagement>
<dependencies>
<dependency>
<artifactId>libraries-bom</artifactId>
<groupId>com.google.cloud</groupId>
<scope>import</scope>
<type>pom</type>
<version>26.40.0</version>
</dependency>
</dependencies>
</dependencyManagement>

</project>
139 changes: 139 additions & 0 deletions tpu/src/main/java/tpu/CreateQueuedResourceWithNetwork.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
/*
* Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package tpu;

//[START tpu_queued_resources_network]
import com.google.api.gax.retrying.RetrySettings;
import com.google.cloud.tpu.v2alpha1.CreateQueuedResourceRequest;
import com.google.cloud.tpu.v2alpha1.NetworkConfig;
import com.google.cloud.tpu.v2alpha1.Node;
import com.google.cloud.tpu.v2alpha1.QueuedResource;
import com.google.cloud.tpu.v2alpha1.TpuClient;
import com.google.cloud.tpu.v2alpha1.TpuSettings;
import java.io.IOException;
import java.util.concurrent.ExecutionException;
import org.threeten.bp.Duration;

public class CreateQueuedResourceWithNetwork {
public static void main(String[] args)
throws IOException, ExecutionException, InterruptedException {
// TODO(developer): Replace these variables before running the sample.
// Project ID or project number of the Google Cloud project you want to create a node.
String projectId = "YOUR_PROJECT_ID";
// The zone in which to create the TPU.
// For more information about supported TPU types for specific zones,
// see https://cloud.google.com/tpu/docs/regions-zones
String zone = "europe-west4-a";
// The name for your TPU.
String nodeName = "YOUR_TPU_NAME";
// The accelerator type that specifies the version and size of the Cloud TPU you want to create.
// For more information about supported accelerator types for each TPU version,
// see https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#versions.
String tpuType = "v2-8";
// Software version that specifies the version of the TPU runtime to install.
// For more information see https://cloud.google.com/tpu/docs/runtimes
String tpuSoftwareVersion = "tpu-vm-tf-2.14.1";
// The name for your Queued Resource.
String queuedResourceId = "QUEUED_RESOURCE_ID";
// The name of the network you want the node to connect to.
// The network should be assigned to your project.
String networkName = "YOUR_COMPUTE_TPU_NETWORK";

createQueuedResourceWithNetwork(projectId, zone, queuedResourceId, nodeName,
tpuType, tpuSoftwareVersion, networkName);
}

// Creates a Queued Resource with network configuration.
public static QueuedResource createQueuedResourceWithNetwork(
String projectId, String zone, String queuedResourceId, String nodeName,
String tpuType, String tpuSoftwareVersion, String networkName)
throws IOException, ExecutionException, InterruptedException {
// With these settings the client library handles the Operation's polling mechanism
// and prevent CancellationException error
TpuSettings.Builder clientSettings =
TpuSettings.newBuilder();
clientSettings
.createQueuedResourceSettings()
.setRetrySettings(
RetrySettings.newBuilder()
.setInitialRetryDelay(Duration.ofMillis(5000L))
.setRetryDelayMultiplier(2.0)
.setInitialRpcTimeout(Duration.ZERO)
.setRpcTimeoutMultiplier(1.0)
.setMaxRetryDelay(Duration.ofMillis(45000L))
.setTotalTimeout(Duration.ofHours(24L))
.build());
// Initialize client that will be used to send requests. This client only needs to be created
// once, and can be reused for multiple requests.
try (TpuClient tpuClient = TpuClient.create(clientSettings.build())) {
String parent = String.format("projects/%s/locations/%s", projectId, zone);
String region = zone.substring(0, zone.length() - 2);

// Specify the network and subnetwork that you want to connect your TPU to.
NetworkConfig networkConfig =
NetworkConfig.newBuilder()
.setEnableExternalIps(true)
.setNetwork(String.format("projects/%s/global/networks/%s", projectId, networkName))
.setSubnetwork(
String.format(
"projects/%s/regions/%s/subnetworks/%s", projectId, region, networkName))
.build();

// Create a node
Node node =
Node.newBuilder()
.setName(nodeName)
.setAcceleratorType(tpuType)
.setRuntimeVersion(tpuSoftwareVersion)
.setNetworkConfig(networkConfig)
.setQueuedResource(
String.format(
"projects/%s/locations/%s/queuedResources/%s",
projectId, zone, queuedResourceId))
.build();

// Create queued resource
QueuedResource queuedResource =
QueuedResource.newBuilder()
.setName(queuedResourceId)
.setTpu(
QueuedResource.Tpu.newBuilder()
.addNodeSpec(
QueuedResource.Tpu.NodeSpec.newBuilder()
.setParent(parent)
.setNode(node)
.setNodeId(nodeName)
.build())
.build())
.build();

CreateQueuedResourceRequest request =
CreateQueuedResourceRequest.newBuilder()
.setParent(parent)
.setQueuedResource(queuedResource)
.setQueuedResourceId(queuedResourceId)
.build();

QueuedResource response = tpuClient.createQueuedResourceAsync(request).get();
// You can wait until TPU Node is READY,
// and check its status using getTpuVm() from "tpu_vm_get" sample.
System.out.println("Queued Resource created: " + queuedResourceId);
return response;
}
}
}
//[END tpu_queued_resources_network]
78 changes: 78 additions & 0 deletions tpu/src/main/java/tpu/DeleteForceQueuedResource.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package tpu;

//[START tpu_queued_resources_delete_force]

import com.google.api.gax.retrying.RetrySettings;
import com.google.api.gax.rpc.UnknownException;
import com.google.cloud.tpu.v2alpha1.DeleteQueuedResourceRequest;
import com.google.cloud.tpu.v2alpha1.TpuClient;
import com.google.cloud.tpu.v2alpha1.TpuSettings;
import java.io.IOException;
import java.util.concurrent.ExecutionException;
import org.threeten.bp.Duration;

public class DeleteForceQueuedResource {
public static void main(String[] args) {
// TODO(developer): Replace these variables before running the sample.
// Project ID or project number of the Google Cloud project.
String projectId = "YOUR_PROJECT_ID";
// The zone in which the TPU was created.
String zone = "europe-west4-a";
// The name for your Queued Resource.
String queuedResourceId = "QUEUED_RESOURCE_ID";

deleteForceQueuedResource(projectId, zone, queuedResourceId);
}

// Deletes a Queued Resource asynchronously with --force flag.
public static void deleteForceQueuedResource(
String projectId, String zone, String queuedResourceId) {
String name = String.format("projects/%s/locations/%s/queuedResources/%s",
projectId, zone, queuedResourceId);
// With these settings the client library handles the Operation's polling mechanism
// and prevent CancellationException error
TpuSettings.Builder clientSettings =
TpuSettings.newBuilder();
clientSettings
.deleteQueuedResourceSettings()
.setRetrySettings(
RetrySettings.newBuilder()
.setInitialRetryDelay(Duration.ofMillis(5000L))
.setRetryDelayMultiplier(2.0)
.setInitialRpcTimeout(Duration.ZERO)
.setRpcTimeoutMultiplier(1.0)
.setMaxRetryDelay(Duration.ofMillis(45000L))
.setTotalTimeout(Duration.ofHours(24L))
.build());

// Initialize client that will be used to send requests. This client only needs to be created
// once, and can be reused for multiple requests.
try (TpuClient tpuClient = TpuClient.create(clientSettings.build())) {
DeleteQueuedResourceRequest request =
DeleteQueuedResourceRequest.newBuilder().setName(name).setForce(true).build();

tpuClient.deleteQueuedResourceAsync(request).get();

} catch (UnknownException | InterruptedException | ExecutionException | IOException e) {
System.out.println(e.getMessage());
}
System.out.printf("Deleted Queued Resource: %s\n", name);
}
}
//[END tpu_queued_resources_delete_force]
54 changes: 54 additions & 0 deletions tpu/src/main/java/tpu/GetQueuedResource.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package tpu;

//[START tpu_queued_resources_get]

import com.google.cloud.tpu.v2alpha1.GetQueuedResourceRequest;
import com.google.cloud.tpu.v2alpha1.QueuedResource;
import com.google.cloud.tpu.v2alpha1.TpuClient;
import java.io.IOException;

public class GetQueuedResource {
public static void main(String[] args) throws IOException {
// TODO(developer): Replace these variables before running the sample.
// Project ID or project number of the Google Cloud project.
String projectId = "YOUR_PROJECT_ID";
// The zone in which the TPU was created.
String zone = "europe-west4-a";
// The name for your Queued Resource.
String queuedResourceId = "QUEUED_RESOURCE_ID";

getQueuedResource(projectId, zone, queuedResourceId);
}

// Get a Queued Resource.
public static QueuedResource getQueuedResource(
String projectId, String zone, String queuedResourceId) throws IOException {
String name = String.format("projects/%s/locations/%s/queuedResources/%s",
projectId, zone, queuedResourceId);
// Initialize client that will be used to send requests. This client only needs to be created
// once, and can be reused for multiple requests.
try (TpuClient tpuClient = TpuClient.create()) {
GetQueuedResourceRequest request =
GetQueuedResourceRequest.newBuilder().setName(name).build();

return tpuClient.getQueuedResource(request);
}
}
}
//[END tpu_queued_resources_get]
Loading

0 comments on commit 002b4ae

Please sign in to comment.