Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(tpu): add tpu vm create topology sample. #9611

Merged
merged 29 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
b951997
Changed package, added information to CODEOWNERS
TetyanaYahodska Oct 15, 2024
9ee20d7
Added information to CODEOWNERS
TetyanaYahodska Oct 15, 2024
f0b8314
Added timeout
TetyanaYahodska Oct 16, 2024
055d61e
Merge branch 'main' into tpu-vm-crud-operations
TetyanaYahodska Oct 16, 2024
d3e1dee
Fixed parameters for test
TetyanaYahodska Oct 16, 2024
2253b54
Fixed DeleteTpuVm and naming
TetyanaYahodska Oct 17, 2024
d29a6b5
Added comment, created Util class
TetyanaYahodska Oct 18, 2024
d832b31
Merge branch 'main' into tpu-vm-crud-operations
TetyanaYahodska Oct 23, 2024
6956852
Fixed naming
TetyanaYahodska Oct 23, 2024
478beaa
Fixed whitespace
TetyanaYahodska Oct 23, 2024
f6b76cc
Merge branch 'main' into tpu-vm-crud-operations
TetyanaYahodska Oct 29, 2024
ec13f4d
Split PR into smaller, deleted redundant code
TetyanaYahodska Oct 29, 2024
d8e2887
Implemented tpu_vm_create_topology sample, created test
TetyanaYahodska Oct 29, 2024
15bceb0
Merged changes from main
TetyanaYahodska Oct 30, 2024
8fc3716
Changed zone
TetyanaYahodska Oct 30, 2024
435dcd5
Merge branch 'main' into tpu_vm_create_topology
TetyanaYahodska Oct 31, 2024
d0ca1da
Fixed empty lines and tests, deleted cleanup method
TetyanaYahodska Oct 31, 2024
69e107c
Fixed tests
TetyanaYahodska Oct 31, 2024
41ce693
Merged changes from main
TetyanaYahodska Nov 7, 2024
d81e48a
Fixed test
TetyanaYahodska Nov 18, 2024
f4885e1
Merged changes from main
TetyanaYahodska Nov 18, 2024
1193bfe
Fixed imports
TetyanaYahodska Nov 18, 2024
8ed463c
Increased timeout to 10 sec
TetyanaYahodska Nov 18, 2024
ae51af1
Fixed tests
TetyanaYahodska Nov 18, 2024
c57cdc2
Fixed tests
TetyanaYahodska Nov 18, 2024
df64c89
Merge branch 'main' into tpu_vm_create_topology
TetyanaYahodska Nov 20, 2024
1833a5c
Deleted settings
TetyanaYahodska Nov 20, 2024
d611789
Made ByteArrayOutputStream bout as local variable
TetyanaYahodska Nov 20, 2024
bb84b24
Changed timeout to 10 sec
TetyanaYahodska Nov 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions tpu/src/main/java/tpu/CreateTpuWithTopologyFlag.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package tpu;

//[START tpu_vm_create_topology]
import com.google.api.gax.longrunning.OperationTimedPollAlgorithm;
import com.google.api.gax.retrying.RetrySettings;
import com.google.cloud.tpu.v2.AcceleratorConfig;
import com.google.cloud.tpu.v2.AcceleratorConfig.Type;
import com.google.cloud.tpu.v2.CreateNodeRequest;
import com.google.cloud.tpu.v2.Node;
import com.google.cloud.tpu.v2.TpuClient;
import com.google.cloud.tpu.v2.TpuSettings;
import java.io.IOException;
import java.util.concurrent.ExecutionException;
import org.threeten.bp.Duration;

public class CreateTpuWithTopologyFlag {

public static void main(String[] args)
throws IOException, ExecutionException, InterruptedException {
// TODO(developer): Replace these variables before running the sample.
// Project ID or project number of the Google Cloud project you want to create a node.
String projectId = "YOUR_PROJECT_ID";
// The zone in which to create the TPU.
// For more information about supported TPU types for specific zones,
// see https://cloud.google.com/tpu/docs/regions-zones
String zone = "europe-west4-a";
// The name for your TPU.
String nodeName = "YOUR_TPU_NAME";
// The version of the Cloud TPU you want to create.
// Available options: TYPE_UNSPECIFIED = 0, V2 = 2, V3 = 4, V4 = 7
Type tpuVersion = AcceleratorConfig.Type.V2;
// Software version that specifies the version of the TPU runtime to install.
// For more information, see https://cloud.google.com/tpu/docs/runtimes
String tpuSoftwareVersion = "tpu-vm-tf-2.17.0-pod-pjrt";
// The physical topology of your TPU slice.
// For more information about topology for each TPU version,
// see https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#versions.
String topology = "2x2";

createTpuWithTopologyFlag(projectId, zone, nodeName, tpuVersion, tpuSoftwareVersion, topology);
}

// Creates a TPU VM with the specified name, zone, version and topology.
public static Node createTpuWithTopologyFlag(String projectId, String zone, String nodeName,
Type tpuVersion, String tpuSoftwareVersion, String topology)
throws IOException, ExecutionException, InterruptedException {
// With these settings the client library handles the Operation's polling mechanism
// and prevent CancellationException error
TpuSettings.Builder clientSettings =
TpuSettings.newBuilder();
clientSettings
.createNodeOperationSettings()
.setPollingAlgorithm(
OperationTimedPollAlgorithm.create(
RetrySettings.newBuilder()
.setInitialRetryDelay(Duration.ofMillis(5000L))
.setRetryDelayMultiplier(1.5)
.setMaxRetryDelay(Duration.ofMillis(45000L))
.setInitialRpcTimeout(Duration.ZERO)
.setRpcTimeoutMultiplier(1.0)
.setMaxRpcTimeout(Duration.ZERO)
.setTotalTimeout(Duration.ofHours(24L))
.build()));

// Initialize client that will be used to send requests. This client only needs to be created
// once, and can be reused for multiple requests.
try (TpuClient tpuClient = TpuClient.create(clientSettings.build())) {
String parent = String.format("projects/%s/locations/%s", projectId, zone);

Node tpuVm =
Node.newBuilder()
.setName(nodeName)
.setAcceleratorConfig(Node.newBuilder()
.getAcceleratorConfigBuilder()
.setType(tpuVersion)
.setTopology(topology)
.build())
.setRuntimeVersion(tpuSoftwareVersion)
.build();

CreateNodeRequest request =
CreateNodeRequest.newBuilder()
.setParent(parent)
.setNodeId(nodeName)
.setNode(tpuVm)
.build();

return tpuClient.createNodeAsync(request).get();
}
}
}
//[END tpu_vm_create_topology]
1 change: 0 additions & 1 deletion tpu/src/main/java/tpu/GetQueuedResource.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
package tpu;

//[START tpu_queued_resources_get]

import com.google.cloud.tpu.v2alpha1.GetQueuedResourceRequest;
import com.google.cloud.tpu.v2alpha1.QueuedResource;
import com.google.cloud.tpu.v2alpha1.TpuClient;
Expand Down
70 changes: 0 additions & 70 deletions tpu/src/test/java/tpu/CreateTpuIT.java

This file was deleted.

21 changes: 10 additions & 11 deletions tpu/src/test/java/tpu/QueuedResourceIT.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import org.junit.Before;
import org.junit.Test;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Timeout;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
Expand All @@ -52,10 +52,10 @@ public class QueuedResourceIT {
private static final String TPU_SOFTWARE_VERSION = "tpu-vm-tf-2.14.1";
private static final String QUEUED_RESOURCE_NAME = "queued-resource";
private static final String NETWORK_NAME = "default";
private ByteArrayOutputStream bout;
private static ByteArrayOutputStream bout;

@Before
public void setUp() {
@BeforeAll
public static void setUp() {
bout = new ByteArrayOutputStream();
System.setOut(new PrintStream(bout));
}
Expand All @@ -75,8 +75,8 @@ public void testCreateQueuedResourceWithSpecifiedNetwork() throws Exception {

QueuedResource returnedQueuedResource =
CreateQueuedResourceWithNetwork.createQueuedResourceWithNetwork(
PROJECT_ID, ZONE, QUEUED_RESOURCE_NAME, NODE_NAME,
TPU_TYPE, TPU_SOFTWARE_VERSION, NETWORK_NAME);
PROJECT_ID, ZONE, QUEUED_RESOURCE_NAME, NODE_NAME,
TPU_TYPE, TPU_SOFTWARE_VERSION, NETWORK_NAME);

verify(mockTpuClient, times(1))
.createQueuedResourceAsync(any(CreateQueuedResourceRequest.class));
Expand All @@ -89,7 +89,6 @@ public void testCreateQueuedResourceWithSpecifiedNetwork() throws Exception {
public void testGetQueuedResource() throws IOException {
try (MockedStatic<TpuClient> mockedTpuClient = mockStatic(TpuClient.class)) {
TpuClient mockClient = mock(TpuClient.class);
GetQueuedResource mockGetQueuedResource = mock(GetQueuedResource.class);
QueuedResource mockQueuedResource = mock(QueuedResource.class);

mockedTpuClient.when(TpuClient::create).thenReturn(mockClient);
Expand All @@ -99,14 +98,14 @@ public void testGetQueuedResource() throws IOException {
QueuedResource returnedQueuedResource =
GetQueuedResource.getQueuedResource(PROJECT_ID, ZONE, NODE_NAME);

verify(mockGetQueuedResource, times(1))
.getQueuedResource(PROJECT_ID, ZONE, NODE_NAME);
verify(mockClient, times(1))
.getQueuedResource(any(GetQueuedResourceRequest.class));
assertEquals(returnedQueuedResource, mockQueuedResource);
}
}

@Test
public void testDeleteTpuVm() {
public void testDeleteForceQueuedResource() {
try (MockedStatic<TpuClient> mockedTpuClient = mockStatic(TpuClient.class)) {
TpuClient mockTpuClient = mock(TpuClient.class);
OperationFuture mockFuture = mock(OperationFuture.class);
Expand Down
60 changes: 57 additions & 3 deletions tpu/src/test/java/tpu/TpuVmIT.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package tpu;

import static com.google.common.truth.Truth.assertThat;
import static org.junit.Assert.assertEquals;
import static org.mockito.Mockito.any;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.mockStatic;
Expand All @@ -25,6 +26,8 @@
import static org.mockito.Mockito.when;

import com.google.api.gax.longrunning.OperationFuture;
import com.google.cloud.tpu.v2.AcceleratorConfig;
import com.google.cloud.tpu.v2.CreateNodeRequest;
import com.google.cloud.tpu.v2.DeleteNodeRequest;
import com.google.cloud.tpu.v2.GetNodeRequest;
import com.google.cloud.tpu.v2.Node;
Expand All @@ -47,6 +50,10 @@ public class TpuVmIT {
private static final String PROJECT_ID = "project-id";
private static final String ZONE = "asia-east1-c";
private static final String NODE_NAME = "test-tpu";
private static final String TPU_TYPE = "v2-8";
private static final AcceleratorConfig.Type ACCELERATOR_TYPE = AcceleratorConfig.Type.V2;
private static final String TPU_SOFTWARE_VERSION = "tpu-vm-tf-2.14.1";
private static final String TOPOLOGY = "2x2";
private static ByteArrayOutputStream bout;

@BeforeAll
Expand All @@ -55,20 +62,43 @@ public static void setUp() {
System.setOut(new PrintStream(bout));
}

@Test
public void testCreateTpuVm() throws Exception {
try (MockedStatic<TpuClient> mockedTpuClient = mockStatic(TpuClient.class)) {
Node mockNode = mock(Node.class);
TpuClient mockTpuClient = mock(TpuClient.class);
OperationFuture mockFuture = mock(OperationFuture.class);

mockedTpuClient.when(() -> TpuClient.create(any(TpuSettings.class)))
.thenReturn(mockTpuClient);
when(mockTpuClient.createNodeAsync(any(CreateNodeRequest.class)))
.thenReturn(mockFuture);
when(mockFuture.get()).thenReturn(mockNode);

Node returnedNode = CreateTpuVm.createTpuVm(
PROJECT_ID, ZONE, NODE_NAME,
TPU_TYPE, TPU_SOFTWARE_VERSION);

verify(mockTpuClient, times(1))
.createNodeAsync(any(CreateNodeRequest.class));
verify(mockFuture, times(1)).get();
assertEquals(returnedNode, mockNode);
}
}

@Test
public void testGetTpuVm() throws IOException {
try (MockedStatic<TpuClient> mockedTpuClient = mockStatic(TpuClient.class)) {
Node mockNode = mock(Node.class);
TpuClient mockClient = mock(TpuClient.class);
GetTpuVm mockGetTpuVm = mock(GetTpuVm.class);

mockedTpuClient.when(TpuClient::create).thenReturn(mockClient);
when(mockClient.getNode(any(GetNodeRequest.class))).thenReturn(mockNode);

Node returnedNode = GetTpuVm.getTpuVm(PROJECT_ID, ZONE, NODE_NAME);

verify(mockGetTpuVm, times(1))
.getTpuVm(PROJECT_ID, ZONE, NODE_NAME);
verify(mockClient, times(1))
.getNode(any(GetNodeRequest.class));
assertThat(returnedNode).isEqualTo(mockNode);
}
}
Expand All @@ -91,4 +121,28 @@ public void testDeleteTpuVm() throws IOException, ExecutionException, Interrupte
verify(mockTpuClient, times(1)).deleteNodeAsync(any(DeleteNodeRequest.class));
}
}

@Test
public void testCreateTpuVmWithTopologyFlag()
throws IOException, ExecutionException, InterruptedException {
try (MockedStatic<TpuClient> mockedTpuClient = mockStatic(TpuClient.class)) {
Node mockNode = mock(Node.class);
TpuClient mockTpuClient = mock(TpuClient.class);
OperationFuture mockFuture = mock(OperationFuture.class);

mockedTpuClient.when(() -> TpuClient.create(any(TpuSettings.class)))
.thenReturn(mockTpuClient);
when(mockTpuClient.createNodeAsync(any(CreateNodeRequest.class)))
.thenReturn(mockFuture);
when(mockFuture.get()).thenReturn(mockNode);
Node returnedNode = CreateTpuWithTopologyFlag.createTpuWithTopologyFlag(
PROJECT_ID, ZONE, NODE_NAME, ACCELERATOR_TYPE,
TPU_SOFTWARE_VERSION, TOPOLOGY);

verify(mockTpuClient, times(1))
.createNodeAsync(any(CreateNodeRequest.class));
verify(mockFuture, times(1)).get();
assertEquals(returnedNode, mockNode);
}
}
}