Skip to content

Commit 44ef03f

Browse files
committed
feat: add speech generation support with a new AudioContent constructor, speechConfigs in interaction parameters, and a dedicated test.
1 parent 9f8e8c2 commit 44ef03f

File tree

4 files changed

+142
-10
lines changed

4 files changed

+142
-10
lines changed

src/main/java/io/github/glaforge/gemini/interactions/model/Config.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,17 @@ public record SpeechConfig(
9090
String voice,
9191
String language,
9292
String speaker
93-
) {}
93+
) {
94+
/**
95+
* Creates a new SpeechConfig with voice and language.
96+
*
97+
* @param voice The voice to use.
98+
* @param language The language of the speech.
99+
*/
100+
public SpeechConfig(String voice, String language) {
101+
this(voice, language, null);
102+
}
103+
}
94104

95105
/**
96106
* Configuration for image generation.

src/main/java/io/github/glaforge/gemini/interactions/model/Content.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,17 @@ record AudioContent(
187187
byte[] data,
188188
String uri,
189189
@JsonProperty("mime_type") String mimeType
190-
) implements Content {}
190+
) implements Content {
191+
/**
192+
* Creates a new AudioContent with default type "audio".
193+
*
194+
* @param data The base64 encoded audio data.
195+
* @param mimeType The MIME type of the audio.
196+
*/
197+
public AudioContent(byte[] data, String mimeType) {
198+
this("audio", data, null, mimeType);
199+
}
200+
}
191201

192202
/**
193203
* Content containing a document.

src/main/java/io/github/glaforge/gemini/interactions/model/InteractionParams.java

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ public Builder() {}
9696
private Object responseFormat;
9797
private String responseMimeType;
9898
private String previousInteractionId;
99+
private List<Config.SpeechConfig> speechConfigs;
99100
private Config.ImageConfig imageConfig;
100101

101102
/**
@@ -268,16 +269,32 @@ public Builder responseFormat(Schema schema) {
268269
*/
269270
public Builder imageConfig(Config.ImageConfig imageConfig) { this.imageConfig = imageConfig; return this; }
270271

272+
/**
273+
* Sets the speech config.
274+
*
275+
* @param speechConfig The speech configuration.
276+
* @return This builder.
277+
*/
278+
public Builder speechConfig(Config.SpeechConfig speechConfig) { this.speechConfigs = List.of(speechConfig); return this; }
279+
280+
/**
281+
* Sets the speech configs.
282+
*
283+
* @param speechConfigs The speech configurations.
284+
* @return This builder.
285+
*/
286+
public Builder speechConfigs(List<Config.SpeechConfig> speechConfigs) { this.speechConfigs = speechConfigs; return this; }
287+
271288
/**
272289
* Builds the CreateModelInteractionParams.
273290
*
274291
* @return The CreateModelInteractionParams parameters.
275292
*/
276293
public ModelInteractionParams build() {
277294
Config.GenerationConfig finalConfig = generationConfig;
278-
if (imageConfig != null) {
295+
if (imageConfig != null || speechConfigs != null) {
279296
if (finalConfig == null) {
280-
finalConfig = new Config.GenerationConfig(null, null, null, null, null, null, null, null, null, imageConfig);
297+
finalConfig = new Config.GenerationConfig(null, null, null, null, null, null, null, null, speechConfigs, imageConfig);
281298
} else {
282299
finalConfig = new Config.GenerationConfig(
283300
finalConfig.temperature(),
@@ -288,8 +305,8 @@ public ModelInteractionParams build() {
288305
finalConfig.thinkingLevel(),
289306
finalConfig.thinkingSummaries(),
290307
finalConfig.maxOutputTokens(),
291-
finalConfig.speechConfig(),
292-
imageConfig
308+
speechConfigs != null ? speechConfigs : finalConfig.speechConfig(),
309+
imageConfig != null ? imageConfig : finalConfig.imageConfig()
293310
);
294311
}
295312
}
@@ -354,6 +371,7 @@ public Builder() {}
354371
private Object responseFormat;
355372
private String responseMimeType;
356373
private String previousInteractionId;
374+
private List<Config.SpeechConfig> speechConfigs;
357375
private Config.ImageConfig imageConfig;
358376

359377
/**
@@ -540,16 +558,32 @@ public Builder responseFormat(String responseFormat) {
540558
*/
541559
public Builder imageConfig(Config.ImageConfig imageConfig) { this.imageConfig = imageConfig; return this; }
542560

561+
/**
562+
* Sets the speech config.
563+
*
564+
* @param speechConfig The speech configuration.
565+
* @return This builder.
566+
*/
567+
public Builder speechConfig(Config.SpeechConfig speechConfig) { this.speechConfigs = List.of(speechConfig); return this; }
568+
569+
/**
570+
* Sets the speech configs.
571+
*
572+
* @param speechConfigs The speech configurations.
573+
* @return This builder.
574+
*/
575+
public Builder speechConfigs(List<Config.SpeechConfig> speechConfigs) { this.speechConfigs = speechConfigs; return this; }
576+
543577
/**
544578
* Builds the CreateAgentInteractionParams.
545579
*
546580
* @return The CreateAgentInteractionParams parameters.
547581
*/
548582
public AgentInteractionParams build() {
549583
Config.GenerationConfig finalConfig = generationConfig;
550-
if (imageConfig != null) {
584+
if (imageConfig != null || speechConfigs != null) {
551585
if (finalConfig == null) {
552-
finalConfig = new Config.GenerationConfig(null, null, null, null, null, null, null, null, null, imageConfig);
586+
finalConfig = new Config.GenerationConfig(null, null, null, null, null, null, null, null, speechConfigs, imageConfig);
553587
} else {
554588
finalConfig = new Config.GenerationConfig(
555589
finalConfig.temperature(),
@@ -560,8 +594,8 @@ public AgentInteractionParams build() {
560594
finalConfig.thinkingLevel(),
561595
finalConfig.thinkingSummaries(),
562596
finalConfig.maxOutputTokens(),
563-
finalConfig.speechConfig(),
564-
imageConfig
597+
speechConfigs != null ? speechConfigs : finalConfig.speechConfig(),
598+
imageConfig != null ? imageConfig : finalConfig.imageConfig()
565599
);
566600
}
567601
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
* Copyright 2025 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package io.github.glaforge.gemini.interactions;
18+
19+
import io.github.glaforge.gemini.interactions.model.Config.SpeechConfig;
20+
import io.github.glaforge.gemini.interactions.model.Content.AudioContent;
21+
import io.github.glaforge.gemini.interactions.model.Interaction;
22+
import io.github.glaforge.gemini.interactions.model.InteractionParams.ModelInteractionParams;
23+
import org.junit.jupiter.api.Test;
24+
import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable;
25+
26+
import java.io.IOException;
27+
import java.nio.file.Files;
28+
import java.nio.file.Path;
29+
import java.nio.file.Paths;
30+
31+
import static org.junit.jupiter.api.Assertions.*;
32+
33+
@EnabledIfEnvironmentVariable(named = "GEMINI_API_KEY", matches = ".+")
34+
public class SpeechGenerationTest {
35+
36+
@Test
37+
public void testSpeechGeneration() {
38+
GeminiInteractionsClient client = GeminiInteractionsClient.builder()
39+
.apiKey(System.getenv("GEMINI_API_KEY"))
40+
.build();
41+
42+
ModelInteractionParams request = ModelInteractionParams.builder()
43+
.model("gemini-2.5-flash-preview-tts")
44+
.input("Say the following: WOOHOO This is so much fun!")
45+
.responseModalities(Interaction.Modality.AUDIO)
46+
.speechConfig(new SpeechConfig("kore", "en-us"))
47+
.build();
48+
49+
Interaction interaction = client.create(request);
50+
51+
assertNotNull(interaction);
52+
assertNotNull(interaction.outputs());
53+
54+
boolean hasAudio = interaction.outputs().stream()
55+
.anyMatch(output -> output instanceof AudioContent);
56+
57+
assertTrue(hasAudio, "Response should contain audio content");
58+
59+
interaction.outputs().stream()
60+
.filter(output -> output instanceof AudioContent)
61+
.map(output -> (AudioContent) output)
62+
.forEach(audio -> {
63+
assertNotNull(audio.data());
64+
assertTrue(audio.data().length > 0);
65+
assertEquals("audio", audio.type());
66+
System.out.println("Received audio data of length: " + audio.data().length);
67+
68+
try {
69+
Path targetPath = Paths.get("target", "generated-audio.raw");
70+
Files.createDirectories(targetPath.getParent());
71+
Files.write(targetPath, audio.data());
72+
System.out.println("Saved audio to: " + targetPath.toAbsolutePath());
73+
} catch (IOException e) {
74+
fail("Failed to save audio file: " + e.getMessage());
75+
}
76+
});
77+
}
78+
}

0 commit comments

Comments
 (0)