crmne · grgr · Oct 25, 2025 · Oct 25, 2025 · Oct 29, 2025 · Oct 22, 2025
diff --git a/README.md b/README.md
@@ -74,6 +74,11 @@ RubyLLM.embed "Ruby is elegant and expressive"
 RubyLLM.transcribe "meeting.wav"
 ```
 
+```ruby
+# Text to speech
+RubyLLM.tts "Hello, welcome to RubyLLM!"
+```
+
 ```ruby
 # Moderate content for safety
 RubyLLM.moderate "Check if this text is safe"

diff --git a/docs/_core_features/text-to-speech.md b/docs/_core_features/text-to-speech.md
@@ -0,0 +1,96 @@
+---
+layout: default
+title: Text to Speech
+nav_order: 7
+description: Convert text to speech
+redirect_from:
+  - /guides/audio-transcription
+  - /guides/transcription
+---
+
+# {{ page.title }}
+{: .d-inline-block .no_toc }
+
+v1.9.0+
+{: .label .label-green }
+
+{{ page.description }}
+{: .fs-6 .fw-300 }
+
+## Table of contents
+{: .no_toc .text-delta }
+
+1. TOC
+{:toc}
+
+---
+
+After reading this guide, you will know:
+
+*   How to generate speech from text.
+*   How to save audio files.
+*   How to select different voices.
+*   How to access raw audio data.
+*   Specifics of language support.
+
+## Basic Text to Speech
+
+Generate audio with the global `RubyLLM.tts` method:
+
+```ruby
+audio = RubyLLM.tts("Hello, welcome to RubyLLM!")
+
+```
+
+## Save Audio File
+You can save the generated audio to a file.
+If you are using OpenAI, the audio will be saved as an MP3 file.
+
+```ruby
+audio = RubyLLM.tts("This is a text to speech example.", provider: :openai, model: "gpt-4o-mini-tts")
+audio.save("example.mp3")
+```
+
+If you are using Gemini, the audio will be saved as a raw PCM file.
+
+```ruby
+audio = RubyLLM.tts("This is a text to speech example.", provider: :gemini, model: "gemini-2.5-flash-preview-tts")
+audio.save("example.pcm")
+```
+
+You can convert it to MP3 using ffmpeg:
+
+```bash
+ffmpeg -f s16le -ar 24000 -ac 1 -i example.pcm example.mp3
+```
+
+### Select Voice
+You can specify different voices. Supported voices for OpenAI
+are alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, and verse.
+
+For Gemini have a look at the [gemini voices](https://ai.google.dev/gemini-api/docs/speech-generation#voices).
+
+```ruby
+# Using a specific voice
+voice = "ash"
+audio = RubyLLM.tts("Hello, this is a #{voice}`s voice.", voice: voice)
+```
+
+### Access Audio Data
+You can access the raw audio data:
+
+```ruby
+audio = RubyLLM.tts("Accessing raw audio data.")
+audio.data # => binary audio data (MP3 for OpenAI, PCM for Gemini)
+```
+
+### Language Support
+OpenAi and Gemini gather language support automatically based on the text provided.
+Previously, you could specify the language manually in Gemini.
+
+## Next Steps
+
+*   [Chatting with AI Models]({% link _core_features/chat.md %}): Learn about conversational AI.
+*   [Image Generation]({% link _core_features/image-generation.md %}): Generate images from text.
+*   [Error Handling]({% link _advanced/error-handling.md %}): Master handling API errors.
+
diff --git a/docs/index.md b/docs/index.md
@@ -138,6 +138,11 @@ RubyLLM.embed "Ruby is elegant and expressive"
 RubyLLM.transcribe "meeting.wav"
 ```
 
+```ruby
+# Text to speech
+RubyLLM.tts "Hello, welcome to RubyLLM!"
+```
+
 ```ruby
 # Moderate content for safety
 RubyLLM.moderate "Check if this text is safe"

diff --git a/lib/ruby_llm.rb b/lib/ruby_llm.rb
@@ -59,6 +59,10 @@ def paint(...)
       Image.paint(...)
     end
 
+    def tts(...)
+      Speech.tts(...)
+    end
+
     def transcribe(...)
       Transcription.transcribe(...)
     end

diff --git a/lib/ruby_llm/configuration.rb b/lib/ruby_llm/configuration.rb
@@ -29,6 +29,7 @@ class Configuration
                   :default_embedding_model,
                   :default_moderation_model,
                   :default_image_model,
+                  :default_audio_model,
                   :default_transcription_model,
                   # Model registry
                   :model_registry_file,
@@ -60,6 +61,7 @@ def initialize
       @default_embedding_model = 'text-embedding-3-small'
       @default_moderation_model = 'omni-moderation-latest'
       @default_image_model = 'gpt-image-1'
+      @default_audio_model = 'gpt-4o-mini-tts'
       @default_transcription_model = 'whisper-1'
 
       @model_registry_file = File.expand_path('models.json', __dir__)

diff --git a/lib/ruby_llm/provider.rb b/lib/ruby_llm/provider.rb
@@ -82,6 +82,12 @@ def paint(prompt, model:, size:)
       parse_image_response(response, model:)
     end
 
+    def tts(input, model:, voice:)
+      payload = render_speech_payload(input, model:, voice:)
+      response = @connection.post speech_url, payload
+      parse_speech_response(response, model:)
+    end
+
     def transcribe(audio_file, model:, language:, **options)
       file_part = build_audio_file_part(audio_file)
       payload = render_transcription_payload(file_part, model:, language:, **options)

diff --git a/lib/ruby_llm/providers/gemini.rb b/lib/ruby_llm/providers/gemini.rb
@@ -7,6 +7,7 @@ class Gemini < Provider
       include Gemini::Chat
       include Gemini::Embeddings
       include Gemini::Images
+      include Gemini::Speech
       include Gemini::Models
       include Gemini::Transcription
       include Gemini::Streaming

diff --git a/lib/ruby_llm/providers/gemini/speech.rb b/lib/ruby_llm/providers/gemini/speech.rb
@@ -0,0 +1,47 @@
+# frozen_string_literal: true
+
+module RubyLLM
+  module Providers
+    class Gemini
+      # Speech generation methods for the Gemini API integration
+      module Speech
+        module_function
+
+        def speech_url
+          "models/#{@model}:generateContent"
+        end
+
+        def render_speech_payload(input, model:, voice:)
+          @model = model
+          {
+            contents: [{
+              role: 'user',
+              parts: [{ text: input }]
+            }],
+            generationConfig: {
+              responseModalities: ['AUDIO'],
+              speechConfig: {
+                voiceConfig: {
+                  prebuiltVoiceConfig: {
+                    voiceName: voice
+                  }
+                }
+              }
+            },
+            model: model
+          }
+        end
+
+        def parse_speech_response(response, model:)
+          base64_audio = response.body['candidates'][0]['content']['parts'][0]['inlineData']['data']
+          pcm_data = Base64.decode64(base64_audio)
+
+          RubyLLM::Speech.new(
+            model: model,
+            data: pcm_data
+          )
+        end
+      end
+    end
+  end
+end
diff --git a/lib/ruby_llm/providers/openai.rb b/lib/ruby_llm/providers/openai.rb
@@ -11,6 +11,7 @@ class OpenAI < Provider
       include OpenAI::Streaming
       include OpenAI::Tools
       include OpenAI::Images
+      include OpenAI::Speech
       include OpenAI::Media
       include OpenAI::Transcription
 

diff --git a/lib/ruby_llm/providers/openai/speech.rb b/lib/ruby_llm/providers/openai/speech.rb
@@ -0,0 +1,32 @@
+# frozen_string_literal: true
+
+module RubyLLM
+  module Providers
+    class OpenAI
+      # Speech generation methods for the OpenAI API integration
+      module Speech
+        module_function
+
+        def speech_url
+          'audio/speech'
+        end
+
+        def render_speech_payload(input, model:, voice:)
+          {
+            model: model,
+            input: input,
+            voice: voice
+          }
+        end
+
+        def parse_speech_response(response, model:)
+          data = response.body
+          RubyLLM::Speech.new(
+            model: model,
+            data: data
+          )
+        end
+      end
+    end
+  end
+end
diff --git a/lib/ruby_llm/speech.rb b/lib/ruby_llm/speech.rb
@@ -0,0 +1,32 @@
+# frozen_string_literal: true
+
+module RubyLLM
+  # Represents a generated image from an AI model.
+  class Speech
+    attr_reader :model, :data
+
+    def initialize(data:, model: nil)
+      @model = model
+      @data = data
+    end
+
+    def save(path)
+      File.binwrite(File.expand_path(path), data)
+      path
+    end
+
+    def self.tts(input, # rubocop:disable Metrics/ParameterLists
+                 model: nil,
+                 provider: nil,
+                 assume_model_exists: false,
+                 voice: 'alloy',
+                 context: nil)
+      config = context&.config || RubyLLM.config
+      model ||= config.default_audio_model
+      model, provider_instance = Models.resolve(model, provider: provider, assume_exists: assume_model_exists,
+                                                       config: config)
+
+      provider_instance.tts(input, model: model.id, voice:)
+    end
+  end
+end
diff --git a/..._basic_functionality_gemini_gemini-2_5-flash-preview-tts_can_generate_audio_from_text.yml b/..._basic_functionality_gemini_gemini-2_5-flash-preview-tts_can_generate_audio_from_text.yml
diff --git a/...settes/speech_basic_functionality_openai_gpt-4o-mini-tts_can_generate_audio_from_text.yml b/...settes/speech_basic_functionality_openai_gpt-4o-mini-tts_can_generate_audio_from_text.yml
diff --git a/spec/ruby_llm/speech_spec.rb b/spec/ruby_llm/speech_spec.rb
@@ -0,0 +1,54 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+require 'tempfile'
+
+def save_and_verify_audio(audio)
+  # Create a temp file to save to
+  temp_file = Tempfile.new(['audio', '.mp3'])
+  temp_path = temp_file.path
+  temp_file.close
+
+  begin
+    saved_path = audio.save(temp_path)
+    expect(saved_path).to eq(temp_path)
+    expect(File.exist?(temp_path)).to be true
+
+    file_size = File.size(temp_path)
+    expect(file_size).to be > 1000 # Any real audio should be larger than 1KB
+  ensure
+    # Clean up
+    File.delete(temp_path)
+  end
+end
+
+RSpec.describe RubyLLM::Speech do
+  include_context 'with configured RubyLLM'
+
+  describe 'basic functionality' do
+    SPEECH_MODELS.each do |config|
+      provider = config[:provider]
+      model = config[:model]
+
+      it "#{provider}/#{model} can generate audio from text" do
+        voice = provider == :gemini ? 'Sadachbia' : 'alloy'
+        audio = RubyLLM.tts(
+          'Hello, welcome!',
+          model: model,
+          provider: provider,
+          voice: voice
+        )
+
+        expect(audio.model).to eq(model)
+
+        save_and_verify_audio audio
+      end
+    end
+
+    it 'validates model existence' do
+      expect do
+        RubyLLM.tts('Hello, welcome!', model: 'invalid-audio-model')
+      end.to raise_error(RubyLLM::ModelNotFoundError)
+    end
+  end
+end
diff --git a/spec/support/models_to_test.rb b/spec/support/models_to_test.rb
@@ -57,3 +57,8 @@
   { provider: :gemini, model: 'gemini-2.5-flash' },
   { provider: :vertexai, model: 'gemini-2.5-flash' }
 ].freeze
+
+SPEECH_MODELS = [
+  { provider: :openai, model: 'gpt-4o-mini-tts' },
+  { provider: :gemini, model: 'gemini-2.5-flash-preview-tts' }
+].freeze