mini-rag-voice-assistant/app_openai.py at main · Scicrop/mini-rag-voice-assistant · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import pyaudio
import wave
from faster_whisper import WhisperModel
import ollama
import numpy as np
import time
import os
import argparse
from piper import PiperVoice
import torch
import onnxruntime as ort
import led_matrix as mb
from rag_openai import get_response

# Configurações de áudio otimizadas
CHUNK = 512
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
WAVE_OUTPUT_FILENAME = "input.wav"
SILENCE_THRESHOLD = 400
SILENCE_DURATION = 1.5
MAX_DURATION = 10


class VoiceAssistant:
    def __init__(self, device_index):
        self.device_index = device_index
        self.audio = pyaudio.PyAudio()

        # Faster-Whisper na CPU
        self.whisper_device = "cpu"
        print(f"Dispositivo do Faster-Whisper: {self.whisper_device}")

        # Piper tenta GPU via onnxruntime
        self.piper_device = "cuda" if "CUDAExecutionProvider" in ort.get_available_providers() else "cpu"
        print(f"Dispositivo esperado do Piper: {self.piper_device}")

        # Ollama (assume GPU se compilado com CUDA)
        self.ollama_device = "cuda" if torch.cuda.is_available() else "cpu"  # Apenas indicativo
        print(f"Dispositivo esperado do Ollama: {self.ollama_device}")

        # Carrega o Faster-Whisper na CPU
        self.whisper_model = WhisperModel("tiny", device=self.whisper_device, compute_type="int8")

        # Carrega o Piper (GPU se disponível)
        self.piper_voice = PiperVoice.load("voice.onnx", config_path="voice.onnx.json")

    def is_speech(self, data, threshold=SILENCE_THRESHOLD):
        audio_data = np.frombuffer(data, dtype=np.int16)
        return np.abs(audio_data).mean() > threshold

    def record_audio(self):
        mb.set_behavior("pulse_green")
        print("Aguardando você falar...")
        try:
            stream = self.audio.open(format=FORMAT, channels=CHANNELS,
                                     rate=RATE, input=True,
                                     frames_per_buffer=CHUNK,
                                     input_device_index=self.device_index)
        except OSError as e:
            print(f"Erro ao abrir dispositivo {self.device_index}: {e}")
            return False

        frames = []
        recording = False
        silence_start = None

        while True:
            data = stream.read(CHUNK, exception_on_overflow=False)
            if not recording and self.is_speech(data):
                print("Detectei voz! Gravando...")
                recording = True
                frames.append(data)
                silence_start = None
            elif recording:
                frames.append(data)
                if not self.is_speech(data):
                    if silence_start is None:
                        silence_start = time.time()
                    elif time.time() - silence_start > SILENCE_DURATION:
                        print("Silêncio detectado. Parando gravação...")
                        break
                else:
                    silence_start = None
                if len(frames) * CHUNK / RATE > MAX_DURATION:
                    print("Tempo máximo atingido. Parando gravação...")
                    break

        stream.stop_stream()
        stream.close()

        try:
            with wave.open(WAVE_OUTPUT_FILENAME, 'wb') as wf:
                wf.setnchannels(CHANNELS)
                wf.setsampwidth(self.audio.get_sample_size(FORMAT))
                wf.setframerate(RATE)
                wf.writeframes(b''.join(frames))
            print("Gravação concluída!")
            return True
        except Exception as e:
            print(f"Erro ao salvar gravação: {e}")
            return False

    def speech_to_text(self):
        print(f"Dispositivo do Faster-Whisper: {self.whisper_device}")
        if not os.path.exists(WAVE_OUTPUT_FILENAME):
            print(f"Arquivo de áudio {WAVE_OUTPUT_FILENAME} não encontrado.")
            return None
        try:
            start_time = time.time()
            segments, info = self.whisper_model.transcribe(
                WAVE_OUTPUT_FILENAME,
                language="pt",
                initial_prompt="Este é um áudio em português brasileiro."
            )
            text = " ".join(segment.text for segment in segments)
            end_time = time.time()
            print(f"Tempo de transcrição: {end_time - start_time:.2f} segundos")
            return text
        except Exception as e:
            print(f"Erro na transcrição: {e}")
            return None

    def ask_ollama(self, question):
        mb.set_behavior("thinking")
        print("Consultando Ollama (espera-se GPU)...")
        start_time = time.time()
        try:
            response = get_response(question)
            end_time = time.time()
            print(f"Tempo de resposta do Ollama: {end_time - start_time:.2f} segundos")
            return response
        except Exception as e:
            print(f"Erro ao consultar o modelo: {e}")
            return None

    def text_to_speech(self, text):
        mb.set_behavior("pulse_blue")
        print(f"Dispositivo do Piper: {self.piper_device}")
        audio_file = "response.wav"
        try:
            start_time = time.time()
            with wave.open(audio_file, "wb") as wav_file:
                self.piper_voice.synthesize(text, wav_file)
            end_time = time.time()
            print(f"Tempo de síntese Piper: {end_time - start_time:.2f} segundos")
            with wave.open(audio_file, 'rb') as wf:
                stream = self.audio.open(format=self.audio.get_format_from_width(wf.getsampwidth()),
                                         channels=wf.getnchannels(),
                                         rate=wf.getframerate(),
                                         output=True)
                data = wf.readframes(CHUNK)
                while data:
                    stream.write(data)
                    data = wf.readframes(CHUNK)
                stream.stop_stream()
                stream.close()
        except Exception as e:
            print(f"Erro na síntese ou reprodução de voz: {e}")

    def run(self):
        while True:
            if not self.record_audio():
                continue
            question = self.speech_to_text()
            if question:
                print(f"Pergunta reconhecida: {question}")
                answer = self.ask_ollama(question)
                if answer:
                    print(f"Resposta do: {answer}")
                    self.text_to_speech(answer)
            if os.path.exists(WAVE_OUTPUT_FILENAME):
                os.remove(WAVE_OUTPUT_FILENAME)
            print("Pronto para a próxima pergunta! (Ctrl+C para sair)")

    def cleanup(self):
        self.audio.terminate()


def parse_arguments():
    parser = argparse.ArgumentParser(description="Assistente de voz otimizado para Jetson Orin Nano.")
    parser.add_argument("--device-index", type=int, default=5, help="Índice do dispositivo de áudio (padrão: 5)")
    return parser.parse_args()


if __name__ == "__main__":
    mb.start_animation()
    args = parse_arguments()
    assistant = VoiceAssistant(args.device_index)
    try:
        assistant.run()
    except KeyboardInterrupt:
        print("Programa encerrado pelo usuário.")
        assistant.cleanup()