-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp_openai.py
More file actions
195 lines (172 loc) · 7.11 KB
/
app_openai.py
File metadata and controls
195 lines (172 loc) · 7.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import pyaudio
import wave
from faster_whisper import WhisperModel
import ollama
import numpy as np
import time
import os
import argparse
from piper import PiperVoice
import torch
import onnxruntime as ort
import led_matrix as mb
from rag_openai import get_response
# Configurações de áudio otimizadas
CHUNK = 512
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
WAVE_OUTPUT_FILENAME = "input.wav"
SILENCE_THRESHOLD = 400
SILENCE_DURATION = 1.5
MAX_DURATION = 10
class VoiceAssistant:
def __init__(self, device_index):
self.device_index = device_index
self.audio = pyaudio.PyAudio()
# Faster-Whisper na CPU
self.whisper_device = "cpu"
print(f"Dispositivo do Faster-Whisper: {self.whisper_device}")
# Piper tenta GPU via onnxruntime
self.piper_device = "cuda" if "CUDAExecutionProvider" in ort.get_available_providers() else "cpu"
print(f"Dispositivo esperado do Piper: {self.piper_device}")
# Ollama (assume GPU se compilado com CUDA)
self.ollama_device = "cuda" if torch.cuda.is_available() else "cpu" # Apenas indicativo
print(f"Dispositivo esperado do Ollama: {self.ollama_device}")
# Carrega o Faster-Whisper na CPU
self.whisper_model = WhisperModel("tiny", device=self.whisper_device, compute_type="int8")
# Carrega o Piper (GPU se disponível)
self.piper_voice = PiperVoice.load("voice.onnx", config_path="voice.onnx.json")
def is_speech(self, data, threshold=SILENCE_THRESHOLD):
audio_data = np.frombuffer(data, dtype=np.int16)
return np.abs(audio_data).mean() > threshold
def record_audio(self):
mb.set_behavior("pulse_green")
print("Aguardando você falar...")
try:
stream = self.audio.open(format=FORMAT, channels=CHANNELS,
rate=RATE, input=True,
frames_per_buffer=CHUNK,
input_device_index=self.device_index)
except OSError as e:
print(f"Erro ao abrir dispositivo {self.device_index}: {e}")
return False
frames = []
recording = False
silence_start = None
while True:
data = stream.read(CHUNK, exception_on_overflow=False)
if not recording and self.is_speech(data):
print("Detectei voz! Gravando...")
recording = True
frames.append(data)
silence_start = None
elif recording:
frames.append(data)
if not self.is_speech(data):
if silence_start is None:
silence_start = time.time()
elif time.time() - silence_start > SILENCE_DURATION:
print("Silêncio detectado. Parando gravação...")
break
else:
silence_start = None
if len(frames) * CHUNK / RATE > MAX_DURATION:
print("Tempo máximo atingido. Parando gravação...")
break
stream.stop_stream()
stream.close()
try:
with wave.open(WAVE_OUTPUT_FILENAME, 'wb') as wf:
wf.setnchannels(CHANNELS)
wf.setsampwidth(self.audio.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
print("Gravação concluída!")
return True
except Exception as e:
print(f"Erro ao salvar gravação: {e}")
return False
def speech_to_text(self):
print(f"Dispositivo do Faster-Whisper: {self.whisper_device}")
if not os.path.exists(WAVE_OUTPUT_FILENAME):
print(f"Arquivo de áudio {WAVE_OUTPUT_FILENAME} não encontrado.")
return None
try:
start_time = time.time()
segments, info = self.whisper_model.transcribe(
WAVE_OUTPUT_FILENAME,
language="pt",
initial_prompt="Este é um áudio em português brasileiro."
)
text = " ".join(segment.text for segment in segments)
end_time = time.time()
print(f"Tempo de transcrição: {end_time - start_time:.2f} segundos")
return text
except Exception as e:
print(f"Erro na transcrição: {e}")
return None
def ask_ollama(self, question):
mb.set_behavior("thinking")
print("Consultando Ollama (espera-se GPU)...")
start_time = time.time()
try:
response = get_response(question)
end_time = time.time()
print(f"Tempo de resposta do Ollama: {end_time - start_time:.2f} segundos")
return response
except Exception as e:
print(f"Erro ao consultar o modelo: {e}")
return None
def text_to_speech(self, text):
mb.set_behavior("pulse_blue")
print(f"Dispositivo do Piper: {self.piper_device}")
audio_file = "response.wav"
try:
start_time = time.time()
with wave.open(audio_file, "wb") as wav_file:
self.piper_voice.synthesize(text, wav_file)
end_time = time.time()
print(f"Tempo de síntese Piper: {end_time - start_time:.2f} segundos")
with wave.open(audio_file, 'rb') as wf:
stream = self.audio.open(format=self.audio.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True)
data = wf.readframes(CHUNK)
while data:
stream.write(data)
data = wf.readframes(CHUNK)
stream.stop_stream()
stream.close()
except Exception as e:
print(f"Erro na síntese ou reprodução de voz: {e}")
def run(self):
while True:
if not self.record_audio():
continue
question = self.speech_to_text()
if question:
print(f"Pergunta reconhecida: {question}")
answer = self.ask_ollama(question)
if answer:
print(f"Resposta do: {answer}")
self.text_to_speech(answer)
if os.path.exists(WAVE_OUTPUT_FILENAME):
os.remove(WAVE_OUTPUT_FILENAME)
print("Pronto para a próxima pergunta! (Ctrl+C para sair)")
def cleanup(self):
self.audio.terminate()
def parse_arguments():
parser = argparse.ArgumentParser(description="Assistente de voz otimizado para Jetson Orin Nano.")
parser.add_argument("--device-index", type=int, default=5, help="Índice do dispositivo de áudio (padrão: 5)")
return parser.parse_args()
if __name__ == "__main__":
mb.start_animation()
args = parse_arguments()
assistant = VoiceAssistant(args.device_index)
try:
assistant.run()
except KeyboardInterrupt:
print("Programa encerrado pelo usuário.")
assistant.cleanup()