diff --git a/TypeaheadAI.xcodeproj/project.pbxproj b/TypeaheadAI.xcodeproj/project.pbxproj index 0e7d5a0..9622980 100644 --- a/TypeaheadAI.xcodeproj/project.pbxproj +++ b/TypeaheadAI.xcodeproj/project.pbxproj @@ -17,6 +17,8 @@ 2B473E872AA85B9A0042913D /* IncognitoModeView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2B473E862AA85B9A0042913D /* IncognitoModeView.swift */; }; 2B473E892AA85BDD0042913D /* LlamaModelManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2B473E882AA85BDD0042913D /* LlamaModelManager.swift */; }; 2B473E8C2AA860380042913D /* MenuBarExtraAccess in Frameworks */ = {isa = PBXBuildFile; productRef = 2B473E8B2AA860380042913D /* MenuBarExtraAccess */; }; + 2B5C2C832AB79EB800072D71 /* SpecialRecordActor.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2B5C2C822AB79EB800072D71 /* SpecialRecordActor.swift */; }; + 2B5C2C852AB7A0D100072D71 /* TranscriptionManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2B5C2C842AB7A0D100072D71 /* TranscriptionManager.swift */; }; 2B8B952B2A9C528B00FB9EA9 /* ScriptManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2B8B952A2A9C528B00FB9EA9 /* ScriptManager.swift */; }; 2B92BDB92AA3A2DD00E65CFA /* CustomModalWindow.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2B92BDB82AA3A2DD00E65CFA /* CustomModalWindow.swift */; }; 2B92BDBB2AA3D10800E65CFA /* ModalManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2B92BDBA2AA3D10800E65CFA /* ModalManager.swift */; }; @@ -81,6 +83,8 @@ 2B3FAC222AAAF44D00B2D405 /* LlamaWrapper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaWrapper.swift; sourceTree = ""; }; 2B473E862AA85B9A0042913D /* IncognitoModeView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = IncognitoModeView.swift; sourceTree = ""; }; 2B473E882AA85BDD0042913D /* LlamaModelManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaModelManager.swift; sourceTree = ""; }; + 2B5C2C822AB79EB800072D71 /* SpecialRecordActor.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SpecialRecordActor.swift; sourceTree = ""; }; + 2B5C2C842AB7A0D100072D71 /* TranscriptionManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TranscriptionManager.swift; sourceTree = ""; }; 2B8B952A2A9C528B00FB9EA9 /* ScriptManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ScriptManager.swift; sourceTree = ""; }; 2B92BDB82AA3A2DD00E65CFA /* CustomModalWindow.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CustomModalWindow.swift; sourceTree = ""; }; 2B92BDBA2AA3D10800E65CFA /* ModalManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ModalManager.swift; sourceTree = ""; }; @@ -155,6 +159,7 @@ 2B2745092AB01CF400F37D3E /* SpecialSaveActor.swift */, 2B27450F2AB03A3D00F37D3E /* CanSimulateCopy.swift */, 2BF929862AB16C4C00FC105B /* ResponseParsingTask.swift */, + 2B5C2C822AB79EB800072D71 /* SpecialRecordActor.swift */, ); path = Actors; sourceTree = ""; @@ -224,6 +229,7 @@ 2B473E882AA85BDD0042913D /* LlamaModelManager.swift */, 2B27450D2AB0380C00F37D3E /* AppContextManager.swift */, 2BF929812AB13F7900FC105B /* MarkdownAttributedStringParser.swift */, + 2B5C2C842AB7A0D100072D71 /* TranscriptionManager.swift */, ); path = TypeaheadAI; sourceTree = ""; @@ -455,10 +461,12 @@ 2BCF843A2A9DE6DA00359841 /* GeneralSettingsView.swift in Sources */, 2B33D87D2AAC3330001193A2 /* ProfileView.swift in Sources */, 2BA7F0B52A9ABCD7003D38BA /* PromptManager.swift in Sources */, + 2B5C2C852AB7A0D100072D71 /* TranscriptionManager.swift in Sources */, 2BF929872AB16C4C00FC105B /* ResponseParsingTask.swift in Sources */, 2B3FAC232AAAF44D00B2D405 /* LlamaWrapper.swift in Sources */, 2B3FAC212AAAF22500B2D405 /* LlamaWrapper.cpp in Sources */, 2BA3C2372AADAD9A00537F95 /* SpecialCopyActor.swift in Sources */, + 2B5C2C832AB79EB800072D71 /* SpecialRecordActor.swift in Sources */, 2B27450E2AB0380C00F37D3E /* AppContextManager.swift in Sources */, 2B27450A2AB01CF400F37D3E /* SpecialSaveActor.swift in Sources */, 2BA7F0792A9ABBA8003D38BA /* TypeaheadAIApp.swift in Sources */, @@ -653,6 +661,8 @@ INFOPLIST_KEY_LSUIElement = YES; INFOPLIST_KEY_NSAppleEventsUsageDescription = "We need to control Google Chrome to get the URL of the active tab."; INFOPLIST_KEY_NSHumanReadableCopyright = ""; + INFOPLIST_KEY_NSMicrophoneUsageDescription = "We need access to the microphone to allow for audio interface"; + INFOPLIST_KEY_NSSpeechRecognitionUsageDescription = "We need access to the speech recognition toolkit so that the user can interface with the LLM through voice input"; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", "@executable_path/../Frameworks", @@ -693,6 +703,8 @@ INFOPLIST_KEY_LSUIElement = YES; INFOPLIST_KEY_NSAppleEventsUsageDescription = "We need to control Google Chrome to get the URL of the active tab."; INFOPLIST_KEY_NSHumanReadableCopyright = ""; + INFOPLIST_KEY_NSMicrophoneUsageDescription = "We need access to the microphone to allow for audio interface"; + INFOPLIST_KEY_NSSpeechRecognitionUsageDescription = "We need access to the speech recognition toolkit so that the user can interface with the LLM through voice input"; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", "@executable_path/../Frameworks", diff --git a/TypeaheadAI.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved b/TypeaheadAI.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved index 19ecd80..314c9fe 100644 --- a/TypeaheadAI.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ b/TypeaheadAI.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -1,5 +1,14 @@ { "pins" : [ + { + "identity" : "dswaveformimage", + "kind" : "remoteSourceControl", + "location" : "https://github.com/dmrschmidt/DSWaveformImage", + "state" : { + "revision" : "e670b8da140c39036f171b9754ed25126b8d414b", + "version" : "13.0.2" + } + }, { "identity" : "highlighterswift", "kind" : "remoteSourceControl", diff --git a/TypeaheadAI.xcodeproj/xcuserdata/jeffhara.xcuserdatad/xcschemes/xcschememanagement.plist b/TypeaheadAI.xcodeproj/xcuserdata/jeffhara.xcuserdatad/xcschemes/xcschememanagement.plist index d5ea02b..009bde8 100644 --- a/TypeaheadAI.xcodeproj/xcuserdata/jeffhara.xcuserdatad/xcschemes/xcschememanagement.plist +++ b/TypeaheadAI.xcodeproj/xcuserdata/jeffhara.xcuserdatad/xcschemes/xcschememanagement.plist @@ -7,7 +7,7 @@ TypeaheadAI.xcscheme_^#shared#^_ orderHint - 0 + 1 diff --git a/TypeaheadAI/Actors/SpecialCutActor.swift b/TypeaheadAI/Actors/SpecialCutActor.swift index 3b591cb..874700c 100644 --- a/TypeaheadAI/Actors/SpecialCutActor.swift +++ b/TypeaheadAI/Actors/SpecialCutActor.swift @@ -56,6 +56,7 @@ class ClipboardMonitor { } func stopMonitoring() { + logger.debug("stop monitoring") timer?.invalidate() timer = nil } diff --git a/TypeaheadAI/Actors/SpecialRecordActor.swift b/TypeaheadAI/Actors/SpecialRecordActor.swift new file mode 100644 index 0000000..74662ff --- /dev/null +++ b/TypeaheadAI/Actors/SpecialRecordActor.swift @@ -0,0 +1,27 @@ +// +// SpecialRecordActor.swift +// TypeaheadAI +// +// Created by Jeff Hara on 9/17/23. +// + +import Foundation +import AVFoundation +import Speech +import os.log + +actor SpecialRecordActor { + private let audioEngine = AVAudioEngine() + private var speechRecognizer: SFSpeechRecognizer? + private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? + private var recognitionTask: SFSpeechRecognitionTask? + + private let logger = Logger( + subsystem: "ai.typeahead.TypeaheadAI", + category: "SpecialRecordActor" + ) + + func specialRecord() { + + } +} diff --git a/TypeaheadAI/ModalManager.swift b/TypeaheadAI/ModalManager.swift index babdf3c..b4aa986 100644 --- a/TypeaheadAI/ModalManager.swift +++ b/TypeaheadAI/ModalManager.swift @@ -56,6 +56,7 @@ class ModalManager: ObservableObject { // TODO: Inject? var clientManager: ClientManager? = nil + var transcriptionManager: TranscriptionManager? = TranscriptionManager() var toastWindow: CustomModalWindow? @@ -377,6 +378,15 @@ class ModalManager: ObservableObject { } } + @MainActor + func cancelRecordingTask() { + transcriptionManager?.stopRecording() + } + + func startRecording(completion: @escaping (String) -> Void) { + transcriptionManager?.startRecording(completion: completion) + } + @objc func windowDidMove(_ notification: Notification) { if let movedWindow = notification.object as? NSWindow { let origin = movedWindow.frame.origin diff --git a/TypeaheadAI/TranscriptionManager.swift b/TypeaheadAI/TranscriptionManager.swift new file mode 100644 index 0000000..3fb269d --- /dev/null +++ b/TypeaheadAI/TranscriptionManager.swift @@ -0,0 +1,140 @@ +// +// TranscriptionManager.swift +// TypeaheadAI +// +// Created by Jeff Hara on 9/17/23. +// + +import Foundation +import AVFoundation +import Speech +import os.log + +enum TranscriptionManagerError: Error { + case notAuthorized + case illegalState + + var localizedDescription: String { + switch self { + case .notAuthorized: + return "The user has not authorized speech recognition" + case .illegalState: + return "Speech recognizer is not available" + } + } +} + +class TranscriptionManager { + private let logger = Logger( + subsystem: "ai.typeahead.TypeaheadAI", + category: "TranscriptionManager" + ) + + private var audioEngine: AVAudioEngine? + private var speechRecognizer: SFSpeechRecognizer? + private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? + private var recognitionTask: SFSpeechRecognitionTask? + + func startRecording(completion: @escaping (String) -> Void) { + self.recognitionTask?.cancel() + self.recognitionTask = nil + + requestSpeechAuthorization() { [weak self] result in + switch result { + case .success(): + do { + try self?.setupRecording(completion: completion) + } catch { + self?.logger.error("\(error.localizedDescription)") + } + case .failure(let error): + self?.logger.error("Authorization failed with error: \(error.localizedDescription)") + } + } + } + + private func setupRecording(completion: @escaping (String) -> Void) throws { + recognitionRequest = SFSpeechAudioBufferRecognitionRequest() + + audioEngine = AVAudioEngine() + speechRecognizer = SFSpeechRecognizer() + + guard let inputNode = audioEngine?.inputNode else { + self.logger.error("AudioEngine is not initialized") + throw TranscriptionManagerError.illegalState + } + + inputNode.reset() + inputNode.removeTap(onBus: 0) + + // Get the system recording format + let hardwareFormat = inputNode.inputFormat(forBus: 0) + let recordingFormat = AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: hardwareFormat.sampleRate, channels: 1, interleaved: false) + + inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { [weak self] (buffer, _) in + self?.recognitionRequest?.append(buffer) + } + + audioEngine?.prepare() + do { + try audioEngine?.start() + } catch { + logger.error("Audio engine failed to start: \(error.localizedDescription)") + throw TranscriptionManagerError.illegalState + } + + DispatchQueue.main.async { + self.recognitionTask = self.speechRecognizer?.recognitionTask(with: self.recognitionRequest!) { [weak self] (result, error) in + guard let self = self else { return } + + if let error = error as NSError?, error.domain == "kAFAssistantErrorDomain" && error.code == 216 { + self.logger.info("Recognition task was cancelled") + } else if let error = error { + self.logger.error("Recognition task failed with error: \(error.localizedDescription)") + } else if let transcription = result?.bestTranscription { + self.logger.info("Recognized text: \(transcription.formattedString)") + completion(transcription.formattedString) + } else { + self.logger.info("No recognition result available") + } + } + } + + logger.info("Successfully started recording") + } + + /// When using Bluetooth headphones, the output audio quality drops if the microphone is enabled. + /// That can't be helped, but make sure to test with BT headphones that when the recording stops + /// the audio quality returns to normal. Could be a sign that something wasn't cleaned up properly. + func stopRecording() { + recognitionTask?.cancel() + recognitionTask = nil + audioEngine?.stop() + audioEngine?.inputNode.reset() + audioEngine?.inputNode.removeTap(onBus: 0) + audioEngine = nil + speechRecognizer = nil + recognitionRequest?.endAudio() + recognitionRequest = nil + } + + private func requestSpeechAuthorization(completion: @escaping (Result) -> Void) { + SFSpeechRecognizer.requestAuthorization { [weak self] authStatus in + OperationQueue.main.addOperation { + switch authStatus { + case .authorized: + self?.speechRecognizer = SFSpeechRecognizer() + if self?.speechRecognizer?.isAvailable ?? false { + completion(.success(())) + } else { + self?.logger.error("\(TranscriptionManagerError.illegalState.localizedDescription)") + completion(.failure(.illegalState)) + } + default: + self?.logger.error("\(TranscriptionManagerError.notAuthorized.localizedDescription)") + completion(.failure(.notAuthorized)) + } + } + } + } +} diff --git a/TypeaheadAI/TypeaheadAI.entitlements b/TypeaheadAI/TypeaheadAI.entitlements index 1c8756a..dcb4a49 100644 --- a/TypeaheadAI/TypeaheadAI.entitlements +++ b/TypeaheadAI/TypeaheadAI.entitlements @@ -10,6 +10,8 @@ com.apple.security.automation.apple-events + com.apple.security.device.audio-input + com.apple.security.files.user-selected.read-write com.apple.security.network.client diff --git a/TypeaheadAI/Views/Modal/ModalView.swift b/TypeaheadAI/Views/Modal/ModalView.swift index a6a030c..1d2245a 100644 --- a/TypeaheadAI/Views/Modal/ModalView.swift +++ b/TypeaheadAI/Views/Modal/ModalView.swift @@ -7,6 +7,7 @@ import SwiftUI import Markdown +import AudioToolbox struct MessageView: View { let message: Message @@ -86,6 +87,7 @@ struct ModalView: View { @State private var text: String = "" @FocusState private var isTextFieldFocused: Bool @State private var isReplyLocked: Bool = false + @State private var isRecording: Bool = false @Namespace var bottomID @@ -108,33 +110,66 @@ struct ModalView: View { } .frame(maxWidth: .infinity, maxHeight: .infinity) - TextField(modalManager.onboardingMode ? "Replies are turned off right now." : "Ask a follow-up question...", text: $text, axis: .vertical) - .textFieldStyle(.plain) - .lineLimit(8) - .focused($isTextFieldFocused) - .padding(.vertical, 5) - .padding(.horizontal, 10) - .background(RoundedRectangle(cornerRadius: 15) - .fill(.secondary.opacity(0.1)) - ) - .onSubmit { - if !text.isEmpty { - modalManager.addUserMessage(text, incognito: incognito) - text = "" + ZStack { + TextField(modalManager.onboardingMode ? "Replies are turned off right now." : "Ask a follow-up question...", text: $text, axis: .vertical) + .textFieldStyle(.plain) + .lineLimit(8) + .focused($isTextFieldFocused) + .padding(.vertical, 5) + .padding(.horizontal, 10) + .background(RoundedRectangle(cornerRadius: 15) + .fill(.secondary.opacity(0.1)) + ) + .onSubmit { + if !text.isEmpty { + modalManager.addUserMessage(text, incognito: incognito) + text = "" + } } - } - .onChange(of: modalManager.triggerFocus) { newValue in - if newValue { + .onChange(of: modalManager.triggerFocus) { newValue in + if newValue { + isTextFieldFocused = true + modalManager.triggerFocus = false + } + } + .padding(.horizontal, 10) + .padding(.vertical, 15) + .onAppear { isTextFieldFocused = true - modalManager.triggerFocus = false } + + HStack { + Spacer() + Button(action: { + if isRecording { + // end_record.caf + AudioServicesPlaySystemSound(SystemSoundID(1114)) + self.modalManager.cancelRecordingTask() + } else { + // begin_record.caf + AudioServicesPlaySystemSound(SystemSoundID(1113)) + self.modalManager.startRecording { result in + print(result) + text = result + } + } + isRecording.toggle() + }, label: { + if !isRecording { + Image(systemName: "waveform") + .imageScale(.large) + .foregroundColor(.secondary) + } else { + Image(systemName: "stop.circle") + .imageScale(.large) + .foregroundColor(.red) + } + }) + .buttonStyle(.borderless) } - .padding(.horizontal, 10) - .padding(.vertical, 15) - .onAppear { - isTextFieldFocused = true - } - .disabled(modalManager.onboardingMode) + .padding(.horizontal, 15) + } + .disabled(modalManager.onboardingMode) } .font(.system(size: fontSize)) .foregroundColor(Color.primary)