Skip to content

Commit 0f4227d

Browse files
authored
examples : add whisper.swiftui demo app (ggml-org#308)
* Add SwiftUI demo project. * Add -DGGML_USE_ACCELERATE
1 parent 4c1fe0c commit 0f4227d

File tree

18 files changed

+1023
-0
lines changed

18 files changed

+1023
-0
lines changed

examples/whisper.swiftui/README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
A sample SwiftUI app using [whisper.cpp](https://github.com/ggerganov/whisper.cpp/) to do voice-to-text transcriptions.
2+
See also: [whisper.objc](https://github.com/ggerganov/whisper.cpp/tree/master/examples/whisper.objc).
3+
4+
To use:
5+
6+
1. Select a model from the [whisper.cpp repository](https://github.com/ggerganov/whisper.cpp/tree/master/models).[^1]
7+
2. Add the model to "whisper.swiftui.demo/Resources/models" via Xcode.
8+
3. Select a sample audio file (for example, [jfk.wav](https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav)).
9+
4. Add the model to "whisper.swiftui.demo/Resources/samples" via Xcode.
10+
5. Select the "release" build configuration under "Run", then deploy and run to your device.
11+
12+
[^1]: I recommend the tiny, base or small models for running on an iOS device.
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import Foundation
2+
3+
enum WhisperError: Error {
4+
case couldNotInitializeContext
5+
}
6+
7+
// Meet Whisper C++ constraint: Don't access from more than one thread at a time.
8+
actor WhisperContext {
9+
private var context: OpaquePointer
10+
11+
init(context: OpaquePointer) {
12+
self.context = context
13+
}
14+
15+
deinit {
16+
whisper_free(context)
17+
}
18+
19+
func fullTranscribe(samples: [Float]) {
20+
// Leave 2 processors free (i.e. the high-efficiency cores).
21+
let maxThreads = max(1, min(8, cpuCount() - 2))
22+
print("Selecting \(maxThreads) threads")
23+
var params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY)
24+
"en".withCString { en in
25+
// Adapted from whisper.objc
26+
params.print_realtime = true
27+
params.print_progress = false
28+
params.print_timestamps = true
29+
params.print_special = false
30+
params.translate = false
31+
params.language = en
32+
params.n_threads = Int32(maxThreads)
33+
params.offset_ms = 0
34+
params.no_context = true
35+
params.single_segment = false
36+
37+
whisper_reset_timings(context)
38+
print("About to run whisper_full")
39+
samples.withUnsafeBufferPointer { samples in
40+
if (whisper_full(context, params, samples.baseAddress, Int32(samples.count)) != 0) {
41+
print("Failed to run the model")
42+
} else {
43+
whisper_print_timings(context)
44+
}
45+
}
46+
}
47+
}
48+
49+
func getTranscription() -> String {
50+
var transcription = ""
51+
for i in 0..<whisper_full_n_segments(context) {
52+
transcription += String.init(cString: whisper_full_get_segment_text(context, i))
53+
}
54+
return transcription
55+
}
56+
57+
static func createContext(path: String) throws -> WhisperContext {
58+
let context = whisper_init(path)
59+
if let context {
60+
return WhisperContext(context: context)
61+
} else {
62+
print("Couldn't load model at \(path)")
63+
throw WhisperError.couldNotInitializeContext
64+
}
65+
}
66+
}
67+
68+
fileprivate func cpuCount() -> Int {
69+
ProcessInfo.processInfo.processorCount
70+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
//
2+
// Use this file to import your target's public headers that you would like to expose to Swift.
3+
//
4+
#import "whisper.h"
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
import Foundation
2+
import SwiftUI
3+
import AVFoundation
4+
5+
@MainActor
6+
class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
7+
@Published var isModelLoaded = false
8+
@Published var messageLog = ""
9+
@Published var canTranscribe = false
10+
@Published var isRecording = false
11+
12+
private var whisperContext: WhisperContext?
13+
private let recorder = Recorder()
14+
private var recordedFile: URL? = nil
15+
private var audioPlayer: AVAudioPlayer?
16+
17+
private var modelUrl: URL? {
18+
Bundle.main.url(forResource: "ggml-tiny.en", withExtension: "bin", subdirectory: "models")
19+
}
20+
21+
private var sampleUrl: URL? {
22+
Bundle.main.url(forResource: "jfk", withExtension: "wav", subdirectory: "samples")
23+
}
24+
25+
private enum LoadError: Error {
26+
case couldNotLocateModel
27+
}
28+
29+
override init() {
30+
super.init()
31+
do {
32+
try loadModel()
33+
canTranscribe = true
34+
} catch {
35+
print(error.localizedDescription)
36+
messageLog += "\(error.localizedDescription)\n"
37+
}
38+
}
39+
40+
private func loadModel() throws {
41+
messageLog += "Loading model...\n"
42+
if let modelUrl {
43+
whisperContext = try WhisperContext.createContext(path: modelUrl.path())
44+
messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
45+
} else {
46+
messageLog += "Could not locate model\n"
47+
}
48+
}
49+
50+
func transcribeSample() async {
51+
if let sampleUrl {
52+
await transcribeAudio(sampleUrl)
53+
} else {
54+
messageLog += "Could not locate sample\n"
55+
}
56+
}
57+
58+
private func transcribeAudio(_ url: URL) async {
59+
if (!canTranscribe) {
60+
return
61+
}
62+
guard let whisperContext else {
63+
return
64+
}
65+
66+
do {
67+
canTranscribe = false
68+
messageLog += "Reading wave samples...\n"
69+
let data = try readAudioSamples(url)
70+
messageLog += "Transcribing data...\n"
71+
await whisperContext.fullTranscribe(samples: data)
72+
let text = await whisperContext.getTranscription()
73+
messageLog += "Done: \(text)\n"
74+
} catch {
75+
print(error.localizedDescription)
76+
messageLog += "\(error.localizedDescription)\n"
77+
}
78+
79+
canTranscribe = true
80+
}
81+
82+
private func readAudioSamples(_ url: URL) throws -> [Float] {
83+
stopPlayback()
84+
try startPlayback(url)
85+
return try decodeWaveFile(url)
86+
}
87+
88+
func toggleRecord() async {
89+
if isRecording {
90+
await recorder.stopRecording()
91+
isRecording = false
92+
if let recordedFile {
93+
await transcribeAudio(recordedFile)
94+
}
95+
} else {
96+
requestRecordPermission { granted in
97+
if granted {
98+
Task {
99+
do {
100+
self.stopPlayback()
101+
let file = try FileManager.default.url(for: .documentDirectory, in: .userDomainMask, appropriateFor: nil, create: true)
102+
.appending(path: "output.wav")
103+
try await self.recorder.startRecording(toOutputFile: file, delegate: self)
104+
self.isRecording = true
105+
self.recordedFile = file
106+
} catch {
107+
print(error.localizedDescription)
108+
self.messageLog += "\(error.localizedDescription)\n"
109+
self.isRecording = false
110+
}
111+
}
112+
}
113+
}
114+
}
115+
}
116+
117+
private func requestRecordPermission(response: @escaping (Bool) -> Void) {
118+
#if os(macOS)
119+
response(true)
120+
#else
121+
AVAudioSession.sharedInstance().requestRecordPermission { granted in
122+
response(granted)
123+
}
124+
#endif
125+
}
126+
127+
private func startPlayback(_ url: URL) throws {
128+
audioPlayer = try AVAudioPlayer(contentsOf: url)
129+
audioPlayer?.play()
130+
}
131+
132+
private func stopPlayback() {
133+
audioPlayer?.stop()
134+
audioPlayer = nil
135+
}
136+
137+
// MARK: AVAudioRecorderDelegate
138+
139+
nonisolated func audioRecorderEncodeErrorDidOccur(_ recorder: AVAudioRecorder, error: Error?) {
140+
if let error {
141+
Task {
142+
await handleRecError(error)
143+
}
144+
}
145+
}
146+
147+
private func handleRecError(_ error: Error) {
148+
print(error.localizedDescription)
149+
messageLog += "\(error.localizedDescription)\n"
150+
isRecording = false
151+
}
152+
153+
nonisolated func audioRecorderDidFinishRecording(_ recorder: AVAudioRecorder, successfully flag: Bool) {
154+
Task {
155+
await onDidFinishRecording()
156+
}
157+
}
158+
159+
private func onDidFinishRecording() {
160+
isRecording = false
161+
}
162+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"colors" : [
3+
{
4+
"idiom" : "universal"
5+
}
6+
],
7+
"info" : {
8+
"author" : "xcode",
9+
"version" : 1
10+
}
11+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
{
2+
"images" : [
3+
{
4+
"idiom" : "universal",
5+
"platform" : "ios",
6+
"size" : "1024x1024"
7+
},
8+
{
9+
"idiom" : "mac",
10+
"scale" : "1x",
11+
"size" : "16x16"
12+
},
13+
{
14+
"idiom" : "mac",
15+
"scale" : "2x",
16+
"size" : "16x16"
17+
},
18+
{
19+
"idiom" : "mac",
20+
"scale" : "1x",
21+
"size" : "32x32"
22+
},
23+
{
24+
"idiom" : "mac",
25+
"scale" : "2x",
26+
"size" : "32x32"
27+
},
28+
{
29+
"idiom" : "mac",
30+
"scale" : "1x",
31+
"size" : "128x128"
32+
},
33+
{
34+
"idiom" : "mac",
35+
"scale" : "2x",
36+
"size" : "128x128"
37+
},
38+
{
39+
"idiom" : "mac",
40+
"scale" : "1x",
41+
"size" : "256x256"
42+
},
43+
{
44+
"idiom" : "mac",
45+
"scale" : "2x",
46+
"size" : "256x256"
47+
},
48+
{
49+
"idiom" : "mac",
50+
"scale" : "1x",
51+
"size" : "512x512"
52+
},
53+
{
54+
"idiom" : "mac",
55+
"scale" : "2x",
56+
"size" : "512x512"
57+
}
58+
],
59+
"info" : {
60+
"author" : "xcode",
61+
"version" : 1
62+
}
63+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"info" : {
3+
"author" : "xcode",
4+
"version" : 1
5+
}
6+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"info" : {
3+
"author" : "xcode",
4+
"version" : 1
5+
}
6+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
3+
<plist version="1.0">
4+
<dict>
5+
<key>com.apple.security.app-sandbox</key>
6+
<true/>
7+
<key>com.apple.security.device.audio-input</key>
8+
<true/>
9+
<key>com.apple.security.files.user-selected.read-only</key>
10+
<true/>
11+
</dict>
12+
</plist>
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import SwiftUI
2+
import AVFoundation
3+
4+
struct ContentView: View {
5+
@StateObject var whisperState = WhisperState()
6+
7+
var body: some View {
8+
NavigationStack {
9+
VStack {
10+
HStack {
11+
Button("Transcribe", action: {
12+
Task {
13+
await whisperState.transcribeSample()
14+
}
15+
})
16+
.buttonStyle(.bordered)
17+
.disabled(!whisperState.canTranscribe)
18+
19+
Button(whisperState.isRecording ? "Stop recording" : "Start recording", action: {
20+
Task {
21+
await whisperState.toggleRecord()
22+
}
23+
})
24+
.buttonStyle(.bordered)
25+
.disabled(!whisperState.canTranscribe)
26+
}
27+
28+
ScrollView {
29+
Text(verbatim: whisperState.messageLog)
30+
.frame(maxWidth: .infinity, alignment: .leading)
31+
}
32+
}
33+
.navigationTitle("Whisper SwiftUI Demo")
34+
.padding()
35+
}
36+
}
37+
}
38+
39+
struct ContentView_Previews: PreviewProvider {
40+
static var previews: some View {
41+
ContentView()
42+
}
43+
}

0 commit comments

Comments
 (0)