Skip to content

Commit 00d3e72

Browse files
committed
feat(asr): Add Cohere Transcribe INT8 model support
Add HuggingFace integration for Cohere Transcribe CoreML models with INT8 quantization support. Changes: - Add CohereTranscribe model names enum with encoder, decoder, and vocab - Add Cohere repository definitions (FP16 and INT8 variants) - Update CohereAsrModels to use stateful decoder from HuggingFace - Support automatic download from FluidInference/cohere-transcribe-03-2026-coreml Model details: - 35-second window architecture (3500 frames → 438 encoder outputs) - INT8 W8A16 quantization (~2.0 GB vs ~4.2 GB FP16) - 14-language support with token primer system - Quality: 16.44% WER on LibriSpeech test-clean (INT8)
1 parent 32f8f17 commit 00d3e72

2 files changed

Lines changed: 45 additions & 3 deletions

File tree

Sources/FluidAudio/ASR/Cohere/CohereAsrModels.swift

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,14 +53,14 @@ public struct CohereAsrModels: Sendable {
5353

5454
// Load encoder
5555
let encoder = try await loadModel(
56-
named: "cohere_encoder",
56+
named: ModelNames.CohereTranscribe.encoder,
5757
from: directory,
5858
configuration: modelConfig
5959
)
6060

61-
// Load decoder
61+
// Load decoder (stateful - uses CoreML state API)
6262
let decoder = try await loadModel(
63-
named: "cohere_decoder_cached",
63+
named: ModelNames.CohereTranscribe.decoderStateful,
6464
from: directory,
6565
configuration: modelConfig
6666
)

Sources/FluidAudio/ModelNames.swift

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ public enum Repo: String, CaseIterable {
2424
case qwen3AsrInt8 = "FluidInference/qwen3-asr-0.6b-coreml/int8"
2525
case multilingualG2p = "FluidInference/charsiu-g2p-byt5-coreml"
2626
case parakeetTdtCtc110m = "FluidInference/parakeet-tdt-ctc-110m-coreml"
27+
case cohereTranscribeCoreml = "FluidInference/cohere-transcribe-03-2026-coreml/f16"
28+
case cohereTranscribeCoremlInt8 = "FluidInference/cohere-transcribe-03-2026-coreml/q8"
2729

2830
/// Repository slug (without owner)
2931
public var name: String {
@@ -72,6 +74,10 @@ public enum Repo: String, CaseIterable {
7274
return "charsiu-g2p-byt5-coreml"
7375
case .parakeetTdtCtc110m:
7476
return "parakeet-tdt-ctc-110m-coreml"
77+
case .cohereTranscribeCoreml:
78+
return "cohere-transcribe-03-2026-coreml/f16"
79+
case .cohereTranscribeCoremlInt8:
80+
return "cohere-transcribe-03-2026-coreml/q8"
7581
}
7682
}
7783

@@ -94,6 +100,8 @@ public enum Repo: String, CaseIterable {
94100
return "FluidInference/qwen3-asr-0.6b-coreml"
95101
case .parakeetTdtCtc110m:
96102
return "FluidInference/parakeet-tdt-ctc-110m-coreml"
103+
case .cohereTranscribeCoreml, .cohereTranscribeCoremlInt8:
104+
return "FluidInference/cohere-transcribe-03-2026-coreml"
97105
default:
98106
return "FluidInference/\(name)"
99107
}
@@ -116,6 +124,10 @@ public enum Repo: String, CaseIterable {
116124
return "nemotron_coreml_1120ms"
117125
case .nemotronStreaming560:
118126
return "nemotron_coreml_560ms"
127+
case .cohereTranscribeCoreml:
128+
return "f16"
129+
case .cohereTranscribeCoremlInt8:
130+
return "q8"
119131
default:
120132
return nil
121133
}
@@ -150,6 +162,10 @@ public enum Repo: String, CaseIterable {
150162
return "parakeet-tdt-ja"
151163
case .parakeetTdtCtc110m:
152164
return "parakeet-tdt-ctc-110m"
165+
case .cohereTranscribeCoreml:
166+
return "cohere-transcribe/f16"
167+
case .cohereTranscribeCoremlInt8:
168+
return "cohere-transcribe/q8"
153169
default:
154170
return name.replacingOccurrences(of: "-coreml", with: "")
155171
}
@@ -585,6 +601,30 @@ public enum ModelNames {
585601
]
586602
}
587603

604+
/// Cohere Transcribe model names
605+
/// Encoder-decoder ASR with 14-language support (35-second window architecture)
606+
public enum CohereTranscribe {
607+
public static let encoder = "cohere_encoder"
608+
public static let decoderStateful = "cohere_decoder_stateful"
609+
public static let vocab = "vocab.json"
610+
611+
public static let encoderFile = encoder + ".mlpackage"
612+
public static let decoderStatefulFile = decoderStateful + ".mlpackage"
613+
614+
/// For compatibility - models can be .mlmodelc or .mlpackage
615+
public static let encoderCompiledFile = encoder + ".mlmodelc"
616+
public static let decoderStatefulCompiledFile = decoderStateful + ".mlmodelc"
617+
618+
/// Alias for the decoder file (used by CohereAsrModels)
619+
public static let decoderFile = decoderStatefulFile
620+
621+
public static let requiredModels: Set<String> = [
622+
encoderFile,
623+
decoderStatefulFile,
624+
vocab,
625+
]
626+
}
627+
588628
/// G2P (grapheme-to-phoneme) model names
589629
public enum G2P {
590630
public static let encoder = "G2PEncoder"
@@ -701,6 +741,8 @@ public enum ModelNames {
701741
return ModelNames.Qwen3ASR.requiredModelsFull
702742
case .multilingualG2p:
703743
return ModelNames.MultilingualG2P.requiredModels
744+
case .cohereTranscribeCoreml, .cohereTranscribeCoremlInt8:
745+
return ModelNames.CohereTranscribe.requiredModels
704746
}
705747
}
706748
}

0 commit comments

Comments
 (0)