Files
herolib/lib/clients/openai/audio/audio.v
2025-08-06 08:50:32 +02:00

231 lines
3.9 KiB
V

module audio
import json
import freeflowuniverse.herolib.core.httpconnection
import os
import net.http
import freeflowuniverse.herolib.clients.openai { OpenAI }
type OpenAIAlias = OpenAI
pub enum Voice {
alloy
ash
coral
echo
fable
onyx
nova
sage
shimmer
}
fn voice_str(x Voice) string {
return match x {
.alloy {
'alloy'
}
.ash {
'ash'
}
.coral {
'coral'
}
.echo {
'echo'
}
.fable {
'fable'
}
.onyx {
'onyx'
}
.nova {
'nova'
}
.sage {
'sage'
}
.shimmer {
'shimmer'
}
}
}
pub enum AudioFormat {
mp3
opus
aac
flac
wav
pcm
}
fn audio_format_str(x AudioFormat) string {
return match x {
.mp3 {
'mp3'
}
.opus {
'opus'
}
.aac {
'aac'
}
.flac {
'flac'
}
.wav {
'wav'
}
.pcm {
'pcm'
}
}
}
pub enum AudioRespType {
json
text
srt
verbose_json
vtt
}
const audio_model = 'whisper-1'
const audio_mime_types = {
'.mp3': 'audio/mpeg'
'.mp4': 'audio/mp4'
'.mpeg': 'audio/mpeg'
'.mpga': 'audio/mp4'
'.m4a': 'audio/mp4'
'.wav': 'audio/vnd.wav'
'.webm': 'application/octet-stream'
}
fn audio_resp_type_str(i AudioRespType) string {
return match i {
.json {
'json'
}
.text {
'text'
}
.srt {
'srt'
}
.verbose_json {
'verbose_json'
}
.vtt {
'vtt'
}
}
}
@[params]
pub struct AudioArgs {
pub mut:
filepath string
prompt string
response_format AudioRespType
temperature int
language string
}
pub struct AudioResponse {
pub mut:
text string
}
// create transcription from an audio file
// supported audio formats are mp3, mp4, mpeg, mpga, m4a, wav, or webm
pub fn (mut f OpenAIAlias) create_transcription(args AudioArgs) !AudioResponse {
return f.create_audio_request(args, 'audio/transcriptions')
}
// create translation to english from an audio file
// supported audio formats are mp3, mp4, mpeg, mpga, m4a, wav, or webm
pub fn (mut f OpenAIAlias) create_tranlation(args AudioArgs) !AudioResponse {
return f.create_audio_request(args, 'audio/translations')
}
fn (mut f OpenAIAlias) create_audio_request(args AudioArgs, endpoint string) !AudioResponse {
file_content := os.read_file(args.filepath)!
ext := os.file_ext(args.filepath)
mut file_mime_type := ''
if ext in audio_mime_types {
file_mime_type = audio_mime_types[ext]
} else {
return error('file extenion not supported')
}
file_data := http.FileData{
filename: os.base(args.filepath)
content_type: file_mime_type
data: file_content
}
form := http.PostMultipartFormConfig{
files: {
'file': [file_data]
}
form: {
'model': audio_model
'prompt': args.prompt
'response_format': audio_resp_type_str(args.response_format)
'temperature': args.temperature.str()
'language': args.language
}
}
req := httpconnection.Request{
prefix: endpoint
}
mut conn := f.connection()!
r := conn.post_multi_part(req, form)!
if r.status_code != 200 {
return error('got error from server: ${r.body}')
}
return json.decode(AudioResponse, r.body)!
}
@[params]
pub struct CreateSpeechArgs {
pub:
model string = 'tts_1'
input string @[required]
voice Voice = .alloy
response_format AudioFormat = .mp3
speed f32 = 1.0
output_path string @[required]
}
pub struct CreateSpeechRequest {
pub:
model string
input string
voice string
response_format string
speed f32
}
pub fn (mut f OpenAIAlias) create_speech(args CreateSpeechArgs) ! {
mut output_file := os.open_file(args.output_path, 'w+')!
req := CreateSpeechRequest{
model: args.model
input: args.input
voice: voice_str(args.voice)
response_format: audio_format_str(args.response_format)
speed: args.speed
}
data := json.encode(req)
mut conn := f.connection()!
r := conn.post_json_str(prefix: 'audio/speech', data: data)!
output_file.write(r.bytes())!
}