;============================================================ ; iron_speech.hsp — 音声認識統一 API (バックエンド切替対応) ; ; 複数の音声認識バックエンドを統一 API で扱えるラッパ。 ; バックエンド: ; SPEECH_BACKEND_CLOUD → iron_ai 経由でクラウド Whisper API ; (OpenAI / Groq / Cloudflare 等) ; SPEECH_BACKEND_WHISPER → hspwhisper.dll (whisper.cpp, オフライン) ※未実装 ; SPEECH_BACKEND_SAPI → hspsapi.dll (Windows SAPI 5) ※未実装 ; SPEECH_BACKEND_WINRT → hspwinrtspeech.dll (WinRT, Windows.Media) ※未実装 ; SPEECH_BACKEND_VOSK → hspvosk.dll (Kaldi/Vosk) ※未実装 ; ; 現状: CLOUD のみ動作。他は ENOTIMPL を返すスタブ。 ; ; 依存: ; - iron_ai.hsp (CLOUD バックエンド用) ; - 各 DLL (将来追加) ; ; API: ; iron_speech_set_backend backend_id ; iron_speech_set_endpoint "url" ; CLOUD のみ ; iron_speech_set_key "..." ; CLOUD のみ ; iron_speech_set_model "..." ; CLOUD: whisper-1 等 / オフライン: model パス ; iron_speech_set_language "ja" ; 言語ヒント (一部バックエンドで使用) ; ; iron_speech_open ; 必要なら初期化 (現状 CLOUD は no-op) ; iron_speech_transcribe "audio.wav", text ; ファイルから文字起こし ; iron_speech_close ; リソース解放 ; ; 例 (Groq Whisper, 爆速): ; #include "iron_speech.hsp" ; ; iron_speech_set_backend SPEECH_BACKEND_CLOUD ; iron_speech_set_endpoint "https://api.groq.com/openai/v1" ; iron_speech_set_key "gsk_..." ; iron_speech_set_model "whisper-large-v3" ; iron_speech_open ; iron_speech_transcribe "voice.wav", text ; mes "認識結果: " + text ; iron_speech_close ;============================================================ #ifndef __iron_speech_hsp__ #define __iron_speech_hsp__ #include "iron_ai.hsp" #module iron_speech ;------------------------------------------------------------ ; hspwhisper.dll (オフライン whisper.cpp バックエンド) ; 存在しない環境でも include 可 (open 時に -1 が返るだけ) ;------------------------------------------------------------ #uselib "hspwhisper.dll" #cfunc _hwhisper_open "whisper_open" str #cfunc _hwhisper_transcribe_wav "whisper_transcribe_wav" int, str, str, var, int #func _hwhisper_close "whisper_close" int ;------------------------------------------------------------ ; hspvosk.dll (オフライン Vosk バックエンド) ;------------------------------------------------------------ #uselib "hspvosk.dll" #cfunc _hvosk_open "hvosk_open" str, int #cfunc _hvosk_transcribe_wav "hvosk_transcribe_wav" int, str, var, int #cfunc _hvosk_accept_pcm_s16 "hvosk_accept_pcm_s16" int, var, int #func _hvosk_partial_result "hvosk_partial_result" int, var, int #func _hvosk_final_result "hvosk_final_result" int, var, int #func _hvosk_reset "hvosk_reset" int #func _hvosk_close "hvosk_close" int ;------------------------------------------------------------ ; hspsapi.dll (Windows SAPI 5 バックエンド) ;------------------------------------------------------------ #uselib "hspsapi.dll" #cfunc _hsapi_open "sapi_open" str #cfunc _hsapi_transcribe_wav "sapi_transcribe_wav" int, str, var, int #func _hsapi_close "sapi_close" int ;------------------------------------------------------------ ; hspwinrtspeech.dll (Windows.Media.SpeechRecognition, mic only) ;------------------------------------------------------------ #uselib "hspwinrtspeech.dll" #cfunc _hwinrt_speech_open "winrt_speech_open" str #cfunc _hwinrt_speech_listen_once "winrt_speech_listen_once" int, var, int #func _hwinrt_speech_close "winrt_speech_close" int ;------------------------------------------------------------ ; バックエンド ID 定数 ;------------------------------------------------------------ #define global SPEECH_BACKEND_CLOUD 1 #define global SPEECH_BACKEND_WHISPER 2 #define global SPEECH_BACKEND_SAPI 3 #define global SPEECH_BACKEND_WINRT 4 #define global SPEECH_BACKEND_VOSK 5 ;------------------------------------------------------------ ; ステータス定数 ;------------------------------------------------------------ #define global SPEECH_OK 0 #define global SPEECH_ERR_BACKEND -1 #define global SPEECH_ERR_NOT_IMPL -2 #define global SPEECH_ERR_NOT_OPEN -3 #define global SPEECH_ERR_HTTP -4 ;------------------------------------------------------------ ; 状態 ;------------------------------------------------------------ ;------------------------------------------------------------ ; 設定系 ;------------------------------------------------------------ #deffunc iron_speech_set_backend int backend_id _isp_backend = backend_id return #deffunc iron_speech_set_endpoint str url _isp_endpoint = url return #deffunc iron_speech_set_key str key _isp_key = key return #deffunc iron_speech_set_model str model _isp_model = model return #deffunc iron_speech_set_language str lang _isp_language = lang return ;------------------------------------------------------------ ; iron_speech_open ;------------------------------------------------------------ #deffunc iron_speech_open if _isp_backend = SPEECH_BACKEND_CLOUD { ; iron_ai を設定 iron_ai_set_endpoint _isp_endpoint iron_ai_set_key _isp_key iron_ai_set_model _isp_model _isp_opened = 1 return SPEECH_OK } if _isp_backend = SPEECH_BACKEND_WHISPER { ; whisper.cpp オフラインモデル open ; _isp_model にモデルファイル (.bin) のパスを指定しておく _isp_whisper_handle = _hwhisper_open(_isp_model) if _isp_whisper_handle < 0 : return SPEECH_ERR_BACKEND _isp_opened = 1 return SPEECH_OK } if _isp_backend = SPEECH_BACKEND_SAPI { ; _isp_language に "en-US" 等の BCP47 言語コード (空でシステム既定) _isp_sapi_handle = _hsapi_open(_isp_language) if _isp_sapi_handle < 0 : return SPEECH_ERR_BACKEND _isp_opened = 1 return SPEECH_OK } if _isp_backend = SPEECH_BACKEND_WINRT { ; mic-only バックエンド。_isp_language に "en-US" 等 _isp_winrt_handle = _hwinrt_speech_open(_isp_language) if _isp_winrt_handle < 0 : return SPEECH_ERR_BACKEND _isp_opened = 1 return SPEECH_OK } if _isp_backend = SPEECH_BACKEND_VOSK { ; _isp_model に Vosk モデルディレクトリパスを指定 (例 "vosk-model-small-ja-0.22") _isp_vosk_handle = _hvosk_open(_isp_model, _isp_vosk_sr) if _isp_vosk_handle < 0 : return SPEECH_ERR_BACKEND _isp_opened = 1 return SPEECH_OK } return SPEECH_ERR_BACKEND ;------------------------------------------------------------ ; iron_speech_transcribe "file.wav", text ;------------------------------------------------------------ #deffunc iron_speech_transcribe str audio_path, var out_text sdim out_text, 4096 out_text = "" if _isp_opened = 0 : return SPEECH_ERR_NOT_OPEN if _isp_backend = SPEECH_BACKEND_CLOUD { iron_ai_transcribe audio_path, out_text, _isp_model if stat ! 200 : return SPEECH_ERR_HTTP return SPEECH_OK } if _isp_backend = SPEECH_BACKEND_WHISPER { if _isp_whisper_handle < 0 : return SPEECH_ERR_NOT_OPEN sdim out_text, 65536 _hwhisper_transcribe_wav _isp_whisper_handle, audio_path, _isp_language, out_text, 65535 return SPEECH_OK } if _isp_backend = SPEECH_BACKEND_VOSK { if _isp_vosk_handle < 0 : return SPEECH_ERR_NOT_OPEN sdim out_text, 65536 _hvosk_transcribe_wav _isp_vosk_handle, audio_path, out_text, 65535 return SPEECH_OK } if _isp_backend = SPEECH_BACKEND_SAPI { if _isp_sapi_handle < 0 : return SPEECH_ERR_NOT_OPEN sdim out_text, 65536 _hsapi_transcribe_wav _isp_sapi_handle, audio_path, out_text, 65535 return SPEECH_OK } return SPEECH_ERR_NOT_IMPL ;------------------------------------------------------------ ; iron_speech_close ;------------------------------------------------------------ #deffunc iron_speech_close if _isp_whisper_handle >= 0 { _hwhisper_close _isp_whisper_handle _isp_whisper_handle = -1 } if _isp_vosk_handle >= 0 { _hvosk_close _isp_vosk_handle _isp_vosk_handle = -1 } if _isp_sapi_handle >= 0 { _hsapi_close _isp_sapi_handle _isp_sapi_handle = -1 } if _isp_winrt_handle >= 0 { _hwinrt_speech_close _isp_winrt_handle _isp_winrt_handle = -1 } _isp_opened = 0 return SPEECH_OK ;------------------------------------------------------------ ; iron_speech_listen_once out_text ; ライブマイクから 1 phrase 取得 (現状 WINRT バックエンドのみサポート) ; ファイル文字起こしと違って、マイクからの 1 回の発話を待つ。 ;------------------------------------------------------------ #deffunc iron_speech_listen_once var out_text sdim out_text, 4096 out_text = "" if _isp_opened = 0 : return SPEECH_ERR_NOT_OPEN if _isp_backend = SPEECH_BACKEND_WINRT { if _isp_winrt_handle < 0 : return SPEECH_ERR_NOT_OPEN sdim out_text, 4096 _hwinrt_speech_listen_once _isp_winrt_handle, out_text, 4095 return SPEECH_OK } return SPEECH_ERR_NOT_IMPL ;------------------------------------------------------------ ; iron_speech_backend_name() — defcfunc ; 現在のバックエンド名を文字列で返す ;------------------------------------------------------------ #defcfunc iron_speech_backend_name if _isp_backend = SPEECH_BACKEND_CLOUD : return "cloud" if _isp_backend = SPEECH_BACKEND_WHISPER : return "whisper.cpp (offline)" if _isp_backend = SPEECH_BACKEND_SAPI : return "Windows SAPI 5" if _isp_backend = SPEECH_BACKEND_WINRT : return "WinRT Speech (mic-only)" if _isp_backend = SPEECH_BACKEND_VOSK : return "Vosk (offline)" return "(unknown)" ;============================================================ ; 連続認識 API (A4) ; ; iron_speech_listen_start_mic dev_idx ; マイクからリアルタイム認識を開始 (現状 VOSK backend のみサポート) ; 内部で hspmfcam.dll の mfcam_audio_open を使ってマイクオープン ; ; iron_speech_listen_poll partial_var, final_var ; partial_var : 途中テキスト (継続的に成長) ; final_var : フレーズ確定時の最終テキスト (確定時のみ非空) ; ; iron_speech_listen_stop ; 停止。マイクを閉じる。 ; ; 例: ; iron_speech_set_backend SPEECH_BACKEND_VOSK ; iron_speech_set_model "vosk-model-small-ja-0.22" ; iron_speech_open ; iron_speech_listen_start_mic 0 ; *loop ; iron_speech_listen_poll partial, final ; if strlen(partial) > 0 : mes "途中: " + partial ; if strlen(final) > 0 : mes "確定: " + final ; await 50 ; stick k, 128 ; if k & 128 : break ; goto *loop ; iron_speech_listen_stop ;============================================================ ;------------------------------------------------------------ ; hspmfcam.dll 側の mic 関数を再宣言 (iron_camera_mf に依存せず動くように) ;------------------------------------------------------------ #uselib "hspmfcam.dll" #cfunc _isp_mfcam_audio_open "mfcam_audio_open" int, int, int, int #func _isp_mfcam_audio_close "mfcam_audio_close" int #cfunc _isp_mfcam_audio_avail "mfcam_audio_pcm_avail" int #func _isp_mfcam_audio_read "mfcam_audio_read_pcm" int, var, int #deffunc iron_speech_listen_start_mic int dev_idx if _isp_opened = 0 : return SPEECH_ERR_NOT_OPEN if _isp_backend ! SPEECH_BACKEND_VOSK : return SPEECH_ERR_NOT_IMPL ; 16kHz mono 16bit でマイクを open (Vosk の典型値) _isp_mic_handle = _isp_mfcam_audio_open(dev_idx, 16000, 1, 16) if _isp_mic_handle < 0 : return SPEECH_ERR_BACKEND return SPEECH_OK #deffunc iron_speech_listen_poll var out_partial, var out_final, local _avail, local _read, local _flag sdim out_partial, 4096 sdim out_final, 4096 out_partial = "" out_final = "" if _isp_mic_handle < 0 : return SPEECH_ERR_NOT_OPEN ; 1. マイクのリングから byte 取り出し (最大 16k byte = 0.5 秒) _avail = _isp_mfcam_audio_avail(_isp_mic_handle) if _avail >= 2 { if _avail > 16000 : _avail = 16000 sdim _isp_pcm_buf, _avail + 16 _isp_mfcam_audio_read _isp_mic_handle, _isp_pcm_buf, _avail _read = stat ; 2. Vosk に食わせる (sample_count = bytes / 2 for s16) _flag = _hvosk_accept_pcm_s16(_isp_vosk_handle, _isp_pcm_buf, _read / 2) if _flag = 1 { ; Final phrase _hvosk_final_result _isp_vosk_handle, out_final, 4095 } } ; Partial は常に取得可 _hvosk_partial_result _isp_vosk_handle, out_partial, 4095 return SPEECH_OK #deffunc iron_speech_listen_stop if _isp_mic_handle >= 0 { _isp_mfcam_audio_close _isp_mic_handle _isp_mic_handle = -1 } if _isp_vosk_handle >= 0 : _hvosk_reset _isp_vosk_handle return SPEECH_OK #global _isp_backend@iron_speech = 1 ; SPEECH_BACKEND_CLOUD sdim _isp_endpoint@iron_speech, 1024 sdim _isp_key@iron_speech, 1024 sdim _isp_model@iron_speech, 256 sdim _isp_language@iron_speech, 32 _isp_endpoint@iron_speech = "https://api.openai.com/v1" _isp_key@iron_speech = "" _isp_model@iron_speech = "whisper-1" _isp_language@iron_speech = "" _isp_opened@iron_speech = 0 _isp_whisper_handle@iron_speech = -1 _isp_vosk_handle@iron_speech = -1 _isp_vosk_sr@iron_speech = 16000 _isp_sapi_handle@iron_speech = -1 _isp_winrt_handle@iron_speech = -1 _isp_mic_handle@iron_speech = -1 sdim _isp_pcm_buf@iron_speech, 32768 #endif