这年头说实在有了peering coding之后,写这些都是笑话。今天算记一个互动类程序的常识吧。简言之,程序里面一定要带多线程,这是一个基本思路。以下代码由可可爱爱花了钱的opus 4.7写的flask本地小服务端转送语音请求去googleapis的speech-to-text云上服务。
""" Bridge HTTP service: emulates the ovos-stt-plugin-server endpoint and forwards the audio to Google Cloud Speech-to-Text REST API using an API key (AIza...). OVOS side: POST /stt multipart/form-data audio = <wav file> lang = en-US (optional) -> 200 text/plain "<transcription>" Google side: POST https://speech.googleapis.com/v1/speech:recognize?key=<API_KEY> """ import base64 import io import logging import os import wave import requests from flask import Flask, request from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry API_KEY = os.environ.get("STT_PW", "").strip() DEFAULT_LANG = os.environ.get("STT_LANG", "en-US") GOOGLE_URL = "https://speech.googleapis.com/v1/speech:recognize" app = Flask(__name__) logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") log = app.logger # Reuse TCP/TLS to Google + auto-retry on transient failures. # urllib3 Retry on POST also retries on connection errors (e.g. RemoteDisconnected). _session = requests.Session() _session.mount( "https://", HTTPAdapter(max_retries=Retry( total=1, connect=1, read=0, backoff_factor=0.5, status_forcelist=(500, 502, 503, 504), allowed_methods=frozenset(["POST"]), respect_retry_after_header=True, )), ) def _wav_to_pcm(wav_bytes: bytes): """Return (raw_pcm_bytes, sample_rate, channels). If not a valid WAV, assume raw 16-bit mono 16 kHz PCM (what OVOS records by default).""" try: with wave.open(io.BytesIO(wav_bytes), "rb") as w: sr = w.getframerate() ch = w.getnchannels() sw = w.getsampwidth() frames = w.readframes(w.getnframes()) if sw != 2: raise ValueError(f"unsupported sample width {sw}") return frames, sr, ch except (wave.Error, EOFError, ValueError) as e: log.warning("Not a valid WAV (%s) — treating as raw 16k mono PCM", e) return wav_bytes, 16000, 1 @app.route("/stt", methods=["POST"]) def stt(): if not API_KEY: return "STT_PW not set on server", 500 if "audio" in request.files: audio_bytes = request.files["audio"].read() else: audio_bytes = request.get_data() or b"" if not audio_bytes: return "no audio", 400 lang = request.form.get("lang") or request.args.get("lang") or DEFAULT_LANG pcm, sr, ch = _wav_to_pcm(audio_bytes) payload = { "config": { "encoding": "LINEAR16", "sampleRateHertz": sr, "audioChannelCount": ch, "languageCode": lang, "enableAutomaticPunctuation": True, }, "audio": {"content": base64.b64encode(pcm).decode("ascii")}, } import time as _time t0 = _time.monotonic() log.info("recv lang=%s sr=%s ch=%s bytes=%d", lang, sr, ch, len(pcm)) try: r = _session.post( GOOGLE_URL, params={"key": API_KEY}, json=payload, timeout=(5, 15), # (connect, read) — fail fast so OVOS gets an error within its 30s window ) except requests.RequestException as e: log.error("Google request failed after %.1fs: %s", _time.monotonic() - t0, e) return f"google request failed: {e}", 502 if r.status_code != 200: log.error("Google error %s: %s", r.status_code, r.text[:500]) return f"google error {r.status_code}: {r.text}", 502 data = r.json() transcript = "" for res in data.get("results", []): alts = res.get("alternatives") or [] if alts: transcript = alts[0].get("transcript", "") break transcript = transcript.strip() log.info("done %.2fs -> %r", _time.monotonic() - t0, transcript) return transcript, 200, {"Content-Type": "text/plain; charset=utf-8"} @app.route("/", methods=["GET"]) def index(): return "ok", 200 if __name__ == "__main__": port = int(os.environ.get("PORT", "9090")) app.run(host="127.0.0.1", port=port, threaded=True)