import os import spaces REPO_TYPE = "hf" from huggingface_hub import snapshot_download MODEL_CACHE_DIR = "./models" FUN_ASR_NANO_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "Fun-ASR-Nano") SENSE_VOICE_SMALL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "SenseVoiceSmall") VAD_MODEL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "fsmn-vad") os.makedirs(MODEL_CACHE_DIR, exist_ok=True) def download_model_if_not_exists(repo_id, local_path, model_name): if not os.path.exists(local_path): print(f"Downloading {model_name} to {local_path} ...") snapshot_download(repo_id=repo_id, local_dir=local_path, ignore_patterns=["*.onnx"]) print(f"{model_name} downloaded.") else: print(f"{model_name} found locally, skipping download.") download_model_if_not_exists("FunAudioLLM/Fun-ASR-Nano-2512", FUN_ASR_NANO_LOCAL_PATH, "Fun-ASR-Nano") download_model_if_not_exists("FunAudioLLM/SenseVoiceSmall", SENSE_VOICE_SMALL_LOCAL_PATH, "SenseVoiceSmall") download_model_if_not_exists("funasr/fsmn-vad", VAD_MODEL_LOCAL_PATH, "VAD Model") import gradio as gr import time import tempfile import numpy as np import torch import torchaudio from funasr import AutoModel from funasr.utils.postprocess_utils import rich_transcription_postprocess device = "cuda:0" if torch.cuda.is_available() else "cpu" loaded_models = {} def get_model(pipeline_type): if pipeline_type in loaded_models: return loaded_models[pipeline_type] if pipeline_type == "fun-asr-nano": model = AutoModel( model=FUN_ASR_NANO_LOCAL_PATH, trust_remote_code=True, remote_code="./Fun-ASR/model.py", vad_model=VAD_MODEL_LOCAL_PATH, vad_kwargs={"max_single_segment_time": 30000}, device=device, disable_update=True, hub="hf", ) elif pipeline_type == "sensevoice": model = AutoModel( model=SENSE_VOICE_SMALL_LOCAL_PATH, trust_remote_code=False, vad_model=VAD_MODEL_LOCAL_PATH, vad_kwargs={"max_single_segment_time": 30000}, device=device, disable_update=True, hub="hf", ) else: return None loaded_models[pipeline_type] = model return model # @spaces.GPU(duration=60) # disabled for CPU-only space def transcribe(audio_input, pipeline_type, language): if audio_input is None: return "Please upload an audio file or record via microphone.", "" model = get_model(pipeline_type) if model is None: return "Model loading failed.", "" # Handle gradio audio input if isinstance(audio_input, tuple): sr, audio_data = audio_input audio_data = audio_data.astype(np.float32) / np.iinfo(np.int16).max if len(audio_data.shape) > 1: audio_data = audio_data.mean(-1) if sr != 16000: resampler = torchaudio.transforms.Resample(sr, 16000) audio_data = resampler(torch.from_numpy(audio_data).float().unsqueeze(0))[0].numpy() # Save to temp file tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) import soundfile as sf sf.write(tmp.name, audio_data, 16000) audio_path = tmp.name else: audio_path = audio_input start_time = time.time() try: if pipeline_type == "fun-asr-nano": res = model.generate(input=[audio_path], use_itn=True, batch_size=1) else: res = model.generate( input=audio_path, cache={}, language=language or "auto", use_itn=True, batch_size_s=60, merge_vad=True, ) elapsed = time.time() - start_time text = rich_transcription_postprocess(res[0]["text"]) metrics = f"⏱️ {elapsed:.2f}s" if os.path.exists(audio_path): import librosa duration = librosa.get_duration(filename=audio_path) rtf = elapsed / duration if duration > 0 else 0 metrics = f"⏱️ {elapsed:.2f}s | Audio: {duration:.1f}s | RTF: {rtf:.4f}" return text, metrics except Exception as e: return f"Error: {str(e)}", "" finally: if isinstance(audio_input, tuple) and os.path.exists(audio_path): os.unlink(audio_path) description_html = """
LLM-Powered Speech Recognition — 31 Languages, Dialects & Accents
End-to-end ASR trained on tens of millions of hours of data. Supports Chinese (+ dialects), English, Japanese, Korean, French, German, Spanish, and 24 more languages.
⭐ GitHub (Fun-ASR) · 🛠️ FunASR Toolkit · 🎙️ SenseVoice · 🤗 Model Card
| Model | Languages | Architecture | Best For |
|---|---|---|---|
| Fun-ASR-Nano ⭐ | 31 | LLM-based | Multi-language, dialects, highest accuracy |
| SenseVoice | 5 | CTC (non-AR) | Speed + Emotion + Audio events |