import os import spaces REPO_TYPE = "hf" from huggingface_hub import snapshot_download MODEL_CACHE_DIR = "./models" FUN_ASR_NANO_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "Fun-ASR-Nano") SENSE_VOICE_SMALL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "SenseVoiceSmall") VAD_MODEL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "fsmn-vad") os.makedirs(MODEL_CACHE_DIR, exist_ok=True) def download_model_if_not_exists(repo_id, local_path, model_name): if not os.path.exists(local_path): print(f"Downloading {model_name} to {local_path} ...") snapshot_download(repo_id=repo_id, local_dir=local_path, ignore_patterns=["*.onnx"]) print(f"{model_name} downloaded.") else: print(f"{model_name} found locally, skipping download.") download_model_if_not_exists("FunAudioLLM/Fun-ASR-Nano-2512", FUN_ASR_NANO_LOCAL_PATH, "Fun-ASR-Nano") download_model_if_not_exists("FunAudioLLM/SenseVoiceSmall", SENSE_VOICE_SMALL_LOCAL_PATH, "SenseVoiceSmall") download_model_if_not_exists("funasr/fsmn-vad", VAD_MODEL_LOCAL_PATH, "VAD Model") import gradio as gr import time import tempfile import numpy as np import torch import torchaudio from funasr import AutoModel from funasr.utils.postprocess_utils import rich_transcription_postprocess device = "cuda:0" if torch.cuda.is_available() else "cpu" loaded_models = {} def get_model(pipeline_type): if pipeline_type in loaded_models: return loaded_models[pipeline_type] if pipeline_type == "fun-asr-nano": model = AutoModel( model=FUN_ASR_NANO_LOCAL_PATH, trust_remote_code=True, remote_code="./Fun-ASR/model.py", vad_model=VAD_MODEL_LOCAL_PATH, vad_kwargs={"max_single_segment_time": 30000}, device=device, disable_update=True, hub="hf", ) elif pipeline_type == "sensevoice": model = AutoModel( model=SENSE_VOICE_SMALL_LOCAL_PATH, trust_remote_code=False, vad_model=VAD_MODEL_LOCAL_PATH, vad_kwargs={"max_single_segment_time": 30000}, device=device, disable_update=True, hub="hf", ) else: return None loaded_models[pipeline_type] = model return model # @spaces.GPU(duration=60) # disabled for CPU-only space def transcribe(audio_input, pipeline_type, language): if audio_input is None: return "Please upload an audio file or record via microphone.", "" model = get_model(pipeline_type) if model is None: return "Model loading failed.", "" # Handle gradio audio input if isinstance(audio_input, tuple): sr, audio_data = audio_input audio_data = audio_data.astype(np.float32) / np.iinfo(np.int16).max if len(audio_data.shape) > 1: audio_data = audio_data.mean(-1) if sr != 16000: resampler = torchaudio.transforms.Resample(sr, 16000) audio_data = resampler(torch.from_numpy(audio_data).float().unsqueeze(0))[0].numpy() # Save to temp file tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) import soundfile as sf sf.write(tmp.name, audio_data, 16000) audio_path = tmp.name else: audio_path = audio_input start_time = time.time() try: if pipeline_type == "fun-asr-nano": res = model.generate(input=[audio_path], use_itn=True, batch_size=1) else: res = model.generate( input=audio_path, cache={}, language=language or "auto", use_itn=True, batch_size_s=60, merge_vad=True, ) elapsed = time.time() - start_time text = rich_transcription_postprocess(res[0]["text"]) metrics = f"⏱️ {elapsed:.2f}s" if os.path.exists(audio_path): import librosa duration = librosa.get_duration(filename=audio_path) rtf = elapsed / duration if duration > 0 else 0 metrics = f"⏱️ {elapsed:.2f}s | Audio: {duration:.1f}s | RTF: {rtf:.4f}" return text, metrics except Exception as e: return f"Error: {str(e)}", "" finally: if isinstance(audio_input, tuple) and os.path.exists(audio_path): os.unlink(audio_path) description_html = """

🚀 Fun-ASR-Nano

LLM-Powered Speech Recognition — 31 Languages, Dialects & Accents

End-to-end ASR trained on tens of millions of hours of data. Supports Chinese (+ dialects), English, Japanese, Korean, French, German, Spanish, and 24 more languages.

⭐ GitHub (Fun-ASR) · 🛠️ FunASR Toolkit · 🎙️ SenseVoice · 🤗 Model Card

""" comparison_html = """
Model Languages Architecture Best For
Fun-ASR-Nano 31 LLM-based Multi-language, dialects, highest accuracy
SenseVoice 5 CTC (non-AR) Speed + Emotion + Audio events
""" def launch(): with gr.Blocks(theme=gr.themes.Soft(), title="Fun-ASR-Nano - 31 Language ASR") as demo: gr.HTML(description_html) gr.HTML(comparison_html) with gr.Row(): with gr.Column(scale=1): audio_input = gr.Audio( label="Upload audio or record via microphone", sources=["upload", "microphone"], type="filepath", ) with gr.Row(): pipeline_type = gr.Dropdown( choices=["fun-asr-nano", "sensevoice"], value="fun-asr-nano", label="Model", ) language = gr.Dropdown( choices=["auto", "zh", "en", "yue", "ja", "ko"], value="auto", label="Language (SenseVoice only)", interactive=True, ) btn = gr.Button("🎯 Transcribe", variant="primary", size="lg") with gr.Column(scale=1): output_text = gr.Textbox( label="Transcription Result", lines=10, show_copy_button=True, ) metrics_text = gr.Textbox(label="Performance", lines=1) btn.click( transcribe, inputs=[audio_input, pipeline_type, language], outputs=[output_text, metrics_text], ) gr.Markdown(""" ### Supported Languages (Fun-ASR-Nano) Chinese (Mandarin, Cantonese, Sichuan, Shanghai, Minnan, Wenzhou, Hakka, Gan, and more), English, Japanese, Korean, French, German, Spanish, Italian, Portuguese, Russian, Arabic, Hindi, Thai, Vietnamese, Indonesian, Malay, Turkish, Polish, Dutch, Swedish, Hebrew, Greek, Czech, Romanian, Hungarian, Finnish, Danish, Norwegian, Ukrainian. ### Tips - **Fun-ASR-Nano**: Best for multi-language & Chinese dialects. Outputs punctuation natively. - **SenseVoice**: Ultra-fast (7x faster than Whisper-small), also detects emotions & audio events. - For long audio (>5min), consider using [FunASR](https://github.com/modelscope/FunASR) locally with GPU. """) demo.launch() if __name__ == "__main__": launch()