import os
import spaces

REPO_TYPE = "hf"

from huggingface_hub import snapshot_download

MODEL_CACHE_DIR = "./models"
FUN_ASR_NANO_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "Fun-ASR-Nano")
SENSE_VOICE_SMALL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "SenseVoiceSmall")
VAD_MODEL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "fsmn-vad")

os.makedirs(MODEL_CACHE_DIR, exist_ok=True)


def download_model_if_not_exists(repo_id, local_path, model_name):
    if not os.path.exists(local_path):
        print(f"Downloading {model_name} to {local_path} ...")
        snapshot_download(repo_id=repo_id, local_dir=local_path, ignore_patterns=["*.onnx"])
        print(f"{model_name} downloaded.")
    else:
        print(f"{model_name} found locally, skipping download.")


download_model_if_not_exists("FunAudioLLM/Fun-ASR-Nano-2512", FUN_ASR_NANO_LOCAL_PATH, "Fun-ASR-Nano")
download_model_if_not_exists("FunAudioLLM/SenseVoiceSmall", SENSE_VOICE_SMALL_LOCAL_PATH, "SenseVoiceSmall")
download_model_if_not_exists("funasr/fsmn-vad", VAD_MODEL_LOCAL_PATH, "VAD Model")

import gradio as gr
import time
import tempfile
import numpy as np
import torch
import torchaudio
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess

device = "cuda:0" if torch.cuda.is_available() else "cpu"

loaded_models = {}


def get_model(pipeline_type):
    if pipeline_type in loaded_models:
        return loaded_models[pipeline_type]

    if pipeline_type == "fun-asr-nano":
        model = AutoModel(
            model=FUN_ASR_NANO_LOCAL_PATH,
            trust_remote_code=True,
            remote_code="./Fun-ASR/model.py",
            vad_model=VAD_MODEL_LOCAL_PATH,
            vad_kwargs={"max_single_segment_time": 30000},
            device=device,
            disable_update=True,
            hub="hf",
        )
    elif pipeline_type == "sensevoice":
        model = AutoModel(
            model=SENSE_VOICE_SMALL_LOCAL_PATH,
            trust_remote_code=False,
            vad_model=VAD_MODEL_LOCAL_PATH,
            vad_kwargs={"max_single_segment_time": 30000},
            device=device,
            disable_update=True,
            hub="hf",
        )
    else:
        return None

    loaded_models[pipeline_type] = model
    return model


# @spaces.GPU(duration=60)  # disabled for CPU-only space
def transcribe(audio_input, pipeline_type, language):
    if audio_input is None:
        return "Please upload an audio file or record via microphone.", ""

    model = get_model(pipeline_type)
    if model is None:
        return "Model loading failed.", ""

    # Handle gradio audio input
    if isinstance(audio_input, tuple):
        sr, audio_data = audio_input
        audio_data = audio_data.astype(np.float32) / np.iinfo(np.int16).max
        if len(audio_data.shape) > 1:
            audio_data = audio_data.mean(-1)
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(sr, 16000)
            audio_data = resampler(torch.from_numpy(audio_data).float().unsqueeze(0))[0].numpy()
        # Save to temp file
        tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        import soundfile as sf
        sf.write(tmp.name, audio_data, 16000)
        audio_path = tmp.name
    else:
        audio_path = audio_input

    start_time = time.time()

    try:
        if pipeline_type == "fun-asr-nano":
            res = model.generate(input=[audio_path], use_itn=True, batch_size=1)
        else:
            res = model.generate(
                input=audio_path, cache={}, language=language or "auto",
                use_itn=True, batch_size_s=60, merge_vad=True,
            )

        elapsed = time.time() - start_time
        text = rich_transcription_postprocess(res[0]["text"])

        metrics = f"⏱️ {elapsed:.2f}s"
        if os.path.exists(audio_path):
            import librosa
            duration = librosa.get_duration(filename=audio_path)
            rtf = elapsed / duration if duration > 0 else 0
            metrics = f"⏱️ {elapsed:.2f}s | Audio: {duration:.1f}s | RTF: {rtf:.4f}"

        return text, metrics

    except Exception as e:
        return f"Error: {str(e)}", ""
    finally:
        if isinstance(audio_input, tuple) and os.path.exists(audio_path):
            os.unlink(audio_path)


description_html = """
<div style="text-align: center; max-width: 850px; margin: 0 auto;">
    <h1 style="font-size: 2.2em; margin-bottom: 0.1em;">🚀 Fun-ASR-Nano</h1>
    <p style="font-size: 1.3em; color: #444; margin-bottom: 0.3em;">LLM-Powered Speech Recognition — 31 Languages, Dialects & Accents</p>
    <p style="font-size: 1em; color: #666;">
        End-to-end ASR trained on <strong>tens of millions of hours</strong> of data.
        Supports Chinese (+ dialects), English, Japanese, Korean, French, German, Spanish, and 24 more languages.
    </p>
    <p style="font-size: 0.9em; margin-top: 0.8em;">
        <a href="https://github.com/FunAudioLLM/Fun-ASR" target="_blank">⭐ GitHub (Fun-ASR)</a> ·
        <a href="https://github.com/modelscope/FunASR" target="_blank">🛠️ FunASR Toolkit</a> ·
        <a href="https://github.com/FunAudioLLM/SenseVoice" target="_blank">🎙️ SenseVoice</a> ·
        <a href="https://huggingface.co/FunAudioLLM/Fun-ASR-Nano-2512" target="_blank">🤗 Model Card</a>
    </p>
</div>
"""

comparison_html = """
<div style="background: linear-gradient(135deg, #f0f9ff 0%, #f5f3ff 100%); border-radius: 10px; padding: 16px; margin: 10px 0;">
    <table style="width: 100%; border-collapse: collapse; font-size: 0.9em;">
        <tr style="border-bottom: 2px solid #ddd;">
            <th style="padding: 8px; text-align: left;">Model</th>
            <th style="padding: 8px; text-align: center;">Languages</th>
            <th style="padding: 8px; text-align: center;">Architecture</th>
            <th style="padding: 8px; text-align: center;">Best For</th>
        </tr>
        <tr style="background: #e8f4fd;">
            <td style="padding: 8px;"><strong>Fun-ASR-Nano</strong> ⭐</td>
            <td style="padding: 8px; text-align: center;">31</td>
            <td style="padding: 8px; text-align: center;">LLM-based</td>
            <td style="padding: 8px; text-align: center;">Multi-language, dialects, highest accuracy</td>
        </tr>
        <tr>
            <td style="padding: 8px;">SenseVoice</td>
            <td style="padding: 8px; text-align: center;">5</td>
            <td style="padding: 8px; text-align: center;">CTC (non-AR)</td>
            <td style="padding: 8px; text-align: center;">Speed + Emotion + Audio events</td>
        </tr>
    </table>
</div>
"""


def launch():
    with gr.Blocks(theme=gr.themes.Soft(), title="Fun-ASR-Nano - 31 Language ASR") as demo:
        gr.HTML(description_html)
        gr.HTML(comparison_html)

        with gr.Row():
            with gr.Column(scale=1):
                audio_input = gr.Audio(
                    label="Upload audio or record via microphone",
                    sources=["upload", "microphone"],
                    type="filepath",
                )
                with gr.Row():
                    pipeline_type = gr.Dropdown(
                        choices=["fun-asr-nano", "sensevoice"],
                        value="fun-asr-nano",
                        label="Model",
                    )
                    language = gr.Dropdown(
                        choices=["auto", "zh", "en", "yue", "ja", "ko"],
                        value="auto",
                        label="Language (SenseVoice only)",
                        interactive=True,
                    )
                btn = gr.Button("🎯 Transcribe", variant="primary", size="lg")

            with gr.Column(scale=1):
                output_text = gr.Textbox(
                    label="Transcription Result",
                    lines=10,
                    show_copy_button=True,
                )
                metrics_text = gr.Textbox(label="Performance", lines=1)

        btn.click(
            transcribe,
            inputs=[audio_input, pipeline_type, language],
            outputs=[output_text, metrics_text],
        )

        gr.Markdown("""
### Supported Languages (Fun-ASR-Nano)
Chinese (Mandarin, Cantonese, Sichuan, Shanghai, Minnan, Wenzhou, Hakka, Gan, and more),
English, Japanese, Korean, French, German, Spanish, Italian, Portuguese, Russian, Arabic, Hindi,
Thai, Vietnamese, Indonesian, Malay, Turkish, Polish, Dutch, Swedish, Hebrew, Greek, Czech, Romanian, Hungarian, Finnish, Danish, Norwegian, Ukrainian.

### Tips
- **Fun-ASR-Nano**: Best for multi-language & Chinese dialects. Outputs punctuation natively.
- **SenseVoice**: Ultra-fast (7x faster than Whisper-small), also detects emotions & audio events.
- For long audio (>5min), consider using [FunASR](https://github.com/modelscope/FunASR) locally with GPU.
        """)

    demo.launch()


if __name__ == "__main__":
    launch()