import spaces import gradio as gr import torch import time from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel # ============================================================ # CONFIG # ============================================================ MODEL_NAME = "rikunarita/Qwen3-4B-Thinking-2507-Genius-Coder" LORA_MODEL_NAME = "rahul7star/Qwen3-4B-Thinking-2509-Genius-Coder-AI" MAX_INPUT_TOKENS = 4096 MAX_NEW_TOKENS = 4096 DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # ============================================================ # LOAD TOKENIZER # ============================================================ print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # ============================================================ # LOAD BASE MODEL # ============================================================ print("Loading base model...") base_model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype="auto", device_map="auto", ) base_model.eval() print("Base model loaded successfully") # ============================================================ # OPTIONAL LORA LOAD # ============================================================ model = base_model lora_loaded = False try: print("Attempting to load LoRA adapter...") model = PeftModel.from_pretrained( base_model, LORA_MODEL_NAME, torch_dtype="auto", ) model.eval() lora_loaded = True print("✅ LoRA loaded successfully") except Exception as e: print("⚠️ LoRA not loaded:", e) model = base_model lora_loaded = False # ============================================================ # SYSTEM PROMPT # ============================================================ SYSTEM_PROMPT = """You are a professional AI Coding Assistant. Your responses must be: - Clear and concise - Well-structured with headings and bullet points - Technically accurate - Written in a formal, professional tone - Focused on best practices and production-quality code """ # ============================================================ # GENERATION FUNCTION # ============================================================ @spaces.GPU() def generate_answer(question, max_tokens, use_lora): print("\n================ GENERATE ANSWER START ================") if not question or not question.strip(): return "Please enter a valid question." try: start_time = time.time() active_model = model if (use_lora and lora_loaded) else base_model messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": question.strip()}, ] prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) inputs = tokenizer( prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_TOKENS, ).to(DEVICE) input_token_count = inputs.input_ids.shape[-1] print(f"Input tokens: {input_token_count}") max_tokens = min(int(max_tokens), MAX_NEW_TOKENS) print(f"Final max_new_tokens: {max_tokens}") print("🚀 Starting generation...") with torch.no_grad(): output = active_model.generate( **inputs, max_new_tokens=max_tokens, do_sample=False, repetition_penalty=1.05, use_cache=True, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, ) print("✅ Generation finished") generated_tokens = output[0][input_token_count:] response = tokenizer.decode( generated_tokens, skip_special_tokens=True, ) print(response) print(f"Generated tokens: {generated_tokens.shape[-1]}") print(f"⏱ Total time: {time.time() - start_time:.2f} sec") print("================ GENERATE ANSWER END ==================\n") return response.strip() or "No output generated." except Exception as e: import traceback traceback.print_exc() if torch.cuda.is_available(): torch.cuda.empty_cache() return f"Error occurred: {str(e)}" # ============================================================ # UI # ============================================================ with gr.Blocks() as demo: gr.Markdown( """ # 🤖 Professional Coding Assistant **Qwen3-4B + Optional LoRA** - ⚡ Stable GPU inference - 🧠 Deterministic responses - 💻 Production-quality code """ ) question = gr.Textbox( label="Your Question", placeholder="Explain Quick Sort with complexity and a Python example", value="write a python code using pytorch for a simple neural network demo", lines=4, ) answer = gr.Markdown(label="AI Response", elem_id="answer_box") max_tokens = gr.Slider( 64, 4096, value=1024, step=32, label="Max New Tokens" ) use_lora = gr.Checkbox( value=lora_loaded, label="Enable LoRA Adapter" ) with gr.Row(): submit = gr.Button("Generate Answer", variant="primary") copy_btn = gr.Button("📋 Copy Response") clear = gr.Button("Clear") submit.click( fn=generate_answer, inputs=[question, max_tokens, use_lora], outputs=answer, ) clear.click( fn=lambda: ("", ""), outputs=[question, answer], ) copy_btn.click( fn=None, js=""" () => { const el = document.querySelector('#answer_box'); navigator.clipboard.writeText(el.innerText); } """, ) demo.launch( theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 900px !important; margin: auto; } textarea { font-size: 14px !important; } """, )