Spaces:

cong182
/

firstAI

Sleeping

App Files Files Community

ndc8 commited on Aug 11

Commit

4b4e9ed

1 Parent(s): 4f67c26

Refactor backend service to support Gemma 3n model and update requirements; remove obsolete test script and add new dependency tests

Browse files

Files changed (4) hide show

backend_service.py +115 -48
requirements.txt +9 -1
test_app_structure.py +0 -39
test_deps.py +37 -0

backend_service.py CHANGED Viewed

@@ -7,8 +7,8 @@ import httpx
 # Hugging Face Spaces: Only transformers backend is supported (no vLLM, no llama-cpp/gguf)
 """
-FastAPI Backend AI Service using Gemma-3n-E4B-it-GGUF
-Provides OpenAI-compatible chat completion endpoints powered by unsloth/gemma-3n-E4B-it-GGUF
 """
 import warnings
@@ -45,6 +45,8 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 # Transformers imports (now fallback for non-GGUF models)
 from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoConfig  # type: ignore
 from transformers import BitsAndBytesConfig  # type: ignore
 import torch
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -88,7 +90,7 @@ class ChatMessage(BaseModel):
         return v
 class ChatCompletionRequest(BaseModel):
-    model: str = Field(default_factory=lambda: os.environ.get("AI_MODEL", "unsloth/gemma-3n-E4B-it-GGUF"), description="The model to use for completion")
     messages: List[ChatMessage] = Field(..., description="List of messages in the conversation")
     max_tokens: Optional[int] = Field(default=512, ge=1, le=2048, description="Maximum tokens to generate")
     temperature: Optional[float] = Field(default=0.7, ge=0.0, le=2.0, description="Sampling temperature")
@@ -137,11 +139,11 @@ class CompletionRequest(BaseModel):
 # Model can be configured via environment variable - defaults to Gemma 3n (transformers format)
-current_model = os.environ.get("AI_MODEL", "unsloth/gemma-3n-E4B-it-GGUF")
 vision_model = os.environ.get("VISION_MODEL", "Salesforce/blip-image-captioning-base")
 # Transformers model support
-tokenizer = None
 model = None
 image_text_pipeline = None  # type: ignore
@@ -190,39 +192,58 @@ def has_images(messages: List[ChatMessage]) -> bool:
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Application lifespan manager for startup and shutdown events"""
-    global tokenizer, model, image_text_pipeline, current_model
     logger.info("🚀 Starting AI Backend Service (Hugging Face Spaces mode)...")
     try:
         logger.info(f"📥 Loading model with transformers: {current_model}")
-        tokenizer = AutoTokenizer.from_pretrained(current_model)
-        # Hugging Face Spaces: Remove device_map and torch_dtype for CPU compatibility
-        model = AutoModelForCausalLM.from_pretrained(
-            current_model,
-            low_cpu_mem_usage=True,
-            trust_remote_code=True,
-        )
-        logger.info(f"✅ Successfully loaded model and tokenizer: {current_model}")
-        # Load image pipeline for multimodal support
-        try:
-            logger.info(f"🖼️ Initializing image captioning pipeline with model: {vision_model}")
-            image_text_pipeline = pipeline("image-to-text", model=vision_model)
-            logger.info("✅ Image captioning pipeline loaded successfully")
-        except Exception as e:
-            logger.warning(f"⚠️ Could not load image captioning pipeline: {e}")
             image_text_pipeline = None
     except Exception as e:
         logger.error(f"❌ Failed to initialize model: {e}")
         raise RuntimeError(f"Service initialization failed: {e}")
     yield
     logger.info("🔄 Shutting down AI Backend Service...")
-    tokenizer = None
     model = None
     image_text_pipeline = None
 # Initialize FastAPI app
 app = FastAPI(
-    title="AI Backend Service - Mistral Nemo",
-    description="OpenAI-compatible chat completion API powered by unsloth/Mistral-Nemo-Instruct-2407",
     version="1.0.0",
     lifespan=lifespan
 )
@@ -239,7 +260,7 @@ app.add_middleware(
 def ensure_model_ready():
     """Check if transformers model is loaded and ready"""
-    if tokenizer is None or model is None:
         raise HTTPException(status_code=503, detail="Service not ready - no model initialized (transformers)")
 def convert_messages_to_prompt(messages: List[ChatMessage]) -> str:
@@ -367,29 +388,75 @@ def convert_messages_to_gemma_prompt(messages: List[ChatMessage]) -> str:
 def generate_response_transformers(messages: List[ChatMessage], max_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.95) -> str:
     """Generate response using transformers model with chat template."""
     try:
-        # Convert messages to HuggingFace format for chat template
-        chat_messages = []
-        for m in messages:
-            content_str = m.content if isinstance(m.content, str) else extract_text_and_images(m.content)[0]
-            chat_messages.append({"role": m.role, "content": content_str})
-        # Apply chat template and tokenize for Hugging Face Spaces CPU
-        inputs = tokenizer.apply_chat_template(
-            chat_messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        )
-        # Pass input_ids and attention_mask directly (no .to(model.device))
-        outputs = model.generate(
-            input_ids=inputs["input_ids"],
-            attention_mask=inputs.get("attention_mask"),
-            max_new_tokens=max_tokens
-        )
-        # Decode only the newly generated tokens (exclude input)
-        generated_text = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
-        return generated_text.strip()
     except Exception as e:
         logger.error(f"Transformers generation failed: {e}")

 # Hugging Face Spaces: Only transformers backend is supported (no vLLM, no llama-cpp/gguf)
 """
+FastAPI Backend AI Service using Gemma-3n-E4B-it
+Provides OpenAI-compatible chat completion endpoints powered by google/gemma-3n-E4B-it
 """
 import warnings
 # Transformers imports (now fallback for non-GGUF models)
 from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoConfig  # type: ignore
 from transformers import BitsAndBytesConfig  # type: ignore
+# Gemma 3n specific imports
+from transformers import Gemma3nForConditionalGeneration, AutoProcessor  # type: ignore
 import torch
 # Configure logging
 logging.basicConfig(level=logging.INFO)
         return v
 class ChatCompletionRequest(BaseModel):
+    model: str = Field(default_factory=lambda: os.environ.get("AI_MODEL", "google/gemma-3n-E4B-it"), description="The model to use for completion")
     messages: List[ChatMessage] = Field(..., description="List of messages in the conversation")
     max_tokens: Optional[int] = Field(default=512, ge=1, le=2048, description="Maximum tokens to generate")
     temperature: Optional[float] = Field(default=0.7, ge=0.0, le=2.0, description="Sampling temperature")
 # Model can be configured via environment variable - defaults to Gemma 3n (transformers format)
+current_model = os.environ.get("AI_MODEL", "google/gemma-3n-E4B-it")
 vision_model = os.environ.get("VISION_MODEL", "Salesforce/blip-image-captioning-base")
 # Transformers model support
+processor = None  # For Gemma 3n we use AutoProcessor instead of just tokenizer
 model = None
 image_text_pipeline = None  # type: ignore
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Application lifespan manager for startup and shutdown events"""
+    global processor, model, image_text_pipeline, current_model
     logger.info("🚀 Starting AI Backend Service (Hugging Face Spaces mode)...")
     try:
         logger.info(f"📥 Loading model with transformers: {current_model}")
+        # For Gemma 3n models, use the specific classes
+        if "gemma-3n" in current_model.lower():
+            processor = AutoProcessor.from_pretrained(current_model)
+            model = Gemma3nForConditionalGeneration.from_pretrained(
+                current_model,
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+                torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+            ).eval()
+        else:
+            # Fallback for other models
+            processor = AutoTokenizer.from_pretrained(current_model)
+            model = AutoModelForCausalLM.from_pretrained(
+                current_model,
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+            )
+        logger.info(f"✅ Successfully loaded model and processor: {current_model}")
+        # Gemma 3n is multimodal, so we don't need a separate image pipeline
+        if "gemma-3n" not in current_model.lower():
+            # Load image pipeline for multimodal support (only for non-Gemma-3n models)
+            try:
+                logger.info(f"🖼️ Initializing image captioning pipeline with model: {vision_model}")
+                image_text_pipeline = pipeline("image-to-text", model=vision_model)
+                logger.info("✅ Image captioning pipeline loaded successfully")
+            except Exception as e:
+                logger.warning(f"⚠️ Could not load image captioning pipeline: {e}")
+                image_text_pipeline = None
+        else:
+            logger.info("✅ Gemma 3n has built-in multimodal support")
             image_text_pipeline = None
     except Exception as e:
         logger.error(f"❌ Failed to initialize model: {e}")
         raise RuntimeError(f"Service initialization failed: {e}")
     yield
     logger.info("🔄 Shutting down AI Backend Service...")
+    processor = None
     model = None
     image_text_pipeline = None
 # Initialize FastAPI app
 app = FastAPI(
+    title="AI Backend Service - Gemma 3n",
+    description="OpenAI-compatible chat completion API powered by google/gemma-3n-E4B-it",
     version="1.0.0",
     lifespan=lifespan
 )
 def ensure_model_ready():
     """Check if transformers model is loaded and ready"""
+    if processor is None or model is None:
         raise HTTPException(status_code=503, detail="Service not ready - no model initialized (transformers)")
 def convert_messages_to_prompt(messages: List[ChatMessage]) -> str:
 def generate_response_transformers(messages: List[ChatMessage], max_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.95) -> str:
     """Generate response using transformers model with chat template."""
     try:
+        # Check if we're using Gemma 3n
+        if "gemma-3n" in current_model.lower():
+            # Gemma 3n specific handling
+            # Convert messages to HuggingFace format for chat template
+            chat_messages = []
+            for m in messages:
+                # Gemma 3n supports multimodal, but for now we'll handle text only
+                if isinstance(m.content, str):
+                    content = [{"type": "text", "text": m.content}]
+                else:
+                    # Extract text content for now (image support can be added later)
+                    text_content, _ = extract_text_and_images(m.content)
+                    content = [{"type": "text", "text": text_content}]
+                chat_messages.append({"role": m.role, "content": content})
+            # Apply chat template using processor
+            inputs = processor.apply_chat_template(
+                chat_messages,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt",
+            )
+            # Generate with Gemma 3n
+            input_len = inputs["input_ids"].shape[-1]
+            with torch.inference_mode():
+                generation = model.generate(
+                    **inputs,
+                    max_new_tokens=max_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    do_sample=temperature > 0,
+                )
+                generation = generation[0][input_len:]
+            # Decode the response
+            generated_text = processor.decode(generation, skip_special_tokens=True)
+            return generated_text.strip()
+        else:
+            # Fallback for other models
+            # Convert messages to HuggingFace format for chat template
+            chat_messages = []
+            for m in messages:
+                content_str = m.content if isinstance(m.content, str) else extract_text_and_images(m.content)[0]
+                chat_messages.append({"role": m.role, "content": content_str})
+            # Apply chat template and tokenize
+            inputs = processor.apply_chat_template(
+                chat_messages,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt",
+            )
+            # Generate response
+            outputs = model.generate(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs.get("attention_mask"),
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                do_sample=temperature > 0,
+            )
+            # Decode only the newly generated tokens (exclude input)
+            generated_text = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
+            return generated_text.strip()
     except Exception as e:
         logger.error(f"Transformers generation failed: {e}")

requirements.txt CHANGED Viewed

@@ -3,11 +3,19 @@
 # Hugging Face Spaces requirements (transformers backend only)
 fastapi
 uvicorn
-transformers
 torch
 python-dotenv
 httpx
 requests
 Pillow
 # Optional: gradio for demo UI
 # gradio

 # Hugging Face Spaces requirements (transformers backend only)
 fastapi
 uvicorn
+transformers>=4.53.0
 torch
 python-dotenv
 httpx
 requests
 Pillow
+# Required dependencies for Gemma models
+protobuf
+tiktoken
+sentencepiece>=0.2.0
+tokenizers
+regex
 # Optional: gradio for demo UI
 # gradio

test_app_structure.py DELETED Viewed

@@ -1,39 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test script to verify the FastAPI app can be imported and started
-"""
-import sys
-import os
-# Add current directory to path
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-try:
-    # Test imports
-    print("Testing imports...")
-    from backend_service import app
-    print("✅ Successfully imported FastAPI app from backend_service")
-    # Test app type
-    from fastapi import FastAPI
-    if isinstance(app, FastAPI):
-        print("✅ App is a valid FastAPI instance")
-    else:
-        print("❌ App is not a FastAPI instance")
-        sys.exit(1)
-    # Test app attributes
-    print(f"✅ App title: {app.title}")
-    print(f"✅ App version: {app.version}")
-    print("\n🎉 All tests passed! The app is ready for Hugging Face Spaces")
-except ImportError as e:
-    print(f"❌ Import error: {e}")
-    print("This is expected if you don't have all dependencies installed locally.")
-    print("The Hugging Face Space will install them from requirements.txt")
-except Exception as e:
-    print(f"❌ Unexpected error: {e}")
-    sys.exit(1)

test_deps.py ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/usr/bin/env python3
+"""
+Test script to verify the transformers dependencies are working
+"""
+def test_imports():
+    """Test that all required transformers imports work"""
+    try:
+        print("Testing transformers imports...")
+        from transformers import AutoProcessor, Gemma3nForConditionalGeneration
+        print("✅ Gemma3nForConditionalGeneration import successful")
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        print("✅ Standard transformers imports successful")
+        import torch
+        print("✅ PyTorch import successful")
+        import sentencepiece
+        print("✅ SentencePiece import successful")
+        import tiktoken
+        print("✅ TikToken import successful")
+        import protobuf
+        print("✅ Protobuf import successful")
+        print("\n🎉 All imports successful! Ready for Hugging Face Spaces deployment")
+        return True
+    except ImportError as e:
+        print(f"❌ Import error: {e}")
+        return False
+if __name__ == "__main__":
+    test_imports()