Spaces:

imperiusrex
/

ClassroomCam

Sleeping

App Files Files Community

imperiusrex commited on 12 days ago

Commit

a26cb4b

verified ·

1 Parent(s): 1c679de

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -165

app.py CHANGED Viewed

@@ -6,16 +6,8 @@ import torch
 import spaces
 from ultralytics import YOLO
 from tqdm import tqdm
-from PIL import Image
-import logging
-import time
-from transformers import MobileViTFeatureExtractor, MobileViTForImageClassification
-from sentence_transformers import util
-import gc
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 # Fix for Ultralytics config write error in Hugging Face environment
 os.environ["YOLO_CONFIG_DIR"] = "/tmp"
@@ -23,37 +15,19 @@ os.environ["YOLO_CONFIG_DIR"] = "/tmp"
 # Use GPU if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Static knowledge base with prompts and explanations
-KNOWLEDGE_BASE = [
-    {
-        "prompt": "A physics equation on a whiteboard",
-        "explanation": "The board likely contains a physics equation, such as E = mc², which is Einstein's mass-energy equivalence formula. It states that energy (E) equals mass (m) times the speed of light (c) squared, a key concept in relativity."
-    },
-    {
-        "prompt": "A geometry diagram on a whiteboard",
-        "explanation": "The board shows a geometry diagram, possibly related to the Pythagorean theorem (a² + b² = c²), which applies to right-angled triangles to calculate side lengths."
-    },
-    {
-        "prompt": "A chemistry formula on a whiteboard",
-        "explanation": "The board displays a chemistry formula, such as a chemical equation or molecular structure, used to describe reactions or compounds."
-    },
-    {
-        "prompt": "A biology diagram on a whiteboard",
-        "explanation": "The board shows a biology diagram, such as a cell structure or photosynthesis process, illustrating biological concepts."
-    }
-]
 @spaces.GPU
 def process_video(video_path):
-    # Load YOLO models
-    try:
-        extract_model = YOLO("best.pt").to(device)
-        detect_model = YOLO("yolov8n.pt").to(device)
-    except Exception as e:
-        logger.error(f"Failed to load YOLO models: {str(e)}")
-        raise RuntimeError(f"Failed to load YOLO models: {str(e)}")
-    os.makedirs("frames", exist_ok=True)
     # Step 1: Extract board-only frames
     cap = cv2.VideoCapture(video_path)
@@ -66,7 +40,7 @@ def process_video(video_path):
         labels = [extract_model.names[int(c)] for c in results[0].boxes.cls.cpu().numpy()]
         if "board" in labels and "person" not in labels:
             frames.append(frame)
-            cv2.imwrite(f"frames/frame_{idx:04d}.jpg", frame)
         idx += 1
     cap.release()
     if not frames:
@@ -100,7 +74,7 @@ def process_video(video_path):
     # Step 3: Median-fuse
     stack = np.stack(aligned, axis=0).astype(np.float32)
     median_board = np.median(stack, axis=0).astype(np.uint8)
-    cv2.imwrite("clean_board.jpg", median_board)
     # Step 4: Mask persons & selective fuse
     sum_img = np.zeros_like(aligned[0], dtype=np.float32)
@@ -119,132 +93,33 @@ def process_video(video_path):
     count[count == 0] = 1
     selective = (sum_img / count[:, :, None]).astype(np.uint8)
-    cv2.imwrite("fused_board_selective.jpg", selective)
     # Step 5: Sharpen
     blur = cv2.GaussianBlur(selective, (5, 5), 0)
     sharp = cv2.addWeighted(selective, 1.5, blur, -0.5, 0)
-    cv2.imwrite("sharpened_board_color.jpg", sharp)
-    # Free YOLO models to save memory
-    extract_model = None
-    detect_model = None
-    gc.collect()
-    if device == "cuda":
-        torch.cuda.empty_cache()
-    return sharp
-def generate_related_content(image, retries=3):
-    # Load MobileViT model
-    model = None
-    feature_extractor = None
-    try:
-        model = MobileViTForImageClassification.from_pretrained(
-            "apple/mobilevit-xxs",
-            torch_dtype=torch.bfloat16,
-            low_cpu_mem_usage=True
-        ).to(device)
-        feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/mobilevit-xxs")
-        logger.info("Successfully loaded MobileViT model and feature extractor")
-    except Exception as e:
-        logger.error(f"Failed to load MobileViT model: {str(e)}")
-        return (
-            "Error: Failed to load MobileViT model due to insufficient memory. "
-            "Consider upgrading to a paid Space with GPU.\n\n"
-            "For further reading:\n"
-            "- Khan Academy: https://www.khanacademy.org\n"
-            "- Wikipedia: https://en.wikipedia.org/wiki/Education\n"
-            "- MIT OpenCourseWare: https://ocw.mit.edu"
-        )
-    # Convert OpenCV image to PIL
-    try:
-        image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
-    except Exception as e:
-        logger.error(f"Image conversion failed: {str(e)}")
-        return f"Error converting image: {str(e)}"
-    # Load sentence transformer for prompt embeddings
-    try:
-        from sentence_transformers import SentenceTransformer
-        text_encoder = SentenceTransformer("all-MiniLM-L6-v2")
-        logger.info("Successfully loaded sentence transformer")
-    except Exception as e:
-        logger.error(f"Failed to load sentence transformer: {str(e)}")
-        return (
-            "Error: Failed to load text encoder for prompts.\n\n"
-            "For further reading:\n"
-            "- Khan Academy: https://www.khanacademy.org\n"
-            "- Wikipedia: https://en.wikipedia.org/wiki/Education\n"
-            "- MIT OpenCourseWare: https://ocw.mit.edu"
-        )
-    # Process image and prompts
-    for attempt in range(retries):
-        try:
-            # Prepare image inputs
-            inputs = feature_extractor(images=image_pil, return_tensors="pt").to(device)
-            # Get image features
-            with torch.no_grad():
-                outputs = model(**inputs, output_hidden_states=True)
-                # Use the last hidden state as features (approximating CLIP-like embeddings)
-                image_features = outputs.hidden_states[-1].mean(dim=1)  # Average pooling
-            # Encode prompts
-            prompts = [entry["prompt"] for entry in KNOWLEDGE_BASE]
-            text_features = text_encoder.encode(prompts, convert_to_tensor=True, device=device)
-            # Compute cosine similarities
-            similarities = util.cos_sim(image_features, text_features)[0]
-            best_match_idx = similarities.argmax()
-            best_score = similarities[best_match_idx].item()
-            # Threshold for confidence
-            if best_score < 0.2:
-                logger.warning("No confident match found for image content")
-                explanation = "The board content could not be confidently identified."
-                matched_prompt = "Unknown content"
-            else:
-                matched_prompt = prompts[best_match_idx]
-                explanation = next(entry["explanation"] for entry in KNOWLEDGE_BASE if entry["prompt"] == matched_prompt)
-                logger.info(f"Matched prompt: {matched_prompt} (score: {best_score:.2f})")
-            references = (
-                "For further reading:\n"
-                "- Khan Academy: https://www.khanacademy.org\n"
-                "- Wikipedia: https://en.wikipedia.org/wiki/Education\n"
-                "- MIT OpenCourseWare: https://ocw.mit.edu"
-            )
-            return f"Content: {matched_prompt}\n\nExplanation: {explanation}\n\n{references}"
-        except Exception as e:
-            error_msg = f"MobileViT processing attempt {attempt + 1} failed: {str(e)}"
-            logger.error(error_msg)
-            if attempt == retries - 1:
-                return f"Error generating content with MobileViT: {error_msg}\n\n{references}"
-            time.sleep(2 ** attempt)
-        finally:
-            # Free model to save memory
-            model = None
-            feature_extractor = None
-            gc.collect()
-            if device == "cuda":
-                torch.cuda.empty_cache()
-def process_and_generate(video_path):
-    try:
-        # Process video to get sharpened image
-        sharpened_image = process_video(video_path)
-        # Generate related content
-        generated_content = generate_related_content(sharpened_image)
-        return sharpened_image, generated_content
-    except Exception as e:
-        logger.error(f"Processing failed: {str(e)}")
-        return None, f"Error processing video: {str(e)}"
 demo = gr.Interface(
-    fn=process_and_generate,
     inputs=[
         gr.File(
             label="Upload Classroom Video (.mp4)",
@@ -255,19 +130,18 @@ demo = gr.Interface(
     ],
     outputs=[
         gr.Image(label="Sharpened Final Board"),
-        gr.Textbox(label="Generated Content and Explanation")
     ],
-    title="📹 Classroom Board Cleaner & Content Generator",
     description=(
         "Upload your classroom video (.mp4). \n"
-        "Automatic extraction, alignment, masking, fusion & sharpening. \n"
-        "Generates a summary and detailed explanation of the board content using MobileViT-XXS."
     )
 )
 if __name__ == "__main__":
     if device == "cuda":
-        logger.info(f"Using GPU: {torch.cuda.get_device_name(0)}")
     else:
-        logger.info("Using CPU (GPU not available or not assigned)")
     demo.launch()

 import spaces
 from ultralytics import YOLO
 from tqdm import tqdm
+import easyocr
+from transformers import pipeline
 # Fix for Ultralytics config write error in Hugging Face environment
 os.environ["YOLO_CONFIG_DIR"] = "/tmp"
 # Use GPU if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load models onto the appropriate device
+extract_model = YOLO("best.pt").to(device)
+detect_model = YOLO("yolov8n.pt").to(device)
+# Initialize EasyOCR reader (English language, GPU if available)
+reader = easyocr.Reader(['en'], gpu=(device == "cuda"))
+# Initialize text generation model (distilgpt2 for lightweight performance)
+generator = pipeline("text-generation", model="distilgpt2", device=0 if device == "cuda" else -1)
 @spaces.GPU
 def process_video(video_path):
+    os.makedirs("/tmp/frames", exist_ok=True)
     # Step 1: Extract board-only frames
     cap = cv2.VideoCapture(video_path)
         labels = [extract_model.names[int(c)] for c in results[0].boxes.cls.cpu().numpy()]
         if "board" in labels and "person" not in labels:
             frames.append(frame)
+            cv2.imwrite(f"/tmp/frames/frame_{idx:04d}.jpg", frame)
         idx += 1
     cap.release()
     if not frames:
     # Step 3: Median-fuse
     stack = np.stack(aligned, axis=0).astype(np.float32)
     median_board = np.median(stack, axis=0).astype(np.uint8)
+    cv2.imwrite("/tmp/clean_board.jpg", median_board)
     # Step 4: Mask persons & selective fuse
     sum_img = np.zeros_like(aligned[0], dtype=np.float32)
     count[count == 0] = 1
     selective = (sum_img / count[:, :, None]).astype(np.uint8)
+    cv2.imwrite("/tmp/fused_board_selective.jpg", selective)
     # Step 5: Sharpen
     blur = cv2.GaussianBlur(selective, (5, 5), 0)
     sharp = cv2.addWeighted(selective, 1.5, blur, -0.5, 0)
+    output_image = "/tmp/sharpened_board_color.jpg"
+    cv2.imwrite(output_image, sharp)
+    # Step 6: Detect text using EasyOCR (not displayed)
+    results = reader.readtext(output_image)
+    detected_text = " ".join([result[1] for result in results]).strip()
+    if not detected_text:
+        return output_image, "No text detected on the board."
+    # Step 7: Generate explanation using distilgpt2
+    prompt = (
+        f"You are an expert teacher. The following content was detected on a classroom board: '{detected_text}'. "
+        "Provide a detailed explanation of the content, including definitions, examples, or step-by-step solutions if applicable. "
+        "If the content is an equation, solve it or explain its significance. If it's a concept, provide context and examples."
+    )
+    explanation = generator(prompt, max_length=200, num_return_sequences=1, truncation=True)[0]['generated_text']
+    return output_image, explanation
+# Update Gradio interface
 demo = gr.Interface(
+    fn=process_video,
     inputs=[
         gr.File(
             label="Upload Classroom Video (.mp4)",
     ],
     outputs=[
         gr.Image(label="Sharpened Final Board"),
+        gr.Textbox(label="Explanation of Board Content")
     ],
+    title="📹 Classroom Board Cleaner & Content Explainer",
     description=(
         "Upload your classroom video (.mp4). \n"
+        "Automatic board extraction, sharpening, and explanation of detected content."
     )
 )
 if __name__ == "__main__":
     if device == "cuda":
+        print(f"[INFO] ✅ Using GPU: {torch.cuda.get_device_name(0)}")
     else:
+        print("[INFO] ⚠️ Using CPU (GPU not available or not assigned)")
     demo.launch()