opensensor
diff --git a/‎.github/workflows/docker-publish.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/docker-publish.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 10 additions & 0 deletions b/‎Dockerfile‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎Pipfile‎
Lines changed: 3 additions & 0 deletions b/‎Pipfile‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎api/v1/endpoints/detection.py‎
Lines changed: 133 additions & 1 deletion b/‎api/v1/endpoints/detection.py‎
Lines changed: 133 additions & 1 deletion
diff --git a/‎backends/base.py‎
Lines changed: 34 additions & 2 deletions b/‎backends/base.py‎
Lines changed: 34 additions & 2 deletions
diff --git a/‎backends/factory.py‎
Lines changed: 37 additions & 12 deletions b/‎backends/factory.py‎
Lines changed: 37 additions & 12 deletions
diff --git a/‎backends/moondream/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/moondream/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -45,3 +45,5 @@ jobs:
           push: true
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
+          build-args: |
+            INSTALL_MOONDREAM=1
@@ -36,6 +36,16 @@ RUN python -m pip install --upgrade pip \
 ARG INSTALL_TENSORFLOW=0
 RUN if [ "$INSTALL_TENSORFLOW" = "1" ]; then python -m pip install tensorflow; fi
 
+# Optional: install moondream (torch + transformers) for VLM backend
+# Usage: docker build --build-arg INSTALL_MOONDREAM=1 ...
+ARG INSTALL_MOONDREAM=0
+RUN if [ "$INSTALL_MOONDREAM" = "1" ]; then \
+    python -m pip install \
+        "transformers>=4.51.1,<5.0" \
+        "torch>=2.7.0" \
+        "accelerate>=1.10.0"; \
+    fi
+
 COPY . .
 
 # Optionally bake the TFLite model (only useful if tensorflow is installed)
 
@@ -17,6 +17,9 @@ onnxruntime = "==1.23.2"
 opencv-python = "*"
 scipy = "*"
 shapely = "*"
+transformers = ">=4.51.1,<5.0"
+torch = ">=2.7.0"
+accelerate = ">=1.10.0"
 
 [dev-packages]
 ultralytics = "*"
 
@@ -6,7 +6,7 @@
 import time
 import logging
 
-from models.detection import DetectionResponse, DetectionResult, ImageResponse
+from models.detection import DetectionResponse, DetectionResult, ImageResponse, DescribeResponse, QueryResponse
 from models.zone import ZoneConfiguration
 from backends.factory import get_backend, BACKEND_REGISTRY
 from utils.image import validate_image, preprocess_image, image_to_bytes
@@ -108,6 +108,11 @@ async def detect_objects(
             for det in detections:
                 class_counts[det.label] = class_counts.get(det.label, 0) + 1
             logger.info(f"Detected objects: {dict(class_counts)}")
+            for det in detections:
+                b = det.bounding_box
+                logger.info(f"  → {det.label} ({det.confidence:.2f}): [{b.x_min:.3f},{b.y_min:.3f},{b.x_max:.3f},{b.y_max:.3f}]")
+        else:
+            logger.info("No objects detected")
     except Exception as e:
         logger.error(f"Detection failed: {str(e)}", exc_info=True)
         raise HTTPException(status_code=500, detail=f"Detection error: {str(e)}")
@@ -183,6 +188,133 @@ async def detect_objects(
     return response
 
 
+@router.post("/describe", response_model=DescribeResponse)
+async def describe_image(
+    file: UploadFile = File(...),
+    backend: str = Query("moondream", description="Backend to use for description"),
+    length: str = Query("normal", description="Caption length: 'short', 'normal', or 'long'"),
+    return_image: bool = Query(False, description="Ignored for describe endpoint (accepted for compatibility)"),
+):
+    """
+    Generate a natural language description of an uploaded image.
+
+    - **file**: Image file to describe
+    - **backend**: Backend to use (must support description, e.g. moondream)
+    - **length**: Caption length - 'short', 'normal', or 'long'
+    """
+    logger.info(f"Describe request: backend={backend}, length={length}, filename={file.filename}")
+
+    if backend not in settings.AVAILABLE_BACKENDS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Backend '{backend}' not available. Available backends: {settings.AVAILABLE_BACKENDS}"
+        )
+
+    try:
+        detector = get_backend(backend)
+    except Exception as e:
+        logger.error(f"Failed to initialize backend {backend}: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Backend initialization error: {str(e)}")
+
+    # Read and validate image
+    try:
+        contents = await file.read()
+        image = Image.open(io.BytesIO(contents))
+        validate_image(image, file.filename)
+        processed_image = preprocess_image(image)
+    except Exception as e:
+        logger.error(f"Image validation/processing failed: {str(e)}")
+        raise HTTPException(status_code=400, detail=f"Invalid image: {str(e)}")
+
+    start_time = time.time()
+    try:
+        description = detector.describe(processed_image, length=length)
+        process_time = time.time() - start_time
+        logger.info(f"Describe completed in {process_time*1000:.1f}ms")
+        logger.info(f"Description result: {description}")
+    except NotImplementedError:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Backend '{backend}' does not support image description"
+        )
+    except Exception as e:
+        logger.error(f"Describe failed: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Description error: {str(e)}")
+
+    return DescribeResponse(
+        backend=backend,
+        filename=file.filename,
+        description=description,
+        process_time_ms=int(process_time * 1000),
+        image_width=image.width,
+        image_height=image.height,
+    )
+
+
+@router.post("/query", response_model=QueryResponse)
+async def query_image(
+    file: UploadFile = File(...),
+    question: str = Query(..., description="Question to ask about the image"),
+    backend: str = Query("moondream", description="Backend to use for visual Q&A"),
+    return_image: bool = Query(False, description="Ignored for query endpoint (accepted for compatibility)"),
+):
+    """
+    Ask a natural language question about an uploaded image.
+
+    - **file**: Image file to query
+    - **question**: Natural language question about the image
+    - **backend**: Backend to use (must support visual Q&A, e.g. moondream)
+    """
+    logger.info(f"Query request: backend={backend}, question={question!r}, filename={file.filename}")
+
+    if backend not in settings.AVAILABLE_BACKENDS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Backend '{backend}' not available. Available backends: {settings.AVAILABLE_BACKENDS}"
+        )
+
+    try:
+        detector = get_backend(backend)
+    except Exception as e:
+        logger.error(f"Failed to initialize backend {backend}: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Backend initialization error: {str(e)}")
+
+    # Read and validate image
+    try:
+        contents = await file.read()
+        image = Image.open(io.BytesIO(contents))
+        validate_image(image, file.filename)
+        processed_image = preprocess_image(image)
+    except Exception as e:
+        logger.error(f"Image validation/processing failed: {str(e)}")
+        raise HTTPException(status_code=400, detail=f"Invalid image: {str(e)}")
+
+    start_time = time.time()
+    try:
+        answer = detector.query(processed_image, question=question)
+        process_time = time.time() - start_time
+        logger.info(f"Query completed in {process_time*1000:.1f}ms")
+        logger.info(f"Query answer: {answer}")
+    except NotImplementedError:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Backend '{backend}' does not support visual Q&A"
+        )
+    except Exception as e:
+        logger.error(f"Query failed: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Query error: {str(e)}")
+
+    return QueryResponse(
+        backend=backend,
+        filename=file.filename,
+        question=question,
+        answer=answer,
+        process_time_ms=int(process_time * 1000),
+        image_width=image.width,
+        image_height=image.height,
+    )
+
+
 @router.get("/backends", response_model=Dict[str, Any])
 async def list_backends():
     """
 
@@ -41,12 +41,44 @@ def get_model_info(self) -> Dict[str, Any]:
     def draw_detections(self, image: Image.Image, detections: List[DetectionResult]) -> Image.Image:
         """
         Draw bounding boxes and labels on the image.
-        
+
         Args:
             image: PIL Image to draw on
             detections: List of DetectionResult objects
-            
+
         Returns:
             PIL Image with bounding boxes and labels drawn
         """
         pass
+
+    def describe(self, image: Image.Image, length: str = "normal") -> str:
+        """
+        Generate a natural language description of the image.
+
+        Args:
+            image: PIL Image to describe
+            length: Caption length - 'short' or 'normal'
+
+        Returns:
+            Natural language description string
+
+        Raises:
+            NotImplementedError: If this backend does not support image description
+        """
+        raise NotImplementedError(f"{self.__class__.__name__} does not support image description")
+
+    def query(self, image: Image.Image, question: str) -> str:
+        """
+        Answer a free-form question about the image.
+
+        Args:
+            image: PIL Image to query
+            question: Natural language question about the image
+
+        Returns:
+            Answer string
+
+        Raises:
+            NotImplementedError: If this backend does not support visual Q&A
+        """
+        raise NotImplementedError(f"{self.__class__.__name__} does not support visual Q&A")
@@ -28,10 +28,20 @@
 except ImportError:
     EDGETPU_AVAILABLE = False
 
+try:
+    from backends.moondream.backend import MoondreamBackend
+    MOONDREAM_AVAILABLE = True
+except ImportError:
+    MOONDREAM_AVAILABLE = False
+
 
 # Registry of available backends
 BACKEND_REGISTRY: Dict[str, Type[DetectionBackend]] = {}
 
+# Instance cache — avoids reloading heavy models (e.g. Moondream 3.85 GB) on
+# every request.
+_backend_instances: Dict[str, DetectionBackend] = {}
+
 if TFLITE_AVAILABLE:
     BACKEND_REGISTRY["tflite"] = TFLiteBackend
 
@@ -44,42 +54,48 @@
 if EDGETPU_AVAILABLE:
     BACKEND_REGISTRY["edgetpu"] = EdgeTPUBackend
 
+if MOONDREAM_AVAILABLE:
+    BACKEND_REGISTRY["moondream"] = MoondreamBackend
+
 
 def get_backend(backend_name: str) -> DetectionBackend:
     """
-    Get an instance of the specified detection backend.
-    
+    Get a (cached) instance of the specified detection backend.
+
     Args:
         backend_name: Name of the backend to instantiate
-        
+
     Returns:
         Instance of the requested backend
-        
+
     Raises:
         ValueError: If the backend is not available
     """
+    if backend_name in _backend_instances:
+        return _backend_instances[backend_name]
+
     if backend_name not in BACKEND_REGISTRY:
         raise ValueError(f"Backend '{backend_name}' not found. Available backends: {list(BACKEND_REGISTRY.keys())}")
-    
+
     backend_class = BACKEND_REGISTRY[backend_name]
-    
+
     # Initialize backend with appropriate settings
     if backend_name == "tflite":
-        return backend_class(
+        instance = backend_class(
             model_path=settings.TFLITE_MODEL_PATH,
             labels_path=settings.TFLITE_LABELS_PATH,
             confidence_threshold=settings.TFLITE_CONFIDENCE_THRESHOLD
         )
     elif backend_name == "onnx":
-        return backend_class(
+        instance = backend_class(
             model_path=settings.ONNX_MODEL_PATH,
             labels_path=settings.ONNX_LABELS_PATH,
             confidence_threshold=settings.ONNX_CONFIDENCE_THRESHOLD,
             iou_threshold=settings.ONNX_IOU_THRESHOLD,
             model_type=settings.ONNX_MODEL_TYPE
         )
     elif backend_name == "opencv":
-        return backend_class(
+        instance = backend_class(
             model_path=settings.OPENCV_MODEL_PATH,
             config_path=settings.OPENCV_CONFIG_PATH,
             labels_path=settings.OPENCV_LABELS_PATH,
@@ -89,17 +105,26 @@ def get_backend(backend_name: str) -> DetectionBackend:
             input_size=settings.OPENCV_INPUT_SIZE
         )
     elif backend_name == "edgetpu":
-        return backend_class(
+        instance = backend_class(
             model_path=settings.EDGETPU_MODEL_PATH,
             labels_path=settings.EDGETPU_LABELS_PATH,
             confidence_threshold=settings.EDGETPU_CONFIDENCE_THRESHOLD,
             device=settings.EDGETPU_DEVICE,
             model_type=settings.EDGETPU_MODEL_TYPE,
             iou_threshold=settings.EDGETPU_IOU_THRESHOLD
         )
+    elif backend_name == "moondream":
+        instance = backend_class(
+            model_name=settings.MOONDREAM_MODEL_NAME,
+            revision=settings.MOONDREAM_REVISION,
+            device=settings.MOONDREAM_DEVICE,
+            default_detect_classes=settings.MOONDREAM_DEFAULT_DETECT_CLASSES,
+        )
+    else:
+        instance = backend_class()
 
-    # Default initialization for other backends
-    return backend_class()
+    _backend_instances[backend_name] = instance
+    return instance
 
 
 def register_backend(name: str, backend_class: Type[DetectionBackend]) -> None:
 
@@ -0,0 +1,2 @@
+# Moondream VLM backend
+