Skip to content

Commit a91368d

Browse files
committed
feat: add Moondream VLM backend with k8s deployment manifests
- Add Moondream2 backend (detect, describe, query) using HuggingFace Transformers - Add /describe and /query API endpoints - Add k8s manifests: deployment, service, PVC, namespace - PersistentVolumeClaim for HuggingFace model cache (3Gi) - Startup probe with 5-min tolerance for cold model download - Memory bumped to 1Gi request / 4Gi limit for VLM inference - Update Dockerfile with optional INSTALL_MOONDREAM build arg - Update CI workflow to build with moondream dependencies - Accept return_image param on /describe and /query for lightNVR compat
1 parent 2be8ea5 commit a91368d

14 files changed

Lines changed: 549 additions & 22 deletions

File tree

.github/workflows/docker-publish.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,5 @@ jobs:
4545
push: true
4646
tags: ${{ steps.meta.outputs.tags }}
4747
labels: ${{ steps.meta.outputs.labels }}
48+
build-args: |
49+
INSTALL_MOONDREAM=1

Dockerfile

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,16 @@ RUN python -m pip install --upgrade pip \
3636
ARG INSTALL_TENSORFLOW=0
3737
RUN if [ "$INSTALL_TENSORFLOW" = "1" ]; then python -m pip install tensorflow; fi
3838

39+
# Optional: install moondream (torch + transformers) for VLM backend
40+
# Usage: docker build --build-arg INSTALL_MOONDREAM=1 ...
41+
ARG INSTALL_MOONDREAM=0
42+
RUN if [ "$INSTALL_MOONDREAM" = "1" ]; then \
43+
python -m pip install \
44+
"transformers>=4.51.1,<5.0" \
45+
"torch>=2.7.0" \
46+
"accelerate>=1.10.0"; \
47+
fi
48+
3949
COPY . .
4050

4151
# Optionally bake the TFLite model (only useful if tensorflow is installed)

Pipfile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ onnxruntime = "==1.23.2"
1717
opencv-python = "*"
1818
scipy = "*"
1919
shapely = "*"
20+
transformers = ">=4.51.1,<5.0"
21+
torch = ">=2.7.0"
22+
accelerate = ">=1.10.0"
2023

2124
[dev-packages]
2225
ultralytics = "*"

api/v1/endpoints/detection.py

Lines changed: 133 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import time
77
import logging
88

9-
from models.detection import DetectionResponse, DetectionResult, ImageResponse
9+
from models.detection import DetectionResponse, DetectionResult, ImageResponse, DescribeResponse, QueryResponse
1010
from models.zone import ZoneConfiguration
1111
from backends.factory import get_backend, BACKEND_REGISTRY
1212
from utils.image import validate_image, preprocess_image, image_to_bytes
@@ -108,6 +108,11 @@ async def detect_objects(
108108
for det in detections:
109109
class_counts[det.label] = class_counts.get(det.label, 0) + 1
110110
logger.info(f"Detected objects: {dict(class_counts)}")
111+
for det in detections:
112+
b = det.bounding_box
113+
logger.info(f" → {det.label} ({det.confidence:.2f}): [{b.x_min:.3f},{b.y_min:.3f},{b.x_max:.3f},{b.y_max:.3f}]")
114+
else:
115+
logger.info("No objects detected")
111116
except Exception as e:
112117
logger.error(f"Detection failed: {str(e)}", exc_info=True)
113118
raise HTTPException(status_code=500, detail=f"Detection error: {str(e)}")
@@ -183,6 +188,133 @@ async def detect_objects(
183188
return response
184189

185190

191+
@router.post("/describe", response_model=DescribeResponse)
192+
async def describe_image(
193+
file: UploadFile = File(...),
194+
backend: str = Query("moondream", description="Backend to use for description"),
195+
length: str = Query("normal", description="Caption length: 'short', 'normal', or 'long'"),
196+
return_image: bool = Query(False, description="Ignored for describe endpoint (accepted for compatibility)"),
197+
):
198+
"""
199+
Generate a natural language description of an uploaded image.
200+
201+
- **file**: Image file to describe
202+
- **backend**: Backend to use (must support description, e.g. moondream)
203+
- **length**: Caption length - 'short', 'normal', or 'long'
204+
"""
205+
logger.info(f"Describe request: backend={backend}, length={length}, filename={file.filename}")
206+
207+
if backend not in settings.AVAILABLE_BACKENDS:
208+
raise HTTPException(
209+
status_code=400,
210+
detail=f"Backend '{backend}' not available. Available backends: {settings.AVAILABLE_BACKENDS}"
211+
)
212+
213+
try:
214+
detector = get_backend(backend)
215+
except Exception as e:
216+
logger.error(f"Failed to initialize backend {backend}: {str(e)}")
217+
raise HTTPException(status_code=500, detail=f"Backend initialization error: {str(e)}")
218+
219+
# Read and validate image
220+
try:
221+
contents = await file.read()
222+
image = Image.open(io.BytesIO(contents))
223+
validate_image(image, file.filename)
224+
processed_image = preprocess_image(image)
225+
except Exception as e:
226+
logger.error(f"Image validation/processing failed: {str(e)}")
227+
raise HTTPException(status_code=400, detail=f"Invalid image: {str(e)}")
228+
229+
start_time = time.time()
230+
try:
231+
description = detector.describe(processed_image, length=length)
232+
process_time = time.time() - start_time
233+
logger.info(f"Describe completed in {process_time*1000:.1f}ms")
234+
logger.info(f"Description result: {description}")
235+
except NotImplementedError:
236+
raise HTTPException(
237+
status_code=400,
238+
detail=f"Backend '{backend}' does not support image description"
239+
)
240+
except Exception as e:
241+
logger.error(f"Describe failed: {str(e)}", exc_info=True)
242+
raise HTTPException(status_code=500, detail=f"Description error: {str(e)}")
243+
244+
return DescribeResponse(
245+
backend=backend,
246+
filename=file.filename,
247+
description=description,
248+
process_time_ms=int(process_time * 1000),
249+
image_width=image.width,
250+
image_height=image.height,
251+
)
252+
253+
254+
@router.post("/query", response_model=QueryResponse)
255+
async def query_image(
256+
file: UploadFile = File(...),
257+
question: str = Query(..., description="Question to ask about the image"),
258+
backend: str = Query("moondream", description="Backend to use for visual Q&A"),
259+
return_image: bool = Query(False, description="Ignored for query endpoint (accepted for compatibility)"),
260+
):
261+
"""
262+
Ask a natural language question about an uploaded image.
263+
264+
- **file**: Image file to query
265+
- **question**: Natural language question about the image
266+
- **backend**: Backend to use (must support visual Q&A, e.g. moondream)
267+
"""
268+
logger.info(f"Query request: backend={backend}, question={question!r}, filename={file.filename}")
269+
270+
if backend not in settings.AVAILABLE_BACKENDS:
271+
raise HTTPException(
272+
status_code=400,
273+
detail=f"Backend '{backend}' not available. Available backends: {settings.AVAILABLE_BACKENDS}"
274+
)
275+
276+
try:
277+
detector = get_backend(backend)
278+
except Exception as e:
279+
logger.error(f"Failed to initialize backend {backend}: {str(e)}")
280+
raise HTTPException(status_code=500, detail=f"Backend initialization error: {str(e)}")
281+
282+
# Read and validate image
283+
try:
284+
contents = await file.read()
285+
image = Image.open(io.BytesIO(contents))
286+
validate_image(image, file.filename)
287+
processed_image = preprocess_image(image)
288+
except Exception as e:
289+
logger.error(f"Image validation/processing failed: {str(e)}")
290+
raise HTTPException(status_code=400, detail=f"Invalid image: {str(e)}")
291+
292+
start_time = time.time()
293+
try:
294+
answer = detector.query(processed_image, question=question)
295+
process_time = time.time() - start_time
296+
logger.info(f"Query completed in {process_time*1000:.1f}ms")
297+
logger.info(f"Query answer: {answer}")
298+
except NotImplementedError:
299+
raise HTTPException(
300+
status_code=400,
301+
detail=f"Backend '{backend}' does not support visual Q&A"
302+
)
303+
except Exception as e:
304+
logger.error(f"Query failed: {str(e)}", exc_info=True)
305+
raise HTTPException(status_code=500, detail=f"Query error: {str(e)}")
306+
307+
return QueryResponse(
308+
backend=backend,
309+
filename=file.filename,
310+
question=question,
311+
answer=answer,
312+
process_time_ms=int(process_time * 1000),
313+
image_width=image.width,
314+
image_height=image.height,
315+
)
316+
317+
186318
@router.get("/backends", response_model=Dict[str, Any])
187319
async def list_backends():
188320
"""

backends/base.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,44 @@ def get_model_info(self) -> Dict[str, Any]:
4141
def draw_detections(self, image: Image.Image, detections: List[DetectionResult]) -> Image.Image:
4242
"""
4343
Draw bounding boxes and labels on the image.
44-
44+
4545
Args:
4646
image: PIL Image to draw on
4747
detections: List of DetectionResult objects
48-
48+
4949
Returns:
5050
PIL Image with bounding boxes and labels drawn
5151
"""
5252
pass
53+
54+
def describe(self, image: Image.Image, length: str = "normal") -> str:
55+
"""
56+
Generate a natural language description of the image.
57+
58+
Args:
59+
image: PIL Image to describe
60+
length: Caption length - 'short' or 'normal'
61+
62+
Returns:
63+
Natural language description string
64+
65+
Raises:
66+
NotImplementedError: If this backend does not support image description
67+
"""
68+
raise NotImplementedError(f"{self.__class__.__name__} does not support image description")
69+
70+
def query(self, image: Image.Image, question: str) -> str:
71+
"""
72+
Answer a free-form question about the image.
73+
74+
Args:
75+
image: PIL Image to query
76+
question: Natural language question about the image
77+
78+
Returns:
79+
Answer string
80+
81+
Raises:
82+
NotImplementedError: If this backend does not support visual Q&A
83+
"""
84+
raise NotImplementedError(f"{self.__class__.__name__} does not support visual Q&A")

backends/factory.py

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,20 @@
2828
except ImportError:
2929
EDGETPU_AVAILABLE = False
3030

31+
try:
32+
from backends.moondream.backend import MoondreamBackend
33+
MOONDREAM_AVAILABLE = True
34+
except ImportError:
35+
MOONDREAM_AVAILABLE = False
36+
3137

3238
# Registry of available backends
3339
BACKEND_REGISTRY: Dict[str, Type[DetectionBackend]] = {}
3440

41+
# Instance cache — avoids reloading heavy models (e.g. Moondream 3.85 GB) on
42+
# every request.
43+
_backend_instances: Dict[str, DetectionBackend] = {}
44+
3545
if TFLITE_AVAILABLE:
3646
BACKEND_REGISTRY["tflite"] = TFLiteBackend
3747

@@ -44,42 +54,48 @@
4454
if EDGETPU_AVAILABLE:
4555
BACKEND_REGISTRY["edgetpu"] = EdgeTPUBackend
4656

57+
if MOONDREAM_AVAILABLE:
58+
BACKEND_REGISTRY["moondream"] = MoondreamBackend
59+
4760

4861
def get_backend(backend_name: str) -> DetectionBackend:
4962
"""
50-
Get an instance of the specified detection backend.
51-
63+
Get a (cached) instance of the specified detection backend.
64+
5265
Args:
5366
backend_name: Name of the backend to instantiate
54-
67+
5568
Returns:
5669
Instance of the requested backend
57-
70+
5871
Raises:
5972
ValueError: If the backend is not available
6073
"""
74+
if backend_name in _backend_instances:
75+
return _backend_instances[backend_name]
76+
6177
if backend_name not in BACKEND_REGISTRY:
6278
raise ValueError(f"Backend '{backend_name}' not found. Available backends: {list(BACKEND_REGISTRY.keys())}")
63-
79+
6480
backend_class = BACKEND_REGISTRY[backend_name]
65-
81+
6682
# Initialize backend with appropriate settings
6783
if backend_name == "tflite":
68-
return backend_class(
84+
instance = backend_class(
6985
model_path=settings.TFLITE_MODEL_PATH,
7086
labels_path=settings.TFLITE_LABELS_PATH,
7187
confidence_threshold=settings.TFLITE_CONFIDENCE_THRESHOLD
7288
)
7389
elif backend_name == "onnx":
74-
return backend_class(
90+
instance = backend_class(
7591
model_path=settings.ONNX_MODEL_PATH,
7692
labels_path=settings.ONNX_LABELS_PATH,
7793
confidence_threshold=settings.ONNX_CONFIDENCE_THRESHOLD,
7894
iou_threshold=settings.ONNX_IOU_THRESHOLD,
7995
model_type=settings.ONNX_MODEL_TYPE
8096
)
8197
elif backend_name == "opencv":
82-
return backend_class(
98+
instance = backend_class(
8399
model_path=settings.OPENCV_MODEL_PATH,
84100
config_path=settings.OPENCV_CONFIG_PATH,
85101
labels_path=settings.OPENCV_LABELS_PATH,
@@ -89,17 +105,26 @@ def get_backend(backend_name: str) -> DetectionBackend:
89105
input_size=settings.OPENCV_INPUT_SIZE
90106
)
91107
elif backend_name == "edgetpu":
92-
return backend_class(
108+
instance = backend_class(
93109
model_path=settings.EDGETPU_MODEL_PATH,
94110
labels_path=settings.EDGETPU_LABELS_PATH,
95111
confidence_threshold=settings.EDGETPU_CONFIDENCE_THRESHOLD,
96112
device=settings.EDGETPU_DEVICE,
97113
model_type=settings.EDGETPU_MODEL_TYPE,
98114
iou_threshold=settings.EDGETPU_IOU_THRESHOLD
99115
)
116+
elif backend_name == "moondream":
117+
instance = backend_class(
118+
model_name=settings.MOONDREAM_MODEL_NAME,
119+
revision=settings.MOONDREAM_REVISION,
120+
device=settings.MOONDREAM_DEVICE,
121+
default_detect_classes=settings.MOONDREAM_DEFAULT_DETECT_CLASSES,
122+
)
123+
else:
124+
instance = backend_class()
100125

101-
# Default initialization for other backends
102-
return backend_class()
126+
_backend_instances[backend_name] = instance
127+
return instance
103128

104129

105130
def register_backend(name: str, backend_class: Type[DetectionBackend]) -> None:

backends/moondream/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Moondream VLM backend
2+

0 commit comments

Comments
 (0)