grctest
diff --git a/‎app/lib/endpoints/chat_endpoints.py‎
Lines changed: 55 additions & 52 deletions b/‎app/lib/endpoints/chat_endpoints.py‎
Lines changed: 55 additions & 52 deletions
diff --git a/‎app/lib/endpoints/llama_cli_endpoints.py‎
Lines changed: 202 additions & 0 deletions b/‎app/lib/endpoints/llama_cli_endpoints.py‎
Lines changed: 202 additions & 0 deletions
@@ -2,21 +2,40 @@
 import httpx
 import asyncio
 import logging
-from .process_management import get_server_processes, get_server_configs
-from pydantic import BaseModel
+import os
+from pydantic import BaseModel, Field
 from typing import List
 
+# --- KEPT server-related imports ---
+from .process_management import (
+    get_server_processes,
+    get_server_configs,
+)
+
+
 logger = logging.getLogger(__name__)
 
+
 class ChatRequest(BaseModel):
     message: str
     port: int = 8081
-    threads: int = 1
-    ctx_size: int = 2048
-    n_predict: int = 256
-    temperature: float = 0.8
+    threads: int = Field(default_factory=lambda: os.cpu_count() or 1, gt=0)
+    ctx_size: int = Field(default=2048, gt=0)
+    n_predict: int = Field(default=256, gt=0)
+    temperature: float = Field(default=0.8, gt=0.0, le=2.0)
+
+
+class MultiChatRequestItem(ChatRequest):
+    pass
+
+
+class MultiChatRequest(BaseModel):
+    requests: List[MultiChatRequestItem]
+
 
-async def chat_with_bitnet(chat: ChatRequest):
+# --- Endpoint logic functions ---
+
+async def handle_chat_with_bitnet_server(chat: ChatRequest):
     host = "127.0.0.1"
     key = (host, chat.port)
     proc_entry = get_server_processes().get(key)
@@ -27,52 +46,36 @@ async def chat_with_bitnet(chat: ChatRequest):
     server_url = f"http://{host}:{chat.port}/completion"
     payload = {
         "prompt": chat.message,
-        "threads": chat.threads,
-        "ctx_size": chat.ctx_size,
         "n_predict": chat.n_predict,
-        "temperature": chat.temperature
+        "temperature": chat.temperature,
     }
-    async def _chat():
+
+    try:
         async with httpx.AsyncClient() as client:
-            try:
-                logger.info(f"Forwarding chat message to BitNet server on port {chat.port}.")
-                response = await client.post(server_url, json=payload, timeout=300.0)
-                response.raise_for_status()
-                return response.json()
-            except httpx.ReadTimeout:
-                logger.error(f"ReadTimeout when communicating with BitNet server on port {chat.port}.")
-                raise HTTPException(status_code=504, detail=f"Request to BitNet server on port {chat.port} timed out.")
-            except httpx.ConnectError:
-                logger.error(f"ConnectError when communicating with BitNet server on port {chat.port}.")
-                raise HTTPException(status_code=503, detail=f"Could not connect to BitNet server on port {chat.port}.")
-            except httpx.HTTPStatusError as e:
-                logger.error(f"HTTPStatusError from BitNet server on port {chat.port}: {e.response.status_code} - {e.response.text}", exc_info=True)
-                raise HTTPException(status_code=e.response.status_code, detail=f"BitNet server error: {e.response.text}")
-            except Exception as e:
-                logger.error(f"Unexpected error during chat with BitNet server on port {chat.port}: {str(e)}", exc_info=True)
-                error_detail = f"An unexpected error occurred while communicating with BitNet server on port {chat.port}: {str(e)}"
-                raise HTTPException(status_code=500, detail=error_detail)
-    return await _chat()
+            response = await client.post(server_url, json=payload, timeout=60.0)
+            response.raise_for_status()
+            response_data = response.json()
+            # Ensure the key "content" exists before accessing it
+            return {"response": response_data.get("content", ""), "port": chat.port}
+    except httpx.RequestError as e:
+        logger.error(f"HTTP request error to server {host}:{chat.port}: {e}")
+        raise HTTPException(status_code=503, detail=f"Error communicating with BitNet server on port {chat.port}: {e}")
+    except httpx.HTTPStatusError as e:
+        logger.error(f"HTTP status error from server {host}:{chat.port}: {e.response.status_code} - {e.response.text}")
+        raise HTTPException(status_code=e.response.status_code, detail=f"Error from BitNet server on port {chat.port}: {e.response.text}")
+    except Exception as e:
+        logger.error(f"Unexpected error during chat with server {host}:{chat.port}: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {str(e)}")
 
-class MultiChatRequest(BaseModel):
-    requests: List[ChatRequest]
-
-async def multichat_with_bitnet(multichat: MultiChatRequest):
-    logger.info(f"Multichat request received for {len(multichat.requests)} chats.")
-    async def run_chat(chat_req: ChatRequest):
-        chat_fn = chat_with_bitnet(chat_req)
-        return await chat_fn
-    results = await asyncio.gather(*(run_chat(req) for req in multichat.requests), return_exceptions=True)
-    formatted = []
-    for i, res in enumerate(results):
-        if isinstance(res, Exception):
-            if isinstance(res, HTTPException):
-                formatted.append({"error": res.detail})
-            else:
-                formatted.append({"error": str(res)})
-        elif isinstance(res, dict) and "content" in res:
-            formatted.append(res["content"])
-        else:
-            formatted.append(res)
-    logger.info("Multichat processing completed.")
-    return {"results": formatted}
+
+async def handle_multichat_with_bitnet_server(data: MultiChatRequest):
+    async def single_chat_wrapper(chat_request: MultiChatRequestItem):
+        try:
+            return await handle_chat_with_bitnet_server(chat_request)
+        except HTTPException as e:
+            return {"port": chat_request.port, "error": e.detail, "status_code": e.status_code}
+        except Exception as e:
+            return {"port": chat_request.port, "error": str(e), "status_code": 500}
+
+    results = await asyncio.gather(*[single_chat_wrapper(req) for req in data.requests])
+    return {"results": results}
@@ -0,0 +1,202 @@
+import os
+import logging
+import asyncio
+import shlex
+from fastapi import HTTPException, status
+from typing import Dict, Any, List
+
+from ..models import (
+    LlamaCliInitRequest,
+    LlamaCliChatRequest,
+    BatchLlamaCliInitRequest,
+    BatchLlamaCliRemoveRequest,
+    BatchLlamaCliChatRequest
+)
+
+# Import the process management functions for persistent sessions
+from .process_management import (
+    start_cli_chat_process,
+    send_to_cli_chat_session,
+    terminate_cli_chat_session,
+    cli_chat_sessions # Direct access for status checks
+)
+
+logger = logging.getLogger(__name__)
+
+# This model path is used for all CLI sessions.
+STATIC_MODEL_PATH = "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf"
+
+async def initialize_llama_cli_session(request: LlamaCliInitRequest) -> Dict[str, Any]:
+    """
+    Starts a persistent llama-cli process in conversational mode.
+    The cli_alias from the request is used as the unique session_id.
+    """
+    session_id = request.cli_alias
+    if session_id in cli_chat_sessions:
+        raise HTTPException(
+            status_code=status.HTTP_409_CONFLICT,
+            detail=f"A CLI chat session with alias '{session_id}' is already running."
+        )
+
+    try:
+        # Start the persistent process
+        session_data = await start_cli_chat_process(
+            session_id=session_id,
+            model_path=STATIC_MODEL_PATH,
+            threads=request.threads,
+            ctx_size=request.ctx_size,
+            n_predict=request.n_predict,
+            temperature=request.temperature,
+            repeat_penalty=request.repeat_penalty,
+            top_k=request.top_k,
+            top_p=request.top_p,
+            system_prompt=request.system_prompt,
+        )
+        logger.info(f"Successfully started persistent llama-cli session '{session_id}' (PID: {session_data['pid']}).")
+        return {
+            "cli_alias": session_id,
+            "status": "running",
+            "pid": session_data["pid"],
+            "message": "CLI process started successfully in conversational mode."
+        }
+    except FileNotFoundError:
+        logger.error(f"Failed to start CLI session '{session_id}': llama-cli executable not found.")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Llama-cli executable not found. Please ensure it's in your PATH or the LLAMA_CLI_PATH environment variable is set correctly."
+        )
+    except RuntimeError as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=str(e)
+        )
+    except Exception as e:
+        logger.error(f"Failed to start persistent CLI session '{session_id}': {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"An unexpected error occurred while starting the CLI process: {str(e)}"
+        )
+
+async def chat_with_llama_cli_session(chat_request: LlamaCliChatRequest) -> Dict[str, Any]:
+    """
+    Sends a prompt to a running persistent llama-cli session.
+    """
+    session_id = chat_request.cli_alias
+    prompt = chat_request.prompt
+
+    try:
+        response_text = await send_to_cli_chat_session(session_id, prompt)
+        return {
+            "cli_alias": session_id,
+            "prompt": prompt,
+            "response": response_text
+        }
+    except LookupError as e:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(e))
+    except (IOError, TimeoutError) as e:
+        raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(e))
+    except Exception as e:
+        logger.error(f"Unexpected error during chat with session '{session_id}': {str(e)}", exc_info=True)
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An unexpected error occurred: {str(e)}")
+
+async def shutdown_llama_cli_session(cli_alias: str) -> Dict[str, str]:
+    """
+    Terminates a persistent llama-cli process.
+    """
+    try:
+        message = await terminate_cli_chat_session(cli_alias)
+        logger.info(f"Termination command for session '{cli_alias}' processed. Result: {message}")
+        return {"cli_alias": cli_alias, "status": "terminated", "message": message}
+    except Exception as e:
+        logger.error(f"Failed to terminate CLI session '{cli_alias}': {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"An unexpected error occurred during termination: {str(e)}"
+        )
+
+async def get_llama_cli_session_status(cli_alias: str) -> Dict[str, Any]:
+    """
+    Retrieves the status of a specific persistent llama-cli session.
+    """
+    session_info = cli_chat_sessions.get(cli_alias)
+    if not session_info:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"No active CLI chat session found with alias '{cli_alias}'."
+        )
+
+    process = session_info.get("process")
+    status = "stopped"
+    if process and process.returncode is None:
+        status = "running"
+
+    # Return a safe subset of the session data
+    return {
+        "cli_alias": cli_alias,
+        "status": status,
+        "pid": session_info.get("pid"),
+        "model_path": session_info.get("model_path"),
+        "start_time": session_info.get("start_time"),
+        "last_interaction_time": session_info.get("last_interaction_time"),
+        "command": " ".join(session_info.get("command", []))
+    }
+
+# --- Batch Operations ---
+async def handle_initialize_batch_llama_cli_configs(batch_request: BatchLlamaCliInitRequest) -> List[Dict[str, Any]]:
+    """
+    Processes a batch request to start multiple persistent llama-cli sessions.
+    """
+    aliases = [req.cli_alias for req in batch_request.requests]
+    if len(aliases) != len(set(aliases)):
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Duplicate cli_alias values found in the batch request."
+        )
+
+    async def process_request(req: LlamaCliInitRequest):
+        try:
+            result = await initialize_llama_cli_session(req)
+            return {"cli_alias": req.cli_alias, "status": "success", "data": result}
+        except HTTPException as e:
+            return {"cli_alias": req.cli_alias, "status": "error", "detail": e.detail, "status_code": e.status_code}
+        except Exception as e:
+            logger.error(f"Unexpected error processing batch init for alias {req.cli_alias}: {str(e)}", exc_info=True)
+            return {"cli_alias": req.cli_alias, "status": "error", "detail": "An unexpected server error occurred.", "status_code": 500}
+
+    results = await asyncio.gather(*(process_request(req) for req in batch_request.requests))
+    return results
+
+async def handle_remove_batch_llama_cli_configs(batch_request: BatchLlamaCliRemoveRequest) -> List[Dict[str, Any]]:
+    """
+    Processes a batch request to terminate multiple persistent llama-cli sessions.
+    """
+    async def process_request(alias: str):
+        try:
+            result = await shutdown_llama_cli_session(alias)
+            return {"cli_alias": alias, "status": "success", "data": result}
+        except HTTPException as e:
+            return {"cli_alias": alias, "status": "error", "detail": e.detail, "status_code": e.status_code}
+        except Exception as e:
+            logger.error(f"Unexpected error processing batch removal for alias {alias}: {str(e)}", exc_info=True)
+            return {"cli_alias": alias, "status": "error", "detail": "An unexpected server error occurred.", "status_code": 500}
+
+    results = await asyncio.gather(*(process_request(alias) for alias in batch_request.aliases))
+    return results
+
+async def handle_batch_chat_with_llama_cli(batch_request: BatchLlamaCliChatRequest) -> List[Dict[str, Any]]:
+    """
+    Processes a batch of chat requests with their respective llama-cli sessions.
+    """
+    async def process_request(req: LlamaCliChatRequest):
+        try:
+            # Reuse the single chat handler logic
+            result = await chat_with_llama_cli_session(req)
+            return {"cli_alias": req.cli_alias, "status": "success", "data": result}
+        except HTTPException as e:
+            return {"cli_alias": req.cli_alias, "status": "error", "detail": e.detail, "status_code": e.status_code}
+        except Exception as e:
+            logger.error(f"Unexpected error processing batch chat for alias {req.cli_alias}: {str(e)}", exc_info=True)
+            return {"cli_alias": req.cli_alias, "status": "error", "detail": "An unexpected server error occurred.", "status_code": 500}
+
+    results = await asyncio.gather(*(process_request(req) for req in batch_request.requests))
+    return results