"""
LM Studio / Ollama inference client for Bible-Companion.
Async HTTP client for OpenAI-compatible chat completions API.
"""

import logging
from typing import List, Dict, Optional
import httpx

logger = logging.getLogger(__name__)


class LMStudioClient:
    """
    Async client for LM Studio or Ollama inference.
    Both support OpenAI-compatible /v1/chat/completions API.
    """

    def __init__(self, base_url: str = "http://192.168.1.169:1234/v1", model: str = "llamafile/lfm2.5-1.2b"):
        """
        Initialize LM Studio client.

        Args:
            base_url: Base URL for inference endpoint (e.g., http://192.168.1.169:1234/v1)
            model: Model identifier (e.g., llamafile/lfm2.5-1.2b)
        """
        self.base_url = base_url.rstrip('/')
        self.model = model
        self.chat_url = f"{self.base_url}/chat/completions"
        self.timeout = httpx.Timeout(60.0)

    async def generate(
        self,
        messages: List[Dict[str, str]],
        temperature: float = 0.7,
        max_tokens: int = 512,
        top_p: float = 0.9,
        stream: bool = False
    ) -> str:
        """
        Generate response from LFM2.5 using OpenAI-compatible API.

        Args:
            messages: List of message dicts with 'role' and 'content' keys
            temperature: Sampling temperature (0.0-2.0)
            max_tokens: Maximum tokens in response
            top_p: Nucleus sampling parameter
            stream: Whether to stream response (not yet implemented)

        Returns:
            Generated text response

        Raises:
            Exception: If inference fails
        """
        payload = {
            "model": self.model,
            "messages": messages,
            "temperature": temperature,
            "max_tokens": max_tokens,
            "top_p": top_p,
            "stream": stream
        }

        try:
            async with httpx.AsyncClient(timeout=self.timeout) as client:
                logger.info(f"📤 POST {self.chat_url} with model: {self.model}")
                response = await client.post(self.chat_url, json=payload)
                response.raise_for_status()

                result = response.json()
                generated_text = result['choices'][0]['message']['content'].strip()

                logger.info(f"✅ Inference successful ({len(generated_text)} chars)")
                return generated_text

        except httpx.TimeoutException as e:
            logger.error(f"⏱️ Inference timeout: {e}")
            raise Exception(f"Inference timeout after 60 seconds: {e}")
        except httpx.HTTPError as e:
            logger.error(f"❌ HTTP error: {e}")
            raise Exception(f"Inference HTTP error: {e}")
        except Exception as e:
            logger.error(f"❌ Inference error: {e}")
            raise Exception(f"Inference failed: {e}")

    async def health_check(self) -> bool:
        """
        Check if inference endpoint is reachable.

        Returns:
            True if endpoint is healthy, False otherwise
        """
        try:
            async with httpx.AsyncClient(timeout=httpx.Timeout(5.0)) as client:
                response = await client.get(f"{self.base_url}/models")
                return response.status_code == 200
        except Exception as e:
            logger.warning(f"⚠️ Health check failed: {e}")
            return False


# Async wrapper for synchronous context
class LMStudioClientSync(LMStudioClient):
    """
    Synchronous wrapper for LMStudioClient if needed in non-async contexts.
    (Typically use LMStudioClient with FastAPI's async support)
    """

    def generate_sync(
        self,
        messages: List[Dict[str, str]],
        temperature: float = 0.7,
        max_tokens: int = 512
    ) -> str:
        """Synchronous version of generate (uses asyncio.run internally)"""
        import asyncio
        return asyncio.run(self.generate(messages, temperature, max_tokens))