"""
LM Studio / Ollama inference client for Bible-Companion.
Async HTTP client for OpenAI-compatible chat completions API.
"""

import logging
from typing import List, Dict, Optional
import httpx

logger = logging.getLogger(__name__)


class LMStudioClient:
    """
    Async client for LM Studio or Ollama inference.
    LM Studio: OpenAI-compatible /v1/chat/completions API
    Ollama: Native /api/chat API (on port 11434)
    """

    def __init__(self, base_url: str = "http://192.168.1.169:1234/v1", model: str = "llamafile/lfm2.5-1.2b"):
        """
        Initialize LM Studio or Ollama client.

        Args:
            base_url: Base URL for inference endpoint
                - LM Studio: http://192.168.1.169:1234/v1
                - Ollama: http://localhost:11434 (without /v1)
            model: Model identifier (e.g., llama2, lfm2.5)
        """
        self.base_url = base_url.rstrip('/')
        self.model = model

        # Detect if this is Ollama (port 11434) or LM Studio
        self.is_ollama = '11434' in self.base_url

        if self.is_ollama:
            self.chat_url = f"{self.base_url}/api/chat"
        else:
            self.chat_url = f"{self.base_url}/chat/completions"

        logger.info(f"✅ LMStudioClient initialized: base_url={self.base_url}, is_ollama={self.is_ollama}, chat_url={self.chat_url}, model={self.model}")
        self.timeout = httpx.Timeout(120.0)

    async def generate(
        self,
        messages: List[Dict[str, str]],
        temperature: float = 0.7,
        max_tokens: int = 512,
        top_p: float = 0.9,
        stream: bool = False
    ) -> str:
        """
        Generate response from model using appropriate API.

        Args:
            messages: List of message dicts with 'role' and 'content' keys
            temperature: Sampling temperature (0.0-2.0)
            max_tokens: Maximum tokens in response
            top_p: Nucleus sampling parameter
            stream: Whether to stream response (not yet implemented)

        Returns:
            Generated text response

        Raises:
            Exception: If inference fails
        """
        if self.is_ollama:
            # Ollama native /api/chat format
            payload = {
                "model": self.model,
                "messages": messages,
                "temperature": temperature,
                "stream": stream
            }
        else:
            # LM Studio OpenAI-compatible format
            payload = {
                "model": self.model,
                "messages": messages,
                "temperature": temperature,
                "max_tokens": max_tokens,
                "top_p": top_p,
                "stream": stream
            }

        try:
            async with httpx.AsyncClient(timeout=self.timeout) as client:
                logger.info(f"???? POST {self.chat_url} with model: {self.model}")
                response = await client.post(self.chat_url, json=payload)
                response.raise_for_status()

                result = response.json()

                # Parse response based on endpoint type
                if self.is_ollama:
                    # Ollama response format
                    generated_text = result.get('message', {}).get('content', '').strip()
                else:
                    # OpenAI-compatible format
                    generated_text = result['choices'][0]['message']['content'].strip()

                logger.info(f"??? Inference successful ({len(generated_text)} chars)")
                return generated_text

        except httpx.TimeoutException as e:
            logger.error(f"?????? Inference timeout: {e}")
            raise Exception(f"Inference timeout after 120 seconds: {e}")
        except httpx.HTTPError as e:
            logger.error(f"??? HTTP error: {e}")
            raise Exception(f"Inference HTTP error: {e}")
        except Exception as e:
            logger.error(f"??? Inference error: {e}")
            raise Exception(f"Inference failed: {e}")

    async def health_check(self) -> bool:
        """
        Check if inference endpoint is reachable.

        Returns:
            True if endpoint is healthy, False otherwise
        """
        try:
            async with httpx.AsyncClient(timeout=httpx.Timeout(5.0)) as client:
                response = await client.get(f"{self.base_url}/models")
                return response.status_code == 200
        except Exception as e:
            logger.warning(f"?????? Health check failed: {e}")
            return False


# Async wrapper for synchronous context
class LMStudioClientSync(LMStudioClient):
    """
    Synchronous wrapper for LMStudioClient if needed in non-async contexts.
    (Typically use LMStudioClient with FastAPI's async support)
    """

    def generate_sync(
        self,
        messages: List[Dict[str, str]],
        temperature: float = 0.7,
        max_tokens: int = 512
    ) -> str:
        """Synchronous version of generate (uses asyncio.run internally)"""
        import asyncio
        return asyncio.run(self.generate(messages, temperature, max_tokens))