""" ZhipuAI LLM Client for B2B Shopping AI Assistant """ import concurrent.futures from typing import Any, Optional from dataclasses import dataclass from zhipuai import ZhipuAI from config import settings from utils.logger import get_logger from utils.response_cache import get_response_cache logger = get_logger(__name__) @dataclass class Message: """Chat message structure""" role: str # "system", "user", "assistant" content: str @dataclass class LLMResponse: """LLM response structure""" content: str finish_reason: str usage: dict[str, int] class ZhipuLLMClient: """ZhipuAI LLM Client wrapper""" DEFAULT_TIMEOUT = 60 # seconds (increased from 30 for better reliability) def __init__( self, api_key: Optional[str] = None, model: Optional[str] = None, timeout: Optional[int] = None, enable_reasoning: Optional[bool] = None ): self.api_key = api_key or settings.zhipu_api_key self.model = model or settings.zhipu_model self.timeout = timeout or self.DEFAULT_TIMEOUT self.enable_reasoning = enable_reasoning if enable_reasoning is not None else settings.enable_reasoning_mode self._client = ZhipuAI(api_key=self.api_key) logger.info( "ZhipuAI client initialized", model=self.model, timeout=self.timeout, reasoning_mode=self.enable_reasoning ) def _should_use_reasoning(self, messages: list[dict[str, str]]) -> bool: """Determine if reasoning mode should be used based on query complexity Args: messages: List of message dictionaries Returns: True if reasoning mode should be used """ if not self.enable_reasoning: return False if not settings.reasoning_mode_for_complex: # If smart mode is disabled, use the global setting return self.enable_reasoning # Smart mode: analyze the last user message last_message = "" for msg in reversed(messages): if msg.get("role") == "user": last_message = msg.get("content", "") break # Simple queries that don't need reasoning simple_patterns = [ "你好", "hi", "hello", "嗨", "谢谢", "thank", "感谢", "再见", "bye", "拜拜", "退货政策", "营业时间", "联系方式", "发货", "配送", "物流" ] last_message_lower = last_message.lower() for pattern in simple_patterns: if pattern in last_message_lower: logger.debug("Simple query detected, disabling reasoning", query=last_message[:50]) return False # Complex queries that benefit from reasoning complex_patterns = [ "为什么", "how", "why", "如何", "推荐", "recommend", "建议", "比较", "compare", "区别", "怎么样", "如何选择" ] for pattern in complex_patterns: if pattern in last_message_lower: logger.debug("Complex query detected, enabling reasoning", query=last_message[:50]) return True # Default: disable reasoning for speed return False async def chat( self, messages: list[Message], temperature: float = 0.7, max_tokens: int = 2048, top_p: float = 0.9, use_cache: bool = True, enable_reasoning: Optional[bool] = None, **kwargs: Any ) -> LLMResponse: """Send chat completion request with caching support""" formatted_messages = [ {"role": msg.role, "content": msg.content} for msg in messages ] # Try cache first if use_cache: try: cache = get_response_cache() cached_response = await cache.get( model=self.model, messages=formatted_messages, temperature=temperature ) if cached_response is not None: logger.info( "Returning cached response", model=self.model, response_length=len(cached_response) ) return LLMResponse( content=cached_response, finish_reason="cache_hit", usage={"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0} ) except Exception as e: logger.warning("Cache check failed", error=str(e)) logger.info( "Sending chat request", model=self.model, message_count=len(messages), temperature=temperature ) # Determine if reasoning mode should be used use_reasoning = enable_reasoning if enable_reasoning is not None else self._should_use_reasoning(formatted_messages) if use_reasoning: logger.info("Reasoning mode enabled for this request") def _make_request(): request_params = { "model": self.model, "messages": formatted_messages, "temperature": temperature, "max_tokens": max_tokens, "top_p": top_p, } # Add thinking mode control # Format: {"thinking": {"type": "disabled"}} or {"type": "enabled"} if use_reasoning: request_params["thinking"] = {"type": "enabled"} logger.info("Thinking mode: enabled", request_params={"thinking": {"type": "enabled"}}) else: request_params["thinking"] = {"type": "disabled"} logger.info("Thinking mode: disabled", request_params={"thinking": {"type": "disabled"}}) request_params.update(kwargs) return self._client.chat.completions.create(**request_params) try: with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: future = executor.submit(_make_request) response = future.result(timeout=self.timeout) choice = response.choices[0] content = choice.message.content logger.info( "Chat response received", finish_reason=choice.finish_reason, content_length=len(content) if content else 0, usage=response.usage.__dict__ if hasattr(response, 'usage') else {} ) if not content: logger.warning("LLM returned empty content") # Cache the response if use_cache and content: try: cache = get_response_cache() await cache.set( model=self.model, messages=formatted_messages, response=content, temperature=temperature ) except Exception as e: logger.warning("Failed to cache response", error=str(e)) return LLMResponse( content=content or "", finish_reason=choice.finish_reason, usage={ "prompt_tokens": response.usage.prompt_tokens, "completion_tokens": response.usage.completion_tokens, "total_tokens": response.usage.total_tokens } ) except concurrent.futures.TimeoutError: logger.error("Chat request timed out", timeout=self.timeout) raise TimeoutError(f"Request timed out after {self.timeout} seconds") except Exception as e: logger.error("Chat request failed", error=str(e)) raise async def chat_with_tools( self, messages: list[Message], tools: list[dict[str, Any]], temperature: float = 0.7, **kwargs: Any ) -> tuple[LLMResponse, None]: """Send chat completion request with tool calling""" formatted_messages = [ {"role": msg.role, "content": msg.content} for msg in messages ] logger.info( "Sending chat request with tools", model=self.model, tool_count=len(tools) ) try: response = self._client.chat.completions.create( model=self.model, messages=formatted_messages, tools=tools, temperature=temperature, **kwargs ) choice = response.choices[0] content = choice.message.content or "" return LLMResponse( content=content, finish_reason=choice.finish_reason, usage={ "prompt_tokens": response.usage.prompt_tokens, "completion_tokens": response.usage.completion_tokens, "total_tokens": response.usage.total_tokens } ), None except Exception as e: logger.error("Chat with tools request failed", error=str(e)) raise llm_client: Optional[ZhipuLLMClient] = None def get_llm_client() -> ZhipuLLMClient: """Get or create global LLM client instance""" global llm_client if llm_client is None: llm_client = ZhipuLLMClient() return llm_client