diff --git a/agent/agents/customer_service.py b/agent/agents/customer_service.py
index 016d515..122afa1 100644
--- a/agent/agents/customer_service.py
+++ b/agent/agents/customer_service.py
@@ -8,6 +8,7 @@ from core.state import AgentState, ConversationState, add_tool_call, set_respons
 from core.llm import get_llm_client, Message
 from prompts import get_prompt
 from utils.logger import get_logger
+from utils.faq_library import get_faq_library
 
 logger = get_logger(__name__)
 
@@ -36,6 +37,32 @@ async def customer_service_agent(state: AgentState) -> AgentState:
     if state["tool_results"]:
         return await _generate_response_from_results(state)
 
+    # ========== FAST PATH: Check if FAQ was already matched at router ==========
+    # Router already checked FAQ and stored response if found
+    if "faq_response" in state and state["faq_response"]:
+        logger.info(
+            "Using FAQ response from router",
+            conversation_id=state["conversation_id"],
+            response_length=len(state["faq_response"])
+        )
+        return set_response(state, state["faq_response"])
+    # =========================================================================
+
+    # ========== FAST PATH: Check local FAQ library first (backup) ==========
+    # This provides instant response for common questions without API calls
+    # This is a fallback in case FAQ wasn't matched at router level
+    faq_library = get_faq_library()
+    faq_response = faq_library.find_match(state["current_message"])
+
+    if faq_response:
+        logger.info(
+            "FAQ match found, returning instant response",
+            conversation_id=state["conversation_id"],
+            response_length=len(faq_response)
+        )
+        return set_response(state, faq_response)
+    # ============================================================
+
     # Get detected language
     locale = state.get("detected_language", "en")
 
diff --git a/agent/agents/router.py b/agent/agents/router.py
index 921bbf0..f469760 100644
--- a/agent/agents/router.py
+++ b/agent/agents/router.py
@@ -9,6 +9,7 @@ from core.llm import get_llm_client, Message
 from core.language_detector import get_cached_or_detect
 from prompts import get_prompt
 from utils.logger import get_logger
+from utils.faq_library import get_faq_library
 
 logger = get_logger(__name__)
 
@@ -34,6 +35,28 @@ async def classify_intent(state: AgentState) -> AgentState:
     state["state"] = ConversationState.CLASSIFYING.value
     state["step_count"] += 1
 
+    # ========== FAST PATH: Check FAQ first BEFORE calling LLM ==========
+    # This avoids slow LLM calls for common questions
+    import re
+    clean_message = re.sub(r'<[^>]+>', '', state["current_message"])
+    clean_message = ' '.join(clean_message.split())
+
+    faq_library = get_faq_library()
+    faq_response = faq_library.find_match(clean_message)
+
+    if faq_response:
+        logger.info(
+            "FAQ matched at router level, skipping LLM classification",
+            conversation_id=state["conversation_id"],
+            message=clean_message[:50]
+        )
+        # Set to customer service intent and store FAQ response
+        state["intent"] = Intent.CUSTOMER_SERVICE.value
+        state["intent_confidence"] = 1.0  # High confidence for FAQ matches
+        state["faq_response"] = faq_response  # Store FAQ response for later use
+        return state
+    # ==============================================================
+
     # Detect language
     detected_locale = get_cached_or_detect(state, state["current_message"])
     confidence = 0.85  # Default confidence for language detection
diff --git a/agent/config.py b/agent/config.py
index c74ca7c..176db65 100644
--- a/agent/config.py
+++ b/agent/config.py
@@ -12,6 +12,8 @@ class Settings(BaseSettings):
     # ============ AI Model ============
     zhipu_api_key: str = Field(..., description="ZhipuAI API Key")
     zhipu_model: str = Field(default="glm-4", description="ZhipuAI Model name")
+    enable_reasoning_mode: bool = Field(default=False, description="Enable AI reasoning/thinking mode (slower but more thoughtful)")
+    reasoning_mode_for_complex: bool = Field(default=True, description="Enable reasoning mode only for complex queries")
     
     # ============ Redis ============
     redis_host: str = Field(default="localhost", description="Redis host")
diff --git a/agent/core/llm.py b/agent/core/llm.py
index 0681573..f815a6c 100644
--- a/agent/core/llm.py
+++ b/agent/core/llm.py
@@ -9,6 +9,7 @@ from zhipuai import ZhipuAI
 
 from config import settings
 from utils.logger import get_logger
+from utils.response_cache import get_response_cache
 
 logger = get_logger(__name__)
 
@@ -31,19 +32,80 @@ class LLMResponse:
 class ZhipuLLMClient:
     """ZhipuAI LLM Client wrapper"""
 
-    DEFAULT_TIMEOUT = 30  # seconds
+    DEFAULT_TIMEOUT = 60  # seconds (increased from 30 for better reliability)
 
     def __init__(
         self,
         api_key: Optional[str] = None,
         model: Optional[str] = None,
-        timeout: Optional[int] = None
+        timeout: Optional[int] = None,
+        enable_reasoning: Optional[bool] = None
     ):
         self.api_key = api_key or settings.zhipu_api_key
         self.model = model or settings.zhipu_model
         self.timeout = timeout or self.DEFAULT_TIMEOUT
+        self.enable_reasoning = enable_reasoning if enable_reasoning is not None else settings.enable_reasoning_mode
         self._client = ZhipuAI(api_key=self.api_key)
-        logger.info("ZhipuAI client initialized", model=self.model, timeout=self.timeout)
+        logger.info(
+            "ZhipuAI client initialized",
+            model=self.model,
+            timeout=self.timeout,
+            reasoning_mode=self.enable_reasoning
+        )
+
+    def _should_use_reasoning(self, messages: list[dict[str, str]]) -> bool:
+        """Determine if reasoning mode should be used based on query complexity
+
+        Args:
+            messages: List of message dictionaries
+
+        Returns:
+            True if reasoning mode should be used
+        """
+        if not self.enable_reasoning:
+            return False
+
+        if not settings.reasoning_mode_for_complex:
+            # If smart mode is disabled, use the global setting
+            return self.enable_reasoning
+
+        # Smart mode: analyze the last user message
+        last_message = ""
+        for msg in reversed(messages):
+            if msg.get("role") == "user":
+                last_message = msg.get("content", "")
+                break
+
+        # Simple queries that don't need reasoning
+        simple_patterns = [
+            "你好", "hi", "hello", "嗨",
+            "谢谢", "thank", "感谢",
+            "再见", "bye", "拜拜",
+            "退货政策", "营业时间", "联系方式",
+            "发货", "配送", "物流"
+        ]
+
+        last_message_lower = last_message.lower()
+        for pattern in simple_patterns:
+            if pattern in last_message_lower:
+                logger.debug("Simple query detected, disabling reasoning", query=last_message[:50])
+                return False
+
+        # Complex queries that benefit from reasoning
+        complex_patterns = [
+            "为什么", "how", "why", "如何",
+            "推荐", "recommend", "建议",
+            "比较", "compare", "区别",
+            "怎么样", "如何选择"
+        ]
+
+        for pattern in complex_patterns:
+            if pattern in last_message_lower:
+                logger.debug("Complex query detected, enabling reasoning", query=last_message[:50])
+                return True
+
+        # Default: disable reasoning for speed
+        return False
 
     async def chat(
         self,
@@ -51,14 +113,39 @@ class ZhipuLLMClient:
         temperature: float = 0.7,
         max_tokens: int = 2048,
         top_p: float = 0.9,
+        use_cache: bool = True,
+        enable_reasoning: Optional[bool] = None,
         **kwargs: Any
     ) -> LLMResponse:
-        """Send chat completion request"""
+        """Send chat completion request with caching support"""
         formatted_messages = [
             {"role": msg.role, "content": msg.content}
             for msg in messages
         ]
 
+        # Try cache first
+        if use_cache:
+            try:
+                cache = get_response_cache()
+                cached_response = await cache.get(
+                    model=self.model,
+                    messages=formatted_messages,
+                    temperature=temperature
+                )
+                if cached_response is not None:
+                    logger.info(
+                        "Returning cached response",
+                        model=self.model,
+                        response_length=len(cached_response)
+                    )
+                    return LLMResponse(
+                        content=cached_response,
+                        finish_reason="cache_hit",
+                        usage={"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
+                    )
+            except Exception as e:
+                logger.warning("Cache check failed", error=str(e))
+
         logger.info(
             "Sending chat request",
             model=self.model,
@@ -66,15 +153,32 @@ class ZhipuLLMClient:
             temperature=temperature
         )
 
+        # Determine if reasoning mode should be used
+        use_reasoning = enable_reasoning if enable_reasoning is not None else self._should_use_reasoning(formatted_messages)
+
+        if use_reasoning:
+            logger.info("Reasoning mode enabled for this request")
+
         def _make_request():
-            return self._client.chat.completions.create(
-                model=self.model,
-                messages=formatted_messages,
-                temperature=temperature,
-                max_tokens=max_tokens,
-                top_p=top_p,
-                **kwargs
-            )
+            request_params = {
+                "model": self.model,
+                "messages": formatted_messages,
+                "temperature": temperature,
+                "max_tokens": max_tokens,
+                "top_p": top_p,
+            }
+
+            # Add thinking mode control
+            # Format: {"thinking": {"type": "disabled"}} or {"type": "enabled"}
+            if use_reasoning:
+                request_params["thinking"] = {"type": "enabled"}
+                logger.info("Thinking mode: enabled", request_params={"thinking": {"type": "enabled"}})
+            else:
+                request_params["thinking"] = {"type": "disabled"}
+                logger.info("Thinking mode: disabled", request_params={"thinking": {"type": "disabled"}})
+
+            request_params.update(kwargs)
+            return self._client.chat.completions.create(**request_params)
 
         try:
             with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
@@ -94,6 +198,19 @@ class ZhipuLLMClient:
             if not content:
                 logger.warning("LLM returned empty content")
 
+            # Cache the response
+            if use_cache and content:
+                try:
+                    cache = get_response_cache()
+                    await cache.set(
+                        model=self.model,
+                        messages=formatted_messages,
+                        response=content,
+                        temperature=temperature
+                    )
+                except Exception as e:
+                    logger.warning("Failed to cache response", error=str(e))
+
             return LLMResponse(
                 content=content or "",
                 finish_reason=choice.finish_reason,
diff --git a/agent/integrations/chatwoot.py b/agent/integrations/chatwoot.py
index 2742e3e..a3874eb 100644
--- a/agent/integrations/chatwoot.py
+++ b/agent/integrations/chatwoot.py
@@ -141,32 +141,89 @@ class ChatwootClient:
         content_attributes: dict[str, Any]
     ) -> dict[str, Any]:
         """Send a rich message (cards, buttons, etc.)
-        
+
         Args:
             conversation_id: Conversation ID
             content: Fallback text content
             content_type: Rich content type (cards, input_select, etc.)
             content_attributes: Rich content attributes
-            
+
         Returns:
             Created message data
         """
         client = await self._get_client()
-        
+
         payload = {
             "content": content,
             "message_type": MessageType.OUTGOING.value,
             "content_type": content_type,
             "content_attributes": content_attributes
         }
-        
+
         response = await client.post(
             f"/conversations/{conversation_id}/messages",
             json=payload
         )
         response.raise_for_status()
-        
+
         return response.json()
+
+    async def send_order_card(
+        self,
+        conversation_id: int,
+        order_data: dict[str, Any],
+        actions: list[dict[str, Any]]
+    ) -> dict[str, Any]:
+        """发送订单卡片消息（Markdown 文本 + 操作按钮）
+
+        Args:
+            conversation_id: 会话 ID
+            order_data: 订单数据，包含：
+                - order_id: 订单号
+                - status: 订单状态
+                - status_text: 状态文本
+                - created_at: 下单时间（可选）
+                - items: 商品列表（可选）
+                - total_amount: 总金额
+                - shipping_fee: 运费（可选）
+                - logistics: 物流信息（可选）
+                - remark: 备注（可选）
+            actions: 操作按钮配置列表，每个按钮包含：
+                - type: "link" 或 "postback"
+                - text: 按钮文字
+                - uri: 链接地址（type=link 时必需）
+                - payload: 回传数据（type=postback 时必需）
+
+        Returns:
+            发送结果
+
+        Example:
+            >>> order_data = {
+            ...     "order_id": "123456789",
+            ...     "status": "shipped",
+            ...     "status_text": "已发货",
+            ...     "total_amount": "1058.00",
+            ...     "items": [...]
+            ... }
+            >>> actions = [
+            ...     {"type": "link", "text": "查看详情", "uri": "https://..."},
+            ...     {"type": "postback", "text": "联系客服", "payload": "CONTACT_SUPPORT"}
+            ... ]
+            >>> await chatwoot.send_order_card(123, order_data, actions)
+        """
+        # 生成 Markdown 内容
+        markdown_content = format_order_card_markdown(order_data)
+
+        # 生成按钮卡片
+        buttons = create_action_buttons(actions)
+
+        # 发送富媒体消息
+        return await self.send_rich_message(
+            conversation_id=conversation_id,
+            content=markdown_content,
+            content_type="cards",
+            content_attributes=buttons
+        )
     
     # ============ Conversations ============
     
@@ -342,6 +399,130 @@ class ChatwootClient:
         return data.get("payload", [])
 
 
+# ============ Helper Functions ============
+
+def format_order_card_markdown(order_data: dict[str, Any]) -> str:
+    """格式化订单信息为 Markdown 卡片
+
+    Args:
+        order_data: 订单数据，包含订单号、状态、商品、金额、物流等信息
+
+    Returns:
+        格式化的 Markdown 字符串
+
+    Example:
+        >>> order = {
+        ...     "order_id": "123456789",
+        ...     "status": "shipped",
+        ...     "status_text": "已发货",
+        ...     "created_at": "2023-10-27 14:30",
+        ...     "items": [...],
+        ...     "total_amount": "1058.00",
+        ...     "shipping_fee": "0.00",
+        ...     "logistics": {...}
+        ... }
+        >>> markdown = format_order_card_markdown(order)
+    """
+    # 订单状态 emoji 映射
+    status_emoji = {
+        "pending": "⏳",
+        "paid": "💰",
+        "processing": "⚙️",
+        "shipped": "📦",
+        "delivered": "✅",
+        "completed": "✅",
+        "cancelled": "❌",
+        "refunded": "💸",
+        "failed": "⚠️",
+    }
+
+    # 获取状态文本和 emoji
+    status = order_data.get("status", "unknown")
+    status_text = order_data.get("status_text", status)
+    emoji = status_emoji.get(status, "📦")
+
+    lines = [
+        f"{emoji} **订单状态：{status_text}**",
+        f"📝 **订单号：** `{order_data.get('order_id', '')}`",
+    ]
+
+    # 添加下单时间（如果有）
+    if order_data.get("created_at"):
+        lines.append(f"📅 **下单时间：** {order_data['created_at']}")
+
+    lines.append("")  # 空行
+    lines.append("**商品详情**")
+
+    # 添加商品列表
+    items = order_data.get("items", [])
+    if items:
+        for item in items:
+            name = item.get("name", "未知商品")
+            quantity = item.get("quantity", 1)
+            price = item.get("price", "0.00")
+            # 可选：添加图片链接
+            image_markdown = ""
+            if item.get("image_url"):
+                image_markdown = f" [图片]({item['image_url']})"
+            lines.append(f"▫️{image_markdown} {name} × {quantity}  ¥{price}")
+    else:
+        lines.append("▫️ 无商品信息")
+
+    # 添加金额信息
+    lines.extend([
+        "",
+        f"💰 **实付：** ¥{order_data.get('total_amount', '0.00')} (含运费 ¥{order_data.get('shipping_fee', '0.00')})"
+    ])
+
+    # 添加物流信息（如果有）
+    logistics = order_data.get("logistics")
+    if logistics:
+        lines.extend([
+            "",
+            "🚚 **物流信息**",
+            f"承运商：{logistics.get('carrier', '未知')}",
+            f"单号：{logistics.get('tracking_number', '未知')}",
+            "*点击单号可复制跟踪*"
+        ])
+
+    # 添加备注（如果有）
+    if order_data.get("remark"):
+        lines.extend([
+            "",
+            f"📋 **备注：** {order_data['remark']}"
+        ])
+
+    return "\n".join(lines)
+
+
+def create_action_buttons(actions: list[dict[str, Any]]) -> dict[str, Any]:
+    """创建 Chatwoot 操作按钮卡片
+
+    Args:
+        actions: 按钮配置列表，每个按钮包含：
+            - type: "link" 或 "postback"
+            - text: 按钮文字
+            - uri: 链接地址（type=link 时）
+            - payload: 回传数据（type=postback 时）
+
+    Returns:
+        符合 Chatwoot content_attributes 格式的字典
+
+    Example:
+        >>> actions = [
+        ...     {"type": "link", "text": "查看详情", "uri": "https://example.com"},
+        ...     {"type": "postback", "text": "联系客服", "payload": "CONTACT_SUPPORT"}
+        ... ]
+        >>> buttons = create_action_buttons(actions)
+    """
+    return {
+        "items": [{
+            "title": "操作",
+            "actions": actions
+        }]
+    }
+
+
 # Global Chatwoot client instance
 chatwoot_client: Optional[ChatwootClient] = None
 
diff --git a/agent/utils/faq_library.py b/agent/utils/faq_library.py
new file mode 100644
index 0000000..8981ff2
--- /dev/null
+++ b/agent/utils/faq_library.py
@@ -0,0 +1,121 @@
+"""
+Local FAQ Library for instant responses
+Common questions can be answered immediately without API calls
+"""
+import re
+from typing import Optional, Dict
+from .logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class FAQLibrary:
+    """Local FAQ library for instant common question responses"""
+
+    def __init__(self):
+        """Initialize FAQ library with ONLY common greetings and social responses
+
+        Note: Business-related FAQs (register, order, payment, shipment, return, etc.)
+        should be handled by Strapi MCP to ensure accuracy and consistency.
+        This library only contains instant social responses for better UX.
+        """
+        self.faqs = {
+            # ========== 问候类 Greetings ==========
+            "你好": "你好！我是您的B2B客户服务助手，很高兴为您服务。我可以帮您处理订单查询、产品咨询、售后问题等。请问有什么可以帮到您的吗？",
+            "您好": "您好！我是您的B2B客户服务助手，很高兴为您服务。我可以帮您处理订单查询、产品咨询、售后问题等。请问有什么可以帮到您的吗？",
+            "hi": "Hello! I'm your B2B customer service assistant. How can I help you today?",
+            "hello": "Hello! I'm here to assist you. How can I help you today?",
+            "hey": "Hey there! How can I help you today?",
+
+            # ========== 感谢类 Gratitude ==========
+            "谢谢": "不客气！如果还有其他问题，随时可以问我。祝您购物愉快！",
+            "感谢": "感谢您的支持！如有任何问题，随时联系我们。",
+            "thank you": "You're welcome! If you have any other questions, feel free to ask. Have a great day!",
+            "thanks": "You're welcome! Let me know if you need anything else.",
+
+            # ========== 再见类 Farewell ==========
+            "再见": "再见！如有需要，随时联系。祝您生活愉快！",
+            "bye": "Goodbye! Feel free to reach out anytime. Have a great day!",
+            "goodbye": "Goodbye! Have a wonderful day!",
+
+            # ========== 社交礼貌类 Social Politeness ==========
+            "早上好": "早上好！很高兴为您服务。请问有什么可以帮到您的吗？",
+            "下午好": "下午好！很高兴为您服务。请问有什么可以帮到您的吗？",
+            "晚上好": "晚上好！很高兴为您服务。请问有什么可以帮到您的吗？",
+            "good morning": "Good morning! How can I assist you today?",
+            "good afternoon": "Good afternoon! How can I assist you today?",
+            "good evening": "Good evening! How can I assist you today?",
+        }
+
+        # Compile regex patterns for fuzzy matching
+        self._compile_patterns()
+
+    def _compile_patterns(self):
+        """Compile regex patterns for fuzzy FAQ matching"""
+        self.patterns = []
+        for keyword, response in self.faqs.items():
+            # Case-insensitive pattern with word boundaries
+            pattern = re.compile(re.escape(keyword), re.IGNORECASE)
+            self.patterns.append((pattern, response))
+
+    def find_match(self, query: str) -> Optional[str]:
+        """Find matching FAQ response
+
+        Args:
+            query: User query text
+
+        Returns:
+            Matching FAQ response or None if no match found
+        """
+        # Remove HTML tags and extra whitespace
+        clean_query = re.sub(r'<[^>]+>', '', query)
+        clean_query = ' '.join(clean_query.split())
+
+        # Try exact match first
+        if clean_query.lower() in (k.lower() for k in self.faqs.keys()):
+            for key, response in self.faqs.items():
+                if key.lower() == clean_query.lower():
+                    logger.info("FAQ exact match", key=key, query=clean_query[:50])
+                    return response
+
+        # Try fuzzy match (contains keyword)
+        for pattern, response in self.patterns:
+            if pattern.search(clean_query):
+                logger.info("FAQ fuzzy match", pattern=pattern.pattern, query=clean_query[:50])
+                return response
+
+        # No match found
+        logger.debug("No FAQ match found", query=clean_query[:50])
+        return None
+
+    def add_faq(self, keyword: str, response: str) -> None:
+        """Add or update FAQ entry
+
+        Args:
+            keyword: Question keyword
+            response: Answer text
+        """
+        self.faqs[keyword] = response
+        pattern = re.compile(re.escape(keyword), re.IGNORECASE)
+        self.patterns.append((pattern, response))
+        logger.info("FAQ added", keyword=keyword)
+
+    def get_all_keywords(self) -> list[str]:
+        """Get all FAQ keywords
+
+        Returns:
+            List of FAQ keywords
+        """
+        return list(self.faqs.keys())
+
+
+# Global FAQ library instance
+faq_library: Optional[FAQLibrary] = None
+
+
+def get_faq_library() -> FAQLibrary:
+    """Get or create global FAQ library instance"""
+    global faq_library
+    if faq_library is None:
+        faq_library = FAQLibrary()
+    return faq_library
diff --git a/agent/utils/response_cache.py b/agent/utils/response_cache.py
new file mode 100644
index 0000000..b663e5c
--- /dev/null
+++ b/agent/utils/response_cache.py
@@ -0,0 +1,194 @@
+"""
+LLM Response Cache for FAQ and common queries
+"""
+import hashlib
+import json
+from typing import Any, Optional
+from datetime import timedelta
+
+from .cache import CacheManager
+from .logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class ResponseCache:
+    """Cache LLM responses for common queries"""
+
+    def __init__(
+        self,
+        cache_manager: Optional[CacheManager] = None,
+        default_ttl: int = 3600  # 1 hour default
+    ):
+        """Initialize response cache
+
+        Args:
+            cache_manager: Cache manager instance
+            default_ttl: Default TTL in seconds for cached responses
+        """
+        self.cache = cache_manager
+        self.default_ttl = default_ttl
+
+    def _generate_key(
+        self,
+        model: str,
+        messages: list[dict[str, str]],
+        temperature: float = 0.7,
+        **kwargs: Any
+    ) -> str:
+        """Generate cache key from request parameters
+
+        Args:
+            model: Model name
+            messages: List of messages
+            temperature: Temperature parameter
+            **kwargs: Additional parameters
+
+        Returns:
+            Cache key string
+        """
+        # Create a normalized representation of the request
+        cache_input = {
+            "model": model,
+            "messages": messages,
+            "temperature": temperature,
+            **{k: v for k, v in kwargs.items() if v is not None}
+        }
+
+        # Hash the input to create a short, unique key
+        cache_str = json.dumps(cache_input, sort_keys=True, ensure_ascii=False)
+        cache_hash = hashlib.sha256(cache_str.encode()).hexdigest()[:16]
+
+        return f"llm_response:{model}:{cache_hash}"
+
+    async def get(
+        self,
+        model: str,
+        messages: list[dict[str, str]],
+        temperature: float = 0.7,
+        **kwargs: Any
+    ) -> Optional[str]:
+        """Get cached response if available
+
+        Args:
+            model: Model name
+            messages: List of messages
+            temperature: Temperature parameter
+            **kwargs: Additional parameters
+
+        Returns:
+            Cached response content or None
+        """
+        if not self.cache:
+            return None
+
+        key = self._generate_key(model, messages, temperature, **kwargs)
+        cached = await self.cache.get(key)
+
+        if cached:
+            logger.info(
+                "Cache hit",
+                model=model,
+                key=key,
+                response_length=len(cached)
+            )
+            try:
+                data = json.loads(cached)
+                return data.get("response")
+            except json.JSONDecodeError:
+                logger.warning("Invalid cached data", key=key)
+                return None
+
+        logger.debug("Cache miss", model=model, key=key)
+        return None
+
+    async def set(
+        self,
+        model: str,
+        messages: list[dict[str, str]],
+        response: str,
+        temperature: float = 0.7,
+        ttl: Optional[int] = None,
+        **kwargs: Any
+    ) -> None:
+        """Cache LLM response
+
+        Args:
+            model: Model name
+            messages: List of messages
+            response: Response content to cache
+            temperature: Temperature parameter
+            ttl: Time-to-live in seconds
+            **kwargs: Additional parameters
+        """
+        if not self.cache:
+            return
+
+        key = self._generate_key(model, messages, temperature, **kwargs)
+        ttl = ttl or self.default_ttl
+
+        # Store response with metadata
+        data = {
+            "response": response,
+            "model": model,
+            "response_length": len(response),
+            "temperature": temperature
+        }
+
+        await self.cache.set(
+            key,
+            json.dumps(data, ensure_ascii=False),
+            ttl=ttl
+        )
+
+        logger.info(
+            "Response cached",
+            model=model,
+            key=key,
+            response_length=len(response),
+            ttl=ttl
+        )
+
+    async def invalidate(self, pattern: str = "llm_response:*") -> int:
+        """Invalidate cached responses matching pattern
+
+        Args:
+            pattern: Redis key pattern to match
+
+        Returns:
+            Number of keys deleted
+        """
+        if not self.cache:
+            return 0
+
+        # This would need scan/delete operation
+        # For now, just log
+        logger.info("Cache invalidation requested", pattern=pattern)
+        return 0
+
+    def get_cache_stats(self) -> dict[str, Any]:
+        """Get cache statistics
+
+        Returns:
+            Dictionary with cache stats
+        """
+        return {
+            "enabled": self.cache is not None,
+            "default_ttl": self.default_ttl
+        }
+
+
+# Global response cache instance
+response_cache: Optional[ResponseCache] = None
+
+
+def get_response_cache() -> ResponseCache:
+    """Get or create global response cache instance"""
+    global response_cache
+    if response_cache is None:
+        from .cache import get_cache_manager
+        response_cache = ResponseCache(
+            cache_manager=get_cache_manager(),
+            default_ttl=3600  # 1 hour
+        )
+    return response_cache
diff --git a/docker-compose.yml b/docker-compose.yml
index 4698adc..09ab3cd 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -64,7 +64,8 @@ services:
   chatwoot:
     image: chatwoot/chatwoot:latest
     container_name: ai_chatwoot
-    command: bundle exec rails s -p 3000 -b 0.0.0.0
+    # 启动前清理 PID 文件，避免重启循环
+    command: sh -c "rm -f /app/tmp/pids/server.pid && bundle exec rails s -p 3000 -b 0.0.0.0"
     environment:
       RAILS_ENV: production
       SECRET_KEY_BASE: ${CHATWOOT_SECRET_KEY_BASE}