Files
assistant/agent/core/language_detector.py

151 lines
4.1 KiB
Python
Raw Normal View History

"""
Language Detection Module
Automatically detects user message language and maps to Strapi-supported locales.
"""
from typing import Optional
from langdetect import detect, LangDetectException
from utils.logger import get_logger
logger = get_logger(__name__)
# Strapi-supported locales
SUPPORTED_LOCALES = ["en", "nl", "de", "es", "fr", "it", "tr"]
# Language code to locale mapping
LOCALE_MAP = {
"en": "en", # English
"nl": "nl", # Dutch
"de": "de", # German
"es": "es", # Spanish
"fr": "fr", # French
"it": "it", # Italian
"tr": "tr", # Turkish
# Fallback mappings for unsupported languages
"af": "en", # Afrikaans -> English
"no": "en", # Norwegian -> English
"sv": "en", # Swedish -> English
"da": "en", # Danish -> English
"pl": "en", # Polish -> English
"pt": "en", # Portuguese -> English
"ru": "en", # Russian -> English
"zh": "en", # Chinese -> English
"ja": "en", # Japanese -> English
"ko": "en", # Korean -> English
"ar": "en", # Arabic -> English
"hi": "en", # Hindi -> English
}
# Minimum confidence threshold
MIN_CONFIDENCE = 0.7
# Minimum message length for reliable detection
MIN_LENGTH = 10
def detect_language(text: str) -> tuple[str, float]:
"""Detect language from text
Args:
text: Input text to detect language from
Returns:
Tuple of (locale_code, confidence_score)
locale_code: Strapi locale (en, nl, de, etc.)
confidence_score: Detection confidence (0-1), 0.0 if detection failed
"""
# Check minimum length
if len(text.strip()) < MIN_LENGTH:
logger.debug("Message too short for reliable detection", length=len(text))
return "en", 0.0
try:
# Detect language using langdetect
detected = detect(text)
logger.debug("Language detected", language=detected, text_length=len(text))
# Map to Strapi locale
locale = map_to_locale(detected)
return locale, 0.85 # langdetect doesn't provide confidence, use default
except LangDetectException as e:
logger.warning("Language detection failed", error=str(e))
return "en", 0.0
def map_to_locale(lang_code: str) -> str:
"""Map detected language code to Strapi locale
Args:
lang_code: ISO 639-1 language code (e.g., "en", "nl", "de")
Returns:
Strapi locale code, or "en" as default if not supported
"""
# Direct mapping
if lang_code in SUPPORTED_LOCALES:
return lang_code
# Use locale map
locale = LOCALE_MAP.get(lang_code, "en")
if locale != lang_code and locale == "en":
logger.info(
"Unsupported language mapped to default",
detected_language=lang_code,
mapped_locale=locale
)
return locale
def get_cached_or_detect(state, text: str) -> str:
"""Get language from cache or detect from text
Priority:
1. Use state.detected_language if available
2. Use state.context["language"] if available
3. Detect from text
Args:
state: Agent state
text: Input text to detect language from
Returns:
Detected locale code
"""
# Check state first
if state.get("detected_language"):
logger.debug("Using cached language from state", language=state["detected_language"])
return state["detected_language"]
# Check context cache
if state.get("context", {}).get("language"):
logger.debug("Using cached language from context", language=state["context"]["language"])
return state["context"]["language"]
# Detect from text
locale, confidence = detect_language(text)
if confidence < MIN_CONFIDENCE and confidence > 0:
logger.warning(
"Low detection confidence, using default",
locale=locale,
confidence=confidence
)
return locale
def is_supported_locale(locale: str) -> bool:
"""Check if locale is supported
Args:
locale: Locale code to check
Returns:
True if locale is in supported list
"""
return locale in SUPPORTED_LOCALES