151 lines
4.1 KiB
Python
151 lines
4.1 KiB
Python
|
|
"""
|
||
|
|
Language Detection Module
|
||
|
|
|
||
|
|
Automatically detects user message language and maps to Strapi-supported locales.
|
||
|
|
"""
|
||
|
|
from typing import Optional
|
||
|
|
from langdetect import detect, LangDetectException
|
||
|
|
from utils.logger import get_logger
|
||
|
|
|
||
|
|
logger = get_logger(__name__)
|
||
|
|
|
||
|
|
# Strapi-supported locales
|
||
|
|
SUPPORTED_LOCALES = ["en", "nl", "de", "es", "fr", "it", "tr"]
|
||
|
|
|
||
|
|
# Language code to locale mapping
|
||
|
|
LOCALE_MAP = {
|
||
|
|
"en": "en", # English
|
||
|
|
"nl": "nl", # Dutch
|
||
|
|
"de": "de", # German
|
||
|
|
"es": "es", # Spanish
|
||
|
|
"fr": "fr", # French
|
||
|
|
"it": "it", # Italian
|
||
|
|
"tr": "tr", # Turkish
|
||
|
|
# Fallback mappings for unsupported languages
|
||
|
|
"af": "en", # Afrikaans -> English
|
||
|
|
"no": "en", # Norwegian -> English
|
||
|
|
"sv": "en", # Swedish -> English
|
||
|
|
"da": "en", # Danish -> English
|
||
|
|
"pl": "en", # Polish -> English
|
||
|
|
"pt": "en", # Portuguese -> English
|
||
|
|
"ru": "en", # Russian -> English
|
||
|
|
"zh": "en", # Chinese -> English
|
||
|
|
"ja": "en", # Japanese -> English
|
||
|
|
"ko": "en", # Korean -> English
|
||
|
|
"ar": "en", # Arabic -> English
|
||
|
|
"hi": "en", # Hindi -> English
|
||
|
|
}
|
||
|
|
|
||
|
|
# Minimum confidence threshold
|
||
|
|
MIN_CONFIDENCE = 0.7
|
||
|
|
|
||
|
|
# Minimum message length for reliable detection
|
||
|
|
MIN_LENGTH = 10
|
||
|
|
|
||
|
|
|
||
|
|
def detect_language(text: str) -> tuple[str, float]:
|
||
|
|
"""Detect language from text
|
||
|
|
|
||
|
|
Args:
|
||
|
|
text: Input text to detect language from
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Tuple of (locale_code, confidence_score)
|
||
|
|
locale_code: Strapi locale (en, nl, de, etc.)
|
||
|
|
confidence_score: Detection confidence (0-1), 0.0 if detection failed
|
||
|
|
"""
|
||
|
|
# Check minimum length
|
||
|
|
if len(text.strip()) < MIN_LENGTH:
|
||
|
|
logger.debug("Message too short for reliable detection", length=len(text))
|
||
|
|
return "en", 0.0
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Detect language using langdetect
|
||
|
|
detected = detect(text)
|
||
|
|
logger.debug("Language detected", language=detected, text_length=len(text))
|
||
|
|
|
||
|
|
# Map to Strapi locale
|
||
|
|
locale = map_to_locale(detected)
|
||
|
|
|
||
|
|
return locale, 0.85 # langdetect doesn't provide confidence, use default
|
||
|
|
|
||
|
|
except LangDetectException as e:
|
||
|
|
logger.warning("Language detection failed", error=str(e))
|
||
|
|
return "en", 0.0
|
||
|
|
|
||
|
|
|
||
|
|
def map_to_locale(lang_code: str) -> str:
|
||
|
|
"""Map detected language code to Strapi locale
|
||
|
|
|
||
|
|
Args:
|
||
|
|
lang_code: ISO 639-1 language code (e.g., "en", "nl", "de")
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Strapi locale code, or "en" as default if not supported
|
||
|
|
"""
|
||
|
|
# Direct mapping
|
||
|
|
if lang_code in SUPPORTED_LOCALES:
|
||
|
|
return lang_code
|
||
|
|
|
||
|
|
# Use locale map
|
||
|
|
locale = LOCALE_MAP.get(lang_code, "en")
|
||
|
|
|
||
|
|
if locale != lang_code and locale == "en":
|
||
|
|
logger.info(
|
||
|
|
"Unsupported language mapped to default",
|
||
|
|
detected_language=lang_code,
|
||
|
|
mapped_locale=locale
|
||
|
|
)
|
||
|
|
|
||
|
|
return locale
|
||
|
|
|
||
|
|
|
||
|
|
def get_cached_or_detect(state, text: str) -> str:
|
||
|
|
"""Get language from cache or detect from text
|
||
|
|
|
||
|
|
Priority:
|
||
|
|
1. Use state.detected_language if available
|
||
|
|
2. Use state.context["language"] if available
|
||
|
|
3. Detect from text
|
||
|
|
|
||
|
|
Args:
|
||
|
|
state: Agent state
|
||
|
|
text: Input text to detect language from
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Detected locale code
|
||
|
|
"""
|
||
|
|
# Check state first
|
||
|
|
if state.get("detected_language"):
|
||
|
|
logger.debug("Using cached language from state", language=state["detected_language"])
|
||
|
|
return state["detected_language"]
|
||
|
|
|
||
|
|
# Check context cache
|
||
|
|
if state.get("context", {}).get("language"):
|
||
|
|
logger.debug("Using cached language from context", language=state["context"]["language"])
|
||
|
|
return state["context"]["language"]
|
||
|
|
|
||
|
|
# Detect from text
|
||
|
|
locale, confidence = detect_language(text)
|
||
|
|
|
||
|
|
if confidence < MIN_CONFIDENCE and confidence > 0:
|
||
|
|
logger.warning(
|
||
|
|
"Low detection confidence, using default",
|
||
|
|
locale=locale,
|
||
|
|
confidence=confidence
|
||
|
|
)
|
||
|
|
|
||
|
|
return locale
|
||
|
|
|
||
|
|
|
||
|
|
def is_supported_locale(locale: str) -> bool:
|
||
|
|
"""Check if locale is supported
|
||
|
|
|
||
|
|
Args:
|
||
|
|
locale: Locale code to check
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
True if locale is in supported list
|
||
|
|
"""
|
||
|
|
return locale in SUPPORTED_LOCALES
|