import asyncio import re from typing import NamedTuple from urllib.parse import unquote, urlparse import mwclient import structlog from mwclient.errors import InvalidPageTitle, LoginError from ..models import AppConfig from .base import BaseAdapter, CircuitBreaker, RateLimiter, with_retry logger = structlog.get_logger() class WikiPageNotFoundError(Exception): pass class WikiPageRedirectError(Exception): pass class WikiPageInfo(NamedTuple): title: str content: str is_redirect: bool = False redirect_target: str | None = None class RuWikiAdapter(BaseAdapter): def __init__(self, config: AppConfig) -> None: super().__init__("ruwiki_adapter") self.config = config self.rate_limiter = RateLimiter( max_concurrent=config.max_concurrent_wiki, name="ruwiki_limiter", ) self.circuit_breaker = CircuitBreaker( failure_threshold=config.circuit_failure_threshold, recovery_timeout=config.circuit_recovery_timeout, name="ruwiki_circuit", ) self._client: mwclient.Site | None = None async def _get_client(self) -> mwclient.Site: if self._client is None: self._client = await asyncio.to_thread( self._create_client, ) return self._client def _create_client(self) -> mwclient.Site: try: site = mwclient.Site("ru.wikipedia.org") site.api("query", meta="siteinfo") self.logger.info("Соединение с RuWiki установлено") return site except (LoginError, ConnectionError) as e: self.logger.error("Ошибка подключения к RuWiki", error=str(e)) raise @staticmethod def extract_title_from_url(url: str) -> str: parsed = urlparse(url) if "wikipedia.org" not in parsed.netloc: raise ValueError(f"Не является URL википедии: {url}") path_parts = parsed.path.split("/") if len(path_parts) < 3 or path_parts[1] != "wiki": raise ValueError(f"Неверный формат URL: {url}") title = unquote(path_parts[2]) title = title.replace("_", " ") return title async def _fetch_page_content(self, title: str) -> WikiPageInfo: client = await self._get_client() def _sync_fetch() -> WikiPageInfo: try: page = client.pages[title] if not page.exists: raise WikiPageNotFoundError(f"Страница '{title}' не найдена") if page.redirect: redirect_target = page.redirects_to() if redirect_target: redirect_title = redirect_target.name self.logger.info( "Страница является редиректом", original=title, target=redirect_title, ) raise WikiPageRedirectError( f"Страница '{title}' перенаправляет на '{redirect_title}'" ) content = page.text() if not content or len(content.strip()) < 100: raise WikiPageNotFoundError(f"Страница '{title}' слишком короткая или пустая") return WikiPageInfo( title=title, content=content, is_redirect=False, ) except InvalidPageTitle as e: raise WikiPageNotFoundError(f"Неверное название страницы: {e}") from e return await asyncio.to_thread(_sync_fetch) def _clean_wikitext(self, text: str) -> str: text = re.sub(r"\{\{[Нн]авигация.*?\}\}", "", text, flags=re.DOTALL) text = re.sub(r"\{\{[Кк]арточка.*?\}\}", "", text, flags=re.DOTALL) text = re.sub(r"\{\{[Дд]исамбиг.*?\}\}", "", text, flags=re.DOTALL) text = re.sub(r"\[\[[Кк]атегория:.*?\]\]", "", text) text = re.sub(r"\[\[[Фф]айл:.*?\]\]", "", text, flags=re.DOTALL) text = re.sub(r"\[\[[Ii]mage:.*?\]\]", "", text, flags=re.DOTALL) text = re.sub(r"", "", text, flags=re.DOTALL) text = re.sub(r"\n\s*\n", "\n\n", text) return text.strip() async def fetch_page(self, url: str) -> WikiPageInfo: title = self.extract_title_from_url(url) async with self.rate_limiter: return await self.circuit_breaker.call( lambda: with_retry( lambda: self._fetch_page_content(title), max_attempts=self.config.max_retries, min_wait=self.config.retry_delay, max_wait=self.config.retry_delay * 4, retry_exceptions=(ConnectionError, TimeoutError), name=f"fetch_page_{title}", ) ) async def fetch_page_cleaned(self, url: str) -> WikiPageInfo: page_info = await self.fetch_page(url) cleaned_content = self._clean_wikitext(page_info.content) return WikiPageInfo( title=page_info.title, content=cleaned_content, is_redirect=page_info.is_redirect, redirect_target=page_info.redirect_target, ) async def health_check(self) -> bool: try: client = await self._get_client() await asyncio.to_thread(lambda: client.api("query", meta="siteinfo", siprop="general")) return True except Exception as e: self.logger.error("Health check failed", error=str(e)) return False