169 lines
5.8 KiB
Python
169 lines
5.8 KiB
Python
import asyncio
|
||
import re
|
||
from typing import NamedTuple
|
||
from urllib.parse import unquote, urlparse
|
||
|
||
import mwclient
|
||
import structlog
|
||
from mwclient.errors import InvalidPageTitle, LoginError
|
||
|
||
from ..models import AppConfig
|
||
from .base import BaseAdapter, CircuitBreaker, RateLimiter, with_retry
|
||
|
||
logger = structlog.get_logger()
|
||
|
||
|
||
class WikiPageNotFoundError(Exception):
|
||
pass
|
||
|
||
|
||
class WikiPageRedirectError(Exception):
|
||
pass
|
||
|
||
|
||
class WikiPageInfo(NamedTuple):
|
||
title: str
|
||
content: str
|
||
is_redirect: bool = False
|
||
redirect_target: str | None = None
|
||
|
||
|
||
class RuWikiAdapter(BaseAdapter):
|
||
|
||
def __init__(self, config: AppConfig) -> None:
|
||
super().__init__("ruwiki_adapter")
|
||
self.config = config
|
||
|
||
self.rate_limiter = RateLimiter(
|
||
max_concurrent=config.max_concurrent_wiki,
|
||
name="ruwiki_limiter",
|
||
)
|
||
self.circuit_breaker = CircuitBreaker(
|
||
failure_threshold=config.circuit_failure_threshold,
|
||
recovery_timeout=config.circuit_recovery_timeout,
|
||
name="ruwiki_circuit",
|
||
)
|
||
|
||
self._client: mwclient.Site | None = None
|
||
|
||
async def _get_client(self) -> mwclient.Site:
|
||
if self._client is None:
|
||
self._client = await asyncio.to_thread(
|
||
self._create_client,
|
||
)
|
||
return self._client
|
||
|
||
def _create_client(self) -> mwclient.Site:
|
||
try:
|
||
site = mwclient.Site("ru.wikipedia.org")
|
||
site.api("query", meta="siteinfo")
|
||
self.logger.info("Соединение с RuWiki установлено")
|
||
return site
|
||
except (LoginError, ConnectionError) as e:
|
||
self.logger.error("Ошибка подключения к RuWiki", error=str(e))
|
||
raise
|
||
|
||
@staticmethod
|
||
def extract_title_from_url(url: str) -> str:
|
||
parsed = urlparse(url)
|
||
if "wikipedia.org" not in parsed.netloc:
|
||
raise ValueError(f"Не является URL википедии: {url}")
|
||
|
||
path_parts = parsed.path.split("/")
|
||
if len(path_parts) < 3 or path_parts[1] != "wiki":
|
||
raise ValueError(f"Неверный формат URL: {url}")
|
||
|
||
title = unquote(path_parts[2])
|
||
title = title.replace("_", " ")
|
||
|
||
return title
|
||
|
||
async def _fetch_page_content(self, title: str) -> WikiPageInfo:
|
||
client = await self._get_client()
|
||
|
||
def _sync_fetch() -> WikiPageInfo:
|
||
try:
|
||
page = client.pages[title]
|
||
|
||
if not page.exists:
|
||
raise WikiPageNotFoundError(f"Страница '{title}' не найдена")
|
||
|
||
if page.redirect:
|
||
redirect_target = page.redirects_to()
|
||
if redirect_target:
|
||
redirect_title = redirect_target.name
|
||
self.logger.info(
|
||
"Страница является редиректом",
|
||
original=title,
|
||
target=redirect_title,
|
||
)
|
||
raise WikiPageRedirectError(
|
||
f"Страница '{title}' перенаправляет на '{redirect_title}'"
|
||
)
|
||
|
||
content = page.text()
|
||
if not content or len(content.strip()) < 100:
|
||
raise WikiPageNotFoundError(f"Страница '{title}' слишком короткая или пустая")
|
||
|
||
return WikiPageInfo(
|
||
title=title,
|
||
content=content,
|
||
is_redirect=False,
|
||
)
|
||
|
||
except InvalidPageTitle as e:
|
||
raise WikiPageNotFoundError(f"Неверное название страницы: {e}") from e
|
||
|
||
return await asyncio.to_thread(_sync_fetch)
|
||
|
||
def _clean_wikitext(self, text: str) -> str:
|
||
text = re.sub(r"\{\{[Нн]авигация.*?\}\}", "", text, flags=re.DOTALL)
|
||
text = re.sub(r"\{\{[Кк]арточка.*?\}\}", "", text, flags=re.DOTALL)
|
||
text = re.sub(r"\{\{[Дд]исамбиг.*?\}\}", "", text, flags=re.DOTALL)
|
||
|
||
text = re.sub(r"\[\[[Кк]атегория:.*?\]\]", "", text)
|
||
|
||
text = re.sub(r"\[\[[Фф]айл:.*?\]\]", "", text, flags=re.DOTALL)
|
||
text = re.sub(r"\[\[[Ii]mage:.*?\]\]", "", text, flags=re.DOTALL)
|
||
|
||
text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
|
||
|
||
text = re.sub(r"\n\s*\n", "\n\n", text)
|
||
|
||
return text.strip()
|
||
|
||
async def fetch_page(self, url: str) -> WikiPageInfo:
|
||
title = self.extract_title_from_url(url)
|
||
|
||
async with self.rate_limiter:
|
||
return await self.circuit_breaker.call(
|
||
lambda: with_retry(
|
||
lambda: self._fetch_page_content(title),
|
||
max_attempts=self.config.max_retries,
|
||
min_wait=self.config.retry_delay,
|
||
max_wait=self.config.retry_delay * 4,
|
||
retry_exceptions=(ConnectionError, TimeoutError),
|
||
name=f"fetch_page_{title}",
|
||
)
|
||
)
|
||
|
||
async def fetch_page_cleaned(self, url: str) -> WikiPageInfo:
|
||
page_info = await self.fetch_page(url)
|
||
cleaned_content = self._clean_wikitext(page_info.content)
|
||
|
||
return WikiPageInfo(
|
||
title=page_info.title,
|
||
content=cleaned_content,
|
||
is_redirect=page_info.is_redirect,
|
||
redirect_target=page_info.redirect_target,
|
||
)
|
||
|
||
async def health_check(self) -> bool:
|
||
try:
|
||
client = await self._get_client()
|
||
await asyncio.to_thread(lambda: client.api("query", meta="siteinfo", siprop="general"))
|
||
return True
|
||
except Exception as e:
|
||
self.logger.error("Health check failed", error=str(e))
|
||
return False
|