ruwiki-test/src/adapters/ruwiki.py

169 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
import re
from typing import NamedTuple
from urllib.parse import unquote, urlparse
import mwclient
import structlog
from mwclient.errors import InvalidPageTitle, LoginError
from ..models import AppConfig
from .base import BaseAdapter, CircuitBreaker, RateLimiter, with_retry
logger = structlog.get_logger()
class WikiPageNotFoundError(Exception):
pass
class WikiPageRedirectError(Exception):
pass
class WikiPageInfo(NamedTuple):
title: str
content: str
is_redirect: bool = False
redirect_target: str | None = None
class RuWikiAdapter(BaseAdapter):
def __init__(self, config: AppConfig) -> None:
super().__init__("ruwiki_adapter")
self.config = config
self.rate_limiter = RateLimiter(
max_concurrent=config.max_concurrent_wiki,
name="ruwiki_limiter",
)
self.circuit_breaker = CircuitBreaker(
failure_threshold=config.circuit_failure_threshold,
recovery_timeout=config.circuit_recovery_timeout,
name="ruwiki_circuit",
)
self._client: mwclient.Site | None = None
async def _get_client(self) -> mwclient.Site:
if self._client is None:
self._client = await asyncio.to_thread(
self._create_client,
)
return self._client
def _create_client(self) -> mwclient.Site:
try:
site = mwclient.Site("ru.wikipedia.org")
site.api("query", meta="siteinfo")
self.logger.info("Соединение с RuWiki установлено")
return site
except (LoginError, ConnectionError) as e:
self.logger.error("Ошибка подключения к RuWiki", error=str(e))
raise
@staticmethod
def extract_title_from_url(url: str) -> str:
parsed = urlparse(url)
if "wikipedia.org" not in parsed.netloc:
raise ValueError(f"Не является URL википедии: {url}")
path_parts = parsed.path.split("/")
if len(path_parts) < 3 or path_parts[1] != "wiki":
raise ValueError(f"Неверный формат URL: {url}")
title = unquote(path_parts[2])
title = title.replace("_", " ")
return title
async def _fetch_page_content(self, title: str) -> WikiPageInfo:
client = await self._get_client()
def _sync_fetch() -> WikiPageInfo:
try:
page = client.pages[title]
if not page.exists:
raise WikiPageNotFoundError(f"Страница '{title}' не найдена")
if page.redirect:
redirect_target = page.redirects_to()
if redirect_target:
redirect_title = redirect_target.name
self.logger.info(
"Страница является редиректом",
original=title,
target=redirect_title,
)
raise WikiPageRedirectError(
f"Страница '{title}' перенаправляет на '{redirect_title}'"
)
content = page.text()
if not content or len(content.strip()) < 100:
raise WikiPageNotFoundError(f"Страница '{title}' слишком короткая или пустая")
return WikiPageInfo(
title=title,
content=content,
is_redirect=False,
)
except InvalidPageTitle as e:
raise WikiPageNotFoundError(f"Неверное название страницы: {e}") from e
return await asyncio.to_thread(_sync_fetch)
def _clean_wikitext(self, text: str) -> str:
text = re.sub(r"\{\{[Нн]авигация.*?\}\}", "", text, flags=re.DOTALL)
text = re.sub(r"\{\{[Кк]арточка.*?\}\}", "", text, flags=re.DOTALL)
text = re.sub(r"\{\{[Дд]исамбиг.*?\}\}", "", text, flags=re.DOTALL)
text = re.sub(r"\[\[[Кк]атегория:.*?\]\]", "", text)
text = re.sub(r"\[\[[Фф]айл:.*?\]\]", "", text, flags=re.DOTALL)
text = re.sub(r"\[\[[Ii]mage:.*?\]\]", "", text, flags=re.DOTALL)
text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
text = re.sub(r"\n\s*\n", "\n\n", text)
return text.strip()
async def fetch_page(self, url: str) -> WikiPageInfo:
title = self.extract_title_from_url(url)
async with self.rate_limiter:
return await self.circuit_breaker.call(
lambda: with_retry(
lambda: self._fetch_page_content(title),
max_attempts=self.config.max_retries,
min_wait=self.config.retry_delay,
max_wait=self.config.retry_delay * 4,
retry_exceptions=(ConnectionError, TimeoutError),
name=f"fetch_page_{title}",
)
)
async def fetch_page_cleaned(self, url: str) -> WikiPageInfo:
page_info = await self.fetch_page(url)
cleaned_content = self._clean_wikitext(page_info.content)
return WikiPageInfo(
title=page_info.title,
content=cleaned_content,
is_redirect=page_info.is_redirect,
redirect_target=page_info.redirect_target,
)
async def health_check(self) -> bool:
try:
client = await self._get_client()
await asyncio.to_thread(lambda: client.api("query", meta="siteinfo", siprop="general"))
return True
except Exception as e:
self.logger.error("Health check failed", error=str(e))
return False