ruwiki-test/src/sources.py

122 lines
4.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import asyncio
from pathlib import Path
from typing import AsyncGenerator
from urllib.parse import urlparse
import structlog
from src.models import SimplifyCommand
from src.models.constants import ARTICLE_NAME_INDEX, MIN_WIKI_PATH_PARTS, WIKI_PATH_INDEX
class FileSource:
def __init__(self, file_path: str) -> None:
self.file_path = Path(file_path)
self.logger = structlog.get_logger().bind(source="file", path=str(self.file_path))
async def read_urls(
self, *, force_reprocess: bool = False
) -> AsyncGenerator[SimplifyCommand, None]:
if not self.file_path.exists():
msg = f"Файл с URL не найден: {self.file_path}"
raise FileNotFoundError(msg)
self.logger.info("Начинаем чтение URL из файла")
content = await asyncio.to_thread(self._read_file_sync)
seen_urls = set()
valid_count = 0
invalid_count = 0
for line_num, original_line in enumerate(content.splitlines(), 1):
line = original_line.strip()
if not line or line.startswith("#"):
continue
if not self._is_valid_wikipedia_url(line):
self.logger.warning("Невалидный URL", line_number=line_num, url=line)
invalid_count += 1
continue
if line in seen_urls:
self.logger.debug("Дубликат URL пропущен", line_number=line_num, url=line)
continue
seen_urls.add(line)
valid_count += 1
yield SimplifyCommand(url=line, force_reprocess=force_reprocess)
self.logger.info(
"Завершено чтение URL",
valid_count=valid_count,
invalid_count=invalid_count,
total_unique=len(seen_urls),
)
def _read_file_sync(self) -> str:
return self.file_path.read_text(encoding="utf-8")
def _is_valid_wikipedia_url(self, url: str) -> bool:
try:
self.logger.info("Начинаем проверку URL", raw_url=url)
parsed = urlparse(url)
self.logger.info(
"Разобранный URL", scheme=parsed.scheme, netloc=parsed.netloc, path=parsed.path
)
if parsed.scheme not in ("http", "https"):
self.logger.info("Отклонено: неподдерживаемая схема", scheme=parsed.scheme, url=url)
return False
if "ruwiki" not in parsed.netloc:
self.logger.info(
"Отклонено: домен не содержит 'ruwiki'", netloc=parsed.netloc, url=url
)
return False
path_parts = parsed.path.split("/")
self.logger.info("Части пути", path_parts=path_parts)
if len(path_parts) < MIN_WIKI_PATH_PARTS:
self.logger.info(
"Отклонено: слишком мало сегментов в пути", parts=path_parts, url=url
)
return False
if path_parts[WIKI_PATH_INDEX] != "wiki":
self.logger.info(
"Отклонено: неверный сегмент пути",
expected="wiki",
actual=path_parts[WIKI_PATH_INDEX],
url=url,
)
return False
article_name = path_parts[ARTICLE_NAME_INDEX]
self.logger.info("Извлечено имя статьи", article_name=article_name, url=url)
if not article_name or article_name in ("Main_Page", "Заглавная_страница"):
self.logger.info(
"Отклонено: некорректное имя статьи", article_name=article_name, url=url
)
return False
self.logger.info("URL прошёл все проверки", url=url)
return True
except Exception as e:
self.logger.info("Ошибка при проверке URL", error=str(e), url=url)
return False
async def count_urls(self) -> int:
count = 0
async for _ in self.read_urls():
count += 1
return count