First commit
This commit is contained in:
commit
c41a2907b8
|
@ -0,0 +1,114 @@
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.pytest_cache/
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
|
||||||
|
# IDE files
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
|
||||||
|
# OS files
|
||||||
|
.DS_Store
|
||||||
|
.DS_Store?
|
||||||
|
._*
|
||||||
|
.Spotlight-V100
|
||||||
|
.Trashes
|
||||||
|
ehthumbs.db
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# Project specific - RuWiki SchoolNotes
|
||||||
|
# Database files
|
||||||
|
data/
|
||||||
|
*.db
|
||||||
|
*.sqlite
|
||||||
|
*.sqlite3
|
||||||
|
*.db-journal
|
||||||
|
*.sqlite-journal
|
||||||
|
*.sqlite3-journal
|
||||||
|
|
||||||
|
# Configuration files with secrets
|
||||||
|
.env.local
|
||||||
|
.env.production
|
||||||
|
config.local.toml
|
||||||
|
secrets.json
|
||||||
|
|
||||||
|
# Logs and monitoring
|
||||||
|
logs/
|
||||||
|
*.log
|
||||||
|
*.log.*
|
||||||
|
monitoring/
|
||||||
|
metrics/
|
||||||
|
|
||||||
|
# Temporary processing files
|
||||||
|
temp/
|
||||||
|
tmp/
|
||||||
|
cache/
|
||||||
|
.cache/
|
||||||
|
processing_temp/
|
||||||
|
|
||||||
|
# Input files with sensitive URLs (keep example only)
|
||||||
|
input_production.txt
|
||||||
|
input_large.txt
|
||||||
|
urls_private.txt
|
||||||
|
|
||||||
|
# Test artifacts specific to our project
|
||||||
|
test_outputs/
|
||||||
|
test_db/
|
||||||
|
mock_data/
|
||||||
|
test_logs/
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.
|
||||||
|
# ruff
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
|
# Performance profiling
|
||||||
|
*.prof
|
||||||
|
profile_results/
|
||||||
|
|
||||||
|
# API response caches
|
||||||
|
api_cache/
|
||||||
|
wikipedia_cache/
|
||||||
|
openai_cache/
|
|
@ -0,0 +1,20 @@
|
||||||
|
OPENAI_API_KEY=your_openai_api_key_here
|
||||||
|
OPENAI_MODEL=gpt-4o-mini
|
||||||
|
OPENAI_TEMPERATURE=0.0
|
||||||
|
|
||||||
|
DB_PATH=./data/wiki.db
|
||||||
|
|
||||||
|
MAX_CONCURRENT_LLM=5
|
||||||
|
OPENAI_RPM=200
|
||||||
|
MAX_CONCURRENT_WIKI=10
|
||||||
|
|
||||||
|
LOG_LEVEL=INFO
|
||||||
|
LOG_FORMAT=json
|
||||||
|
|
||||||
|
CHUNK_SIZE=2000
|
||||||
|
CHUNK_OVERLAP=200
|
||||||
|
MAX_RETRIES=3
|
||||||
|
RETRY_DELAY=1.0
|
||||||
|
|
||||||
|
CIRCUIT_FAILURE_THRESHOLD=5
|
||||||
|
CIRCUIT_RECOVERY_TIMEOUT=60
|
|
@ -0,0 +1,3 @@
|
||||||
|
https://ru.ruwiki.ru/wiki/Изотопы
|
||||||
|
https://ru.ruwiki.ru/wiki/Вещественное_число
|
||||||
|
https://ru.ruwiki.ru/wiki/Митоз
|
|
@ -0,0 +1,83 @@
|
||||||
|
[tool.poetry]
|
||||||
|
name = "ruwiki-schoolnotes"
|
||||||
|
version = "1.0.0"
|
||||||
|
description = "Конвейер для упрощения статей RuWiki с помощью LLM"
|
||||||
|
authors = ["Leon K. <leonkl32@gmail.com>"]
|
||||||
|
readme = "README.md"
|
||||||
|
packages = [{include = "src"}]
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = "^3.10"
|
||||||
|
anyio = "^4.2.0"
|
||||||
|
aiohttp = "^3.9.0"
|
||||||
|
aiosqlite = "^0.19.0"
|
||||||
|
sqlmodel = "^0.0.14"
|
||||||
|
openai = "^1.13.0"
|
||||||
|
tiktoken = "^0.5.2"
|
||||||
|
mwclient = "^0.10.1"
|
||||||
|
pydantic = "^2.5.0"
|
||||||
|
pydantic-settings = "^2.1.0"
|
||||||
|
structlog = "^23.2.0"
|
||||||
|
tenacity = "^8.2.3"
|
||||||
|
click = "^8.1.7"
|
||||||
|
|
||||||
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
black = "^23.12.0"
|
||||||
|
ruff = "^0.1.8"
|
||||||
|
mypy = "^1.8.0"
|
||||||
|
pytest = "^7.4.3"
|
||||||
|
pytest-asyncio = "^0.21.1"
|
||||||
|
pytest-cov = "^4.1.0"
|
||||||
|
pytest-vcr = "^1.0.2"
|
||||||
|
bandit = "^1.7.5"
|
||||||
|
pip-audit = "^2.6.2"
|
||||||
|
|
||||||
|
[tool.poetry.scripts]
|
||||||
|
schoolnotes = "src.cli:main"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
|
[tool.black]
|
||||||
|
line-length = 100
|
||||||
|
target-version = ['py310']
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
target-version = "py310"
|
||||||
|
line-length = 100
|
||||||
|
select = ["ALL"]
|
||||||
|
ignore = [
|
||||||
|
"D100", "D101", "D102", "D103", "D104", "D105", "D106", "D107", # missing docstrings
|
||||||
|
"ANN101", "ANN102", # missing type annotation for self/cls
|
||||||
|
"COM812", "ISC001", # incompatible with black
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.ruff.per-file-ignores]
|
||||||
|
"tests/*" = ["S101", "PLR2004", "ANN"] # allow assert, magic values, no annotations
|
||||||
|
|
||||||
|
[tool.mypy]
|
||||||
|
python_version = "3.10"
|
||||||
|
strict = true
|
||||||
|
warn_return_any = true
|
||||||
|
warn_unused_configs = true
|
||||||
|
disallow_untyped_defs = true
|
||||||
|
disallow_incomplete_defs = true
|
||||||
|
check_untyped_defs = true
|
||||||
|
disallow_untyped_decorators = true
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
asyncio_mode = "auto"
|
||||||
|
testpaths = ["tests"]
|
||||||
|
python_files = ["test_*.py"]
|
||||||
|
python_classes = ["Test*"]
|
||||||
|
python_functions = ["test_*"]
|
||||||
|
addopts = "--cov=src --cov-report=html --cov-report=term-missing --cov-fail-under=80"
|
||||||
|
|
||||||
|
[tool.coverage.run]
|
||||||
|
source = ["src"]
|
||||||
|
omit = ["tests/*"]
|
||||||
|
|
||||||
|
[tool.bandit]
|
||||||
|
exclude_dirs = ["tests"]
|
||||||
|
skips = ["B101"]
|
|
@ -0,0 +1,12 @@
|
||||||
|
-r requirements.txt
|
||||||
|
|
||||||
|
black>=25.1.0
|
||||||
|
ruff>=0.12.0
|
||||||
|
|
||||||
|
mypy>=1.16.0
|
||||||
|
|
||||||
|
pytest>=8.4.0
|
||||||
|
pytest-asyncio>=1.0.0
|
||||||
|
pytest-cov>=6.2.0
|
||||||
|
|
||||||
|
bandit>=1.8.0
|
|
@ -0,0 +1,18 @@
|
||||||
|
anyio>=4.2.0,<5.0.0
|
||||||
|
aiohttp>=3.9.0,<4.0.0
|
||||||
|
|
||||||
|
aiosqlite>=0.19.0,<0.20.0
|
||||||
|
sqlmodel>=0.0.14,<0.0.15
|
||||||
|
|
||||||
|
openai>=1.13.0,<2.0.0
|
||||||
|
tiktoken>=0.5.2,<0.6.0
|
||||||
|
|
||||||
|
mwclient>=0.10.1,<0.11.0
|
||||||
|
|
||||||
|
pydantic>=2.5.0,<3.0.0
|
||||||
|
pydantic-settings>=2.1.0,<3.0.0
|
||||||
|
|
||||||
|
structlog>=23.2.0,<24.0.0
|
||||||
|
tenacity>=8.2.3,<9.0.0
|
||||||
|
|
||||||
|
click>=8.1.7,<9.0.0
|
|
@ -0,0 +1 @@
|
||||||
|
__version__ = "1.0.0"
|
|
@ -0,0 +1,18 @@
|
||||||
|
from .base import BaseAdapter, CircuitBreaker, CircuitBreakerError, RateLimiter
|
||||||
|
from .llm import LLMError, LLMProviderAdapter, LLMRateLimitError, LLMTokenLimitError
|
||||||
|
from .ruwiki import RuWikiAdapter, WikiPageInfo, WikiPageNotFoundError, WikiPageRedirectError
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"BaseAdapter",
|
||||||
|
"CircuitBreaker",
|
||||||
|
"CircuitBreakerError",
|
||||||
|
"LLMError",
|
||||||
|
"LLMProviderAdapter",
|
||||||
|
"LLMRateLimitError",
|
||||||
|
"LLMTokenLimitError",
|
||||||
|
"RateLimiter",
|
||||||
|
"RuWikiAdapter",
|
||||||
|
"WikiPageInfo",
|
||||||
|
"WikiPageNotFoundError",
|
||||||
|
"WikiPageRedirectError",
|
||||||
|
]
|
|
@ -0,0 +1,128 @@
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from collections.abc import Awaitable, Callable
|
||||||
|
from typing import Any, TypeVar
|
||||||
|
|
||||||
|
import structlog
|
||||||
|
from tenacity import (
|
||||||
|
AsyncRetrying,
|
||||||
|
before_sleep_log,
|
||||||
|
retry_if_exception_type,
|
||||||
|
stop_after_attempt,
|
||||||
|
wait_exponential,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
|
T = TypeVar("T")
|
||||||
|
|
||||||
|
|
||||||
|
class CircuitBreakerError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class CircuitBreaker:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
failure_threshold: int = 5,
|
||||||
|
recovery_timeout: int = 60,
|
||||||
|
name: str = "circuit_breaker",
|
||||||
|
) -> None:
|
||||||
|
self.failure_threshold = failure_threshold
|
||||||
|
self.recovery_timeout = recovery_timeout
|
||||||
|
self.name = name
|
||||||
|
|
||||||
|
self._failure_count = 0
|
||||||
|
self._last_failure_time: float | None = None
|
||||||
|
self._state: str = "closed"
|
||||||
|
|
||||||
|
async def call(self, func: Callable[[], Awaitable[T]]) -> T:
|
||||||
|
if self._state == "open":
|
||||||
|
if self._should_attempt_reset():
|
||||||
|
self._state = "half_open"
|
||||||
|
logger.info("Circuit breaker перешёл в half_open", name=self.name)
|
||||||
|
else:
|
||||||
|
raise CircuitBreakerError(f"Circuit breaker {self.name} открыт")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await func()
|
||||||
|
self._on_success()
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
self._on_failure()
|
||||||
|
raise e
|
||||||
|
|
||||||
|
def _should_attempt_reset(self) -> bool:
|
||||||
|
if self._last_failure_time is None:
|
||||||
|
return True
|
||||||
|
return time.time() - self._last_failure_time >= self.recovery_timeout
|
||||||
|
|
||||||
|
def _on_success(self) -> None:
|
||||||
|
if self._state == "half_open":
|
||||||
|
self._state = "closed"
|
||||||
|
logger.info("Circuit breaker восстановлен", name=self.name)
|
||||||
|
self._failure_count = 0
|
||||||
|
|
||||||
|
def _on_failure(self) -> None:
|
||||||
|
self._failure_count += 1
|
||||||
|
self._last_failure_time = time.time()
|
||||||
|
|
||||||
|
if self._failure_count >= self.failure_threshold:
|
||||||
|
self._state = "open"
|
||||||
|
logger.warning(
|
||||||
|
"Circuit breaker открыт из-за превышения порога ошибок",
|
||||||
|
name=self.name,
|
||||||
|
failure_count=self._failure_count,
|
||||||
|
threshold=self.failure_threshold,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class RateLimiter:
|
||||||
|
|
||||||
|
def __init__(self, max_concurrent: int, name: str = "rate_limiter") -> None:
|
||||||
|
self.semaphore = asyncio.Semaphore(max_concurrent)
|
||||||
|
self.name = name
|
||||||
|
self.max_concurrent = max_concurrent
|
||||||
|
|
||||||
|
async def __aenter__(self) -> None:
|
||||||
|
await self.semaphore.acquire()
|
||||||
|
|
||||||
|
async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
||||||
|
self.semaphore.release()
|
||||||
|
|
||||||
|
|
||||||
|
async def with_retry(
|
||||||
|
func: Callable[[], Awaitable[T]],
|
||||||
|
max_attempts: int = 3,
|
||||||
|
min_wait: float = 1.0,
|
||||||
|
max_wait: float = 10.0,
|
||||||
|
retry_exceptions: tuple[type[Exception], ...] = (Exception,),
|
||||||
|
name: str = "retry_operation",
|
||||||
|
) -> T:
|
||||||
|
async for attempt in AsyncRetrying(
|
||||||
|
stop=stop_after_attempt(max_attempts),
|
||||||
|
wait=wait_exponential(multiplier=1, min=min_wait, max=max_wait),
|
||||||
|
retry=retry_if_exception_type(retry_exceptions),
|
||||||
|
before_sleep=before_sleep_log(logger, "warning"),
|
||||||
|
reraise=True,
|
||||||
|
):
|
||||||
|
with attempt:
|
||||||
|
logger.debug(
|
||||||
|
"Попытка выполнения операции",
|
||||||
|
operation=name,
|
||||||
|
attempt_number=attempt.retry_state.attempt_number,
|
||||||
|
)
|
||||||
|
return await func()
|
||||||
|
|
||||||
|
|
||||||
|
class BaseAdapter(ABC):
|
||||||
|
|
||||||
|
def __init__(self, name: str) -> None:
|
||||||
|
self.name = name
|
||||||
|
self.logger = structlog.get_logger().bind(adapter=name)
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def health_check(self) -> bool:
|
||||||
|
pass
|
|
@ -0,0 +1,196 @@
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
|
||||||
|
import openai
|
||||||
|
import structlog
|
||||||
|
import tiktoken
|
||||||
|
from openai import AsyncOpenAI
|
||||||
|
from openai.types.chat import ChatCompletion
|
||||||
|
|
||||||
|
from ..models import AppConfig
|
||||||
|
from .base import BaseAdapter, CircuitBreaker, RateLimiter, with_retry
|
||||||
|
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class LLMError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class LLMTokenLimitError(LLMError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class LLMRateLimitError(LLMError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class LLMProviderAdapter(BaseAdapter):
|
||||||
|
|
||||||
|
def __init__(self, config: AppConfig) -> None:
|
||||||
|
super().__init__("llm_adapter")
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
self.client = AsyncOpenAI(api_key=config.openai_api_key)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.tokenizer = tiktoken.encoding_for_model(config.openai_model)
|
||||||
|
except KeyError:
|
||||||
|
self.tokenizer = tiktoken.get_encoding("cl100k_base")
|
||||||
|
|
||||||
|
self.rate_limiter = RateLimiter(
|
||||||
|
max_concurrent=config.max_concurrent_llm,
|
||||||
|
name="llm_limiter",
|
||||||
|
)
|
||||||
|
self.circuit_breaker = CircuitBreaker(
|
||||||
|
failure_threshold=config.circuit_failure_threshold,
|
||||||
|
recovery_timeout=config.circuit_recovery_timeout,
|
||||||
|
name="llm_circuit",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.request_times: list[float] = []
|
||||||
|
self.rpm_lock = asyncio.Lock()
|
||||||
|
|
||||||
|
def count_tokens(self, text: str) -> int:
|
||||||
|
try:
|
||||||
|
return len(self.tokenizer.encode(text))
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning("Ошибка подсчёта токенов", error=str(e))
|
||||||
|
return len(text) // 4
|
||||||
|
|
||||||
|
async def _check_rpm_limit(self) -> None:
|
||||||
|
async with self.rpm_lock:
|
||||||
|
current_time = time.time()
|
||||||
|
self.request_times = [
|
||||||
|
req_time for req_time in self.request_times if current_time - req_time < 60
|
||||||
|
]
|
||||||
|
|
||||||
|
if len(self.request_times) >= self.config.openai_rpm:
|
||||||
|
oldest_request = min(self.request_times)
|
||||||
|
wait_time = 60 - (current_time - oldest_request)
|
||||||
|
if wait_time > 0:
|
||||||
|
self.logger.info(
|
||||||
|
"Ожидание из-за RPM лимита",
|
||||||
|
wait_seconds=wait_time,
|
||||||
|
current_rpm=len(self.request_times),
|
||||||
|
)
|
||||||
|
await asyncio.sleep(wait_time)
|
||||||
|
|
||||||
|
self.request_times.append(current_time)
|
||||||
|
|
||||||
|
async def _make_completion_request(
|
||||||
|
self,
|
||||||
|
messages: list[dict[str, str]],
|
||||||
|
) -> ChatCompletion:
|
||||||
|
try:
|
||||||
|
response = await self.client.chat.completions.create(
|
||||||
|
model=self.config.openai_model,
|
||||||
|
messages=messages,
|
||||||
|
temperature=self.config.openai_temperature,
|
||||||
|
max_tokens=1500,
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
except openai.RateLimitError as e:
|
||||||
|
raise LLMRateLimitError(f"Rate limit exceeded: {e}") from e
|
||||||
|
except openai.APIError as e:
|
||||||
|
raise LLMError(f"OpenAI API error: {e}") from e
|
||||||
|
|
||||||
|
async def simplify_text(
|
||||||
|
self,
|
||||||
|
title: str,
|
||||||
|
wiki_text: str,
|
||||||
|
prompt_template: str,
|
||||||
|
) -> tuple[str, int, int]:
|
||||||
|
input_tokens = self.count_tokens(wiki_text)
|
||||||
|
if input_tokens > 6000:
|
||||||
|
raise LLMTokenLimitError(f"Текст слишком длинный: {input_tokens} токенов (лимит 6000)")
|
||||||
|
|
||||||
|
try:
|
||||||
|
prompt_text = prompt_template.format(
|
||||||
|
title=title,
|
||||||
|
wiki_source_text=wiki_text,
|
||||||
|
)
|
||||||
|
except KeyError as e:
|
||||||
|
raise LLMError(f"Ошибка в шаблоне промпта: отсутствует ключ {e}") from e
|
||||||
|
|
||||||
|
messages = self._parse_prompt_template(prompt_text)
|
||||||
|
|
||||||
|
total_input_tokens = sum(self.count_tokens(msg["content"]) for msg in messages)
|
||||||
|
|
||||||
|
async with self.rate_limiter:
|
||||||
|
await self._check_rpm_limit()
|
||||||
|
|
||||||
|
response = await self.circuit_breaker.call(
|
||||||
|
lambda: with_retry(
|
||||||
|
lambda: self._make_completion_request(messages),
|
||||||
|
max_attempts=self.config.max_retries,
|
||||||
|
min_wait=self.config.retry_delay,
|
||||||
|
max_wait=self.config.retry_delay * 4,
|
||||||
|
retry_exceptions=(LLMRateLimitError, ConnectionError, TimeoutError),
|
||||||
|
name=f"simplify_{title}",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not response.choices:
|
||||||
|
raise LLMError("Пустой ответ от OpenAI")
|
||||||
|
|
||||||
|
simplified_text = response.choices[0].message.content
|
||||||
|
if not simplified_text:
|
||||||
|
raise LLMError("OpenAI вернул пустой текст")
|
||||||
|
|
||||||
|
simplified_text = simplified_text.replace("###END###", "").strip()
|
||||||
|
|
||||||
|
output_tokens = self.count_tokens(simplified_text)
|
||||||
|
|
||||||
|
if output_tokens > 1200:
|
||||||
|
self.logger.warning(
|
||||||
|
"Упрощённый текст превышает лимит",
|
||||||
|
output_tokens=output_tokens,
|
||||||
|
title=title,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.logger.info(
|
||||||
|
"Текст успешно упрощён",
|
||||||
|
title=title,
|
||||||
|
input_tokens=total_input_tokens,
|
||||||
|
output_tokens=output_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
return simplified_text, total_input_tokens, output_tokens
|
||||||
|
|
||||||
|
def _parse_prompt_template(self, prompt_text: str) -> list[dict[str, str]]:
|
||||||
|
messages: list[dict[str, str]] = []
|
||||||
|
|
||||||
|
parts = prompt_text.split("### role:")
|
||||||
|
|
||||||
|
for part in parts[1:]:
|
||||||
|
lines = part.strip().split("\n", 1)
|
||||||
|
if len(lines) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
role = lines[0].strip()
|
||||||
|
content = lines[1].strip()
|
||||||
|
|
||||||
|
if role in ("system", "user", "assistant"):
|
||||||
|
messages.append({"role": role, "content": content})
|
||||||
|
|
||||||
|
if not messages:
|
||||||
|
messages = [{"role": "user", "content": prompt_text}]
|
||||||
|
|
||||||
|
return messages
|
||||||
|
|
||||||
|
async def health_check(self) -> bool:
|
||||||
|
try:
|
||||||
|
test_messages = [{"role": "user", "content": "Ответь 'OK' если всё работает."}]
|
||||||
|
|
||||||
|
response = await self.client.chat.completions.create(
|
||||||
|
model=self.config.openai_model,
|
||||||
|
messages=test_messages,
|
||||||
|
temperature=0,
|
||||||
|
max_tokens=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
return bool(response.choices and response.choices[0].message.content)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error("LLM health check failed", error=str(e))
|
||||||
|
return False
|
|
@ -0,0 +1,168 @@
|
||||||
|
import asyncio
|
||||||
|
import re
|
||||||
|
from typing import NamedTuple
|
||||||
|
from urllib.parse import unquote, urlparse
|
||||||
|
|
||||||
|
import mwclient
|
||||||
|
import structlog
|
||||||
|
from mwclient.errors import InvalidPageTitle, LoginError
|
||||||
|
|
||||||
|
from ..models import AppConfig
|
||||||
|
from .base import BaseAdapter, CircuitBreaker, RateLimiter, with_retry
|
||||||
|
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class WikiPageNotFoundError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class WikiPageRedirectError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class WikiPageInfo(NamedTuple):
|
||||||
|
title: str
|
||||||
|
content: str
|
||||||
|
is_redirect: bool = False
|
||||||
|
redirect_target: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class RuWikiAdapter(BaseAdapter):
|
||||||
|
|
||||||
|
def __init__(self, config: AppConfig) -> None:
|
||||||
|
super().__init__("ruwiki_adapter")
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
self.rate_limiter = RateLimiter(
|
||||||
|
max_concurrent=config.max_concurrent_wiki,
|
||||||
|
name="ruwiki_limiter",
|
||||||
|
)
|
||||||
|
self.circuit_breaker = CircuitBreaker(
|
||||||
|
failure_threshold=config.circuit_failure_threshold,
|
||||||
|
recovery_timeout=config.circuit_recovery_timeout,
|
||||||
|
name="ruwiki_circuit",
|
||||||
|
)
|
||||||
|
|
||||||
|
self._client: mwclient.Site | None = None
|
||||||
|
|
||||||
|
async def _get_client(self) -> mwclient.Site:
|
||||||
|
if self._client is None:
|
||||||
|
self._client = await asyncio.to_thread(
|
||||||
|
self._create_client,
|
||||||
|
)
|
||||||
|
return self._client
|
||||||
|
|
||||||
|
def _create_client(self) -> mwclient.Site:
|
||||||
|
try:
|
||||||
|
site = mwclient.Site("ru.wikipedia.org")
|
||||||
|
site.api("query", meta="siteinfo")
|
||||||
|
self.logger.info("Соединение с RuWiki установлено")
|
||||||
|
return site
|
||||||
|
except (LoginError, ConnectionError) as e:
|
||||||
|
self.logger.error("Ошибка подключения к RuWiki", error=str(e))
|
||||||
|
raise
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_title_from_url(url: str) -> str:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
if "wikipedia.org" not in parsed.netloc:
|
||||||
|
raise ValueError(f"Не является URL википедии: {url}")
|
||||||
|
|
||||||
|
path_parts = parsed.path.split("/")
|
||||||
|
if len(path_parts) < 3 or path_parts[1] != "wiki":
|
||||||
|
raise ValueError(f"Неверный формат URL: {url}")
|
||||||
|
|
||||||
|
title = unquote(path_parts[2])
|
||||||
|
title = title.replace("_", " ")
|
||||||
|
|
||||||
|
return title
|
||||||
|
|
||||||
|
async def _fetch_page_content(self, title: str) -> WikiPageInfo:
|
||||||
|
client = await self._get_client()
|
||||||
|
|
||||||
|
def _sync_fetch() -> WikiPageInfo:
|
||||||
|
try:
|
||||||
|
page = client.pages[title]
|
||||||
|
|
||||||
|
if not page.exists:
|
||||||
|
raise WikiPageNotFoundError(f"Страница '{title}' не найдена")
|
||||||
|
|
||||||
|
if page.redirect:
|
||||||
|
redirect_target = page.redirects_to()
|
||||||
|
if redirect_target:
|
||||||
|
redirect_title = redirect_target.name
|
||||||
|
self.logger.info(
|
||||||
|
"Страница является редиректом",
|
||||||
|
original=title,
|
||||||
|
target=redirect_title,
|
||||||
|
)
|
||||||
|
raise WikiPageRedirectError(
|
||||||
|
f"Страница '{title}' перенаправляет на '{redirect_title}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
content = page.text()
|
||||||
|
if not content or len(content.strip()) < 100:
|
||||||
|
raise WikiPageNotFoundError(f"Страница '{title}' слишком короткая или пустая")
|
||||||
|
|
||||||
|
return WikiPageInfo(
|
||||||
|
title=title,
|
||||||
|
content=content,
|
||||||
|
is_redirect=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
except InvalidPageTitle as e:
|
||||||
|
raise WikiPageNotFoundError(f"Неверное название страницы: {e}") from e
|
||||||
|
|
||||||
|
return await asyncio.to_thread(_sync_fetch)
|
||||||
|
|
||||||
|
def _clean_wikitext(self, text: str) -> str:
|
||||||
|
text = re.sub(r"\{\{[Нн]авигация.*?\}\}", "", text, flags=re.DOTALL)
|
||||||
|
text = re.sub(r"\{\{[Кк]арточка.*?\}\}", "", text, flags=re.DOTALL)
|
||||||
|
text = re.sub(r"\{\{[Дд]исамбиг.*?\}\}", "", text, flags=re.DOTALL)
|
||||||
|
|
||||||
|
text = re.sub(r"\[\[[Кк]атегория:.*?\]\]", "", text)
|
||||||
|
|
||||||
|
text = re.sub(r"\[\[[Фф]айл:.*?\]\]", "", text, flags=re.DOTALL)
|
||||||
|
text = re.sub(r"\[\[[Ii]mage:.*?\]\]", "", text, flags=re.DOTALL)
|
||||||
|
|
||||||
|
text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
|
||||||
|
|
||||||
|
text = re.sub(r"\n\s*\n", "\n\n", text)
|
||||||
|
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
async def fetch_page(self, url: str) -> WikiPageInfo:
|
||||||
|
title = self.extract_title_from_url(url)
|
||||||
|
|
||||||
|
async with self.rate_limiter:
|
||||||
|
return await self.circuit_breaker.call(
|
||||||
|
lambda: with_retry(
|
||||||
|
lambda: self._fetch_page_content(title),
|
||||||
|
max_attempts=self.config.max_retries,
|
||||||
|
min_wait=self.config.retry_delay,
|
||||||
|
max_wait=self.config.retry_delay * 4,
|
||||||
|
retry_exceptions=(ConnectionError, TimeoutError),
|
||||||
|
name=f"fetch_page_{title}",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
async def fetch_page_cleaned(self, url: str) -> WikiPageInfo:
|
||||||
|
page_info = await self.fetch_page(url)
|
||||||
|
cleaned_content = self._clean_wikitext(page_info.content)
|
||||||
|
|
||||||
|
return WikiPageInfo(
|
||||||
|
title=page_info.title,
|
||||||
|
content=cleaned_content,
|
||||||
|
is_redirect=page_info.is_redirect,
|
||||||
|
redirect_target=page_info.redirect_target,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def health_check(self) -> bool:
|
||||||
|
try:
|
||||||
|
client = await self._get_client()
|
||||||
|
await asyncio.to_thread(lambda: client.api("query", meta="siteinfo", siprop="general"))
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error("Health check failed", error=str(e))
|
||||||
|
return False
|
|
@ -0,0 +1,278 @@
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import click
|
||||||
|
import structlog
|
||||||
|
|
||||||
|
from .dependency_injection import get_container
|
||||||
|
from .models import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logging(log_level: str, log_format: str) -> None:
|
||||||
|
processors = [
|
||||||
|
structlog.stdlib.filter_by_level,
|
||||||
|
structlog.stdlib.add_logger_name,
|
||||||
|
structlog.stdlib.add_log_level,
|
||||||
|
structlog.stdlib.PositionalArgumentsFormatter(),
|
||||||
|
structlog.processors.TimeStamper(fmt="iso"),
|
||||||
|
structlog.processors.StackInfoRenderer(),
|
||||||
|
structlog.processors.format_exc_info,
|
||||||
|
]
|
||||||
|
|
||||||
|
if log_format == "json":
|
||||||
|
processors.append(structlog.processors.JSONRenderer())
|
||||||
|
else:
|
||||||
|
processors.append(structlog.dev.ConsoleRenderer())
|
||||||
|
|
||||||
|
structlog.configure(
|
||||||
|
processors=processors,
|
||||||
|
wrapper_class=structlog.stdlib.BoundLogger,
|
||||||
|
logger_factory=structlog.stdlib.LoggerFactory(),
|
||||||
|
cache_logger_on_first_use=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.basicConfig(level=getattr(logging, log_level.upper()))
|
||||||
|
|
||||||
|
|
||||||
|
@click.group()
|
||||||
|
@click.option(
|
||||||
|
"--config-file",
|
||||||
|
type=click.Path(exists=True),
|
||||||
|
help="Путь к файлу конфигурации .env",
|
||||||
|
)
|
||||||
|
@click.option("--log-level", default="INFO", help="Уровень логирования")
|
||||||
|
@click.option(
|
||||||
|
"--log-format", default="text", type=click.Choice(["json", "text"]), help="Формат логов"
|
||||||
|
)
|
||||||
|
@click.pass_context
|
||||||
|
def main(ctx: click.Context, config_file: str | None, log_level: str, log_format: str) -> None:
|
||||||
|
setup_logging(log_level, log_format)
|
||||||
|
|
||||||
|
if config_file:
|
||||||
|
config = AppConfig(_env_file=config_file)
|
||||||
|
else:
|
||||||
|
config = AppConfig()
|
||||||
|
|
||||||
|
ctx.ensure_object(dict)
|
||||||
|
ctx.obj["config"] = config
|
||||||
|
|
||||||
|
|
||||||
|
@main.command()
|
||||||
|
@click.argument("input_file", type=click.Path(exists=True))
|
||||||
|
@click.option(
|
||||||
|
"--force",
|
||||||
|
is_flag=True,
|
||||||
|
help="Принудительно обработать даже уже обработанные статьи",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--max-articles",
|
||||||
|
type=int,
|
||||||
|
help="Максимальное количество статей для обработки",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--max-workers",
|
||||||
|
type=int,
|
||||||
|
help="Максимальное количество worker корутин",
|
||||||
|
)
|
||||||
|
@click.pass_context
|
||||||
|
def process(
|
||||||
|
ctx: click.Context,
|
||||||
|
input_file: str,
|
||||||
|
force: bool,
|
||||||
|
max_articles: int | None,
|
||||||
|
max_workers: int | None,
|
||||||
|
) -> None:
|
||||||
|
config: AppConfig = ctx.obj["config"]
|
||||||
|
|
||||||
|
async def _run() -> None:
|
||||||
|
container = get_container(config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await container.initialize()
|
||||||
|
|
||||||
|
runner = container.create_runner(max_workers)
|
||||||
|
|
||||||
|
click.echo(f"Запуск обработки статей из файла: {input_file}")
|
||||||
|
click.echo(f"Принудительная обработка: {force}")
|
||||||
|
click.echo(f"Максимум статей: {max_articles or 'без ограничений'}")
|
||||||
|
click.echo(f"Workers: {runner.max_workers}")
|
||||||
|
click.echo()
|
||||||
|
|
||||||
|
stats = await runner.run_from_file(
|
||||||
|
input_file=input_file,
|
||||||
|
force_reprocess=force,
|
||||||
|
max_articles=max_articles,
|
||||||
|
)
|
||||||
|
|
||||||
|
click.echo("\n" + "=" * 50)
|
||||||
|
click.echo("РЕЗУЛЬТАТЫ ОБРАБОТКИ")
|
||||||
|
click.echo("=" * 50)
|
||||||
|
click.echo(f"Всего обработано: {stats.total_processed}")
|
||||||
|
click.echo(f"Успешно: {stats.successful}")
|
||||||
|
click.echo(f"Ошибок: {stats.failed}")
|
||||||
|
click.echo(f"Пропущено: {stats.skipped}")
|
||||||
|
click.echo(f"Процент успеха: {stats.success_rate:.1f}%")
|
||||||
|
|
||||||
|
if stats.successful > 0:
|
||||||
|
click.echo(f"Среднее время обработки: {stats.average_processing_time:.2f}с")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
click.echo(f"Ошибка: {e}", err=True)
|
||||||
|
sys.exit(1)
|
||||||
|
finally:
|
||||||
|
await container.cleanup()
|
||||||
|
|
||||||
|
asyncio.run(_run())
|
||||||
|
|
||||||
|
|
||||||
|
@main.command()
|
||||||
|
@click.pass_context
|
||||||
|
def health(ctx: click.Context) -> None:
|
||||||
|
config: AppConfig = ctx.obj["config"]
|
||||||
|
|
||||||
|
async def _check() -> None:
|
||||||
|
container = get_container(config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await container.initialize()
|
||||||
|
|
||||||
|
click.echo("Проверка работоспособности системы...")
|
||||||
|
checks = await container.health_check()
|
||||||
|
|
||||||
|
click.echo("\nРезультаты проверки:")
|
||||||
|
all_ok = True
|
||||||
|
|
||||||
|
for component, status in checks.items():
|
||||||
|
status_str = "✓ OK" if status else "✗ FAILED"
|
||||||
|
click.echo(f" {component}: {status_str}")
|
||||||
|
if not status:
|
||||||
|
all_ok = False
|
||||||
|
|
||||||
|
if all_ok:
|
||||||
|
click.echo("\nВсе компоненты работают нормально")
|
||||||
|
else:
|
||||||
|
click.echo("\nОбнаружены проблемы с компонентами")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
click.echo(f"Ошибка при проверке: {e}", err=True)
|
||||||
|
sys.exit(1)
|
||||||
|
finally:
|
||||||
|
await container.cleanup()
|
||||||
|
|
||||||
|
asyncio.run(_check())
|
||||||
|
|
||||||
|
|
||||||
|
@main.command()
|
||||||
|
@click.argument("input_file", type=click.Path(exists=True))
|
||||||
|
@click.pass_context
|
||||||
|
def stats(ctx: click.Context, input_file: str) -> None:
|
||||||
|
from .sources import FileSource
|
||||||
|
|
||||||
|
async def _stats() -> None:
|
||||||
|
try:
|
||||||
|
source = FileSource(input_file)
|
||||||
|
total_urls = await source.count_urls()
|
||||||
|
|
||||||
|
click.echo(f"Файл: {input_file}")
|
||||||
|
click.echo(f"Валидных URL: {total_urls}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
click.echo(f"Ошибка: {e}", err=True)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
asyncio.run(_stats())
|
||||||
|
|
||||||
|
|
||||||
|
@main.command()
|
||||||
|
@click.option(
|
||||||
|
"--limit",
|
||||||
|
type=int,
|
||||||
|
default=10,
|
||||||
|
help="Количество статей для вывода",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--status",
|
||||||
|
type=click.Choice(["pending", "processing", "completed", "failed"]),
|
||||||
|
help="Фильтр по статусу",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--format",
|
||||||
|
"output_format",
|
||||||
|
type=click.Choice(["table", "json"]),
|
||||||
|
default="table",
|
||||||
|
help="Формат вывода",
|
||||||
|
)
|
||||||
|
@click.pass_context
|
||||||
|
def list_articles(
|
||||||
|
ctx: click.Context,
|
||||||
|
limit: int,
|
||||||
|
status: str | None,
|
||||||
|
output_format: str,
|
||||||
|
) -> None:
|
||||||
|
config: AppConfig = ctx.obj["config"]
|
||||||
|
|
||||||
|
async def _list() -> None:
|
||||||
|
container = get_container(config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await container.initialize()
|
||||||
|
repository = container.get_repository()
|
||||||
|
|
||||||
|
if status:
|
||||||
|
from .models import ProcessingStatus
|
||||||
|
|
||||||
|
status_enum = ProcessingStatus(status)
|
||||||
|
articles = await repository.get_articles_by_status(status_enum, limit)
|
||||||
|
else:
|
||||||
|
articles = await repository.get_all_articles(limit)
|
||||||
|
|
||||||
|
if output_format == "json":
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
"id": article.id,
|
||||||
|
"url": article.url,
|
||||||
|
"title": article.title,
|
||||||
|
"status": article.status.value,
|
||||||
|
"created_at": article.created_at.isoformat(),
|
||||||
|
"token_count_raw": article.token_count_raw,
|
||||||
|
"token_count_simplified": article.token_count_simplified,
|
||||||
|
}
|
||||||
|
for article in articles
|
||||||
|
]
|
||||||
|
click.echo(json.dumps(data, ensure_ascii=False, indent=2))
|
||||||
|
else:
|
||||||
|
if not articles:
|
||||||
|
click.echo("Статьи не найдены")
|
||||||
|
return
|
||||||
|
|
||||||
|
click.echo(f"{'ID':<5} {'Статус':<12} {'Название':<50} {'Токены (исх/упр)':<15}")
|
||||||
|
click.echo("-" * 87)
|
||||||
|
|
||||||
|
for article in articles:
|
||||||
|
tokens_info = ""
|
||||||
|
if article.token_count_raw and article.token_count_simplified:
|
||||||
|
tokens_info = f"{article.token_count_raw}/{article.token_count_simplified}"
|
||||||
|
elif article.token_count_raw:
|
||||||
|
tokens_info = f"{article.token_count_raw}/-"
|
||||||
|
|
||||||
|
title = article.title[:47] + "..." if len(article.title) > 50 else article.title
|
||||||
|
|
||||||
|
click.echo(
|
||||||
|
f"{article.id:<5} {article.status.value:<12} {title:<50} {tokens_info:<15}"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
click.echo(f"Ошибка: {e}", err=True)
|
||||||
|
sys.exit(1)
|
||||||
|
finally:
|
||||||
|
await container.cleanup()
|
||||||
|
|
||||||
|
asyncio.run(_list())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,149 @@
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
|
import structlog
|
||||||
|
|
||||||
|
from .adapters import LLMProviderAdapter, RuWikiAdapter
|
||||||
|
from .models import AppConfig
|
||||||
|
from .runner import AsyncRunner
|
||||||
|
from .services import ArticleRepository, AsyncWriteQueue, DatabaseService, SimplifyService
|
||||||
|
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class DependencyContainer:
|
||||||
|
def __init__(self, config: AppConfig) -> None:
|
||||||
|
self.config = config
|
||||||
|
self._database_service: DatabaseService | None = None
|
||||||
|
self._repository: ArticleRepository | None = None
|
||||||
|
self._write_queue: AsyncWriteQueue | None = None
|
||||||
|
self._ruwiki_adapter: RuWikiAdapter | None = None
|
||||||
|
self._llm_adapter: LLMProviderAdapter | None = None
|
||||||
|
self._simplify_service: SimplifyService | None = None
|
||||||
|
self._initialized = False
|
||||||
|
|
||||||
|
async def initialize(self) -> None:
|
||||||
|
if self._initialized:
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("Инициализация системы...")
|
||||||
|
|
||||||
|
db_service = self.get_database_service()
|
||||||
|
await db_service.initialize_database()
|
||||||
|
|
||||||
|
write_queue = self.get_write_queue()
|
||||||
|
await write_queue.start()
|
||||||
|
|
||||||
|
self._initialized = True
|
||||||
|
logger.info("Система инициализирована")
|
||||||
|
|
||||||
|
async def cleanup(self) -> None:
|
||||||
|
if not self._initialized:
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("Очистка ресурсов...")
|
||||||
|
|
||||||
|
if self._write_queue:
|
||||||
|
await self._write_queue.stop()
|
||||||
|
|
||||||
|
if self._database_service:
|
||||||
|
self._database_service.close()
|
||||||
|
|
||||||
|
self._initialized = False
|
||||||
|
logger.info("Ресурсы очищены")
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def get_database_service(self) -> DatabaseService:
|
||||||
|
if self._database_service is None:
|
||||||
|
self._database_service = DatabaseService(self.config)
|
||||||
|
return self._database_service
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def get_repository(self) -> ArticleRepository:
|
||||||
|
if self._repository is None:
|
||||||
|
db_service = self.get_database_service()
|
||||||
|
self._repository = ArticleRepository(db_service)
|
||||||
|
return self._repository
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def get_write_queue(self) -> AsyncWriteQueue:
|
||||||
|
if self._write_queue is None:
|
||||||
|
repository = self.get_repository()
|
||||||
|
self._write_queue = AsyncWriteQueue(repository, max_batch_size=10)
|
||||||
|
return self._write_queue
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def get_ruwiki_adapter(self) -> RuWikiAdapter:
|
||||||
|
if self._ruwiki_adapter is None:
|
||||||
|
self._ruwiki_adapter = RuWikiAdapter(self.config)
|
||||||
|
return self._ruwiki_adapter
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def get_llm_adapter(self) -> LLMProviderAdapter:
|
||||||
|
if self._llm_adapter is None:
|
||||||
|
self._llm_adapter = LLMProviderAdapter(self.config)
|
||||||
|
return self._llm_adapter
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def get_simplify_service(self) -> SimplifyService:
|
||||||
|
if self._simplify_service is None:
|
||||||
|
self._simplify_service = SimplifyService(
|
||||||
|
config=self.config,
|
||||||
|
ruwiki_adapter=self.get_ruwiki_adapter(),
|
||||||
|
llm_adapter=self.get_llm_adapter(),
|
||||||
|
repository=self.get_repository(),
|
||||||
|
write_queue=self.get_write_queue(),
|
||||||
|
)
|
||||||
|
return self._simplify_service
|
||||||
|
|
||||||
|
def create_runner(self, max_workers: int | None = None) -> AsyncRunner:
|
||||||
|
if max_workers is None:
|
||||||
|
max_workers = min(
|
||||||
|
self.config.max_concurrent_llm,
|
||||||
|
self.config.max_concurrent_wiki,
|
||||||
|
10,
|
||||||
|
)
|
||||||
|
|
||||||
|
return AsyncRunner(
|
||||||
|
config=self.config,
|
||||||
|
simplify_service=self.get_simplify_service(),
|
||||||
|
max_workers=max_workers,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def health_check(self) -> dict[str, bool]:
|
||||||
|
checks = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
db_service = self.get_database_service()
|
||||||
|
checks["database"] = await db_service.health_check()
|
||||||
|
except Exception:
|
||||||
|
checks["database"] = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
write_queue = self.get_write_queue()
|
||||||
|
checks["write_queue"] = (
|
||||||
|
write_queue._worker_task is not None and not write_queue._worker_task.done()
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
checks["write_queue"] = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
ruwiki = self.get_ruwiki_adapter()
|
||||||
|
checks["ruwiki"] = await ruwiki.health_check()
|
||||||
|
except Exception:
|
||||||
|
checks["ruwiki"] = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
llm = self.get_llm_adapter()
|
||||||
|
checks["llm"] = await llm.health_check()
|
||||||
|
except Exception:
|
||||||
|
checks["llm"] = False
|
||||||
|
|
||||||
|
return checks
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def get_container(config: AppConfig | None = None) -> DependencyContainer:
|
||||||
|
if config is None:
|
||||||
|
config = AppConfig()
|
||||||
|
|
||||||
|
return DependencyContainer(config)
|
|
@ -0,0 +1,15 @@
|
||||||
|
from .article import Article, ArticleCreate, ArticleRead, ProcessingStatus
|
||||||
|
from .commands import ProcessingResult, ProcessingStats, SimplifyCommand
|
||||||
|
from .config import AppConfig
|
||||||
|
from .constants import *
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"AppConfig",
|
||||||
|
"Article",
|
||||||
|
"ArticleCreate",
|
||||||
|
"ArticleRead",
|
||||||
|
"ProcessingResult",
|
||||||
|
"ProcessingStats",
|
||||||
|
"ProcessingStatus",
|
||||||
|
"SimplifyCommand",
|
||||||
|
]
|
|
@ -0,0 +1,81 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
from sqlmodel import Field, SQLModel
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessingStatus(str, Enum):
|
||||||
|
PENDING = "pending"
|
||||||
|
PROCESSING = "processing"
|
||||||
|
COMPLETED = "completed"
|
||||||
|
FAILED = "failed"
|
||||||
|
|
||||||
|
|
||||||
|
class Article(SQLModel, table=True):
|
||||||
|
|
||||||
|
__tablename__ = "articles"
|
||||||
|
|
||||||
|
id: int | None = Field(default=None, primary_key=True)
|
||||||
|
url: str = Field(index=True, unique=True, max_length=500)
|
||||||
|
title: str = Field(max_length=300)
|
||||||
|
raw_text: str = Field(description="Исходный wiki-текст")
|
||||||
|
simplified_text: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="Упрощённый текст для школьников",
|
||||||
|
)
|
||||||
|
status: ProcessingStatus = Field(default=ProcessingStatus.PENDING)
|
||||||
|
error_message: str | None = Field(default=None, max_length=1000)
|
||||||
|
token_count_raw: int | None = Field(
|
||||||
|
default=None, description="Количество токенов в исходном тексте"
|
||||||
|
)
|
||||||
|
token_count_simplified: int | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="Количество токенов в упрощённом тексте",
|
||||||
|
)
|
||||||
|
processing_time_seconds: float | None = Field(default=None)
|
||||||
|
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
||||||
|
updated_at: datetime | None = Field(default=None)
|
||||||
|
|
||||||
|
def mark_processing(self) -> None:
|
||||||
|
self.status = ProcessingStatus.PROCESSING
|
||||||
|
self.updated_at = datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
def mark_completed(
|
||||||
|
self,
|
||||||
|
simplified_text: str,
|
||||||
|
token_count_raw: int,
|
||||||
|
token_count_simplified: int,
|
||||||
|
processing_time: float,
|
||||||
|
) -> None:
|
||||||
|
self.simplified_text = simplified_text
|
||||||
|
self.token_count_raw = token_count_raw
|
||||||
|
self.token_count_simplified = token_count_simplified
|
||||||
|
self.processing_time_seconds = processing_time
|
||||||
|
self.status = ProcessingStatus.COMPLETED
|
||||||
|
self.error_message = None
|
||||||
|
self.updated_at = datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
def mark_failed(self, error_message: str) -> None:
|
||||||
|
self.status = ProcessingStatus.FAILED
|
||||||
|
self.error_message = error_message[:1000]
|
||||||
|
self.updated_at = datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleCreate(SQLModel):
|
||||||
|
url: str
|
||||||
|
title: str
|
||||||
|
raw_text: str
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleRead(SQLModel):
|
||||||
|
id: int
|
||||||
|
url: str
|
||||||
|
title: str
|
||||||
|
raw_text: str
|
||||||
|
simplified_text: str | None
|
||||||
|
status: ProcessingStatus
|
||||||
|
token_count_raw: int | None
|
||||||
|
token_count_simplified: int | None
|
||||||
|
created_at: datetime
|
|
@ -0,0 +1,89 @@
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class SimplifyCommand:
|
||||||
|
url: str
|
||||||
|
force_reprocess: bool = False
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return f"SimplifyCommand(url='{self.url}', force={self.force_reprocess})"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ProcessingResult:
|
||||||
|
|
||||||
|
url: str
|
||||||
|
success: bool
|
||||||
|
title: str | None = None
|
||||||
|
raw_text: str | None = None
|
||||||
|
simplified_text: str | None = None
|
||||||
|
token_count_raw: int | None = None
|
||||||
|
token_count_simplified: int | None = None
|
||||||
|
processing_time_seconds: float | None = None
|
||||||
|
error_message: str | None = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def success_result(
|
||||||
|
cls,
|
||||||
|
url: str,
|
||||||
|
title: str,
|
||||||
|
raw_text: str,
|
||||||
|
simplified_text: str,
|
||||||
|
token_count_raw: int,
|
||||||
|
token_count_simplified: int,
|
||||||
|
processing_time_seconds: float,
|
||||||
|
) -> "ProcessingResult":
|
||||||
|
return cls(
|
||||||
|
url=url,
|
||||||
|
success=True,
|
||||||
|
title=title,
|
||||||
|
raw_text=raw_text,
|
||||||
|
simplified_text=simplified_text,
|
||||||
|
token_count_raw=token_count_raw,
|
||||||
|
token_count_simplified=token_count_simplified,
|
||||||
|
processing_time_seconds=processing_time_seconds,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def failure_result(cls, url: str, error_message: str) -> "ProcessingResult":
|
||||||
|
return cls(
|
||||||
|
url=url,
|
||||||
|
success=False,
|
||||||
|
error_message=error_message,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ProcessingStats:
|
||||||
|
|
||||||
|
total_processed: int = 0
|
||||||
|
successful: int = 0
|
||||||
|
failed: int = 0
|
||||||
|
skipped: int = 0
|
||||||
|
total_processing_time: float = 0.0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def success_rate(self) -> float:
|
||||||
|
if self.total_processed == 0:
|
||||||
|
return 0.0
|
||||||
|
return (self.successful / self.total_processed) * 100.0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def average_processing_time(self) -> float:
|
||||||
|
if self.successful == 0:
|
||||||
|
return 0.0
|
||||||
|
return self.total_processing_time / self.successful
|
||||||
|
|
||||||
|
def add_result(self, result: ProcessingResult) -> None:
|
||||||
|
self.total_processed += 1
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
self.successful += 1
|
||||||
|
if result.processing_time_seconds:
|
||||||
|
self.total_processing_time += result.processing_time_seconds
|
||||||
|
else:
|
||||||
|
self.failed += 1
|
||||||
|
|
||||||
|
def add_skipped(self) -> None:
|
||||||
|
self.skipped += 1
|
|
@ -0,0 +1,73 @@
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
from pydantic import Field, field_validator
|
||||||
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
|
||||||
|
|
||||||
|
class AppConfig(BaseSettings):
|
||||||
|
|
||||||
|
model_config = SettingsConfigDict(
|
||||||
|
env_file=".env",
|
||||||
|
env_file_encoding="utf-8",
|
||||||
|
case_sensitive=False,
|
||||||
|
extra="ignore",
|
||||||
|
)
|
||||||
|
|
||||||
|
openai_api_key: str = Field(description="API ключ OpenAI")
|
||||||
|
openai_model: str = Field(default="gpt-4o-mini", description="Модель OpenAI для упрощения")
|
||||||
|
openai_temperature: float = Field(
|
||||||
|
default=0.0, ge=0.0, le=2.0, description="Температура для LLM"
|
||||||
|
)
|
||||||
|
|
||||||
|
db_path: str = Field(default="./data/wiki.db", description="Путь к файлу SQLite")
|
||||||
|
|
||||||
|
max_concurrent_llm: int = Field(
|
||||||
|
default=5, ge=1, le=50, description="Максимум одновременных LLM запросов"
|
||||||
|
)
|
||||||
|
openai_rpm: int = Field(default=200, ge=1, description="Лимит запросов в минуту для OpenAI")
|
||||||
|
max_concurrent_wiki: int = Field(
|
||||||
|
default=10, ge=1, le=100, description="Максимум одновременных wiki запросов"
|
||||||
|
)
|
||||||
|
|
||||||
|
log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"] = Field(default="INFO")
|
||||||
|
log_format: Literal["json", "text"] = Field(default="json")
|
||||||
|
|
||||||
|
chunk_size: int = Field(default=2000, ge=500, le=8000, description="Размер чанка для текста")
|
||||||
|
chunk_overlap: int = Field(default=200, ge=0, le=1000, description="Перекрытие между чанками")
|
||||||
|
max_retries: int = Field(default=3, ge=1, le=10, description="Максимум попыток повтора")
|
||||||
|
retry_delay: float = Field(
|
||||||
|
default=1.0, ge=0.1, le=60.0, description="Задержка между попытками (сек)"
|
||||||
|
)
|
||||||
|
|
||||||
|
circuit_failure_threshold: int = Field(
|
||||||
|
default=5, ge=1, le=20, description="Порог отказов для circuit breaker"
|
||||||
|
)
|
||||||
|
circuit_recovery_timeout: int = Field(
|
||||||
|
default=60,
|
||||||
|
ge=10,
|
||||||
|
le=600,
|
||||||
|
description="Время восстановления circuit breaker (сек)",
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt_template_path: str = Field(
|
||||||
|
default="src/prompt.txt", description="Путь к файлу с prompt-шаблоном"
|
||||||
|
)
|
||||||
|
input_file_path: str = Field(
|
||||||
|
default="input.txt", description="Путь к файлу с URL для обработки"
|
||||||
|
)
|
||||||
|
|
||||||
|
@field_validator("db_path")
|
||||||
|
@classmethod
|
||||||
|
def validate_db_path(cls, v: str) -> str:
|
||||||
|
db_path = Path(v)
|
||||||
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
return str(db_path)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def db_url(self) -> str:
|
||||||
|
return f"sqlite+aiosqlite:///{self.db_path}"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def sync_db_url(self) -> str:
|
||||||
|
return f"sqlite:///{self.db_path}"
|
|
@ -0,0 +1,28 @@
|
||||||
|
### role: system
|
||||||
|
Ты — опытный редактор Рувики и педагог-методист. Твоя задача — адаптировать научные статьи для школьного образования.
|
||||||
|
|
||||||
|
ПРАВИЛА УПРОЩЕНИЯ:
|
||||||
|
1. Сократи текст до ≤ 1000 токенов, сохранив ключевую информацию
|
||||||
|
2. Замени сложные термины на простые аналоги с объяснениями
|
||||||
|
3. Убери избыточные детали, оставь только суть
|
||||||
|
4. Сохрани корректную wiki-разметку (== заголовки ==, '''жирный''', ''курсив'', [[ссылки]])
|
||||||
|
5. Структурируй материал логично: определение → основные свойства → примеры
|
||||||
|
6. Добавь простые примеры для лучшего понимания
|
||||||
|
7. Убери технические подробности, не нужные школьникам
|
||||||
|
|
||||||
|
ЦЕЛЬ: Сделать статью понятной для учеников 8-11 классов, сохранив научную точность.
|
||||||
|
|
||||||
|
ФОРМАТ ОТВЕТА:
|
||||||
|
- Начни сразу с упрощённого wiki-текста
|
||||||
|
- Используй простые предложения
|
||||||
|
- Избегай сложных конструкций
|
||||||
|
- Заверши ответ маркером ###END###
|
||||||
|
|
||||||
|
### role: user
|
||||||
|
Статья: {title}
|
||||||
|
|
||||||
|
<WikiSource>
|
||||||
|
{wiki_source_text}
|
||||||
|
</WikiSource>
|
||||||
|
|
||||||
|
Задание: сократи и упрости текст, следуя инструкциям system-сообщения.
|
|
@ -0,0 +1,214 @@
|
||||||
|
import asyncio
|
||||||
|
import signal
|
||||||
|
import time
|
||||||
|
|
||||||
|
import structlog
|
||||||
|
|
||||||
|
from .models import AppConfig, ProcessingStats, SimplifyCommand
|
||||||
|
from .services import SimplifyService
|
||||||
|
from .sources import FileSource
|
||||||
|
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncRunner:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
config: AppConfig,
|
||||||
|
simplify_service: SimplifyService,
|
||||||
|
max_workers: int = 10,
|
||||||
|
) -> None:
|
||||||
|
self.config = config
|
||||||
|
self.simplify_service = simplify_service
|
||||||
|
self.max_workers = max_workers
|
||||||
|
|
||||||
|
self._task_queue: asyncio.Queue[SimplifyCommand] = asyncio.Queue()
|
||||||
|
self._workers: list[asyncio.Task[None]] = []
|
||||||
|
self._shutdown_event = asyncio.Event()
|
||||||
|
|
||||||
|
self.stats = ProcessingStats()
|
||||||
|
self._start_time: float | None = None
|
||||||
|
|
||||||
|
self.logger = structlog.get_logger().bind(service="runner")
|
||||||
|
|
||||||
|
async def run_from_file(
|
||||||
|
self,
|
||||||
|
input_file: str,
|
||||||
|
force_reprocess: bool = False,
|
||||||
|
max_articles: int | None = None,
|
||||||
|
) -> ProcessingStats:
|
||||||
|
self.logger.info(
|
||||||
|
"Запуск обработки статей из файла",
|
||||||
|
input_file=input_file,
|
||||||
|
force_reprocess=force_reprocess,
|
||||||
|
max_workers=self.max_workers,
|
||||||
|
max_articles=max_articles,
|
||||||
|
)
|
||||||
|
|
||||||
|
self._setup_signal_handlers()
|
||||||
|
|
||||||
|
try:
|
||||||
|
source = FileSource(input_file)
|
||||||
|
await self._load_tasks_from_source(source, force_reprocess, max_articles)
|
||||||
|
|
||||||
|
await self._run_processing()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error("Ошибка при выполнении runner", error=str(e))
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
await self._cleanup()
|
||||||
|
|
||||||
|
return self.stats
|
||||||
|
|
||||||
|
async def _load_tasks_from_source(
|
||||||
|
self,
|
||||||
|
source: FileSource,
|
||||||
|
force_reprocess: bool,
|
||||||
|
max_articles: int | None,
|
||||||
|
) -> None:
|
||||||
|
loaded_count = 0
|
||||||
|
|
||||||
|
async for command in source.read_urls(force_reprocess):
|
||||||
|
if max_articles and loaded_count >= max_articles:
|
||||||
|
break
|
||||||
|
|
||||||
|
await self._task_queue.put(command)
|
||||||
|
loaded_count += 1
|
||||||
|
|
||||||
|
self.logger.info("Задачи загружены в очередь", count=loaded_count)
|
||||||
|
|
||||||
|
async def _run_processing(self) -> None:
|
||||||
|
self._start_time = time.time()
|
||||||
|
|
||||||
|
self.logger.info("Запуск worker корутин", count=self.max_workers)
|
||||||
|
|
||||||
|
for i in range(self.max_workers):
|
||||||
|
worker = asyncio.create_task(self._worker_loop(worker_id=i))
|
||||||
|
self._workers.append(worker)
|
||||||
|
|
||||||
|
await self._task_queue.join()
|
||||||
|
|
||||||
|
self._shutdown_event.set()
|
||||||
|
|
||||||
|
if self._workers:
|
||||||
|
await asyncio.gather(*self._workers, return_exceptions=True)
|
||||||
|
|
||||||
|
async def _worker_loop(self, worker_id: int) -> None:
|
||||||
|
worker_logger = self.logger.bind(worker_id=worker_id)
|
||||||
|
worker_logger.info("Worker запущен")
|
||||||
|
|
||||||
|
processed_count = 0
|
||||||
|
|
||||||
|
while not self._shutdown_event.is_set():
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
command = await asyncio.wait_for(
|
||||||
|
self._task_queue.get(),
|
||||||
|
timeout=1.0,
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await self.simplify_service.process_command(command)
|
||||||
|
|
||||||
|
self.stats.add_result(result)
|
||||||
|
processed_count += 1
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
worker_logger.info(
|
||||||
|
"Статья обработана успешно",
|
||||||
|
url=command.url,
|
||||||
|
title=result.title,
|
||||||
|
tokens_in=result.token_count_raw,
|
||||||
|
tokens_out=result.token_count_simplified,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
worker_logger.warning(
|
||||||
|
"Ошибка при обработке статьи",
|
||||||
|
url=command.url,
|
||||||
|
error=result.error_message,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
worker_logger.error(
|
||||||
|
"Неожиданная ошибка в worker",
|
||||||
|
url=command.url,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
from .models import ProcessingResult
|
||||||
|
|
||||||
|
error_result = ProcessingResult.failure_result(
|
||||||
|
command.url,
|
||||||
|
f"Неожиданная ошибка: {e!s}",
|
||||||
|
)
|
||||||
|
self.stats.add_result(error_result)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
self._task_queue.task_done()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
worker_logger.error("Критическая ошибка в worker loop", error=str(e))
|
||||||
|
break
|
||||||
|
|
||||||
|
worker_logger.info("Worker завершён", processed_articles=processed_count)
|
||||||
|
|
||||||
|
def _setup_signal_handlers(self) -> None:
|
||||||
|
def signal_handler(signum: int, frame: None) -> None:
|
||||||
|
signal_name = signal.Signals(signum).name
|
||||||
|
self.logger.info(f"Получен сигнал {signal_name}, начинаем graceful shutdown")
|
||||||
|
self._shutdown_event.set()
|
||||||
|
|
||||||
|
try:
|
||||||
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
signal.signal(signal.SIGTERM, signal_handler)
|
||||||
|
except ValueError:
|
||||||
|
self.logger.warning("Не удалось настроить обработчики сигналов")
|
||||||
|
|
||||||
|
async def _cleanup(self) -> None:
|
||||||
|
self.logger.info("Начинаем очистку ресурсов")
|
||||||
|
|
||||||
|
for worker in self._workers:
|
||||||
|
if not worker.done():
|
||||||
|
worker.cancel()
|
||||||
|
|
||||||
|
if self._workers:
|
||||||
|
results = await asyncio.gather(*self._workers, return_exceptions=True)
|
||||||
|
cancelled_count = sum(1 for r in results if isinstance(r, asyncio.CancelledError))
|
||||||
|
if cancelled_count > 0:
|
||||||
|
self.logger.info("Workers отменены", count=cancelled_count)
|
||||||
|
|
||||||
|
self._workers.clear()
|
||||||
|
|
||||||
|
def get_progress_info(self) -> dict[str, any]:
|
||||||
|
elapsed_time = time.time() - self._start_time if self._start_time else 0
|
||||||
|
|
||||||
|
articles_per_minute = 0
|
||||||
|
if elapsed_time > 0:
|
||||||
|
articles_per_minute = (self.stats.successful * 60) / elapsed_time
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_processed": self.stats.total_processed,
|
||||||
|
"successful": self.stats.successful,
|
||||||
|
"failed": self.stats.failed,
|
||||||
|
"success_rate": self.stats.success_rate,
|
||||||
|
"elapsed_time": elapsed_time,
|
||||||
|
"articles_per_minute": articles_per_minute,
|
||||||
|
"queue_size": self._task_queue.qsize(),
|
||||||
|
"active_workers": len([w for w in self._workers if not w.done()]),
|
||||||
|
}
|
||||||
|
|
||||||
|
async def health_check(self) -> dict[str, any]:
|
||||||
|
checks = await self.simplify_service.health_check()
|
||||||
|
|
||||||
|
checks.update(
|
||||||
|
{
|
||||||
|
"runner_active": bool(self._workers and not self._shutdown_event.is_set()),
|
||||||
|
"queue_size": self._task_queue.qsize(),
|
||||||
|
"workers_count": len(self._workers),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return checks
|
|
@ -0,0 +1,14 @@
|
||||||
|
from .database import DatabaseService
|
||||||
|
from .repository import ArticleRepository
|
||||||
|
from .simplify_service import SimplifyService
|
||||||
|
from .text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
from .write_queue import AsyncWriteQueue, WriteOperation
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"ArticleRepository",
|
||||||
|
"AsyncWriteQueue",
|
||||||
|
"DatabaseService",
|
||||||
|
"RecursiveCharacterTextSplitter",
|
||||||
|
"SimplifyService",
|
||||||
|
"WriteOperation",
|
||||||
|
]
|
|
@ -0,0 +1,65 @@
|
||||||
|
"""Сервис для управления базой данных."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import aiosqlite
|
||||||
|
import structlog
|
||||||
|
from sqlmodel import SQLModel, create_engine
|
||||||
|
|
||||||
|
from ..models import AppConfig
|
||||||
|
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class DatabaseService:
|
||||||
|
def __init__(self, config: AppConfig) -> None:
|
||||||
|
self.config = config
|
||||||
|
self.logger = structlog.get_logger().bind(service="database")
|
||||||
|
|
||||||
|
self._sync_engine = create_engine(
|
||||||
|
config.sync_db_url,
|
||||||
|
echo=False,
|
||||||
|
connect_args={"check_same_thread": False},
|
||||||
|
)
|
||||||
|
|
||||||
|
async def initialize_database(self) -> None:
|
||||||
|
db_path = Path(self.config.db_path)
|
||||||
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
self.logger.info("Создание схемы базы данных", db_path=self.config.db_path)
|
||||||
|
SQLModel.metadata.create_all(self._sync_engine)
|
||||||
|
|
||||||
|
await self._configure_sqlite()
|
||||||
|
|
||||||
|
self.logger.info("База данных инициализирована", db_path=self.config.db_path)
|
||||||
|
|
||||||
|
async def _configure_sqlite(self) -> None:
|
||||||
|
async with aiosqlite.connect(self.config.db_path) as conn:
|
||||||
|
await conn.execute("PRAGMA journal_mode=WAL")
|
||||||
|
|
||||||
|
await conn.execute("PRAGMA cache_size=10000")
|
||||||
|
|
||||||
|
await conn.execute("PRAGMA synchronous=NORMAL")
|
||||||
|
|
||||||
|
await conn.execute("PRAGMA busy_timeout=30000")
|
||||||
|
|
||||||
|
await conn.commit()
|
||||||
|
self.logger.info("SQLite настроен для оптимальной производительности")
|
||||||
|
|
||||||
|
async def get_connection(self) -> aiosqlite.Connection:
|
||||||
|
return await aiosqlite.connect(
|
||||||
|
self.config.db_path,
|
||||||
|
timeout=30.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def health_check(self) -> bool:
|
||||||
|
try:
|
||||||
|
async with self.get_connection() as conn:
|
||||||
|
await conn.execute("SELECT 1")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error("Database health check failed", error=str(e))
|
||||||
|
return False
|
||||||
|
|
||||||
|
def close(self) -> None:
|
||||||
|
self._sync_engine.dispose()
|
|
@ -0,0 +1,188 @@
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import aiosqlite
|
||||||
|
import structlog
|
||||||
|
|
||||||
|
from ..models import Article, ArticleCreate, ProcessingStatus
|
||||||
|
from .database import DatabaseService
|
||||||
|
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleRepository:
|
||||||
|
|
||||||
|
def __init__(self, db_service: DatabaseService) -> None:
|
||||||
|
self.db_service = db_service
|
||||||
|
self.logger = structlog.get_logger().bind(repository="article")
|
||||||
|
|
||||||
|
async def create_article(self, article_data: ArticleCreate) -> Article:
|
||||||
|
existing = await self.get_by_url(article_data.url)
|
||||||
|
if existing:
|
||||||
|
raise ValueError(f"Статья с URL {article_data.url} уже существует")
|
||||||
|
|
||||||
|
article = Article(
|
||||||
|
url=article_data.url,
|
||||||
|
title=article_data.title,
|
||||||
|
raw_text=article_data.raw_text,
|
||||||
|
status=ProcessingStatus.PENDING,
|
||||||
|
)
|
||||||
|
|
||||||
|
async with self.db_service.get_connection() as conn:
|
||||||
|
cursor = await conn.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO articles (url, title, raw_text, status, created_at)
|
||||||
|
VALUES (?, ?, ?, ?, ?)
|
||||||
|
""",
|
||||||
|
(
|
||||||
|
article.url,
|
||||||
|
article.title,
|
||||||
|
article.raw_text,
|
||||||
|
article.status.value,
|
||||||
|
article.created_at,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
await conn.commit()
|
||||||
|
|
||||||
|
article.id = cursor.lastrowid
|
||||||
|
|
||||||
|
self.logger.info("Статья создана", article_id=article.id, url=article.url)
|
||||||
|
return article
|
||||||
|
|
||||||
|
async def get_by_id(self, article_id: int) -> Article | None:
|
||||||
|
async with self.db_service.get_connection() as conn:
|
||||||
|
cursor = await conn.execute(
|
||||||
|
"SELECT * FROM articles WHERE id = ?",
|
||||||
|
(article_id,),
|
||||||
|
)
|
||||||
|
row = await cursor.fetchone()
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return self._row_to_article(row)
|
||||||
|
|
||||||
|
async def get_by_url(self, url: str) -> Article | None:
|
||||||
|
async with self.db_service.get_connection() as conn:
|
||||||
|
cursor = await conn.execute(
|
||||||
|
"SELECT * FROM articles WHERE url = ?",
|
||||||
|
(url,),
|
||||||
|
)
|
||||||
|
row = await cursor.fetchone()
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return self._row_to_article(row)
|
||||||
|
|
||||||
|
async def update_article(self, article: Article) -> Article:
|
||||||
|
if not article.id:
|
||||||
|
raise ValueError("ID статьи не может быть None для обновления")
|
||||||
|
|
||||||
|
async with self.db_service.get_connection() as conn:
|
||||||
|
cursor = await conn.execute(
|
||||||
|
"""
|
||||||
|
UPDATE articles SET
|
||||||
|
title = ?,
|
||||||
|
raw_text = ?,
|
||||||
|
simplified_text = ?,
|
||||||
|
status = ?,
|
||||||
|
error_message = ?,
|
||||||
|
token_count_raw = ?,
|
||||||
|
token_count_simplified = ?,
|
||||||
|
processing_time_seconds = ?,
|
||||||
|
updated_at = ?
|
||||||
|
WHERE id = ?
|
||||||
|
""",
|
||||||
|
(
|
||||||
|
article.title,
|
||||||
|
article.raw_text,
|
||||||
|
article.simplified_text,
|
||||||
|
article.status.value,
|
||||||
|
article.error_message,
|
||||||
|
article.token_count_raw,
|
||||||
|
article.token_count_simplified,
|
||||||
|
article.processing_time_seconds,
|
||||||
|
article.updated_at,
|
||||||
|
article.id,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
await conn.commit()
|
||||||
|
|
||||||
|
if cursor.rowcount == 0:
|
||||||
|
raise ValueError(f"Статья с ID {article.id} не найдена")
|
||||||
|
|
||||||
|
self.logger.info("Статья обновлена", article_id=article.id, status=article.status)
|
||||||
|
return article
|
||||||
|
|
||||||
|
async def get_articles_by_status(
|
||||||
|
self, status: ProcessingStatus, limit: int | None = None
|
||||||
|
) -> list[Article]:
|
||||||
|
query = "SELECT * FROM articles WHERE status = ?"
|
||||||
|
params: tuple[Any, ...] = (status.value,)
|
||||||
|
|
||||||
|
if limit:
|
||||||
|
query += " LIMIT ?"
|
||||||
|
params = params + (limit,)
|
||||||
|
|
||||||
|
async with self.db_service.get_connection() as conn:
|
||||||
|
cursor = await conn.execute(query, params)
|
||||||
|
rows = await cursor.fetchall()
|
||||||
|
|
||||||
|
return [self._row_to_article(row) for row in rows]
|
||||||
|
|
||||||
|
async def get_pending_articles(self, limit: int | None = None) -> list[Article]:
|
||||||
|
return await self.get_articles_by_status(ProcessingStatus.PENDING, limit)
|
||||||
|
|
||||||
|
async def count_by_status(self, status: ProcessingStatus) -> int:
|
||||||
|
async with self.db_service.get_connection() as conn:
|
||||||
|
cursor = await conn.execute(
|
||||||
|
"SELECT COUNT(*) FROM articles WHERE status = ?",
|
||||||
|
(status.value,),
|
||||||
|
)
|
||||||
|
result = await cursor.fetchone()
|
||||||
|
|
||||||
|
return result[0] if result else 0
|
||||||
|
|
||||||
|
async def get_all_articles(self, limit: int | None = None, offset: int = 0) -> list[Article]:
|
||||||
|
query = "SELECT * FROM articles ORDER BY created_at DESC"
|
||||||
|
params: tuple[Any, ...] = ()
|
||||||
|
|
||||||
|
if limit:
|
||||||
|
query += " LIMIT ? OFFSET ?"
|
||||||
|
params = (limit, offset)
|
||||||
|
|
||||||
|
async with self.db_service.get_connection() as conn:
|
||||||
|
cursor = await conn.execute(query, params)
|
||||||
|
rows = await cursor.fetchall()
|
||||||
|
|
||||||
|
return [self._row_to_article(row) for row in rows]
|
||||||
|
|
||||||
|
async def delete_article(self, article_id: int) -> bool:
|
||||||
|
async with self.db_service.get_connection() as conn:
|
||||||
|
cursor = await conn.execute(
|
||||||
|
"DELETE FROM articles WHERE id = ?",
|
||||||
|
(article_id,),
|
||||||
|
)
|
||||||
|
await conn.commit()
|
||||||
|
|
||||||
|
deleted = cursor.rowcount > 0
|
||||||
|
if deleted:
|
||||||
|
self.logger.info("Статья удалена", article_id=article_id)
|
||||||
|
|
||||||
|
return deleted
|
||||||
|
|
||||||
|
def _row_to_article(self, row: aiosqlite.Row) -> Article:
|
||||||
|
return Article(
|
||||||
|
id=row["id"],
|
||||||
|
url=row["url"],
|
||||||
|
title=row["title"],
|
||||||
|
raw_text=row["raw_text"],
|
||||||
|
simplified_text=row["simplified_text"],
|
||||||
|
status=ProcessingStatus(row["status"]),
|
||||||
|
error_message=row["error_message"],
|
||||||
|
token_count_raw=row["token_count_raw"],
|
||||||
|
token_count_simplified=row["token_count_simplified"],
|
||||||
|
processing_time_seconds=row["processing_time_seconds"],
|
||||||
|
created_at=row["created_at"],
|
||||||
|
updated_at=row["updated_at"],
|
||||||
|
)
|
|
@ -0,0 +1,272 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import structlog
|
||||||
|
|
||||||
|
from src.adapters.llm import LLMProviderAdapter, LLMTokenLimitError
|
||||||
|
from src.adapters.ruwiki import RuWikiAdapter
|
||||||
|
from src.models import AppConfig, ArticleCreate, ProcessingResult, SimplifyCommand
|
||||||
|
from src.models.constants import LLM_MAX_INPUT_TOKENS, MAX_TOKEN_LIMIT_WITH_BUFFER
|
||||||
|
from src.services.repository import ArticleRepository
|
||||||
|
from src.services.text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
from src.services.write_queue import AsyncWriteQueue
|
||||||
|
|
||||||
|
|
||||||
|
class SimplifyService:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
config: AppConfig,
|
||||||
|
ruwiki_adapter: RuWikiAdapter,
|
||||||
|
llm_adapter: LLMProviderAdapter,
|
||||||
|
repository: ArticleRepository,
|
||||||
|
write_queue: AsyncWriteQueue,
|
||||||
|
) -> None:
|
||||||
|
self.config = config
|
||||||
|
self.ruwiki_adapter = ruwiki_adapter
|
||||||
|
self.llm_adapter = llm_adapter
|
||||||
|
self.repository = repository
|
||||||
|
self.write_queue = write_queue
|
||||||
|
|
||||||
|
self.text_splitter = RecursiveCharacterTextSplitter(
|
||||||
|
chunk_size=config.chunk_size,
|
||||||
|
chunk_overlap=config.chunk_overlap,
|
||||||
|
length_function=self.llm_adapter.count_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
self._prompt_template: str | None = None
|
||||||
|
self.logger = structlog.get_logger().bind(service="simplify")
|
||||||
|
|
||||||
|
async def get_prompt_template(self) -> str:
|
||||||
|
if self._prompt_template is None:
|
||||||
|
prompt_path = Path(self.config.prompt_template_path)
|
||||||
|
if not prompt_path.exists():
|
||||||
|
msg = f"Prompt template не найден: {prompt_path}"
|
||||||
|
raise FileNotFoundError(msg)
|
||||||
|
|
||||||
|
self._prompt_template = prompt_path.read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
return self._prompt_template
|
||||||
|
|
||||||
|
async def process_command(self, command: SimplifyCommand) -> ProcessingResult:
|
||||||
|
start_time = time.time()
|
||||||
|
self.logger.info("Начало обработки статьи", url=command.url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return await self._process_command_impl(command, start_time)
|
||||||
|
except Exception as e:
|
||||||
|
return await self._handle_processing_error(command, e, start_time)
|
||||||
|
|
||||||
|
async def _process_command_impl(
|
||||||
|
self, command: SimplifyCommand, start_time: float
|
||||||
|
) -> ProcessingResult:
|
||||||
|
if not command.force_reprocess:
|
||||||
|
existing_result = await self._check_existing_article(command.url)
|
||||||
|
if existing_result:
|
||||||
|
return existing_result
|
||||||
|
|
||||||
|
page_info = await self.ruwiki_adapter.fetch_page_cleaned(command.url)
|
||||||
|
article = await self._create_or_update_article(command, page_info)
|
||||||
|
|
||||||
|
article.mark_processing()
|
||||||
|
await self.repository.update_article(article)
|
||||||
|
|
||||||
|
simplified_text, input_tokens, output_tokens = await self._simplify_article_text(
|
||||||
|
title=page_info.title,
|
||||||
|
raw_text=page_info.content,
|
||||||
|
)
|
||||||
|
|
||||||
|
processing_time = time.time() - start_time
|
||||||
|
result = ProcessingResult.success_result(
|
||||||
|
url=command.url,
|
||||||
|
title=page_info.title,
|
||||||
|
raw_text=page_info.content,
|
||||||
|
simplified_text=simplified_text,
|
||||||
|
token_count_raw=input_tokens,
|
||||||
|
token_count_simplified=output_tokens,
|
||||||
|
processing_time_seconds=processing_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
await self.write_queue.update_from_result(result)
|
||||||
|
|
||||||
|
self.logger.info(
|
||||||
|
"Статья успешно обработана",
|
||||||
|
url=command.url,
|
||||||
|
title=page_info.title,
|
||||||
|
processing_time=processing_time,
|
||||||
|
input_tokens=input_tokens,
|
||||||
|
output_tokens=output_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def _check_existing_article(self, url: str) -> ProcessingResult | None:
|
||||||
|
existing_article = await self.repository.get_by_url(url)
|
||||||
|
if existing_article and existing_article.simplified_text:
|
||||||
|
self.logger.info("Статья уже обработана, пропускаем", url=url)
|
||||||
|
return ProcessingResult.success_result(
|
||||||
|
url=url,
|
||||||
|
title=existing_article.title,
|
||||||
|
raw_text=existing_article.raw_text,
|
||||||
|
simplified_text=existing_article.simplified_text,
|
||||||
|
token_count_raw=existing_article.token_count_raw or 0,
|
||||||
|
token_count_simplified=existing_article.token_count_simplified or 0,
|
||||||
|
processing_time_seconds=existing_article.processing_time_seconds or 0,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _create_or_update_article(self, command, page_info):
|
||||||
|
article_data = ArticleCreate(
|
||||||
|
url=command.url,
|
||||||
|
title=page_info.title,
|
||||||
|
raw_text=page_info.content,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return await self.repository.create_article(article_data)
|
||||||
|
except ValueError:
|
||||||
|
article = await self.repository.get_by_url(command.url)
|
||||||
|
if not article:
|
||||||
|
msg = f"Не удалось найти статью после создания: {command.url}"
|
||||||
|
raise ValueError(msg) from None
|
||||||
|
|
||||||
|
if command.force_reprocess:
|
||||||
|
article.title = page_info.title
|
||||||
|
article.raw_text = page_info.content
|
||||||
|
article.mark_processing()
|
||||||
|
await self.repository.update_article(article)
|
||||||
|
|
||||||
|
return article
|
||||||
|
|
||||||
|
async def _handle_processing_error(
|
||||||
|
self, command: SimplifyCommand, error: Exception, start_time: float
|
||||||
|
) -> ProcessingResult:
|
||||||
|
processing_time = time.time() - start_time
|
||||||
|
error_message = f"{type(error).__name__}: {error!s}"
|
||||||
|
|
||||||
|
self.logger.exception(
|
||||||
|
"Ошибка при обработке статьи",
|
||||||
|
url=command.url,
|
||||||
|
processing_time=processing_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
error_result = ProcessingResult.failure_result(command.url, error_message)
|
||||||
|
try:
|
||||||
|
await self.write_queue.update_from_result(error_result)
|
||||||
|
except Exception:
|
||||||
|
self.logger.exception("Ошибка записи результата с ошибкой")
|
||||||
|
|
||||||
|
return error_result
|
||||||
|
|
||||||
|
async def _simplify_article_text(self, title: str, raw_text: str) -> tuple[str, int, int]:
|
||||||
|
prompt_template = await self.get_prompt_template()
|
||||||
|
text_tokens = self.llm_adapter.count_tokens(raw_text)
|
||||||
|
|
||||||
|
if text_tokens <= self.config.chunk_size:
|
||||||
|
return await self.llm_adapter.simplify_text(title, raw_text, prompt_template)
|
||||||
|
|
||||||
|
return await self._process_long_text(title, raw_text, prompt_template)
|
||||||
|
|
||||||
|
async def _process_long_text(
|
||||||
|
self, title: str, raw_text: str, prompt_template: str
|
||||||
|
) -> tuple[str, int, int]:
|
||||||
|
self.logger.info(
|
||||||
|
"Разбиение длинного текста на части",
|
||||||
|
title=title,
|
||||||
|
total_tokens=self.llm_adapter.count_tokens(raw_text),
|
||||||
|
chunk_size=self.config.chunk_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks = self.text_splitter.split_text(raw_text)
|
||||||
|
simplified_chunks = []
|
||||||
|
total_input_tokens = 0
|
||||||
|
total_output_tokens = 0
|
||||||
|
|
||||||
|
for i, chunk in enumerate(chunks):
|
||||||
|
self.logger.debug(
|
||||||
|
"Обработка части текста",
|
||||||
|
title=title,
|
||||||
|
chunk_index=i + 1,
|
||||||
|
total_chunks=len(chunks),
|
||||||
|
chunk_tokens=self.llm_adapter.count_tokens(chunk),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
simplified_chunk, input_tokens, output_tokens = (
|
||||||
|
await self.llm_adapter.simplify_text(
|
||||||
|
title=f"{title} (часть {i+1}/{len(chunks)})",
|
||||||
|
wiki_text=chunk,
|
||||||
|
prompt_template=prompt_template,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
simplified_chunks.append(simplified_chunk)
|
||||||
|
total_input_tokens += input_tokens
|
||||||
|
total_output_tokens += output_tokens
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(
|
||||||
|
"Ошибка при обработке части текста",
|
||||||
|
title=title,
|
||||||
|
chunk_index=i + 1,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
if not simplified_chunks:
|
||||||
|
msg = "Не удалось обработать ни одной части текста"
|
||||||
|
raise LLMTokenLimitError(msg)
|
||||||
|
|
||||||
|
combined_text = self._combine_simplified_chunks(simplified_chunks)
|
||||||
|
return self._ensure_token_limit(combined_text, total_input_tokens, total_output_tokens)
|
||||||
|
|
||||||
|
def _ensure_token_limit(
|
||||||
|
self, combined_text: str, total_input_tokens: int, total_output_tokens: int
|
||||||
|
) -> tuple[str, int, int]:
|
||||||
|
final_tokens = self.llm_adapter.count_tokens(combined_text)
|
||||||
|
if final_tokens > MAX_TOKEN_LIMIT_WITH_BUFFER:
|
||||||
|
self.logger.warning(
|
||||||
|
"Объединённый текст превышает лимит, обрезаем",
|
||||||
|
final_tokens=final_tokens,
|
||||||
|
)
|
||||||
|
combined_text = self._truncate_to_token_limit(combined_text, 1000)
|
||||||
|
total_output_tokens = self.llm_adapter.count_tokens(combined_text)
|
||||||
|
|
||||||
|
return combined_text, total_input_tokens, total_output_tokens
|
||||||
|
|
||||||
|
def _combine_simplified_chunks(self, chunks: list[str]) -> str:
|
||||||
|
combined = "\n\n".join(chunk.strip() for chunk in chunks if chunk.strip())
|
||||||
|
return "\n".join(line for line in combined.split("\n") if line.strip())
|
||||||
|
|
||||||
|
def _truncate_to_token_limit(self, text: str, token_limit: int) -> str:
|
||||||
|
current_tokens = self.llm_adapter.count_tokens(text)
|
||||||
|
if current_tokens <= token_limit:
|
||||||
|
return text
|
||||||
|
|
||||||
|
sentences = text.split(". ")
|
||||||
|
truncated = ""
|
||||||
|
|
||||||
|
for sentence in sentences:
|
||||||
|
test_text = truncated + sentence + ". "
|
||||||
|
if self.llm_adapter.count_tokens(test_text) > token_limit:
|
||||||
|
break
|
||||||
|
truncated = test_text
|
||||||
|
|
||||||
|
return truncated.strip()
|
||||||
|
|
||||||
|
async def health_check(self) -> dict[str, bool]:
|
||||||
|
checks = {}
|
||||||
|
|
||||||
|
checks["ruwiki"] = await self._safe_health_check(self.ruwiki_adapter.health_check)
|
||||||
|
checks["llm"] = await self._safe_health_check(self.llm_adapter.health_check)
|
||||||
|
checks["prompt_template"] = await self._safe_health_check(self.get_prompt_template)
|
||||||
|
|
||||||
|
return checks
|
||||||
|
|
||||||
|
async def _safe_health_check(self, check_func) -> bool:
|
||||||
|
try:
|
||||||
|
await check_func()
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
return False
|
|
@ -0,0 +1,163 @@
|
||||||
|
import re
|
||||||
|
from collections.abc import Callable
|
||||||
|
|
||||||
|
import structlog
|
||||||
|
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class RecursiveCharacterTextSplitter:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
chunk_size: int = 2000,
|
||||||
|
chunk_overlap: int = 200,
|
||||||
|
length_function: Callable[[str], int] | None = None,
|
||||||
|
separators: list[str] | None = None,
|
||||||
|
) -> None:
|
||||||
|
self.chunk_size = chunk_size
|
||||||
|
self.chunk_overlap = chunk_overlap
|
||||||
|
self.length_function = length_function or len
|
||||||
|
|
||||||
|
self.separators = separators or [
|
||||||
|
"\n\n",
|
||||||
|
"\n",
|
||||||
|
". ",
|
||||||
|
"! ",
|
||||||
|
"? ",
|
||||||
|
"; ",
|
||||||
|
", ",
|
||||||
|
" ",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
|
||||||
|
self.logger = structlog.get_logger().bind(service="text_splitter")
|
||||||
|
|
||||||
|
def split_text(self, text: str) -> list[str]:
|
||||||
|
if not text.strip():
|
||||||
|
return []
|
||||||
|
|
||||||
|
if self.length_function(text) <= self.chunk_size:
|
||||||
|
return [text]
|
||||||
|
|
||||||
|
chunks = self._split_text_recursive(text, self.separators)
|
||||||
|
|
||||||
|
merged_chunks = self._merge_splits(chunks)
|
||||||
|
|
||||||
|
self.logger.debug(
|
||||||
|
"Текст разбит на части",
|
||||||
|
original_length=self.length_function(text),
|
||||||
|
chunks_count=len(merged_chunks),
|
||||||
|
avg_chunk_size=(
|
||||||
|
sum(self.length_function(chunk) for chunk in merged_chunks) / len(merged_chunks)
|
||||||
|
if merged_chunks
|
||||||
|
else 0
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
return merged_chunks
|
||||||
|
|
||||||
|
def _split_text_recursive(self, text: str, separators: list[str]) -> list[str]:
|
||||||
|
final_chunks = []
|
||||||
|
separator = separators[-1]
|
||||||
|
new_separators = []
|
||||||
|
|
||||||
|
for i, sep in enumerate(separators):
|
||||||
|
if sep == "":
|
||||||
|
separator = sep
|
||||||
|
break
|
||||||
|
if re.search(re.escape(sep), text):
|
||||||
|
separator = sep
|
||||||
|
new_separators = separators[i + 1 :]
|
||||||
|
break
|
||||||
|
|
||||||
|
splits = self._split_by_separator(text, separator)
|
||||||
|
|
||||||
|
good_splits = []
|
||||||
|
for split in splits:
|
||||||
|
if self.length_function(split) < self.chunk_size:
|
||||||
|
good_splits.append(split)
|
||||||
|
else:
|
||||||
|
if good_splits:
|
||||||
|
merged_text = self._merge_splits(good_splits)
|
||||||
|
final_chunks.extend(merged_text)
|
||||||
|
good_splits = []
|
||||||
|
|
||||||
|
if not new_separators:
|
||||||
|
final_chunks.extend(self._split_by_length(split))
|
||||||
|
else:
|
||||||
|
other_info = self._split_text_recursive(split, new_separators)
|
||||||
|
final_chunks.extend(other_info)
|
||||||
|
|
||||||
|
if good_splits:
|
||||||
|
merged_text = self._merge_splits(good_splits)
|
||||||
|
final_chunks.extend(merged_text)
|
||||||
|
|
||||||
|
return final_chunks
|
||||||
|
|
||||||
|
def _split_by_separator(self, text: str, separator: str) -> list[str]:
|
||||||
|
if separator == "":
|
||||||
|
return list(text)
|
||||||
|
|
||||||
|
return text.split(separator)
|
||||||
|
|
||||||
|
def _split_by_length(self, text: str) -> list[str]:
|
||||||
|
chunks = []
|
||||||
|
start = 0
|
||||||
|
|
||||||
|
while start < len(text):
|
||||||
|
end = start + self.chunk_size
|
||||||
|
|
||||||
|
if end < len(text):
|
||||||
|
for offset in range(min(100, self.chunk_size // 4)):
|
||||||
|
if end - offset > start and text[end - offset] in " \n\t.,;!?":
|
||||||
|
end = end - offset + 1
|
||||||
|
break
|
||||||
|
|
||||||
|
chunk = text[start:end].strip()
|
||||||
|
if chunk:
|
||||||
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
start = max(start + 1, end - self.chunk_overlap)
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def _merge_splits(self, splits: list[str]) -> list[str]:
|
||||||
|
if not splits:
|
||||||
|
return []
|
||||||
|
|
||||||
|
merged_chunks = []
|
||||||
|
current_chunk = ""
|
||||||
|
|
||||||
|
for split in splits:
|
||||||
|
test_chunk = current_chunk
|
||||||
|
if current_chunk and not current_chunk.endswith(("\n", " ")):
|
||||||
|
if not split.startswith(("\n", " ")):
|
||||||
|
test_chunk += " "
|
||||||
|
test_chunk += split
|
||||||
|
|
||||||
|
if self.length_function(test_chunk) <= self.chunk_size:
|
||||||
|
current_chunk = test_chunk
|
||||||
|
else:
|
||||||
|
if current_chunk.strip():
|
||||||
|
merged_chunks.append(current_chunk.strip())
|
||||||
|
|
||||||
|
current_chunk = split
|
||||||
|
|
||||||
|
if current_chunk.strip():
|
||||||
|
merged_chunks.append(current_chunk.strip())
|
||||||
|
|
||||||
|
return merged_chunks
|
||||||
|
|
||||||
|
def create_chunks_with_metadata(self, text: str, title: str = "") -> list[dict[str, str]]:
|
||||||
|
chunks = self.split_text(text)
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"text": chunk,
|
||||||
|
"title": title,
|
||||||
|
"chunk_index": i,
|
||||||
|
"total_chunks": len(chunks),
|
||||||
|
"chunk_size": self.length_function(chunk),
|
||||||
|
}
|
||||||
|
for i, chunk in enumerate(chunks)
|
||||||
|
]
|
|
@ -0,0 +1,182 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
import structlog
|
||||||
|
|
||||||
|
from src.models import Article, ProcessingResult
|
||||||
|
from src.models.constants import WRITE_QUEUE_BATCH_SIZE
|
||||||
|
from src.services.repository import ArticleRepository
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class WriteOperation:
|
||||||
|
operation_type: str
|
||||||
|
article: Article | None = None
|
||||||
|
result: ProcessingResult | None = None
|
||||||
|
future: asyncio.Future[Article] | None = field(default=None, init=False)
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncWriteQueue:
|
||||||
|
def __init__(
|
||||||
|
self, repository: ArticleRepository, max_batch_size: int = WRITE_QUEUE_BATCH_SIZE
|
||||||
|
) -> None:
|
||||||
|
self.repository = repository
|
||||||
|
self.max_batch_size = max_batch_size
|
||||||
|
self.logger = structlog.get_logger().bind(service="write_queue")
|
||||||
|
|
||||||
|
self._queue: asyncio.Queue[WriteOperation] = asyncio.Queue()
|
||||||
|
self._worker_task: asyncio.Task[None] | None = None
|
||||||
|
self._shutdown_event = asyncio.Event()
|
||||||
|
|
||||||
|
self._total_operations = 0
|
||||||
|
self._failed_operations = 0
|
||||||
|
|
||||||
|
async def start(self) -> None:
|
||||||
|
if self._worker_task is not None:
|
||||||
|
msg = "Write queue уже запущена"
|
||||||
|
raise RuntimeError(msg)
|
||||||
|
|
||||||
|
self._worker_task = asyncio.create_task(self._worker_loop())
|
||||||
|
self.logger.info("Write queue запущена")
|
||||||
|
|
||||||
|
async def stop(self, timeout: float = 10.0) -> None:
|
||||||
|
if self._worker_task is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.logger.info("Остановка write queue")
|
||||||
|
self._shutdown_event.set()
|
||||||
|
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(self._worker_task, timeout=timeout)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
self.logger.warning("Таймаут остановки write queue, принудительная отмена")
|
||||||
|
self._worker_task.cancel()
|
||||||
|
|
||||||
|
self.logger.info("Write queue остановлена")
|
||||||
|
|
||||||
|
async def update_article(self, article: Article) -> None:
|
||||||
|
operation = WriteOperation(
|
||||||
|
operation_type="update",
|
||||||
|
article=article,
|
||||||
|
)
|
||||||
|
await self._queue.put(operation)
|
||||||
|
|
||||||
|
async def update_from_result(self, result: ProcessingResult) -> Article:
|
||||||
|
future: asyncio.Future[Article] = asyncio.Future()
|
||||||
|
|
||||||
|
operation = WriteOperation(
|
||||||
|
operation_type="update_from_result",
|
||||||
|
result=result,
|
||||||
|
)
|
||||||
|
operation.future = future
|
||||||
|
|
||||||
|
await self._queue.put(operation)
|
||||||
|
return await future
|
||||||
|
|
||||||
|
async def _worker_loop(self) -> None:
|
||||||
|
batch: list[WriteOperation] = []
|
||||||
|
|
||||||
|
while not self._shutdown_event.is_set():
|
||||||
|
batch = await self._collect_batch(batch)
|
||||||
|
if batch and (len(batch) >= self.max_batch_size or self._shutdown_event.is_set()):
|
||||||
|
await self._process_batch(batch)
|
||||||
|
batch.clear()
|
||||||
|
|
||||||
|
if batch:
|
||||||
|
await self._process_batch(batch)
|
||||||
|
|
||||||
|
async def _collect_batch(self, batch: list[WriteOperation]) -> list[WriteOperation]:
|
||||||
|
try:
|
||||||
|
timeout = 0.1 if batch else 1.0
|
||||||
|
operation = await asyncio.wait_for(self._queue.get(), timeout=timeout)
|
||||||
|
batch.append(operation)
|
||||||
|
return batch
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
return batch
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.exception("Ошибка в worker loop")
|
||||||
|
self._handle_batch_error(batch, e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _handle_batch_error(self, batch: list[WriteOperation], error: Exception) -> None:
|
||||||
|
for op in batch:
|
||||||
|
if op.future and not op.future.done():
|
||||||
|
op.future.set_exception(error)
|
||||||
|
|
||||||
|
async def _process_batch(self, batch: list[WriteOperation]) -> None:
|
||||||
|
if not batch:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.logger.debug("Обработка батча операций", batch_size=len(batch))
|
||||||
|
|
||||||
|
for operation in batch:
|
||||||
|
await self._process_operation_safely(operation)
|
||||||
|
|
||||||
|
async def _process_operation_safely(self, operation: WriteOperation) -> None:
|
||||||
|
try:
|
||||||
|
await self._process_single_operation(operation)
|
||||||
|
self._total_operations += 1
|
||||||
|
|
||||||
|
if operation.future and not operation.future.done():
|
||||||
|
if operation.operation_type == "update_from_result" and operation.result:
|
||||||
|
article = await self.repository.get_by_url(operation.result.url)
|
||||||
|
operation.future.set_result(article)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self._failed_operations += 1
|
||||||
|
self.logger.exception(
|
||||||
|
"Ошибка при обработке операции",
|
||||||
|
operation_type=operation.operation_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
if operation.future and not operation.future.done():
|
||||||
|
operation.future.set_exception(e)
|
||||||
|
|
||||||
|
async def _process_single_operation(self, operation: WriteOperation) -> None:
|
||||||
|
if operation.operation_type == "update" and operation.article:
|
||||||
|
await self.repository.update_article(operation.article)
|
||||||
|
elif operation.operation_type == "update_from_result" and operation.result:
|
||||||
|
await self._update_article_from_result(operation.result)
|
||||||
|
else:
|
||||||
|
msg = f"Неизвестный тип операции: {operation.operation_type}"
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
async def _update_article_from_result(self, result: ProcessingResult) -> Article:
|
||||||
|
article = await self.repository.get_by_url(result.url)
|
||||||
|
if not article:
|
||||||
|
msg = f"Статья с URL {result.url} не найдена"
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
if not (result.title and result.raw_text and result.simplified_text):
|
||||||
|
msg = "Неполные данные в успешном результате"
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
article.mark_completed(
|
||||||
|
simplified_text=result.simplified_text,
|
||||||
|
token_count_raw=result.token_count_raw or 0,
|
||||||
|
token_count_simplified=result.token_count_simplified or 0,
|
||||||
|
processing_time=result.processing_time_seconds or 0,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
article.mark_failed(result.error_message or "Неизвестная ошибка")
|
||||||
|
|
||||||
|
return await self.repository.update_article(article)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def queue_size(self) -> int:
|
||||||
|
return self._queue.qsize()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def stats(self) -> dict[str, int]:
|
||||||
|
return {
|
||||||
|
"total_operations": self._total_operations,
|
||||||
|
"failed_operations": self._failed_operations,
|
||||||
|
"success_rate": (
|
||||||
|
(self._total_operations - self._failed_operations) / self._total_operations * 100
|
||||||
|
if self._total_operations > 0
|
||||||
|
else 0
|
||||||
|
),
|
||||||
|
}
|
|
@ -0,0 +1,88 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import AsyncGenerator
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import structlog
|
||||||
|
|
||||||
|
from src.models import SimplifyCommand
|
||||||
|
from src.models.constants import ARTICLE_NAME_INDEX, MIN_WIKI_PATH_PARTS, WIKI_PATH_INDEX
|
||||||
|
|
||||||
|
|
||||||
|
class FileSource:
|
||||||
|
def __init__(self, file_path: str) -> None:
|
||||||
|
self.file_path = Path(file_path)
|
||||||
|
self.logger = structlog.get_logger().bind(source="file", path=str(self.file_path))
|
||||||
|
|
||||||
|
async def read_urls(
|
||||||
|
self, *, force_reprocess: bool = False
|
||||||
|
) -> AsyncGenerator[SimplifyCommand, None]:
|
||||||
|
if not self.file_path.exists():
|
||||||
|
msg = f"Файл с URL не найден: {self.file_path}"
|
||||||
|
raise FileNotFoundError(msg)
|
||||||
|
|
||||||
|
self.logger.info("Начинаем чтение URL из файла")
|
||||||
|
|
||||||
|
content = await asyncio.to_thread(self._read_file_sync)
|
||||||
|
|
||||||
|
seen_urls = set()
|
||||||
|
valid_count = 0
|
||||||
|
invalid_count = 0
|
||||||
|
|
||||||
|
for line_num, original_line in enumerate(content.splitlines(), 1):
|
||||||
|
line = original_line.strip()
|
||||||
|
|
||||||
|
if not line or line.startswith("#"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not self._is_valid_wikipedia_url(line):
|
||||||
|
self.logger.warning("Невалидный URL", line_number=line_num, url=line)
|
||||||
|
invalid_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if line in seen_urls:
|
||||||
|
self.logger.debug("Дубликат URL пропущен", line_number=line_num, url=line)
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen_urls.add(line)
|
||||||
|
valid_count += 1
|
||||||
|
|
||||||
|
yield SimplifyCommand(url=line, force_reprocess=force_reprocess)
|
||||||
|
|
||||||
|
self.logger.info(
|
||||||
|
"Завершено чтение URL",
|
||||||
|
valid_count=valid_count,
|
||||||
|
invalid_count=invalid_count,
|
||||||
|
total_unique=len(seen_urls),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _read_file_sync(self) -> str:
|
||||||
|
return self.file_path.read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
def _is_valid_wikipedia_url(self, url: str) -> bool:
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
|
||||||
|
if parsed.scheme not in ("http", "https"):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if "wikipedia.org" not in parsed.netloc:
|
||||||
|
return False
|
||||||
|
|
||||||
|
path_parts = parsed.path.split("/")
|
||||||
|
if len(path_parts) < MIN_WIKI_PATH_PARTS or path_parts[WIKI_PATH_INDEX] != "wiki":
|
||||||
|
return False
|
||||||
|
|
||||||
|
article_name = path_parts[ARTICLE_NAME_INDEX]
|
||||||
|
return bool(article_name and article_name not in ("Main_Page", "Заглавная_страница"))
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def count_urls(self) -> int:
|
||||||
|
count = 0
|
||||||
|
async for _ in self.read_urls():
|
||||||
|
count += 1
|
||||||
|
return count
|
|
@ -0,0 +1,178 @@
|
||||||
|
import asyncio
|
||||||
|
import tempfile
|
||||||
|
from collections.abc import Generator
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import AsyncMock, MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from openai.types.chat import ChatCompletion, ChatCompletionMessage
|
||||||
|
from openai.types.chat.chat_completion import Choice
|
||||||
|
|
||||||
|
from src.models import AppConfig, Article, ArticleCreate, ProcessingStatus
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def event_loop() -> Generator[asyncio.AbstractEventLoop, None, None]:
|
||||||
|
"""Создать event loop для всей сессии тестов."""
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
yield loop
|
||||||
|
loop.close()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def test_config() -> AppConfig:
|
||||||
|
"""Тестовая конфигурация."""
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
db_path = Path(temp_dir) / "test.db"
|
||||||
|
return AppConfig(
|
||||||
|
openai_api_key="test_key",
|
||||||
|
openai_model="gpt-4o-mini",
|
||||||
|
db_path=str(db_path),
|
||||||
|
max_concurrent_llm=2,
|
||||||
|
openai_rpm=10,
|
||||||
|
max_concurrent_wiki=5,
|
||||||
|
prompt_template_path="src/prompt.txt",
|
||||||
|
log_level="DEBUG",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_wiki_urls() -> list[str]:
|
||||||
|
"""Список тестовых URL википедии."""
|
||||||
|
return [
|
||||||
|
"https://ru.wikipedia.org/wiki/Тест",
|
||||||
|
"https://ru.wikipedia.org/wiki/Пример",
|
||||||
|
"https://ru.wikipedia.org/wiki/Образец",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def invalid_urls() -> list[str]:
|
||||||
|
"""Список невалидных URL."""
|
||||||
|
return [
|
||||||
|
"https://example.com/invalid",
|
||||||
|
"https://en.wikipedia.org/wiki/English",
|
||||||
|
"not_a_url",
|
||||||
|
"",
|
||||||
|
"https://ru.wikipedia.org/wiki/",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_wikitext() -> str:
|
||||||
|
return """'''Тест''' — это проверка чего-либо.
|
||||||
|
|
||||||
|
== Определение ==
|
||||||
|
Тест может проводиться для различных целей:
|
||||||
|
* Проверка знаний
|
||||||
|
* Проверка работоспособности
|
||||||
|
* Проверка качества
|
||||||
|
|
||||||
|
== История ==
|
||||||
|
Тесты использовались с древних времён.
|
||||||
|
|
||||||
|
{{навигация|тема=Тестирование}}
|
||||||
|
|
||||||
|
[[Категория:Тестирование]]"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def simplified_text() -> str:
|
||||||
|
return """'''Тест''' — это проверка чего-либо для школьников.
|
||||||
|
|
||||||
|
== Что такое тест ==
|
||||||
|
Тест помогает проверить:
|
||||||
|
* Знания учеников
|
||||||
|
* Как работают устройства
|
||||||
|
* Качество продуктов
|
||||||
|
|
||||||
|
== Когда появились тесты ==
|
||||||
|
Люди проверяли друг друга очень давно.
|
||||||
|
|
||||||
|
###END###"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_article_data() -> ArticleCreate:
|
||||||
|
return ArticleCreate(
|
||||||
|
url="https://ru.wikipedia.org/wiki/Тест",
|
||||||
|
title="Тест",
|
||||||
|
raw_text="Тестовый wiki-текст",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_article(sample_article_data: ArticleCreate) -> Article:
|
||||||
|
return Article(
|
||||||
|
id=1,
|
||||||
|
url=sample_article_data.url,
|
||||||
|
title=sample_article_data.title,
|
||||||
|
raw_text=sample_article_data.raw_text,
|
||||||
|
status=ProcessingStatus.PENDING,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def completed_article(sample_article: Article, simplified_text: str) -> Article:
|
||||||
|
article = sample_article.model_copy()
|
||||||
|
article.mark_completed(
|
||||||
|
simplified_text=simplified_text,
|
||||||
|
token_count_raw=100,
|
||||||
|
token_count_simplified=50,
|
||||||
|
processing_time=2.5,
|
||||||
|
)
|
||||||
|
return article
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_openai_response() -> ChatCompletion:
|
||||||
|
return ChatCompletion(
|
||||||
|
id="test_completion",
|
||||||
|
object="chat.completion",
|
||||||
|
created=1234567890,
|
||||||
|
model="gpt-4o-mini",
|
||||||
|
choices=[
|
||||||
|
Choice(
|
||||||
|
index=0,
|
||||||
|
message=ChatCompletionMessage(
|
||||||
|
role="assistant",
|
||||||
|
content="Упрощённый текст для школьников.\n\n###END###",
|
||||||
|
),
|
||||||
|
finish_reason="stop",
|
||||||
|
)
|
||||||
|
],
|
||||||
|
usage=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def temp_input_file(sample_wiki_urls: list[str]) -> Generator[str, None, None]:
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
|
||||||
|
for url in sample_wiki_urls:
|
||||||
|
f.write(f"{url}\n")
|
||||||
|
f.write("# Комментарий\n")
|
||||||
|
f.write("\n")
|
||||||
|
f.write("https://ru.wikipedia.org/wiki/Дубликат\n")
|
||||||
|
f.write("https://ru.wikipedia.org/wiki/Дубликат\n")
|
||||||
|
temp_path = f.name
|
||||||
|
|
||||||
|
yield temp_path
|
||||||
|
|
||||||
|
Path(temp_path).unlink(missing_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
async def mock_wiki_client() -> AsyncMock:
|
||||||
|
mock_client = AsyncMock()
|
||||||
|
mock_page = MagicMock()
|
||||||
|
mock_page.exists = True
|
||||||
|
mock_page.redirect = False
|
||||||
|
mock_page.text.return_value = "Тестовый wiki-текст"
|
||||||
|
mock_client.pages = {"Тест": mock_page}
|
||||||
|
return mock_client
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
async def mock_openai_client() -> AsyncMock:
|
||||||
|
mock_client = AsyncMock()
|
||||||
|
return mock_client
|
|
@ -0,0 +1,278 @@
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
from unittest.mock import AsyncMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from openai import APIError, RateLimitError
|
||||||
|
|
||||||
|
from src.adapters import (
|
||||||
|
CircuitBreaker,
|
||||||
|
CircuitBreakerError,
|
||||||
|
LLMProviderAdapter,
|
||||||
|
LLMRateLimitError,
|
||||||
|
LLMTokenLimitError,
|
||||||
|
RateLimiter,
|
||||||
|
RuWikiAdapter,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCircuitBreaker:
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_successful_call(self):
|
||||||
|
cb = CircuitBreaker(failure_threshold=3, recovery_timeout=1)
|
||||||
|
|
||||||
|
async def test_func():
|
||||||
|
return "success"
|
||||||
|
|
||||||
|
result = await cb.call(test_func)
|
||||||
|
assert result == "success"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_failure_accumulation(self):
|
||||||
|
cb = CircuitBreaker(failure_threshold=2, recovery_timeout=1)
|
||||||
|
|
||||||
|
async def failing_func():
|
||||||
|
raise ValueError("Test error")
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
await cb.call(failing_func)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
await cb.call(failing_func)
|
||||||
|
|
||||||
|
with pytest.raises(CircuitBreakerError):
|
||||||
|
await cb.call(failing_func)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_recovery(self):
|
||||||
|
cb = CircuitBreaker(failure_threshold=1, recovery_timeout=0.1)
|
||||||
|
|
||||||
|
async def failing_func():
|
||||||
|
raise ValueError("Test error")
|
||||||
|
|
||||||
|
async def success_func():
|
||||||
|
return "recovered"
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
await cb.call(failing_func)
|
||||||
|
|
||||||
|
with pytest.raises(CircuitBreakerError):
|
||||||
|
await cb.call(failing_func)
|
||||||
|
|
||||||
|
await asyncio.sleep(0.2)
|
||||||
|
|
||||||
|
result = await cb.call(success_func)
|
||||||
|
assert result == "recovered"
|
||||||
|
|
||||||
|
|
||||||
|
class TestRateLimiter:
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_concurrency_limit(self):
|
||||||
|
limiter = RateLimiter(max_concurrent=2)
|
||||||
|
results = []
|
||||||
|
|
||||||
|
async def test_task(task_id: int):
|
||||||
|
async with limiter:
|
||||||
|
results.append(f"start_{task_id}")
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
results.append(f"end_{task_id}")
|
||||||
|
|
||||||
|
tasks = [test_task(i) for i in range(3)]
|
||||||
|
await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
start_count = 0
|
||||||
|
max_concurrent = 0
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
if result.startswith("start_"):
|
||||||
|
start_count += 1
|
||||||
|
max_concurrent = max(max_concurrent, start_count)
|
||||||
|
elif result.startswith("end_"):
|
||||||
|
start_count -= 1
|
||||||
|
|
||||||
|
assert max_concurrent <= 2
|
||||||
|
|
||||||
|
|
||||||
|
class TestRuWikiAdapter:
|
||||||
|
|
||||||
|
def test_extract_title_from_url(self):
|
||||||
|
adapter = RuWikiAdapter
|
||||||
|
|
||||||
|
title = adapter.extract_title_from_url("https://ru.wikipedia.org/wiki/Тест")
|
||||||
|
assert title == "Тест"
|
||||||
|
|
||||||
|
title = adapter.extract_title_from_url("https://ru.wikipedia.org/wiki/Тест_статья")
|
||||||
|
assert title == "Тест статья"
|
||||||
|
|
||||||
|
title = adapter.extract_title_from_url(
|
||||||
|
"https://ru.wikipedia.org/wiki/%D0%A2%D0%B5%D1%81%D1%82"
|
||||||
|
)
|
||||||
|
assert title == "Тест"
|
||||||
|
|
||||||
|
def test_extract_title_invalid_url(self):
|
||||||
|
adapter = RuWikiAdapter
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
adapter.extract_title_from_url("https://example.com/invalid")
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
adapter.extract_title_from_url("https://ru.wikipedia.org/invalid")
|
||||||
|
|
||||||
|
def test_clean_wikitext(self, test_config, sample_wikitext):
|
||||||
|
"""Тест очистки wiki-текста."""
|
||||||
|
adapter = RuWikiAdapter(test_config)
|
||||||
|
|
||||||
|
cleaned = adapter._clean_wikitext(sample_wikitext)
|
||||||
|
|
||||||
|
assert "{{навигация" not in cleaned
|
||||||
|
assert "[[Категория:" not in cleaned
|
||||||
|
|
||||||
|
assert "'''Тест'''" in cleaned
|
||||||
|
assert "== Определение ==" in cleaned
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_health_check_success(self, test_config):
|
||||||
|
adapter = RuWikiAdapter(test_config)
|
||||||
|
|
||||||
|
with patch.object(adapter, "_get_client") as mock_get_client:
|
||||||
|
mock_client = AsyncMock()
|
||||||
|
mock_get_client.return_value = mock_client
|
||||||
|
|
||||||
|
with patch("asyncio.to_thread") as mock_to_thread:
|
||||||
|
mock_to_thread.return_value = {"query": {"general": {}}}
|
||||||
|
|
||||||
|
result = await adapter.health_check()
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_health_check_failure(self, test_config):
|
||||||
|
adapter = RuWikiAdapter(test_config)
|
||||||
|
|
||||||
|
with patch.object(adapter, "_get_client") as mock_get_client:
|
||||||
|
mock_get_client.side_effect = ConnectionError("Network error")
|
||||||
|
|
||||||
|
result = await adapter.health_check()
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestLLMProviderAdapter:
|
||||||
|
|
||||||
|
def test_count_tokens(self, test_config):
|
||||||
|
"""Тест подсчёта токенов."""
|
||||||
|
adapter = LLMProviderAdapter(test_config)
|
||||||
|
|
||||||
|
count = adapter.count_tokens("Hello world")
|
||||||
|
assert count > 0
|
||||||
|
|
||||||
|
count = adapter.count_tokens("")
|
||||||
|
assert count == 0
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_rpm_limiting(self, test_config):
|
||||||
|
test_config.openai_rpm = 2
|
||||||
|
adapter = LLMProviderAdapter(test_config)
|
||||||
|
|
||||||
|
current_time = time.time()
|
||||||
|
adapter.request_times = [current_time - 10, current_time - 5]
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
await adapter._check_rpm_limit()
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
|
||||||
|
assert elapsed > 0.01
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_simplify_text_token_limit_error(self, test_config):
|
||||||
|
adapter = LLMProviderAdapter(test_config)
|
||||||
|
|
||||||
|
long_text = "word " * 2000
|
||||||
|
|
||||||
|
with pytest.raises(LLMTokenLimitError):
|
||||||
|
await adapter.simplify_text("Test", long_text, "template")
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_simplify_text_success(self, test_config, mock_openai_response):
|
||||||
|
adapter = LLMProviderAdapter(test_config)
|
||||||
|
|
||||||
|
with patch.object(adapter.client.chat.completions, "create") as mock_create:
|
||||||
|
mock_create.return_value = mock_openai_response
|
||||||
|
|
||||||
|
with patch.object(adapter, "_check_rpm_limit"):
|
||||||
|
result = await adapter.simplify_text(
|
||||||
|
title="Тест",
|
||||||
|
wiki_text="Тестовый текст",
|
||||||
|
prompt_template="### role: user\n{wiki_source_text}",
|
||||||
|
)
|
||||||
|
|
||||||
|
simplified_text, input_tokens, output_tokens = result
|
||||||
|
|
||||||
|
assert "Упрощённый текст для школьников" in simplified_text
|
||||||
|
assert "###END###" not in simplified_text
|
||||||
|
assert input_tokens > 0
|
||||||
|
assert output_tokens > 0
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_simplify_text_openai_error(self, test_config):
|
||||||
|
adapter = LLMProviderAdapter(test_config)
|
||||||
|
|
||||||
|
with patch.object(adapter.client.chat.completions, "create") as mock_create:
|
||||||
|
mock_create.side_effect = RateLimitError(
|
||||||
|
"Rate limit exceeded", response=None, body=None
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch.object(adapter, "_check_rpm_limit"):
|
||||||
|
with pytest.raises(LLMRateLimitError):
|
||||||
|
await adapter.simplify_text(
|
||||||
|
title="Тест",
|
||||||
|
wiki_text="Тестовый текст",
|
||||||
|
prompt_template="### role: user\n{wiki_source_text}",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_parse_prompt_template(self, test_config):
|
||||||
|
adapter = LLMProviderAdapter(test_config)
|
||||||
|
|
||||||
|
template = """### role: system
|
||||||
|
Ты помощник.
|
||||||
|
|
||||||
|
### role: user
|
||||||
|
Задание: {task}"""
|
||||||
|
|
||||||
|
messages = adapter._parse_prompt_template(template)
|
||||||
|
|
||||||
|
assert len(messages) == 2
|
||||||
|
assert messages[0]["role"] == "system"
|
||||||
|
assert messages[0]["content"] == "Ты помощник."
|
||||||
|
assert messages[1]["role"] == "user"
|
||||||
|
assert messages[1]["content"] == "Задание: {task}"
|
||||||
|
|
||||||
|
def test_parse_prompt_template_fallback(self, test_config):
|
||||||
|
adapter = LLMProviderAdapter(test_config)
|
||||||
|
|
||||||
|
template = "Обычный текст без ролей"
|
||||||
|
messages = adapter._parse_prompt_template(template)
|
||||||
|
|
||||||
|
assert len(messages) == 1
|
||||||
|
assert messages[0]["role"] == "user"
|
||||||
|
assert messages[0]["content"] == template
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_health_check_success(self, test_config, mock_openai_response):
|
||||||
|
adapter = LLMProviderAdapter(test_config)
|
||||||
|
|
||||||
|
with patch.object(adapter.client.chat.completions, "create") as mock_create:
|
||||||
|
mock_create.return_value = mock_openai_response
|
||||||
|
|
||||||
|
result = await adapter.health_check()
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_health_check_failure(self, test_config):
|
||||||
|
adapter = LLMProviderAdapter(test_config)
|
||||||
|
|
||||||
|
with patch.object(adapter.client.chat.completions, "create") as mock_create:
|
||||||
|
mock_create.side_effect = APIError("API Error", response=None, body=None)
|
||||||
|
|
||||||
|
result = await adapter.health_check()
|
||||||
|
assert result is False
|
|
@ -0,0 +1,311 @@
|
||||||
|
import asyncio
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import AsyncMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.dependency_injection import DependencyContainer
|
||||||
|
from src.models import ProcessingStatus
|
||||||
|
from src.sources import FileSource
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileSourceIntegration:
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_read_urls_from_file(self, temp_input_file):
|
||||||
|
source = FileSource(temp_input_file)
|
||||||
|
|
||||||
|
commands = []
|
||||||
|
async for command in source.read_urls():
|
||||||
|
commands.append(command)
|
||||||
|
|
||||||
|
assert len(commands) >= 3
|
||||||
|
|
||||||
|
for command in commands:
|
||||||
|
assert command.url.startswith("https://ru.wikipedia.org/wiki/")
|
||||||
|
assert command.force_reprocess is False
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_count_urls(self, temp_input_file):
|
||||||
|
source = FileSource(temp_input_file)
|
||||||
|
|
||||||
|
count = await source.count_urls()
|
||||||
|
assert count >= 3
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_file_not_found(self):
|
||||||
|
source = FileSource("nonexistent.txt")
|
||||||
|
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
async for _ in source.read_urls():
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TestDatabaseIntegration:
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_full_article_lifecycle(self, test_config, sample_article_data):
|
||||||
|
container = DependencyContainer(test_config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await container.initialize()
|
||||||
|
|
||||||
|
repository = container.get_repository()
|
||||||
|
|
||||||
|
article = await repository.create_article(sample_article_data)
|
||||||
|
assert article.id is not None
|
||||||
|
assert article.status == ProcessingStatus.PENDING
|
||||||
|
|
||||||
|
found_article = await repository.get_by_url(sample_article_data.url)
|
||||||
|
assert found_article is not None
|
||||||
|
assert found_article.id == article.id
|
||||||
|
|
||||||
|
article.mark_processing()
|
||||||
|
updated_article = await repository.update_article(article)
|
||||||
|
assert updated_article.status == ProcessingStatus.PROCESSING
|
||||||
|
|
||||||
|
article.mark_completed(
|
||||||
|
simplified_text="Упрощённый текст",
|
||||||
|
token_count_raw=100,
|
||||||
|
token_count_simplified=50,
|
||||||
|
processing_time=2.5,
|
||||||
|
)
|
||||||
|
final_article = await repository.update_article(article)
|
||||||
|
assert final_article.status == ProcessingStatus.COMPLETED
|
||||||
|
assert final_article.simplified_text == "Упрощённый текст"
|
||||||
|
|
||||||
|
completed_count = await repository.count_by_status(ProcessingStatus.COMPLETED)
|
||||||
|
assert completed_count == 1
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await container.cleanup()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_write_queue_integration(self, test_config, sample_article_data):
|
||||||
|
container = DependencyContainer(test_config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await container.initialize()
|
||||||
|
|
||||||
|
repository = container.get_repository()
|
||||||
|
write_queue = container.get_write_queue()
|
||||||
|
|
||||||
|
article = await repository.create_article(sample_article_data)
|
||||||
|
|
||||||
|
from src.models import ProcessingResult
|
||||||
|
|
||||||
|
result = ProcessingResult.success_result(
|
||||||
|
url=article.url,
|
||||||
|
title=article.title,
|
||||||
|
raw_text=article.raw_text,
|
||||||
|
simplified_text="Упрощённый текст",
|
||||||
|
token_count_raw=100,
|
||||||
|
token_count_simplified=50,
|
||||||
|
processing_time_seconds=2.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
updated_article = await write_queue.update_from_result(result)
|
||||||
|
|
||||||
|
assert updated_article.status == ProcessingStatus.COMPLETED
|
||||||
|
assert updated_article.simplified_text == "Упрощённый текст"
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await container.cleanup()
|
||||||
|
|
||||||
|
|
||||||
|
class TestSystemIntegration:
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_dependency_container_initialization(self, test_config):
|
||||||
|
container = DependencyContainer(test_config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await container.initialize()
|
||||||
|
|
||||||
|
db_service = container.get_database_service()
|
||||||
|
repository = container.get_repository()
|
||||||
|
write_queue = container.get_write_queue()
|
||||||
|
ruwiki_adapter = container.get_ruwiki_adapter()
|
||||||
|
llm_adapter = container.get_llm_adapter()
|
||||||
|
simplify_service = container.get_simplify_service()
|
||||||
|
|
||||||
|
assert db_service is not None
|
||||||
|
assert repository is not None
|
||||||
|
assert write_queue is not None
|
||||||
|
assert ruwiki_adapter is not None
|
||||||
|
assert llm_adapter is not None
|
||||||
|
assert simplify_service is not None
|
||||||
|
|
||||||
|
checks = await container.health_check()
|
||||||
|
assert "database" in checks
|
||||||
|
assert "write_queue" in checks
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await container.cleanup()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_runner_with_mocked_adapters(self, test_config, temp_input_file):
|
||||||
|
container = DependencyContainer(test_config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await container.initialize()
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch.object(container, "get_ruwiki_adapter") as mock_ruwiki,
|
||||||
|
patch.object(container, "get_llm_adapter") as mock_llm,
|
||||||
|
):
|
||||||
|
|
||||||
|
mock_ruwiki_instance = AsyncMock()
|
||||||
|
mock_ruwiki.return_value = mock_ruwiki_instance
|
||||||
|
|
||||||
|
from src.adapters.ruwiki import WikiPageInfo
|
||||||
|
|
||||||
|
mock_ruwiki_instance.fetch_page_cleaned.return_value = WikiPageInfo(
|
||||||
|
title="Тест",
|
||||||
|
content="Тестовый контент",
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_llm_instance = AsyncMock()
|
||||||
|
mock_llm.return_value = mock_llm_instance
|
||||||
|
mock_llm_instance.simplify_text.return_value = ("Упрощённый текст", 100, 50)
|
||||||
|
mock_llm_instance.count_tokens.return_value = 100
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
|
||||||
|
f.write("### role: user\n{wiki_source_text}")
|
||||||
|
test_config.prompt_template_path = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
runner = container.create_runner(max_workers=2)
|
||||||
|
|
||||||
|
stats = await runner.run_from_file(
|
||||||
|
input_file=temp_input_file,
|
||||||
|
max_articles=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert stats.total_processed >= 1
|
||||||
|
assert stats.successful >= 0
|
||||||
|
|
||||||
|
finally:
|
||||||
|
Path(test_config.prompt_template_path).unlink(missing_ok=True)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await container.cleanup()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_error_handling_in_runner(self, test_config, temp_input_file):
|
||||||
|
container = DependencyContainer(test_config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await container.initialize()
|
||||||
|
|
||||||
|
with patch.object(container, "get_ruwiki_adapter") as mock_ruwiki:
|
||||||
|
mock_ruwiki_instance = AsyncMock()
|
||||||
|
mock_ruwiki.return_value = mock_ruwiki_instance
|
||||||
|
|
||||||
|
from src.adapters import WikiPageNotFoundError
|
||||||
|
|
||||||
|
mock_ruwiki_instance.fetch_page_cleaned.side_effect = WikiPageNotFoundError(
|
||||||
|
"Страница не найдена"
|
||||||
|
)
|
||||||
|
|
||||||
|
runner = container.create_runner(max_workers=1)
|
||||||
|
|
||||||
|
stats = await runner.run_from_file(
|
||||||
|
input_file=temp_input_file,
|
||||||
|
max_articles=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert stats.total_processed >= 1
|
||||||
|
assert stats.failed >= 1
|
||||||
|
assert stats.success_rate < 100.0
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await container.cleanup()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_concurrent_processing(self, test_config, temp_input_file):
|
||||||
|
container = DependencyContainer(test_config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await container.initialize()
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch.object(container, "get_ruwiki_adapter") as mock_ruwiki,
|
||||||
|
patch.object(container, "get_llm_adapter") as mock_llm,
|
||||||
|
):
|
||||||
|
|
||||||
|
async def delayed_fetch(*args, **kwargs):
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
from src.adapters.ruwiki import WikiPageInfo
|
||||||
|
|
||||||
|
return WikiPageInfo(title="Тест", content="Контент")
|
||||||
|
|
||||||
|
async def delayed_simplify(*args, **kwargs):
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
return ("Упрощённый", 100, 50)
|
||||||
|
|
||||||
|
mock_ruwiki_instance = AsyncMock()
|
||||||
|
mock_ruwiki.return_value = mock_ruwiki_instance
|
||||||
|
mock_ruwiki_instance.fetch_page_cleaned.side_effect = delayed_fetch
|
||||||
|
|
||||||
|
mock_llm_instance = AsyncMock()
|
||||||
|
mock_llm.return_value = mock_llm_instance
|
||||||
|
mock_llm_instance.simplify_text.side_effect = delayed_simplify
|
||||||
|
mock_llm_instance.count_tokens.return_value = 100
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
|
||||||
|
f.write("### role: user\n{wiki_source_text}")
|
||||||
|
test_config.prompt_template_path = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
import time
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
runner = container.create_runner(max_workers=3)
|
||||||
|
stats = await runner.run_from_file(
|
||||||
|
input_file=temp_input_file,
|
||||||
|
max_articles=3,
|
||||||
|
)
|
||||||
|
|
||||||
|
elapsed_time = time.time() - start_time
|
||||||
|
|
||||||
|
assert elapsed_time < 1.0
|
||||||
|
assert stats.total_processed >= 1
|
||||||
|
|
||||||
|
finally:
|
||||||
|
Path(test_config.prompt_template_path).unlink(missing_ok=True)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await container.cleanup()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_health_check_integration(self, test_config):
|
||||||
|
container = DependencyContainer(test_config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await container.initialize()
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch.object(container, "get_ruwiki_adapter") as mock_ruwiki,
|
||||||
|
patch.object(container, "get_llm_adapter") as mock_llm,
|
||||||
|
):
|
||||||
|
|
||||||
|
mock_ruwiki_instance = AsyncMock()
|
||||||
|
mock_ruwiki.return_value = mock_ruwiki_instance
|
||||||
|
mock_ruwiki_instance.health_check.return_value = True
|
||||||
|
|
||||||
|
mock_llm_instance = AsyncMock()
|
||||||
|
mock_llm.return_value = mock_llm_instance
|
||||||
|
mock_llm_instance.health_check.return_value = True
|
||||||
|
|
||||||
|
checks = await container.health_check()
|
||||||
|
|
||||||
|
assert checks["database"] is True
|
||||||
|
assert checks["write_queue"] is True
|
||||||
|
assert checks["ruwiki"] is True
|
||||||
|
assert checks["llm"] is True
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await container.cleanup()
|
|
@ -0,0 +1,263 @@
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.models import (
|
||||||
|
AppConfig,
|
||||||
|
Article,
|
||||||
|
ProcessingResult,
|
||||||
|
ProcessingStats,
|
||||||
|
ProcessingStatus,
|
||||||
|
SimplifyCommand,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestAppConfig:
|
||||||
|
|
||||||
|
def test_default_values(self):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
AppConfig()
|
||||||
|
|
||||||
|
def test_valid_config(self):
|
||||||
|
config = AppConfig(
|
||||||
|
openai_api_key="test_key",
|
||||||
|
db_path="./test.db",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert config.openai_api_key == "test_key"
|
||||||
|
assert config.openai_model == "gpt-4o-mini"
|
||||||
|
assert config.openai_temperature == 0.0
|
||||||
|
assert config.max_concurrent_llm == 5
|
||||||
|
assert config.openai_rpm == 200
|
||||||
|
|
||||||
|
def test_db_url_generation(self):
|
||||||
|
config = AppConfig(
|
||||||
|
openai_api_key="test_key",
|
||||||
|
db_path="./test.db",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert config.db_url == "sqlite+aiosqlite:///test.db"
|
||||||
|
assert config.sync_db_url == "sqlite:///test.db"
|
||||||
|
|
||||||
|
def test_validation_constraints(self):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
AppConfig(
|
||||||
|
openai_api_key="test_key",
|
||||||
|
openai_temperature=3.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
AppConfig(
|
||||||
|
openai_api_key="test_key",
|
||||||
|
max_concurrent_llm=100,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestArticle:
|
||||||
|
|
||||||
|
def test_article_creation(self, sample_article_data):
|
||||||
|
article = Article(
|
||||||
|
url=sample_article_data.url,
|
||||||
|
title=sample_article_data.title,
|
||||||
|
raw_text=sample_article_data.raw_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert article.url == sample_article_data.url
|
||||||
|
assert article.title == sample_article_data.title
|
||||||
|
assert article.status == ProcessingStatus.PENDING
|
||||||
|
assert article.simplified_text is None
|
||||||
|
assert isinstance(article.created_at, datetime)
|
||||||
|
|
||||||
|
def test_mark_processing(self, sample_article):
|
||||||
|
article = sample_article
|
||||||
|
original_updated = article.updated_at
|
||||||
|
|
||||||
|
article.mark_processing()
|
||||||
|
|
||||||
|
assert article.status == ProcessingStatus.PROCESSING
|
||||||
|
assert article.updated_at != original_updated
|
||||||
|
|
||||||
|
def test_mark_completed(self, sample_article):
|
||||||
|
article = sample_article
|
||||||
|
simplified_text = "Упрощённый текст"
|
||||||
|
|
||||||
|
article.mark_completed(
|
||||||
|
simplified_text=simplified_text,
|
||||||
|
token_count_raw=100,
|
||||||
|
token_count_simplified=50,
|
||||||
|
processing_time=2.5,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert article.status == ProcessingStatus.COMPLETED
|
||||||
|
assert article.simplified_text == simplified_text
|
||||||
|
assert article.token_count_raw == 100
|
||||||
|
assert article.token_count_simplified == 50
|
||||||
|
assert article.processing_time_seconds == 2.5
|
||||||
|
assert article.error_message is None
|
||||||
|
assert article.updated_at is not None
|
||||||
|
|
||||||
|
def test_mark_failed(self, sample_article):
|
||||||
|
article = sample_article
|
||||||
|
error_message = "Тестовая ошибка"
|
||||||
|
|
||||||
|
article.mark_failed(error_message)
|
||||||
|
|
||||||
|
assert article.status == ProcessingStatus.FAILED
|
||||||
|
assert article.error_message == error_message
|
||||||
|
assert article.updated_at is not None
|
||||||
|
|
||||||
|
def test_mark_failed_long_error(self, sample_article):
|
||||||
|
article = sample_article
|
||||||
|
long_error = "x" * 1500
|
||||||
|
|
||||||
|
article.mark_failed(long_error)
|
||||||
|
|
||||||
|
assert len(article.error_message) == 1000
|
||||||
|
assert article.error_message == "x" * 1000
|
||||||
|
|
||||||
|
|
||||||
|
class TestSimplifyCommand:
|
||||||
|
|
||||||
|
def test_command_creation(self):
|
||||||
|
url = "https://ru.wikipedia.org/wiki/Тест"
|
||||||
|
command = SimplifyCommand(url=url)
|
||||||
|
|
||||||
|
assert command.url == url
|
||||||
|
assert command.force_reprocess is False
|
||||||
|
|
||||||
|
def test_command_with_force(self):
|
||||||
|
url = "https://ru.wikipedia.org/wiki/Тест"
|
||||||
|
command = SimplifyCommand(url=url, force_reprocess=True)
|
||||||
|
|
||||||
|
assert command.url == url
|
||||||
|
assert command.force_reprocess is True
|
||||||
|
|
||||||
|
def test_command_string_representation(self):
|
||||||
|
url = "https://ru.wikipedia.org/wiki/Тест"
|
||||||
|
command = SimplifyCommand(url=url, force_reprocess=True)
|
||||||
|
|
||||||
|
expected = f"SimplifyCommand(url='{url}', force=True)"
|
||||||
|
assert str(command) == expected
|
||||||
|
|
||||||
|
|
||||||
|
class TestProcessingResult:
|
||||||
|
|
||||||
|
def test_success_result_creation(self):
|
||||||
|
result = ProcessingResult.success_result(
|
||||||
|
url="https://ru.wikipedia.org/wiki/Тест",
|
||||||
|
title="Тест",
|
||||||
|
raw_text="Исходный текст",
|
||||||
|
simplified_text="Упрощённый текст",
|
||||||
|
token_count_raw=100,
|
||||||
|
token_count_simplified=50,
|
||||||
|
processing_time_seconds=2.5,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.success is True
|
||||||
|
assert result.url == "https://ru.wikipedia.org/wiki/Тест"
|
||||||
|
assert result.title == "Тест"
|
||||||
|
assert result.raw_text == "Исходный текст"
|
||||||
|
assert result.simplified_text == "Упрощённый текст"
|
||||||
|
assert result.token_count_raw == 100
|
||||||
|
assert result.token_count_simplified == 50
|
||||||
|
assert result.processing_time_seconds == 2.5
|
||||||
|
assert result.error_message is None
|
||||||
|
|
||||||
|
def test_failure_result_creation(self):
|
||||||
|
result = ProcessingResult.failure_result(
|
||||||
|
url="https://ru.wikipedia.org/wiki/Тест",
|
||||||
|
error_message="Тестовая ошибка",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.success is False
|
||||||
|
assert result.url == "https://ru.wikipedia.org/wiki/Тест"
|
||||||
|
assert result.error_message == "Тестовая ошибка"
|
||||||
|
assert result.title is None
|
||||||
|
assert result.raw_text is None
|
||||||
|
assert result.simplified_text is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestProcessingStats:
|
||||||
|
|
||||||
|
def test_initial_stats(self):
|
||||||
|
stats = ProcessingStats()
|
||||||
|
|
||||||
|
assert stats.total_processed == 0
|
||||||
|
assert stats.successful == 0
|
||||||
|
assert stats.failed == 0
|
||||||
|
assert stats.skipped == 0
|
||||||
|
assert stats.success_rate == 0.0
|
||||||
|
assert stats.average_processing_time == 0.0
|
||||||
|
|
||||||
|
def test_add_successful_result(self):
|
||||||
|
stats = ProcessingStats()
|
||||||
|
result = ProcessingResult.success_result(
|
||||||
|
url="test",
|
||||||
|
title="Test",
|
||||||
|
raw_text="text",
|
||||||
|
simplified_text="simple",
|
||||||
|
token_count_raw=100,
|
||||||
|
token_count_simplified=50,
|
||||||
|
processing_time_seconds=2.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
stats.add_result(result)
|
||||||
|
|
||||||
|
assert stats.total_processed == 1
|
||||||
|
assert stats.successful == 1
|
||||||
|
assert stats.failed == 0
|
||||||
|
assert stats.success_rate == 100.0
|
||||||
|
assert stats.average_processing_time == 2.0
|
||||||
|
|
||||||
|
def test_add_failed_result(self):
|
||||||
|
stats = ProcessingStats()
|
||||||
|
result = ProcessingResult.failure_result("test", "error")
|
||||||
|
|
||||||
|
stats.add_result(result)
|
||||||
|
|
||||||
|
assert stats.total_processed == 1
|
||||||
|
assert stats.successful == 0
|
||||||
|
assert stats.failed == 1
|
||||||
|
assert stats.success_rate == 0.0
|
||||||
|
|
||||||
|
def test_mixed_results(self):
|
||||||
|
stats = ProcessingStats()
|
||||||
|
|
||||||
|
success_result = ProcessingResult.success_result(
|
||||||
|
url="test1",
|
||||||
|
title="Test1",
|
||||||
|
raw_text="text",
|
||||||
|
simplified_text="simple",
|
||||||
|
token_count_raw=100,
|
||||||
|
token_count_simplified=50,
|
||||||
|
processing_time_seconds=3.0,
|
||||||
|
)
|
||||||
|
stats.add_result(success_result)
|
||||||
|
|
||||||
|
failure_result = ProcessingResult.failure_result("test2", "error")
|
||||||
|
stats.add_result(failure_result)
|
||||||
|
|
||||||
|
success_result2 = ProcessingResult.success_result(
|
||||||
|
url="test3",
|
||||||
|
title="Test3",
|
||||||
|
raw_text="text",
|
||||||
|
simplified_text="simple",
|
||||||
|
token_count_raw=100,
|
||||||
|
token_count_simplified=50,
|
||||||
|
processing_time_seconds=1.0,
|
||||||
|
)
|
||||||
|
stats.add_result(success_result2)
|
||||||
|
|
||||||
|
assert stats.total_processed == 3
|
||||||
|
assert stats.successful == 2
|
||||||
|
assert stats.failed == 1
|
||||||
|
assert stats.success_rate == pytest.approx(66.67, rel=1e-2)
|
||||||
|
assert stats.average_processing_time == 2.0
|
||||||
|
|
||||||
|
def test_add_skipped(self):
|
||||||
|
stats = ProcessingStats()
|
||||||
|
|
||||||
|
stats.add_skipped()
|
||||||
|
stats.add_skipped()
|
||||||
|
|
||||||
|
assert stats.skipped == 2
|
|
@ -0,0 +1,370 @@
|
||||||
|
"""Тесты для сервисов."""
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import AsyncMock, MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.adapters import LLMProviderAdapter, RuWikiAdapter
|
||||||
|
from src.adapters.ruwiki import WikiPageInfo
|
||||||
|
from src.models import ProcessingResult, SimplifyCommand
|
||||||
|
from src.services import (
|
||||||
|
AsyncWriteQueue,
|
||||||
|
DatabaseService,
|
||||||
|
RecursiveCharacterTextSplitter,
|
||||||
|
SimplifyService,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRecursiveCharacterTextSplitter:
|
||||||
|
def test_split_short_text(self):
|
||||||
|
splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
|
||||||
|
|
||||||
|
short_text = "Это короткий текст."
|
||||||
|
chunks = splitter.split_text(short_text)
|
||||||
|
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert chunks[0] == short_text
|
||||||
|
|
||||||
|
def test_split_long_text(self):
|
||||||
|
splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
|
||||||
|
|
||||||
|
long_text = "Это очень длинный текст. " * 10
|
||||||
|
chunks = splitter.split_text(long_text)
|
||||||
|
|
||||||
|
assert len(chunks) > 1
|
||||||
|
|
||||||
|
for chunk in chunks:
|
||||||
|
assert len(chunk) <= 60
|
||||||
|
|
||||||
|
def test_split_by_paragraphs(self):
|
||||||
|
splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
|
||||||
|
|
||||||
|
text = "Первый абзац.\n\nВторой абзац.\n\nТретий абзац."
|
||||||
|
chunks = splitter.split_text(text)
|
||||||
|
|
||||||
|
assert len(chunks) >= 2
|
||||||
|
|
||||||
|
def test_split_empty_text(self):
|
||||||
|
splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
|
||||||
|
|
||||||
|
chunks = splitter.split_text("")
|
||||||
|
assert chunks == []
|
||||||
|
|
||||||
|
def test_custom_length_function(self):
|
||||||
|
def word_count(text: str) -> int:
|
||||||
|
return len(text.split())
|
||||||
|
|
||||||
|
splitter = RecursiveCharacterTextSplitter(
|
||||||
|
chunk_size=5,
|
||||||
|
chunk_overlap=2,
|
||||||
|
length_function=word_count,
|
||||||
|
)
|
||||||
|
|
||||||
|
text = "Один два три четыре пять шесть семь восемь девять десять"
|
||||||
|
chunks = splitter.split_text(text)
|
||||||
|
|
||||||
|
assert len(chunks) > 1
|
||||||
|
|
||||||
|
for chunk in chunks:
|
||||||
|
word_count_in_chunk = len(chunk.split())
|
||||||
|
assert word_count_in_chunk <= 7
|
||||||
|
|
||||||
|
def test_create_chunks_with_metadata(self):
|
||||||
|
splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
|
||||||
|
|
||||||
|
text = "Это тестовый текст. " * 10
|
||||||
|
title = "Тестовая статья"
|
||||||
|
|
||||||
|
chunks_with_metadata = splitter.create_chunks_with_metadata(text, title)
|
||||||
|
|
||||||
|
assert len(chunks_with_metadata) > 1
|
||||||
|
|
||||||
|
for i, chunk_data in enumerate(chunks_with_metadata):
|
||||||
|
assert "text" in chunk_data
|
||||||
|
assert chunk_data["title"] == title
|
||||||
|
assert chunk_data["chunk_index"] == i
|
||||||
|
assert chunk_data["total_chunks"] == len(chunks_with_metadata)
|
||||||
|
assert "chunk_size" in chunk_data
|
||||||
|
|
||||||
|
|
||||||
|
class TestDatabaseService:
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_initialize_database(self, test_config):
|
||||||
|
db_service = DatabaseService(test_config)
|
||||||
|
|
||||||
|
await db_service.initialize_database()
|
||||||
|
|
||||||
|
assert Path(test_config.db_path).exists()
|
||||||
|
|
||||||
|
assert await db_service.health_check() is True
|
||||||
|
|
||||||
|
db_service.close()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_get_connection(self, test_config):
|
||||||
|
db_service = DatabaseService(test_config)
|
||||||
|
await db_service.initialize_database()
|
||||||
|
|
||||||
|
async with db_service.get_connection() as conn:
|
||||||
|
cursor = await conn.execute("SELECT 1")
|
||||||
|
result = await cursor.fetchone()
|
||||||
|
assert result[0] == 1
|
||||||
|
|
||||||
|
db_service.close()
|
||||||
|
|
||||||
|
|
||||||
|
class TestAsyncWriteQueue:
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_start_stop(self):
|
||||||
|
mock_repository = AsyncMock()
|
||||||
|
queue = AsyncWriteQueue(mock_repository, max_batch_size=5)
|
||||||
|
|
||||||
|
await queue.start()
|
||||||
|
assert queue._worker_task is not None
|
||||||
|
|
||||||
|
await queue.stop(timeout=1.0)
|
||||||
|
assert queue._worker_task.done()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_update_from_result_success(self, sample_article, simplified_text):
|
||||||
|
mock_repository = AsyncMock()
|
||||||
|
mock_repository.get_by_url.return_value = sample_article
|
||||||
|
mock_repository.update_article.return_value = sample_article
|
||||||
|
|
||||||
|
queue = AsyncWriteQueue(mock_repository, max_batch_size=1)
|
||||||
|
await queue.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = ProcessingResult.success_result(
|
||||||
|
url=sample_article.url,
|
||||||
|
title=sample_article.title,
|
||||||
|
raw_text=sample_article.raw_text,
|
||||||
|
simplified_text=simplified_text,
|
||||||
|
token_count_raw=100,
|
||||||
|
token_count_simplified=50,
|
||||||
|
processing_time_seconds=2.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
updated_article = await queue.update_from_result(result)
|
||||||
|
|
||||||
|
assert updated_article.simplified_text == simplified_text
|
||||||
|
mock_repository.get_by_url.assert_called_once_with(sample_article.url)
|
||||||
|
mock_repository.update_article.assert_called_once()
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await queue.stop(timeout=1.0)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_update_from_result_failure(self, sample_article):
|
||||||
|
mock_repository = AsyncMock()
|
||||||
|
mock_repository.get_by_url.return_value = sample_article
|
||||||
|
mock_repository.update_article.return_value = sample_article
|
||||||
|
|
||||||
|
queue = AsyncWriteQueue(mock_repository, max_batch_size=1)
|
||||||
|
await queue.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = ProcessingResult.failure_result(
|
||||||
|
url=sample_article.url,
|
||||||
|
error_message="Тестовая ошибка",
|
||||||
|
)
|
||||||
|
|
||||||
|
updated_article = await queue.update_from_result(result)
|
||||||
|
|
||||||
|
assert updated_article.error_message == "Тестовая ошибка"
|
||||||
|
mock_repository.update_article.assert_called_once()
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await queue.stop(timeout=1.0)
|
||||||
|
|
||||||
|
def test_stats(self):
|
||||||
|
mock_repository = AsyncMock()
|
||||||
|
queue = AsyncWriteQueue(mock_repository)
|
||||||
|
|
||||||
|
stats = queue.stats
|
||||||
|
|
||||||
|
assert "total_operations" in stats
|
||||||
|
assert "failed_operations" in stats
|
||||||
|
assert "queue_size" in stats
|
||||||
|
assert stats["total_operations"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestSimplifyService:
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_adapters_and_queue(self, test_config):
|
||||||
|
mock_ruwiki = AsyncMock(spec=RuWikiAdapter)
|
||||||
|
mock_llm = AsyncMock(spec=LLMProviderAdapter)
|
||||||
|
mock_repository = AsyncMock()
|
||||||
|
mock_write_queue = AsyncMock()
|
||||||
|
|
||||||
|
return mock_ruwiki, mock_llm, mock_repository, mock_write_queue
|
||||||
|
|
||||||
|
def test_service_initialization(self, test_config, mock_adapters_and_queue):
|
||||||
|
mock_ruwiki, mock_llm, mock_repository, mock_write_queue = mock_adapters_and_queue
|
||||||
|
|
||||||
|
service = SimplifyService(
|
||||||
|
config=test_config,
|
||||||
|
ruwiki_adapter=mock_ruwiki,
|
||||||
|
llm_adapter=mock_llm,
|
||||||
|
repository=mock_repository,
|
||||||
|
write_queue=mock_write_queue,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert service.config == test_config
|
||||||
|
assert service.ruwiki_adapter == mock_ruwiki
|
||||||
|
assert service.llm_adapter == mock_llm
|
||||||
|
assert isinstance(service.text_splitter, RecursiveCharacterTextSplitter)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_get_prompt_template(self, test_config, mock_adapters_and_queue):
|
||||||
|
mock_ruwiki, mock_llm, mock_repository, mock_write_queue = mock_adapters_and_queue
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
|
||||||
|
f.write("### role: system\nТы помощник")
|
||||||
|
temp_prompt_path = f.name
|
||||||
|
|
||||||
|
test_config.prompt_template_path = temp_prompt_path
|
||||||
|
|
||||||
|
service = SimplifyService(
|
||||||
|
config=test_config,
|
||||||
|
ruwiki_adapter=mock_ruwiki,
|
||||||
|
llm_adapter=mock_llm,
|
||||||
|
repository=mock_repository,
|
||||||
|
write_queue=mock_write_queue,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
template = await service.get_prompt_template()
|
||||||
|
assert "### role: system" in template
|
||||||
|
assert "Ты помощник" in template
|
||||||
|
|
||||||
|
template2 = await service.get_prompt_template()
|
||||||
|
assert template == template2
|
||||||
|
|
||||||
|
finally:
|
||||||
|
Path(temp_prompt_path).unlink(missing_ok=True)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_get_prompt_template_not_found(self, test_config, mock_adapters_and_queue):
|
||||||
|
mock_ruwiki, mock_llm, mock_repository, mock_write_queue = mock_adapters_and_queue
|
||||||
|
|
||||||
|
test_config.prompt_template_path = "nonexistent.txt"
|
||||||
|
|
||||||
|
service = SimplifyService(
|
||||||
|
config=test_config,
|
||||||
|
ruwiki_adapter=mock_ruwiki,
|
||||||
|
llm_adapter=mock_llm,
|
||||||
|
repository=mock_repository,
|
||||||
|
write_queue=mock_write_queue,
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
await service.get_prompt_template()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_process_command_success(
|
||||||
|
self, test_config, mock_adapters_and_queue, sample_wikitext, simplified_text
|
||||||
|
):
|
||||||
|
mock_ruwiki, mock_llm, mock_repository, mock_write_queue = mock_adapters_and_queue
|
||||||
|
|
||||||
|
wiki_page_info = WikiPageInfo(
|
||||||
|
title="Тест",
|
||||||
|
content=sample_wikitext,
|
||||||
|
)
|
||||||
|
mock_ruwiki.fetch_page_cleaned.return_value = wiki_page_info
|
||||||
|
mock_llm.simplify_text.return_value = (simplified_text, 100, 50)
|
||||||
|
mock_llm.count_tokens.return_value = 100
|
||||||
|
|
||||||
|
mock_repository.get_by_url.return_value = None
|
||||||
|
mock_repository.create_article.return_value = MagicMock(id=1)
|
||||||
|
mock_repository.update_article.return_value = MagicMock()
|
||||||
|
|
||||||
|
mock_write_queue.update_from_result.return_value = MagicMock()
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
|
||||||
|
f.write("### role: user\n{wiki_source_text}")
|
||||||
|
test_config.prompt_template_path = f.name
|
||||||
|
|
||||||
|
service = SimplifyService(
|
||||||
|
config=test_config,
|
||||||
|
ruwiki_adapter=mock_ruwiki,
|
||||||
|
llm_adapter=mock_llm,
|
||||||
|
repository=mock_repository,
|
||||||
|
write_queue=mock_write_queue,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
command = SimplifyCommand(url="https://ru.wikipedia.org/wiki/Тест")
|
||||||
|
result = await service.process_command(command)
|
||||||
|
|
||||||
|
assert result.success is True
|
||||||
|
assert result.title == "Тест"
|
||||||
|
assert result.simplified_text == simplified_text
|
||||||
|
assert result.token_count_raw == 100
|
||||||
|
assert result.token_count_simplified == 50
|
||||||
|
|
||||||
|
mock_ruwiki.fetch_page_cleaned.assert_called_once()
|
||||||
|
mock_llm.simplify_text.assert_called_once()
|
||||||
|
mock_write_queue.update_from_result.assert_called_once()
|
||||||
|
|
||||||
|
finally:
|
||||||
|
Path(test_config.prompt_template_path).unlink(missing_ok=True)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_process_command_skip_existing(
|
||||||
|
self, test_config, mock_adapters_and_queue, completed_article
|
||||||
|
):
|
||||||
|
mock_ruwiki, mock_llm, mock_repository, mock_write_queue = mock_adapters_and_queue
|
||||||
|
|
||||||
|
mock_repository.get_by_url.return_value = completed_article
|
||||||
|
|
||||||
|
service = SimplifyService(
|
||||||
|
config=test_config,
|
||||||
|
ruwiki_adapter=mock_ruwiki,
|
||||||
|
llm_adapter=mock_llm,
|
||||||
|
repository=mock_repository,
|
||||||
|
write_queue=mock_write_queue,
|
||||||
|
)
|
||||||
|
|
||||||
|
command = SimplifyCommand(url=completed_article.url, force_reprocess=False)
|
||||||
|
result = await service.process_command(command)
|
||||||
|
|
||||||
|
assert result.success is True
|
||||||
|
assert result.title == completed_article.title
|
||||||
|
|
||||||
|
mock_ruwiki.fetch_page_cleaned.assert_not_called()
|
||||||
|
mock_llm.simplify_text.assert_not_called()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_health_check(self, test_config, mock_adapters_and_queue):
|
||||||
|
mock_ruwiki, mock_llm, mock_repository, mock_write_queue = mock_adapters_and_queue
|
||||||
|
|
||||||
|
mock_ruwiki.health_check.return_value = True
|
||||||
|
mock_llm.health_check.return_value = True
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
|
||||||
|
f.write("test prompt")
|
||||||
|
test_config.prompt_template_path = f.name
|
||||||
|
|
||||||
|
service = SimplifyService(
|
||||||
|
config=test_config,
|
||||||
|
ruwiki_adapter=mock_ruwiki,
|
||||||
|
llm_adapter=mock_llm,
|
||||||
|
repository=mock_repository,
|
||||||
|
write_queue=mock_write_queue,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
checks = await service.health_check()
|
||||||
|
|
||||||
|
assert checks["ruwiki"] is True
|
||||||
|
assert checks["llm"] is True
|
||||||
|
assert checks["prompt_template"] is True
|
||||||
|
|
||||||
|
finally:
|
||||||
|
Path(test_config.prompt_template_path).unlink(missing_ok=True)
|
Loading…
Reference in New Issue