feat: add manager

This commit is contained in:
itqop 2025-11-05 02:04:56 +03:00
parent 18cbbe00d3
commit ad12343784
5 changed files with 273 additions and 168 deletions

View File

@ -1,3 +1,6 @@
# src/dataloader/api/__init__.py
from __future__ import annotations
from collections.abc import AsyncGenerator from collections.abc import AsyncGenerator
import contextlib import contextlib
import typing as tp import typing as tp
@ -8,31 +11,35 @@ from .metric_router import router as metric_router
from .middleware import log_requests from .middleware import log_requests
from .os_router import router as service_router from .os_router import router as service_router
from .v1 import router as v1_router from .v1 import router as v1_router
from dataloader.context import APP_CTX
from dataloader.workers.manager import build_manager_from_env, WorkerManager
_manager: WorkerManager | None = None
@contextlib.asynccontextmanager @contextlib.asynccontextmanager
async def lifespan(app: tp.Any) -> AsyncGenerator[None, None]: async def lifespan(app: tp.Any) -> AsyncGenerator[None, None]:
from dataloader.context import APP_CTX """
Жизненный цикл приложения: инициализация контекста и запуск/остановка воркер-менеджера.
"""
global _manager
await APP_CTX.on_startup() await APP_CTX.on_startup()
yield _manager = build_manager_from_env()
await APP_CTX.on_shutdown() await _manager.start()
try:
yield
finally:
if _manager is not None:
await _manager.stop()
_manager = None
await APP_CTX.on_shutdown()
app_main = FastAPI(title="Data Gateway", lifespan=lifespan) app_main = FastAPI(title="Data Gateway", lifespan=lifespan)
app_main.middleware("http")(log_requests) app_main.middleware("http")(log_requests)
app_main.include_router(service_router, tags=["Openshift dataloader routes"])
app_main.include_router(metric_router, tags=["Like/dislike metric dataloader routes"])
app_main.include_router(v1_router, prefix="/api/v1", tags=["dataloader"])
app_main.include_router( __all__ = ["app_main"]
service_router, tags=["Openshift dataloader routes"]
)
app_main.include_router(
metric_router, tags=["Like/dislike metric dataloader routes"]
)
app_main.include_router(
v1_router, prefix="/api/v1", tags=["dataloader"]
)
__all__ = [
"app_main",
]

View File

@ -1,71 +1,110 @@
"""Общий PG-воркер: claim/lease/heartbeat/retry.""" # src/dataloader/workers/base.py
from __future__ import annotations
import asyncio import asyncio
from typing import Optional from contextlib import AsyncExitStack
from uuid import UUID from dataclasses import dataclass
from sqlalchemy.ext.asyncio import AsyncSession from datetime import datetime, timedelta, timezone
from sqlalchemy import text from typing import AsyncIterator, Callable, Optional
from ..config import Settings, PGSettings from dataloader.context import APP_CTX
from ..storage.db import Database from dataloader.storage.db import get_sessionmaker
from .pipelines.registry import PipelineRegistry from dataloader.storage.repositories import QueueRepository
from dataloader.workers.pipelines.registry import resolve as resolve_pipeline
class BaseWorker: @dataclass(frozen=True)
"""Базовый воркер для обработки задач из очереди.""" class WorkerConfig:
"""
def __init__( Конфигурация воркера.
self, """
queue: str, queue: str
settings: Settings, heartbeat_sec: int
shutdown_event: asyncio.Event, claim_backoff_sec: int
):
self.queue = queue
self.settings = settings class PGWorker:
self.shutdown_event = shutdown_event """
self.db: Optional[Database] = None Базовый асинхронный воркер очереди Postgres.
self.current_job_id: Optional[UUID] = None """
self.current_lock_key: Optional[str] = None def __init__(self, cfg: WorkerConfig, stop_event: asyncio.Event) -> None:
self._cfg = cfg
async def run(self): self._stop = stop_event
"""Основной цикл воркера.""" self._log = APP_CTX.get_logger()
# TODO: инициализация БД, подключение к LISTEN/NOTIFY self._sm = get_sessionmaker()
# TODO: цикл claim -> heartbeat -> execute -> finish
raise NotImplementedError async def run(self) -> None:
async def claim_job(self, session: AsyncSession) -> Optional[dict]:
""" """
Claim одной задачи из очереди. Главный цикл: ожидание claim исполнение завершение.
Возвращает dict с job_id, task, args, lock_key или None.
""" """
# TODO: реализовать SELECT ... FOR UPDATE SKIP LOCKED self._log.info(f"worker.start queue={self._cfg.queue}")
# TODO: проверка advisory lock на lock_key while not self._stop.is_set():
# TODO: backoff если advisory lock занят claimed = await self._claim_and_execute_once()
raise NotImplementedError if not claimed:
await self._listen_or_sleep(self._cfg.claim_backoff_sec)
async def heartbeat(self, session: AsyncSession, job_id: UUID): self._log.info(f"worker.stop queue={self._cfg.queue}")
"""Обновление heartbeat для задачи."""
# TODO: UPDATE heartbeat_at и lease_expires_at
raise NotImplementedError
async def finish_job(
self,
session: AsyncSession,
job_id: UUID,
success: bool,
error: Optional[str] = None,
):
"""Завершение задачи (успех или ошибка с ретраем)."""
# TODO: UPDATE status в зависимости от success и attempt
# TODO: снятие advisory lock
raise NotImplementedError
async def check_cancel_requested(
self,
session: AsyncSession,
job_id: UUID,
) -> bool:
"""Проверка флага cancel_requested."""
# TODO: SELECT cancel_requested
raise NotImplementedError
async def _listen_or_sleep(self, timeout_sec: int) -> None:
"""
Ожидание появления задач с тайм-аутом.
"""
try:
await asyncio.wait_for(self._stop.wait(), timeout=timeout_sec)
except asyncio.TimeoutError:
return
async def _claim_and_execute_once(self) -> bool:
"""
Выполняет одну попытку захвата задачи и её обработку.
"""
async with AsyncExitStack() as stack:
s = await stack.enter_async_context(self._sm())
repo = QueueRepository(s)
row = await repo.claim_one(self._cfg.queue)
if not row:
await s.commit()
return False
job_id = row["job_id"]
ttl = int(row["lease_ttl_sec"])
task = row["task"]
args = row["args"]
try:
await self._execute_with_heartbeat(job_id, ttl, self._pipeline(task, args))
await repo.finish_ok(job_id)
return True
except asyncio.CancelledError:
await repo.finish_fail_or_retry(job_id, "cancelled")
raise
except Exception as e:
await repo.finish_fail_or_retry(job_id, str(e))
return True
async def _execute_with_heartbeat(self, job_id: str, ttl: int, it: AsyncIterator[None]) -> None:
"""
Исполняет конвейер с поддержкой heartbeat.
"""
next_hb = datetime.now(timezone.utc) + timedelta(seconds=self._cfg.heartbeat_sec)
async for _ in it:
if datetime.now(timezone.utc) >= next_hb:
async with self._sm() as s_hb:
await QueueRepository(s_hb).heartbeat(job_id, ttl)
next_hb = datetime.now(timezone.utc) + timedelta(seconds=self._cfg.heartbeat_sec)
if self._stop.is_set():
raise asyncio.CancelledError()
async def _pipeline(self, task: str, args: dict) -> AsyncIterator[None]:
"""
Вызывает зарегистрированный пайплайн по имени задачи.
"""
fn: Callable[[dict], object] = resolve_pipeline(task)
res = fn(args)
if hasattr(res, "__aiter__"):
async for _ in res: # type: ignore[func-returns-value]
yield
elif asyncio.iscoroutine(res):
await res # type: ignore[arg-type]
yield
else:
yield

View File

@ -1,53 +1,106 @@
"""Создание asyncio Tasks воркеров по конфигу.""" # src/dataloader/workers/manager.py
from __future__ import annotations
import asyncio import asyncio
import json import contextlib
from typing import List, Dict, Any from dataclasses import dataclass
from ..config import Settings, PGSettings from typing import Any
from .base import BaseWorker
from dataloader.context import APP_CTX
from dataloader.config import APP_CONFIG
from dataloader.storage.db import get_sessionmaker
from dataloader.storage.repositories import QueueRepository
from dataloader.workers.base import PGWorker, WorkerConfig
@dataclass(frozen=True)
class WorkerSpec:
"""
Конфигурация набора воркеров для очереди.
"""
queue: str
concurrency: int
class WorkerManager: class WorkerManager:
"""Менеджер воркеров: создание и управление воркерами по конфигу.""" """
Управляет жизненным циклом асинхронных воркеров.
def __init__(self, settings: Settings): """
self.settings = settings def __init__(self, specs: list[WorkerSpec]) -> None:
self.workers: List[asyncio.Task] = [] self._log = APP_CTX.get_logger()
self.worker_configs: List[Dict[str, Any]] = [] self._specs = specs
self._shutdown_event = asyncio.Event() self._stop = asyncio.Event()
self._tasks: list[asyncio.Task] = []
def _parse_workers_config(self) -> List[Dict[str, Any]]: self._reaper_task: asyncio.Task | None = None
"""Парсинг WORKERS_JSON из конфига."""
if not self.settings.workers_json: async def start(self) -> None:
return [] """
try: Стартует воркеры и фоновую задачу реапера.
return json.loads(self.settings.workers_json) """
except json.JSONDecodeError: hb = int(APP_CONFIG.dl.dl_heartbeat_sec)
# TODO: логирование ошибки backoff = int(APP_CONFIG.dl.dl_claim_backoff_sec)
return []
for spec in self._specs:
async def start(self): for i in range(max(1, spec.concurrency)):
"""Запуск всех воркеров.""" cfg = WorkerConfig(queue=spec.queue, heartbeat_sec=hb, claim_backoff_sec=backoff)
self.worker_configs = self._parse_workers_config() t = asyncio.create_task(PGWorker(cfg, self._stop).run(), name=f"worker:{spec.queue}:{i}")
for config in self.worker_configs: self._tasks.append(t)
queue = config.get("queue")
concurrency = config.get("concurrency", 1) self._reaper_task = asyncio.create_task(self._reaper_loop(), name="reaper")
for _ in range(concurrency): self._log.info(
worker = BaseWorker( "worker_manager.started",
queue=queue, extra={"specs": [spec.__dict__ for spec in self._specs], "total_tasks": len(self._tasks)},
settings=self.settings,
shutdown_event=self._shutdown_event,
)
task = asyncio.create_task(worker.run())
self.workers.append(task)
async def shutdown(self, timeout: float = 30.0):
"""Мягкая остановка воркеров."""
self._shutdown_event.set()
# Ждем завершения всех воркеров с таймаутом
await asyncio.wait_for(
asyncio.gather(*self.workers, return_exceptions=True),
timeout=timeout,
) )
async def stop(self) -> None:
"""
Останавливает воркеры и реапер.
"""
self._stop.set()
for t in self._tasks:
t.cancel()
await asyncio.gather(*self._tasks, return_exceptions=True)
self._tasks.clear()
if self._reaper_task:
self._reaper_task.cancel()
with contextlib.suppress(Exception):
await self._reaper_task
self._reaper_task = None
self._log.info("worker_manager.stopped")
async def _reaper_loop(self) -> None:
"""
Фоновый цикл возврата потерянных задач в очередь.
"""
period = int(APP_CONFIG.dl.dl_reaper_period_sec)
sm = get_sessionmaker()
while not self._stop.is_set():
try:
async with sm() as s:
repo = QueueRepository(s)
ids = await repo.requeue_lost()
if ids:
APP_CTX.get_logger().info("reaper.requeued", extra={"count": len(ids)})
except Exception as e:
APP_CTX.get_logger().error("reaper.error", extra={"error": str(e)})
try:
await asyncio.wait_for(self._stop.wait(), timeout=period)
except asyncio.TimeoutError:
continue
def build_manager_from_env() -> WorkerManager:
"""
Собирает WorkerManager из WORKERS_JSON.
"""
specs: list[WorkerSpec] = []
for item in APP_CONFIG.dl.parsed_workers():
q = str(item.get("queue", "")).strip()
c = int(item.get("concurrency", 1))
if q:
specs.append(WorkerSpec(queue=q, concurrency=max(1, c)))
return WorkerManager(specs)

View File

@ -0,0 +1,20 @@
# src/dataloader/workers/pipelines/noop.py
from __future__ import annotations
import asyncio
from typing import AsyncIterator
from .registry import register
@register("noop")
async def noop(args: dict) -> AsyncIterator[None]:
"""
Эталонный пайплайн без побочных эффектов, имитирует 3 шага.
"""
await asyncio.sleep(float(args.get("sleep1", 2)))
yield
await asyncio.sleep(float(args.get("sleep2", 2)))
yield
await asyncio.sleep(float(args.get("sleep3", 2)))
yield

View File

@ -1,47 +1,33 @@
"""Реестр обработчиков по task.""" # src/dataloader/workers/pipelines/registry.py
from __future__ import annotations
from typing import Dict, Callable, Any, Awaitable, Optional from typing import Any, Callable, Dict, Iterable
from abc import ABC, abstractmethod
_Registry: Dict[str, Callable[[dict[str, Any]], Any]] = {}
class Pipeline(ABC): def register(task: str) -> Callable[[Callable[[dict[str, Any]], Any]], Callable[[dict[str, Any]], Any]]:
"""Базовый класс для пайплайна обработки задачи.""" """
Регистрирует обработчик пайплайна под именем задачи.
@abstractmethod """
async def execute(self, args: Dict[str, Any]) -> Dict[str, Any]: def _wrap(fn: Callable[[dict[str, Any]], Any]) -> Callable[[dict[str, Any]], Any]:
""" _Registry[task] = fn
Выполнение задачи. return fn
Возвращает результат (например, progress). return _wrap
"""
pass
class PipelineRegistry: def resolve(task: str) -> Callable[[dict[str, Any]], Any]:
"""Реестр обработчиков задач по типу task.""" """
Возвращает обработчик пайплайна по имени задачи.
def __init__(self): """
self._pipelines: Dict[str, type[Pipeline]] = {} try:
return _Registry[task]
def register(self, task: str, pipeline_class: type[Pipeline]): except KeyError:
"""Регистрация обработчика для типа задачи.""" raise KeyError(f"pipeline not found: {task}")
self._pipelines[task] = pipeline_class
def get_pipeline(self, task: str) -> Optional[type[Pipeline]]:
"""Получить класс обработчика для типа задачи."""
return self._pipelines.get(task)
async def execute(
self,
task: str,
args: Dict[str, Any],
) -> Dict[str, Any]:
"""
Выполнить задачу через зарегистрированный обработчик.
"""
pipeline_class = self.get_pipeline(task)
if pipeline_class is None:
raise ValueError(f"Pipeline for task '{task}' not found")
pipeline = pipeline_class()
return await pipeline.execute(args)
def tasks() -> Iterable[str]:
"""
Возвращает список зарегистрированных задач.
"""
return _Registry.keys()