feat: add manager
This commit is contained in:
parent
18cbbe00d3
commit
ad12343784
|
|
@ -1,3 +1,6 @@
|
||||||
|
# src/dataloader/api/__init__.py
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from collections.abc import AsyncGenerator
|
from collections.abc import AsyncGenerator
|
||||||
import contextlib
|
import contextlib
|
||||||
import typing as tp
|
import typing as tp
|
||||||
|
|
@ -8,31 +11,35 @@ from .metric_router import router as metric_router
|
||||||
from .middleware import log_requests
|
from .middleware import log_requests
|
||||||
from .os_router import router as service_router
|
from .os_router import router as service_router
|
||||||
from .v1 import router as v1_router
|
from .v1 import router as v1_router
|
||||||
|
from dataloader.context import APP_CTX
|
||||||
|
from dataloader.workers.manager import build_manager_from_env, WorkerManager
|
||||||
|
|
||||||
|
|
||||||
|
_manager: WorkerManager | None = None
|
||||||
|
|
||||||
|
|
||||||
@contextlib.asynccontextmanager
|
@contextlib.asynccontextmanager
|
||||||
async def lifespan(app: tp.Any) -> AsyncGenerator[None, None]:
|
async def lifespan(app: tp.Any) -> AsyncGenerator[None, None]:
|
||||||
from dataloader.context import APP_CTX
|
"""
|
||||||
|
Жизненный цикл приложения: инициализация контекста и запуск/остановка воркер-менеджера.
|
||||||
|
"""
|
||||||
|
global _manager
|
||||||
await APP_CTX.on_startup()
|
await APP_CTX.on_startup()
|
||||||
yield
|
_manager = build_manager_from_env()
|
||||||
await APP_CTX.on_shutdown()
|
await _manager.start()
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
if _manager is not None:
|
||||||
|
await _manager.stop()
|
||||||
|
_manager = None
|
||||||
|
await APP_CTX.on_shutdown()
|
||||||
|
|
||||||
|
|
||||||
app_main = FastAPI(title="Data Gateway", lifespan=lifespan)
|
app_main = FastAPI(title="Data Gateway", lifespan=lifespan)
|
||||||
|
|
||||||
app_main.middleware("http")(log_requests)
|
app_main.middleware("http")(log_requests)
|
||||||
|
app_main.include_router(service_router, tags=["Openshift dataloader routes"])
|
||||||
|
app_main.include_router(metric_router, tags=["Like/dislike metric dataloader routes"])
|
||||||
|
app_main.include_router(v1_router, prefix="/api/v1", tags=["dataloader"])
|
||||||
|
|
||||||
app_main.include_router(
|
__all__ = ["app_main"]
|
||||||
service_router, tags=["Openshift dataloader routes"]
|
|
||||||
)
|
|
||||||
app_main.include_router(
|
|
||||||
metric_router, tags=["Like/dislike metric dataloader routes"]
|
|
||||||
)
|
|
||||||
app_main.include_router(
|
|
||||||
v1_router, prefix="/api/v1", tags=["dataloader"]
|
|
||||||
)
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"app_main",
|
|
||||||
]
|
|
||||||
|
|
|
||||||
|
|
@ -1,71 +1,110 @@
|
||||||
"""Общий PG-воркер: claim/lease/heartbeat/retry."""
|
# src/dataloader/workers/base.py
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
from typing import Optional
|
from contextlib import AsyncExitStack
|
||||||
from uuid import UUID
|
from dataclasses import dataclass
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from datetime import datetime, timedelta, timezone
|
||||||
from sqlalchemy import text
|
from typing import AsyncIterator, Callable, Optional
|
||||||
|
|
||||||
from ..config import Settings, PGSettings
|
from dataloader.context import APP_CTX
|
||||||
from ..storage.db import Database
|
from dataloader.storage.db import get_sessionmaker
|
||||||
from .pipelines.registry import PipelineRegistry
|
from dataloader.storage.repositories import QueueRepository
|
||||||
|
from dataloader.workers.pipelines.registry import resolve as resolve_pipeline
|
||||||
|
|
||||||
|
|
||||||
class BaseWorker:
|
@dataclass(frozen=True)
|
||||||
"""Базовый воркер для обработки задач из очереди."""
|
class WorkerConfig:
|
||||||
|
"""
|
||||||
|
Конфигурация воркера.
|
||||||
|
"""
|
||||||
|
queue: str
|
||||||
|
heartbeat_sec: int
|
||||||
|
claim_backoff_sec: int
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
queue: str,
|
|
||||||
settings: Settings,
|
|
||||||
shutdown_event: asyncio.Event,
|
|
||||||
):
|
|
||||||
self.queue = queue
|
|
||||||
self.settings = settings
|
|
||||||
self.shutdown_event = shutdown_event
|
|
||||||
self.db: Optional[Database] = None
|
|
||||||
self.current_job_id: Optional[UUID] = None
|
|
||||||
self.current_lock_key: Optional[str] = None
|
|
||||||
|
|
||||||
async def run(self):
|
class PGWorker:
|
||||||
"""Основной цикл воркера."""
|
"""
|
||||||
# TODO: инициализация БД, подключение к LISTEN/NOTIFY
|
Базовый асинхронный воркер очереди Postgres.
|
||||||
# TODO: цикл claim -> heartbeat -> execute -> finish
|
"""
|
||||||
raise NotImplementedError
|
def __init__(self, cfg: WorkerConfig, stop_event: asyncio.Event) -> None:
|
||||||
|
self._cfg = cfg
|
||||||
|
self._stop = stop_event
|
||||||
|
self._log = APP_CTX.get_logger()
|
||||||
|
self._sm = get_sessionmaker()
|
||||||
|
|
||||||
async def claim_job(self, session: AsyncSession) -> Optional[dict]:
|
async def run(self) -> None:
|
||||||
"""
|
"""
|
||||||
Claim одной задачи из очереди.
|
Главный цикл: ожидание → claim → исполнение → завершение.
|
||||||
Возвращает dict с job_id, task, args, lock_key или None.
|
|
||||||
"""
|
"""
|
||||||
# TODO: реализовать SELECT ... FOR UPDATE SKIP LOCKED
|
self._log.info(f"worker.start queue={self._cfg.queue}")
|
||||||
# TODO: проверка advisory lock на lock_key
|
while not self._stop.is_set():
|
||||||
# TODO: backoff если advisory lock занят
|
claimed = await self._claim_and_execute_once()
|
||||||
raise NotImplementedError
|
if not claimed:
|
||||||
|
await self._listen_or_sleep(self._cfg.claim_backoff_sec)
|
||||||
|
self._log.info(f"worker.stop queue={self._cfg.queue}")
|
||||||
|
|
||||||
async def heartbeat(self, session: AsyncSession, job_id: UUID):
|
async def _listen_or_sleep(self, timeout_sec: int) -> None:
|
||||||
"""Обновление heartbeat для задачи."""
|
"""
|
||||||
# TODO: UPDATE heartbeat_at и lease_expires_at
|
Ожидание появления задач с тайм-аутом.
|
||||||
raise NotImplementedError
|
"""
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(self._stop.wait(), timeout=timeout_sec)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
return
|
||||||
|
|
||||||
async def finish_job(
|
async def _claim_and_execute_once(self) -> bool:
|
||||||
self,
|
"""
|
||||||
session: AsyncSession,
|
Выполняет одну попытку захвата задачи и её обработку.
|
||||||
job_id: UUID,
|
"""
|
||||||
success: bool,
|
async with AsyncExitStack() as stack:
|
||||||
error: Optional[str] = None,
|
s = await stack.enter_async_context(self._sm())
|
||||||
):
|
repo = QueueRepository(s)
|
||||||
"""Завершение задачи (успех или ошибка с ретраем)."""
|
row = await repo.claim_one(self._cfg.queue)
|
||||||
# TODO: UPDATE status в зависимости от success и attempt
|
if not row:
|
||||||
# TODO: снятие advisory lock
|
await s.commit()
|
||||||
raise NotImplementedError
|
return False
|
||||||
|
|
||||||
async def check_cancel_requested(
|
job_id = row["job_id"]
|
||||||
self,
|
ttl = int(row["lease_ttl_sec"])
|
||||||
session: AsyncSession,
|
task = row["task"]
|
||||||
job_id: UUID,
|
args = row["args"]
|
||||||
) -> bool:
|
|
||||||
"""Проверка флага cancel_requested."""
|
|
||||||
# TODO: SELECT cancel_requested
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
await self._execute_with_heartbeat(job_id, ttl, self._pipeline(task, args))
|
||||||
|
await repo.finish_ok(job_id)
|
||||||
|
return True
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
await repo.finish_fail_or_retry(job_id, "cancelled")
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
await repo.finish_fail_or_retry(job_id, str(e))
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def _execute_with_heartbeat(self, job_id: str, ttl: int, it: AsyncIterator[None]) -> None:
|
||||||
|
"""
|
||||||
|
Исполняет конвейер с поддержкой heartbeat.
|
||||||
|
"""
|
||||||
|
next_hb = datetime.now(timezone.utc) + timedelta(seconds=self._cfg.heartbeat_sec)
|
||||||
|
async for _ in it:
|
||||||
|
if datetime.now(timezone.utc) >= next_hb:
|
||||||
|
async with self._sm() as s_hb:
|
||||||
|
await QueueRepository(s_hb).heartbeat(job_id, ttl)
|
||||||
|
next_hb = datetime.now(timezone.utc) + timedelta(seconds=self._cfg.heartbeat_sec)
|
||||||
|
if self._stop.is_set():
|
||||||
|
raise asyncio.CancelledError()
|
||||||
|
|
||||||
|
async def _pipeline(self, task: str, args: dict) -> AsyncIterator[None]:
|
||||||
|
"""
|
||||||
|
Вызывает зарегистрированный пайплайн по имени задачи.
|
||||||
|
"""
|
||||||
|
fn: Callable[[dict], object] = resolve_pipeline(task)
|
||||||
|
res = fn(args)
|
||||||
|
if hasattr(res, "__aiter__"):
|
||||||
|
async for _ in res: # type: ignore[func-returns-value]
|
||||||
|
yield
|
||||||
|
elif asyncio.iscoroutine(res):
|
||||||
|
await res # type: ignore[arg-type]
|
||||||
|
yield
|
||||||
|
else:
|
||||||
|
yield
|
||||||
|
|
|
||||||
|
|
@ -1,53 +1,106 @@
|
||||||
"""Создание asyncio Tasks воркеров по конфигу."""
|
# src/dataloader/workers/manager.py
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import contextlib
|
||||||
from typing import List, Dict, Any
|
from dataclasses import dataclass
|
||||||
from ..config import Settings, PGSettings
|
from typing import Any
|
||||||
from .base import BaseWorker
|
|
||||||
|
from dataloader.context import APP_CTX
|
||||||
|
from dataloader.config import APP_CONFIG
|
||||||
|
from dataloader.storage.db import get_sessionmaker
|
||||||
|
from dataloader.storage.repositories import QueueRepository
|
||||||
|
from dataloader.workers.base import PGWorker, WorkerConfig
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class WorkerSpec:
|
||||||
|
"""
|
||||||
|
Конфигурация набора воркеров для очереди.
|
||||||
|
"""
|
||||||
|
queue: str
|
||||||
|
concurrency: int
|
||||||
|
|
||||||
|
|
||||||
class WorkerManager:
|
class WorkerManager:
|
||||||
"""Менеджер воркеров: создание и управление воркерами по конфигу."""
|
"""
|
||||||
|
Управляет жизненным циклом асинхронных воркеров.
|
||||||
|
"""
|
||||||
|
def __init__(self, specs: list[WorkerSpec]) -> None:
|
||||||
|
self._log = APP_CTX.get_logger()
|
||||||
|
self._specs = specs
|
||||||
|
self._stop = asyncio.Event()
|
||||||
|
self._tasks: list[asyncio.Task] = []
|
||||||
|
self._reaper_task: asyncio.Task | None = None
|
||||||
|
|
||||||
def __init__(self, settings: Settings):
|
async def start(self) -> None:
|
||||||
self.settings = settings
|
"""
|
||||||
self.workers: List[asyncio.Task] = []
|
Стартует воркеры и фоновую задачу реапера.
|
||||||
self.worker_configs: List[Dict[str, Any]] = []
|
"""
|
||||||
self._shutdown_event = asyncio.Event()
|
hb = int(APP_CONFIG.dl.dl_heartbeat_sec)
|
||||||
|
backoff = int(APP_CONFIG.dl.dl_claim_backoff_sec)
|
||||||
|
|
||||||
def _parse_workers_config(self) -> List[Dict[str, Any]]:
|
for spec in self._specs:
|
||||||
"""Парсинг WORKERS_JSON из конфига."""
|
for i in range(max(1, spec.concurrency)):
|
||||||
if not self.settings.workers_json:
|
cfg = WorkerConfig(queue=spec.queue, heartbeat_sec=hb, claim_backoff_sec=backoff)
|
||||||
return []
|
t = asyncio.create_task(PGWorker(cfg, self._stop).run(), name=f"worker:{spec.queue}:{i}")
|
||||||
try:
|
self._tasks.append(t)
|
||||||
return json.loads(self.settings.workers_json)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
# TODO: логирование ошибки
|
|
||||||
return []
|
|
||||||
|
|
||||||
async def start(self):
|
self._reaper_task = asyncio.create_task(self._reaper_loop(), name="reaper")
|
||||||
"""Запуск всех воркеров."""
|
|
||||||
self.worker_configs = self._parse_workers_config()
|
|
||||||
for config in self.worker_configs:
|
|
||||||
queue = config.get("queue")
|
|
||||||
concurrency = config.get("concurrency", 1)
|
|
||||||
|
|
||||||
for _ in range(concurrency):
|
self._log.info(
|
||||||
worker = BaseWorker(
|
"worker_manager.started",
|
||||||
queue=queue,
|
extra={"specs": [spec.__dict__ for spec in self._specs], "total_tasks": len(self._tasks)},
|
||||||
settings=self.settings,
|
|
||||||
shutdown_event=self._shutdown_event,
|
|
||||||
)
|
|
||||||
task = asyncio.create_task(worker.run())
|
|
||||||
self.workers.append(task)
|
|
||||||
|
|
||||||
async def shutdown(self, timeout: float = 30.0):
|
|
||||||
"""Мягкая остановка воркеров."""
|
|
||||||
self._shutdown_event.set()
|
|
||||||
# Ждем завершения всех воркеров с таймаутом
|
|
||||||
await asyncio.wait_for(
|
|
||||||
asyncio.gather(*self.workers, return_exceptions=True),
|
|
||||||
timeout=timeout,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def stop(self) -> None:
|
||||||
|
"""
|
||||||
|
Останавливает воркеры и реапер.
|
||||||
|
"""
|
||||||
|
self._stop.set()
|
||||||
|
|
||||||
|
for t in self._tasks:
|
||||||
|
t.cancel()
|
||||||
|
await asyncio.gather(*self._tasks, return_exceptions=True)
|
||||||
|
self._tasks.clear()
|
||||||
|
|
||||||
|
if self._reaper_task:
|
||||||
|
self._reaper_task.cancel()
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
await self._reaper_task
|
||||||
|
self._reaper_task = None
|
||||||
|
|
||||||
|
self._log.info("worker_manager.stopped")
|
||||||
|
|
||||||
|
async def _reaper_loop(self) -> None:
|
||||||
|
"""
|
||||||
|
Фоновый цикл возврата потерянных задач в очередь.
|
||||||
|
"""
|
||||||
|
period = int(APP_CONFIG.dl.dl_reaper_period_sec)
|
||||||
|
sm = get_sessionmaker()
|
||||||
|
while not self._stop.is_set():
|
||||||
|
try:
|
||||||
|
async with sm() as s:
|
||||||
|
repo = QueueRepository(s)
|
||||||
|
ids = await repo.requeue_lost()
|
||||||
|
if ids:
|
||||||
|
APP_CTX.get_logger().info("reaper.requeued", extra={"count": len(ids)})
|
||||||
|
except Exception as e:
|
||||||
|
APP_CTX.get_logger().error("reaper.error", extra={"error": str(e)})
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(self._stop.wait(), timeout=period)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
def build_manager_from_env() -> WorkerManager:
|
||||||
|
"""
|
||||||
|
Собирает WorkerManager из WORKERS_JSON.
|
||||||
|
"""
|
||||||
|
specs: list[WorkerSpec] = []
|
||||||
|
for item in APP_CONFIG.dl.parsed_workers():
|
||||||
|
q = str(item.get("queue", "")).strip()
|
||||||
|
c = int(item.get("concurrency", 1))
|
||||||
|
if q:
|
||||||
|
specs.append(WorkerSpec(queue=q, concurrency=max(1, c)))
|
||||||
|
return WorkerManager(specs)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,20 @@
|
||||||
|
# src/dataloader/workers/pipelines/noop.py
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from typing import AsyncIterator
|
||||||
|
|
||||||
|
from .registry import register
|
||||||
|
|
||||||
|
|
||||||
|
@register("noop")
|
||||||
|
async def noop(args: dict) -> AsyncIterator[None]:
|
||||||
|
"""
|
||||||
|
Эталонный пайплайн без побочных эффектов, имитирует 3 шага.
|
||||||
|
"""
|
||||||
|
await asyncio.sleep(float(args.get("sleep1", 2)))
|
||||||
|
yield
|
||||||
|
await asyncio.sleep(float(args.get("sleep2", 2)))
|
||||||
|
yield
|
||||||
|
await asyncio.sleep(float(args.get("sleep3", 2)))
|
||||||
|
yield
|
||||||
|
|
@ -1,47 +1,33 @@
|
||||||
"""Реестр обработчиков по task."""
|
# src/dataloader/workers/pipelines/registry.py
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Dict, Callable, Any, Awaitable, Optional
|
from typing import Any, Callable, Dict, Iterable
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
|
_Registry: Dict[str, Callable[[dict[str, Any]], Any]] = {}
|
||||||
|
|
||||||
|
|
||||||
class Pipeline(ABC):
|
def register(task: str) -> Callable[[Callable[[dict[str, Any]], Any]], Callable[[dict[str, Any]], Any]]:
|
||||||
"""Базовый класс для пайплайна обработки задачи."""
|
"""
|
||||||
|
Регистрирует обработчик пайплайна под именем задачи.
|
||||||
@abstractmethod
|
"""
|
||||||
async def execute(self, args: Dict[str, Any]) -> Dict[str, Any]:
|
def _wrap(fn: Callable[[dict[str, Any]], Any]) -> Callable[[dict[str, Any]], Any]:
|
||||||
"""
|
_Registry[task] = fn
|
||||||
Выполнение задачи.
|
return fn
|
||||||
Возвращает результат (например, progress).
|
return _wrap
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class PipelineRegistry:
|
def resolve(task: str) -> Callable[[dict[str, Any]], Any]:
|
||||||
"""Реестр обработчиков задач по типу task."""
|
"""
|
||||||
|
Возвращает обработчик пайплайна по имени задачи.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return _Registry[task]
|
||||||
|
except KeyError:
|
||||||
|
raise KeyError(f"pipeline not found: {task}")
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self._pipelines: Dict[str, type[Pipeline]] = {}
|
|
||||||
|
|
||||||
def register(self, task: str, pipeline_class: type[Pipeline]):
|
|
||||||
"""Регистрация обработчика для типа задачи."""
|
|
||||||
self._pipelines[task] = pipeline_class
|
|
||||||
|
|
||||||
def get_pipeline(self, task: str) -> Optional[type[Pipeline]]:
|
|
||||||
"""Получить класс обработчика для типа задачи."""
|
|
||||||
return self._pipelines.get(task)
|
|
||||||
|
|
||||||
async def execute(
|
|
||||||
self,
|
|
||||||
task: str,
|
|
||||||
args: Dict[str, Any],
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Выполнить задачу через зарегистрированный обработчик.
|
|
||||||
"""
|
|
||||||
pipeline_class = self.get_pipeline(task)
|
|
||||||
if pipeline_class is None:
|
|
||||||
raise ValueError(f"Pipeline for task '{task}' not found")
|
|
||||||
|
|
||||||
pipeline = pipeline_class()
|
|
||||||
return await pipeline.execute(args)
|
|
||||||
|
|
||||||
|
def tasks() -> Iterable[str]:
|
||||||
|
"""
|
||||||
|
Возвращает список зарегистрированных задач.
|
||||||
|
"""
|
||||||
|
return _Registry.keys()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue