Add more info logs

This commit is contained in:
itqop 2025-07-12 10:36:16 +03:00
parent 7a1e35ec00
commit c5306bb56e
5 changed files with 53 additions and 14 deletions

View File

@ -55,7 +55,7 @@ class RuWikiAdapter(BaseAdapter):
def _create_client(self) -> mwclient.Site:
try:
site = mwclient.Site("ru.wikipedia.org")
site = mwclient.Site("ru.ruwiki.ru")
site.api("query", meta="siteinfo")
self.logger.info("Соединение с RuWiki установлено")
return site
@ -66,7 +66,7 @@ class RuWikiAdapter(BaseAdapter):
@staticmethod
def extract_title_from_url(url: str) -> str:
parsed = urlparse(url)
if "wikipedia.org" not in parsed.netloc:
if "ruwiki.ru" not in parsed.netloc:
raise ValueError(f"Не является URL википедии: {url}")
path_parts = parsed.path.split("/")

View File

@ -1,6 +1,6 @@
LLM_MAX_INPUT_TOKENS = 4096
MAX_TOKEN_LIMIT_WITH_BUFFER = 3800
ARTICLE_NAME_INDEX = 0
LLM_MAX_INPUT_TOKENS = 120000
MAX_TOKEN_LIMIT_WITH_BUFFER = 16000
ARTICLE_NAME_INDEX = -1
MIN_WIKI_PATH_PARTS = 2
WIKI_PATH_INDEX = 1
WRITE_QUEUE_BATCH_SIZE = 10

View File

@ -69,7 +69,7 @@ class AsyncRunner:
) -> None:
loaded_count = 0
async for command in source.read_urls(force_reprocess):
async for command in source.read_urls(force_reprocess=force_reprocess):
if max_articles and loaded_count >= max_articles:
break

View File

@ -79,6 +79,16 @@ class SimplifyService:
raw_text=page_info.content,
)
self.logger.info("Упрощение завершено",
url=command.url,
simplified_length=len(simplified_text),
input_tokens=input_tokens,
output_tokens=output_tokens)
if not simplified_text.strip():
self.logger.error("Получен пустой simplified_text!", url=command.url)
raise ValueError("Упрощение привело к пустому результату")
processing_time = time.time() - start_time
result = ProcessingResult.success_result(
url=command.url,
@ -90,7 +100,9 @@ class SimplifyService:
processing_time_seconds=processing_time,
)
self.logger.info("Отправляем результат в write_queue...", url=command.url)
await self.write_queue.update_from_result(result)
self.logger.info("Результат успешно записан в write_queue", url=command.url)
self.logger.info(
"Статья успешно обработана",
@ -163,8 +175,8 @@ class SimplifyService:
async def _simplify_article_text(self, title: str, raw_text: str) -> tuple[str, int, int]:
prompt_template = await self.get_prompt_template()
text_tokens = self.llm_adapter.count_tokens(raw_text)
if text_tokens <= self.config.chunk_size:
max_input_size = LLM_MAX_INPUT_TOKENS - len(prompt_template) - 1000
if text_tokens <= max_input_size:
return await self.llm_adapter.simplify_text(title, raw_text, prompt_template)
return await self._process_long_text(title, raw_text, prompt_template)
@ -230,7 +242,7 @@ class SimplifyService:
"Объединённый текст превышает лимит, обрезаем",
final_tokens=final_tokens,
)
combined_text = self._truncate_to_token_limit(combined_text, 1000)
combined_text = self._truncate_to_token_limit(combined_text, MAX_TOKEN_LIMIT_WITH_BUFFER)
total_output_tokens = self.llm_adapter.count_tokens(combined_text)
return combined_text, total_input_tokens, total_output_tokens

View File

@ -63,24 +63,51 @@ class FileSource:
def _is_valid_wikipedia_url(self, url: str) -> bool:
try:
self.logger.info("Начинаем проверку URL", raw_url=url)
parsed = urlparse(url)
self.logger.info("Разобранный URL", scheme=parsed.scheme, netloc=parsed.netloc, path=parsed.path)
if parsed.scheme not in ("http", "https"):
self.logger.info("Отклонено: неподдерживаемая схема", scheme=parsed.scheme, url=url)
return False
if "wikipedia.org" not in parsed.netloc:
if "ruwiki" not in parsed.netloc:
self.logger.info("Отклонено: домен не содержит 'ruwiki'", netloc=parsed.netloc, url=url)
return False
path_parts = parsed.path.split("/")
if len(path_parts) < MIN_WIKI_PATH_PARTS or path_parts[WIKI_PATH_INDEX] != "wiki":
self.logger.info("Части пути", path_parts=path_parts)
if len(path_parts) < MIN_WIKI_PATH_PARTS:
self.logger.info(
"Отклонено: слишком мало сегментов в пути", parts=path_parts, url=url
)
return False
if path_parts[WIKI_PATH_INDEX] != "wiki":
self.logger.info(
"Отклонено: неверный сегмент пути", expected="wiki", actual=path_parts[WIKI_PATH_INDEX], url=url
)
return False
article_name = path_parts[ARTICLE_NAME_INDEX]
return bool(article_name and article_name not in ("Main_Page", "Заглавная_страница"))
self.logger.info("Извлечено имя статьи", article_name=article_name, url=url)
except Exception:
if not article_name or article_name in ("Main_Page", "Заглавная_страница"):
self.logger.info(
"Отклонено: некорректное имя статьи", article_name=article_name, url=url
)
return False
self.logger.info("URL прошёл все проверки", url=url)
return True
except Exception as e:
self.logger.info("Ошибка при проверке URL", error=str(e), url=url)
return False
async def count_urls(self) -> int:
count = 0
async for _ in self.read_urls():