From c5306bb56e3e6aaada9b9302159e19e4d0df11ab Mon Sep 17 00:00:00 2001 From: itqop Date: Sat, 12 Jul 2025 10:36:16 +0300 Subject: [PATCH] Add more info logs --- src/adapters/ruwiki.py | 4 ++-- src/models/constants.py | 8 ++++---- src/runner.py | 2 +- src/services/simplify_service.py | 18 +++++++++++++--- src/sources.py | 35 ++++++++++++++++++++++++++++---- 5 files changed, 53 insertions(+), 14 deletions(-) diff --git a/src/adapters/ruwiki.py b/src/adapters/ruwiki.py index 8bc71d4..143edbb 100644 --- a/src/adapters/ruwiki.py +++ b/src/adapters/ruwiki.py @@ -55,7 +55,7 @@ class RuWikiAdapter(BaseAdapter): def _create_client(self) -> mwclient.Site: try: - site = mwclient.Site("ru.wikipedia.org") + site = mwclient.Site("ru.ruwiki.ru") site.api("query", meta="siteinfo") self.logger.info("Соединение с RuWiki установлено") return site @@ -66,7 +66,7 @@ class RuWikiAdapter(BaseAdapter): @staticmethod def extract_title_from_url(url: str) -> str: parsed = urlparse(url) - if "wikipedia.org" not in parsed.netloc: + if "ruwiki.ru" not in parsed.netloc: raise ValueError(f"Не является URL википедии: {url}") path_parts = parsed.path.split("/") diff --git a/src/models/constants.py b/src/models/constants.py index 2022d74..a744481 100644 --- a/src/models/constants.py +++ b/src/models/constants.py @@ -1,6 +1,6 @@ -LLM_MAX_INPUT_TOKENS = 4096 -MAX_TOKEN_LIMIT_WITH_BUFFER = 3800 -ARTICLE_NAME_INDEX = 0 +LLM_MAX_INPUT_TOKENS = 120000 +MAX_TOKEN_LIMIT_WITH_BUFFER = 16000 +ARTICLE_NAME_INDEX = -1 MIN_WIKI_PATH_PARTS = 2 -WIKI_PATH_INDEX = 1 +WIKI_PATH_INDEX = 1 WRITE_QUEUE_BATCH_SIZE = 10 diff --git a/src/runner.py b/src/runner.py index e4a5e38..0e59795 100644 --- a/src/runner.py +++ b/src/runner.py @@ -69,7 +69,7 @@ class AsyncRunner: ) -> None: loaded_count = 0 - async for command in source.read_urls(force_reprocess): + async for command in source.read_urls(force_reprocess=force_reprocess): if max_articles and loaded_count >= max_articles: break diff --git a/src/services/simplify_service.py b/src/services/simplify_service.py index 4c64b8d..922a4d3 100644 --- a/src/services/simplify_service.py +++ b/src/services/simplify_service.py @@ -79,6 +79,16 @@ class SimplifyService: raw_text=page_info.content, ) + self.logger.info("Упрощение завершено", + url=command.url, + simplified_length=len(simplified_text), + input_tokens=input_tokens, + output_tokens=output_tokens) + + if not simplified_text.strip(): + self.logger.error("Получен пустой simplified_text!", url=command.url) + raise ValueError("Упрощение привело к пустому результату") + processing_time = time.time() - start_time result = ProcessingResult.success_result( url=command.url, @@ -90,7 +100,9 @@ class SimplifyService: processing_time_seconds=processing_time, ) + self.logger.info("Отправляем результат в write_queue...", url=command.url) await self.write_queue.update_from_result(result) + self.logger.info("Результат успешно записан в write_queue", url=command.url) self.logger.info( "Статья успешно обработана", @@ -163,8 +175,8 @@ class SimplifyService: async def _simplify_article_text(self, title: str, raw_text: str) -> tuple[str, int, int]: prompt_template = await self.get_prompt_template() text_tokens = self.llm_adapter.count_tokens(raw_text) - - if text_tokens <= self.config.chunk_size: + max_input_size = LLM_MAX_INPUT_TOKENS - len(prompt_template) - 1000 + if text_tokens <= max_input_size: return await self.llm_adapter.simplify_text(title, raw_text, prompt_template) return await self._process_long_text(title, raw_text, prompt_template) @@ -230,7 +242,7 @@ class SimplifyService: "Объединённый текст превышает лимит, обрезаем", final_tokens=final_tokens, ) - combined_text = self._truncate_to_token_limit(combined_text, 1000) + combined_text = self._truncate_to_token_limit(combined_text, MAX_TOKEN_LIMIT_WITH_BUFFER) total_output_tokens = self.llm_adapter.count_tokens(combined_text) return combined_text, total_input_tokens, total_output_tokens diff --git a/src/sources.py b/src/sources.py index 9e15bd2..e2d1f1f 100644 --- a/src/sources.py +++ b/src/sources.py @@ -63,24 +63,51 @@ class FileSource: def _is_valid_wikipedia_url(self, url: str) -> bool: try: + self.logger.info("Начинаем проверку URL", raw_url=url) + parsed = urlparse(url) + self.logger.info("Разобранный URL", scheme=parsed.scheme, netloc=parsed.netloc, path=parsed.path) if parsed.scheme not in ("http", "https"): + self.logger.info("Отклонено: неподдерживаемая схема", scheme=parsed.scheme, url=url) return False - if "wikipedia.org" not in parsed.netloc: + if "ruwiki" not in parsed.netloc: + self.logger.info("Отклонено: домен не содержит 'ruwiki'", netloc=parsed.netloc, url=url) return False path_parts = parsed.path.split("/") - if len(path_parts) < MIN_WIKI_PATH_PARTS or path_parts[WIKI_PATH_INDEX] != "wiki": + self.logger.info("Части пути", path_parts=path_parts) + + if len(path_parts) < MIN_WIKI_PATH_PARTS: + self.logger.info( + "Отклонено: слишком мало сегментов в пути", parts=path_parts, url=url + ) + return False + + if path_parts[WIKI_PATH_INDEX] != "wiki": + self.logger.info( + "Отклонено: неверный сегмент пути", expected="wiki", actual=path_parts[WIKI_PATH_INDEX], url=url + ) return False article_name = path_parts[ARTICLE_NAME_INDEX] - return bool(article_name and article_name not in ("Main_Page", "Заглавная_страница")) + self.logger.info("Извлечено имя статьи", article_name=article_name, url=url) - except Exception: + if not article_name or article_name in ("Main_Page", "Заглавная_страница"): + self.logger.info( + "Отклонено: некорректное имя статьи", article_name=article_name, url=url + ) + return False + + self.logger.info("URL прошёл все проверки", url=url) + return True + + except Exception as e: + self.logger.info("Ошибка при проверке URL", error=str(e), url=url) return False + async def count_urls(self) -> int: count = 0 async for _ in self.read_urls():