Add more info logs
This commit is contained in:
parent
7a1e35ec00
commit
c5306bb56e
|
@ -55,7 +55,7 @@ class RuWikiAdapter(BaseAdapter):
|
||||||
|
|
||||||
def _create_client(self) -> mwclient.Site:
|
def _create_client(self) -> mwclient.Site:
|
||||||
try:
|
try:
|
||||||
site = mwclient.Site("ru.wikipedia.org")
|
site = mwclient.Site("ru.ruwiki.ru")
|
||||||
site.api("query", meta="siteinfo")
|
site.api("query", meta="siteinfo")
|
||||||
self.logger.info("Соединение с RuWiki установлено")
|
self.logger.info("Соединение с RuWiki установлено")
|
||||||
return site
|
return site
|
||||||
|
@ -66,7 +66,7 @@ class RuWikiAdapter(BaseAdapter):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def extract_title_from_url(url: str) -> str:
|
def extract_title_from_url(url: str) -> str:
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
if "wikipedia.org" not in parsed.netloc:
|
if "ruwiki.ru" not in parsed.netloc:
|
||||||
raise ValueError(f"Не является URL википедии: {url}")
|
raise ValueError(f"Не является URL википедии: {url}")
|
||||||
|
|
||||||
path_parts = parsed.path.split("/")
|
path_parts = parsed.path.split("/")
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
LLM_MAX_INPUT_TOKENS = 4096
|
LLM_MAX_INPUT_TOKENS = 120000
|
||||||
MAX_TOKEN_LIMIT_WITH_BUFFER = 3800
|
MAX_TOKEN_LIMIT_WITH_BUFFER = 16000
|
||||||
ARTICLE_NAME_INDEX = 0
|
ARTICLE_NAME_INDEX = -1
|
||||||
MIN_WIKI_PATH_PARTS = 2
|
MIN_WIKI_PATH_PARTS = 2
|
||||||
WIKI_PATH_INDEX = 1
|
WIKI_PATH_INDEX = 1
|
||||||
WRITE_QUEUE_BATCH_SIZE = 10
|
WRITE_QUEUE_BATCH_SIZE = 10
|
||||||
|
|
|
@ -69,7 +69,7 @@ class AsyncRunner:
|
||||||
) -> None:
|
) -> None:
|
||||||
loaded_count = 0
|
loaded_count = 0
|
||||||
|
|
||||||
async for command in source.read_urls(force_reprocess):
|
async for command in source.read_urls(force_reprocess=force_reprocess):
|
||||||
if max_articles and loaded_count >= max_articles:
|
if max_articles and loaded_count >= max_articles:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
|
@ -79,6 +79,16 @@ class SimplifyService:
|
||||||
raw_text=page_info.content,
|
raw_text=page_info.content,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.logger.info("Упрощение завершено",
|
||||||
|
url=command.url,
|
||||||
|
simplified_length=len(simplified_text),
|
||||||
|
input_tokens=input_tokens,
|
||||||
|
output_tokens=output_tokens)
|
||||||
|
|
||||||
|
if not simplified_text.strip():
|
||||||
|
self.logger.error("Получен пустой simplified_text!", url=command.url)
|
||||||
|
raise ValueError("Упрощение привело к пустому результату")
|
||||||
|
|
||||||
processing_time = time.time() - start_time
|
processing_time = time.time() - start_time
|
||||||
result = ProcessingResult.success_result(
|
result = ProcessingResult.success_result(
|
||||||
url=command.url,
|
url=command.url,
|
||||||
|
@ -90,7 +100,9 @@ class SimplifyService:
|
||||||
processing_time_seconds=processing_time,
|
processing_time_seconds=processing_time,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.logger.info("Отправляем результат в write_queue...", url=command.url)
|
||||||
await self.write_queue.update_from_result(result)
|
await self.write_queue.update_from_result(result)
|
||||||
|
self.logger.info("Результат успешно записан в write_queue", url=command.url)
|
||||||
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"Статья успешно обработана",
|
"Статья успешно обработана",
|
||||||
|
@ -163,8 +175,8 @@ class SimplifyService:
|
||||||
async def _simplify_article_text(self, title: str, raw_text: str) -> tuple[str, int, int]:
|
async def _simplify_article_text(self, title: str, raw_text: str) -> tuple[str, int, int]:
|
||||||
prompt_template = await self.get_prompt_template()
|
prompt_template = await self.get_prompt_template()
|
||||||
text_tokens = self.llm_adapter.count_tokens(raw_text)
|
text_tokens = self.llm_adapter.count_tokens(raw_text)
|
||||||
|
max_input_size = LLM_MAX_INPUT_TOKENS - len(prompt_template) - 1000
|
||||||
if text_tokens <= self.config.chunk_size:
|
if text_tokens <= max_input_size:
|
||||||
return await self.llm_adapter.simplify_text(title, raw_text, prompt_template)
|
return await self.llm_adapter.simplify_text(title, raw_text, prompt_template)
|
||||||
|
|
||||||
return await self._process_long_text(title, raw_text, prompt_template)
|
return await self._process_long_text(title, raw_text, prompt_template)
|
||||||
|
@ -230,7 +242,7 @@ class SimplifyService:
|
||||||
"Объединённый текст превышает лимит, обрезаем",
|
"Объединённый текст превышает лимит, обрезаем",
|
||||||
final_tokens=final_tokens,
|
final_tokens=final_tokens,
|
||||||
)
|
)
|
||||||
combined_text = self._truncate_to_token_limit(combined_text, 1000)
|
combined_text = self._truncate_to_token_limit(combined_text, MAX_TOKEN_LIMIT_WITH_BUFFER)
|
||||||
total_output_tokens = self.llm_adapter.count_tokens(combined_text)
|
total_output_tokens = self.llm_adapter.count_tokens(combined_text)
|
||||||
|
|
||||||
return combined_text, total_input_tokens, total_output_tokens
|
return combined_text, total_input_tokens, total_output_tokens
|
||||||
|
|
|
@ -63,24 +63,51 @@ class FileSource:
|
||||||
|
|
||||||
def _is_valid_wikipedia_url(self, url: str) -> bool:
|
def _is_valid_wikipedia_url(self, url: str) -> bool:
|
||||||
try:
|
try:
|
||||||
|
self.logger.info("Начинаем проверку URL", raw_url=url)
|
||||||
|
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
|
self.logger.info("Разобранный URL", scheme=parsed.scheme, netloc=parsed.netloc, path=parsed.path)
|
||||||
|
|
||||||
if parsed.scheme not in ("http", "https"):
|
if parsed.scheme not in ("http", "https"):
|
||||||
|
self.logger.info("Отклонено: неподдерживаемая схема", scheme=parsed.scheme, url=url)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if "wikipedia.org" not in parsed.netloc:
|
if "ruwiki" not in parsed.netloc:
|
||||||
|
self.logger.info("Отклонено: домен не содержит 'ruwiki'", netloc=parsed.netloc, url=url)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
path_parts = parsed.path.split("/")
|
path_parts = parsed.path.split("/")
|
||||||
if len(path_parts) < MIN_WIKI_PATH_PARTS or path_parts[WIKI_PATH_INDEX] != "wiki":
|
self.logger.info("Части пути", path_parts=path_parts)
|
||||||
|
|
||||||
|
if len(path_parts) < MIN_WIKI_PATH_PARTS:
|
||||||
|
self.logger.info(
|
||||||
|
"Отклонено: слишком мало сегментов в пути", parts=path_parts, url=url
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
if path_parts[WIKI_PATH_INDEX] != "wiki":
|
||||||
|
self.logger.info(
|
||||||
|
"Отклонено: неверный сегмент пути", expected="wiki", actual=path_parts[WIKI_PATH_INDEX], url=url
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
article_name = path_parts[ARTICLE_NAME_INDEX]
|
article_name = path_parts[ARTICLE_NAME_INDEX]
|
||||||
return bool(article_name and article_name not in ("Main_Page", "Заглавная_страница"))
|
self.logger.info("Извлечено имя статьи", article_name=article_name, url=url)
|
||||||
|
|
||||||
except Exception:
|
if not article_name or article_name in ("Main_Page", "Заглавная_страница"):
|
||||||
|
self.logger.info(
|
||||||
|
"Отклонено: некорректное имя статьи", article_name=article_name, url=url
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.logger.info("URL прошёл все проверки", url=url)
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.info("Ошибка при проверке URL", error=str(e), url=url)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
async def count_urls(self) -> int:
|
async def count_urls(self) -> int:
|
||||||
count = 0
|
count = 0
|
||||||
async for _ in self.read_urls():
|
async for _ in self.read_urls():
|
||||||
|
|
Loading…
Reference in New Issue