Add more info logs
This commit is contained in:
parent
7a1e35ec00
commit
c5306bb56e
|
@ -55,7 +55,7 @@ class RuWikiAdapter(BaseAdapter):
|
|||
|
||||
def _create_client(self) -> mwclient.Site:
|
||||
try:
|
||||
site = mwclient.Site("ru.wikipedia.org")
|
||||
site = mwclient.Site("ru.ruwiki.ru")
|
||||
site.api("query", meta="siteinfo")
|
||||
self.logger.info("Соединение с RuWiki установлено")
|
||||
return site
|
||||
|
@ -66,7 +66,7 @@ class RuWikiAdapter(BaseAdapter):
|
|||
@staticmethod
|
||||
def extract_title_from_url(url: str) -> str:
|
||||
parsed = urlparse(url)
|
||||
if "wikipedia.org" not in parsed.netloc:
|
||||
if "ruwiki.ru" not in parsed.netloc:
|
||||
raise ValueError(f"Не является URL википедии: {url}")
|
||||
|
||||
path_parts = parsed.path.split("/")
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
LLM_MAX_INPUT_TOKENS = 4096
|
||||
MAX_TOKEN_LIMIT_WITH_BUFFER = 3800
|
||||
ARTICLE_NAME_INDEX = 0
|
||||
LLM_MAX_INPUT_TOKENS = 120000
|
||||
MAX_TOKEN_LIMIT_WITH_BUFFER = 16000
|
||||
ARTICLE_NAME_INDEX = -1
|
||||
MIN_WIKI_PATH_PARTS = 2
|
||||
WIKI_PATH_INDEX = 1
|
||||
WRITE_QUEUE_BATCH_SIZE = 10
|
||||
|
|
|
@ -69,7 +69,7 @@ class AsyncRunner:
|
|||
) -> None:
|
||||
loaded_count = 0
|
||||
|
||||
async for command in source.read_urls(force_reprocess):
|
||||
async for command in source.read_urls(force_reprocess=force_reprocess):
|
||||
if max_articles and loaded_count >= max_articles:
|
||||
break
|
||||
|
||||
|
|
|
@ -79,6 +79,16 @@ class SimplifyService:
|
|||
raw_text=page_info.content,
|
||||
)
|
||||
|
||||
self.logger.info("Упрощение завершено",
|
||||
url=command.url,
|
||||
simplified_length=len(simplified_text),
|
||||
input_tokens=input_tokens,
|
||||
output_tokens=output_tokens)
|
||||
|
||||
if not simplified_text.strip():
|
||||
self.logger.error("Получен пустой simplified_text!", url=command.url)
|
||||
raise ValueError("Упрощение привело к пустому результату")
|
||||
|
||||
processing_time = time.time() - start_time
|
||||
result = ProcessingResult.success_result(
|
||||
url=command.url,
|
||||
|
@ -90,7 +100,9 @@ class SimplifyService:
|
|||
processing_time_seconds=processing_time,
|
||||
)
|
||||
|
||||
self.logger.info("Отправляем результат в write_queue...", url=command.url)
|
||||
await self.write_queue.update_from_result(result)
|
||||
self.logger.info("Результат успешно записан в write_queue", url=command.url)
|
||||
|
||||
self.logger.info(
|
||||
"Статья успешно обработана",
|
||||
|
@ -163,8 +175,8 @@ class SimplifyService:
|
|||
async def _simplify_article_text(self, title: str, raw_text: str) -> tuple[str, int, int]:
|
||||
prompt_template = await self.get_prompt_template()
|
||||
text_tokens = self.llm_adapter.count_tokens(raw_text)
|
||||
|
||||
if text_tokens <= self.config.chunk_size:
|
||||
max_input_size = LLM_MAX_INPUT_TOKENS - len(prompt_template) - 1000
|
||||
if text_tokens <= max_input_size:
|
||||
return await self.llm_adapter.simplify_text(title, raw_text, prompt_template)
|
||||
|
||||
return await self._process_long_text(title, raw_text, prompt_template)
|
||||
|
@ -230,7 +242,7 @@ class SimplifyService:
|
|||
"Объединённый текст превышает лимит, обрезаем",
|
||||
final_tokens=final_tokens,
|
||||
)
|
||||
combined_text = self._truncate_to_token_limit(combined_text, 1000)
|
||||
combined_text = self._truncate_to_token_limit(combined_text, MAX_TOKEN_LIMIT_WITH_BUFFER)
|
||||
total_output_tokens = self.llm_adapter.count_tokens(combined_text)
|
||||
|
||||
return combined_text, total_input_tokens, total_output_tokens
|
||||
|
|
|
@ -63,24 +63,51 @@ class FileSource:
|
|||
|
||||
def _is_valid_wikipedia_url(self, url: str) -> bool:
|
||||
try:
|
||||
self.logger.info("Начинаем проверку URL", raw_url=url)
|
||||
|
||||
parsed = urlparse(url)
|
||||
self.logger.info("Разобранный URL", scheme=parsed.scheme, netloc=parsed.netloc, path=parsed.path)
|
||||
|
||||
if parsed.scheme not in ("http", "https"):
|
||||
self.logger.info("Отклонено: неподдерживаемая схема", scheme=parsed.scheme, url=url)
|
||||
return False
|
||||
|
||||
if "wikipedia.org" not in parsed.netloc:
|
||||
if "ruwiki" not in parsed.netloc:
|
||||
self.logger.info("Отклонено: домен не содержит 'ruwiki'", netloc=parsed.netloc, url=url)
|
||||
return False
|
||||
|
||||
path_parts = parsed.path.split("/")
|
||||
if len(path_parts) < MIN_WIKI_PATH_PARTS or path_parts[WIKI_PATH_INDEX] != "wiki":
|
||||
self.logger.info("Части пути", path_parts=path_parts)
|
||||
|
||||
if len(path_parts) < MIN_WIKI_PATH_PARTS:
|
||||
self.logger.info(
|
||||
"Отклонено: слишком мало сегментов в пути", parts=path_parts, url=url
|
||||
)
|
||||
return False
|
||||
|
||||
if path_parts[WIKI_PATH_INDEX] != "wiki":
|
||||
self.logger.info(
|
||||
"Отклонено: неверный сегмент пути", expected="wiki", actual=path_parts[WIKI_PATH_INDEX], url=url
|
||||
)
|
||||
return False
|
||||
|
||||
article_name = path_parts[ARTICLE_NAME_INDEX]
|
||||
return bool(article_name and article_name not in ("Main_Page", "Заглавная_страница"))
|
||||
self.logger.info("Извлечено имя статьи", article_name=article_name, url=url)
|
||||
|
||||
except Exception:
|
||||
if not article_name or article_name in ("Main_Page", "Заглавная_страница"):
|
||||
self.logger.info(
|
||||
"Отклонено: некорректное имя статьи", article_name=article_name, url=url
|
||||
)
|
||||
return False
|
||||
|
||||
self.logger.info("URL прошёл все проверки", url=url)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.info("Ошибка при проверке URL", error=str(e), url=url)
|
||||
return False
|
||||
|
||||
|
||||
async def count_urls(self) -> int:
|
||||
count = 0
|
||||
async for _ in self.read_urls():
|
||||
|
|
Loading…
Reference in New Issue