import os import re from typing import List, Dict, Any from dataclasses import dataclass import markdown @dataclass class Document: content: str metadata: Dict[str, Any] doc_id: str source: str class MarkdownLoader: def __init__(self): self.md = markdown.Markdown(extensions=["meta", "toc"]) def load_file(self, file_path: str) -> Document: with open(file_path, "r", encoding="utf-8") as f: content = f.read() title = self._extract_title(content) doc_id = self._generate_doc_id(file_path) metadata = self._extract_metadata(content, file_path) metadata.update( { "title": title, "doc_type": "case" if "case" in file_path.lower() else "info", "source": file_path, "file_size": len(content), } ) clean_content = self._clean_content(content) return Document( content=clean_content, metadata=metadata, doc_id=doc_id, source=file_path ) def load_directory(self, dir_path: str) -> List[Document]: documents = [] for root, dirs, files in os.walk(dir_path): for file in files: if file.endswith(".md"): file_path = os.path.join(root, file) try: doc = self.load_file(file_path) documents.append(doc) except Exception as e: print(f"Error loading {file_path}: {e}") return documents def _extract_title(self, content: str) -> str: lines = content.strip().split("\n") for line in lines: line = line.strip() if line.startswith("# "): title = line[2:].strip() if title and not title.startswith("["): return title return "Untitled" def _generate_doc_id(self, file_path: str) -> str: filename = os.path.basename(file_path) name_without_ext = os.path.splitext(filename)[0] return name_without_ext.replace(" ", "_").replace("-", "_").lower() def _extract_metadata(self, content: str, file_path: str) -> Dict[str, Any]: metadata = {} filename = os.path.basename(file_path).lower() content_lower = content.lower() industry_mapping = { "маркетинг": "marketing_agency", "агентство": "marketing_agency", "реклам": "marketing_agency", "блогер": "marketing_agency", "mediar": "marketing_agency", "büro": "marketing_agency", "логист": "logistics", "достав": "logistics", "склад": "logistics", "грузчик": "logistics", "разраб": "software", "програм": "software", "progkids": "software", "it": "software", "диджитал": "software", "строит": "construction", "недвиж": "construction", "этажи": "construction", "рознич": "retail", "торгов": "retail", "консалт": "consulting", "экобренд": "manufacturing", "wonder": "manufacturing", "производ": "manufacturing", "колл-центр": "call_center", "звонки": "call_center", } industries = [] for keyword, industry in industry_mapping.items(): if keyword in content_lower or keyword in filename: if industry not in industries: industries.append(industry) if not industries: industries = ["other"] metadata["industry"] = industries roles_mapping = { "технический директор": "tech", "техн": "tech", "cto": "tech", "операционный директор": "ops", "директор": "ceo", "руководи": "ceo", "основатель": "ceo", "фин": "finance", "бухгалт": "finance", "cfo": "finance", "операц": "ops", "coo": "ops", "hr": "hr", "кадр": "hr", "маркет": "marketing", "продаж": "sales", "менеджер": "other", } roles = [] for keyword, role in roles_mapping.items(): if keyword in content_lower: if role not in roles: roles.append(role) if not roles: roles = ["other"] metadata["roles_relevant"] = roles metrics = self._extract_metrics(content) if metrics: metadata["metrics"] = metrics metadata["language"] = "ru" import datetime metadata["created_at"] = datetime.datetime.now().isoformat() metadata["updated_at"] = datetime.datetime.now().isoformat() return metadata def _extract_metrics(self, content: str) -> Dict[str, Any]: metrics = {} time_patterns = [ (r"(\d+)\s*минут[ауы]?", "processing_minutes"), (r"(\d+)\s*час[ауов]?", "processing_hours"), (r"(\d+)\s*дн[ейяах]", "processing_days"), ( r"с\s+(\d+)\s*дн[ейя]\s+до\s+(\d+)\s*минут", "improvement_days_to_minutes", ), ( r"с\s+(\d+)\s*час[ауов]?\s+до\s+(\d+)\s*минут", "improvement_hours_to_minutes", ), (r"(\d+)\s*секунд", "processing_seconds"), ] for pattern, key in time_patterns: matches = re.findall(pattern, content, re.IGNORECASE) if matches: try: if key.startswith("improvement_"): if len(matches[0]) == 2: metrics[f"{key}_before"] = int(matches[0][0]) metrics[f"{key}_after"] = int(matches[0][1]) else: metrics[key] = int(matches[0]) else: metrics[key] = int(matches[0]) except (ValueError, IndexError): pass percentage_patterns = [ (r"(\d+)%\s*снижени", "error_reduction_pct"), (r"снижение[^0-9]*(\d+)%", "error_reduction_pct"), (r"(\d+)%\s*документ", "document_collection_pct"), (r"(\d+)%\s*точност", "accuracy_pct"), (r"увеличи[лв]\w*\s+в\s+(\d+)\s*раз", "growth_multiplier"), ] for pattern, key in percentage_patterns: matches = re.findall(pattern, content, re.IGNORECASE) if matches: try: metrics[key] = int(matches[0]) except ValueError: pass volume_patterns = [ (r"(\d+)\s*блогер", "bloggers_count"), (r"(\d+)\s*исполнител", "contractors_count"), (r"(\d+)\s*сотрудник", "employees_count"), (r"бол[ьеее]+\s+(\d+)", "more_than_count"), (r"свыше\s+(\d+)", "over_count"), ] for pattern, key in volume_patterns: matches = re.findall(pattern, content, re.IGNORECASE) if matches: try: metrics[key] = int(matches[0]) except ValueError: pass return metrics def _clean_content(self, content: str) -> str: content = re.sub(r"^\s*#+\s*", "", content, flags=re.MULTILINE) content = re.sub(r"\*\*(.*?)\*\*", r"\1", content) content = re.sub(r"\*(.*?)\*", r"\1", content) content = re.sub(r"\[([^\]]+)\]\([^\)]*\)", r"\1", content) content = re.sub(r"^\s*[-*+]\s+", "• ", content, flags=re.MULTILINE) content = re.sub( r"\d+\s+\d+\s+\[Комментировать\]\(\)\s+\d{2}\.\d{2}\.\d{2}", "", content ) noise_patterns = [ r"Автор и редактор журнала Консоль", r"Автор\s+\[.*?\]\(\)", r"Поделиться", r"Ваше мнение\?", r"Отлично\s+Хорошо\s+Нормально\s+Плохо\s+Ужасно", r"Сайт использует файлы cookie.*?Принять", r"\[Политика конфиденциальности\]\(\)", r"\[Пользовательское соглашение\]\(\)", r"hello@konsol\.pro", r"\+7 \(\d{3}\) \d{3}-\d{2}-\d{2}", r"125047.*?дом \d+", r"\[Разработка - SKDO\]\(\)", r"\[Подключиться к Консоли\]\(\)", r"\[Кейсы наших клиентов\]\(\)", r"\[Делимся экспертизой\]\(\)", r"^\s*\d+\s*$", r"^\s*\[\d+\]\(\)\s*\d{2}\.\d{2}\.\d{2}\s*$", ] for pattern in noise_patterns: content = re.sub(pattern, "", content, flags=re.MULTILINE | re.IGNORECASE) related_articles_pattern = r"###\s+\[.*?\]\(\).*?(?=###|\Z)" content = re.sub(related_articles_pattern, "", content, flags=re.DOTALL) content = re.sub(r"\n{3,}", "\n\n", content) lines = content.split("\n") filtered_lines = [] for line in lines: line = line.strip() if line and not (line.startswith("[") and line.endswith("]()")): if not re.match(r"^\d+\s*$", line): if len(line) > 10 or line.startswith("•"): filtered_lines.append(line) content = "\n".join(filtered_lines) return content.strip() def create_platform_overview() -> Document: content = """ Консоль.Про – платформа автоматизации работы с самозанятыми, ИП и физлицами. Основные возможности: • Подключение нового исполнителя за ~15 минут • Выплаты в течение минут вместо часов • Сбор 100% закрывающих документов • Снижение ошибок до 95% • Управление сотнями исполнителей одним сотрудником • API интеграции для автоматизации процессов • Автоматический сбор чеков и документов • Снижение времени онбординга с 2 дней до ~20 минут Платформа решает ключевые задачи бизнеса: • Быстрое масштабирование команды исполнителей • Автоматизация документооборота и выплат • Снижение операционных затрат • Обеспечение налогового соответствия • Упрощение работы с подрядчиками Внедрение платформы занимает около 1 дня. """ metadata = { "title": "Платформа Консоль.Про - Обзор", "doc_type": "platform_overview", "source": "internal", "industry": ["generic"], "roles_relevant": ["tech", "finance", "ops", "ceo"], "metrics": { "onboarding_minutes": 15, "onboarding_days_before": 2, "onboarding_minutes_after": 20, "error_reduction_pct": 95, "document_collection_pct": 100, "implementation_days": 1, }, "language": "ru", "created_at": "2024-01-01T00:00:00", "updated_at": "2024-01-01T00:00:00", } return Document( content=content.strip(), metadata=metadata, doc_id="platform_overview", source="platform_overview.md", )