336 lines
12 KiB
Python
336 lines
12 KiB
Python
import os
|
||
import re
|
||
from typing import List, Dict, Any
|
||
from dataclasses import dataclass
|
||
import markdown
|
||
|
||
|
||
@dataclass
|
||
class Document:
|
||
content: str
|
||
metadata: Dict[str, Any]
|
||
doc_id: str
|
||
source: str
|
||
|
||
|
||
class MarkdownLoader:
|
||
def __init__(self):
|
||
self.md = markdown.Markdown(extensions=["meta", "toc"])
|
||
|
||
def load_file(self, file_path: str) -> Document:
|
||
with open(file_path, "r", encoding="utf-8") as f:
|
||
content = f.read()
|
||
|
||
title = self._extract_title(content)
|
||
doc_id = self._generate_doc_id(file_path)
|
||
|
||
metadata = self._extract_metadata(content, file_path)
|
||
metadata.update(
|
||
{
|
||
"title": title,
|
||
"doc_type": "case" if "case" in file_path.lower() else "info",
|
||
"source": file_path,
|
||
"file_size": len(content),
|
||
}
|
||
)
|
||
|
||
clean_content = self._clean_content(content)
|
||
|
||
return Document(
|
||
content=clean_content, metadata=metadata, doc_id=doc_id, source=file_path
|
||
)
|
||
|
||
def load_directory(self, dir_path: str) -> List[Document]:
|
||
documents = []
|
||
|
||
for root, dirs, files in os.walk(dir_path):
|
||
for file in files:
|
||
if file.endswith(".md"):
|
||
file_path = os.path.join(root, file)
|
||
try:
|
||
doc = self.load_file(file_path)
|
||
documents.append(doc)
|
||
except Exception as e:
|
||
print(f"Error loading {file_path}: {e}")
|
||
|
||
return documents
|
||
|
||
def _extract_title(self, content: str) -> str:
|
||
lines = content.strip().split("\n")
|
||
for line in lines:
|
||
line = line.strip()
|
||
if line.startswith("# "):
|
||
title = line[2:].strip()
|
||
if title and not title.startswith("["):
|
||
return title
|
||
|
||
return "Untitled"
|
||
|
||
def _generate_doc_id(self, file_path: str) -> str:
|
||
filename = os.path.basename(file_path)
|
||
name_without_ext = os.path.splitext(filename)[0]
|
||
return name_without_ext.replace(" ", "_").replace("-", "_").lower()
|
||
|
||
def _extract_metadata(self, content: str, file_path: str) -> Dict[str, Any]:
|
||
metadata = {}
|
||
|
||
filename = os.path.basename(file_path).lower()
|
||
content_lower = content.lower()
|
||
|
||
industry_mapping = {
|
||
"маркетинг": "marketing_agency",
|
||
"агентство": "marketing_agency",
|
||
"реклам": "marketing_agency",
|
||
"блогер": "marketing_agency",
|
||
"mediar": "marketing_agency",
|
||
"büro": "marketing_agency",
|
||
"логист": "logistics",
|
||
"достав": "logistics",
|
||
"склад": "logistics",
|
||
"грузчик": "logistics",
|
||
"разраб": "software",
|
||
"програм": "software",
|
||
"progkids": "software",
|
||
"it": "software",
|
||
"диджитал": "software",
|
||
"строит": "construction",
|
||
"недвиж": "construction",
|
||
"этажи": "construction",
|
||
"рознич": "retail",
|
||
"торгов": "retail",
|
||
"консалт": "consulting",
|
||
"экобренд": "manufacturing",
|
||
"wonder": "manufacturing",
|
||
"производ": "manufacturing",
|
||
"колл-центр": "call_center",
|
||
"звонки": "call_center",
|
||
}
|
||
|
||
industries = []
|
||
for keyword, industry in industry_mapping.items():
|
||
if keyword in content_lower or keyword in filename:
|
||
if industry not in industries:
|
||
industries.append(industry)
|
||
|
||
if not industries:
|
||
industries = ["other"]
|
||
|
||
metadata["industry"] = industries
|
||
|
||
roles_mapping = {
|
||
"технический директор": "tech",
|
||
"техн": "tech",
|
||
"cto": "tech",
|
||
"операционный директор": "ops",
|
||
"директор": "ceo",
|
||
"руководи": "ceo",
|
||
"основатель": "ceo",
|
||
"фин": "finance",
|
||
"бухгалт": "finance",
|
||
"cfo": "finance",
|
||
"операц": "ops",
|
||
"coo": "ops",
|
||
"hr": "hr",
|
||
"кадр": "hr",
|
||
"маркет": "marketing",
|
||
"продаж": "sales",
|
||
"менеджер": "other",
|
||
}
|
||
|
||
roles = []
|
||
for keyword, role in roles_mapping.items():
|
||
if keyword in content_lower:
|
||
if role not in roles:
|
||
roles.append(role)
|
||
|
||
if not roles:
|
||
roles = ["other"]
|
||
|
||
metadata["roles_relevant"] = roles
|
||
|
||
metrics = self._extract_metrics(content)
|
||
if metrics:
|
||
metadata["metrics"] = metrics
|
||
|
||
metadata["language"] = "ru"
|
||
|
||
import datetime
|
||
|
||
metadata["created_at"] = datetime.datetime.now().isoformat()
|
||
metadata["updated_at"] = datetime.datetime.now().isoformat()
|
||
|
||
return metadata
|
||
|
||
def _extract_metrics(self, content: str) -> Dict[str, Any]:
|
||
metrics = {}
|
||
|
||
time_patterns = [
|
||
(r"(\d+)\s*минут[ауы]?", "processing_minutes"),
|
||
(r"(\d+)\s*час[ауов]?", "processing_hours"),
|
||
(r"(\d+)\s*дн[ейяах]", "processing_days"),
|
||
(
|
||
r"с\s+(\d+)\s*дн[ейя]\s+до\s+(\d+)\s*минут",
|
||
"improvement_days_to_minutes",
|
||
),
|
||
(
|
||
r"с\s+(\d+)\s*час[ауов]?\s+до\s+(\d+)\s*минут",
|
||
"improvement_hours_to_minutes",
|
||
),
|
||
(r"(\d+)\s*секунд", "processing_seconds"),
|
||
]
|
||
|
||
for pattern, key in time_patterns:
|
||
matches = re.findall(pattern, content, re.IGNORECASE)
|
||
if matches:
|
||
try:
|
||
if key.startswith("improvement_"):
|
||
if len(matches[0]) == 2:
|
||
metrics[f"{key}_before"] = int(matches[0][0])
|
||
metrics[f"{key}_after"] = int(matches[0][1])
|
||
else:
|
||
metrics[key] = int(matches[0])
|
||
else:
|
||
metrics[key] = int(matches[0])
|
||
except (ValueError, IndexError):
|
||
pass
|
||
|
||
percentage_patterns = [
|
||
(r"(\d+)%\s*снижени", "error_reduction_pct"),
|
||
(r"снижение[^0-9]*(\d+)%", "error_reduction_pct"),
|
||
(r"(\d+)%\s*документ", "document_collection_pct"),
|
||
(r"(\d+)%\s*точност", "accuracy_pct"),
|
||
(r"увеличи[лв]\w*\s+в\s+(\d+)\s*раз", "growth_multiplier"),
|
||
]
|
||
|
||
for pattern, key in percentage_patterns:
|
||
matches = re.findall(pattern, content, re.IGNORECASE)
|
||
if matches:
|
||
try:
|
||
metrics[key] = int(matches[0])
|
||
except ValueError:
|
||
pass
|
||
|
||
volume_patterns = [
|
||
(r"(\d+)\s*блогер", "bloggers_count"),
|
||
(r"(\d+)\s*исполнител", "contractors_count"),
|
||
(r"(\d+)\s*сотрудник", "employees_count"),
|
||
(r"бол[ьеее]+\s+(\d+)", "more_than_count"),
|
||
(r"свыше\s+(\d+)", "over_count"),
|
||
]
|
||
|
||
for pattern, key in volume_patterns:
|
||
matches = re.findall(pattern, content, re.IGNORECASE)
|
||
if matches:
|
||
try:
|
||
metrics[key] = int(matches[0])
|
||
except ValueError:
|
||
pass
|
||
|
||
return metrics
|
||
|
||
def _clean_content(self, content: str) -> str:
|
||
content = re.sub(r"^\s*#+\s*", "", content, flags=re.MULTILINE)
|
||
|
||
content = re.sub(r"\*\*(.*?)\*\*", r"\1", content)
|
||
content = re.sub(r"\*(.*?)\*", r"\1", content)
|
||
|
||
content = re.sub(r"\[([^\]]+)\]\([^\)]*\)", r"\1", content)
|
||
|
||
content = re.sub(r"^\s*[-*+]\s+", "• ", content, flags=re.MULTILINE)
|
||
|
||
content = re.sub(
|
||
r"\d+\s+\d+\s+\[Комментировать\]\(\)\s+\d{2}\.\d{2}\.\d{2}", "", content
|
||
)
|
||
|
||
noise_patterns = [
|
||
r"Автор и редактор журнала Консоль",
|
||
r"Автор\s+\[.*?\]\(\)",
|
||
r"Поделиться",
|
||
r"Ваше мнение\?",
|
||
r"Отлично\s+Хорошо\s+Нормально\s+Плохо\s+Ужасно",
|
||
r"Сайт использует файлы cookie.*?Принять",
|
||
r"\[Политика конфиденциальности\]\(\)",
|
||
r"\[Пользовательское соглашение\]\(\)",
|
||
r"hello@konsol\.pro",
|
||
r"\+7 \(\d{3}\) \d{3}-\d{2}-\d{2}",
|
||
r"125047.*?дом \d+",
|
||
r"\[Разработка - SKDO\]\(\)",
|
||
r"\[Подключиться к Консоли\]\(\)",
|
||
r"\[Кейсы наших клиентов\]\(\)",
|
||
r"\[Делимся экспертизой\]\(\)",
|
||
r"^\s*\d+\s*$",
|
||
r"^\s*\[\d+\]\(\)\s*\d{2}\.\d{2}\.\d{2}\s*$",
|
||
]
|
||
|
||
for pattern in noise_patterns:
|
||
content = re.sub(pattern, "", content, flags=re.MULTILINE | re.IGNORECASE)
|
||
|
||
related_articles_pattern = r"###\s+\[.*?\]\(\).*?(?=###|\Z)"
|
||
content = re.sub(related_articles_pattern, "", content, flags=re.DOTALL)
|
||
|
||
content = re.sub(r"\n{3,}", "\n\n", content)
|
||
|
||
lines = content.split("\n")
|
||
filtered_lines = []
|
||
for line in lines:
|
||
line = line.strip()
|
||
if line and not (line.startswith("[") and line.endswith("]()")):
|
||
if not re.match(r"^\d+\s*$", line):
|
||
if len(line) > 10 or line.startswith("•"):
|
||
filtered_lines.append(line)
|
||
|
||
content = "\n".join(filtered_lines)
|
||
|
||
return content.strip()
|
||
|
||
|
||
def create_platform_overview() -> Document:
|
||
content = """
|
||
Консоль.Про – платформа автоматизации работы с самозанятыми, ИП и физлицами.
|
||
|
||
Основные возможности:
|
||
• Подключение нового исполнителя за ~15 минут
|
||
• Выплаты в течение минут вместо часов
|
||
• Сбор 100% закрывающих документов
|
||
• Снижение ошибок до 95%
|
||
• Управление сотнями исполнителей одним сотрудником
|
||
• API интеграции для автоматизации процессов
|
||
• Автоматический сбор чеков и документов
|
||
• Снижение времени онбординга с 2 дней до ~20 минут
|
||
|
||
Платформа решает ключевые задачи бизнеса:
|
||
• Быстрое масштабирование команды исполнителей
|
||
• Автоматизация документооборота и выплат
|
||
• Снижение операционных затрат
|
||
• Обеспечение налогового соответствия
|
||
• Упрощение работы с подрядчиками
|
||
|
||
Внедрение платформы занимает около 1 дня.
|
||
"""
|
||
|
||
metadata = {
|
||
"title": "Платформа Консоль.Про - Обзор",
|
||
"doc_type": "platform_overview",
|
||
"source": "internal",
|
||
"industry": ["generic"],
|
||
"roles_relevant": ["tech", "finance", "ops", "ceo"],
|
||
"metrics": {
|
||
"onboarding_minutes": 15,
|
||
"onboarding_days_before": 2,
|
||
"onboarding_minutes_after": 20,
|
||
"error_reduction_pct": 95,
|
||
"document_collection_pct": 100,
|
||
"implementation_days": 1,
|
||
},
|
||
"language": "ru",
|
||
"created_at": "2024-01-01T00:00:00",
|
||
"updated_at": "2024-01-01T00:00:00",
|
||
}
|
||
|
||
return Document(
|
||
content=content.strip(),
|
||
metadata=metadata,
|
||
doc_id="platform_overview",
|
||
source="platform_overview.md",
|
||
)
|