ai-email-assistant/src/ingest/loader.py

336 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import re
from typing import List, Dict, Any
from dataclasses import dataclass
import markdown
@dataclass
class Document:
content: str
metadata: Dict[str, Any]
doc_id: str
source: str
class MarkdownLoader:
def __init__(self):
self.md = markdown.Markdown(extensions=["meta", "toc"])
def load_file(self, file_path: str) -> Document:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
title = self._extract_title(content)
doc_id = self._generate_doc_id(file_path)
metadata = self._extract_metadata(content, file_path)
metadata.update(
{
"title": title,
"doc_type": "case" if "case" in file_path.lower() else "info",
"source": file_path,
"file_size": len(content),
}
)
clean_content = self._clean_content(content)
return Document(
content=clean_content, metadata=metadata, doc_id=doc_id, source=file_path
)
def load_directory(self, dir_path: str) -> List[Document]:
documents = []
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith(".md"):
file_path = os.path.join(root, file)
try:
doc = self.load_file(file_path)
documents.append(doc)
except Exception as e:
print(f"Error loading {file_path}: {e}")
return documents
def _extract_title(self, content: str) -> str:
lines = content.strip().split("\n")
for line in lines:
line = line.strip()
if line.startswith("# "):
title = line[2:].strip()
if title and not title.startswith("["):
return title
return "Untitled"
def _generate_doc_id(self, file_path: str) -> str:
filename = os.path.basename(file_path)
name_without_ext = os.path.splitext(filename)[0]
return name_without_ext.replace(" ", "_").replace("-", "_").lower()
def _extract_metadata(self, content: str, file_path: str) -> Dict[str, Any]:
metadata = {}
filename = os.path.basename(file_path).lower()
content_lower = content.lower()
industry_mapping = {
"маркетинг": "marketing_agency",
"агентство": "marketing_agency",
"реклам": "marketing_agency",
"блогер": "marketing_agency",
"mediar": "marketing_agency",
"büro": "marketing_agency",
"логист": "logistics",
"достав": "logistics",
"склад": "logistics",
"грузчик": "logistics",
"разраб": "software",
"програм": "software",
"progkids": "software",
"it": "software",
"диджитал": "software",
"строит": "construction",
"недвиж": "construction",
"этажи": "construction",
"рознич": "retail",
"торгов": "retail",
"консалт": "consulting",
"экобренд": "manufacturing",
"wonder": "manufacturing",
"производ": "manufacturing",
"колл-центр": "call_center",
"звонки": "call_center",
}
industries = []
for keyword, industry in industry_mapping.items():
if keyword in content_lower or keyword in filename:
if industry not in industries:
industries.append(industry)
if not industries:
industries = ["other"]
metadata["industry"] = industries
roles_mapping = {
"технический директор": "tech",
"техн": "tech",
"cto": "tech",
"операционный директор": "ops",
"директор": "ceo",
"руководи": "ceo",
"основатель": "ceo",
"фин": "finance",
"бухгалт": "finance",
"cfo": "finance",
"операц": "ops",
"coo": "ops",
"hr": "hr",
"кадр": "hr",
"маркет": "marketing",
"продаж": "sales",
"менеджер": "other",
}
roles = []
for keyword, role in roles_mapping.items():
if keyword in content_lower:
if role not in roles:
roles.append(role)
if not roles:
roles = ["other"]
metadata["roles_relevant"] = roles
metrics = self._extract_metrics(content)
if metrics:
metadata["metrics"] = metrics
metadata["language"] = "ru"
import datetime
metadata["created_at"] = datetime.datetime.now().isoformat()
metadata["updated_at"] = datetime.datetime.now().isoformat()
return metadata
def _extract_metrics(self, content: str) -> Dict[str, Any]:
metrics = {}
time_patterns = [
(r"(\d+)\s*минут[ауы]?", "processing_minutes"),
(r"(\d+)\s*час[ауов]?", "processing_hours"),
(r"(\d+)\s*дн[ейяах]", "processing_days"),
(
r"с\s+(\d+)\s*дн[ейя]\s+до\s+(\d+)\s*минут",
"improvement_days_to_minutes",
),
(
r"с\s+(\d+)\s*час[ауов]?\s+до\s+(\d+)\s*минут",
"improvement_hours_to_minutes",
),
(r"(\d+)\s*секунд", "processing_seconds"),
]
for pattern, key in time_patterns:
matches = re.findall(pattern, content, re.IGNORECASE)
if matches:
try:
if key.startswith("improvement_"):
if len(matches[0]) == 2:
metrics[f"{key}_before"] = int(matches[0][0])
metrics[f"{key}_after"] = int(matches[0][1])
else:
metrics[key] = int(matches[0])
else:
metrics[key] = int(matches[0])
except (ValueError, IndexError):
pass
percentage_patterns = [
(r"(\d+)%\s*снижени", "error_reduction_pct"),
(r"снижение[^0-9]*(\d+)%", "error_reduction_pct"),
(r"(\d+)%\s*документ", "document_collection_pct"),
(r"(\d+)%\s*точност", "accuracy_pct"),
(r"увеличи[лв]\w*\s+в\s+(\d+)\s*раз", "growth_multiplier"),
]
for pattern, key in percentage_patterns:
matches = re.findall(pattern, content, re.IGNORECASE)
if matches:
try:
metrics[key] = int(matches[0])
except ValueError:
pass
volume_patterns = [
(r"(\d+)\s*блогер", "bloggers_count"),
(r"(\d+)\s*исполнител", "contractors_count"),
(r"(\d+)\s*сотрудник", "employees_count"),
(r"бол[ьеее]+\s+(\d+)", "more_than_count"),
(r"свыше\s+(\d+)", "over_count"),
]
for pattern, key in volume_patterns:
matches = re.findall(pattern, content, re.IGNORECASE)
if matches:
try:
metrics[key] = int(matches[0])
except ValueError:
pass
return metrics
def _clean_content(self, content: str) -> str:
content = re.sub(r"^\s*#+\s*", "", content, flags=re.MULTILINE)
content = re.sub(r"\*\*(.*?)\*\*", r"\1", content)
content = re.sub(r"\*(.*?)\*", r"\1", content)
content = re.sub(r"\[([^\]]+)\]\([^\)]*\)", r"\1", content)
content = re.sub(r"^\s*[-*+]\s+", "", content, flags=re.MULTILINE)
content = re.sub(
r"\d+\s+\d+\s+\[Комментировать\]\(\)\s+\d{2}\.\d{2}\.\d{2}", "", content
)
noise_patterns = [
r"Автор и редактор журнала Консоль",
r"Автор\s+\[.*?\]\(\)",
r"Поделиться",
r"Ваше мнение\?",
r"Отлично\s+Хорошо\s+Нормально\s+Плохо\s+Ужасно",
r"Сайт использует файлы cookie.*?Принять",
r"\[Политика конфиденциальности\]\(\)",
r"\[Пользовательское соглашение\]\(\)",
r"hello@konsol\.pro",
r"\+7 \(\d{3}\) \d{3}-\d{2}-\d{2}",
r"125047.*?дом \d+",
r"\[Разработка - SKDO\]\(\)",
r"\[Подключиться к Консоли\]\(\)",
r"\[Кейсы наших клиентов\]\(\)",
r"\[Делимся экспертизой\]\(\)",
r"^\s*\d+\s*$",
r"^\s*\[\d+\]\(\)\s*\d{2}\.\d{2}\.\d{2}\s*$",
]
for pattern in noise_patterns:
content = re.sub(pattern, "", content, flags=re.MULTILINE | re.IGNORECASE)
related_articles_pattern = r"###\s+\[.*?\]\(\).*?(?=###|\Z)"
content = re.sub(related_articles_pattern, "", content, flags=re.DOTALL)
content = re.sub(r"\n{3,}", "\n\n", content)
lines = content.split("\n")
filtered_lines = []
for line in lines:
line = line.strip()
if line and not (line.startswith("[") and line.endswith("]()")):
if not re.match(r"^\d+\s*$", line):
if len(line) > 10 or line.startswith(""):
filtered_lines.append(line)
content = "\n".join(filtered_lines)
return content.strip()
def create_platform_overview() -> Document:
content = """
Консоль.Про платформа автоматизации работы с самозанятыми, ИП и физлицами.
Основные возможности:
• Подключение нового исполнителя за ~15 минут
• Выплаты в течение минут вместо часов
Сбор 100% закрывающих документов
• Снижение ошибок до 95%
• Управление сотнями исполнителей одним сотрудником
• API интеграции для автоматизации процессов
• Автоматический сбор чеков и документов
• Снижение времени онбординга с 2 дней до ~20 минут
Платформа решает ключевые задачи бизнеса:
• Быстрое масштабирование команды исполнителей
• Автоматизация документооборота и выплат
• Снижение операционных затрат
• Обеспечение налогового соответствия
• Упрощение работы с подрядчиками
Внедрение платформы занимает около 1 дня.
"""
metadata = {
"title": "Платформа Консоль.Про - Обзор",
"doc_type": "platform_overview",
"source": "internal",
"industry": ["generic"],
"roles_relevant": ["tech", "finance", "ops", "ceo"],
"metrics": {
"onboarding_minutes": 15,
"onboarding_days_before": 2,
"onboarding_minutes_after": 20,
"error_reduction_pct": 95,
"document_collection_pct": 100,
"implementation_days": 1,
},
"language": "ru",
"created_at": "2024-01-01T00:00:00",
"updated_at": "2024-01-01T00:00:00",
}
return Document(
content=content.strip(),
metadata=metadata,
doc_id="platform_overview",
source="platform_overview.md",
)