This repo IS Atlas (总助 Claw / 老板视角项目执行雷达). The earlier
two-profile framing (Atlas + Vega placeholder) was a misread — Vega is
the agent persona answering Multica issues, not the product. Vega has
no relationship to assistant-claw the product.
Changes:
- Move atlas/* to top-level (git mv preserves history)
- Remove empty Vega placeholders prompts/.gitkeep, tools/.gitkeep
- Delete atlas/ wrapper directory (now empty)
- Update path references in INTEGRATION-hermes.md, scripts/mirror-...sh,
docs/decisions/0001-mirror-nuwa-skill.md
- Rewrite README.md as Atlas-only, remove dual-profile language
After this commit:
- Top-level OpenClaw 8 files (IDENTITY/SOUL/USER/AGENTS/TOOLS/MEMORY/
BOOTSTRAP/HEARTBEAT + CLAUDE symlink + zh-CN mirrors)
- skills/{6 sub-skills + DESCRIPTION + README}
- mcp-tools/{spec + Python implementation}
- state-schemas/{project, person, customer + README}
- autopilots/{5 atlas-*.yaml}
- client-deck/, docs/decisions/, scripts/
The ~/.hermes/skills/atlas/ destination convention preserved (atlas as
a skill namespace on the operator's machine, distinct from source path).
151 lines
4.9 KiB
Python
151 lines
4.9 KiB
Python
"""Stage 2: Decode.
|
|
|
|
MIME parsing → plain text. Handles charset detection, multipart, HTML→text.
|
|
Output is the raw cleanable text — Stage 3 (dequote) strips the conversation
|
|
history afterwards.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import email
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from email.message import Message
|
|
from typing import Iterable
|
|
|
|
import chardet
|
|
import html2text
|
|
from readability import Document
|
|
|
|
_HTML2TEXT = html2text.HTML2Text()
|
|
_HTML2TEXT.ignore_images = True
|
|
_HTML2TEXT.ignore_emphasis = True
|
|
_HTML2TEXT.ignore_links = False
|
|
_HTML2TEXT.body_width = 0 # don't re-wrap
|
|
|
|
|
|
@dataclass
|
|
class DecodedMessage:
|
|
msg_id: str
|
|
subject: str
|
|
from_addr: tuple[str, str] # (name, email)
|
|
to_addrs: list[tuple[str, str]]
|
|
cc_addrs: list[tuple[str, str]]
|
|
in_reply_to: str | None
|
|
references: list[str]
|
|
body_text: str # full text (may include quoted history)
|
|
body_html: str | None
|
|
attachments_meta: list[dict] = field(default_factory=list)
|
|
decode_warnings: list[str] = field(default_factory=list)
|
|
|
|
|
|
def _decode_bytes(data: bytes, declared_charset: str | None) -> str:
|
|
"""Best-effort charset decode."""
|
|
candidates: list[str] = []
|
|
if declared_charset:
|
|
candidates.append(declared_charset)
|
|
candidates.append("utf-8")
|
|
sniffed = chardet.detect(data).get("encoding")
|
|
if sniffed and sniffed not in candidates:
|
|
candidates.append(sniffed)
|
|
candidates.append("gb18030") # common Chinese fallback
|
|
candidates.append("latin-1") # never fails
|
|
for enc in candidates:
|
|
try:
|
|
return data.decode(enc)
|
|
except (UnicodeDecodeError, LookupError):
|
|
continue
|
|
return data.decode("utf-8", errors="replace")
|
|
|
|
|
|
def _addr_pair(addr_str: str) -> tuple[str, str]:
|
|
name, email_addr = email.utils.parseaddr(addr_str or "")
|
|
return (name.strip(), email_addr.strip().lower())
|
|
|
|
|
|
def _addr_list(addr_str: str) -> list[tuple[str, str]]:
|
|
if not addr_str:
|
|
return []
|
|
pairs = email.utils.getaddresses([addr_str])
|
|
return [(n.strip(), e.strip().lower()) for (n, e) in pairs if e]
|
|
|
|
|
|
def _walk_parts(msg: Message) -> Iterable[Message]:
|
|
if msg.is_multipart():
|
|
for part in msg.walk():
|
|
if not part.is_multipart():
|
|
yield part
|
|
else:
|
|
yield msg
|
|
|
|
|
|
def _extract_bodies(msg: Message) -> tuple[str, str | None, list[dict], list[str]]:
|
|
"""Return (text, html, attachments_meta, warnings)."""
|
|
text_parts: list[str] = []
|
|
html_parts: list[str] = []
|
|
attachments: list[dict] = []
|
|
warnings: list[str] = []
|
|
|
|
for part in _walk_parts(msg):
|
|
ctype = part.get_content_type()
|
|
disp = (part.get("Content-Disposition") or "").lower()
|
|
payload = part.get_payload(decode=True)
|
|
if payload is None:
|
|
continue
|
|
if "attachment" in disp:
|
|
attachments.append(
|
|
{
|
|
"filename": part.get_filename(),
|
|
"content_type": ctype,
|
|
"size_bytes": len(payload),
|
|
}
|
|
)
|
|
continue
|
|
if ctype == "text/plain":
|
|
text_parts.append(_decode_bytes(payload, part.get_content_charset()))
|
|
elif ctype == "text/html":
|
|
html_parts.append(_decode_bytes(payload, part.get_content_charset()))
|
|
elif ctype == "text/calendar":
|
|
warnings.append("text/calendar part skipped")
|
|
|
|
text_body = "\n\n".join(text_parts).strip()
|
|
html_body = "\n\n".join(html_parts).strip() if html_parts else None
|
|
|
|
if not text_body and html_body:
|
|
# readability for the main article extraction, then html2text
|
|
try:
|
|
summary_html = Document(html_body).summary()
|
|
text_body = _HTML2TEXT.handle(summary_html).strip()
|
|
except Exception as exc:
|
|
warnings.append(f"readability failed: {exc}; falling back to html2text")
|
|
text_body = _HTML2TEXT.handle(html_body).strip()
|
|
|
|
return text_body, html_body, attachments, warnings
|
|
|
|
|
|
def decode_mime(raw_mime: bytes) -> DecodedMessage:
|
|
msg = email.message_from_bytes(raw_mime)
|
|
text, html, attachments, warnings = _extract_bodies(msg)
|
|
|
|
msg_id = (msg.get("Message-ID") or msg.get("Message-Id") or "").strip("<> ")
|
|
subject_raw = msg.get("Subject", "")
|
|
subject = str(email.header.make_header(email.header.decode_header(subject_raw or "")))
|
|
|
|
refs_raw = msg.get("References", "") or ""
|
|
references = [r.strip("<> ") for r in refs_raw.split() if r.strip()]
|
|
in_reply_to = (msg.get("In-Reply-To") or "").strip("<> ") or None
|
|
|
|
return DecodedMessage(
|
|
msg_id=msg_id,
|
|
subject=subject,
|
|
from_addr=_addr_pair(msg.get("From", "")),
|
|
to_addrs=_addr_list(msg.get("To", "")),
|
|
cc_addrs=_addr_list(msg.get("Cc", "")),
|
|
in_reply_to=in_reply_to,
|
|
references=references,
|
|
body_text=text,
|
|
body_html=html,
|
|
attachments_meta=attachments,
|
|
decode_warnings=warnings,
|
|
)
|