assistant-claw/atlas/mcp-tools/email-extractor/atlas_extractor/decode.py
Vega (Atlas scaffolding) ce9f27320a Add Atlas profile under atlas/ — boss-perspective project execution radar
This adds the full Atlas (总助 Claw / 老板视角项目执行雷达) scaffolding as a
sibling profile to the existing Vega general-purpose assistant. All Atlas content
lives under atlas/ to keep the existing top-level skeleton intact.

What's included:

- atlas/IDENTITY.md, SOUL.md, USER.md, AGENTS.md, MEMORY.md, BOOTSTRAP.md,
  HEARTBEAT.md, TOOLS.md (+ zh-CN mirrors) — full OpenClaw 8-piece set
  matching the zero-cca convention
- atlas/skills/ — 6 sub-skills with frontmatter:
  claw-email-parser / claw-project-tracker / claw-people-observer /
  claw-customer-radar / claw-boss-distiller / claw-report-writer
- atlas/skills/claw-boss-distiller/ — adapter notes for nuwa-skill, 5-layer
  boss_skill seed template (23 rules across Expression DNA / Mental Models /
  Decision Heuristics / Anti-Patterns / Honest Boundaries), and a complete
  synthetic distillation demo (10 input emails -> validated 5-layer output)
- atlas/mcp-tools/email-extractor/ — Python implementation of stages 1-3
  (fetch + decode + dequote), 7 pytest tests passing, CLI: atlas-extract
- atlas/state-schemas/ — formal JSON schemas for project / person / customer
  cards with the no-employee-rating hard constraint baked in
- atlas/client-deck/ — 2-page client-facing pitch document
- autopilots/atlas-*.yaml — 5 autopilot configs (daily / weekly / monthly /
  quarterly + andon event-triggered) for a future Multica-side scheduler

Notes:

- nuwa-skill (MIT, https://github.com/alchaincyf/nuwa-skill) NOT vendored;
  fetch at deploy time via instructions in
  atlas/skills/claw-boss-distiller/upstream/README.md
- Vega-side prompts/skills/tools/autopilots/docs scaffold left untouched
- Top-level README.md updated with a brief Atlas pointer; rest preserved
2026-05-09 17:00:29 +08:00

151 lines
4.9 KiB
Python

"""Stage 2: Decode.
MIME parsing → plain text. Handles charset detection, multipart, HTML→text.
Output is the raw cleanable text — Stage 3 (dequote) strips the conversation
history afterwards.
"""
from __future__ import annotations
import email
import re
from dataclasses import dataclass, field
from email.message import Message
from typing import Iterable
import chardet
import html2text
from readability import Document
_HTML2TEXT = html2text.HTML2Text()
_HTML2TEXT.ignore_images = True
_HTML2TEXT.ignore_emphasis = True
_HTML2TEXT.ignore_links = False
_HTML2TEXT.body_width = 0 # don't re-wrap
@dataclass
class DecodedMessage:
msg_id: str
subject: str
from_addr: tuple[str, str] # (name, email)
to_addrs: list[tuple[str, str]]
cc_addrs: list[tuple[str, str]]
in_reply_to: str | None
references: list[str]
body_text: str # full text (may include quoted history)
body_html: str | None
attachments_meta: list[dict] = field(default_factory=list)
decode_warnings: list[str] = field(default_factory=list)
def _decode_bytes(data: bytes, declared_charset: str | None) -> str:
"""Best-effort charset decode."""
candidates: list[str] = []
if declared_charset:
candidates.append(declared_charset)
candidates.append("utf-8")
sniffed = chardet.detect(data).get("encoding")
if sniffed and sniffed not in candidates:
candidates.append(sniffed)
candidates.append("gb18030") # common Chinese fallback
candidates.append("latin-1") # never fails
for enc in candidates:
try:
return data.decode(enc)
except (UnicodeDecodeError, LookupError):
continue
return data.decode("utf-8", errors="replace")
def _addr_pair(addr_str: str) -> tuple[str, str]:
name, email_addr = email.utils.parseaddr(addr_str or "")
return (name.strip(), email_addr.strip().lower())
def _addr_list(addr_str: str) -> list[tuple[str, str]]:
if not addr_str:
return []
pairs = email.utils.getaddresses([addr_str])
return [(n.strip(), e.strip().lower()) for (n, e) in pairs if e]
def _walk_parts(msg: Message) -> Iterable[Message]:
if msg.is_multipart():
for part in msg.walk():
if not part.is_multipart():
yield part
else:
yield msg
def _extract_bodies(msg: Message) -> tuple[str, str | None, list[dict], list[str]]:
"""Return (text, html, attachments_meta, warnings)."""
text_parts: list[str] = []
html_parts: list[str] = []
attachments: list[dict] = []
warnings: list[str] = []
for part in _walk_parts(msg):
ctype = part.get_content_type()
disp = (part.get("Content-Disposition") or "").lower()
payload = part.get_payload(decode=True)
if payload is None:
continue
if "attachment" in disp:
attachments.append(
{
"filename": part.get_filename(),
"content_type": ctype,
"size_bytes": len(payload),
}
)
continue
if ctype == "text/plain":
text_parts.append(_decode_bytes(payload, part.get_content_charset()))
elif ctype == "text/html":
html_parts.append(_decode_bytes(payload, part.get_content_charset()))
elif ctype == "text/calendar":
warnings.append("text/calendar part skipped")
text_body = "\n\n".join(text_parts).strip()
html_body = "\n\n".join(html_parts).strip() if html_parts else None
if not text_body and html_body:
# readability for the main article extraction, then html2text
try:
summary_html = Document(html_body).summary()
text_body = _HTML2TEXT.handle(summary_html).strip()
except Exception as exc:
warnings.append(f"readability failed: {exc}; falling back to html2text")
text_body = _HTML2TEXT.handle(html_body).strip()
return text_body, html_body, attachments, warnings
def decode_mime(raw_mime: bytes) -> DecodedMessage:
msg = email.message_from_bytes(raw_mime)
text, html, attachments, warnings = _extract_bodies(msg)
msg_id = (msg.get("Message-ID") or msg.get("Message-Id") or "").strip("<> ")
subject_raw = msg.get("Subject", "")
subject = str(email.header.make_header(email.header.decode_header(subject_raw or "")))
refs_raw = msg.get("References", "") or ""
references = [r.strip("<> ") for r in refs_raw.split() if r.strip()]
in_reply_to = (msg.get("In-Reply-To") or "").strip("<> ") or None
return DecodedMessage(
msg_id=msg_id,
subject=subject,
from_addr=_addr_pair(msg.get("From", "")),
to_addrs=_addr_list(msg.get("To", "")),
cc_addrs=_addr_list(msg.get("Cc", "")),
in_reply_to=in_reply_to,
references=references,
body_text=text,
body_html=html,
attachments_meta=attachments,
decode_warnings=warnings,
)