assistant-claw/atlas/mcp-tools/email-extractor/atlas_extractor/dequote.py
Vega (Atlas scaffolding) ce9f27320a Add Atlas profile under atlas/ — boss-perspective project execution radar
This adds the full Atlas (总助 Claw / 老板视角项目执行雷达) scaffolding as a
sibling profile to the existing Vega general-purpose assistant. All Atlas content
lives under atlas/ to keep the existing top-level skeleton intact.

What's included:

- atlas/IDENTITY.md, SOUL.md, USER.md, AGENTS.md, MEMORY.md, BOOTSTRAP.md,
  HEARTBEAT.md, TOOLS.md (+ zh-CN mirrors) — full OpenClaw 8-piece set
  matching the zero-cca convention
- atlas/skills/ — 6 sub-skills with frontmatter:
  claw-email-parser / claw-project-tracker / claw-people-observer /
  claw-customer-radar / claw-boss-distiller / claw-report-writer
- atlas/skills/claw-boss-distiller/ — adapter notes for nuwa-skill, 5-layer
  boss_skill seed template (23 rules across Expression DNA / Mental Models /
  Decision Heuristics / Anti-Patterns / Honest Boundaries), and a complete
  synthetic distillation demo (10 input emails -> validated 5-layer output)
- atlas/mcp-tools/email-extractor/ — Python implementation of stages 1-3
  (fetch + decode + dequote), 7 pytest tests passing, CLI: atlas-extract
- atlas/state-schemas/ — formal JSON schemas for project / person / customer
  cards with the no-employee-rating hard constraint baked in
- atlas/client-deck/ — 2-page client-facing pitch document
- autopilots/atlas-*.yaml — 5 autopilot configs (daily / weekly / monthly /
  quarterly + andon event-triggered) for a future Multica-side scheduler

Notes:

- nuwa-skill (MIT, https://github.com/alchaincyf/nuwa-skill) NOT vendored;
  fetch at deploy time via instructions in
  atlas/skills/claw-boss-distiller/upstream/README.md
- Vega-side prompts/skills/tools/autopilots/docs scaffold left untouched
- Top-level README.md updated with a brief Atlas pointer; rest preserved
2026-05-09 17:00:29 +08:00

165 lines
5.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Stage 3: Dequote.
Strip quoted-reply chains, signature blocks, and disclaimer footers.
This is the unglamorous-but-critical step — without it, every email
looks like every other email and downstream clustering is destroyed.
Strategy stack (apply in order, keep all matches conservative):
1. Marker patterns (English + Chinese reply/forward markers)
2. Outlook-style block headers
3. RFC quoted lines (`> ...`)
4. Signature separator (`-- \n`)
5. Trailing-block heuristic (phone/title patterns)
6. Disclaimer footer regex
Result: only the new content the sender wrote in this message.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
# --- Marker patterns ---------------------------------------------------------
_MARKERS = [
# English: "On Mon, Apr 22, 2024 at 9:14 AM Wang <wang@us.com> wrote:"
re.compile(r"^On\s.+?wrote:\s*$", re.MULTILINE),
# English forwards
re.compile(r"^[-]+\s*Forwarded message\s*[-]+\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^[-]+\s*Original Message\s*[-]+\s*$", re.MULTILINE | re.IGNORECASE),
# Chinese: "王 于 2026年4月22日 下午2:30 写道:" / variants
re.compile(r"^.*?于\s*\d{4}年.+?写道[:]\s*$", re.MULTILINE),
re.compile(r"^.+?写道[:]\s*$", re.MULTILINE),
# Chinese forward markers
re.compile(r"^[-]+\s*转发(的)?邮件\s*[-]+\s*$", re.MULTILINE),
re.compile(r"^[-]+\s*原始邮件\s*[-]+\s*$", re.MULTILINE),
# Outlook block (From: / Sent: / To: / Subject: stack)
re.compile(
r"^From:.+?\n(Sent|发送时间):.+?\n(To|收件人):.+?\n(Subject|主题):.+?$",
re.MULTILINE | re.DOTALL,
),
re.compile(
r"^发件人[:].+?\n发送时间[:].+?\n收件人[:].+?\n主题[:].+?$",
re.MULTILINE | re.DOTALL,
),
]
_QUOTE_LINE = re.compile(r"^\s*>+\s?", re.MULTILINE)
_SIGNATURE_SEP = re.compile(r"^--\s*$", re.MULTILINE)
_DISCLAIMER_PATTERNS = [
re.compile(r"本邮件(及其附件)?(包含|含有)?(保密|机密).*", re.IGNORECASE | re.DOTALL),
re.compile(r"This\s+e?-?mail.*confidential.*", re.IGNORECASE | re.DOTALL),
re.compile(r"DISCLAIMER:.*", re.IGNORECASE | re.DOTALL),
]
@dataclass
class DequoteResult:
text_clean: str
strategies_used: list[str]
chars_stripped: int
def _strip_at_first_marker(text: str, used: list[str]) -> str:
earliest_idx: int | None = None
matched_pattern: str | None = None
for pat in _MARKERS:
m = pat.search(text)
if m and (earliest_idx is None or m.start() < earliest_idx):
earliest_idx = m.start()
matched_pattern = pat.pattern[:40]
if earliest_idx is not None:
used.append(f"marker:{matched_pattern}")
return text[:earliest_idx].rstrip()
return text
def _strip_quoted_lines(text: str, used: list[str]) -> str:
"""Cut all leading-`>` lines AND any trailing blocks of them."""
if not _QUOTE_LINE.search(text):
return text
used.append("rfc_quoted_lines")
lines = text.splitlines()
# Find first line that is NOT a quoted line, working from the bottom up
while lines and (
_QUOTE_LINE.match(lines[-1])
or lines[-1].strip() == ""
):
lines.pop()
cleaned = [ln for ln in lines if not _QUOTE_LINE.match(ln)]
return "\n".join(cleaned).strip()
def _strip_signature(text: str, used: list[str]) -> str:
m = _SIGNATURE_SEP.search(text)
if m:
used.append("signature_sep_dashdash")
return text[: m.start()].rstrip()
return text
def _strip_trailing_block_heuristic(text: str, used: list[str]) -> str:
"""If the last 3-8 lines look like a contact block, drop them.
Heuristic: trailing block of short lines that contains a phone number
pattern, an email, or a generic title word like 'CEO/总监/经理/董事长'.
"""
lines = text.splitlines()
if len(lines) < 6:
return text
tail = [ln for ln in lines[-8:] if ln.strip()]
if len(tail) < 2 or len(tail) > 8:
return text
joined = "\n".join(tail)
has_signal = (
re.search(r"\+?\d[\d\s\-()]{6,}", joined)
or re.search(r"[\w.+-]+@[\w-]+\.[\w.-]+", joined)
or re.search(r"(CEO|CTO|CFO|总监|经理|董事长|总裁|主管|VP|Director)", joined)
)
if not has_signal:
return text
avg_len = sum(len(t) for t in tail) / len(tail)
if avg_len > 60: # too long, probably real content
return text
used.append("trailing_block_heuristic")
cut_idx = len(lines)
for i in range(len(lines) - 1, -1, -1):
if lines[i].strip() == "" and i < len(lines) - 1:
continue
if lines[i] in tail:
cut_idx = i
else:
break
return "\n".join(lines[:cut_idx]).rstrip()
def _strip_disclaimer(text: str, used: list[str]) -> str:
for pat in _DISCLAIMER_PATTERNS:
m = pat.search(text)
if m:
used.append(f"disclaimer:{pat.pattern[:30]}")
text = text[: m.start()].rstrip()
return text
def dequote(text: str) -> DequoteResult:
"""Run the full dequote stack and return the cleaned text."""
if not text:
return DequoteResult(text_clean="", strategies_used=[], chars_stripped=0)
original_len = len(text)
used: list[str] = []
text = _strip_at_first_marker(text, used)
text = _strip_quoted_lines(text, used)
text = _strip_signature(text, used)
text = _strip_disclaimer(text, used)
text = _strip_trailing_block_heuristic(text, used)
return DequoteResult(
text_clean=text.strip(),
strategies_used=used,
chars_stripped=original_len - len(text.strip()),
)