assistant-claw/mcp-tools/email-extractor/atlas_extractor/dequote.py
Atlas refactor bd0be97630 Refactor: drop Vega framing, promote Atlas to repo root
This repo IS Atlas (总助 Claw / 老板视角项目执行雷达). The earlier
two-profile framing (Atlas + Vega placeholder) was a misread — Vega is
the agent persona answering Multica issues, not the product. Vega has
no relationship to assistant-claw the product.

Changes:
- Move atlas/* to top-level (git mv preserves history)
- Remove empty Vega placeholders prompts/.gitkeep, tools/.gitkeep
- Delete atlas/ wrapper directory (now empty)
- Update path references in INTEGRATION-hermes.md, scripts/mirror-...sh,
  docs/decisions/0001-mirror-nuwa-skill.md
- Rewrite README.md as Atlas-only, remove dual-profile language

After this commit:
- Top-level OpenClaw 8 files (IDENTITY/SOUL/USER/AGENTS/TOOLS/MEMORY/
  BOOTSTRAP/HEARTBEAT + CLAUDE symlink + zh-CN mirrors)
- skills/{6 sub-skills + DESCRIPTION + README}
- mcp-tools/{spec + Python implementation}
- state-schemas/{project, person, customer + README}
- autopilots/{5 atlas-*.yaml}
- client-deck/, docs/decisions/, scripts/

The ~/.hermes/skills/atlas/ destination convention preserved (atlas as
a skill namespace on the operator's machine, distinct from source path).
2026-05-09 17:54:18 +08:00

165 lines
5.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Stage 3: Dequote.
Strip quoted-reply chains, signature blocks, and disclaimer footers.
This is the unglamorous-but-critical step — without it, every email
looks like every other email and downstream clustering is destroyed.
Strategy stack (apply in order, keep all matches conservative):
1. Marker patterns (English + Chinese reply/forward markers)
2. Outlook-style block headers
3. RFC quoted lines (`> ...`)
4. Signature separator (`-- \n`)
5. Trailing-block heuristic (phone/title patterns)
6. Disclaimer footer regex
Result: only the new content the sender wrote in this message.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
# --- Marker patterns ---------------------------------------------------------
_MARKERS = [
# English: "On Mon, Apr 22, 2024 at 9:14 AM Wang <wang@us.com> wrote:"
re.compile(r"^On\s.+?wrote:\s*$", re.MULTILINE),
# English forwards
re.compile(r"^[-]+\s*Forwarded message\s*[-]+\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^[-]+\s*Original Message\s*[-]+\s*$", re.MULTILINE | re.IGNORECASE),
# Chinese: "王 于 2026年4月22日 下午2:30 写道:" / variants
re.compile(r"^.*?于\s*\d{4}年.+?写道[:]\s*$", re.MULTILINE),
re.compile(r"^.+?写道[:]\s*$", re.MULTILINE),
# Chinese forward markers
re.compile(r"^[-]+\s*转发(的)?邮件\s*[-]+\s*$", re.MULTILINE),
re.compile(r"^[-]+\s*原始邮件\s*[-]+\s*$", re.MULTILINE),
# Outlook block (From: / Sent: / To: / Subject: stack)
re.compile(
r"^From:.+?\n(Sent|发送时间):.+?\n(To|收件人):.+?\n(Subject|主题):.+?$",
re.MULTILINE | re.DOTALL,
),
re.compile(
r"^发件人[:].+?\n发送时间[:].+?\n收件人[:].+?\n主题[:].+?$",
re.MULTILINE | re.DOTALL,
),
]
_QUOTE_LINE = re.compile(r"^\s*>+\s?", re.MULTILINE)
_SIGNATURE_SEP = re.compile(r"^--\s*$", re.MULTILINE)
_DISCLAIMER_PATTERNS = [
re.compile(r"本邮件(及其附件)?(包含|含有)?(保密|机密).*", re.IGNORECASE | re.DOTALL),
re.compile(r"This\s+e?-?mail.*confidential.*", re.IGNORECASE | re.DOTALL),
re.compile(r"DISCLAIMER:.*", re.IGNORECASE | re.DOTALL),
]
@dataclass
class DequoteResult:
text_clean: str
strategies_used: list[str]
chars_stripped: int
def _strip_at_first_marker(text: str, used: list[str]) -> str:
earliest_idx: int | None = None
matched_pattern: str | None = None
for pat in _MARKERS:
m = pat.search(text)
if m and (earliest_idx is None or m.start() < earliest_idx):
earliest_idx = m.start()
matched_pattern = pat.pattern[:40]
if earliest_idx is not None:
used.append(f"marker:{matched_pattern}")
return text[:earliest_idx].rstrip()
return text
def _strip_quoted_lines(text: str, used: list[str]) -> str:
"""Cut all leading-`>` lines AND any trailing blocks of them."""
if not _QUOTE_LINE.search(text):
return text
used.append("rfc_quoted_lines")
lines = text.splitlines()
# Find first line that is NOT a quoted line, working from the bottom up
while lines and (
_QUOTE_LINE.match(lines[-1])
or lines[-1].strip() == ""
):
lines.pop()
cleaned = [ln for ln in lines if not _QUOTE_LINE.match(ln)]
return "\n".join(cleaned).strip()
def _strip_signature(text: str, used: list[str]) -> str:
m = _SIGNATURE_SEP.search(text)
if m:
used.append("signature_sep_dashdash")
return text[: m.start()].rstrip()
return text
def _strip_trailing_block_heuristic(text: str, used: list[str]) -> str:
"""If the last 3-8 lines look like a contact block, drop them.
Heuristic: trailing block of short lines that contains a phone number
pattern, an email, or a generic title word like 'CEO/总监/经理/董事长'.
"""
lines = text.splitlines()
if len(lines) < 6:
return text
tail = [ln for ln in lines[-8:] if ln.strip()]
if len(tail) < 2 or len(tail) > 8:
return text
joined = "\n".join(tail)
has_signal = (
re.search(r"\+?\d[\d\s\-()]{6,}", joined)
or re.search(r"[\w.+-]+@[\w-]+\.[\w.-]+", joined)
or re.search(r"(CEO|CTO|CFO|总监|经理|董事长|总裁|主管|VP|Director)", joined)
)
if not has_signal:
return text
avg_len = sum(len(t) for t in tail) / len(tail)
if avg_len > 60: # too long, probably real content
return text
used.append("trailing_block_heuristic")
cut_idx = len(lines)
for i in range(len(lines) - 1, -1, -1):
if lines[i].strip() == "" and i < len(lines) - 1:
continue
if lines[i] in tail:
cut_idx = i
else:
break
return "\n".join(lines[:cut_idx]).rstrip()
def _strip_disclaimer(text: str, used: list[str]) -> str:
for pat in _DISCLAIMER_PATTERNS:
m = pat.search(text)
if m:
used.append(f"disclaimer:{pat.pattern[:30]}")
text = text[: m.start()].rstrip()
return text
def dequote(text: str) -> DequoteResult:
"""Run the full dequote stack and return the cleaned text."""
if not text:
return DequoteResult(text_clean="", strategies_used=[], chars_stripped=0)
original_len = len(text)
used: list[str] = []
text = _strip_at_first_marker(text, used)
text = _strip_quoted_lines(text, used)
text = _strip_signature(text, used)
text = _strip_disclaimer(text, used)
text = _strip_trailing_block_heuristic(text, used)
return DequoteResult(
text_clean=text.strip(),
strategies_used=used,
chars_stripped=original_len - len(text.strip()),
)