assistant-claw/mcp-tools/email-extractor/tests/test_dequote.py
Atlas refactor bd0be97630 Refactor: drop Vega framing, promote Atlas to repo root
This repo IS Atlas (总助 Claw / 老板视角项目执行雷达). The earlier
two-profile framing (Atlas + Vega placeholder) was a misread — Vega is
the agent persona answering Multica issues, not the product. Vega has
no relationship to assistant-claw the product.

Changes:
- Move atlas/* to top-level (git mv preserves history)
- Remove empty Vega placeholders prompts/.gitkeep, tools/.gitkeep
- Delete atlas/ wrapper directory (now empty)
- Update path references in INTEGRATION-hermes.md, scripts/mirror-...sh,
  docs/decisions/0001-mirror-nuwa-skill.md
- Rewrite README.md as Atlas-only, remove dual-profile language

After this commit:
- Top-level OpenClaw 8 files (IDENTITY/SOUL/USER/AGENTS/TOOLS/MEMORY/
  BOOTSTRAP/HEARTBEAT + CLAUDE symlink + zh-CN mirrors)
- skills/{6 sub-skills + DESCRIPTION + README}
- mcp-tools/{spec + Python implementation}
- state-schemas/{project, person, customer + README}
- autopilots/{5 atlas-*.yaml}
- client-deck/, docs/decisions/, scripts/

The ~/.hermes/skills/atlas/ destination convention preserved (atlas as
a skill namespace on the operator's machine, distinct from source path).
2026-05-09 17:54:18 +08:00

82 lines
2.6 KiB
Python

"""Tests for Stage 3 dequoting logic.
Run: pytest -q
"""
from pathlib import Path
from atlas_extractor.decode import decode_mime
from atlas_extractor.dequote import dequote
from atlas_extractor.pipeline import stage123
from atlas_extractor.fetch import FetchedRaw
from datetime import datetime, timezone
FIXTURE = Path(__file__).parent / "fixtures" / "sample_thread.eml"
def _load_fixture() -> bytes:
return FIXTURE.read_bytes()
def test_decode_basic():
decoded = decode_mime(_load_fixture())
assert decoded.msg_id == "demo-001@us-saas.cn"
assert decoded.from_addr[1] == "wang@us-saas.cn"
assert "张三" in decoded.body_text
# The full body still contains the quoted history at this stage
assert "On Mon, Apr 16" in decoded.body_text
def test_dequote_strips_english_marker():
decoded = decode_mime(_load_fixture())
result = dequote(decoded.body_text)
assert "On Mon, Apr 16" not in result.text_clean
assert any("marker" in s for s in result.strategies_used)
def test_dequote_strips_signature_separator():
decoded = decode_mime(_load_fixture())
result = dequote(decoded.body_text)
# signature block "-- \n王\nCEO, US-SaaS..." should be gone
assert "+86-138" not in result.text_clean
assert "CEO, US-SaaS" not in result.text_clean
assert any("signature" in s for s in result.strategies_used)
def test_dequote_strips_disclaimer():
decoded = decode_mime(_load_fixture())
result = dequote(decoded.body_text)
# disclaimer text must be gone — strategy may be "disclaimer" OR
# may be subsumed by signature stripper if disclaimer sits inside the
# signature block (which is fine — outcome is what matters).
assert "保密信息" not in result.text_clean
def test_dequote_keeps_real_content():
decoded = decode_mime(_load_fixture())
result = dequote(decoded.body_text)
assert "PRJ-001" in result.text_clean
assert "我上次问已经过去 6 天了" in result.text_clean
assert "不要等我再问第四次" in result.text_clean
def test_dequote_chars_stripped_meaningful():
decoded = decode_mime(_load_fixture())
result = dequote(decoded.body_text)
assert result.chars_stripped > 50, "Expected non-trivial cleanup"
def test_pipeline_e2e_via_fetched_raw():
raw = FetchedRaw(
account="test",
folder="local",
uid="1",
internal_date=datetime.now(timezone.utc),
raw_mime=_load_fixture(),
)
out = stage123(raw)
assert "PRJ-001" in out.dequoted.text_clean
assert "保密信息" not in out.dequoted.text_clean
assert "On Mon" not in out.dequoted.text_clean