This adds the full Atlas (总助 Claw / 老板视角项目执行雷达) scaffolding as a sibling profile to the existing Vega general-purpose assistant. All Atlas content lives under atlas/ to keep the existing top-level skeleton intact. What's included: - atlas/IDENTITY.md, SOUL.md, USER.md, AGENTS.md, MEMORY.md, BOOTSTRAP.md, HEARTBEAT.md, TOOLS.md (+ zh-CN mirrors) — full OpenClaw 8-piece set matching the zero-cca convention - atlas/skills/ — 6 sub-skills with frontmatter: claw-email-parser / claw-project-tracker / claw-people-observer / claw-customer-radar / claw-boss-distiller / claw-report-writer - atlas/skills/claw-boss-distiller/ — adapter notes for nuwa-skill, 5-layer boss_skill seed template (23 rules across Expression DNA / Mental Models / Decision Heuristics / Anti-Patterns / Honest Boundaries), and a complete synthetic distillation demo (10 input emails -> validated 5-layer output) - atlas/mcp-tools/email-extractor/ — Python implementation of stages 1-3 (fetch + decode + dequote), 7 pytest tests passing, CLI: atlas-extract - atlas/state-schemas/ — formal JSON schemas for project / person / customer cards with the no-employee-rating hard constraint baked in - atlas/client-deck/ — 2-page client-facing pitch document - autopilots/atlas-*.yaml — 5 autopilot configs (daily / weekly / monthly / quarterly + andon event-triggered) for a future Multica-side scheduler Notes: - nuwa-skill (MIT, https://github.com/alchaincyf/nuwa-skill) NOT vendored; fetch at deploy time via instructions in atlas/skills/claw-boss-distiller/upstream/README.md - Vega-side prompts/skills/tools/autopilots/docs scaffold left untouched - Top-level README.md updated with a brief Atlas pointer; rest preserved
82 lines
2.6 KiB
Python
82 lines
2.6 KiB
Python
"""Tests for Stage 3 dequoting logic.
|
|
|
|
Run: pytest -q
|
|
"""
|
|
|
|
from pathlib import Path
|
|
|
|
from atlas_extractor.decode import decode_mime
|
|
from atlas_extractor.dequote import dequote
|
|
from atlas_extractor.pipeline import stage123
|
|
from atlas_extractor.fetch import FetchedRaw
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
FIXTURE = Path(__file__).parent / "fixtures" / "sample_thread.eml"
|
|
|
|
|
|
def _load_fixture() -> bytes:
|
|
return FIXTURE.read_bytes()
|
|
|
|
|
|
def test_decode_basic():
|
|
decoded = decode_mime(_load_fixture())
|
|
assert decoded.msg_id == "demo-001@us-saas.cn"
|
|
assert decoded.from_addr[1] == "wang@us-saas.cn"
|
|
assert "张三" in decoded.body_text
|
|
# The full body still contains the quoted history at this stage
|
|
assert "On Mon, Apr 16" in decoded.body_text
|
|
|
|
|
|
def test_dequote_strips_english_marker():
|
|
decoded = decode_mime(_load_fixture())
|
|
result = dequote(decoded.body_text)
|
|
assert "On Mon, Apr 16" not in result.text_clean
|
|
assert any("marker" in s for s in result.strategies_used)
|
|
|
|
|
|
def test_dequote_strips_signature_separator():
|
|
decoded = decode_mime(_load_fixture())
|
|
result = dequote(decoded.body_text)
|
|
# signature block "-- \n王\nCEO, US-SaaS..." should be gone
|
|
assert "+86-138" not in result.text_clean
|
|
assert "CEO, US-SaaS" not in result.text_clean
|
|
assert any("signature" in s for s in result.strategies_used)
|
|
|
|
|
|
def test_dequote_strips_disclaimer():
|
|
decoded = decode_mime(_load_fixture())
|
|
result = dequote(decoded.body_text)
|
|
# disclaimer text must be gone — strategy may be "disclaimer" OR
|
|
# may be subsumed by signature stripper if disclaimer sits inside the
|
|
# signature block (which is fine — outcome is what matters).
|
|
assert "保密信息" not in result.text_clean
|
|
|
|
|
|
def test_dequote_keeps_real_content():
|
|
decoded = decode_mime(_load_fixture())
|
|
result = dequote(decoded.body_text)
|
|
assert "PRJ-001" in result.text_clean
|
|
assert "我上次问已经过去 6 天了" in result.text_clean
|
|
assert "不要等我再问第四次" in result.text_clean
|
|
|
|
|
|
def test_dequote_chars_stripped_meaningful():
|
|
decoded = decode_mime(_load_fixture())
|
|
result = dequote(decoded.body_text)
|
|
assert result.chars_stripped > 50, "Expected non-trivial cleanup"
|
|
|
|
|
|
def test_pipeline_e2e_via_fetched_raw():
|
|
raw = FetchedRaw(
|
|
account="test",
|
|
folder="local",
|
|
uid="1",
|
|
internal_date=datetime.now(timezone.utc),
|
|
raw_mime=_load_fixture(),
|
|
)
|
|
out = stage123(raw)
|
|
assert "PRJ-001" in out.dequoted.text_clean
|
|
assert "保密信息" not in out.dequoted.text_clean
|
|
assert "On Mon" not in out.dequoted.text_clean
|