"""Tests for Stage 3 dequoting logic. Run: pytest -q """ from pathlib import Path from atlas_extractor.decode import decode_mime from atlas_extractor.dequote import dequote from atlas_extractor.pipeline import stage123 from atlas_extractor.fetch import FetchedRaw from datetime import datetime, timezone FIXTURE = Path(__file__).parent / "fixtures" / "sample_thread.eml" def _load_fixture() -> bytes: return FIXTURE.read_bytes() def test_decode_basic(): decoded = decode_mime(_load_fixture()) assert decoded.msg_id == "demo-001@us-saas.cn" assert decoded.from_addr[1] == "wang@us-saas.cn" assert "张三" in decoded.body_text # The full body still contains the quoted history at this stage assert "On Mon, Apr 16" in decoded.body_text def test_dequote_strips_english_marker(): decoded = decode_mime(_load_fixture()) result = dequote(decoded.body_text) assert "On Mon, Apr 16" not in result.text_clean assert any("marker" in s for s in result.strategies_used) def test_dequote_strips_signature_separator(): decoded = decode_mime(_load_fixture()) result = dequote(decoded.body_text) # signature block "-- \n王\nCEO, US-SaaS..." should be gone assert "+86-138" not in result.text_clean assert "CEO, US-SaaS" not in result.text_clean assert any("signature" in s for s in result.strategies_used) def test_dequote_strips_disclaimer(): decoded = decode_mime(_load_fixture()) result = dequote(decoded.body_text) # disclaimer text must be gone — strategy may be "disclaimer" OR # may be subsumed by signature stripper if disclaimer sits inside the # signature block (which is fine — outcome is what matters). assert "保密信息" not in result.text_clean def test_dequote_keeps_real_content(): decoded = decode_mime(_load_fixture()) result = dequote(decoded.body_text) assert "PRJ-001" in result.text_clean assert "我上次问已经过去 6 天了" in result.text_clean assert "不要等我再问第四次" in result.text_clean def test_dequote_chars_stripped_meaningful(): decoded = decode_mime(_load_fixture()) result = dequote(decoded.body_text) assert result.chars_stripped > 50, "Expected non-trivial cleanup" def test_pipeline_e2e_via_fetched_raw(): raw = FetchedRaw( account="test", folder="local", uid="1", internal_date=datetime.now(timezone.utc), raw_mime=_load_fixture(), ) out = stage123(raw) assert "PRJ-001" in out.dequoted.text_clean assert "保密信息" not in out.dequoted.text_clean assert "On Mon" not in out.dequoted.text_clean