"""Command-line entry point. Three modes: atlas-extract imap --host imap.gmail.com --user X --password Y --state-dir ./state atlas-extract eml --input ./fixtures/sample.eml --state-dir ./state atlas-extract dir --input-dir ./test-emails --state-dir ./state Suitable for V0 dev + the demo flow. Production wraps this in an MCP server. """ from __future__ import annotations import json import os from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Iterator import click from .fetch import FetchedRaw, fetch_imap from .pipeline import run_on_raws @click.group() def main() -> None: """Atlas / 总助 Claw — email extractor V0 (Stages 1-3).""" @main.command("imap") @click.option("--host", required=True) @click.option("--port", default=993, type=int) @click.option("--user", "username", required=True, envvar="ATLAS_IMAP_USER") @click.option("--password", required=True, envvar="ATLAS_IMAP_PASSWORD") @click.option("--folder", "folders", multiple=True, default=["INBOX", "Sent"]) @click.option("--state-dir", required=True, type=click.Path(path_type=Path)) @click.option( "--since-days", default=365, type=int, help="On cold start, only pull messages newer than N days.", ) @click.option("--max-per-run", default=5000, type=int) def imap_cmd(host, port, username, password, folders, state_dir, since_days, max_per_run) -> None: """Pull from a real IMAP account, run stages 1-3, write JSON.""" since = datetime.now(timezone.utc) - timedelta(days=since_days) state_dir = state_dir.resolve() state_dir.mkdir(parents=True, exist_ok=True) raws = fetch_imap( host=host, port=port, username=username, password=password, folders=list(folders), state_dir=state_dir, since=since, max_per_run=max_per_run, ) summary = run_on_raws(raws, state_dir) click.echo(json.dumps(summary, ensure_ascii=False, indent=2)) @main.command("eml") @click.option("--input", "eml_path", required=True, type=click.Path(exists=True, path_type=Path)) @click.option("--state-dir", required=True, type=click.Path(path_type=Path)) def eml_cmd(eml_path: Path, state_dir: Path) -> None: """Single .eml file → run stages 1-3.""" state_dir = state_dir.resolve() raws = _eml_iter([eml_path]) summary = run_on_raws(raws, state_dir) click.echo(json.dumps(summary, ensure_ascii=False, indent=2)) @main.command("dir") @click.option("--input-dir", required=True, type=click.Path(exists=True, file_okay=False, path_type=Path)) @click.option("--state-dir", required=True, type=click.Path(path_type=Path)) def dir_cmd(input_dir: Path, state_dir: Path) -> None: """Directory of .eml/.txt files → run stages 1-3.""" state_dir = state_dir.resolve() paths = sorted([p for p in input_dir.rglob("*") if p.is_file() and p.suffix.lower() in {".eml", ".txt"}]) raws = _eml_iter(paths) summary = run_on_raws(raws, state_dir) click.echo(json.dumps(summary, ensure_ascii=False, indent=2)) def _eml_iter(paths: list[Path]) -> Iterator[FetchedRaw]: for i, p in enumerate(paths, start=1): raw_bytes = p.read_bytes() # If it's a .txt without proper MIME headers, wrap minimally so decode doesn't choke if p.suffix.lower() == ".txt" and not raw_bytes.lstrip().startswith(b"From:"): raw_bytes = ( b"From: unknown@local\r\nTo: unknown@local\r\nSubject: " + p.stem.encode("utf-8", errors="replace") + b"\r\nMessage-ID: \r\n\r\n" + raw_bytes ) yield FetchedRaw( account=os.environ.get("ATLAS_LOCAL_ACCOUNT", "local"), folder="local", uid=str(i), internal_date=datetime.fromtimestamp(p.stat().st_mtime, tz=timezone.utc), raw_mime=raw_bytes, ) if __name__ == "__main__": main()