Smart-AI-Memory · silversurfer562 · May 1, 2026 · May 1, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "attune-author"
-version = "0.5.1"
+version = "0.6.0"
 description = "Documentation authoring and maintenance for the attune ecosystem — generate, maintain, and validate help content with AI assistance."
 readme = {file = "README.md", content-type = "text/markdown"}
 requires-python = ">=3.10"

diff --git a/src/attune_author/cli.py b/src/attune_author/cli.py
@@ -169,6 +169,27 @@ def _build_parser() -> argparse.ArgumentParser:
         help="Report stale features without regenerating.",
     )
 
+    p_cache = sub.add_parser(
+        "cache",
+        help="Manage the on-disk polish cache",
+        description=(
+            "Inspect and clear the on-disk LLM polish cache used by the "
+            "generator. Entries are pruned automatically by mtime (default "
+            "TTL 30 days, configurable via ATTUNE_AUTHOR_POLISH_CACHE_TTL_SECONDS); "
+            "this command exposes a manual nuke."
+        ),
+    )
+    cache_sub = p_cache.add_subparsers(dest="cache_command", help="Cache subcommands")
+    cache_sub.add_parser(
+        "clear",
+        help="Delete every cached polish entry",
+        description=(
+            "Remove all entries from the polish cache directory. Useful "
+            "after a prompt change in attune-author itself, or to reclaim "
+            "disk space without waiting for the TTL sweep."
+        ),
+    )
+
     p_docs = sub.add_parser(
         "docs",
         help="Generate docs from source (requires [ai])",
@@ -220,6 +241,7 @@ def _dispatch(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
         "generate": _cmd_generate,
         "regenerate": _cmd_regenerate,
         "docs": _cmd_docs,
+        "cache": _cmd_cache,
     }
     handler = handlers.get(args.command)
     if handler is None:
@@ -403,6 +425,20 @@ def _cmd_regenerate(args: argparse.Namespace) -> int:
     return 0
 
 
+def _cmd_cache(args: argparse.Namespace) -> int:
+    """Handle the cache command and its subcommands."""
+    from attune_author.polish import _cache_dir, clear_cache
+
+    if args.cache_command == "clear":
+        deleted = clear_cache()
+        cache_path = _cache_dir()
+        print(f"Cleared {deleted} entries from {cache_path}")
+        return 0
+
+    print("Usage: attune-author cache clear", file=sys.stderr)
+    return 1
+
+
 def _cmd_docs(args: argparse.Namespace) -> int:
     """Handle the docs command."""
     if not args.target:

diff --git a/src/attune_author/doc_gen/_anthropic.py b/src/attune_author/doc_gen/_anthropic.py
@@ -12,13 +12,17 @@
 import logging
 import os
 import re
+import time
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     from anthropic import Anthropic
 
 logger = logging.getLogger(__name__)
 
+_MAX_RETRIES = 3
+_RETRY_BASE_DELAY = 1.0  # seconds; doubles each attempt
+
 #: Source-content character budgets per doc-gen stage. Tuned so
 #: the outline and review stages see enough code for accuracy
 #: without dominating the prompt context, while the write stage
@@ -55,6 +59,19 @@ def _redact(text: str) -> str:
     return _KEY_PATTERN.sub(_REDACTED, text)
 
 
+def _is_retryable(exc: Exception) -> bool:
+    """Return True for transient Anthropic errors that are safe to retry."""
+    try:
+        from anthropic import APIConnectionError, APIStatusError
+    except ImportError:
+        return False
+    if isinstance(exc, APIConnectionError):
+        return True
+    if isinstance(exc, APIStatusError):
+        return exc.status_code in (429, 529)
+    return False
+
+
 def get_client(api_key: str | None = None) -> Anthropic:
     """Instantiate an Anthropic client.
 
@@ -85,11 +102,11 @@ def call_anthropic(
     model: str,
     max_tokens: int,
 ) -> str:
-    """Make a single-turn ``messages.create`` call.
+    """Make a single-turn ``messages.create`` call with retry/backoff.
 
-    Wraps the SDK call so every caller shares identical error
-    handling, message shape, and response unwrapping. Any
-    exception raised by the SDK is re-raised as
+    Retries up to ``_MAX_RETRIES`` times on transient errors (rate
+    limits and overload responses). Non-transient SDK errors fail
+    immediately. All exceptions are re-raised as
     :class:`AnthropicCallError` with a redacted message and an
     empty ``__cause__`` chain to guarantee credential material
     cannot leak through ``str(exc.__cause__)``.
@@ -106,23 +123,40 @@ def call_anthropic(
         string if the response carried no content.
 
     Raises:
-        AnthropicCallError: On any SDK or transport failure.
+        AnthropicCallError: On any SDK or transport failure after
+            retries are exhausted.
     """
-    try:
-        response = client.messages.create(
-            model=model,
-            max_tokens=max_tokens,
-            system=system,
-            messages=[{"role": "user", "content": user_message}],
-        )
-    except Exception as exc:  # noqa: BLE001
-        # INTENTIONAL: every SDK exception type funnels through
-        # one redaction pass so credential material can't leak
-        # into logs, error surfaces, or upstream exception
-        # chains. `from None` strips __cause__ so callers
-        # inspecting the chain only ever see the redacted form.
-        raise AnthropicCallError(_redact(str(exc))) from None
-
-    if response.content:
-        return response.content[0].text
-    return ""
+    last_exc: Exception | None = None
+    for attempt in range(_MAX_RETRIES + 1):
+        if attempt:
+            delay = _RETRY_BASE_DELAY * (2 ** (attempt - 1))
+            logger.warning(
+                "Anthropic call failed (attempt %d/%d), retrying in %.1fs: %s",
+                attempt,
+                _MAX_RETRIES,
+                delay,
+                _redact(str(last_exc)),
+            )
+            time.sleep(delay)
+        try:
+            response = client.messages.create(
+                model=model,
+                max_tokens=max_tokens,
+                system=system,
+                messages=[{"role": "user", "content": user_message}],
+            )
+            if response.content:
+                return response.content[0].text
+            return ""
+        except Exception as exc:  # noqa: BLE001
+            # INTENTIONAL: every SDK exception type funnels through
+            # one redaction pass so credential material can't leak
+            # into logs, error surfaces, or upstream exception
+            # chains. `from None` strips __cause__ so callers
+            # inspecting the chain only ever see the redacted form.
+            if _is_retryable(exc):
+                last_exc = exc
+                continue
+            raise AnthropicCallError(_redact(str(exc))) from None
+
+    raise AnthropicCallError(_redact(str(last_exc))) from None
diff --git a/src/attune_author/generator.py b/src/attune_author/generator.py
@@ -14,6 +14,7 @@
 
 import ast
 import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from pathlib import Path
@@ -25,6 +26,55 @@
 
 logger = logging.getLogger(__name__)
 
+#: Cap on concurrent LLM calls during the parallel polish phase.
+#: Sized to comfortably fit under Anthropic's per-minute rate
+#: limits while still saturating the LLM-bound wall time of a
+#: typical ``regenerate --all-kinds`` run.
+_POLISH_MAX_WORKERS = 4
+
+
+def _parallel_polish(
+    pending: list[tuple[str, str, Path]],
+    feature: object,
+    source_info: object,
+    use_rag: bool,
+) -> dict[str, tuple[str, Path]]:
+    """Polish a batch of rendered templates concurrently.
+
+    Args:
+        pending: List of (depth, rendered_content, out_path) tuples.
+        feature: Feature being documented (read-only, thread-safe).
+        source_info: Extracted source info (read-only, thread-safe).
+        use_rag: Whether to use RAG grounding during polish.
+
+    Returns:
+        Mapping of depth -> (polished_content, out_path). Raises
+        the first exception encountered (propagated from the future).
+    """
+
+    def _task(depth: str, content: str, out_path: Path) -> tuple[str, str, Path]:
+        polished = _maybe_polish(
+            content,
+            feature,  # type: ignore[arg-type]
+            source_info,  # type: ignore[arg-type]
+            template_type=depth,
+            use_rag=use_rag,
+        )
+        return depth, polished, out_path
+
+    results: dict[str, tuple[str, Path]] = {}
+    workers = min(len(pending), _POLISH_MAX_WORKERS)
+    with ThreadPoolExecutor(max_workers=workers) as executor:
+        futures = {
+            executor.submit(_task, depth, content, out_path): depth
+            for depth, content, out_path in pending
+        }
+        for future in as_completed(futures):
+            depth, polished, out_path = future.result()
+            results[depth] = (polished, out_path)
+    return results
+
+
 #: Core progressive-depth template kinds. These form the
 #: progressive disclosure path that attune-help renders:
 #: concept → task → reference. They are generated by
@@ -234,6 +284,9 @@ def generate_feature_templates(
             ", ".join(feature.doc_paths[1:]),
         )
 
+    # Phase 1: render all templates (fast Jinja2, sequential).
+    # Determines which depths are active and builds the rendered skeleton.
+    pending: list[tuple[str, str, Path]] = []
     for depth in target_depths:
         if depth not in _ALL_TEMPLATE_NAMES:
             logger.warning("Unknown template kind '%s', skipping", depth)
@@ -278,17 +331,15 @@ def generate_feature_templates(
                 source_hash=source_hash,
                 source_info=source_info,
             )
+        pending.append((depth, content, out_path))
 
-        # LLM polish pass — improves writing quality
-        content = _maybe_polish(
-            content,
-            feature,
-            source_info,
-            template_type=depth,
-            use_rag=use_rag,
-        )
+    # Phase 2: LLM polish — run all depths concurrently.
+    polished = _parallel_polish(pending, feature, source_info, use_rag)
 
-        out_path.write_text(content, encoding="utf-8")
+    # Phase 3: write results in original depth order.
+    for depth, content, out_path in pending:
+        final_content, _ = polished[depth]
+        out_path.write_text(final_content, encoding="utf-8")
         result.templates.append(
             GeneratedTemplate(
                 feature=feature.name,