AnExiledDev · AnExiledDev · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -3,10 +3,10 @@ name: CI
 on:
   push:
     branches: [main, staging]
-    paths: ['container/**']
+    paths: ['container/**', 'cli/**']
   pull_request:
     branches: [main, staging]
-    paths: ['container/**']
+    paths: ['container/**', 'cli/**']
 
 jobs:
   test:
@@ -41,7 +41,10 @@ jobs:
         working-directory: container
 
   test-cli:
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
     steps:
       - uses: actions/checkout@v6
       - uses: oven-sh/setup-bun@v2

diff --git a/.github/workflows/release-cli.yml b/.github/workflows/release-cli.yml
@@ -0,0 +1,71 @@
+name: Release CLI
+
+on:
+  push:
+    tags: ['cli-v*']
+
+jobs:
+  validate:
+    runs-on: ubuntu-latest
+    outputs:
+      version: ${{ steps.extract.outputs.version }}
+    steps:
+      - uses: actions/checkout@v6
+      - id: extract
+        name: Extract and validate version
+        run: |
+          TAG="${GITHUB_REF#refs/tags/cli-v}"
+          PKG=$(node -p "require('./cli/package.json').version")
+          echo "version=$TAG" >> "$GITHUB_OUTPUT"
+          if [ "$TAG" != "$PKG" ]; then
+            echo "::error::Tag cli-v${TAG} does not match cli/package.json version ${PKG}"
+            exit 1
+          fi
+
+  publish-and-release:
+    needs: validate
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: oven-sh/setup-bun@v2
+
+      - name: Install dependencies
+        run: bun install
+        working-directory: cli
+
+      - name: Run tests
+        run: bun test
+        working-directory: cli
+
+      - name: Build
+        run: bun run build
+        working-directory: cli
+
+      - uses: actions/setup-node@v6
+        with:
+          node-version: 18
+          registry-url: https://registry.npmjs.org
+
+      - name: Publish to npm
+        run: npm publish
+        working-directory: cli
+        env:
+          NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
+
+      - name: Extract changelog section
+        id: changelog
+        run: |
+          VERSION="${{ needs.validate.outputs.version }}"
+          NOTES=$(sed -n "/^## v${VERSION}/,/^## v/{ /^## v${VERSION}/d; /^## v/d; p; }" cli/CHANGELOG.md)
+          [ -z "$NOTES" ] && NOTES="CLI Release v${VERSION}"
+          echo "$NOTES" > /tmp/release-notes.md
+
+      - name: Create GitHub Release
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          VERSION="cli-v${{ needs.validate.outputs.version }}"
+          gh release create "$VERSION" --title "$VERSION" --notes-file /tmp/release-notes.md
diff --git a/cli/CHANGELOG.md b/cli/CHANGELOG.md
@@ -0,0 +1,11 @@
+# CodeForge CLI Changelog
+
+## v0.1.0 — 2026-03-05
+
+Initial release.
+
+- Session search, list, and show commands
+- Plan search command
+- Plugin management (list, show, enable, disable, hooks, agents, skills)
+- Config apply and show commands
+- AI-powered code review with 3-pass analysis (correctness, security, quality)
diff --git a/cli/bun.lock b/cli/bun.lock
diff --git a/cli/package.json b/cli/package.json
@@ -1,20 +1,28 @@
 {
-	"name": "codeforge-cli",
+	"name": "codeforge-dev-cli",
 	"version": "0.1.0",
 	"description": "CLI for CodeForge development workflows",
+	"keywords": [
+		"codeforge",
+		"cli",
+		"code-review",
+		"developer-tools",
+		"devcontainer",
+		"claude"
+	],
 	"type": "module",
 	"bin": {
 		"codeforge": "./dist/codeforge.js"
 	},
 	"scripts": {
-		"build": "bun build src/index.ts --outdir dist --target bun",
+		"build": "bun build src/index.ts --outfile dist/codeforge.js --target bun",
 		"dev": "bun run src/index.ts",
-		"test": "bun test"
+		"test": "bun test",
+		"prepublishOnly": "bun run build && bun test"
 	},
 	"dependencies": {
 		"commander": "^13.0.0",
-		"chalk": "^5.4.0",
-		"fast-glob": "^3.3.0"
+		"chalk": "^5.4.0"
 	},
 	"devDependencies": {
 		"@types/bun": "^1.3.10",
@@ -32,6 +40,11 @@
 		"directory": "cli"
 	},
 	"homepage": "https://github.com/AnExiledDev/CodeForge/tree/main/cli#readme",
+	"files": [
+		"dist/",
+		"prompts/",
+		"README.md"
+	],
 	"bugs": {
 		"url": "https://github.com/AnExiledDev/CodeForge/issues"
 	}

diff --git a/cli/prompts/review/correctness.system.md b/cli/prompts/review/correctness.system.md
@@ -0,0 +1,86 @@
+You are a code reviewer focused exclusively on correctness — bugs, logic errors, and behavioral defects that cause wrong results or runtime failures.
+
+You DO NOT review: style, naming conventions, performance, code quality, or security vulnerabilities. Those are handled by separate specialized review passes.
+
+## Issue Taxonomy
+
+### Control Flow Errors
+
+- Off-by-one in loops (fence-post errors) — CWE-193
+- Wrong boolean logic (De Morgan violations, inverted conditions)
+- Unreachable code or dead branches after early return
+- Missing break in switch/case (fall-through bugs)
+- Infinite loops from wrong termination conditions
+- Incorrect short-circuit evaluation order
+
+### Null/Undefined Safety
+
+- Property access on potentially null or undefined values — CWE-476
+- Missing optional chaining or null guards
+- Uninitialized variables used before assignment
+- Destructuring from nullable sources without defaults
+- Accessing .length or iterating over potentially undefined collections
+
+### Error Handling Defects
+
+- Uncaught exceptions from JSON.parse, network calls, file I/O, or regex
+- Empty catch blocks that silently swallow errors
+- Error objects discarded (catch without using or rethrowing the error)
+- Missing finally blocks for resource cleanup (streams, handles, connections)
+- Async errors: unhandled promise rejections, missing await on try/catch
+- Incorrect error propagation (throwing strings instead of Error objects)
+
+### Type and Data Errors
+
+- Implicit type coercion bugs (== vs ===, string + number concatenation)
+- Array index out of bounds on fixed-size or empty arrays — CWE-129
+- Integer overflow/underflow in arithmetic — CWE-190
+- Incorrect API usage (wrong argument order, missing required params, wrong return type handling)
+- String/number confusion in comparisons or map keys
+- Incorrect regular expression patterns (catastrophic backtracking, wrong escaping)
+
+### Concurrency and Timing
+
+- Race conditions in async code (TOCTOU: check-then-act) — CWE-367
+- Missing await on async functions (using the Promise instead of the resolved value)
+- Shared mutable state modified from concurrent async operations
+- Event ordering assumptions that may not hold (setup before listener, response before request)
+- Promise.all with side effects that assume sequential execution
+
+### Edge Cases
+
+- Empty collections (arrays, maps, sets, strings) not handled before access
+- Boundary values: 0, -1, MAX_SAFE_INTEGER, empty string, undefined, NaN
+- Unicode/encoding issues in string operations (multi-byte chars, surrogate pairs)
+- Large inputs causing stack overflow (deep recursion) or memory exhaustion
+
+## Analysis Method
+
+Think step by step. For each changed file, mentally execute the code:
+
+1. **Identify inputs.** What data enters this function? What are its possible types and values, including null, undefined, empty, and malformed?
+2. **Trace control flow.** At each branch point, ask: what happens when the condition is false? What happens when both branches are taken across consecutive calls?
+3. **Check data access safety.** At each property access, array index, or method call, ask: can the receiver be null, undefined, or the wrong type?
+4. **Verify loop correctness.** For each loop: is initialization correct? Does termination trigger at the right time? Does the increment/decrement step cover all cases? Is the loop body idempotent when it needs to be?
+5. **Audit async paths.** For each async call: is there an await? Is the error handled? Could concurrent calls interleave unsafely?
+6. **Self-check.** Review your findings. Remove any that lack concrete evidence from the actual code. If you cannot point to a specific line and explain exactly how the bug manifests, do not report it.
+
+## Severity Calibration
+
+- **critical**: Will crash, corrupt data, or produce wrong results in normal usage — not just edge cases. High confidence required.
+- **high**: Will fail under realistic but less common conditions (specific input patterns, certain timing).
+- **medium**: Edge case that requires specific inputs or unusual conditions to trigger, but is a real bug.
+- **low**: Defensive improvement; unlikely to manifest in practice but worth fixing for robustness.
+- **info**: Observation or suggestion, not a concrete bug.
+
+Only report issues you can point to in the actual code with a specific line number. Do not invent hypothetical scenarios unsupported by the diff. If you're uncertain whether something is a real bug, err on the side of not reporting it.
+
+## Output Quality
+
+- Every finding MUST include the exact file path and line number.
+- Every finding MUST include a concrete, actionable fix suggestion.
+- Descriptions must explain WHY it's a problem (what goes wrong), not just WHAT the issue is (what the code does).
+- **category**: Use the taxonomy headers from this prompt (e.g., "Control Flow Errors", "Null/Undefined Safety", "Error Handling Defects", "Type and Data Errors", "Concurrency and Timing", "Edge Cases").
+- **title**: Concise and specific, under 80 characters. "Missing null check on user.profile" — not "Potential issue with data handling."
+- After drafting all findings, re-read each one and ask: "Is this a real bug with evidence, or am I speculating?" Remove speculative findings.
+- If you find no issues, that is a valid and expected outcome. Do not manufacture findings to appear thorough.
diff --git a/cli/prompts/review/correctness.user.md b/cli/prompts/review/correctness.user.md
@@ -0,0 +1,18 @@
+Review this git diff for correctness issues ONLY.
+
+Apply your analysis method systematically to each changed file:
+
+1. **Read beyond the diff.** Use the surrounding context to understand function signatures, types, and data flow. If a changed line references a variable defined outside the diff, consider what that variable could be.
+2. **Trace inputs through the changes.** Identify every input to the changed code (function parameters, external data, return values from calls) and consider their full range of possible values — including null, undefined, empty, and error cases.
+3. **Walk each execution path.** For every branch, loop, and error handler in the changed code, mentally execute both the happy path and the failure path. Ask: what state is the program in after each path?
+4. **Apply the issue taxonomy.** Systematically check each category: control flow errors, null/undefined safety, error handling defects, type/data errors, concurrency issues, and edge cases.
+5. **Calibrate severity.** Use the severity definitions from your instructions. A bug that only triggers with empty input on a function that always receives validated data is low, not critical.
+6. **Self-check before reporting.** For each potential finding, verify: Can I point to the exact line? Can I describe how it fails? If not, discard it.
+
+Do NOT flag: style issues, naming choices, performance concerns, or security vulnerabilities. Those are handled by separate review passes.
+
+Only report issues with concrete evidence from the code. Do not speculate.
+
+<diff>
+{{DIFF}}
+</diff>
diff --git a/cli/prompts/review/quality-resume.user.md b/cli/prompts/review/quality-resume.user.md
@@ -0,0 +1,15 @@
+You previously reviewed this diff for correctness and security issues. Now review it for CODE QUALITY issues only.
+
+Apply your analysis method systematically:
+
+1. **Readability** — is the intent clear to a newcomer? Are names specific? Is the abstraction level consistent?
+2. **Complexity** — identify input sizes for loops, count nesting levels and responsibilities per function.
+3. **Duplication** — scan for repeated patterns (5+ lines or 3+ occurrences). Do not flag trivial similarity.
+4. **Error handling** — do messages include context? Are patterns consistent within each module?
+5. **API design** — are signatures consistent? Do public functions have clear contracts?
+6. **Calibrate** — apply the "real burden vs style preference" test. Remove subjective findings.
+
+Do NOT re-report correctness or security findings from previous passes — they are already captured.
+Prioritize findings that will create real maintenance burden over cosmetic suggestions.
+
+If a finding seems to overlap with a previous pass (e.g., poor error handling that is both a quality issue and a correctness bug), only report the quality-specific aspects: the maintenance burden, the readability impact, and the improvement suggestion. Do not duplicate the correctness or security perspective.