Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,49 @@ Core engine, CLI, and all major subsystems are stable. Summary of shipped featur

## In Progress

_(Currently empty)_
### Tauri Desktop App — Mission Control for AI Agents

**Priority: Urgent** — The desktop app is the primary interface for users to manage, monitor, and assist AI browsing agents.

**Phase 1 — Semantic Tree Viewer + CAPTCHA Handoff (current)**
- [ ] Semantic tree viewer panel — render ARIA role tree with interactive nodes in Tauri dashboard
- [ ] Per-instance controls — URL bar, navigate, agent status (idle/running/waiting-challenge)
- [ ] CAPTCHA handoff — when agent hits a challenge, popup OS webview (WKWebView/WebKitGTK/WebView2) for user to solve, then sync cookies back to headless browser via CDP `Network.setCookie`
- [ ] Cookie bridge — `tokio-tungstenite` WebSocket client to inject cookies into headless CDP server
- [ ] Agent action log — real-time log of agent actions (navigate, click, type, wait) streamed from CDP events
- [ ] Cross-platform — dashboard is pure HTML/CSS (no OS webview dependency for primary view); CAPTCHA popup uses OS webview only when needed

**Phase 2 — Multi-Agent Dashboard**
- [ ] Multiple concurrent agent instances — spawn/manage N agents in one window
- [ ] Agent status grid — see all agents at a glance with status indicators (running, idle, stuck, CAPTCHA)
- [ ] Live agent action streaming — watch each agent's actions in real-time via CDP event bus
- [ ] Take-over button — pause agent, let user manually interact, then resume agent
- [ ] Agent conversation panel — show the LLM conversation alongside browser actions

**Phase 3 — Rendered View (Optional)**
- [ ] Rendered page tab — OS webview shows actual page pixels (WKWebView on macOS, WebKitGTK on Linux, WebView2 on Windows)
- [ ] Split view — semantic tree on left, rendered pixels on right
- [ ] Screenshot capture — use pardus-core screenshot feature (chromiumoxide) for pixel-perfect captures

**Architecture:**
```
┌─ Mission Control ──────────────────────────────────────┐
│ ┌─ Agents ─────┐ ┌─ Semantic Tree ──────────────────┐ │
│ │ ● Agent 1 │ │ [Document] │ │
│ │ Shopping │ │ ├── [Nav] "Menu" │ │
│ │ Running │ │ ├── [Main] │ │
│ │ │ │ │ ├── [H1] "Welcome" │ │
│ │ ● Agent 2 │ │ │ ├── [TextBox #3] "Email" │ │
│ │ Research │ │ │ └── [Button #4] "Submit" │ │
│ │ ⚠ CAPTCHA │ │ └── [Footer] │ │
│ └──────────────┘ └───────────────────────────────────┘ │
│ ┌─ Action Log ────────────────────────────────────────┐ │
│ │ 12:03:01 Navigate → shop.example.com │ │
│ │ 12:03:02 Click [#5] "Add to Cart" │ │
│ │ 12:03:03 ⚠ CAPTCHA detected — Cloudflare │ │
│ └─────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────┘
```

---

Expand Down
150 changes: 109 additions & 41 deletions ai-agent/pardus-browser/src/agent/Agent.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { LLMClient, LLMConfig, Message, getSystemPrompt } from '../llm/index.js';
import { LLMClient, LLMConfig, Message, getSystemPrompt, compactMessages, truncateToolResult, ContextConfig } from '../llm/index.js';
import { ToolExecutor } from '../tools/executor.js';
import { BrowserManager } from '../core/index.js';
import { BrowserToolName } from '../tools/definitions.js';
Expand All @@ -13,13 +13,15 @@ interface AgentOptions {
maxRounds?: number;
/** Tool execution configuration */
toolConfig?: {
/** Enable parallel execution where safe (default: false) */
/** Enable parallel execution where safe (default: true) */
parallel?: boolean;
/** Continue on tool failure (default: true) */
continueOnError?: boolean;
/** Default retry configuration for all tools */
defaultRetryConfig?: ToolExecutionConfig;
};
/** Context window management configuration */
contextConfig?: Partial<ContextConfig>;
}

/**
Expand All @@ -36,17 +38,25 @@ export class Agent {
private browserManager: BrowserManager;
private isRunning = false;
private toolConfig: AgentOptions['toolConfig'];
private contextConfig: ContextConfig;

constructor(browserManager: BrowserManager, options: AgentOptions) {
this.browserManager = browserManager;
this.llm = new LLMClient(options.llmConfig);
this.toolExecutor = new ToolExecutor(browserManager);
this.maxRounds = options.maxRounds ?? 50;
this.toolConfig = {
parallel: false,
parallel: true,
continueOnError: true,
...options.toolConfig,
};
this.contextConfig = {
maxTokens: 100_000,
keepRecentMessages: 10,
maxToolResultChars: 6000,
charsPerToken: 4,
...options.contextConfig,
};

// Initialize with system prompt
this.messages.push({
Expand Down Expand Up @@ -128,22 +138,21 @@ export class Agent {
return errorMessage;
}

// Add all tool results to conversation
// Add all tool results to conversation — toolCallId flows from the LLM response
for (const result of toolResults) {
// Find the original tool call ID
const toolCall = response.toolCalls.find(t =>
t.name === result.name &&
JSON.stringify(t.arguments) === JSON.stringify(result.args)
);

const content = result.success
? truncateToolResult(result.content || '', this.contextConfig.maxToolResultChars)
: `Error: ${result.error || 'Unknown error'}\n\nPartial result: ${result.content || 'none'}`;

this.messages.push({
role: 'tool',
tool_call_id: toolCall?.id || 'unknown',
content: result.success
? (result.content || '')
: `Error: ${result.error || 'Unknown error'}\n\nPartial result: ${result.content || 'none'}`,
tool_call_id: result.toolCallId || 'unknown',
content,
});
}

// Compact conversation history if approaching context limit
this.messages = compactMessages(this.messages, this.contextConfig);
}

if (rounds >= this.maxRounds) {
Expand All @@ -169,7 +178,7 @@ export class Agent {
if (!this.toolConfig?.parallel) {
// Sequential execution
const results: ToolExecutionResult[] = [];

for (const call of toolCalls) {
console.log(`[Tool] ${call.name}: ${JSON.stringify(call.arguments)}`);

Expand All @@ -180,6 +189,7 @@ export class Agent {
);

results.push({
toolCallId: call.id,
name: call.name,
args: call.arguments,
success: result.success,
Expand All @@ -196,12 +206,12 @@ export class Agent {
console.log(`[Tool Error] ${result.error}`);
}
}

return results;
} else {
// Parallel execution with grouping
// Convert to format expected by executeTools
const tools = toolCalls.map(call => ({
toolCallId: call.id,
name: call.name as BrowserToolName,
args: call.arguments,
config: this.toolConfig?.defaultRetryConfig,
Expand All @@ -220,42 +230,100 @@ export class Agent {
console.log(`[Tool Error] ${result.error}`);
}
}

return parallelResult.results;
}
}

/**
* Stream a response for interactive CLI
*
* Note: Tool calls still happen after the stream completes
* Stream a response for interactive CLI with full tool call support.
*
* Yields text chunks as they arrive. Tool calls are buffered and
* executed after the stream completes, then the loop continues
* (same as chat() but with streamed text output).
*/
async *streamChat(userMessage: string): AsyncGenerator<string, string, unknown> {
// For streaming, we currently don't support mid-stream tool calls
// The LLM will respond with text, then we check for tool calls
// This is a simplified version - full implementation would parse tool calls from stream
if (this.isRunning) {
throw new Error('Agent is already processing a message');
}

this.messages.push({
role: 'user',
content: userMessage,
});
this.isRunning = true;

// For simplicity in streaming mode, we don't use tools
// Full implementation would parse tool calls from stream
const stream = this.llm.streamChat(this.messages);
let fullResponse = '';
try {
this.messages.push({ role: 'user', content: userMessage });

for await (const chunk of stream) {
fullResponse += chunk;
yield chunk;
}
let rounds = 0;

this.messages.push({
role: 'assistant',
content: fullResponse,
});
while (rounds < this.maxRounds) {
rounds++;

const result = await this.llm.streamChat(this.messages);

// Yield any text chunks
for (const chunk of result.textChunks) {
yield chunk;
}

return fullResponse;
// No tool calls — done
if (!result.toolCalls || result.toolCalls.length === 0) {
this.messages.push({
role: 'assistant',
content: result.content ?? '',
});
return result.content ?? '';
}

// Add assistant message with tool calls
this.messages.push({
role: 'assistant',
content: result.content ?? '',
tool_calls: result.toolCalls.map(call => ({
id: call.id,
type: 'function' as const,
function: {
name: call.name,
arguments: JSON.stringify(call.arguments),
},
})),
});

// Execute tool calls
const toolResults = await this.executeToolCalls(result.toolCalls);

const hasFailures = toolResults.some(r => !r.success);
if (hasFailures && !this.toolConfig?.continueOnError) {
const errorMessage = 'Tool execution failed. Aborting conversation.';
this.messages.push({ role: 'assistant', content: errorMessage });
yield `\n\n${errorMessage}`;
return errorMessage;
}

// Add tool results
for (const res of toolResults) {
const content = res.success
? truncateToolResult(res.content || '', this.contextConfig.maxToolResultChars)
: `Error: ${res.error || 'Unknown error'}\n\nPartial result: ${res.content || 'none'}`;

this.messages.push({
role: 'tool',
tool_call_id: res.toolCallId || 'unknown',
content,
});
}

// Compact context
this.messages = compactMessages(this.messages, this.contextConfig);

// The loop continues — the next iteration will stream the LLM's
// response to the tool results (which may include more tool calls).
}

const limitMsg = 'Maximum number of tool call rounds reached.';
yield `\n\n${limitMsg}`;
return limitMsg;
} finally {
this.isRunning = false;
}
}

/**
Expand Down
Loading
Loading