pathfinder/pathfinder.example.yaml at main · CopilotKit/pathfinder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# Pathfinder — agentic knowledge server for AI agents
# Copy this file to pathfinder.yaml and customize for your project.
# Full documentation: https://pathfinder.copilotkit.dev

server:
  name: my-project-docs
  version: "1.0.0"
  # max_sessions: 1000                # Global max concurrent sessions across all IPs (default: 1000)
  #                                   # When exceeded, returns HTTP 503 with JSON
  #                                   # {error, reason, totalSessions, maxSessions, retryAfterSeconds, contact}
  #                                   # and a Retry-After header.
  # max_sessions_per_ip: 20           # Max concurrent MCP sessions per IP (default: 20)
  #                                   # When exceeded, returns HTTP 429 with JSON
  #                                   # {error, reason, limit, currentCount, retryAfterSeconds, contact}
  #                                   # and a Retry-After header.
  # session_ttl_minutes: 30           # Idle session timeout for active sessions (default: 30)
  #                                   # "Active" = has invoked at least one tool (search, bash, etc.)
  # session_unused_ttl_minutes: 15    # Idle session timeout for unused sessions (default: 15)
  #                                   # "Unused" = connected but never invoked a tool.
  #                                   # Set to match Railway's 15-min SSE hard limit.
  # allowlist:                        # IPs / CIDRs that bypass max_sessions_per_ip. Empty by default.
  #   - "160.79.106.35"               # Example: Anthropic Assistant crawler
  #   - "10.0.0.0/8"                  # Example: internal health-probe CIDR
  # NOTE: behind a reverse proxy (Railway, Fly, etc.), allowlist entries only match
  # when trust_proxy: true. Otherwise the server sees the proxy IP for every request.
  #
  # ⚠️  SECURITY WARNING — trust_proxy ⚠️
  # When true, the server honors X-Forwarded-For and populates req.ip from
  # the leftmost entry. This is REQUIRED when the server runs behind a
  # reverse proxy (Railway, Fly, Nginx, etc.) that sets X-Forwarded-For.
  #
  # Only enable this when the proxy discards any client-supplied
  # X-Forwarded-For AND sets its own trusted value. If the proxy passes
  # through client-supplied X-Forwarded-For, attackers can send
  # `X-Forwarded-For: 160.79.106.35` to be seen as an allowlisted IP and
  # BYPASS the rate limiter entirely.
  #
  # When false (the default), X-Forwarded-For is ignored and the server
  # uses the TCP socket's peer address. Leave this false for any server
  # exposed directly to the public internet.
  #
  # Hosted note — Railway, Fly, Render, Heroku, Cloud Run, AWS App
  # Runner, Vercel, and similar PaaS edges ALWAYS sit in front of your
  # container and terminate TLS, so the socket peer the server sees is
  # the platform proxy — never the real client. On these platforms you
  # MUST set trust_proxy (to true, a hop count, or a trusted-proxy CIDR
  # allowlist matching your platform's docs) or every rate-limit bucket
  # and CIDR allowlist entry evaluates against the proxy IP instead of
  # the real caller.
  #
  # Accepted values:
  #   false             — ignore X-Forwarded-For (hardened default)
  #   true              — trust every hop (only safe when the platform
  #                       strips client-supplied X-Forwarded-For)
  #   <integer>         — trust N hops (e.g. 1 for single-proxy Railway)
  #   [<cidr>, ...]     — allowlist of trusted proxy CIDRs
  trust_proxy: false

sources:
  - name: docs
    type: markdown
    repo: https://github.com/your-org/your-repo.git
    path: docs/
    file_patterns: ["**/*.mdx", "**/*.md"]
    chunk:
      target_tokens: 600
      overlap_tokens: 50

# ── Notion source (requires NOTION_TOKEN) ──
#  - name: wiki
#    type: notion
#    root_pages: ["your-page-id"]
#    databases: ["your-db-id"]
#    max_depth: 5
#    include_properties: true
#    chunk:
#      target_tokens: 600
#      overlap_tokens: 50

# ── Document source (requires pdf-parse and/or mammoth) ──
#  - name: specs
#    type: document
#    path: docs/specs/
#    file_patterns: ["**/*.pdf", "**/*.docx"]
#    chunk:
#      target_tokens: 600
#      overlap_tokens: 50

# ── Slack source (requires SLACK_BOT_TOKEN + OPENAI_API_KEY) ──
#  - name: community
#    type: slack
#    channels: ["C0123456789"]
#    min_thread_replies: 2
#    chunk:
#      target_tokens: 600

tools:
  # Semantic search (RAG) — requires embedding config below
  - name: search-docs
    type: search
    description: "Search documentation for relevant information."
    source: docs
    default_limit: 5
    max_limit: 20
    result_format: docs
    # search_mode: vector   # 'vector' (default) | 'keyword' | 'hybrid'

  # Filesystem exploration — no database or API keys needed
  - name: explore-docs
    type: bash
    description: "Explore documentation files using bash commands (find, grep, cat, ls, head)."
    sources: [docs]
    bash:
      session_state: true
      grep_strategy: hybrid # memory | vector | hybrid
      virtual_files: true

  # ── Knowledge tool (for FAQ/Q&A sources) ──
  #  - name: get-faq
  #    type: knowledge
  #    description: "Browse and search community Q&A"
  #    sources: [community]

# Required for search tools (omit for bash-only mode)
embedding:
  provider: openai
  model: text-embedding-3-small
  dimensions: 1536

# ── Ollama (local, requires ollama running with model pulled) ──
# embedding:
#   provider: ollama
#   model: nomic-embed-text
#   dimensions: 768
#   base_url: http://localhost:11434

# ── Local transformers.js (zero external deps, CPU-only) ──
# embedding:
#   provider: local
#   model: Xenova/all-MiniLM-L6-v2
#   dimensions: 384

indexing:
  auto_reindex: true
  reindex_hour_utc: 3
  stale_threshold_hours: 24

# ── Analytics (query logging and dashboard) ──
# analytics:
#   enabled: true
#   log_queries: true
#   retention_days: 90