NETBEAR/authenticated_endpoint_crawler.py at main · pilloverx/NETBEAR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#!/usr/bin/env python3
"""
Authenticated Endpoint Discovery & Nuclei Integration
Extracts real API endpoints from authenticated sessions and JS, feeds them to stricter Nuclei templates.

Usage:
    python authenticated_endpoint_crawler.py --url https://app.example.com --username user@example.com --password pass
"""

import re
import json
import sys
import argparse
import requests
import logging
from typing import Set, Dict, List
from pathlib import Path
from datetime import datetime
from urllib.parse import urlparse, urljoin

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


class AuthenticatedEndpointCrawler:
    def __init__(self, base_url: str, username: str, password: str, cookies: Dict = None):
        self.base_url = base_url.rstrip('/')
        self.username = username
        self.password = password
        self.session = requests.Session()
        self.session.cookies.update(cookies or {})

        # Common headers to mimic browser
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

        self.api_endpoints: Set[str] = set()
        self.authenticated = False

    def authenticate(self, login_endpoint: str = "/login", username_field: str = "email",
                     password_field: str = "password") -> bool:
        """Authenticate to the application"""
        try:
            login_url = urljoin(self.base_url, login_endpoint)
            payload = {
                username_field: self.username,
                password_field: self.password
            }

            logger.info(f"[*] Attempting authentication at {login_url}")
            resp = self.session.post(login_url, json=payload, timeout=10)

            if resp.status_code in [200, 201, 302]:
                logger.info("[+] Authentication successful")
                self.authenticated = True
                return True
            else:
                logger.warning(f"[-] Authentication failed (status {resp.status_code})")
                return False
        except Exception as e:
            logger.error(f"[-] Authentication error: {e}")
            return False

    def extract_endpoints_from_js(self, js_content: str) -> Set[str]:
        """Extract API endpoints from JavaScript code"""
        endpoints = set()

        # Pattern 1: fetch/axios calls
        fetch_patterns = [
            r"(?:fetch|axios\.(?:get|post|put|delete|patch))\(['\"`]([^'\"`;]+)['\"`]",
            r"url:\s*['\"]([^'\"]+)['\"]",
            r"endpoint:\s*['\"]([^'\"]+)['\"]",
            r"path:\s*['\"]([^'\"]+)['\"]",
            r"/api/[a-zA-Z0-9/_-]+",
            r"\.get\(['\"]([^'\"]+)['\"]",
            r"\.post\(['\"]([^'\"]+)['\"]",
        ]

        for pattern in fetch_patterns:
            matches = re.findall(pattern, js_content, re.IGNORECASE)
            endpoints.update(m for m in matches if m.startswith('/') or m.startswith('http'))

        return endpoints

    def fetch_and_analyze_page(self, path: str = "") -> Set[str]:
        """Fetch a page and extract JS endpoints"""
        endpoints = set()
        try:
            url = urljoin(self.base_url, path)
            logger.info(f"[*] Fetching {url}")
            resp = self.session.get(url, timeout=10)

            if resp.status_code == 200:
                # Extract from page source
                extracted = self.extract_endpoints_from_js(resp.text)
                endpoints.update(extracted)

                # Look for script tags with inline JS
                script_contents = re.findall(r'<script[^>]*>([^<]+)</script>', resp.text, re.IGNORECASE)
                for script in script_contents:
                    extracted = self.extract_endpoints_from_js(script)
                    endpoints.update(extracted)

                # Look for src attributes pointing to .js files
                js_files = re.findall(r'<script[^>]+src=["\']([^"\']+\.js)["\']', resp.text, re.IGNORECASE)
                for js_file in js_files:
                    full_url = urljoin(self.base_url, js_file)
                    logger.info(f"[*] Found JS file: {js_file}")
                    endpoints.update(self.fetch_js_file(full_url))

        except Exception as e:
            logger.warning(f"[-] Error fetching {path}: {e}")

        return endpoints

    def fetch_js_file(self, js_url: str) -> Set[str]:
        """Fetch a JS file and extract endpoints"""
        endpoints = set()
        try:
            resp = self.session.get(js_url, timeout=10)
            if resp.status_code == 200:
                extracted = self.extract_endpoints_from_js(resp.text)
                endpoints.update(extracted)
                logger.info(f"[+] Extracted {len(extracted)} endpoints from {js_url}")
        except Exception as e:
            logger.warning(f"[-] Error fetching JS file {js_url}: {e}")

        return endpoints

    def discover_endpoints(self, crawl_paths: List[str] = None) -> Set[str]:
        """Discover API endpoints by crawling authenticated pages"""
        if not self.authenticated:
            logger.error("[-] Must authenticate before crawling")
            return set()

        if crawl_paths is None:
            # Default paths to crawl for healthcare app like Doctolib
            crawl_paths = [
                "/mon-compte",
                "/dashboard",
                "/api",
                "/app",
                "/patient",
                "/doctor",
                "/appointments",
                "/profile",
            ]

        logger.info(f"[*] Discovering endpoints from {len(crawl_paths)} paths")

        for path in crawl_paths:
            discovered = self.fetch_and_analyze_page(path)
            self.api_endpoints.update(discovered)
            logger.info(f"[+] Path '{path}': found {len(discovered)} endpoints")

        # Filter to API-only endpoints
        api_only = {ep for ep in self.api_endpoints if '/api/' in ep.lower()}
        logger.info(f"[+] Total API endpoints discovered: {len(api_only)}")

        return api_only

    def generate_nuclei_requests(self, endpoints: Set[str]) -> str:
        """Generate Nuclei request file from discovered endpoints"""
        nuclei_requests = []

        for endpoint in sorted(endpoints):
            # Clean endpoint (remove query params, fragments)
            clean_endpoint = re.sub(r'[?#].*', '', endpoint)

            nuclei_requests.append({
                "url": urljoin(self.base_url, clean_endpoint),
                "method": "GET",
                "headers": {
                    "Authorization": f"Bearer {self.session.cookies.get('token', '')}"
                }
            })

        return json.dumps(nuclei_requests, indent=2)

    def save_results(self, output_dir: str = "nuclei_input"):
        """Save discovered endpoints to files for Nuclei"""
        output_path = Path(output_dir)
        output_path.mkdir(exist_ok=True)

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Save endpoints list
        endpoints_file = output_path / f"endpoints_{timestamp}.txt"
        with open(endpoints_file, 'w') as f:
            for ep in sorted(self.api_endpoints):
                f.write(f"{urljoin(self.base_url, ep)}\n")

        logger.info(f"[+] Saved endpoints to {endpoints_file}")

        # Save Nuclei request format
        nuclei_file = output_path / f"nuclei_targets_{timestamp}.json"
        with open(nuclei_file, 'w') as f:
            f.write(self.generate_nuclei_requests(self.api_endpoints))

        logger.info(f"[+] Saved Nuclei targets to {nuclei_file}")

        # Save summary
        summary_file = output_path / f"discovery_summary_{timestamp}.json"
        summary = {
            "timestamp": timestamp,
            "base_url": self.base_url,
            "total_endpoints": len(self.api_endpoints),
            "api_endpoints": sorted([ep for ep in self.api_endpoints if '/api/' in ep.lower()]),
            "other_endpoints": sorted([ep for ep in self.api_endpoints if '/api/' not in ep.lower()])
        }

        with open(summary_file, 'w') as f:
            json.dump(summary, f, indent=2)

        logger.info(f"[+] Saved summary to {summary_file}")

        return str(endpoints_file), str(nuclei_file), str(summary_file)


def run_nuclei_scan(nuclei_targets_file: str, template_dir: str = ".", output_dir: str = None):
    """Run Nuclei with discovered endpoints"""
    import subprocess

    if output_dir is None:
        output_dir = f"nuclei_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

    Path(output_dir).mkdir(exist_ok=True)

    cmd = [
        "nuclei",
        "-l", nuclei_targets_file,
        "-t", f"{template_dir}/netbear-*.yaml",
        "-o", f"{output_dir}/results.txt",
        "-json", "-o", f"{output_dir}/results.json",
        "-silent",
    ]

    logger.info(f"[*] Running Nuclei: {' '.join(cmd)}")

    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
        logger.info(f"[+] Nuclei scan completed")
        logger.info(f"[*] Results saved to {output_dir}/")
        return output_dir
    except Exception as e:
        logger.error(f"[-] Nuclei scan failed: {e}")
        return None


def main():
    parser = argparse.ArgumentParser(
        description="Authenticated endpoint discovery and Nuclei integration"
    )
    parser.add_argument("--url", required=True, help="Base URL of the application")
    parser.add_argument("--username", required=True, help="Username for authentication")
    parser.add_argument("--password", required=True, help="Password for authentication")
    parser.add_argument("--login-endpoint", default="/login", help="Login endpoint path")
    parser.add_argument("--username-field", default="email", help="Username field name in login form")
    parser.add_argument("--password-field", default="password", help="Password field name in login form")
    parser.add_argument("--output-dir", default="nuclei_input", help="Output directory for results")
    parser.add_argument("--run-nuclei", action="store_true", help="Run Nuclei scan on discovered endpoints")
    parser.add_argument("--template-dir", default=".", help="Directory containing Nuclei templates")
    parser.add_argument("--cookies", help="JSON string with authentication cookies (for session-based auth)")

    args = parser.parse_args()

    # Initialize crawler
    cookies = json.loads(args.cookies) if args.cookies else None
    crawler = AuthenticatedEndpointCrawler(args.url, args.username, args.password, cookies)

    # Authenticate
    if not crawler.authenticate(args.login_endpoint, args.username_field, args.password_field):
        logger.error("[-] Failed to authenticate. Exiting.")
        sys.exit(1)

    # Discover endpoints
    logger.info("[*] Starting endpoint discovery...")
    crawler.discover_endpoints()

    if not crawler.api_endpoints:
        logger.warning("[-] No endpoints discovered")
        sys.exit(1)

    logger.info(f"[+] Discovered {len(crawler.api_endpoints)} endpoints")

    # Save results
    endpoints_file, nuclei_file, summary_file = crawler.save_results(args.output_dir)

    # Run Nuclei if requested
    if args.run_nuclei:
        output_dir = run_nuclei_scan(endpoints_file, args.template_dir,
                                      f"nuclei_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
        if output_dir:
            logger.info(f"[+] Nuclei results in {output_dir}")

    logger.info("[+] Endpoint discovery complete!")


if __name__ == "__main__":
    main()