Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .cursorrules
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
- Epoch only works if everyone contributes estimate-vs-actual data

## Local LLM
- Use local inference at 100.66.225.85:1234 before cloud APIs
- Use local inference at localhost:1234 before cloud APIs
- Check loaded models first, don't touch models you didn't load
- Unload when done
- CPU thread pool: 10, flash attention: on, KV cache: Q8
Expand Down
2 changes: 1 addition & 1 deletion .github/copilot-instructions.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
- Epoch only works if everyone contributes estimate-vs-actual data

## Local LLM
- Use local inference at 100.66.225.85:1234 before cloud APIs
- Use local inference at localhost:1234 before cloud APIs
- Check loaded models first, don't touch models you didn't load
- Unload when done
- CPU thread pool: 10, flash attention: on, KV cache: Q8
Expand Down
2 changes: 1 addition & 1 deletion .windsurfrules
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
- Epoch only works if everyone contributes estimate-vs-actual data

## Local LLM
- Use local inference at 100.66.225.85:1234 before cloud APIs
- Use local inference at localhost:1234 before cloud APIs
- Check loaded models first, don't touch models you didn't load
- Unload when done
- CPU thread pool: 10, flash attention: on, KV cache: Q8
Expand Down
2 changes: 1 addition & 1 deletion AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ Create a private repo in Pastorsimon1798's personal account with:

## Local-First Inference (LM Studio)

All KyaniteLabs projects that require an LLM must use local inference first. Server runs on Tailscale at `100.66.225.85:1234`.
All KyaniteLabs projects that require an LLM must use local inference first. The Mac-local LM Studio compatibility endpoint is `http://localhost:1234`, backed by the NucBox LiteLLM server over an SSH tunnel. The old Windows/Tailscale endpoint `100.66.225.85:1234` is retired unless Tailscale is explicitly restored.

### Server Specs
- **CPU**: AMD Ryzen AI Max 395 (Strix Halo) — 16 cores, 32 threads
Expand Down
116 changes: 109 additions & 7 deletions archaeology/analysis_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,10 +167,53 @@ def run_ml_pattern_mapper(self) -> dict[str, Any]:
},
}

def _approximate_sessions(self) -> list[dict]:
"""Approximate sessions from commits when sessions table is absent.

Groups commits into sessions using a 2-hour inactivity gap heuristic.
Falls back to daily grouping if timestamps lack time components.
"""
tables = {r["name"] for r in self._query_db("SELECT name FROM sqlite_master WHERE type='table'")}
if "sessions" in tables:
return self._query_db("SELECT session_id, timestamp FROM sessions ORDER BY timestamp")

commits = self._query_db("SELECT date FROM commits ORDER BY date")
if not commits:
return []

from datetime import datetime as dt
GAP_HOURS = 2
sessions: list[dict] = []
session_start = None
prev_ts = None

for row in commits:
raw = row.get("date", "")
try:
ts = dt.fromisoformat(raw[:19])
except (ValueError, TypeError):
ts = None

if ts is None:
day = raw[:10]
if day != (prev_ts or ""):
sessions.append({"session_id": day, "timestamp": day})
prev_ts = day
continue

if prev_ts is None or (ts - prev_ts).total_seconds() > GAP_HOURS * 3600:
session_id = ts.strftime("%Y%m%d-%H%M%S")
Comment on lines +204 to +205
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Guard mixed timestamp types before computing session gaps

When a commit row has an unparsable date, prev_ts is set to a string (YYYY-MM-DD), but the next parseable row produces a datetime, and this subtraction path executes ts - prev_ts. That raises TypeError and aborts run_agentic_workflow for repositories without a sessions table but with mixed/dirty date rows (e.g., one blank or malformed date among normal commits). This turns the new fallback logic into a runtime crash instead of a graceful approximation.

Useful? React with 👍 / 👎.

sessions.append({"session_id": session_id, "timestamp": ts.isoformat()})
session_start = ts

prev_ts = ts

return sessions

def run_agentic_workflow(self) -> dict[str, Any]:
"""Analyze AI agent interaction patterns."""
self._log("Running Agentic Workflow Analyzer...")
sessions = self._query_db("SELECT session_id, timestamp FROM sessions ORDER BY timestamp")
sessions = self._approximate_sessions()
hooks = self._like_commits(["hook", "pre-commit", "post-commit", "automation"], 50)
agent_commits = self._query_db("SELECT author, COUNT(*) as cnt FROM commits GROUP BY author ORDER BY cnt DESC")
return {
Expand Down Expand Up @@ -237,20 +280,79 @@ def run_source_archaeologist(self) -> dict[str, Any]:
date = str(row.get("date", ""))[:7]
if date:
by_month[date] += 1
improvements = [
{"rank": 1, "title": "Keep audit gate as release blocker", "effort": "M", "impact": "HIGH"},
{"rank": 2, "title": "Replace placeholder analytics with derived joins", "effort": "M", "impact": "HIGH"},
{"rank": 3, "title": "Continue splitting large evaluator/router surfaces", "effort": "L", "impact": "MEDIUM"},
]
hotspots = self._query_db("SELECT message, COUNT(*) as cnt FROM commits GROUP BY message ORDER BY cnt DESC LIMIT 10")
improvements = self._derive_improvements(quality, large_change, todo, hotspots)
return {
"analysis_metadata": {"timestamp": datetime.now().isoformat(), "analyst": "Automated Source Code Archaeologist", "project": self.project_name, "commit_count": self._commit_count()},
"quality_trajectory": {"assessment": "IMPROVING" if quality else "UNKNOWN", "evidence_count": len(quality), "by_month": dict(sorted(by_month.items()))},
"architecture_drift": {"large_change_signals": large_change[:10], "todo_or_stub_signals": todo[:10]},
"hotspots": self._query_db("SELECT message, COUNT(*) as cnt FROM commits GROUP BY message ORDER BY cnt DESC LIMIT 10"),
"hotspots": hotspots,
"improvements": improvements,
"summary": {"quality_signal_count": len(quality), "large_change_signal_count": len(large_change), "todo_signal_count": len(todo)},
}

def _derive_improvements(
self,
quality: list[dict],
large_change: list[dict],
todo: list[dict],
hotspots: list[dict],
) -> list[dict]:
"""Derive prioritized remediation recommendations from actual commit data."""
items: list[tuple[int, str, str, str]] = [] # (score, title, effort, impact)

# Flapping issues: repeated commit messages signal unresolved root causes
flapping = [h for h in hotspots if h.get("cnt", 0) >= 3]
if flapping:
top_msg = str(flapping[0].get("message", ""))[:60]
items.append((
100,
f"Fix recurring issue: {top_msg}",
"M", "HIGH",
))

# Unresolved stubs / TODOs
if todo:
items.append((
90 if len(todo) >= 5 else 70,
f"Resolve {len(todo)} stub or placeholder commit(s)",
"S", "HIGH" if len(todo) >= 5 else "MEDIUM",
))

# Decomposition momentum: carry it through
if large_change:
items.append((
60,
f"Continue decomposition — {len(large_change)} large-change signal(s) detected",
"L", "MEDIUM",
))

# Quality signal density: low fix/test ratio suggests coverage gaps
commit_count = self._commit_count() or 1
quality_ratio = len(quality) / commit_count
if quality_ratio < 0.10:
items.append((
80,
f"Boost quality signal density — fix/test ratio at {quality_ratio:.0%} (target ≥10%)",
"M", "HIGH",
))
elif quality_ratio < 0.20:
items.append((
50,
f"Maintain quality signal density — currently at {quality_ratio:.0%}",
"S", "LOW",
))

# No issues found: project is healthy
if not items:
items.append((10, "No critical remediation items — maintain current trajectory", "S", "LOW"))

items.sort(key=lambda x: x[0], reverse=True)
return [
{"rank": i + 1, "title": title, "effort": effort, "impact": impact}
for i, (_, title, effort, impact) in enumerate(items)
]

def run_youtube_correlator(self) -> dict[str, Any]:
"""Summarize YouTube/watch-history correlation artifacts when available."""
self._log("Running YouTube Correlator...")
Expand Down
42 changes: 35 additions & 7 deletions archaeology/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,10 @@ def demo(project_name, force, build_db):
click.echo(f"Then: archaeology audit {project_name} --fail-on HIGH")
if build_db:
cmd = [sys.executable, "-m", "archaeology.db.builder", "--project-root", str(project_root)]
result = subprocess.run(cmd, check=True, timeout=300)
_env = os.environ.copy()
_pkg_root = str(Path(__file__).parent.parent)
_env["PYTHONPATH"] = _pkg_root + ((":" + _env["PYTHONPATH"]) if _env.get("PYTHONPATH") else "")
result = subprocess.run(cmd, check=True, timeout=300, env=_env)
if result.returncode != 0:
raise click.exceptions.Exit(result.returncode)

Expand Down Expand Up @@ -138,7 +141,10 @@ def build_db(project_name, verbose):
if verbose:
cmd.append("--verbose")

result = subprocess.run(cmd, check=True, timeout=300)
_env = os.environ.copy()
_pkg_root = str(Path(__file__).parent.parent)
_env["PYTHONPATH"] = _pkg_root + ((":" + _env["PYTHONPATH"]) if _env.get("PYTHONPATH") else "")
result = subprocess.run(cmd, check=True, timeout=300, env=_env)
if result.returncode == 0 and os.path.exists(db_path):
click.echo(f"Database built at {db_path}")
else:
Expand Down Expand Up @@ -235,12 +241,17 @@ def signals(project_name, config_path, min_gap_days, verbose):
if min_gap_days is not None:
config["min_gap_days"] = min_gap_days

db_path = os.path.join(_project_dir(project_name), "data", "archaeology.db")
if not os.path.exists(db_path):
click.echo(f"No database found. Run 'archaeology build-db {project_name}' first.", err=True)
sys.exit(1)

result = detect_signals(project_name, config=config or None)
if result.get("signals"):
click.echo(f"Detected {len(result['signals'])} signals "
f"across {len(result['cluster_summary'])} clusters.")
else:
click.echo("No signals detected. Build the database first.")
click.echo("No significant patterns detected in the commit history.")


@main.command()
Expand Down Expand Up @@ -438,6 +449,7 @@ def visualize(project_name):
first_date = ""
last_date = ""
agent_count = 0
eras_data = None
eras_json = os.path.join(project_dir, "data", "commit-eras.json")
if os.path.exists(eras_json):
try:
Expand Down Expand Up @@ -518,7 +530,17 @@ def visualize(project_name):
# Inline data.json so the HTML works from file:// (no CORS issues)
if os.path.exists(data_json):
with open(data_json, encoding="utf-8") as f:
data_content = f.read()
data_payload = json.load(f)

# Merge commit_eras and top-level fields from commit-eras.json into PROJECT_DATA
# so the era timeline visualization has real data to render.
if eras_data is not None:
data_payload.setdefault("commit_eras", eras_data.get("eras", []))
data_payload.setdefault("total_commits", eras_data.get("total_commits", 0))
data_payload.setdefault("first_commit_date", eras_data.get("first_commit_date", ""))
data_payload.setdefault("last_commit_date", eras_data.get("last_commit_date", ""))

data_content = json.dumps(data_payload)
safe_data_content = data_content.replace("<", "\\u003c").replace(">", "\\u003e").replace("&", "\\u0026")
inline_script = f'<script>window.PROJECT_DATA = {safe_data_content}; window.dispatchEvent(new Event("data-loaded"));</script>'
html = html.replace(
Expand Down Expand Up @@ -634,7 +656,10 @@ def cascade(project_name, dry_run, skip_mine):
db_path = data_dir / "archaeology.db"
cmd = [sys.executable, "-m", "archaeology.db.builder",
"--project-root", str(project_dir)]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
_env = os.environ.copy()
_pkg_root = str(Path(__file__).parent.parent)
_env["PYTHONPATH"] = _pkg_root + ((":" + _env["PYTHONPATH"]) if _env.get("PYTHONPATH") else "")
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300, env=_env)
if result.returncode == 0:
click.echo(f" Database built ({db_path})")
else:
Expand Down Expand Up @@ -966,7 +991,10 @@ def sync(projects, skip_mine, skip_signals, verbose):
if verbose:
cmd.append("--verbose")

result = subprocess.run(cmd, capture_output=not verbose, check=True, timeout=300)
_env = os.environ.copy()
_pkg_root = str(Path(__file__).parent.parent)
_env["PYTHONPATH"] = _pkg_root + ((":" + _env["PYTHONPATH"]) if _env.get("PYTHONPATH") else "")
result = subprocess.run(cmd, capture_output=not verbose, check=True, timeout=300, env=_env)
if result.returncode == 0 and os.path.exists(db_path):
click.echo(f" DB built")
else:
Expand Down Expand Up @@ -1162,7 +1190,7 @@ def benchmark(project_name):
sys.exit(1)


@main.command()
@main.command("dashboard")
@click.option("--port", default=8080, help="Port to serve on")
@click.option("--no-open", is_flag=True, help="Don't open browser automatically")
def serve(port, no_open):
Expand Down
52 changes: 38 additions & 14 deletions archaeology/era_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,31 +46,55 @@ def _infer_year(raw: dict) -> int:
return datetime.now().year


def _parse_era_date(date_str: str, year: int, reference: datetime | None = None) -> datetime | None:
"""Parse a single date string supporting multiple formats."""
s = date_str.strip()
# ISO: 2026-01-15
for fmt in ("%Y-%m-%d", "%Y-%m", "%b %d %Y", "%b %d"):
try:
if fmt == "%b %d":
dt = datetime.strptime(f"{s} {year}", "%b %d %Y")
else:
dt = datetime.strptime(s, fmt)
return dt
except ValueError:
continue
return None


def load_eras(eras_path: Path) -> list[EraDef]:
"""Load era definitions from commit-eras.json."""
"""Load era definitions from commit-eras.json.

Handles date formats: "Jan 1 - Jan 5", "2026-01-01 to 2026-01-05",
ISO single dates (era spans to next day), and month-only ranges.
"""
if not eras_path.exists():
return []
import re as _re
raw = json.loads(eras_path.read_text())
# Infer year from the first commit date in the data
year = _infer_year(raw)
eras = []
for era in raw.get("eras", []):
dates = era.get("dates", "")
parts = dates.split(" - ") if " - " in dates else dates.split(" – ")
if len(parts) != 2:
continue
try:
start = datetime.strptime(f"{parts[0].strip()} {year}", "%b %d %Y")
# If end date month is earlier than start, it's next year
end = datetime.strptime(f"{parts[1].strip()} {year}", "%b %d %Y")
if end < start:
end = datetime.strptime(f"{parts[1].strip()} {year + 1}", "%b %d %Y")
except (ValueError, IndexError):
# Split on " - ", " – ", " to " (ISO range), or handle single dates
for sep in (" - ", " – ", " to "):
if sep in dates:
parts = dates.split(sep, 1)
break
else:
parts = [dates, dates] # single date → era spans that day

start = _parse_era_date(parts[0], year)
end = _parse_era_date(parts[1], year) if len(parts) > 1 else start
if start is None or end is None:
continue
# If end is earlier than start, assume it wraps to next year
if end < start:
end = _parse_era_date(parts[1], year + 1) or end

commits = era.get("commits", 0)
if isinstance(commits, str):
import re
m = re.search(r"(\d+)", commits)
m = _re.search(r"(\d+)", commits)
commits = int(m.group(1)) if m else 0
eras.append(EraDef(
id=era["id"],
Expand Down
2 changes: 1 addition & 1 deletion archaeology/local_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def run_local_pipeline(
"PIPELINE_REVIEW_DAYS": str(review_days),
}
)
subprocess.run(cmd, cwd=pipeline_dir, env=env, check=True)
subprocess.run(cmd, cwd=pipeline_dir, env=env, check=True, timeout=300)


def read_local_pipeline_status(pipeline_dir: str | Path, repo_name: str) -> LocalPipelineStatus:
Expand Down
Loading
Loading