-
Notifications
You must be signed in to change notification settings - Fork 19
perf: optimize pagerank using sparse vectorization and np.add.at #40
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -1,5 +1,6 @@ | ||||||||||||||||||||
| from __future__ import annotations | ||||||||||||||||||||
|
|
||||||||||||||||||||
| import numpy as np | ||||||||||||||||||||
| from random import Random | ||||||||||||||||||||
| from typing import Any | ||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
@@ -21,26 +22,44 @@ def generate(node_count: int, seed: int) -> dict[str, list[str]]: | |||||||||||||||||||
|
|
||||||||||||||||||||
| def solve(graph: dict[str, list[str]], iterations: int = 16, damping: float = 0.85) -> dict[str, Any]: | ||||||||||||||||||||
| nodes = sorted(graph) | ||||||||||||||||||||
| if not nodes: | ||||||||||||||||||||
| n = len(nodes) | ||||||||||||||||||||
| if n == 0: | ||||||||||||||||||||
| return {"node_count": 0, "top_node": "", "top_score": 0.0, "checksum": 0.0} | ||||||||||||||||||||
|
|
||||||||||||||||||||
| rank = {node: 1.0 / len(nodes) for node in nodes} | ||||||||||||||||||||
| outgoing = {node: graph[node] if graph[node] else nodes for node in nodes} | ||||||||||||||||||||
| base = (1.0 - damping) / len(nodes) | ||||||||||||||||||||
| node_to_idx = {node: i for i, node in enumerate(nodes)} | ||||||||||||||||||||
|
|
||||||||||||||||||||
| src_list = [] | ||||||||||||||||||||
| dst_list = [] | ||||||||||||||||||||
| out_degrees = np.zeros(n) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| for i, node in enumerate(nodes): | ||||||||||||||||||||
| neighbors = graph[node] | ||||||||||||||||||||
| out_degrees[i] = len(neighbors) | ||||||||||||||||||||
| for neighbor in neighbors: | ||||||||||||||||||||
| src_list.append(i) | ||||||||||||||||||||
| dst_list.append(node_to_idx[neighbor]) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| src_indices = np.array(src_list) | ||||||||||||||||||||
| dst_indices = np.array(dst_list) | ||||||||||||||||||||
|
coderabbitai[bot] marked this conversation as resolved.
Comment on lines
+42
to
+43
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
python -m pip install --quiet numpy
python - <<'PY'
import numpy as np
bad = np.array([])
good = np.array([], dtype=np.intp)
ranks = np.array([1.0])
for label, idx in [("bad", bad), ("good", good)]:
print(label, idx.dtype)
try:
print(ranks[idx])
except Exception as exc:
print(type(exc).__name__, exc)
PYRepository: iiitl/chuck Length of output: 151 🏁 Script executed: head -60 chuck/tasks/graph_analytics/task.py | tail -30Repository: iiitl/chuck Length of output: 921 🏁 Script executed: head -70 chuck/tasks/graph_analytics/task.py | cat -nRepository: iiitl/chuck Length of output: 2466 Force an integer dtype for empty edge lists. If a caller passes a graph whose adjacency lists are all empty, these arrays become 🩹 Proposed fix- src_indices = np.array(src_list)
- dst_indices = np.array(dst_list)
+ src_indices = np.array(src_list, dtype=np.intp)
+ dst_indices = np.array(dst_list, dtype=np.intp)🤖 Prompt for AI Agents |
||||||||||||||||||||
|
|
||||||||||||||||||||
| ranks = np.full(n, 1.0 / n) | ||||||||||||||||||||
| teleport_base = (1.0 - damping) / n | ||||||||||||||||||||
|
|
||||||||||||||||||||
| for _ in range(iterations): | ||||||||||||||||||||
| new_rank = {node: base for node in nodes} | ||||||||||||||||||||
| for node in nodes: | ||||||||||||||||||||
| share = rank[node] / len(outgoing[node]) | ||||||||||||||||||||
| for target in outgoing[node]: | ||||||||||||||||||||
| new_rank[target] += damping * share | ||||||||||||||||||||
| rank = new_rank | ||||||||||||||||||||
| top_node = max(nodes, key=lambda node: (rank[node], node)) | ||||||||||||||||||||
| checksum = sum((index + 1) * rank[node] for index, node in enumerate(nodes)) | ||||||||||||||||||||
| new_ranks = np.full(n, teleport_base) | ||||||||||||||||||||
| contributions = (ranks[src_indices] / out_degrees[src_indices]) * damping | ||||||||||||||||||||
| np.add.at(new_ranks, dst_indices, contributions) | ||||||||||||||||||||
| ranks = new_ranks | ||||||||||||||||||||
|
|
||||||||||||||||||||
| top_idx = np.argmax(ranks) | ||||||||||||||||||||
| top_node = nodes[top_idx] | ||||||||||||||||||||
| checksum = np.sum(np.arange(1, n + 1) * ranks) | ||||||||||||||||||||
|
Comment on lines
+54
to
+56
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# Search for original PageRank top-node selection logic in git history
git log -p --all -S 'max(nodes' -- '*.py' | head -80Repository: iiitl/chuck Length of output: 37 🏁 Script executed: # First, check the current state of the file around lines 54-56
wc -l chuck/tasks/graph_analytics/task.pyRepository: iiitl/chuck Length of output: 93 🏁 Script executed: # Show the context around lines 54-56
sed -n '45,65p' chuck/tasks/graph_analytics/task.pyRepository: iiitl/chuck Length of output: 650 🏁 Script executed: # Check git history for this file to find the original implementation
git log --oneline chuck/tasks/graph_analytics/task.py | head -20Repository: iiitl/chuck Length of output: 127 🏁 Script executed: # Look for the original code in git diff
git log -p chuck/tasks/graph_analytics/task.py | head -200Repository: iiitl/chuck Length of output: 37 🏁 Script executed: # Show the specific commit that changed the PageRank implementation
git show 180e97bRepository: iiitl/chuck Length of output: 50370 🏁 Script executed: # Check if there are any earlier commits or if this is the initial commit
git log --all -- chuck/tasks/graph_analytics/task.pyRepository: iiitl/chuck Length of output: 434 🏁 Script executed: # Look for the original code before the optimization commit
git show 180e97b^:chuck/tasks/graph_analytics/task.py 2>/dev/null || echo "No previous version found"Repository: iiitl/chuck Length of output: 80 🏁 Script executed: # Check the previous commit to see if there was an original implementation
git show 3bb6d58:chuck/tasks/graph_analytics/task.py 2>/dev/null | head -100Repository: iiitl/chuck Length of output: 1708 Tie-breaking behavior differs from original implementation. The original Though regression tests pass (suggesting ties may not occur in practice), this is a correctness discrepancy. To align with the original behavior: - top_idx = np.argmax(ranks)
- top_node = nodes[top_idx]
+ max_rank = np.max(ranks)
+ # Among nodes with max rank, pick lexicographically largest (matches original behavior)
+ top_indices = np.where(ranks == max_rank)[0]
+ top_idx = top_indices[-1] # Last index among ties = largest node name
+ top_node = nodes[top_idx]📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||
|
|
||||||||||||||||||||
| return { | ||||||||||||||||||||
| "node_count": len(nodes), | ||||||||||||||||||||
| "node_count": n, | ||||||||||||||||||||
| "top_node": top_node, | ||||||||||||||||||||
| "top_score": round6(rank[top_node]), | ||||||||||||||||||||
| "checksum": round6(checksum), | ||||||||||||||||||||
| "top_score": round6(ranks[top_idx]), | ||||||||||||||||||||
| "checksum": round6(float(checksum)), | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| numpy |
Uh oh!
There was an error while loading. Please reload this page.