Skip to content

Commit e136a42

Browse files
committed
scaled culler for multiple classes
1 parent 321fe17 commit e136a42

11 files changed

Lines changed: 493 additions & 0 deletions

File tree

README.md

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,69 @@ Alternatively, to run the script immediately:
7676

7777
This will trigger the cronjob to spawn a job manually.
7878

79+
80+
### scaled-nb-culler
81+
1. Ensure you are logged in to your OpenShift account via the CLI and you have access to ope-rhods-testing namespace.
82+
Then run:
83+
```
84+
oc project ope-rhods-testing
85+
```
86+
87+
2. Ensure the environment variables are correctly set in `cronjobs/scaled-nb-culler/cronjob`: <br>
88+
89+
Create a json dict for each class, the key must be the group name of the class
90+
The first value should be "cutoff" which is the cutoff time for the class
91+
The next value should be "ns" which is the namespace in which the class is running, or if there are multiple namespaces, the prefix of the namespace that should be matched
92+
The last value should be "multiple-ns" which value should be set to `true` if the class runs in multiple namespaces or `false` without quotes, if the class runs in a single namespace
93+
94+
For example:
95+
```
96+
value: |
97+
{
98+
"cs391": {
99+
"cutoff": 43200,
100+
"ns": "bu-cs391-pmpp",
101+
"multiple-ns": true
102+
},
103+
"ds100": {
104+
"cutoff": 7200,
105+
"ns": "rhods-notebooks",
106+
"multiple-ns": false
107+
},
108+
"cs210": {
109+
"cutoff": 43200,
110+
"ns": "rhods-notebooks",
111+
"multiple-ns": false
112+
},
113+
"dsp562": {
114+
"cutoff": 10800,
115+
"ns": "rhods-notebooks",
116+
"multiple-ns": false
117+
}
118+
}
119+
```
120+
3. Ensure that the namespace value in `kustomization.yaml` is correct.
121+
122+
4. From cronjobs/scaled-nb-culler/ directory run:
123+
```
124+
oc apply -k . --as system:admin
125+
```
126+
127+
This will deploy all the necessary resources for the cronjob to run on the specified schedule.
128+
129+
Alternatively, to run the script immediately:
130+
131+
1. Ensure you followed the steps above
132+
2. Verify the cronjob `scaled-culler` exists
133+
```
134+
oc get cronjob scaled-culler
135+
```
136+
137+
3. Run:
138+
```
139+
kubectl create -n rhods-notebooks job --from=cronjob/scaled-culler scaled-culler
140+
```
141+
79142
### multiple-ns-group-sync
80143
This cronjob runs once every hours at the top of the hour, adding all users with the edit rolebinding in the specified namespaces to the specified group. This cronjob differs from the original `group-sync` cronjob by syncing with multiple namespaces rather than just one namespace.
81144
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
FROM quay.io/operate-first/opf-toolbox:v0.8.0
2+
3+
# Create the destination directory
4+
WORKDIR /scripts
5+
6+
# Install pip first
7+
RUN dnf install -y python3-pip
8+
9+
# Install requirements first to maximize caching
10+
COPY requirements.txt ./requirements.txt
11+
RUN pip3 install -r requirements.txt
12+
13+
# Install the group-sync application
14+
COPY nb-culler.py helpers.py ./
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
from datetime import datetime, timezone
2+
import openshift_client as oc
3+
from typing import Optional
4+
import logging
5+
import json
6+
import sys
7+
8+
LOG = logging.getLogger(__name__)
9+
10+
11+
def build_user_cutoff_map(single_ns: dict[str, dict]) -> dict[str, dict]:
12+
"""Build a user -> {class, cutoff} lookup from all single-namespace class configs.
13+
If a user belongs to multiple groups, the more lenient (higher) cutoff wins.
14+
"""
15+
user_to_info: dict[str, dict] = {}
16+
17+
for class_name, config in single_ns.items():
18+
cutoff = int(config["cutoff"])
19+
users = get_group_users(class_name)
20+
21+
LOG.info("Loaded group %s (%d users, cutoff=%ss)", class_name, len(users), cutoff)
22+
23+
for u in users:
24+
u = str(u).strip()
25+
if not u:
26+
continue
27+
28+
existing = user_to_info.get(u)
29+
if existing:
30+
if cutoff > existing["cutoff"]:
31+
LOG.warning(
32+
"User %s in multiple groups (%s, %s). Using more lenient cutoff %ss from %s.",
33+
u, existing["class"], class_name, cutoff, class_name,
34+
)
35+
user_to_info[u] = {"class": class_name, "cutoff": cutoff}
36+
else:
37+
user_to_info[u] = {"class": class_name, "cutoff": cutoff}
38+
39+
LOG.info("Built user_to_info map with %d total users", len(user_to_info))
40+
return user_to_info
41+
42+
43+
def parse_rfc3339(ts: str) -> datetime:
44+
return datetime.fromisoformat(ts.replace("Z", "+00:00"))
45+
46+
47+
def as_bool(v) -> bool:
48+
"""Ensure strings passed in are normalized to booleans"""
49+
if isinstance(v, bool):
50+
return v
51+
if isinstance(v, str):
52+
return v.strip().lower() in {"true", "1", "yes", "y"}
53+
return bool(v)
54+
55+
56+
def get_notebook_username(nb: dict) -> Optional[str]:
57+
"""return the notebook user from annotations, if present."""
58+
ann = (nb.get("metadata") or {}).get("annotations") or {}
59+
60+
for k in ("opendatahub.io/username", "notebooks.opendatahub.io/username"):
61+
v = ann.get(k)
62+
if v:
63+
return str(v).strip()
64+
return None
65+
66+
67+
def get_group_users(group_name: str) -> set[str]:
68+
"get users from the class group"
69+
try:
70+
result = oc.invoke("get", ["group", group_name, "-o", "json"])
71+
group = json.loads(result.out())
72+
if not group.get("users"):
73+
LOG.warning("Group %s is empty. This could lead to notebooks being deleted.", group_name)
74+
return set(group.get("users") or [])
75+
except Exception as e:
76+
LOG.error("Failed to get users for group %s: %s", group_name, e)
77+
sys.exit(1)
78+
79+
80+
def get_running_notebooks(namespace: Optional[str] = None) -> list[dict]:
81+
"""
82+
Return running notebooks. If namespace is given, scope to that namespace;
83+
otherwise query all namespaces and include a 'namespace' key in each result.
84+
"""
85+
if namespace:
86+
ns_args = ["-n", namespace]
87+
jsonpath = (
88+
'{range .items[?(@.status.containerState.running)]}'
89+
'{.metadata.name}{"\\t"}{.status.containerState.running.startedAt}{"\\n"}{end}'
90+
)
91+
else:
92+
ns_args = ["-A"]
93+
jsonpath = (
94+
'{range .items[?(@.status.containerState.running)]}'
95+
'{.metadata.namespace}{"\\t"}{.metadata.name}{"\\t"}'
96+
'{.status.containerState.running.startedAt}{"\\n"}{end}'
97+
)
98+
99+
result = oc.invoke("get", ["notebooks"] + ns_args + ["-o", f"jsonpath={jsonpath}"])
100+
101+
notebooks = []
102+
for line in result.out().strip().splitlines():
103+
parts = line.strip().split("\t")
104+
if namespace and len(parts) == 2:
105+
notebooks.append({"name": parts[0], "startedAt": parts[1]})
106+
elif not namespace and len(parts) == 3:
107+
notebooks.append({"namespace": parts[0], "name": parts[1], "startedAt": parts[2]})
108+
return notebooks
109+
110+
111+
def get_notebook_username_map(namespace: str) -> dict[str, str]:
112+
"""
113+
Fetch all notebooks in a namespace and return a
114+
name -> username map derived from annotations.
115+
"""
116+
result = oc.invoke("get", ["notebooks", "-n", namespace, "-o", "json"])
117+
items = json.loads(result.out()).get("items", [])
118+
return {
119+
item["metadata"]["name"]: username
120+
for item in items
121+
if (username := get_notebook_username(item))
122+
}
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
import os
2+
import sys
3+
import openshift_client as oc
4+
import logging
5+
import json
6+
from datetime import datetime, timezone
7+
from helpers import (
8+
parse_rfc3339,
9+
as_bool,
10+
build_user_cutoff_map,
11+
get_running_notebooks,
12+
get_notebook_username_map,
13+
)
14+
15+
LOG = logging.getLogger(__name__)
16+
17+
18+
def get_class_ns(culler: dict) -> tuple[dict, dict]:
19+
"""
20+
build multi ns and single ns dicts
21+
"""
22+
multi_ns: dict[str, dict] = {}
23+
single_ns: dict[str, dict] = {}
24+
25+
for class_name, config in culler.items():
26+
cutoff = int(config["cutoff"])
27+
ns = config["ns"]
28+
mult_ns = as_bool(config.get("multiple-ns", False))
29+
30+
if mult_ns:
31+
multi_ns[class_name] = {
32+
"cutoff": cutoff,
33+
"prefix": ns,
34+
}
35+
else:
36+
single_ns[class_name] = {
37+
"cutoff": cutoff,
38+
"namespace": ns,
39+
}
40+
41+
return multi_ns, single_ns
42+
43+
44+
def stop_notebook(nb_name: str, started_at: str, namespace: str, cutoff_seconds: int) -> bool:
45+
""" Patch notebook if past cutoff. Returns True if stopped. """
46+
start_dt = parse_rfc3339(started_at)
47+
age_seconds = int((datetime.now(timezone.utc) - start_dt).total_seconds())
48+
49+
if age_seconds <= cutoff_seconds:
50+
LOG.info(
51+
"Notebook %s/%s within cutoff (age=%ss < cutoff=%ss)",
52+
namespace, nb_name, age_seconds, cutoff_seconds
53+
)
54+
return False
55+
56+
now_utc = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
57+
patch_obj = {
58+
"metadata": {
59+
"annotations": {"kubeflow-resource-stopped": now_utc}
60+
}
61+
}
62+
oc.invoke("patch", ["notebook", nb_name, "-n", namespace, "--type=merge", "-p", json.dumps(patch_obj)])
63+
LOG.info("Patched notebook %s/%s (age=%ss > cutoff=%ss)", namespace, nb_name, age_seconds, cutoff_seconds)
64+
return True
65+
66+
67+
def process_single(single_ns: dict[str, dict]) -> None:
68+
if not single_ns:
69+
LOG.info("No single namespace classes configured")
70+
return
71+
72+
namespace = next(iter(single_ns.values()))["namespace"]
73+
74+
LOG.info("Processing shared single namespace %s for %d class(es)", namespace, len(single_ns))
75+
76+
try:
77+
user_to_info = build_user_cutoff_map(single_ns)
78+
79+
with oc.project(namespace):
80+
running = get_running_notebooks(namespace)
81+
username_map = get_notebook_username_map(namespace)
82+
83+
for nb in running:
84+
nb_name = nb["name"]
85+
86+
username = username_map.get(nb_name)
87+
if not username:
88+
LOG.warning("Notebook %s missing username annotation, skipping", nb_name)
89+
continue
90+
91+
info = user_to_info.get(username)
92+
if not info:
93+
try:
94+
pvc = f"jupyterhub-nb-{nb_name.removeprefix('jupyter-nb-')}-pvc"
95+
oc.invoke("delete", ["notebook", nb_name, "-n", namespace])
96+
oc.invoke("delete", ["pvc", pvc, "-n", namespace])
97+
98+
LOG.info("Deleted notebook %s and pvc %s in namespace %s", nb_name, pvc, namespace)
99+
except Exception as e:
100+
LOG.error("Failed deleting notebook %s or pvc in namespace %s: %s", nb_name,namespace, e)
101+
continue
102+
103+
stop_notebook(nb_name, nb["startedAt"], namespace, info["cutoff"])
104+
105+
except Exception as e:
106+
LOG.error("Error processing shared single namespace %s: %s", namespace, e)
107+
108+
109+
def process_multi(multi_ns: dict[str, dict]) -> None:
110+
matchers: list[tuple[str, int]] = []
111+
for class_name, cfg in multi_ns.items():
112+
prefix = str(cfg["prefix"]).strip()
113+
cutoff = int(cfg["cutoff"])
114+
if prefix:
115+
matchers.append((prefix, cutoff))
116+
117+
if not matchers:
118+
LOG.info("No multi-namespace classes configured")
119+
return
120+
121+
matchers.sort(key=lambda x: len(x[0]), reverse=True)
122+
123+
try:
124+
running_all = get_running_notebooks()
125+
except Exception as e:
126+
LOG.error("Failed to list running notebooks across all namespaces: %s", e)
127+
return
128+
129+
matched = 0
130+
for nb in running_all:
131+
ns = nb["namespace"]
132+
133+
cutoff = None
134+
for prefix, c in matchers:
135+
if ns == prefix or ns.startswith(prefix + "-"):
136+
cutoff = c if cutoff is None else max(cutoff, c)
137+
138+
if cutoff is None:
139+
continue
140+
141+
matched += 1
142+
stop_notebook(nb["name"], nb["startedAt"], ns, cutoff)
143+
144+
LOG.info("Matched %d running notebooks in multi namespaces", matched)
145+
146+
147+
if __name__ == '__main__':
148+
logging.basicConfig(level='INFO')
149+
150+
culler_dict = json.loads(os.environ["CULLER_DICT"])
151+
152+
if not culler_dict:
153+
LOG.error('CULLER_DICT environment variables is required.')
154+
sys.exit(1)
155+
156+
class_info = get_class_ns(culler_dict)
157+
multi_ns, single_ns = class_info
158+
159+
process_single(single_ns)
160+
process_multi(multi_ns)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
openshift-client

0 commit comments

Comments
 (0)