Skip to content

Commit bb8ee5b

Browse files
authored
Merge pull request #511 from microsoft/v1.1.0-hotfix
V1.1.0
2 parents 1b97834 + 9daf206 commit bb8ee5b

61 files changed

Lines changed: 3935 additions & 1894 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/ClusterBootstrap/deploy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2831,8 +2831,8 @@ def start_one_kube_service(fname):
28312831
pass
28322832

28332833
if fname == "./deploy/services/jobmanager/jobmanager.yaml":
2834-
# recreate the configmap init-user-script
2835-
run_kubectl( ["create configmap init-user-script --from-file=../Jobs_Templete/init_user.sh -o yaml --dry-run | ./deploy/bin/kubectl apply -f -"] )
2834+
# recreate the configmap dlws-scripts
2835+
run_kubectl( ["create configmap dlws-scripts --from-file=../Jobs_Templete/ -o yaml --dry-run | ./deploy/bin/kubectl apply -f -"] )
28362836

28372837
run_kubectl( ["create", "-f", fname ] )
28382838

src/ClusterBootstrap/params.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,19 @@
2323
"job-exporter": { "port": 9102 },
2424
"node-exporter": { "port": 9100 },
2525
"watchdog": { "port": 9101 },
26-
"grafana": { "port": 3000 },
26+
"grafana": { "port": 3000, "prometheus-ip": "localhost" },
2727
"alert-manager": {
2828
"port": 9093,
2929
"configured": False,
3030
"alert_users": False,
3131
# If want to deploy with alert-manager, should config
3232
# configured with True, and fill appropriate value to:
3333
# smtp_url, smtp_from, smtp_auth_username, smtp_auth_password and receiver
34+
"reaper": {
35+
"dry-run": True,
36+
"port": "9500",
37+
"restful-url": "http://localhost:5000",
38+
}
3439
},
3540

3641
"mysql_port": "3306",

src/ClusterBootstrap/services/jobmanager/jobmanager.yaml

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ spec:
1313
labels:
1414
jobmanager-node: pod
1515
app: jobmanager
16+
annotations:
17+
prometheus.io/scrape: "true"
18+
prometheus.io/path: "/metrics"
1619
spec:
1720
{% if cnf["dnsPolicy"] %}
1821
dnsPolicy: {{cnf["dnsPolicy"]}}
@@ -39,7 +42,40 @@ spec:
3942
- mountPath: {{cnf["storage-mount-path"]}}/jobfiles
4043
name: dlwsdatajobfiles
4144
- mountPath: /var/log/dlworkspace
42-
name: log
45+
name: log
46+
ports:
47+
- containerPort: 9200
48+
hostPort: 9200
49+
name: job-mgr
50+
protocol: TCP
51+
- containerPort: 9201
52+
hostPort: 9201
53+
name: user-mgr
54+
protocol: TCP
55+
- containerPort: 9202
56+
hostPort: 9202
57+
name: node-mgr
58+
protocol: TCP
59+
- containerPort: 9203
60+
hostPort: 9203
61+
name: joblog-mgr
62+
protocol: TCP
63+
- containerPort: 9204
64+
hostPort: 9204
65+
name: cmd-mgr
66+
protocol: TCP
67+
- containerPort: 9205
68+
hostPort: 9205
69+
name: endpoint-mgr
70+
protocol: TCP
71+
readinessProbe:
72+
failureThreshold: 3
73+
initialDelaySeconds: 3
74+
periodSeconds: 30
75+
successThreshold: 1
76+
tcpSocket:
77+
port: 9200
78+
timeoutSeconds: 10
4379
volumes:
4480
- name: certs
4581
hostPath:

src/ClusterBootstrap/services/monitor/alert-manager.yaml

Lines changed: 74 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ spec:
2424
hostNetwork: true
2525
containers:
2626
- name: alert-manager
27-
image: prom/alertmanager:v0.15.1
27+
image: prom/alertmanager:v0.18.0
2828
args:
2929
- '--config.file=/etc/alertmanager/config.yml'
3030
- '--storage.path=/alertmanager'
@@ -40,6 +40,23 @@ spec:
4040
mountPath: /alertmanager
4141
- name: templates-volume
4242
mountPath: /etc/alertmanager/template
43+
{% if cnf["alert-manager"]["reaper"] %}
44+
- name: reaper
45+
image: {{cnf["worker-dockerregistry"]}}{{cnf["dockerprefix"]}}reaper:{{cnf["dockertag"]}}
46+
command:
47+
- 'python'
48+
- '/reaper/main.py'
49+
- '--port'
50+
- '{{ cnf["alert-manager"]["reaper"]["port"] }}'
51+
- '--restful_url'
52+
- '{{ cnf["alert-manager"]["reaper"]["restful-url"] }}'
53+
{% if cnf["alert-manager"]["reaper"]["dry-run"] %}
54+
- '--dry_run'
55+
{% endif %}
56+
ports:
57+
- name: alert-manager
58+
containerPort: {{ cnf["alert-manager"]["reaper"]["port"] }}
59+
{% endif %}
4360
volumes:
4461
- name: config-volume
4562
configMap:
@@ -80,22 +97,38 @@ data:
8097
receiver: alert-email
8198
group_wait: 30s
8299
group_interval: 5m
83-
group_by: [alertname]
100+
group_by: [alertname, cluster]
84101
routes:
85-
- receiver: task_user
102+
- receiver: idle_gpu_receiver
86103
repeat_interval: 4h
87104
group_by: [alertname, user_email, cluster]
88105
match_re:
89-
type: user_alert
106+
type: idle_gpu
90107
alertname: "zero-gpu-usage"
108+
- receiver: job_state_change_receiver
109+
group_by: [alertname, user_email, cluster, subject]
110+
match_re:
111+
type: user_alert
112+
alertname: "job-state-changed"
113+
- receiver: reaper
114+
group_by: [alertname, user_email, job_name]
115+
group_wait: 0s
116+
match_re:
117+
type: reaper
118+
- receiver: kill_idle_job_email
119+
group_by: [alertname, user_email, cluster]
120+
group_wait: 0s
121+
match_re:
122+
type: kill_idle_job_email
123+
alertname: "kill-idle-jobs-email"
91124
receivers:
92125
- name: "alert-email"
93126
email_configs:
94127
- to: {{ alert_info["receiver"] }}
95128
html: '{{ "{{" }} template "email.html" . {{ "}}" }}'
96129
headers:
97130
subject: '{{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}'
98-
- name: "task_user"
131+
- name: "idle_gpu_receiver"
99132
email_configs:
100133
{% if cnf["alert-manager"]["alert_users"] %}
101134
- to: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }},{{ alert_info["receiver"] }}'
@@ -109,4 +142,40 @@ data:
109142
CC: '{{ alert_info["receiver"] }}'
110143
{% endif %}
111144
subject: '{{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}'
145+
- name: "job_state_change_receiver"
146+
email_configs:
147+
{% if cnf["alert-manager"]["alert_users"] %}
148+
- to: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }},{{ alert_info["receiver"] }}'
149+
{% else %}
150+
- to: '{{ alert_info["receiver"] }}'
151+
{% endif %}
152+
html: '{{ "{{" }} template "job_state.html" . {{ "}}" }}'
153+
headers:
154+
{% if cnf["alert-manager"]["alert_users"] %}
155+
To: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }}'
156+
CC: '{{ alert_info["receiver"] }}'
157+
{% endif %}
158+
subject: '{{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}'
159+
- name: "reaper"
160+
{% if cnf["alert-manager"]["reaper"] %}
161+
webhook_configs:
162+
- send_resolved: False
163+
url: 'http://localhost:{{ cnf["alert-manager"]["reaper"]["port"] }}/kill'
164+
http_config:
165+
bearer_token: 'shinigami'
166+
- name: "kill_idle_job_email"
167+
email_configs:
168+
{% if cnf["alert-manager"]["alert_users"] %}
169+
- to: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }},{{ alert_info["receiver"] }}'
170+
{% else %}
171+
- to: '{{ alert_info["receiver"] }}'
172+
{% endif %}
173+
html: '{{ "{{" }} template "kill_idle.html" . {{ "}}" }}'
174+
headers:
175+
{% if cnf["alert-manager"]["alert_users"] %}
176+
To: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }}'
177+
CC: '{{ alert_info["receiver"] }}'
178+
{% endif %}
179+
subject: '{{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}'
180+
{% endif %}
112181
{% endif %}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
{{ define "job_state.html" }}
2+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
3+
<!--
4+
Style and HTML derived from https://github.com/mailgun/transactional-email-templates
5+
6+
7+
The MIT License (MIT)
8+
9+
Copyright (c) 2014 Mailgun
10+
11+
Permission is hereby granted, free of charge, to any person obtaining a copy
12+
of this software and associated documentation files (the "Software"), to deal
13+
in the Software without restriction, including without limitation the rights
14+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15+
copies of the Software, and to permit persons to whom the Software is
16+
furnished to do so, subject to the following conditions:
17+
18+
The above copyright notice and this permission notice shall be included in all
19+
copies or substantial portions of the Software.
20+
21+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27+
SOFTWARE.
28+
-->
29+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
30+
<head>
31+
<meta name="viewport" content="width=device-width"/>
32+
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
33+
<title>{{ template "__subject" . }}</title>
34+
35+
</head>
36+
37+
<body itemscope="" itemtype="http://schema.org/EmailMessage" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; -webkit-font-smoothing: antialiased; -webkit-text-size-adjust: none; height: 100%; line-height: 1.6em; width: 100% !important; background-color: #f6f6f6; margin: 0; padding: 0;" bgcolor="#f6f6f6">
38+
39+
<table style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; width: 100%; background-color: #f6f6f6; margin: 0;" bgcolor="#f6f6f6">
40+
<tr>
41+
<td valign="top"></td>
42+
<td width="600" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; display: block !important; max-width: 600px !important; clear: both !important; width: 100% !important; margin: 0 auto; padding: 0;" valign="top">
43+
<div style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; max-width: 600px; display: block; margin: 0 auto; padding: 0;">
44+
<table width="100%" cellpadding="0" cellspacing="0" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; border-radius: 3px; background-color: #fff; margin: 0; border: 1px solid #e9e9e9;" bgcolor="#fff">
45+
<tr>
46+
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 10px;" valign="top">
47+
<table width="100%" cellpadding="0" cellspacing="0">
48+
{{ range .Alerts.Firing }}
49+
<tr>
50+
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
51+
Your job
52+
<a href="http://dltshub.redmond.corp.microsoft.com/Home/JobDetail/?cluster={{.Labels.cluster}}&jobId={{.Labels.job_name}}">
53+
<strong>{{.Labels.job_name}}</strong>
54+
</a> from cluster '{{.Labels.cluster}}' has changed to state of {{.Labels.job_state}}.
55+
</td>
56+
</tr>
57+
{{ end }}
58+
</table>
59+
</td>
60+
</tr>
61+
</table>
62+
63+
</div>
64+
</td>
65+
<td valign="top"></td>
66+
</tr>
67+
</table>
68+
69+
</body>
70+
</html>
71+
{{ end }}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
{{ define "kill_idle.html" }}
2+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
3+
<!--
4+
Style and HTML derived from https://github.com/mailgun/transactional-email-templates
5+
6+
7+
The MIT License (MIT)
8+
9+
Copyright (c) 2014 Mailgun
10+
11+
Permission is hereby granted, free of charge, to any person obtaining a copy
12+
of this software and associated documentation files (the "Software"), to deal
13+
in the Software without restriction, including without limitation the rights
14+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15+
copies of the Software, and to permit persons to whom the Software is
16+
furnished to do so, subject to the following conditions:
17+
18+
The above copyright notice and this permission notice shall be included in all
19+
copies or substantial portions of the Software.
20+
21+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27+
SOFTWARE.
28+
-->
29+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
30+
<head>
31+
<meta name="viewport" content="width=device-width"/>
32+
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
33+
<title>{{ template "__subject" . }}</title>
34+
35+
</head>
36+
37+
<body itemscope="" itemtype="http://schema.org/EmailMessage" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; -webkit-font-smoothing: antialiased; -webkit-text-size-adjust: none; height: 100%; line-height: 1.6em; width: 100% !important; background-color: #f6f6f6; margin: 0; padding: 0;" bgcolor="#f6f6f6">
38+
39+
<table style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; width: 100%; background-color: #f6f6f6; margin: 0;" bgcolor="#f6f6f6">
40+
<tr>
41+
<td valign="top"></td>
42+
<td width="600" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; display: block !important; max-width: 600px !important; clear: both !important; width: 100% !important; margin: 0 auto; padding: 0;" valign="top">
43+
<div style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; max-width: 600px; display: block; margin: 0 auto; padding: 0;">
44+
<table width="100%" cellpadding="0" cellspacing="0" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; border-radius: 3px; background-color: #fff; margin: 0; border: 1px solid #e9e9e9;" bgcolor="#fff">
45+
<tr>
46+
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 10px;" valign="top">
47+
<table width="100%" cellpadding="0" cellspacing="0">
48+
{{ range .Alerts.Firing }}
49+
<tr>
50+
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 0 0 20px;" valign="top">
51+
Your job
52+
<a href="http://dltshub.redmond.corp.microsoft.com/Home/JobDetail/?cluster={{.Labels.cluster}}&jobId={{.Labels.job_name}}">
53+
<strong>{{.Labels.job_name}}</strong>
54+
</a> from cluster '{{.Labels.cluster}}' VC '{{.Labels.vc_name}}' was killed because it have been idle for too long.
55+
</td>
56+
</tr>
57+
{{ end }}
58+
</table>
59+
</td>
60+
</tr>
61+
</table>
62+
63+
</div>
64+
</td>
65+
<td valign="top"></td>
66+
</tr>
67+
</table>
68+
69+
</body>
70+
</html>
71+
{{ end }}

src/ClusterBootstrap/services/monitor/alerting/jobs.rules

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,14 @@ groups:
55
expr: avg(task_gpu_percent) by (user_email, job_name, vc_name) == 0
66
for: 4h
77
labels:
8-
type: user_alert
8+
type: idle_gpu
9+
- alert: kill-idle-jobs-email
10+
expr: avg(task_gpu_percent) by (user_email, job_name, vc_name) == 0
11+
for: 8h
12+
labels:
13+
type: kill_idle_job_email
14+
- alert: kill-idle-jobs
15+
expr: avg(task_gpu_percent) by (user_email, job_name, vc_name) == 0
16+
for: 8h
17+
labels:
18+
type: reaper

0 commit comments

Comments
 (0)