2424 hostNetwork : true
2525 containers :
2626 - name : alert-manager
27- image : prom/alertmanager:v0.15.1
27+ image : prom/alertmanager:v0.18.0
2828 args :
2929 - ' --config.file=/etc/alertmanager/config.yml'
3030 - ' --storage.path=/alertmanager'
4040 mountPath : /alertmanager
4141 - name : templates-volume
4242 mountPath : /etc/alertmanager/template
43+ {% if cnf["alert-manager"]["reaper"] %}
44+ - name : reaper
45+ image : {{cnf["worker-dockerregistry"]}}{{cnf["dockerprefix"]}}reaper:{{cnf["dockertag"]}}
46+ command :
47+ - ' python'
48+ - ' /reaper/main.py'
49+ - ' --port'
50+ - ' {{ cnf["alert-manager"]["reaper"]["port"] }}'
51+ - ' --restful_url'
52+ - ' {{ cnf["alert-manager"]["reaper"]["restful-url"] }}'
53+ {% if cnf["alert-manager"]["reaper"]["dry-run"] %}
54+ - ' --dry_run'
55+ {% endif %}
56+ ports :
57+ - name : alert-manager
58+ containerPort : {{ cnf["alert-manager"]["reaper"]["port"] }}
59+ {% endif %}
4360 volumes :
4461 - name : config-volume
4562 configMap :
@@ -80,22 +97,38 @@ data:
8097 receiver : alert-email
8198 group_wait : 30s
8299 group_interval : 5m
83- group_by : [alertname]
100+ group_by : [alertname, cluster ]
84101 routes :
85- - receiver : task_user
102+ - receiver : idle_gpu_receiver
86103 repeat_interval : 4h
87104 group_by : [alertname, user_email, cluster]
88105 match_re :
89- type : user_alert
106+ type : idle_gpu
90107 alertname : " zero-gpu-usage"
108+ - receiver : job_state_change_receiver
109+ group_by : [alertname, user_email, cluster, subject]
110+ match_re :
111+ type : user_alert
112+ alertname : " job-state-changed"
113+ - receiver : reaper
114+ group_by : [alertname, user_email, job_name]
115+ group_wait : 0s
116+ match_re :
117+ type : reaper
118+ - receiver : kill_idle_job_email
119+ group_by : [alertname, user_email, cluster]
120+ group_wait : 0s
121+ match_re :
122+ type : kill_idle_job_email
123+ alertname : " kill-idle-jobs-email"
91124 receivers :
92125 - name : " alert-email"
93126 email_configs :
94127 - to : {{ alert_info["receiver"] }}
95128 html : ' {{ "{{" }} template "email.html" . {{ "}}" }}'
96129 headers :
97130 subject : ' {{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}'
98- - name : " task_user "
131+ - name : " idle_gpu_receiver "
99132 email_configs :
100133 {% if cnf["alert-manager"]["alert_users"] %}
101134 - to : ' {{ "{{" }} .GroupLabels.user_email {{ "}}" }},{{ alert_info["receiver"] }}'
@@ -109,4 +142,40 @@ data:
109142 CC : ' {{ alert_info["receiver"] }}'
110143 {% endif %}
111144 subject : ' {{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}'
145+ - name : " job_state_change_receiver"
146+ email_configs :
147+ {% if cnf["alert-manager"]["alert_users"] %}
148+ - to : ' {{ "{{" }} .GroupLabels.user_email {{ "}}" }},{{ alert_info["receiver"] }}'
149+ {% else %}
150+ - to : ' {{ alert_info["receiver"] }}'
151+ {% endif %}
152+ html : ' {{ "{{" }} template "job_state.html" . {{ "}}" }}'
153+ headers :
154+ {% if cnf["alert-manager"]["alert_users"] %}
155+ To : ' {{ "{{" }} .GroupLabels.user_email {{ "}}" }}'
156+ CC : ' {{ alert_info["receiver"] }}'
157+ {% endif %}
158+ subject : ' {{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}'
159+ - name : " reaper"
160+ {% if cnf["alert-manager"]["reaper"] %}
161+ webhook_configs :
162+ - send_resolved : False
163+ url : ' http://localhost:{{ cnf["alert-manager"]["reaper"]["port"] }}/kill'
164+ http_config :
165+ bearer_token : ' shinigami'
166+ - name : " kill_idle_job_email"
167+ email_configs :
168+ {% if cnf["alert-manager"]["alert_users"] %}
169+ - to : ' {{ "{{" }} .GroupLabels.user_email {{ "}}" }},{{ alert_info["receiver"] }}'
170+ {% else %}
171+ - to : ' {{ alert_info["receiver"] }}'
172+ {% endif %}
173+ html : ' {{ "{{" }} template "kill_idle.html" . {{ "}}" }}'
174+ headers :
175+ {% if cnf["alert-manager"]["alert_users"] %}
176+ To : ' {{ "{{" }} .GroupLabels.user_email {{ "}}" }}'
177+ CC : ' {{ alert_info["receiver"] }}'
178+ {% endif %}
179+ subject : ' {{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}'
180+ {% endif %}
112181{% endif %}
0 commit comments