From 4fe8e35c0349de62e5d47385816f1a09258c16da Mon Sep 17 00:00:00 2001 From: Mats Rynge Date: Tue, 19 May 2026 14:37:39 -0700 Subject: [PATCH] OSPool: refactoring cm/ccb, enable live reconfig --- opensciencegrid/ospool-ccb/10-htcondor.conf | 8 + opensciencegrid/ospool-ccb/Dockerfile | 53 +++++++ opensciencegrid/ospool-ccb/README.md | 4 + .../ospool-ccb/condor_master_wrapper | 6 + opensciencegrid/ospool-ccb/healthy.sh | 25 +++ .../ospool-ccb/opt/ospool/retain-logs | 19 +++ .../ospool-ccb/opt/ospool/update-prios | 87 +++++++++++ .../ospool-ccb/ospool-ccb-config.sh | 133 ++++++++++++++++ opensciencegrid/ospool-ccb/ospool-ccb.cron | 9 ++ opensciencegrid/ospool-ccb/supervisord.conf | 22 +++ opensciencegrid/ospool-cm/Dockerfile | 5 +- opensciencegrid/ospool-cm/README.md | 7 +- opensciencegrid/ospool-cm/ospool-cm-config.sh | 145 ++++++++++++++++++ opensciencegrid/ospool-cm/ospool.cron | 3 + 14 files changed, 520 insertions(+), 6 deletions(-) create mode 100644 opensciencegrid/ospool-ccb/10-htcondor.conf create mode 100644 opensciencegrid/ospool-ccb/Dockerfile create mode 100644 opensciencegrid/ospool-ccb/README.md create mode 100755 opensciencegrid/ospool-ccb/condor_master_wrapper create mode 100755 opensciencegrid/ospool-ccb/healthy.sh create mode 100755 opensciencegrid/ospool-ccb/opt/ospool/retain-logs create mode 100755 opensciencegrid/ospool-ccb/opt/ospool/update-prios create mode 100755 opensciencegrid/ospool-ccb/ospool-ccb-config.sh create mode 100644 opensciencegrid/ospool-ccb/ospool-ccb.cron create mode 100644 opensciencegrid/ospool-ccb/supervisord.conf create mode 100755 opensciencegrid/ospool-cm/ospool-cm-config.sh diff --git a/opensciencegrid/ospool-ccb/10-htcondor.conf b/opensciencegrid/ospool-ccb/10-htcondor.conf new file mode 100644 index 00000000..0416611e --- /dev/null +++ b/opensciencegrid/ospool-ccb/10-htcondor.conf @@ -0,0 +1,8 @@ +[program:condor_master] +command=/usr/sbin/condor_master_wrapper +autorestart=True +startsecs=60 +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +redirect_stderr=true + diff --git a/opensciencegrid/ospool-ccb/Dockerfile b/opensciencegrid/ospool-ccb/Dockerfile new file mode 100644 index 00000000..301cb5fe --- /dev/null +++ b/opensciencegrid/ospool-ccb/Dockerfile @@ -0,0 +1,53 @@ +# Specify the opensciencegrid/software-base image tag +ARG BASE_OSG_SERIES=25 +ARG BASE_YUM_REPO=release +ARG BASE_OS=el9 + +FROM opensciencegrid/software-base:$BASE_OSG_SERIES-$BASE_OS-$BASE_YUM_REPO + +# has to be redefined for use in the RUN stages +ARG BASE_YUM_REPO + +RUN yum -y distro-sync && \ + yum -y install \ + epel-release && \ + yum -y install \ + bc \ + git \ + lsof \ + rrdtool \ + python3-pip \ + vim \ + wget \ + && \ + yum clean all + +# Pull HTCondor from the proper repo. For "release" we need to use +# osg-upcoming-testing to meet the patch tuesday requirements. +RUN if [[ $BASE_YUM_REPO = release ]]; then \ + yum -y --enablerepo=osg-upcoming-testing install condor; \ + else \ + yum -y install condor; \ + fi + +# basic config is a collector, so we can test +RUN echo "DAEMON_LIST = MASTER, COLLECTOR" >/etc/condor/config.d/05-ospool-base.config && \ + echo "USE_SHARED_PORT = TRUE" >>/etc/condor/config.d/05-ospool-base.config + +COPY condor_master_wrapper /usr/sbin/ +RUN chmod 755 /usr/sbin/condor_master_wrapper + +# Override the software-base supervisord.conf to throw away supervisord logs +COPY supervisord.conf /etc/supervisord.conf + +COPY 10-htcondor.conf /etc/supervisord.d/ + +COPY ospool-ccb-config.sh /etc/osg/image-init.d/60-ospool-ccb-config.sh +RUN chmod 755 /etc/osg/image-init.d/60-ospool-ccb-config.sh + +COPY ospool-ccb.cron /etc/cron.d/ospool-ccb + +ADD opt/ospool /opt/ospool + +ADD healthy.sh /healthy.sh + diff --git a/opensciencegrid/ospool-ccb/README.md b/opensciencegrid/ospool-ccb/README.md new file mode 100644 index 00000000..c6c7fef4 --- /dev/null +++ b/opensciencegrid/ospool-ccb/README.md @@ -0,0 +1,4 @@ +# ospool-ccb + +Container image for the OSPool ccb instances. + diff --git a/opensciencegrid/ospool-ccb/condor_master_wrapper b/opensciencegrid/ospool-ccb/condor_master_wrapper new file mode 100755 index 00000000..34867f5f --- /dev/null +++ b/opensciencegrid/ospool-ccb/condor_master_wrapper @@ -0,0 +1,6 @@ +#!/bin/bash + +tail -F `condor_config_val LOG`/MasterLog 2>/dev/null & + +exec /usr/sbin/condor_master -f + diff --git a/opensciencegrid/ospool-ccb/healthy.sh b/opensciencegrid/ospool-ccb/healthy.sh new file mode 100755 index 00000000..697f3e3d --- /dev/null +++ b/opensciencegrid/ospool-ccb/healthy.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +failures=$(supervisorctl status | grep -Ev 'container_cleanup|RUNNING') +if [ -n "$failures" ]; then + failures=$(echo $failures | sed -E 's/ +/ /' | xargs) + echo "supervisord non-RUNNING service: $failures" >&2 + exit 2 +fi + +container_start_time=$(stat -c %Z /proc/1) # ctime, epoch time + +procs_z=$(ps axo pid,stat | awk '$2 ~ /^Z/ { print $1 }' | wc -l) +if [ "$procs_z" -gt 3 ]; then + echo "Found $procs_z zombie (Z) processes" >&2 + exit 4 +fi + +procs_d=$(ps axo pid,stat | awk '$2 ~ /^D/ { print $1 }' | wc -l) +if [ "$procs_d" -gt 15 ]; then + echo "Found $procs_d uninterruptible (D) processes" >&2 + exit 5 +fi + +exit 0 + diff --git a/opensciencegrid/ospool-ccb/opt/ospool/retain-logs b/opensciencegrid/ospool-ccb/opt/ospool/retain-logs new file mode 100755 index 00000000..56f7ba0b --- /dev/null +++ b/opensciencegrid/ospool-ccb/opt/ospool/retain-logs @@ -0,0 +1,19 @@ +#!/bin/bash + +# For performance reasons, the pods are set up to log to local +# disk. This script is run from cron, and keeps copies of the +# logs for a certain number of days. + +TARGET_DIR=save-$(date +'%Y%m%d') + +mkdir -p /state/htcondor/logs +cd /state/htcondor/logs || exit 1 + +mkdir -p $TARGET_DIR +rsync -a /var/log/condor/. $TARGET_DIR/. + +# only keep the last N set of saved logs +for OLD in $(ls -d -t save-*| tail -n +7); do + rm -rf $OLD +done + diff --git a/opensciencegrid/ospool-ccb/opt/ospool/update-prios b/opensciencegrid/ospool-ccb/opt/ospool/update-prios new file mode 100755 index 00000000..0271c0c0 --- /dev/null +++ b/opensciencegrid/ospool-ccb/opt/ospool/update-prios @@ -0,0 +1,87 @@ +#!/usr/bin/python3 + +# This script is designed to adjust user priorities in the OSPool based on +# the current state of user workloads. The HTCondor priority settings are +# modified according to attributes such as the number of held jobs and the +# ratio of goodput to badput. + +import os +import re +import sys + +import htcondor2 as htcondor +import classad2 as classad +from htcondor2 import AdTypes + + +MIN_PRIORITY_FACTOR = 500 +MAX_PRIORITY_FACTOR = 20000 +HELD_JOB_PENALTY_MULTIPLIER = 5 + + +def ad_int(ad, key, default=0): + if key not in ad: + return default + try: + if int(ad[key]) == ad[key]: + return int(ad[key]) + except: + return default + return default + + +def step(n): + # round n to a nice 100 + return int(round(n / 100) * 100) + + +def main(): + col = htcondor.Collector() + + # find the right negotiator + neg_ad = None + for ad in col.query(AdTypes.Negotiator): + # the main negotiator of the pool starts with cm-1. or cm-2. + if re.search("^cm-[12]\.", ad["Name"]): + neg_ad = ad + if not neg_ad: + print("Unable to find the main negotiator") + sys.exit(1) + neg = htcondor.Negotiator(neg_ad) + print(f"Updating negotiator {neg_ad['Name']}") + + # get the current prio ads so we can determine if we need + # an update or note + current_prios = neg.getPriorities() + + for ad in col.query(AdTypes.Submitter): + + # current factor + current_factor = 1000 + for prio_ad in current_prios: + if prio_ad["Name"] == ad["Name"]: + current_factor = round(prio_ad["PriorityFactor"]) + + factor = 1000 + + # held jobs + held = ad_int(ad, "HeldJobs", 0) * HELD_JOB_PENALTY_MULTIPLIER + factor += held + + # make sure we finish jobs on ap2X before migration + if ".uc.osg-htc.org" in ad["Name"]: + factor -= 1000 + + # upper/lower limits on the adjustments + factor = min(factor, MAX_PRIORITY_FACTOR) + factor = max(factor, MIN_PRIORITY_FACTOR) + factor = step(factor) + + if factor != current_factor: + print(f" {ad['Name']} {current_factor} -> {factor}") + neg.setFactor(ad["Name"], factor) + + +if __name__ == "__main__": + main() + diff --git a/opensciencegrid/ospool-ccb/ospool-ccb-config.sh b/opensciencegrid/ospool-ccb/ospool-ccb-config.sh new file mode 100755 index 00000000..564e7c2e --- /dev/null +++ b/opensciencegrid/ospool-ccb/ospool-ccb-config.sh @@ -0,0 +1,133 @@ +#!/bin/bash + +# this script is invoked by the OSG container init setup + + +function kill_container() +{ + echo "Restarting container: $1" 2>&1 + sleep 180 + exit 1 +} + +# create an env file so we can have the cron jobs run with the +# same environment as the init.d run +if [ -e /etc/ospool.env ]; then + . /etc/ospool.env +else + cat >/etc/ospool.env </etc/condor/config.d/10-ospool-ccb.config </etc/condor/certs/condor_mapfile + echo "Writing out new /etc/condor/config.d/95_flocking_ospoolgit.config ..." + ./fe-admin --target-env $OSPOOL_ENVIRONMENT --htcondor-config >/etc/condor/config.d/95_flocking_ospoolgit.config + + # this will fail during initial configuration, but work once the pool is up + /usr/sbin/condor_reconfig || true + +fi + diff --git a/opensciencegrid/ospool-ccb/ospool-ccb.cron b/opensciencegrid/ospool-ccb/ospool-ccb.cron new file mode 100644 index 00000000..3ed445d1 --- /dev/null +++ b/opensciencegrid/ospool-ccb/ospool-ccb.cron @@ -0,0 +1,9 @@ + +# configuration changes + */5 * * * * root /bin/bash -c 'sleep $(( RANDOM \% 240 )); /etc/osg/image-init.d/60-ospool-ccb-config.sh' >/var/log/ospool-ccb-config.log 2>&1 + + +# logs + 0 * * * * root /opt/ospool/retain-logs >/dev/null 2>&1 + + diff --git a/opensciencegrid/ospool-ccb/supervisord.conf b/opensciencegrid/ospool-ccb/supervisord.conf new file mode 100644 index 00000000..966130f3 --- /dev/null +++ b/opensciencegrid/ospool-ccb/supervisord.conf @@ -0,0 +1,22 @@ +[supervisord] +nodaemon=true +logfile=/dev/null +logfile_maxbytes=0 + +[unix_http_server] +file=/tmp/supervisor.sock ; (the path to the socket file) + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface + +[supervisorctl] +serverurl=unix:///tmp/supervisor.sock ; use a unix:// URL for a unix socket +loglevel=debug + +[include] +files=/etc/supervisord.d/*.conf + +[program:crond] +command=/usr/sbin/crond -n +autorestart=true + diff --git a/opensciencegrid/ospool-cm/Dockerfile b/opensciencegrid/ospool-cm/Dockerfile index 8cf38a8b..e490826e 100644 --- a/opensciencegrid/ospool-cm/Dockerfile +++ b/opensciencegrid/ospool-cm/Dockerfile @@ -1,5 +1,5 @@ # Specify the opensciencegrid/software-base image tag -ARG BASE_OSG_SERIES=23 +ARG BASE_OSG_SERIES=25 ARG BASE_YUM_REPO=release ARG BASE_OS=el9 @@ -47,6 +47,9 @@ COPY supervisord.conf /etc/supervisord.conf COPY 10-htcondor.conf /etc/supervisord.d/ COPY 30-prometheus.conf /etc/supervisord.d/ +COPY ospool-cm-config.sh /etc/osg/image-init.d/60-ospool-cm-config.sh +RUN chmod 755 /etc/osg/image-init.d/60-ospool-cm-config.sh + COPY ospool.cron /etc/cron.d/ospool ADD opt/ospool /opt/ospool diff --git a/opensciencegrid/ospool-cm/README.md b/opensciencegrid/ospool-cm/README.md index aaa33b4d..fe98ac7d 100644 --- a/opensciencegrid/ospool-cm/README.md +++ b/opensciencegrid/ospool-cm/README.md @@ -1,7 +1,4 @@ -# docker-ospool-cm +# ospool-cm -Container image for the OSPool central managers and ccb instances. -This is currently based on the OSG 3.6 release, but we want to -keep a separate image here as the OSPool might need more newer -HTCondor release to test/enable new features. +Container image for the OSPool central managers. diff --git a/opensciencegrid/ospool-cm/ospool-cm-config.sh b/opensciencegrid/ospool-cm/ospool-cm-config.sh new file mode 100755 index 00000000..06dc8db2 --- /dev/null +++ b/opensciencegrid/ospool-cm/ospool-cm-config.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +# This script is invoked by the OSG container init setup, and from cron + +function kill_container() +{ + echo "Restarting container: $1" 2>&1 + sleep 180 + exit 1 +} + +# create an env file so we can have the cron jobs run with the +# same environment as the init.d run +if [ -e /etc/ospool.env ]; then + . /etc/ospool.env +else + cat >/etc/ospool.env </etc/condor/config.d/10-ospool-cm.config </etc/condor/certs/condor_mapfile + echo "Writing out new /etc/condor/config.d/95_flocking_ospoolgit.config ..." + ./fe-admin --target-env $OSPOOL_ENVIRONMENT --htcondor-config >/etc/condor/config.d/95_flocking_ospoolgit.config + + # need a spool directory for our extra negotiator + mkdir -p $(condor_config_val SPOOL)/negotiator_allocated + chown condor: $(condor_config_val SPOOL)/negotiator_allocated + + # set up the ospool-overview web directory + mkdir -p /shared/web/overview + cp -r /opt/ospool-overview/* /shared/web/overview/ + + # this will fail during initial configuration, but work once the pool is up + /usr/sbin/condor_reconfig || true + +fi + diff --git a/opensciencegrid/ospool-cm/ospool.cron b/opensciencegrid/ospool-cm/ospool.cron index 480274c2..7d883413 100644 --- a/opensciencegrid/ospool-cm/ospool.cron +++ b/opensciencegrid/ospool-cm/ospool.cron @@ -1,4 +1,7 @@ +# configuration changes + */5 * * * * root /bin/bash -c 'sleep $(( RANDOM \% 240 )); /etc/osg/image-init.d/60-ospool-cm-config.sh' >/var/log/ospool-cm-config.log 2>&1 + # logs 0 * * * * root /opt/ospool/retain-logs >/dev/null 2>&1