Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions opensciencegrid/ospool-ccb/10-htcondor.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[program:condor_master]
command=/usr/sbin/condor_master_wrapper
autorestart=True
startsecs=60
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
redirect_stderr=true

53 changes: 53 additions & 0 deletions opensciencegrid/ospool-ccb/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Specify the opensciencegrid/software-base image tag
ARG BASE_OSG_SERIES=25
ARG BASE_YUM_REPO=release
ARG BASE_OS=el9

FROM opensciencegrid/software-base:$BASE_OSG_SERIES-$BASE_OS-$BASE_YUM_REPO

# has to be redefined for use in the RUN stages
ARG BASE_YUM_REPO

RUN yum -y distro-sync && \
yum -y install \
epel-release && \
yum -y install \
bc \
git \
lsof \
rrdtool \
python3-pip \
vim \
wget \
&& \
yum clean all

# Pull HTCondor from the proper repo. For "release" we need to use
# osg-upcoming-testing to meet the patch tuesday requirements.
RUN if [[ $BASE_YUM_REPO = release ]]; then \
yum -y --enablerepo=osg-upcoming-testing install condor; \
else \
yum -y install condor; \
fi

# basic config is a collector, so we can test
RUN echo "DAEMON_LIST = MASTER, COLLECTOR" >/etc/condor/config.d/05-ospool-base.config && \
echo "USE_SHARED_PORT = TRUE" >>/etc/condor/config.d/05-ospool-base.config

COPY condor_master_wrapper /usr/sbin/
RUN chmod 755 /usr/sbin/condor_master_wrapper

# Override the software-base supervisord.conf to throw away supervisord logs
COPY supervisord.conf /etc/supervisord.conf

COPY 10-htcondor.conf /etc/supervisord.d/

COPY ospool-ccb-config.sh /etc/osg/image-init.d/60-ospool-ccb-config.sh
RUN chmod 755 /etc/osg/image-init.d/60-ospool-ccb-config.sh

COPY ospool-ccb.cron /etc/cron.d/ospool-ccb

ADD opt/ospool /opt/ospool

ADD healthy.sh /healthy.sh

4 changes: 4 additions & 0 deletions opensciencegrid/ospool-ccb/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# ospool-ccb

Container image for the OSPool ccb instances.

6 changes: 6 additions & 0 deletions opensciencegrid/ospool-ccb/condor_master_wrapper
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

tail -F `condor_config_val LOG`/MasterLog 2>/dev/null &

exec /usr/sbin/condor_master -f

25 changes: 25 additions & 0 deletions opensciencegrid/ospool-ccb/healthy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

failures=$(supervisorctl status | grep -Ev 'container_cleanup|RUNNING')
if [ -n "$failures" ]; then
failures=$(echo $failures | sed -E 's/ +/ /' | xargs)
echo "supervisord non-RUNNING service: $failures" >&2
exit 2
fi

container_start_time=$(stat -c %Z /proc/1) # ctime, epoch time

procs_z=$(ps axo pid,stat | awk '$2 ~ /^Z/ { print $1 }' | wc -l)
if [ "$procs_z" -gt 3 ]; then
echo "Found $procs_z zombie (Z) processes" >&2
exit 4
fi

procs_d=$(ps axo pid,stat | awk '$2 ~ /^D/ { print $1 }' | wc -l)
if [ "$procs_d" -gt 15 ]; then
echo "Found $procs_d uninterruptible (D) processes" >&2
exit 5
fi

exit 0

19 changes: 19 additions & 0 deletions opensciencegrid/ospool-ccb/opt/ospool/retain-logs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

# For performance reasons, the pods are set up to log to local
# disk. This script is run from cron, and keeps copies of the
# logs for a certain number of days.

TARGET_DIR=save-$(date +'%Y%m%d')

mkdir -p /state/htcondor/logs
cd /state/htcondor/logs || exit 1

mkdir -p $TARGET_DIR
rsync -a /var/log/condor/. $TARGET_DIR/.

# only keep the last N set of saved logs
for OLD in $(ls -d -t save-*| tail -n +7); do
rm -rf $OLD
done

87 changes: 87 additions & 0 deletions opensciencegrid/ospool-ccb/opt/ospool/update-prios
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/usr/bin/python3

# This script is designed to adjust user priorities in the OSPool based on
# the current state of user workloads. The HTCondor priority settings are
# modified according to attributes such as the number of held jobs and the
# ratio of goodput to badput.

import os
import re
import sys

import htcondor2 as htcondor
import classad2 as classad
from htcondor2 import AdTypes


MIN_PRIORITY_FACTOR = 500
MAX_PRIORITY_FACTOR = 20000
HELD_JOB_PENALTY_MULTIPLIER = 5


def ad_int(ad, key, default=0):
if key not in ad:
return default
try:
if int(ad[key]) == ad[key]:
return int(ad[key])
except:
return default
return default


def step(n):
# round n to a nice 100
return int(round(n / 100) * 100)


def main():
col = htcondor.Collector()

# find the right negotiator
neg_ad = None
for ad in col.query(AdTypes.Negotiator):
# the main negotiator of the pool starts with cm-1. or cm-2.
if re.search("^cm-[12]\.", ad["Name"]):
neg_ad = ad
if not neg_ad:
print("Unable to find the main negotiator")
sys.exit(1)
neg = htcondor.Negotiator(neg_ad)
print(f"Updating negotiator {neg_ad['Name']}")

# get the current prio ads so we can determine if we need
# an update or note
current_prios = neg.getPriorities()

for ad in col.query(AdTypes.Submitter):

# current factor
current_factor = 1000
for prio_ad in current_prios:
if prio_ad["Name"] == ad["Name"]:
current_factor = round(prio_ad["PriorityFactor"])

factor = 1000

# held jobs
held = ad_int(ad, "HeldJobs", 0) * HELD_JOB_PENALTY_MULTIPLIER
factor += held

# make sure we finish jobs on ap2X before migration
if ".uc.osg-htc.org" in ad["Name"]:
factor -= 1000

# upper/lower limits on the adjustments
factor = min(factor, MAX_PRIORITY_FACTOR)
factor = max(factor, MIN_PRIORITY_FACTOR)
factor = step(factor)

if factor != current_factor:
print(f" {ad['Name']} {current_factor} -> {factor}")
neg.setFactor(ad["Name"], factor)


if __name__ == "__main__":
main()

133 changes: 133 additions & 0 deletions opensciencegrid/ospool-ccb/ospool-ccb-config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#!/bin/bash

# this script is invoked by the OSG container init setup


function kill_container()
{
echo "Restarting container: $1" 2>&1
sleep 180
exit 1
}

# create an env file so we can have the cron jobs run with the
# same environment as the init.d run
if [ -e /etc/ospool.env ]; then
. /etc/ospool.env
else
cat >/etc/ospool.env <<EOF
export OSPOOL_ENVIRONMENT=$OSPOOL_ENVIRONMENT
export OSPOOL_CCB_HOSTNAME=$OSPOOL_CCB_HOSTNAME
EOF
fi

# git repo - fetch / check for updates
NEEDS_UPDATE=1
cd /opt
if [ ! -e osg-flock ]; then
git clone https://github.com/opensciencegrid/osg-flock.git || kill_container "Unable to pull Git repo"
else
# only continue with the config if changes are found
cd osg-flock
git fetch --quiet
# Count commits that exist on the upstream branch but NOT on local branch
CHANGES_COUNT=$(git rev-list --count HEAD..@{u})
if [ "$CHANGES_COUNT" -eq 0 ]; then
echo "The osg-flock checkout is in sync with git repo. Nothing to do."
NEEDS_UPDATE=0
fi
git pull
fi

if [ $NEEDS_UPDATE -eq 1 ]; then

# fix ownership/permissions on mounted directories
chown -R condor:condor /var/log/condor
chown -R condor:condor /var/lib/condor/spool

# most config comes from the shared github repo below, but here is
# what is specific to the cm pod
cat >/etc/condor/config.d/10-ospool-ccb.config <<EOF

DAEMON_LIST = MASTER, SHARED_PORT, COLLECTOR

# FULL_HOSTNAME seems to be causing issues with HTCondor 23
#CONDOR_HOST = \$(FULL_HOSTNAME)
CONDOR_HOST = 127.0.0.1

HOST_ALIAS = $OSPOOL_CCB_HOSTNAME
TCP_FORWARDING_HOST = $OSPOOL_CCB_HOSTNAME

UPDATE_COLLECTOR_WITH_TCP = True

USE_SHARED_PORT = True
SHARED_PORT_MAX_WORKERS = 1000
SHARED_PORT_PORT = 9618

# Setup 10 child collectors
use feature:ChildCollector(1)
use feature:ChildCollector(2)
use feature:ChildCollector(3)
use feature:ChildCollector(4)
use feature:ChildCollector(5)
use feature:ChildCollector(6)
use feature:ChildCollector(7)
use feature:ChildCollector(8)
use feature:ChildCollector(9)
use feature:ChildCollector(10)

# no forwarding here - these are only used for CCB
TOP_COLLECTOR_HOST =

# limit logging
COLLECTOR1.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
COLLECTOR2.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
COLLECTOR3.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
COLLECTOR4.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
COLLECTOR5.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
COLLECTOR6.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
COLLECTOR7.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
COLLECTOR8.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
COLLECTOR9.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)
COLLECTOR10.MAX_COLLECTOR_LOG = \$(MAX_DEFAULT_LOG)

EOF

echo "Installing HTCondor credentials..."
cd /etc/ospool-creds/idkeys.d
for FILE in *; do
install -o root -g root -m 0600 $FILE /etc/condor/passwords.d/$FILE
done
cd /etc/ospool-creds/idtokens.d
for FILE in *; do
install -o root -g root -m 0600 $FILE /etc/condor/tokens.d/$FILE
done
# the gwms frontend generates tokens with kid=FRONTEND - for now make
# sure we have a copy of our flock.opensciencegrid.org password in the
# correct location
install -o root -g root -m 0600 \
/etc/condor/passwords.d/flock.opensciencegrid.org \
/etc/condor/passwords.d/FRONTEND
# SSL auth - the main hostcert comes from k8s certmanager
cd /etc/ospool-creds/tls.d
install -o root -g root -m 0644 tls.crt /etc/pki/tls/certs/localhost.crt
install -o root -g root -m 0600 tls.key /etc/pki/tls/private/localhost.key

# condor config
rm -f /etc/condor/config.d/*ospoolgit*
cp /opt/osg-flock/ospool.osg-htc.org/$OSPOOL_ENVIRONMENT/htcondor-config.d/* /etc/condor/config.d/
rm -f /etc/condor/config.d/90_high_availability.config
rm -f /etc/condor/config.d/95_negotiator_osgflockgit.config

cd /opt/osg-flock/ospool.osg-htc.org
echo "Writing out new /etc/condor/certs/condor_mapfile ..."
mkdir -p /etc/condor/certs
./fe-admin --target-env $OSPOOL_ENVIRONMENT --htcondor-mapfile >/etc/condor/certs/condor_mapfile
echo "Writing out new /etc/condor/config.d/95_flocking_ospoolgit.config ..."
./fe-admin --target-env $OSPOOL_ENVIRONMENT --htcondor-config >/etc/condor/config.d/95_flocking_ospoolgit.config

# this will fail during initial configuration, but work once the pool is up
/usr/sbin/condor_reconfig || true

fi

9 changes: 9 additions & 0 deletions opensciencegrid/ospool-ccb/ospool-ccb.cron
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

# configuration changes
*/5 * * * * root /bin/bash -c 'sleep $(( RANDOM \% 240 )); /etc/osg/image-init.d/60-ospool-ccb-config.sh' >/var/log/ospool-ccb-config.log 2>&1


# logs
0 * * * * root /opt/ospool/retain-logs >/dev/null 2>&1


22 changes: 22 additions & 0 deletions opensciencegrid/ospool-ccb/supervisord.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[supervisord]
nodaemon=true
logfile=/dev/null
logfile_maxbytes=0

[unix_http_server]
file=/tmp/supervisor.sock ; (the path to the socket file)

[rpcinterface:supervisor]
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface

[supervisorctl]
serverurl=unix:///tmp/supervisor.sock ; use a unix:// URL for a unix socket
loglevel=debug

[include]
files=/etc/supervisord.d/*.conf

[program:crond]
command=/usr/sbin/crond -n
autorestart=true

5 changes: 4 additions & 1 deletion opensciencegrid/ospool-cm/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Specify the opensciencegrid/software-base image tag
ARG BASE_OSG_SERIES=23
ARG BASE_OSG_SERIES=25
ARG BASE_YUM_REPO=release
ARG BASE_OS=el9

Expand Down Expand Up @@ -47,6 +47,9 @@ COPY supervisord.conf /etc/supervisord.conf
COPY 10-htcondor.conf /etc/supervisord.d/
COPY 30-prometheus.conf /etc/supervisord.d/

COPY ospool-cm-config.sh /etc/osg/image-init.d/60-ospool-cm-config.sh
RUN chmod 755 /etc/osg/image-init.d/60-ospool-cm-config.sh

COPY ospool.cron /etc/cron.d/ospool

ADD opt/ospool /opt/ospool
Expand Down
Loading
Loading