Skip to content

Commit 77dd0ac

Browse files
committed
check for file existance
1 parent 2377272 commit 77dd0ac

4 files changed

Lines changed: 158 additions & 2 deletions

File tree

Dockerfile.dev

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# Ultimately to be consistent with the Red Hat Open AI (RHOAI) environment
2+
# (and attendant HW) that is avaiable to us in the NERC MOC we base our
3+
# images on the RHOAI provided images. The src for these are maintained
4+
# here:
5+
# https://github.com/red-hat-data-services/notebooks
6+
# the images are published here:
7+
# https://quay.io/organization/modh
8+
#
9+
10+
# The image we have choosen is
11+
# quay.io/modh/cuda-notebooks:cuda-jupyter-minimal-ubi9-python-3.11-20250808
12+
13+
# NERC maintains a drived based image that added various packages we need for
14+
# our development environment see distro packages below
15+
# src: https://github.com/nerc-images/csw-base-cuda-minimal
16+
# img: quay.io/nerc-images/csw-base-cuda-minimal:latest
17+
FROM quay.io/nerc-images/csw-base-cuda-minimal:latest
18+
19+
USER root
20+
21+
# CUDA
22+
# the following repo is already available from the base so we don't re add but here
23+
# for documentation purposes
24+
# as per https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-fedora
25+
# we do this so that additional repositories are available even non-NVIDIA ones
26+
27+
# add nvidia nsight-systems cli : https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html
28+
RUN rpm --import https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub && \
29+
dnf config-manager --add-repo "https://developer.download.nvidia.com/devtools/repos/rhel$(source /etc/os-release; echo ${VERSION_ID%%.*})/$(rpm --eval '%{_arch}' | sed s/aarch/arm/)/"
30+
31+
# add cmake so that nvidia samples can be built
32+
RUN dnf install -y \
33+
nsight-systems-cli && \
34+
yum clean all \
35+
&& rm -rf /var/cache/yum/*
36+
37+
# DISTRO PACKAGES
38+
# nerc minimal has all the based distro packages we need see
39+
# https://github.com/nerc-images/csw-base-cuda-minimal/blob/main/Containerfile
40+
# cmake, emacs, bc, jq, texlive, man-pages
41+
42+
# PYTHON PACKAGES
43+
# python stuff for ope compatability
44+
RUN pip install \
45+
nbstripout \
46+
jupyter-book \
47+
ghp-import \
48+
jupytext \
49+
jupyter_nbextensions_configurator \
50+
jupyter_contrib_nbextensions \
51+
jupyterlab-spellchecker \
52+
nbconvert \
53+
pyppeteer \
54+
jupyterlab-myst \
55+
nbgitpuller \
56+
jupyterlab_rise \
57+
pyright \
58+
python-lsp-server
59+
60+
# bash kernel to make it easier bash based juypter notebooks
61+
RUN pip install bash_kernel && python -m bash_kernel.install
62+
63+
# as per the nbstripout readme we setup nbstripout be always be used for the joyvan user for all repos
64+
RUN nbstripout --install --system
65+
66+
# OPE TOOLS
67+
RUN mkdir /home/ope && \
68+
cd /home/ope && \
69+
git clone https://github.com/OPEFFORT/tools.git . && \
70+
./install.sh && \
71+
fix-permissions /home/ope
72+
73+
# CUDA SUDA WUDA TOOLS
74+
RUN mkdir /home/csw && \
75+
cd /home/csw && \
76+
git clone https://github.com/jappavoo/batchtools.git && \
77+
for f in /home/csw/batchtools/b*; do \
78+
ln -s $f /usr/local/bin/$(basename $f); \
79+
done
80+
81+
COPY . /tools
82+
RUN pwd && ls -la && ls -la /tools
83+
WORKDIR /tools
84+
RUN pip install -e .
85+
86+
# Cats and dogs python packages
87+
RUN pip install \
88+
numpy \
89+
matplotlib \
90+
pandas
91+
92+
USER 1001

batchtools/br.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from .helpers import oc_delete
1818
from .helpers import fmt
1919
from .file_setup import prepare_context
20+
from pathlib import Path
2021
from .prom_metrics import (
2122
PROMETHEUS_INSTANCE,
2223
IN_PROGRESS,
@@ -229,8 +230,27 @@ def run(args: argparse.Namespace):
229230
sys.exit(f"Error occurred while creating job: {e}")
230231

231232
if args.job_delete and args.wait:
232-
subprocess.run(["cat", f"jobs/{job_name}/{job_name}.log"], check=True)
233-
print(f"RUNDIR: jobs/{job_name}")
233+
job_dir = Path("jobs") / job_name
234+
log_file = job_dir / f"{job_name}.log"
235+
236+
if result_phase == "succeeded":
237+
# Wait for the log file to appear (rsync may take a moment)
238+
max_wait = 10 # seconds
239+
wait_interval = 0.5
240+
elapsed = 0
241+
242+
while not log_file.exists() and elapsed < max_wait:
243+
time.sleep(wait_interval)
244+
elapsed += wait_interval
245+
246+
if log_file.exists():
247+
subprocess.run(["cat", f"jobs/{job_name}/{job_name}.log"], check=True)
248+
print(f"RUNDIR: jobs/{job_name}")
249+
else:
250+
print(f"Warning: Log file not found after {max_wait}s. Check jobs/{job_name}/")
251+
else:
252+
print("Something went wrong with running your job. Check over your code and please try again.")
253+
234254
oc_delete("job", job_name)
235255
else:
236256
print(

batchtools/prom_metrics.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,4 @@ def push_registry_text(grouping_key: dict[str, str] | None = None) -> None:
158158
except Exception as e:
159159
pass
160160
# print(f"PROM: failed to push metrics to pushgateway {PUSHGATEWAY_ADDR}: {e}")
161+

push.sh

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
export ORG="${ORG:-memalhot}"
5+
export IMAGE_NAME="${IMAGE_NAME:-csw-dev-test}"
6+
export TAG="${TAG:-latest}"
7+
8+
IMAGE="quay.io/${ORG}/${IMAGE_NAME}:${TAG}"
9+
10+
# Jupyter defaults
11+
CONTAINER_PORT="${CONTAINER_PORT:-8888}"
12+
HOST_PORT="${HOST_PORT:-8888}"
13+
14+
cmd="${1:-}"
15+
16+
case "$cmd" in
17+
build|"")
18+
docker build -f Dockerfile.dev -t "$IMAGE" .
19+
;;
20+
push)
21+
docker push "$IMAGE"
22+
;;
23+
buildpush)
24+
docker build -f Dockerfile.dev -t "$IMAGE" .
25+
docker push "$IMAGE"
26+
;;
27+
run)
28+
docker run --rm -it \
29+
-p "${HOST_PORT}:${CONTAINER_PORT}" \
30+
"$IMAGE"
31+
;;
32+
run-hostnet)
33+
docker run --rm -it --network host "$IMAGE"
34+
;;
35+
*)
36+
echo "Usage: $0 {build|push|buildpush|run|run-hostnet}"
37+
echo "Env: ORG IMAGE_NAME TAG HOST_PORT CONTAINER_PORT"
38+
exit 2
39+
;;
40+
esac
41+
42+
43+
#docker run --rm -it -p 8888:8888 a2969a46eebd

0 commit comments

Comments
 (0)