From 02e65d7c2eb922a0417f82721cbfb29de9f12a99 Mon Sep 17 00:00:00 2001 From: Jose Santos Date: Wed, 25 Feb 2026 14:16:37 -0600 Subject: [PATCH 1/6] Add GitHub Actions runner scripts and documentation - Introduced multiple scripts for managing GitHub Actions runners within an Apptainer container, including build, run, and cleanup functionalities. - Added comprehensive documentation (README.md, AGENTS.md, skills.md) detailing setup, usage, and environment configuration. - Included example environment files for both runner and container configurations to guide users in setting up their environments securely and effectively. This update establishes a robust framework for running GitHub Actions in HPC environments, enhancing usability and maintainability. --- .github/scripts/github-runner-files/AGENTS.md | 24 ++ .github/scripts/github-runner-files/README.md | 214 ++++++++++++++++++ .../build-github-coding-agent-runner.sh | 92 ++++++++ .../cleanup-old-runners.sh | 83 +++++++ .../scripts/github-runner-files/env.example | 28 +++ .../run-github-coding-agent-runner.sh | 198 ++++++++++++++++ .../runner-container.env.example | 55 +++++ .github/scripts/github-runner-files/skills.md | 31 +++ .github/scripts/github-runner-files/start.sh | 169 ++++++++++++++ 9 files changed, 894 insertions(+) create mode 100644 .github/scripts/github-runner-files/AGENTS.md create mode 100644 .github/scripts/github-runner-files/README.md create mode 100755 .github/scripts/github-runner-files/build-github-coding-agent-runner.sh create mode 100755 .github/scripts/github-runner-files/cleanup-old-runners.sh create mode 100644 .github/scripts/github-runner-files/env.example create mode 100755 .github/scripts/github-runner-files/run-github-coding-agent-runner.sh create mode 100644 .github/scripts/github-runner-files/runner-container.env.example create mode 100644 .github/scripts/github-runner-files/skills.md create mode 100755 .github/scripts/github-runner-files/start.sh diff --git a/.github/scripts/github-runner-files/AGENTS.md b/.github/scripts/github-runner-files/AGENTS.md new file mode 100644 index 000000000..2c91c0a4f --- /dev/null +++ b/.github/scripts/github-runner-files/AGENTS.md @@ -0,0 +1,24 @@ +# Agent instructions + +## Workflow + +Flow: **run-github-coding-agent-runner.sh** → **container** → **start.sh** → **Actions listener**. + +- **Standalone:** run `./run-github-coding-agent-runner.sh` with required flags (`--github-token`, `--github-repository`, `--script-dir`, `--runner-base`). No env needed. +- **SLURM:** set `GITHUB_TOKEN` and `GITHUB_REPOSITORY`, then `sbatch run-github-coding-agent-runner.sh`. When the script runs under SLURM with no arguments, it uses env and SLURM defaults (`SLURM_SUBMIT_DIR`, `WORK`) for script-dir and runner-base. start.sh installs/configures the runner in `RUNNER_HOME` if needed and starts the Actions listener; workflow jobs run in the container. + +## Conventions + +When editing scripts or config in this project: + +1. **Never add sensitive data to scripts or committed files.** + Do not hardcode tokens, passwords, API keys, or other secrets. Use environment variables or a secure mechanism outside the repo (e.g. `export GITHUB_TOKEN` before running). + +2. **Never use host-specific absolute paths.** + Do not add paths like `/work1/amd/josantos/...` or other machine-specific directories. Prefer: + - Paths relative to the script (e.g. `SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"` then `cd "${SCRIPT_DIR}"`). + - Environment variables (e.g. `$WORK`, `$HOME`) when a base directory is needed. + - Relative paths from the project or script location. + +3. **Never edit the container definition file (e.g. `iris.def`) unless explicitly asked.** + Prefer changing scripts (e.g. `start.sh`, `run-github-coding-agent-runner.sh`) to install, configure, or run things at runtime. Only modify `.def` files when the user explicitly requests it. diff --git a/.github/scripts/github-runner-files/README.md b/.github/scripts/github-runner-files/README.md new file mode 100644 index 000000000..8850fe52d --- /dev/null +++ b/.github/scripts/github-runner-files/README.md @@ -0,0 +1,214 @@ +# Iris + GitHub Actions Self-Hosted Runner (Apptainer) + +This setup runs a GitHub Actions self-hosted runner in an Apptainer container with the Iris framework (ROCm, Triton) and the `copilot` label, for HPC environments where Docker is not available. + +## Prerequisites + +- Apptainer/Singularity installed +- GitHub Personal Access Token with `repo` scope +- Access to the repository where you want to register the runner +- SLURM (for job scheduling) +- Optional: ROCm/AMD GPU partition for GPU workflows + +## Quick Start + +### 1. Create GitHub Personal Access Token + +1. Go to https://github.com/settings/tokens/new +2. Name: e.g. `GitHub Actions Runner` +3. Scopes: Select `repo` (Full control of private repositories) +4. Click "Generate token" and save it securely + +### 2. Prepare token and paths + +You will pass the GitHub token and repository as flags (see step 4). Do not commit tokens. + +### 3. Build the Container + +From this directory: + +```bash +sbatch build-github-coding-agent-runner.sh +``` + +This builds `github-copilot-coding-agent-runner.sif` from `iris.def` by default. To use another definition file: `./build-github-coding-agent-runner.sh --def=my.def` or set `DEF_FILE=my.def` before `sbatch`. The job uses partition `mi3001x` and may take a while. See **skills.md** for full build instructions. + +### 4. Run the Runner + +After the build completes, from the repo directory (where `run-github-coding-agent-runner.sh` and the `.sif` live). You can run in two ways: + +**Option A — Standalone with flags (required when not using SLURM):** + +```bash +./run-github-coding-agent-runner.sh \ + --github-token='YOUR_GITHUB_TOKEN' \ + --github-repository='owner/repo' \ + --script-dir="$(pwd)" \ + --runner-base="$(pwd)/runner-data" +``` + +**Option B — Via SLURM with environment variables (when `sbatch run-github-coding-agent-runner.sh` is used, the script uses env and SLURM defaults for any value not passed as a flag):** + +```bash +export GITHUB_TOKEN='YOUR_GITHUB_TOKEN' +export GITHUB_REPOSITORY='owner/repo' +sbatch run-github-coding-agent-runner.sh +``` + +With Option B, `SCRIPT_DIR` defaults to `SLURM_SUBMIT_DIR` (or the script’s directory), and `RUNNER_BASE` defaults to `$WORK/github-runner-data` if `WORK` is set, otherwise `$SCRIPT_DIR/github-runner-data`. You can override with `export SCRIPT_DIR=... RUNNER_BASE=...` if needed. + +Copy-paste and replace: +- `YOUR_GITHUB_TOKEN` — your GitHub Personal Access Token +- `owner/repo` — your repository (e.g. `Jose/Iris`) +- `runner-data` (Option A) — directory for runner state and work (created if missing); use any path you prefer. + +Optional flags (Option A) or env vars (Option B) (examples): + +```bash + --cluster-name='vultr-k8' \ # or export CLUSTER_NAME=... + --runner-labels='copilot,rocm' \ + --use-overlay=1 +``` + +### 5. Verify Runner Registration + +1. Go to your repository on GitHub +2. Navigate to: Settings → Actions → Runners +3. You should see your runner listed with the `copilot` label + +## Using the Runner in Workflows + +In your `.github/workflows/*.yml` files, use the runner via the `copilot` label (or whatever you passed to `--runner-labels`). Ensure the workflow’s `runs-on` matches: e.g. `runs-on: copilot` or `runs-on: [self-hosted, copilot]`. If a workflow uses a different label (e.g. `apptainer`), either register the runner with that label too or change the workflow to `copilot`. + +```yaml +name: Example Workflow +on: [push] + +jobs: + build: + runs-on: copilot + steps: + - uses: actions/checkout@v4 + - name: Run a test + run: echo "Running on Iris + copilot runner in HPC!" +``` + +## Workflow + +End-to-end flow when you run the runner via SLURM: + +1. **One-time setup** + Create a GitHub PAT with `repo` scope. From this directory, run `sbatch build-github-coding-agent-runner.sh` to build `github-copilot-coding-agent-runner.sif` from `iris.def` (Iris + ROCm; the runner is not in the image). + +2. **Run the runner** + Either pass required flags to `run-github-coding-agent-runner.sh` (standalone) or set `GITHUB_TOKEN` and `GITHUB_REPOSITORY` and run `sbatch run-github-coding-agent-runner.sh` (SLURM-only env fallback; see step 4). The script runs Apptainer with overlay and bind mounts and executes `/bin/bash -c "/runner-scripts/start.sh"`. So: **run-github-coding-agent-runner.sh** → **container** → **start.sh**. + +3. **Inside the container: start.sh** + It receives `GITHUB_TOKEN`, `GITHUB_REPOSITORY`, `RUNNER_HOME`, `RUNNER_NAME`, `RUNNER_LABELS`, and `RUNNER_WORKDIR` from the run script (via `--env`). It checks required vars, sets defaults for any unset, and uses `RUNNER_HOME` (e.g. `/runner-home`). If the runner is not installed in `RUNNER_HOME`, it installs it (from `/opt/actions-runner` or by download). It fetches a registration token from GitHub, runs `config.sh`, then starts the Actions runner listener (`./run.sh`). The runner listens for jobs; when a workflow uses the `copilot` (or your) label, GitHub sends a job and the runner runs the steps in the container. + +4. **End-to-end** + You run **run-github-coding-agent-runner.sh** with `--github-token`, `--github-repository`, `--script-dir`, and `--runner-base` (and optionally `--sif`). **run-github-coding-agent-runner.sh** starts the container, binds the script dir and runner dirs, passes env to the container, and runs **start.sh**. **start.sh** installs/configures the runner if needed and starts the listener. So: **run-github-coding-agent-runner.sh** → **container** → **start.sh** (install/configure + listener) → **runner runs workflow jobs**. + +## Management Commands + +```bash +# Build container +sbatch build-github-coding-agent-runner.sh + +# Run standalone (required flags) +./run-github-coding-agent-runner.sh --github-token='...' --github-repository='owner/repo' --script-dir="$(pwd)" --runner-base="$(pwd)/runner-data" + +# Run via SLURM with env (set GITHUB_TOKEN and GITHUB_REPOSITORY; SCRIPT_DIR/RUNNER_BASE default from SLURM) +export GITHUB_TOKEN=... GITHUB_REPOSITORY=owner/repo +sbatch run-github-coding-agent-runner.sh + +# Check SLURM job status +squeue -u $USER + +# View SLURM job logs +tail -f github-coding-agent-runner-*.out + +# Cancel SLURM job +scancel +``` + +## Customization + +### Runner Name and Labels + +Defaults are set in `run-github-coding-agent-runner.sh` (e.g. runner name: `repo-runner-cluster-YYYYMMDD-HHMMSS`; default label: `copilot`). Override with flags: + +```bash +./run-github-coding-agent-runner.sh ... --runner-name='my-runner' --runner-labels='copilot,slurm,apptainer,hpc,iris,rocm,mi300x' +``` + +### SLURM Parameters + +Edit `run-github-coding-agent-runner.sh` SBATCH directives as needed: + +- `#SBATCH --time=8:00:00` +- `#SBATCH -p mi3008x` # partition +- `#SBATCH --nodes=1` + +GPU access is enabled via `--rocm` in the container run. + +### Kubernetes / no overlay + +Overlays are not used in Kubernetes (default `USE_OVERLAY=0` in pods). The script uses **bind mounts only** for writable space: + +- **RUNNER_HOME** (runner config) and **RUNNER_WORKDIR** (job work) are bind-mounted from the host/pod. +- Optional: set **RUNNER_TMP** to a writable directory (e.g. a pod `emptyDir` mounted in the container) and the script will bind it to `/tmp` inside the container so tools (e.g. Triton cache) can write there. + +Example in a pod spec: mount an `emptyDir` at `/runner-tmp` and set `RUNNER_TMP=/runner-tmp` in the container env so `/tmp` is writable without an overlay. + +## Troubleshooting + +### Runner not appearing in GitHub + +1. Check logs: `tail -f github-coding-agent-runner-*.out` and `github-coding-agent-runner-*.err` +2. Verify the token (`--github-token` or `GITHUB_TOKEN`) has `repo` scope +3. Verify `--github-repository` format is `owner/repo` +4. Check token has not expired + +### Build failures + +- Build runs on partition `mi3001x` with fakeroot. See **skills.md** for details. +- Cache and temp dirs are under the project directory (`.apptainer-cache`, `.apptainer-tmp`). Ensure enough disk space. + +### Container not found when running + +If the container image is missing (default: `script-dir/github-copilot-coding-agent-runner.sif`), `run-github-coding-agent-runner.sh` will print a message. Run the build and wait for it to complete, or pass `--sif=/path/to/image.sif`. + +### Runner offline + +```bash +squeue -u $USER +tail -50 github-coding-agent-runner-*.err +scancel +# Resubmit: either same flags (standalone) or same env then sbatch run-github-coding-agent-runner.sh +``` + +## Security + +- **Tokens**: Never commit tokens. Use `--github-token=TOKEN` when running standalone, or set `GITHUB_TOKEN` when using `sbatch run-github-coding-agent-runner.sh`; do not put secrets in committed files. +- **Paths**: Do not hardcode host-specific paths in scripts. See **AGENTS.md** for project conventions. +- **Container**: Apptainer runs as your user; the container is read-only with a per-job writable overlay. + +## File Structure + +``` +github-runner/ +├── iris.def # Apptainer definition (Iris + ROCm) +├── build-github-coding-agent-runner.sh # SLURM build job (--def=FILE for definition file) +├── run-github-coding-agent-runner.sh # Run job (flags or sbatch + env) +├── start.sh # Runner startup (inside container; also used as K8s entrypoint) +├── runner-container.env.example # Example env file for container (start.sh sources it) +├── AGENTS.md # Agent instructions (no secrets, relative paths) +├── skills.md # Build instructions +├── README.md # This file +└── github-copilot-coding-agent-runner.sif # Built image (after build) +``` + +## License + +MIT License. diff --git a/.github/scripts/github-runner-files/build-github-coding-agent-runner.sh b/.github/scripts/github-runner-files/build-github-coding-agent-runner.sh new file mode 100755 index 000000000..25dd97f23 --- /dev/null +++ b/.github/scripts/github-runner-files/build-github-coding-agent-runner.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +# SLURM job script to build GitHub Coding Agent Runner container + +#SBATCH --job-name=build-github-coding-agent-runner +#SBATCH --output=build-github-coding-agent-runner-%j.out +#SBATCH --error=build-github-coding-agent-runner-%j.err +#SBATCH --time=2:00:00 +#SBATCH --nodes=1 +#SBATCH -p mi3001x + +set -e + +# Parse flags for definition file (and optional output) +# Usage: ./build-github-coding-agent-runner.sh [--def=FILE] [--output=SIF] +# or: sbatch build-github-coding-agent-runner.sh (uses DEF_FILE env or default iris.def) +while [[ $# -gt 0 ]]; do + case $1 in + --def=*) DEF_FILE="${1#*=}"; shift ;; + --def) DEF_FILE="${2:-}"; shift 2 ;; + --definition=*) DEF_FILE="${1#*=}"; shift ;; + --definition) DEF_FILE="${2:-}"; shift 2 ;; + -d) DEF_FILE="${2:-}"; shift 2 ;; + --output=*) OUTPUT_SIF="${1#*=}"; shift ;; + --output) OUTPUT_SIF="${2:-}"; shift 2 ;; + -o) OUTPUT_SIF="${2:-}"; shift 2 ;; + -h|--help) + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " --def=FILE, --definition=FILE, -d FILE Apptainer definition file (default: iris.def)" + echo " --output=FILE, -o FILE Output .sif file (default: github-copilot-coding-agent-runner.sif)" + exit 0 + ;; + *) break ;; + esac +done + +# Defaults: when under SLURM with no args, use env; else use script default +DEF_FILE="${DEF_FILE:-iris.def}" +OUTPUT_SIF="${OUTPUT_SIF:-github-copilot-coding-agent-runner.sif}" + +echo "==========================================" +echo "GitHub Coding Agent Runner Container Build" +echo "==========================================" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURM_NODELIST" +echo "Start: $(date)" +echo "==========================================" + +# Run from script directory so build and def file are in the right place +if [ -n "${SLURM_SUBMIT_DIR}" ]; then + BUILD_DIR="${SLURM_SUBMIT_DIR}" +else + BUILD_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +fi +cd "${BUILD_DIR}" +echo "Build directory: ${BUILD_DIR}" + +# Resolve def file path if relative +[ "${DEF_FILE#/}" = "$DEF_FILE" ] && DEF_FILE="${BUILD_DIR}/${DEF_FILE}" +[ "${OUTPUT_SIF#/}" = "$OUTPUT_SIF" ] && OUTPUT_SIF="${BUILD_DIR}/${OUTPUT_SIF}" + +if [ ! -f "$DEF_FILE" ]; then + echo "Error: definition file not found: $DEF_FILE" + exit 1 +fi + +# Temp and cache under build dir (avoids /tmp filling up) +export APPTAINER_TMPDIR="${BUILD_DIR}/.apptainer-tmp" +export APPTAINER_CACHEDIR="${BUILD_DIR}/.apptainer-cache" +mkdir -p "$APPTAINER_TMPDIR" "$APPTAINER_CACHEDIR" + +echo "" +echo "==========================================" +echo "Building container image..." +echo "Definition file: $DEF_FILE" +echo "Output file: $OUTPUT_SIF" +echo "==========================================" + +apptainer build --force --fakeroot "$OUTPUT_SIF" "$DEF_FILE" + +# Clean build temp to free space (cache is kept for faster rebuilds; remove .apptainer-cache to reclaim that too). +rm -rf "$APPTAINER_TMPDIR" +echo "Cleaned temporary directory: $APPTAINER_TMPDIR" + +echo "" +echo "==========================================" +echo "Build completed" +echo "==========================================" + +echo "" +echo "Finished: $(date)" diff --git a/.github/scripts/github-runner-files/cleanup-old-runners.sh b/.github/scripts/github-runner-files/cleanup-old-runners.sh new file mode 100755 index 000000000..ba079e122 --- /dev/null +++ b/.github/scripts/github-runner-files/cleanup-old-runners.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# Cleanup script for old GitHub runner configurations and overlays + +set -e + +WORK_DIR="${WORK:-/work1/amd/josantos}" +RUNNER_BASE="${WORK_DIR}/github-runner-data" +OVERLAY_DIR="${RUNNER_BASE}/overlays" + +echo "==========================================" +echo "GitHub Runner Cleanup Script" +echo "==========================================" +echo "Cleaning up directories in: ${RUNNER_BASE}" +echo "" + +# Function to check if SLURM job is still running +is_job_running() { + local job_id=$1 + squeue -j "$job_id" &>/dev/null +} + +# Cleanup old runner config directories +echo "Cleaning up old runner configurations..." +for runner_dir in "${RUNNER_BASE}"/.github-runner-*; do + if [ -d "$runner_dir" ]; then + # Extract job ID from directory name + job_id=$(basename "$runner_dir" | sed 's/.github-runner-//') + + if [[ "$job_id" =~ ^[0-9]+$ ]]; then + # Check if job is still running + if is_job_running "$job_id"; then + echo " Skipping $runner_dir (job $job_id is still running)" + else + echo " Removing $runner_dir (job $job_id is not running)" + rm -rf "$runner_dir" + fi + else + echo " Skipping $runner_dir (not a job-specific directory)" + fi + fi +done + +# Cleanup old overlay images +echo "Cleaning up old overlay images..." +for overlay_file in "${OVERLAY_DIR}"/overlay-*.img; do + if [ -f "$overlay_file" ]; then + # Extract job ID from filename + job_id=$(basename "$overlay_file" | sed 's/overlay-//' | sed 's/.img$//') + + if [[ "$job_id" =~ ^[0-9]+$ ]]; then + # Check if job is still running + if is_job_running "$job_id"; then + echo " Skipping $overlay_file (job $job_id is still running)" + else + size=$(du -h "$overlay_file" | cut -f1) + echo " Removing $overlay_file (job $job_id is not running, size: $size)" + rm -f "$overlay_file" + fi + else + echo " Skipping $overlay_file (not a job-specific overlay)" + fi + fi +done + +echo "==========================================" +echo "Cleanup complete!" +echo "==========================================" + +# Show remaining files +echo "Remaining runner configurations:" +ls -lh "${RUNNER_BASE}"/.github-runner-* 2>/dev/null || echo " None" + +echo "" +echo "Remaining overlay images:" +ls -lh "${OVERLAY_DIR}"/overlay-*.img 2>/dev/null || echo " None" + +# Show disk usage +echo "" +echo "Disk usage:" +echo " Runner data directory: $(du -sh "${RUNNER_BASE}" 2>/dev/null | cut -f1)" +echo " Overlays directory: $(du -sh "${OVERLAY_DIR}" 2>/dev/null | cut -f1)" + diff --git a/.github/scripts/github-runner-files/env.example b/.github/scripts/github-runner-files/env.example new file mode 100644 index 000000000..76de21881 --- /dev/null +++ b/.github/scripts/github-runner-files/env.example @@ -0,0 +1,28 @@ +# GitHub Actions Runner Environment Configuration +# +# SECURITY WARNING: This file contains sensitive information! +# +# Recommended setup: +# 1. Copy this file: cp env.example ~/.github-runner-env +# 2. Edit with your values: nano ~/.github-runner-env +# 3. Protect the file: chmod 600 ~/.github-runner-env +# 4. Source before running: source ~/.github-runner-env +# +# For SLURM jobs, the run-slurm.sh script will automatically +# source ~/.github-runner-env if it exists + +# REQUIRED: GitHub Personal Access Token with 'repo' scope +# Create at: https://github.com/settings/tokens/new +export GITHUB_TOKEN='' + +# REQUIRED: Repository in format owner/repo +export GITHUB_REPOSITORY='' + +# OPTIONAL: Customize runner name (defaults to hostname + timestamp) +export RUNNER_NAME='' + +# OPTIONAL: Comma-separated labels (defaults to 'copilot') +export RUNNER_LABELS='copilot' + +# OPTIONAL: Work directory (defaults to current directory/_work) +export RUNNER_WORKDIR="" diff --git a/.github/scripts/github-runner-files/run-github-coding-agent-runner.sh b/.github/scripts/github-runner-files/run-github-coding-agent-runner.sh new file mode 100755 index 000000000..291b4d39c --- /dev/null +++ b/.github/scripts/github-runner-files/run-github-coding-agent-runner.sh @@ -0,0 +1,198 @@ +#!/bin/bash + +# SLURM job script to run GitHub Coding Agent Runner (Iris + Apptainer) + +#SBATCH --job-name=github-coding-agent-runner +#SBATCH --output=github-coding-agent-runner-%j.out +#SBATCH --error=github-coding-agent-runner-%j.err +#SBATCH --time=8:00:00 +#SBATCH --nodes=1 +#SBATCH -p mi3008x # MI300X partition + +# Adjust the above SLURM parameters as needed for your system +# +# Two ways to run: +# 1) Standalone with flags (required): +# ./run-github-coding-agent-runner.sh --github-token='...' --github-repository='owner/repo' --script-dir="$(pwd)" --runner-base="$(pwd)/runner-data" +# 2) Via sbatch with env (SLURM-only fallback): set GITHUB_TOKEN, GITHUB_REPOSITORY; SCRIPT_DIR/RUNNER_BASE default from SLURM_SUBMIT_DIR and WORK +# export GITHUB_TOKEN=... GITHUB_REPOSITORY=owner/repo +# sbatch run-github-coding-agent-runner.sh + +set -e + +# Parse input flags first. When running under SLURM with no args, env and SLURM defaults are used for any unset value. +while [[ $# -gt 0 ]]; do + case $1 in + --github-token=*) GITHUB_TOKEN="${1#*=}"; shift ;; + --github-token) GITHUB_TOKEN="${2:-}"; shift 2 ;; + --github-repository=*) GITHUB_REPOSITORY="${1#*=}"; shift ;; + --github-repository) GITHUB_REPOSITORY="${2:-}"; shift 2 ;; + --runner-name=*) RUNNER_NAME="${1#*=}"; shift ;; + --runner-name) RUNNER_NAME="${2:-}"; shift 2 ;; + --cluster-name=*) CLUSTER_NAME="${1#*=}"; shift ;; + --cluster-name) CLUSTER_NAME="${2:-}"; shift 2 ;; + --runner-labels=*) RUNNER_LABELS="${1#*=}"; shift ;; + --runner-labels) RUNNER_LABELS="${2:-}"; shift 2 ;; + --script-dir=*) SCRIPT_DIR="${1#*=}"; shift ;; + --script-dir) SCRIPT_DIR="${2:-}"; shift 2 ;; + --runner-base=*) RUNNER_BASE="${1#*=}"; shift ;; + --runner-base) RUNNER_BASE="${2:-}"; shift 2 ;; + --sif=*) SIF_PATH="${1#*=}"; shift ;; + --sif) SIF_PATH="${2:-}"; shift 2 ;; + --runner-tmp=*) RUNNER_TMP="${1#*=}"; shift ;; + --runner-tmp) RUNNER_TMP="${2:-}"; shift 2 ;; + --use-overlay=*) USE_OVERLAY="${1#*=}"; shift ;; + --use-overlay) USE_OVERLAY="${2:-}"; shift 2 ;; + -h|--help) + echo "Usage: $0 [OPTIONS]" + echo "Options (--option=value or --option value):" + echo " --github-token=TOKEN GitHub token (required)" + echo " --github-repository=OWNER/REPO e.g. Jose/Iris (required)" + echo " --script-dir=DIR Directory with container and scripts (required)" + echo " --runner-base=DIR Runner data base directory (required)" + echo " --sif=PATH Path to .sif container (default: script-dir/github-copilot-coding-agent-runner.sif)" + echo " --runner-name=NAME Runner name (default: repo-runner-cluster-YYYYMMDD-HHMMSS)" + echo " --cluster-name=NAME Cluster name for default runner name (default: hostname)" + echo " --runner-labels=LABELS Comma-separated labels (default: copilot)" + echo " --runner-tmp=DIR Bind DIR to /tmp in container (e.g. Triton cache)" + echo " --use-overlay=0|1 Use overlay (1) or bind mounts only (0)" + exit 0 + ;; + *) break ;; + esac +done + +# SLURM-only env fallback: when running under sbatch with no args, use env and SLURM defaults +if [ -n "${SLURM_JOB_ID}" ]; then + if [ -z "$SCRIPT_DIR" ]; then + SCRIPT_DIR="${SLURM_SUBMIT_DIR:-}" + [ -z "$SCRIPT_DIR" ] && SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + fi + if [ -z "$RUNNER_BASE" ]; then + [ -n "${WORK}" ] && RUNNER_BASE="${WORK}/github-runner-data" || RUNNER_BASE="${SCRIPT_DIR}/github-runner-data" + fi + [ -z "$USE_OVERLAY" ] && USE_OVERLAY="${USE_OVERLAY:-1}" + [ -z "$SIF_PATH" ] && SIF_PATH="${SIF_PATH:-}" + [ -z "$RUNNER_NAME" ] && RUNNER_NAME="${RUNNER_NAME:-}" + [ -z "$RUNNER_LABELS" ] && RUNNER_LABELS="${RUNNER_LABELS:-}" + [ -z "$RUNNER_TMP" ] && RUNNER_TMP="${RUNNER_TMP:-}" +fi + +# Required: pass as flags when standalone, or set env when using sbatch +[ -n "$GITHUB_TOKEN" ] || { echo "Error: pass --github-token=TOKEN or set GITHUB_TOKEN (when using sbatch)"; exit 1; } +[ -n "$GITHUB_REPOSITORY" ] || { echo "Error: pass --github-repository=owner/repo or set GITHUB_REPOSITORY (when using sbatch)"; exit 1; } +[ -n "$SCRIPT_DIR" ] || { echo "Error: pass --script-dir=DIR or set SCRIPT_DIR (when using sbatch)"; exit 1; } +[ -d "$SCRIPT_DIR" ] || { echo "Error: SCRIPT_DIR must be an existing directory"; exit 1; } +[ -n "$RUNNER_BASE" ] || { echo "Error: pass --runner-base=DIR or set RUNNER_BASE (when using sbatch)"; exit 1; } + +# SIF path: default under script-dir if not passed; relative paths under script-dir +SIF_PATH="${SIF_PATH:-${SCRIPT_DIR}/github-copilot-coding-agent-runner.sif}" +[ "${SIF_PATH#/}" = "$SIF_PATH" ] && SIF_PATH="${SCRIPT_DIR}/${SIF_PATH}" + +# Subdirectories of runner base only (no env or separate flags) +RUNNER_WORKDIR="${RUNNER_BASE}/_work" +OVERLAY_DIR="${RUNNER_BASE}/overlay" + +# Default runner name: repo-runner-clustername-YYYYMMDD-HHMMSS (e.g. iris-runner-vultr-k8-20260214-025830) +if [ -z "$RUNNER_NAME" ]; then + REPO_NAME="${GITHUB_REPOSITORY##*/}" + REPO_NAME="$(echo "$REPO_NAME" | tr '[:upper:]' '[:lower:]')" + [ -z "$CLUSTER_NAME" ] && CLUSTER_NAME="$(hostname 2>/dev/null || echo local)" + RUNNER_NAME="${REPO_NAME}-runner-${CLUSTER_NAME}-$(date +%Y%m%d)-$(date +%H%M%S)" +fi +RUNNER_LABELS="${RUNNER_LABELS:-copilot}" +mkdir -p "${RUNNER_WORKDIR}" +[ -n "${USE_OVERLAY}" ] && [ "${USE_OVERLAY}" != "0" ] && mkdir -p "${OVERLAY_DIR}" + +echo "==========================================" +echo "GitHub Coding Agent Runner - SLURM Job" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURM_NODELIST" +echo "==========================================" +echo "Repository: $GITHUB_REPOSITORY" +echo "Runner Name: $RUNNER_NAME" +echo "Labels: $RUNNER_LABELS" +echo "Script/container directory: $SCRIPT_DIR" +echo "Runner base: $RUNNER_BASE" +echo "Container SIF: $SIF_PATH" +echo "Overlay directory: $OVERLAY_DIR" +echo "Work directory: $RUNNER_WORKDIR" +echo "TMP bind: ${RUNNER_TMP:-}" +echo "Overlay: ${USE_OVERLAY:-0} (use USE_OVERLAY=1 to enable in non-SLURM)" +echo "==========================================" + +# Change to the directory containing the container +cd "${SCRIPT_DIR}" + +# Container must exist (build first with: sbatch build-github-coding-agent-runner.sh) +if [ ! -f "$SIF_PATH" ]; then + echo "Error: container not found: $SIF_PATH" + echo "Build first: cd ${SCRIPT_DIR} && sbatch build-github-coding-agent-runner.sh" + exit 1 +fi + +# Writable runner install dir (start.sh installs runner here if missing) +RUNNER_HOME_HOST="${RUNNER_BASE}/.github-runner" +mkdir -p "${RUNNER_HOME_HOST}" +# When running as root (e.g. in a K8s pod), chown so start.sh can re-exec as nobody and still write +if [ "$(id -u)" = "0" ]; then + chown -R 65534:65534 "${RUNNER_HOME_HOST}" "${RUNNER_WORKDIR}" 2>/dev/null || true +fi + +# Show GPU info +echo "GPU Information:" +rocm-smi --showproductname || echo "Warning: Could not get GPU info" +echo "==========================================" + +# Run github-copilot-coding-agent-runner.sif: mount start.sh and writable dirs. +# RUNNER_HOME=/runner-home so start.sh installs/runs the runner there (no HOME override). +# +# Options (overlay not available in Kubernetes): +# - USE_OVERLAY=1 (SLURM): use --overlay for a writable layer (needs overlayfs). +# - USE_OVERLAY=0 (default in K8s/pods): no overlay; only bind mounts. Writable paths: +# RUNNER_HOME_HOST (runner config), RUNNER_WORKDIR (job work), and optionally +# RUNNER_TMP (bind to /tmp) if set, so /tmp is writable (e.g. Triton cache). +RUNNER_TMP_BIND="" +if [ -n "${RUNNER_TMP:-}" ] && [ -d "${RUNNER_TMP}" ]; then + RUNNER_TMP_BIND="--bind ${RUNNER_TMP}:/tmp:rw" +fi + +if [ -n "${USE_OVERLAY}" ] && [ "${USE_OVERLAY}" != "0" ] && [ -d "${OVERLAY_DIR}" ]; then + apptainer exec \ + --no-home \ + --overlay "${OVERLAY_DIR}" \ + --bind "${SCRIPT_DIR}:/runner-scripts:ro" \ + --bind "${RUNNER_HOME_HOST}:/runner-home:rw" \ + --bind "${RUNNER_WORKDIR}:${RUNNER_WORKDIR}" \ + --env "RUNNER_HOME=/runner-home" \ + --env "GITHUB_TOKEN=${GITHUB_TOKEN}" \ + --env "GITHUB_REPOSITORY=${GITHUB_REPOSITORY}" \ + --env "RUNNER_NAME=${RUNNER_NAME}" \ + --env "RUNNER_LABELS=${RUNNER_LABELS}" \ + --env "RUNNER_WORKDIR=${RUNNER_WORKDIR}" \ + --rocm \ + "$SIF_PATH" \ + /bin/bash -c "/runner-scripts/start.sh" +else + # No overlay (Kubernetes or USE_OVERLAY=0): bind mounts only + # Optional: set RUNNER_TMP to a writable dir (e.g. pod emptyDir) to bind /tmp for Triton/cache + apptainer exec \ + --no-home \ + --bind "${SCRIPT_DIR}:/runner-scripts:ro" \ + --bind "${RUNNER_HOME_HOST}:/runner-home:rw" \ + --bind "${RUNNER_WORKDIR}:${RUNNER_WORKDIR}" \ + ${RUNNER_TMP_BIND:+"$RUNNER_TMP_BIND"} \ + --env "RUNNER_HOME=/runner-home" \ + --env "GITHUB_TOKEN=${GITHUB_TOKEN}" \ + --env "GITHUB_REPOSITORY=${GITHUB_REPOSITORY}" \ + --env "RUNNER_NAME=${RUNNER_NAME}" \ + --env "RUNNER_LABELS=${RUNNER_LABELS}" \ + --env "RUNNER_WORKDIR=${RUNNER_WORKDIR}" \ + --rocm \ + "$SIF_PATH" \ + /bin/bash -c "/runner-scripts/start.sh" +fi + +echo "==========================================" +echo "GitHub Coding Agent Runner stopped" +echo "==========================================" diff --git a/.github/scripts/github-runner-files/runner-container.env.example b/.github/scripts/github-runner-files/runner-container.env.example new file mode 100644 index 000000000..169f44b15 --- /dev/null +++ b/.github/scripts/github-runner-files/runner-container.env.example @@ -0,0 +1,55 @@ +# Environment for the GitHub Actions runner inside the container. +# start.sh sources this (from RUNNER_ENV_FILE, or runner-container.env in +# RUNNER_HOME or next to start.sh). Use it to set paths and options for +# your specific host, device, and container—no hardcoded paths in start.sh. +# +# Usage: +# cp runner-container.env.example runner-container.env +# # Edit runner-container.env for this host/container, then run the runner. +# # Or: export RUNNER_ENV_FILE=/path/to/your.env +# +# Variables set here are inherited by workflow steps (actions run in the +# same env). RUNNER_WORKDIR and RUNNER_HOME are already set by start.sh +# before this file is sourced, so you can use them below. + +# ----------------------------------------------------------------------------- +# Runner behavior +# ----------------------------------------------------------------------------- + +# Allow runner to run as root (often needed for GPU access in containers) +export RUNNER_ALLOW_RUNASROOT=1 + +# Triton kernel cache (must be writable). Default in start.sh is RUNNER_WORKDIR/.triton_cache +# export TRITON_CACHE_DIR="${RUNNER_WORKDIR}/.triton_cache" + +# Git config when running as root (start.sh sets a default; override if needed) +# export GIT_CONFIG_GLOBAL="${RUNNER_WORKDIR}/.gitconfig" + +# ----------------------------------------------------------------------------- +# Paths: set for YOUR container/host. Examples below—uncomment and adjust. +# ----------------------------------------------------------------------------- + +# --- Example A: Iris-style image (ROCm + venv + Triton under /opt) --- +# export ROCM_PATH="/opt/rocm" +# export PATH="/opt/rocm/bin:/opt/conda/envs/py_3.10/bin:${PATH}" +# export LD_LIBRARY_PATH="/opt/rocm/lib:${LD_LIBRARY_PATH:-}" +# export PYTHONPATH="/opt/venv/lib/python3.13/site-packages:/opt/triton/python:${PYTHONPATH:-}" + +# --- Example B: Conda-only (no ROCm) --- +# export PATH="/opt/conda/bin:${PATH}" +# export PYTHONPATH="/opt/conda/lib/python3.10/site-packages:${PYTHONPATH:-}" + +# --- Example C: System Python + ROCm in /usr --- +# export ROCM_PATH="/usr" +# export PATH="/usr/lib/rocm/bin:${PATH}" +# export LD_LIBRARY_PATH="/usr/lib/rocm/lib:${LD_LIBRARY_PATH:-}" + +# --- Example D: Custom locations (set your own) --- +# export ROCM_PATH="${ROCM_PATH:-/path/to/rocm}" +# export PATH="/path/to/python/bin:/path/to/rocm/bin:${PATH}" +# export LD_LIBRARY_PATH="/path/to/rocm/lib:${LD_LIBRARY_PATH:-}" +# export PYTHONPATH="/path/to/site-packages:${PYTHONPATH:-}" + +# If your image already sets PATH/PYTHONPATH/ROCM_PATH in its Dockerfile or +# definition file, you can leave this file empty or only set runner options +# (RUNNER_ALLOW_RUNASROOT, TRITON_CACHE_DIR, GIT_CONFIG_GLOBAL). diff --git a/.github/scripts/github-runner-files/skills.md b/.github/scripts/github-runner-files/skills.md new file mode 100644 index 000000000..bc9f078f7 --- /dev/null +++ b/.github/scripts/github-runner-files/skills.md @@ -0,0 +1,31 @@ +# Build instructions + +## Container build (SLURM) + +From the `github-runner` directory: + +```bash +sbatch build-github-coding-agent-runner.sh +``` + +- **Partition:** `mi3001x` +- **Time limit:** 2 hours +- **Input:** definition file, default `iris.def` (override with `--def=FILE` or env `DEF_FILE`) +- **Output:** default `github-copilot-coding-agent-runner.sif` (override with `--output=FILE` or env `OUTPUT_SIF`) + +The job uses `SLURM_SUBMIT_DIR` when set, so submit from the repo directory (e.g. `cd /path/to/github-runner && sbatch build-github-coding-agent-runner.sh`) so the build runs in the right place. + +Temp and cache are under the build directory (`.apptainer-tmp`, `.apptainer-cache`) to avoid filling `/tmp`. The temp dir is removed after a successful build; the cache is kept for faster rebuilds. To reclaim space, remove `.apptainer-cache` as well. + +## After build + +**Option 1 — Run via SLURM with env (SLURM-only fallback):** set `GITHUB_TOKEN` and `GITHUB_REPOSITORY`, then submit the script. The script uses `SLURM_SUBMIT_DIR` and `WORK` (when set) for script-dir and runner-base. + +```bash +export GITHUB_TOKEN=... GITHUB_REPOSITORY=owner/repo +sbatch run-github-coding-agent-runner.sh +``` + +**Option 2 — Run standalone with flags:** pass all required options on the command line (see README). + +See **README.md** for full setup and usage. diff --git a/.github/scripts/github-runner-files/start.sh b/.github/scripts/github-runner-files/start.sh new file mode 100755 index 000000000..f7045fd2d --- /dev/null +++ b/.github/scripts/github-runner-files/start.sh @@ -0,0 +1,169 @@ +#!/bin/bash + +# GitHub Actions Runner startup script for Apptainer (SLURM, standalone, or Kubernetes) +# +# Usage: env only. Required: GITHUB_TOKEN, GITHUB_REPOSITORY, RUNNER_HOME +# Optional: RUNNER_NAME, RUNNER_LABELS, RUNNER_WORKDIR, RUNNER_ENV_FILE, etc. +# See runner-container.env.example and README for details. + +set -e + +# Required: set when launching the runner (e.g. by run-github-coding-agent-runner.sh or pod spec) +[ -n "$GITHUB_TOKEN" ] || { echo "Error: GITHUB_TOKEN is required"; exit 1; } +[ -n "$GITHUB_REPOSITORY" ] || { echo "Error: GITHUB_REPOSITORY is required (owner/repo)"; exit 1; } +[ -n "${RUNNER_HOME:-}" ] || { echo "Error: RUNNER_HOME is required"; exit 1; } + +# Default values (set early so env file can use RUNNER_WORKDIR / RUNNER_HOME) +RUNNER_NAME="${RUNNER_NAME:-$(hostname)-$(date +%s)}" +RUNNER_LABELS="${RUNNER_LABELS:-copilot}" +RUNNER_WORKDIR="${RUNNER_WORKDIR:-$(dirname "${RUNNER_HOME}")/_work}" + +# Source container env file so variables can be set or sourced (override with RUNNER_ENV_FILE) +if [ -n "${RUNNER_ENV_FILE:-}" ] && [ -f "${RUNNER_ENV_FILE}" ]; then + echo "Sourcing env file: ${RUNNER_ENV_FILE}" + set -a + # shellcheck source=/dev/null + . "${RUNNER_ENV_FILE}" + set +a +elif [ -f "${RUNNER_HOME}/runner-container.env" ]; then + echo "Sourcing env file: ${RUNNER_HOME}/runner-container.env" + set -a + # shellcheck source=/dev/null + . "${RUNNER_HOME}/runner-container.env" + set +a +else + RUNNER_SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" 2>/dev/null && pwd)" + if [ -n "${RUNNER_SCRIPT_DIR:-}" ] && [ -f "${RUNNER_SCRIPT_DIR}/runner-container.env" ]; then + echo "Sourcing env file: ${RUNNER_SCRIPT_DIR}/runner-container.env" + set -a + # shellcheck source=/dev/null + . "${RUNNER_SCRIPT_DIR}/runner-container.env" + set +a + fi +fi + +# Runner-only defaults (use RUNNER_WORKDIR; no host-specific paths here). +# PATH, PYTHONPATH, ROCM_PATH, LD_LIBRARY_PATH, etc. come from the container +# image or from runner-container.env (see runner-container.env.example). +# Copy and edit that file per host/container so workflows see the right tools. +export RUNNER_ALLOW_RUNASROOT="${RUNNER_ALLOW_RUNASROOT:-1}" +export TRITON_CACHE_DIR="${TRITON_CACHE_DIR:-${RUNNER_WORKDIR}/.triton_cache}" + +mkdir -p "${RUNNER_HOME}" + +echo "==========================================" +echo "GitHub Actions Runner - Apptainer Edition" +echo "==========================================" +echo "Repository: $GITHUB_REPOSITORY" +echo "Runner Name: $RUNNER_NAME" +echo "Labels: $RUNNER_LABELS" +echo "Work Directory: $RUNNER_WORKDIR" +echo "Runner Home: $RUNNER_HOME" +echo "==========================================" + +# Install runner binaries if not already present +if [ ! -f "${RUNNER_HOME}/run.sh" ]; then + echo "Setting up runner in ${RUNNER_HOME}..." + if [ -d /opt/actions-runner ] && [ -f /opt/actions-runner/run.sh ]; then + cp -r /opt/actions-runner/* "${RUNNER_HOME}/" + chmod +x "${RUNNER_HOME}"/*.sh + else + RUNNER_VERSION="${RUNNER_VERSION:-2.313.0}" + echo "Downloading Actions runner v${RUNNER_VERSION}..." + (cd "${RUNNER_HOME}" && curl -sL -o runner.tgz \ + "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" \ + && tar xzf runner.tgz && rm -f runner.tgz) + chmod +x "${RUNNER_HOME}"/*.sh 2>/dev/null || true + fi +fi + +# Change to writable runner directory +cd "${RUNNER_HOME}" + +# Create work directory if it doesn't exist +mkdir -p "$RUNNER_WORKDIR" + +# Get registration token +echo "Getting registration token..." +REGISTRATION_RESPONSE=$(curl -s -X POST \ + -H "Authorization: token $GITHUB_TOKEN" \ + -H "Accept: application/vnd.github.v3+json" \ + "https://api.github.com/repos/$GITHUB_REPOSITORY/actions/runners/registration-token") +if command -v jq >/dev/null 2>&1; then + REGISTRATION_TOKEN=$(echo "$REGISTRATION_RESPONSE" | jq -r .token) +else + REGISTRATION_TOKEN=$(echo "$REGISTRATION_RESPONSE" | grep -o '"token":"[^"]*"' | head -1 | cut -d'"' -f4) +fi + +if [ "$REGISTRATION_TOKEN" == "null" ] || [ -z "$REGISTRATION_TOKEN" ]; then + echo "Error: Failed to get registration token." + echo "Please check:" + echo " 1. GITHUB_TOKEN has 'repo' scope" + echo " 2. Token has not expired" + echo " 3. GITHUB_REPOSITORY format is correct (owner/repo)" + exit 1 +fi + +echo "Registration token obtained successfully" + +# Check if already configured (cleanup any previous config) +if [ -f ".runner" ]; then + echo "Found existing runner configuration, removing..." + ./config.sh remove --token "$REGISTRATION_TOKEN" || true +fi + +# Configure the runner +echo "Configuring runner..." +./config.sh \ + --url "https://github.com/$GITHUB_REPOSITORY" \ + --token "$REGISTRATION_TOKEN" \ + --name "$RUNNER_NAME" \ + --labels "$RUNNER_LABELS" \ + --work "$RUNNER_WORKDIR" \ + --unattended \ + --replace + +# Cleanup function +cleanup() { + # Only run removal once; skip if config already removed + if [ ! -f "${RUNNER_HOME}/.runner" ]; then + echo "Runner config already removed or not configured. Skipping cleanup." + return 0 + fi + + echo "" + echo "Shutting down... Removing runner from GitHub..." + + REMOVE_RESPONSE=$(curl -s -X POST \ + -H "Authorization: token $GITHUB_TOKEN" \ + -H "Accept: application/vnd.github.v3+json" \ + "https://api.github.com/repos/$GITHUB_REPOSITORY/actions/runners/remove-token") + if command -v jq >/dev/null 2>&1; then + REMOVE_TOKEN=$(echo "$REMOVE_RESPONSE" | jq -r .token) + else + REMOVE_TOKEN=$(echo "$REMOVE_RESPONSE" | grep -o '"token":"[^"]*"' | head -1 | cut -d'"' -f4) + fi + + if [ "$REMOVE_TOKEN" != "null" ] && [ -n "$REMOVE_TOKEN" ]; then + ./config.sh remove --token "$REMOVE_TOKEN" + echo "Runner removed successfully" + else + echo "Warning: Could not remove runner automatically" + fi +} + +# Set trap to cleanup on exit +trap cleanup EXIT INT TERM + +# Fix git safe directory issues (common when running as root in containers) +# Point git config to a writable location (can be overridden by env file) +export GIT_CONFIG_GLOBAL="${GIT_CONFIG_GLOBAL:-${RUNNER_WORKDIR}/.gitconfig}" +mkdir -p "$(dirname "$GIT_CONFIG_GLOBAL")" +git config --global --add safe.directory '*' + +# Start the runner +echo "Starting GitHub Actions Runner..." +echo "Press Ctrl+C to stop" +echo "==========================================" +command -v rocminfo >/dev/null 2>&1 && rocminfo || true +./run.sh From aa5a3a137abddc239be8d01eb970d49ae6b3e29b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Feb 2026 20:18:35 +0000 Subject: [PATCH 2/6] Apply Ruff auto-fixes --- examples/common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/common/utils.py b/examples/common/utils.py index 0e6ea9482..f9ebba8d7 100644 --- a/examples/common/utils.py +++ b/examples/common/utils.py @@ -86,7 +86,7 @@ def reset(self): self.comm_end_timestamp.fill_(self.min_ts) def to_json(self, filename, gpu_freq): - cycles_to_us = lambda cycles: (cycles / gpu_freq) + cycles_to_us = lambda cycles: cycles / gpu_freq gemm_begin_us = cycles_to_us(self.mm_begin_timestamp.cpu().numpy()) gemm_end_us = cycles_to_us(self.mm_end_timestamp.cpu().numpy()) From 73e2829c11783b50a7f0d9786bb92b2a33b5fbf4 Mon Sep 17 00:00:00 2001 From: Jose Santos Date: Wed, 25 Feb 2026 14:45:57 -0600 Subject: [PATCH 3/6] Add GitHub Actions workflow for Copilot setup - Introduced a new workflow file to automate the setup of a Python virtual environment for Copilot. - The workflow includes steps for checking out the repository, creating and activating a virtual environment, installing dependencies, and verifying ROCm and GPU visibility. - This addition enhances the CI/CD process by streamlining the environment setup for Copilot integration. --- .github/workflows/copilot-setup-steps.yml | 35 +++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 .github/workflows/copilot-setup-steps.yml diff --git a/.github/workflows/copilot-setup-steps.yml b/.github/workflows/copilot-setup-steps.yml new file mode 100644 index 000000000..363fdbef3 --- /dev/null +++ b/.github/workflows/copilot-setup-steps.yml @@ -0,0 +1,35 @@ +name: Copilot Setup Steps + +on: + workflow_dispatch: + +jobs: + copilot-setup-steps: + runs-on: [self-hosted, copilot] + + permissions: + contents: read + + timeout-minutes: 59 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Create task venv for Copilot + run: | + python3 -m venv $GITHUB_WORKSPACE/.venv + source $GITHUB_WORKSPACE/.venv/bin/activate + python -m pip install --upgrade pip + python -m pip install -e . + + - name: Make venv default for subsequent steps + run: | + echo "$GITHUB_WORKSPACE/.venv/bin" >> $GITHUB_PATH + + - name: Verify ROCm and GPU visibility + run: | + echo "=== rocminfo ===" + rocminfo | head -50 || true + echo "=== rocm-smi ===" + rocm-smi || true From 418a23feaecd04f0c6953d7428bff43e5782d3a7 Mon Sep 17 00:00:00 2001 From: Jose Santos Date: Thu, 26 Feb 2026 11:21:57 -0600 Subject: [PATCH 4/6] ci: trigger copilot setup workflow on PR @copilot comments add issue_comment trigger with created, edited gate job to PR comments containing @copilot keep manual workflow_dispatch target self-hosted runner labels: copilot, apptainer, iris --- .github/workflows/copilot-setup-steps.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/copilot-setup-steps.yml b/.github/workflows/copilot-setup-steps.yml index 363fdbef3..e4eed5f11 100644 --- a/.github/workflows/copilot-setup-steps.yml +++ b/.github/workflows/copilot-setup-steps.yml @@ -2,20 +2,26 @@ name: Copilot Setup Steps on: workflow_dispatch: + issue_comment: + types: [created, edited] jobs: copilot-setup-steps: - runs-on: [self-hosted, copilot] + if: >- + github.event_name == 'workflow_dispatch' || + (github.event.issue.pull_request && contains(github.event.comment.body, '@copilot')) + runs-on: [self-hosted, copilot, apptainer, iris] permissions: contents: read + pull-requests: read timeout-minutes: 59 steps: - name: Checkout repository uses: actions/checkout@v4 - + - name: Create task venv for Copilot run: | python3 -m venv $GITHUB_WORKSPACE/.venv From 4d39f2f472fb5bbf149600e0ef55c99fa1850c40 Mon Sep 17 00:00:00 2001 From: Jose Santos Date: Tue, 3 Mar 2026 12:46:03 -0600 Subject: [PATCH 5/6] Update Copilot setup workflow to use system site packages for virtual environment --- .github/workflows/copilot-setup-steps.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/copilot-setup-steps.yml b/.github/workflows/copilot-setup-steps.yml index e4eed5f11..b3d1b86db 100644 --- a/.github/workflows/copilot-setup-steps.yml +++ b/.github/workflows/copilot-setup-steps.yml @@ -24,7 +24,7 @@ jobs: - name: Create task venv for Copilot run: | - python3 -m venv $GITHUB_WORKSPACE/.venv + python3 -m venv --system-site-packages $GITHUB_WORKSPACE/.venv source $GITHUB_WORKSPACE/.venv/bin/activate python -m pip install --upgrade pip python -m pip install -e . From 4b55cb77c672444cc047fe7a4e93cd1879283b97 Mon Sep 17 00:00:00 2001 From: Jose Santos Date: Mon, 16 Mar 2026 14:02:14 -0400 Subject: [PATCH 6/6] Enhance GitHub runner scripts with additional directory creation and cleanup processes - Updated `run-github-coding-agent-runner.sh` to create necessary subdirectories within the runner work directory. - Modified `start.sh` to ensure writable HOME and TMPDIR directories are set up for job steps. - Added cleanup functionality to terminate stale MCP processes during runner cleanup. - Introduced new documentation files outlining build and run instructions, as well as workflow conventions for the GitHub Actions runner. --- .../.cursor/rules/github-runner-build-run.mdc | 19 ++++++++++++++ .../rules/github-runner-conventions.mdc | 25 +++++++++++++++++++ .../run-github-coding-agent-runner.sh | 2 +- .github/scripts/github-runner-files/start.sh | 10 ++++++++ 4 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 .github/scripts/github-runner-files/.cursor/rules/github-runner-build-run.mdc create mode 100644 .github/scripts/github-runner-files/.cursor/rules/github-runner-conventions.mdc diff --git a/.github/scripts/github-runner-files/.cursor/rules/github-runner-build-run.mdc b/.github/scripts/github-runner-files/.cursor/rules/github-runner-build-run.mdc new file mode 100644 index 000000000..69a4cee8c --- /dev/null +++ b/.github/scripts/github-runner-files/.cursor/rules/github-runner-build-run.mdc @@ -0,0 +1,19 @@ +--- +description: Build and run instructions (from skills.md) +globs: **/build-github-coding-agent-runner.sh,**/run-github-coding-agent-runner.sh,**/start.sh,**/iris.def +alwaysApply: false +--- + +# Build & Run + +## Build (SLURM) + +From the `github-runner` directory: `sbatch build-github-coding-agent-runner.sh`. + +- Partition: `mi3001x`, time limit: 2 hours. +- Input: `--def=FILE` (default `iris.def`). Output: `--output=FILE` (default `github-copilot-coding-agent-runner.sif`). Same directory. +- Submit from repo dir so `SLURM_SUBMIT_DIR` is correct. Temp/cache under build dir (`.apptainer-tmp`, `.apptainer-cache`); temp removed after success, cache kept for rebuilds. + +## Run + +After build: (1) Standalone: `./run-github-coding-agent-runner.sh --github-token=... --github-repository=... --script-dir="$(pwd)" --runner-base="$(pwd)/runner-data"`. (2) SLURM: set `GITHUB_TOKEN` and `GITHUB_REPOSITORY`, then `sbatch run-github-coding-agent-runner.sh` (script uses env and SLURM defaults). See README.md for full setup. diff --git a/.github/scripts/github-runner-files/.cursor/rules/github-runner-conventions.mdc b/.github/scripts/github-runner-files/.cursor/rules/github-runner-conventions.mdc new file mode 100644 index 000000000..860cd02b0 --- /dev/null +++ b/.github/scripts/github-runner-files/.cursor/rules/github-runner-conventions.mdc @@ -0,0 +1,25 @@ +--- +description: Workflow and conventions for the GitHub Actions runner (from AGENTS.md) +alwaysApply: true +--- + +# GitHub Runner – Workflow and Conventions + +## Workflow + +Flow: run-github-coding-agent-runner.sh → container → start.sh → Actions listener. Two run modes: (1) Standalone: `./run-github-coding-agent-runner.sh` with required flags (--github-token, --github-repository, --script-dir, --runner-base). (2) SLURM: set GITHUB_TOKEN and GITHUB_REPOSITORY, then `sbatch run-github-coding-agent-runner.sh`; when under SLURM with no args, the script uses env and SLURM defaults. start.sh installs/configures the runner in RUNNER_HOME and starts the Actions listener. + +## Conventions + +1. **No sensitive data** – Do not hardcode tokens, passwords, or API keys. Use environment variables (e.g. export GITHUB_TOKEN before running). +2. **No host-specific paths** – Do not add paths like /work1/amd/josantos/... Prefer SCRIPT_DIR with dirname BASH_SOURCE, or GITHUB_WORKSPACE, RUNNER_WORKDIR, RUNNER_BASE, WORK, or relative paths. +3. **Do not edit iris.def unless the user explicitly asks.** Prefer changing start.sh or run-github-coding-agent-runner.sh for runtime behavior. +4. **Use known writable directories** – Prefer GITHUB_WORKSPACE, RUNNER_WORKDIR, RUNNER_BASE for installs and cache. Avoid $HOME, ~, /tmp. + +## Directory layout (when running in runner) + +| Variable | Use for | +|----------|---------| +| GITHUB_WORKSPACE | Repo checkout; installs, cache, venv | +| RUNNER_WORKDIR | Parent of owner/repo; job work | +| RUNNER_BASE | Runner data root; overlay, .github-runner | diff --git a/.github/scripts/github-runner-files/run-github-coding-agent-runner.sh b/.github/scripts/github-runner-files/run-github-coding-agent-runner.sh index 291b4d39c..ad6c0728f 100755 --- a/.github/scripts/github-runner-files/run-github-coding-agent-runner.sh +++ b/.github/scripts/github-runner-files/run-github-coding-agent-runner.sh @@ -101,7 +101,7 @@ if [ -z "$RUNNER_NAME" ]; then RUNNER_NAME="${REPO_NAME}-runner-${CLUSTER_NAME}-$(date +%Y%m%d)-$(date +%H%M%S)" fi RUNNER_LABELS="${RUNNER_LABELS:-copilot}" -mkdir -p "${RUNNER_WORKDIR}" +mkdir -p "${RUNNER_WORKDIR}" "${RUNNER_WORKDIR}/.home" "${RUNNER_WORKDIR}/.pip-cache" "${RUNNER_WORKDIR}/.tmp" "${RUNNER_WORKDIR}/.cache" [ -n "${USE_OVERLAY}" ] && [ "${USE_OVERLAY}" != "0" ] && mkdir -p "${OVERLAY_DIR}" echo "==========================================" diff --git a/.github/scripts/github-runner-files/start.sh b/.github/scripts/github-runner-files/start.sh index f7045fd2d..adc1aea36 100755 --- a/.github/scripts/github-runner-files/start.sh +++ b/.github/scripts/github-runner-files/start.sh @@ -49,6 +49,11 @@ fi export RUNNER_ALLOW_RUNASROOT="${RUNNER_ALLOW_RUNASROOT:-1}" export TRITON_CACHE_DIR="${TRITON_CACHE_DIR:-${RUNNER_WORKDIR}/.triton_cache}" +# Writable HOME/TMPDIR for job steps (run-github-coding-agent-runner.sh may already create dirs on host) +mkdir -p "${RUNNER_WORKDIR}/.home" "${RUNNER_WORKDIR}/.tmp" +export HOME="${RUNNER_WORKDIR}/.home" +export TMPDIR="${RUNNER_WORKDIR}/.tmp" + mkdir -p "${RUNNER_HOME}" echo "==========================================" @@ -125,6 +130,11 @@ echo "Configuring runner..." # Cleanup function cleanup() { + # Kill any stale MCP processes left over from cancelled jobs + pkill -f "mcp/dist/index.js" 2>/dev/null || true + pkill -f "mcp-server-playwright" 2>/dev/null || true + pkill -f "playwright-mcp" 2>/dev/null || true + # Only run removal once; skip if config already removed if [ ! -f "${RUNNER_HOME}/.runner" ]; then echo "Runner config already removed or not configured. Skipping cleanup."