Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
d866870
chore: exclude data, plans, and SLURM log files from version control
mohamedelabbas1996 Mar 8, 2026
e3e4239
config: add GBIF credentials env vars and fiftyone optional dependency
mohamedelabbas1996 Mar 8, 2026
3bde89c
chore: update lockfiles after adding fiftyone optional dependency
mohamedelabbas1996 Mar 8, 2026
b6b87e3
chore: standardize SLURM headers and venv activation in existing job …
mohamedelabbas1996 Mar 8, 2026
6cf3004
feat: include verbatimScientificName in DwCA data loading
mohamedelabbas1996 Mar 8, 2026
5ea3c7c
feat: allow clean-dataset to run without prior verification step
mohamedelabbas1996 Mar 8, 2026
ca59db2
feat: add download-gbif module and CLI command for GBIF DwCA downloads
mohamedelabbas1996 Mar 8, 2026
927e923
feat: add fetch-and-pack module for quota-safe chunked webdataset cre…
mohamedelabbas1996 Mar 8, 2026
94e849e
feat: add FiftyOne webdataset visualization module
mohamedelabbas1996 Mar 8, 2026
dbd2ee3
feat: add Vermont species pipeline SLURM scripts
mohamedelabbas1996 Mar 8, 2026
d567949
feat: add GBIF download and DwCA inspection job scripts
mohamedelabbas1996 Mar 8, 2026
44ff278
feat: add FiftyOne visualization job scripts
mohamedelabbas1996 Mar 8, 2026
9e35b9a
feat: add fetch-and-pack chunked pipeline and test scripts
mohamedelabbas1996 Mar 8, 2026
6ebcd6f
chore: add archived and legacy pipeline scripts
mohamedelabbas1996 Mar 8, 2026
dd146e1
feat: add webdataset inspection script with class distribution stats
mohamedelabbas1996 Mar 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
WANDB_API_KEY=
OUTPUT_LOGS_DIR=
TEST_IMAGES_DIR=

GBIF_USER=
GBIF_PWD=
GBIF_EMAIL=
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,7 @@ cython_debug/

# Weights and Biases files
wandb/

# Project data and plans (not for version control)
data/
plans/
4,460 changes: 2,961 additions & 1,499 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ dependencies = [
]

[project.optional-dependencies]
viz = [
"fiftyone",
]
research = [
"awscli>=1.33.44",
"awscli-plugin-endpoint>=0.4",
Expand Down
61 changes: 61 additions & 0 deletions scripts/gbif/job_download_ne_america.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/bin/bash
#SBATCH --job-name=download_ne_america
#SBATCH --account=def-drolnick
#SBATCH --time=6:00:00
#SBATCH --cpus-per-task=4
#SBATCH --mem=8G
#SBATCH --output=/project/6068129/melabbas/data/ne-america-eccv2024/slurm-%j.out
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=hack1996man@gmail.com

set -euo pipefail

S5CMD="$HOME/bin/s5cmd"
ENDPOINT="https://object-arbutus.cloud.computecanada.ca"
DEST=/project/6068129/melabbas/data/ne-america-eccv2024/wbds/

mkdir -p "$DEST"

echo "Starting download at $(date)"
echo "Destination: $DEST"

# Build a run-file of only the missing shards (resumable)
RUNFILE=$(mktemp /tmp/s5cmd_runfile_XXXXXX.txt)

echo "Building run-file of missing files..."
S3_PREFIX="s3://ami-dataset-eccv2024/ami_gbif/fine-grained_classification/wbds"

$S5CMD --profile ami --endpoint-url "$ENDPOINT" \
ls "${S3_PREFIX}/ne-america_*" \
| awk '{print $NF}' \
| while read fname; do
if [ ! -f "$DEST/$fname" ]; then
echo "cp ${S3_PREFIX}/${fname} $DEST/${fname}"
fi
done > "$RUNFILE"

N_MISSING=$(wc -l < "$RUNFILE")
echo "Files to download: $N_MISSING"

if [ "$N_MISSING" -gt 0 ]; then
$S5CMD --profile ami --endpoint-url "$ENDPOINT" \
--numworkers 32 \
run "$RUNFILE"
else
echo "All files already present, nothing to download."
fi

rm -f "$RUNFILE"

echo "Download complete at $(date)"

# Count and report
N_TRAIN=$(ls "$DEST"ne-america_train450-*.tar 2>/dev/null | wc -l)
N_VAL=$(ls "$DEST"ne-america_val450-*.tar 2>/dev/null | wc -l)
N_TEST=$(ls "$DEST"ne-america_test450-*.tar 2>/dev/null | wc -l)
TOTAL_GB=$(du -sh "$DEST" | cut -f1)

echo "train=$N_TRAIN val=$N_VAL test=$N_TEST total=$TOTAL_GB"

~/bin/notify "ne-america download done" \
"train=${N_TRAIN}/1583, val=${N_VAL}/187, test=${N_TEST}/375, size=${TOTAL_GB} — $(date)"
27 changes: 27 additions & 0 deletions scripts/gbif/job_download_quebec_vermont.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
#SBATCH --account=def-drolnick
#SBATCH --job-name=download_quebec_vermont
#SBATCH --cpus-per-task=4
#SBATCH --mem=8G
#SBATCH --time=1:00:00
#SBATCH --output=/project/6068129/melabbas/ami-ml/scripts/download_quebec_vermont_%j.out
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=hack1996man@gmail.com

DEST="/project/6068129/melabbas/data/quebec_vermont"
ENDPOINT="https://object-arbutus.cloud.computecanada.ca"
SRC="s3://ami-fine-tuning/quebec_vermont"

export AWS_PROFILE=ami
export AWS_REGION=us-east-1

mkdir -p "$DEST"

echo "Starting download at $(date)"

s5cmd --endpoint-url "$ENDPOINT" cp --if-size-differ \
"${SRC}/*" "$DEST/"

echo "Download completed at $(date)"

~/bin/notify "download_quebec_vermont: done" "Dataset downloaded to $DEST"
18 changes: 18 additions & 0 deletions scripts/gbif/job_download_vermont_species.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash
#SBATCH --job-name=download_vermont_species
#SBATCH --cpus-per-task=4
#SBATCH --mem=8G
#SBATCH --time=4:00:00
#SBATCH --output=download_vermont_species_%j.out
#SBATCH --account=def-drolnick

BASE_DIR="/home/melabbas/projects/def-drolnick/melabbas/ami-ml"
OUT="$BASE_DIR/data/vermont_species.zip"

echo "Starting download at $(date)"
curl -L -C - --progress-bar \
"https://api.gbif.org/v1/occurrence/download/request/0030333-260208012135463.zip" \
-o "$OUT"

echo "Done at $(date)"
echo "File size: $(du -sh $OUT)"
64 changes: 64 additions & 0 deletions scripts/gbif/job_dwca_inspect_northamerica.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/bin/bash
#SBATCH --account=def-drolnick
#SBATCH --job-name=dwca_inspect_northamerica
#SBATCH --cpus-per-task=2
#SBATCH --mem=32G
#SBATCH --time=3:00:00
#SBATCH --output=/project/6068129/melabbas/ami-ml/scripts/dwca_inspect_northamerica_%j.out
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=hack1996man@gmail.com

DWCA="/home/melabbas/projects/def-drolnick/melabbas/ami-ml/data/northamerica_gbif_download_v1/gbif_inat_northamerica_butterflies_106spp_2081177rec_20260227.zip"
DWCA_TOOLS="/home/melabbas/projects/def-drolnick/melabbas/dwca-tools"
REPORT="/home/melabbas/projects/def-drolnick/melabbas/ami-ml/data/northamerica_gbif_download_v1/dwca_inspection_northamerica_butterflies_106spp_2081177rec_image_counts_20260227.txt"

cd "$DWCA_TOOLS"
source .venv/bin/activate

{
echo "############################################################"
echo "# DwCA Inspection Report"
echo "# Archive : $(basename $DWCA)"
echo "# Generated: $(date)"
echo "############################################################"
echo ""

echo "============================================================"
echo "=== SECTION 1: summarize files"
echo "============================================================"
echo ""
dwca-tools summarize files "$DWCA"
echo ""
echo "Completed at $(date)"
echo ""

echo "============================================================"
echo "=== SECTION 2: summarize taxa --image-counts"
echo "============================================================"
echo ""
dwca-tools summarize taxa "$DWCA" \
--image-counts
echo ""
echo "Completed at $(date)"
echo ""

echo "============================================================"
echo "=== SECTION 3: summarize taxa --group-by verbatimScientificName --show-mismatched-names --image-counts"
echo "============================================================"
echo ""
dwca-tools summarize taxa "$DWCA" \
--group-by verbatimScientificName \
--show-mismatched-names \
--image-counts
echo ""
echo "Completed at $(date)"
echo ""

echo "############################################################"
echo "# End of report"
echo "############################################################"
} | tee "$REPORT"

echo ""
echo "Report saved to: $REPORT"
notify "dwca inspect northamerica done" "3-section inspection report (with image counts) saved to $(basename $REPORT)"
25 changes: 25 additions & 0 deletions scripts/gbif/job_dwca_summarize_files.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash
#SBATCH --account=def-drolnick
#SBATCH --job-name=dwca_summarize_files
#SBATCH --cpus-per-task=1
#SBATCH --mem=8G
#SBATCH --time=0:30:00
#SBATCH --output=/project/6068129/melabbas/ami-ml/scripts/dwca_summarize_files_%j.out
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=hack1996man@gmail.com

DWCA="/home/melabbas/projects/def-drolnick/melabbas/ami-ml/data/vermont_gbif_download_v2/gbif_inat_vermont_butterflies_106spp_2479479rec_20260225.zip"
DWCA_TOOLS="/home/melabbas/projects/def-drolnick/melabbas/dwca-tools"

cd "$DWCA_TOOLS"
source .venv/bin/activate

echo "=== dwca-tools summarize files ==="
echo "Archive: $DWCA"
echo "Started at $(date)"
echo ""

dwca-tools summarize files "$DWCA"

echo ""
echo "Completed at $(date)"
25 changes: 25 additions & 0 deletions scripts/gbif/job_dwca_summarize_taxa.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash
#SBATCH --account=def-drolnick
#SBATCH --job-name=dwca_summarize_taxa
#SBATCH --cpus-per-task=2
#SBATCH --mem=16G
#SBATCH --time=3:00:00
#SBATCH --output=/project/6068129/melabbas/ami-ml/scripts/dwca_summarize_taxa_%j.out
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=hack1996man@gmail.com

DWCA="/home/melabbas/projects/def-drolnick/melabbas/ami-ml/data/vermont_gbif_download_v2/gbif_inat_vermont_butterflies_106spp_2479479rec_20260225.zip"
DWCA_TOOLS="/home/melabbas/projects/def-drolnick/melabbas/dwca-tools"

cd "$DWCA_TOOLS"
source .venv/bin/activate

echo "=== dwca-tools summarize taxa ==="
echo "Archive: $DWCA"
echo "Started at $(date)"
echo ""

dwca-tools summarize taxa "$DWCA"

echo ""
echo "Completed at $(date)"
27 changes: 27 additions & 0 deletions scripts/gbif/job_dwca_summarize_taxa_verbatim.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
#SBATCH --account=def-drolnick
#SBATCH --job-name=dwca_summarize_verbatim
#SBATCH --cpus-per-task=2
#SBATCH --mem=16G
#SBATCH --time=3:00:00
#SBATCH --output=/project/6068129/melabbas/ami-ml/scripts/dwca_summarize_verbatim_%j.out
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=hack1996man@gmail.com

DWCA="/home/melabbas/projects/def-drolnick/melabbas/ami-ml/data/vermont_gbif_download_v2/gbif_inat_vermont_butterflies_106spp_2479479rec_20260225.zip"
DWCA_TOOLS="/home/melabbas/projects/def-drolnick/melabbas/dwca-tools"

cd "$DWCA_TOOLS"
source .venv/bin/activate

echo "=== dwca-tools summarize taxa --group-by verbatimScientificName --show-mismatched-names ==="
echo "Archive: $DWCA"
echo "Started at $(date)"
echo ""

dwca-tools summarize taxa "$DWCA" \
--group-by verbatimScientificName \
--show-mismatched-names

echo ""
echo "Completed at $(date)"
35 changes: 35 additions & 0 deletions scripts/gbif/job_gbif_download.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash
#SBATCH --account=def-drolnick
#SBATCH --job-name=gbif_download
#SBATCH --cpus-per-task=2
#SBATCH --mem=8G
#SBATCH --time=4:00:00
#SBATCH --output=gbif_download_%j.out
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=hack1996man@gmail.com

BASE_DIR="/home/melabbas/projects/def-drolnick/melabbas/ami-ml"
NAMES_FILE="$BASE_DIR/data/vermont_species_list_v2.txt"
OUTPUT_DIR="$BASE_DIR/data/vermont_gbif_download_v2"
COUNTRY="" # e.g. "US" — leave empty to omit
BBOX="" # e.g. "42.7,-73.5,45.0,-71.5" — leave empty to omit
DATASET_KEY="50c9509d-22c7-4a22-a47d-8c48425ef4a7"

cd "$BASE_DIR"
source .venv/bin/activate
if [[ -f .env ]]; then set -a; source .env; set +a; fi # loads GBIF_USER/PWD/EMAIL
mkdir -p "$OUTPUT_DIR"

EXTRA_FLAGS=""
[[ -n "$COUNTRY" ]] && EXTRA_FLAGS="$EXTRA_FLAGS --country $COUNTRY"
[[ -n "$BBOX" ]] && EXTRA_FLAGS="$EXTRA_FLAGS --bbox $BBOX"
[[ -n "$DATASET_KEY" ]] && EXTRA_FLAGS="$EXTRA_FLAGS --dataset-key $DATASET_KEY"

ami-dataset download-gbif \
--names-file "$NAMES_FILE" \
--output-dir "$OUTPUT_DIR" \
--poll-interval 30 \
--max-wait 14400 \
$EXTRA_FLAGS

echo "Done at $(date)"; ls -lh "$OUTPUT_DIR"
31 changes: 31 additions & 0 deletions scripts/gbif/job_gbif_download_northamerica.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash
#SBATCH --account=def-drolnick
#SBATCH --job-name=gbif_download_northamerica
#SBATCH --cpus-per-task=2
#SBATCH --mem=8G
#SBATCH --time=12:00:00
#SBATCH --output=gbif_download_northamerica_%j.out
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=hack1996man@gmail.com

BASE_DIR="/home/melabbas/projects/def-drolnick/melabbas/ami-ml"
NAMES_FILE="$BASE_DIR/data/vermont_species_list_v2.txt"
OUTPUT_DIR="$BASE_DIR/data/northamerica_gbif_download_v1"
DATASET_KEY="50c9509d-22c7-4a22-a47d-8c48425ef4a7"

cd "$BASE_DIR"
source .venv/bin/activate
if [[ -f .env ]]; then set -a; source .env; set +a; fi # loads GBIF_USER/PWD/EMAIL
mkdir -p "$OUTPUT_DIR"

ami-dataset download-gbif \
--names-file "$NAMES_FILE" \
--output-dir "$OUTPUT_DIR" \
--dataset-key "$DATASET_KEY" \
--country US \
--country CA \
--poll-interval 60 \
--max-wait 21600

echo "Done at $(date)"; ls -lh "$OUTPUT_DIR"
notify "gbif northamerica download done" "DwCA for 106 Vermont spp (US+CA, iNat) saved to $OUTPUT_DIR"
36 changes: 15 additions & 21 deletions scripts/job_clean_dataset.sh
Original file line number Diff line number Diff line change
@@ -1,31 +1,25 @@
#!/bin/bash
#SBATCH --account=def-drolnick
#SBATCH --job-name=clean_dataset
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --mem=16G
#SBATCH --time=3:00:00
#SBATCH --partition=long-cpu # Ask for long-cpu job
#SBATCH --cpus-per-task=2 # Ask for 2 CPUs
#SBATCH --mem=300G # Ask for 300 GB of RAM
#SBATCH --output=clean_dataset_%j.out

# 1. Load the required modules
module load miniconda/3
BASE_DIR="/home/melabbas/projects/def-drolnick/melabbas/ami-ml"

# 2. Load your environment
conda activate ami-ml
cd "$BASE_DIR"
source .venv/bin/activate

# 3. Load the environment variables outside of python script
set -o allexport
source .env
set +o allexport
if [[ -f .env ]]; then
set -a
source .env
set +a
fi

# Keep track of time
SECONDS=0

# 4. Launch your script
ami-dataset clean-dataset \
--dwca-file $DWCA_FILE \
--verified-data-csv $VERIFICATION_RESULTS \
--life-stage-predictions $LIFESTAGE_RESULTS
--dwca-file $DWCA_FILE \
--verified-data-csv $VERIFICATION_RESULTS \
--life-stage-predictions $LIFESTAGE_RESULTS

# Print time taken to execute the script
echo "Time taken to clean the dataset: $SECONDS seconds"
echo "Clean completed at $(date)"
Loading
Loading