Add a script that makes it easy to score submissions.

aahladc · aahladc · commit b4734a4a0166 · 2026-03-19T13:22:22.000-07:00
The script is meant to be used only in the slurm cluster. It forces a specific directory structure, and checks for it right in the beginning. If the dir structure is not as expected, it throws an error and explains the structure it expects.

It also includes a dry run flag which runs the job for 10 steps, and includes a command on how to use it at the top of the file.
Also update the readme file to explain this script.
diff --git a/scoring/utils/slurm/README.md b/scoring/utils/slurm/README.md
@@ -48,6 +48,29 @@ LOGS_BUCKET="algoperf-runs-internal"
 sbatch run_jobs.sh
 ```
 
+## Convenient bash script to launch SLURM jobs
+
+The run_submissions.sh script does all the steps above for you. It is intended to be used on a slurm login node. It however does expect a very specific directory structure. You need to be in the $HOME dir with the algorithmic-efficiency and submissions_algorithms git repos in the home dir.
+
+```
+$USER$@$USER$:~/$ tree -L 1
+.
+├── algorithmic-efficiency
+└── submissions_algorithms
+```
+
+And you run the script with a command like so:
+
+```
+./algorithmic-efficiency/scoring/utils/slurm/run_submission.sh \
+  --submission_path submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2
+  --dry_run false
+```
+
+The submission path points to the dir where the submission exists (in the submissions git repo). `dry_run` is set to true by default (which limits max global steps to 10) to prevent accidental commands from wasting resources. Explicitly set it to false for full runs.
+
+The script will figure out the rest and run them for you (creating the config, saving it to a path with a reasonable name, and running the sbatch script with the right flags).
+
 # Set up new SLURM cluster
 
 If you are setting up a new cluster, we recommend using the [HPC toolkit to set up a SLURM cluster](https://cloud.google.com/cluster-toolkit/docs/quickstarts/slurm-cluster).
diff --git a/scoring/utils/slurm/run_submission.sh b/scoring/utils/slurm/run_submission.sh
@@ -0,0 +1,204 @@
+#!/bin/bash
+
+# Usage:
+# ./algorithmic-efficiency/scoring/utils/slurm/run_submission.sh \
+#   --submission_path submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2
+#
+# Note: --dry_run is true by default (sets MAX_GLOBAL_STEPS=10).
+# To perform a full run, explicitly set --dry_run false.
+
+set -e
+set -x
+
+# --- Global Variables ---
+SUBMISSION_PATH=""
+DRY_RUN=true
+MAX_GLOBAL_STEPS=10
+SUBMISSION_NAME=""
+RULESET=""
+FRAMEWORK=""
+ARRAY_RANGE=""
+
+# --- Helper Functions ---
+
+install_yq() {
+  if ! command -v yq &> /dev/null; then
+    echo "yq not found. Attempting to install locally to $HOME/.local/bin..."
+    mkdir -p "$HOME/.local/bin"
+    local OS=$(uname | tr '[:upper:]' '[:lower:]')
+    local ARCH=$(uname -m)
+    case "$ARCH" in
+      x86_64) ARCH="amd64" ;;
+      aarch64) ARCH="arm64" ;;
+    esac
+    
+    local YQ_URL="https://github.com/mikefarah/yq/releases/latest/download/yq_${OS}_${ARCH}"
+    if command -v curl &> /dev/null; then
+      curl -L "$YQ_URL" -o "$HOME/.local/bin/yq"
+    elif command -v wget &> /dev/null; then
+      wget "$YQ_URL" -O "$HOME/.local/bin/yq"
+    else
+      echo "Error: Neither curl nor wget found. Please install yq manually: https://github.com/mikefarah/yq"
+      exit 1
+    fi
+    chmod +x "$HOME/.local/bin/yq"
+    export PATH="$HOME/.local/bin:$PATH"
+    echo "yq installed successfully to $HOME/.local/bin"
+  fi
+}
+
+check_command() {
+  if ! command -v "$1" &> /dev/null; then
+    echo "Error: $1 could not be found. Please install it."
+    exit 1
+  fi
+}
+
+verify_environment() {
+  if [[ "$PWD" != "$HOME" ]]; then
+    echo "Error: This script must be run from your home directory ($HOME)."
+    echo "Expected directory structure:"
+    echo "  $HOME/"
+    echo "  ├── algorithmic-efficiency/"
+    echo "  └── submissions_algorithms/"
+    exit 1
+  fi
+
+  if [[ ! -d "algorithmic-efficiency" || ! -d "submissions_algorithms" ]]; then
+    echo "Error: Required repositories not found in the current directory."
+    echo "Please ensure both 'algorithmic-efficiency' and 'submissions_algorithms' are present in $HOME."
+    exit 1
+  fi
+
+  install_yq
+  check_command "jq"
+}
+
+parse_flags() {
+  while [[ $# -gt 0 ]]; do
+    case $1 in
+      --submission_path)
+        SUBMISSION_PATH="$2"
+        shift 2
+        ;;
+      --dry_run)
+        DRY_RUN="$2"
+        shift 2
+        ;;
+      *)
+        echo "Unknown option $1"
+        exit 1
+        ;;
+    esac
+  done
+
+  if [ -z "$SUBMISSION_PATH" ]; then
+    echo "Error: --submission_path is required."
+    exit 1
+  fi
+
+  if [ "$DRY_RUN" = false ]; then
+    MAX_GLOBAL_STEPS=""
+  fi
+}
+
+extract_submission_info() {
+  SUBMISSION_NAME=$(basename "$SUBMISSION_PATH")
+  local info_file="$SUBMISSION_PATH/submission_info.yml"
+
+  if [ ! -f "$info_file" ]; then
+    echo "Error: $info_file not found."
+    exit 1
+  fi
+
+  local raw_ruleset=$(yq eval '.ruleset' "$info_file" | tr '[:upper:]' '[:lower:]')
+  FRAMEWORK=$(yq eval '.framework' "$info_file" | tr '[:upper:]' '[:lower:]')
+
+  # Parse ruleset by checking for substrings "self" or "external"
+  if [[ "$raw_ruleset" == *"self"* ]]; then
+    RULESET="self"
+  elif [[ "$raw_ruleset" == *"external"* ]]; then
+    RULESET="external"
+  else
+    echo "Error: Expected 'ruleset' in $info_file to contain 'self' or 'external' (got '$raw_ruleset')."
+    exit 1
+  fi
+
+  # Verify framework
+  if [[ "$FRAMEWORK" != "jax" && "$FRAMEWORK" != "pytorch" ]]; then
+    echo "Error: 'framework' in $info_file must be either 'jax' or 'pytorch' (got '$FRAMEWORK')."
+    exit 1
+  fi
+
+  echo "Submission Name: $SUBMISSION_NAME"
+  echo "Ruleset: $RULESET"
+  echo "Framework: $FRAMEWORK"
+  echo "Dry Run: $DRY_RUN"
+  echo "Max Global Steps: $MAX_GLOBAL_STEPS"
+}
+
+generate_config() {
+  local exp_prefix="submissions_a100_dry_run"
+  if [ "$DRY_RUN" = false ]; then
+    exp_prefix="submissions_a100"
+  fi
+
+  docker run \
+    --rm \
+    -v "$(pwd)":/algorithmic-efficiency \
+    -w /algorithmic-efficiency \
+    --entrypoint python \
+    "europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_${FRAMEWORK}_main:latest" \
+    algorithmic-efficiency/scoring/utils/slurm/make_job_config.py \
+    --framework="$FRAMEWORK" \
+    --tuning_ruleset="$RULESET" \
+    --submission_path="$SUBMISSION_PATH/submission.py" \
+    --experiment_dir="${exp_prefix}/$SUBMISSION_NAME"
+
+  mv config.json "$SUBMISSION_NAME.json"
+}
+
+prepare_sbatch_array() {
+  local num_jobs=$(jq 'length' "$SUBMISSION_NAME.json")
+  if [[ "$num_jobs" -eq 0 ]]; then
+    echo "Error: No jobs found in $SUBMISSION_NAME.json."
+    exit 1
+  fi
+
+  ARRAY_RANGE="0-$((num_jobs - 1))"
+  echo "Number of jobs: $num_jobs"
+  echo "Sbatch array range: $ARRAY_RANGE"
+
+  mkdir -p "experiments/tests/$SUBMISSION_NAME"
+}
+
+run_sbatch() {
+  local sbatch_cmd=(
+    sbatch
+    --array="$ARRAY_RANGE"
+    --output="experiments/tests/$SUBMISSION_NAME/job_%A_%a.out"
+    --error="experiments/tests/$SUBMISSION_NAME/job_%A_%a.err"
+    "algorithmic-efficiency/scoring/utils/slurm/run_jobs.sh"
+    --config_file "$(pwd)/$SUBMISSION_NAME.json"
+    --image "europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_${FRAMEWORK}_main:latest"
+  )
+
+  if [ -n "$MAX_GLOBAL_STEPS" ]; then
+    sbatch_cmd+=(--max_global_steps "$MAX_GLOBAL_STEPS")
+  fi
+
+  "${sbatch_cmd[@]}"
+}
+
+# --- Main ---
+
+main() {
+  verify_environment
+  parse_flags "$@"
+  extract_submission_info
+  generate_config
+  prepare_sbatch_array
+  run_sbatch
+}
+
+main "$@"