Skip to content

[TRITON] Sage V2 quantization on Unified Attention #7023

[TRITON] Sage V2 quantization on Unified Attention

[TRITON] Sage V2 quantization on Unified Attention #7023

Workflow file for this run

name: Aiter Test
on:
push:
branches: [main]
pull_request:
types: [opened, synchronize, reopened, ready_for_review]
branches: [main] # Triggers on PRs targeting `main`
paths-ignore:
- '**/*.md'
- 'docs/**'
- 'LICENSE'
- '.gitignore'
workflow_dispatch:
schedule:
- cron: '0 22 * * *' # 6:00 AM Beijing Time (UTC+8)
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
env:
DOCKER_IMAGE: "rocm/pytorch:latest"
GPU_ARCH_LIST: "gfx942;gfx950"
GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/aiter.git' }}
GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id || github.sha }}
AITER_TEST: "op_tests"
jobs:
check-signal:
if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }}
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download and check signal artifact
run: ./.github/scripts/check_signal.sh
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_SHA: ${{ github.sha }}
build_aiter_image:
if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }}
runs-on: build-only-aiter
needs: check-signal
permissions:
id-token: write
contents: read
steps:
- name: Checkout code
if: ${{ !github.event.pull_request.head.repo.fork }}
uses: actions/checkout@v4
# - name: Prepare docker config
# run: |
# export DOCKER_CONFIG="$HOME/.docker"
# mkdir -p "$DOCKER_CONFIG" || true
# cp /docker-config/config.json "$DOCKER_CONFIG/config.json"
# echo "DOCKER_CONFIG=$DOCKER_CONFIG" >> "$GITHUB_ENV"
- name: Generate Dockerfile
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
cat <<EOF > Dockerfile.mod
FROM ${{ env.DOCKER_IMAGE }}
RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter || true
RUN pip uninstall -y aiter
RUN pip install --upgrade pandas zmq einops numpy==1.26.2
RUN pip install --upgrade "pybind11>=3.0.1"
RUN pip install --upgrade "ninja>=1.11.1"
RUN pip install tabulate
RUN pip list
RUN rm -rf aiter \
&& git clone ${{ env.GITHUB_REPO_URL }} aiter \
&& cd aiter \
&& git checkout ${{ env.GITHUB_COMMIT_SHA }} \
&& if [ "${{ github.event_name }}" = "schedule" ]; then \
echo "It's nightly build, syncing latest CK..."; \
git submodule set-branch --branch develop 3rdparty/composable_kernel; \
git submodule sync && \
git submodule update --init --recursive --remote --jobs 4; \
else \
echo "Using pinned CK commit..."; \
git submodule sync && \
git submodule update --init --recursive --depth 1 --jobs 4; \
fi \
&& pip install -r requirements.txt \
&& echo "Prebuilding kernels with GPU_ARCHS: ${{ env.GPU_ARCH_LIST }} and PREBUILD_KERNELS: 1" \
&& PREBUILD_KERNELS=1 GPU_ARCHS="${{ env.GPU_ARCH_LIST }}" python setup.py bdist_wheel \
&& pip install dist/*.whl \
&& echo "Prebuilding kernels completed"
RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true
EOF
- name: Show Dockerfile
if: ${{ !github.event.pull_request.head.repo.fork }}
run: cat Dockerfile.mod
- name: Build Docker image
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }}
docker build --network=host --no-cache -t $IMAGE_TAG -f Dockerfile.mod .
- name: Verify prebuilt kernels
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }}
echo "=== Prebuilt kernel validation ==="
KERNEL_COUNT=$(docker run --rm $IMAGE_TAG find /aiter/aiter/jit -name "*.so" | wc -l)
echo "Prebuilt kernel .so files: $KERNEL_COUNT"
docker run --rm $IMAGE_TAG find /aiter/aiter/jit -name "*.so" | sort
if [ "$KERNEL_COUNT" -lt 10 ]; then
echo "::warning::Prebuild may have failed: expected at least 10 kernel .so files, found $KERNEL_COUNT. This can cause JIT compilation and OOM at runtime."
else
echo "Prebuild validation passed: $KERNEL_COUNT kernels compiled"
fi
- name: Push Docker image
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }}
docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }}
docker push $IMAGE_TAG
- name: Success message
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
echo "Successfully prepared image: rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }}"
- name: Extract wheel from image
if: ${{ github.ref == 'refs/heads/main' && github.event_name != 'schedule' }}
run: |
set -ex
IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }}
mkdir -p dist
docker run --rm \
-v "${{ github.workspace }}/dist:/dist" \
$IMAGE_TAG \
bash -c "cp /aiter/dist/*.whl /dist/"
echo "Extracted wheels:"
ls -lh dist/*.whl
- name: Upload wheel as artifact
if: ${{ github.ref == 'refs/heads/main' && github.event_name != 'schedule' }}
uses: actions/upload-artifact@v4
with:
name: aiter-whl-main-${{ github.run_id }}
path: dist/*.whl
- name: Configure AWS credentials
if: ${{ github.ref == 'refs/heads/main' && github.event_name != 'schedule' }}
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-east-1
role-to-assume: arn:aws:iam::661452401056:role/framework-aiter-nightlies
- name: Install AWS CLI
if: ${{ github.ref == 'refs/heads/main' && github.event_name != 'schedule' }}
run: |
if ! command -v aws &> /dev/null; then
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
unzip -q awscliv2.zip
sudo ./aws/install
rm -rf awscliv2.zip aws
fi
- name: Upload wheels to S3
if: ${{ github.ref == 'refs/heads/main' && github.event_name != 'schedule' }}
run: |
for WHL in dist/*.whl; do
WHL_NAME=$(basename ${WHL})
echo "Uploading ${WHL_NAME} to S3..."
aws s3 cp ${WHL} s3://framework-whls-nightlies/whl-staging/gfx942-gfx950/${WHL_NAME}
done
echo "Wheels uploaded to S3 staging"
split_aiter_tests:
if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }}
runs-on: ubuntu-latest
needs: [check-signal, build_aiter_image]
outputs:
shard_count: 5
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Split Aiter Tests (5 shards)
run: ./.github/scripts/split_tests.sh --shards 5 --test-type aiter
- name: Upload test shard lists as artifact
uses: actions/upload-artifact@v4
with:
name: aiter_shards
path: aiter_shard_*.list
standard:
if: >-
(!github.event.pull_request || github.event.pull_request.draft == false) &&
github.event.action != 'labeled'
name: Standard Tests (1 GPU)
needs: [build_aiter_image, split_aiter_tests]
strategy:
fail-fast: false
matrix:
include:
- runner: linux-aiter-mi355-1
label: MI355
shard_total: 5
shard_idx: 0
- runner: linux-aiter-mi355-1
label: MI355
shard_total: 5
shard_idx: 1
- runner: linux-aiter-mi355-1
label: MI355
shard_total: 5
shard_idx: 2
- runner: linux-aiter-mi355-1
label: MI355
shard_total: 5
shard_idx: 3
- runner: linux-aiter-mi355-1
label: MI355
shard_total: 5
shard_idx: 4
- runner: aiter-1gpu-runner
label: MI325
shard_total: 5
shard_idx: 0
- runner: aiter-1gpu-runner
label: MI325
shard_total: 5
shard_idx: 1
- runner: aiter-1gpu-runner
label: MI325
shard_total: 5
shard_idx: 2
- runner: aiter-1gpu-runner
label: MI325
shard_total: 5
shard_idx: 3
- runner: aiter-1gpu-runner
label: MI325
shard_total: 5
shard_idx: 4
runs-on: ${{ matrix.runner }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha || github.sha }}
- name: Docker login
run: docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} || true
- name: Download test shard lists
uses: actions/download-artifact@v4
with:
name: aiter_shards
- name: List test shard files
run: |
ls -l aiter_shard_*.list
- name: Export test file list for this shard as env
id: set_shard_files
run: |
echo "AITER_TEST=$(cat aiter_shard_${{ matrix.shard_idx }}.list)" >> $GITHUB_ENV
echo "$AITER_TEST"
- name: Run the container
run: |
set -ex
echo "Starting container: aiter_test"
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
if [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then
IMAGE_TAG=${{ env.DOCKER_IMAGE }}
else
IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }}
fi
docker run -dt \
--device=/dev/kfd $DEVICE_FLAG \
--shm-size=16G \
--network=host \
--group-add $(getent group render | cut -d: -f3) \
--group-add $(getent group video | cut -d: -f3) \
-e AITER_TEST="${AITER_TEST}" \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
--name aiter_test \
$IMAGE_TAG
- name: Setup Aiter for fork PR
if: ${{ github.event.pull_request.head.repo.fork }}
run: |
set -ex
git submodule sync && git submodule update --init --recursive --depth 1 --jobs 4
echo "Setting up Aiter for fork PR..."
docker exec \
-w /workspace \
aiter_test \
bash -c "BUILD_TRITON=0 ./.github/scripts/build_aiter_triton.sh"
- name: Sync CK submodule
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
set -ex
if [ "${{ github.event_name }}" = "schedule" ]; then
echo "Nightly build: syncing latest CK from develop branch..."
git submodule set-branch --branch develop 3rdparty/composable_kernel
git submodule sync
git submodule update --init --recursive --remote --jobs 4
else
echo "Using pinned CK commit..."
git submodule sync
git submodule update --init --recursive --depth 1 --jobs 4
fi
- name: Show Aiter version
run: |
set -ex
docker exec \
-w /workspace \
aiter_test \
bash -c "pip show amd-aiter || true"
- name: Tests
timeout-minutes: 90
run: |
set -ex
if [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then
docker exec \
-w /workspace \
aiter_test \
bash -c "MAX_JOBS=64 SHARD_TOTAL=${{ matrix.shard_total }} SHARD_IDX=${{ matrix.shard_idx }} ./.github/scripts/aiter_test.sh"
else
docker exec \
-w /workspace \
aiter_test \
bash -c "SHARD_TOTAL=${{ matrix.shard_total }} SHARD_IDX=${{ matrix.shard_idx }} ./.github/scripts/aiter_test.sh"
fi
- name: Collect test logs
if: always()
run: |
echo "Collecting test logs..."
echo "Aiter Operator Tests Summary:" >> $GITHUB_STEP_SUMMARY
python3 ./.github/scripts/collect_logs.py latest_test.log >> $GITHUB_STEP_SUMMARY
- name: Upload test logs
uses: actions/upload-artifact@v4
if: success()
with:
name: standard-test-log-${{ matrix.runner }}-shard-${{ matrix.shard_idx }}
path: latest_test.log
- name: Cleanup container
if: always()
run: |
docker rm -f aiter_test || true
standard-test-finish:
if: >-
!github.event.pull_request.draft &&
github.event.action != 'labeled'
name: Standard Test Results
runs-on: ubuntu-latest
needs: [standard]
steps:
- name: Download all test logs
uses: actions/download-artifact@v4
with:
pattern: standard-test-log-*-shard-*
path: .
- name: List test logs
run: |
ls -l standard-test-log-*
- name: Check Standard Test Results
run: |
set -ex
echo "Checking Standard Test Results..."
all_passed=true
for shard in {0..4}; do
for runner in {linux-aiter-mi355-1,aiter-1gpu-runner}; do
if [ ! -f standard-test-log-${runner}-shard-${shard}/latest_test.log ]; then
echo "Test report for ${runner} shard ${shard} not found."
all_passed=false
break
fi
done
done
if [ "$all_passed" = true ]; then
echo "All tests passed."
else
echo "Test failures or errors detected."
exit 1
fi
multi-gpu:
name: Multi-GPU Tests (8 GPU)
if: github.ref == 'refs/heads/main'
needs: build_aiter_image
strategy:
fail-fast: false
matrix:
include:
- runner: linux-aiter-mi355-8
label: MI355
- runner: aiter-8gpu-runner
label: MI325
runs-on: ${{ matrix.runner }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Docker login
run: docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} || true
- name: Run the container
run: |
set -ex
echo "Starting container: aiter_test"
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
if [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then
IMAGE_TAG=${{ env.DOCKER_IMAGE }}
else
IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }}
fi
docker run -dt \
--device=/dev/kfd $DEVICE_FLAG \
--shm-size=16G \
--network=host \
--group-add $(getent group render | cut -d: -f3) \
--group-add $(getent group video | cut -d: -f3) \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
--name aiter_test \
$IMAGE_TAG
- name: Setup Aiter for fork PR
if: ${{ github.event.pull_request.head.repo.fork }}
run: |
set -ex
echo "Setting up Aiter for fork PR..."
docker exec \
-w /workspace \
aiter_test \
bash -c "BUILD_TRITON=0 ./.github/scripts/build_aiter_triton.sh"
- name: Show Aiter version
run: |
set -ex
docker exec \
-w /workspace \
aiter_test \
bash -c "pip show amd-aiter || true"
- name: Tests
timeout-minutes: 60
run: |
set -ex
docker exec \
-e MULTIGPU=TRUE \
-w /workspace \
aiter_test \
bash -c "./.github/scripts/aiter_test.sh"
- name: Upload test logs
uses: actions/upload-artifact@v4
if: always()
with:
name: multigpu-test-${{ matrix.runner }}
path: latest_test.log
- name: Cleanup container
if: always()
run: |
docker rm -f aiter_test || true
- name: Clean up Rocm processes
if: always()
run: |
./.github/scripts/clean_up_rocm.sh