test(telemetry/e2e): make TestTelemetryE2E deterministic + deflake retry tests under merge-queue load #362
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: E2E Tests and Code Coverage | |
| permissions: | |
| contents: read | |
| id-token: write | |
| on: | |
| pull_request: | |
| merge_group: | |
| workflow_dispatch: | |
| # `pull_request` gives the PR author fast feedback as they iterate. | |
| # `merge_group` runs the same suite against the queue's transient | |
| # branch (current main + the queued PR diff, freshly merged) and is | |
| # the run that actually protects main — by the time `push:main` fires, | |
| # the merge has already happened and the coverage check has no power | |
| # to block. Hence we deliberately don't subscribe to `push:main`. | |
| # | |
| # Concurrency groups: | |
| # - pull_request: per-ref + cancel-in-progress. A force-push or fast | |
| # follow-up commit on a PR cancels the previous run instead of | |
| # racing it against shared warehouse state (Delta tables, UC Volume | |
| # files, telemetry endpoints, etc.). | |
| # - merge_group: serialised globally with a fixed group name. The | |
| # warehouse can't tolerate two parallel queue entries hammering | |
| # telemetry / retry paths simultaneously — we have observed flaky | |
| # retry-test failures (extra `/telemetry-ext` retries inflating | |
| # mock.call_count) under that load. Running queue entries one at a | |
| # time costs queue throughput (one entry at a time, ~17 min each) | |
| # but keeps signal trustworthy. cancel-in-progress is off so each | |
| # entry gets a complete run. | |
| # - workflow_dispatch: shares the merge_group group; manual triggers | |
| # are rare enough that serialising them with the queue is fine. | |
| concurrency: | |
| group: ${{ github.event_name == 'pull_request' && format('e2e-pr-{0}', github.ref) || 'e2e-mq-serial' }} | |
| cancel-in-progress: ${{ github.event_name == 'pull_request' }} | |
| jobs: | |
| test-with-coverage: | |
| runs-on: | |
| group: databricks-protected-runner-group | |
| labels: linux-ubuntu-latest | |
| environment: azure-prod | |
| env: | |
| DATABRICKS_SERVER_HOSTNAME: ${{ secrets.DATABRICKS_HOST }} | |
| DATABRICKS_HTTP_PATH: ${{ secrets.TEST_PECO_WAREHOUSE_HTTP_PATH }} | |
| DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} | |
| DATABRICKS_CATALOG: peco | |
| DATABRICKS_USER: ${{ secrets.TEST_PECO_SP_ID }} | |
| steps: | |
| - name: Check out repository | |
| uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Install system dependencies | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y libkrb5-dev | |
| - name: Setup Poetry | |
| uses: ./.github/actions/setup-poetry | |
| with: | |
| python-version: "3.10" | |
| install-args: "--all-extras" | |
| - name: Run all tests with coverage | |
| continue-on-error: false | |
| run: | | |
| poetry run pytest tests/unit tests/e2e \ | |
| -n 4 \ | |
| --dist=loadgroup \ | |
| --cov=src \ | |
| --cov-report=xml \ | |
| --cov-report=term \ | |
| -v | |
| - name: Check for coverage override | |
| id: override | |
| env: | |
| # PR_BODY is empty on `merge_group` (no pull_request payload). | |
| # That's intentional — coverage overrides are an author-time | |
| # escape hatch, not a queue-time bypass, so the queue run | |
| # always enforces the threshold regardless of the PR's | |
| # SKIP_COVERAGE_CHECK marker. | |
| PR_BODY: ${{ github.event.pull_request.body }} | |
| run: | | |
| OVERRIDE_COMMENT=$(echo "$PR_BODY" | grep -E "SKIP_COVERAGE_CHECK\s*=" || echo "") | |
| if [ -n "$OVERRIDE_COMMENT" ]; then | |
| echo "override=true" >> $GITHUB_OUTPUT | |
| REASON=$(echo "$OVERRIDE_COMMENT" | sed -E 's/.*SKIP_COVERAGE_CHECK\s*=\s*(.+)/\1/') | |
| echo "reason=$REASON" >> $GITHUB_OUTPUT | |
| echo "Coverage override found in PR description: $REASON" | |
| else | |
| echo "override=false" >> $GITHUB_OUTPUT | |
| echo "No coverage override found" | |
| fi | |
| - name: Check coverage percentage | |
| if: steps.override.outputs.override == 'false' | |
| run: | | |
| COVERAGE_FILE="coverage.xml" | |
| if [ ! -f "$COVERAGE_FILE" ]; then | |
| echo "ERROR: Coverage file not found at $COVERAGE_FILE" | |
| exit 1 | |
| fi | |
| if ! command -v xmllint &> /dev/null; then | |
| sudo apt-get update && sudo apt-get install -y libxml2-utils | |
| fi | |
| COVERED=$(xmllint --xpath "string(//coverage/@lines-covered)" "$COVERAGE_FILE") | |
| TOTAL=$(xmllint --xpath "string(//coverage/@lines-valid)" "$COVERAGE_FILE") | |
| PERCENTAGE=$(python3 -c "covered=${COVERED}; total=${TOTAL}; print(round((covered/total)*100, 2))") | |
| echo "Branch Coverage: $PERCENTAGE%" | |
| echo "Required Coverage: 85%" | |
| python3 -c "import sys; sys.exit(0 if float('$PERCENTAGE') >= 85 else 1)" | |
| if [ $? -eq 1 ]; then | |
| echo "ERROR: Coverage is $PERCENTAGE%, which is less than the required 85%" | |
| exit 1 | |
| else | |
| echo "SUCCESS: Coverage is $PERCENTAGE%, which meets the required 85%" | |
| fi | |
| - name: Coverage enforcement summary | |
| env: | |
| OVERRIDE: ${{ steps.override.outputs.override }} | |
| REASON: ${{ steps.override.outputs.reason }} | |
| run: | | |
| if [ "$OVERRIDE" == "true" ]; then | |
| echo "Coverage checks bypassed: $REASON" | |
| echo "Please ensure this override is justified and temporary" | |
| else | |
| echo "Coverage checks enforced - minimum 85% required" | |
| fi |