Skip to content

Commit c3ed7db

Browse files
authored
[XPU] [CI] Fix xpu ci bug (#7014)
* fix xpu ci bug * Remove unnecessary blank line in conftest.py * Update upload-artifact action to version 6 * Update _xpu_8cards_case_test.yml * fix ci bug * Change exit code on test failure to 1 * fix ci bug * fix ci bug * fix ci bug * fix ci bug * Update conftest.py
1 parent a31d4bf commit c3ed7db

7 files changed

Lines changed: 77 additions & 4 deletions

File tree

.github/workflows/_xpu_4cards_case_test.yml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,13 +193,29 @@ jobs:
193193
echo "============================开始运行pytest测试============================"
194194
export PYTHONPATH=/workspace/FastDeploy/
195195
export PYTHONPATH=$(pwd)/tests/xpu_ci:$PYTHONPATH
196+
mkdir -p case_logs
197+
set +e
196198
python -m pytest -v -s --tb=short tests/xpu_ci/4cards_cases/
197199
exit_code=$?
200+
set -e
201+
202+
# 修改case_logs权限,确保Docker外部的runner用户可以读取并上传
203+
chmod -R a+rX case_logs/ 2>/dev/null || true
198204
199205
if [ $exit_code -eq 0 ]; then
200206
echo "============================4卡cases测试通过!============================"
207+
exit $exit_code
201208
else
202209
echo "============================4卡cases测试失败,请检查日志!============================"
203210
exit $exit_code
204211
fi
205212
'
213+
214+
- name: Upload case logs
215+
if: always()
216+
uses: actions/upload-artifact@v6
217+
with:
218+
name: xpu-4cards-case-logs
219+
path: FastDeploy/case_logs/
220+
retention-days: 7
221+
if-no-files-found: ignore

.github/workflows/_xpu_8cards_case_test.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,8 +182,14 @@ jobs:
182182
echo "============================开始运行pytest测试============================"
183183
export PYTHONPATH=/workspace/FastDeploy/
184184
export PYTHONPATH=$(pwd)/tests/xpu_ci:$PYTHONPATH
185+
mkdir -p case_logs
186+
set +e
185187
python -m pytest -v -s --tb=short tests/xpu_ci/8cards_cases/
186188
exit_code=$?
189+
set -e
190+
191+
# 修改case_logs权限,确保Docker外部的runner用户可以读取并上传
192+
chmod -R a+rX case_logs/ 2>/dev/null || true
187193
188194
if [ $exit_code -eq 0 ]; then
189195
echo "============================8卡cases测试通过!============================"
@@ -192,3 +198,12 @@ jobs:
192198
exit $exit_code
193199
fi
194200
'
201+
202+
- name: Upload case logs
203+
if: always()
204+
uses: actions/upload-artifact@v6
205+
with:
206+
name: xpu-8cards-case-logs
207+
path: FastDeploy/case_logs/
208+
retention-days: 7
209+
if-no-files-found: ignore

tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp1.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def print_pd_logs_on_failure():
109109
log_dirs = ["log_router", "log_prefill", "log_decode"]
110110

111111
for log_dir in log_dirs:
112-
nohup_path = os.path.join(log_dir, "log_0/worklog.0")
112+
nohup_path = os.path.join(log_dir, "log_0/workerlog.0")
113113
if os.path.exists(nohup_path):
114114
print(f"\n========== {nohup_path} ==========")
115115
with open(nohup_path, "r") as f:

tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp4.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def print_pd_logs_on_failure():
109109
log_dirs = ["log_router", "log_prefill", "log_decode"]
110110

111111
for log_dir in log_dirs:
112-
nohup_path = os.path.join(log_dir, "log_0/worklog.0")
112+
nohup_path = os.path.join(log_dir, "log_0/workerlog.0")
113113
if os.path.exists(nohup_path):
114114
print(f"\n========== {nohup_path} ==========")
115115
with open(nohup_path, "r") as f:

tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp4_cudagraph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def print_pd_logs_on_failure():
109109
log_dirs = ["log_router", "log_prefill", "log_decode"]
110110

111111
for log_dir in log_dirs:
112-
nohup_path = os.path.join(log_dir, "log_0/worklog.0")
112+
nohup_path = os.path.join(log_dir, "log_0/workerlog.0")
113113
if os.path.exists(nohup_path):
114114
print(f"\n========== {nohup_path} ==========")
115115
with open(nohup_path, "r") as f:

tests/xpu_ci/8cards_cases/test_pd_p_tp4ep4_d_tp1ep4.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def print_pd_logs_on_failure():
110110
log_dirs = ["log_router", "log_prefill", "log_decode"]
111111

112112
for log_dir in log_dirs:
113-
nohup_path = os.path.join(log_dir, "log_0/worklog.0")
113+
nohup_path = os.path.join(log_dir, "log_0/workerlog.0")
114114
if os.path.exists(nohup_path):
115115
print(f"\n========== {nohup_path} ==========")
116116
with open(nohup_path, "r") as f:

tests/xpu_ci/conftest.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
4. 环境配置 - 设置XPU相关环境变量
2424
"""
2525

26+
import glob
2627
import json
2728
import os
2829
import shutil
@@ -31,6 +32,8 @@
3132

3233
import pytest
3334

35+
CASE_LOGS_DIR = os.path.join(os.getcwd(), "case_logs")
36+
3437

3538
def get_xpu_id():
3639
"""获取XPU_ID环境变量"""
@@ -457,3 +460,42 @@ def setup_logprobs_zmq_env():
457460
os.environ[key] = value
458461
print(f"设置环境变量: {key}={value}")
459462
return original_values
463+
464+
465+
# ============ 日志归档 pytest hook ============
466+
467+
468+
def _archive_case_logs(test_name):
469+
"""
470+
将当前工作目录下所有 log 开头的文件夹和 server.log 复制到 case_logs/{test_name}/ 下
471+
"""
472+
dest_dir = os.path.join(CASE_LOGS_DIR, test_name)
473+
os.makedirs(dest_dir, exist_ok=True)
474+
475+
# 复制所有 log* 目录
476+
for entry in glob.glob("log*"):
477+
if os.path.isdir(entry):
478+
shutil.copytree(entry, os.path.join(dest_dir, entry), dirs_exist_ok=True)
479+
elif os.path.isfile(entry):
480+
# 处理 server.log 等 log 开头的文件
481+
shutil.copy2(entry, os.path.join(dest_dir, entry))
482+
483+
# 单独处理 server.log(不以 log 开头但也是关键日志)
484+
if os.path.exists("server.log") and not os.path.exists(os.path.join(dest_dir, "server.log")):
485+
shutil.copy2("server.log", os.path.join(dest_dir, "server.log"))
486+
487+
488+
@pytest.hookimpl(hookwrapper=True, trylast=True)
489+
def pytest_runtest_makereport(item, call):
490+
"""每个测试阶段结束后归档日志(仅在 call 阶段后执行)"""
491+
outcome = yield
492+
report = outcome.get_result()
493+
494+
if report.when == "call":
495+
# 使用测试文件名(不含 .py)作为归档目录名
496+
test_file = os.path.basename(item.fspath)
497+
test_name = os.path.splitext(test_file)[0]
498+
try:
499+
_archive_case_logs(test_name)
500+
except Exception:
501+
pass

0 commit comments

Comments
 (0)