Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 31 additions & 4 deletions agent_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,33 @@ def run_agent_eval(

results: List[Tuple[int, Dict[str, Any], str]] = []
tasks = list(range(len(dataset)))
tasks_to_run = tasks
if reuse:
tasks_to_run = []
for idx in tasks:
if do_eval:
eval_cached = store.load_eval(idx)
if eval_cached is not None:
cached_score = eval_cached.get("score", eval_cached)
cached_final = eval_cached.get("final_answer", "")
if not cached_final:
traj = store.load_traj(idx)
if traj is not None:
cached_final = traj.get("final_answer", "")
results.append((idx, cached_score, cached_final))
continue
tasks_to_run.append(idx)
continue

if do_infer:
traj = store.load_traj(idx)
if traj and traj.get("success"):
results.append((idx, {}, traj.get("final_answer", "")))
else:
tasks_to_run.append(idx)
else:
tasks_to_run.append(idx)

if nproc > 1:
with ThreadPoolExecutor(max_workers=nproc) as executor:
futures = [
Expand All @@ -128,15 +155,15 @@ def run_agent_eval(
do_infer,
do_eval,
)
for idx in tasks
for idx in tasks_to_run
]
with tqdm(total=len(tasks), desc="Agent Eval", unit="sample") as pbar:
with tqdm(total=len(tasks_to_run), desc="Agent Eval", unit="sample") as pbar:
for fut in as_completed(futures):
results.append(fut.result())
pbar.update(1)
else:
with tqdm(total=len(tasks), desc="Agent Eval", unit="sample") as pbar:
for idx in tasks:
with tqdm(total=len(tasks_to_run), desc="Agent Eval", unit="sample") as pbar:
for idx in tasks_to_run:
results.append(
_run_one_sample(
idx, agent, dataset, store, judge_kwargs, reuse, do_infer, do_eval
Expand Down
2 changes: 1 addition & 1 deletion scieval/agents/smolagents.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def __init__(
):
super().__init__(name=self.name, model_version=model_version)
self.api_key = api_key or os.environ.get("OPENAI_API_KEY", "")
self.api_base = api_base or os.environ.get("OPENAI_BASE_URL", "")
self.api_base = api_base or os.environ.get("OPENAI_API_BASE", "")
self.model_version = model_version or os.environ.get("MODEL_ID", "o3")

def run(self, sample: EvalSample) -> EvalResult:
Expand Down
Loading