Default to not checking for duplicates (#1431)

PGijsbers · web-flow · commit 0909980b65fb · 2025-06-20T13:38:08.000+02:00
diff --git a/openml/config.py b/openml/config.py
@@ -150,7 +150,7 @@ def _resolve_default_cache_dir() -> Path:
     "apikey": "",
     "server": "https://www.openml.org/api/v1/xml",
     "cachedir": _resolve_default_cache_dir(),
-    "avoid_duplicate_runs": True,
+    "avoid_duplicate_runs": False,
     "retry_policy": "human",
     "connection_n_retries": 5,
     "show_progress": False,
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -59,7 +59,7 @@
 def run_model_on_task(  # noqa: PLR0913
     model: Any,
     task: int | str | OpenMLTask,
-    avoid_duplicate_runs: bool = True,  # noqa: FBT001, FBT002
+    avoid_duplicate_runs: bool | None = None,
     flow_tags: list[str] | None = None,
     seed: int | None = None,
     add_local_measures: bool = True,  # noqa: FBT001, FBT002
@@ -77,9 +77,10 @@ def run_model_on_task(  # noqa: PLR0913
     task : OpenMLTask or int or str
         Task to perform or Task id.
         This may be a model instead if the first argument is an OpenMLTask.
-    avoid_duplicate_runs : bool, optional (default=True)
+    avoid_duplicate_runs : bool, optional (default=None)
         If True, the run will throw an error if the setup/task combination is already present on
         the server. This feature requires an internet connection.
+        If not set, it will use the default from your openml configuration (False if unset).
     flow_tags : List[str], optional (default=None)
         A list of tags that the flow should have at creation.
     seed: int, optional (default=None)
@@ -104,6 +105,8 @@ def run_model_on_task(  # noqa: PLR0913
     flow : OpenMLFlow (optional, only if `return_flow` is True).
         Flow generated from the model.
     """
+    if avoid_duplicate_runs is None:
+        avoid_duplicate_runs = openml.config.avoid_duplicate_runs
     if avoid_duplicate_runs and not config.apikey:
         warnings.warn(
             "avoid_duplicate_runs is set to True, but no API key is set. "
@@ -175,7 +178,7 @@ def get_task_and_type_conversion(_task: int | str | OpenMLTask) -> OpenMLTask:
 def run_flow_on_task(  # noqa: C901, PLR0912, PLR0915, PLR0913
     flow: OpenMLFlow,
     task: OpenMLTask,
-    avoid_duplicate_runs: bool = True,  # noqa: FBT002, FBT001
+    avoid_duplicate_runs: bool | None = None,
     flow_tags: list[str] | None = None,
     seed: int | None = None,
     add_local_measures: bool = True,  # noqa: FBT001, FBT002
@@ -195,9 +198,10 @@ def run_flow_on_task(  # noqa: C901, PLR0912, PLR0915, PLR0913
         all supervised estimators of scikit learn follow this definition of a model.
     task : OpenMLTask
         Task to perform. This may be an OpenMLFlow instead if the first argument is an OpenMLTask.
-    avoid_duplicate_runs : bool, optional (default=True)
+    avoid_duplicate_runs : bool, optional (default=None)
         If True, the run will throw an error if the setup/task combination is already present on
         the server. This feature requires an internet connection.
+        If not set, it will use the default from your openml configuration (False if unset).
     flow_tags : List[str], optional (default=None)
         A list of tags that the flow should have at creation.
     seed: int, optional (default=None)
@@ -221,6 +225,9 @@ def run_flow_on_task(  # noqa: C901, PLR0912, PLR0915, PLR0913
     if flow_tags is not None and not isinstance(flow_tags, list):
         raise ValueError("flow_tags should be a list")
 
+    if avoid_duplicate_runs is None:
+        avoid_duplicate_runs = openml.config.avoid_duplicate_runs
+
     # TODO: At some point in the future do not allow for arguments in old order (changed 6-2018).
     # Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019).
     if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow):
diff --git a/openml/testing.py b/openml/testing.py
@@ -101,7 +101,6 @@ def setUp(self, n_levels: int = 1, tmpdir_suffix: str = "") -> None:
         self.cached = True
         openml.config.apikey = TestBase.apikey
         self.production_server = "https://www.openml.org/api/v1/xml"
-        openml.config.avoid_duplicate_runs = False
         openml.config.set_root_cache_directory(str(self.workdir))
 
         # Increase the number of retries to avoid spurious server failures
diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py
@@ -175,13 +175,14 @@ def test_configuration_file_not_overwritten_on_load():
 
 def test_configuration_loads_booleans(tmp_path):
     config_file_content = "avoid_duplicate_runs=true\nshow_progress=false"
-    with (tmp_path / "config").open("w") as config_file:
+    tmp_file = tmp_path / "config"
+    with tmp_file.open("w") as config_file:
         config_file.write(config_file_content)
-        read_config = openml.config._parse_config(tmp_path)
+    read_config = openml.config._parse_config(tmp_file)
 
     # Explicit test to avoid truthy/falsy modes of other types
-    assert True == read_config["avoid_duplicate_runs"]
-    assert False == read_config["show_progress"]
+    assert read_config["avoid_duplicate_runs"] is True
+    assert read_config["show_progress"] is False
 
 
 def test_openml_cache_dir_env_var(tmp_path: Path) -> None:
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
@@ -130,7 +130,6 @@ def test_to_from_filesystem_vanilla(self):
             model=model,
             task=task,
             add_local_measures=False,
-            avoid_duplicate_runs=False,
             upload_flow=True,
         )
 
@@ -174,7 +173,6 @@ def test_to_from_filesystem_search(self):
             model=model,
             task=task,
             add_local_measures=False,
-            avoid_duplicate_runs=False,
         )
 
         cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
@@ -311,7 +309,6 @@ def test_publish_with_local_loaded_flow(self):
                 flow=flow,
                 task=task,
                 add_local_measures=False,
-                avoid_duplicate_runs=False,
                 upload_flow=False,
             )
 
@@ -351,7 +348,6 @@ def test_offline_and_online_run_identical(self):
                 flow=flow,
                 task=task,
                 add_local_measures=False,
-                avoid_duplicate_runs=False,
                 upload_flow=False,
             )
 
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -181,14 +181,12 @@ def _rerun_model_and_compare_predictions(self, run_id, model_prime, seed, create
             run_prime = openml.runs.run_model_on_task(
                 model=model_prime,
                 task=task,
-                avoid_duplicate_runs=False,
                 seed=seed,
             )
         else:
             run_prime = openml.runs.run_model_on_task(
                 model=model_prime,
                 task=run.task_id,
-                avoid_duplicate_runs=False,
                 seed=seed,
             )
 
@@ -278,7 +276,6 @@ def _remove_random_state(flow):
             flow=flow,
             task=task,
             seed=seed,
-            avoid_duplicate_runs=openml.config.avoid_duplicate_runs,
         )
         run_ = run.publish()
         TestBase._mark_entity_for_removal("run", run.run_id)
@@ -414,7 +411,6 @@ def test_run_regression_on_classif_task(self):
             openml.runs.run_model_on_task(
                 model=clf,
                 task=task,
-                avoid_duplicate_runs=False,
             )
 
     @pytest.mark.sklearn()
@@ -969,7 +965,6 @@ def test_initialize_cv_from_run(self):
         run = openml.runs.run_model_on_task(
             model=randomsearch,
             task=task,
-            avoid_duplicate_runs=False,
             seed=1,
         )
         run_ = run.publish()
@@ -1026,7 +1021,6 @@ def test_local_run_swapped_parameter_order_model(self):
         run = openml.runs.run_model_on_task(
             task,
             clf,
-            avoid_duplicate_runs=False,
             upload_flow=False,
         )
 
@@ -1055,7 +1049,6 @@ def test_local_run_swapped_parameter_order_flow(self):
         run = openml.runs.run_flow_on_task(
             task,
             flow,
-            avoid_duplicate_runs=False,
             upload_flow=False,
         )
 
@@ -1083,7 +1076,6 @@ def test_local_run_metric_score(self):
         run = openml.runs.run_model_on_task(
             model=clf,
             task=task,
-            avoid_duplicate_runs=False,
             upload_flow=False,
         )
 
@@ -1142,7 +1134,6 @@ def test_initialize_model_from_run(self):
         run = openml.runs.run_model_on_task(
             model=clf,
             task=task,
-            avoid_duplicate_runs=False,
         )
         run_ = run.publish()
         TestBase._mark_entity_for_removal("run", run_.run_id)
@@ -1251,7 +1242,6 @@ def test_run_with_illegal_flow_id_after_load(self):
         run = openml.runs.run_flow_on_task(
             task=task,
             flow=flow,
-            avoid_duplicate_runs=False,
             upload_flow=False,
         )
 
@@ -1316,7 +1306,6 @@ def test_run_with_illegal_flow_id_1_after_load(self):
         run = openml.runs.run_flow_on_task(
             task=task,
             flow=flow_new,
-            avoid_duplicate_runs=False,
             upload_flow=False,
         )
 
@@ -1664,7 +1653,6 @@ def test_run_flow_on_task_downloaded_flow(self):
         run = openml.runs.run_flow_on_task(
             flow=downloaded_flow,
             task=task,
-            avoid_duplicate_runs=False,
             upload_flow=False,
         )
 
@@ -1913,7 +1901,7 @@ def test_delete_run(self):
         task = openml.tasks.get_task(32)  # diabetes; crossvalidation
 
         run = openml.runs.run_model_on_task(
-            model=clf, task=task, seed=rs, avoid_duplicate_runs=False
+            model=clf, task=task, seed=rs,
         )
         run.publish()