diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 7fa3450ca..7d5d48ac0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -101,6 +101,23 @@ jobs:
         echo "BEFORE=$git_status" >> $GITHUB_ENV
         echo "Repository status before tests: $git_status"
 
+    - name: Clone Services
+      if: matrix.os == 'ubuntu-latest'
+      run: |
+        git clone --depth 1 https://github.com/openml/services.git
+
+    - name: Start Docker Services
+      if: matrix.os == 'ubuntu-latest'
+      working-directory: ./services
+      run: |
+        docker compose --profile rest-api --profile minio up -d
+
+        echo "Waiting for PHP API to boot..."
+        timeout 60s bash -c 'until [ "$(docker inspect -f {{.State.Health.Status}} openml-php-rest-api)" == "healthy" ]; do sleep 5; done'
+
+        echo "Final Verification: Gateway Connectivity..."
+        curl -sSfL http://localhost:8000/api/v1/xml/data/1 | head -n 15
+
     - name: Show installed dependencies
       run: python -m pip list
 
@@ -108,15 +125,16 @@ jobs:
       if: matrix.os == 'ubuntu-latest'
       env:
         OPENML_TEST_SERVER_ADMIN_KEY: ${{ secrets.OPENML_TEST_SERVER_ADMIN_KEY }}
+        OPENML_USE_LOCAL_SERVICES: "true"
       run: |
         if [ "${{ matrix.code-cov }}" = "true" ]; then
           codecov="--cov=openml --long --cov-report=xml"
         fi
 
         if [ "${{ matrix.sklearn-only }}" = "true" ]; then
-          marks="sklearn and not production_server and not test_server"
+          marks="sklearn and not production_server"
         else
-          marks="not production_server and not test_server"
+          marks="not production_server"
         fi
 
         pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
@@ -125,15 +143,16 @@ jobs:
       if: matrix.os == 'ubuntu-latest'
       env:
         OPENML_TEST_SERVER_ADMIN_KEY: ${{ secrets.OPENML_TEST_SERVER_ADMIN_KEY }}
+        OPENML_USE_LOCAL_SERVICES: "true"
       run: |
         if [ "${{ matrix.code-cov }}" = "true" ]; then
           codecov="--cov=openml --long --cov-report=xml"
         fi
 
         if [ "${{ matrix.sklearn-only }}" = "true" ]; then
-          marks="sklearn and production_server and not test_server"
+          marks="sklearn and production_server"
         else
-          marks="production_server and not test_server"
+          marks="production_server"
         fi
 
         pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
@@ -145,6 +164,20 @@ jobs:
       run: |  # we need a separate step because of the bash-specific if-statement in the previous one.
         pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1 -m "not test_server"
 
+    - name: Upload coverage
+      if: matrix.code-cov && always()
+      uses: codecov/codecov-action@v4
+      with:
+        files: coverage.xml
+        token: ${{ secrets.CODECOV_TOKEN }}
+        fail_ci_if_error: true
+        verbose: true
+
+    - name: Cleanup Docker setup
+      if: matrix.os == 'ubuntu-latest' && always()
+      run: |
+        sudo rm -rf services
+
     - name: Check for files left behind by test
       if: matrix.os != 'windows-latest' && always()
       run: |
@@ -157,15 +190,6 @@ jobs:
             exit 1
         fi
 
-    - name: Upload coverage
-      if: matrix.code-cov && always()
-      uses: codecov/codecov-action@v4
-      with:
-        files: coverage.xml
-        token: ${{ secrets.CODECOV_TOKEN }}
-        fail_ci_if_error: true
-        verbose: true
-
   dummy_windows_py_sk024:
     name: (windows-latest, Py, sk0.24.*, sk-only:false)
     runs-on: ubuntu-latest
diff --git a/tests/conftest.py b/tests/conftest.py
index 2a7a6dcc7..d1bc23d4f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -272,6 +272,8 @@ def as_robot() -> Iterator[None]:
 
 @pytest.fixture(autouse=True)
 def with_server(request):
+    if os.getenv("OPENML_USE_LOCAL_SERVICES") == "true":
+        openml.config.TEST_SERVER_URL = "http://localhost:8000"
     if "production_server" in request.keywords:
         openml.config.server = "https://www.openml.org/api/v1/xml"
         openml.config.apikey = None
@@ -306,4 +308,4 @@ def workdir(tmp_path):
     original_cwd = Path.cwd()
     os.chdir(tmp_path)
     yield tmp_path
-    os.chdir(original_cwd)
+    os.chdir(original_cwd)
\ No newline at end of file
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index 151a9ac23..10517a3e1 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -530,6 +530,10 @@ def test_deletion_of_cache_dir_faulty_download(self, patch):
         datasets_cache_dir = os.path.join(openml.config.get_cache_directory(), "datasets")
         assert len(os.listdir(datasets_cache_dir)) == 0
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_publish_dataset(self):
         arff_file_path = self.static_cache_dir / "org" / "openml" / "test" / "datasets" / "2" / "dataset.arff"
@@ -566,6 +570,10 @@ def test__retrieve_class_labels(self):
         labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name)
         assert labels == ["COIL", "SHEET"]
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_upload_dataset_with_url(self):
         dataset = OpenMLDataset(
@@ -689,6 +697,10 @@ def test_attributes_arff_from_df_unknown_dtype(self):
             with pytest.raises(ValueError, match=err_msg):
                 attributes_arff_from_df(df)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_create_dataset_numpy(self):
         data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T
@@ -723,6 +735,10 @@ def test_create_dataset_numpy(self):
         ), "Uploaded arff does not match original one"
         assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_create_dataset_list(self):
         data = [
@@ -778,6 +794,10 @@ def test_create_dataset_list(self):
         ), "Uploaded ARFF does not match original one"
         assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_create_dataset_sparse(self):
         # test the scipy.sparse.coo_matrix
@@ -926,6 +946,10 @@ def test_get_online_dataset_format(self):
             dataset_id
         ), "The format of the ARFF files is different"
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_create_dataset_pandas(self):
         data = [
@@ -1151,6 +1175,10 @@ def test_ignore_attributes_dataset(self):
                 paper_url=paper_url,
             )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_publish_fetch_ignore_attribute(self):
         """Test to upload and retrieve dataset and check ignore_attributes"""
@@ -1270,6 +1298,10 @@ def test_create_dataset_row_id_attribute_error(self):
                 paper_url=paper_url,
             )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_create_dataset_row_id_attribute_inference(self):
         # meta-information
@@ -1438,6 +1470,10 @@ def test_data_edit_non_critical_field(self):
         edited_dataset = openml.datasets.get_dataset(did)
         assert edited_dataset.description == desc
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_data_edit_critical_field(self):
         # Case 2
@@ -1490,6 +1526,10 @@ def test_data_edit_requires_valid_dataset(self):
             description="xor operation dataset",
         )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
         # Need to own a dataset to be able to edit meta-data
@@ -2008,4 +2048,4 @@ def test_get_dataset_parquet(requests_mock, test_files_directory):
     assert dataset._parquet_url is not None
     assert dataset.parquet_file is not None
     assert os.path.isfile(dataset.parquet_file)
-    assert dataset.data_file is None  # is alias for arff path
+    assert dataset.data_file is None  # is alias for arff path
\ No newline at end of file
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index b942c0ab9..6f0de0a43 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -5,6 +5,7 @@
 import copy
 import hashlib
 import re
+import os
 import time
 from packaging.version import Version
 from unittest import mock
@@ -33,7 +34,6 @@
 from openml.testing import SimpleImputer, TestBase
 
 
-
 class TestFlow(TestBase):
     _multiprocess_can_split_ = True
 
@@ -180,6 +180,10 @@ def test_to_xml_from_xml(self):
         openml.flows.functions.assert_flows_equal(new_flow, flow)
         assert new_flow is not flow
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_publish_flow(self):
@@ -222,6 +226,10 @@ def test_publish_existing_flow(self, flow_exists_mock):
             f"collected from {__file__.split('/')[-1]}: {flow.flow_id}",
         )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_publish_flow_with_similar_components(self):
@@ -273,6 +281,10 @@ def test_publish_flow_with_similar_components(self):
         TestBase._mark_entity_for_removal("flow", flow3.flow_id, flow3.name)
         TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow3.flow_id}")
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_semi_legal_flow(self):
@@ -383,6 +395,10 @@ def get_sentinel():
         flow_id = openml.flows.flow_exists(name, version)
         assert not flow_id
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_existing_flow_exists(self):
@@ -424,6 +440,10 @@ def test_existing_flow_exists(self):
             )
             assert downloaded_flow_id == flow.flow_id
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_sklearn_to_upload_to_flow(self):
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index ce0d5e782..035fabe4a 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -12,6 +12,7 @@
 from unittest import mock
 from unittest.mock import patch
 
+import os
 import pandas as pd
 import pytest
 import requests
@@ -309,6 +310,10 @@ def test_get_flow1(self):
         flow = openml.flows.get_flow(1)
         assert flow.external_version is None
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_get_flow_reinstantiate_model(self):
@@ -392,6 +397,10 @@ def test_get_flow_reinstantiate_flow_not_strict_pre_023(self):
         assert flow.flow_id is None
         assert "sklearn==0.19.1" not in flow.dependencies
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_get_flow_id(self):
diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py
index 3f30f38ba..28d94d43a 100644
--- a/tests/test_openml/test_api_calls.py
+++ b/tests/test_openml/test_api_calls.py
@@ -7,6 +7,7 @@
 
 import minio
 import pytest
+import os
 
 import openml
 from openml.config import ConfigurationForExamples
@@ -20,6 +21,10 @@ def test_too_long_uri(self):
         with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"):
             openml.datasets.list_datasets(data_id=list(range(10000)))
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @unittest.mock.patch("time.sleep")
     @unittest.mock.patch("requests.Session")
     @pytest.mark.test_server()
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 17349fca8..92db1817e 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -118,6 +118,10 @@ def _check_array(array, type_):
         else:
             assert run_prime_trace_content is None
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_to_from_filesystem_vanilla(self):
@@ -153,6 +157,10 @@ def test_to_from_filesystem_vanilla(self):
             f"collected from {__file__.split('/')[-1]}: {run_prime.run_id}",
         )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.flaky()
     @pytest.mark.test_server()
@@ -189,6 +197,10 @@ def test_to_from_filesystem_search(self):
             f"collected from {__file__.split('/')[-1]}: {run_prime.run_id}",
         )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_to_from_filesystem_no_model(self):
@@ -295,6 +307,10 @@ def assert_run_prediction_data(task, run, model):
             assert_method(y_pred, saved_y_pred)
             assert_method(y_test, saved_y_test)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_publish_with_local_loaded_flow(self):
@@ -339,6 +355,10 @@ def test_publish_with_local_loaded_flow(self):
             assert openml.flows.flow_exists(flow.name, flow.external_version)
             openml.runs.get_run(loaded_run.run_id)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_offline_and_online_run_identical(self):
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 9bc8d74fa..19cc1badf 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -397,6 +397,10 @@ def _check_sample_evaluations(
                                 assert evaluation > 0
                             assert evaluation < max_time_allowed
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_regression_on_classif_task(self):
@@ -414,6 +418,10 @@ def test_run_regression_on_classif_task(self):
                 task=task,
             )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_check_erronous_sklearn_flow_fails(self):
@@ -627,6 +635,10 @@ def _run_and_upload_regression(
             sentinel=sentinel,
         )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_and_upload_logistic_regression(self):
@@ -636,6 +648,10 @@ def test_run_and_upload_logistic_regression(self):
         n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
         self._run_and_upload_classification(lr, task_id, n_missing_vals, n_test_obs, "62501")
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_and_upload_linear_regression(self):
@@ -667,6 +683,10 @@ def test_run_and_upload_linear_regression(self):
         n_test_obs = self.TEST_SERVER_TASK_REGRESSION["n_test_obs"]
         self._run_and_upload_regression(lr, task_id, n_missing_vals, n_test_obs, "62501")
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_and_upload_pipeline_dummy_pipeline(self):
@@ -681,6 +701,10 @@ def test_run_and_upload_pipeline_dummy_pipeline(self):
         n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
         self._run_and_upload_classification(pipeline1, task_id, n_missing_vals, n_test_obs, "62501")
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -798,6 +822,10 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock):
                 call_count += 1
         assert call_count == 3
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_and_upload_gridsearch(self):
@@ -821,6 +849,10 @@ def test_run_and_upload_gridsearch(self):
         )
         assert len(run.trace.trace_iterations) == 9
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_and_upload_randomsearch(self):
@@ -854,6 +886,10 @@ def test_run_and_upload_randomsearch(self):
         trace = openml.runs.get_run_trace(run.run_id)
         assert len(trace.trace_iterations) == 5
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_and_upload_maskedarrays(self):
@@ -882,6 +918,10 @@ def test_run_and_upload_maskedarrays(self):
 
     ##########################################################################
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_learning_curve_task_1(self):
@@ -907,6 +947,10 @@ def test_learning_curve_task_1(self):
         )
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_learning_curve_task_2(self):
@@ -944,6 +988,10 @@ def test_learning_curve_task_2(self):
         )
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.21"),
@@ -1023,6 +1071,10 @@ def _test_local_evaluations(self, run):
                 assert alt_scores[idx] >= 0
                 assert alt_scores[idx] <= 1
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_local_run_swapped_parameter_order_model(self):
@@ -1039,6 +1091,10 @@ def test_local_run_swapped_parameter_order_model(self):
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1068,6 +1124,10 @@ def test_local_run_swapped_parameter_order_flow(self):
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1106,6 +1166,10 @@ def test_online_run_metric_score(self):
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1168,6 +1232,10 @@ def test_initialize_model_from_run(self):
         assert flowS.components["Imputer"].parameters["strategy"] == '"most_frequent"'
         assert flowS.components["VarianceThreshold"].parameters["threshold"] == "0.05"
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1228,6 +1296,10 @@ def test__run_exists(self):
             run_ids = run_exists(task.task_id, setup_exists)
             assert run_ids, (run_ids, clf)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_with_illegal_flow_id(self):
@@ -1248,6 +1320,10 @@ def test_run_with_illegal_flow_id(self):
                 avoid_duplicate_runs=True,
             )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_with_illegal_flow_id_after_load(self):
@@ -1306,6 +1382,10 @@ def test_run_with_illegal_flow_id_1(self):
                 avoid_duplicate_runs=True,
             )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_with_illegal_flow_id_1_after_load(self):
@@ -1345,6 +1425,10 @@ def test_run_with_illegal_flow_id_1_after_load(self):
             loaded_run.publish,
         )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1575,6 +1659,10 @@ def test_get_runs_list_by_tag(self):
         runs = openml.runs.list_runs(tag="curves", size=2)
         assert len(runs) >= 1
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1612,6 +1700,10 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
             # repeat, fold, row_id, 6 confidences, prediction and correct label
             assert len(row) == 12
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1666,6 +1758,10 @@ def test_get_uncached_run(self):
         with pytest.raises(openml.exceptions.OpenMLCacheException):
             openml.runs.functions._get_cached_run(10)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_run_flow_on_task_downloaded_flow(self):
@@ -1767,7 +1863,10 @@ def test_format_prediction_task_regression(self):
         self.assertListEqual(res, [0] * 5)
 
 
-
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
@@ -1865,6 +1964,10 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key):
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
+@pytest.mark.skipif(
+    os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+    reason="Pending resolution of #1657",
+)
 @pytest.mark.sklearn()
 @unittest.skipIf(
     Version(sklearn.__version__) < Version("0.21"),
@@ -1947,6 +2050,10 @@ def test__run_task_get_arffcontent_2(parallel_mock):
     )
 
 
+@pytest.mark.skipif(
+    os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+    reason="Pending resolution of #1657",
+)
 @pytest.mark.sklearn()
 @unittest.skipIf(
     Version(sklearn.__version__) < Version("0.21"),
@@ -2016,6 +2123,7 @@ def test_joblib_backends(parallel_mock, n_jobs, backend, call_count):
         n_jobs=n_jobs,
     )
     from openml_sklearn import SklearnExtension
+
     extension = SklearnExtension()
     with parallel_backend(backend, n_jobs=n_jobs):
         res = openml.runs.functions._run_task_get_arffcontent(
@@ -2032,4 +2140,4 @@ def test_joblib_backends(parallel_mock, n_jobs, backend, call_count):
     # *_time_millis_* not recorded when n_jobs = -1
     assert len(res[2]["predictive_accuracy"][0]) == 10
     assert len(res[3]["predictive_accuracy"][0]) == 10
-    assert parallel_mock.call_count == call_count
+    assert parallel_mock.call_count == call_count
\ No newline at end of file
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index 0df3a0b3b..3892b9104 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -4,7 +4,7 @@
 import hashlib
 import time
 import unittest.mock
-
+import os
 import pandas as pd
 import pytest
 import sklearn.base
@@ -34,6 +34,10 @@ def setUp(self):
         self.extension = SklearnExtension()
         super().setUp()
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_nonexisting_setup_exists(self):
@@ -82,6 +86,10 @@ def _existing_setup_exists(self, classif):
         setup_id = openml.setups.setup_exists(flow)
         assert setup_id == run.setup_id
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_existing_setup_exists_1(self):
@@ -98,12 +106,20 @@ def side_effect(self):
             nb = sklearn.naive_bayes.GaussianNB()
             self._existing_setup_exists(nb)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_exisiting_setup_exists_2(self):
         # Check a flow with one hyperparameter
         self._existing_setup_exists(sklearn.naive_bayes.GaussianNB())
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.sklearn()
     @pytest.mark.test_server()
     def test_existing_setup_exists_3(self):
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index df3c0a3b6..2ed61ec0f 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -167,6 +167,10 @@ def test_get_task(self):
             os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset_1.pq")
         )
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_get_task_lazy(self):
         task = openml.tasks.get_task(2, download_data=False)  # anneal; crossvalidation
@@ -224,6 +228,10 @@ def test_get_task_different_types(self):
         # Issue 538, get_task failing with clustering task.
         openml.tasks.functions.get_task(126033)
 
+    @pytest.mark.skipif(
+        os.getenv("OPENML_USE_LOCAL_SERVICES") == "true",
+        reason="Pending resolution of #1657",
+    )
     @pytest.mark.test_server()
     def test_download_split(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation