Skip to content

Commit 3fd90a7

Browse files
authored
Merge pull request #210 from seddonym/joblib-multiprocessing
Use joblib for more robust parallel import scanning
2 parents 6b2dd86 + e2ac3c9 commit 3fd90a7

4 files changed

Lines changed: 41 additions & 30 deletions

File tree

CHANGELOG.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ Changelog
55
Unreleased
66
----------
77

8+
* Use joblib instead of multiprocessing for CPU parallelism. Fixes https://github.com/seddonym/grimp/issues/208.
9+
810
3.8 (2025-04-11)
911
----------------
1012

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ authors = [
1616
]
1717
requires-python = ">=3.9"
1818
dependencies = [
19+
"joblib>=1.3.0",
1920
"typing-extensions>=3.10.0.0",
2021
]
2122
classifiers = [

src/grimp/application/usecases.py

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
"""
44

55
from typing import Dict, Sequence, Set, Type, Union, cast, Iterable, Collection
6-
import multiprocessing
76
import math
87

8+
import joblib # type: ignore
9+
910
from ..application.ports import caching
1011
from ..application.ports.filesystem import AbstractFileSystem
1112
from ..application.ports.graph import ImportGraph
@@ -228,19 +229,19 @@ def _create_chunks(module_files: Collection[ModuleFile]) -> tuple[tuple[ModuleFi
228229
module_files_tuple = tuple(module_files)
229230

230231
number_of_module_files = len(module_files_tuple)
231-
n_chunks = _decide_number_of_of_processes(number_of_module_files)
232+
n_chunks = _decide_number_of_processes(number_of_module_files)
232233
chunk_size = math.ceil(number_of_module_files / n_chunks)
233234

234235
return tuple(
235236
module_files_tuple[i * chunk_size : (i + 1) * chunk_size] for i in range(n_chunks)
236237
)
237238

238239

239-
def _decide_number_of_of_processes(number_of_module_files: int) -> int:
240+
def _decide_number_of_processes(number_of_module_files: int) -> int:
240241
if number_of_module_files < MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING:
241-
# Don't incur the overhead of multiprocessing.
242+
# Don't incur the overhead of multiple processes.
242243
return 1
243-
return min(multiprocessing.cpu_count(), number_of_module_files)
244+
return min(joblib.cpu_count(), number_of_module_files)
244245

245246

246247
def _scan_chunks(
@@ -257,20 +258,15 @@ def _scan_chunks(
257258
)
258259

259260
number_of_processes = len(chunks)
260-
if number_of_processes == 1:
261-
# No need to spawn a process if there's only one chunk.
262-
[chunk] = chunks
263-
return _scan_chunk(import_scanner, exclude_type_checking_imports, chunk)
264-
else:
265-
with multiprocessing.Pool(number_of_processes) as pool:
266-
imports_by_module_file: Dict[ModuleFile, Set[DirectImport]] = {}
267-
import_scanning_jobs = pool.starmap(
268-
_scan_chunk,
269-
[(import_scanner, exclude_type_checking_imports, chunk) for chunk in chunks],
270-
)
271-
for chunk_imports_by_module_file in import_scanning_jobs:
272-
imports_by_module_file.update(chunk_imports_by_module_file)
273-
return imports_by_module_file
261+
import_scanning_jobs = joblib.Parallel(n_jobs=number_of_processes)(
262+
joblib.delayed(_scan_chunk)(import_scanner, exclude_type_checking_imports, chunk)
263+
for chunk in chunks
264+
)
265+
266+
imports_by_module_file = {}
267+
for chunk_imports_by_module_file in import_scanning_jobs:
268+
imports_by_module_file.update(chunk_imports_by_module_file)
269+
return imports_by_module_file
274270

275271

276272
def _scan_chunk(

tox.ini

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,20 @@ envlist =
44
check,
55
docs,
66
{py39,py310,py311,py312,py13},
7+
py13-joblib-earliest,
8+
9+
[base]
10+
deps =
11+
pytest==7.4.4
12+
pyyaml==6.0.1
13+
pytest-cov==5.0.0
14+
pytest-benchmark==4.0.0
15+
# External packages to attempt to build the graph from.
16+
Django==4.2.17 # N.B. Django 5 doesn't support Python 3.9.
17+
flask==3.0.3
18+
requests==2.32.3
19+
sqlalchemy==2.0.35
20+
google-cloud-audit-log==0.3.0
721

822
[testenv]
923
basepython =
@@ -12,6 +26,7 @@ basepython =
1226
py311: {env:TOXPYTHON:python3.11}
1327
py312: {env:TOXPYTHON:python3.12}
1428
py313: {env:TOXPYTHON:python3.13}
29+
py313-joblib-earliest: {env:TOXPYTHON:python3.13}
1530
{clean,check,docs,report}: {env:TOXPYTHON:python3}
1631
setenv =
1732
PYTHONPATH={toxinidir}/tests
@@ -20,19 +35,16 @@ passenv =
2035
*
2136
usedevelop = false
2237
deps =
23-
pytest==7.4.4
24-
pyyaml==6.0.1
25-
pytest-cov==5.0.0
26-
pytest-benchmark==4.0.0
27-
# External packages to attempt to build the graph from.
28-
Django==4.2.17 # N.B. Django 5 doesn't support Python 3.9.
29-
flask==3.0.3
30-
requests==2.32.3
31-
sqlalchemy==2.0.35
32-
google-cloud-audit-log==0.3.0
38+
{[base]deps}
39+
joblib==1.4.2
3340
commands =
3441
{posargs:pytest --cov --cov-report=term-missing --benchmark-skip -vv tests}
3542

43+
[testenv:py313-joblib-earliest]
44+
deps =
45+
{[base]deps}
46+
joblib==1.3.0
47+
3648
[testenv:check]
3749
basepython = py313
3850
deps =
@@ -107,4 +119,4 @@ python =
107119
3.10: py310, report
108120
3.11: py311, report
109121
3.12: py312, report
110-
3.13: py313, report, check, docs
122+
3.13: py313, py313-joblib-earliest, report, check, docs

0 commit comments

Comments
 (0)