Skip to content

Commit 4fdf254

Browse files
authored
Merge pull request #207 from seddonym/small-codebase-build-optimization
Don't parallelize scanning for small codebases
2 parents 30e36bc + 059a5d9 commit 4fdf254

2 files changed

Lines changed: 87 additions & 22 deletions

File tree

src/grimp/application/usecases.py

Lines changed: 58 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,15 @@
1515
from ..domain.valueobjects import DirectImport, Module
1616
from .config import settings
1717

18-
N_CPUS = multiprocessing.cpu_count()
19-
2018

2119
class NotSupplied:
2220
pass
2321

2422

23+
# This is an arbitrary number, but setting it too low slows down our functional tests considerably.
24+
MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING = 50
25+
26+
2527
def build_graph(
2628
package_name,
2729
*additional_package_names,
@@ -209,32 +211,66 @@ def _scan_imports(
209211
include_external_packages: bool,
210212
exclude_type_checking_imports: bool,
211213
) -> Dict[ModuleFile, Set[DirectImport]]:
212-
import_scanner: AbstractImportScanner = settings.IMPORT_SCANNER_CLASS(
213-
file_system=file_system,
214-
found_packages=found_packages,
215-
include_external_packages=include_external_packages,
214+
chunks = _create_chunks(module_files)
215+
return _scan_chunks(
216+
chunks,
217+
file_system,
218+
found_packages,
219+
include_external_packages,
220+
exclude_type_checking_imports,
216221
)
217222

218-
imports_by_module_file: Dict[ModuleFile, Set[DirectImport]] = {}
219223

220-
n_chunks = min(N_CPUS, len(module_files))
221-
chunks = _create_chunks(list(module_files), n_chunks=n_chunks)
222-
with multiprocessing.Pool(n_chunks) as pool:
223-
import_scanning_jobs = pool.starmap(
224-
_scan_chunk,
225-
[(import_scanner, exclude_type_checking_imports, chunk) for chunk in chunks],
226-
)
227-
for chunk_imports_by_module_file in import_scanning_jobs:
228-
imports_by_module_file.update(chunk_imports_by_module_file)
224+
def _create_chunks(module_files: Collection[ModuleFile]) -> tuple[tuple[ModuleFile, ...], ...]:
225+
"""
226+
Split the module files into chunks, each to be worked on by a separate OS process.
227+
"""
228+
module_files_tuple = tuple(module_files)
229229

230-
return imports_by_module_file
230+
number_of_module_files = len(module_files_tuple)
231+
n_chunks = _decide_number_of_of_processes(number_of_module_files)
232+
chunk_size = math.ceil(number_of_module_files / n_chunks)
233+
234+
return tuple(
235+
module_files_tuple[i * chunk_size : (i + 1) * chunk_size] for i in range(n_chunks)
236+
)
231237

232238

233-
def _create_chunks(
234-
module_files: Sequence[ModuleFile], *, n_chunks: int
235-
) -> Iterable[Iterable[ModuleFile]]:
236-
chunk_size = math.ceil(len(module_files) / n_chunks)
237-
return [module_files[i * chunk_size : (i + 1) * chunk_size] for i in range(n_chunks)]
239+
def _decide_number_of_of_processes(number_of_module_files: int) -> int:
240+
if number_of_module_files < MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING:
241+
# Don't incur the overhead of multiprocessing.
242+
return 1
243+
return min(multiprocessing.cpu_count(), number_of_module_files)
244+
245+
246+
def _scan_chunks(
247+
chunks: Collection[Collection[ModuleFile]],
248+
file_system: AbstractFileSystem,
249+
found_packages: Set[FoundPackage],
250+
include_external_packages: bool,
251+
exclude_type_checking_imports: bool,
252+
) -> Dict[ModuleFile, Set[DirectImport]]:
253+
import_scanner: AbstractImportScanner = settings.IMPORT_SCANNER_CLASS(
254+
file_system=file_system,
255+
found_packages=found_packages,
256+
include_external_packages=include_external_packages,
257+
)
258+
259+
number_of_processes = len(chunks)
260+
if number_of_processes == 1:
261+
# No need to spawn a process if there's only one chunk.
262+
[chunk] = chunks
263+
return _scan_chunk(import_scanner, exclude_type_checking_imports, chunk)
264+
else:
265+
with multiprocessing.Pool(number_of_processes) as pool:
266+
imports_by_module_file: Dict[ModuleFile, Set[DirectImport]] = {}
267+
import_scanning_jobs = pool.starmap(
268+
_scan_chunk,
269+
[(import_scanner, exclude_type_checking_imports, chunk) for chunk in chunks],
270+
)
271+
for chunk_imports_by_module_file in import_scanning_jobs:
272+
imports_by_module_file.update(chunk_imports_by_module_file)
273+
return imports_by_module_file
238274

239275

240276
def _scan_chunk(

tests/functional/test_build_and_use_graph.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from grimp import build_graph
22
from typing import Set, Tuple, Optional
33
import pytest
4+
from unittest.mock import patch
5+
from grimp.application import usecases
46

57
"""
68
For ease of reference, these are the imports of all the files:
@@ -53,6 +55,33 @@ def test_modules():
5355
}
5456

5557

58+
@patch.object(usecases, "MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING", 0)
59+
def test_modules_multiprocessing():
60+
"""
61+
This test runs relatively slowly, but it's important we cover the multiprocessing code.
62+
"""
63+
graph = build_graph("testpackage", cache_dir=None)
64+
65+
assert graph.modules == {
66+
"testpackage",
67+
"testpackage.one",
68+
"testpackage.one.alpha",
69+
"testpackage.one.beta",
70+
"testpackage.one.gamma",
71+
"testpackage.one.delta",
72+
"testpackage.one.delta.blue",
73+
"testpackage.two",
74+
"testpackage.two.alpha",
75+
"testpackage.two.beta",
76+
"testpackage.two.gamma",
77+
"testpackage.utils",
78+
"testpackage.three",
79+
"testpackage.three.beta",
80+
"testpackage.three.gamma",
81+
"testpackage.three.alpha",
82+
}
83+
84+
5685
def test_add_module():
5786
graph = build_graph("testpackage", cache_dir=None)
5887
number_of_modules = len(graph.modules)

0 commit comments

Comments
 (0)