diff --git a/src/crawlee/_utils/system.py b/src/crawlee/_utils/system.py index 25fe021f20..c7d3de578d 100644 --- a/src/crawlee/_utils/system.py +++ b/src/crawlee/_utils/system.py @@ -1,10 +1,11 @@ from __future__ import annotations import os +import sys from contextlib import suppress from datetime import datetime, timezone from logging import getLogger -from typing import Annotated, Any +from typing import Annotated import psutil from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator @@ -13,6 +14,24 @@ logger = getLogger(__name__) +if sys.platform == 'linux': + """Get the most suitable available used memory metric. + + `Proportional Set Size (PSS)`, is the amount of own memory and memory shared with other processes, accounted in a + way that the shared amount is divided evenly between the processes that share it. Available on Linux. Suitable for + avoiding overestimation by counting the same shared memory used by children processes multiple times. + + `Resident Set Size (RSS)` is the non-swapped physical memory a process has used; it includes shared memory. It + should be available everywhere. + """ + + def _get_used_memory(process: psutil.Process) -> int: + return int(process.memory_full_info().pss) +else: + + def _get_used_memory(process: psutil.Process) -> int: + return int(process.memory_info().rss) + class CpuInfo(BaseModel): """Information about the CPU usage.""" @@ -88,14 +107,14 @@ def get_memory_info() -> MemoryInfo: current_process = psutil.Process(os.getpid()) # Retrieve estimated memory usage of the current process. - current_size_bytes = int(_get_used_memory(current_process.memory_full_info())) + current_size_bytes = _get_used_memory(current_process) # Sum memory usage by all children processes, try to exclude shared memory from the sum if allowed by OS. for child in current_process.children(recursive=True): # Ignore any NoSuchProcess exception that might occur if a child process ends before we retrieve # its memory usage. with suppress(psutil.NoSuchProcess): - current_size_bytes += _get_used_memory(child.memory_full_info()) + current_size_bytes += _get_used_memory(child) vm = psutil.virtual_memory() @@ -104,20 +123,3 @@ def get_memory_info() -> MemoryInfo: current_size=ByteSize(current_size_bytes), system_wide_used_size=ByteSize(vm.total - vm.available), ) - - -def _get_used_memory(memory_full_info: Any) -> int: - """Get the most suitable available used memory metric. - - `Proportional Set Size (PSS)`, is the amount of own memory and memory shared with other processes, accounted in a - way that the shared amount is divided evenly between the processes that share it. Available on Linux. Suitable for - avoiding overestimation by counting the same shared memory used by children processes multiple times. - - `Resident Set Size (RSS)` is the non-swapped physical memory a process has used; it includes shared memory. It - should be available everywhere. - """ - try: - # Linux - return int(memory_full_info.pss) - except AttributeError: - return int(memory_full_info.rss) diff --git a/tests/unit/_utils/test_system.py b/tests/unit/_utils/test_system.py index f14172a105..4e147d9c80 100644 --- a/tests/unit/_utils/test_system.py +++ b/tests/unit/_utils/test_system.py @@ -1,6 +1,6 @@ from __future__ import annotations -import os +import sys from multiprocessing import Barrier, Process, Value, synchronize from multiprocessing.shared_memory import SharedMemory from typing import TYPE_CHECKING @@ -26,7 +26,7 @@ def test_get_cpu_info_returns_valid_values() -> None: assert 0 <= cpu_info.used_ratio <= 1 -@pytest.mark.skipif(os.name == 'nt', reason='Improved estimation not available on Windows') +@pytest.mark.skipif(sys.platform != 'linux', reason='Improved estimation available only on Linux') def test_memory_estimation_does_not_overestimate_due_to_shared_memory() -> None: """Test that memory usage estimation is not overestimating memory usage by counting shared memory multiple times.