Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/scope/server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from datetime import datetime
from functools import wraps
from importlib.metadata import version
from logging.handlers import RotatingFileHandler
from pathlib import Path
from typing import TYPE_CHECKING

Expand Down Expand Up @@ -69,6 +68,7 @@
from .logs_config import (
LOG_FORMAT,
FalConnectionFilter,
ResilientRotatingFileHandler,
cleanup_old_logs,
ensure_logs_dir,
get_current_log_file,
Expand Down Expand Up @@ -157,12 +157,12 @@ def _configure_logging():
for handler in root_logger.handlers:
handler.addFilter(_fal_filter)
if isinstance(handler, logging.StreamHandler) and not isinstance(
handler, RotatingFileHandler
handler, ResilientRotatingFileHandler
):
handler.setLevel(logging.INFO)

# Add rotating file handler
file_handler = RotatingFileHandler(
# Add rotating file handler (resilient: auto-recreates if /tmp cleanup deletes the file)
file_handler = ResilientRotatingFileHandler(
log_file,
maxBytes=5 * 1024 * 1024, # 5 MB per file
backupCount=5, # Keep 5 backup files
Expand Down
2 changes: 1 addition & 1 deletion src/scope/server/cloud_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ async def connect(
self._session = aiohttp.ClientSession()
try:
self.ws = await asyncio.wait_for(
self._session.ws_connect(ws_url),
self._session.ws_connect(ws_url, heartbeat=30.0),
timeout=30.0,
)
except TimeoutError:
Expand Down
60 changes: 60 additions & 0 deletions src/scope/server/logs_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import os
import threading
from datetime import datetime, timedelta
from logging.handlers import RotatingFileHandler
from pathlib import Path

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -161,3 +162,62 @@ def cleanup_old_logs(max_age_days: int = 1) -> None:
logger.info(
f"Cleaned up {deleted_count} old log file(s) older than {max_age_days} day(s)"
)


class ResilientRotatingFileHandler(RotatingFileHandler):
"""A RotatingFileHandler that recovers gracefully when the log file or its
parent directory is deleted mid-session (e.g. by OS /tmp cleanup on fal.ai
workers).

Standard behaviour: if ``shouldRollover()`` or ``emit()`` encounters a
``FileNotFoundError`` the Python logging framework catches it and writes a
noisy ``--- Logging error ---`` traceback to *stderr* for every subsequent
log call.

This subclass intercepts ``FileNotFoundError`` at both sites, recreates the
log directory and reopens the stream, then retries the operation once. All
other exceptions are left to the default ``handleError`` path.
"""

def _reopen_stream(self) -> None:
"""Close the current stream (if any) and reopen the log file.

Recreates the parent directory first so the open cannot fail with
``FileNotFoundError`` again.
"""
if self.stream:
try:
self.stream.close()
except Exception:
pass
self.stream = None # type: ignore[assignment]

Path(self.baseFilename).parent.mkdir(parents=True, exist_ok=True)
self.stream = self._open()

def shouldRollover(self, record: logging.LogRecord) -> int:
"""Override to recover if the log file has been deleted."""
try:
return super().shouldRollover(record)
except FileNotFoundError:
# Log file (or its directory) was deleted; reopen before deciding.
try:
self._reopen_stream()
except Exception:
return 0 # Can't recover — skip rollover check.
try:
return super().shouldRollover(record)
except Exception:
return 0

def emit(self, record: logging.LogRecord) -> None:
"""Override to recover if the log file has been deleted mid-session."""
try:
super().emit(record)
except FileNotFoundError:
# Directory or file was deleted; recreate and retry once.
try:
self._reopen_stream()
super().emit(record)
except Exception:
self.handleError(record)
143 changes: 143 additions & 0 deletions tests/test_logs_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import logging
import os
import shutil
from datetime import datetime, timedelta
from pathlib import Path
from unittest.mock import patch
Expand All @@ -11,6 +12,7 @@

from scope.server.logs_config import (
LOGS_DIR_ENV_VAR,
ResilientRotatingFileHandler,
cleanup_old_logs,
get_most_recent_log_file,
)
Expand Down Expand Up @@ -310,3 +312,144 @@ def test_mixed_old_and_new_files(self, temp_logs_dir):
# New files should be kept
assert new1.exists()
assert new2.exists()


class TestResilientRotatingFileHandler:
"""Tests for ResilientRotatingFileHandler — the mid-session recovery handler."""

@pytest.fixture
def log_file(self, tmp_path):
"""Return a path for a log file inside a temp directory."""
return tmp_path / "logs" / "test.log"

@pytest.fixture
def handler(self, log_file):
"""Create a ResilientRotatingFileHandler and ensure it is closed after the test."""
log_file.parent.mkdir(parents=True, exist_ok=True)
h = ResilientRotatingFileHandler(str(log_file), maxBytes=1024, backupCount=2)
h.setFormatter(logging.Formatter("%(message)s"))
yield h
h.close()

# ------------------------------------------------------------------
# Normal operation
# ------------------------------------------------------------------

def test_emits_record_normally(self, handler, log_file):
"""Normal emit should write to the log file."""
record = logging.LogRecord("test", logging.INFO, "", 0, "hello world", (), None)
handler.emit(record)
handler.flush()

assert "hello world" in log_file.read_text()

# ------------------------------------------------------------------
# Recovery after directory deletion (simulates fal.ai /tmp cleanup)
# ------------------------------------------------------------------

def test_emit_recovers_after_directory_deleted(self, handler, log_file):
"""emit() should recover if the log directory is deleted after stream closes.

On Linux, unlinking a file while it is open keeps the FD valid — the
FileNotFoundError only appears when the stream is closed (e.g. after a
rollover) and Python then tries to reopen it. Simulate that by closing
the stream manually before deleting the directory.
"""
# Write one record normally
record = logging.LogRecord("test", logging.INFO, "", 0, "first", (), None)
handler.emit(record)
handler.flush()

# Close the stream (as doRollover() would) then delete the directory
handler.stream.close()
handler.stream = None # type: ignore[assignment]
shutil.rmtree(log_file.parent)
assert not log_file.parent.exists()

# emit() must not raise and must recreate the directory + file
record2 = logging.LogRecord("test", logging.INFO, "", 0, "second", (), None)
handler.emit(record2)
handler.flush()

assert log_file.exists()
assert "second" in log_file.read_text()

def test_emit_recovers_after_stream_closed_and_file_deleted(self, handler, log_file):
"""emit() should recover if the stream is closed and the file is gone."""
record = logging.LogRecord("test", logging.INFO, "", 0, "first", (), None)
handler.emit(record)
handler.flush()

# Close the stream then delete the file (directory still present)
handler.stream.close()
handler.stream = None # type: ignore[assignment]
log_file.unlink()
assert not log_file.exists()

record2 = logging.LogRecord("test", logging.INFO, "", 0, "second", (), None)
handler.emit(record2)
handler.flush()

assert log_file.exists()
assert "second" in log_file.read_text()

# ------------------------------------------------------------------
# shouldRollover recovery
# ------------------------------------------------------------------

def test_should_rollover_recovers_after_stream_closed_and_dir_deleted(
self, handler, log_file
):
"""shouldRollover() should recover without raising if the log dir is gone.

Mirrors the real failure path: stream closed by doRollover(), then
the directory vanishes before the next open attempt.
"""
record = logging.LogRecord("test", logging.INFO, "", 0, "x" * 512, (), None)
handler.emit(record)
handler.flush()

# Close the stream then delete the directory
handler.stream.close()
handler.stream = None # type: ignore[assignment]
shutil.rmtree(log_file.parent)

# shouldRollover must not raise
record2 = logging.LogRecord("test", logging.INFO, "", 0, "y", (), None)
result = handler.shouldRollover(record2)

# Result is an int (0 or 1 — either is acceptable after recovery)
assert isinstance(result, int)
# Directory should have been recreated by the recovery path
assert log_file.parent.exists()

# ------------------------------------------------------------------
# Irreversible errors still call handleError
# ------------------------------------------------------------------

def test_emit_calls_handle_error_on_unrecoverable_failure(self, handler, log_file):
"""emit() should fall back to handleError if recovery itself fails."""
handle_error_called = []

def fake_handle_error(record):
handle_error_called.append(record)

handler.handleError = fake_handle_error

# Force emit to always raise FileNotFoundError (even after reopen)
original_open = handler._open

def always_fail():
raise FileNotFoundError("Cannot create file")

handler._open = always_fail

# Close the existing stream to force a reopen attempt
handler.stream.close()
handler.stream = None # type: ignore[assignment]

record = logging.LogRecord("test", logging.INFO, "", 0, "msg", (), None)
handler.emit(record) # Must not propagate an exception

assert len(handle_error_called) == 1
handler._open = original_open # restore
Loading