Skip to content

Commit 321b500

Browse files
bpiwowarclaude
andcommitted
feat: ProcessWebService, service log viewing, and keep services alive
- Add ProcessWebService for subprocess-based services with atexit cleanup - Add log viewing to global services tab (l key binding) - Show non-STOPPED services (not just RUNNING) in global services - Fix OR logic for log file existence checks in both service widgets - Add stdout/stderr/log_directory delegation to SSHLocalService and MockService - Stop SSH syncs on service failure in SSHLocalService - Set local log directory for SSHMockService services - Don't stop services on experiment exit (cleaned up on process exit) - Update demo submodule with ProcessWebService TensorBoard Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 6c3ca0b commit 321b500

7 files changed

Lines changed: 125 additions & 36 deletions

File tree

src/experimaestro/scheduler/experiment.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,11 +1015,6 @@ def __exit__(self, exc_type, exc_value, traceback):
10151015
if self._register_signals:
10161016
SIGNAL_HANDLER.remove(self)
10171017

1018-
# Stop services
1019-
for service in self.services.values():
1020-
logger.info("Closing service %s", service.description())
1021-
service.stop()
1022-
10231018
# Set end time for BaseExperiment interface
10241019
self._ended_at = datetime.now()
10251020

src/experimaestro/scheduler/remote/client.py

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,18 @@ def state(self):
126126
def url(self) -> Optional[str]:
127127
return getattr(self._inner, "url", None)
128128

129+
@property
130+
def log_directory(self) -> Optional[Path]:
131+
return self._inner.log_directory
132+
133+
@property
134+
def stdout(self) -> Optional[Path]:
135+
return self._inner.stdout
136+
137+
@property
138+
def stderr(self) -> Optional[Path]:
139+
return self._inner.stderr
140+
129141
def description(self) -> str:
130142
return self._inner.description()
131143

@@ -163,8 +175,20 @@ def get_url(self) -> str:
163175
# Notify initial status
164176
self._notify_status_change()
165177

166-
# Start the inner service
167-
return self._inner.get_url()
178+
# Start the inner service; stop syncs if it fails
179+
try:
180+
url = self._inner.get_url()
181+
except Exception:
182+
self._stop_syncs()
183+
raise
184+
185+
# Check if service ended up in error state (e.g. process exited)
186+
from experimaestro.scheduler.services import ServiceState
187+
188+
if self._inner.state == ServiceState.ERROR:
189+
self._stop_syncs()
190+
191+
return url
168192

169193
@property
170194
def sync_status(self) -> Optional[str]:
@@ -189,16 +213,19 @@ def error(self) -> Optional[str]:
189213
"""Return error message if service failed to start."""
190214
return self._inner.error
191215

216+
def _stop_syncs(self) -> None:
217+
"""Stop all adaptive synchronizers."""
218+
for sync in self._synchronizers:
219+
sync.stop()
220+
self._synchronizers.clear()
221+
192222
def stop(self) -> None:
193223
"""Stop the inner service and stop all syncs."""
194224
# Stop the inner service first
195225
if hasattr(self._inner, "stop"):
196226
self._inner.stop()
197227

198-
# Stop all synchronizers
199-
for sync in self._synchronizers:
200-
sync.stop()
201-
self._synchronizers.clear()
228+
self._stop_syncs()
202229

203230

204231
class SSHMockService(MockService):
@@ -306,6 +333,11 @@ def path_translator(remote_path: str) -> Path:
306333
)
307334
inner_service.id = self.id
308335

336+
# Set local log directory for service output
337+
local_cache = self._state_provider.local_cache_dir
338+
if local_cache:
339+
inner_service.log_directory = local_cache / "services" / self.id
340+
309341
# Extract remote paths for sync management
310342
remote_paths = self._extract_paths()
311343

src/experimaestro/scheduler/services.py

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import abc
2+
import atexit
23
from enum import Enum
34
import logging
45
import subprocess
@@ -57,6 +58,7 @@ def __init__(self):
5758
self._listeners: Set[ServiceListener] = set()
5859
self._listeners_lock = threading.Lock()
5960
self._error = None
61+
self.log_directory: Optional[Path] = None
6062

6163
def set_experiment(self, xp: "Experiment") -> None:
6264
"""Called when the service is added to an experiment.
@@ -68,10 +70,9 @@ def set_experiment(self, xp: "Experiment") -> None:
6870
xp: The experiment this service is being added to.
6971
"""
7072
self._experiment = xp
71-
72-
# Create log directory for this service
73-
if self.log_directory:
74-
self.log_directory.mkdir(parents=True, exist_ok=True)
73+
self.log_directory = (
74+
self._experiment.workspace.scheduler_services_path / self.id
75+
)
7576

7677
@property
7778
def experiment_id(self) -> str:
@@ -87,13 +88,6 @@ def run_id(self) -> str:
8788
return self._experiment.run_id or ""
8889
return ""
8990

90-
@property
91-
def log_directory(self) -> Optional[Path]:
92-
"""Return the directory for service logs (None if not attached to experiment)"""
93-
if self._experiment is None:
94-
return None
95-
return self._experiment.workspace.scheduler_services_path / self.id
96-
9791
@property
9892
def stdout(self) -> Optional[Path]:
9993
"""Return path to stdout log file"""
@@ -261,6 +255,10 @@ def setup_logging(
261255
if not self.stdout or not self.stderr:
262256
return None, None
263257

258+
# Ensure log directory exists
259+
if self.log_directory:
260+
self.log_directory.mkdir(parents=True, exist_ok=True)
261+
264262
# Get logger for this service
265263
service_logger = logging.getLogger(f"xpm.service.{self.id}")
266264
service_logger.setLevel(logging.INFO)
@@ -622,18 +620,27 @@ def _start_process(self):
622620
"""Start the service as a subprocess"""
623621
# Build command to run service
624622
cmd = self._build_command()
623+
logger.info(
624+
"Starting service %s (log_directory=%s): %s",
625+
self.id,
626+
self.log_directory,
627+
" ".join(cmd),
628+
)
625629

626-
# Redirect stdout/stderr to log files
630+
# Ensure log directory exists and redirect stdout/stderr to log files
631+
if self.log_directory:
632+
self.log_directory.mkdir(parents=True, exist_ok=True)
627633
stdout_file = open(self.stdout, "w") if self.stdout else subprocess.DEVNULL
628634
stderr_file = open(self.stderr, "w") if self.stderr else subprocess.DEVNULL
629635

630-
# Start process
636+
# Start process (ensure it gets killed when the parent exits)
631637
self.process = subprocess.Popen(
632638
cmd,
633639
stdout=stdout_file,
634640
stderr=stderr_file,
635641
cwd=str(self.log_directory) if self.log_directory else None,
636642
)
643+
atexit.register(self._kill_process)
637644

638645
# Monitor process in background thread
639646
monitor_thread = threading.Thread(
@@ -656,8 +663,9 @@ def _monitor_process(self):
656663
self.process.wait()
657664

658665
except Exception as e:
659-
logger.exception(f"Service {self.id} monitoring failed: {e}")
660-
self.state = ServiceState.ERROR
666+
logger.error("Service %s failed: %s", self.id, e)
667+
self.set_error(str(e))
668+
661669
finally:
662670
if running_event and not running_event.is_set():
663671
running_event.set()
@@ -701,6 +709,16 @@ def stop(self, timeout: float = 2.0):
701709
running_event.wait()
702710

703711
# Terminate process
712+
self._kill_process(timeout=timeout)
713+
atexit.unregister(self._kill_process)
714+
715+
with self._start_lock:
716+
self.url = None
717+
self._running_event = None
718+
self.state = ServiceState.STOPPED
719+
720+
def _kill_process(self, timeout: float = 2.0):
721+
"""Kill the subprocess if it is still running."""
704722
if self.process and self.process.poll() is None:
705723
logger.info(f"Terminating service {self.id} (PID {self.process.pid})")
706724
self.process.terminate()
@@ -710,8 +728,3 @@ def stop(self, timeout: float = 2.0):
710728
logger.warning(f"Service {self.id} did not terminate, killing")
711729
self.process.kill()
712730
self.process.wait()
713-
714-
with self._start_lock:
715-
self.url = None
716-
self._running_event = None
717-
self.state = ServiceState.STOPPED

src/experimaestro/scheduler/state_provider.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1776,6 +1776,24 @@ def description(self) -> str:
17761776
"""Return service description"""
17771777
return self._description
17781778

1779+
@property
1780+
def log_directory(self) -> Optional[Path]:
1781+
if self._live_service is not None:
1782+
return self._live_service.log_directory
1783+
return None
1784+
1785+
@property
1786+
def stdout(self) -> Optional[Path]:
1787+
if self._live_service is not None:
1788+
return self._live_service.stdout
1789+
return None
1790+
1791+
@property
1792+
def stderr(self) -> Optional[Path]:
1793+
if self._live_service is not None:
1794+
return self._live_service.stderr
1795+
return None
1796+
17791797
@property
17801798
def sync_status(self) -> Optional[str]:
17811799
"""Return sync status for display.

src/experimaestro/tui/widgets/global_services.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ class GlobalServiceSyncs(Vertical):
2323

2424
BINDINGS = [
2525
Binding("ctrl+k", "stop_service", "Stop Service"),
26+
Binding("l", "view_logs", "View Logs", priority=True),
2627
]
2728

2829
def __init__(self, state_provider: StateProvider) -> None:
@@ -70,11 +71,11 @@ def _load_services(self) -> None:
7071
running_services = [
7172
s
7273
for s in all_services
73-
if hasattr(s, "state") and s.state == ServiceState.RUNNING
74+
if hasattr(s, "state") and s.state != ServiceState.STOPPED
7475
]
7576
self.log.info(
7677
f"GlobalServiceSyncs._load_services: got {len(running_services)} "
77-
f"running services (out of {len(all_services)} total)"
78+
f"active services (out of {len(all_services)} total)"
7879
)
7980
except Exception as e:
8081
logger.warning(f"Failed to load services: {e}")
@@ -228,3 +229,31 @@ def action_stop_service(self) -> None:
228229
self.refresh_services()
229230
except Exception as e:
230231
self.notify(f"Failed to stop service: {e}", severity="error")
232+
233+
def action_view_logs(self) -> None:
234+
"""View service logs"""
235+
service = self._get_selected_service()
236+
if not service:
237+
return
238+
239+
# Convert to live service to get log paths
240+
live_service = service.to_service()
241+
242+
if not live_service.stdout and not live_service.stderr:
243+
self.notify("Service logs not available", severity="warning")
244+
return
245+
246+
stdout_exists = live_service.stdout and live_service.stdout.exists()
247+
stderr_exists = live_service.stderr and live_service.stderr.exists()
248+
if not stdout_exists and not stderr_exists:
249+
self.notify("No log files found", severity="warning")
250+
return
251+
252+
from experimaestro.tui.log_viewer import create_service_log_viewer
253+
254+
sync_func = None
255+
if self.state_provider.is_remote:
256+
sync_func = self.state_provider.sync_path
257+
258+
viewer = create_service_log_viewer(live_service, sync_func)
259+
self.app.push_screen(viewer)

src/experimaestro/tui/widgets/services.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,11 +284,13 @@ def action_view_logs(self) -> None:
284284
# Convert to live service to get log paths
285285
live_service = service.to_service()
286286

287-
if not live_service.stdout or not live_service.stderr:
287+
if not live_service.stdout and not live_service.stderr:
288288
self.notify("Service logs not available", severity="warning")
289289
return
290290

291-
if not live_service.stdout.exists() and not live_service.stderr.exists():
291+
stdout_exists = live_service.stdout and live_service.stdout.exists()
292+
stderr_exists = live_service.stderr and live_service.stderr.exists()
293+
if not stdout_exists and not stderr_exists:
292294
self.notify("No log files found", severity="warning")
293295
return
294296

0 commit comments

Comments
 (0)