Skip to content

Commit 5f77683

Browse files
committed
feat(cli): add lock-management surface (unlock subcommand, --lock-timeout, --break-lock)
Operators now have first-class, JSON-aware recovery for the `<.socket>/apply.lock` advisory lock used by mutating subcommands. - `socket-patch unlock` — probe lock state. Exits 0 when free, 1 when held. `--release` deletes a free leftover lock file; refuses when held (use --break-lock on the mutating subcommand for that scenario). - `--lock-timeout=<secs>` — on apply/rollback/repair/remove, waits up to N seconds for the lock to free before reporting `lock_held`. Plumbs through to the existing `acquire(dir, Duration)` API. - `--break-lock` — on apply/rollback/repair/remove, removes the lock file before acquisition. Records a `lock_broken` warning event in the JSON envelope (and `warnings[]` for rollback's ad-hoc shape) so the action is auditable. Stderr also notes it in human mode. Replaces the prior "rm <.socket>/apply.lock and retry" stderr hint with a pointer at the new tools. Adds unit + integration tests covering each new flag and subcommand against held / free / leftover-file scenarios. Assisted-by: Claude Code:claude-opus-4-7
1 parent 3c4857b commit 5f77683

13 files changed

Lines changed: 772 additions & 27 deletions

File tree

crates/socket-patch-cli/src/args.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,26 @@ pub struct GlobalArgs {
146146
)]
147147
pub yes: bool,
148148

149+
/// Seconds to wait for `<.socket>/apply.lock` before giving up.
150+
/// Default (`None`) and `0` both mean a single non-blocking try
151+
/// — failing immediately if another process holds the lock. A
152+
/// positive value retries with a 100 ms backoff until the lock
153+
/// frees or the budget elapses. Only meaningful for the mutating
154+
/// subcommands (`apply`, `rollback`, `repair`, `remove`); other
155+
/// commands accept it silently.
156+
#[arg(long = "lock-timeout", env = "SOCKET_LOCK_TIMEOUT")]
157+
pub lock_timeout: Option<u64>,
158+
159+
/// Force-remove `<.socket>/apply.lock` before attempting
160+
/// acquisition. Use when you are certain no other socket-patch
161+
/// process is running (e.g. a previous run crashed in a way that
162+
/// stripped the OS lock but left the file). Emits a
163+
/// `lock_broken` warning event in the JSON envelope so the
164+
/// action is auditable. Only meaningful for mutating
165+
/// subcommands; other commands accept it silently.
166+
#[arg(long = "break-lock", env = "SOCKET_BREAK_LOCK", default_value_t = false)]
167+
pub break_lock: bool,
168+
149169
/// Emit verbose debug logs to stderr.
150170
#[arg(long = "debug", env = "SOCKET_DEBUG", default_value_t = false)]
151171
pub debug: bool,
@@ -235,6 +255,8 @@ impl Default for GlobalArgs {
235255
silent: false,
236256
dry_run: false,
237257
yes: false,
258+
lock_timeout: None,
259+
break_lock: false,
238260
debug: false,
239261
no_telemetry: false,
240262
}

crates/socket-patch-cli/src/commands/apply.rs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,12 @@ use socket_patch_core::patch::apply::{
1212
apply_package_patch, verify_file_patch, ApplyResult, PatchSources, VerifyStatus,
1313
};
1414

15-
use crate::commands::lock_cli::acquire_or_emit;
15+
use crate::commands::lock_cli::{acquire_or_emit, lock_broken_event};
1616
use socket_patch_core::utils::purl::strip_purl_qualifiers;
1717
use socket_patch_core::utils::telemetry::{track_patch_applied, track_patch_apply_failed};
1818
use std::collections::{HashMap, HashSet};
1919
use std::path::{Path, PathBuf};
20+
use std::time::Duration;
2021
use tempfile::TempDir;
2122

2223
use crate::args::{apply_env_toggles, GlobalArgs};
@@ -167,16 +168,20 @@ pub async fn run(args: ApplyArgs) -> i32 {
167168
// `.socket/` directory. The guard releases on function return; see
168169
// `socket_patch_core::patch::apply_lock`.
169170
let socket_dir = manifest_path.parent().unwrap_or(Path::new("."));
170-
let _lock = match acquire_or_emit(
171+
let acquired = match acquire_or_emit(
171172
socket_dir,
172173
Command::Apply,
173174
args.common.json,
174175
args.common.silent,
175176
args.common.dry_run,
177+
Duration::from_secs(args.common.lock_timeout.unwrap_or(0)),
178+
args.common.break_lock,
176179
) {
177-
Ok(guard) => guard,
180+
Ok(acquired) => acquired,
178181
Err(code) => return code,
179182
};
183+
let _lock = acquired.guard;
184+
let lock_was_broken = acquired.broke_lock;
180185

181186
// Package-manager layout detection. yarn-berry PnP keeps packages
182187
// inside `.yarn/cache/*.zip` and resolves them via `.pnp.cjs` —
@@ -237,6 +242,9 @@ pub async fn run(args: ApplyArgs) -> i32 {
237242
if args.common.json {
238243
let mut env = Envelope::new(Command::Apply);
239244
env.dry_run = args.common.dry_run;
245+
if lock_was_broken {
246+
env.record(lock_broken_event(socket_dir));
247+
}
240248
for result in &results {
241249
env.record(result_to_event(result, args.common.dry_run));
242250
// Sidecar records live on the envelope, not on

crates/socket-patch-cli/src/commands/lock_cli.rs

Lines changed: 238 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,31 @@ use std::time::Duration;
1616

1717
use socket_patch_core::patch::apply_lock::{acquire, LockError, LockGuard};
1818

19-
use crate::json_envelope::{Command, Envelope, EnvelopeError};
19+
use crate::json_envelope::{
20+
Command, Envelope, EnvelopeError, PatchAction, PatchEvent,
21+
};
22+
23+
/// Stable `errorCode` tag emitted as a `Skipped` warning event when
24+
/// `--break-lock` actually deletes a pre-existing lock file. Exposed
25+
/// for downstream consumers and integration tests that pattern-match
26+
/// on it.
27+
pub const LOCK_BROKEN_CODE: &str = "lock_broken";
28+
29+
/// Outcome of a successful lock acquisition. Callers attach a
30+
/// `lock_broken` event to their own envelope when [`broke_lock`] is
31+
/// true, so the audit trail follows the same conventions as the
32+
/// rest of the command's output.
33+
///
34+
/// [`broke_lock`]: LockAcquired::broke_lock
35+
#[derive(Debug)]
36+
pub struct LockAcquired {
37+
pub guard: LockGuard,
38+
/// True iff `--break-lock` was set AND the helper actually
39+
/// removed a pre-existing `apply.lock` file before acquiring.
40+
/// False when the file didn't exist (nothing to break) — the
41+
/// flag was a no-op in that case so no warning is warranted.
42+
pub broke_lock: bool,
43+
}
2044

2145
/// Try to acquire `<socket_dir>/apply.lock` and return the guard, or
2246
/// emit a failure envelope and a non-zero exit code.
@@ -26,23 +50,74 @@ use crate::json_envelope::{Command, Envelope, EnvelopeError};
2650
/// than a generic "lock failed". `dry_run` is plumbed through to the
2751
/// envelope's `dry_run` field for the (rare) case where lock
2852
/// contention happens during a dry-run apply.
53+
///
54+
/// `timeout = Duration::ZERO` keeps the historical non-blocking
55+
/// try-once shape. Positive values wait with a 100 ms backoff —
56+
/// see `socket_patch_core::patch::apply_lock::acquire`.
57+
///
58+
/// `break_lock = true` deletes `<socket_dir>/apply.lock` before the
59+
/// acquire attempt. The motivating case is a crashed prior run that
60+
/// left the file but no OS lock. When the file exists and is
61+
/// successfully removed the return value's `broke_lock` is true and
62+
/// the caller should attach a `lock_broken` warning event to their
63+
/// envelope.
2964
pub fn acquire_or_emit(
3065
socket_dir: &Path,
3166
command: Command,
3267
json: bool,
3368
silent: bool,
3469
dry_run: bool,
35-
) -> Result<LockGuard, i32> {
36-
match acquire(socket_dir, Duration::ZERO) {
37-
Ok(guard) => Ok(guard),
70+
timeout: Duration,
71+
break_lock: bool,
72+
) -> Result<LockAcquired, i32> {
73+
let mut broke_lock = false;
74+
if break_lock {
75+
let path = socket_dir.join("apply.lock");
76+
match std::fs::remove_file(&path) {
77+
Ok(()) => {
78+
broke_lock = true;
79+
if !silent && !json {
80+
eprintln!(
81+
"Warning: --break-lock removed {} before acquisition.",
82+
path.display()
83+
);
84+
}
85+
}
86+
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
87+
// No file to break — silently proceed to the normal
88+
// acquire path. Documented as a no-op so scripts can
89+
// pass --break-lock unconditionally on retry.
90+
}
91+
Err(source) => {
92+
let msg = format!(
93+
"failed to remove lock file at {}: {}",
94+
path.display(),
95+
source
96+
);
97+
emit(command, json, silent, dry_run, "lock_break_failed", &msg, None);
98+
return Err(1);
99+
}
100+
}
101+
}
102+
103+
match acquire(socket_dir, timeout) {
104+
Ok(guard) => Ok(LockAcquired { guard, broke_lock }),
38105
Err(LockError::Held) => {
106+
let msg = if timeout > Duration::ZERO {
107+
format!(
108+
"another socket-patch process is operating in this directory (waited {}s)",
109+
timeout.as_secs()
110+
)
111+
} else {
112+
"another socket-patch process is operating in this directory".to_string()
113+
};
39114
emit(
40115
command,
41116
json,
42117
silent,
43118
dry_run,
44119
"lock_held",
45-
"another socket-patch process is operating in this directory",
120+
&msg,
46121
Some(socket_dir),
47122
);
48123
Err(1)
@@ -55,6 +130,27 @@ pub fn acquire_or_emit(
55130
}
56131
}
57132

133+
/// Build the warning event that callers attach to their envelope
134+
/// when [`LockAcquired::broke_lock`] is true. Artifact-level (no
135+
/// PURL) since the action targets the `.socket/` directory itself,
136+
/// not a specific package.
137+
pub fn lock_broken_event(socket_dir: &Path) -> PatchEvent {
138+
PatchEvent::artifact(PatchAction::Skipped).with_reason(
139+
LOCK_BROKEN_CODE,
140+
format!(
141+
"--break-lock removed {}/apply.lock before acquisition",
142+
socket_dir.display()
143+
),
144+
)
145+
}
146+
147+
/// Convenience: record the `lock_broken` warning event on an
148+
/// envelope. Mirrors the inline pattern at each call site so we
149+
/// don't drift on the action / errorCode pair.
150+
pub fn record_lock_broken(env: &mut Envelope, socket_dir: &Path) {
151+
env.record(lock_broken_event(socket_dir));
152+
}
153+
58154
fn emit(
59155
command: Command,
60156
json: bool,
@@ -71,10 +167,9 @@ fn emit(
71167
println!("{}", env.to_pretty_json());
72168
} else if !silent {
73169
eprintln!("Error: {message}.");
74-
if let Some(dir) = hint_dir {
170+
if hint_dir.is_some() {
75171
eprintln!(
76-
" If you are sure no other process is running, remove {}/apply.lock and retry.",
77-
dir.display()
172+
" Run `socket-patch unlock` to inspect, or rerun with --break-lock if you're sure no holder exists."
78173
);
79174
}
80175
}
@@ -87,17 +182,43 @@ mod tests {
87182
#[test]
88183
fn acquire_or_emit_succeeds_on_fresh_dir() {
89184
let dir = tempfile::tempdir().unwrap();
90-
let guard = acquire_or_emit(dir.path(), Command::Apply, false, true, false).unwrap();
91-
drop(guard);
185+
let acquired = acquire_or_emit(
186+
dir.path(),
187+
Command::Apply,
188+
false,
189+
true,
190+
false,
191+
Duration::ZERO,
192+
false,
193+
)
194+
.unwrap();
195+
assert!(!acquired.broke_lock);
196+
drop(acquired.guard);
92197
}
93198

94199
#[test]
95200
fn acquire_or_emit_returns_one_on_contention() {
96201
let dir = tempfile::tempdir().unwrap();
97-
let _first =
98-
acquire_or_emit(dir.path(), Command::Apply, false, true, false).unwrap();
99-
let code =
100-
acquire_or_emit(dir.path(), Command::Apply, false, true, false).unwrap_err();
202+
let _first = acquire_or_emit(
203+
dir.path(),
204+
Command::Apply,
205+
false,
206+
true,
207+
false,
208+
Duration::ZERO,
209+
false,
210+
)
211+
.unwrap();
212+
let code = acquire_or_emit(
213+
dir.path(),
214+
Command::Apply,
215+
false,
216+
true,
217+
false,
218+
Duration::ZERO,
219+
false,
220+
)
221+
.unwrap_err();
101222
assert_eq!(code, 1);
102223
}
103224

@@ -110,8 +231,111 @@ mod tests {
110231
false,
111232
true,
112233
false,
234+
Duration::ZERO,
235+
false,
236+
)
237+
.unwrap_err();
238+
assert_eq!(code, 1);
239+
}
240+
241+
/// Positive timeout waits then errors `lock_held` — confirms the
242+
/// budget is plumbed through to `acquire`. Mirrors the
243+
/// `apply_lock::tests::timeout_held` shape so a regression in
244+
/// either layer surfaces here.
245+
#[test]
246+
fn acquire_or_emit_honors_lock_timeout() {
247+
let dir = tempfile::tempdir().unwrap();
248+
let _first = acquire_or_emit(
249+
dir.path(),
250+
Command::Apply,
251+
false,
252+
true,
253+
false,
254+
Duration::ZERO,
255+
false,
256+
)
257+
.unwrap();
258+
let start = std::time::Instant::now();
259+
let code = acquire_or_emit(
260+
dir.path(),
261+
Command::Apply,
262+
false,
263+
true,
264+
false,
265+
Duration::from_millis(250),
266+
false,
113267
)
114268
.unwrap_err();
269+
let elapsed = start.elapsed();
115270
assert_eq!(code, 1);
271+
assert!(
272+
elapsed >= Duration::from_millis(200),
273+
"expected at least 200ms wait, got {:?}",
274+
elapsed
275+
);
276+
}
277+
278+
/// `break_lock=true` against a pre-existing lock file with no
279+
/// holder removes the file and acquires fresh. `broke_lock` flag
280+
/// surfaces so callers can attach the warning event.
281+
#[test]
282+
fn acquire_or_emit_break_lock_removes_and_acquires() {
283+
let dir = tempfile::tempdir().unwrap();
284+
// Pre-stage a lock file with no holder — simulates the
285+
// post-crash leftover scenario.
286+
std::fs::write(dir.path().join("apply.lock"), b"").unwrap();
287+
288+
let acquired = acquire_or_emit(
289+
dir.path(),
290+
Command::Apply,
291+
false,
292+
true,
293+
false,
294+
Duration::ZERO,
295+
true,
296+
)
297+
.unwrap();
298+
assert!(
299+
acquired.broke_lock,
300+
"broke_lock should be true when a lock file existed and was removed"
301+
);
302+
// Lock file has been re-created by `acquire` and we hold it.
303+
assert!(dir.path().join("apply.lock").is_file());
304+
}
305+
306+
/// `break_lock=true` on a clean directory (no lock file) is a
307+
/// no-op for the warning surface — `broke_lock` stays false so
308+
/// callers don't emit a spurious event.
309+
#[test]
310+
fn acquire_or_emit_break_lock_is_noop_when_no_file() {
311+
let dir = tempfile::tempdir().unwrap();
312+
let acquired = acquire_or_emit(
313+
dir.path(),
314+
Command::Apply,
315+
false,
316+
true,
317+
false,
318+
Duration::ZERO,
319+
true,
320+
)
321+
.unwrap();
322+
assert!(
323+
!acquired.broke_lock,
324+
"broke_lock should be false when there was nothing to remove"
325+
);
326+
}
327+
328+
#[test]
329+
fn lock_broken_event_uses_documented_code() {
330+
let dir = tempfile::tempdir().unwrap();
331+
let event = lock_broken_event(dir.path());
332+
let v: serde_json::Value =
333+
serde_json::from_str(&serde_json::to_string(&event).unwrap()).unwrap();
334+
assert_eq!(v["action"], "skipped");
335+
assert_eq!(v["errorCode"], LOCK_BROKEN_CODE);
336+
assert!(
337+
v.as_object().unwrap().get("purl").is_none(),
338+
"lock_broken is an artifact-level event — no purl"
339+
);
116340
}
117341
}

0 commit comments

Comments
 (0)