From aefc47a460368b9fbc9ede361f1f6abfa1a72d1d Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 18 Mar 2026 20:40:40 +0000 Subject: [PATCH] Use separate in-memory types Serializing and deserializing raw JSON to the Crucible Agent's data file creates the potential for a versioning problem that would be hidden from the normal validation performed to make sure that our API surface doesn't change. Consider the following: - a normal update occurs that changes the Crucible Agent's version - that newer version's serialized data file is not compatible with the older version - some Bad Situation occurs and an operator decides to MUPdate back to the previous version In this scenario the older Crucible Agent will not be able to deserialize what the newer Agent serialized, adding to the existing Bad Situation. In a future when Nexus pays attention to Crucible health and there are Agents that cannot start, this would result in potentially stuck Upstairs and that may cascade to interrupt other operations while Nexus waits for repairs or reconciliations. This commit separates the in-memory types used in the DataFile struct from the types committed to disk (and adds a comment that warns against changing said type), and further distinguishes between Region states and RunningSnapshot states. This work is a prerequisite to getting the Agent to work on multiple read-only region clones concurrently as that work adds a new State, and internally discussing the Bad Situation scenario referenced earlier in this message lead to the inspiration to use separate in-memory types. Additionally, fix a bug where the `key_pem` field stored on disk would be output in the error message that would result from requesting a region that matches an existing one except for that field. Luckily no production system uses Crucible's support for X509 yet and this endpoint is not exposed to users. --- agent-types/src/region.rs | 113 +------ agent-types/src/snapshot.rs | 81 +---- agent/src/datafile.rs | 574 ++++++++++++++++++++++++++++++++---- agent/src/main.rs | 54 ++-- agent/src/resource.rs | 7 +- agent/src/server.rs | 20 +- 6 files changed, 580 insertions(+), 269 deletions(-) diff --git a/agent-types/src/region.rs b/agent-types/src/region.rs index f9040e5af..a9cec8e03 100644 --- a/agent-types/src/region.rs +++ b/agent-types/src/region.rs @@ -1,14 +1,10 @@ // Copyright 2025 Oxide Computer Company use std::net::SocketAddr; -use std::path::Path; -use crucible_smf::scf_type_t::*; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use crate::smf::SmfProperty; - #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Serialize, Deserialize, JsonSchema, Debug, PartialEq, Clone)] #[serde(rename_all = "lowercase")] @@ -24,10 +20,12 @@ pub enum State { fn source_default() -> Option { None } + // If not provided, select false as the default for read only. fn read_only_default() -> bool { false } + #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Serialize, Deserialize, JsonSchema, Debug, PartialEq, Clone)] pub struct Region { @@ -59,65 +57,6 @@ pub struct Region { pub read_only: bool, } -impl Region { - /** - * Given a root directory, return a list of SMF properties to ensure for - * the corresponding running instance. - */ - pub fn get_smf_properties(&self, dir: &Path) -> Vec> { - let mut results = vec![ - SmfProperty { - name: "directory", - typ: SCF_TYPE_ASTRING, - val: dir.to_str().unwrap().to_string(), - }, - SmfProperty { - name: "port", - typ: SCF_TYPE_COUNT, - val: self.port_number.to_string(), - }, - ]; - - if self.cert_pem.is_some() { - let mut path = dir.to_path_buf(); - path.push("cert.pem"); - let path = path.into_os_string().into_string().unwrap(); - - results.push(SmfProperty { - name: "cert_pem_path", - typ: SCF_TYPE_ASTRING, - val: path, - }); - } - - if self.key_pem.is_some() { - let mut path = dir.to_path_buf(); - path.push("key.pem"); - let path = path.into_os_string().into_string().unwrap(); - - results.push(SmfProperty { - name: "key_pem_path", - typ: SCF_TYPE_ASTRING, - val: path, - }); - } - - if self.root_pem.is_some() { - let mut path = dir.to_path_buf(); - path.push("root.pem"); - let path = path.into_os_string().into_string().unwrap(); - - results.push(SmfProperty { - name: "root_pem_path", - typ: SCF_TYPE_ASTRING, - val: path, - }); - } - - results - } -} - #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Serialize, Deserialize, JsonSchema, Debug, PartialEq, Clone)] pub struct CreateRegion { @@ -138,54 +77,6 @@ pub struct CreateRegion { pub source: Option, } -impl CreateRegion { - pub fn mismatch(&self, r: &Region) -> Option { - if self.block_size != r.block_size { - Some(format!( - "block size {} instead of requested {}", - self.block_size, r.block_size - )) - } else if self.extent_size != r.extent_size { - Some(format!( - "extent size {} instead of requested {}", - self.extent_size, r.extent_size - )) - } else if self.extent_count != r.extent_count { - Some(format!( - "extent count {} instead of requested {}", - self.extent_count, r.extent_count - )) - } else if self.encrypted != r.encrypted { - Some(format!( - "encrypted {} instead of requested {}", - self.encrypted, r.encrypted - )) - } else if self.cert_pem != r.cert_pem { - Some(format!( - "cert_pem {:?} instead of requested {:?}", - self.cert_pem, r.cert_pem - )) - } else if self.key_pem != r.key_pem { - Some(format!( - "key_pem {:?} instead of requested {:?}", - self.key_pem, r.key_pem - )) - } else if self.root_pem != r.root_pem { - Some(format!( - "root_pem {:?} instead of requested {:?}", - self.root_pem, r.root_pem - )) - } else if self.source != r.source { - Some(format!( - "source {:?} instead of requested {:?}", - self.source, r.source - )) - } else { - None - } - } -} - #[allow(clippy::derive_partial_eq_without_eq)] #[derive( Serialize, diff --git a/agent-types/src/snapshot.rs b/agent-types/src/snapshot.rs index fa6c24b5a..4547f4495 100644 --- a/agent-types/src/snapshot.rs +++ b/agent-types/src/snapshot.rs @@ -1,16 +1,10 @@ // Copyright 2025 Oxide Computer Company -use std::path::Path; - use chrono::{DateTime, Utc}; -use crucible_smf::scf_type_t::*; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use crate::{ - region::{RegionId, State}, - smf::SmfProperty, -}; +use crate::region::{RegionId, State}; #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Serialize, Deserialize, JsonSchema, Debug, PartialEq, Clone)] @@ -28,79 +22,6 @@ pub struct RunningSnapshot { pub state: State, } -impl RunningSnapshot { - /** - * Given a root directory, return a list of SMF properties to ensure for - * the corresponding running instance. - */ - pub fn get_smf_properties(&self, dir: &Path) -> Vec> { - let mut results = vec![ - SmfProperty { - name: "directory", - typ: SCF_TYPE_ASTRING, - val: dir.to_str().unwrap().to_string(), - }, - SmfProperty { - name: "port", - typ: SCF_TYPE_COUNT, - val: self.port_number.to_string(), - }, - SmfProperty { - name: "mode", - typ: SCF_TYPE_ASTRING, - val: "ro".to_string(), - }, - ]; - - // Test for X509 files in snapshot - note this means that running - // snapshots will use the X509 information in the snapshot, not a new - // set. - { - let mut path = dir.to_path_buf(); - path.push("cert.pem"); - let path = path.into_os_string().into_string().unwrap(); - - if Path::new(&path).exists() { - results.push(SmfProperty { - name: "cert_pem_path", - typ: SCF_TYPE_ASTRING, - val: path, - }); - } - } - - { - let mut path = dir.to_path_buf(); - path.push("key.pem"); - let path = path.into_os_string().into_string().unwrap(); - - if Path::new(&path).exists() { - results.push(SmfProperty { - name: "key_pem_path", - typ: SCF_TYPE_ASTRING, - val: path, - }); - } - } - - { - let mut path = dir.to_path_buf(); - path.push("root.pem"); - let path = path.into_os_string().into_string().unwrap(); - - if Path::new(&path).exists() { - results.push(SmfProperty { - name: "root_pem_path", - typ: SCF_TYPE_ASTRING, - val: path, - }); - } - } - - results - } -} - pub struct CreateRunningSnapshotRequest { pub id: RegionId, pub name: String, diff --git a/agent/src/datafile.rs b/agent/src/datafile.rs index d3bbb8d12..56bc86855 100644 --- a/agent/src/datafile.rs +++ b/agent/src/datafile.rs @@ -1,8 +1,14 @@ // Copyright 2021 Oxide Computer Company use anyhow::{Result, anyhow, bail}; -use crucible_agent_types::{region::*, snapshot::*}; +use crucible_agent_types::region::CreateRegion; +use crucible_agent_types::region::RegionId; +use crucible_agent_types::snapshot::CreateRunningSnapshotRequest; +use crucible_agent_types::snapshot::DeleteRunningSnapshotRequest; +use crucible_agent_types::snapshot::DeleteSnapshotRequest; +use crucible_agent_types::snapshot::Snapshot; use crucible_common::write_json; +use crucible_smf::scf_type_t::*; use serde::{Deserialize, Serialize}; use slog::{Logger, crit, error, info}; use std::collections::BTreeMap; @@ -14,6 +20,7 @@ use std::sync::{Arc, Condvar, Mutex, MutexGuard}; use crate::ZFSDataset; use crate::resource::Resource; use crate::snapshot_interface::SnapshotInterface; +use crucible_agent_types::smf::SmfProperty; pub struct DataFile { log: Logger, @@ -27,13 +34,461 @@ pub struct DataFile { snapshot_interface: Arc, } -#[derive(Serialize, Deserialize, Default)] +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Debug, PartialEq, Clone)] +pub enum RegionState { + Requested, + Created, + Tombstoned, + Destroyed, + Failed, +} + +impl From for crucible_agent_types::region::State { + fn from(s: RegionState) -> crucible_agent_types::region::State { + match s { + RegionState::Requested => { + crucible_agent_types::region::State::Requested + } + RegionState::Created => { + crucible_agent_types::region::State::Created + } + RegionState::Tombstoned => { + crucible_agent_types::region::State::Tombstoned + } + RegionState::Destroyed => { + crucible_agent_types::region::State::Destroyed + } + RegionState::Failed => crucible_agent_types::region::State::Failed, + } + } +} + +impl From for RegionState { + fn from(s: crucible_agent_types::region::State) -> RegionState { + match s { + crucible_agent_types::region::State::Requested => { + RegionState::Requested + } + crucible_agent_types::region::State::Created => { + RegionState::Created + } + crucible_agent_types::region::State::Tombstoned => { + RegionState::Tombstoned + } + crucible_agent_types::region::State::Destroyed => { + RegionState::Destroyed + } + crucible_agent_types::region::State::Failed => RegionState::Failed, + } + } +} + +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Debug, PartialEq, Clone)] +pub struct Region { + pub id: RegionId, + pub state: RegionState, + + // Creation parameters + pub block_size: u64, + pub extent_size: u64, + pub extent_count: u32, + pub encrypted: bool, + + // Run-time parameters + pub port_number: u16, + pub cert_pem: Option, + pub key_pem: Option, + pub root_pem: Option, + + // If this region was created as part of a clone. + pub source: Option, + + // If this region is read only + pub read_only: bool, +} + +impl From for crucible_agent_types::region::Region { + fn from(region: Region) -> crucible_agent_types::region::Region { + crucible_agent_types::region::Region { + id: region.id, + state: region.state.into(), + + block_size: region.block_size, + extent_size: region.extent_size, + extent_count: region.extent_count, + encrypted: region.encrypted, + + port_number: region.port_number, + cert_pem: region.cert_pem, + key_pem: region.key_pem, + root_pem: region.root_pem, + + source: region.source, + + read_only: region.read_only, + } + } +} + +impl From for Region { + fn from(region: crucible_agent_types::region::Region) -> Region { + Region { + id: region.id, + state: region.state.into(), + + block_size: region.block_size, + extent_size: region.extent_size, + extent_count: region.extent_count, + encrypted: region.encrypted, + + port_number: region.port_number, + cert_pem: region.cert_pem, + key_pem: region.key_pem, + root_pem: region.root_pem, + + source: region.source, + + read_only: region.read_only, + } + } +} + +impl Region { + /** + * Given a root directory, return a list of SMF properties to ensure for + * the corresponding running instance. + */ + pub fn get_smf_properties(&self, dir: &Path) -> Vec> { + let mut results = vec![ + SmfProperty { + name: "directory", + typ: SCF_TYPE_ASTRING, + val: dir.to_str().unwrap().to_string(), + }, + SmfProperty { + name: "port", + typ: SCF_TYPE_COUNT, + val: self.port_number.to_string(), + }, + ]; + + if self.cert_pem.is_some() { + let mut path = dir.to_path_buf(); + path.push("cert.pem"); + let path = path.into_os_string().into_string().unwrap(); + + results.push(SmfProperty { + name: "cert_pem_path", + typ: SCF_TYPE_ASTRING, + val: path, + }); + } + + if self.key_pem.is_some() { + let mut path = dir.to_path_buf(); + path.push("key.pem"); + let path = path.into_os_string().into_string().unwrap(); + + results.push(SmfProperty { + name: "key_pem_path", + typ: SCF_TYPE_ASTRING, + val: path, + }); + } + + if self.root_pem.is_some() { + let mut path = dir.to_path_buf(); + path.push("root.pem"); + let path = path.into_os_string().into_string().unwrap(); + + results.push(SmfProperty { + name: "root_pem_path", + typ: SCF_TYPE_ASTRING, + val: path, + }); + } + + results + } +} + +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Debug, PartialEq, Clone)] +pub enum RunningSnapshotState { + Requested, + Created, + Tombstoned, + Destroyed, + Failed, +} + +impl From for crucible_agent_types::region::State { + fn from(s: RunningSnapshotState) -> crucible_agent_types::region::State { + match s { + RunningSnapshotState::Requested => { + crucible_agent_types::region::State::Requested + } + RunningSnapshotState::Created => { + crucible_agent_types::region::State::Created + } + RunningSnapshotState::Tombstoned => { + crucible_agent_types::region::State::Tombstoned + } + RunningSnapshotState::Destroyed => { + crucible_agent_types::region::State::Destroyed + } + RunningSnapshotState::Failed => { + crucible_agent_types::region::State::Failed + } + } + } +} + +impl From for RunningSnapshotState { + fn from(s: crucible_agent_types::region::State) -> RunningSnapshotState { + match s { + crucible_agent_types::region::State::Requested => { + RunningSnapshotState::Requested + } + crucible_agent_types::region::State::Created => { + RunningSnapshotState::Created + } + crucible_agent_types::region::State::Tombstoned => { + RunningSnapshotState::Tombstoned + } + crucible_agent_types::region::State::Destroyed => { + RunningSnapshotState::Destroyed + } + crucible_agent_types::region::State::Failed => { + RunningSnapshotState::Failed + } + } + } +} + +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Debug, PartialEq, Clone)] +pub struct RunningSnapshot { + pub id: RegionId, + pub name: String, + pub port_number: u16, + pub state: RunningSnapshotState, +} + +impl From for crucible_agent_types::snapshot::RunningSnapshot { + fn from( + running_snapshot: RunningSnapshot, + ) -> crucible_agent_types::snapshot::RunningSnapshot { + Self { + id: running_snapshot.id, + name: running_snapshot.name, + port_number: running_snapshot.port_number, + state: running_snapshot.state.into(), + } + } +} + +impl From for RunningSnapshot { + fn from( + running_snapshot: crucible_agent_types::snapshot::RunningSnapshot, + ) -> Self { + Self { + id: running_snapshot.id, + name: running_snapshot.name, + port_number: running_snapshot.port_number, + state: running_snapshot.state.into(), + } + } +} + +impl RunningSnapshot { + /** + * Given a root directory, return a list of SMF properties to ensure for + * the corresponding running instance. + */ + pub fn get_smf_properties(&self, dir: &Path) -> Vec> { + let mut results = vec![ + SmfProperty { + name: "directory", + typ: SCF_TYPE_ASTRING, + val: dir.to_str().unwrap().to_string(), + }, + SmfProperty { + name: "port", + typ: SCF_TYPE_COUNT, + val: self.port_number.to_string(), + }, + SmfProperty { + name: "mode", + typ: SCF_TYPE_ASTRING, + val: "ro".to_string(), + }, + ]; + + // Test for X509 files in snapshot - note this means that running + // snapshots will use the X509 information in the snapshot, not a new + // set. + { + let mut path = dir.to_path_buf(); + path.push("cert.pem"); + let path = path.into_os_string().into_string().unwrap(); + + if Path::new(&path).exists() { + results.push(SmfProperty { + name: "cert_pem_path", + typ: SCF_TYPE_ASTRING, + val: path, + }); + } + } + + { + let mut path = dir.to_path_buf(); + path.push("key.pem"); + let path = path.into_os_string().into_string().unwrap(); + + if Path::new(&path).exists() { + results.push(SmfProperty { + name: "key_pem_path", + typ: SCF_TYPE_ASTRING, + val: path, + }); + } + } + + { + let mut path = dir.to_path_buf(); + path.push("root.pem"); + let path = path.into_os_string().into_string().unwrap(); + + if Path::new(&path).exists() { + results.push(SmfProperty { + name: "root_pem_path", + typ: SCF_TYPE_ASTRING, + val: path, + }); + } + } + + results + } +} + +/// A separate in-memory-only version of the deserialized data file. +#[derive(Clone)] struct Inner { regions: BTreeMap, + // indexed by region id and snapshot name running_snapshots: BTreeMap>, } +/// Serialized on-disk data file for the Crucible Agent. Be careful changing +/// this: an older version of the Agent will not be able to deserialize the file +/// anymore, or may interpret the fields or states differently if semantics +/// change. The types used in this function are from `crucible_agent_types`, +/// meaning the API versioning tooling will flag type changes (but not semantic +/// changes!). +#[derive(Serialize, Deserialize, Default)] +struct OnDiskDataFile { + regions: BTreeMap, + + // indexed by region id and snapshot name + running_snapshots: BTreeMap< + RegionId, + BTreeMap, + >, +} + +impl From for Inner { + fn from(on_disk: OnDiskDataFile) -> Inner { + Inner { + regions: on_disk + .regions + .into_iter() + .map(|(k, v)| (k, v.into())) + .collect(), + + running_snapshots: on_disk + .running_snapshots + .into_iter() + .map(|(k, v)| { + (k, v.into_iter().map(|(kk, vv)| (kk, vv.into())).collect()) + }) + .collect(), + } + } +} + +impl From for OnDiskDataFile { + fn from(inner: Inner) -> OnDiskDataFile { + OnDiskDataFile { + regions: inner + .regions + .into_iter() + .map(|(k, v)| (k, v.into())) + .collect(), + + running_snapshots: inner + .running_snapshots + .into_iter() + .map(|(k, v)| { + (k, v.into_iter().map(|(kk, vv)| (kk, vv.into())).collect()) + }) + .collect(), + } + } +} + +fn request_mismatch( + request: &CreateRegion, + r: &crate::datafile::Region, +) -> Option { + if request.block_size != r.block_size { + Some(format!( + "block size {} instead of requested {}", + request.block_size, r.block_size + )) + } else if request.extent_size != r.extent_size { + Some(format!( + "extent size {} instead of requested {}", + request.extent_size, r.extent_size + )) + } else if request.extent_count != r.extent_count { + Some(format!( + "extent count {} instead of requested {}", + request.extent_count, r.extent_count + )) + } else if request.encrypted != r.encrypted { + Some(format!( + "encrypted {} instead of requested {}", + request.encrypted, r.encrypted + )) + } else if request.cert_pem != r.cert_pem { + Some(format!( + "cert_pem {:?} instead of requested {:?}", + request.cert_pem, r.cert_pem + )) + } else if request.key_pem != r.key_pem { + // Do not output key_pem, leaking what is stored on disk! + Some(String::from("key_pem incorrect")) + } else if request.root_pem != r.root_pem { + Some(format!( + "root_pem {:?} instead of requested {:?}", + request.root_pem, r.root_pem + )) + } else if request.source != r.source { + Some(format!( + "source {:?} instead of requested {:?}", + request.source, r.source + )) + } else { + None + } +} + impl DataFile { pub fn new( log: Logger, @@ -50,13 +505,14 @@ impl DataFile { /* * Open data file, load contents. */ - let inner = match crucible_common::read_json_maybe(&conf_path) { - Ok(Some(inner)) => inner, - Ok(None) => Inner::default(), - Err(e) => { - bail!("failed to load data file {:?}: {:?}", conf_path, e); - } - }; + let inner: OnDiskDataFile = + match crucible_common::read_json_maybe(&conf_path) { + Ok(Some(inner)) => inner, + Ok(None) => OnDiskDataFile::default(), + Err(e) => { + bail!("failed to load data file {:?}: {:?}", conf_path, e); + } + }; Ok(DataFile { log, @@ -66,7 +522,7 @@ impl DataFile { port_min, port_max, bell: Condvar::new(), - inner: Mutex::new(inner), + inner: Mutex::new(inner.into()), snapshot_interface, }) } @@ -99,8 +555,9 @@ impl DataFile { * Store the database into the JSON file. */ fn store(&self, inner: MutexGuard) { + let on_disk_datafile: OnDiskDataFile = (*inner).clone().into(); loop { - match write_json(&self.conf_path, &*inner, true) { + match write_json(&self.conf_path, &on_disk_datafile, true) { Ok(()) => return, Err(e) => { /* @@ -130,7 +587,7 @@ impl DataFile { * for now, as they may still prevent use of * their assigned port number. */ - if region.state == State::Destroyed { + if region.state == RegionState::Destroyed { continue; } @@ -148,7 +605,8 @@ impl DataFile { inner.running_snapshots.values() { for running_snapshot in running_snapshot_regions.values() { - if running_snapshot.state == State::Destroyed { + if running_snapshot.state == RunningSnapshotState::Destroyed + { continue; } @@ -187,7 +645,7 @@ impl DataFile { * Look for a region with this ID. */ if let Some(r) = inner.regions.get(&create.id) { - if let Some(mis) = create.mismatch(r) { + if let Some(mis) = request_mismatch(&create, r) { bail!( "requested region {} already exists, with {}", create.id.0, @@ -211,7 +669,7 @@ impl DataFile { let r = Region { id: create.id.clone(), - state: State::Requested, + state: RegionState::Requested, block_size: create.block_size, extent_size: create.extent_size, @@ -300,7 +758,7 @@ impl DataFile { id: request.id.clone(), name: request.name.clone(), port_number, - state: State::Requested, + state: RunningSnapshotState::Requested, }; info!( @@ -377,7 +835,8 @@ impl DataFile { // snapshot state could be anything. match existing.state { - State::Tombstoned | State::Destroyed => { + RunningSnapshotState::Tombstoned + | RunningSnapshotState::Destroyed => { /* * Either: * - Destroy already scheduled. @@ -385,7 +844,8 @@ impl DataFile { */ } - State::Requested | State::Created => { + RunningSnapshotState::Requested + | RunningSnapshotState::Created => { info!( self.log, "removing running snapshot {}-{}", @@ -393,7 +853,7 @@ impl DataFile { request.name ); - existing.state = State::Tombstoned; + existing.state = RunningSnapshotState::Tombstoned; /* * Wake the worker thread to remove the snapshot we've @@ -404,7 +864,7 @@ impl DataFile { self.store(inner); } - State::Failed => { + RunningSnapshotState::Failed => { /* * For now, this terminal state will preserve evidence * for investigation. @@ -437,7 +897,9 @@ impl DataFile { && let Some(running_snapshot) = running_snapshots.get(&request.name) { match running_snapshot.state { - State::Requested | State::Created | State::Tombstoned => { + RunningSnapshotState::Requested + | RunningSnapshotState::Created + | RunningSnapshotState::Tombstoned => { bail!( "read-only downstairs running for region {} snapshot {}", request.id.0, @@ -445,11 +907,11 @@ impl DataFile { ); } - State::Destroyed => { + RunningSnapshotState::Destroyed => { // ok to delete } - State::Failed => { + RunningSnapshotState::Failed => { // Something has set the running snapshot to state // failed, so we can't delete this snapshot. bail!( @@ -476,24 +938,25 @@ impl DataFile { // Did the region exist in the past, and was it already deleted? if let Some(region) = inner.regions.get(&request.id) { match region.state { - State::Tombstoned | State::Destroyed => { + RegionState::Tombstoned | RegionState::Destroyed => { // If so, any snapshots must have been deleted // before the agent would allow the region to be // deleted. return Ok(()); } - State::Requested | State::Created => { + RegionState::Requested | RegionState::Created => { // This is a bug: according to the agent's datafile, // the region exists, but according to zfs list, it // does not bail!( - "Agent thinks region {} exists but zfs list does not! {e}", + "Agent thinks region {} exists but zfs list \ + does not! {e}", request.id.0 ); } - State::Failed => { + RegionState::Failed => { // Something has set the region to state failed, so // we can't delete this snapshot. bail!( @@ -505,7 +968,8 @@ impl DataFile { } else { // In here, the region never existed! bail!( - "Inside region {} snapshot {} delete, region never existed! {e}", + "Inside region {} snapshot {} delete, region never \ + existed! {e}", request.id.0, request.name ); @@ -527,7 +991,7 @@ impl DataFile { let mut inner = self.inner.lock().unwrap(); let r = inner.regions.get_mut(id).unwrap(); - let nstate = State::Failed; + let nstate = RegionState::Failed; if r.state == nstate { return; } @@ -554,7 +1018,7 @@ impl DataFile { .get_mut(snapshot_name) .unwrap(); - let nstate = State::Failed; + let nstate = RunningSnapshotState::Failed; if rs.state == nstate { return; } @@ -579,10 +1043,10 @@ impl DataFile { let mut inner = self.inner.lock().unwrap(); let r = inner.regions.get_mut(id).unwrap(); - let nstate = State::Created; + let nstate = RegionState::Created; match &r.state { - State::Requested => (), - State::Tombstoned => { + RegionState::Requested => (), + RegionState::Tombstoned => { /* * Nexus requested that we destroy this region before we * finished provisioning it. @@ -619,12 +1083,12 @@ impl DataFile { .get_mut(snapshot_name) .unwrap(); - let nstate = State::Created; + let nstate = RunningSnapshotState::Created; match &rs.state { - State::Requested => (), + RunningSnapshotState::Requested => (), - State::Tombstoned => { + RunningSnapshotState::Tombstoned => { /* * Something else set this to Tombstoned between when the SMF * was applied and before the state in the datafile changed! @@ -681,10 +1145,10 @@ impl DataFile { let mut inner = self.inner.lock().unwrap(); let r = inner.regions.get_mut(id).unwrap(); - let nstate = State::Destroyed; + let nstate = RegionState::Destroyed; match &r.state { - State::Requested => (), - State::Tombstoned => (), + RegionState::Requested => (), + RegionState::Tombstoned => (), x => bail!("region to destroy in weird state {:?}", x), } @@ -717,7 +1181,7 @@ impl DataFile { .get_mut(snapshot_name) .unwrap(); - let nstate = State::Destroyed; + let nstate = RunningSnapshotState::Destroyed; info!( self.log, @@ -745,14 +1209,16 @@ impl DataFile { .ok_or_else(|| anyhow!("region {} does not exist", id.0))?; match r.state { - State::Tombstoned | State::Destroyed => { + RegionState::Tombstoned | RegionState::Destroyed => { /* * Either: * - Destroy already scheduled. * - Already destroyed; no more work to do. */ } - State::Requested | State::Created | State::Failed => { + RegionState::Requested + | RegionState::Created + | RegionState::Failed => { /* * Schedule the destruction of this region. */ @@ -761,9 +1227,9 @@ impl DataFile { "region {} state: {:?} -> {:?}", r.id.0, r.state, - State::Tombstoned + RegionState::Tombstoned ); - r.state = State::Tombstoned; + r.state = RegionState::Tombstoned; self.bell.notify_all(); self.store(inner); } @@ -777,7 +1243,11 @@ impl DataFile { * particular state. If there are no resources in the provided state, * wait on the condition variable. */ - pub fn first_in_states(&self, states: &[State]) -> Resource { + pub fn first_in_states( + &self, + region_states: &[RegionState], + running_snapshot_states: &[RunningSnapshotState], + ) -> Resource { let mut inner = self.inner.lock().unwrap(); loop { @@ -788,13 +1258,15 @@ impl DataFile { * allows us to focus on destroying tombstoned * regions ahead of creating new regions. */ - for s in states { + for s in region_states { for r in inner.regions.values() { if &r.state == s { return Resource::Region(r.clone()); } } + } + for s in running_snapshot_states { for (rid, r) in &inner.running_snapshots { for (name, rs) in r { if &rs.state == s { @@ -832,17 +1304,17 @@ impl DataFile { let region = region.unwrap(); match region.state { - State::Requested - | State::Destroyed - | State::Tombstoned - | State::Failed => { + RegionState::Requested + | RegionState::Destroyed + | RegionState::Tombstoned + | RegionState::Failed => { // Either the region hasn't been created yet, or it has been // destroyed or marked to be destroyed (both of which require // that no snapshots exist). Return an empty list. return Ok(vec![]); } - State::Created => { + RegionState::Created => { // proceed to next section } } diff --git a/agent/src/main.rs b/agent/src/main.rs index 666a081d2..5130ac76b 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -43,7 +43,9 @@ mod server; mod smf_interface; mod snapshot_interface; -use crucible_agent_types::region::{self, State}; +use crate::datafile::Region; +use crate::datafile::RegionState; +use crate::datafile::RunningSnapshotState; use smf_interface::*; use crate::resource::Resource; @@ -384,7 +386,7 @@ where */ let expected_downstairs_instances = regions .iter() - .filter(|r| r.state == State::Created) + .filter(|r| r.state == RegionState::Created) .map(|r| format!("{}-{}", downstairs_prefix, r.id.0)) .collect::>(); @@ -392,7 +394,7 @@ where .iter() .flat_map(|(_, n)| { n.iter() - .filter(|(_, rs)| rs.state == State::Created) + .filter(|(_, rs)| rs.state == RunningSnapshotState::Created) .map(|(_, rs)| { format!("{}-{}-{}", snapshot_prefix, rs.id.0, rs.name) }) @@ -474,7 +476,7 @@ where for r in regions.iter() { // If the region is in state Created, then the dataset exists, so start // a downstairs that points to it. - if r.state != State::Created { + if r.state != RegionState::Created { continue; } @@ -642,7 +644,10 @@ where // that we have to take action on both Requested and Created for // running snapshots. This is in contrast to Region, which has // different actions for Requested and Created. - if !matches!(snapshot.state, State::Requested | State::Created) { + if !matches!( + snapshot.state, + RunningSnapshotState::Requested | RunningSnapshotState::Created + ) { continue; } @@ -835,7 +840,13 @@ fn worker( * which wraps either a Region or RegionSnapshot that has changed. * Otherwise, first_in_states will wait on the condvar. */ - let work = df.first_in_states(&[State::Tombstoned, State::Requested]); + let work = df.first_in_states( + &[RegionState::Tombstoned, RegionState::Requested], + &[ + RunningSnapshotState::Tombstoned, + RunningSnapshotState::Requested, + ], + ); match work { Resource::Region(r) => { @@ -847,7 +858,7 @@ fn worker( * then we finish up destroying the region. */ match &r.state { - State::Requested => 'requested: { + RegionState::Requested => 'requested: { /* * Compute the actual size required for a full region, * then add our metadata overhead to that. @@ -946,7 +957,7 @@ fn worker( } } - State::Tombstoned => 'tombstoned: { + RegionState::Tombstoned => 'tombstoned: { info!(log, "applying SMF actions before removal..."); let result = apply_smf( &log, @@ -990,6 +1001,7 @@ fn worker( df.fail(&r.id); } } + _ => { error!( log, @@ -1040,11 +1052,11 @@ fn worker( // `apply_smf` returned Ok, so the desired state transition // succeeded: update the datafile. let res = match &rs.state { - State::Requested => { + RunningSnapshotState::Requested => { df.created_rs(®ion_id, &snapshot_name) } - State::Tombstoned => { + RunningSnapshotState::Tombstoned => { df.destroyed_rs(®ion_id, &snapshot_name) } @@ -1077,7 +1089,7 @@ fn worker( fn worker_region_create( log: &Logger, prog: &Path, - region: ®ion::Region, + region: &Region, dir: &Path, ) -> Result<()> { let log = log.new(o!("region" => region.id.0.to_string())); @@ -1171,7 +1183,7 @@ fn worker_region_create( fn worker_region_destroy( log: &Logger, - region: ®ion::Region, + region: &Region, region_dataset: ZFSDataset, ) -> Result<()> { let log = log.new(o!("region" => region.id.0.to_string())); @@ -1855,7 +1867,7 @@ mod test { running_snapshots.get(®ion_id).unwrap(); let running_snapshot = region_running_snapshots.get(&snapshot_name).unwrap(); - assert_eq!(running_snapshot.state, State::Requested); + assert_eq!(running_snapshot.state, RunningSnapshotState::Requested); } // Say a snapshot_delete saga runs now. Before the worker can call @@ -1875,7 +1887,10 @@ mod test { running_snapshots.get(®ion_id).unwrap(); let running_snapshot = region_running_snapshots.get(&snapshot_name).unwrap(); - assert_eq!(running_snapshot.state, State::Tombstoned); + assert_eq!( + running_snapshot.state, + RunningSnapshotState::Tombstoned + ); } // worker finishes up with calling `created_rs` @@ -1890,7 +1905,10 @@ mod test { running_snapshots.get(®ion_id).unwrap(); let running_snapshot = region_running_snapshots.get(&snapshot_name).unwrap(); - assert_eq!(running_snapshot.state, State::Tombstoned); + assert_eq!( + running_snapshot.state, + RunningSnapshotState::Tombstoned + ); } // `delete_running_snapshot_request` will notify the worker to run, so @@ -1928,7 +1946,7 @@ mod test { // dataset exists. { let region = harness.df.get(®ion_id).unwrap(); - assert_eq!(region.state, State::Requested); + assert_eq!(region.state, RegionState::Requested); } // If Nexus then requests to destroy the region, @@ -1937,7 +1955,7 @@ mod test { // the State will be Tombstoned { let region = harness.df.get(®ion_id).unwrap(); - assert_eq!(region.state, State::Tombstoned); + assert_eq!(region.state, RegionState::Tombstoned); } // Worker will create the child dataset, then call `df.created`: @@ -1946,7 +1964,7 @@ mod test { // It should still be Tombstoned { let region = harness.df.get(®ion_id).unwrap(); - assert_eq!(region.state, State::Tombstoned); + assert_eq!(region.state, RegionState::Tombstoned); } // `apply_smf` will be called twice diff --git a/agent/src/resource.rs b/agent/src/resource.rs index 898735311..0f03c72f3 100644 --- a/agent/src/resource.rs +++ b/agent/src/resource.rs @@ -1,9 +1,8 @@ // Copyright 2025 Oxide Computer Company -use crucible_agent_types::{ - region::{Region, RegionId}, - snapshot::RunningSnapshot, -}; +use crate::datafile::Region; +use crate::datafile::RunningSnapshot; +use crucible_agent_types::region::RegionId; // The different types of resources the worker thread monitors for changes. This // wraps the object that has been added, or changed somehow. diff --git a/agent/src/server.rs b/agent/src/server.rs index 9984b03b0..35740e327 100644 --- a/agent/src/server.rs +++ b/agent/src/server.rs @@ -22,7 +22,14 @@ impl CrucibleAgentApi for CrucibleAgentImpl { async fn region_list( rqctx: RequestContext, ) -> SResult>, HttpError> { - Ok(HttpResponseOk(rqctx.context().regions())) + let regions = rqctx + .context() + .regions() + .into_iter() + .map(|r| r.into()) + .collect(); + + Ok(HttpResponseOk(regions)) } async fn region_create( @@ -32,7 +39,7 @@ impl CrucibleAgentApi for CrucibleAgentImpl { let create = body.into_inner(); match rqctx.context().create_region_request(create) { - Ok(r) => Ok(HttpResponseOk(r)), + Ok(r) => Ok(HttpResponseOk(r.into())), Err(e) => Err(HttpError::for_internal_error(format!( "region create failure: {:?}", e @@ -47,7 +54,7 @@ impl CrucibleAgentApi for CrucibleAgentImpl { let p = path.into_inner(); match rc.context().get(&p.id) { - Some(r) => Ok(HttpResponseOk(r)), + Some(r) => Ok(HttpResponseOk(r.into())), None => Err(HttpError::for_not_found( None, format!("region {:?} not found", p.id), @@ -116,7 +123,10 @@ impl CrucibleAgentApi for CrucibleAgentImpl { Ok(HttpResponseOk(GetSnapshotResponse { snapshots, - running_snapshots, + running_snapshots: running_snapshots + .into_iter() + .map(|(k, v)| (k, v.into())) + .collect(), })) } @@ -226,7 +236,7 @@ impl CrucibleAgentApi for CrucibleAgentImpl { }; match rc.context().create_running_snapshot_request(create) { - Ok(r) => Ok(HttpResponseOk(r)), + Ok(r) => Ok(HttpResponseOk(r.into())), Err(e) => Err(HttpError::for_internal_error(format!( "running snapshot create failure: {:?}", e