Skip to content

[WIP] Nested Virtualization: KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE #322

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions kvm-bindings/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ extern crate serde;
#[cfg(feature = "serde")]
extern crate zerocopy;

extern crate core;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need this? :o

Copy link
Author

@phip1611 phip1611 Jun 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because kvm-bindings still uses Rust edition 2015; needed to access core::mem.

Strictly speaking, the crate doesn't specify any Rust edition and falls backto 2015

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whew. I'll submit a PR to get us onto edition 2024 tmrw, no reason for us to be on 2015 except that no one ever bothered to update it I guess (we dont really have a MSRV policy around here either)


#[cfg(feature = "serde")]
#[macro_use]
mod serialize;
Expand Down
13 changes: 13 additions & 0 deletions kvm-bindings/src/x86_64/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2022,6 +2022,10 @@ impl Default for kvm_vmx_nested_state_data {
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
#[cfg_attr(
feature = "serde",
derive(zerocopy::IntoBytes, zerocopy::Immutable, zerocopy::FromBytes)
)]
pub struct kvm_vmx_nested_state_hdr {
pub vmxon_pa: __u64,
pub vmcs12_pa: __u64,
Expand All @@ -2032,6 +2036,10 @@ pub struct kvm_vmx_nested_state_hdr {
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
#[cfg_attr(
feature = "serde",
derive(zerocopy::IntoBytes, zerocopy::Immutable, zerocopy::FromBytes)
)]
pub struct kvm_vmx_nested_state_hdr__bindgen_ty_1 {
pub flags: __u16,
}
Expand Down Expand Up @@ -2088,6 +2096,10 @@ impl Default for kvm_svm_nested_state_data {
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, PartialEq)]
#[cfg_attr(
feature = "serde",
derive(zerocopy::IntoBytes, zerocopy::Immutable, zerocopy::FromBytes)
)]
pub struct kvm_svm_nested_state_hdr {
pub vmcb_pa: __u64,
}
Expand All @@ -2110,6 +2122,7 @@ pub struct kvm_nested_state {
}
#[repr(C)]
#[derive(Copy, Clone)]
#[cfg_attr(feature = "serde", derive(zerocopy::Immutable, zerocopy::FromBytes))]
pub union kvm_nested_state__bindgen_ty_1 {
pub vmx: kvm_vmx_nested_state_hdr,
pub svm: kvm_svm_nested_state_hdr,
Expand Down
2 changes: 2 additions & 0 deletions kvm-bindings/src/x86_64/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ pub mod bindings;
#[cfg(feature = "fam-wrappers")]
pub mod fam_wrappers;

pub mod nested;

#[cfg(feature = "serde")]
mod serialize;

Expand Down
116 changes: 116 additions & 0 deletions kvm-bindings/src/x86_64/nested.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
//! Higher-level abstractions compared to the raw KVM bindings for working with
//! nested state.
//!
//! Getting and setting the nested KVM state is helpful if nested virtualization
//! is used and the state needs to be serialized, e.g., for live-migration or
//! state save/resume. The main export is [`KvmNestedState`].

use core::mem;
use KVM_STATE_NESTED_SVM_VMCB_SIZE;
use {kvm_nested_state__bindgen_ty_1, KVM_STATE_NESTED_VMX_VMCS_SIZE};

/// Non-zero variant of the bindgen data union.
///
/// Please note that on SVM, this type wastes one page as the VMX state is
/// larger.
#[derive(Clone, Copy)]
#[cfg_attr(feature = "serde", derive(zerocopy::Immutable, zerocopy::FromBytes))]
#[repr(C)]
pub union kvm_nested_state__data {
pub vmx: kvm_vmx_nested_state_data,
pub svm: kvm_svm_nested_state_data,
}

impl Default for kvm_nested_state__data {
fn default() -> Self {
unsafe { mem::zeroed() }
}
}

#[derive(Clone, Copy)]
#[cfg_attr(
feature = "serde",
derive(zerocopy::IntoBytes, zerocopy::Immutable, zerocopy::FromBytes)
)]
#[repr(C)]
pub struct kvm_vmx_nested_state_data {
pub vmcs12: [u8; KVM_STATE_NESTED_VMX_VMCS_SIZE as usize],
pub shadow_vmcs12: [u8; KVM_STATE_NESTED_VMX_VMCS_SIZE as usize],
}

#[derive(Clone, Copy)]
#[cfg_attr(
feature = "serde",
derive(zerocopy::IntoBytes, zerocopy::Immutable, zerocopy::FromBytes)
)]
#[repr(C)]
pub struct kvm_svm_nested_state_data {
pub vmcb12: [u8; KVM_STATE_NESTED_SVM_VMCB_SIZE as usize],
}

/// A stack-allocated buffer for nested KVM state including the mandatory
/// header with meta-information.
///
/// KVM uses a dynamically sized buffer structure (with a header reporting the
/// size of the buffer/state) making it cumbersome to work with. This helper
/// type makes working with `get_nested_state()` and `set_nested_state`()
/// significantly more convenient at the cost of a slightly higher memory
/// footprint in some cases.
///
/// # Type Size
///
/// On Intel VMX, the actual state requires `128 + 8192 == 8320` bytes, on
/// AMD SVM, the actual state requires `128 + 4096 == 4224` bytes. This type
/// doesn't make a differentiation and unifies the required memory. By
/// sacrificing a few more bytes on VMX, this type is more convenient to use.
#[derive(Clone, Copy)]
#[cfg_attr(
feature = "serde",
derive(zerocopy::IntoBytes, zerocopy::Immutable, zerocopy::FromBytes)
)]
#[repr(C)]
pub struct KvmNestedState {
pub flags: u16,
pub format: u16,
pub size: u32,
pub hdr: kvm_nested_state__bindgen_ty_1,
pub data: kvm_nested_state__data,
// Prevent constructor bypass in public API.
_priv: core::marker::PhantomData<()>,
Comment on lines +78 to +79
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can use #[non_exhaustive] on the struct for this?

}

impl KvmNestedState {
/// Creates a new empty buffer, ready for nested state to be stored in by KVM.
///
/// The `size` property will report the size of the buffer to KVM.
pub fn empty() -> Self {
let mut this: KvmNestedState = unsafe { mem::zeroed() };
// This way, KVM knows the size of the buffer to store state into.
this.size = size_of::<Self>() as u32;
this
}
}

impl Default for KvmNestedState {
fn default() -> Self {
Self::empty()
}
}

#[cfg(test)]
mod tests {
use super::*;

use crate::kvm_nested_state as kvm_nested_state_raw_binding;

#[test]
fn test_layout() {
assert_eq!(
align_of::<kvm_nested_state_raw_binding>(),
align_of::<KvmNestedState>()
);
assert!(size_of::<KvmNestedState>() > size_of::<kvm_nested_state_raw_binding>());
// When this fails/changes, we should re-evaluate the overall types and API
assert_eq!(size_of::<KvmNestedState>(), 8320);
}
}
32 changes: 31 additions & 1 deletion kvm-bindings/src/x86_64/serialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ use bindings::{
kvm_xcr, kvm_xcrs, kvm_xsave,
};
use fam_wrappers::kvm_xsave2;
use kvm_nested_state__bindgen_ty_1;
use nested::{kvm_nested_state__data, KvmNestedState};
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use zerocopy::{transmute, FromBytes, FromZeros, Immutable, IntoBytes};

Expand All @@ -35,7 +37,8 @@ serde_impls!(
kvm_xsave2,
kvm_irqchip,
kvm_irq_routing,
kvm_irq_routing_entry
kvm_irq_routing_entry,
KvmNestedState
);

// SAFETY: zerocopy's derives explicitly disallow deriving for unions where
Expand Down Expand Up @@ -122,10 +125,35 @@ unsafe impl IntoBytes for kvm_irq_routing_entry__bindgen_ty_1 {
}
}

// SAFETY: zerocopy's derives explicitly disallow deriving for unions where
// the fields have different sizes, due to the smaller fields having padding.
// Miri however does not complain about these implementations (e.g. about
// reading the "padding" for one union field as valid data for a bigger one)
unsafe impl IntoBytes for kvm_nested_state__bindgen_ty_1 {
fn only_derive_is_allowed_to_implement_this_trait()
where
Self: Sized,
{
}
}

// SAFETY: zerocopy's derives explicitly disallow deriving for unions where
// the fields have different sizes, due to the smaller fields having padding.
// Miri however does not complain about these implementations (e.g. about
// reading the "padding" for one union field as valid data for a bigger one)
unsafe impl IntoBytes for kvm_nested_state__data {
fn only_derive_is_allowed_to_implement_this_trait()
where
Self: Sized,
{
}
}

#[cfg(test)]
mod tests {
use super::*;
use bindings::*;
use nested;

fn is_serde<T: Serialize + for<'de> Deserialize<'de> + Default>() {
let config = bincode::config::standard();
Expand Down Expand Up @@ -182,6 +210,7 @@ mod tests {
is_serde::<kvm_mp_state>();
is_serde::<kvm_irq_routing>();
is_serde::<kvm_irq_routing_entry>();
is_serde::<nested::KvmNestedState>();
}

fn is_serde_json<T: Serialize + for<'de> Deserialize<'de> + Default>() {
Expand Down Expand Up @@ -216,5 +245,6 @@ mod tests {
is_serde_json::<kvm_mp_state>();
is_serde_json::<kvm_irq_routing>();
is_serde_json::<kvm_irq_routing_entry>();
is_serde_json::<KvmNestedState>();
}
}
2 changes: 2 additions & 0 deletions kvm-ioctls/src/cap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,4 +165,6 @@ pub enum Cap {
UserMemory2 = KVM_CAP_USER_MEMORY2,
GuestMemfd = KVM_CAP_GUEST_MEMFD,
MemoryAttributes = KVM_CAP_MEMORY_ATTRIBUTES,
#[cfg(target_arch = "x86_64")]
NestedState = KVM_CAP_NESTED_STATE,
}
119 changes: 119 additions & 0 deletions kvm-ioctls/src/ioctls/vcpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the THIRD-PARTY file.

// Part of public API
#[cfg(target_arch = "x86_64")]
pub use kvm_bindings::nested::KvmNestedState;

use kvm_bindings::*;
use libc::EINVAL;
use std::fs::File;
Expand Down Expand Up @@ -1983,6 +1987,94 @@ impl VcpuFd {
}
}

/// Returns the nested guest state using the `KVM_GET_NESTED_STATE` ioctl.
///
/// This only works when `KVM_CAP_NESTED_STATE` is available.
///
/// # Arguments
///
/// - `buffer`: The buffer to be filled with the new nested state.
///
/// # Return Value
/// If this returns `None`, KVM doesn't have nested state. Otherwise, the
/// actual length of the state is returned.
///
/// # Example
///
/// ```rust
/// # use kvm_ioctls::{Kvm, Cap, KvmNestedState};
/// let kvm = Kvm::new().unwrap();
/// let vm = kvm.create_vm().unwrap();
/// let vcpu = vm.create_vcpu(0).unwrap();
/// let mut state_buffer = KvmNestedState::empty();
/// if kvm.check_extension(Cap::NestedState) {
/// vcpu.get_nested_state(&mut state_buffer).unwrap();
/// // Next, serialize the actual state into a file or so.
/// }
/// ```
///
/// [`Kvm::check_extension_int`]: kvm_ioctls::Kvm::check_extension_int
#[cfg(target_arch = "x86_64")]
pub fn get_nested_state(
&self,
buffer: &mut KvmNestedState,
) -> Result<Option<usize /* actual length of state */>> {
// Even an empty struct (`Default::default()`) should report the correct size.
assert_ne!(buffer.size, 0, "buffer should not report a size of zero");

// SAFETY: Safe because we call this with a Vcpu fd and we trust the kernel.
let ret = unsafe { ioctl_with_mut_ref(self, KVM_GET_NESTED_STATE(), buffer) };
match ret {
0 => {
let size = buffer.size as usize;
if size == size_of::<kvm_nested_state /* just the empty header */>() {
Ok(None)
} else {
Ok(Some(size))
}
}
_ => Err(errno::Error::last()),
}
}

/// Sets the nested guest state using the `KVM_SET_NESTED_STATE` ioctl.
///
/// This only works when `KVM_CAP_NESTED_STATE` is available.
///
/// # Arguments
///
/// - `state`: The new state to be put into KVM. The header must report the
/// `size` of the state properly. The state must be retrieved first using
/// [`Self::get_nested_state`].
///
/// # Example
///
/// ```rust
/// # use kvm_ioctls::{Kvm, Cap, KvmNestedState};
/// let kvm = Kvm::new().unwrap();
/// let vm = kvm.create_vm().unwrap();
/// let vcpu = vm.create_vcpu(0).unwrap();
/// if kvm.check_extension(Cap::NestedState) {
/// let mut state_buffer = KvmNestedState::empty();
/// vcpu.get_nested_state(&mut state_buffer).unwrap();
/// // Rename the variable to better reflect the role.
/// let old_state = state_buffer;
///
/// // now assume we transfer the state to a new location
/// // and load it back into kvm:
/// vcpu.set_nested_state(&old_state).unwrap();
/// }
/// ```
#[cfg(target_arch = "x86_64")]
pub fn set_nested_state(&self, state: &KvmNestedState) -> Result<()> {
// SAFETY: Safe because we call this with a Vcpu fd and we trust the kernel.
let ret = unsafe { ioctl_with_ref(self, KVM_SET_NESTED_STATE(), state) };
match ret {
0 => Ok(()),
_ => Err(errno::Error::last()),
}
}

/// Queues an NMI on the thread's vcpu. Only usable if `KVM_CAP_USER_NMI`
/// is available.
///
Expand Down Expand Up @@ -3609,4 +3701,31 @@ mod tests {
assert_eq!(addr, ADDR);
assert_eq!(data, (DATA as u16).to_le_bytes());
}

#[test]
#[cfg(target_arch = "x86_64")]
fn test_get_and_set_nested_state() {
let kvm = Kvm::new().unwrap();
let vm = kvm.create_vm().unwrap();
let vcpu = vm.create_vcpu(0).unwrap();

// Ensure that KVM also during runtime never wants more memory than we have pre-allocated
// by the helper type. KVM is expected to report:
// - 128+4096==4224 on SVM
// - 128+8192==8320 on VMX
let kvm_nested_state_size = kvm.check_extension_int(Cap::NestedState) as usize;
assert!(kvm_nested_state_size <= size_of::<KvmNestedState>());

let mut state_buffer = KvmNestedState::default();
// Ensure that header shows full buffer length.
assert_eq!(state_buffer.size as usize, size_of::<KvmNestedState>());

vcpu.get_nested_state(&mut state_buffer).unwrap();
let old_state = state_buffer;

// There is no nested guest in this test, so there is no payload.
assert_eq!(state_buffer.size as usize, size_of::<kvm_nested_state>());

vcpu.set_nested_state(&old_state).unwrap();
}
}
Loading