Skip to content

Commit

Permalink
perf(virtio)!: use KVM_IOEVENTFD for queue notify
Browse files Browse the repository at this point in the history
KVM_IOEVENTFD avoids the VM exits of VCPU threads from kernel space
to user space.

Further we use a non-zero notify_off_multiplier [1] in virtio device
configs. By just looking at the MMIO address we are able to tell
which queue is sending the notification. The value written to the
MMIO address is not needed. Thus the instruction decoding in the KVM
is avoided.

Test setup:

- Host CPU: AMD Ryzen 9 5950X
- VM: memory size 1G, 1 VCPU

virtio-net thought put by iperf3:

- VM -> host
  - without KVM_IOEVENTFD: 30.6 Gbits/sec
  - with KVM_IOEVENTFD: 33.5 Gbits/sec

- Host -> VM
  - without KVM_IOEVENTFD: 19.5 Gbits/sec
  - with KVM_IOEVENTFD: 25.4 Gbits/sec

[1] Virtio Spec 1.2, Sec 4.1.4.4.

Signed-off-by: Changyuan Lyu <[email protected]>
  • Loading branch information
Lencerf committed May 26, 2024
1 parent b6a5b01 commit 020c5d3
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 25 deletions.
2 changes: 0 additions & 2 deletions alioth/src/mem/mem.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,6 @@ pub enum Error {
LockPoisoned,
#[error("cannot allocate")]
CanotAllocate,
#[error("cannot register MMIO notifier: {0}")]
Notifier(#[source] Box<dyn std::error::Error + Send + Sync + 'static>),
#[error("{0}")]
Hv(#[from] hv::Error),
#[error("cannot handle action: {0:x?}")]
Expand Down
33 changes: 28 additions & 5 deletions alioth/src/virtio/dev/dev.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,18 @@
// limitations under the License.

use std::fmt::Debug;
use std::os::fd::AsRawFd;
use std::sync::atomic::{AtomicU16, AtomicU64, AtomicU8};
use std::sync::mpsc::{self, Receiver, Sender};
use std::sync::Arc;
use std::thread::JoinHandle;

use bitfield::bitfield;
use mio::event::Event;
use mio::{Events, Poll, Registry, Token, Waker};
use mio::unix::SourceFd;
use mio::{Events, Interest, Poll, Registry, Token, Waker};

use crate::hv::{IoeventFd, IoeventFdRegistry};
use crate::mem::emulated::Mmio;
use crate::mem::mapped::RamBus;
use crate::mem::MemRegion;
Expand Down Expand Up @@ -114,25 +117,28 @@ where
}

#[derive(Debug)]
pub struct VirtioDevice<D, S>
pub struct VirtioDevice<D, S, E>
where
D: Virtio,
S: IrqSender,
E: IoeventFd,
{
pub name: Arc<String>,
pub device_config: Arc<D::Config>,
pub reg: Arc<Register>,
pub queue_regs: Arc<Vec<Queue>>,
pub ioeventfds: Arc<Vec<E>>,
pub shared_mem_regions: Option<Arc<MemRegion>>,
pub waker: Arc<Waker>,
pub event_tx: Sender<WakeEvent<S>>,
worker_handle: Option<JoinHandle<()>>,
}

impl<D, S> VirtioDevice<D, S>
impl<D, S, E> VirtioDevice<D, S, E>
where
D: Virtio,
S: IrqSender,
E: IoeventFd,
{
fn shutdown(&mut self) -> Result<(), Box<dyn std::error::Error>> {
let Some(handle) = self.worker_handle.take() else {
Expand All @@ -146,7 +152,10 @@ where
Ok(())
}

pub fn new(name: Arc<String>, dev: D, memory: Arc<RamBus>) -> Result<Self> {
pub fn new<R>(name: Arc<String>, dev: D, memory: Arc<RamBus>, registry: &R) -> Result<Self>
where
R: IoeventFdRegistry<IoeventFd = E>,
{
let poll = Poll::new()?;
let device_config = dev.config();
let reg = Arc::new(Register {
Expand All @@ -159,6 +168,18 @@ where
..Default::default()
});
let queue_regs = Arc::new(queue_regs.collect::<Vec<_>>());
let ioeventfds = Arc::new(
(0..num_queues)
.map(|_| registry.create())
.collect::<Result<Vec<_>, _>>()?,
);
for (index, fd) in ioeventfds.iter().enumerate() {
poll.registry().register(
&mut SourceFd(&fd.as_fd().as_raw_fd()),
Token(TOKEN_IS_QUEUE as usize | index),
Interest::READABLE,
)?;
}
let token = TOKEN_IS_QUEUE | TOKEN_WORKER_EVENT;
let waker = Waker::new(poll.registry(), Token(token as usize))?;
let shared_mem_regions = dev.shared_mem_regions();
Expand Down Expand Up @@ -186,6 +207,7 @@ where
name,
reg,
queue_regs,
ioeventfds,
worker_handle: Some(handle),
event_tx,
waker: Arc::new(waker),
Expand All @@ -196,10 +218,11 @@ where
}
}

impl<D, S> Drop for VirtioDevice<D, S>
impl<D, S, E> Drop for VirtioDevice<D, S, E>
where
D: Virtio,
S: IrqSender,
E: IoeventFd,
{
fn drop(&mut self) {
if let Err(e) = self.shutdown() {
Expand Down
75 changes: 63 additions & 12 deletions alioth/src/virtio/pci.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ use mio::Waker;
use parking_lot::{Mutex, RwLock};
use zerocopy::{AsBytes, FromBytes, FromZeroes};

use crate::hv::MsiSender;
use crate::hv::{IoeventFd, IoeventFdRegistry, MsiSender};
use crate::mem::emulated::Mmio;
use crate::mem::{MemRange, MemRegion, MemRegionEntry};
use crate::mem::{MemRange, MemRegion, MemRegionCallback, MemRegionEntry};
use crate::pci::cap::{
MsixCap, MsixCapMmio, MsixCapOffset, MsixMsgCtrl, MsixTableEntry, MsixTableMmio, PciCap,
PciCapHdr, PciCapId, PciCapList,
Expand Down Expand Up @@ -422,10 +422,15 @@ where
VirtioCommonCfg::LAYOUT_QUEUE_RESET => {
todo!()
}
(VirtioPciRegister::OFFSET_QUEUE_NOTIFY, _) => {
let event = WakeEvent::Notify {
q_index: val as u16,
};
(offset, _)
if offset >= VirtioPciRegister::OFFSET_QUEUE_NOTIFY
&& offset
< VirtioPciRegister::OFFSET_QUEUE_NOTIFY
+ size_of::<u32>() * self.queues.len() =>
{
let q_index = (offset - VirtioPciRegister::OFFSET_QUEUE_NOTIFY) as u16 / 4;
log::warn!("{}: notifying queue-{q_index} by vm exit!", self.name);
let event = WakeEvent::Notify { q_index };
self.wake_up_dev(event)
}
_ => {
Expand All @@ -440,6 +445,38 @@ where
}
}

#[derive(Debug)]
struct IoeventFdCallback<R>
where
R: IoeventFdRegistry,
{
registry: R,
ioeventfds: Arc<Vec<R::IoeventFd>>,
}

impl<R> MemRegionCallback for IoeventFdCallback<R>
where
R: IoeventFdRegistry,
{
fn mapped(&self, addr: usize) -> mem::Result<()> {
for (q_index, fd) in self.ioeventfds.iter().enumerate() {
let base_addr = addr + (12 << 10) + VirtioPciRegister::OFFSET_QUEUE_NOTIFY;
let notify_addr = base_addr + q_index * size_of::<u32>();
self.registry.register(fd, notify_addr, 0, None)?;
log::info!("q-{q_index} ioeventfd registered at {notify_addr:x}",)
}
Ok(())
}

fn unmapped(&self) -> mem::Result<()> {
for fd in self.ioeventfds.iter() {
self.registry.deregister(fd)?;
log::info!("ioeventfd {fd:?} de-registered")
}
Ok(())
}
}

const VIRTIO_VENDOR_ID: u16 = 0x1af4;
const VIRTIO_DEVICE_ID_BASE: u16 = 0x1040;

Expand Down Expand Up @@ -514,22 +551,31 @@ impl PciCap for VirtioPciNotifyCap {
}

#[derive(Debug)]
pub struct VirtioPciDevice<D, M>
pub struct VirtioPciDevice<D, M, E>
where
D: Virtio,
M: MsiSender,
E: IoeventFd,
{
pub dev: VirtioDevice<D, PciIrqSender<M>>,
pub dev: VirtioDevice<D, PciIrqSender<M>, E>,
pub config: Arc<EmulatedConfig>,
pub registers: Arc<VirtioPciRegisterMmio<M>>,
}

impl<D, M> VirtioPciDevice<D, M>
impl<D, M, E> VirtioPciDevice<D, M, E>
where
M: MsiSender,
D: Virtio,
E: IoeventFd,
{
pub fn new(dev: VirtioDevice<D, PciIrqSender<M>>, msi_sender: M) -> Result<Self> {
pub fn new<R>(
dev: VirtioDevice<D, PciIrqSender<M>, E>,
msi_sender: M,
ioeventfd_reg: R,
) -> Result<Self>
where
R: IoeventFdRegistry<IoeventFd = E>,
{
let (class, subclass) = get_class(D::device_id());
let mut header = DeviceHeader {
common: CommonHeader {
Expand Down Expand Up @@ -608,7 +654,7 @@ where
length: (size_of::<u32>() * num_queues) as u32,
..Default::default()
},
multiplier: 0, // TODO use 4 for KVM_IOEVENTFD
multiplier: size_of::<u32>() as u32,
};
let cap_device_config = VirtioPciCap {
header: PciCapHdr {
Expand Down Expand Up @@ -701,6 +747,10 @@ where
bar0.ranges
.push(MemRange::Span((12 << 10) - msix_table_size));
bar0.ranges.push(MemRange::Emulated(registers.clone()));
bar0.callbacks.lock().push(Box::new(IoeventFdCallback {
registry: ioeventfd_reg,
ioeventfds: dev.ioeventfds.clone(),
}));
if device_config.size() > 0 {
bar0.ranges.push(MemRange::Emulated(device_config))
}
Expand Down Expand Up @@ -734,10 +784,11 @@ where
}
}

impl<D, M> Pci for VirtioPciDevice<D, M>
impl<D, M, E> Pci for VirtioPciDevice<D, M, E>
where
M: MsiSender,
D: Virtio,
E: IoeventFd,
{
fn config(&self) -> Arc<dyn PciConfig> {
self.config.clone()
Expand Down
4 changes: 2 additions & 2 deletions alioth/src/virtio/virtio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use std::fmt::Debug;
use bitflags::bitflags;
use thiserror::Error;

use crate::mem;
use crate::{hv, mem};

#[path = "dev/dev.rs"]
pub mod dev;
Expand All @@ -28,7 +28,7 @@ pub mod queue;
#[derive(Debug, Error)]
pub enum Error {
#[error("hypervisor: {0}")]
Hv(#[source] Box<dyn std::error::Error + Send + Sync + 'static>),
Hv(#[from] hv::Error),

#[error("IO: {0}")]
Io(#[from] std::io::Error),
Expand Down
19 changes: 15 additions & 4 deletions alioth/src/vm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use crate::board::{self, ArchBoard, Board, BoardConfig, STATE_CREATED, STATE_RUN
use crate::device::fw_cfg::{FwCfg, FwCfgItemParam, PORT_SELECTOR};
use crate::device::pvpanic::PvPanic;
use crate::device::serial::Serial;
use crate::hv::{self, Hypervisor, Vm, VmConfig};
use crate::hv::{self, Hypervisor, IoeventFdRegistry, Vm, VmConfig};
use crate::loader::{self, Payload};
use crate::mem::Memory;
use crate::pci::bus::PciBus;
Expand Down Expand Up @@ -152,16 +152,27 @@ where
&mut self,
name: String,
param: P,
) -> Result<Arc<VirtioPciDevice<D, <<H as Hypervisor>::Vm as Vm>::MsiSender>>, Error>
) -> Result<
Arc<
VirtioPciDevice<
D,
<<H as Hypervisor>::Vm as Vm>::MsiSender,
<<<H as Hypervisor>::Vm as Vm>::IoeventFdRegistry as IoeventFdRegistry>::IoeventFd,
>,
>,
Error,
>
where
P: DevParam<Device = D>,
D: Virtio,
{
let name = Arc::new(name);
let dev = param.build(name.clone())?;
let virtio_dev = VirtioDevice::new(name.clone(), dev, self.board.memory.ram_bus().clone())?;
let registry = self.board.vm.create_ioeventfd_registry()?;
let virtio_dev =
VirtioDevice::new(name.clone(), dev, self.board.memory.ram_bus(), &registry)?;
let msi_sender = self.board.vm.create_msi_sender()?;
let dev = VirtioPciDevice::new(virtio_dev, msi_sender)?;
let dev = VirtioPciDevice::new(virtio_dev, msi_sender, registry)?;
let dev = Arc::new(dev);
let pci_dev = PciDevice::new(name.clone(), dev.clone());
self.add_pci_dev(pci_dev)?;
Expand Down

0 comments on commit 020c5d3

Please sign in to comment.