blob: 63b70390b05d9be979d700bbf5359d0d2a349997 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2018 The Hafnium Authors.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include <clocksource/arm_arch_timer.h>
#include <linux/atomic.h>
#include <linux/cpuhotplug.h>
#include <linux/hrtimer.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/net.h>
#include <linux/of.h>
#include <linux/platform_device.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <hf/call.h>
#include <hf/spci.h>
#include <hf/transport.h>
#include "uapi/hf/socket.h"
#define HYPERVISOR_TIMER_NAME "el2_timer"
#define CONFIG_HAFNIUM_MAX_VMS 16
#define CONFIG_HAFNIUM_MAX_VCPUS 32
#define FIRST_SECONDARY_VM_ID (HF_VM_ID_OFFSET + 1)
struct hf_vcpu {
struct hf_vm *vm;
spci_vcpu_index_t vcpu_index;
struct task_struct *task;
atomic_t abort_sleep;
atomic_t waiting_for_message;
struct hrtimer timer;
};
struct hf_vm {
spci_vm_id_t id;
spci_vcpu_count_t vcpu_count;
struct hf_vcpu *vcpu;
};
struct hf_sock {
/* This needs to be the first field. */
struct sock sk;
/*
* The following fields are immutable after the socket transitions to
* SS_CONNECTED state.
*/
uint64_t local_port;
uint64_t remote_port;
struct hf_vm *peer_vm;
};
static struct proto hf_sock_proto = {
.name = "hafnium",
.owner = THIS_MODULE,
.obj_size = sizeof(struct hf_sock),
};
static struct hf_vm *hf_vms;
static spci_vm_count_t hf_vm_count;
static struct page *hf_send_page;
static struct page *hf_recv_page;
static atomic64_t hf_next_port = ATOMIC64_INIT(0);
static DEFINE_SPINLOCK(hf_send_lock);
static DEFINE_HASHTABLE(hf_local_port_hash, 7);
static DEFINE_SPINLOCK(hf_local_port_hash_lock);
static int hf_irq;
static enum cpuhp_state hf_cpuhp_state;
static spci_vm_id_t current_vm_id;
/**
* Retrieves a VM from its ID, returning NULL if the VM doesn't exist.
*/
static struct hf_vm *hf_vm_from_id(spci_vm_id_t vm_id)
{
if (vm_id < FIRST_SECONDARY_VM_ID ||
vm_id >= FIRST_SECONDARY_VM_ID + hf_vm_count)
return NULL;
return &hf_vms[vm_id - FIRST_SECONDARY_VM_ID];
}
/**
* Wakes up the kernel thread responsible for running the given vcpu.
*
* Returns 0 if the thread was already running, 1 otherwise.
*/
static int hf_vcpu_wake_up(struct hf_vcpu *vcpu)
{
/* Set a flag indicating that the thread should not go to sleep. */
atomic_set(&vcpu->abort_sleep, 1);
/* Set the thread to running state. */
return wake_up_process(vcpu->task);
}
/**
* Puts the current thread to sleep. The current thread must be responsible for
* running the given vcpu.
*
* Going to sleep will fail if hf_vcpu_wake_up() or kthread_stop() was called on
* this vcpu/thread since the last time it [re]started running.
*/
static void hf_vcpu_sleep(struct hf_vcpu *vcpu)
{
int abort;
set_current_state(TASK_INTERRUPTIBLE);
/* Check the sleep-abort flag after making thread interruptible. */
abort = atomic_read(&vcpu->abort_sleep);
if (!abort && !kthread_should_stop())
schedule();
/* Set state back to running on the way out. */
set_current_state(TASK_RUNNING);
}
/**
* Wakes up the thread associated with the vcpu that owns the given timer. This
* is called when the timer the thread is waiting on expires.
*/
static enum hrtimer_restart hf_vcpu_timer_expired(struct hrtimer *timer)
{
struct hf_vcpu *vcpu = container_of(timer, struct hf_vcpu, timer);
/* TODO: Inject interrupt. */
hf_vcpu_wake_up(vcpu);
return HRTIMER_NORESTART;
}
/**
* This function is called when Hafnium requests that the primary VM wake up a
* vCPU that belongs to a secondary VM.
*
* It wakes up the thread if it's sleeping, or kicks it if it's already running.
*/
static void hf_handle_wake_up_request(spci_vm_id_t vm_id,
spci_vcpu_index_t vcpu)
{
struct hf_vm *vm = hf_vm_from_id(vm_id);
if (!vm) {
pr_warn("Request to wake up non-existent VM id: %u\n", vm_id);
return;
}
if (vcpu >= vm->vcpu_count) {
pr_warn("Request to wake up non-existent vCPU: %u.%u\n",
vm_id, vcpu);
return;
}
if (hf_vcpu_wake_up(&vm->vcpu[vcpu]) == 0) {
/*
* The task was already running (presumably on a different
* physical CPU); interrupt it. This gives Hafnium a chance to
* inject any new interrupts.
*/
kick_process(vm->vcpu[vcpu].task);
}
}
/**
* Injects an interrupt into a vCPU of the VM and ensures the vCPU will run to
* handle the interrupt.
*/
static void hf_interrupt_vm(spci_vm_id_t vm_id, uint64_t int_id)
{
struct hf_vm *vm = hf_vm_from_id(vm_id);
spci_vcpu_index_t vcpu;
int64_t ret;
if (!vm) {
pr_warn("Request to wake up non-existent VM id: %u\n", vm_id);
return;
}
/*
* TODO: For now we're picking the first vcpu to interrupt, but
* we want to be smarter.
*/
vcpu = 0;
ret = hf_interrupt_inject(vm_id, vcpu, int_id);
if (ret == -1) {
pr_warn("Failed to inject interrupt %lld to vCPU %d of VM %d",
int_id, vcpu, vm_id);
return;
}
if (ret != 1) {
/* We don't need to wake up the vcpu. */
return;
}
hf_handle_wake_up_request(vm_id, vcpu);
}
/**
* Notify all waiters on the given VM.
*/
static void hf_notify_waiters(spci_vm_id_t vm_id)
{
spci_vm_id_t waiter_vm_id;
while ((waiter_vm_id = hf_mailbox_waiter_get(vm_id)) != -1) {
if (waiter_vm_id == HF_PRIMARY_VM_ID) {
/*
* TODO: Use this information when implementing per-vm
* queues.
*/
} else {
hf_interrupt_vm(waiter_vm_id,
HF_MAILBOX_WRITABLE_INTID);
}
}
}
/**
* Delivers a message to a VM.
*/
static void hf_deliver_message(spci_vm_id_t vm_id)
{
struct hf_vm *vm = hf_vm_from_id(vm_id);
spci_vcpu_index_t i;
if (!vm) {
pr_warn("Tried to deliver message to non-existent VM id: %u\n",
vm_id);
return;
}
/* Try to wake a vCPU that is waiting for a message. */
for (i = 0; i < vm->vcpu_count; i++) {
if (atomic_read(&vm->vcpu[i].waiting_for_message)) {
hf_handle_wake_up_request(vm->id,
vm->vcpu[i].vcpu_index);
return;
}
}
/* None were waiting for a message so interrupt one. */
hf_interrupt_vm(vm->id, HF_MAILBOX_READABLE_INTID);
}
/**
* Handles a message delivered to this VM by validating that it's well-formed
* and then queueing it for delivery to the appropriate socket.
*/
static void hf_handle_message(struct hf_vm *sender, size_t len,
const void *message)
{
struct hf_sock *hsock;
const struct hf_msg_hdr *hdr = (struct hf_msg_hdr *)message;
struct sk_buff *skb;
int err;
/* Ignore messages that are too small to hold a header. */
if (len < sizeof(struct hf_msg_hdr))
return;
len -= sizeof(struct hf_msg_hdr);
/* Go through the colliding sockets. */
rcu_read_lock();
hash_for_each_possible_rcu(hf_local_port_hash, hsock, sk.sk_node,
hdr->dst_port) {
if (hsock->peer_vm == sender &&
hsock->remote_port == hdr->src_port) {
sock_hold(&hsock->sk);
break;
}
}
rcu_read_unlock();
/* Nothing to do if we couldn't find the target. */
if (!hsock)
return;
/*
* TODO: From this point on, there are two failure paths: when we
* create the skb below, and when we enqueue it to the socket. What
* should we do if they fail? Ideally we would have some form of flow
* control to prevent message loss, but how to do it efficiently?
*
* One option is to have a pre-allocated message that indicates to the
* sender that a message was dropped. This way we guarantee that the
* sender will be aware of loss and should back-off.
*/
/* Create the skb. */
skb = alloc_skb(len, GFP_KERNEL);
if (!skb)
goto exit;
memcpy(skb_put(skb, len), hdr + 1, len);
/*
* Add the skb to the receive queue of the target socket. On success it
* calls sk->sk_data_ready, which is currently set to sock_def_readable,
* which wakes up any waiters.
*/
err = sock_queue_rcv_skb(&hsock->sk, skb);
if (err)
kfree_skb(skb);
exit:
sock_put(&hsock->sk);
if (spci_rx_release().func == SPCI_RX_RELEASE_32)
hf_notify_waiters(HF_PRIMARY_VM_ID);
}
/**
* This is the main loop of each vcpu.
*/
static int hf_vcpu_thread(void *data)
{
struct hf_vcpu *vcpu = data;
struct spci_value ret;
hrtimer_init(&vcpu->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
vcpu->timer.function = &hf_vcpu_timer_expired;
while (!kthread_should_stop()) {
spci_vcpu_index_t i;
/*
* We're about to run the vcpu, so we can reset the abort-sleep
* flag.
*/
atomic_set(&vcpu->abort_sleep, 0);
/* Call into Hafnium to run vcpu. */
ret = spci_run(vcpu->vm->id, vcpu->vcpu_index);
switch (ret.func) {
/* Preempted. */
case SPCI_INTERRUPT_32:
if (need_resched())
schedule();
break;
/* Yield. */
case SPCI_YIELD_32:
if (!kthread_should_stop())
schedule();
break;
/* WFI. */
case HF_SPCI_RUN_WAIT_FOR_INTERRUPT:
if (ret.arg2 != SPCI_SLEEP_INDEFINITE) {
hrtimer_start(&vcpu->timer, ret.arg2,
HRTIMER_MODE_REL);
}
hf_vcpu_sleep(vcpu);
hrtimer_cancel(&vcpu->timer);
break;
/* Waiting for a message. */
case SPCI_MSG_WAIT_32:
atomic_set(&vcpu->waiting_for_message, 1);
if (ret.arg2 != SPCI_SLEEP_INDEFINITE) {
hrtimer_start(&vcpu->timer, ret.arg2,
HRTIMER_MODE_REL);
}
hf_vcpu_sleep(vcpu);
hrtimer_cancel(&vcpu->timer);
atomic_set(&vcpu->waiting_for_message, 0);
break;
/* Wake up another vcpu. */
case HF_SPCI_RUN_WAKE_UP:
hf_handle_wake_up_request(spci_vm_id(ret),
spci_vcpu_index(ret));
break;
/* Response available. */
case SPCI_MSG_SEND_32:
if (spci_msg_send_receiver(ret) == HF_PRIMARY_VM_ID) {
hf_handle_message(vcpu->vm,
spci_msg_send_size(ret),
page_address(hf_recv_page));
} else {
hf_deliver_message(spci_msg_send_receiver(ret));
}
break;
/* Notify all waiters. */
case SPCI_RX_RELEASE_32:
hf_notify_waiters(vcpu->vm->id);
break;
/* Abort was triggered. */
case SPCI_ERROR_32:
pr_warn("SPCI error %d running VM %d vCPU %d", ret.arg2,
vcpu->vm->id, vcpu->vcpu_index);
switch (ret.arg2) {
case SPCI_ABORTED:
for (i = 0; i < vcpu->vm->vcpu_count; i++) {
if (i == vcpu->vcpu_index)
continue;
hf_handle_wake_up_request(vcpu->vm->id,
i);
}
hf_vcpu_sleep(vcpu);
break;
}
break;
}
}
return 0;
}
/**
* Converts a pointer to a struct sock into a pointer to a struct hf_sock. It
* relies on the fact that the first field of hf_sock is a sock.
*/
static struct hf_sock *hsock_from_sk(struct sock *sk)
{
return (struct hf_sock *)sk;
}
/**
* This is called when the last reference to the outer socket is released. For
* example, if it's a user-space socket, when the last file descriptor pointing
* to this socket is closed.
*
* It begins cleaning up resources, though some can only be cleaned up after all
* references to the underlying socket are released, which is handled by
* hf_sock_destruct().
*/
static int hf_sock_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct hf_sock *hsock = hsock_from_sk(sk);
unsigned long flags;
if (!sk)
return 0;
/* Shutdown for both send and receive. */
lock_sock(sk);
sk->sk_shutdown |= RCV_SHUTDOWN | SEND_SHUTDOWN;
sk->sk_state_change(sk);
release_sock(sk);
/* Remove from the hash table, so lookups from now on won't find it. */
spin_lock_irqsave(&hf_local_port_hash_lock, flags);
hash_del_rcu(&hsock->sk.sk_node);
spin_unlock_irqrestore(&hf_local_port_hash_lock, flags);
/*
* TODO: When we implement a tx queue, we need to clear it here so that
* sk_wmem_alloc will not prevent sk from being freed (sk_free).
*/
/*
* Wait for in-flight lookups to finish. We need to do this here because
* in-flight lookups rely on the reference to the socket we're about to
* release.
*/
synchronize_rcu();
sock_put(sk);
sock->sk = NULL;
return 0;
}
/**
* This is called when there are no more references to the socket. It frees all
* resources that haven't been freed during release.
*/
static void hf_sock_destruct(struct sock *sk)
{
/*
* Clear the receive queue now that the handler cannot add any more
* skbs to it.
*/
skb_queue_purge(&sk->sk_receive_queue);
}
/**
* Connects the Hafnium socket to the provided VM and port. After the socket is
* connected, it can be used to exchange datagrams with the specified peer.
*/
static int hf_sock_connect(struct socket *sock, struct sockaddr *saddr, int len,
int connect_flags)
{
struct sock *sk = sock->sk;
struct hf_sock *hsock = hsock_from_sk(sk);
struct hf_vm *vm;
struct hf_sockaddr *addr;
int err;
unsigned long flags;
/* Basic address validation. */
if (len < sizeof(struct hf_sockaddr) || saddr->sa_family != AF_HF)
return -EINVAL;
addr = (struct hf_sockaddr *)saddr;
vm = hf_vm_from_id(addr->vm_id);
if (!vm)
return -ENETUNREACH;
/*
* TODO: Once we implement access control in Hafnium, check that the
* caller is allowed to contact the specified VM. Return -ECONNREFUSED
* if access is denied.
*/
/* Take lock to make sure state doesn't change as we connect. */
lock_sock(sk);
/* Only unconnected sockets are allowed to become connected. */
if (sock->state != SS_UNCONNECTED) {
err = -EISCONN;
goto exit;
}
hsock->local_port = atomic64_inc_return(&hf_next_port);
hsock->remote_port = addr->port;
hsock->peer_vm = vm;
sock->state = SS_CONNECTED;
/* Add socket to hash table now that it's fully initialised. */
spin_lock_irqsave(&hf_local_port_hash_lock, flags);
hash_add_rcu(hf_local_port_hash, &sk->sk_node, hsock->local_port);
spin_unlock_irqrestore(&hf_local_port_hash_lock, flags);
err = 0;
exit:
release_sock(sk);
return err;
}
/**
* Sends the given skb to the appropriate VM by calling Hafnium. It will also
* trigger the wake up of a recipient VM.
*
* Takes ownership of the skb on success.
*/
static int hf_send_skb(struct sk_buff *skb)
{
unsigned long flags;
struct spci_value ret;
struct hf_sock *hsock = hsock_from_sk(skb->sk);
struct hf_vm *vm = hsock->peer_vm;
void *message = page_address(hf_send_page);
/*
* Call Hafnium under the send lock so that we serialize the use of the
* global send buffer.
*/
spin_lock_irqsave(&hf_send_lock, flags);
memcpy(message, skb->data, skb->len);
ret = spci_msg_send(current_vm_id, vm->id, skb->len, 0);
spin_unlock_irqrestore(&hf_send_lock, flags);
if (ret.func == SPCI_ERROR_32) {
switch (ret.arg2) {
case SPCI_INVALID_PARAMETERS:
return -ENXIO;
case SPCI_NOT_SUPPORTED:
return -EIO;
case SPCI_DENIED:
case SPCI_BUSY:
default:
return -EAGAIN;
}
}
/* Ensure the VM will run to pick up the message. */
hf_deliver_message(vm->id);
kfree_skb(skb);
return 0;
}
/**
* Determines if the given socket is in the connected state. It acquires and
* releases the socket lock.
*/
static bool hf_sock_is_connected(struct socket *sock)
{
bool ret;
lock_sock(sock->sk);
ret = sock->state == SS_CONNECTED;
release_sock(sock->sk);
return ret;
}
/**
* Sends a message to the VM & port the socket is connected to. All variants
* of write/send/sendto/sendmsg eventually call this function.
*/
static int hf_sock_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
{
struct sock *sk = sock->sk;
struct sk_buff *skb;
int err;
struct hf_msg_hdr *hdr;
struct hf_sock *hsock = hsock_from_sk(sk);
size_t payload_max_len = HF_MAILBOX_SIZE - sizeof(struct hf_msg_hdr);
/* Check length. */
if (len > payload_max_len)
return -EMSGSIZE;
/* We don't allow the destination address to be specified. */
if (m->msg_namelen > 0)
return -EISCONN;
/* We don't support out of band messages. */
if (m->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
/*
* Ensure that the socket is connected. We don't need to hold the socket
* lock (acquired and released by hf_sock_is_connected) for the
* remainder of the function because the fields we care about are
* immutable once the state is SS_CONNECTED.
*/
if (!hf_sock_is_connected(sock))
return -ENOTCONN;
/*
* Allocate an skb for this write. If there isn't enough room in the
* socket's send buffer (sk_wmem_alloc >= sk_sndbuf), this will block
* (if it's a blocking call). On success, it increments sk_wmem_alloc
* and sets up the skb such that sk_wmem_alloc gets decremented when
* the skb is freed (sock_wfree gets called).
*/
skb = sock_alloc_send_skb(sk, len + sizeof(struct hf_msg_hdr),
m->msg_flags & MSG_DONTWAIT, &err);
if (!skb)
return err;
/* Reserve room for the header and initialise it. */
skb_reserve(skb, sizeof(struct hf_msg_hdr));
hdr = skb_push(skb, sizeof(struct hf_msg_hdr));
hdr->src_port = hsock->local_port;
hdr->dst_port = hsock->remote_port;
/* Allocate area for the contents, then copy into skb. */
if (!copy_from_iter_full(skb_put(skb, len), len, &m->msg_iter)) {
err = -EFAULT;
goto err_cleanup;
}
/*
* TODO: We currently do this inline, but when we have support for
* readiness notification from Hafnium, we must add this to a per-VM tx
* queue that can make progress when the VM becomes writable. This will
* fix send buffering and poll readiness notification.
*/
err = hf_send_skb(skb);
if (err)
goto err_cleanup;
return 0;
err_cleanup:
kfree_skb(skb);
return err;
}
/**
* Receives a message originated from the VM & port the socket is connected to.
* All variants of read/recv/recvfrom/recvmsg eventually call this function.
*/
static int hf_sock_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
int flags)
{
struct sock *sk = sock->sk;
struct sk_buff *skb;
int err;
size_t copy_len;
if (!hf_sock_is_connected(sock))
return -ENOTCONN;
/* Grab the next skb from the receive queue. */
skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
if (!skb)
return err;
/* Make sure we don't copy more than what fits in the output buffer. */
copy_len = skb->len;
if (copy_len > len) {
copy_len = len;
m->msg_flags |= MSG_TRUNC;
}
/* Make sure we don't overflow the return value type. */
if (copy_len > INT_MAX) {
copy_len = INT_MAX;
m->msg_flags |= MSG_TRUNC;
}
/* Copy skb to output iterator, then free it. */
err = skb_copy_datagram_msg(skb, 0, m, copy_len);
skb_free_datagram(sk, skb);
if (err)
return err;
return copy_len;
}
/**
* This function is called when a Hafnium socket is created. It initialises all
* state such that the caller will be able to connect the socket and then send
* and receive messages through it.
*/
static int hf_sock_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
static const struct proto_ops ops = {
.family = PF_HF,
.owner = THIS_MODULE,
.release = hf_sock_release,
.bind = sock_no_bind,
.connect = hf_sock_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.ioctl = sock_no_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
.getsockopt = sock_no_getsockopt,
.sendmsg = hf_sock_sendmsg,
.recvmsg = hf_sock_recvmsg,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
.poll = datagram_poll,
};
struct sock *sk;
if (sock->type != SOCK_DGRAM)
return -ESOCKTNOSUPPORT;
if (protocol != 0)
return -EPROTONOSUPPORT;
/*
* For now we only allow callers with sys admin capability to create
* Hafnium sockets.
*/
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
/* Allocate and initialise socket. */
sk = sk_alloc(net, PF_HF, GFP_KERNEL, &hf_sock_proto, kern);
if (!sk)
return -ENOMEM;
sock_init_data(sock, sk);
sk->sk_destruct = hf_sock_destruct;
sock->ops = &ops;
sock->state = SS_UNCONNECTED;
return 0;
}
/**
* Frees all resources, including threads, associated with the Hafnium driver.
*/
static void hf_free_resources(void)
{
uint16_t i;
spci_vcpu_index_t j;
/*
* First stop all worker threads. We need to do this before freeing
* resources because workers may reference each other, so it is only
* safe to free resources after they have all stopped.
*/
for (i = 0; i < hf_vm_count; i++) {
struct hf_vm *vm = &hf_vms[i];
for (j = 0; j < vm->vcpu_count; j++)
kthread_stop(vm->vcpu[j].task);
}
/* Free resources. */
for (i = 0; i < hf_vm_count; i++) {
struct hf_vm *vm = &hf_vms[i];
for (j = 0; j < vm->vcpu_count; j++)
put_task_struct(vm->vcpu[j].task);
kfree(vm->vcpu);
}
kfree(hf_vms);
}
/**
* Handles the hypervisor timer interrupt.
*/
static irqreturn_t hf_nop_irq_handler(int irq, void *dev)
{
/*
* No need to do anything, the interrupt only exists to return to the
* primary vCPU so that the virtual timer will be restored and fire as
* normal.
*/
return IRQ_HANDLED;
}
/**
* Enables the hypervisor timer interrupt on a CPU, when it starts or after the
* driver is first loaded.
*/
static int hf_starting_cpu(unsigned int cpu)
{
if (hf_irq != 0) {
/* Enable the interrupt, and set it to be edge-triggered. */
enable_percpu_irq(hf_irq, IRQ_TYPE_EDGE_RISING);
}
return 0;
}
/**
* Disables the hypervisor timer interrupt on a CPU when it is powered down.
*/
static int hf_dying_cpu(unsigned int cpu)
{
if (hf_irq != 0) {
/* Disable the interrupt while the CPU is asleep. */
disable_percpu_irq(hf_irq);
}
return 0;
}
/**
* Registers for the hypervisor timer interrupt.
*/
static int hf_int_driver_probe(struct platform_device *pdev)
{
int irq;
int ret;
/*
* Register a handler for the hyperviser timer IRQ, as it is needed for
* Hafnium to emulate the virtual timer for Linux while a secondary vCPU
* is running.
*/
irq = platform_get_irq(pdev, ARCH_TIMER_HYP_PPI);
if (irq < 0) {
pr_err("Error getting hypervisor timer IRQ: %d\n", irq);
return irq;
}
hf_irq = irq;
ret = request_percpu_irq(irq, hf_nop_irq_handler, HYPERVISOR_TIMER_NAME,
pdev);
if (ret != 0) {
pr_err("Error registering hypervisor timer IRQ %d: %d\n",
irq, ret);
return ret;
}
pr_info("Hafnium registered for IRQ %d\n", irq);
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"hafnium/hypervisor_timer:starting",
hf_starting_cpu, hf_dying_cpu);
if (ret < 0) {
pr_err("Error enabling timer on all CPUs: %d\n", ret);
free_percpu_irq(irq, pdev);
return ret;
}
hf_cpuhp_state = ret;
return 0;
}
/**
* Unregisters for the hypervisor timer interrupt.
*/
static int hf_int_driver_remove(struct platform_device *pdev)
{
/*
* This will cause hf_dying_cpu to be called on each CPU, which will
* disable the IRQs.
*/
cpuhp_remove_state(hf_cpuhp_state);
free_percpu_irq(hf_irq, pdev);
return 0;
}
static const struct of_device_id hf_int_driver_id[] = {
{.compatible = "arm,armv7-timer"},
{.compatible = "arm,armv8-timer"},
{}
};
static struct platform_driver hf_int_driver = {
.driver = {
.name = HYPERVISOR_TIMER_NAME,
.owner = THIS_MODULE,
.of_match_table = of_match_ptr(hf_int_driver_id),
},
.probe = hf_int_driver_probe,
.remove = hf_int_driver_remove,
};
/**
* Initializes the Hafnium driver by creating a thread for each vCPU of each
* virtual machine.
*/
static int __init hf_init(void)
{
static const struct net_proto_family proto_family = {
.family = PF_HF,
.create = hf_sock_create,
.owner = THIS_MODULE,
};
int64_t ret;
struct spci_value spci_ret;
spci_vm_id_t i;
spci_vcpu_index_t j;
spci_vm_count_t secondary_vm_count;
uint32_t total_vcpu_count;
/* Allocate a page for send and receive buffers. */
hf_send_page = alloc_page(GFP_KERNEL);
if (!hf_send_page) {
pr_err("Unable to allocate send buffer\n");
return -ENOMEM;
}
hf_recv_page = alloc_page(GFP_KERNEL);
if (!hf_recv_page) {
__free_page(hf_send_page);
pr_err("Unable to allocate receive buffer\n");
return -ENOMEM;
}
/*
* Configure both addresses. Once configured, we cannot free these pages
* because the hypervisor will use them, even if the module is
* unloaded.
*/
spci_ret = spci_rxtx_map(page_to_phys(hf_send_page),
page_to_phys(hf_recv_page));
if (spci_ret.func != SPCI_SUCCESS_32) {
__free_page(hf_send_page);
__free_page(hf_recv_page);
pr_err("Unable to configure VM\n");
if (spci_ret.func == SPCI_ERROR_32)
pr_err("SPCI error code %d\n", spci_ret.arg2);
else
pr_err("Unexpected SPCI function %#x\n", spci_ret.func);
return -EIO;
}
/* Get the number of secondary VMs. */
secondary_vm_count = hf_vm_get_count() - 1;
/* Confirm the maximum number of VMs looks sane. */
BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VMS < 1);
BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VMS > U16_MAX);
/* Validate the number of VMs. There must at least be the primary. */
if (secondary_vm_count > CONFIG_HAFNIUM_MAX_VMS - 1) {
pr_err("Number of VMs is out of range: %d\n",
secondary_vm_count);
return -EDQUOT;
}
/* Only track the secondary VMs. */
hf_vms = kmalloc_array(secondary_vm_count, sizeof(struct hf_vm),
GFP_KERNEL);
if (!hf_vms)
return -ENOMEM;
/* Cache the VM id for later usage. */
current_vm_id = hf_vm_get_id();
/* Initialize each VM. */
total_vcpu_count = 0;
for (i = 0; i < secondary_vm_count; i++) {
struct hf_vm *vm = &hf_vms[i];
spci_vcpu_count_t vcpu_count;
/* Adjust the ID as only the secondaries are tracked. */
vm->id = i + FIRST_SECONDARY_VM_ID;
vcpu_count = hf_vcpu_get_count(vm->id);
if (vcpu_count < 0) {
pr_err("HF_VCPU_GET_COUNT failed for vm=%u: %d",
vm->id, vcpu_count);
ret = -EIO;
goto fail_with_cleanup;
}
/* Avoid overflowing the vcpu count. */
if (vcpu_count > (U32_MAX - total_vcpu_count)) {
pr_err("Too many vcpus: %u\n", total_vcpu_count);
ret = -EDQUOT;
goto fail_with_cleanup;
}
/* Confirm the maximum number of VCPUs looks sane. */
BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VCPUS < 1);
BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VCPUS > U16_MAX);
/* Enforce the limit on vcpus. */
total_vcpu_count += vcpu_count;
if (total_vcpu_count > CONFIG_HAFNIUM_MAX_VCPUS) {
pr_err("Too many vcpus: %u\n", total_vcpu_count);
ret = -EDQUOT;
goto fail_with_cleanup;
}
vm->vcpu_count = vcpu_count;
vm->vcpu = kmalloc_array(vm->vcpu_count, sizeof(struct hf_vcpu),
GFP_KERNEL);
if (!vm->vcpu) {
ret = -ENOMEM;
goto fail_with_cleanup;
}
/* Update the number of initialized VMs. */
hf_vm_count = i + 1;
/* Create a kernel thread for each vcpu. */
for (j = 0; j < vm->vcpu_count; j++) {
struct hf_vcpu *vcpu = &vm->vcpu[j];
vcpu->task =
kthread_create(hf_vcpu_thread, vcpu,
"vcpu_thread_%u_%u", vm->id, j);
if (IS_ERR(vcpu->task)) {
pr_err("Error creating task (vm=%u,vcpu=%u): %ld\n",
vm->id, j, PTR_ERR(vcpu->task));
vm->vcpu_count = j;
ret = PTR_ERR(vcpu->task);
goto fail_with_cleanup;
}
get_task_struct(vcpu->task);
vcpu->vm = vm;
vcpu->vcpu_index = j;
atomic_set(&vcpu->abort_sleep, 0);
atomic_set(&vcpu->waiting_for_message, 0);
}
}
/* Register protocol and socket family. */
ret = proto_register(&hf_sock_proto, 0);
if (ret) {
pr_err("Unable to register protocol: %lld\n", ret);
goto fail_with_cleanup;
}
ret = sock_register(&proto_family);
if (ret) {
pr_err("Unable to register Hafnium's socket family: %lld\n",
ret);
goto fail_unregister_proto;
}
/*
* Register as a driver for the timer device, so we can register a
* handler for the hyperviser timer IRQ.
*/
ret = platform_driver_register(&hf_int_driver);
if (ret != 0) {
pr_err("Error registering timer driver %lld\n", ret);
goto fail_unregister_socket;
}
/*
* Start running threads now that all is initialized.
*
* Any failures from this point on must also unregister the driver with
* platform_driver_unregister().
*/
for (i = 0; i < hf_vm_count; i++) {
struct hf_vm *vm = &hf_vms[i];
for (j = 0; j < vm->vcpu_count; j++)
wake_up_process(vm->vcpu[j].task);
}
/* Dump vm/vcpu count info. */
pr_info("Hafnium successfully loaded with %u VMs:\n", hf_vm_count);
for (i = 0; i < hf_vm_count; i++) {
struct hf_vm *vm = &hf_vms[i];
pr_info("\tVM %u: %u vCPUS\n", vm->id, vm->vcpu_count);
}
return 0;
fail_unregister_socket:
sock_unregister(PF_HF);
fail_unregister_proto:
proto_unregister(&hf_sock_proto);
fail_with_cleanup:
hf_free_resources();
return ret;
}
/**
* Frees up all resources used by the Hafnium driver in preparation for
* unloading it.
*/
static void __exit hf_exit(void)
{
pr_info("Preparing to unload Hafnium\n");
sock_unregister(PF_HF);
proto_unregister(&hf_sock_proto);
hf_free_resources();
platform_driver_unregister(&hf_int_driver);
pr_info("Hafnium ready to unload\n");
}
MODULE_LICENSE("GPL v2");
module_init(hf_init);
module_exit(hf_exit);