Add communication channel multiplexing.

The concept of ports is introduced to allow VMs to differentiate
communication channels within the transport offered by Hafnium.

The implementation is exposed to user-space as sockets. Only admins are
currently allowed to create them.

Change-Id: I3ad3b6c297d1b23878b858ade00dbfeb06cc93a1
diff --git a/main.c b/main.c
index 26c5cab..11bbc2e 100644
--- a/main.c
+++ b/main.c
@@ -16,6 +16,7 @@
  */
 
 #include <linux/hrtimer.h>
+#include <linux/atomic.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/kthread.h>
@@ -23,14 +24,21 @@
 #include <linux/module.h>
 #include <linux/sched/task.h>
 #include <linux/slab.h>
+#include <linux/net.h>
+#include <net/sock.h>
 
 #include <hf/call.h>
 
+/* TODO: Reusing AF_ECONET for now as it's otherwise unused. */
+#define AF_HF AF_ECONET
+#define PF_HF AF_HF
+
+#define MESSAGE_INT_ID 1
+
 #define CONFIG_HAFNIUM_MAX_VMS   16
 #define CONFIG_HAFNIUM_MAX_VCPUS 32
 
 struct hf_vcpu {
-	spinlock_t lock;
 	struct hf_vm *vm;
 	uint32_t vcpu_index;
 	struct task_struct *task;
@@ -44,10 +52,44 @@
 	struct hf_vcpu *vcpu;
 };
 
+struct hf_msg_hdr {
+	uint64_t src_port;
+	uint64_t dst_port;
+};
+
+struct hf_sock {
+	/* This needs to be the first field. */
+	struct sock sk;
+
+	/*
+	 * The following fields are immutable after the socket transitions to
+	 * SS_CONNECTED state.
+	 */
+	uint64_t local_port;
+	uint64_t remote_port;
+	struct hf_vm *peer_vm;
+};
+
+struct sockaddr_hf {
+	sa_family_t family;
+	uint32_t vm_id;
+	uint64_t port;
+};
+
+static struct proto hf_sock_proto = {
+	.name = "hafnium",
+	.owner = THIS_MODULE,
+	.obj_size = sizeof(struct hf_sock),
+};
+
 static struct hf_vm *hf_vms;
 static uint32_t hf_vm_count;
-static struct page *hf_send_page = NULL;
-static struct page *hf_recv_page = NULL;
+static struct page *hf_send_page;
+static struct page *hf_recv_page;
+static atomic64_t hf_next_port = ATOMIC64_INIT(0);
+static DEFINE_SPINLOCK(hf_send_lock);
+static DEFINE_HASHTABLE(hf_local_port_hash, 7);
+static DEFINE_SPINLOCK(hf_local_port_hash_lock);
 
 /**
  * Wakes up the kernel thread responsible for running the given vcpu.
@@ -98,6 +140,118 @@
 }
 
 /**
+ * Handles a message delivered to this VM by validating that it's well-formed
+ * and then queueing it for delivery to the appropriate socket.
+ */
+static void hf_handle_message(struct hf_vm *sender, const void *ptr, size_t len)
+{
+	struct hf_sock *hsock;
+	const struct hf_msg_hdr *hdr = ptr;
+	struct sk_buff *skb;
+	int err;
+
+	/* Ignore messages that are too small to hold a header. */
+	if (len < sizeof(struct hf_msg_hdr))
+		return;
+
+	len -= sizeof(struct hf_msg_hdr);
+
+	/* Go through the colliding sockets. */
+	rcu_read_lock();
+	hash_for_each_possible_rcu(hf_local_port_hash, hsock, sk.sk_node,
+				   hdr->dst_port) {
+		if (hsock->peer_vm == sender &&
+		    hsock->remote_port == hdr->src_port) {
+			sock_hold(&hsock->sk);
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	/* Nothing to do if we couldn't find the target. */
+	if (!hsock)
+		return;
+
+	/* TODO: From this point on, there are two failure paths: when we
+	 * create the skb below, and when we enqueue it to the socket. What
+	 * should we do if they fail? Ideally we would have some form of flow
+	 * control to prevent message loss, but how to do it efficiently?
+	 *
+	 * One option is to have a pre-allocated message that indicates to the
+	 * sender that a message was dropped. This way we guarantee that the
+	 * sender will be aware of loss and should back-off.
+	 */
+	/* Create the skb. */
+	skb = alloc_skb(len, GFP_KERNEL);
+	if (!skb)
+		goto exit;
+
+	memcpy(skb_put(skb, len), hdr + 1, len);
+
+	/*
+	 * Add the skb to the receive queue of the target socket. On success it
+	 * calls sk->sk_data_ready, which is currently set to sock_def_readable,
+	 * which wakes up any waiters.
+	 */
+	err = sock_queue_rcv_skb(&hsock->sk, skb);
+	if (err)
+		kfree_skb(skb);
+
+exit:
+	sock_put(&hsock->sk);
+}
+
+/**
+ * This function is called when Hafnium requests that the primary VM wake up a
+ * vCPU that belongs to a secondary VM.
+ *
+ * It wakes up the thread if it's sleeping, or kicks it if it's already running.
+ *
+ * If vCPU is HF_INVALID_VCPU, it injects a MESSAGE_INT_ID interrupt into a vCPU
+ * belonging to the specified VM.
+ */
+static void hf_handle_wake_up_request(uint32_t vm_id, uint16_t vcpu)
+{
+	struct hf_vm *vm;
+
+	if (vm_id > hf_vm_count) {
+		pr_warn("Request to wake up non-existent VM id: %u\n", vm_id);
+		return;
+	}
+
+	vm = &hf_vms[vm_id - 1];
+	if (vcpu >= vm->vcpu_count) {
+		int64_t ret;
+
+		if (vcpu != HF_INVALID_VCPU) {
+			pr_warn("Request to wake up non-existent vCPU: %u.%u\n",
+				vm_id, vcpu);
+			return;
+		}
+
+		/*
+		 * TODO: For now we're picking the first vcpu to interrupt, but
+		 * we want to be smarter.
+		 */
+		vcpu = 0;
+		ret = hf_inject_interrupt(vm_id, vcpu, MESSAGE_INT_ID);
+		if (ret != 1) {
+			/* We don't need to wake up the vcpu. */
+			return;
+		}
+	}
+
+	if (hf_vcpu_wake_up(&vm->vcpu[vcpu]) == 0) {
+		/*
+		 * The task was already running (presumably on a different
+		 * physical CPU); interrupt it. This gives Hafnium a chance to
+		 * inject any new interrupts.
+		 */
+		kick_process(vm->vcpu[vcpu].task);
+	}
+}
+
+/**
  * This is the main loop of each vcpu.
  */
 static int hf_vcpu_thread(void *data)
@@ -130,43 +284,20 @@
 
 		/* Wake up another vcpu. */
 		case HF_VCPU_RUN_WAKE_UP:
-			{
-				struct hf_vm *vm;
-				if (ret.wake_up.vm_id > hf_vm_count)
-					break;
-				vm = &hf_vms[ret.wake_up.vm_id - 1];
-				if (ret.wake_up.vcpu < vm->vcpu_count) {
-					if (hf_vcpu_wake_up(&vm->vcpu[ret.wake_up.vcpu]) == 0) {
-						/*
-						 * The task was already running (presumably on a
-						 * different physical CPU); interrupt it. This gives
-						 * Hafnium a chance to inject any new interrupts.
-						 */
-						kick_process(vm->vcpu[ret.wake_up.vcpu].task);
-					}
-				} else if (ret.wake_up.vcpu == HF_INVALID_VCPU) {
-					/* TODO: pick one to interrupt. */
-					pr_warning("No vcpu to wake.");
-				}
-			}
+			hf_handle_wake_up_request(ret.wake_up.vm_id,
+						  ret.wake_up.vcpu);
 			break;
 
 		/* Response available. */
 		case HF_VCPU_RUN_MESSAGE:
-			{
-				uint32_t i;
-				const char *buf = page_address(hf_recv_page);
-				pr_info("Received response from vm %u (%u bytes): ",
-					vcpu->vm->id, ret.message.size);
-				for (i = 0; i < ret.message.size; i++)
-					printk(KERN_CONT "%c", buf[i]);
-				printk(KERN_CONT "\n");
-				hf_mailbox_clear();
-			}
+			hf_handle_message(vcpu->vm, page_address(hf_recv_page),
+					  ret.message.size);
+			hf_mailbox_clear();
 			break;
 
 		case HF_VCPU_RUN_SLEEP:
-			hrtimer_start(&vcpu->timer, ret.sleep.ns, HRTIMER_MODE_REL);
+			hrtimer_start(&vcpu->timer, ret.sleep.ns,
+				      HRTIMER_MODE_REL);
 			hf_vcpu_sleep(vcpu);
 			hrtimer_cancel(&vcpu->timer);
 			break;
@@ -177,6 +308,350 @@
 }
 
 /**
+ * Converts a pointer to a struct sock into a pointer to a struct hf_sock. It
+ * relies on the fact that the first field of hf_sock is a sock.
+ */
+static struct hf_sock *hsock_from_sk(struct sock *sk)
+{
+	return (struct hf_sock *)sk;
+}
+
+/**
+ * This is called when the last reference to the outer socket is released. For
+ * example, if it's a user-space socket, when the last file descriptor pointing
+ * to this socket is closed.
+ *
+ * It begins cleaning up resources, though some can only be cleaned up after all
+ * references to the underlying socket are released, which is handled by
+ * hf_sock_destruct().
+ */
+static int hf_sock_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct hf_sock *hsock = hsock_from_sk(sk);
+	unsigned long flags;
+
+	if (!sk)
+		return 0;
+
+	/* Shutdown for both send and receive. */
+	lock_sock(sk);
+	sk->sk_shutdown |= RCV_SHUTDOWN | SEND_SHUTDOWN;
+	sk->sk_state_change(sk);
+	release_sock(sk);
+
+	/* Remove from the hash table, so lookups from now on won't find it. */
+	spin_lock_irqsave(&hf_local_port_hash_lock, flags);
+	hash_del_rcu(&hsock->sk.sk_node);
+	spin_unlock_irqrestore(&hf_local_port_hash_lock, flags);
+
+	/*
+	 * TODO: When we implement a tx queue, we need to clear it here so that
+	 * sk_wmem_alloc will not prevent sk from being freed (sk_free).
+	 */
+
+	/*
+	 * Wait for in-flight lookups to finish. We need to do this here because
+	 * in-flight lookups  rely on the reference to the socket we're about to
+	 * release.
+	 */
+	synchronize_rcu();
+	sock_put(sk);
+	sock->sk = NULL;
+
+	return 0;
+}
+
+/**
+ * This is called when there are no more references to the socket. It frees all
+ * resources that haven't been freed during release.
+ */
+static void hf_sock_destruct(struct sock *sk)
+{
+	/*
+	 * Clear the receive queue now that the handler cannot add any more
+	 * skbs to it.
+	 */
+	skb_queue_purge(&sk->sk_receive_queue);
+}
+
+/**
+ * Connects the Hafnium socket to the provided VM and port. After the socket is
+ * connected, it can be used to exchange datagrams with the specified peer.
+ */
+static int hf_sock_connect(struct socket *sock, struct sockaddr *saddr,
+			   int len, int connect_flags)
+{
+	struct sock *sk = sock->sk;
+	struct hf_sock *hsock = hsock_from_sk(sk);
+	struct hf_vm *vm;
+	struct sockaddr_hf *addr;
+	int err;
+	unsigned long flags;
+
+	/* Basic address validation. */
+	if (len < sizeof(struct sockaddr_hf) || saddr->sa_family != AF_HF)
+		return -EINVAL;
+
+	addr = (struct sockaddr_hf *)saddr;
+	if (addr->vm_id > hf_vm_count)
+		return -ENETUNREACH;
+
+	vm = &hf_vms[addr->vm_id - 1];
+
+	/*
+	 * TODO: Once we implement access control in Hafnium, check that the
+	 * caller is allowed to contact the specified VM. Return -ECONNREFUSED
+	 * if access is denied.
+	 */
+
+	/* Take lock to make sure state doesn't change as we connect. */
+	lock_sock(sk);
+
+	/* Only unconnected sockets are allowed to become connected. */
+	if (sock->state != SS_UNCONNECTED) {
+		err = -EISCONN;
+		goto exit;
+	}
+
+	hsock->local_port = atomic64_inc_return(&hf_next_port);
+	hsock->remote_port = addr->port;
+	hsock->peer_vm = vm;
+
+	sock->state = SS_CONNECTED;
+
+	/* Add socket to hash table now that it's fully initialised. */
+	spin_lock_irqsave(&hf_local_port_hash_lock, flags);
+	hash_add_rcu(hf_local_port_hash, &sk->sk_node, hsock->local_port);
+	spin_unlock_irqrestore(&hf_local_port_hash_lock, flags);
+
+	err = 0;
+exit:
+	release_sock(sk);
+	return err;
+}
+
+/**
+ * Sends the given skb to the appropriate VM by calling Hafnium. It will also
+ * trigger the wake up of a recipient VM.
+ *
+ * Takes ownership of the skb on success.
+ */
+static int hf_send_skb(struct sk_buff *skb)
+{
+	unsigned long flags;
+	int64_t ret;
+	struct hf_sock *hsock = hsock_from_sk(skb->sk);
+	struct hf_vm *vm = hsock->peer_vm;
+
+	/*
+	 * Call Hafnium under the send lock so that we serialize the use of the
+	 * global send buffer.
+	 */
+	spin_lock_irqsave(&hf_send_lock, flags);
+	memcpy(page_address(hf_send_page), skb->data, skb->len);
+	ret = hf_mailbox_send(vm->id, skb->len);
+	spin_unlock_irqrestore(&hf_send_lock, flags);
+
+	if (ret < 0)
+		return -EAGAIN;
+
+	/* Wake some vcpu up to handle the new message. */
+	hf_handle_wake_up_request(vm->id, ret);
+
+	kfree_skb(skb);
+
+	return 0;
+}
+
+/**
+ * Determines if the given socket is in the connected state. It acquires and
+ * releases the socket lock.
+ */
+static bool hf_sock_is_connected(struct socket *sock)
+{
+	bool ret;
+
+	lock_sock(sock->sk);
+	ret = sock->state == SS_CONNECTED;
+	release_sock(sock->sk);
+
+	return ret;
+}
+
+/**
+ * Sends a message to the VM & port the socket is connected to. All variants
+ * of write/send/sendto/sendmsg eventually call this function.
+ */
+static int hf_sock_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	int err;
+	struct hf_msg_hdr *hdr;
+	struct hf_sock *hsock = hsock_from_sk(sk);
+
+	/* Check length. */
+	if (len > HF_MAILBOX_SIZE - sizeof(struct hf_msg_hdr))
+		return -EMSGSIZE;
+
+	/* We don't allow the destination address to be specified. */
+	if (m->msg_namelen > 0)
+		return -EISCONN;
+
+	/* We don't support out of band messages. */
+	if (m->msg_flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	/*
+	 * Ensure that the socket is connected. We don't need to hold the socket
+	 * lock (acquired and released by hf_sock_is_connected) for the
+	 * remainder of the function because the fields we care about are
+	 * immutable once the state is SS_CONNECTED.
+	 */
+	if (!hf_sock_is_connected(sock))
+		return -ENOTCONN;
+
+	/*
+	 * Allocate an skb for this write. If there isn't enough room in the
+	 * socket's send buffer (sk_wmem_alloc >= sk_sndbuf), this will block
+	 * (if it's a blocking call). On success, it increments sk_wmem_alloc
+	 * and sets up the skb such that sk_wmem_alloc gets decremented when
+	 * the skb is freed (sock_wfree gets called).
+	 */
+	skb = sock_alloc_send_skb(sk, len + sizeof(struct hf_msg_hdr),
+				  m->msg_flags & MSG_DONTWAIT, &err);
+	if (!skb)
+		return err;
+
+	/* Reserve room for the header and initialise it. */
+	skb_reserve(skb, sizeof(struct hf_msg_hdr));
+	hdr = skb_push(skb, sizeof(struct hf_msg_hdr));
+	hdr->src_port = hsock->local_port;
+	hdr->dst_port = hsock->remote_port;
+
+	/* Allocate area for the contents, then copy into skb. */
+	if (!copy_from_iter_full(skb_put(skb, len), len, &m->msg_iter)) {
+		err = -EFAULT;
+		goto err_cleanup;
+	}
+
+	/*
+	 * TODO: We currently do this inline, but when we have support for
+	 * readiness notification from Hafnium, we must add this to a per-VM tx
+	 * queue that can make progress when the VM becomes writable. This will
+	 * fix send buffering and poll readiness notification.
+	 */
+	err = hf_send_skb(skb);
+	if (err)
+		goto err_cleanup;
+
+	return 0;
+
+err_cleanup:
+	kfree_skb(skb);
+	return err;
+}
+
+/**
+ * Receives a message originated from the VM & port the socket is connected to.
+ * All variants of read/recv/recvfrom/recvmsg eventually call this function.
+ */
+static int hf_sock_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
+			   int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	int err;
+	size_t copy_len;
+
+	if (!hf_sock_is_connected(sock))
+		return -ENOTCONN;
+
+	/* Grab the next skb from the receive queue. */
+	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
+	if (!skb)
+		return err;
+
+	/* Make sure we don't copy more than what fits in the output buffer. */
+	copy_len = skb->len;
+	if (copy_len > len) {
+		copy_len = len;
+		m->msg_flags |= MSG_TRUNC;
+	}
+
+	/* Make sure we don't overflow the return value type. */
+	if (copy_len > INT_MAX) {
+		copy_len = INT_MAX;
+		m->msg_flags |= MSG_TRUNC;
+	}
+
+	/* Copy skb to output iterator, then free it. */
+	err = skb_copy_datagram_msg(skb, 0, m, copy_len);
+	skb_free_datagram(sk, skb);
+	if (err)
+		return err;
+
+	return copy_len;
+}
+
+/**
+ * This function is called when a Hafnium socket is created. It initialises all
+ * state such that the caller will be able to connect the socket and then send
+ * and receive messages through it.
+ */
+static int hf_sock_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	static const struct proto_ops ops = {
+		.family = PF_HF,
+		.owner = THIS_MODULE,
+		.release = hf_sock_release,
+		.bind = sock_no_bind,
+		.connect = hf_sock_connect,
+		.socketpair = sock_no_socketpair,
+		.accept = sock_no_accept,
+		.ioctl = sock_no_ioctl,
+		.listen = sock_no_listen,
+		.shutdown = sock_no_shutdown,
+		.setsockopt = sock_no_setsockopt,
+		.getsockopt = sock_no_getsockopt,
+		.sendmsg = hf_sock_sendmsg,
+		.recvmsg = hf_sock_recvmsg,
+		.mmap = sock_no_mmap,
+		.sendpage = sock_no_sendpage,
+		.poll = datagram_poll,
+	};
+	struct sock *sk;
+
+	if (sock->type != SOCK_DGRAM)
+		return -ESOCKTNOSUPPORT;
+
+	if (protocol != 0)
+		return -EPROTONOSUPPORT;
+
+	/*
+	 * For now we only allow callers with sys admin capability to create
+	 * Hafnium sockets.
+	 */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* Allocate and initialise socket. */
+	sk = sk_alloc(net, PF_HF, GFP_KERNEL, &hf_sock_proto, kern);
+	if (!sk)
+		return -ENOMEM;
+
+	sock_init_data(sock, sk);
+
+	sk->sk_destruct = hf_sock_destruct;
+	sock->ops = &ops;
+	sock->state = SS_UNCONNECTED;
+
+	return 0;
+}
+
+/**
  * Frees all resources, including threads, associated with the Hafnium driver.
  */
 static void hf_free_resources(void)
@@ -190,6 +665,7 @@
 	 */
 	for (i = 0; i < hf_vm_count; i++) {
 		struct hf_vm *vm = &hf_vms[i];
+
 		for (j = 0; j < vm->vcpu_count; j++)
 			kthread_stop(vm->vcpu[j].task);
 	}
@@ -197,6 +673,7 @@
 	/* Free resources. */
 	for (i = 0; i < hf_vm_count; i++) {
 		struct hf_vm *vm = &hf_vms[i];
+
 		for (j = 0; j < vm->vcpu_count; j++)
 			put_task_struct(vm->vcpu[j].task);
 		kfree(vm->vcpu);
@@ -205,68 +682,17 @@
 	kfree(hf_vms);
 }
 
-static ssize_t hf_send_store(struct kobject *kobj, struct kobj_attribute *attr,
-			     const char *buf, size_t count)
-{
-	int64_t ret;
-	struct hf_vm *vm;
-
-	count = min_t(size_t, count, HF_MAILBOX_SIZE);
-
-	/* Copy data to send buffer. */
-	memcpy(page_address(hf_send_page), buf, count);
-
-	vm = &hf_vms[0];
-	ret = hf_mailbox_send(vm->id, count);
-	if (ret < 0)
-		return -EAGAIN;
-
-	if (ret == HF_INVALID_VCPU) {
-		/*
-		 * TODO: We need to interrupt some vcpu because none are waiting
-		 * for data.
-		 */
-		pr_warning("No vcpu to receive message.");
-		return -ENOSYS;
-	}
-
-	if (ret >= vm->vcpu_count)
-		return -EINVAL;
-
-	/* Wake up the vcpu that is going to process the data. */
-	hf_vcpu_wake_up(&vm->vcpu[ret]);
-
-	return count;
-}
-
-static struct kobject *hf_sysfs_obj = NULL;
-static struct kobj_attribute send_attr =
-	__ATTR(send, 0200, NULL, hf_send_store);
-
-/**
- * Initializes the Hafnium driver's sysfs interface.
- */
-static void __init hf_init_sysfs(void)
-{
-	int ret;
-
-	/* Create the sysfs interface to interrupt vcpus. */
-	hf_sysfs_obj = kobject_create_and_add("hafnium", kernel_kobj);
-	if (!hf_sysfs_obj) {
-		pr_err("Unable to create sysfs object");
-	} else {
-		ret = sysfs_create_file(hf_sysfs_obj, &send_attr.attr);
-		if (ret)
-			pr_err("Unable to create 'send' sysfs file");
-	}
-}
-
 /**
  * Initializes the Hafnium driver by creating a thread for each vCPU of each
  * virtual machine.
  */
 static int __init hf_init(void)
 {
+	static const struct net_proto_family proto_family = {
+		.family = PF_HF,
+		.create = hf_sock_create,
+		.owner = THIS_MODULE,
+	};
 	int64_t ret;
 	uint32_t i, j;
 	uint32_t total_vm_count;
@@ -296,8 +722,10 @@
 	if (ret) {
 		__free_page(hf_send_page);
 		__free_page(hf_recv_page);
-		/* TODO: We may want to grab this information from hypervisor
-		 * and go from there. */
+		/*
+		 * TODO: We may want to grab this information from hypervisor
+		 * and go from there.
+		 */
 		pr_err("Unable to configure VM\n");
 		return -EIO;
 	}
@@ -321,7 +749,8 @@
 
 	/* Only track the secondary VMs. */
 	total_vm_count = ret - 1;
-	hf_vms = kmalloc(sizeof(struct hf_vm) * total_vm_count, GFP_KERNEL);
+	hf_vms = kmalloc_array(total_vm_count, sizeof(struct hf_vm),
+			       GFP_KERNEL);
 	if (!hf_vms)
 		return -ENOMEM;
 
@@ -335,8 +764,8 @@
 
 		ret = hf_vcpu_get_count(vm->id);
 		if (ret < 0) {
-			pr_err("HF_VCPU_GET_COUNT failed for vm=%u: %lld", vm->id,
-			       ret);
+			pr_err("HF_VCPU_GET_COUNT failed for vm=%u: %lld",
+			       vm->id, ret);
 			ret = -EIO;
 			goto fail_with_cleanup;
 		}
@@ -361,8 +790,8 @@
 		}
 
 		vm->vcpu_count = ret;
-		vm->vcpu = kmalloc(sizeof(struct hf_vcpu) * vm->vcpu_count,
-				   GFP_KERNEL);
+		vm->vcpu = kmalloc_array(vm->vcpu_count, sizeof(struct hf_vcpu),
+					 GFP_KERNEL);
 		if (!vm->vcpu) {
 			pr_err("No memory for %u vcpus for vm %u",
 			       vm->vcpu_count, vm->id);
@@ -380,22 +809,40 @@
 						    "vcpu_thread_%u_%u",
 						    vm->id, j);
 			if (IS_ERR(vcpu->task)) {
-				pr_err("Error creating task (vm=%u,vcpu=%u)"
-				       ": %ld\n", vm->id, j, PTR_ERR(vcpu->task));
+				pr_err("Error creating task (vm=%u,vcpu=%u): %ld\n",
+				       vm->id, j, PTR_ERR(vcpu->task));
 				vm->vcpu_count = j;
 				ret = PTR_ERR(vcpu->task);
 				goto fail_with_cleanup;
 			}
 
 			get_task_struct(vcpu->task);
-			spin_lock_init(&vcpu->lock);
 			vcpu->vm = vm;
 			vcpu->vcpu_index = j;
 			atomic_set(&vcpu->abort_sleep, 0);
 		}
 	}
 
-	/* Start running threads now that all is initialized. */
+	/* Register protocol and socket family. */
+	ret = proto_register(&hf_sock_proto, 0);
+	if (ret) {
+		pr_err("Unable to register protocol: %lld\n", ret);
+		goto fail_with_cleanup;
+	}
+
+	ret = sock_register(&proto_family);
+	if (ret) {
+		pr_err("Unable to register Hafnium's socket family: %lld\n",
+		       ret);
+		goto fail_unregister_proto;
+	}
+
+	/*
+	 * Start running threads now that all is initialized.
+	 *
+	 * Any failures from this point on must also unregister the socket
+	 * family with a call to sock_unregister().
+	 */
 	for (i = 0; i < hf_vm_count; i++) {
 		struct hf_vm *vm = &hf_vms[i];
 		for (j = 0; j < vm->vcpu_count; j++)
@@ -406,13 +853,14 @@
 	pr_info("Hafnium successfully loaded with %u VMs:\n", hf_vm_count);
 	for (i = 0; i < hf_vm_count; i++) {
 		struct hf_vm *vm = &hf_vms[i];
+
 		pr_info("\tVM %u: %u vCPUS\n", vm->id, vm->vcpu_count);
 	}
 
-	hf_init_sysfs();
-
 	return 0;
 
+fail_unregister_proto:
+	proto_unregister(&hf_sock_proto);
 fail_with_cleanup:
 	hf_free_resources();
 	return ret;
@@ -424,15 +872,14 @@
  */
 static void __exit hf_exit(void)
 {
-	if (hf_sysfs_obj)
-		kobject_put(hf_sysfs_obj);
-
 	pr_info("Preparing to unload Hafnium\n");
+	sock_unregister(PF_HF);
+	proto_unregister(&hf_sock_proto);
 	hf_free_resources();
 	pr_info("Hafnium ready to unload\n");
 }
 
-MODULE_LICENSE("GPL");
+MODULE_LICENSE("GPL v2");
 
 module_init(hf_init);
 module_exit(hf_exit);