Page MenuHomeFreeBSD

D34811.diff
No OneTemporary

D34811.diff

diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -268,6 +268,12 @@
int vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta);
int vm_restore_time(struct vmctx *ctx);
+/*
+ * Live migration
+ */
+int vm_get_dirty_page_list(struct vmctx *ctx, char *page_list, size_t num,
+ bool is_all_dirty);
+
/*
* Deprecated interfaces, do not use them in new code.
*/
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -1734,6 +1734,28 @@
return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy));
}
+int
+vm_get_dirty_page_list(struct vmctx *ctx, char *page_list, size_t num_pages, bool is_all_dirty)
+{
+ struct vm_get_dirty_page_list list;
+
+ bzero(&list, sizeof(struct vm_get_dirty_page_list));
+ list.page_list = (uint8_t *)page_list;
+ list.num_pages = num_pages;
+ list.is_all_dirty = is_all_dirty;
+
+ list.lowmem_start = 0;
+ list.lowmem_end = ctx->lowmem;
+ list.highmem_start = ctx->highmem != 0 ? 4 * GB : -1;
+ list.highmem_end = ctx->highmem != 0 ? 4 * GB + ctx->highmem : -1;
+
+ madvise(ctx->baseaddr, ctx->lowmem, MADV_WILLNEED);
+ if (ctx->highmem != 0)
+ madvise(ctx->baseaddr + 4 * GB, ctx->highmem, MADV_WILLNEED);
+
+ return (ioctl(ctx->fd, VM_GET_DIRTY_PAGE_LIST, &list));
+}
+
int
vm_set_topology(struct vmctx *ctx,
uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
@@ -1788,7 +1810,7 @@
VM_SET_INTINFO, VM_GET_INTINFO,
VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME,
VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY,
- VM_SNAPSHOT_REQ, VM_RESTORE_TIME
+ VM_SNAPSHOT_REQ, VM_RESTORE_TIME, VM_GET_DIRTY_PAGE_LIST
};
int
diff --git a/usr.sbin/bhyve/migration.c b/usr.sbin/bhyve/migration.c
--- a/usr.sbin/bhyve/migration.c
+++ b/usr.sbin/bhyve/migration.c
@@ -30,6 +30,9 @@
#include <stdlib.h>
#include <string.h>
#include <vmmapi.h>
+#ifdef BHYVE_DEBUG
+#include <time.h>
+#endif
#include "migration.h"
#include "pci_emul.h"
@@ -742,6 +745,287 @@
return (error);
}
+#define MIGRATION_ROUNDS 4
+
+static int
+migrate_segment_pages(int socket, char *baseaddr, char *page_list,
+ size_t nr_pages, enum migration_transfer_req migration_req)
+{
+ size_t start_dirty_page;
+ int rc;
+
+ for (size_t i = 0; i < nr_pages; i++) {
+ if (page_list[i] == 0)
+ continue;
+
+ start_dirty_page = i;
+
+ for (; i < nr_pages; i++) {
+ if (page_list[i] == 0)
+ break;
+ }
+
+ /* Transfer all continous dirty pages into the vm's memory */
+ rc = migration_transfer_data(socket, baseaddr + start_dirty_page * PAGE_SIZE,
+ (i - start_dirty_page) * PAGE_SIZE, migration_req);
+ if (rc != 0)
+ return (rc);
+ }
+
+ return (0);
+}
+
+static int
+migrate_pages(struct vmctx *ctx, int socket, char *page_list,
+ size_t page_list_size, enum migration_transfer_req migration_req)
+{
+ size_t lowmem_limit_page, lowmem, highmem;
+ int rc;
+ char *baseaddr;
+
+ if ((migration_req != MIGRATION_SEND_REQ) && (migration_req != MIGRATION_RECV_REQ)) {
+ EPRINTF("wrong migration transfer req");
+ return (EINVAL);
+ }
+
+ /*
+ * Transfer the state of the pages (dirty/not dirty) from the source
+ * host to the destination host. The pages that are dirty will be
+ * transferred in the next steps.
+ */
+ rc = migration_transfer_data(socket, page_list, page_list_size, migration_req);
+ if (rc != 0) {
+ EPRINTF("Could not transfer page list");
+ return (rc);
+ }
+
+ vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem, &highmem);
+
+ /* Lowmem segment */
+ rc = migrate_segment_pages(socket, baseaddr,
+ page_list, lowmem / PAGE_SIZE, migration_req);
+ if (rc != 0) {
+ EPRINTF("Could not migrate the lowmem segment pages");
+ return (rc);
+ }
+
+ /* Highmem segment */
+ if (highmem == 0)
+ return (0);
+
+ lowmem_limit_page = vm_get_lowmem_limit(ctx) / PAGE_SIZE;
+
+ rc = migrate_segment_pages(socket, baseaddr + 4 * GB,
+ page_list + lowmem_limit_page, highmem / PAGE_SIZE, migration_req);
+ if (rc != 0) {
+ EPRINTF("Could not migrate the highmem segment pages");
+ return (rc);
+ }
+
+ return (0);
+}
+
+static int
+live_migrate_send(struct vmctx *ctx, int socket)
+{
+ int error, i, rc;
+ uint8_t rounds;
+ size_t migration_completed;
+ size_t pages;
+ char *page_list_indexes;
+
+#ifdef BHYVE_DEBUG
+ struct timespec start, now;
+ uint64_t time_diff_ms;
+#endif
+
+ error = 0;
+ page_list_indexes = NULL;
+ rounds = MIGRATION_ROUNDS;
+
+ /* Send the number of memory rounds to destination */
+ error = migration_transfer_data(socket, &rounds, sizeof(rounds), MIGRATION_SEND_REQ);
+ if (error != 0) {
+ EPRINTF("Could not transfer the number of rounds");
+ goto done;
+ }
+
+ pages = (vm_get_lowmem_size(ctx) + vm_get_highmem_size(ctx)) / PAGE_SIZE;
+
+ /* alloc page_list_indexes */
+ page_list_indexes = mmap(NULL, pages * sizeof(char), PROT_READ | PROT_WRITE, MAP_ANON, -1, 0);
+ if (page_list_indexes == MAP_FAILED) {
+ perror("Page list indexes could not be allocated");
+ error = errno;
+ goto done;
+ }
+
+ if (mlock(page_list_indexes, pages * sizeof(char)) == -1) {
+ perror("Page list indexes could not be locked");
+ error = errno;
+ goto done;
+ }
+
+ for (i = 0; i <= MIGRATION_ROUNDS; i++) {
+ if (i == MIGRATION_ROUNDS) {
+ /* Last round */
+ vm_vcpu_pause(ctx);
+
+ rc = vm_pause_user_devs();
+ if (rc != 0) {
+ EPRINTF("Could not pause devices");
+ error = rc;
+ goto unlock_vm_and_exit;
+ }
+ }
+
+ DPRINTF("Live migration round %d - Start", i);
+#ifdef BHYVE_DEBUG
+ clock_gettime(CLOCK_MONOTONIC, &start);
+#endif
+ memset(page_list_indexes, 0, pages);
+
+ /* Search the dirty pages and populate page_list_index */
+ rc = vm_get_dirty_page_list(ctx, page_list_indexes, pages, i == 0);
+
+ if (rc != 0) {
+ EPRINTF("Couldn't search for the dirty pages");
+ error = errno;
+ if (i == MIGRATION_ROUNDS)
+ goto unlock_vm_and_exit;
+ else
+ goto done;
+ }
+
+ DPRINTF("Live migration round %d: Finished searching the dirty pages", i);
+
+ error = migrate_pages(ctx, socket, page_list_indexes,
+ pages, MIGRATION_SEND_REQ);
+ if (error != 0) {
+ EPRINTF("Couldn't send dirty pages to dest");
+ if (i == MIGRATION_ROUNDS)
+ goto unlock_vm_and_exit;
+ else
+ goto done;
+ }
+
+#ifdef BHYVE_DEBUG
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ time_diff_ms = (now.tv_sec - start.tv_sec) * 1000 +
+ (now.tv_nsec - start.tv_nsec) / 1000000;
+ DPRINTF("Live migration round %d - Done - %lu ms", i, time_diff_ms);
+#endif
+ }
+
+ /* Send kern data */
+ error = migrate_kern_data(ctx, socket, MIGRATION_SEND_REQ);
+ if (error != 0) {
+ EPRINTF("Could not send kern data to destination");
+ goto unlock_vm_and_exit;
+ }
+
+ /* Send PCI data */
+ error = migrate_devs(socket, MIGRATION_SEND_REQ);
+ if (error != 0) {
+ EPRINTF("Could not send pci devs to destination");
+ goto unlock_vm_and_exit;
+ }
+
+ /* Wait for migration completed */
+ error = migration_transfer_data(socket, &migration_completed,
+ sizeof(migration_completed), MIGRATION_RECV_REQ);
+ if ((error != 0) || (migration_completed != MIGRATION_SPECS_OK)) {
+ EPRINTF("Could not recv migration completed remote or received error");
+ if (error == 0)
+ error = EINVAL;
+ goto unlock_vm_and_exit;
+ }
+
+#ifdef BHYVE_DEBUG
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ time_diff_ms = (now.tv_sec - start.tv_sec) * 1000 + (now.tv_nsec - start.tv_nsec) / 1000000;
+ DPRINTF("Live migration downtime - %lu ms", time_diff_ms);
+#endif
+
+ vm_destroy(ctx);
+ exit(0);
+
+unlock_vm_and_exit:
+ rc = vm_resume_user_devs();
+ if (rc != 0)
+ EPRINTF("Could not resume devices");
+ vm_vcpu_resume(ctx);
+
+done:
+ if (page_list_indexes != MAP_FAILED) {
+ munmap(page_list_indexes, pages * sizeof(char));
+ }
+ return (error);
+}
+
+static int
+live_migrate_recv(struct vmctx *ctx, int socket)
+{
+ int error, index;
+ uint8_t rounds;
+ size_t lowmem_size, highmem_size, pages;
+ char *baseaddr, *page_list_indexes;
+
+ page_list_indexes = NULL;
+
+ error = migration_transfer_data(socket, &rounds, sizeof(rounds), MIGRATION_RECV_REQ);
+ if (error != 0) {
+ EPRINTF("Could not recv the number of rounds from remote");
+ goto done;
+ }
+
+ /* Compute memory_size and pages*/
+ vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem_size, &highmem_size);
+
+ pages = (lowmem_size + highmem_size) / PAGE_SIZE;
+
+ madvise(baseaddr, lowmem_size, MADV_WILLNEED);
+ if (highmem_size != 0)
+ madvise(baseaddr + 4 * GB, highmem_size, MADV_WILLNEED);
+
+ /* alloc page_list_indexes */
+ page_list_indexes = malloc(pages * sizeof(char));
+ if (page_list_indexes == NULL) {
+ perror("Page list indexes could not be allocated");
+ error = ENOMEM;
+ goto done;
+ }
+
+ /* The following iteration contains the preliminary round in which the
+ * entire memory is migrated to the destination. Then, for
+ * MIGRATION_ROUNDS - 1 rounds, only the dirtied pages will be migrated.
+ * In the final round, the rest of the pages are migrated.
+ * Since the vcpus are not started, we don't need to lock them, so we
+ * can do the memory migration pretty straight-forward.
+ */
+ DPRINTF("Live migration start");
+ for (index = 0; index <= rounds; index ++) {
+ DPRINTF("Live migration round %d: Start", index);
+ error = migrate_pages(ctx, socket, page_list_indexes, pages, MIGRATION_RECV_REQ);
+ if (error != 0) {
+ EPRINTF("Couldn't recv dirty pages from source");
+ goto done;
+ }
+ }
+ DPRINTF("Live migration done");
+
+ madvise(baseaddr, lowmem_size, MADV_NORMAL);
+ if (highmem_size!= 0)
+ madvise(baseaddr + 4 * GB, highmem_size, MADV_NORMAL);
+
+ error = 0;
+done:
+ if (page_list_indexes != NULL) {
+ free(page_list_indexes);
+ }
+ return (error);
+}
+
static inline int
migrate_connections(struct migrate_req req, int *socket_fd,
enum migration_transfer_req type)
@@ -874,8 +1158,7 @@
}
if (is_live) {
- EPRINTF("Live migration not implemented");
- rc = EOPNOTSUPP;
+ rc = live_migrate_send(ctx, s);
if (rc != 0)
EPRINTF("Could not live migrate the guest's memory");
error = rc;
@@ -964,8 +1247,7 @@
* way in which the memory is migrated.
*/
if (is_live) {
- EPRINTF("Live migration not implemented");
- rc = EOPNOTSUPP;
+ rc = live_migrate_recv(ctx, s);
if (rc != 0) {
EPRINTF("Could not live migrate the guest's memory");
goto done;

File Metadata

Mime Type
text/plain
Expires
Thu, Sep 26, 5:14 AM (21 h, 44 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
12812043
Default Alt Text
D34811.diff (10 KB)

Event Timeline