Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F96491526
D34811.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
10 KB
Referenced Files
None
Subscribers
None
D34811.diff
View Options
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -268,6 +268,12 @@
int vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta);
int vm_restore_time(struct vmctx *ctx);
+/*
+ * Live migration
+ */
+int vm_get_dirty_page_list(struct vmctx *ctx, char *page_list, size_t num,
+ bool is_all_dirty);
+
/*
* Deprecated interfaces, do not use them in new code.
*/
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -1734,6 +1734,28 @@
return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy));
}
+int
+vm_get_dirty_page_list(struct vmctx *ctx, char *page_list, size_t num_pages, bool is_all_dirty)
+{
+ struct vm_get_dirty_page_list list;
+
+ bzero(&list, sizeof(struct vm_get_dirty_page_list));
+ list.page_list = (uint8_t *)page_list;
+ list.num_pages = num_pages;
+ list.is_all_dirty = is_all_dirty;
+
+ list.lowmem_start = 0;
+ list.lowmem_end = ctx->lowmem;
+ list.highmem_start = ctx->highmem != 0 ? 4 * GB : -1;
+ list.highmem_end = ctx->highmem != 0 ? 4 * GB + ctx->highmem : -1;
+
+ madvise(ctx->baseaddr, ctx->lowmem, MADV_WILLNEED);
+ if (ctx->highmem != 0)
+ madvise(ctx->baseaddr + 4 * GB, ctx->highmem, MADV_WILLNEED);
+
+ return (ioctl(ctx->fd, VM_GET_DIRTY_PAGE_LIST, &list));
+}
+
int
vm_set_topology(struct vmctx *ctx,
uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
@@ -1788,7 +1810,7 @@
VM_SET_INTINFO, VM_GET_INTINFO,
VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME,
VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY,
- VM_SNAPSHOT_REQ, VM_RESTORE_TIME
+ VM_SNAPSHOT_REQ, VM_RESTORE_TIME, VM_GET_DIRTY_PAGE_LIST
};
int
diff --git a/usr.sbin/bhyve/migration.c b/usr.sbin/bhyve/migration.c
--- a/usr.sbin/bhyve/migration.c
+++ b/usr.sbin/bhyve/migration.c
@@ -30,6 +30,9 @@
#include <stdlib.h>
#include <string.h>
#include <vmmapi.h>
+#ifdef BHYVE_DEBUG
+#include <time.h>
+#endif
#include "migration.h"
#include "pci_emul.h"
@@ -742,6 +745,287 @@
return (error);
}
+#define MIGRATION_ROUNDS 4
+
+static int
+migrate_segment_pages(int socket, char *baseaddr, char *page_list,
+ size_t nr_pages, enum migration_transfer_req migration_req)
+{
+ size_t start_dirty_page;
+ int rc;
+
+ for (size_t i = 0; i < nr_pages; i++) {
+ if (page_list[i] == 0)
+ continue;
+
+ start_dirty_page = i;
+
+ for (; i < nr_pages; i++) {
+ if (page_list[i] == 0)
+ break;
+ }
+
+ /* Transfer all continous dirty pages into the vm's memory */
+ rc = migration_transfer_data(socket, baseaddr + start_dirty_page * PAGE_SIZE,
+ (i - start_dirty_page) * PAGE_SIZE, migration_req);
+ if (rc != 0)
+ return (rc);
+ }
+
+ return (0);
+}
+
+static int
+migrate_pages(struct vmctx *ctx, int socket, char *page_list,
+ size_t page_list_size, enum migration_transfer_req migration_req)
+{
+ size_t lowmem_limit_page, lowmem, highmem;
+ int rc;
+ char *baseaddr;
+
+ if ((migration_req != MIGRATION_SEND_REQ) && (migration_req != MIGRATION_RECV_REQ)) {
+ EPRINTF("wrong migration transfer req");
+ return (EINVAL);
+ }
+
+ /*
+ * Transfer the state of the pages (dirty/not dirty) from the source
+ * host to the destination host. The pages that are dirty will be
+ * transferred in the next steps.
+ */
+ rc = migration_transfer_data(socket, page_list, page_list_size, migration_req);
+ if (rc != 0) {
+ EPRINTF("Could not transfer page list");
+ return (rc);
+ }
+
+ vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem, &highmem);
+
+ /* Lowmem segment */
+ rc = migrate_segment_pages(socket, baseaddr,
+ page_list, lowmem / PAGE_SIZE, migration_req);
+ if (rc != 0) {
+ EPRINTF("Could not migrate the lowmem segment pages");
+ return (rc);
+ }
+
+ /* Highmem segment */
+ if (highmem == 0)
+ return (0);
+
+ lowmem_limit_page = vm_get_lowmem_limit(ctx) / PAGE_SIZE;
+
+ rc = migrate_segment_pages(socket, baseaddr + 4 * GB,
+ page_list + lowmem_limit_page, highmem / PAGE_SIZE, migration_req);
+ if (rc != 0) {
+ EPRINTF("Could not migrate the highmem segment pages");
+ return (rc);
+ }
+
+ return (0);
+}
+
+static int
+live_migrate_send(struct vmctx *ctx, int socket)
+{
+ int error, i, rc;
+ uint8_t rounds;
+ size_t migration_completed;
+ size_t pages;
+ char *page_list_indexes;
+
+#ifdef BHYVE_DEBUG
+ struct timespec start, now;
+ uint64_t time_diff_ms;
+#endif
+
+ error = 0;
+ page_list_indexes = NULL;
+ rounds = MIGRATION_ROUNDS;
+
+ /* Send the number of memory rounds to destination */
+ error = migration_transfer_data(socket, &rounds, sizeof(rounds), MIGRATION_SEND_REQ);
+ if (error != 0) {
+ EPRINTF("Could not transfer the number of rounds");
+ goto done;
+ }
+
+ pages = (vm_get_lowmem_size(ctx) + vm_get_highmem_size(ctx)) / PAGE_SIZE;
+
+ /* alloc page_list_indexes */
+ page_list_indexes = mmap(NULL, pages * sizeof(char), PROT_READ | PROT_WRITE, MAP_ANON, -1, 0);
+ if (page_list_indexes == MAP_FAILED) {
+ perror("Page list indexes could not be allocated");
+ error = errno;
+ goto done;
+ }
+
+ if (mlock(page_list_indexes, pages * sizeof(char)) == -1) {
+ perror("Page list indexes could not be locked");
+ error = errno;
+ goto done;
+ }
+
+ for (i = 0; i <= MIGRATION_ROUNDS; i++) {
+ if (i == MIGRATION_ROUNDS) {
+ /* Last round */
+ vm_vcpu_pause(ctx);
+
+ rc = vm_pause_user_devs();
+ if (rc != 0) {
+ EPRINTF("Could not pause devices");
+ error = rc;
+ goto unlock_vm_and_exit;
+ }
+ }
+
+ DPRINTF("Live migration round %d - Start", i);
+#ifdef BHYVE_DEBUG
+ clock_gettime(CLOCK_MONOTONIC, &start);
+#endif
+ memset(page_list_indexes, 0, pages);
+
+ /* Search the dirty pages and populate page_list_index */
+ rc = vm_get_dirty_page_list(ctx, page_list_indexes, pages, i == 0);
+
+ if (rc != 0) {
+ EPRINTF("Couldn't search for the dirty pages");
+ error = errno;
+ if (i == MIGRATION_ROUNDS)
+ goto unlock_vm_and_exit;
+ else
+ goto done;
+ }
+
+ DPRINTF("Live migration round %d: Finished searching the dirty pages", i);
+
+ error = migrate_pages(ctx, socket, page_list_indexes,
+ pages, MIGRATION_SEND_REQ);
+ if (error != 0) {
+ EPRINTF("Couldn't send dirty pages to dest");
+ if (i == MIGRATION_ROUNDS)
+ goto unlock_vm_and_exit;
+ else
+ goto done;
+ }
+
+#ifdef BHYVE_DEBUG
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ time_diff_ms = (now.tv_sec - start.tv_sec) * 1000 +
+ (now.tv_nsec - start.tv_nsec) / 1000000;
+ DPRINTF("Live migration round %d - Done - %lu ms", i, time_diff_ms);
+#endif
+ }
+
+ /* Send kern data */
+ error = migrate_kern_data(ctx, socket, MIGRATION_SEND_REQ);
+ if (error != 0) {
+ EPRINTF("Could not send kern data to destination");
+ goto unlock_vm_and_exit;
+ }
+
+ /* Send PCI data */
+ error = migrate_devs(socket, MIGRATION_SEND_REQ);
+ if (error != 0) {
+ EPRINTF("Could not send pci devs to destination");
+ goto unlock_vm_and_exit;
+ }
+
+ /* Wait for migration completed */
+ error = migration_transfer_data(socket, &migration_completed,
+ sizeof(migration_completed), MIGRATION_RECV_REQ);
+ if ((error != 0) || (migration_completed != MIGRATION_SPECS_OK)) {
+ EPRINTF("Could not recv migration completed remote or received error");
+ if (error == 0)
+ error = EINVAL;
+ goto unlock_vm_and_exit;
+ }
+
+#ifdef BHYVE_DEBUG
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ time_diff_ms = (now.tv_sec - start.tv_sec) * 1000 + (now.tv_nsec - start.tv_nsec) / 1000000;
+ DPRINTF("Live migration downtime - %lu ms", time_diff_ms);
+#endif
+
+ vm_destroy(ctx);
+ exit(0);
+
+unlock_vm_and_exit:
+ rc = vm_resume_user_devs();
+ if (rc != 0)
+ EPRINTF("Could not resume devices");
+ vm_vcpu_resume(ctx);
+
+done:
+ if (page_list_indexes != MAP_FAILED) {
+ munmap(page_list_indexes, pages * sizeof(char));
+ }
+ return (error);
+}
+
+static int
+live_migrate_recv(struct vmctx *ctx, int socket)
+{
+ int error, index;
+ uint8_t rounds;
+ size_t lowmem_size, highmem_size, pages;
+ char *baseaddr, *page_list_indexes;
+
+ page_list_indexes = NULL;
+
+ error = migration_transfer_data(socket, &rounds, sizeof(rounds), MIGRATION_RECV_REQ);
+ if (error != 0) {
+ EPRINTF("Could not recv the number of rounds from remote");
+ goto done;
+ }
+
+ /* Compute memory_size and pages*/
+ vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem_size, &highmem_size);
+
+ pages = (lowmem_size + highmem_size) / PAGE_SIZE;
+
+ madvise(baseaddr, lowmem_size, MADV_WILLNEED);
+ if (highmem_size != 0)
+ madvise(baseaddr + 4 * GB, highmem_size, MADV_WILLNEED);
+
+ /* alloc page_list_indexes */
+ page_list_indexes = malloc(pages * sizeof(char));
+ if (page_list_indexes == NULL) {
+ perror("Page list indexes could not be allocated");
+ error = ENOMEM;
+ goto done;
+ }
+
+ /* The following iteration contains the preliminary round in which the
+ * entire memory is migrated to the destination. Then, for
+ * MIGRATION_ROUNDS - 1 rounds, only the dirtied pages will be migrated.
+ * In the final round, the rest of the pages are migrated.
+ * Since the vcpus are not started, we don't need to lock them, so we
+ * can do the memory migration pretty straight-forward.
+ */
+ DPRINTF("Live migration start");
+ for (index = 0; index <= rounds; index ++) {
+ DPRINTF("Live migration round %d: Start", index);
+ error = migrate_pages(ctx, socket, page_list_indexes, pages, MIGRATION_RECV_REQ);
+ if (error != 0) {
+ EPRINTF("Couldn't recv dirty pages from source");
+ goto done;
+ }
+ }
+ DPRINTF("Live migration done");
+
+ madvise(baseaddr, lowmem_size, MADV_NORMAL);
+ if (highmem_size!= 0)
+ madvise(baseaddr + 4 * GB, highmem_size, MADV_NORMAL);
+
+ error = 0;
+done:
+ if (page_list_indexes != NULL) {
+ free(page_list_indexes);
+ }
+ return (error);
+}
+
static inline int
migrate_connections(struct migrate_req req, int *socket_fd,
enum migration_transfer_req type)
@@ -874,8 +1158,7 @@
}
if (is_live) {
- EPRINTF("Live migration not implemented");
- rc = EOPNOTSUPP;
+ rc = live_migrate_send(ctx, s);
if (rc != 0)
EPRINTF("Could not live migrate the guest's memory");
error = rc;
@@ -964,8 +1247,7 @@
* way in which the memory is migrated.
*/
if (is_live) {
- EPRINTF("Live migration not implemented");
- rc = EOPNOTSUPP;
+ rc = live_migrate_recv(ctx, s);
if (rc != 0) {
EPRINTF("Could not live migrate the guest's memory");
goto done;
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Sep 26, 5:14 AM (21 h, 44 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
12812043
Default Alt Text
D34811.diff (10 KB)
Attached To
Mode
D34811: Live Migration feature for bhyve [Part 2]
Attached
Detach File
Event Timeline
Log In to Comment