Commit 051833c8 authored by Anael Beutot's avatar Anael Beutot
Browse files

Safe live migration

parent 348afef5
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -32,3 +32,7 @@ class DRBDError(CCNodeError):

class ConsoleAlreadyOpened(CCNodeError):
    pass


class VMMigrationError(CCNodeError):
    pass
+106 −23
Original line number Diff line number Diff line
import logging
import os
import signal
import socket
import time
import weakref
from StringIO import StringIO
from itertools import chain, imap
@@ -16,7 +19,9 @@ from cloudcontrol.node.hypervisor.lib import (
    EventLoop as VirEventLoop,
)
from cloudcontrol.node.hypervisor.domains import VirtualMachine
from cloudcontrol.node.exc import UndefinedDomain, PoolStorageError, DRBDError
from cloudcontrol.node.exc import (
    UndefinedDomain, PoolStorageError, DRBDError, VMMigrationError,
)
from cloudcontrol.node.hypervisor.jobs import (
    ImportVolume, ExportVolume, TCPTunnel, DRBD,
)
@@ -290,7 +295,8 @@ class Handler(HostHandler):
            logger.error(msg)
            raise UndefinedDomain(msg)

    def vm_migrate_tunneled(self, name, tun_res, migtun_res, unsafe=False):
    def vm_migrate_tunneled(self, name, tun_res, migtun_res, unsafe=False,
                            timeout=60.):
        """Live migrate VM through TCP tunnel.

        :param name: VM name to migrate
@@ -298,6 +304,8 @@ class Handler(HostHandler):
        :param migtun_res: result of tunnel setup handler
        :param unsafe: for Libvirt >= 0.9.11, see
            http://libvirt.org/html/libvirt-libvirt.html#virDomainMigrateFlags
        :param float timeout: timeout for libvirt migration (prevents libvirt
            from trying to acquire domain lock forever)
        """
        logger.debug('VM live migrate %s', name)

@@ -326,13 +334,40 @@ class Handler(HostHandler):
            logger.exception('Cannot connect to remote libvirt for live'
                             ' migrating vm %s', name)
            raise


        # we open a new connection to libvirt and fork because sometimes libvirt
        # python binding, while doing a operation,
        # doesn't seem to realease CPython's GIL, therefore all node
        # operations are blocked
        # the only solution we have found right now is to use a dedicated
        # libvirt connection for the migration and fork, the migration operation
        # in itself is handled by the child while other threads can be scheduled

        # create a new libvirt connection dedicated to migration
        try:
            new_con = libvirt.open('qemu:///system')
            domain = new_con.lookupByUUIDString(vm.uuid)
        except libvirt.libvirtError:
            logger.exception('Cannot connect to libvirt')
            raise

        try:
            pid = os.fork()
        except OSError:
            logger.error('Cannot fork before running live migration')
            raise

        if pid == 0:
            # child
            # FIXME we should close all unused fds
            try:
                if unsafe:
                    # VIR_MIGRATE_UNSAFE is not defined for libvirt < 0.9.11
                    append_flags = getattr(libvirt, 'VIR_MIGRATE_UNSAFE', 0)
                else:
                    append_flags = 0
            vm.lv_dom.migrate(
                domain.migrate(
                    dest_virt_con,
                    libvirt.VIR_MIGRATE_LIVE | libvirt.VIR_MIGRATE_PEER2PEER |
                    libvirt.VIR_MIGRATE_TUNNELLED |
@@ -344,13 +379,61 @@ class Handler(HostHandler):
                    0,
                )
            except libvirt.libvirtError:
            # FIXME maybe we should catch some weird crap libvirt bad exception
            logger.exception('Libvirt error while live migrating vm %s', name)
            raise
                os._exit(1)
            except:
                # whatever the matter is we MUST NOT return to libev or sjRPC
                os._exit(2)
            else:
                os._exit(0)
            finally:
                dest_virt_con.close()
        else:
            # watch for migration status every second
            started_migration = time.time()
            while True:
                # wait timeout
                time.sleep(1.)

                # waitpid with no delay
                try:
                    rpid, status = os.waitpid(pid, os.WNOHANG)
                except OSError as exc:
                    logger.error('Error while waiting for child to terminate: %s',
                                 os.strerror(exc.errno))
                    raise

                # convert status to return status
                status >>= 8
                if rpid == status == 0:
                    if time.time() - started_migration < timeout:
                        continue

                    # waitpid returned immediately, thus migration still running
                    # after timeout fired, we need to kill the child (term would
                    # have no effect)
                    os.kill(pid, signal.SIGKILL)

        logger.debug('Sucessfuly live migrated vm %s', name)
                    try:
                        rpid, status = os.waitpid(pid, 0)
                    except OSError as exc:
                        logger.error('Error while waiting for child after killing'
                                     ' it: %s', os.strerror(exc.errno))
                        raise

                    assert rpid == pid, 'PID returned by waitpid is not valid'

                    logger.error('Migration timeout for vm %s', name)
                    raise VMMigrationError('Timeout')
                else:
                    if status != 0:
                        # error
                        logger.error('Libvirt error while live migrating vm %s',
                                     name)
                        logger.debug('Exit status %s', status)
                        raise VMMigrationError('Migration failed')
                    else:
                        logger.info('Sucessfuly live migrated vm %s', name)
                        break

    @threadless
    @pass_connection