Loading cloudcontrol/node/exc.py +4 −0 Original line number Diff line number Diff line Loading @@ -32,3 +32,7 @@ class DRBDError(CCNodeError): class ConsoleAlreadyOpened(CCNodeError): pass class VMMigrationError(CCNodeError): pass cloudcontrol/node/hypervisor/__init__.py +106 −23 Original line number Diff line number Diff line import logging import os import signal import socket import time import weakref from StringIO import StringIO from itertools import chain, imap Loading @@ -16,7 +19,9 @@ from cloudcontrol.node.hypervisor.lib import ( EventLoop as VirEventLoop, ) from cloudcontrol.node.hypervisor.domains import VirtualMachine from cloudcontrol.node.exc import UndefinedDomain, PoolStorageError, DRBDError from cloudcontrol.node.exc import ( UndefinedDomain, PoolStorageError, DRBDError, VMMigrationError, ) from cloudcontrol.node.hypervisor.jobs import ( ImportVolume, ExportVolume, TCPTunnel, DRBD, ) Loading Loading @@ -290,7 +295,8 @@ class Handler(HostHandler): logger.error(msg) raise UndefinedDomain(msg) def vm_migrate_tunneled(self, name, tun_res, migtun_res, unsafe=False): def vm_migrate_tunneled(self, name, tun_res, migtun_res, unsafe=False, timeout=60.): """Live migrate VM through TCP tunnel. :param name: VM name to migrate Loading @@ -298,6 +304,8 @@ class Handler(HostHandler): :param migtun_res: result of tunnel setup handler :param unsafe: for Libvirt >= 0.9.11, see http://libvirt.org/html/libvirt-libvirt.html#virDomainMigrateFlags :param float timeout: timeout for libvirt migration (prevents libvirt from trying to acquire domain lock forever) """ logger.debug('VM live migrate %s', name) Loading Loading @@ -326,13 +334,40 @@ class Handler(HostHandler): logger.exception('Cannot connect to remote libvirt for live' ' migrating vm %s', name) raise # we open a new connection to libvirt and fork because sometimes libvirt # python binding, while doing a operation, # doesn't seem to realease CPython's GIL, therefore all node # operations are blocked # the only solution we have found right now is to use a dedicated # libvirt connection for the migration and fork, the migration operation # in itself is handled by the child while other threads can be scheduled # create a new libvirt connection dedicated to migration try: new_con = libvirt.open('qemu:///system') domain = new_con.lookupByUUIDString(vm.uuid) except libvirt.libvirtError: logger.exception('Cannot connect to libvirt') raise try: pid = os.fork() except OSError: logger.error('Cannot fork before running live migration') raise if pid == 0: # child # FIXME we should close all unused fds try: if unsafe: # VIR_MIGRATE_UNSAFE is not defined for libvirt < 0.9.11 append_flags = getattr(libvirt, 'VIR_MIGRATE_UNSAFE', 0) else: append_flags = 0 vm.lv_dom.migrate( domain.migrate( dest_virt_con, libvirt.VIR_MIGRATE_LIVE | libvirt.VIR_MIGRATE_PEER2PEER | libvirt.VIR_MIGRATE_TUNNELLED | Loading @@ -344,13 +379,61 @@ class Handler(HostHandler): 0, ) except libvirt.libvirtError: # FIXME maybe we should catch some weird crap libvirt bad exception logger.exception('Libvirt error while live migrating vm %s', name) raise os._exit(1) except: # whatever the matter is we MUST NOT return to libev or sjRPC os._exit(2) else: os._exit(0) finally: dest_virt_con.close() else: # watch for migration status every second started_migration = time.time() while True: # wait timeout time.sleep(1.) # waitpid with no delay try: rpid, status = os.waitpid(pid, os.WNOHANG) except OSError as exc: logger.error('Error while waiting for child to terminate: %s', os.strerror(exc.errno)) raise # convert status to return status status >>= 8 if rpid == status == 0: if time.time() - started_migration < timeout: continue # waitpid returned immediately, thus migration still running # after timeout fired, we need to kill the child (term would # have no effect) os.kill(pid, signal.SIGKILL) logger.debug('Sucessfuly live migrated vm %s', name) try: rpid, status = os.waitpid(pid, 0) except OSError as exc: logger.error('Error while waiting for child after killing' ' it: %s', os.strerror(exc.errno)) raise assert rpid == pid, 'PID returned by waitpid is not valid' logger.error('Migration timeout for vm %s', name) raise VMMigrationError('Timeout') else: if status != 0: # error logger.error('Libvirt error while live migrating vm %s', name) logger.debug('Exit status %s', status) raise VMMigrationError('Migration failed') else: logger.info('Sucessfuly live migrated vm %s', name) break @threadless @pass_connection Loading Loading
cloudcontrol/node/exc.py +4 −0 Original line number Diff line number Diff line Loading @@ -32,3 +32,7 @@ class DRBDError(CCNodeError): class ConsoleAlreadyOpened(CCNodeError): pass class VMMigrationError(CCNodeError): pass
cloudcontrol/node/hypervisor/__init__.py +106 −23 Original line number Diff line number Diff line import logging import os import signal import socket import time import weakref from StringIO import StringIO from itertools import chain, imap Loading @@ -16,7 +19,9 @@ from cloudcontrol.node.hypervisor.lib import ( EventLoop as VirEventLoop, ) from cloudcontrol.node.hypervisor.domains import VirtualMachine from cloudcontrol.node.exc import UndefinedDomain, PoolStorageError, DRBDError from cloudcontrol.node.exc import ( UndefinedDomain, PoolStorageError, DRBDError, VMMigrationError, ) from cloudcontrol.node.hypervisor.jobs import ( ImportVolume, ExportVolume, TCPTunnel, DRBD, ) Loading Loading @@ -290,7 +295,8 @@ class Handler(HostHandler): logger.error(msg) raise UndefinedDomain(msg) def vm_migrate_tunneled(self, name, tun_res, migtun_res, unsafe=False): def vm_migrate_tunneled(self, name, tun_res, migtun_res, unsafe=False, timeout=60.): """Live migrate VM through TCP tunnel. :param name: VM name to migrate Loading @@ -298,6 +304,8 @@ class Handler(HostHandler): :param migtun_res: result of tunnel setup handler :param unsafe: for Libvirt >= 0.9.11, see http://libvirt.org/html/libvirt-libvirt.html#virDomainMigrateFlags :param float timeout: timeout for libvirt migration (prevents libvirt from trying to acquire domain lock forever) """ logger.debug('VM live migrate %s', name) Loading Loading @@ -326,13 +334,40 @@ class Handler(HostHandler): logger.exception('Cannot connect to remote libvirt for live' ' migrating vm %s', name) raise # we open a new connection to libvirt and fork because sometimes libvirt # python binding, while doing a operation, # doesn't seem to realease CPython's GIL, therefore all node # operations are blocked # the only solution we have found right now is to use a dedicated # libvirt connection for the migration and fork, the migration operation # in itself is handled by the child while other threads can be scheduled # create a new libvirt connection dedicated to migration try: new_con = libvirt.open('qemu:///system') domain = new_con.lookupByUUIDString(vm.uuid) except libvirt.libvirtError: logger.exception('Cannot connect to libvirt') raise try: pid = os.fork() except OSError: logger.error('Cannot fork before running live migration') raise if pid == 0: # child # FIXME we should close all unused fds try: if unsafe: # VIR_MIGRATE_UNSAFE is not defined for libvirt < 0.9.11 append_flags = getattr(libvirt, 'VIR_MIGRATE_UNSAFE', 0) else: append_flags = 0 vm.lv_dom.migrate( domain.migrate( dest_virt_con, libvirt.VIR_MIGRATE_LIVE | libvirt.VIR_MIGRATE_PEER2PEER | libvirt.VIR_MIGRATE_TUNNELLED | Loading @@ -344,13 +379,61 @@ class Handler(HostHandler): 0, ) except libvirt.libvirtError: # FIXME maybe we should catch some weird crap libvirt bad exception logger.exception('Libvirt error while live migrating vm %s', name) raise os._exit(1) except: # whatever the matter is we MUST NOT return to libev or sjRPC os._exit(2) else: os._exit(0) finally: dest_virt_con.close() else: # watch for migration status every second started_migration = time.time() while True: # wait timeout time.sleep(1.) # waitpid with no delay try: rpid, status = os.waitpid(pid, os.WNOHANG) except OSError as exc: logger.error('Error while waiting for child to terminate: %s', os.strerror(exc.errno)) raise # convert status to return status status >>= 8 if rpid == status == 0: if time.time() - started_migration < timeout: continue # waitpid returned immediately, thus migration still running # after timeout fired, we need to kill the child (term would # have no effect) os.kill(pid, signal.SIGKILL) logger.debug('Sucessfuly live migrated vm %s', name) try: rpid, status = os.waitpid(pid, 0) except OSError as exc: logger.error('Error while waiting for child after killing' ' it: %s', os.strerror(exc.errno)) raise assert rpid == pid, 'PID returned by waitpid is not valid' logger.error('Migration timeout for vm %s', name) raise VMMigrationError('Timeout') else: if status != 0: # error logger.error('Libvirt error while live migrating vm %s', name) logger.debug('Exit status %s', status) raise VMMigrationError('Migration failed') else: logger.info('Sucessfuly live migrated vm %s', name) break @threadless @pass_connection Loading