Skip to content
Snippets Groups Projects
Commit 68fb8f45 authored by Thibault VINCENT's avatar Thibault VINCENT
Browse files

add: job for drbd live volume copy

parent de63350a
No related branches found
No related tags found
No related merge requests found
......@@ -7,8 +7,7 @@ from threading import Lock, RLock, Thread, Event
from logging import debug
from utils import RWLock
from drbd import DRBD
from errors import (JobManagerError, JobError, XferJobError, ReceiveFileJobError,
SendFileJobError, LVMError, DRBDError)
from lvm import LVM
from errors import (JobManagerError, JobError, XferJobError,
ReceiveFileJobError, SendFileJobError, DrbdCopyJobError, LVMError,
DRBDError, TCPTunnelJobError)
......@@ -356,7 +355,7 @@ class XferJob(BaseJob):
# tcp port range used for transferts
TCP_PORT_MIN = 42000
TCP_PORT_MAX = 42999
TCP_PORT_MAX = 43999
def __init__(self, manager, path):
'''
......@@ -375,6 +374,7 @@ class XferJob(BaseJob):
return self._checksum
#TODO rewrite with the _cancel() and _cleanup() methods
class SendFileJob(XferJob):
'''
'''
......@@ -382,7 +382,7 @@ class SendFileJob(XferJob):
'''
'''
self._host = remote_host
self._port = int(remote_port)
self._local_port = int(remote_port)
super(SendFileJob, self).__init__(manager, path)
def _validate(self):
......@@ -391,8 +391,8 @@ class SendFileJob(XferJob):
return ( super(SendFileJob, self)._validate()
and os.path.isfile(self._path)
and os.access(self._path, os.F_OK | os.R_OK )
and self._port >= self.TCP_PORT_MIN
and self._port <= self.TCP_PORT_MAX
and self._local_port >= self.TCP_PORT_MIN
and self._local_port <= self.TCP_PORT_MAX
and isinstance(self._host, str))
def _prepare(self):
......@@ -409,14 +409,14 @@ class SendFileJob(XferJob):
try:
# try to connect
self._log.append('connecting to `%s`:`%i`' % (self._host,
self._port))
self._local_port))
self._sock.settimeout(10)
retry = 0
connected = False
while retry < 6 and not connected and not self._cancelled:
retry += 1
try:
self._sock.connect((self._host, self._port))
self._sock.connect((self._host, self._local_port))
except socket.timeout:
pass
except socket.error:
......@@ -466,17 +466,18 @@ class SendFileJob(XferJob):
pass
#TODO rewrite with the _cancel() and _cleanup() methods
class ReceiveFileJob(XferJob):
'''
'''
_ports_used = []
_ports_used_local = []
_ports_used_mutex = RWLock()
def __init__(self, manager, path, create=False):
'''
'''
self._create = create
self._port = None
self._local_port = None
super(ReceiveFileJob, self).__init__(manager, path)
def _validate(self):
......@@ -504,14 +505,14 @@ class ReceiveFileJob(XferJob):
# find a port to bind to
try:
with self._ports_used_mutex.write:
if port not in self._ports_used:
if port not in self._ports_used_local:
self._sock.bind(('', port))
self._port = port
self._ports_used.append(port)
self._local_port = port
self._ports_used_local.append(port)
break
except socket.error:
pass
if not self._port:
if not self._local_port:
msg = 'no port to listen to'
self._log.append(msg)
raise ReceiveFileJobError(msg)
......@@ -525,7 +526,7 @@ class ReceiveFileJob(XferJob):
self._sock.listen(1)
self._sock.settimeout(1)
# wait for connection
self._log.append('waiting for connection on port `%i`' % self._port)
self._log.append('waiting for connection on port `%i`' % self._local_port)
retry = 0
conn = None
while retry < 60 and not conn and not self._cancelled:
......@@ -570,8 +571,8 @@ class ReceiveFileJob(XferJob):
except:
pass
with self._ports_used_mutex.write:
if self._port in self._ports_used:
self._ports_used.remove(port)
if self._local_port in self._ports_used_local:
self._ports_used_local.remove(self._local_port)
try:
fd.close()
except:
......@@ -580,7 +581,283 @@ class ReceiveFileJob(XferJob):
def get_port(self):
'''
'''
return int(self._port)
return int(self._local_port)
class DrbdCopyJob(BaseJob):
'''
'''
def __init__(self, manager, lvm, drbd_pool, source_vg, source_lv):
'''
'''
self._mutex_step = Lock()
self._lvm = lvm
self._pool = drbd_pool
self._source_vgname = source_vg
self._source_lvname = source_lv
super(DrbdCopyJob, self).__init__(manager)
self._drbd = self._pool.new_device()
# checkpoint events
self._event_connected_ready = Event()
self._event_connected = Event()
self._event_rolechosen_ready = Event()
self._event_rolechosen = Event()
self._event_sleep_ready = Event()
self._event_sleep = Event()
def _cancel(self):
'''
'''
# release some pending calls
self._event_sleep_ready.set()
self._event_sleep.set()
def _cleanup(self):
'''
'''
# destroy drbd device
try:
self._log.append('destroying the drbd device')
self._drbd.destroy()
except Exception as err:
self._log.append('failed to destroy the drbd device')
else:
self._log.append('successfully destroyed the drbd device')
# we wait for any potential job step to finish
# should be quick after the drbd destruction (kills all waits)
with self._mutex_step:
# destroy drbd meta
try:
self._log.append('removing the drbd meta volume')
self._lvm.get_vg(self._source_vgname).remove(self._meta_lvname)
except Exception as err:
self._log.append('failed to remove the drbd meta volume')
else:
self._log.append('successfully removed the drbd meta volume')
# remove source LV copy
try:
self._log.append('removing the DM device copy')
self._lvm.dm_remove(self._copy_dm)
except Exception as err:
self._log.append('failed to remove the DM device copy')
else:
self._log.append('successfully removed DM device copy')
def _prepare(self):
'''
'''
self._lvm.reload()
# get source LV handle
try:
lv = self._lvm.get_lv(self._source_vgname, self._source_lvname)
except:
msg = 'failed to fetch LV `%s/%s`' % (self._source_vgname,
self._source_lvname)
self._log.append(msg)
raise DrbdCopyJobError(msg)
# get source LV infos
self._source_path = lv.path
self._source_size = lv.size
self._source_dm = lv.dm_name()
self._source_table = lv.dm_table()
self._drbd_table = None
# we got here, so all went fine
# just a safety
return lv.size > 0
def _job(self):
'''
'''
# checkpoint
if self._cancelled:
return
self._lvm.reload()
with self._mutex_step:
# create drbd meta
try:
self._log.append('creating a drbd meta volume')
# now let's create a meta device for DRBD in the same VG
# MS = C/32768 + 4MiB && MS >= 128MiB
# http://www.drbd.org/users-guide/ch-internals.html
self._meta_lvname = '%s.drbdmeta' % self._source_lvname
self._meta_path = '/dev/%s/%s' % (self._source_vgname,
self._meta_lvname)
self._meta_size = int(max(self._source_size/32768 + 4 * 2**20,
128 * 2**20))
self._lvm.get_vg(self._source_vgname).create(self._meta_lvname,
self._meta_size)
self._lvm.reload()
self._meta_dm = self._lvm.get_lv(self._source_vgname,
self._meta_lvname).dm_name()
except Exception as err:
self._log.append('failed to create a drbd meta volume')
raise err
else:
self._log.append('successfully created the drbd meta volume'
' `%s`' % self._meta_path)
# checkpoint
if self._cancelled:
return
with self._mutex_step:
# create a copy of the source LV
try:
self._log.append('creating a device copy of `%s`' %
self._source_path)
self._copy_dm = '%s.%s.copy' % (self._source_vgname,
self._source_lvname)
self._copy_path = '/dev/mapper/%s' % self._copy_dm
self._lvm.dm_create(self._copy_dm, self._source_table)
except Exception as err:
self._log.append('failed to create a device copy')
raise err
else:
self._log.append('successfully copied device as `%s`' %
self._copy_path)
# checkpoint
if self._cancelled:
return
with self._mutex_step:
# setup the drbd device
try:
self._log.append('configuring the drbd device `%s`' %
self._drbd.get_path())
self._drbd.setup(self._copy_path, self._meta_path)
self._drbd_table = '0 %d linear %s 0' % (self._source_size / 512
, self._source_path)
except Exception as err:
self._log.append('failed to configure the drbd device')
raise err
else:
self._log.append('successfully configured the drbd device')
# blocking checkpoint
# wait for call to connect()
if self._cancelled:
return
self._event_connected_ready.set()
self._event_connected.wait()
if self._cancelled:
return
with self._mutex_step:
# wait for a connection with the remote host
try:
self._log.append('waiting for node connection')
self._drbd.wait_connection()
except Exception as err:
self._log.append('failed to establish a connection')
raise err
else:
self._log.append('connected with a remote node')
# blocking checkpoint
# wait for call to role()
if self._cancelled:
return
self._event_rolechosen_ready.set()
self._event_rolechosen.wait()
if self._cancelled:
return
# wait for the sync completion
try:
self._log.append('waiting for initial device sync')
self._drbd.wait_sync()
except Exception as err:
self._log.append('failed to sync the DRBD')
raise err
else:
self._log.append('DRBD is sync\'ed !')
# sleep until the DRBD is shutdown
self._log.append('job entering sleep')
self._event_sleep_ready.set()
self._event_sleep.wait()
self._log.append('job leaving sleep')
def drbd_get_port(self):
'''
'''
return self._drbd.get_port()
def drbd_connect(self, remote_addr, remote_port):
'''
'''
if not self._event_connected.is_set():
if not self._event_connected_ready.is_set():
self._log.append('can\'t connect right now, waiting for '
'initialization to complete...')
self._event_connected_ready.wait()
with self._mutex_step:
# set endpoint
self._drbd.connect(remote_addr, remote_port)
# unlock any connection wait
self._event_connected.set()
else:
raise DrbdCopyJobError('already connected')
def drbd_role(self, primary=None):
'''
'''
if self._event_connected.is_set():
self._event_rolechosen_ready.wait()
if primary is True or primary is False:
# change node role
if primary:
self._log.append('switching to primary mode')
self._drbd.primary()
else:
self._log.append('switching to secondary mode')
self._drbd.secondary()
# unlock any role change wait
self._event_rolechosen.set()
else:
raise DrbdCopyJobError('cannot change role, not connected yet')
def drbd_waitsync(self):
'''
'''
self._log.append('waiting for sync')
# wait for initial sync and setup to be completed
self._event_sleep_ready.wait()
# wait for DRBD sync
with self._mutex_step:
self._drbd.wait_sync()
def drbd_takeover(self, state):
'''
'''
# FIXME comment that
# FIXME check events
with self._mutex_step:
#
if state is True:
if self._drbd_table is not None:
table = self._drbd_table
else:
raise DrbdCopyJobError('no DRBD table available yet')
else:
table = self._source_table
#
self._lvm.dm_set_table(self._source_dm, table)
def drbd_stop(self):
'''
'''
self._event_sleep_ready.wait(5) # to fix thread sync issue
if self._event_sleep_ready.is_set():
with self._mutex_step:
# unlock job termination
self._event_sleep.set()
else:
raise DrbdCopyJobError('can\'t stop now')
class TCPTunnelJob(BaseJob):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment