# This file is part of CloudControl. # # CloudControl is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # CloudControl is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with CloudControl. If not, see . import errno import logging import os import resource import signal import subprocess import sys import traceback from threading import Thread, Event from StringIO import StringIO from cloudcontrol.node.exc import JobError from cloudcontrol.node.utils import num_to_sig, close_fds logger = logging.getLogger(__name__) class JobManager(object): def __init__(self, main_loop): """ :param main_loop: :class:`MainLoop` instance """ def counter(): i = 0 while True: yield i i += 1 self.job_id = counter() self.main = main_loop #: keep an index of all jobs self.jobs = {} def job_start(self): pass def job_stop(self): pass def notify(self, job): """Called when a job is done.""" # by now only remove the job self.remove(job.id) def cancel(self, job_id): """Cancel a job.""" self.jobs[job_id].stop() def remove(self, job_id): try: return self.jobs.pop(job_id) except KeyError: logger.error('Job %s does not exist', job_id) def create(self, job_constructor, *args, **kwargs): """Create a new job and populate job id.""" job = job_constructor(self, *args, **kwargs) self.jobs[job.id] = job return job def get(self, job_id): return self.jobs[job_id] def start(self): pass def stop(self): logger.debug('Stopping all currently running jobs') for job in self.jobs.itervalues(): try: job.stop() except Exception: pass class BaseThreadedJob(Thread): """Job running in a background thread. Handles job notification to the job manager. """ def __init__(self, job_manager): Thread.__init__(self) #: report progress in % self.progress = 0. self.job_manager = job_manager #: job id self.id = job_manager.job_id.next() self.running = False def pre_job(self): """Job preparation that is called when doing start, it can raise exceptions to report error to the caller. In the latter case, it will also removes itself from the job list. """ pass def run_job(self): """Overide this method to define what your job do.""" raise NotImplementedError def run(self): try: self.run_job() except Exception: pass finally: self.running = False def notify(self): self.job_manager.notify(self) def start(self): """Start job in a background thread.""" # first we run pre_job as it could raise an exception try: self.pre_job() except Exception: # in case of error we must remove the job from the manager self.notify() raise # then we start the watcher when it's safe self.running = True Thread.start(self) # thread will signal when it's done using async def wait(self): """For jobs running in a background, this method MUST be called in order to remove the job from the list in the end. """ self.join() self.notify() def start_current(self): """Start job in current thread.""" try: self.pre_job() self.running = True self.run_job() # we could log exceptions here but rather do it inside the run_job # method finally: self.notify() def stop(self): self.running = False class ForkedJob(object): """Job that executes in a fork. When inherit, you must define open_fds property that list file descriptors that must be kept in the child and closed in the parent. .. warning:: logging should not be used in the child as this would cause a deadlock to occur, see http://bugs.python.org/issue6721 """ def __init__(self, job_manager): """ :param job_manager: :class:`JobManager` instance """ self.job_manager = job_manager #: job id self.id = job_manager.job_id.next() self.running = False # event for other thread to wait for job termination self.job_done = Event() self.fork_pid = None # internal event used to wait for forked process termination self.fork_die = Event() # libev child watcher self.fork_watcher = None # return status of forked process self.return_status = None def create_fork_watcher(self): self.fork_watcher = self.job_manager.main.evloop.child(self.fork_pid, False, self.child_cb) self.fork_watcher.start() def child_cb(self, watcher, revents): self.return_status = watcher.rstatus self.fork_die.set() watcher.stop() self.fork_watcher = None def pre_job(self): """This method represents any preparation job that must be done in the parent before the fork. """ pass def run_job(self): """This represent the work that will be done in the forked child. This method MUST be redefined in subclasses. """ pass def after_fork(self): """This method will be called just after fork in the child. It does nothing by default and it can be redifined in subclasses. """ pass def fatal(self, fmt, *args, **kwargs): """Write error message in stderr and exit. :param str fmt: format string :param \*args: arguments for format string :param \*\*kwargs: can contain ('status', :int:) -> exit status of process """ try: status = int(kwargs.get('status', 1)) except (ValueError, TypeError): sys.stderr.write('Bad status argument %s' % status) os._exit(42) try: fmt = fmt % args except (ValueError, TypeError): sys.stderr.write('Bad formatting for string: %s' % fmt) os._exit(42) try: sys.stderr.write(fmt) except IOError: os._exit(42) os._exit(status) def fatal_exc(self, fmt, *args, **kwargs): """Write error message and traceback and exit. :param str fmt: format string :param \*args: arguments for format string :param \*\*kwargs: can contain ('status', :int:) -> exit status of process """ tb = StringIO() tb.write('\n') traceback.print_exc(file=tb) tb.write('\n') fmt += '%s' args = args + (tb.getvalue(),) self.fatal(fmt, *args, status=kwargs.get('status', 1)) def reset_signal_mask(self): signal.signal(signal.SIGTERM, lambda *args: os._exit(1)) signal.signal(signal.SIGUSR1, signal.SIG_IGN) signal.signal(signal.SIGINT, signal.SIG_IGN) def run(self): """This method performs all the hard work by doing the actual fork. It catches all possible exceptions in the child, as this would prevents the latter from going back in the stack and doing nasty things with libev loop or sjRPC. Thus you do not need to capture all exceptions in your code, furthermore, if you need to exit from the child, you'd better use `os._exit `_ function. """ try: self.fork_pid = os.fork() except OSError as exc: logger.error('Cannot fork (job %s): %s', self.id, exc.strerror) raise self.running = True if self.fork_pid == 0: # child # just hope to not receive any signals in between since there is no # way to block signals in python :( try: self.reset_signal_mask() close_fds(exclude_fds=self.open_fds, debug=self.job_manager.main.config.debug) self.after_fork() except: traceback.print_exc() os._exit(1) try: self.run_job() except: sys.stderr.write('Error during job %s\n' % self.id) traceback.print_exc() os._exit(1) else: sys.stderr.write('Job execution went well %s\n' % self.id) os._exit(0) else: self.create_fork_watcher() # close child fds for fd in self.open_fds: try: os.close(fd) except OSError as exc: if exc.errno == errno.EBADF: # FIXME this is weird but it seems to happen sometimes logger.debug('Error while closing fd %s in parent,' ' EBADF (job %s', fd, self.id) continue logger.error('Error while closing fds in parent: %s', exc.strerror) raise def start(self): """This will start the job by executing :py:meth:`pre_job` method and :py:meth:`run`. """ self.pre_job() self.job_manager.main.call_in_main_thread(self.run) def stop(self): """This would be called to stop the job. """ if self.fork_die.is_set(): return try: os.kill(self.fork_pid, signal.SIGKILL) except OSError as exc: if exc.errno == errno.ESRCH: logger.debug('Child already killed') return logger.error('Cannot kill child for IO job: %s', exc.strerror) raise def notify(self): self.job_manager.notify(self) def wait(self): """This will wait for the fork to end and raise exception depending on child return status. .. warning:: This method MUST be called. """ if self.fork_pid is None: return try: self.fork_die.wait() if self.return_status >> 8 != 0: if self.return_status & 0xff == signal.SIGKILL: logger.error('Job was killed') else: raise JobError('Exception during job, returned %s, signal' ' %s' % ( self.return_status >> 8, num_to_sig(self.return_status & 0xff))) finally: self.fork_pid = None self.job_done.set() self.notify() def join(self): """This provides an API similar to `threading.Thread.join `_ , you can wait for the job termination from multiple points in your program but one of these and only one MUST be `wait` method. """ self.job_done.wait() class BaseIOJob(ForkedJob): """Fork job that set ionice on the child.""" #: level of io nice that will be set (see :manpage:`ionice(1)`) IO_NICE = 7 def after_fork(self): try: subprocess.check_call(['ionice', '-n%d' % self.IO_NICE, '-p%d' % os.getpid()], close_fds=True) except subprocess.CalledProcessError as exc: sys.stderr.write('Cannot set ionice, return code %s\n' % exc.returncode)