-
Anael Beutot authoredAnael Beutot authored
jobs.py 12.13 KiB
import errno
import logging
import os
import resource
import signal
import subprocess
import sys
import traceback
from threading import Thread, Event
from StringIO import StringIO
from cloudcontrol.node.exc import JobError
from cloudcontrol.node.utils import num_to_sig
logger = logging.getLogger(__name__)
class JobManager(object):
def __init__(self, main_loop):
"""
:param main_loop: :class:`MainLoop` instance
"""
def counter():
i = 0
while True:
yield i
i += 1
self.job_id = counter()
self.main = main_loop
#: keep an index of all jobs
self.jobs = {}
def job_start(self):
pass
def job_stop(self):
pass
def notify(self, job):
"""Called when a job is done."""
# by now only remove the job
self.remove(job.id)
def cancel(self, job_id):
"""Cancel a job."""
self.jobs[job_id].stop()
def remove(self, job_id):
try:
return self.jobs.pop(job_id)
except KeyError:
logger.error('Job %s does not exist', job_id)
def create(self, job_constructor, *args, **kwargs):
"""Create a new job and populate job id."""
job = job_constructor(self, *args, **kwargs)
self.jobs[job.id] = job
return job
def get(self, job_id):
return self.jobs[job_id]
def start(self):
pass
def stop(self):
logger.debug('Stopping all currently running jobs')
for job in self.jobs.itervalues():
try:
job.stop()
except Exception:
pass
class BaseThreadedJob(Thread):
"""Job running in a background thread.
Handles job notification to the job manager.
"""
def __init__(self, job_manager):
Thread.__init__(self)
#: report progress in %
self.progress = 0.
self.job_manager = job_manager
#: job id
self.id = job_manager.job_id.next()
self.running = False
def pre_job(self):
"""Job preparation that is called when doing start, it can raise
exceptions to report error to the caller. In the latter case, it will
also removes itself from the job list.
"""
pass
def run_job(self):
"""Overide this method to define what your job do."""
raise NotImplementedError
def run(self):
try:
self.run_job()
except Exception:
pass
finally:
self.running = False
def notify(self):
self.job_manager.notify(self)
def start(self):
"""Start job in a background thread."""
# first we run pre_job as it could raise an exception
try:
self.pre_job()
except Exception:
# in case of error we must remove the job from the manager
self.notify()
raise
# then we start the watcher when it's safe
self.running = True
Thread.start(self) # thread will signal when it's done using async
def wait(self):
"""For jobs running in a background, this method MUST be called in order
to remove the job from the list in the end.
"""
self.join()
self.notify()
def start_current(self):
"""Start job in current thread."""
try:
self.pre_job()
self.running = True
self.run_job()
# we could log exceptions here but rather do it inside the run_job
# method
finally:
self.notify()
def stop(self):
self.running = False
class ForkedJob(object):
"""Job that executes in a fork.
When inherit, you must define open_fds property that list file descriptors
that must be kept in the child and closed in the parent.
.. warning::
logging should not be used in the child as this would cause a deadlock
to occur, see http://bugs.python.org/issue6721
"""
def __init__(self, job_manager):
"""
:param job_manager: :class:`JobManager` instance
"""
self.job_manager = job_manager
#: job id
self.id = job_manager.job_id.next()
self.running = False
# event for other thread to wait for job termination
self.job_done = Event()
self.fork_pid = None
def pre_job(self):
"""This method represents any preparation job that must be done in the
parent before the fork.
"""
pass
def run_job(self):
"""This represent the work that will be done in the forked child.
This method MUST be redefined in subclasses.
"""
pass
def after_fork(self):
"""This method will be called just after fork in the child.
It does nothing by default and it can be redifined in subclasses.
"""
pass
def fatal(self, fmt, *args, **kwargs):
"""Write error message in stderr and exit.
:param str fmt: format string
:param \*args: arguments for format string
:param \*\*kwargs: can contain ('status', :int:) -> exit status of process
"""
try:
status = int(kwargs.get('status', 1))
except (ValueError, TypeError):
sys.stderr.write('Bad status argument %s' % status)
os._exit(42)
try:
fmt = fmt % args
except (ValueError, TypeError):
sys.stderr.write('Bad formatting for string: %s' % fmt)
os._exit(42)
try:
sys.stderr.write(fmt)
except IOError:
os._exit(42)
os._exit(status)
def fatal_exc(self, fmt, *args, **kwargs):
"""Write error message and traceback and exit.
:param str fmt: format string
:param \*args: arguments for format string
:param \*\*kwargs: can contain ('status', :int:) -> exit status of process
"""
tb = StringIO()
tb.write('\n')
traceback.print_exc(file=tb)
tb.write('\n')
fmt += '%s'
args = args + (tb.getvalue(),)
self.fatal(fmt, *args, status=kwargs.get('status', 1))
def close_fds(self):
"""Close all fds uneeded fds in children.
If global debug variable in configuration is set to True, then it will
prevent standard input/output from being closed thus allowing logging on
stderr.
"""
# get max fd
limit = resource.getrlimit(resource.RLIMIT_NOFILE)[1]
if limit == resource.RLIM_INFINITY:
max_fd = 2048
else:
max_fd = limit
exclude_fd = self.open_fds
if self.job_manager.main.config.debug:
exclude_fd += [0, 1, 2] # debug
for fd in xrange(max_fd, -1, -1):
if fd in exclude_fd:
continue
try:
os.close(fd)
except OSError as exc:
if exc.errno != errno.EBADF:
raise
# wasn't open
if not self.job_manager.main.config.debug:
sys.stdin = open(os.devnull)
sys.stdout = open(os.devnull)
sys.stderr = open(os.devnull)
assert sys.stdin.fileno() == 0
assert sys.stdout.fileno() == 1
assert sys.stderr.fileno() == 2
def reset_signal_mask(self):
signal.signal(signal.SIGTERM, lambda *args: os._exit(1))
signal.signal(signal.SIGUSR1, signal.SIG_IGN)
signal.signal(signal.SIGINT, signal.SIG_IGN)
def run(self):
"""This method performs all the hard work by doing the actual fork.
It catches all possible exceptions in the child, as this would prevents
the latter from going back in the stack and doing nasty things with
libev loop or sjRPC.
Thus you do not need to capture all exceptions in your code,
furthermore, if you need to exit from the child, you'd better use
`os._exit <http://docs.python.org/library/os.html#os._exit>`_ function.
"""
try:
self.fork_pid = os.fork()
except OSError as exc:
logger.error('Cannot fork (job %s): %s', self.id, exc.strerror)
raise
self.running = True
if self.fork_pid == 0:
# child
self.reset_signal_mask()
self.close_fds()
self.after_fork()
try:
self.run_job()
except:
sys.stderr.write('Error during job %s\n' % self.id)
os._exit(1)
else:
sys.stderr.write('Job execution went well %s\n' % self.id)
os._exit(0)
else:
# close child fds
for fd in self.open_fds:
try:
os.close(fd)
except OSError as exc:
if exc.errno == errno.EBADF:
# FIXME this is weird but it seems to happen sometimes
logger.debug('Error while closing fd %s in parent,'
' EBADF (job %s', fd, self.id)
continue
logger.error('Error while closing fds in parent: %s',
exc.strerror)
raise
def start(self):
"""This will start the job by executing :py:meth:`pre_job` method and
:py:meth:`run`.
"""
self.pre_job()
self.run()
def stop(self):
"""This would be called to stop the job.
It will kill the child but it will not call `os.waitpid
<http://docs.python.org/library/os.html#os.waitpid>`_ to get return
status from zombie process. :py:meth:`wait` MUST be called anyway.
"""
try:
os.kill(self.fork_pid, signal.SIGKILL)
except OSError as exc:
if exc.errno == errno.ESRCH:
logger.debug('Child already killed')
return
logger.error('Cannot kill child for IO job: %s', exc.strerror)
raise
def notify(self):
self.job_manager.notify(self)
def wait(self):
"""This will wait for the fork to end and raise exception depending on
child return status.
.. warning::
This method MUST be called.
"""
if self.fork_pid is None:
return
try:
while True:
try:
pid, return_status = os.waitpid(self.fork_pid, 0)
except OSError as exc:
if exc.errno == errno.EINTR:
continue
logger.error('Error while waiting for child to terminate:'
' %s (job %s)', exc.strerror, self.id)
raise
else:
break
assert pid == self.fork_pid
if return_status >> 8 != 0:
if return_status & 0xff == signal.SIGKILL:
logger.error('Job was killed')
else:
raise JobError('Exception during job, returned %s, signal'
' %s' % (
return_status >> 8,
num_to_sig(return_status & 0xff)))
finally:
self.fork_pid = None
self.job_done.set()
self.notify()
def join(self):
"""This provides an API similar to `threading.Thread.join
<http://docs.python.org/library/threading.html#threading.Thread.join>`_
, you can wait
for the job termination from multiple points in your program but one of
these and only one MUST be `wait` method.
"""
self.job_done.wait()
class BaseIOJob(ForkedJob):
"""Fork job that set ionice on the child."""
#: level of io nice that will be set (see :manpage:`ionice(1)`)
IO_NICE = 7
def after_fork(self):
try:
subprocess.check_call(['ionice', '-n%d' % self.IO_NICE,
'-p%d' % os.getpid()], close_fds=True)
except subprocess.CalledProcessError as exc:
sys.stderr.write('Cannot set ionice, return code %s\n' % exc.returncode)