Files
catalyst/zipline/core/monitor.py
T

655 lines
19 KiB
Python

import os
import zmq
import sys
import time
import itertools
import logbook
from setproctitle import setproctitle
from signal import SIGHUP, SIGINT
from collections import OrderedDict, Counter
from zipline.protocol import (
CONTROL_PROTOCOL,
CONTROL_FRAME,
CONTROL_UNFRAME,
CONTROL_STATES,
INVALID_CONTROL_FRAME
)
from zipline.utils.protocol_utils import ndict
INIT, SOURCES_READY, RUNNING, TERMINATE = CONTROL_STATES
CONTROLLER_TRANSITIONS = frozenset([
(-1 , INIT),
(INIT , SOURCES_READY),
(SOURCES_READY , RUNNING),
(INIT , TERMINATE), # pseudo failure mode
(SOURCES_READY , TERMINATE), # pseudo failure mode
(RUNNING , TERMINATE),
])
class UnknownChatter(Exception):
def __init__(self, name):
self.named = name
def __str__(self):
return """Component calling itself "%s" talking on unexpected channel""" % self.named
log = logbook.Logger('Monitor')
# The scalars determining the timing of the monitor behavior for
# the system.
PARAMETERS = ndict(dict(
# time Monitor will wait for a heartbeat, in seconds
GENERATIONAL_PERIOD = 20,
# time Component will wait for GO and for a heartbeat before
# timing out.
MAX_COMPONENT_WAIT = 25,
ALLOWED_SKIPPED_HEARTBEATS = 10,
ALLOWED_INVALID_HEARTBEATS = 3,
PRESTART_HEARBEATS = 3,
SOURCES_START_HEARTBEATS = 3,
SYSTEM_TIMEOUT = 50,
))
class Monitor(object):
"""
A N to M messaging system for inter component communication.
:param pub_socket: Socket to publish messages, the starting
point of :func message_listener: .
:param route_socket: Socket to listen for status updates for
the individual components.
:func message_sender: .
"""
# Turn on debug for verbose logging of the system.
debug = True
period = PARAMETERS.GENERATIONAL_PERIOD
def __init__(
self,
pub_socket,
route_socket,
exception_socket,
send_sighup=False):
self.nosignals = False
self.context = None
self.zmq = None
self.zmq_poller = None
self.running = False
self.alive = False
self.tracked = set()
self.finished = set()
self.responses = set()
self.ctime = 0
self.tic = time.time()
self.freeform = False
self._state = -1
self.associated = []
self.pub_socket = pub_socket
self.route_socket = route_socket
self.exception_socket = exception_socket
self.missed_beats = Counter()
# start with an empty topology
self.topology = set([])
self.send_sighup = send_sighup
if self.send_sighup:
log.info("Request to send sighup/sigint")
def init_zmq(self):
self.zmq = zmq
self.context = self.zmq.Context()
self.zmq_poller = self.zmq.Poller
return
def add_to_topology(self, component_id):
add = set([component_id, "FORK-" + component_id])
self.topology.update(add)
def freeze_topology(self):
if isinstance(self.topology, frozenset):
return
# we've been incrementally adding components.
# time to freeze.
self.manage(self.topology)
def manage(self, topology):
"""
Give the controller a set set of components to manage and
a set of state transitions for the entire system.
"""
# A freeform topology is where we heartbeat with anything
# that shows up.
if topology == 'freeform':
self.freeform = True
self.topology = frozenset([])
else:
self.freeform = False
self.topology = frozenset(topology)
self.alive = True
@property
def state(self):
#log.info('returned %s' % self._state)
return self._state
@state.setter
def state(self, new):
old = self._state
if (old, new) in CONTROLLER_TRANSITIONS:
self._state = new
log.info("State Transition : %s -> %s" % (old, self._state))
else:
raise RuntimeError("Invalid State Transition : %s -> %s" %(old, new))
def run(self):
self.freeze_topology()
self.running = True
self.init_zmq()
setproctitle('Monitor')
self.state = CONTROL_STATES.INIT
# TODO: keep the exitfunc? the corresponding override on clean
# exit is commented out currently.
#
# Interpreter SIDE EFFECT
# -----------------------
# The last breathe of the interpreter will assume that we've
# failed unless we specify otherwise.
log.info('registering exit function')
sys.exitfunc = self.signal_interrupt
# We overload this if ( and only if ) the topology exits
# cleanly. This prevents failure modes where the monitor
# dies.
try:
return self._poll() # use a python loop
except KeyboardInterrupt:
log.info('Shutdown event loop')
def log_status(self):
"""
Snapshot of the tracked components at every period.
"""
#log.info("Tracking component : %s" % ([c for c in self.tracked],))
pass
def replay_errors(self):
"""
Replay the errors in the order they were reported to the
controller.
"""
return [ a for a in sorted(self.replay_errors.keys())]
# -------------
# Publications
# -------------
def send_go(self):
go_frame = CONTROL_FRAME(
CONTROL_PROTOCOL.GO,
''
)
self.pub.send(go_frame)
def send_heart(self):
if not self.running:
return
heartbeat_frame = CONTROL_FRAME(
CONTROL_PROTOCOL.HEARTBEAT,
str(self.ctime)
)
self.pub.send(heartbeat_frame)
def send_hardkill(self):
if not self.running:
return
kill_frame = CONTROL_FRAME(
CONTROL_PROTOCOL.KILL,
''
)
self.pub.send(kill_frame)
def send_softkill(self):
if not self.running:
return
soft_frame = CONTROL_FRAME(
CONTROL_PROTOCOL.SHUTDOWN,
''
)
self.pub.send(soft_frame)
# -----------
# Event Loops
# -----------
def _poll(self):
assert self.route_socket
assert self.pub_socket
assert self.topology,\
""""Must define topology to monitor, call setup_controller() on
your Zipline. """
# -- Publish --
# =============
self.pub = self.context.socket(self.zmq.PUB)
self.pub.bind(self.pub_socket)
self.pub.setsockopt(zmq.LINGER, 0)
# -- Router --
# =============
self.router = self.context.socket(self.zmq.ROUTER)
self.router.bind(self.route_socket)
self.router.setsockopt(zmq.LINGER, 0)
# -- Exception Out --
# ===================
self.ex_out = self.context.socket(self.zmq.PUSH)
self.ex_out.connect(self.exception_socket)
poller = self.zmq.Poller()
poller.register(self.router, self.zmq.POLLIN)
#poller.register(self.cancel, self.zmq.POLLIN)
self.associated += [self.pub, self.router]
# TODO: actually do this
self.state = CONTROL_STATES.SOURCES_READY
self.state = CONTROL_STATES.RUNNING
buffer = []
# ===================
# Heartbeat Iteration
# ===================
for i in itertools.count(0):
self.log_status()
# Reset the responses for this cycle
self.responses = set()
# broadcast the heartbeat packet
self.ctime = time.time()
self.send_heart()
# ==============
# Hearbeat Cycle
# ==============
initializing = len(self.tracked) == 0 and len(self.finished) == 0
# Wait the responses
while self.alive:
socks = dict(poller.poll(0))
tic = time.time()
if socks.get(self.router) == self.zmq.POLLIN:
rawmessage = self.router.recv()
if rawmessage:
buffer.append(rawmessage)
try:
if not self.router.getsockopt(self.zmq.RCVMORE):
self.handle_recv(buffer[:])
buffer = []
except INVALID_CONTROL_FRAME:
log.error('Invalid frame', rawmessage)
pass
# We break out of this loop if the time between
# sending and receiving the heartbeat is more
# than our poll period.
if tic - self.ctime > self.period:
log.info("heartbeat loop timedout: %s" % (tic - self.ctime))
log.info(repr(self.responses))
break
# if this is the first time heartbeating, break
# out early if we get everything tracked no need
# to hold out for the full heartbeat.
if initializing and not self.freeform:
if len(self.responses) == len(self.topology):
log.info("breaking out of initial heartbeat")
break
# Break out if the entire topology told us its DONE
if len(self.finished) == len(self.topology):
break
# ================
# Heartbeat Stats
# ================
complete = self.beat()
# ================
# Topology Status
# ================
# Has the entire topology told us its DONE
done = len(self.finished) == len(self.topology)
# Has the entire topology shown up to the party
complete = len(self.tracked) == len(self.topology)
if complete:
self.send_go()
log.info('Heartbeat (%s, %s)' % (done, complete))
# ================
# Exit Strategies
# ================
# Will also fall out of loop when done, if using
# non-freeform topology
if done:
log.info('Entire topology exited cleanly')
self.shutdown()
# Noop exit func
#sys.exitfunc = lambda: None
# Send SIGHUP to buritto
self.signal_hangup()
if not self.alive:
log.info('Breaking out of Monitor Loop')
break
def signal_hangup(self):
"""
A clean exit, inform the burrito ( and arbiter ) that
we're good. The topology exited cleanly and we can prove
it.
"""
if not self.send_sighup:
log.warning("Skipping SIGHUP")
return
ppid = os.getppid()
log.warning("Sending SIGHUP")
os.kill(ppid, SIGHUP)
def signal_interrupt(self):
"""
Send a SIGINT in the error mode that the monitor's
interpreter exits. If the monitor dies the system is
considered a failure.
"""
if not self.send_sighup:
log.warning("Skipping SIGINT")
return
ppid = os.getpid()
log.warning("Sending SIGINT")
os.kill(ppid, SIGINT)
def beat(self):
"""
The tracking logic of the system. It's the "stethoscope"
that inspects to the heartbeats in a generation and
infers the state of the system from the responses.
"""
# These the set overloaded operations
# A & B ~ set.intersection
# A - B ~ set.difference
# * good - Components we are currently tracking and who just sent
# us back the right response.
# * bad - Components we are currently tracking but who did not
# send us back a response.
# * new - Components we haven't heard from yet, but sent back the
# right response.
# * finished - Components we were tracking but have now
# finished, when this set goes to zero this
# triggers the end of the topology.
good = self.tracked & self.responses
bad = self.tracked - good - self.finished
new = self.responses - good - self.finished
for component in new:
self.new(component)
for component in bad:
self.timed_out(component)
missing = self.topology - self.tracked - self.finished
for component in missing:
if self.debug:
log.info('Missing component %r' % component)
for component in self.tracked:
if component not in self.topology:
log.info('Uninvited component %r' % component)
# --------------
# Init Handlers
# --------------
def new_universal(self):
pass
# The various "states of being that a component can inform us
# of
def new(self, component):
if self.state is CONTROL_STATES.TERMINATE:
return
if component in self.finished:
#log.info("Got heartbeat from supposedly finished component")
return
log.info('Now Tracking "%s" ' % component)
universal = self.new_universal
init_handlers = {}
if component in (self.topology - self.finished) or self.freeform:
init_handlers.get(component, universal)()
self.tracked.add(component)
else:
# Some sort of socket collision has occurred, this is
# a very bad failure mode.
raise UnknownChatter(component)
# ------------------
# Epic Fail Handling
# ------------------
def timed_out(self, component):
if self.state is CONTROL_STATES.TERMINATE:
return
if component in (self.topology - self.finished) or self.freeform:
log.warning('Component "%s" missed heartbeat' % component)
# we treat a time out as a severe failure, and
# conduct a rapid shutdown
self.kill()
# -------------------
# Completion Handling
# -------------------
def done(self, component):
self.finished.add(component)
self.tracked.discard(component)
log.info('Component "%s" finished.' % component)
# --------------
# Error Handling
# --------------
def exception(self, component, exception_data):
log.error('Component in exception state: %s. Shutting down system and sending exception data to listeners.'\
% component)
# Send the exception message out to listeners.
self.ex_out.send(exception_data)
# An exception in one component is treated as a hard
# failure, and we conduct a rapid shutdown.
self.kill()
# -----------------
# Protocol Handling
# -----------------
def handle_recv(self, msg):
"""
Check for proper framing at the transport layer.
Seperates the proper frames from anything else that might
be coming over the wire.
"""
identity = msg[0] # identity of the socket
id, status = CONTROL_UNFRAME(msg[1])
# I'm alive, condemned to be a free process in the cold
# cold dark absurd Zipline universe.
if id is CONTROL_PROTOCOL.READY:
self.responses.add(identity)
return
# The heartbeat love song between a component and the
# controller
if id is CONTROL_PROTOCOL.OK:
if status == str(self.ctime):
# Go to your bosom; knock there, and ask your heart what
# it doth know...
self.responses.add(identity)
elif float(status) < self.ctime:
# False face must hide what the false heart doth know.
log.warning('Delayed heartbeat received: %s' % msg)
elif float(status) > self.ctime:
# Pre-emptive heartbeat from the component
# log.info("pre-emptive pong: %s" % msg)
self.responses.add(identity)
else:
# Otherwise its something weird and we don't know
# what to do so just say so, probably line noise
# from ZeroMQ
# What's in a name? that which we call a rose...
log.error("Weird heartbeat packet happened: %s" % msg)
return
# A component is telling us it failed, and how
if id is CONTROL_PROTOCOL.EXCEPTION:
# status should be a msgpack emitted from
# EXCEPTION_FRAME
try:
exception_data = status
self.exception(identity, exception_data)
except:
# if an exception occurs when we try to handle
# the exception, signal the parent that we need
# to go down
# TODO: should we attempt to call self.exception?
log.exception("Unexpected exception sending exception data")
self.kill()
return
# A component is telling us its done with work and won't
# be talking to us anymore
if id is CONTROL_PROTOCOL.DONE:
self.done(identity)
return
# -------------------
# Hooks for Endpoints
# -------------------
# These are all connects so no complex allocation logic is
# needed. Dealers and Subscribers can all come and go as a
# function of time without impacting flow of the whole
# system.
def message_sender(self, identity, context = None):
"""
Spin off a socket used for sending messages to this
controller.
"""
if not context:
context = self.zmq.Context.instance()
s = context.socket(zmq.DEALER)
s.setsockopt(zmq.IDENTITY, identity)
s.connect(self.route_socket)
self.associated.append(s)
return s
def message_listener(self, context = None):
"""
Spin off a socket used for receiving messages from this
controller.
"""
if not context:
context = self.zmq.Context.instance()
s = context.socket(zmq.SUB)
s.connect(self.pub_socket)
s.setsockopt(zmq.SUBSCRIBE, '')
self.associated.append(s)
return s
def kill(self):
"""Aggressively exit the whole zipline.
"""
if self.state is CONTROL_STATES.TERMINATE:
return
log.info('Hard Shutdown')
self.send_hardkill()
self.state = CONTROL_STATES.TERMINATE
self.alive = False
# send burrito an interrupt, instructing it to kill all
# child processes assocated with this zipline.
time.sleep(3)
self.signal_interrupt()
def shutdown(self):
if self.state is CONTROL_STATES.TERMINATE:
return
log.info('Soft Shutdown')
self.send_softkill()
self.state = CONTROL_STATES.TERMINATE
self.alive = False