mirror of
https://github.com/wassname/catalyst.git
synced 2026-06-28 13:17:43 +08:00
655 lines
19 KiB
Python
655 lines
19 KiB
Python
import os
|
|
import zmq
|
|
import sys
|
|
import time
|
|
import itertools
|
|
import logbook
|
|
from setproctitle import setproctitle
|
|
from signal import SIGHUP, SIGINT
|
|
|
|
from collections import OrderedDict, Counter
|
|
|
|
from zipline.protocol import (
|
|
CONTROL_PROTOCOL,
|
|
CONTROL_FRAME,
|
|
CONTROL_UNFRAME,
|
|
CONTROL_STATES,
|
|
INVALID_CONTROL_FRAME
|
|
)
|
|
|
|
from zipline.utils.protocol_utils import ndict
|
|
|
|
INIT, SOURCES_READY, RUNNING, TERMINATE = CONTROL_STATES
|
|
|
|
CONTROLLER_TRANSITIONS = frozenset([
|
|
(-1 , INIT),
|
|
(INIT , SOURCES_READY),
|
|
(SOURCES_READY , RUNNING),
|
|
|
|
(INIT , TERMINATE), # pseudo failure mode
|
|
(SOURCES_READY , TERMINATE), # pseudo failure mode
|
|
(RUNNING , TERMINATE),
|
|
])
|
|
|
|
class UnknownChatter(Exception):
|
|
def __init__(self, name):
|
|
self.named = name
|
|
def __str__(self):
|
|
return """Component calling itself "%s" talking on unexpected channel""" % self.named
|
|
|
|
|
|
log = logbook.Logger('Monitor')
|
|
|
|
# The scalars determining the timing of the monitor behavior for
|
|
# the system.
|
|
|
|
PARAMETERS = ndict(dict(
|
|
# time Monitor will wait for a heartbeat, in seconds
|
|
GENERATIONAL_PERIOD = 20,
|
|
# time Component will wait for GO and for a heartbeat before
|
|
# timing out.
|
|
MAX_COMPONENT_WAIT = 25,
|
|
ALLOWED_SKIPPED_HEARTBEATS = 10,
|
|
ALLOWED_INVALID_HEARTBEATS = 3,
|
|
PRESTART_HEARBEATS = 3,
|
|
SOURCES_START_HEARTBEATS = 3,
|
|
SYSTEM_TIMEOUT = 50,
|
|
))
|
|
|
|
class Monitor(object):
|
|
"""
|
|
A N to M messaging system for inter component communication.
|
|
|
|
:param pub_socket: Socket to publish messages, the starting
|
|
point of :func message_listener: .
|
|
|
|
:param route_socket: Socket to listen for status updates for
|
|
the individual components.
|
|
:func message_sender: .
|
|
|
|
"""
|
|
|
|
# Turn on debug for verbose logging of the system.
|
|
debug = True
|
|
period = PARAMETERS.GENERATIONAL_PERIOD
|
|
|
|
def __init__(
|
|
self,
|
|
pub_socket,
|
|
route_socket,
|
|
exception_socket,
|
|
send_sighup=False):
|
|
|
|
self.nosignals = False
|
|
self.context = None
|
|
self.zmq = None
|
|
self.zmq_poller = None
|
|
|
|
self.running = False
|
|
self.alive = False
|
|
self.tracked = set()
|
|
self.finished = set()
|
|
|
|
self.responses = set()
|
|
|
|
self.ctime = 0
|
|
self.tic = time.time()
|
|
self.freeform = False
|
|
self._state = -1
|
|
|
|
self.associated = []
|
|
|
|
self.pub_socket = pub_socket
|
|
self.route_socket = route_socket
|
|
self.exception_socket = exception_socket
|
|
|
|
self.missed_beats = Counter()
|
|
|
|
# start with an empty topology
|
|
self.topology = set([])
|
|
|
|
self.send_sighup = send_sighup
|
|
if self.send_sighup:
|
|
log.info("Request to send sighup/sigint")
|
|
|
|
|
|
def init_zmq(self):
|
|
self.zmq = zmq
|
|
self.context = self.zmq.Context()
|
|
self.zmq_poller = self.zmq.Poller
|
|
return
|
|
|
|
def add_to_topology(self, component_id):
|
|
add = set([component_id, "FORK-" + component_id])
|
|
self.topology.update(add)
|
|
|
|
def freeze_topology(self):
|
|
if isinstance(self.topology, frozenset):
|
|
return
|
|
# we've been incrementally adding components.
|
|
# time to freeze.
|
|
self.manage(self.topology)
|
|
|
|
def manage(self, topology):
|
|
"""
|
|
Give the controller a set set of components to manage and
|
|
a set of state transitions for the entire system.
|
|
"""
|
|
# A freeform topology is where we heartbeat with anything
|
|
# that shows up.
|
|
if topology == 'freeform':
|
|
self.freeform = True
|
|
self.topology = frozenset([])
|
|
else:
|
|
self.freeform = False
|
|
self.topology = frozenset(topology)
|
|
self.alive = True
|
|
|
|
@property
|
|
def state(self):
|
|
#log.info('returned %s' % self._state)
|
|
return self._state
|
|
|
|
@state.setter
|
|
def state(self, new):
|
|
old = self._state
|
|
|
|
if (old, new) in CONTROLLER_TRANSITIONS:
|
|
self._state = new
|
|
log.info("State Transition : %s -> %s" % (old, self._state))
|
|
else:
|
|
raise RuntimeError("Invalid State Transition : %s -> %s" %(old, new))
|
|
|
|
def run(self):
|
|
self.freeze_topology()
|
|
self.running = True
|
|
self.init_zmq()
|
|
setproctitle('Monitor')
|
|
|
|
self.state = CONTROL_STATES.INIT
|
|
|
|
# TODO: keep the exitfunc? the corresponding override on clean
|
|
# exit is commented out currently.
|
|
#
|
|
# Interpreter SIDE EFFECT
|
|
# -----------------------
|
|
# The last breathe of the interpreter will assume that we've
|
|
# failed unless we specify otherwise.
|
|
log.info('registering exit function')
|
|
sys.exitfunc = self.signal_interrupt
|
|
# We overload this if ( and only if ) the topology exits
|
|
# cleanly. This prevents failure modes where the monitor
|
|
# dies.
|
|
|
|
try:
|
|
return self._poll() # use a python loop
|
|
except KeyboardInterrupt:
|
|
log.info('Shutdown event loop')
|
|
|
|
def log_status(self):
|
|
"""
|
|
Snapshot of the tracked components at every period.
|
|
"""
|
|
#log.info("Tracking component : %s" % ([c for c in self.tracked],))
|
|
pass
|
|
|
|
def replay_errors(self):
|
|
"""
|
|
Replay the errors in the order they were reported to the
|
|
controller.
|
|
"""
|
|
return [ a for a in sorted(self.replay_errors.keys())]
|
|
|
|
# -------------
|
|
# Publications
|
|
# -------------
|
|
|
|
def send_go(self):
|
|
go_frame = CONTROL_FRAME(
|
|
CONTROL_PROTOCOL.GO,
|
|
''
|
|
)
|
|
self.pub.send(go_frame)
|
|
|
|
def send_heart(self):
|
|
if not self.running:
|
|
return
|
|
|
|
heartbeat_frame = CONTROL_FRAME(
|
|
CONTROL_PROTOCOL.HEARTBEAT,
|
|
str(self.ctime)
|
|
)
|
|
self.pub.send(heartbeat_frame)
|
|
|
|
def send_hardkill(self):
|
|
if not self.running:
|
|
return
|
|
|
|
kill_frame = CONTROL_FRAME(
|
|
CONTROL_PROTOCOL.KILL,
|
|
''
|
|
)
|
|
self.pub.send(kill_frame)
|
|
|
|
def send_softkill(self):
|
|
if not self.running:
|
|
return
|
|
|
|
soft_frame = CONTROL_FRAME(
|
|
CONTROL_PROTOCOL.SHUTDOWN,
|
|
''
|
|
)
|
|
self.pub.send(soft_frame)
|
|
|
|
# -----------
|
|
# Event Loops
|
|
# -----------
|
|
|
|
def _poll(self):
|
|
|
|
assert self.route_socket
|
|
assert self.pub_socket
|
|
|
|
assert self.topology,\
|
|
""""Must define topology to monitor, call setup_controller() on
|
|
your Zipline. """
|
|
|
|
# -- Publish --
|
|
# =============
|
|
self.pub = self.context.socket(self.zmq.PUB)
|
|
self.pub.bind(self.pub_socket)
|
|
self.pub.setsockopt(zmq.LINGER, 0)
|
|
|
|
# -- Router --
|
|
# =============
|
|
self.router = self.context.socket(self.zmq.ROUTER)
|
|
self.router.bind(self.route_socket)
|
|
self.router.setsockopt(zmq.LINGER, 0)
|
|
|
|
# -- Exception Out --
|
|
# ===================
|
|
self.ex_out = self.context.socket(self.zmq.PUSH)
|
|
self.ex_out.connect(self.exception_socket)
|
|
|
|
poller = self.zmq.Poller()
|
|
poller.register(self.router, self.zmq.POLLIN)
|
|
#poller.register(self.cancel, self.zmq.POLLIN)
|
|
|
|
self.associated += [self.pub, self.router]
|
|
|
|
# TODO: actually do this
|
|
self.state = CONTROL_STATES.SOURCES_READY
|
|
self.state = CONTROL_STATES.RUNNING
|
|
|
|
buffer = []
|
|
|
|
# ===================
|
|
# Heartbeat Iteration
|
|
# ===================
|
|
|
|
for i in itertools.count(0):
|
|
self.log_status()
|
|
|
|
# Reset the responses for this cycle
|
|
self.responses = set()
|
|
|
|
# broadcast the heartbeat packet
|
|
self.ctime = time.time()
|
|
self.send_heart()
|
|
|
|
# ==============
|
|
# Hearbeat Cycle
|
|
# ==============
|
|
|
|
initializing = len(self.tracked) == 0 and len(self.finished) == 0
|
|
|
|
# Wait the responses
|
|
while self.alive:
|
|
|
|
socks = dict(poller.poll(0))
|
|
tic = time.time()
|
|
|
|
if socks.get(self.router) == self.zmq.POLLIN:
|
|
rawmessage = self.router.recv()
|
|
|
|
if rawmessage:
|
|
buffer.append(rawmessage)
|
|
try:
|
|
if not self.router.getsockopt(self.zmq.RCVMORE):
|
|
self.handle_recv(buffer[:])
|
|
buffer = []
|
|
|
|
except INVALID_CONTROL_FRAME:
|
|
log.error('Invalid frame', rawmessage)
|
|
pass
|
|
|
|
# We break out of this loop if the time between
|
|
# sending and receiving the heartbeat is more
|
|
# than our poll period.
|
|
|
|
if tic - self.ctime > self.period:
|
|
log.info("heartbeat loop timedout: %s" % (tic - self.ctime))
|
|
log.info(repr(self.responses))
|
|
break
|
|
|
|
# if this is the first time heartbeating, break
|
|
# out early if we get everything tracked no need
|
|
# to hold out for the full heartbeat.
|
|
if initializing and not self.freeform:
|
|
if len(self.responses) == len(self.topology):
|
|
log.info("breaking out of initial heartbeat")
|
|
break
|
|
|
|
# Break out if the entire topology told us its DONE
|
|
if len(self.finished) == len(self.topology):
|
|
break
|
|
|
|
|
|
# ================
|
|
# Heartbeat Stats
|
|
# ================
|
|
|
|
complete = self.beat()
|
|
|
|
# ================
|
|
# Topology Status
|
|
# ================
|
|
|
|
# Has the entire topology told us its DONE
|
|
done = len(self.finished) == len(self.topology)
|
|
|
|
# Has the entire topology shown up to the party
|
|
complete = len(self.tracked) == len(self.topology)
|
|
|
|
if complete:
|
|
self.send_go()
|
|
|
|
log.info('Heartbeat (%s, %s)' % (done, complete))
|
|
|
|
# ================
|
|
# Exit Strategies
|
|
# ================
|
|
|
|
# Will also fall out of loop when done, if using
|
|
# non-freeform topology
|
|
if done:
|
|
log.info('Entire topology exited cleanly')
|
|
self.shutdown()
|
|
|
|
# Noop exit func
|
|
#sys.exitfunc = lambda: None
|
|
|
|
# Send SIGHUP to buritto
|
|
self.signal_hangup()
|
|
|
|
if not self.alive:
|
|
log.info('Breaking out of Monitor Loop')
|
|
break
|
|
|
|
def signal_hangup(self):
|
|
"""
|
|
A clean exit, inform the burrito ( and arbiter ) that
|
|
we're good. The topology exited cleanly and we can prove
|
|
it.
|
|
"""
|
|
if not self.send_sighup:
|
|
log.warning("Skipping SIGHUP")
|
|
return
|
|
ppid = os.getppid()
|
|
log.warning("Sending SIGHUP")
|
|
os.kill(ppid, SIGHUP)
|
|
|
|
def signal_interrupt(self):
|
|
"""
|
|
Send a SIGINT in the error mode that the monitor's
|
|
interpreter exits. If the monitor dies the system is
|
|
considered a failure.
|
|
"""
|
|
if not self.send_sighup:
|
|
log.warning("Skipping SIGINT")
|
|
return
|
|
ppid = os.getpid()
|
|
log.warning("Sending SIGINT")
|
|
os.kill(ppid, SIGINT)
|
|
|
|
def beat(self):
|
|
"""
|
|
The tracking logic of the system. It's the "stethoscope"
|
|
that inspects to the heartbeats in a generation and
|
|
infers the state of the system from the responses.
|
|
"""
|
|
|
|
# These the set overloaded operations
|
|
# A & B ~ set.intersection
|
|
# A - B ~ set.difference
|
|
|
|
# * good - Components we are currently tracking and who just sent
|
|
# us back the right response.
|
|
# * bad - Components we are currently tracking but who did not
|
|
# send us back a response.
|
|
# * new - Components we haven't heard from yet, but sent back the
|
|
# right response.
|
|
# * finished - Components we were tracking but have now
|
|
# finished, when this set goes to zero this
|
|
# triggers the end of the topology.
|
|
|
|
good = self.tracked & self.responses
|
|
bad = self.tracked - good - self.finished
|
|
new = self.responses - good - self.finished
|
|
|
|
for component in new:
|
|
self.new(component)
|
|
|
|
for component in bad:
|
|
self.timed_out(component)
|
|
|
|
missing = self.topology - self.tracked - self.finished
|
|
|
|
for component in missing:
|
|
if self.debug:
|
|
log.info('Missing component %r' % component)
|
|
|
|
|
|
for component in self.tracked:
|
|
if component not in self.topology:
|
|
log.info('Uninvited component %r' % component)
|
|
|
|
# --------------
|
|
# Init Handlers
|
|
# --------------
|
|
|
|
def new_universal(self):
|
|
pass
|
|
|
|
# The various "states of being that a component can inform us
|
|
# of
|
|
def new(self, component):
|
|
if self.state is CONTROL_STATES.TERMINATE:
|
|
return
|
|
|
|
if component in self.finished:
|
|
#log.info("Got heartbeat from supposedly finished component")
|
|
return
|
|
|
|
log.info('Now Tracking "%s" ' % component)
|
|
|
|
universal = self.new_universal
|
|
init_handlers = {}
|
|
|
|
if component in (self.topology - self.finished) or self.freeform:
|
|
init_handlers.get(component, universal)()
|
|
self.tracked.add(component)
|
|
else:
|
|
# Some sort of socket collision has occurred, this is
|
|
# a very bad failure mode.
|
|
raise UnknownChatter(component)
|
|
|
|
# ------------------
|
|
# Epic Fail Handling
|
|
# ------------------
|
|
|
|
def timed_out(self, component):
|
|
if self.state is CONTROL_STATES.TERMINATE:
|
|
return
|
|
|
|
if component in (self.topology - self.finished) or self.freeform:
|
|
log.warning('Component "%s" missed heartbeat' % component)
|
|
# we treat a time out as a severe failure, and
|
|
# conduct a rapid shutdown
|
|
self.kill()
|
|
|
|
|
|
# -------------------
|
|
# Completion Handling
|
|
# -------------------
|
|
|
|
def done(self, component):
|
|
self.finished.add(component)
|
|
self.tracked.discard(component)
|
|
log.info('Component "%s" finished.' % component)
|
|
|
|
# --------------
|
|
# Error Handling
|
|
# --------------
|
|
def exception(self, component, exception_data):
|
|
log.error('Component in exception state: %s. Shutting down system and sending exception data to listeners.'\
|
|
% component)
|
|
# Send the exception message out to listeners.
|
|
self.ex_out.send(exception_data)
|
|
# An exception in one component is treated as a hard
|
|
# failure, and we conduct a rapid shutdown.
|
|
self.kill()
|
|
|
|
# -----------------
|
|
# Protocol Handling
|
|
# -----------------
|
|
|
|
def handle_recv(self, msg):
|
|
"""
|
|
Check for proper framing at the transport layer.
|
|
Seperates the proper frames from anything else that might
|
|
be coming over the wire.
|
|
"""
|
|
|
|
identity = msg[0] # identity of the socket
|
|
id, status = CONTROL_UNFRAME(msg[1])
|
|
|
|
# I'm alive, condemned to be a free process in the cold
|
|
# cold dark absurd Zipline universe.
|
|
if id is CONTROL_PROTOCOL.READY:
|
|
self.responses.add(identity)
|
|
return
|
|
|
|
# The heartbeat love song between a component and the
|
|
# controller
|
|
if id is CONTROL_PROTOCOL.OK:
|
|
|
|
if status == str(self.ctime):
|
|
# Go to your bosom; knock there, and ask your heart what
|
|
# it doth know...
|
|
self.responses.add(identity)
|
|
elif float(status) < self.ctime:
|
|
# False face must hide what the false heart doth know.
|
|
log.warning('Delayed heartbeat received: %s' % msg)
|
|
elif float(status) > self.ctime:
|
|
# Pre-emptive heartbeat from the component
|
|
# log.info("pre-emptive pong: %s" % msg)
|
|
self.responses.add(identity)
|
|
else:
|
|
# Otherwise its something weird and we don't know
|
|
# what to do so just say so, probably line noise
|
|
# from ZeroMQ
|
|
|
|
# What's in a name? that which we call a rose...
|
|
log.error("Weird heartbeat packet happened: %s" % msg)
|
|
return
|
|
|
|
# A component is telling us it failed, and how
|
|
if id is CONTROL_PROTOCOL.EXCEPTION:
|
|
# status should be a msgpack emitted from
|
|
# EXCEPTION_FRAME
|
|
try:
|
|
exception_data = status
|
|
self.exception(identity, exception_data)
|
|
except:
|
|
# if an exception occurs when we try to handle
|
|
# the exception, signal the parent that we need
|
|
# to go down
|
|
# TODO: should we attempt to call self.exception?
|
|
log.exception("Unexpected exception sending exception data")
|
|
self.kill()
|
|
|
|
return
|
|
|
|
# A component is telling us its done with work and won't
|
|
# be talking to us anymore
|
|
if id is CONTROL_PROTOCOL.DONE:
|
|
self.done(identity)
|
|
return
|
|
|
|
# -------------------
|
|
# Hooks for Endpoints
|
|
# -------------------
|
|
|
|
# These are all connects so no complex allocation logic is
|
|
# needed. Dealers and Subscribers can all come and go as a
|
|
# function of time without impacting flow of the whole
|
|
# system.
|
|
|
|
def message_sender(self, identity, context = None):
|
|
"""
|
|
Spin off a socket used for sending messages to this
|
|
controller.
|
|
"""
|
|
|
|
if not context:
|
|
context = self.zmq.Context.instance()
|
|
|
|
s = context.socket(zmq.DEALER)
|
|
s.setsockopt(zmq.IDENTITY, identity)
|
|
s.connect(self.route_socket)
|
|
|
|
self.associated.append(s)
|
|
return s
|
|
|
|
def message_listener(self, context = None):
|
|
"""
|
|
Spin off a socket used for receiving messages from this
|
|
controller.
|
|
"""
|
|
|
|
if not context:
|
|
context = self.zmq.Context.instance()
|
|
|
|
s = context.socket(zmq.SUB)
|
|
s.connect(self.pub_socket)
|
|
s.setsockopt(zmq.SUBSCRIBE, '')
|
|
|
|
self.associated.append(s)
|
|
return s
|
|
|
|
def kill(self):
|
|
"""Aggressively exit the whole zipline.
|
|
"""
|
|
if self.state is CONTROL_STATES.TERMINATE:
|
|
return
|
|
|
|
log.info('Hard Shutdown')
|
|
self.send_hardkill()
|
|
self.state = CONTROL_STATES.TERMINATE
|
|
self.alive = False
|
|
# send burrito an interrupt, instructing it to kill all
|
|
# child processes assocated with this zipline.
|
|
time.sleep(3)
|
|
self.signal_interrupt()
|
|
|
|
def shutdown(self):
|
|
|
|
if self.state is CONTROL_STATES.TERMINATE:
|
|
return
|
|
|
|
log.info('Soft Shutdown')
|
|
self.send_softkill()
|
|
self.state = CONTROL_STATES.TERMINATE
|
|
self.alive = False
|