Source code for nemo_nowcast.manager

# Copyright 2016-2021 Doug Latornell, 43ravens

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#    http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""NEMO_Nowcast manager.
"""
import argparse
import importlib
import logging
import logging.config
import os
import pprint
import signal
import time

import attr
import requests
import sentry_sdk
import yaml
import zmq
import zmq.log.handlers

from nemo_nowcast import CommandLineInterface, Config, Message


[docs]def main(): """ Setup and run the nowcast system manager. See :command:`python -m nemo_nowcast.manager --help` for details of the command-line interface. """ mgr = NowcastManager() mgr.setup() mgr.run()
[docs]@attr.s class NowcastManager: """Construct a :py:class:`nemo_nowcast.manager.NowcastManager` instance. """ #: The name of the manager instance. #: Used in the nowcast messaging system and for logging. name = attr.ib(default="manager") #: :py:class:`nemo_nowcast.config.Config` object that holds #: the nowcast system configuration that is loaded from the configuration #: file in the :py:meth:`~nemo_nowcast.manager.NowcastManager.setup` method. config = attr.ib(default=attr.Factory(Config)) #: Logger for the manager. #: Configured from the :kbd:`logging` section of the configuration file #: in the :py:meth:`~nemo_nowcast.manager.NowcastManager.setup` method . logger = attr.ib(default=None) #: Nowcast system checklist: :py:class:`dict` containing the present #: state of the nowcast system. checklist = attr.ib(default=attr.Factory(dict)) #: :py:class:`argparse.Namespace` instance containing the arguments #: and option flags and values parsed from the command-line when the #: :py:meth:`~nemo_nowcast.manager.NowcastManager.setup` method is called. _parsed_args = attr.ib(default=None) #: The :kbd:`message registry` section of #: :py:attr:`~nemo_nowcast.manager.config`. _msg_registry = attr.ib(default=None) #: Worker race condition management data structure: :py:class:`dict` #: containing collections of workers that must all finish before #: another collection of workers are launched _race_condition_mgmt = attr.ib(default=attr.Factory(dict)) #: Name of the Python module that contains functions to calculate #: lists of workers to launch after previous workers end their work. #: Set from the :kbd:`message registry` section of #: :py:attr:`~nemo_nowcast.manager.config` in the #: py:meth:`~nemo_nowcast.manager.NowcastManager.setup` method. _next_workers_module = attr.ib(default=None) #: :py:class:`zmq.Context` instance that provides the basis for the #: nowcast messaging system. _context = attr.ib(default=attr.Factory(zmq.Context)) #: :py:class:`zmq.Context.socket` instance that is connected to the #: message broker to enable nowcast system messages to be exchanged #: with worker processes. #: Created when the #: py:meth:`~nemo_nowcast.manager.NowcastManager.run` method is called. _socket = attr.ib(default=None)
[docs] def setup(self): """Set up the nowcast system manager process including: * Building the command-line parser, and parsing the command-line used to launch the manager * Reading and parsing the configuration file given on the command-line * Configuring the logging system as specified in the configuration file * Logging the manager's PID, and the file path/name that was used to configure it. * Importing the :py:mod:`next_workers` module specified in the configuration file. The set-up is repeated if the manager process receives a HUP signal so that the configuration can be re-loaded without having to stop and re-start the manager. """ self._parsed_args = self._cli() self.config.load(self._parsed_args.config_file) self._msg_registry = self.config["message registry"] msg = self._configure_logging() self.logger.info(f"running in process {os.getpid()}") self.logger.info(f"read config from {self.config.file}") self.logger.info(msg) try: self._next_workers_module = importlib.import_module( self._msg_registry["next workers module"] ) except ImportError: self.logger.critical( f"could not find next workers module: {self._msg_registry['next workers module']}", exc_info=True, ) raise self.logger.info( f"next workers module loaded from {self._msg_registry['next workers module']}" )
def _cli(self, args=None): """Configure command-line argument parser and return parsed arguments object. """ cli = CommandLineInterface( self.name, package="nemo_nowcast", description=__doc__ ) cli.build_parser(add_help=False) parser = argparse.ArgumentParser( prog=cli.parser.prog, description=cli.parser.description, parents=[cli.parser], ) parser.add_argument( "--ignore-checklist", action="store_true", help=""" Don't load the serialized checklist left by a previously running manager instance. """, ) return parser.parse_args(args) def _configure_logging(self): """Configure the manager's logging system interface. """ # Initialize exception logging to Sentry with client DSN URL from SENTRY_DSN envvar; # does nothing if SENTRY_DSN does not exist, is empty, or is not recognized by Sentry sentry_sdk.init() self.logger = logging.getLogger(self.name) if "publisher" in self.config["logging"]: # Publish log messages to distributed logging aggregator logging_config = self.config["logging"]["publisher"] logging_config["handlers"]["zmq_pub"]["context"] = self._context port = self.config["zmq"]["ports"]["logging"][self.name] addr = f"tcp://*:{port}" logging_config["handlers"]["zmq_pub"]["interface_or_socket"] = addr logging.config.dictConfig(logging_config) for handler in self.logger.root.handlers: if isinstance(handler, zmq.log.handlers.PUBHandler): handler.root_topic = self.name handler.formatters = { logging.DEBUG: logging.Formatter("%(message)s\n"), logging.INFO: logging.Formatter("%(message)s\n"), logging.WARNING: logging.Formatter("%(message)s\n"), logging.ERROR: logging.Formatter("%(message)s\n"), logging.CRITICAL: logging.Formatter("%(message)s\n"), } # Not sure why, but we need a brief pause before we start logging # messages time.sleep(1) msg = f"publishing logging messages to {addr}" else: # Write log messages to local file system # # Replace logging RotatingFileHandlers with WatchedFileHandlers so # that we notice when log files are rotated and switch to writing # to the new ones logging_config = self.config["logging"] logging_handlers = logging_config["handlers"] rotating_handler = "logging.handlers.RotatingFileHandler" watched_handler = "logging.handlers.WatchedFileHandler" for handler in logging_handlers: if logging_handlers[handler]["class"] == rotating_handler: logging_handlers[handler]["class"] = watched_handler del logging_handlers[handler]["backupCount"] logging.config.dictConfig(logging_config) msg = "writing logging messages to local file system" return msg
[docs] def run(self): """Run the nowcast system manager: * Create the :py:class:`zmq.Context.socket` for communication with the worker processes and connect it to the message broker. * Install signal handlers for hangup, interrupt, and kill signals. * Launch the manager's message processing loop """ self._socket = self._context.socket(zmq.REP) zmq_host = self.config["zmq"]["host"] zmq_port = self.config["zmq"]["ports"]["manager"] self._socket.connect(f"tcp://{zmq_host}:{zmq_port}") self.logger.info(f"connected to {zmq_host} port {zmq_port}") self._install_signal_handlers(zmq_host, zmq_port) if not self._parsed_args.ignore_checklist: self._load_checklist() self._process_messages()
def _install_signal_handlers(self, zmq_host, zmq_port): """Set up hangup, interrupt, and kill signal handlers. """ def sighup_handler(signal, frame): self.logger.info("hangup signal (SIGHUP) received; reloading configuration") self._socket.disconnect(f"tcp://{zmq_host}:{zmq_port}") self.setup() self.run() signal.signal(signal.SIGHUP, sighup_handler) def sigint_handler(signal, frame): self.logger.info( "interrupt signal (SIGINT or Ctrl-C) received; shutting down" ) self._socket.close() raise SystemExit signal.signal(signal.SIGINT, sigint_handler) def sigterm_handler(signal, frame): self.logger.info("termination signal (SIGTERM) received; shutting down") self._socket.close() raise SystemExit signal.signal(signal.SIGTERM, sigterm_handler) def _load_checklist(self): """Load the serialized checklist left on disk by a previously running manager instance. """ checklist_file = self.config["checklist file"] try: with open(checklist_file, "rt") as f: self.checklist = yaml.safe_load(f) self.logger.info(f"checklist read from {checklist_file}") self.logger.info(f"checklist:\n{pprint.pformat(self.checklist)}") except FileNotFoundError: self.logger.warning("checklist load failed:", exc_info=True) self.logger.warning("running with empty checklist") def _process_messages(self): """Process messages from workers. """ while True: self.logger.debug("listening...") try: self._try_messages() except zmq.ZMQError as e: # Fatal ZeroMQ problem self.logger.critical("ZMQError:", exc_info=e) self.logger.critical("shutting down") break except SystemExit: # Termination by signal break except Exception as e: self.logger.critical("unhandled exception:", exc_info=e) self.logger.critical("shutting down") def _try_messages(self): """Try to process messages. Extracted from the :kbd:`try:` block in :py:meth:`_process_messages` so that it can be tested outside of the :kbd:`while True:` loop. """ message = self._socket.recv_string() reply, next_workers = self._message_handler(message) self._socket.send_string(reply) for worker in next_workers: worker.launch(self.config, self.name) def _message_handler(self, message): """Handle message from worker. """ msg = Message.deserialize(message) if msg.source not in self._msg_registry["workers"]: reply = self._handle_unregistered_worker_msg(msg) return reply, [] if msg.type not in self._msg_registry["workers"][msg.source]: reply = self._handle_unregistered_msg_type(msg) return reply, [] self._log_received_msg(msg) if msg.type == "clear checklist": reply = self._clear_checklist() return reply, [] if msg.type == "need": reply = self._handle_need_msg(msg) return reply, [] reply, next_workers = self._handle_continue_msg(msg) return reply, next_workers def _handle_unregistered_worker_msg(self, msg): """Emit error message to log about a message received from a worker that is not included in the message registry. """ self.logger.error( f"message received from unregistered worker: {msg.source}", extra={"worker_msg": msg}, ) reply = Message(self.name, "unregistered worker").serialize() return reply def _handle_unregistered_msg_type(self, msg): """Emit error message to log about a message type received from a worker that is not included in the message registry. """ self.logger.error( f"unregistered message type received from {msg.source} worker: {msg.type}", extra={"worker_msg": msg}, ) reply = Message(self.name, "unregistered message type").serialize() return reply def _log_received_msg(self, msg): """Emit debug message about message received from worker. """ msg_words = self._msg_registry["workers"][msg.source][msg.type] self.logger.debug( f"received message from {msg.source}: ({msg.type}) {msg_words}", extra={"worker_msg": msg}, ) def _handle_need_msg(self, msg): """Handle request for checklist section message from worker. """ reply = Message( self.name, "ack", payload=self.checklist[msg.payload] ).serialize() return reply def _handle_continue_msg(self, msg): """Handle success, failure, or crash message from worker by generating list of subsequent workers to launch. """ if msg.payload is not None: self._update_checklist(msg) self._slack_notification(msg) importlib.reload(self._next_workers_module) worker = msg.source try: after_func = getattr(self._next_workers_module, f"after_{worker}") except AttributeError: self.logger.critical( f"could not find after_{worker} in {self._msg_registry['next workers module']} module", exc_info=True, ) reply = Message(self.name, "no after_worker function").serialize() return reply, [] next_workers = after_func(msg, self.config, self.checklist) if len(next_workers) > 1 and isinstance(next_workers[-1], set): next_workers, self._race_condition_mgmt["must finish"] = next_workers self._race_condition_mgmt["then launch"] = [] self.logger.debug( f"race condition management activated: {self._race_condition_mgmt}" ) try: self._race_condition_mgmt["must finish"].remove(worker) self._race_condition_mgmt["then launch"].extend(next_workers) next_workers.clear() self.logger.debug( f"{worker} finished and race condition management updated: {self._race_condition_mgmt}" ) except (KeyError, ValueError): # No race condition management in effect, or worker not in "must finish" list pass try: if not self._race_condition_mgmt["must finish"]: next_workers = self._race_condition_mgmt["then launch"] self._race_condition_mgmt = {} self.logger.debug( f"race condition management ended; next workers released: {next_workers}" ) except KeyError: # No race condition management in effect pass reply = Message(self.name, "ack").serialize() return reply, next_workers def _update_checklist(self, msg): """Update the checklist value at worker's key with the items passed from the worker. If key is not present in the checklist, add it with the worker items as its value. Write the checklist to disk as a YAML file so that it can be inspected and/or recovered if the manager instance is restarted. """ try: key = self._msg_registry["workers"][msg.source]["checklist key"] except KeyError: raise KeyError(f"checklist key not found for {msg.source} worker") try: self.checklist[key].update(msg.payload) except (KeyError, AttributeError): self.checklist[key] = msg.payload self.logger.info( f"checklist updated with [{key}] items from {msg.source} worker", extra={"worker_msg": msg}, ) self._write_checklist_to_disk() def _write_checklist_to_disk(self): """Write the checklist to disk as a YAML file so that it can be inspected and/or recovered if the manager instance is restarted. """ with open(self.config["checklist file"], "wt") as f: yaml.dump(self.checklist, f) def _slack_notification(self, msg): try: slack_notifications = self.config["slack notifications"] except KeyError: # No slack notification section in config, and that's okay! return slack_msg = {"text": f"{msg.source}: {msg.type}"} try: slack_msg["text"] = "\n".join( (slack_msg["text"], f"Log: {slack_notifications['website log url']}") ) except KeyError: # Not everyone publishes their logs and/or checklist to the web... pass try: slack_msg["text"] = "\n".join( ( slack_msg["text"], f"Checklist: {slack_notifications['website checklist url']}", ) ) except KeyError: # Not everyone publishes their logs and/or checklist to the web... pass for key, workers in slack_notifications.items(): if not key.startswith("SLACK"): continue slack_url_envvar = key try: slack_url = os.environ[slack_url_envvar] except KeyError: # No value found in environment self.logger.debug( f"slack notification environment variable not found: {slack_url_envvar}" ) continue if msg.source in workers: requests.post(slack_url, json=slack_msg) def _clear_checklist(self): """Write the checklist to a log file, then clear it. This method is called in response to a "clear checklist" message from the :py:mod:`nemo_nowcast.workers.clear_checklist` worker. That worker is typically run once per nowcast cycle (e.g. daily) at the end of processing, just before rotating the log files via the :py:mod:`nemo_nowcast.workers.rotate_logs` worker. """ checklist_logger = logging.getLogger("checklist") if checklist_logger.handlers: self.logger.info("writing checklist to log file") for handler in checklist_logger.handlers: checklist_logger.log( handler.level, f"checklist:\n{pprint.pformat(self.checklist)}" ) handler.close() self.checklist.clear() self._write_checklist_to_disk() self.logger.info("checklist cleared") reply = Message(self.name, "checklist cleared").serialize() return reply
if __name__ == "__main__": main() # pragma: no cover