meow_base/core/base_conductor.py


"""
This file contains the base MEOW conductor defintion. This should be inherited
from for all conductor instances.

Author(s): David Marchant
"""
import shutil
import subprocess
import os

from datetime import datetime
from threading import Event, Thread
from time import sleep
from typing import Any, Tuple, Dict, Union


from meow_base.core.meow import valid_job
from meow_base.core.vars import VALID_CONDUCTOR_NAME_CHARS, VALID_CHANNELS, \
    JOB_STATUS, JOB_START_TIME, META_FILE, STATUS_RUNNING, STATUS_DONE , \
    BACKUP_JOB_ERROR_FILE, JOB_END_TIME, STATUS_FAILED, JOB_ERROR, \
    get_drt_imp_msg
from meow_base.functionality.file_io import write_file, \
    threadsafe_read_status, threadsafe_update_status
from meow_base.functionality.validation import check_implementation, \
    valid_string, valid_existing_dir_path, valid_natural, valid_dir_path
from meow_base.functionality.naming import generate_conductor_id


class BaseConductor:
    # An identifier for a conductor within the runner. Can be manually set in
    # the constructor, or autogenerated if no name provided.
    name:str
    # A channel for sending messages to the runner job queue. Note that this
    # will be overridden by a MeowRunner, if a conductor instance is passed to
    # it, and so does not need to be initialised within the conductor itself,
    # unless the conductor is running independently of a runner.
    to_runner_job: VALID_CHANNELS
    # Directory where queued jobs are initially written to. Note that this
    # will be overridden by a MeowRunner, if a handler instance is passed to
    # it, and so does not need to be initialised within the handler itself.
    job_queue_dir:str
    # Directory where completed jobs are finally written to. Note that this
    # will be overridden by a MeowRunner, if a handler instance is passed to
    # it, and so does not need to be initialised within the handler itself.
    job_output_dir:str
    # A count, for how long a conductor will wait if told that there are no
    # jobs in the runner, before polling again. Default is 5 seconds.
    pause_time: int
    def __init__(self, name:str="", pause_time:int=5)->None:
        """BaseConductor Constructor. This will check that any class inheriting
        from it implements its validation functions."""
        check_implementation(type(self).valid_execute_criteria, BaseConductor)
        if not name:
            name = generate_conductor_id()
        self._is_valid_name(name)
        self.name = name
        self._is_valid_pause_time(pause_time)
        self.pause_time = pause_time

    def __new__(cls, *args, **kwargs):
        """A check that this base class is not instantiated itself, only
        inherited from"""
        if cls is BaseConductor:
            msg = get_drt_imp_msg(BaseConductor)
            raise TypeError(msg)
        return object.__new__(cls)

    def _is_valid_name(self, name:str)->None:
        """Validation check for 'name' variable from main constructor. Is
        automatically called during initialisation. This does not need to be
        overridden by child classes."""
        valid_string(name, VALID_CONDUCTOR_NAME_CHARS)

    def _is_valid_pause_time(self, pause_time:int)->None:
        """Validation check for 'pause_time' variable from main constructor. Is
        automatically called during initialisation. This does not need to be
        overridden by child classes."""
        valid_natural(pause_time, hint="BaseHandler.pause_time")

    def prompt_runner_for_job(self)->Union[Dict[str,Any],Any]:
        self.to_runner_job.send(1)

        if self.to_runner_job.poll(self.pause_time):
            return self.to_runner_job.recv()
        return None

    def start(self)->None:
        """Function to start the conductor as an ongoing thread, as defined by
        the main_loop function. Together, these will execute any code in a
        implemented conductors execute function sequentially, but concurrently
        to any other conductors running or other runner operations. This is
        intended as a naive mmultiprocessing implementation, and any more in
        depth parallelisation of execution must be implemented by a user by
        overriding this function, and the stop function."""
        self._stop_event = Event()
        self._handle_thread = Thread(
            target=self.main_loop,
            args=(self._stop_event,),
            daemon=True,
            name="conductor_thread"
        )
        self._handle_thread.start()

    def stop(self)->None:
        """Function to stop the conductor as an ongoing thread. May be
        overidden by any child class. This function should also be overriden if
        the start function has been."""
        self._stop_event.set()
        self._handle_thread.join()

    def main_loop(self, stop_event)->None:
        """Function defining an ongoing thread, as started by the start
        function and stoped by the stop function. """

        while not stop_event.is_set():
            reply = self.prompt_runner_for_job()

            # If we have recieved 'None' then we have already timed out so skip
            # this loop and start again
            if reply is None:
                continue

            try:
                valid_existing_dir_path(reply)
            except:
                # Were not given a job dir, so sleep before trying again
                sleep(self.pause_time)

            try:
                self.execute(reply)
            except:
                # TODO some error reporting here
                pass

    def valid_execute_criteria(self, job:Dict[str,Any])->Tuple[bool,str]:
        """Function to determine given an job defintion, if this conductor can
        process it or not. Must be implemented by any child process."""
        pass

    def run_job(self, job_dir:str)->None:
        """Function to actually execute a job. This will read job
        defintions from its meta file, update the meta file and attempt to
        execute. Some unspecific feedback will be given on execution failure,
        but depending on what it is it may be up to the job itself to provide
        more detailed feedback. If you simply wish to alter the conditions
        under which the job is executed, please instead look at the execute
        function."""
        valid_dir_path(job_dir, must_exist=True)

        # Test our job parameters. Even if its gibberish, we still move to
        # output
        abort = False
        try:
            meta_file = os.path.join(job_dir, META_FILE)
            job = threadsafe_read_status(meta_file)
            valid_job(job)

            # update the status file with running status
            threadsafe_update_status(
                {
                    JOB_STATUS: STATUS_RUNNING,
                    JOB_START_TIME: datetime.now()
                },
                meta_file
            )

        except Exception as e:
            # If something has gone wrong at this stage then its bad, so we
            # need to make our own error file
            error_file = os.path.join(job_dir, BACKUP_JOB_ERROR_FILE)
            write_file(f"Recieved incorrectly setup job.\n\n{e}", error_file)
            abort = True

        # execute the job
        if not abort:
            try:
                result = subprocess.call(
                    os.path.join(job_dir, job["tmp script command"]),
                    cwd="."
                )

                if result == 0:
                    # Update the status file with the finalised status
                    threadsafe_update_status(
                        {
                            JOB_STATUS: STATUS_DONE,
                            JOB_END_TIME: datetime.now()
                        },
                        meta_file
                    )

                else:
                    # Update the status file with the error status. Don't
                    # overwrite any more specific error messages already
                    # created
                    threadsafe_update_status(
                        {
                            JOB_STATUS: STATUS_FAILED,
                            JOB_END_TIME: datetime.now(),
                            JOB_ERROR: "Job execution returned non-zero."
                        },
                        meta_file
                    )

            except Exception as e:
                # Update the status file with the error status. Don't overwrite
                # any more specific error messages already created
                threadsafe_update_status(
                    {
                        JOB_STATUS: STATUS_FAILED,
                        JOB_END_TIME: datetime.now(),
                        JOB_ERROR: f"Job execution failed. {e}"
                    },
                    meta_file
                )

        # Move the contents of the execution directory to the final output
        # directory.
        job_output_dir = \
            os.path.join(self.job_output_dir, os.path.basename(job_dir))
        shutil.move(job_dir, job_output_dir)

    def execute(self, job_dir:str)->None:
        """Function to run job execution. By default this will simply call the
        run_job function, to execute the job locally. However, this function
        may be overridden to execute the job in some other manner, such as on
        another resource. Note that the job itself should be executed using the
        run_job func in order to maintain expected logging etc."""
        self.run_job(job_dir)