Files
meow_base/core/base_conductor.py
2023-04-22 21:48:33 +02:00

230 lines
9.4 KiB
Python

"""
This file contains the base MEOW conductor defintion. This should be inherited
from for all conductor instances.
Author(s): David Marchant
"""
import shutil
import subprocess
import os
from datetime import datetime
from threading import Event, Thread
from time import sleep
from typing import Any, Tuple, Dict, Union
from meow_base.core.meow import valid_job
from meow_base.core.vars import VALID_CONDUCTOR_NAME_CHARS, VALID_CHANNELS, \
JOB_STATUS, JOB_START_TIME, META_FILE, STATUS_RUNNING, STATUS_DONE , \
BACKUP_JOB_ERROR_FILE, JOB_END_TIME, STATUS_FAILED, JOB_ERROR, \
get_drt_imp_msg
from meow_base.functionality.file_io import write_file, \
threadsafe_read_status, threadsafe_update_status
from meow_base.functionality.validation import check_implementation, \
valid_string, valid_existing_dir_path, valid_natural, valid_dir_path
from meow_base.functionality.naming import generate_conductor_id
class BaseConductor:
# An identifier for a conductor within the runner. Can be manually set in
# the constructor, or autogenerated if no name provided.
name:str
# A channel for sending messages to the runner job queue. Note that this
# will be overridden by a MeowRunner, if a conductor instance is passed to
# it, and so does not need to be initialised within the conductor itself,
# unless the conductor is running independently of a runner.
to_runner_job: VALID_CHANNELS
# Directory where queued jobs are initially written to. Note that this
# will be overridden by a MeowRunner, if a handler instance is passed to
# it, and so does not need to be initialised within the handler itself.
job_queue_dir:str
# Directory where completed jobs are finally written to. Note that this
# will be overridden by a MeowRunner, if a handler instance is passed to
# it, and so does not need to be initialised within the handler itself.
job_output_dir:str
# A count, for how long a conductor will wait if told that there are no
# jobs in the runner, before polling again. Default is 5 seconds.
pause_time: int
def __init__(self, name:str="", pause_time:int=5)->None:
"""BaseConductor Constructor. This will check that any class inheriting
from it implements its validation functions."""
check_implementation(type(self).valid_execute_criteria, BaseConductor)
if not name:
name = generate_conductor_id()
self._is_valid_name(name)
self.name = name
self._is_valid_pause_time(pause_time)
self.pause_time = pause_time
def __new__(cls, *args, **kwargs):
"""A check that this base class is not instantiated itself, only
inherited from"""
if cls is BaseConductor:
msg = get_drt_imp_msg(BaseConductor)
raise TypeError(msg)
return object.__new__(cls)
def _is_valid_name(self, name:str)->None:
"""Validation check for 'name' variable from main constructor. Is
automatically called during initialisation. This does not need to be
overridden by child classes."""
valid_string(name, VALID_CONDUCTOR_NAME_CHARS)
def _is_valid_pause_time(self, pause_time:int)->None:
"""Validation check for 'pause_time' variable from main constructor. Is
automatically called during initialisation. This does not need to be
overridden by child classes."""
valid_natural(pause_time, hint="BaseHandler.pause_time")
def prompt_runner_for_job(self)->Union[Dict[str,Any],Any]:
self.to_runner_job.send(1)
if self.to_runner_job.poll(self.pause_time):
return self.to_runner_job.recv()
return None
def start(self)->None:
"""Function to start the conductor as an ongoing thread, as defined by
the main_loop function. Together, these will execute any code in a
implemented conductors execute function sequentially, but concurrently
to any other conductors running or other runner operations. This is
intended as a naive mmultiprocessing implementation, and any more in
depth parallelisation of execution must be implemented by a user by
overriding this function, and the stop function."""
self._stop_event = Event()
self._handle_thread = Thread(
target=self.main_loop,
args=(self._stop_event,),
daemon=True,
name="conductor_thread"
)
self._handle_thread.start()
def stop(self)->None:
"""Function to stop the conductor as an ongoing thread. May be
overidden by any child class. This function should also be overriden if
the start function has been."""
self._stop_event.set()
self._handle_thread.join()
def main_loop(self, stop_event)->None:
"""Function defining an ongoing thread, as started by the start
function and stoped by the stop function. """
while not stop_event.is_set():
reply = self.prompt_runner_for_job()
# If we have recieved 'None' then we have already timed out so skip
# this loop and start again
if reply is None:
continue
try:
valid_existing_dir_path(reply)
except:
# Were not given a job dir, so sleep before trying again
sleep(self.pause_time)
try:
self.execute(reply)
except:
# TODO some error reporting here
pass
def valid_execute_criteria(self, job:Dict[str,Any])->Tuple[bool,str]:
"""Function to determine given an job defintion, if this conductor can
process it or not. Must be implemented by any child process."""
pass
def run_job(self, job_dir:str)->None:
"""Function to actually execute a job. This will read job
defintions from its meta file, update the meta file and attempt to
execute. Some unspecific feedback will be given on execution failure,
but depending on what it is it may be up to the job itself to provide
more detailed feedback. If you simply wish to alter the conditions
under which the job is executed, please instead look at the execute
function."""
valid_dir_path(job_dir, must_exist=True)
# Test our job parameters. Even if its gibberish, we still move to
# output
abort = False
try:
meta_file = os.path.join(job_dir, META_FILE)
job = threadsafe_read_status(meta_file)
valid_job(job)
# update the status file with running status
threadsafe_update_status(
{
JOB_STATUS: STATUS_RUNNING,
JOB_START_TIME: datetime.now()
},
meta_file
)
except Exception as e:
# If something has gone wrong at this stage then its bad, so we
# need to make our own error file
error_file = os.path.join(job_dir, BACKUP_JOB_ERROR_FILE)
write_file(f"Recieved incorrectly setup job.\n\n{e}", error_file)
abort = True
# execute the job
if not abort:
try:
result = subprocess.call(
os.path.join(job_dir, job["tmp script command"]),
cwd="."
)
if result == 0:
# Update the status file with the finalised status
threadsafe_update_status(
{
JOB_STATUS: STATUS_DONE,
JOB_END_TIME: datetime.now()
},
meta_file
)
else:
# Update the status file with the error status. Don't
# overwrite any more specific error messages already
# created
threadsafe_update_status(
{
JOB_STATUS: STATUS_FAILED,
JOB_END_TIME: datetime.now(),
JOB_ERROR: "Job execution returned non-zero."
},
meta_file
)
except Exception as e:
# Update the status file with the error status. Don't overwrite
# any more specific error messages already created
threadsafe_update_status(
{
JOB_STATUS: STATUS_FAILED,
JOB_END_TIME: datetime.now(),
JOB_ERROR: f"Job execution failed. {e}"
},
meta_file
)
# Move the contents of the execution directory to the final output
# directory.
job_output_dir = \
os.path.join(self.job_output_dir, os.path.basename(job_dir))
shutil.move(job_dir, job_output_dir)
def execute(self, job_dir:str)->None:
"""Function to run job execution. By default this will simply call the
run_job function, to execute the job locally. However, this function
may be overridden to execute the job in some other manner, such as on
another resource. Note that the job itself should be executed using the
run_job func in order to maintain expected logging etc."""
self.run_job(job_dir)