230 lines
9.4 KiB
Python
230 lines
9.4 KiB
Python
|
|
"""
|
|
This file contains the base MEOW conductor defintion. This should be inherited
|
|
from for all conductor instances.
|
|
|
|
Author(s): David Marchant
|
|
"""
|
|
import shutil
|
|
import subprocess
|
|
import os
|
|
|
|
from datetime import datetime
|
|
from threading import Event, Thread
|
|
from time import sleep
|
|
from typing import Any, Tuple, Dict, Union
|
|
|
|
|
|
from meow_base.core.meow import valid_job
|
|
from meow_base.core.vars import VALID_CONDUCTOR_NAME_CHARS, VALID_CHANNELS, \
|
|
JOB_STATUS, JOB_START_TIME, META_FILE, STATUS_RUNNING, STATUS_DONE , \
|
|
BACKUP_JOB_ERROR_FILE, JOB_END_TIME, STATUS_FAILED, JOB_ERROR, \
|
|
get_drt_imp_msg
|
|
from meow_base.functionality.file_io import write_file, \
|
|
threadsafe_read_status, threadsafe_update_status
|
|
from meow_base.functionality.validation import check_implementation, \
|
|
valid_string, valid_existing_dir_path, valid_natural, valid_dir_path
|
|
from meow_base.functionality.naming import generate_conductor_id
|
|
|
|
|
|
class BaseConductor:
|
|
# An identifier for a conductor within the runner. Can be manually set in
|
|
# the constructor, or autogenerated if no name provided.
|
|
name:str
|
|
# A channel for sending messages to the runner job queue. Note that this
|
|
# will be overridden by a MeowRunner, if a conductor instance is passed to
|
|
# it, and so does not need to be initialised within the conductor itself,
|
|
# unless the conductor is running independently of a runner.
|
|
to_runner_job: VALID_CHANNELS
|
|
# Directory where queued jobs are initially written to. Note that this
|
|
# will be overridden by a MeowRunner, if a handler instance is passed to
|
|
# it, and so does not need to be initialised within the handler itself.
|
|
job_queue_dir:str
|
|
# Directory where completed jobs are finally written to. Note that this
|
|
# will be overridden by a MeowRunner, if a handler instance is passed to
|
|
# it, and so does not need to be initialised within the handler itself.
|
|
job_output_dir:str
|
|
# A count, for how long a conductor will wait if told that there are no
|
|
# jobs in the runner, before polling again. Default is 5 seconds.
|
|
pause_time: int
|
|
def __init__(self, name:str="", pause_time:int=5)->None:
|
|
"""BaseConductor Constructor. This will check that any class inheriting
|
|
from it implements its validation functions."""
|
|
check_implementation(type(self).valid_execute_criteria, BaseConductor)
|
|
if not name:
|
|
name = generate_conductor_id()
|
|
self._is_valid_name(name)
|
|
self.name = name
|
|
self._is_valid_pause_time(pause_time)
|
|
self.pause_time = pause_time
|
|
|
|
def __new__(cls, *args, **kwargs):
|
|
"""A check that this base class is not instantiated itself, only
|
|
inherited from"""
|
|
if cls is BaseConductor:
|
|
msg = get_drt_imp_msg(BaseConductor)
|
|
raise TypeError(msg)
|
|
return object.__new__(cls)
|
|
|
|
def _is_valid_name(self, name:str)->None:
|
|
"""Validation check for 'name' variable from main constructor. Is
|
|
automatically called during initialisation. This does not need to be
|
|
overridden by child classes."""
|
|
valid_string(name, VALID_CONDUCTOR_NAME_CHARS)
|
|
|
|
def _is_valid_pause_time(self, pause_time:int)->None:
|
|
"""Validation check for 'pause_time' variable from main constructor. Is
|
|
automatically called during initialisation. This does not need to be
|
|
overridden by child classes."""
|
|
valid_natural(pause_time, hint="BaseHandler.pause_time")
|
|
|
|
def prompt_runner_for_job(self)->Union[Dict[str,Any],Any]:
|
|
self.to_runner_job.send(1)
|
|
|
|
if self.to_runner_job.poll(self.pause_time):
|
|
return self.to_runner_job.recv()
|
|
return None
|
|
|
|
def start(self)->None:
|
|
"""Function to start the conductor as an ongoing thread, as defined by
|
|
the main_loop function. Together, these will execute any code in a
|
|
implemented conductors execute function sequentially, but concurrently
|
|
to any other conductors running or other runner operations. This is
|
|
intended as a naive mmultiprocessing implementation, and any more in
|
|
depth parallelisation of execution must be implemented by a user by
|
|
overriding this function, and the stop function."""
|
|
self._stop_event = Event()
|
|
self._handle_thread = Thread(
|
|
target=self.main_loop,
|
|
args=(self._stop_event,),
|
|
daemon=True,
|
|
name="conductor_thread"
|
|
)
|
|
self._handle_thread.start()
|
|
|
|
def stop(self)->None:
|
|
"""Function to stop the conductor as an ongoing thread. May be
|
|
overidden by any child class. This function should also be overriden if
|
|
the start function has been."""
|
|
self._stop_event.set()
|
|
self._handle_thread.join()
|
|
|
|
def main_loop(self, stop_event)->None:
|
|
"""Function defining an ongoing thread, as started by the start
|
|
function and stoped by the stop function. """
|
|
|
|
while not stop_event.is_set():
|
|
reply = self.prompt_runner_for_job()
|
|
|
|
# If we have recieved 'None' then we have already timed out so skip
|
|
# this loop and start again
|
|
if reply is None:
|
|
continue
|
|
|
|
try:
|
|
valid_existing_dir_path(reply)
|
|
except:
|
|
# Were not given a job dir, so sleep before trying again
|
|
sleep(self.pause_time)
|
|
|
|
try:
|
|
self.execute(reply)
|
|
except:
|
|
# TODO some error reporting here
|
|
pass
|
|
|
|
def valid_execute_criteria(self, job:Dict[str,Any])->Tuple[bool,str]:
|
|
"""Function to determine given an job defintion, if this conductor can
|
|
process it or not. Must be implemented by any child process."""
|
|
pass
|
|
|
|
def run_job(self, job_dir:str)->None:
|
|
"""Function to actually execute a job. This will read job
|
|
defintions from its meta file, update the meta file and attempt to
|
|
execute. Some unspecific feedback will be given on execution failure,
|
|
but depending on what it is it may be up to the job itself to provide
|
|
more detailed feedback. If you simply wish to alter the conditions
|
|
under which the job is executed, please instead look at the execute
|
|
function."""
|
|
valid_dir_path(job_dir, must_exist=True)
|
|
|
|
# Test our job parameters. Even if its gibberish, we still move to
|
|
# output
|
|
abort = False
|
|
try:
|
|
meta_file = os.path.join(job_dir, META_FILE)
|
|
job = threadsafe_read_status(meta_file)
|
|
valid_job(job)
|
|
|
|
# update the status file with running status
|
|
threadsafe_update_status(
|
|
{
|
|
JOB_STATUS: STATUS_RUNNING,
|
|
JOB_START_TIME: datetime.now()
|
|
},
|
|
meta_file
|
|
)
|
|
|
|
except Exception as e:
|
|
# If something has gone wrong at this stage then its bad, so we
|
|
# need to make our own error file
|
|
error_file = os.path.join(job_dir, BACKUP_JOB_ERROR_FILE)
|
|
write_file(f"Recieved incorrectly setup job.\n\n{e}", error_file)
|
|
abort = True
|
|
|
|
# execute the job
|
|
if not abort:
|
|
try:
|
|
result = subprocess.call(
|
|
os.path.join(job_dir, job["tmp script command"]),
|
|
cwd="."
|
|
)
|
|
|
|
if result == 0:
|
|
# Update the status file with the finalised status
|
|
threadsafe_update_status(
|
|
{
|
|
JOB_STATUS: STATUS_DONE,
|
|
JOB_END_TIME: datetime.now()
|
|
},
|
|
meta_file
|
|
)
|
|
|
|
else:
|
|
# Update the status file with the error status. Don't
|
|
# overwrite any more specific error messages already
|
|
# created
|
|
threadsafe_update_status(
|
|
{
|
|
JOB_STATUS: STATUS_FAILED,
|
|
JOB_END_TIME: datetime.now(),
|
|
JOB_ERROR: "Job execution returned non-zero."
|
|
},
|
|
meta_file
|
|
)
|
|
|
|
except Exception as e:
|
|
# Update the status file with the error status. Don't overwrite
|
|
# any more specific error messages already created
|
|
threadsafe_update_status(
|
|
{
|
|
JOB_STATUS: STATUS_FAILED,
|
|
JOB_END_TIME: datetime.now(),
|
|
JOB_ERROR: f"Job execution failed. {e}"
|
|
},
|
|
meta_file
|
|
)
|
|
|
|
# Move the contents of the execution directory to the final output
|
|
# directory.
|
|
job_output_dir = \
|
|
os.path.join(self.job_output_dir, os.path.basename(job_dir))
|
|
shutil.move(job_dir, job_output_dir)
|
|
|
|
def execute(self, job_dir:str)->None:
|
|
"""Function to run job execution. By default this will simply call the
|
|
run_job function, to execute the job locally. However, this function
|
|
may be overridden to execute the job in some other manner, such as on
|
|
another resource. Note that the job itself should be executed using the
|
|
run_job func in order to maintain expected logging etc."""
|
|
self.run_job(job_dir) |