I was working on some other configuration that changed how the remote user was mapped to a system user and now the I am unable to replicate the error that I was seeing before. Previously, the system user on the OOD node had a different username than the user that the slurm commands were being run as on the slurm cluster ( I am using ssh to run the slurm commands). My guess is that some of the bin_override scripts was not handling this correctly. However, I’ve now matched usernames on both systems and things seem to be working now.
Now, the job status get properly updated when the job is cancelled and it allows me to delete the jobs from the job composer. Thanks for taking the time to help me out!
Although the problem is resolved now, here is the output from my squeue:
[tbpetersen@comet-ln3 ~]$ squeue -j 33172502
slurm_load_jobs error: Invalid job id specified
[tbpetersen@comet-ln3 ~]$ squeue -j 88888888888
squeue: error: Invalid job id: 88888888888
where 33172502 was a recently cancelled job and 88888888888 is an invalid job id
Here is the bin_override for squeue:
#!/usr/bin/python
from getpass import getuser
from select import select
from sh import ssh, ErrorReturnCode # pip3 install sh
import os
import re
import sys
import syslog
SUBMISSION_NODE = 'comet.sdsc.edu'
USER = os.environ['USER']
def run_remote_bin(remote_bin_path, *argv):
output = None
try:
result = ssh(
SUBMISSION_NODE,
'-q',
'-oBatchMode=yes', # ensure that SSH does not hang waiting for a password that will never be sent
remote_bin_path, # the real sbatch on the remote
*argv, # any arguments that sbatch should get
_err_to_out=True # merge stdout and stderr
)
output = result.stdout.decode('utf-8')
syslog.syslog(syslog.LOG_INFO, output)
except ErrorReturnCode as e:
output = e.stdout.decode('utf-8')
syslog.syslog(syslog.LOG_INFO, output)
print(output)
sys.exit(e.exit_code)
return output
def filter_args(args):
new_args = list(filter(lambda arg: arg != '--noconvert', args))
return new_args
def main():
output = run_remote_bin(
'/bin/squeue',
filter_args(sys.argv[1:])
)
print(output)
if __name__ == '__main__':
main()
and for scancel:
#!/usr/bin/python
from getpass import getuser
from select import select
from sh import ssh, ErrorReturnCode # pip3 install sh
import os
import re
import sys
import syslog
SUBMISSION_NODE = 'comet.sdsc.edu'
USER = os.environ['USER']
def run_remote_bin(remote_bin_path, *argv):
output = None
try:
result = ssh(
SUBMISSION_NODE,
'-oBatchMode=yes', # ensure that SSH does not hang waiting for a password that will never be sent
remote_bin_path, # the real sbatch on the remote
*argv, # any arguments that sbatch should get
_err_to_out=True # merge stdout and stderr
)
output = result.stdout.decode('utf-8')
syslog.syslog(syslog.LOG_INFO, output)
except ErrorReturnCode as e:
output = e.stdout.decode('utf-8')
syslog.syslog(syslog.LOG_INFO, output)
print(output)
sys.exit(e.exit_code)
return output
def main():
output = run_remote_bin(
'/bin/scancel',
sys.argv[1:]
)
print(output)
if __name__ == '__main__':
main()
Thanks!