Continue updating.

# Management

scontrol update nodename=nodeXX state=resume
scontrol update jobid=jobid jobname=newname
scontrol update jobid=jobid partition=exclusive
scontrol show partition (partition_name)
scontrol show node (node_name)
scontrol show job jobid
scontrol hold/release/requeue jobid/joblist # use this command when jobs are waiting in queue
scontrol requeuehold/release jobid # use this command when jobs are running
sacctmgr list assoc/account (user=username)
sacct -u username -S 2022-07-01 -E now --field=jobid,partition,jobname,user,nnodes,nodelist,start,end,elapsed,alloccpus,state

Reference1

# Restart

# on master node

$ systemctl restart slurmctld.service
$ systemctl restart slurmdbd.service
$ systemctl restart slurmd.service

# on slave node

$ systemctl restart slurmd.service

# Upgrade

# on master node

$ systemctl stop slurmdbd.service
$ systemctl stop slurmctld.service
$ systemctl stop slurmd.service

# on slave node

$ systemctl stop slurmd.service

# on master node, make backup of database and old config files

$ mysqldump -p --databases slurm_acct_db > backup.sql (path: /root)
$ mkdir old_slurm
$ cp -r /etc/slurm/* /root/old_slurm

# all nodes upgrade

# conda deactivate, clean environment
# echo $LD_LIBRARY_PATH [*]
# unset LD_LIBRARY_PATH [*]
# export PATH=/usr/lib64/qt-3.3/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/usr/local/bin:/root/bin [*]
# yum --disablerepo='*' localinstall libaec-1.0.4-1.el7.x86_64.rpm libaec-devel-1.0.4-1.el7.x86_64.rpm hdf5-devel-1.8.12-12.el7.x86_64.rpm hdf5-1.8.12-12.el7.x86_64.rpm 
$ rpmbuild -ta slurm-*.tar.bz2
$ rpm --upgrade *.rpm # for downgrade, use "rpm -Uvh --oldpackage *.rpm"
$ yum --disablerepo='*' list slurm
# "[*]" maybe necessary

# Pam

  1. Config1
$ vim /etc/slurm/slurm.conf
#TaskPlugin=task/affinity
TaskPlugin=task/cgroup
PrologFlags=contain
  1. Config2
$ vim /etc/pam.d/sshd
account    sufficient    pam_listfile.so item=user sense=allow onerr=fail file=/etc/ssh/slurm_allowed_users
-account    required      pam_slurm_adopt.so
  1. Config3
$ vim /etc/pam.d/password-auth
#account    sufficient    pam_localuser.so
#-session   optional      pam_systemd.so

# scp from master node to slave nodes

$ scp /etc/slurm/slurm.conf nodeXX:/etc/slurm/
$ scp /etc/pam.d/sshd nodeXX:/etc/pam.d/
$ scp /etc/pam.d/password-auth nodeXX:/etc/pam.d/password-auth
  1. Restart
$ systemctl restart sshd.service

Reference2
Reference3


# Issues

⚠️ Error

Failed to start MUNGE authentication service.

🔧 Solution

  1. Check time sync status
$ systemctl stop ntpd.service
$ ntpd -gq
$ systemctl start ntpd.service
  1. Check munge folder
# check if /var/run/munge is missing
$ mkdir /var/run/munge
$ chown -R munge:munge /var/run/munge/
$ systemctl restart munge.service
$ systemctl restart slurmd.service