# Management
| scontrol update nodename=nodeXX state=resume |
| scontrol update jobid=jobid jobname=newname |
| scontrol update jobid=jobid partition=exclusive |
| |
| scontrol show partition (partition_name) |
| scontrol show node (node_name) |
| scontrol show job jobid |
| |
| scontrol hold/release/requeue jobid/joblist |
| scontrol requeuehold/release jobid |
| |
| sacctmgr list assoc/account (user=username) |
| sacct -u username -S 2022-07-01 -E now --field=jobid,partition,jobname,user,nnodes,nodelist,start,end,elapsed,alloccpus,state |
Reference1
# Restart
# on master node
| $ systemctl restart slurmctld.service |
| $ systemctl restart slurmdbd.service |
| $ systemctl restart slurmd.service |
# on slave node
| $ systemctl restart slurmd.service |
# Upgrade
# on master node
| $ systemctl stop slurmdbd.service |
| $ systemctl stop slurmctld.service |
| $ systemctl stop slurmd.service |
# on slave node
| $ systemctl stop slurmd.service |
# on master node, make backup of database and old config files
| $ mysqldump -p --databases slurm_acct_db > backup.sql (path: /root) |
| |
| $ mkdir old_slurm |
| $ cp -r /etc/slurm/* /root/old_slurm |
# all nodes upgrade
| |
| |
| |
| |
| |
| $ rpmbuild -ta slurm-*.tar.bz2 |
| $ rpm --upgrade *.rpm |
| $ yum --disablerepo='*' list slurm |
| |
| |
# Pam
- Config1
| $ vim /etc/slurm/slurm.conf |
| |
| |
| TaskPlugin=task/cgroup |
| PrologFlags=contain |
- Config2
| $ vim /etc/pam.d/sshd |
| |
| account sufficient pam_listfile.so item=user sense=allow onerr=fail file=/etc/ssh/slurm_allowed_users |
| -account required pam_slurm_adopt.so |
- Config3
| $ vim /etc/pam.d/password-auth |
| |
| |
| |
# scp from master node to slave nodes
| $ scp /etc/slurm/slurm.conf nodeXX:/etc/slurm/ |
| $ scp /etc/pam.d/sshd nodeXX:/etc/pam.d/ |
| $ scp /etc/pam.d/password-auth nodeXX:/etc/pam.d/password-auth |
- Restart
| $ systemctl restart sshd.service |
Reference2
Reference3
# Issues
⚠️ Error
| Failed to start MUNGE authentication service. |
🔧 Solution
- Check time sync status
| $ systemctl stop ntpd.service |
| $ ntpd -gq |
| $ systemctl start ntpd.service |
- Check munge folder
| |
| $ mkdir /var/run/munge |
| $ chown -R munge:munge /var/run/munge/ |
| $ systemctl restart munge.service |
| $ systemctl restart slurmd.service |