Cfengine: Glasgow Worker Nodes
From GridPP Wiki
This is a cfengine excerpt from Glasgow showing how we manage our worker nodes with cfengine. Note that it's probably a good idea to split your cfengine file into sections once it gets big, but this is presented as a single cfagent.conf file for simplicity.
Note that worker nodes are in the following classes: worker, grid, torque, autofs, scientific_sl_3 and, of course, any
##########################
#
# cfagent.conf for UKI-SCOTGRID-GLASGOW
#
# $Id: cfagent.conf 201 2006-11-22 12:43:00Z root $
#
##########################
groups:
worker = ( HostRange(node,1-140) )
gridsvr = ( HostRange(svr,016-023) )
disksvr = ( HostRange(disk,032-041) )
# Nicer names for grid servers
ce = ( svr016 )
dpm = ( svr018 )
mon = ( svr019 )
ui = ( svr020 )
sitebdii = ( svr021 )
## Compound groups
# Batch system nodes
torque = ( ce worker )
# Nodes which look at the autofs system
autofs = ( worker ce ui )
# All grid nodes
grid = ( worker gridsvr disksvr )
control:
any::
actionsequence = (
directories
files
links
editfiles
packages
copy
shellcommands
tidy
)
domain = ( beowulf.cluster )
skel = ( /var/cfengine/inputs/skel )
scripts = ( /var/cfengine/inputs/scripts )
syslog = ( on )
ChecksumUpdates = ( on )
DefaultPkgMgr = ( rpm )
RPMcommand = ( /bin/rpm )
RPMInstallCommand = ( "/usr/bin/yum -y install %s" )
torque::
torquesvr = ( 10.141.255.16 )
torquequeues = ( dteam:atlas:alice:cms:lhcb:biom:pheno:zeus:sixt:ilc:babar:dzero:ngs:ops:glpnp:glpppt:glee:glbio )
# It would be nicer if this could be defined more dynamically...
scientific_sl_3::
java = ( j2sdk1.4.2_12 )
!gridsvr::
# cfengine will run once an hour, so splay the cluster across 50 minutes
# to ensure the load on the master server is not too high
splaytime = ( 50 )
gridsvr::
splaytime = ( 5 )
directories:
grid::
# We need to create the locations for files to be copied into - copy runs before shellcommands
/opt/glite/yaim mode=0700 owner=root group=root
/opt/glite/yaim/etc mode=0755 owner=root group=root
/opt/glite/yaim/functions/local mode=0755 owner=root group=root
/etc/grid-security mode=0755 owner=root group=root
scientific_sl_3::
/usr/java/$(java) mode=0755 owner=root group=root
torque::
/var/spool/pbs/mom_priv mode=0755 owner=root group=root
/gridstorage mode=0755 owner=root group=root
/home mode=0755 owner=root group=root
links:
grid.scientific_sl_3::
# In YAIM we give java location as /usr/java/current, and link here
# (It would be much better if grid stuff just used /etc/java.conf or JAVA_HOME, *sigh*)
/usr/java/current -> /usr/java/j2sdk1.4.2_12
torque::
/gridstorage/exptsw -> /grid/exp_soft
packages:
any::
ganglia-gmond action=install elsedefine=newgmon
grid::
lcg-CA action=install version=1.10
# N.B. note that runyaim happens when the yaim package is first installed
glite-yaim action=install elsedefine=runyaim
worker::
# Worker node meta package
glite-WN action=install
torque|ui::
# Packages requested by VOs
gcc action=install
gcc-ssa action=install
gcc-g77 action=install
gcc-g77-ssa action=install
zsh action=install
zlib-devel action=install
compat-libstdc++ action=install
tidy:
any::
# Make sure this is > max wallclock for the batch system!
/tmp pattern=* age=12 recurse=inf
copy:
# Master server is exempt from default files
any.!svr031::
# Root's environment
$(skel)/common/root/.bash_profile mode=0644 dest=/root/.bash_profile type=sum
$(skel)/common/root/.bashrc mode=0644 dest=/root/.bashrc type=sum
$(skel)/common/root/.ssh/authorized_keys mode=0644 dest=/root/.ssh/authorized_keys type=sum
# Security for servers and ssh
$(skel)/common/etc/ssh/ssh_known_hosts mode=644 dest=/etc/ssh/ssh_known_hosts type=sum
$(skel)/common/etc/ssh/ssh_config mode=644 dest=/etc/ssh/ssh_config define=newssh type=sum
$(skel)/common/etc/ssh/sshd_config mode=600 dest=/etc/ssh/sshd_config define=newssh type=sum
# Time, time, time!
$(skel)/common/etc/ntp.conf mode=644 dest=/etc/ntp.conf define=newntp type=sum
$(skel)/common/etc/ntp/step-tickers mode=644 dest=/etc/ntp/step-tickers define=newntp type=sum
# Environment for interactive shells (and jobs)
$(skel)/common/etc/profile.d/proxy.csh mode=644 dest=/etc/profile.d/proxy.csh type=sum
$(skel)/common/etc/profile.d/proxy.sh mode=644 dest=/etc/profile.d/proxy.sh type=sum
$(skel)/common/etc/profile.d/tmpdir.csh mode=644 dest=/etc/profile.d/tmpdir.csh type=sum
$(skel)/common/etc/profile.d/tmpdir.sh mode=644 dest=/etc/profile.d/tmpdir.sh type=sum
# Post boot signaling script
# This is an important part of Glasgow's auto install - it signals to the master server when the first boot
# after kickstart has happened.
$(skel)/common/etc/rc.d/rc.local mode=644 dest=/etc/rc.d/rc.local type=sum
grid::
# GridPP VOMS + YAIM setup for workers
$(skel)/grid/etc/grid-security/vomsdir/voms.gridpp.ac.uk mode=0644 dest=/etc/grid-security/vomsdir/voms.gridpp.ac.uk type=sum
$(skel)/yaim/site-info.def mode=600 dest=/opt/glite/yaim/etc/site-info.def type=sum
$(skel)/yaim/groups.conf mode=644 dest=/opt/glite/yaim/etc/groups.conf type=sum
$(skel)/yaim/users.conf mode=644 dest=/opt/glite/yaim/etc/users.conf type=sum
# We don't let YAIM do users - so override to a blank function
$(skel)/yaim/local/config_users mode=644 dest=/opt/glite/yaim/functions/local/config_users type=sum
torque::
$(skel)/torque/shosts.equiv mode=644 dest=/etc/ssh/shosts.equiv type=sum
torque|ui|dpm|disksvr::
# On torque hosts (and the ui) distribute the shadow and password files to avoid problems with account locking, etc.
# DPM and disk servers need this for gridftp
$(skel)/torque/passwd mode=644 dest=/etc/passwd define=localpoolaccounts type=sum
$(skel)/torque/shadow mode=400 dest=/etc/shadow type=sum
$(skel)/torque/group mode=644 dest=/etc/group type=sum
autofs::
$(skel)/autofs/auto.cluster mode=0644 dest=/etc/auto.cluster define=newautofs type=sum
$(skel)/autofs/auto.grid mode=0644 dest=/etc/auto.grid define=newautofs type=sum
$(skel)/autofs/auto.master mode=0644 dest=/etc/auto.master define=newautofs type=sum
gridsvr::
$(skel)/gridsvr/etc/gmond.conf mode=0644 dest=/etc/gmond.conf define=newgmon type=sum
worker::
$(skel)/worker/etc/gmond.conf mode=0644 dest=/etc/gmond.conf define=newgmon type=sum
# Worker nodes need to route directly to grid and disk servers even when their public IPs are given
$(skel)/worker/etc/sysconfig/network-scripts/route-eth1 mode=0644 dest=/etc/sysconfig/network-scripts/route-eth1 define=needroute type=sum
shellcommands:
newgmon::
"/sbin/service gmond restart" umask=022
newssh::
"/sbin/service sshd restart" umask=022
newntp::
"/sbin/service ntpd restart" umask=022
newautofs::
"/sbin/service autofs restart" umask=022
newtorque::
"/sbin/service pbs_server restart" umask=022
newmaui::
"/sbin/service maui restart" umask=022
newhttp::
"/sbin/service httpd restart" umask=022
localpoolaccounts.!ui::
"/var/cfengine/inputs/scripts/local_pool_accounts /etc/passwd" umask=022
worker.needroute::
"/sbin/ip route add 130.209.239.16/28 dev eth1" umask=022
"/sbin/ip route add 130.209.239.32/28 dev eth1" umask=022
worker.runyaim::
# Only define startmom if this looks ok, otherwise withdraw from the batch system
"/opt/glite/yaim/scripts/configure_node /opt/glite/yaim/etc/site-info.def WN" umask=022 define=startmom elsedefine=stopmom
worker.startmom::
"/sbin/chkconfig pbs_mom on" umask=022
"/sbin/service pbs_mom restart" umask=022
worker.stopmom::
"/sbin/chkconfig pbs_mom off" umask=022
"/sbin/service pbs_mom stop" umask=022