[cam-users] CAM automatic resubmission
Jim Rosinski
rosinski@cgd.ucar.edu
Tue, 11 Mar 2003 12:56:10 -0700 (MST)
On Tue, 11 Mar 2003, eric maloney wrote:
> I'm interested in setting up some scripts to automatically resubmit my
> CAM2.0.1 jobs (atmosphere-only) on blackforest and/or bluesky. Has anyone
> developed such scripts? Such code existed in the release version of
> CCM3.6.6.
An example script I use is attached. The job resubmits itself upon
successful completion only if it is a restart run (nsrest == 1). The parts
relevant to the issue you raise can be found by searching on the string
"submit" (without the quotes). Note that this job will resubmit itself
indefinitely--the only way to break the cycle is with "llcancel" once you've
integrated long enough. Fancier things could be done, e.g. a countdown loop
that only resubmits so many times.
The multiple job steps at the top are useful for occasions when the mass
store is down and the job dies due to timeout.
Hope this helps.
Jim Rosinski
---------------------------------cut here---------------------------------
#! /usr/bin/csh -fvx
# Set Loadleveler options
#-----------------------------------------------------------------------
## IBM
##------------
#@ job_name = test
#@ step_name = step1
#@ class = csl_rg8
#@ shell = /usr/local/bin/tcsh
#@ job_type = parallel
#@ network.MPI = csss,not_shared,us
#@ output = test.out
#@ error = test.err
#@ total_tasks = 16
#@ node = 8
#@ node_usage = not_shared
#@ queue
## Rerun the job if step1 runs out of time
#@ step_name = step2
#@ class = csl_rg8
#@ shell = /usr/local/bin/tcsh
#@ job_type = parallel
#@ network.MPI = csss,not_shared,us
#@ output = test.out
#@ error = test.err
#@ total_tasks = 16
#@ node = 8
#@ node_usage = not_shared
#@ dependency = ( step1 != 0 )
#@ queue
set caseid = test
set modelsrc = /fs/cgd/data0/$LOGNAME/camdevlatest
set submitdir = /fs/cgd/home0/$LOGNAME/control_runs
set submitscript = runit.test
set builddir = /ptmp/$LOGNAME/$caseid
set rundir = /ptmp/$LOGNAME/$caseid
set outdir = /fs/cgd/data0/$LOGNAME
set nelapse = -30
set nelapse = -175
set nelapse = -1
set nelapse = -1450
set nsrest = 1
set gnumake = gnumake
setenv LIB_NETCDF /usr/local/lib32/r4i4
set absems_data = abs_ems_factors_fastvx.052001.nc
set bndtvo = noaao3.1990.21999.nc
set bndtvs = sst_HadOIBl_bc_64x128_clim_c020411.monthlyoi.nc
set fsurdat = surface-data.128x064.topo.nc
## Ensure that run and build directories exist
mkdir -p $rundir || echo "cannot create $rundir" && exit 1
mkdir -p $builddir || echo "cannot create $builddir" && exit 1
## Build then run the model
cd $builddir || exit 1
## Set Rootdir file for ESMF
echo $modelsrc/cam1 >! Rootdir
## Set search path for standalone CCM (i.e. uncoupled) run with Eulerian
## dynamics.
set srcdir = $modelsrc/cam1/models
cat >! Filepath << EOF || exit 1
$srcdir/atm/cam/src/dynamics/eul
$srcdir/atm/cam/src/advection/slt
$srcdir/atm/cam/src/control
$srcdir/atm/cam/src/physics/cam1
$srcdir/csm_share
$srcdir/utils/timing
$srcdir/atmlnd_share
$srcdir/atm/cam/src/ocnsice/dom
$srcdir/ice/csim4
$srcdir/lnd/clm2/src/main
$srcdir/lnd/clm2/src/biogeophys
$srcdir/lnd/clm2/src/biogeochem
$srcdir/lnd/clm2/src/mksrfdata
$srcdir/lnd/clm2/src/ecosysdyn
$srcdir/lnd/clm2/src/riverroute
$srcdir/atm/cam/src/utils
EOF
## Build appropriate misc.h, params.h, and preproc.h files for standalone,
## T42, 26-level model.
##
## Any additional source code mods should probably go here as well. This is
## because when gmake is run, it will look in the directory from which it is
## run (i.e. $TMPDIR/$caseid) for source files *before* the directories
## specified in Filepath.
if ( ! -e misc.h ) then
cat >! misc.h << EOF || echo "cannot create misc.h" && exit 1
#ifndef MISC_SET
#define MISC_SET
#define AIX
#define SPMD
#define SHELL_MSS
#endif
EOF
endif
if ( ! -e params.h ) then
cat >! params.h << EOF || echo "cannot create params.h" && exit 1
#ifndef PARAMS_SET
#define PARAMS_SET
#define PCNST 1
#define PNATS 1
#define PLEV 26
#define PLEVR 26
#define PLON 128
#define PLAT 64
#define PTRM 42
#define PTRN 42
#define PTRK 42
#define PCOLS 16
#endif
EOF
endif
if ( ! -e preproc.h ) then
cat >! preproc.h << EOF || echo "cannot create preproc.h" && exit 1
#ifndef PREPROC_SET
#define PREPROC_SET
#define SHELL_MSS
#define COUP_CAM
#define SPMD
#define LSMLON 128
#define LSMLAT 64
#endif
EOF
endif
## Build the model
setenv EXENAME ccm3bin
setenv MODEL_EXEDIR $rundir
setenv SPMD TRUE
## Committed Makefile won't work on IBM: need O2 not O3
## For better or worse, I moded the Makefile in the tagged directory
cp $srcdir/atm/cam/bld/Makefile .
env DEBUG=FALSE NEWBUILD=TRUE timex gnumake -j16 >&! out.gmake.$$ || exit 1
## Build a namelist and run the model
cd $rundir
cat >! namelist << EOF || exit 1
&camexp
absems_data= '$absems_data'
bndtvo = '$bndtvo'
bndtvs = '$bndtvs'
caseid = '$caseid'
ctitle = '$caseid fractional with SST dataset from Jim Hurrell'
kmxhdc = 5
ncdata = '/JET/csm/domsstevappbl04/atm/init/domsstevappbl04.cam2.i.0026-01-01-00000.nc'
nelapse = $nelapse
mfilt(2) = 365
nsrest = $nsrest
iyear_ad = 1950
ice_conschk_frq = -1
/
&clmexp
fpftcon = '/fs/cgd/csm/inputdata/lnd/clm2/pftdata/pft-physiology-vegdyn-cleanup-ratio'
fsurdat = '$fsurdat'
finidat = '/$LOGNAME/csm/domsstevappbl04/lnd/init/domsstevappbl04.clm2.i.0026-01-01-00000.newclm.nc'
mksrf_fglacier= '/fs/cgd/csm/inputdata/lnd/clm2/rawdata/mksrf_glacier.nc'
mksrf_flai = '/fs/cgd/csm/inputdata/lnd/clm2/rawdata/mksrf_lai.nc'
mksrf_flanwat = '/fs/cgd/csm/inputdata/lnd/clm2/rawdata/mksrf_lanwat.nc'
mksrf_fsoicol = '/fs/cgd/csm/inputdata/lnd/clm2/rawdata/mksrf_soicol_clm2.nc'
mksrf_fsoitex = '/fs/cgd/csm/inputdata/lnd/clm2/rawdata/mksrf_soitex.10level.nc'
mksrf_furban = '/fs/cgd/csm/inputdata/lnd/clm2/rawdata/mksrf_urban.nc'
mksrf_fvegtyp = '/fs/cgd/csm/inputdata/lnd/clm2/rawdata/mksrf_pft.nc'
/
EOF
if ( ! -e $absems_data ) then
cp /fs/cgd/csm/inputdata/atm/cam1/rad/$absems_data .
endif
if ( ! -e $bndtvo ) then
cp /fs/cgd/csm/inputdata/atm/cam1/ozone/$bndtvo .
endif
if ( ! -e $bndtvs ) then
cp /fs/cgd/data0/$LOGNAME/datasets/$bndtvs .
endif
if ( ! -e $fsurdat ) then
cp /fs/cgd/csm/inputdata/lnd/clm2/srfdata/cam/$fsurdat .
endif
setenv XLSMPOPTS "stack=86000000"
limit stacksize unlimited
set modelret = good
if ( $nsrest == 0 ) then
set outfile = out.${caseid}.init
else
set outfile = out.${caseid}.rest.$$
endif
## For init runs assume interactive and set OMP_NUM_THREADS accordingly
if ( $nsrest == 0 ) then
setenv OMP_NUM_THREADS 16
else
setenv OMP_NUM_THREADS 4
endif
setenv MP_SHARED_MEMORY yes
poe ./ccm3bin < namelist >&! $outfile || set modelret = bad
## Tar up the source code for the case and write to the mass store
cd $builddir || exit 1
set curdate = `date +%y-%m-%d.%H:%M`
set tarname = $caseid.$curdate.tar
tar cf $tarname *.[Fch] *.F90 Makefile -C $rundir namelist $outfile -C $srcdir .
set mslogname = `echo $LOGNAME | tr '[a-z]' '[A-Z]'`
mswrite -w passwd -t 1000 $tarname /$mslogname/csm/$caseid/atm/$tarname; \rm $tarname &
if ( $nelapse < -100 ) then
gzip $outfile
mswrite -w passwd -t 1000 $outfile /$mslogname/csm/$caseid/${outfile}.gz &
else
mswrite -w passwd -t 1000 $outfile /$mslogname/csm/$caseid/${outfile} &
endif
cp $rundir/${outfile}.gz $outdir
## Resubmit
if ( $modelret == good && $nsrest == 1 ) then
cd $submitdir
llsubmit $submitscript
endif
exit 0