[Dart-dev] [3482] DART/trunk/models/MITgcm_ocean/shell_scripts: Updated to implicitly run with a nodefile if not submitted through

Fri Aug 1 09:56:55 MDT 2008

An HTML attachment was scrubbed...
URL: http://mailman.ucar.edu/pipermail/dart-dev/attachments/20080801/9d3ce1f8/attachment-0001.html
-------------- next part --------------
Modified: DART/trunk/models/MITgcm_ocean/shell_scripts/job.simple.csh
===================================================================

--- DART/trunk/models/MITgcm_ocean/shell_scripts/job.simple.csh	2008-07-31 22:01:48 UTC (rev 3481)
+++ DART/trunk/models/MITgcm_ocean/shell_scripts/job.simple.csh	2008-08-01 15:56:54 UTC (rev 3482)
@@ -14,31 +14,24 @@
 #-----------------------------------------------------------------------------
 # job.simple.csh ... Top level script to run a single assimilation experiment.
 #
-#  Unlike the more complex job.csh, this script only processes a single 
-#  observation file.  Still fairly complex; requires a raft of
-#  data files and most of them are in hardcoded locations.
+# Unlike the more complex job.csh, this script only processes a single 
+# observation file.  Still fairly complex; requires a raft of
+# data files and most of them are in hardcoded locations.
 #
-# You need to know which of several batch systems you are using.  The most
-# common one is LSF.   PBS is also common.  (POE is another but is
-# not supported directly by this script.  It is not recommended that you have a
-# parallel cluster without a batch system (it schedules which nodes are assigned
-# to which processes) but it is possible to run that way -- you have to do
-# more work to get the information about which nodes are involved to the 
-# parallel tasks -- but anyway, there is a section below that uses ssh and no
-# batch.
+# This script is designed to be run from the command line (as a single thread)
+# and should only take a few seconds to a minute to complete, depending on
+# the filesystem performance and data file size.
 #
-# How to submit this job:
-#  1. Look at the #BSUB or #PBS sections below and adjust any of the parameters
-#     on your cluster.  Queue names are very system specific; some systems 
-#     require wall-clock limits; some require an explicit charge code.
-#  2. Submit this script to the queue:
-#        LSF:   bsub < job.simple.csh
-#        PBS:   qsub job.simple.csh
-#       NONE:   job.simple.csh
+# The script moves the necessary files to the current directory - in DART
+# nomenclature, this will be called CENTRALDIR. 
+# After everything is confirmed to have been assembled, it is possible
+# to edit the data, data.cal, and input.nml files for the specifics of 
+# the experiment; as well as allow final configuration of a 'nodelist' file.
 #
-# The script moves the necessary files to the current directory and then
-# starts 'filter' as a parallel job on all nodes; each of these tasks will 
-# call some a separate model_advance.csh when necessary.
+# Once the 'table is set', all that remains is to start/submit the 
+# 'runme_filter' script. That script will spawn 'filter' as a 
+# parallel job on the appropriate nodes; each of these tasks will 
+# call a separate model_advance.csh when necessary.
 #
 # The central directory is where the scripts reside and where script and 
 # program I/O are expected to happen.
@@ -129,6 +122,13 @@
 mkdir -p ${experiment}/{MIT,DART}
 
 #-----------------------------------------------------------------------------
+# Early exit just to check that everything moved OK, and the table is set.
+# Gives us a chance to edit the local input.nml, data.cal, etc. if needed.
+#-----------------------------------------------------------------------------
+
+exit
+
+#-----------------------------------------------------------------------------
 # Runs filter which integrates the results of model advances  (async=4).
 #-----------------------------------------------------------------------------
 
@@ -152,6 +152,7 @@
 ls -l
 
 ${MOVE} *.data *.meta         ${experiment}/MIT
+${MOVE} data data.cal         ${experiment}/MIT
 ${MOVE} STD*                  ${experiment}/MIT
 
 ${MOVE} filter_restart*            ${experiment}/DART

Modified: DART/trunk/models/MITgcm_ocean/shell_scripts/runme_filter
===================================================================
--- DART/trunk/models/MITgcm_ocean/shell_scripts/runme_filter	2008-07-31 22:01:48 UTC (rev 3481)
+++ DART/trunk/models/MITgcm_ocean/shell_scripts/runme_filter	2008-08-01 15:56:54 UTC (rev 3482)
@@ -1,14 +1,12 @@
-#!/bin/tcsh 
+#!/bin/tcsh
 #
 # Data Assimilation Research Testbed -- DART
-# Copyright 2004-2006, Data Assimilation Research Section, 
+# Copyright 2004-2006, Data Assimilation Research Section,
 # University Corporation for Atmospheric Research
 # Licensed under the GPL -- www.gpl.org/licenses/gpl.html
 #
 # $Id: runme_filter 2799 2007-04-04 23:17:51Z nancy $
 #
-# start at a generic run script for the mpi version.  this should probably
-# end up in the shell scripts directory - but it is here for now.  nsc.
 #=============================================================================
 # This block of directives constitutes the preamble for the LSF queuing system
 # LSF is used on the IBM   Linux cluster 'lightning'
@@ -28,29 +26,29 @@
 ##=============================================================================
 #BSUB -J filter
 #BSUB -o filter.%J.log
-#BSUB -q dedicated
+#BSUB -q regular
 #BSUB -n 20
 #BXXX -P 868500xx
-#BSUB -W 2:00
+#BSUB -W 12:00
 #BSUB -N -u ${USER}@ucar.edu
 #
 ##=============================================================================
 ## This block of directives constitutes the preamble for the PBS queuing system
-## PBS is used on the CGD   Linux cluster 'bangkok'
-## PBS is used on the CGD   Linux cluster 'calgary'
-## 
+## PBS is used on the CGD Linux cluster 'bangkok'
+## PBS is used on the CGD Linux cluster 'calgary'
+##
 ## the normal way to submit to the queue is:    qsub runme_filter
-## 
+##
 ## an explanation of the most common directives follows:
 ## -N     Job name
 ## -r n   Declare job non-rerunable
 ## -e <arg>  filename for standard error
-## -o <arg>  filename for standard out 
+## -o <arg>  filename for standard out
 ## -q <arg>   Queue name (small, medium, long, verylong)
-## -l nodes=xx:ppn=2   requests BOTH processors on the node. On both bangkok 
+## -l nodes=xx:ppn=2   requests BOTH processors on the node. On both bangkok
 ##                     and calgary, there is no way to 'share' the processors
 ##                     on the node with another job, so you might as well use
-##                     them both.  (ppn == Processors Per Node)
+##                     them both. (ppn == Processors Per Node)
 ##=============================================================================
 #PBS -N filter
 #PBS -r n
@@ -60,10 +58,10 @@
 #PBS -l nodes=10:ppn=2
 
 # if async=2, e.g. you are going to run './mitgcmuv', single process
-# (or possibly 'mpirun -np 1 ./mitgcmuv'), so each processor advances 
+# (or possibly 'mpirun -np 1 ./mitgcmuv'), so each processor advances
 # one ensemble independently of the others, leave this as false.
 #
-# if async=4, e.g.  all the processors advance each mitgcmuv in turn with
+# if async=4, e.g. all the processors advance each mitgcmuv in turn with
 # mpirun -np 64 mitgcmuv (or whatever) for as many ensembles as you have,
 # set this to "true"
 
@@ -83,228 +81,226 @@
 
 if ($?LS_SUBCWD) then
 
-   # LSF has a list of processors already in a variable (LSB_HOSTS)
-   #  alias submit 'bsub < \!*'
+    # LSF has a list of processors already in a variable (LSB_HOSTS)
+    # alias submit 'bsub < \!*'
+    echo "LSF - using mpirun.lsf for execution"
 
-   # each filter task advances the ensembles, each running on 1 proc.
-   if ( "$parallel_model" == "false" ) then
-   
-     mpirun.lsf ./filter
-   
-   else
-   
-   # filter runs in parallel until time to do a model advance,
-   # and then this script starts up the mitgcmuv jobs, each one
-   # running in parallel.  then it runs wakeup_filter to wake
-   # up filter so it can continue.
-   
-     \rm  -f model_to_filter.lock filter_to_model.lock
-     mkfifo  model_to_filter.lock filter_to_model.lock
+    # each filter task advances the ensembles, each running on 1 proc.
+    if ( "$parallel_model" == "false" ) then
 
-     set filterhome = ~/.filter$$
-     if ( ! -e $filterhome) mkdir $filterhome
-   
-     # this starts filter but also returns control back to
-     # this script immediately.
-      
-     (setenv HOME $filterhome; mpirun.lsf ./filter)  &
-       
-     while ( -e filter_to_model.lock )
-       
-       set todo=`( echo $< ) < filter_to_model.lock`
-       echo "todo received, value = ${todo}"
-       
-       if ( "${todo}" == "finished" ) then
-         echo "main script: filter done."
-         wait
-         break                                
+       mpirun.lsf ./filter
 
-       else if ( "${todo}" == "advance" ) then
-         
-         # the second number below must match the number
-         # of ensembles.  and in input.nml, the advance model
-         # command must have -np N with N equal to the number
-         # of processors this job is using.
-   
-         echo "calling model advance now:"
-         ./advance_model.csh 0 ${num_ens} filter_control00000
-         
-         echo "restarting filter."
-         mpirun.lsf ./wakeup_filter
-       
-       else
-         
-         echo "main script: unexpected value received."
-         break
-       
-       endif
-      
-     end
-      
-     echo "filter finished, removing pipes."
-     rm -f model_to_filter.lock filter_to_model.lock
-   
-     if ( -d $filterhome) rmdir $filterhome
-   endif
-   
+    else
 
+    # filter runs in parallel until time to do a model advance,
+    # and then this script starts up the mitgcmuv jobs, each one
+    # running in parallel. then it runs wakeup_filter to wake
+    # up filter so it can continue.
+
+      \rm -f model_to_filter.lock filter_to_model.lock
+      mkfifo model_to_filter.lock filter_to_model.lock
+
+      set filterhome = ~/.filter$$
+      if ( ! -e $filterhome) mkdir $filterhome
+
+      # this starts filter but also returns control back to
+      # this script immediately.
+
+      (setenv HOME $filterhome; mpirun.lsf ./filter) &
+
+      while ( -e filter_to_model.lock )
+
+        set todo=`( echo $< ) < filter_to_model.lock`
+        echo "todo received, value = ${todo}"
+
+        if ( "${todo}" == "finished" ) then
+          echo "main script: filter done."
+          wait
+          break
+
+        else if ( "${todo}" == "advance" ) then
+
+          # the second number below must match the number
+          # of ensembles. Also, in input.nml, the advance model
+          # command must have -np N with N equal to the number
+          # of processors this job is using.
+
+          echo "calling model advance now:"
+          ./advance_model.csh 0 ${num_ens} filter_control00000
+
+          echo "restarting filter."
+          mpirun.lsf ./wakeup_filter
+
+        else
+
+          echo "main script: unexpected value received."
+          break
+
+        endif
+
+      end
+
+      echo "filter finished, removing pipes."
+      \rm -f model_to_filter.lock filter_to_model.lock
+
+      if ( -d $filterhome) rmdir $filterhome
+    endif
+
+
 else if ($?PBS_O_WORKDIR) then
 
-   # PBS has a list of processors in a file whose name is (PBS_NODEFILE)
-   #  alias submit 'qsub \!*'
+    # PBS has a list of processors in a file whose name is (PBS_NODEFILE)
+    # alias submit 'qsub \!*'
+    echo "PBS - using mpirun for execution"
 
-   # each filter task advances the ensembles, each running on 1 proc.
-   if ( "$parallel_model" == "false" ) then
-   
-     mpirun ./filter
-   
-   else
-   
-   # filter runs in parallel until time to do a model advance,
-   # and then this script starts up the mitgcmuv jobs, each one
-   # running in parallel.  then it runs wakeup_filter to wake
-   # up filter so it can continue.
-   
-     rm  -f model_to_filter.lock filter_to_model.lock
-     mkfifo model_to_filter.lock filter_to_model.lock
+    # each filter task advances the ensembles, each running on 1 proc.
+    if ( "$parallel_model" == "false" ) then
 
-     set filterhome = ~/.filter
-     if ( ! -e $filterhome) mkdir $filterhome
-   
-     # this starts filter but also returns control back to
-     # this script immediately.
-      
-     (setenv HOME $filterhome; mpirun ./filter)  &
-       
-     while ( -e filter_to_model.lock )
-       
-       set todo=`( echo $< ) < filter_to_model.lock`
-       echo "todo received, value = ${todo}"
-       
-       if ( "${todo}" == "finished" ) then
-         echo "main script: filter done."
-         wait
-         break                                
+      mpirun ./filter
 
-       else if ( "${todo}" == "advance" ) then
-         
-         # the second number below must match the number
-         # of ensembles.  and in input.nml, the advance model
-         # command must have -np N with N equal to the number
-         # of processors this job is using.
-   
-         echo "calling model advance now:"
-         ./advance_model.csh 0 ${num_ens} filter_control00000
-         
-         echo "restarting filter."
-         mpirun  ./wakeup_filter
-       
-       else
-         
-         echo "main script: unexpected value received."
-         break
-       
-       endif
-      
-     end
-      
-     echo "filter finished, removing pipes."
-     rm -f model_to_filter.lock filter_to_model.lock
-   
-     if ( -d $filterhome) rmdir $filterhome
-   endif
-   
-else if ($?MYNODEFILE) then
+    else
 
-   # If you have a linux cluster with no queuing software, use this
-   # section.  The list of computational nodes is given to the mpirun
-   # command and it assigns them as they appear in the file.  In some
-   # cases it seems to be necessary to wrap the command in a small
-   # script that changes to the current directory before running.
+    # filter runs in parallel until time to do a model advance,
+    # and then this script starts up the mitgcmuv jobs, each one
+    # running in parallel. then it runs wakeup_filter to wake
+    # up filter so it can continue.
 
-   echo "running with no queueing system"
+      \rm -f model_to_filter.lock filter_to_model.lock
+      mkfifo model_to_filter.lock filter_to_model.lock
 
-   # before running this script, do this once.  the syntax is
-   # node name : how many tasks you can run on it
-   #setenv MYNODEFILE  ~/nodelist
-   #echo "node7:2" >! $MYNODEFILE
-   #echo "node5:2" >> $MYNODEFILE
-   #echo "node3:2" >> $MYNODEFILE
-   #echo "node1:2" >> $MYNODEFILE
+      set filterhome = ~/.filter
+      if ( ! -e $filterhome) mkdir $filterhome
 
-   setenv NUM_PROCS 8
-   echo "running with $NUM_PROCS nodes specified from $MYNODEFILE"
+      # this starts filter but also returns control back to
+      # this script immediately.
 
-   # each filter task advances the ensembles, each running on 1 proc.
-   if ( "$parallel_model" == "false" ) then
+      (setenv HOME $filterhome; mpirun ./filter) &
 
-      mpirun -np $NUM_PROCS -nolocal -machinefile $MYNODEFILE ./filter
+      while ( -e filter_to_model.lock )
 
-   else
+        set todo=`( echo $< ) < filter_to_model.lock`
+        echo "todo received, value = ${todo}"
 
-   # filter runs in parallel until time to do a model advance,
-   # and then this script starts up the mitgcmuv jobs, each one
-   # running in parallel.  then it runs wakeup_filter to wake
-   # up filter so it can continue.
-   
-     rm  -f model_to_filter.lock filter_to_model.lock
-     mkfifo model_to_filter.lock filter_to_model.lock
+        if ( "${todo}" == "finished" ) then
+          echo "main script: filter done."
+          wait
+          break
 
-     set filterhome = ~/.filter
-     if ( ! -e $filterhome) mkdir $filterhome
-   
-     # this starts filter but also returns control back to
-     # this script immediately.
-      
-     (setenv HOME $filterhome; \
-      mpirun -np $NUM_PROCS -nolocal -machinefile $MYNODEFILE ./filter ) &
-       
-     while ( -e filter_to_model.lock )
-       
-       set todo=`( echo $< ) < filter_to_model.lock`
-       echo "todo received, value = ${todo}"
-       
-       if ( "${todo}" == "finished" ) then
-         echo "main script: filter done."
-         wait
-         break                                
+        else if ( "${todo}" == "advance" ) then
 
-       else if ( "${todo}" == "advance" ) then
-         
-         # the second number below must match the number
-         # of ensembles.  and in input.nml, the advance model
-         # command must have -np N with N equal to the number
-         # of processors this job is using.
-   
-         echo "calling model advance now:"
-         ./advance_model.csh 0 ${num_ens} filter_control00000
-         
-         echo "restarting filter."
-         mpirun -np $NUM_PROCS -nolocal -machinefile $MYNODEFILE ./wakeup_filter
-       
-       else
-         
-         echo "main script: unexpected value received."
-         break
-       
-       endif
-      
-     end
-      
-     echo "filter finished, removing pipes."
-     rm -f model_to_filter.lock filter_to_model.lock
-   
-     if ( -d $filterhome) rmdir $filterhome
-   endif
-   
+          # the second number below must match the number
+          # of ensembles. Also, in input.nml, the advance model
+          # command must have -np N with N equal to the number
+          # of processors this job is using.
+
+          echo "calling model advance now:"
+          ./advance_model.csh 0 ${num_ens} filter_control00000
+
+          echo "restarting filter."
+          mpirun ./wakeup_filter
+
+        else
+
+          echo "main script: unexpected value received."
+          break
+
+        endif
+
+      end
+
+      echo "filter finished, removing pipes."
+      \rm -f model_to_filter.lock filter_to_model.lock
+
+      if ( -d $filterhome) rmdir $filterhome
+    endif
+
 else
 
-   # interactive - assume you are using 'lam-mpi' and that you have
-   # already run 'lamboot' once to start the lam server, or that you
-   # are running with a machine that has mpich installed.
+    # If you have a linux cluster with no queuing software, use this
+    # section. The list of computational nodes is given to the mpirun
+    # command and it assigns them as they appear in the file. In some
+    # cases it seems to be necessary to wrap the command in a small
+    # script that changes to the current directory before running.
 
-   echo "running interactively"
-   mpirun -np 4 ./filter
+    echo "running with no queueing system"
 
+    # before running this script, do this once. the syntax is
+    # node name : how many tasks you can run on it
+    #setenv MYNODEFILE ~/nodelist
+    #echo "node7:2" >! $MYNODEFILE
+    #echo "node5:2" >> $MYNODEFILE
+    #echo "node3:2" >> $MYNODEFILE
+    #echo "node1:2" >> $MYNODEFILE
+
+    # Ibrahim ... check value of MPIRUN and MYNODEFILE 
+
+    setenv NUM_PROCS `cat nodelist | wc -l`
+    set MYNODEFILE = nodelist
+    set MPIRUN = /opt/mpich/myrinet/pgi/bin/mpirun
+
+    echo "running with $NUM_PROCS nodes specified from $MYNODEFILE"
+
+    # each filter task advances the ensembles, each running on 1 proc.
+    if ( "$parallel_model" == "false" ) then
+
+       $MPIRUN -np $NUM_PROCS -nolocal -machinefile $MYNODEFILE ./filter
+
+    else
+
+    # filter runs in parallel until time to do a model advance,
+    # and then this script starts up the mitgcmuv jobs, each one
+    # running in parallel. then it runs wakeup_filter to wake
+    # up filter so it can continue.
+
+      \rm -f model_to_filter.lock filter_to_model.lock
+      mkfifo model_to_filter.lock filter_to_model.lock
+
+      set filterhome = ~/.filter$$
+      if ( ! -e $filterhome) mkdir $filterhome
+
+      # this starts filter but also returns control back to
+      # this script immediately.
+
+      (setenv HOME $filterhome; \
+       $MPIRUN -np $NUM_PROCS -nolocal -machinefile $MYNODEFILE ./filter) &
+
+      while ( -e filter_to_model.lock )
+
+        set todo=`( echo $< ) < filter_to_model.lock`
+        echo "todo received, value = ${todo}"
+
+        if ( "${todo}" == "finished" ) then
+          echo "main script: filter done."
+          wait
+          break
+
+        else if ( "${todo}" == "advance" ) then
+
+          # the second number below must match the number
+          # of ensembles. Also, in input.nml, the advance model
+          # command must have -np N with N equal to the number
+          # of processors this job is using.
+
+          echo "calling model advance now:"
+          ./advance_model.csh 0 ${num_ens} filter_control00000
+
+          echo "restarting filter."
+          $MPIRUN -np $NUM_PROCS -nolocal -machinefile $MYNODEFILE ./wakeup_filter
+
+        else
+
+          echo "main script: unexpected value received."
+          break
+
+        endif
+
+      end
+
+      echo "filter finished, removing pipes."
+      \rm -f model_to_filter.lock filter_to_model.lock
+
+      if ( -d $filterhome) rmdir $filterhome
+    endif
+
 endif
 

Modified: DART/trunk/models/MITgcm_ocean/shell_scripts/runmodel_1x
===================================================================
--- DART/trunk/models/MITgcm_ocean/shell_scripts/runmodel_1x	2008-07-31 22:01:48 UTC (rev 3481)
+++ DART/trunk/models/MITgcm_ocean/shell_scripts/runmodel_1x	2008-08-01 15:56:54 UTC (rev 3482)
@@ -20,9 +20,9 @@
 # -q queue    cheapest == [standby, economy, (regular,debug), premium] == $$$$
 # -n number of processors  (really)
 ##=============================================================================
-#BSUB -J testrun
-#BSUB -o testrun.%J.log
-#BSUB -q dedicated
+#BSUB -J fortnight
+#BSUB -o fortnight.%J.log
+#BSUB -q regular
 #BSUB -N -u ${USER}@ucar.edu
 #BSUB -n 20
 #BSUB -W 12:00
@@ -66,7 +66,6 @@
    setenv JOBNAME     $LSB_OUTPUTFILE:ar
    setenv CENTRALDIR  $LS_SUBCWD
    setenv TMPDIR      /ptmp/${user}
-endif
    setenv SRCDIR      /fs/image/home/${user}/SVN/DART/models/MITgcm_ocean
 
    set advance_command = 'mpirun.lsf ./mitgcmuv_20p'