[Dart-dev] [3363] DART/trunk/models/MITgcm_ocean/shell_scripts/runme_filter: copied wrf runme_filter to use as a starting point for async=4

nancy at ucar.edu nancy at ucar.edu
Tue May 20 16:07:55 MDT 2008


An HTML attachment was scrubbed...
URL: http://mailman.ucar.edu/pipermail/dart-dev/attachments/20080520/512d0282/attachment.html
-------------- next part --------------
Added: DART/trunk/models/MITgcm_ocean/shell_scripts/runme_filter
===================================================================
--- DART/trunk/models/MITgcm_ocean/shell_scripts/runme_filter	                        (rev 0)
+++ DART/trunk/models/MITgcm_ocean/shell_scripts/runme_filter	2008-05-20 22:07:55 UTC (rev 3363)
@@ -0,0 +1,312 @@
+#!/bin/tcsh 
+#
+# Data Assimilation Research Testbed -- DART
+# Copyright 2004-2006, Data Assimilation Research Section, 
+# University Corporation for Atmospheric Research
+# Licensed under the GPL -- www.gpl.org/licenses/gpl.html
+#
+# $Id: runme_filter 2799 2007-04-04 23:17:51Z nancy $
+#
+# start at a generic run script for the mpi version.  this should probably
+# end up in the shell scripts directory - but it is here for now.  nsc.
+#=============================================================================
+# This block of directives constitutes the preamble for the LSF queuing system
+# LSF is used on the IBM   Linux cluster 'lightning'
+# LSF is used on the IMAGe Linux cluster 'coral'
+# LSF is used on the IBM   'bluevista'
+# The queues on lightning and bluevista are supposed to be similar.
+#
+# the normal way to submit to the queue is:    bsub < runme_filter
+#
+# an explanation of the most common directives follows:
+# -J Job name (master script job.csh presumes filter_server.xxxx.log)
+# -o STDOUT filename
+# -e STDERR filename
+# -P      account
+# -q queue    cheapest == [standby, economy, (regular,debug), premium] == $$$$
+# -n number of processors  (really)
+##=============================================================================
+#BSUB -J filter
+#BSUB -o filter.%J.log
+#BSUB -q regular
+#BSUB -n 16
+##BSUB -P nnnnnnnn
+#BSUB -W 1:00
+#
+#
+##=============================================================================
+## This block of directives constitutes the preamble for the PBS queuing system
+## PBS is used on the CGD   Linux cluster 'bangkok'
+## PBS is used on the CGD   Linux cluster 'calgary'
+## 
+## the normal way to submit to the queue is:    qsub runme_filter
+## 
+## an explanation of the most common directives follows:
+## -N     Job name
+## -r n   Declare job non-rerunable
+## -e <arg>  filename for standard error
+## -o <arg>  filename for standard out 
+## -q <arg>   Queue name (small, medium, long, verylong)
+## -l nodes=xx:ppn=2   requests BOTH processors on the node. On both bangkok 
+##                     and calgary, there is no way to 'share' the processors
+##                     on the node with another job, so you might as well use
+##                     them both.  (ppn == Processors Per Node)
+##=============================================================================
+#PBS -N filter
+#PBS -r n
+#PBS -e filter.err
+#PBS -o filter.log
+#PBS -q medium
+#PBS -l nodes=16:ppn=2
+
+
+# if async=2, e.g. you are going to run './wrf.exe', single process
+# (or possibly 'mpirun -np 1 ./wrf.exe'), so each processor advances 
+# one ensemble independently of the others, leave this as false.
+#
+# if async=4, e.g.  all the processors advance each wrf.exe in turn with
+# mpirun -np 64 wrf.exe (or whatever) for as many ensembles as you have,
+# set this to "true"
+
+# if async=4, also check that the call to advance_model.csh
+# has the right number of ensemble members below; it must match
+# the input.nml number.
+
+set parallel_model = "false"
+
+set num_ens = 16
+
+# A common strategy for the beginning is to check for the existence of
+# some variables that get set by the different queuing mechanisms.
+# This way, we know which queuing mechanism we are working with,
+# and can set 'queue-independent' variables for use for the remainder
+# of the script.
+
+if ($?LS_SUBCWD) then
+
+   # LSF has a list of processors already in a variable (LSB_HOSTS)
+   #  alias submit 'bsub < \!*'
+
+   # each filter task advances the ensembles, each running on 1 proc.
+   if ( "$parallel_model" == "false" ) then
+   
+     mpirun.lsf ./filter
+   
+   else
+   
+   # filter runs in parallel until time to do a model advance,
+   # and then this script starts up the wrf.exe jobs, each one
+   # running in parallel.  then it runs wakeup_filter to wake
+   # up filter so it can continue.
+   
+     rm -f model_to_filter.lock filter_to_model.lock
+     mkfifo model_to_filter.lock filter_to_model.lock
+
+     set filterhome = ~/.filter
+     if ( ! -e $filterhome) mkdir $filterhome
+   
+     # this starts filter but also returns control back to
+     # this script immediately.
+      
+     (setenv HOME $filterhome; mpirun.lsf ./filter)  &
+       
+     while ( -e filter_to_model.lock )
+       
+       set todo=`( echo $< ) < filter_to_model.lock`
+       echo todo received, value = ${todo}
+       
+       if ( "${todo}" == "finished" ) then
+         echo main script: filter done.
+         wait
+         break                                
+
+       else if ( "${todo}" == "advance" ) then
+         
+         # the second number below must match the number
+         # of ensembles.  and in input.nml, the advance model
+         # command must have -np N with N equal to the number
+         # of processors this job is using.
+   
+         echo calling model advance now:                
+         ./advance_model.csh 0 ${num_ens} filter_control00000  true
+         
+         echo restarting filter.     
+         mpirun.lsf  ./wakeup_filter
+       
+       else
+         
+         echo main script: unexpected value received.
+         break
+       
+       endif
+      
+     end
+      
+     echo filter finished, removing pipes.
+     rm -f model_to_filter.lock filter_to_model.lock
+   
+     if ( -d $filterhome) rmdir $filterhome
+   endif
+   
+
+else if ($?PBS_O_WORKDIR) then
+
+   # PBS has a list of processors in a file whose name is (PBS_NODEFILE)
+   #  alias submit 'qsub \!*'
+
+   # each filter task advances the ensembles, each running on 1 proc.
+   if ( "$parallel_model" == "false" ) then
+   
+     mpirun ./filter
+   
+   else
+   
+   # filter runs in parallel until time to do a model advance,
+   # and then this script starts up the wrf.exe jobs, each one
+   # running in parallel.  then it runs wakeup_filter to wake
+   # up filter so it can continue.
+   
+     rm -f model_to_filter.lock filter_to_model.lock
+     mkfifo model_to_filter.lock filter_to_model.lock
+
+     set filterhome = ~/.filter
+     if ( ! -e $filterhome) mkdir $filterhome
+   
+     # this starts filter but also returns control back to
+     # this script immediately.
+      
+     (setenv HOME $filterhome; mpirun ./filter)  &
+       
+     while ( -e filter_to_model.lock )
+       
+       set todo=`( echo $< ) < filter_to_model.lock`
+       echo todo received, value = ${todo}
+       
+       if ( "${todo}" == "finished" ) then
+         echo main script: filter done.
+         wait
+         break                                
+
+       else if ( "${todo}" == "advance" ) then
+         
+         # the second number below must match the number
+         # of ensembles.  and in input.nml, the advance model
+         # command must have -np N with N equal to the number
+         # of processors this job is using.
+   
+         echo calling model advance now:                
+         ./advance_model.csh 0 ${num_ens} filter_control00000  true
+         
+         echo restarting filter.     
+         mpirun  ./wakeup_filter
+       
+       else
+         
+         echo main script: unexpected value received.
+         break
+       
+       endif
+      
+     end
+      
+     echo filter finished, removing pipes.
+     rm -f model_to_filter.lock filter_to_model.lock
+   
+     if ( -d $filterhome) rmdir $filterhome
+   endif
+   
+else if ($?OCOTILLO_MPINODES) then
+
+   # If you have a linux cluster with no queuing software, use this
+   # section.  The list of computational nodes is given to the mpirun
+   # command and it assigns them as they appear in the file.  In some
+   # cases it seems to be necessary to wrap the command in a small
+   # script that changes to the current directory before running.
+   # (ocotillo is a local ncar cluster, and also a type of desert tree)
+
+   echo "running on ocotillo"
+
+   # before running this script, do this once.  the syntax is
+   # node name : how many tasks you can run on it
+   #setenv OCOTILLO_MPINODES  ~/nodelist
+   #echo "node7:2" >! $OCOTILLO_MPINODES
+   #echo "node5:2" >> $OCOTILLO_MPINODES
+   #echo "node3:2" >> $OCOTILLO_MPINODES
+   #echo "node1:2" >> $OCOTILLO_MPINODES
+
+   setenv NUM_PROCS 8
+   echo "running with $NUM_PROCS nodes specified from $OCOTILLO_MPINODES"
+
+   # each filter task advances the ensembles, each running on 1 proc.
+   if ( "$parallel_model" == "false" ) then
+
+      mpirun -np $NUM_PROCS -nolocal -machinefile $OCOTILLO_MPINODES ./filter
+
+   else
+
+   # filter runs in parallel until time to do a model advance,
+   # and then this script starts up the wrf.exe jobs, each one
+   # running in parallel.  then it runs wakeup_filter to wake
+   # up filter so it can continue.
+   
+     rm -f model_to_filter.lock filter_to_model.lock
+     mkfifo model_to_filter.lock filter_to_model.lock
+
+     set filterhome = ~/.filter
+     if ( ! -e $filterhome) mkdir $filterhome
+   
+     # this starts filter but also returns control back to
+     # this script immediately.
+      
+     (setenv HOME $filterhome; \
+      mpirun -np $NUM_PROCS -nolocal -machinefile $OCOTILLO_MPINODES ./filter ) &
+       
+     while ( -e filter_to_model.lock )
+       
+       set todo=`( echo $< ) < filter_to_model.lock`
+       echo todo received, value = ${todo}
+       
+       if ( "${todo}" == "finished" ) then
+         echo main script: filter done.
+         wait
+         break                                
+
+       else if ( "${todo}" == "advance" ) then
+         
+         # the second number below must match the number
+         # of ensembles.  and in input.nml, the advance model
+         # command must have -np N with N equal to the number
+         # of processors this job is using.
+   
+         echo calling model advance now:                
+         ./advance_model.csh 0 ${num_ens} filter_control00000  true
+         
+         echo restarting filter.     
+         mpirun -np $NUM_PROCS -nolocal -machinefile $OCOTILLO_MPINODES ./wakeup_filter
+       
+       else
+         
+         echo main script: unexpected value received.
+         break
+       
+       endif
+      
+     end
+      
+     echo filter finished, removing pipes.
+     rm -f model_to_filter.lock filter_to_model.lock
+   
+     if ( -d $filterhome) rmdir $filterhome
+   endif
+   
+else
+
+   # interactive - assume you are using 'lam-mpi' and that you have
+   # already run 'lamboot' once to start the lam server, or that you
+   # are running with a machine that has mpich installed.
+
+   echo "running interactively"
+   mpirun -np 4 ./filter
+
+endif
+


Property changes on: DART/trunk/models/MITgcm_ocean/shell_scripts/runme_filter
___________________________________________________________________
Name: svn:executable
   + *


More information about the Dart-dev mailing list