[Dart-dev] [7768] DART/trunk/models/tiegcm/shell_scripts: These scripts are identical to r7761 on the tiegcm branch.

Wed Mar 25 17:28:03 MDT 2015

Revision: 7768
Author:   thoar
Date:     2015-03-25 17:28:03 -0600 (Wed, 25 Mar 2015)
Log Message:
-----------
These scripts are identical to r7761 on the tiegcm branch.
There is a subversion problem that will not let me merge the branch
back onto the trunk, so the log files for the revisions on the branch
are only available on the branch.

In summary - these scripts enforce some assumptions about filenames
and such through the use of 'sed' on input.nml and tiegcm.nml.

Modified Paths:
--------------
    DART/trunk/models/tiegcm/shell_scripts/run_filter.csh
    DART/trunk/models/tiegcm/shell_scripts/run_filter_async4.csh
    DART/trunk/models/tiegcm/shell_scripts/run_perfect_model_obs.csh

-------------- next part --------------
Modified: DART/trunk/models/tiegcm/shell_scripts/run_filter.csh
===================================================================

--- DART/trunk/models/tiegcm/shell_scripts/run_filter.csh	2015-03-25 23:23:34 UTC (rev 7767)
+++ DART/trunk/models/tiegcm/shell_scripts/run_filter.csh	2015-03-25 23:28:03 UTC (rev 7768)
@@ -25,19 +25,31 @@
 # Once the 'table is set', all that remains is to start/submit the 
 # 'runme_filter' script. That script will spawn 'filter' as a 
 # parallel job on the appropriate nodes; each of these tasks will 
-# call a separate model_advance.csh when necessary.
+# call a separate advance_model.csh when necessary.
 #
 # The central directory is where the scripts reside and where script and 
 # program I/O are expected to happen.
+#
+# PLEASE READ THE FOLLOWING: 
+#    Setting the number of tasks and choosing the right ptile requires work.
+# The number of tasks (-n) can be be as big as the ensemble size for
+# a single-threaded tiegcm (i.e. async == 2) so that all ensemble members can 
+# run simultaneously. The setting of ptile specifies the number of tasks on each 
+# node, which usually depends on the model resolution and subsequent memory use 
+# of each ensemble member. Think of ptile as the number of ensemble members you 
+# can run on one node and not run out of the shared memory on that node.
+#    If you specify more tasks than ensemble members, there are tasks that have
+# nothing to do during the model advance. If the model advance step takes longer
+# than the MPI timeout on your machine, you may need to disable the MPI timeout.
 #-----------------------------------------------------------------------------
 #
-#BSUB -J filter
-#BSUB -o filter.%J.log
+#BSUB -J tiegcm_filter
+#BSUB -o tiegcm_filter.%J.log
 #BSUB -P P3507xxxx
-#BSUB -q economy
-#BSUB -n 64
-#BSUB -R "span[ptile=64]"
-#BSUB -W 3:00
+#BSUB -q regular
+#BSUB -n 80
+#BSUB -R "span[ptile=16]"
+#BSUB -W 1:00
 #BSUB -N -u ${USER}@ucar.edu
 
 #----------------------------------------------------------------------
@@ -53,7 +65,12 @@
    setenv JOBID       $LSB_JOBID
    setenv MYQUEUE     $LSB_QUEUE
    setenv MYHOST      $LSB_SUB_HOST
+   setenv MPI_RUN_CMD mpirun.lsf
 
+   # MP_DEBUG_NOTIMEOUT may alleviate MPI timeouts that may occur under
+   # certain task geometries. It is NOT a good idea to use it in general. 
+   # setenv MP_DEBUG_NOTIMEOUT yes
+
 else
 
    #-------------------------------------------------------------------
@@ -61,10 +78,11 @@
    #-------------------------------------------------------------------
 
    setenv ORIGINALDIR `pwd`
-   setenv JOBNAME     tiegcm
+   setenv JOBNAME     tiegcm_filter
    setenv JOBID       $$
    setenv MYQUEUE     Interactive
-   setenv MYHOST      $host
+   setenv MYHOST      $HOST
+   setenv MPI_RUN_CMD ''
 
 endif
 
@@ -84,12 +102,11 @@
 # Make a unique, (empty, clean) temporary directory.
 #----------------------------------------------------------------------
 
-setenv TMPDIR /glade/scratch/${user}/DART/${JOBNAME}/job_${JOBID}
+setenv CENTRALDIR /glade/scratch/${user}/DART/${JOBNAME}/job_${JOBID}
 
-mkdir -p ${TMPDIR}
-cd ${TMPDIR}
+mkdir -p ${CENTRALDIR}
+cd ${CENTRALDIR}
 
-set CENTRALDIR = `pwd`
 set myname = $0          # this is the name of this script
 
 # some systems don't like the -v option to any of the following 
@@ -100,16 +117,19 @@
       setenv REMOVE 'rm -rf'
       setenv   COPY 'cp -p'
       setenv   MOVE 'mv -f'
+      setenv   LINK 'ln -s'
       breaksw
    case AIX:
       setenv REMOVE 'rm -rf'
       setenv   COPY 'cp -p'
       setenv   MOVE 'mv -f'
+      setenv   LINK 'ln -s'
       breaksw
    default:
       setenv REMOVE 'rm -rvf'
       setenv   COPY 'cp -v'
       setenv   MOVE 'mv -fv'
+      setenv   LINK 'ln -s'
       breaksw
 endsw
 
@@ -117,118 +137,176 @@
 
 #-----------------------------------------------------------------------------
 # Set variables containing various directory names where we will GET things
+# DARTDIR      The location of the DART tiegcm model directory
+# TIEGCMDIR    The location of the TIEGCM executable
+# ENSEMBLEDIR  The location of the initial ensemble of TIEGCM files
+# EXPERIMENT   The (safe) location for the results of this run.
 #-----------------------------------------------------------------------------
 
-set    DARTDIR = /glade/u/home/tmatsuo/DART/models/tiegcm
-set  TIEGCMDIR = /glade/u/home/tmatsuo/DART/models/tiegcm/tiegcm_files
-set EXPERIMENT = /glade/scratch/tmatsuo/2002_03_28_tiegcm
+set     DARTDIR = /glade/u/home/${USER}/DART/tiegcm/models/tiegcm
+set   TIEGCMDIR = /glade/u/home/${USER}/tiegcm/src
+set ENSEMBLEDIR = /glade/p/work/alexc/startup_files/initial
+set  EXPERIMENT = /glade/p/work/${USER}/${JOBNAME}
 
 #-----------------------------------------------------------------------------
 # Get the DART executables, scripts, and input files
+# Get the tiegcm executable, control files, and data files.
+# The tiegcm initial conditions are in the next block.
 #-----------------------------------------------------------------------------
 
-# executables
+${COPY} ${DARTDIR}/work/filter                       . || exit 1
+${COPY} ${DARTDIR}/work/dart_to_model                . || exit 1
+${COPY} ${DARTDIR}/work/model_to_dart                . || exit 1
+${COPY} ${DARTDIR}/work/input.nml   input.nml.original || exit 1
+${COPY} ${DARTDIR}/shell_scripts/advance_model.csh   . || exit 1
+${COPY} ${EXPERIMENT}/observation/obs_seq.out        . || exit 1
 
- ${COPY} ${DARTDIR}/work/filter                     .
- ${COPY} ${DARTDIR}/work/dart_to_model              .
- ${COPY} ${DARTDIR}/work/model_to_dart              .
+${COPY}  ${TIEGCMDIR}/tiegcm-nompi              tiegcm || exit 1
 
-# shell scripts
- ${COPY} ${DARTDIR}/shell_scripts/advance_model.csh .
-
-# data files
- ${COPY} ${EXPERIMENT}/initial/obs_seq.out          .
- ${COPY} ${DARTDIR}/work/input.nml                  .
-
 #-----------------------------------------------------------------------------
-# Get the tiegcm executable, control files, and data files.
-#-----------------------------------------------------------------------------
-
- ${COPY} ${TIEGCMDIR}/tiegcm-nompi                  tiegcm
-#${COPY} ${TIEGCMDIR}/tiegcm                        .
-
-#-----------------------------------------------------------------------------
-# Get the tiegcm input state ... for this experiment, we generated the ensemble by: 
-#
-# ${COPY} ${TIEGCMDIR}/tiegcm_s.nc                   .
-# ${COPY} ${TIEGCMDIR}/tiegcm_restart_p.nc           .
-# ./model_to_dart || exit 1
-# mv temp_ud filter_ics
-#
-# REQUIREMENT for the case where we have an initial ensemble:
-# input.nml:filter_nml:start_from_restart = .TRUE.
-# input.nml:filter_nml:restart_in_file    = 'filter_ics'
-#-----------------------------------------------------------------------------
 # Put all of the DART initial conditions files and all of the TIEGCM files
 # in the CENTRALDIR - preserving the ensemble member ID for each filename.
 # The advance_model.csh script will copy the appropriate files for each 
 # ensemble member into the model advance directory.
 # These files may be linked to CENTRALDIR since they get copied to the
 # model advance directory. 
+#
+# REQUIREMENTS: for input.nml
+# model_nml            : tiegcm_restart_file_name   = 'tiegcm_restart_p.nc'
+# model_nml            : tiegcm_secondary_file_name = 'tiegcm_s.nc'
+# model_nml            : tiegcm_namelist_file_name  = 'tiegcm.nml'
+# model_to_dart_nml    : file_out                   = 'dart_ics'
 #-----------------------------------------------------------------------------
+# ensemble_manager_nml : single_restart_file_in     = .false.
+# filter_nml           : async                      = 2
+# filter_nml           : adv_ens_command            = './advance_model.csh'
+# filter_nml           : start_from_restart         = .TRUE.
+# filter_nml           : restart_in_file_name       = 'filter_ics'
+# filter_nml           : restart_out_file_name      = 'filter_restart'
+#-----------------------------------------------------------------------------
+# dart_to_model_nml    : file_in                    = 'dart_restart'
+# dart_to_model_nml    : file_namelist_out          = 'namelist_update'
 
-set ENSEMBLESTRING = `/usr/local/bin/grep -A 42 filter_nml input.nml | grep ens_size`
+sed -e "/ tiegcm_restart_file_name /c\ tiegcm_restart_file_name = 'tiegcm_restart_p.nc'" \
+    -e "/ tiegcm_secondary_file_name /c\ tiegcm_secondary_file_name = 'tiegcm_s.nc'" \
+    -e "/ tiegcm_namelist_file_name /c\ tiegcm_namelist_file_name = 'tiegcm.nml'" \
+    -e "/ file_out /c\ file_out = 'dart_ics'" \
+    -e "/ single_restart_file_in /c\ single_restart_file_in = .FALSE." \
+    -e "/ async /c\ async = 2" \
+    -e "/ adv_ens_command /c\ adv_ens_command = './advance_model.csh'" \
+    -e "/ start_from_restart /c\ start_from_restart = .TRUE." \
+    -e "/ restart_in_file_name /c\ restart_in_file_name = 'filter_ics'" \
+    -e "/ restart_out_file_name /c\ restart_out_file_name = 'filter_restart'" \
+    -e "/ file_in /c\ file_in = 'dart_restart'" \
+    -e "/ file_namelist_out /c\ file_namelist_out = 'namelist_update'" \
+    input.nml.original >! input.nml  || exit 2
+
+set ENSEMBLESTRING = `grep -A 42 filter_nml input.nml | grep ens_size`
 set NUM_ENS = `echo $ENSEMBLESTRING[3] | sed -e "s#,##"`
 
-@ i = 1
-while ( $i <= $NUM_ENS )
+@ instance = 1
+while ( $instance <= $NUM_ENS )
 
-  set darticname  = `printf "filter_ics.%04d"          $i`
-  set tiesecond   = `printf "tiegcm_s.nc.%04d"         $i`
-  set tierestart  = `printf "tiegcm_restart_p.nc.%04d" $i`
-  set tieinp      = `printf "tiegcm.nml.%04d"          $i`
+  set darticname  = `printf "filter_ics.%04d"          $instance`
+  set tiesecond   = `printf "tiegcm_s.nc.%04d"         $instance`
+  set tierestart  = `printf "tiegcm_restart_p.nc.%04d" $instance`
+  set tieinp      = `printf "tiegcm.nml.%04d"          $instance`
 
-  ln -sf ${EXPERIMENT}/initial/$darticname .
-  ln -sf ${EXPERIMENT}/initial/$tiesecond  .
-  ln -sf ${EXPERIMENT}/initial/$tierestart .
-  ln -sf ${EXPERIMENT}/initial/$tieinp     tiegcm.nml.original 
+  ${COPY} ${ENSEMBLEDIR}/$tiesecond  .                   || exit 2
+  ${COPY} ${ENSEMBLEDIR}/$tierestart .                   || exit 2
+  ${COPY} ${ENSEMBLEDIR}/$tieinp     tiegcm.nml.original || exit 2
 
-  sed -e 's/;.*//' -e '/^$/ d' tiegcm.nml.original >! $tieinp
+  # Ensure that the tiegcm.nml for all the ensemble members is identical
+  # in all the ways that matter. This will result in a miniumum of changes
+  # in the advance_model.csh script. This script REQUIRES that there is a  
+  # SINGLE tiegcm_restart_p.nc. Just keep appending all the timesteps to
+  # the same file. If you need to subset the large file, use the NCO
+  # operators. for example    ncks -d time,20,30 tiegcm_restart_p.nc bob.nc 
+  # If you need more than 300 timesteps in the file, increase it here.
+  
+  sed -e 's/;.*//' -e '/^$/ d' \
+      -e "/ MXHIST_PRIM /c\ MXHIST_PRIM = 300" \
+      -e "/ MXHIST_SECH /c\ MXHIST_SECH = 300" \
+      -e "/ SOURCE /c\ SOURCE = 'tiegcm_restart_p.nc'" \
+      -e "/ OUTPUT /c\ OUTPUT = 'tiegcm_restart_p.nc'" \
+      -e "/ SECOUT /c\ SECOUT = 'tiegcm_s.nc'"         \
+      tiegcm.nml.original >! $tieinp  || exit 2
 
-  @ i += 1
+  # If an existing ensemble of filter_ics.#### exist, use it.
+  # If not, generate one. Be aware - even if they exist, they may
+  # not have the same variable set as your current input.nml
+  # If that is the case, you will have to generate your own set anyway.
+  # If you get an error from aread_state_restart(), this is likely the case.
+
+  if (  -e  ${ENSEMBLEDIR}/initial/$darticname.GENERATE ) then
+     ${REMOVE} $darticname
+     ${LINK} ${ENSEMBLEDIR}/initial/$darticname . || exit 2
+  else
+     # We must convert a tiegcm_restart_p.nc file to a dart_ics file
+     # for each ensemble member. So - momentarily, we must
+     # create links to the static filenames expected by model_to_dart
+
+     ${REMOVE} tiegcm_restart_p.nc tiegcm_s.nc tiegcm.nml 
+
+     ${LINK} $tierestart tiegcm_restart_p.nc   || exit 2
+     ${LINK} $tiesecond  tiegcm_s.nc           || exit 2
+     ${LINK} $tieinp     tiegcm.nml            || exit 2
+
+     ./model_to_dart || exit 2
+
+     if (-e dart_ics ) then
+        ${MOVE} dart_ics $darticname
+     else
+        echo "ERROR: File conversion from $tierestart to $darticname failed."
+        echo "ERROR: File conversion from $tierestart to $darticname failed."
+        echo "ERROR: File conversion from $tierestart to $darticname failed."
+        exit 2
+     endif
+  endif
+
+  @ instance++
 end
 
 #-----------------------------------------------------------------------------
 # Run filter ... 
 #-----------------------------------------------------------------------------
 
-ln -sf tiegcm_restart_p.nc.0001 tiegcm_restart_p.nc
-ln -sf tiegcm_s.nc.0001         tiegcm_s.nc
-ln -sf tiegcm.nml.0001          tiegcm.nml
+${REMOVE} tiegcm_restart_p.nc tiegcm_s.nc tiegcm.nml 
 
-mpirun.lsf ./filter || exit 2
+${LINK} tiegcm_restart_p.nc.0001 tiegcm_restart_p.nc   || exit 3
+${LINK} tiegcm_s.nc.0001         tiegcm_s.nc           || exit 3
+${LINK} tiegcm.nml.0001          tiegcm.nml            || exit 3
 
-echo "${JOBNAME} ($JOBID) finished at "`date`
+${MPI_RUN_CMD} ./filter || exit 3
 
 #-----------------------------------------------------------------------------
-# Move the output to storage after filter completes.
-# At this point, all the DART restart,diagnostic files are in the CENTRALDIR
-# and need to be moved to the 'experiment permanent' directory.
+# At this point, all the restart,diagnostic files are in the run/CENTRALDIR.
+# You may want to move them to someplace more 'permanent'.
 #
 # TJH: At this point, the output files have pretty 'generic' names.
-# The files should be archived with the assimilation date in their name.
+# The files could be archived with the assimilation date in their name.
 #-----------------------------------------------------------------------------
 
-exit
+# ${COPY} tiegcm.nml                 ${EXPERIMENT}/tiegcm
+# ${MOVE} tiegcm_s.nc*               ${EXPERIMENT}/tiegcm
+# ${MOVE} tiegcm_restart_p.nc*       ${EXPERIMENT}/tiegcm
+# ${MOVE} tiegcm_out_*               ${EXPERIMENT}/tiegcm
 
-${MOVE} tiegcm_s.nc*               ${experiment}/tiegcm
-${MOVE} tiegcm_restart_p.nc*       ${experiment}/tiegcm
-${MOVE} tiegcm_out_*               ${experiment}/tiegcm
+# ${MOVE} Posterior_Diag.nc          ${EXPERIMENT}/DART
+# ${MOVE} Prior_Diag.nc              ${EXPERIMENT}/DART
+# ${MOVE} obs_seq.final              ${EXPERIMENT}/DART
+# ${MOVE} dart_log.out               ${EXPERIMENT}/DART
 
-${MOVE} filter_restart*            ${experiment}/DART
-${MOVE} assim_model_state_ud[1-9]* ${experiment}/DART
-${MOVE} assim_model_state_ic[1-9]* ${experiment}/DART
-${MOVE} Posterior_Diag.nc          ${experiment}/DART
-${MOVE} Prior_Diag.nc              ${experiment}/DART
-${MOVE} obs_seq.final              ${experiment}/DART
-${MOVE} dart_log.out               ${experiment}/DART
-
 # Good style dictates that you save the scripts so you can see what worked.
 
-${COPY} input.nml                  ${experiment}/DART
-${COPY} *.csh                      ${experiment}/DART
-${COPY} $myname                    ${experiment}/DART
+# ${COPY} input.nml                  ${EXPERIMENT}/DART
+# ${COPY} *.csh                      ${EXPERIMENT}/DART
+# ${COPY} $myname                    ${EXPERIMENT}/DART
 
+echo "${JOBNAME} ($JOBID) finished at "`date`
+echo "These are the files in the run directory at completion:"
+ls -lrt
+
 exit 0
 
 # <next few lines under version control, do not edit>

Modified: DART/trunk/models/tiegcm/shell_scripts/run_filter_async4.csh
===================================================================
--- DART/trunk/models/tiegcm/shell_scripts/run_filter_async4.csh	2015-03-25 23:23:34 UTC (rev 7767)
+++ DART/trunk/models/tiegcm/shell_scripts/run_filter_async4.csh	2015-03-25 23:28:03 UTC (rev 7768)
@@ -6,23 +6,23 @@
 #
 # DART $Id$
 #
-#=============================================================================
-# This block of directives constitutes the preamble for the LSF queuing system
-# LSF is used on the IBM   Linux cluster 'lightning'
-# LSF is used on the IMAGe Linux cluster 'coral'
-# LSF is used on the IBM   'bluevista'
-# The queues on lightning and bluevista are supposed to be similar.
-#
-# the normal way to submit to the queue is:    bsub < run_filter.csh
-#
-# an explanation of the most common directives follows:
-# -J Job name (master script job.csh presumes filter_server.xxxx.log)
-# -o STDOUT filename
-# -e STDERR filename
-# -P      account
-# -q queue    cheapest == [standby, economy, (regular,debug), premium] == $$$$
-# -n number of processors  (really)
-##=============================================================================
+##------------------------------------------------------------------------------
+## This block of directives constitutes the preamble for the LSF queuing system
+## LSF is used on the IBM   Linux cluster 'lightning'
+## LSF is used on the IMAGe Linux cluster 'coral'
+## LSF is used on the IBM   'bluevista'
+## The queues on lightning and bluevista are supposed to be similar.
+##
+## the normal way to submit to the queue is:    bsub < run_filter.csh
+##
+## an explanation of the most common directives follows:
+## -J Job name (master script job.csh presumes filter_server.xxxx.log)
+## -o STDOUT filename
+## -e STDERR filename
+## -P      account
+## -q queue    cheapest == [standby, economy, (regular,debug), premium] == $$$$
+## -n number of processors  (really)
+##
 #BSUB -J filter
 #BSUB -o filter.%J.log
 #BSUB -P P3507xxxx
@@ -31,7 +31,7 @@
 #BSUB -W 0:30
 #BSUB -N -u ${USER}@ucar.edu
 #
-##=============================================================================
+##------------------------------------------------------------------------------
 ## This block of directives constitutes the preamble for the PBS queuing system
 ## PBS is used on the CGD Linux cluster 'bangkok'
 ## PBS is used on the CGD Linux cluster 'calgary'
@@ -48,7 +48,7 @@
 ##                     and calgary, there is no way to 'share' the processors
 ##                     on the node with another job, so you might as well use
 ##                     them both. (ppn == Processors Per Node)
-##=============================================================================
+##
 #PBS -N filter
 #PBS -r n
 #PBS -e filter.err
@@ -56,20 +56,8 @@
 #PBS -q dedicated
 #PBS -l nodes=10:ppn=2
 
-# if async=2, e.g. you are going to run './modelxxx', single process
-# (or possibly 'mpirun -np 1 ./modelxxx'), so each processor advances
-# one ensemble independently of the others, leave this as false.
-#
-# if async=4, e.g. all the processors advance each modelxxx in turn with
-# mpirun -np 64 modelxxx (or whatever) for as many ensembles as you have,
-# set this to "true"
+#===============================================================================
 
-# if async=4, also check that the call to advance_model.csh
-# has the right number of ensemble members below; it must match
-# the input.nml number.
-
-set parallel_model = "true"
-
 # Determine the number of ensemble members from input.nml,
 # it may exist in more than one place.
 # Parse out the filter_nml string and see which 
@@ -84,6 +72,17 @@
 set ENSEMBLESTRING = `grep -A 42 filter_nml input.nml | grep ens_size`
 set NUM_ENS = `echo $ENSEMBLESTRING[3] | sed -e "s#,##"`
 
+# FIXME ... read the async value from input.nml and set parallel_model accordingly.
+# if async=2, e.g. you are going to run './modelxxx', single process
+# (or possibly 'mpirun -np 1 ./modelxxx'), so each processor advances
+# one ensemble independently of the others, leave this as false.
+#
+# if async=4, e.g. all the processors advance each modelxxx in turn with
+# mpirun -np 64 modelxxx (or whatever) for as many ensembles as you have,
+# set this to "true"
+
+set parallel_model = "true"
+
 # A common strategy for the beginning is to check for the existence of
 # some variables that get set by the different queuing mechanisms.
 # This way, we know which queuing mechanism we are working with,
@@ -95,136 +94,15 @@
     # LSF has a list of processors already in a variable (LSB_HOSTS)
     # alias submit 'bsub < \!*'
     echo "LSF - using mpirun.lsf for execution"
+    setenv MPICMD mpirun.lsf
 
-    # each filter task advances the ensembles, each running on 1 proc.
-    if ( "$parallel_model" == "false" ) then
-
-       mpirun.lsf ./filter
-
-    else
-
-    # filter runs in parallel until time to do a model advance,
-    # and then this script starts up the modelxxx jobs, each one
-    # running in parallel. then it runs wakeup_filter to wake
-    # up filter so it can continue.
-
-      \rm -f model_to_filter.lock filter_to_model.lock
-      mkfifo model_to_filter.lock filter_to_model.lock
-
-      set filterhome = ~/.filter$$
-      if ( ! -e $filterhome) mkdir $filterhome
-
-      # this starts filter but also returns control back to
-      # this script immediately.
-
-      (setenv HOME $filterhome; mpirun.lsf ./filter) &
-
-      while ( -e filter_to_model.lock )
-
-        set todo=`cat < filter_to_model.lock`
-        echo "todo received, value = ${todo}"
-
-        if ( "${todo}" == "finished" ) then
-          echo "main script: filter done."
-          wait
-          break
-
-        else if ( "${todo}" == "advance" ) then
-
-          # the second number below must match the number
-          # of ensembles. Also, in input.nml, the advance model
-          # command must have -np N with N equal to the number
-          # of processors this job is using.
-
-          echo "calling model advance now:"
-          ./advance_model.csh 0 ${NUM_ENS} filter_control00000 || exit 9
-
-          echo "restarting filter."
-          mpirun.lsf ./wakeup_filter
-
-        else
-
-          echo "main script: unexpected value received."
-          break
-
-        endif
-
-      end
-
-      echo "filter finished, removing pipes."
-      \rm -f model_to_filter.lock filter_to_model.lock
-
-      if ( -d $filterhome) rmdir $filterhome
-    endif
-
-
 else if ($?PBS_O_WORKDIR) then
 
     # PBS has a list of processors in a file whose name is (PBS_NODEFILE)
     # alias submit 'qsub \!*'
     echo "PBS - using mpirun for execution"
+    setenv MPICMD mpirun
 
-    # each filter task advances the ensembles, each running on 1 proc.
-    if ( "$parallel_model" == "false" ) then
-
-      mpirun ./filter
-
-    else
-
-    # filter runs in parallel until time to do a model advance,
-    # and then this script starts up the modelxxx jobs, each one
-    # running in parallel. then it runs wakeup_filter to wake
-    # up filter so it can continue.
-
-      \rm -f model_to_filter.lock filter_to_model.lock
-      mkfifo model_to_filter.lock filter_to_model.lock
-
-      set filterhome = ~/.filter
-      if ( ! -e $filterhome) mkdir $filterhome
-
-      # this starts filter but also returns control back to
-      # this script immediately.
-
-      (setenv HOME $filterhome; mpirun ./filter) &
-
-      while ( -e filter_to_model.lock )
-
-        set todo=`cat < filter_to_model.lock`
-        echo "todo received, value = ${todo}"
-
-        if ( "${todo}" == "finished" ) then
-          echo "main script: filter done."
-          wait
-          break
-
-        else if ( "${todo}" == "advance" ) then
-
-          # the second number below must match the number
-          # of ensembles. Also, in input.nml, the advance model
-          # command must have -np N with N equal to the number
-          # of processors this job is using.
-
-          echo "calling model advance now:"
-          ./advance_model.csh 0 ${NUM_ENS} filter_control00000 || exit 9
-
-          echo "restarting filter."
-          mpirun ./wakeup_filter
-
-        else
-
-          echo "main script: unexpected value received."
-          break
-
-        endif
-
-      end
-
-      echo "filter finished, removing pipes."
-      \rm -f model_to_filter.lock filter_to_model.lock
-
-      if ( -d $filterhome) rmdir $filterhome
-    endif
-
 else
 
     # If you have a linux cluster with no queuing software, use this
@@ -254,59 +132,69 @@
 
     echo "MPICMD = ${MPICMD}"
 
-    # filter runs in parallel until time to do a model advance,
-    # and then this script starts up the modelxxx jobs, each one
-    # running in parallel. then it runs wakeup_filter to wake
-    # up filter so it can continue.
+endif
 
-    \rm -f model_to_filter.lock filter_to_model.lock
-    mkfifo model_to_filter.lock filter_to_model.lock
+#-------------------------------------------------------------------------------
+# Everything below this separator should not need to be modified if everything
+# above the separator is set correctly.
+#-------------------------------------------------------------------------------
 
-    set filterhome = ~/.filter$$
-    if ( ! -e $filterhome) mkdir $filterhome
+if ( "$parallel_model" == "false" ) then
 
-    # this starts filter but also returns control back to
-    # this script immediately.
+   # each filter task advances the ensembles, each running on 1 proc.
 
-    (setenv HOME $filterhome; ${MPICMD} ./filter) &
+   ${MPICMD} ./filter
 
-    while ( -e filter_to_model.lock )
+else
 
-        set todo=`cat < filter_to_model.lock`
-        echo "todo received, value = ${todo}"
+   # filter runs in parallel until time to do a model advance,
+   # and then this script starts up the modelxxx jobs, each one
+   # running in parallel. then it runs wakeup_filter to wake
+   # up filter so it can continue. The communication happens through
+   # 'named pipes' created by the mkfifo command.
 
-        if ( "${todo}" == "finished" ) then
-          echo "main script: filter done."
-          wait
-          break
+   \rm -f model_to_filter.lock filter_to_model.lock
+   mkfifo model_to_filter.lock filter_to_model.lock
 
-        else if ( "${todo}" == "advance" ) then
+   set filterhome = ~/.filter$$
+   if ( ! -e $filterhome) mkdir $filterhome
 
-          # the second number below must match the number
-          # of ensembles. Also, in input.nml, the advance model
-          # command must have -np N with N equal to the number
-          # of processors this job is using.
+   # start filter and immediately return control back to this script
 
-          echo "calling model advance now:"
-          ./advance_model.csh 0 ${NUM_ENS} filter_control00000 || exit 9
+   (setenv HOME $filterhome; ${MPICMD} ./filter) &
 
-          echo "restarting filter."
-          ${MPICMD} ./wakeup_filter
+   while ( -e filter_to_model.lock )
 
-        else
+      set todo=`cat < filter_to_model.lock`
+      echo "todo received, value = ${todo}"
 
-          echo "main script: unexpected value received."
-          break
+      if ( "${todo}" == "finished" ) then
+         echo "main script: filter done."
+         wait
+         break
 
-        endif
+      else if ( "${todo}" == "advance" ) then
 
-    end
+         # FIXME : in input.nml, the advance model command must
+         # have -np N with N equal to the number of processors this job is using.
 
-    echo "filter finished, removing pipes."
-    \rm -f model_to_filter.lock filter_to_model.lock
+         echo "calling model advance now:"
+         ./advance_model.csh 0 ${NUM_ENS} filter_control00000 || exit 9
 
-    if ( -d $filterhome) rmdir $filterhome
+         echo "restarting filter."
+         ${MPICMD} ./wakeup_filter
 
+      else
+          echo "main script: unexpected value received."
+          break
+      endif
+   end
+
+   echo "filter finished, removing pipes."
+   \rm -f model_to_filter.lock filter_to_model.lock
+
+   if ( -d $filterhome) \rmdir $filterhome
+
 endif
 
 exit 0

Modified: DART/trunk/models/tiegcm/shell_scripts/run_perfect_model_obs.csh
===================================================================
--- DART/trunk/models/tiegcm/shell_scripts/run_perfect_model_obs.csh	2015-03-25 23:23:34 UTC (rev 7767)
+++ DART/trunk/models/tiegcm/shell_scripts/run_perfect_model_obs.csh	2015-03-25 23:28:03 UTC (rev 7768)
@@ -25,14 +25,14 @@
 # Once the 'table is set', all that remains is to start/submit the 
 # 'runme_filter' script. That script will spawn 'filter' as a 
 # parallel job on the appropriate nodes; each of these tasks will 
-# call a separate model_advance.csh when necessary.
+# call a separate advance_model.csh when necessary.
 #
 # The central directory is where the scripts reside and where script and 
 # program I/O are expected to happen.
 #-----------------------------------------------------------------------------
 #
-#BSUB -J perfect
-#BSUB -o perfect.%J.log
+#BSUB -J tiegcm_perfect
+#BSUB -o tiegcm_perfect.%J.log
 #BSUB -P P3507xxxx
 #BSUB -q economy
 #BSUB -n 1
@@ -60,10 +60,10 @@
    #-------------------------------------------------------------------
 
    setenv ORIGINALDIR `pwd`
-   setenv JOBNAME     tiegcm
+   setenv JOBNAME     tiegcm_perfect
    setenv JOBID       $$
    setenv MYQUEUE     Interactive
-   setenv MYHOST      $host
+   setenv MYHOST      $HOST
 
 endif
 
@@ -83,12 +83,11 @@
 # Make a unique, (empty, clean) temporary directory.
 #----------------------------------------------------------------------
 
-setenv TMPDIR /glade/scratch/${user}/DART/${JOBNAME}/job_${JOBID}
+setenv CENTRALDIR /glade/scratch/${user}/DART/${JOBNAME}/job_${JOBID}
 
-mkdir -p ${TMPDIR}
-cd ${TMPDIR}
+mkdir -p ${CENTRALDIR}
+cd ${CENTRALDIR}
 
-set CENTRALDIR = `pwd`
 set myname = $0          # this is the name of this script
 
 # some systems don't like the -v option to any of the following 
@@ -99,16 +98,19 @@
       setenv REMOVE 'rm -rf'
       setenv   COPY 'cp -p'
       setenv   MOVE 'mv -f'
+      setenv   LINK 'ln -s'
       breaksw
    case AIX:
       setenv REMOVE 'rm -rf'
       setenv   COPY 'cp -p'
       setenv   MOVE 'mv -f'
+      setenv   LINK 'ln -s'
       breaksw
    default:
       setenv REMOVE 'rm -rvf'
       setenv   COPY 'cp -v'
       setenv   MOVE 'mv -fv'
+      setenv   LINK 'ln -s'
       breaksw
 endsw
 
@@ -116,99 +118,131 @@
 
 #-----------------------------------------------------------------------------
 # Set variables containing various directory names where we will GET things
+# DARTDIR      The location of the DART tiegcm model directory
+# TIEGCMDIR    The location of the TIEGCM executable
+# ENSEMBLEDIR  The location of the initial ensemble of TIEGCM files
+# EXPERIMENT   The (safe) location for the results of this run.
 #-----------------------------------------------------------------------------
 
-set    DARTDIR = /glade/u/home/tmatsuo/DART/models/tiegcm
-set  TIEGCMDIR = /glade/u/home/tmatsuo/DART/models/tiegcm/tiegcm_files
-set EXPERIMENT = /glade/scratch/tmatsuo/2002_03_28_tiegcm
+set     DARTDIR = /glade/u/home/${USER}/DART/tiegcm/models/tiegcm
+set   TIEGCMDIR = /glade/u/home/${USER}/tiegcm/src
+set ENSEMBLEDIR = /glade/p/work/alexc/startup_files/initial
+set  EXPERIMENT = /glade/p/work/${USER}/${JOBNAME}
 
 #-----------------------------------------------------------------------------
 # Get the DART executables, scripts, and input files
+# Get the tiegcm executable, control files, and data files.
+# The tiegcm initial conditions are in the next block.
 #-----------------------------------------------------------------------------
 
-# executables
+${COPY} ${DARTDIR}/work/perfect_model_obs            . || exit 1
+${COPY} ${DARTDIR}/work/dart_to_model                . || exit 1
+${COPY} ${DARTDIR}/work/model_to_dart                . || exit 1
+${COPY} ${DARTDIR}/work/input.nml   input.nml.original || exit 1
+${COPY} ${DARTDIR}/shell_scripts/advance_model.csh   . || exit 1
+${COPY} ${DARTDIR}/work/obs_seq.in                   . || exit 1
 
- ${COPY} ${DARTDIR}/work/perfect_model_obs          .
- ${COPY} ${DARTDIR}/work/dart_to_model              .
- ${COPY} ${DARTDIR}/work/model_to_dart              .
+${COPY} ${TIEGCMDIR}/tiegcm-nompi             tiegcm || exit 1
 
-# shell scripts
- ${COPY} ${DARTDIR}/shell_scripts/advance_model.csh .
+${COPY} ${ENSEMBLEDIR}/tiegcm_restart_p.nc           . || exit 1
+${COPY} ${ENSEMBLEDIR}/tiegcm_s.nc                   . || exit 1
+${COPY} ${ENSEMBLEDIR}/tiegcm.nml  tiegcm.nml.original || exit 1
 
-# data files
- ${COPY} ${EXPERIMENT}/initial/obs_seq.in           .
- ${COPY} ${DARTDIR}/work/input.nml                  .
-
 #-----------------------------------------------------------------------------
-# Get the tiegcm executable, control files, and data files.
+# Remove all the comments that follow (;) symbol from tiegcm.nml namelist file
+# That is a non-standard syntax for fortran namelists.
+#
+# Ensure that the tiegcm.nml for all the ensemble members is identical
+# in all the ways that matter. This will result in a miniumum of changes
+# in the advance_model.csh script. This script REQUIRES that there is a  
+# SINGLE tiegcm_restart_p.nc. Just keep appending all the timesteps to
+# the same file. If you need to subset the large file, use the NCO
+# operators. for example    ncks -d time,20,30 tiegcm_restart_p.nc bob.nc 
+# If you need more than 300 timesteps in the file, increase it here.
 #-----------------------------------------------------------------------------
 
- ${COPY} ${TIEGCMDIR}/tiegcm-nompi                  tiegcm
-#${COPY} ${TIEGCMDIR}/tiegcm                        .
+sed -e 's/;.*//' -e '/^$/ d' \
+    -e "/ MXHIST_PRIM /c\ MXHIST_PRIM = 300" \
+    -e "/ MXHIST_SECH /c\ MXHIST_SECH = 300" \
+    -e "/ SOURCE /c\ SOURCE = 'tiegcm_restart_p.nc'" \
+    -e "/ OUTPUT /c\ OUTPUT = 'tiegcm_restart_p.nc'" \
+    -e "/ SECOUT /c\ SECOUT = 'tiegcm_s.nc'"         \
+    tiegcm.nml.original >! tiegcm.nml  || exit 2
 
- ${COPY} ${EXPERIMENT}/initial/tiegcm_restart_p.nc  .
- ${COPY} ${EXPERIMENT}/initial/tiegcm_s.nc          .
- ${COPY} ${EXPERIMENT}/initial/tiegcm.nml           tiegcm.nml.original
-
 #-----------------------------------------------------------------------------
-# Remove all the comments that follow (;) symbol from tiegcm.nml namelist file
+# Convert a TIEGCM file 'tiegcm_restart.nc' to a DART ics file 'dart_ics'
+# There are some requirements for this script and advance_model.csh.
+# The requirements for this script are enforced here, the requirements for
+# advance_model.csh are enforced there.
+# 
+# REQUIREMENTS: for input.nml
+# model_nml            : tiegcm_restart_file_name   = 'tiegcm_restart_p.nc'
+# model_nml            : tiegcm_secondary_file_name = 'tiegcm_s.nc'
+# model_nml            : tiegcm_namelist_file_name  = 'tiegcm.nml'
+# model_to_dart_nml    : file_out                   = 'dart_ics'
 #-----------------------------------------------------------------------------
+# perfect_model_obs_nml: async                      = 2
+# perfect_model_obs_nml: adv_ens_command            = 'advance_model.csh'
+# perfect_model_obs_nml: start_from_restart         = .TRUE.
+# perfect_model_obs_nml: restart_in_file_name       = 'dart_ics'
+#-----------------------------------------------------------------------------
+# dart_to_model_nml    : file_in                    = 'dart_restart'
+# dart_to_model_nml    : file_namelist_out          = 'namelist_update'
 
-grep -v "^;" tiegcm.nml.original >! tiegcm.nml
+sed -e "/ tiegcm_restart_file_name /c\ tiegcm_restart_file_name = 'tiegcm_restart_p.nc'" \
+    -e "/ tiegcm_secondary_file_name /c\ tiegcm_secondary_file_name = 'tiegcm_s.nc'" \
+    -e "/ tiegcm_namelist_file_name /c\ tiegcm_namelist_file_name = 'tiegcm.nml'" \
+    -e "/ file_out /c\ file_out = 'dart_ics'" \
+    -e "/ async /c\ async = 2" \
+    -e "/ adv_ens_command /c\ adv_ens_command = './advance_model.csh'" \
+    -e "/ start_from_restart /c\ start_from_restart = .TRUE." \
+    -e "/ restart_in_file_name /c\ restart_in_file_name = 'dart_ics'" \
+    -e "/ file_in /c\ file_in = 'dart_restart'" \
+    -e "/ file_namelist_out /c\ file_namelist_out = 'namelist_update'" \
+    input.nml.original >! input.nml  || exit -3
 
-#-----------------------------------------------------------------------------
-# Check that everything moved OK, and the table is set.
-# Convert a TIEGCM file 'tiegcm_restart.nc' to a DART ics file 'perfect_ics'
-# 'model_to_dart' has a hardwired output filename of 'temp_ud' ...
-#-----------------------------------------------------------------------------
+./model_to_dart || exit 2
 
-./model_to_dart || exit 1
-mv temp_ud perfect_ics
-
 #-----------------------------------------------------------------------------
 # Run perfect_model_obs ... harvest the observations to populate obs_seq.out
 # model_mod expects a generic name // advance_model.csh expects a filename
 # with the ensemble member ID tacked on - must provide both.
 #-----------------------------------------------------------------------------
 
-ln -sf tiegcm_restart_p.nc tiegcm_restart_p.nc.0001
-ln -sf tiegcm_s.nc         tiegcm_s.nc.0001
-ln -sf tiegcm.nml          tiegcm.nml.0001
+${REMOVE} tiegcm_restart_p.nc.0001 tiegcm_s.nc.0001 tiegcm.nml.0001 
 
-./perfect_model_obs || exit 2
+${LINK} tiegcm_restart_p.nc tiegcm_restart_p.nc.0001 || exit 3
+${LINK} tiegcm_s.nc         tiegcm_s.nc.0001         || exit 3
+${LINK} tiegcm.nml          tiegcm.nml.0001          || exit 3
 
-echo "${JOBNAME} ($JOBID) finished at "`date`
 
+./perfect_model_obs || exit 3
+
 #-----------------------------------------------------------------------------
-# Move the output to storage after filter completes.
-# At this point, all the restart,diagnostic files are in the CENTRALDIR
-# and need to be moved to the 'experiment permanent' directory.
-# We have had problems with some, but not all, files being moved
-# correctly, so we are adding bulletproofing to check to ensure the filesystem
-# has completed writing the files, etc. Sometimes we get here before
-# all the files have finished being written.
+# At this point, all the restart,diagnostic files are in the run/CENTRALDIR.
+# You may want to move them to someplace more 'permanent'.
+#
+# TJH: At this point, the output files have pretty 'generic' names.
+# The files could be archived with the assimilation date in their name.
 #-----------------------------------------------------------------------------
 
-echo "Listing contents of CENTRALDIR before archiving"
-ls -l
+# ${MOVE} tiegcm_s.nc.0001           ${EXPERIMENT}/perfect/tiegcm_s.nc
+# ${MOVE} tiegcm_restart_p.nc.0001   ${EXPERIMENT}/perfect/tiegcm_restart_p.nc
+# ${MOVE} tiegcm.nml                 ${EXPERIMENT}/perfect
+# ${MOVE} obs_seq.out                ${EXPERIMENT}/perfect
+# ${MOVE} True_State.nc              ${EXPERIMENT}/perfect
 
-exit
+# ${MOVE} tiegcm_out_1               ${EXPERIMENT}/perfect/tiegcm_out
+# ${MOVE} dart_log.out               ${EXPERIMENT}/perfect
+# ${MOVE} dart_log.nml               ${EXPERIMENT}/perfect
+# Good style dictates that you save the scripts so you can see what worked.
 
-${MOVE} tiegcm_s.nc.0001           ${EXPERIMENT}/perfect/tiegcm_s.nc
-${MOVE} tiegcm_restart_p.nc.0001   ${EXPERIMENT}/perfect/tiegcm_restart_p.nc
-${MOVE} tiegcm.nml                 ${EXPERIMENT}/perfect
-${MOVE} obs_seq.out                ${EXPERIMENT}/perfect
-${MOVE} True_State.nc              ${EXPERIMENT}/perfect
-${MOVE} perfect_restart            ${EXPERIMENT}/perfect
+# ${COPY} input.nml                  ${EXPERIMENT}/DART
+# ${COPY} *.csh                      ${EXPERIMENT}/DART
+# ${COPY} $myname                    ${EXPERIMENT}/DART
 
-${MOVE} tiegcm_out_1               ${EXPERIMENT}/perfect/tiegcm_out
-${MOVE} dart_log.out               ${EXPERIMENT}/perfect
-${MOVE} dart_log.nml               ${EXPERIMENT}/perfect
-# Good style dictates that you save the scripts so you can see what worked.
-${COPY} input.nml                  ${experiment}/DART
-${COPY} *.csh                      ${experiment}/DART
-${COPY} $myname                    ${experiment}/DART
-
+echo "${JOBNAME} ($JOBID) finished at "`date`
+echo "These are the files in the run directory at completion:"
 ls -lrt
 
 exit 0