[Dart-dev] [4345] DART/trunk/models/POP: The README has notes for how to configure/ start/continue an experiment

nancy at ucar.edu nancy at ucar.edu
Thu Apr 8 15:30:16 MDT 2010


Revision: 4345
Author:   thoar
Date:     2010-04-08 15:30:16 -0600 (Thu, 08 Apr 2010)
Log Message:
-----------
The README has notes for how to configure/start/continue an experiment
using the paired CAM DATM fields for POP.

The assimilate.csh script preserves different node configurations we
had tried for running DART on a different number of nodes than POP
(even though CCSM is controlling both) - and this version records
the initial attempt at distributing the pop_to_dart and dart_to_pop 
portions. Collectively, these were taking about 10 minutes for a
48member ensemble. Hopefully, using all the CPUs on a node will reduce
that ...

Modified Paths:
--------------
    DART/trunk/models/POP/README
    DART/trunk/models/POP/shell_scripts/assimilate.csh

-------------- next part --------------
Modified: DART/trunk/models/POP/README
===================================================================
--- DART/trunk/models/POP/README	2010-04-08 21:26:39 UTC (rev 4344)
+++ DART/trunk/models/POP/README	2010-04-08 21:30:16 UTC (rev 4345)
@@ -62,3 +62,194 @@
 The init_ts_file entry is completely ignored. A pointer file with the name
 "pop_pointer.restart" contains the name of the restart file.
 
+
+#----------------------------------------------------------------------
+# Fri Mar 19 12:56:05 MDT 2010 -- Running CCSM/POP on bluefire
+#----------------------------------------------------------------------
+
+The initial POP ensemble are binary restarts - DART only works with 
+netCDF restarts, so we cannot assimilate this first advance/job.
+Depending on what we want to compare to, there may be an additional
+'one-off' advance day where there is no assimilation. (Jan 3rd)
+But the THIRD execution is always an assimilation ... resulting in an 
+assimilated Jan 4th, 1998.
+
+#
+# create the case name for the experiment. Steve's convention is
+#
+
+"c"	is ocean only
+"cam23"	explains the forcing
+".2"	was some experiment number
+
+set CASENAME = c.cam48.timing1
+set TEMPLATECASE = /gpfs/proj2/fis/cgd/oce/yeager/home/ccsm_runs/ccsm4.0_IE/c.cam48.pio
+set TEMPLATECASE = /blhome/thoar/CCSM_POP/c.cam48.inf
+
+mkdir ~thoar/CCSM_POP/${CASENAME}
+
+# Copy an existing experiment (CASE) that worked 
+# and clean out all the bits that don't relate
+
+cp -r ${TEMPLATECASE}/* ${CASENAME}
+
+cd ${CASENAME}
+./configure -cleanall
+
+rm -rf poe* logs/* timing/* MachinesHist/*
+
+# replace (almost) all instances of "yeager"  and the old CASE, case_name
+# (I have been using steve's env_run.xml:CCSMROOT)
+# ./env_case.xml:<entry id="CCSMROOT"   value="/blhome/yeager/ccsm4_0_beta21_iepio"  />
+# These are the entries I change:
+
+./CaseDocs/drv_in:  username      = 'yeager '
+./CaseDocs/drv_in:  case_name     = 'c.cam48.pio '
+
+./env_case.xml:<entry id="CCSMUSER" value="yeager"  />
+./env_case.xml:<entry id="CASE"     value="c.cam48.pio"  />
+./env_case.xml:<entry id="CASEROOT" value="/fis01/cgd/oce/yeager/home/ccsm_runs/ccsm4.0_IE/c.cam48.pio"  />
+
+# For an initial run ... 
+# the CCSM restart files are for 1998 Jan 1 00Z. 
+# With env_run.xml:STOP_N==3 the first restart is available at the 
+# end of Jan 3rd. The first POP advance is from binary restarts - 
+# DART only works with netCDF restarts, so we cannot assimilate
+# this first advance/job.
+#
+# hand-edit the env_run.xml file so it has the following:
+<entry id="CONTINUE_RUN"    value="FALSE"  />    
+<entry id="POST_DATA_ASSIM" value="FALSE"  />    
+<entry id="RESUBMIT"        value="0"  />    
+<entry id="STOP_N"          value="3"  />    
+
+# After all the xml files have been configured - populate Buildconf with 
+# valid namelists and run scripts and streams. There has to be a stream
+# file for each ensemble member (hence make_cplhist_streams.csh)
+
+./configure -case
+
+cd Buildconf
+
+cp ${TEMPLATECASE}/Buildconf/*ninst*.txt .
+cp ${TEMPLATECASE}/Buildconf/make_cplhist_streams.csh .
+./make_cplhist_streams.csh
+
+# This was my sanity check ... xxdiff lives on the DASG cluster 
+# this is a non-sequiter of sorts ...
+# every non-comment difference was required
+
+foreach FILE ( *buildnml* )
+      xxdiff $FILE ${TEMPLATECASE}/Buildconf/$FILE
+end
+
+# build the stuff
+
+cd ..
+./${CASENAME}.bluefire.build
+
+# modify the run script to include the assimilate.csh script after
+# CSM EXECUTION HAS FINISHED and before
+# FOR POSTPROCESSING
+
+hand edit $CASE.bluefire.run   to include assimilate.csh
+hand edit assimilate.csh to reference YOUR DART instance
+hand edit the DART input.nml to reflect your assimilation experiment
+
+#
+# STAGE the restart files for the N oceans
+#
+# Steve's repository of ocean restart files is :
+# /gpfs/proj2/ccsm/ocn/DART/gx1v6_restarts/c.b12.001
+# This copies the first 23 pointer files (adjacent years, btw) 
+# into the execution directory. The restart files are still 
+# in the restarts directory
+cd /ptmp/thoar/${CASENAME}/run
+
+# ln -s /gpfs/proj2/ccsm/ocn/DART/gx1v6_restarts/c.b12.001 restarts
+
+# From Steve - Wed Apr  7 12:19:03 MDT 2010
+#
+# We've been using successive january's from an old hindcast (c.b12.001)
+# to start up the DART runs:
+# /ccsm/ocn/DART/gx1v6_restarts/c.b12.001/
+#
+# The first 48 of these restart files (years 0002 to 0049) have the 
+# following ensemble average and rms:
+# /ccsm/ocn/DART/gx1v6_restarts/c.b12.001/ensavg.nc
+# /ccsm/ocn/DART/gx1v6_restarts/c.b12.001/ensrms.nc
+#
+# I'm putting things in a new directory which will have assorted restarts 
+# from assorted recent CCSM4 hindcasts which differ in their sea ice 
+# treatment and salinity restoring strength:
+# /ccsm/ocn/DART/gx1v6_restarts/core2/
+# I'm only putting in restarts at least 10 years apart.  The avg & rms
+# of the 48-member ensemble sitting there now is
+# /ccsm/ocn/DART/gx1v6_restarts/core2/ensavg.nc
+# /ccsm/ocn/DART/gx1v6_restarts/core2/ensrms.nc
+#
+# The spread is bigger in the latter and I think we should use something like this in our next production run.
+
+ln -s /gpfs/proj2/ccsm/ocn/DART/gx1v6_restarts/core2 restarts
+
+# this was for the 23-member ensemble
+cp restarts/rpointer.ocn.?.* .
+cp restarts/rpointer.ocn.1?.* .
+cp restarts/rpointer.ocn.2[0-3].* .
+
+# this was for the 48-member ensemble
+cp restarts/rpointer.ocn.?.* .
+cp restarts/rpointer.ocn.[123]?.* .
+cp restarts/rpointer.ocn.4[0-8].* .
+
+# submit the job for the first N days
+
+cd ~thoar/CCSM_POP/${CASENAME}
+bsub < ${CASENAME}.bluefire.run
+
+#----------------------------------------------------------------------
+# For resubmissions
+#----------------------------------------------------------------------
+
+If CONTINUE_RUN == TRUE ... it will look  for the .rpointer files to
+know what dates, etc.	
+
+Modify the env_run.xml bits as follows:
+
+< <entry id="CONTINUE_RUN"    value="FALSE"  />    
+> <entry id="CONTINUE_RUN"    value="TRUE"  />    
+
+< <entry id="POST_DATA_ASSIM" value="FALSE"  />    
+> <entry id="POST_DATA_ASSIM" value="TRUE"  />    
+
+< <entry id="RESUBMIT"        value="0"  />    
+> <entry id="RESUBMIT"        value="10"  />    
+
+< <entry id="STOP_N"          value="2"  />    
+> <entry id="STOP_N"          value="1"  />  
+
+Change the input.nml to reflect the 'start from restart' selections.
+
+
+#----------------------------------------------------------------------
+# To remove all traces of a failed experiment and restart 
+# Steve Yeager: Tue Mar 23 11:41:26 MDT 2010:
+#----------------------------------------------------------------------
+
+To rerun, I would do the following:
+
+1) clean up everything and start from scratch:
+	rm -rf /ptmp/thoar/${CASENAME}/*
+	rm -rf /ptmp/thoar/archive/${CASENAME}
+	msrm -R -wpwd THOAR /THOAR/csm/${CASENAME}
+
+2) rebuild
+	./${CASENAME}.bluefire.build
+
+3) pre-position restarts again (see notes)
+
+4) change something(?)
+
+5) resubmit:
+	bsub < ${CASENAME}.bluefire.run
+

Modified: DART/trunk/models/POP/shell_scripts/assimilate.csh
===================================================================
--- DART/trunk/models/POP/shell_scripts/assimilate.csh	2010-04-08 21:26:39 UTC (rev 4344)
+++ DART/trunk/models/POP/shell_scripts/assimilate.csh	2010-04-08 21:30:16 UTC (rev 4345)
@@ -61,7 +61,9 @@
 
 foreach FILE ( input.nml filter pop_to_dart dart_to_pop )
 
-   if ( -e    ../${FILE} ) then
+   if ( -e ${CASEROOT}/${FILE} ) then
+      ${COPY}   ${CASEROOT}/${FILE} .
+   else if ( -e    ../${FILE} ) then
       ${COPY} ../${FILE} .
    else if ( -e ${DARTDIR}/${FILE} ) then
       ${COPY}   ${DARTDIR}/${FILE} .
@@ -73,7 +75,7 @@
 end
 
 #-------------------------------------------------------------------------
-# INFLATION COPY BLOCK
+# DART INFLATION BLOCK
 # This file is only relevant if 'inflation' is turned on -
 # i.e. if inf_flavor(1) /= 0 - AND we are in a 'restart' mode.
 #
@@ -131,17 +133,33 @@
 set member = 1
 while ( $member <= $ensemble_size )
 
-   set DART_IC_FILE = `printf filter_ics.%04d $member`
-   set OCN_RESTART_FILENAME = `head -1 ../rpointer.ocn.$member.restart`
-   ${LINK} ../$OCN_RESTART_FILENAME pop.r.nc
-   ${LINK} ../pop2_in.$member       pop_in
+   # Each member will do its job in its own directory.
+   # That way, we can do N of them simultaneously -
+   # they all read their OWN 'input.nml' ... the output
+   # filenames must inserted into the appropriate input.nml
 
-   ./pop_to_dart || exit 1
+   set MYTEMPDIR = member_${member}
+   mkdir -p $MYTEMPDIR
+   cd $MYTEMPDIR
 
-   ${MOVE} dart.ud $DART_IC_FILE
+   set OCN_RESTART_FILENAME = `head -1 ../../rpointer.ocn.$member.restart`
+   ${LINK} ../../$OCN_RESTART_FILENAME pop.r.nc
+   ${LINK} ../../pop2_in.$member       pop_in
+
+   # the slash in the filename screws up 'sed' ... unless
+   set DART_IC_FILE = `printf ..\\/filter_ics.%04d $member`
+
+   sed -e "s/dart.ud/${DART_IC_FILE}/" < ../input.nml >! input.nml
+
+   ../pop_to_dart &
+
+   cd ..
+
    @ member++
 end
 
+wait
+
 #-------------------------------------------------------------------------
 # Block 2: Actually run the assimilation.
 # Will result in a set of files : 'filter_restart.xxxx'
@@ -175,12 +193,34 @@
 setenv LSF_BINDIR /contrib/lsf/tgmpatch
 setenv PATH ${LSF_BINDIR}:${PATH}
 setenv ORG_TASK_GEOMETRY "${LSB_PJL_TASK_GEOMETRY}"
-setenv NANCY_GEOMETRY_126_6NODES \
-	"{(0,29,30,31,1,32,33,34,2,35,36,37,3,38,39,40,4,41,42,43,5)(44,45,46,6,47,48,49,7,50,51,52,8,53,54,55,9,56,57,58,10,59)(60,61,11,62,63,64,12,65,66,67,13,68,69,70,14,71,72,73,15,74,75)(76,16,77,78,79,17,80,81,82,18,83,84,85,19,86,87,88,20,89,90,91)(21,92,93,94,22,95,96,97,23,98,99,100,24,101,102,103,25,104,105,106,26)(107,108,109,27,110,111,112,28,113,114,115,116,117,118,119,120,121,122,123,124,125)}"
+
+# layout 1: rr by node
+setenv NANCY_GEOMETRY_126_2NODES_RR \
+	"{(0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124)(1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63,65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95,97,99,101,103,105,107,109,111,113,115,117,119,121,123,125)}";
+	
+# layout 2: flat
+setenv NANCY_GEOMETRY_126_2NODES_FL \
+	"{(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62)(63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125)}";
+
+# layout 3: rr sub block by stride
+setenv NANCY_GEOMETRY_126_2NODES_STR \
+	"{(0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62)(1,65,3,67,5,69,7,71,9,73,11,75,13,77,15,79,17,81,19,83,21,85,23,87,25,89,27,91,29,93,31,95,33,97,35,99,37,101,39,103,41,105,43,107,45,109,47,111,49,113,51,115,53,117,55,119,57,121,59,123,61,125,63)}";
+
 setenv NANCY_GEOMETRY_126_3NODES \
 	"{(0,29,30,31,1,32,33,34,2,35,36,37,3,38,39,40,4,41,42,43,5,44,45,46,6,47,48,49,7,50,51,52,8,53,54,55,9,56,57,58,10,59)(60,61,11,62,63,64,12,65,66,67,13,68,69,70,14,71,72,73,15,74,75,76,16,77,78,79,17,80,81,82,18,83,84,85,19,86,87,88,20,89,90,91)(21,92,93,94,22,95,96,97,23,98,99,100,24,101,102,103,25,104,105,106,26,107,108,109,27,110,111,112,28,113,114,115,116,117,118,119,120,121,122,123,124,125)}"
-setenv LSB_PJL_TASK_GEOMETRY "${NANCY_GEOMETRY_126_6NODES}"
 
+setenv NANCY_GEOMETRY_126_6NODES \
+	"{(0,29,30,31,1,32,33,34,2,35,36,37,3,38,39,40,4,41,42,43,5)(44,45,46,6,47,48,49,7,50,51,52,8,53,54,55,9,56,57,58,10,59)(60,61,11,62,63,64,12,65,66,67,13,68,69,70,14,71,72,73,15,74,75)(76,16,77,78,79,17,80,81,82,18,83,84,85,19,86,87,88,20,89,90,91)(21,92,93,94,22,95,96,97,23,98,99,100,24,101,102,103,25,104,105,106,26)(107,108,109,27,110,111,112,28,113,114,115,116,117,118,119,120,121,122,123,124,125)}"
+
+setenv NANCY_GEOMETRY_126_7NODES \
+         "{(0,7,14,21,28,35,42,49,56,63,70,77,84,91,98,105,112,119)(1,8,15,22,29,36,43,50,57,64,71,78,85,92,99,106,113,120)(2,9,16,23,30,37,44,51,58,65,72,79,86,93,100,107,114,121)(3,10,17,24,31,38,45,52,59,66,73,80,87,94,101,108,115,122)(4,11,18,25,32,39,46,53,60,67,74,81,88,95,102,109,116,123)(5,12,19,26,33,40,47,54,61,68,75,82,89,96,103,110,117,124)(6,13,20,27,34,41,48,55,62,69,76,83,90,97,104,111,118,125)}"
+
+# layout: flat
+setenv NANCY_GEOMETRY_54_1NODE \
+	"{(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53)}";
+
+setenv LSB_PJL_TASK_GEOMETRY "${NANCY_GEOMETRY_54_1NODE}"
+
 which mpirun.lsf
 
 mpirun.lsf ./filter || exit 2
@@ -215,7 +255,7 @@
 setenv LSB_PJL_TASK_GEOMETRY "${ORG_TASK_GEOMETRY}"
 
 #-------------------------------------------------------------------------
-# Block 3: Update the POP restart files ... sequentially (sigh) ...
+# Block 3: Update the POP restart files ... simultaneously ...
 #
 # DART namelist settings required:
 # &filter_nml:           restart_out_file_name  = 'filter_restart'
@@ -227,18 +267,33 @@
 set member = 1
 while ( $member <= $ensemble_size )
 
+   # Each member will do its job in its own directory.
+   # That way, we can do N of them simultaneously -
+   # they all read their OWN 'input.nml' ... the output
+   # filenames must inserted into the appropriate input.nml
+
+   set MYTEMPDIR = member_${member}
+   mkdir -p $MYTEMPDIR
+   cd $MYTEMPDIR
+
    set DART_RESTART_FILE = `printf filter_restart.%04d $member`
-   set OCN_RESTART_FILENAME = `head -1 ../rpointer.ocn.$member.restart`
-   ${LINK} ../$OCN_RESTART_FILENAME pop.r.nc
-   ${LINK} ../pop2_in.$member       pop_in
+   ${LINK} ../$DART_RESTART_FILE dart.ic
 
-   ${LINK} $DART_RESTART_FILE dart.ic
+   set OCN_RESTART_FILENAME = `head -1 ../../rpointer.ocn.$member.restart`
+   ${LINK} ../../$OCN_RESTART_FILENAME pop.r.nc
+   ${LINK} ../../pop2_in.$member       pop_in
 
-   ./dart_to_pop || exit 3
+   cp -f ../input.nml .
 
+   ../dart_to_pop &
+
+   cd ..
+
    @ member++
 end
 
+wait
+
 #-------------------------------------------------------------------------
 # Cleanup
 #-------------------------------------------------------------------------


More information about the Dart-dev mailing list