[Dart-dev] [4777] DART/trunk/mpi_utilities: Mostly added debugging messages to help diagnose problems with

nancy at ucar.edu nancy at ucar.edu
Fri Mar 4 13:26:17 MST 2011


Revision: 4777
Author:   nancy
Date:     2011-03-04 13:26:16 -0700 (Fri, 04 Mar 2011)
Log Message:
-----------
Mostly added debugging messages to help diagnose problems with
the model advances in async 2 and 4.  Also added some code that
made it easier to run on the cray system here; and has the code
needed to synchronize through a regular file instead of a pipe.
This last part has not been heavily tested and so for now i've
left it marked in the docs as not working yet (it needs a slightly
different script, which i'll check in an example of in the L96 dir).

Modified Paths:
--------------
    DART/trunk/mpi_utilities/mpi_utilities_mod.f90
    DART/trunk/mpi_utilities/mpi_utilities_mod.html
    DART/trunk/mpi_utilities/mpi_utilities_mod.nml
    DART/trunk/mpi_utilities/null_mpi_utilities_mod.f90

-------------- next part --------------
Modified: DART/trunk/mpi_utilities/mpi_utilities_mod.f90
===================================================================
--- DART/trunk/mpi_utilities/mpi_utilities_mod.f90	2011-03-04 17:07:03 UTC (rev 4776)
+++ DART/trunk/mpi_utilities/mpi_utilities_mod.f90	2011-03-04 20:26:16 UTC (rev 4777)
@@ -250,14 +250,16 @@
 ! per task, if this is set to true.  however, for debugging if you need
 ! messages from tasks which aren't 0, this will elicit them.  error messages
 ! from any task will print regardless of this setting.
-logical :: all_tasks_print      = .false.   ! by default only messages from 0 print
+logical :: all_tasks_print      = .false.   ! by default only msgs from 0 print
 
 ! NAMELIST: change the following from .false. to .true. to enable
 ! the reading of this namelist.  This is the only place you need
 ! to make this change.
 logical :: use_namelist = .false.
 
-namelist /mpi_utilities_nml/ reverse_task_layout, verbose, all_tasks_print
+namelist /mpi_utilities_nml/ reverse_task_layout, all_tasks_print, &
+                             verbose, async2_verbose, async4_verbose, &
+                             shell_name, separate_node_sync, create_local_comm
 
 contains
 
@@ -576,10 +578,11 @@
 
 !-----------------------------------------------------------------------------
 
-subroutine send_to(dest_id, srcarray, time)
+subroutine send_to(dest_id, srcarray, time, label)
  integer, intent(in) :: dest_id
  real(r8), intent(in) :: srcarray(:)
  type(time_type), intent(in), optional :: time
+ character(len=*), intent(in), optional :: label
 
 ! Send the srcarray to the destination id.
 ! If time is specified, it is also sent in a separate communications call.  
@@ -605,7 +608,11 @@
    call error_handler(E_ERR,'send_to', errstring, source, revision, revdate)
 endif
 
-if (verbose) write(*,*) "PE", myrank, ": send_to itemsize ", size(srcarray), " dest ", dest_id
+if (present(label)) then
+   write(*,*) trim(label)//" PE", myrank, ": send_to itemsize ", size(srcarray), " dest ", dest_id
+else if (verbose) then
+   write(*,*) "PE", myrank, ": send_to itemsize ", size(srcarray), " dest ", dest_id
+endif
 
 ! use my task id as the tag; unused at this point.
 tag = myrank
@@ -647,10 +654,11 @@
 
 !-----------------------------------------------------------------------------
 
-subroutine receive_from(src_id, destarray, time)
+subroutine receive_from(src_id, destarray, time, label)
  integer, intent(in) :: src_id
  real(r8), intent(inout) :: destarray(:)
  type(time_type), intent(out), optional :: time
+ character(len=*), intent(in), optional :: label
 
 ! Receive data into the destination array from the src task.
 ! If time is specified, it is received in a separate communications call.  
@@ -677,7 +685,11 @@
    call error_handler(E_ERR,'receive_from', errstring, source, revision, revdate)
 endif
 
-if (verbose) write(*,*) "PE", myrank, ": receive_from itemsize ", size(destarray), " src ", src_id
+if (present(label)) then
+   write(*,*) trim(label)//" PE", myrank, ": receive_from itemsize ", size(destarray), " src ", src_id
+else if (verbose) then
+   write(*,*) "PE", myrank, ": receive_from itemsize ", size(destarray), " src ", src_id
+endif
 
 ! send_to uses its own id as the tag.
 tag = src_id
@@ -1097,7 +1109,7 @@
 ! the MPI job which spreads out on all the PEs for this job
 ! and writes into the file from the correct PE.
 
-character(len = 32) :: fifo_name, filter_to_model, model_to_filter
+character(len = 32) :: fifo_name, filter_to_model, model_to_filter, non_pipe
 integer :: rc
 
 if ( .not. module_initialized ) then
@@ -1109,6 +1121,7 @@
 ! mpi wrapper code, callable from programs other than filter.)
 filter_to_model = 'filter_to_model.lock'
 model_to_filter = 'model_to_filter.lock'
+non_pipe = 'filter_to_model.file'
 
 ! the i5.5 format below will not handle task counts larger than this.
 if (total_tasks > 99999) then
@@ -1116,14 +1129,39 @@
    call error_handler(E_ERR,'block_task', errstring, source, revision, revdate)
 endif
 
-if (verbose) write(*,*) 'putting to sleep task id ', myrank
+! make it so we only have to test 1 or 2 things here instead of 3
+! when deciding whether to print status messages.
+if (verbose) async4_verbose = .TRUE.
 
-if (myrank == head_task) then
-   if (print4status .or. verbose) write(*,*) 'MPI job telling script to advance model'
+if ((myrank == head_task) .and. separate_node_sync) then
+
+   if (async4_verbose) then
+      write(*,*)  'checking master task host'
+      rc = system('echo master task running on host `hostname`'//' '//char(0))
+      if (rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
+   endif
+
+   if (async4_verbose .or. print4status) write(*,*) 'MPI job telling script to advance model'
+   rc = system('echo advance > '//trim(non_pipe)//' '//char(0))
+   if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
+
+endif
+
+if ((myrank == head_task) .and. .not. separate_node_sync) then
+
+   if (async4_verbose) then
+      write(*,*)  'checking master task host'
+      rc = system('echo master task running on host `hostname`'//' '//char(0))
+      if (rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
+   endif
+
+   if (async4_verbose .or. print4status) write(*,*) 'MPI job telling script to advance model'
    rc = system('echo advance > '//trim(filter_to_model)//' '//char(0))
+   if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
 
-   if (verbose) write(*,*) 'MPI job now waiting to read from lock file'
-   rc = system('cat < '//trim(model_to_filter)//' '//char(0))
+   if (async4_verbose) write(*,*) 'MPI job now waiting to read from lock file'
+   rc = system('cat < '//trim(model_to_filter)//'> /dev/null '//char(0))
+   if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
 
 else
 
@@ -1132,27 +1170,49 @@
    ! FIXME: this should be 'task_lock', since it's generic code beyond filter.
    write(fifo_name, '(a, i5.5)') "filter_lock", myrank
    
-   if (verbose) write(*,*) 'removing any previous lock file: '//trim(fifo_name)
+   if (async4_verbose) then
+      write(*,*)  'checking slave task host'
+      rc = system('echo '//trim(fifo_name)//' accessed from host `hostname`'//' '//char(0))
+      if (rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
+   endif
+
+   if (async4_verbose) write(*,*) 'removing any previous lock file: '//trim(fifo_name)
    rc = system('rm -f '//trim(fifo_name)//' '//char(0))
+   if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
 
-   if (verbose) write(*,*) 'made fifo, named: '//trim(fifo_name)
+   if (async4_verbose) write(*,*) 'made fifo, named: '//trim(fifo_name)
    rc = system('mkfifo '//trim(fifo_name)//' '//char(0))
+   if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
 
-   if (verbose) write(*,*) 'ready to read from lock file: '//trim(fifo_name)
-   rc = system('cat < '//trim(fifo_name)//' '//char(0))
+   if (async4_verbose) write(*,*) 'ready to read from lock file: '//trim(fifo_name)
+   rc = system('cat < '//trim(fifo_name)//'> /dev/null '//char(0))
+   if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
 
-   if (verbose) write(*,*) 'got response, removing lock file: '//trim(fifo_name)
+   if (async4_verbose) write(*,*) 'got response, removing lock file: '//trim(fifo_name)
    rc = system('rm -f '//trim(fifo_name)//' '//char(0))
+   if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
 
 endif
 
+! make sure all tasks get here before any get to proceed further.
+! this hides a multitude of sins.  it could also cause
+! tasks to hang forever, but right now it also makes some
+! systems able to run async 4.  maybe this should be namelist
+! selectable.  something named 'horrible_mpi_hack = .true.'
+! if tasks are hanging out here instead of reading the lock files,
+! they are burning cpu cycles and competing with the model advances.
+! but, it works, as opposed to not working.
+call task_sync()
+
 end subroutine block_task
 
 !-----------------------------------------------------------------------------
 subroutine restart_task()
 
+! companion to block_task.  must be called by a different executable
+! and it writes into the named pipes to restart the waiting task.
 
-character(len = 32) :: fifo_name, filter_to_model, model_to_filter
+character(len = 32) :: fifo_name, model_to_filter
 integer :: rc
 
 if ( .not. module_initialized ) then
@@ -1161,7 +1221,6 @@
 endif
 
 ! FIXME: ditto previous comment about using the string 'filter' here.
-filter_to_model = 'filter_to_model.lock'
 model_to_filter = 'model_to_filter.lock'
 
 ! the i5.5 format below will not handle task counts larger than this.
@@ -1170,24 +1229,38 @@
    call error_handler(E_ERR,'block_task', errstring, source, revision, revdate)
 endif
 
+! make it so we only have to test 1 or 2 things here instead of 3
+! when deciding whether to print status messages.
+if (verbose) async4_verbose = .TRUE.
+
 ! process 0 (or N-1) is handled differently in the code.
-if (myrank == head_task) then
+if ((myrank == head_task) .and. .not. separate_node_sync) then
 
-   if (print4status .or. verbose) write(*,*) 'script telling MPI job ok to restart'
+   if (async4_verbose) then
+      rc = system('echo master task running on host `hostname`'//' '//char(0))
+      if (rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
+   endif
+
+   if (async4_verbose .or. print4status) write(*,*) 'script telling MPI job ok to restart'
    rc = system('echo restart > '//trim(model_to_filter)//' '//char(0))
+   if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
 
 else
 
-   if (verbose) write(*,*) 'waking up task id ', myrank
+   if (async4_verbose) write(*,*) 'waking up task id ', myrank
 
    ! FIXME: this should be 'task_lock', since it's generic code beyond filter.
    write(fifo_name,"(a,i5.5)") "filter_lock", myrank
 
-   if (verbose) write(*,*) 'ready to write to lock file: '//trim(fifo_name)
+   if (async4_verbose) then
+      rc = system('echo '//trim(fifo_name)//' accessed from host `hostname`'//' '//char(0))
+      if (rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
+   endif
+
+   if (async4_verbose) write(*,*) 'ready to write to lock file: '//trim(fifo_name)
    rc = system('echo restart > '//trim(fifo_name)//' '//char(0))
+   if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
 
-   if (verbose) write(*,*) 'response was read from lock file: '//trim(fifo_name)
-   
 endif
 
 end subroutine restart_task
@@ -1196,7 +1269,7 @@
 subroutine finished_task(async)
  integer, intent(in) :: async
 
-character(len = 32) :: fifo_name, filter_to_model, model_to_filter
+character(len = 32) :: fifo_name, filter_to_model, non_pipe
 integer :: rc
 
 if ( .not. module_initialized ) then
@@ -1209,14 +1282,18 @@
 
 ! FIXME: ditto previous comment about using the string 'filter' here.
 filter_to_model = 'filter_to_model.lock'
-model_to_filter = 'model_to_filter.lock'
+non_pipe = 'filter_to_model.file'
 
 
 ! only process 0 (or N-1) needs to do anything.
 if (myrank == head_task) then
 
    if (print4status .or. verbose) write(*,*) 'MPI task telling script we are done'
-   rc = system('echo finished > '//trim(filter_to_model)//' '//char(0))
+   if (separate_node_sync) then
+      rc = system('echo finished > '//trim(non_pipe)//' '//char(0))
+   else
+      rc = system('echo finished > '//trim(filter_to_model)//' '//char(0))
+   endif
 
    
 endif
@@ -1263,7 +1340,7 @@
 endif
 
 write(fname, "(a,i4.4)") trim(pipename)//".", myrank
-print *, "fname now = ", trim(fname)
+!print *, "fname now = ", trim(fname)
 
 ! check to see if the pipe already exists; if so, we've got the unit number
 ! (directly into the output value) and we're done.  otherwise, make it and
@@ -1396,6 +1473,8 @@
 integer :: i, errcode, dummy(1)
 integer :: status(MPI_STATUS_SIZE)
 
+   if (verbose) async2_verbose = .true.
+
    ! default to everyone running concurrently, but if set and not true,
    ! serialize the calls to system() so they do not step on each other.
    if (present(serialize)) then
@@ -1404,15 +1483,16 @@
       all_at_once = .TRUE.
    endif
 
-   if (verbose) write(*,*) "system string is: ", trim(execute_string)
+   if (async2_verbose) write(*,*) "PE", myrank, ": system string is: ", trim(execute_string)
    shell_execute = -1
 
    ! this is the normal (default) case
    if (all_at_once) then
 
       ! all tasks call system at the same time
-      shell_execute = system(trim(execute_string)//' '//char(0))
-      if (verbose) write(*,*) "execution returns, rc = ", shell_execute
+      !shell_execute = system(trim(execute_string)//' '//char(0))
+      shell_execute = system(trim(shell_name)//' '//trim(execute_string)//' '//char(0))
+      if (async2_verbose) write(*,*) "PE", myrank, ": execution returns, rc = ", shell_execute
 
       return
    endif
@@ -1426,8 +1506,8 @@
    if (myrank == 0) then
 
       ! my turn to execute
-      shell_execute = system(trim(execute_string)//' '//char(0))
-      if (verbose) write(*,*) "PE", myrank, ": execution returns, rc = ", shell_execute
+      shell_execute = system(trim(shell_name)//' '//trim(execute_string)//' '//char(0))
+      if (async2_verbose) write(*,*) "PE", myrank, ": execution returns, rc = ", shell_execute
 
       if (total_tasks > 1) then
          ! tell next task it can continue
@@ -1451,8 +1531,8 @@
       endif
 
       ! my turn to execute
-      shell_execute = system(trim(execute_string)//' '//char(0))
-      if (verbose) write(*,*) "PE", myrank, ": execution returns, rc = ", shell_execute
+      shell_execute = system(trim(shell_name)//' '//trim(execute_string)//' '//char(0))
+      if (async2_verbose) write(*,*) "PE", myrank, ": execution returns, rc = ", shell_execute
 
       ! and now tell (me+1) to go
       call MPI_Send(dummy, 1, MPI_INTEGER, myrank+1, myrank+1, my_local_comm, errcode)
@@ -1473,8 +1553,8 @@
       endif
 
       ! my turn to execute
-      shell_execute = system(trim(execute_string)//' '//char(0))
-      if (verbose) write(*,*) "PE", myrank, ": execution returns, rc = ", shell_execute
+      shell_execute = system(trim(shell_name)//' '//trim(execute_string)//' '//char(0))
+      if (async2_verbose) write(*,*) "PE", myrank, ": execution returns, rc = ", shell_execute
 
    endif
        

Modified: DART/trunk/mpi_utilities/mpi_utilities_mod.html
===================================================================
--- DART/trunk/mpi_utilities/mpi_utilities_mod.html	2011-03-04 17:07:03 UTC (rev 4776)
+++ DART/trunk/mpi_utilities/mpi_utilities_mod.html	2011-03-04 20:26:16 UTC (rev 4777)
@@ -144,24 +144,28 @@
 
 <TR><!--contents--><TD valign=top>reverse_task_layout</TD>
     <!--  type  --><TD valign=top>logical</TD>
-    <!--descript--><TD>If running async=4 mode (parallel filter
-       and parallel model advance) with the OpenMPI library,
-       this must be .TRUE. .  The synchronizing
-       mechanism between the scripting and the filter executable will
-       fail otherwise.  Default: .FALSE.</TD></TR>
+    <!--descript--><TD>The synchronizing mechanism between the job script
+       and the parallel filter in async=4 mode relies on the script and
+       task 0 running on the same node (in the same memory space if the
+       nodes have multiple processors).  Some MPI implementations (OpenMPI
+       being the most commonly used one) lay the tasks out so that the last
+       task is on the same node as the script.  If the async 4 model advance
+       never starts but there are no error messages, try setting this to 
+       .TRUE. before running.  See also the 'async4_verbose' flag below.
+                       Default: .FALSE.</TD></TR>
 
 <TR><!--contents--><TD valign=top>all_tasks_print</TD>
     <!--  type  --><TD valign=top>logical</TD>
     <!--descript--><TD>In the parallel filter, informational messages only 
        print from task 0 to avoid N copies of the same messages.  Error
        messages and warnings print no matter which task they occur in.  
-       If this variable is set to true, messages will print 
+       If this variable is set to true, even messages will print 
        from all tasks.  Default: .FALSE. </TD></TR>
 
 <TR><!--contents--><TD valign=top>verbose</TD>
     <!--  type  --><TD valign=top>logical</TD>
     <!--descript--><TD>USE WITH CAUTION!  This flag enables debugging
-       print messages for every MPI call - sends, receives, syncs - and
+       print messages for every MPI call - sends, receives, barriers - and
        is very, very verbose.  In most cases the size of the output file
        will exceed the filesystem limits or will cause the executable to
        run so slowly that it will not be useful.  However in small
@@ -186,8 +190,8 @@
     <!--descript--><TD>If running on compute nodes which do not have the
         expected default shell for async=2 or async=4 mode, specify the
         full pathname of the shell to execute the script.   Not normally
-        needed on most systems we run on  (Cray execute nodes being an
-        exception.)  Default: '' </TD></TR>
+        needed on most systems we run on. (However, at least one type of Cray
+        system has this need.)  Default: '' </TD></TR>
 
 <TR><!--contents--><TD valign=top>separate_node_sync</TD>
     <!--  type  --><TD valign=top>logical</TD>

Modified: DART/trunk/mpi_utilities/mpi_utilities_mod.nml
===================================================================
--- DART/trunk/mpi_utilities/mpi_utilities_mod.nml	2011-03-04 17:07:03 UTC (rev 4776)
+++ DART/trunk/mpi_utilities/mpi_utilities_mod.nml	2011-03-04 20:26:16 UTC (rev 4777)
@@ -1,3 +1,11 @@
 &mpi_utilities_nml
+   reverse_task_layout = .false., 
+   all_tasks_print     = .false., 
+   verbose             = .false., 
+   async2_verbose      = .false., 
+   async4_verbose      = .false., 
+   shell_name          = '',
+   separate_node_sync  = .false.,
+   create_local_comm   = .true.,
    /
 

Modified: DART/trunk/mpi_utilities/null_mpi_utilities_mod.f90
===================================================================
--- DART/trunk/mpi_utilities/null_mpi_utilities_mod.f90	2011-03-04 17:07:03 UTC (rev 4776)
+++ DART/trunk/mpi_utilities/null_mpi_utilities_mod.f90	2011-03-04 20:26:16 UTC (rev 4777)
@@ -331,10 +331,11 @@
 
 !-----------------------------------------------------------------------------
 
-subroutine send_to(dest_id, srcarray, time)
+subroutine send_to(dest_id, srcarray, time, label)
  integer, intent(in) :: dest_id
  real(r8), intent(in) :: srcarray(:)
  type(time_type), intent(in), optional :: time
+ character(len=*), intent(in), optional :: label
 
 ! Send the srcarray to the destination id.
 ! If time is specified, it is also sent in a separate communications call.  
@@ -361,10 +362,11 @@
 
 !-----------------------------------------------------------------------------
 
-subroutine receive_from(src_id, destarray, time)
+subroutine receive_from(src_id, destarray, time, label)
  integer, intent(in) :: src_id
  real(r8), intent(out) :: destarray(:)
  type(time_type), intent(out), optional :: time
+ character(len=*), intent(in), optional :: label
 
 ! Receive data into the destination array from the src task.
 ! If time is specified, it is received in a separate communications call.  


More information about the Dart-dev mailing list