[Dart-dev] [4777] DART/trunk/mpi_utilities: Mostly added debugging messages to help diagnose problems with
nancy at ucar.edu
nancy at ucar.edu
Fri Mar 4 13:26:17 MST 2011
Revision: 4777
Author: nancy
Date: 2011-03-04 13:26:16 -0700 (Fri, 04 Mar 2011)
Log Message:
-----------
Mostly added debugging messages to help diagnose problems with
the model advances in async 2 and 4. Also added some code that
made it easier to run on the cray system here; and has the code
needed to synchronize through a regular file instead of a pipe.
This last part has not been heavily tested and so for now i've
left it marked in the docs as not working yet (it needs a slightly
different script, which i'll check in an example of in the L96 dir).
Modified Paths:
--------------
DART/trunk/mpi_utilities/mpi_utilities_mod.f90
DART/trunk/mpi_utilities/mpi_utilities_mod.html
DART/trunk/mpi_utilities/mpi_utilities_mod.nml
DART/trunk/mpi_utilities/null_mpi_utilities_mod.f90
-------------- next part --------------
Modified: DART/trunk/mpi_utilities/mpi_utilities_mod.f90
===================================================================
--- DART/trunk/mpi_utilities/mpi_utilities_mod.f90 2011-03-04 17:07:03 UTC (rev 4776)
+++ DART/trunk/mpi_utilities/mpi_utilities_mod.f90 2011-03-04 20:26:16 UTC (rev 4777)
@@ -250,14 +250,16 @@
! per task, if this is set to true. however, for debugging if you need
! messages from tasks which aren't 0, this will elicit them. error messages
! from any task will print regardless of this setting.
-logical :: all_tasks_print = .false. ! by default only messages from 0 print
+logical :: all_tasks_print = .false. ! by default only msgs from 0 print
! NAMELIST: change the following from .false. to .true. to enable
! the reading of this namelist. This is the only place you need
! to make this change.
logical :: use_namelist = .false.
-namelist /mpi_utilities_nml/ reverse_task_layout, verbose, all_tasks_print
+namelist /mpi_utilities_nml/ reverse_task_layout, all_tasks_print, &
+ verbose, async2_verbose, async4_verbose, &
+ shell_name, separate_node_sync, create_local_comm
contains
@@ -576,10 +578,11 @@
!-----------------------------------------------------------------------------
-subroutine send_to(dest_id, srcarray, time)
+subroutine send_to(dest_id, srcarray, time, label)
integer, intent(in) :: dest_id
real(r8), intent(in) :: srcarray(:)
type(time_type), intent(in), optional :: time
+ character(len=*), intent(in), optional :: label
! Send the srcarray to the destination id.
! If time is specified, it is also sent in a separate communications call.
@@ -605,7 +608,11 @@
call error_handler(E_ERR,'send_to', errstring, source, revision, revdate)
endif
-if (verbose) write(*,*) "PE", myrank, ": send_to itemsize ", size(srcarray), " dest ", dest_id
+if (present(label)) then
+ write(*,*) trim(label)//" PE", myrank, ": send_to itemsize ", size(srcarray), " dest ", dest_id
+else if (verbose) then
+ write(*,*) "PE", myrank, ": send_to itemsize ", size(srcarray), " dest ", dest_id
+endif
! use my task id as the tag; unused at this point.
tag = myrank
@@ -647,10 +654,11 @@
!-----------------------------------------------------------------------------
-subroutine receive_from(src_id, destarray, time)
+subroutine receive_from(src_id, destarray, time, label)
integer, intent(in) :: src_id
real(r8), intent(inout) :: destarray(:)
type(time_type), intent(out), optional :: time
+ character(len=*), intent(in), optional :: label
! Receive data into the destination array from the src task.
! If time is specified, it is received in a separate communications call.
@@ -677,7 +685,11 @@
call error_handler(E_ERR,'receive_from', errstring, source, revision, revdate)
endif
-if (verbose) write(*,*) "PE", myrank, ": receive_from itemsize ", size(destarray), " src ", src_id
+if (present(label)) then
+ write(*,*) trim(label)//" PE", myrank, ": receive_from itemsize ", size(destarray), " src ", src_id
+else if (verbose) then
+ write(*,*) "PE", myrank, ": receive_from itemsize ", size(destarray), " src ", src_id
+endif
! send_to uses its own id as the tag.
tag = src_id
@@ -1097,7 +1109,7 @@
! the MPI job which spreads out on all the PEs for this job
! and writes into the file from the correct PE.
-character(len = 32) :: fifo_name, filter_to_model, model_to_filter
+character(len = 32) :: fifo_name, filter_to_model, model_to_filter, non_pipe
integer :: rc
if ( .not. module_initialized ) then
@@ -1109,6 +1121,7 @@
! mpi wrapper code, callable from programs other than filter.)
filter_to_model = 'filter_to_model.lock'
model_to_filter = 'model_to_filter.lock'
+non_pipe = 'filter_to_model.file'
! the i5.5 format below will not handle task counts larger than this.
if (total_tasks > 99999) then
@@ -1116,14 +1129,39 @@
call error_handler(E_ERR,'block_task', errstring, source, revision, revdate)
endif
-if (verbose) write(*,*) 'putting to sleep task id ', myrank
+! make it so we only have to test 1 or 2 things here instead of 3
+! when deciding whether to print status messages.
+if (verbose) async4_verbose = .TRUE.
-if (myrank == head_task) then
- if (print4status .or. verbose) write(*,*) 'MPI job telling script to advance model'
+if ((myrank == head_task) .and. separate_node_sync) then
+
+ if (async4_verbose) then
+ write(*,*) 'checking master task host'
+ rc = system('echo master task running on host `hostname`'//' '//char(0))
+ if (rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
+ endif
+
+ if (async4_verbose .or. print4status) write(*,*) 'MPI job telling script to advance model'
+ rc = system('echo advance > '//trim(non_pipe)//' '//char(0))
+ if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
+
+endif
+
+if ((myrank == head_task) .and. .not. separate_node_sync) then
+
+ if (async4_verbose) then
+ write(*,*) 'checking master task host'
+ rc = system('echo master task running on host `hostname`'//' '//char(0))
+ if (rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
+ endif
+
+ if (async4_verbose .or. print4status) write(*,*) 'MPI job telling script to advance model'
rc = system('echo advance > '//trim(filter_to_model)//' '//char(0))
+ if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
- if (verbose) write(*,*) 'MPI job now waiting to read from lock file'
- rc = system('cat < '//trim(model_to_filter)//' '//char(0))
+ if (async4_verbose) write(*,*) 'MPI job now waiting to read from lock file'
+ rc = system('cat < '//trim(model_to_filter)//'> /dev/null '//char(0))
+ if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
else
@@ -1132,27 +1170,49 @@
! FIXME: this should be 'task_lock', since it's generic code beyond filter.
write(fifo_name, '(a, i5.5)') "filter_lock", myrank
- if (verbose) write(*,*) 'removing any previous lock file: '//trim(fifo_name)
+ if (async4_verbose) then
+ write(*,*) 'checking slave task host'
+ rc = system('echo '//trim(fifo_name)//' accessed from host `hostname`'//' '//char(0))
+ if (rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
+ endif
+
+ if (async4_verbose) write(*,*) 'removing any previous lock file: '//trim(fifo_name)
rc = system('rm -f '//trim(fifo_name)//' '//char(0))
+ if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
- if (verbose) write(*,*) 'made fifo, named: '//trim(fifo_name)
+ if (async4_verbose) write(*,*) 'made fifo, named: '//trim(fifo_name)
rc = system('mkfifo '//trim(fifo_name)//' '//char(0))
+ if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
- if (verbose) write(*,*) 'ready to read from lock file: '//trim(fifo_name)
- rc = system('cat < '//trim(fifo_name)//' '//char(0))
+ if (async4_verbose) write(*,*) 'ready to read from lock file: '//trim(fifo_name)
+ rc = system('cat < '//trim(fifo_name)//'> /dev/null '//char(0))
+ if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
- if (verbose) write(*,*) 'got response, removing lock file: '//trim(fifo_name)
+ if (async4_verbose) write(*,*) 'got response, removing lock file: '//trim(fifo_name)
rc = system('rm -f '//trim(fifo_name)//' '//char(0))
+ if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
endif
+! make sure all tasks get here before any get to proceed further.
+! this hides a multitude of sins. it could also cause
+! tasks to hang forever, but right now it also makes some
+! systems able to run async 4. maybe this should be namelist
+! selectable. something named 'horrible_mpi_hack = .true.'
+! if tasks are hanging out here instead of reading the lock files,
+! they are burning cpu cycles and competing with the model advances.
+! but, it works, as opposed to not working.
+call task_sync()
+
end subroutine block_task
!-----------------------------------------------------------------------------
subroutine restart_task()
+! companion to block_task. must be called by a different executable
+! and it writes into the named pipes to restart the waiting task.
-character(len = 32) :: fifo_name, filter_to_model, model_to_filter
+character(len = 32) :: fifo_name, model_to_filter
integer :: rc
if ( .not. module_initialized ) then
@@ -1161,7 +1221,6 @@
endif
! FIXME: ditto previous comment about using the string 'filter' here.
-filter_to_model = 'filter_to_model.lock'
model_to_filter = 'model_to_filter.lock'
! the i5.5 format below will not handle task counts larger than this.
@@ -1170,24 +1229,38 @@
call error_handler(E_ERR,'block_task', errstring, source, revision, revdate)
endif
+! make it so we only have to test 1 or 2 things here instead of 3
+! when deciding whether to print status messages.
+if (verbose) async4_verbose = .TRUE.
+
! process 0 (or N-1) is handled differently in the code.
-if (myrank == head_task) then
+if ((myrank == head_task) .and. .not. separate_node_sync) then
- if (print4status .or. verbose) write(*,*) 'script telling MPI job ok to restart'
+ if (async4_verbose) then
+ rc = system('echo master task running on host `hostname`'//' '//char(0))
+ if (rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
+ endif
+
+ if (async4_verbose .or. print4status) write(*,*) 'script telling MPI job ok to restart'
rc = system('echo restart > '//trim(model_to_filter)//' '//char(0))
+ if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
else
- if (verbose) write(*,*) 'waking up task id ', myrank
+ if (async4_verbose) write(*,*) 'waking up task id ', myrank
! FIXME: this should be 'task_lock', since it's generic code beyond filter.
write(fifo_name,"(a,i5.5)") "filter_lock", myrank
- if (verbose) write(*,*) 'ready to write to lock file: '//trim(fifo_name)
+ if (async4_verbose) then
+ rc = system('echo '//trim(fifo_name)//' accessed from host `hostname`'//' '//char(0))
+ if (rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
+ endif
+
+ if (async4_verbose) write(*,*) 'ready to write to lock file: '//trim(fifo_name)
rc = system('echo restart > '//trim(fifo_name)//' '//char(0))
+ if (async4_verbose .and. rc /= 0) write(*, *) 'system command returned nonzero rc, ', rc
- if (verbose) write(*,*) 'response was read from lock file: '//trim(fifo_name)
-
endif
end subroutine restart_task
@@ -1196,7 +1269,7 @@
subroutine finished_task(async)
integer, intent(in) :: async
-character(len = 32) :: fifo_name, filter_to_model, model_to_filter
+character(len = 32) :: fifo_name, filter_to_model, non_pipe
integer :: rc
if ( .not. module_initialized ) then
@@ -1209,14 +1282,18 @@
! FIXME: ditto previous comment about using the string 'filter' here.
filter_to_model = 'filter_to_model.lock'
-model_to_filter = 'model_to_filter.lock'
+non_pipe = 'filter_to_model.file'
! only process 0 (or N-1) needs to do anything.
if (myrank == head_task) then
if (print4status .or. verbose) write(*,*) 'MPI task telling script we are done'
- rc = system('echo finished > '//trim(filter_to_model)//' '//char(0))
+ if (separate_node_sync) then
+ rc = system('echo finished > '//trim(non_pipe)//' '//char(0))
+ else
+ rc = system('echo finished > '//trim(filter_to_model)//' '//char(0))
+ endif
endif
@@ -1263,7 +1340,7 @@
endif
write(fname, "(a,i4.4)") trim(pipename)//".", myrank
-print *, "fname now = ", trim(fname)
+!print *, "fname now = ", trim(fname)
! check to see if the pipe already exists; if so, we've got the unit number
! (directly into the output value) and we're done. otherwise, make it and
@@ -1396,6 +1473,8 @@
integer :: i, errcode, dummy(1)
integer :: status(MPI_STATUS_SIZE)
+ if (verbose) async2_verbose = .true.
+
! default to everyone running concurrently, but if set and not true,
! serialize the calls to system() so they do not step on each other.
if (present(serialize)) then
@@ -1404,15 +1483,16 @@
all_at_once = .TRUE.
endif
- if (verbose) write(*,*) "system string is: ", trim(execute_string)
+ if (async2_verbose) write(*,*) "PE", myrank, ": system string is: ", trim(execute_string)
shell_execute = -1
! this is the normal (default) case
if (all_at_once) then
! all tasks call system at the same time
- shell_execute = system(trim(execute_string)//' '//char(0))
- if (verbose) write(*,*) "execution returns, rc = ", shell_execute
+ !shell_execute = system(trim(execute_string)//' '//char(0))
+ shell_execute = system(trim(shell_name)//' '//trim(execute_string)//' '//char(0))
+ if (async2_verbose) write(*,*) "PE", myrank, ": execution returns, rc = ", shell_execute
return
endif
@@ -1426,8 +1506,8 @@
if (myrank == 0) then
! my turn to execute
- shell_execute = system(trim(execute_string)//' '//char(0))
- if (verbose) write(*,*) "PE", myrank, ": execution returns, rc = ", shell_execute
+ shell_execute = system(trim(shell_name)//' '//trim(execute_string)//' '//char(0))
+ if (async2_verbose) write(*,*) "PE", myrank, ": execution returns, rc = ", shell_execute
if (total_tasks > 1) then
! tell next task it can continue
@@ -1451,8 +1531,8 @@
endif
! my turn to execute
- shell_execute = system(trim(execute_string)//' '//char(0))
- if (verbose) write(*,*) "PE", myrank, ": execution returns, rc = ", shell_execute
+ shell_execute = system(trim(shell_name)//' '//trim(execute_string)//' '//char(0))
+ if (async2_verbose) write(*,*) "PE", myrank, ": execution returns, rc = ", shell_execute
! and now tell (me+1) to go
call MPI_Send(dummy, 1, MPI_INTEGER, myrank+1, myrank+1, my_local_comm, errcode)
@@ -1473,8 +1553,8 @@
endif
! my turn to execute
- shell_execute = system(trim(execute_string)//' '//char(0))
- if (verbose) write(*,*) "PE", myrank, ": execution returns, rc = ", shell_execute
+ shell_execute = system(trim(shell_name)//' '//trim(execute_string)//' '//char(0))
+ if (async2_verbose) write(*,*) "PE", myrank, ": execution returns, rc = ", shell_execute
endif
Modified: DART/trunk/mpi_utilities/mpi_utilities_mod.html
===================================================================
--- DART/trunk/mpi_utilities/mpi_utilities_mod.html 2011-03-04 17:07:03 UTC (rev 4776)
+++ DART/trunk/mpi_utilities/mpi_utilities_mod.html 2011-03-04 20:26:16 UTC (rev 4777)
@@ -144,24 +144,28 @@
<TR><!--contents--><TD valign=top>reverse_task_layout</TD>
<!-- type --><TD valign=top>logical</TD>
- <!--descript--><TD>If running async=4 mode (parallel filter
- and parallel model advance) with the OpenMPI library,
- this must be .TRUE. . The synchronizing
- mechanism between the scripting and the filter executable will
- fail otherwise. Default: .FALSE.</TD></TR>
+ <!--descript--><TD>The synchronizing mechanism between the job script
+ and the parallel filter in async=4 mode relies on the script and
+ task 0 running on the same node (in the same memory space if the
+ nodes have multiple processors). Some MPI implementations (OpenMPI
+ being the most commonly used one) lay the tasks out so that the last
+ task is on the same node as the script. If the async 4 model advance
+ never starts but there are no error messages, try setting this to
+ .TRUE. before running. See also the 'async4_verbose' flag below.
+ Default: .FALSE.</TD></TR>
<TR><!--contents--><TD valign=top>all_tasks_print</TD>
<!-- type --><TD valign=top>logical</TD>
<!--descript--><TD>In the parallel filter, informational messages only
print from task 0 to avoid N copies of the same messages. Error
messages and warnings print no matter which task they occur in.
- If this variable is set to true, messages will print
+ If this variable is set to true, even messages will print
from all tasks. Default: .FALSE. </TD></TR>
<TR><!--contents--><TD valign=top>verbose</TD>
<!-- type --><TD valign=top>logical</TD>
<!--descript--><TD>USE WITH CAUTION! This flag enables debugging
- print messages for every MPI call - sends, receives, syncs - and
+ print messages for every MPI call - sends, receives, barriers - and
is very, very verbose. In most cases the size of the output file
will exceed the filesystem limits or will cause the executable to
run so slowly that it will not be useful. However in small
@@ -186,8 +190,8 @@
<!--descript--><TD>If running on compute nodes which do not have the
expected default shell for async=2 or async=4 mode, specify the
full pathname of the shell to execute the script. Not normally
- needed on most systems we run on (Cray execute nodes being an
- exception.) Default: '' </TD></TR>
+ needed on most systems we run on. (However, at least one type of Cray
+ system has this need.) Default: '' </TD></TR>
<TR><!--contents--><TD valign=top>separate_node_sync</TD>
<!-- type --><TD valign=top>logical</TD>
Modified: DART/trunk/mpi_utilities/mpi_utilities_mod.nml
===================================================================
--- DART/trunk/mpi_utilities/mpi_utilities_mod.nml 2011-03-04 17:07:03 UTC (rev 4776)
+++ DART/trunk/mpi_utilities/mpi_utilities_mod.nml 2011-03-04 20:26:16 UTC (rev 4777)
@@ -1,3 +1,11 @@
&mpi_utilities_nml
+ reverse_task_layout = .false.,
+ all_tasks_print = .false.,
+ verbose = .false.,
+ async2_verbose = .false.,
+ async4_verbose = .false.,
+ shell_name = '',
+ separate_node_sync = .false.,
+ create_local_comm = .true.,
/
Modified: DART/trunk/mpi_utilities/null_mpi_utilities_mod.f90
===================================================================
--- DART/trunk/mpi_utilities/null_mpi_utilities_mod.f90 2011-03-04 17:07:03 UTC (rev 4776)
+++ DART/trunk/mpi_utilities/null_mpi_utilities_mod.f90 2011-03-04 20:26:16 UTC (rev 4777)
@@ -331,10 +331,11 @@
!-----------------------------------------------------------------------------
-subroutine send_to(dest_id, srcarray, time)
+subroutine send_to(dest_id, srcarray, time, label)
integer, intent(in) :: dest_id
real(r8), intent(in) :: srcarray(:)
type(time_type), intent(in), optional :: time
+ character(len=*), intent(in), optional :: label
! Send the srcarray to the destination id.
! If time is specified, it is also sent in a separate communications call.
@@ -361,10 +362,11 @@
!-----------------------------------------------------------------------------
-subroutine receive_from(src_id, destarray, time)
+subroutine receive_from(src_id, destarray, time, label)
integer, intent(in) :: src_id
real(r8), intent(out) :: destarray(:)
type(time_type), intent(out), optional :: time
+ character(len=*), intent(in), optional :: label
! Receive data into the destination array from the src task.
! If time is specified, it is received in a separate communications call.
More information about the Dart-dev
mailing list