[Dart-dev] [4546] DART/trunk/mpi_utilities/mpi_utilities_mod.f90: Fix the abort code to use a valid communicator when trying to abort and
nancy at ucar.edu
nancy at ucar.edu
Mon Nov 1 14:26:36 MDT 2010
Revision: 4546
Author: nancy
Date: 2010-11-01 14:26:36 -0600 (Mon, 01 Nov 2010)
Log Message:
-----------
Fix the abort code to use a valid communicator when trying to abort and
kill any other running mpi tasks.
Modified Paths:
--------------
DART/trunk/mpi_utilities/mpi_utilities_mod.f90
-------------- next part --------------
Modified: DART/trunk/mpi_utilities/mpi_utilities_mod.f90
===================================================================
--- DART/trunk/mpi_utilities/mpi_utilities_mod.f90 2010-10-27 22:58:22 UTC (rev 4545)
+++ DART/trunk/mpi_utilities/mpi_utilities_mod.f90 2010-11-01 20:26:36 UTC (rev 4546)
@@ -73,6 +73,9 @@
! call this routine before it can compute and return
! the value.
!
+! # get_dart_mpi_comm() This code creates a private communicator for use by
+! the routines in this file. This function returns it.
+!
! Lower level utility routines which interact with the utilities_mod.f90
! code to open a named pipe per MPI task, read and write from them, and
! close and/or remove them.
@@ -204,17 +207,17 @@
! ---- private data for mpi_utilities ----
-integer :: myrank ! my mpi number
-integer :: total_tasks ! total mpi tasks/procs
-integer :: my_local_comm ! duplicate communicator private to this file
-integer :: datasize ! which MPI type corresponds to our r8 definition
+integer :: myrank = -1 ! my mpi number
+integer :: total_tasks = -1 ! total mpi tasks/procs
+integer :: my_local_comm = 0 ! duplicate communicator private to this file
+integer :: datasize = 0 ! which MPI type corresponds to our r8 definition
public :: initialize_mpi_utilities, finalize_mpi_utilities, &
task_count, my_task_id, block_task, restart_task, &
task_sync, array_broadcast, send_to, receive_from, iam_task0, &
broadcast_send, broadcast_recv, shell_execute, sleep_seconds, &
- sum_across_tasks, mpi_was_init
+ sum_across_tasks, get_dart_mpi_comm
! version controlled file description for error handling, do not edit
character(len=128), parameter :: &
@@ -225,18 +228,23 @@
logical :: module_initialized = .false.
character(len = 129) :: saved_progname = ''
-integer :: head_task ! default 0, but N-1 if reverse_task_layout true
+character(len = 129) :: shell_name = '' ! if needed, add ksh, tcsh, bash, etc
+integer :: head_task = 0 ! def 0, but N-1 if reverse_task_layout true
logical :: print4status = .true. ! minimal messages for async4 handshake
character(len = 129) :: errstring
! this turns on trace messages for most MPI communications
-logical :: verbose = .false. ! very very verbose, use with care
+logical :: verbose = .false. ! very very very verbose, use with care
+logical :: async2_verbose = .false. ! messages only for system() in async2
+logical :: async4_verbose = .false. ! messages only for block/restart async4
! if your batch system does the task layout backwards, set this to true
! so the last task will communicate with the script in async 4 mode.
! as of now, mpich and mvapich do it forward, openmpi does it backwards.
logical :: reverse_task_layout = .false. ! task 0 on head node; task N-1 if .true.
+logical :: separate_node_sync = .false. ! true if tasks & script do not share nodes
+logical :: create_local_comm = .true. ! make a private communicator
! for large numbers of MPI tasks, you will get replicated messages, one
! per task, if this is set to true. however, for debugging if you need
@@ -303,6 +311,11 @@
call exit(-99)
endif
+! give this a temporary initial value, in case we call the abort code.
+! later down, we will dup the world comm and use a private comm for
+! our communication.
+my_local_comm = MPI_COMM_WORLD
+
! pass the arguments through so the utilities can log the program name
! only PE0 gets to output, whenever possible.
if (myrank == 0) then
@@ -347,14 +360,13 @@
! duplicate the world communicator to isolate us from any other user
! calls to MPI. All subsequent mpi calls here will use the local communicator
! and not the global world comm.
-call MPI_Comm_dup(MPI_COMM_WORLD, my_local_comm, errcode)
-! option here: do not duplicate communicator, use world. comment in the next
-! two lines and comment out the previous one to try this.
-!!my_local_comm = MPI_COMM_WORLD
-!!errcode = MPI_SUCCESS
-if (errcode /= MPI_SUCCESS) then
- write(errstring, '(a,i8)') 'MPI_Comm_dup returned error code ', errcode
- call error_handler(E_ERR,'initialize_mpi_utilities', errstring, source, revision, revdate)
+if (create_local_comm) then
+ call MPI_Comm_dup(MPI_COMM_WORLD, my_local_comm, errcode)
+ if (errcode /= MPI_SUCCESS) then
+ write(errstring, '(a,i8)') 'MPI_Comm_dup returned error code ', errcode
+ call error_handler(E_ERR,'initialize_mpi_utilities', errstring, &
+ source, revision, revdate)
+ endif
endif
! find out who we are (0 to N-1).
@@ -1488,13 +1500,13 @@
end subroutine sleep_seconds
!-----------------------------------------------------------------------------
-function mpi_was_init()
- logical :: mpi_was_init
+function get_dart_mpi_comm()
+ integer :: get_dart_mpi_comm
-! return the init status for the non-module subroutine below.
- mpi_was_init = module_initialized
+! return our private communicator (or world, if no private created)
+ get_dart_mpi_comm = my_local_comm
-end function mpi_was_init
+end function get_dart_mpi_comm
!-----------------------------------------------------------------------------
!-----------------------------------------------------------------------------
@@ -1509,7 +1521,7 @@
!-----------------------------------------------------------------------------
subroutine exit_all(exit_code)
- use mpi_utilities_mod, only : mpi_was_init
+ use mpi_utilities_mod, only : get_dart_mpi_comm
integer, intent(in) :: exit_code
@@ -1519,14 +1531,12 @@
integer :: ierror
-! if we seem to have gone through the init code, call abort on our
-! private communicator. otherwise call abort on the world comm.
-if ( mpi_was_init() ) then
- call MPI_Abort(my_local_comm, exit_code, ierror)
-else
- call MPI_Abort(MPI_COMM_WORLD, exit_code, ierror)
-endif
+! if we made a local communicator, call abort on it.
+! otherwise call abort on the world comm.
+!print *, 'calling abort on comm ', get_dart_mpi_comm()
+call MPI_Abort(get_dart_mpi_comm(), exit_code, ierror)
+
! execution should never get here
end subroutine exit_all
More information about the Dart-dev
mailing list