[Wrf-users] Fatal error in MPI_Wait-running a big domain

Preeti preeti at csa.iisc.ernet.in
Wed Aug 19 22:43:44 MDT 2009


I am running a pretty big domain on 56 processors and simulating 5 days run,
namelist.input file is specified at the bottom of this mail.
With only 6 hours of simulation time remaining, wrf.exe dies with these
messages after running for nearly 8 hours

*rank 59 in job 1  garl-fire15.local_39996   caused collective abort of all
  exit status of rank 59: killed by signal 9
rank 58 in job 1  garl-fire15.local_39996   caused collective abort of all
  exit status of rank 58: killed by signal 9
rank 3 in job 1  garl-fire15.local_39996   caused collective abort of all
  exit status of rank 3: killed by signal 9
rank 42 in job 1  garl-fire15.local_39996   caused collective abort of all
  exit status of rank 42: return code 1
rank 40 in job 1  garl-fire15.local_39996   caused collective abort of all
  exit status of rank 40: return code 1
rank 45 in job 1  garl-fire15.local_39996   caused collective abort of all
  exit status of rank 45: return code 1
rank 44 in job 1  garl-fire15.local_39996   caused collective abort of all
  exit status of rank 44: killed by signal 9
rank 21 in job 1  garl-fire15.local_39996   caused collective abort of all
  exit status of rank 21: return code 1


In the log file rsl.error.0021 these errors are reported,

*Fatal error in MPI_Wait: Other MPI error, error stack:
MPI_Wait(156).............................: MPI_Wait(request=0x6816130,
status0x7fbfff1e10) failed
MPIDI_CH3i_Progress_wait(215).............: an error occurred while handling
an event returned by MPIDU_Sock_Wait()
MPIDU_Socki_handle_read(637)..............: connection failure
(set=0,sock=15,errno=104:Connection reset by peer)[cli_21]: aborting job:
Fatal error in MPI_Wait: Other MPI error, error stack:
MPI_Wait(156).............................: MPI_Wait(request=0x6816130,
status0x7fbfff1e10) failed
MPIDI_CH3i_Progress_wait(215).............: an error occurred while handling
an event returned by MPIDU_Sock_Wait()
MPIDU_Socki_handle_read(637)..............: connection failure
(set=0,sock=15,errno=104:Connection reset by peer)*

Any clue what could be going wrong?
I am using mpich2-1.0.8

Thanks in advance


 run_days                            = 5,
 run_hours                           = 0,
 run_minutes                         = 0,
 run_seconds                         = 0,
 start_year                          = 2009, 2009, 2000,
 start_month                         = 05,   05,   01,
 start_day                           = 22,   22,   24,
 start_hour                          = 00,   00,   12,
 start_minute                        = 00,   00,   00,
 start_second                        = 00,   00,   00,
 end_year                            = 2009, 2009, 2000,
 end_month                           = 05,   05,   01,
 end_day                             = 27,   27,   25,
 end_hour                            = 00,   00,   12,
 end_minute                          = 00,   00,   00,
 end_second                          = 00,   00,   00,
 interval_seconds                    = 21600
 input_from_file                     = .true.,.false.,.true.,
 history_interval                    = 90,   30,   60,
 frames_per_outfile                  = 10,   10, 1000,
 restart                             = .false.,
 restart_interval                    = 9000,
 io_form_history                     = 2
 io_form_restart                     = 2
 io_form_input                       = 2
 io_form_boundary                    = 2
 debug_level                         = 0
 nocolons                 = .false.
 auxinput1_inname                    ="met_em.d<domain>.<date>"

 time_step                           = 90,
 time_step_fract_num                 = 0,
 time_step_fract_den                 = 1,
 max_dom                             = 2,
 s_we                                = 1,     1,     1,
 e_we                                = 538,   151,   94,
 s_sn                                = 1,     1,     1,
 e_sn                                = 366,   193,   91,
 s_vert                              = 1,     1,     1,
 e_vert                              = 28,    28,    28,
 num_metgrid_levels                  = 27
 dx                                  = 15000, 5000,  3333.33,
 dy                                  = 15000, 5000,  3333.33,
 grid_id                             = 1,     2,     3,
 parent_id                           = 0,     1,     2,
 i_parent_start                      = 1,     299,   30,
 j_parent_start                      = 1,     151,   30,
 parent_grid_ratio                   = 1,     3,     3,
 parent_time_step_ratio              = 1,     3,     3,
 feedback                            = 1,
 smooth_option                       = 0,
 corral_dist                         = 2,

 mp_physics                          = 3,     3,     3,
 ra_lw_physics                       = 1,     1,     1,
 ra_sw_physics                       = 1,     1,     1,
 radt                                = 30,    30,    30,
 sf_sfclay_physics                   = 1,     1,     1,
 sf_surface_physics                  = 2,     2,     2,
 bl_pbl_physics                      = 1,     1,     1,
 bldt                                = 0,     0,     0,
 cu_physics                          = 1,     1,     0,
 cudt                                = 5,     5,     5,
 isfflx                              = 1,
 ifsnow                              = 0,
 icloud                              = 1,
 surface_input_source                = 1,
 num_soil_layers                     = 4,
 ucmcall                             = 0,
 maxiens                             = 1,
 maxens                              = 3,
 maxens2                             = 3,
 maxens3                             = 16,
 ensdim                              = 144,


 w_damping                           = 0,
 diff_opt                            = 1,
 km_opt                              = 4,
 diff_6th_opt                        = 0,
 diff_6th_factor                     = 0.12,
 base_temp                           = 290.
 damp_opt                            = 0,
 zdamp                               = 5000.,  5000.,  5000.,
 dampcoef                            = 0.2,    0.2,    0.2
 khdif                               = 0,      0,      0,
 kvdif                               = 0,      0,      0,
 non_hydrostatic                     = .true., .true., .true.,
 pd_moist                            = .true., .true., .true.,
 pd_scalar                           = .true., .true., .true.,

 spec_bdy_width                      = 5,
 spec_zone                           = 1,
 relax_zone                          = 4,
 specified                           = .true., .false.,.false.,
 nested                              = .false., .true., .true.,


 nio_tasks_per_group = 0,
 nio_groups = 1,
