<p><b>duda</b> 2013-04-19 13:43:31 -0600 (Fri, 19 Apr 2013)</p><p>Merging changes from atmos_physics branch to trunk.<br>
<br>
<br>
M    src/core_atmos_physics/mpas_atmphys_driver_sfclayer.F<br>
M    src/core_atmos_physics/physics_wrf/module_bl_ysu.F<br>
M    src/core_atmos_physics/mpas_atmphys_vars.F<br>
M    src/core_atmos_physics/mpas_atmphys_interface_nhyd.F<br>
M    src/core_atmos_physics/mpas_atmphys_driver.F<br>
M    src/core_nhyd_atmos/mpas_atm_time_integration.F<br>
M    src/core_nhyd_atmos/Registry.xml<br>
M    src/core_nhyd_atmos/mpas_atm_mpas_core.F<br>
</p><hr noshade><pre><font color="gray">Modified: trunk/mpas/src/core_atmos_physics/mpas_atmphys_driver.F
===================================================================
--- trunk/mpas/src/core_atmos_physics/mpas_atmphys_driver.F        2013-04-19 18:44:55 UTC (rev 2780)
+++ trunk/mpas/src/core_atmos_physics/mpas_atmphys_driver.F        2013-04-19 19:43:31 UTC (rev 2781)
@@ -60,7 +60,8 @@
 
     !physics prep step:
 #ifdef non_hydrostatic_core
-    call MPAS_to_physics(block%mesh,block%state%time_levs(1)%state,block%diag)
+    call MPAS_to_physics(block%mesh,block%state%time_levs(1)%state,block%diag, &amp;
+                         block%diag_physics)
 #elif hydrostatic_core
     call MPAS_to_physics(block%state%time_levs(1)%state,block%diag)
 #endif

Modified: trunk/mpas/src/core_atmos_physics/mpas_atmphys_driver_sfclayer.F
===================================================================
--- trunk/mpas/src/core_atmos_physics/mpas_atmphys_driver_sfclayer.F        2013-04-19 18:44:55 UTC (rev 2780)
+++ trunk/mpas/src/core_atmos_physics/mpas_atmphys_driver_sfclayer.F        2013-04-19 19:43:31 UTC (rev 2781)
@@ -137,31 +137,31 @@
     tsk_p(i,j)    = sfc_input    % skintemp % array(i)
     xland_p(i,j)  = sfc_input    % xland    % array(i)       
     !inout variables:
+    br_p(i,j)     = diag_physics % br     % array(i)
+    cpm_p(i,j)    = diag_physics % cpm    % array(i)
+    chs_p(i,j)    = diag_physics % chs    % array(i)
+    chs2_p(i,j)   = diag_physics % chs2   % array(i)
+    cqs2_p(i,j)   = diag_physics % cqs2   % array(i)
+    fh_p(i,j)     = diag_physics % fh     % array(i)
+    fm_p(i,j)     = diag_physics % fm     % array(i)
+    flhc_p(i,j)   = diag_physics % flhc   % array(i)
+    flqc_p(i,j)   = diag_physics % flqc   % array(i)
+    gz1oz0_p(i,j) = diag_physics % gz1oz0 % array(i)
     hfx_p(i,j)    = diag_physics % hfx    % array(i)
     qfx_p(i,j)    = diag_physics % qfx    % array(i)
+    qgh_p(i,j)    = diag_physics % qgh    % array(i)
     qsfc_p(i,j)   = diag_physics % qsfc   % array(i) 
+    lh_p(i,j)     = diag_physics % lh     % array(i)
     mol_p(i,j)    = diag_physics % mol    % array(i) 
+    psim_p(i,j)   = diag_physics % psim   % array(i)
+    psih_p(i,j)   = diag_physics % psih   % array(i)
     regime_p(i,j) = diag_physics % regime % array(i)
+    rmol_p(i,j)   = diag_physics % rmol   % array(i)
     ust_p(i,j)    = diag_physics % ust    % array(i)
+    wspd_p(i,j)   = diag_physics % wspd   % array(i)
     znt_p(i,j)    = diag_physics % znt    % array(i) 
     zol_p(i,j)    = diag_physics % zol    % array(i) 
     !output variables:
-    br_p(i,j)     = 0._RKIND
-    cpm_p(i,j)    = cp
-    chs_p(i,j)    = 0._RKIND
-    chs2_p(i,j)   = 0._RKIND
-    cqs2_p(i,j)   = 0._RKIND
-    flhc_p(i,j)   = 0._RKIND
-    flqc_p(i,j)   = 0._RKIND
-    fh_p(i,j)     = 0._RKIND
-    fm_p(i,j)     = 0._RKIND
-    gz1oz0_p(i,j) = 0._RKIND
-    lh_p(i,j)     = 0._RKIND
-    psim_p(i,j)   = 0._RKIND
-    psih_p(i,j)   = 0._RKIND
-    qgh_p(i,j)    = 0._RKIND
-    rmol_p(i,j)   = 0._RKIND
-    wspd_p(i,j)   = 0._RKIND
     q2_p(i,j)     = 0._RKIND
     t2m_p(i,j)    = 0._RKIND
     th2m_p(i,j)   = 0._RKIND
@@ -188,6 +188,8 @@
     diag_physics % chs    % array(i) = chs_p(i,j)
     diag_physics % chs2   % array(i) = chs2_p(i,j)
     diag_physics % cqs2   % array(i) = cqs2_p(i,j)
+    diag_physics % fh     % array(i) = fh_p(i,j)
+    diag_physics % fm     % array(i) = fm_p(i,j)
     diag_physics % flhc   % array(i) = flhc_p(i,j)
     diag_physics % flqc   % array(i) = flqc_p(i,j)
     diag_physics % gz1oz0 % array(i) = gz1oz0_p(i,j)

Modified: trunk/mpas/src/core_atmos_physics/mpas_atmphys_interface_nhyd.F
===================================================================
--- trunk/mpas/src/core_atmos_physics/mpas_atmphys_interface_nhyd.F        2013-04-19 18:44:55 UTC (rev 2780)
+++ trunk/mpas/src/core_atmos_physics/mpas_atmphys_interface_nhyd.F        2013-04-19 19:43:31 UTC (rev 2781)
@@ -111,7 +111,7 @@
  end subroutine deallocate_forall_physics
 
 !=============================================================================================
- subroutine MPAS_to_physics(mesh,state,diag)
+ subroutine MPAS_to_physics(mesh,state,diag,diag_physics)
 !=============================================================================================
 
 !input variables:
@@ -119,6 +119,9 @@
  type(state_type),intent(in):: state
  type(diag_type) ,intent(in):: diag
 
+!inout variables:
+ type(diag_physics_type),intent(inout):: diag_physics
+
 !local variables:
  integer:: i,k,j
  real(kind=RKIND):: z0,z1,z2,w1,w2
@@ -322,6 +325,13 @@
  enddo
  enddo
 
+!save the model-top pressure:
+ do j = jts,jte
+ do i = its,ite
+    diag_physics % plrad % array(i) = pres2_p(i,kte+1,j) 
+ enddo
+ enddo
+
 !formats: 
  201 format(3i8,10(1x,e15.8))
  202 format(2i6,10(1x,e15.8))

Modified: trunk/mpas/src/core_atmos_physics/mpas_atmphys_vars.F
===================================================================
--- trunk/mpas/src/core_atmos_physics/mpas_atmphys_vars.F        2013-04-19 18:44:55 UTC (rev 2780)
+++ trunk/mpas/src/core_atmos_physics/mpas_atmphys_vars.F        2013-04-19 19:43:31 UTC (rev 2781)
@@ -345,7 +345,12 @@
 !... variables and arrays related to parameterization of long-wave radiation:
 !=============================================================================================
 
+ integer,dimension(:,:),allocatable:: &amp;
+    nlrad_p            !number of layers added above the model top                         [-]
  real(kind=RKIND),dimension(:,:),allocatable:: &amp;
+    plrad_p            !pressure at model_top                                             [Pa]
+
+ real(kind=RKIND),dimension(:,:),allocatable:: &amp;
     glw_p,            &amp;!net longwave flux at surface                                   [W m-2]
     lwcf_p,           &amp;!longwave cloud forcing at top-of-atmosphere                    [W m-2]
     lwdnb_p,          &amp;!all-sky downwelling longwave flux at bottom-of-atmosphere      [J m-2]

Modified: trunk/mpas/src/core_atmos_physics/physics_wrf/module_bl_ysu.F
===================================================================
--- trunk/mpas/src/core_atmos_physics/physics_wrf/module_bl_ysu.F        2013-04-19 18:44:55 UTC (rev 2780)
+++ trunk/mpas/src/core_atmos_physics/physics_wrf/module_bl_ysu.F        2013-04-19 19:43:31 UTC (rev 2781)
@@ -380,6 +380,9 @@
 !               ==&gt; consider thermal z0 when differs from mechanical z0
 !              a bug fix in wscale computation in stable bl, sukanta basu, jun 2012
 !               ==&gt; wscale becomes small with height, and less mixing in stable bl
+!               ==&gt; ri = max(ri,rimin). limits the richardson number to -100 in
+!               unstable layers, following Hong et al. 2006.
+!               Laura D. Fowler (2013-04-18).
 !
 !     references:
 !
@@ -1002,6 +1005,7 @@
          rlamdz = min(dza(i,k+1),rlamdz)
          rl2 = (zk*rlamdz/(rlamdz+zk))**2
          dk = rl2*sqrt(ss)
+         ri = max(ri,rimin)
          if(ri.lt.0.)then
 ! unstable regime
            sri = sqrt(-ri)

Modified: trunk/mpas/src/core_nhyd_atmos/Registry.xml
===================================================================
--- trunk/mpas/src/core_nhyd_atmos/Registry.xml        2013-04-19 18:44:55 UTC (rev 2780)
+++ trunk/mpas/src/core_nhyd_atmos/Registry.xml        2013-04-19 19:43:31 UTC (rev 2781)
@@ -49,6 +49,7 @@
                 &lt;nml_option name=&quot;config_h_theta_eddy_visc4&quot;         type=&quot;real&quot;          default_value=&quot;0.0&quot;/&gt;
                 &lt;nml_option name=&quot;config_v_theta_eddy_visc2&quot;         type=&quot;real&quot;          default_value=&quot;0.0&quot;/&gt;
                 &lt;nml_option name=&quot;config_visc4_2dsmag&quot;               type=&quot;real&quot;          default_value=&quot;0.0&quot;/&gt;
+                &lt;nml_option name=&quot;config_del4u_div_factor&quot;           type=&quot;real&quot;          default_value=&quot;1.0&quot;/&gt;
                 &lt;nml_option name=&quot;config_number_of_sub_steps&quot;        type=&quot;integer&quot;       default_value=&quot;4&quot;/&gt;
                 &lt;nml_option name=&quot;config_w_adv_order&quot;                type=&quot;integer&quot;       default_value=&quot;3&quot;/&gt;
                 &lt;nml_option name=&quot;config_theta_adv_order&quot;            type=&quot;integer&quot;       default_value=&quot;3&quot;/&gt;
@@ -560,12 +561,15 @@
                 &lt;!--  zol       :z/L height over Monin-Obukhov length                                               [-] --&gt;
                 &lt;!--  znt       :time-varying roughness length                                                      [m] --&gt;
                 &lt;!--  wspd      :wind speed                                                                       [m/s] --&gt;
+                &lt;!--  fh        :integrated function for heat                                                       [-] --&gt;
+                &lt;!--  fm        :integrated function for momentum                                                   [-] --&gt;
                 &lt;!--  DIAGNOSTICS:                                                                                      --&gt;
                 &lt;!--  q2        :specific humidity at 2m                                                        [kg/kg] --&gt;
                 &lt;!--  u10       :u at 10 m                                                                        [m/s] --&gt;
                 &lt;!--  v10       :v at 10 m                                                                        [m/s] --&gt;
                 &lt;!--  t2m       :temperature at 2m                                                                  [K] --&gt;
                 &lt;!--  th2m      :potential temperature at 2m                                                        [K] --&gt;
+
                 &lt;var name=&quot;hfx&quot;           type=&quot;real&quot;     dimensions=&quot;nCells Time&quot;                 streams=&quot;ro&quot;/&gt;
                 &lt;var name=&quot;mavail&quot;        type=&quot;real&quot;     dimensions=&quot;nCells Time&quot;                 streams=&quot;ro&quot;/&gt;
                 &lt;var name=&quot;mol&quot;           type=&quot;real&quot;     dimensions=&quot;nCells Time&quot;                 streams=&quot;ro&quot;/&gt;
@@ -595,6 +599,8 @@
                 &lt;var name=&quot;regime&quot;        type=&quot;real&quot;     dimensions=&quot;nCells Time&quot;                 streams=&quot;ro&quot;/&gt;
                 &lt;var name=&quot;rmol&quot;          type=&quot;real&quot;     dimensions=&quot;nCells Time&quot;                 streams=&quot;ro&quot;/&gt;
                 &lt;var name=&quot;wspd&quot;          type=&quot;real&quot;     dimensions=&quot;nCells Time&quot;                 streams=&quot;ro&quot;/&gt;
+                &lt;var name=&quot;fh&quot;            type=&quot;real&quot;     dimensions=&quot;nCells Time&quot;                 streams=&quot;ro&quot;/&gt;
+                &lt;var name=&quot;fm&quot;            type=&quot;real&quot;     dimensions=&quot;nCells Time&quot;                 streams=&quot;ro&quot;/&gt;
 
                 &lt;!-- DIAGNOSTICS: --&gt;
                 &lt;var name=&quot;u10&quot;           type=&quot;real&quot;     dimensions=&quot;nCells Time&quot;                 streams=&quot;ro&quot;/&gt;
@@ -703,6 +709,8 @@
                 &lt;!--        in contrast,lwdnb is an optional ouput argument to the subroutine rrtmg_lwrad depending on  --&gt;
                 &lt;!--        the presence of lwupt (or not).                                                             --&gt;
 
+                &lt;!--  nlrad     :number of layers added above the model-top in the RRTMG lw radiation code          [-] --&gt;
+                &lt;!--  plrad     :pressure at model-top                                                             [Pa] --&gt;
                 &lt;!--  glw       :all-sky downwelling longwave flux at bottom-of-atmosphere                      [W m-2] --&gt;
                 &lt;!--  lwcf      :longwave cloud forcing at top-of-atmosphere                                    [W m-2] --&gt;
                 &lt;!--  lwdnb     :all-sky downwelling longwave flux at bottom-of-atmosphere                      [W m-2] --&gt;
@@ -736,6 +744,9 @@
                 &lt;!--  i_aclwupt : counter related to how often lwupt is begin reset relative to its bucket value    (-) --&gt;
                 &lt;!--  i_aclwuptc: counter related to how often lwuptc is begin reset relative to its bucket value   (-) --&gt;
 
+                &lt;var name=&quot;nlrad&quot;         type=&quot;integer&quot;  dimensions=&quot;nCells Time&quot;                 streams=&quot;o&quot; /&gt;
+                &lt;var name=&quot;plrad&quot;         type=&quot;real&quot;     dimensions=&quot;nCells Time&quot;                 streams=&quot;o&quot; /&gt;
+
                 &lt;var name=&quot;i_aclwdnb&quot;     type=&quot;integer&quot;  dimensions=&quot;nCells Time&quot;                 streams=&quot;ro&quot;/&gt;
                 &lt;var name=&quot;i_aclwdnbc&quot;    type=&quot;integer&quot;  dimensions=&quot;nCells Time&quot;                 streams=&quot;ro&quot;/&gt;
                 &lt;var name=&quot;i_aclwdnt&quot;     type=&quot;integer&quot;  dimensions=&quot;nCells Time&quot;                 streams=&quot;ro&quot;/&gt;

Modified: trunk/mpas/src/core_nhyd_atmos/mpas_atm_mpas_core.F
===================================================================
--- trunk/mpas/src/core_nhyd_atmos/mpas_atm_mpas_core.F        2013-04-19 18:44:55 UTC (rev 2780)
+++ trunk/mpas/src/core_nhyd_atmos/mpas_atm_mpas_core.F        2013-04-19 19:43:31 UTC (rev 2781)
@@ -80,9 +80,9 @@
          !     
          ! We need to decide which time slice to read from the surface file - read the most recent time slice that falls before or on the start time
          !
-         sfc_update_obj % time = MPAS_seekStream(sfc_update_obj % io_stream, trim(config_start_time), MPAS_STREAM_LATEST_BEFORE, timeStamp, ierr)
+         sfc_update_obj % time = MPAS_seekStream(sfc_update_obj % io_stream, trim(startTimeStamp), MPAS_STREAM_LATEST_BEFORE, timeStamp, ierr)
          if (ierr == MPAS_IO_ERR) then
-            write(0,*) 'Error: surface update file '//trim(sfc_update_obj % filename)//' did not contain any times at or before '//trim(config_start_time)
+            write(0,*) 'Error: surface update file '//trim(sfc_update_obj % filename)//' did not contain any times at or before '//trim(startTimeStamp)
             call mpas_dmpar_abort(domain % dminfo)
          end if
 
@@ -105,7 +105,14 @@
       type (MPAS_TimeInterval_type) :: runDuration, timeStep, alarmTimeStep
       integer :: ierr
 
-      call mpas_set_time(curr_time=startTime, dateTimeString=config_start_time, ierr=ierr)
+      if(trim(config_start_time) == 'file') then
+         open(22,file='restart_timestamp',form='formatted',status='old')
+         read(22,*) startTimeStamp
+         close(22)
+      else
+        startTimeStamp = config_start_time
+      end if
+      call mpas_set_time(curr_time=startTime, dateTimeString=startTimeStamp, ierr=ierr)
       call mpas_set_timeInterval(timeStep, dt=dt, ierr=ierr)
 
       if (trim(config_run_duration) /= &quot;none&quot;) then

Modified: trunk/mpas/src/core_nhyd_atmos/mpas_atm_time_integration.F
===================================================================
--- trunk/mpas/src/core_nhyd_atmos/mpas_atm_time_integration.F        2013-04-19 18:44:55 UTC (rev 2780)
+++ trunk/mpas/src/core_nhyd_atmos/mpas_atm_time_integration.F        2013-04-19 19:43:31 UTC (rev 2781)
@@ -68,12 +68,13 @@
    ! Advance model state forward in time by the specified time step using 
    !   time-split RK3 scheme
    !
-   ! Hydrostatic (primitive eqns.) solver
+   ! Nonhydrostatic atmospheric solver
    !
    ! Input: domain - current model state in time level 1 (e.g., time_levs(1)state%h(:,:)) 
-   !                 plus grid meta-data
+   !                 plus grid meta-data and some diagnostics of state.
    ! Output: domain - upon exit, time level 2 (e.g., time_levs(2)%state%h(:,:)) contains 
-   !                  model state advanced forward in time by dt seconds
+   !                  model state advanced forward in time by dt seconds,
+   !                  and some diagnostics in diag 
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
 
       implicit none
@@ -118,14 +119,6 @@
 
       if(debug) write(0,*) ' copy step in rk solver '
 
-! WCS-parallel: it appears we have chosen to update all edges of nCellsSolve (to cut down on communications on acoustic steps).
-! Do our communications patterns and loops (specifically in compute_dyn_tend) reflect this?  Or do they assume we are only updating 
-! the so-called owned edges?
-
-
-
-! WCS-parallel: first three and rtheta_p arise from scalar transport and microphysics update (OK).  Others come from where?
-
 ! theta_m
       call mpas_dmpar_exch_halo_field(domain % blocklist % state % time_levs(1) % state % theta_m)
  
@@ -141,72 +134,71 @@
 
       block =&gt; domain % blocklist
       do while (associated(block))
-         ! We are setting values in the halo here, so no communications are needed.
-         ! Alternatively, we could just set owned cells and edge values and communicate after this block loop.
          call atm_rk_integration_setup( block % state % time_levs(2) % state, block % state % time_levs(1) % state, block % diag )
          block =&gt; block % next
       end do
 
       !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
-      ! BEGIN RK loop 
+      ! BEGIN Runge-Kutta loop 
       !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
 
       do rk_step = 1, 3  ! Runge-Kutta loop
 
          if(debug) write(0,*) ' rk substep ', rk_step
 
-        block =&gt; domain % blocklist
-        do while (associated(block))
-           ! The coefficients are set for owned cells (cqw) and for all edges of owned cells, 
-           ! thus no communications should be needed after this call.  
-           ! We could consider combining this and the next block loop.
-           call atm_compute_moist_coefficients( block % state % time_levs(2) % state, block % diag, block % mesh )
-           block =&gt; block % next
-        end do
+         block =&gt; domain % blocklist
+         do while (associated(block))
+            ! The coefficients are set for owned cells (cqw) and for all edges of owned cells, 
+            call atm_compute_moist_coefficients( block % state % time_levs(2) % state, block % diag, block % mesh )
+            block =&gt; block % next
+         end do
 
 
-        if (debug) write(0,*) ' compute_dyn_tend '
-        block =&gt; domain % blocklist
-        do while (associated(block))
-           call atm_compute_dyn_tend( block % tend, block % state % time_levs(2) % state, block % diag, block % mesh, rk_step, dt )
-           block =&gt; block % next
-        end do
-        if (debug) write(0,*) ' finished compute_dyn_tend '
+         if (debug) write(0,*) ' compute_dyn_tend '
+         block =&gt; domain % blocklist
+         do while (associated(block))
+            call atm_compute_dyn_tend( block % tend, block % state % time_levs(2) % state, block % diag, block % mesh, rk_step, dt )
+            block =&gt; block % next
+         end do
+         if (debug) write(0,*) ' finished compute_dyn_tend '
 
-
 #ifdef DO_PHYSICS
-        if (debug) write(0,*) ' add physics tendencies '
-        block =&gt; domain % blocklist
-        do while (associated(block))
-           call physics_addtend( block % mesh, &amp;
-                        block % state % time_levs(1) % state, &amp;
-                        block % diag, &amp;
-                        block % tend, &amp;
-                        block % tend_physics, &amp;
-                        block % state % time_levs(2) % state % rho_zz % array(:,:), &amp;
-                        block % diag % rho_edge % array(:,:), &amp; 
-                        rk_step )
-           block =&gt; block % next
-        end do
-        if (debug) write(0,*) ' finished add physics tendencies '
+         if (debug) write(0,*) ' add physics tendencies '
+         block =&gt; domain % blocklist
+         do while (associated(block))
+            call physics_addtend( block % mesh, &amp;
+                         block % state % time_levs(1) % state, &amp;
+                         block % diag, &amp;
+                         block % tend, &amp;
+                         block % tend_physics, &amp;
+                         block % state % time_levs(2) % state % rho_zz % array(:,:), &amp;
+                         block % diag % rho_edge % array(:,:), &amp; 
+                         rk_step )
+            block =&gt; block % next
+         end do
+         if (debug) write(0,*) ' finished add physics tendencies '
 #endif
 
-!***********************************
-!  we will need to communicate the momentum tendencies here - we want tendencies for all edges of owned cells
-!  because we are solving for all edges of owned cells
-!***********************************
+         !***********************************
+         !  need tendencies at all edges of owned cells -
+         !  we are solving for all edges of owned cells to minimize communications
+         !  during the acoustic substeps
+         !***********************************
 
 ! tend_u
          call mpas_dmpar_exch_halo_field(domain % blocklist % tend % u, (/ 1 /))
 
          block =&gt; domain % blocklist
             do while (associated(block))
-!               call atm_set_smlstep_pert_variables( block % state % time_levs(1) % state, block % state % time_levs(2) % state,  &amp;
                call atm_set_smlstep_pert_variables( block % tend, block % diag, block % mesh )
                call atm_compute_vert_imp_coefs( block % state % time_levs(2) % state, block % mesh, block % diag, rk_sub_timestep(rk_step) )
             block =&gt; block % next
          end do
 
+         !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+         ! begin acoustic steps loop
+         !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
          do small_step = 1, number_sub_steps(rk_step)
 
             if(debug) write(0,*) ' acoustic step ',small_step
@@ -220,42 +212,24 @@
 
             if(debug) write(0,*) ' acoustic step complete '
   
-            !  will need communications here for rtheta_pp
+! rtheta_pp
+! This is the only communications needed during the acoustic steps because we solve for u on all edges of owned cells
 
-!  WCS-parallel: is this a candidate for a smaller stencil?  we need only communicate cells that share edges with owned cells.
-
-! rtheta_pp
             call mpas_dmpar_exch_halo_field(domain % blocklist % diag % rtheta_pp, (/ 1 /))
  
-         end do  ! end of small stimestep loop
+         end do  ! end of acoustic steps loop
 
-         !  will need communications here for rho_pp
-
-! WCS-parallel: is communication of rw_p and rho_pp because of limiter (pd or mono scheme?),
-!  or is it needed for the large-step variable recovery (to get decoupled variables)?
-!  seems like only rho_pp needed...
-!
-!  or, do we need ru and u in the halo for diagnostics that are computed later in compute_solve_diagnostics?
-!
-!  rho_pp might be candidate for smaller stencil (same stencil as rtheta_pp above).
-
-! MGD seems necessary
-! rw_p
          !CR: SMALLER STENCIL?: call mpas_dmpar_exch_halo_field(block % diag % rw_p, (/ 1 /))
          call mpas_dmpar_exch_halo_field(domain % blocklist % diag % rw_p)
 
-! MGD seems necessary
-! ru_p
          !CR: SMALLER STENCIL?: call mpas_dmpar_exch_halo_field(block % diag % ru_p, (/ 2 /))
          call mpas_dmpar_exch_halo_field(domain % blocklist % diag % ru_p)
 
-! rho_pp
          call mpas_dmpar_exch_halo_field(domain % blocklist % diag % rho_pp)
 
          ! the second layer of halo cells must be exchanged before calling atm_recover_large_step_variables
          call mpas_dmpar_exch_halo_field(domain % blocklist % diag % rtheta_pp, (/ 2 /))
 
-
          block =&gt; domain % blocklist
          do while (associated(block))
             call atm_recover_large_step_variables( block % state % time_levs(2) % state,                 &amp;
@@ -264,12 +238,12 @@
             block =&gt; block % next
          end do
 
-!  ************  advection of moist variables here...
-
 ! u
          !CR: SMALLER STENCIL?: call mpas_dmpar_exch_halo_field(block % state % time_levs(2) % state % u, (/ 3 /))
          call mpas_dmpar_exch_halo_field(domain % blocklist % state % time_levs(2) % state % u)
 
+         ! scalar advection: RK3 scheme of Skamarock and Gassmann (2011). 
+         ! PD or monotonicity constraints applied only on the final Runge-Kutta substep.
 
          if (config_scalar_advection) then
 
@@ -278,7 +252,7 @@
             !
             ! Note: The advance_scalars_mono routine can be used without limiting, and thus, encompasses 
             !       the functionality of the advance_scalars routine; however, it is noticeably slower, 
-            !       so we keep the advance_scalars routine as well
+            !       so we use the advance_scalars routine for the first two RK substeps.
             !
             if (rk_step &lt; 3 .or. (.not. config_monotonic .and. .not. config_positive_definite)) then
                call atm_advance_scalars( block % tend, &amp;
@@ -295,30 +269,12 @@
             block =&gt; block % next
          end do
 
-! For now, we do scalar halo updates later on...
-!         block =&gt; domain % blocklist
-!         do while (associated(block))
-!            call mpas_dmpar_exch_halo_field3d_real(domain % dminfo, block % tend % scalars % array(:,:,:), &amp;
-!                                             block % tend % num_scalars, block % mesh % nVertLevels, block % mesh % nCells, &amp;
-!                                             block % parinfo % cellsToSend, block % parinfo % cellsToRecv)
-!            call mpas_dmpar_exch_halo_field3d_real(domain % dminfo, block % state % time_levs(2) % state % scalars % array(:,:,:), &amp;
-!                                             block % state % time_levs(2) % state % num_scalars, block % mesh % nVertLevels, block % mesh % nCells, &amp;
-!                                             block % parinfo % cellsToSend, block % parinfo % cellsToRecv)
-!            block =&gt; block % next
-!         end do
-
          else
  
             write(0,*) ' no scalar advection '
 
          end if
 
-! WCS-parallel: seems like we already use u and w (and other state variables) as if they were already correctly set in the halo,
-! but they are not (at least to the outer edges - see communications below? or are those communications redundent?).  
-! Perhaps we should communicate u, w, theta_m, rho_zz, etc after recover_large_step_variables),
-! cover it with the scalar communications, and then compute solve_diagnostics.  I do not think we need to communicate the stuff we compute
-! in compute_solve_diagnostics if we compute it out in the halo (and I think we do - the halos should be large enough).
-
          block =&gt; domain % blocklist
          do while (associated(block))
             call atm_compute_solve_diagnostics( dt, block % state % time_levs(2) % state, block % diag, block % mesh )
@@ -327,11 +283,6 @@
 
          if(debug) write(0,*) ' diagnostics complete '
 
-
-      ! need communications here to fill out u, w, theta_m, p, and pp, scalars, etc  
-      ! so that they are available for next RK step or the first rk substep of the next timestep
-
-!MGD seems necessary
 ! w
          call mpas_dmpar_exch_halo_field(domain % blocklist % state % time_levs(2) % state % w)
 
@@ -341,30 +292,29 @@
 ! rho_edge
          call mpas_dmpar_exch_halo_field(domain % blocklist % diag % rho_edge)
 
-!  ****  this will always be needed - perhaps we can cover this with compute_solve_diagnostics
-
 ! scalars
-         if(rk_step &lt; 3) then
+         if (rk_step &lt; 3) then
             call mpas_dmpar_exch_halo_field(domain % blocklist % state % time_levs(2) % state % scalars)
          end if
 
-       end do ! rk_step loop
+      end do ! rk_step loop
 
 !...  compute full velocity vectors at cell centers:
       block =&gt; domain % blocklist
-        do while (associated(block))
-           call mpas_reconstruct(block % mesh, block % state % time_levs(2) % state % u % array, &amp;
-                                 block % diag % uReconstructX % array,                           &amp;
-                                 block % diag % uReconstructY % array,                           &amp;
-                                 block % diag % uReconstructZ % array,                           &amp;
-                                 block % diag % uReconstructZonal % array,                       &amp;
-                                 block % diag % uReconstructMeridional % array                   &amp;
-                                )
-           block =&gt; block % next
-        end do
+      do while (associated(block))
+         call mpas_reconstruct(block % mesh, block % state % time_levs(2) % state % u % array, &amp;
+                               block % diag % uReconstructX % array,                           &amp;
+                               block % diag % uReconstructY % array,                           &amp;
+                               block % diag % uReconstructZ % array,                           &amp;
+                               block % diag % uReconstructZonal % array,                       &amp;
+                               block % diag % uReconstructMeridional % array                   &amp;
+                              )
+         block =&gt; block % next
+      end do
 
 !... call to parameterizations of cloud microphysics. calculation of the tendency of water vapor to horizontal and
 !... vertical advection needed for the Tiedtke parameterization of convection.
+
 #ifdef DO_PHYSICS
       block =&gt; domain % blocklist
       do while(associated(block))
@@ -372,21 +322,21 @@
          !NOTE: The calculation of the tendency due to horizontal and vertical advection for the water vapor mixing ratio
          !requires that the subroutine atm_advance_scalars_mono was called on the third Runge Kutta step, so that a halo
          !update for the scalars at time_levs(1) is applied. A halo update for the scalars at time_levs(2) is done above. 
-         if(config_monotonic) then
+         if (config_monotonic) then
             block % tend_physics % rqvdynten % array(:,:) = &amp;
                  ( block % state % time_levs(2) % state % scalars % array(block % state % time_levs(2) % state % index_qv,:,:)   &amp;
                  - block % state % time_levs(1) % state % scalars % array(block % state % time_levs(1) % state % index_qv,:,:) ) &amp;
                  / config_dt
          else
             block % tend_physics % rqvdynten % array(:,:) = 0._RKIND
-         endif
+         end if
 
          !simply set to zero negative mixing ratios of different water species (for now):
          where ( block % state % time_levs(2) % state % scalars % array(:,:,:) .lt. 0.) &amp;
             block % state % time_levs(2) % state % scalars % array(:,:,:) = 0.
 
          !call microphysics schemes:
-         if(config_microp_scheme .ne. 'off') &amp;
+         if (config_microp_scheme .ne. 'off') &amp;
             call microphysics_driver ( block % state % time_levs(2) % state, block % diag, block % diag_physics, &amp;
                                        block % tend, block % mesh, itimestep )
 
@@ -394,87 +344,84 @@
       end do
 #endif
 
+      102 format(' global min, max scalar',i4,2(1x,e17.10))
+      write(0,*)
+      block =&gt; domain % blocklist
+      do while (associated(block))
+         scalar_min = 0.
+         scalar_max = 0.
+         do iCell = 1, block % mesh % nCellsSolve
+         do k = 1, block % mesh % nVertLevels
+            scalar_min = min(scalar_min, block % state % time_levs(2) % state % w % array(k,iCell))
+            scalar_max = max(scalar_max, block % state % time_levs(2) % state % w % array(k,iCell))
+         end do
+         end do
+         call mpas_dmpar_min_real(domain%dminfo, scalar_min, global_scalar_min)
+         call mpas_dmpar_max_real(domain%dminfo, scalar_max, global_scalar_max)
+         write(0,*) 'global min, max w ',global_scalar_min, global_scalar_max
 
-!      if(debug) then
-!        101 format(' local  min, max scalar',i4,2(1x,e17.10))
-        102 format(' global min, max scalar',i4,2(1x,e17.10))
-        write(0,*)
-        block =&gt; domain % blocklist
-          do while (associated(block))
-             scalar_min = 0.
-             scalar_max = 0.
-             do iCell = 1, block % mesh % nCellsSolve
-             do k = 1, block % mesh % nVertLevels
-               scalar_min = min(scalar_min, block % state % time_levs(2) % state % w % array(k,iCell))
-               scalar_max = max(scalar_max, block % state % time_levs(2) % state % w % array(k,iCell))
-             enddo
-             enddo
-             call mpas_dmpar_min_real(domain%dminfo, scalar_min, global_scalar_min)
-             call mpas_dmpar_max_real(domain%dminfo, scalar_max, global_scalar_max)
-!             write(0,*) 'local  min, max w ',scalar_min, scalar_max
-             write(0,*) 'global min, max w ',global_scalar_min, global_scalar_max
+         scalar_min = 0.
+         scalar_max = 0.
+         do iEdge = 1, block % mesh % nEdgesSolve
+         do k = 1, block % mesh % nVertLevels
+            scalar_min = min(scalar_min, block % state % time_levs(2) % state % u % array(k,iEdge))
+            scalar_max = max(scalar_max, block % state % time_levs(2) % state % u % array(k,iEdge))
+         end do
+         end do
+         call mpas_dmpar_min_real(domain%dminfo, scalar_min, global_scalar_min)
+         call mpas_dmpar_max_real(domain%dminfo, scalar_max, global_scalar_max)
+         write(0,*) 'global min, max u ',global_scalar_min, global_scalar_max
 
-             scalar_min = 0.
-             scalar_max = 0.
-             do iEdge = 1, block % mesh % nEdgesSolve
-             do k = 1, block % mesh % nVertLevels
-               scalar_min = min(scalar_min, block % state % time_levs(2) % state % u % array(k,iEdge))
-               scalar_max = max(scalar_max, block % state % time_levs(2) % state % u % array(k,iEdge))
-             enddo
-             enddo
-             call mpas_dmpar_min_real(domain%dminfo, scalar_min, global_scalar_min)
-             call mpas_dmpar_max_real(domain%dminfo, scalar_max, global_scalar_max)
-!             write(0,*) 'local  min, max u ',scalar_min, scalar_max
-             write(0,*) 'global min, max u ',global_scalar_min, global_scalar_max
+         do iScalar = 1, block % state % time_levs(2) % state % num_scalars
+            scalar_min = 0.
+            scalar_max = 0.
+            do iCell = 1, block % mesh % nCellsSolve
+            do k = 1, block % mesh % nVertLevels
+               scalar_min = min(scalar_min, block % state % time_levs(2) % state % scalars % array(iScalar,k,iCell))
+               scalar_max = max(scalar_max, block % state % time_levs(2) % state % scalars % array(iScalar,k,iCell))
+            end do
+            end do
+            call mpas_dmpar_min_real(domain%dminfo, scalar_min, global_scalar_min)
+            call mpas_dmpar_max_real(domain%dminfo, scalar_max, global_scalar_max)
+            write(0,102) iScalar,global_scalar_min,global_scalar_max
+         end do
 
-             do iScalar = 1, block % state % time_levs(2) % state % num_scalars
-                scalar_min = 0.
-                scalar_max = 0.
-                do iCell = 1, block % mesh % nCellsSolve
-                do k = 1, block % mesh % nVertLevels
-                  scalar_min = min(scalar_min, block % state % time_levs(2) % state % scalars % array(iScalar,k,iCell))
-                  scalar_max = max(scalar_max, block % state % time_levs(2) % state % scalars % array(iScalar,k,iCell))
-                enddo
-                enddo
-                call mpas_dmpar_min_real(domain%dminfo, scalar_min, global_scalar_min)
-                call mpas_dmpar_max_real(domain%dminfo, scalar_max, global_scalar_max)
-!                write(0,101) iScalar,scalar_min,scalar_max
-                write(0,102) iScalar,global_scalar_min,global_scalar_max
-             end do
-             block =&gt; block % next
+         block =&gt; block % next
+      end do
 
-          end do
-!      end if
-
-
    end subroutine atm_srk3
 
 !---
 
    subroutine atm_rk_integration_setup( s_old, s_new, diag )
 
-     implicit none
-     type (state_type) :: s_new, s_old
-     type (diag_type) :: diag
+      implicit none
 
-     diag % ru_save % array = diag % ru % array
-     diag % rw_save % array = diag % rw % array
-     diag % rtheta_p_save % array = diag % rtheta_p % array
-     diag % rho_p_save % array = diag % rho_p % array
+      type (state_type) :: s_new, s_old
+      type (diag_type) :: diag
 
-     s_old % u % array = s_new % u % array
-     s_old % w % array = s_new % w % array
-     s_old % theta_m % array = s_new % theta_m % array
-     s_old % rho_zz % array = s_new % rho_zz % array
-     s_old % scalars % array = s_new % scalars % array
+      diag % ru_save % array       = diag % ru % array
+      diag % rw_save % array       = diag % rw % array
+      diag % rtheta_p_save % array = diag % rtheta_p % array
+      diag % rho_p_save % array    = diag % rho_p % array
 
+      s_old % u % array       = s_new % u % array
+      s_old % w % array       = s_new % w % array
+      s_old % theta_m % array = s_new % theta_m % array
+      s_old % rho_zz % array  = s_new % rho_zz % array
+      s_old % scalars % array = s_new % scalars % array
+
    end subroutine atm_rk_integration_setup
 
 !-----
 
    subroutine atm_compute_moist_coefficients( state, diag, grid )
 
+      ! the moist coefficients cqu and cqw serve to transform the inverse dry density (1/rho_d) 
+      ! into the inverse full (moist) density (1/rho_m).
+
       implicit none
+
       type (state_type) :: state
       type (diag_type) :: diag
       type (mesh_type) :: grid
@@ -482,35 +429,38 @@
       integer :: iEdge, iCell, k, cell1, cell2, iq
       integer :: nCells, nEdges, nVertLevels, nCellsSolve
       real (kind=RKIND) :: qtot
+      integer, dimension(:,:), pointer :: cellsOnEdge
 
       nCells      = grid % nCells
       nEdges      = grid % nEdges
       nVertLevels = grid % nVertLevels
       nCellsSolve = grid % nCellsSolve
 
-        do iCell = 1, nCellsSolve
-          do k = 2, nVertLevels
+      cellsOnEdge =&gt; grid % cellsOnEdge % array
+
+      do iCell = 1, nCellsSolve
+         do k = 2, nVertLevels
             qtot = 0.
             do iq = state % moist_start, state % moist_end
-              qtot = qtot + 0.5 * (state % scalars % array (iq, k, iCell) + state % scalars % array (iq, k-1, iCell))
+               qtot = qtot + 0.5 * (state % scalars % array (iq, k, iCell) + state % scalars % array (iq, k-1, iCell))
             end do
             diag % cqw % array(k,iCell) = 1./(1.+qtot)
-          end do
-        end do
+         end do
+      end do
 
-        do iEdge = 1, nEdges
-          cell1 = grid % cellsOnEdge % array(1,iEdge)
-          cell2 = grid % cellsOnEdge % array(2,iEdge)
-          if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then
+      do iEdge = 1, nEdges
+         cell1 = cellsOnEdge(1,iEdge)
+         cell2 = cellsOnEdge(2,iEdge)
+         if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then
             do k = 1, nVertLevels
-              qtot = 0.
-              do iq = state % moist_start, state % moist_end
-                 qtot = qtot + 0.5 * ( state % scalars % array (iq, k, cell1) + state % scalars % array (iq, k, cell2) )
-              end do
-              diag % cqu % array(k,iEdge) = 1./( 1. + qtot)
+               qtot = 0.
+               do iq = state % moist_start, state % moist_end
+                  qtot = qtot + 0.5 * ( state % scalars % array (iq, k, cell1) + state % scalars % array (iq, k, cell2) )
+               end do
+               diag % cqu % array(k,iEdge) = 1./( 1. + qtot)
             end do
-          end if
-        end do
+         end if
+      end do
 
    end subroutine atm_compute_moist_coefficients
 
@@ -528,13 +478,14 @@
 
       implicit none
 
-      type (state_type), intent(in) :: s
-      type (mesh_type), intent(in) :: grid
+      type (state_type), intent(in)   :: s
+      type (mesh_type), intent(in)    :: grid
       type (diag_type), intent(inout) :: diag
-      real (kind=RKIND), intent(in) :: dts
+      real (kind=RKIND), intent(in)   :: dts
 
-      integer :: i, k, iq
 
+      integer :: iCell, k, iq
+
       integer :: nCells, nVertLevels, nCellsSolve
       real (kind=RKIND), dimension(:,:), pointer :: zz, cqw, p, t, rb, rtb, pb, rt
       real (kind=RKIND), dimension(:,:), pointer :: cofwr, cofwz, coftz, cofwt, a_tri, alpha_tri, gamma_tri
@@ -548,8 +499,6 @@
       nCells      = grid % nCells
       nCellsSolve = grid % nCellsSolve
       nVertLevels = grid % nVertLevels
-!      epssm = grid % epssm  !  this should come in through the namelist  ******************
-!      epssm = 0.1
       epssm = config_epssm
 
       rdzu =&gt; grid % rdzu % array
@@ -584,53 +533,53 @@
          cofrz(k) = dtseps*rdzw(k)
       end do
 
-      do i = 1, nCellsSolve  !  we only need to do cells we are solving for, not halo cells
+      do iCell = 1, nCellsSolve  !  we only need to do cells we are solving for, not halo cells
 
-        do k=2,nVertLevels
-          cofwr(k,i) =.5*dtseps*gravity*(fzm(k)*zz(k,i)+fzp(k)*zz(k-1,i))
-        end do
-        coftz(1,i) = 0.0
-        do k=2,nVertLevels
-           cofwz(k,i) = dtseps*c2*(fzm(k)*zz(k,i)+fzp(k)*zz(k-1,i))  &amp;
-                *rdzu(k)*cqw(k,i)*(fzm(k)*p (k,i)+fzp(k)*p (k-1,i))
-           coftz(k,i) = dtseps*   (fzm(k)*t (k,i)+fzp(k)*t (k-1,i))
-        end do
-        coftz(nVertLevels+1,i) = 0.0
-        do k=1,nVertLevels
+         do k=2,nVertLevels
+            cofwr(k,iCell) =.5*dtseps*gravity*(fzm(k)*zz(k,iCell)+fzp(k)*zz(k-1,iCell))
+         end do
+         coftz(1,iCell) = 0.0
+         do k=2,nVertLevels
+            cofwz(k,iCell) = dtseps*c2*(fzm(k)*zz(k,iCell)+fzp(k)*zz(k-1,iCell))  &amp;
+                 *rdzu(k)*cqw(k,iCell)*(fzm(k)*p (k,iCell)+fzp(k)*p (k-1,iCell))
+            coftz(k,iCell) = dtseps*   (fzm(k)*t (k,iCell)+fzp(k)*t (k-1,iCell))
+         end do
+         coftz(nVertLevels+1,iCell) = 0.0
+         do k=1,nVertLevels
 
-          qtot = 0.
-          do iq = s % moist_start, s % moist_end
-            qtot = qtot + s % scalars % array (iq, k, i)
-          end do
+            qtot = 0.
+            do iq = s % moist_start, s % moist_end
+               qtot = qtot + s % scalars % array (iq, k, iCell)
+            end do
 
-          cofwt(k,i) = .5*dtseps*rcv*zz(k,i)*gravity*rb(k,i)/(1.+qtot)  &amp;
-                              *p(k,i)/((rtb(k,i)+rt(k,i))*pb(k,i))
-        end do
+            cofwt(k,iCell) = .5*dtseps*rcv*zz(k,iCell)*gravity*rb(k,iCell)/(1.+qtot)  &amp;
+                                *p(k,iCell)/((rtb(k,iCell)+rt(k,iCell))*pb(k,iCell))
+         end do
 
-        a_tri(1,i) = 0.  ! note, this value is never used
-        b_tri(1) = 1.    ! note, this value is never used
-        c_tri(1) = 0.    ! note, this value is never used
-        gamma_tri(1,i) = 0.
-        alpha_tri(1,i) = 0.  ! note, this value is never used
+         a_tri(1,iCell) = 0.  ! note, this value is never used
+         b_tri(1) = 1.    ! note, this value is never used
+         c_tri(1) = 0.    ! note, this value is never used
+         gamma_tri(1,iCell) = 0.
+         alpha_tri(1,iCell) = 0.  ! note, this value is never used
 
-        do k=2,nVertLevels
-          a_tri(k,i) = -cofwz(k  ,i)* coftz(k-1,i)*rdzw(k-1)*zz(k-1,i)   &amp;
-                       +cofwr(k  ,i)* cofrz(k-1  )                       &amp;
-                       -cofwt(k-1,i)* coftz(k-1,i)*rdzw(k-1)
-          b_tri(k) = 1.                                                  &amp;
-                       +cofwz(k  ,i)*(coftz(k  ,i)*rdzw(k  )*zz(k  ,i)   &amp;
-                                    +coftz(k  ,i)*rdzw(k-1)*zz(k-1,i))   &amp;
-                       -coftz(k  ,i)*(cofwt(k  ,i)*rdzw(k  )             &amp;
-                                     -cofwt(k-1,i)*rdzw(k-1))            &amp;
-                       +cofwr(k,  i)*(cofrz(k    )-cofrz(k-1))
-          c_tri(k) =   -cofwz(k  ,i)* coftz(k+1,i)*rdzw(k  )*zz(k  ,i)   &amp;
-                       -cofwr(k  ,i)* cofrz(k    )                       &amp;
-                       +cofwt(k  ,i)* coftz(k+1,i)*rdzw(k  )
-        end do
-        do k=2,nVertLevels
-          alpha_tri(k,i) = 1./(b_tri(k)-a_tri(k,i)*gamma_tri(k-1,i))
-          gamma_tri(k,i) = c_tri(k)*alpha_tri(k,i)
-        end do
+         do k=2,nVertLevels
+            a_tri(k,iCell) = -cofwz(k  ,iCell)* coftz(k-1,iCell)*rdzw(k-1)*zz(k-1,iCell)   &amp;
+                         +cofwr(k  ,iCell)* cofrz(k-1  )                       &amp;
+                         -cofwt(k-1,iCell)* coftz(k-1,iCell)*rdzw(k-1)
+            b_tri(k) = 1.                                                  &amp;
+                         +cofwz(k  ,iCell)*(coftz(k  ,iCell)*rdzw(k  )*zz(k  ,iCell)   &amp;
+                                      +coftz(k  ,iCell)*rdzw(k-1)*zz(k-1,iCell))   &amp;
+                         -coftz(k  ,iCell)*(cofwt(k  ,iCell)*rdzw(k  )             &amp;
+                                       -cofwt(k-1,iCell)*rdzw(k-1))            &amp;
+                         +cofwr(k,  iCell)*(cofrz(k    )-cofrz(k-1))
+            c_tri(k) =   -cofwz(k  ,iCell)* coftz(k+1,iCell)*rdzw(k  )*zz(k  ,iCell)   &amp;
+                         -cofwr(k  ,iCell)* cofrz(k    )                       &amp;
+                         +cofwt(k  ,iCell)* coftz(k+1,iCell)*rdzw(k  )
+         end do
+         do k=2,nVertLevels
+            alpha_tri(k,iCell) = 1./(b_tri(k)-a_tri(k,iCell)*gamma_tri(k-1,iCell))
+            gamma_tri(k,iCell) = c_tri(k)*alpha_tri(k,iCell)
+         end do
 
       end do ! loop over cells
 
@@ -640,48 +589,67 @@
 
    subroutine atm_set_smlstep_pert_variables( tend, diag, grid )
 
+      ! following Klemp et al MWR 2007, we use preturbation variables
+      ! in the acoustic-step integration.  This routine computes those 
+      ! perturbation variables.  state variables are reconstituted after 
+      ! the acousstic steps in subroutine atm_recover_large_step_variables
+
+
       implicit none
+
       type (tend_type) :: tend
       type (diag_type) :: diag
       type (mesh_type) :: grid
-      !SHP-w
+
       integer :: iCell, iEdge, k, cell1, cell2, coef_3rd_order
+      integer :: nCellsSolve, nCells, nVertLevels, nEdges
       integer, dimension(:,:), pointer :: cellsOnEdge
       real (kind=RKIND), dimension(:), pointer :: fzm, fzp, dvEdge, areaCell
       real (kind=RKIND) :: flux
-      !SHP-w
+
       coef_3rd_order = config_coef_3rd_order
-      if(config_theta_adv_order /=3) coef_3rd_order = 0
+      if (config_theta_adv_order /=3) coef_3rd_order = 0
 
-      !SHP-w
+      nCellsSolve = grid % nCellsSolve
+      nCells      = grid % nCells
+      nEdges      = grid % nEdges
+      nVertLevels = grid % nVertLevels
+
       fzm =&gt; grid % fzm % array
       fzp =&gt; grid % fzp % array
       dvEdge =&gt; grid % dvEdge % array
       areaCell =&gt; grid % areaCell % array
       cellsOnEdge =&gt; grid % cellsOnEdge % array
 
+      ! set the acoustic step perturbation variables by subtracting the RK timestep variables
+      ! from their at the previous RK substep.
+
       diag % rho_pp % array = diag % rho_p_save % array - diag % rho_p % array
-
       diag % ru_p % array = diag % ru_save % array - diag % ru % array
       diag % rtheta_pp % array = diag % rtheta_p_save % array - diag % rtheta_p % array
       diag % rtheta_pp_old % array = diag % rtheta_pp % array
       diag % rw_p % array = diag % rw_save % array - diag % rw % array
 
-      do iCell = 1, grid % nCellsSolve
-      do k = 2, grid % nVertLevels
-        tend % w % array(k,iCell) = ( fzm(k) * grid % zz % array(k  ,iCell) +   &amp;
-                                      fzp(k) * grid % zz % array(k-1,iCell)   ) &amp;
-                                     * tend % w % array(k,iCell)
+      ! we solve for omega instead of w (see Klemp et al MWR 2007),
+      ! so here we change the w_p tendency to an omega_p tendency
+
+      do iCell = 1, nCellsSolve
+      do k = 2, nVertLevels
+         tend % w % array(k,iCell) = ( fzm(k) * grid % zz % array(k  ,iCell) +   &amp;
+                                       fzp(k) * grid % zz % array(k-1,iCell)   ) &amp;
+                                      * tend % w % array(k,iCell)
       end do
       end do
 
-      do iEdge = 1,grid % nEdges
+      ! here we need to compute the omega tendency in a manner consistent with our diagnosis of omega.
+      ! this requires us to use the same flux divergence as is used in the theta eqn - see Klemp et al MWR 2003.
 
+      do iEdge = 1, nEdges
+
          cell1 = cellsOnEdge(1,iEdge)
          cell2 = cellsOnEdge(2,iEdge)
 
-         !SHP-w
-         do k = 2, grid%nVertLevels
+         do k = 2, nVertLevels
             flux = fzm(k) * tend % u % array(k,iEdge) + fzp(k) * tend % u % array(k-1,iEdge)
             tend % w % array(k,cell2) = tend % w % array(k,cell2)   &amp;
                      + (grid % zb % array(k,2,iEdge) + coef_3rd_order*sign(1.0_RKIND,tend % u % array(k,iEdge))*grid %zb3 % array(k,2,iEdge))*flux   &amp;
@@ -693,6 +661,8 @@
 
       end do
 
+      !  ruAvg and wwAvg will store the mass fluxes averaged over the acoustic steps for the subsequent scalar transport.
+
       diag % ruAvg % array = 0.
       diag % wwAvg % array = 0.
 
@@ -702,6 +672,12 @@
 
    subroutine atm_advance_acoustic_step( s, diag, tend, grid, dts )
 
+      !  This subroutine performs the entire acoustic step update, following Klemp et al MWR 2007,
+      !  using forward-backward vertically implicit integration.  
+      !  The gravity-waves are included in the acoustic-step integration.
+      !  The input state variables that are updated are ru_p, rw_p (note that this is (rho*omega)_p here),
+      !  rtheta_p, and rho_pp.  The time averaged mass flux is accumulated in ruAvg and wwAvg
+
       implicit none
 
       type (state_type) :: s
@@ -710,15 +686,17 @@
       type (mesh_type) :: grid
       real (kind=RKIND), intent(in) :: dts
 
-      real (kind=RKIND), dimension(:,:), pointer :: rho_zz, theta_m, ru_p, rw_p, rtheta_pp,    &amp;
-                                                    rtheta_pp_old, zz, exner, cqu, ruAvg, &amp;
-                                                    wwAvg, rho_pp, cofwt, coftz, zx,      &amp;
-                                                    a_tri, alpha_tri, gamma_tri, dss,     &amp;
-                                                    tend_ru, tend_rho, tend_rt, tend_rw,  &amp;
+
+      real (kind=RKIND), dimension(:,:), pointer :: rho_zz, theta_m, ru_p, rw_p, rtheta_pp,  &amp;
+                                                    rtheta_pp_old, zz, exner, cqu, ruAvg,    &amp;
+                                                    wwAvg, rho_pp, cofwt, coftz, zx,         &amp;
+                                                    a_tri, alpha_tri, gamma_tri, dss,        &amp;
+                                                    tend_ru, tend_rho, tend_rt, tend_rw,     &amp;
                                                     zgrid, cofwr, cofwz, w, h_divergence
       real (kind=RKIND), dimension(:), pointer :: fzm, fzp, rdzw, dcEdge, AreaCell, cofrz, dvEdge
 
       real (kind=RKIND), dimension(:,:), pointer :: cpr, cpl, pzp, pzm
+      integer, dimension(:,:), pointer :: cellsOnEdge
 
       real (kind=RKIND) :: smdiv, c2, rcv
       real (kind=RKIND), dimension( grid % nVertLevels ) :: du
@@ -734,11 +712,11 @@
       integer :: nEdges, nCells, nCellsSolve, nVertLevels
 
       logical, parameter :: debug = .false.
-!      logical, parameter :: debug = .true.
       logical, parameter :: debug1 = .false.
       logical :: newpx
 
 !--
+      cellsOnEdge =&gt; grid % cellsOnEdge % array
 
       rho_zz =&gt; s % rho_zz % array
       theta_m =&gt; s % theta_m % array
@@ -782,8 +760,6 @@
       dvEdge =&gt; grid % dvEdge % array
       AreaCell =&gt; grid % AreaCell % array
 
-!  might these be pointers instead? **************************
-
       nEdges = grid % nEdges
       nCells = grid % nCells
       nCellsSolve = grid % nCellsSolve
@@ -797,6 +773,8 @@
       cpl         =&gt; grid % cpl % array
       newpx = config_newpx
 
+      ! epssm is the offcentering coefficient for the vertically implicit integration.
+      ! smdiv is the 3D divergence-damping coefficient.
       epssm = config_epssm
       smdiv = config_smdiv
 
@@ -807,18 +785,27 @@
       ts = 0.
       rs = 0.
 
-      ! acoustic step divergence damping - forward weight rtheta_pp
+      ! acoustic step divergence damping - forward weight rtheta_pp - see Klemp et al MWR 2007
       rtheta_pp_old = rtheta_pp + smdiv*(rtheta_pp - rtheta_pp_old)
 
-      if(debug) write(0,*) ' updating ru_p '
+      if (debug) write(0,*) ' updating ru_p '
 
+      ! forward-backward acoustic step integration.
+      ! begin by updating the horizontal velocity u, 
+      ! and accumulating the contribution from the updated u to the other tendencies.
+
+      ! we are looping over all edges, but only computing on edges of owned cells. This will include updates of
+      ! all owned edges plus some edges that are owned by other blocks.  We perform these redundant computations
+      ! so that we do not have to communicate updates of u to update the cell variables (rho, w, and theta). 
+
       do iEdge = 1, nEdges
  
-         cell1 = grid % cellsOnEdge % array (1,iEdge)
-         cell2 = grid % cellsOnEdge % array (2,iEdge)
-         ! update edge for block-owned cells
-         if (cell1 &lt;= grid % nCellsSolve .or. cell2 &lt;= grid % nCellsSolve ) then
+         cell1 = cellsOnEdge(1,iEdge)
+         cell2 = cellsOnEdge(2,iEdge)
 
+         ! update edges for block-owned cells
+         if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve ) then
+
             if (newpx) then
 
                k = 1
@@ -853,13 +840,6 @@
             else
 
                k = 1
-!               dpzx(k) = .5*zx(k,iEdge)*(cf1*(zz(k  ,cell2)*rtheta_pp_old(k  ,cell2)    &amp;
-!                                             +zz(k  ,cell1)*rtheta_pp_old(k  ,cell1))   &amp;
-!                                        +cf2*(zz(k+1,cell2)*rtheta_pp_old(k+1,cell2)    &amp;
-!                                             +zz(k+1,cell1)*rtheta_pp_old(k+1,cell1))   &amp;
-!                                        +cf3*(zz(k+2,cell2)*rtheta_pp_old(k+2,cell2)    &amp;
-!                                             +zz(k+2,cell1)*rtheta_pp_old(k+2,cell1)))
-
                dpzx(k) = .25*(zx(k,iEdge)+zx(k+1,iEdge))                               &amp;
                          *(pzm(k,cell2)*(zz(k+1,cell2)*rtheta_pp_old(k+1,cell2)        &amp;
                                         -zz(k  ,cell2)*rtheta_pp_old(k  ,cell2))       &amp;
@@ -870,11 +850,7 @@
                           +pzp(k,cell1)*(zz(k+2,cell1)*rtheta_pp_old(k+2,cell1)        &amp;
                                         -zz(k  ,cell1)*rtheta_pp_old(k  ,cell1)))
 
-               do k=2,grid % nVertLevels-1
-!                  dpzx(k)=.5*zx(k,iEdge)*(fzm(k)*(zz(k  ,cell2)*rtheta_pp_old(k  ,cell2)   &amp;
-!                                                 +zz(k  ,cell1)*rtheta_pp_old(k  ,cell1))  &amp;
-!                                         +fzp(k)*(zz(k-1,cell2)*rtheta_pp_old(k-1,cell2)   &amp;
-!                                                 +zz(k-1,cell1)*rtheta_pp_old(k-1,cell1)))
+               do k=2,nVertLevels-1
                   dpzx(k) = .25*(zx(k,iEdge)+zx(k+1,iEdge))                                   &amp;
                                    *(pzp(k,cell2)*(zz(k+1,cell2)*rtheta_pp_old(k+1,cell2)     &amp;
                                                   -zz(k  ,cell2)*rtheta_pp_old(k  ,cell2))    &amp;
@@ -886,38 +862,30 @@
                                                   -zz(k-1,cell1)*rtheta_pp_old(k-1,cell1)))
                end do
 
-               k=grid % nVertLevels
+               k = nVertLevels
                dpzx(k) = .25*(zx(k,iEdge)+zx(k+1,iEdge))                                   &amp;
                                 *(pzm(k,cell2)*(zz(k  ,cell2)*rtheta_pp_old(k  ,cell2)     &amp;
                                                -zz(k-1,cell2)*rtheta_pp_old(k-1,cell2))    &amp;
                                  +pzm(k,cell1)*(zz(k  ,cell1)*rtheta_pp_old(k  ,cell1)     &amp;
                                                -zz(k-1,cell1)*rtheta_pp_old(k-1,cell1)))
 
-!               dpzx(nVertLevels + 1) = 0.
-
                do k=1,nVertLevels
-!                  pgrad =  (rtheta_pp_old(k,cell2)-rtheta_pp_old(k,cell1))/dcEdge(iEdge)  &amp;
-!                               - rdzw(k)*(dpzx(k+1)-dpzx(k))
                   pgrad =     ((rtheta_pp_old(k,cell2)*zz(k,cell2)                    &amp;
                                -rtheta_pp_old(k,cell1)*zz(k,cell1))/dcEdge(iEdge)     &amp;
                             -dpzx(k))/(.5*(zz(k,cell2)+zz(k,cell1)))
                   pgrad = 0.5*c2*(exner(k,cell1)+exner(k,cell2))*pgrad
                   du(k) = dts*(tend_ru(k,iEdge) - cqu(k,iEdge) * pgrad) 
-!                          + (0.05/6.)*dcEdge(iEdge)*(h_divergence(k,cell2)-h_divergence(k,cell1))
                end do
             end if
 
             do k=1,nVertLevels
+
+               ! full update of ru_p
+
                ru_p(k,iEdge) = ru_p(k,iEdge) + du(k)
 
-               if(debug) then
-                 if(iEdge == 3750) then
-                   write(0,*) ' k, pgrad, tend_ru ',k,pgrad,tend_ru(k,3750)
-                 end if
-               end if
+               ! add horizontal fluxes using updated ru_p into density update, rtheta update and w update
 
-!  need to add horizontal fluxes into density update, rtheta update and w update
-
                flux = dts*dvEdge(iEdge)*ru_p(k,iEdge)
                rs(k,cell1) = rs(k,cell1)-flux/AreaCell(cell1)
                rs(k,cell2) = rs(k,cell2)+flux/AreaCell(cell2)
@@ -925,67 +893,82 @@
                flux = flux*0.5*(theta_m(k,cell2)+theta_m(k,cell1))
                ts(k,cell1) = ts(k,cell1)-flux/AreaCell(cell1)
                ts(k,cell2) = ts(k,cell2)+flux/AreaCell(cell2)
-   
+
+               ! accumulate ru_p for use later in scalar transport
+
                ruAvg(k,iEdge) = ruAvg(k,iEdge) + ru_p(k,iEdge)
 
             end do
 
-        end if ! end test for block-owned cells
+         end if ! end test for block-owned cells
 
       end do ! end loop over edges
 
       ! saving rtheta_pp before update for use in divergence damping in next acoustic step
+
       rtheta_pp_old(:,:) = rtheta_pp(:,:)
 
+      ! vertically implicit acoustic and gravity wave integration.
+      ! this follows Klemp et al MWR 2007, with the addition of an implicit Rayleigh damping of w
+      ! serves as a gravity-wave absorbing layer, from Klemp et al 2008.
+
       do iCell = 1, nCellsSolve
 
-        do k=1, nVertLevels
-          rs(k,iCell) = rho_pp(k,iCell) + dts*tend_rho(k,iCell) + rs(k,iCell)      &amp;
-                          - cofrz(k)*resm*(rw_p(k+1,iCell)-rw_p(k,iCell))
-          ts(k,iCell) = rtheta_pp(k,iCell) + dts*tend_rt(k,iCell) + ts(k,iCell)    &amp;
-                             - resm*rdzw(k)*(coftz(k+1,iCell)*rw_p(k+1,iCell)      &amp;
-                             -coftz(k,iCell)*rw_p(k,iCell))
-        enddo
+         do k=1, nVertLevels
+            rs(k,iCell) = rho_pp(k,iCell) + dts*tend_rho(k,iCell) + rs(k,iCell)      &amp;
+                            - cofrz(k)*resm*(rw_p(k+1,iCell)-rw_p(k,iCell))
+            ts(k,iCell) = rtheta_pp(k,iCell) + dts*tend_rt(k,iCell) + ts(k,iCell)    &amp;
+                               - resm*rdzw(k)*(coftz(k+1,iCell)*rw_p(k+1,iCell)      &amp;
+                               -coftz(k,iCell)*rw_p(k,iCell))
+         end do
 
-        do k=2, nVertLevels
+         do k=2, nVertLevels
 
-          wwavg(k,iCell) = wwavg(k,iCell) + 0.5*(1.-epssm)*rw_p(k,iCell)
+            wwavg(k,iCell) = wwavg(k,iCell) + 0.5*(1.-epssm)*rw_p(k,iCell)
 
-          rw_p(k,iCell) = rw_p(k,iCell) +  dts*tend_rw(k,iCell)                       &amp;
-                     - cofwz(k,iCell)*((zz(k  ,iCell)*ts (k  ,iCell)                  &amp;
-                                   -zz(k-1,iCell)*ts (k-1,iCell))                     &amp;
-                             +resm*(zz(k  ,iCell)*rtheta_pp(k  ,iCell)                &amp;
-                                   -zz(k-1,iCell)*rtheta_pp(k-1,iCell)))              &amp;
-                     - cofwr(k,iCell)*((rs (k,iCell)+rs (k-1,iCell))                  &amp;
-                             +resm*(rho_pp(k,iCell)+rho_pp(k-1,iCell)))               &amp;
-                     + cofwt(k  ,iCell)*(ts (k  ,iCell)+resm*rtheta_pp(k  ,iCell))    &amp;
-                     + cofwt(k-1,iCell)*(ts (k-1,iCell)+resm*rtheta_pp(k-1,iCell))
-        enddo
+            rw_p(k,iCell) = rw_p(k,iCell) +  dts*tend_rw(k,iCell)                       &amp;
+                       - cofwz(k,iCell)*((zz(k  ,iCell)*ts (k  ,iCell)                  &amp;
+                                     -zz(k-1,iCell)*ts (k-1,iCell))                     &amp;
+                               +resm*(zz(k  ,iCell)*rtheta_pp(k  ,iCell)                &amp;
+                                     -zz(k-1,iCell)*rtheta_pp(k-1,iCell)))              &amp;
+                       - cofwr(k,iCell)*((rs (k,iCell)+rs (k-1,iCell))                  &amp;
+                               +resm*(rho_pp(k,iCell)+rho_pp(k-1,iCell)))               &amp;
+                       + cofwt(k  ,iCell)*(ts (k  ,iCell)+resm*rtheta_pp(k  ,iCell))    &amp;
+                       + cofwt(k-1,iCell)*(ts (k-1,iCell)+resm*rtheta_pp(k-1,iCell))
+         end do
 
-        do k=2,nVertLevels
-          rw_p(k,iCell) = (rw_p(k,iCell)-a_tri(k,iCell)*rw_p(k-1,iCell))*alpha_tri(k,iCell)
-        end do
+         ! tridiagonal solve sweeping up and then down the column
 
-        do k=nVertLevels,1,-1
-          rw_p(k,iCell) = rw_p(k,iCell) - gamma_tri(k,iCell)*rw_p(k+1,iCell)     
-        end do
+         do k=2,nVertLevels
+            rw_p(k,iCell) = (rw_p(k,iCell)-a_tri(k,iCell)*rw_p(k-1,iCell))*alpha_tri(k,iCell)
+         end do
 
-        do k=2,nVertLevels
-           rw_p(k,iCell) = (rw_p(k,iCell)-dts*dss(k,iCell)*               &amp;
-                       (fzm(k)*zz (k,iCell)+fzp(k)*zz (k-1,iCell))        &amp;
-                       *(fzm(k)*rho_zz(k,iCell)+fzp(k)*rho_zz(k-1,iCell))       &amp;
-                                *w(k,iCell)    )/(1.+dts*dss(k,iCell))
+         do k=nVertLevels,1,-1
+            rw_p(k,iCell) = rw_p(k,iCell) - gamma_tri(k,iCell)*rw_p(k+1,iCell)     
+         end do
 
-           wwAvg(k,iCell) = wwAvg(k,iCell) + 0.5*(1.+epssm)*rw_p(k,iCell)
+         ! the implicit Rayleigh damping on w (gravity-wave absorbing) 
 
-        end do
+         do k=2,nVertLevels
+            rw_p(k,iCell) = (rw_p(k,iCell)-dts*dss(k,iCell)*               &amp;
+                        (fzm(k)*zz (k,iCell)+fzp(k)*zz (k-1,iCell))        &amp;
+                        *(fzm(k)*rho_zz(k,iCell)+fzp(k)*rho_zz(k-1,iCell))       &amp;
+                                 *w(k,iCell)    )/(1.+dts*dss(k,iCell))

+            ! accumulate (rho*omega)' for use later in scalar transport
 
-        do k=1,nVertLevels
-          rho_pp(k,iCell) = rs(k,iCell) - cofrz(k) *(rw_p(k+1,iCell)-rw_p(k  ,iCell))
-          rtheta_pp(k,iCell) = ts(k,iCell) - rdzw(k)*(coftz(k+1,iCell)*rw_p(k+1,iCell)  &amp;
-                             -coftz(k  ,iCell)*rw_p(k  ,iCell))
-        end do
+            wwAvg(k,iCell) = wwAvg(k,iCell) + 0.5*(1.+epssm)*rw_p(k,iCell)

+         end do
 
+         ! update rho_pp and theta_pp given updated rw_p
+
+         do k=1,nVertLevels
+            rho_pp(k,iCell) = rs(k,iCell) - cofrz(k) *(rw_p(k+1,iCell)-rw_p(k  ,iCell))
+            rtheta_pp(k,iCell) = ts(k,iCell) - rdzw(k)*(coftz(k+1,iCell)*rw_p(k+1,iCell)  &amp;
+                               -coftz(k  ,iCell)*rw_p(k  ,iCell))
+         end do
+
       end do !  end of loop over cells
 
    end subroutine atm_advance_acoustic_step
@@ -994,7 +977,13 @@
 
    subroutine atm_recover_large_step_variables( s, diag, tend, grid, dt, ns, rk_step )
 
+      ! reconstitute state variables from acoustic-step perturbation variables 
+      ! after the acoustic steps.  The perturbation variables were originally set in
+      ! subroutine atm_set_smlstep_pert_variables prior to their acoustic-steps update.
+      ! we are also computing a few other state-derived variables here.
+
       implicit none
+
       type (state_type) :: s
       type (diag_type) :: diag
       type (tend_type) :: tend
@@ -1002,12 +991,13 @@
       integer, intent(in) :: ns, rk_step
       real (kind=RKIND), intent(in) :: dt
 
+
       real (kind=RKIND), dimension(:,:), pointer :: wwAvg, rw_save, w, rw, rw_p, rtheta_p, rtheta_pp,   &amp;
                                                     rtheta_p_save, rt_diabatic_tend, rho_p, rho_p_save, &amp;
                                                     rho_pp, rho_zz, rho_base, ruAvg, ru_save, ru_p, u, ru, &amp;
                                                     exner, exner_base, rtheta_base, pressure_p,         &amp;
                                                     zz, theta_m, pressure_b, qvapor
-      real (kind=RKIND), dimension(:), pointer :: fzm, fzp, dvEdge, AreaCell
+      real (kind=RKIND), dimension(:), pointer :: fzm, fzp, dvEdge, areaCell
       real (kind=RKIND), dimension(:,:,:), pointer :: zb, zb3 
       integer, dimension(:,:), pointer :: cellsOnEdge
 
@@ -1015,86 +1005,79 @@
       integer :: nVertLevels, nCells, nCellsSolve, nEdges, nEdgesSolve
       real (kind=RKIND) :: rcv, p0, cf1, cf2, cf3, flux, coef_3rd_order
 
-!      logical, parameter :: debug=.true.
       logical, parameter :: debug=.false.
 
-!---
-       wwAvg =&gt; diag % wwAvg % array
-       rw_save =&gt; diag % rw_save % array
-       rw =&gt; diag % rw % array
-       rw_p =&gt; diag % rw_p % array
-       w =&gt; s % w % array
 
-       rtheta_p =&gt; diag % rtheta_p % array
-       rtheta_p_save =&gt; diag % rtheta_p_save % array
-       rtheta_pp =&gt; diag % rtheta_pp % array
-       rtheta_base =&gt; diag % rtheta_base % array
-       rt_diabatic_tend =&gt; tend % rt_diabatic_tend % array
-       theta_m =&gt; s % theta_m % array
-       qvapor =&gt; s % scalars % array(s%index_qv,:,:)
+      wwAvg =&gt; diag % wwAvg % array
+      rw_save =&gt; diag % rw_save % array
+      rw =&gt; diag % rw % array
+      rw_p =&gt; diag % rw_p % array
+      w =&gt; s % w % array
 
-       rho_zz =&gt; s % rho_zz % array
-       rho_p =&gt; diag % rho_p % array
-       rho_p_save =&gt; diag % rho_p_save % array
-       rho_pp =&gt; diag % rho_pp % array
-       rho_base =&gt; diag % rho_base % array
+      rtheta_p =&gt; diag % rtheta_p % array
+      rtheta_p_save =&gt; diag % rtheta_p_save % array
+      rtheta_pp =&gt; diag % rtheta_pp % array
+      rtheta_base =&gt; diag % rtheta_base % array
+      rt_diabatic_tend =&gt; tend % rt_diabatic_tend % array
+      theta_m =&gt; s % theta_m % array
+      qvapor =&gt; s % scalars % array(s%index_qv,:,:)
 
-       ruAvg =&gt; diag % ruAvg % array
-       ru_save =&gt; diag % ru_save % array
-       ru_p =&gt; diag % ru_p % array
-       ru =&gt; diag % ru % array
-       u =&gt; s % u % array
+      rho_zz =&gt; s % rho_zz % array
+      rho_p =&gt; diag % rho_p % array
+      rho_p_save =&gt; diag % rho_p_save % array
+      rho_pp =&gt; diag % rho_pp % array
+      rho_base =&gt; diag % rho_base % array
 
-       exner =&gt; diag % exner % array
-       exner_base =&gt; diag % exner_base % array
+      ruAvg =&gt; diag % ruAvg % array
+      ru_save =&gt; diag % ru_save % array
+      ru_p =&gt; diag % ru_p % array
+      ru =&gt; diag % ru % array
+      u =&gt; s % u % array
 
-       pressure_p =&gt; diag % pressure_p % array
-       pressure_b =&gt; diag % pressure_base % array
+      exner =&gt; diag % exner % array
+      exner_base =&gt; diag % exner_base % array
 
-       zz =&gt; grid % zz % array
-       zb =&gt; grid % zb % array
-       zb3 =&gt; grid % zb3 % array
-       fzm =&gt; grid % fzm % array
-       fzp =&gt; grid % fzp % array
-       dvEdge =&gt; grid % dvEdge % array
-       AreaCell =&gt; grid % AreaCell % array
-       CellsOnEdge =&gt; grid % CellsOnEdge % array
+      pressure_p =&gt; diag % pressure_p % array
+      pressure_b =&gt; diag % pressure_base % array
 
-       nVertLevels = grid % nVertLevels
-       nCells = grid % nCells
-       nCellsSolve = grid % nCellsSolve
-       nEdges = grid % nEdges
-       nEdgesSolve = grid % nEdgesSolve
+      zz =&gt; grid % zz % array
+      zb =&gt; grid % zb % array
+      zb3 =&gt; grid % zb3 % array
+      fzm =&gt; grid % fzm % array
+      fzp =&gt; grid % fzp % array
+      dvEdge =&gt; grid % dvEdge % array
+      areaCell =&gt; grid % areaCell % array
+      cellsOnEdge =&gt; grid % cellsOnEdge % array
 
-       rcv = rgas/(cp-rgas)
-       p0 = 1.e+05  ! this should come from somewhere else...
+      nVertLevels = grid % nVertLevels
+      nCells = grid % nCells
+      nCellsSolve = grid % nCellsSolve
+      nEdges = grid % nEdges
+      nEdgesSolve = grid % nEdgesSolve
 
-       cf1 = grid % cf1 % scalar
-       cf2 = grid % cf2 % scalar
-       cf3 = grid % cf3 % scalar
-       coef_3rd_order = config_coef_3rd_order
-       if(config_theta_adv_order /=3) coef_3rd_order = 0
+      rcv = rgas/(cp-rgas)
+      p0 = 1.e+05  ! this should come from somewhere else...
 
+      cf1 = grid % cf1 % scalar
+      cf2 = grid % cf2 % scalar
+      cf3 = grid % cf3 % scalar
+      coef_3rd_order = config_coef_3rd_order
+      if (config_theta_adv_order /=3) coef_3rd_order = 0
+
       ! compute new density everywhere so we can compute u from ru.
       ! we will also need it to compute theta_m below
 
       do iCell = 1, nCells
 
-        do k = 1, nVertLevels
+         do k = 1, nVertLevels
 
-          rho_p(k,iCell) = rho_p(k,iCell) + rho_pp(k,iCell)
+            rho_p(k,iCell) = rho_p(k,iCell) + rho_pp(k,iCell)
 
-          rho_zz(k,iCell) = rho_p(k,iCell) + rho_base(k,iCell)
-        end do
+            rho_zz(k,iCell) = rho_p(k,iCell) + rho_base(k,iCell)
+         end do
 
-      !  recover owned-cell values in block
-
-!        if( iCell &lt;= nCellsSolve ) then    ! If using this test, more halo exchanges will be needed
-!  WCS-parallel: OK
-
-
-          w(1,iCell) = 0.
-          do k = 2, nVertLevels
+         w(1,iCell) = 0.
+         do k = 2, nVertLevels
             wwAvg(k,iCell) = rw(k,iCell) + (wwAvg(k,iCell) / float(ns))
 
             rw(k,iCell) = rw(k,iCell) + rw_p(k,iCell)
@@ -1103,27 +1086,27 @@
           ! pick up part of diagnosed w from omega
             w(k,iCell) = rw(k,iCell)/( (fzm(k)*zz (k,iCell)+fzp(k)*zz (k-1,iCell))   &amp;
                                       *(fzm(k)*rho_zz(k,iCell)+fzp(k)*rho_zz(k-1,iCell)) )
-          end do
-          w(nVertLevels+1,iCell) = 0.
+         end do
+         w(nVertLevels+1,iCell) = 0.
 
-          if(rk_step == 3) then
+         if (rk_step == 3) then
             do k = 1, nVertLevels
                rtheta_p(k,iCell) = rtheta_p(k,iCell) + rtheta_pp(k,iCell) &amp;
                                  - dt * rho_zz(k,iCell) * rt_diabatic_tend(k,iCell)
             end do
-          else
+         else
             do k = 1, nVertLevels
                rtheta_p(k,iCell) = rtheta_p(k,iCell) + rtheta_pp(k,iCell)
             end do
-          end if
+         end if
 
-          do k = 1, nVertLevels
+         do k = 1, nVertLevels
             theta_m(k,iCell) = (rtheta_p(k,iCell) + rtheta_base(k,iCell))/rho_zz(k,iCell)
             exner(k,iCell) = (zz(k,iCell)*(rgas/p0)*(rtheta_p(k,iCell)+rtheta_base(k,iCell)))**rcv
-             ! pressure below is perturbation pressure
+            ! pressure_p is perturbation pressure
             pressure_p(k,iCell) = zz(k,iCell) * rgas * (exner(k,iCell)*rtheta_p(k,iCell)+rtheta_base(k,iCell)  &amp;
                                                           * (exner(k,iCell)-exner_base(k,iCell)))
-          end do
+         end do
 
       end do
 
@@ -1133,45 +1116,35 @@
 
       do iEdge = 1, nEdges
 
-        cell1 = CellsOnEdge(1,iEdge)
-        cell2 = CellsOnEdge(2,iEdge)
+         cell1 = cellsOnEdge(1,iEdge)
+         cell2 = cellsOnEdge(2,iEdge)
 
-!        if( cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve ) then   ! If using this test, more halo exchanges will be needed,
-                                                                     ! though we can avoid division by zero, e.g., by rho_zz(:,cell2)
-          do k = 1, nVertLevels
-
-
-!  WCS-parallel: we could pick this up in the last acoustic step (ruAvg) because we solve for all edges of owned cells
-
+         do k = 1, nVertLevels
             ruAvg(k,iEdge) = ru(k,iEdge) + (ruAvg(k,iEdge) / float(ns))
-
             ru(k,iEdge) = ru(k,iEdge) + ru_p(k,iEdge)
-
             u(k,iEdge) = 2.*ru(k,iEdge)/(rho_zz(k,cell1)+rho_zz(k,cell2))
-          enddo
+         end do
 
+         !  finish recovering w from (rho*omega)_p.  as when we formed (rho*omega)_p from u and w, we need
+         !  to use the same flux-divergence operator as is used for the horizontal theta transport
+         !  (See Klemp et al 2003).
 
-! WCS-parallel: we likely only need this for owned cells
+         flux = cf1*ru(1,iEdge) + cf2*ru(2,iEdge) + cf3*ru(3,iEdge)
+         w(1,cell2) = w(1,cell2) - (zb(1,2,iEdge) + sign(1.0_RKIND,flux)*coef_3rd_order*zb3(1,2,iEdge))  &amp;
+                                *flux/(cf1*rho_zz(1,cell2)+cf2*rho_zz(2,cell2)+cf3*rho_zz(3,cell2))
+         w(1,cell1) = w(1,cell1) + (zb(1,1,iEdge) + sign(1.0_RKIND,flux)*coef_3rd_order*zb3(1,1,iEdge))  &amp;
+                                *flux/(cf1*rho_zz(1,cell1)+cf2*rho_zz(2,cell1)+cf3*rho_zz(3,cell1))
 
-          !SHP-mtn
-          flux = cf1*ru(1,iEdge) + cf2*ru(2,iEdge) + cf3*ru(3,iEdge)
-          w(1,cell2) = w(1,cell2) - (zb(1,2,iEdge) + sign(1.0_RKIND,flux)*coef_3rd_order*zb3(1,2,iEdge))  &amp;
-                                 *flux/(cf1*rho_zz(1,cell2)+cf2*rho_zz(2,cell2)+cf3*rho_zz(3,cell2))
-          w(1,cell1) = w(1,cell1) + (zb(1,1,iEdge) + sign(1.0_RKIND,flux)*coef_3rd_order*zb3(1,1,iEdge))  &amp;
-                                 *flux/(cf1*rho_zz(1,cell1)+cf2*rho_zz(2,cell1)+cf3*rho_zz(3,cell1))
-
-          do k = 2, nVertLevels
+         do k = 2, nVertLevels
             flux = (fzm(k)*ru(k,iEdge)+fzp(k)*ru(k-1,iEdge))
             w(k,cell2) = w(k,cell2) - (zb(k,2,iEdge)+sign(1.0_RKIND,flux)*coef_3rd_order*zb3(k,2,iEdge)) &amp;
                                  *flux/(fzm(k)*rho_zz(k,cell2)+fzp(k)*rho_zz(k-1,cell2))
             w(k,cell1) = w(k,cell1) + (zb(k,1,iEdge)+sign(1.0_RKIND,flux)*coef_3rd_order*zb3(k,1,iEdge)) &amp;
                                  *flux/(fzm(k)*rho_zz(k,cell1)+fzp(k)*rho_zz(k-1,cell1))
-          enddo
+         end do
 
-!        end if
+      end do
 
-      enddo
-
    end subroutine atm_recover_large_step_variables
 
 !---------------------------------------------------------------------------------------
@@ -1179,10 +1152,21 @@
    subroutine atm_advance_scalars( tend, s_old, s_new, diag, grid, dt)
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
    !
-   ! Input: s - current model state
+   ! Integrate scalar equations - explicit transport plus other tendencies
+   !
+   ! Input: s - current model state, 
+   !            including tendencies from sources other than resolved transport.
    !        grid - grid metadata
    !
-   ! Output: tend - computed scalar tendencies
+   ! input scalars in state are uncoupled (i.e. not mulitplied by density)
+   ! 
+   ! Output: updated uncoupled scalars (scalars in s_new).
+   ! Note: scalar tendencies are also modified by this routine.
+   !
+   ! This routine DOES NOT apply any positive definite or monotonic renormalizations.
+   !
+   ! The transport scheme is from Skamarock and Gassmann MWR 2011.
+   !
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
 
       implicit none
@@ -1209,7 +1193,7 @@
       real (kind=RKIND), dimension( s_old % num_scalars, grid % nVertLevels ) :: flux_arr
 
       real (kind=RKIND), dimension( s_old % num_scalars, grid % nVertLevels + 1 ) :: wdtn
-      integer :: nVertLevels
+      integer :: nCellsSolve, nEdges, nVertLevels
 
       real (kind=RKIND), dimension(:), pointer :: fnm, fnp, rdnw, meshScalingDel2, meshScalingDel4
       real (kind=RKIND) :: coef_3rd_order
@@ -1255,6 +1239,8 @@
       adv_coefs =&gt; grid % adv_coefs % array
       adv_coefs_3rd =&gt; grid % adv_coefs_3rd % array
 
+      nCellsSolve = grid % nCellsSolve
+      nEdges      = grid % nEdges
       nVertLevels = grid % nVertLevels
 
       h_theta_eddy_visc2 = config_h_theta_eddy_visc2
@@ -1269,127 +1255,132 @@
 #endif
 
       !
-      ! Runge Kutta integration, so we compute fluxes from scalar_new values, update starts form scalar_old
+      ! Runge Kutta integration, so we compute fluxes from scalar_new values, update starts from scalar_old
       !
-      !
       !  horizontal flux divergence, accumulate in scalar_tend
 
-         do iEdge=1,grid%nEdges
-            cell1 = cellsOnEdge(1,iEdge)
-            cell2 = cellsOnEdge(2,iEdge)
-            if (cell1 &lt;= grid%nCellsSolve .or. cell2 &lt;= grid%nCellsSolve) then  ! only for owned cells
+      do iEdge=1,nEdges
+         cell1 = cellsOnEdge(1,iEdge)
+         cell2 = cellsOnEdge(2,iEdge)
+         if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then  ! only for owned cells
   
-               flux_arr(:,:) = 0.
-               do i=1,nAdvCellsForEdge(iEdge)
-                 iCell = advCellsForEdge(i,iEdge)
-                 do k=1,grid % nVertLevels
-                   scalar_weight = adv_coefs(i,iEdge) + coef_3rd_order*sign(1.0_RKIND,uhAvg(k,iEdge))*adv_coefs_3rd(i,iEdge)
-                   do iScalar=1,s_old % num_scalars
+            !  flux_arr stores the value of the scalar at the edge.
+            !  a better name perhaps would be scalarEdge
+
+            flux_arr(:,:) = 0.
+            do i=1,nAdvCellsForEdge(iEdge)
+               iCell = advCellsForEdge(i,iEdge)
+               do k=1,nVertLevels
+                  scalar_weight = adv_coefs(i,iEdge) + coef_3rd_order*sign(1.0_RKIND,uhAvg(k,iEdge))*adv_coefs_3rd(i,iEdge)
+                  do iScalar=1,s_old % num_scalars
                      flux_arr(iScalar,k) = flux_arr(iScalar,k) + scalar_weight* scalar_new(iScalar,k,iCell)
-                   end do
-                 end do
+                  end do
                end do
+            end do
 
-               do k=1,grid % nVertLevels
-               do iScalar=1,s_old % num_scalars
-                     scalar_tend(iScalar,k,cell1) = scalar_tend(iScalar,k,cell1) &amp;
-                            - uhAvg(k,iEdge)*flux_arr(iScalar,k)/areaCell(cell1)
-                     scalar_tend(iScalar,k,cell2) = scalar_tend(iScalar,k,cell2) &amp;
-                            + uhAvg(k,iEdge)*flux_arr(iScalar,k)/areaCell(cell2)
-               end do
-               end do
+            ! here we add the horizontal flux divergence into the scalar tendency.
+            ! note that the scalar tendency is modified.
 
-            end if
-          end do
+            do k=1,nVertLevels
+            do iScalar=1,s_old % num_scalars
+                  scalar_tend(iScalar,k,cell1) = scalar_tend(iScalar,k,cell1) &amp;
+                         - uhAvg(k,iEdge)*flux_arr(iScalar,k)/areaCell(cell1)
+                  scalar_tend(iScalar,k,cell2) = scalar_tend(iScalar,k,cell2) &amp;
+                         + uhAvg(k,iEdge)*flux_arr(iScalar,k)/areaCell(cell2)
+            end do
+            end do
 
+         end if
+      end do
+
       !
-      !  vertical flux divergence
+      !  vertical flux divergence and update of the scalars
       !
 
       ! zero fluxes at top and bottom
 
-         wdtn(:,1) = 0.
-         wdtn(:,grid % nVertLevels+1) = 0.
+      wdtn(:,1) = 0.
+      wdtn(:,nVertLevels+1) = 0.
 
-         if (config_scalar_vadv_order == 2) then
+      if (config_scalar_vadv_order == 2) then
 
-           do iCell=1,grid % nCellsSolve
-             do k = 2, nVertLevels
-                do iScalar=1,s_old % num_scalars
+         do iCell=1,nCellsSolve
+            do k = 2, nVertLevels
+               do iScalar=1,s_old % num_scalars
                   wdtn(iScalar,k) = wwAvg(k,iCell)*(fnm(k)*scalar_new(iScalar,k,iCell)+fnp(k)*scalar_new(iScalar,k-1,iCell))
-                end do
-             end do
-             do k=1,grid % nVertLevels   ! Could be nVertLevelsSolve?
-                do iScalar=1,s_old % num_scalars
+               end do
+            end do
+            do k=1,nVertLevels
+               do iScalar=1,s_old % num_scalars
                   scalar_new(iScalar,k,iCell) = (   scalar_old(iScalar,k,iCell)*h_old(k,iCell) &amp;
                         + dt*( scalar_tend(iScalar,k,iCell) -rdnw(k)*(wdtn(iScalar,k+1)-wdtn(iScalar,k)) ) )/h_new(k,iCell)
-                end do
-             end do
-           end do
+               end do
+            end do
+         end do
 
-         else if ( config_scalar_vadv_order == 3 ) then
+      else if ( config_scalar_vadv_order == 3 ) then
 
-           do iCell=1,grid % nCellsSolve
+         do iCell=1,nCellsSolve
 
-             k = 2
-             do iScalar=1,s_old % num_scalars
+            k = 2
+            do iScalar=1,s_old % num_scalars
                wdtn(iScalar,k) = wwAvg(k,iCell)*(fnm(k)*scalar_new(iScalar,k,iCell)+fnp(k)*scalar_new(iScalar,k-1,iCell))
-             enddo
+            end do
              
-             do k=3,nVertLevels-1
+            do k=3,nVertLevels-1
                do iScalar=1,s_old % num_scalars
-                 wdtn(iScalar,k) = flux3( scalar_new(iScalar,k-2,iCell),scalar_new(iScalar,k-1,iCell),  &amp;
-                                          scalar_new(iScalar,k  ,iCell),scalar_new(iScalar,k+1,iCell),  &amp;
-                                          wwAvg(k,iCell), coef_3rd_order )
+                  wdtn(iScalar,k) = flux3( scalar_new(iScalar,k-2,iCell),scalar_new(iScalar,k-1,iCell),  &amp;
+                                           scalar_new(iScalar,k  ,iCell),scalar_new(iScalar,k+1,iCell),  &amp;
+                                           wwAvg(k,iCell), coef_3rd_order )
                end do
-             end do
-             k = nVertLevels
-             do iScalar=1,s_old % num_scalars
+            end do
+            k = nVertLevels
+            do iScalar=1,s_old % num_scalars
                wdtn(iScalar,k) = wwAvg(k,iCell)*(fnm(k)*scalar_new(iScalar,k,iCell)+fnp(k)*scalar_new(iScalar,k-1,iCell))
-             enddo
+            end do
 
-             do k=1,grid % nVertLevels   ! Could be nVertLevelsSolve?
-                do iScalar=1,s_old % num_scalars
+            do k=1,nVertLevels
+               do iScalar=1,s_old % num_scalars
                   scalar_new(iScalar,k,iCell) = (   scalar_old(iScalar,k,iCell)*h_old(k,iCell) &amp;
                         + dt*( scalar_tend(iScalar,k,iCell) -rdnw(k)*(wdtn(iScalar,k+1)-wdtn(iScalar,k)) ) )/h_new(k,iCell)
-                end do
-             end do
+               end do
+            end do
 
-           end do
+         end do
 
-         else if ( config_scalar_vadv_order == 4 ) then
+      else if ( config_scalar_vadv_order == 4 ) then
 
-           do iCell=1,grid % nCellsSolve
+         do iCell=1,nCellsSolve
 
-             k = 2
-             do iScalar=1,s_old % num_scalars
+            k = 2
+            do iScalar=1,s_old % num_scalars
                wdtn(iScalar,k) = wwAvg(k,iCell)*(fnm(k)*scalar_new(iScalar,k,iCell)+fnp(k)*scalar_new(iScalar,k-1,iCell))
-             enddo
-             do k=3,nVertLevels-1
+            end do
+            do k=3,nVertLevels-1
                do iScalar=1,s_old % num_scalars
-                 wdtn(iScalar,k) = flux4( scalar_new(iScalar,k-2,iCell),scalar_new(iScalar,k-1,iCell),  &amp;
-                                          scalar_new(iScalar,k  ,iCell),scalar_new(iScalar,k+1,iCell), wwAvg(k,iCell) )
+                  wdtn(iScalar,k) = flux4( scalar_new(iScalar,k-2,iCell),scalar_new(iScalar,k-1,iCell),  &amp;
+                                           scalar_new(iScalar,k  ,iCell),scalar_new(iScalar,k+1,iCell), wwAvg(k,iCell) )
                end do
-             end do
-             k = nVertLevels
-             do iScalar=1,s_old % num_scalars
+            end do
+            k = nVertLevels
+            do iScalar=1,s_old % num_scalars
                wdtn(iScalar,k) = wwAvg(k,iCell)*(fnm(k)*scalar_new(iScalar,k,iCell)+fnp(k)*scalar_new(iScalar,k-1,iCell))
-             enddo
+            end do
 
-             do k=1,grid % nVertLevels   ! Could be nVertLevelsSolve?
-                do iScalar=1,s_old % num_scalars
+            do k=1,nVertLevels
+               do iScalar=1,s_old % num_scalars
                   scalar_new(iScalar,k,iCell) = (   scalar_old(iScalar,k,iCell)*h_old(k,iCell) &amp;
                         + dt*( scalar_tend(iScalar,k,iCell) -rdnw(k)*(wdtn(iScalar,k+1)-wdtn(iScalar,k)) ) )/h_new(k,iCell)
-                end do
-             end do
+               end do
+            end do
 
-           end do
+         end do
                                                                                         
-         else 
+      else 
 
-           write(0,*) ' bad value for config_scalar_vadv_order - ', config_scalar_vadv_order
+         write(0,*) ' bad value for config_scalar_vadv_order - ', config_scalar_vadv_order
 
-         end if
+      end if
 
    end subroutine atm_advance_scalars
 
@@ -1398,9 +1389,25 @@
    subroutine atm_advance_scalars_mono(tend, s_old, s_new, diag, grid, dt)
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
    !
-   ! Input: s - current model state
+   ! Integrate scalar equations - transport plus other tendencies
+   !
+   ! Input: s - current model state, 
+   !            including tendencies from sources other than resolved transport.
    !        grid - grid metadata
    !
+   ! input scalars in state are uncoupled (i.e. not mulitplied by density)
+   ! 
+   ! Output: updated uncoupled scalars (scalars in s_new).
+   ! Note: scalar tendencies are also modified by this routine.
+   !
+   ! This routine DOES apply positive definite or monotonic renormalizations.
+   !
+   ! The transport scheme is from Skamarock and Gassmann MWR 2011.
+   !
+   ! The positive-definite or monotonic renormalization is from Zalesak JCP 1979
+   !   as used in the RK3 scheme as described in Wang et al MWR 2009
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
+
       implicit none
 
       type (tend_type),intent(in)     :: tend
@@ -1436,9 +1443,10 @@
       real (kind=RKIND), dimension( grid % nVertLevels, grid % nEdges ) :: flux_arr
       real (kind=RKIND), dimension( grid % nVertLevels + 1, grid % nCells ) :: wdtn
 
-      integer :: nVertLevels, num_scalars, icellmax, kmax
+      integer :: nCells, nCellsSolve, nEdges, nVertLevels, num_scalars, icellmax, kmax
 
       real (kind=RKIND), dimension(:), pointer :: fnm, fnp, rdnw, meshScalingDel2, meshScalingDel4
+      integer, dimension(:), pointer :: nEdgesOnCell
       real (kind=RKIND) :: coef_3rd_order
 
       real (kind=RKIND) :: h_theta_eddy_visc2, v_theta_eddy_visc2
@@ -1480,11 +1488,15 @@
       meshScalingDel2 =&gt; grid % meshScalingDel2 % array
       meshScalingDel4 =&gt; grid % meshScalingDel4 % array
 
+      nEdgesOnCell =&gt; grid % nEdgesOnCell % array
       nAdvCellsForEdge =&gt; grid % nAdvCellsForEdge % array
       advCellsForEdge =&gt; grid % advCellsForEdge % array
       adv_coefs =&gt; grid % adv_coefs % array
       adv_coefs_3rd =&gt; grid % adv_coefs_3rd % array
 
+      nCells      = grid % nCells
+      nCellsSolve = grid % nCellsSolve
+      nEdges      = grid % nEdges
       nVertLevels = grid % nVertLevels
 
       h_theta_eddy_visc2 = config_h_theta_eddy_visc2
@@ -1498,13 +1510,13 @@
       scalar_tend = 0.  !  testing purposes - we have no sources or sinks
 #endif
 
-      !
-      ! Update scalars for physics (i.e., what is in scalar_tend)
-      !   we should probably move this to another routine called before mono advection ****
-      !
+      !  for positive-definite or monotonic option, we first update scalars using the tendency from sources other than
+      !  the resolved transport (these should constitute a positive definite update).  
+      !  Note, however, that we enforce positive-definiteness in this update.
+      !  The transport will maintain this positive definite solution and optionally, shape preservation (monotonicity).
 
-      do iCell = 1,grid%nCellsSolve
-      do k = 1, grid%nVertLevels
+      do iCell = 1, nCellsSolve
+      do k = 1, nVertLevels
       do iScalar = 1,s_old%num_scalars
          s_old % scalars % array(iScalar,k,iCell) = s_old % scalars % array(iScalar,k,iCell)+dt*scalar_tend(iScalar,k,iCell) / h_old(k,iCell)
          scalar_tend(iScalar,k,iCell) = 0.
@@ -1525,34 +1537,34 @@
       num_scalars = 1
 
       do iScalar = 1, s_old % num_scalars
-        write(0,*) ' mono transport for scalar ',iScalar
+         write(0,*) ' mono transport for scalar ',iScalar
 
-        do iCell = 1, grid%nCells
-        do k=1, grid%nVertLevels
-          scalar_old(k,iCell) = s_old % scalars % array(iScalar,k,iCell)
-          scalar_new(k,iCell) = s_new % scalars % array(iScalar,k,iCell)
-        end do
-        end do
+         do iCell = 1, nCells
+         do k = 1, nVertLevels
+            scalar_old(k,iCell) = s_old % scalars % array(iScalar,k,iCell)
+            scalar_new(k,iCell) = s_new % scalars % array(iScalar,k,iCell)
+         end do
+         end do
 
-         if(debug_print) then
+         if (debug_print) then
             scmin = scalar_old(1,1)
             scmax = scalar_old(1,1)
-            do iCell = 1, grid%nCells
-            do k=1, grid%nVertLevels
+            do iCell = 1, nCells
+            do k=1, nVertLevels
                scmin = min(scmin,scalar_old(k,iCell))
                scmax = max(scmax,scalar_old(k,iCell))
-            enddo
-            enddo
+            end do
+            end do
             write(0,*) ' scmin, scmin old in ',scmin,scmax
 
             scmin = scalar_new(1,1)
             scmax = scalar_new(1,1)
-            do iCell = 1, grid%nCells
-            do k=1, grid%nVertLevels
+            do iCell = 1, nCells
+            do k=1, nVertLevels
                scmin = min(scmin,scalar_new(k,iCell))
                scmax = max(scmax,scalar_new(k,iCell))
-            enddo
-            enddo
+            end do
+            end do
             write(0,*) ' scmin, scmin new in ',scmin,scmax
          end if
 
@@ -1562,42 +1574,42 @@
       !
 
  
-         do iCell=1,grid % nCellsSolve
+         do iCell=1,nCellsSolve
 
-          ! zero flux at top and bottom
-           wdtn(1,iCell) = 0.
-           wdtn(grid % nVertLevels+1,iCell) = 0.
+            ! zero flux at top and bottom
+            wdtn(1,iCell) = 0.
+            wdtn(grid % nVertLevels+1,iCell) = 0.
 
-           k = 1
-           s_max(k,iCell) = max(scalar_old(1,iCell),scalar_old(2,iCell))
-           s_min(k,iCell) = min(scalar_old(1,iCell),scalar_old(2,iCell))
+            k = 1
+            s_max(k,iCell) = max(scalar_old(1,iCell),scalar_old(2,iCell))
+            s_min(k,iCell) = min(scalar_old(1,iCell),scalar_old(2,iCell))
 
-           k = 2
-           wdtn(k,iCell) = wwAvg(k,iCell)*(fnm(k)*scalar_new(k,iCell)+fnp(k)*scalar_new(k-1,iCell))
-           s_max(k,iCell) = max(scalar_old(k-1,iCell),scalar_old(k,iCell),scalar_old(k+1,iCell))
-           s_min(k,iCell) = min(scalar_old(k-1,iCell),scalar_old(k,iCell),scalar_old(k+1,iCell))
+            k = 2
+            wdtn(k,iCell) = wwAvg(k,iCell)*(fnm(k)*scalar_new(k,iCell)+fnp(k)*scalar_new(k-1,iCell))
+            s_max(k,iCell) = max(scalar_old(k-1,iCell),scalar_old(k,iCell),scalar_old(k+1,iCell))
+            s_min(k,iCell) = min(scalar_old(k-1,iCell),scalar_old(k,iCell),scalar_old(k+1,iCell))
              
-           do k=3,nVertLevels-1
-              wdtn(k,iCell) = flux3( scalar_new(k-2,iCell),scalar_new(k-1,iCell),  &amp;
-                                     scalar_new(k  ,iCell),scalar_new(k+1,iCell),  &amp;
-                                     wwAvg(k,iCell), coef_3rd_order )
-              s_max(k,iCell) = max(scalar_old(k-1,iCell),scalar_old(k,iCell),scalar_old(k+1,iCell))
-              s_min(k,iCell) = min(scalar_old(k-1,iCell),scalar_old(k,iCell),scalar_old(k+1,iCell))
-           end do
+            do k=3,nVertLevels-1
+               wdtn(k,iCell) = flux3( scalar_new(k-2,iCell),scalar_new(k-1,iCell),  &amp;
+                                      scalar_new(k  ,iCell),scalar_new(k+1,iCell),  &amp;
+                                      wwAvg(k,iCell), coef_3rd_order )
+               s_max(k,iCell) = max(scalar_old(k-1,iCell),scalar_old(k,iCell),scalar_old(k+1,iCell))
+               s_min(k,iCell) = min(scalar_old(k-1,iCell),scalar_old(k,iCell),scalar_old(k+1,iCell))
+            end do
  
-           k = nVertLevels
-           wdtn(k,iCell) = wwAvg(k,iCell)*(fnm(k)*scalar_new(k,iCell)+fnp(k)*scalar_new(k-1,iCell))
-           s_max(k,iCell) = max(scalar_old(k,iCell),scalar_old(k-1,iCell))
-           s_min(k,iCell) = min(scalar_old(k,iCell),scalar_old(k-1,iCell))
+            k = nVertLevels
+            wdtn(k,iCell) = wwAvg(k,iCell)*(fnm(k)*scalar_new(k,iCell)+fnp(k)*scalar_new(k-1,iCell))
+            s_max(k,iCell) = max(scalar_old(k,iCell),scalar_old(k-1,iCell))
+            s_min(k,iCell) = min(scalar_old(k,iCell),scalar_old(k-1,iCell))
 
       ! pull s_min and s_max from the (horizontal) surrounding cells
 
-           do i=1, grid % nEdgesOnCell % array(iCell)
-             do k=1, grid % nVertLevels
-               s_max(k,iCell) = max(s_max(k,iCell),scalar_old(k, grid % CellsOnCell % array(i,iCell)))
-               s_min(k,iCell) = min(s_min(k,iCell),scalar_old(k, grid % CellsOnCell % array(i,iCell)))
-             end do
-           end do
+            do i=1, nEdgesOnCell(iCell)
+               do k=1, grid % nVertLevels
+                  s_max(k,iCell) = max(s_max(k,iCell),scalar_old(k, grid % CellsOnCell % array(i,iCell)))
+                  s_min(k,iCell) = min(s_min(k,iCell),scalar_old(k, grid % CellsOnCell % array(i,iCell)))
+               end do
+            end do
 
          end do
 
@@ -1605,77 +1617,77 @@
       !  horizontal flux divergence
 
          flux_arr(:,:) = 0.
-         do iEdge=1,grid%nEdges
+         do iEdge=1,nEdges
 
             cell1 = cellsOnEdge(1,iEdge)
             cell2 = cellsOnEdge(2,iEdge)
 
-            if (cell1 &lt;= grid%nCellsSolve .or. cell2 &lt;= grid%nCellsSolve) then  ! only for owned cells
+            if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then  ! only for owned cells
   
                do i=1,nAdvCellsForEdge(iEdge)
-                 iCell = advCellsForEdge(i,iEdge)
-                 do k=1,grid % nVertLevels
-                   scalar_weight = uhAvg(k,iEdge)*(adv_coefs(i,iEdge) + coef_3rd_order*sign(1.0_RKIND,uhAvg(k,iEdge))*adv_coefs_3rd(i,iEdge))
-                   flux_arr(k,iEdge) = flux_arr(k,iEdge) + scalar_weight* scalar_new(k,iCell)
-                 end do
+                  iCell = advCellsForEdge(i,iEdge)
+                  do k=1,nVertLevels
+                     scalar_weight = uhAvg(k,iEdge)*(adv_coefs(i,iEdge) + coef_3rd_order*sign(1.0_RKIND,uhAvg(k,iEdge))*adv_coefs_3rd(i,iEdge))
+                     flux_arr(k,iEdge) = flux_arr(k,iEdge) + scalar_weight* scalar_new(k,iCell)
+                  end do
                end do
 
             end if
 
-          end do
+         end do
 
 !  vertical flux divergence for upwind update, we will put upwind update into scalar_new, and put factor of dt in fluxes
 
-          do iCell = 1, grid % nCellsSolve
+         do iCell = 1, nCellsSolve
 
             k = 1
             scalar_new(k,iCell) = scalar_old(k,iCell)*h_old(k,iCell)
 
             do k = 2, nVertLevels
-              scalar_new(k,iCell) = scalar_old(k,iCell)*h_old(k,iCell)
-              flux_upwind = dt*(max(0.0_RKIND,wwAvg(k,iCell))*scalar_old(k-1,iCell) + min(0.0_RKIND,wwAvg(k,iCell))*scalar_old(k,iCell))
-              scalar_new(k-1,iCell) = scalar_new(k-1,iCell) - flux_upwind*rdnw(k-1)
-              scalar_new(k  ,iCell) = scalar_new(k  ,iCell) + flux_upwind*rdnw(k)
-              wdtn(k,iCell) = dt*wdtn(k,iCell) - flux_upwind
+               scalar_new(k,iCell) = scalar_old(k,iCell)*h_old(k,iCell)
+               flux_upwind = dt*(max(0.0_RKIND,wwAvg(k,iCell))*scalar_old(k-1,iCell) + min(0.0_RKIND,wwAvg(k,iCell))*scalar_old(k,iCell))
+               scalar_new(k-1,iCell) = scalar_new(k-1,iCell) - flux_upwind*rdnw(k-1)
+               scalar_new(k  ,iCell) = scalar_new(k  ,iCell) + flux_upwind*rdnw(k)
+               wdtn(k,iCell) = dt*wdtn(k,iCell) - flux_upwind
             end do
 
 ! scale_arr(SCALE_IN,:,:) and scale_arr(SCALE_OUT:,:) are used here to store the incoming and outgoing perturbation flux 
 ! contributions to the update:  first the vertical flux component, then the horizontal
 
             do k=1,nVertLevels
-              scale_arr(SCALE_IN, k,iCell) = - rdnw(k)*(min(0.0_RKIND,wdtn(k+1,iCell))-max(0.0_RKIND,wdtn(k,iCell)))
-              scale_arr(SCALE_OUT,k,iCell) = - rdnw(k)*(max(0.0_RKIND,wdtn(k+1,iCell))-min(0.0_RKIND,wdtn(k,iCell)))
+               scale_arr(SCALE_IN, k,iCell) = - rdnw(k)*(min(0.0_RKIND,wdtn(k+1,iCell))-max(0.0_RKIND,wdtn(k,iCell)))
+               scale_arr(SCALE_OUT,k,iCell) = - rdnw(k)*(max(0.0_RKIND,wdtn(k+1,iCell))-min(0.0_RKIND,wdtn(k,iCell)))
             end do
 
-          end do
+         end do
 
 !  horizontal flux divergence for upwind update
 
          !  upwind flux computation
 
-         do iEdge=1,grid%nEdges
-           cell1 = cellsOnEdge(1,iEdge)
-           cell2 = cellsOnEdge(2,iEdge)
-             if (cell1 &lt;= grid%nCellsSolve .or. cell2 &lt;= grid%nCellsSolve) then  ! only for owned cells
-               do k=1,grid % nVertLevels
-                 flux_upwind = grid % dvEdge % array(iEdge) * dt *   &amp;
-                        (max(0.0_RKIND,uhAvg(k,iEdge))*scalar_old(k,cell1) + min(0.0_RKIND,uhAvg(k,iEdge))*scalar_old(k,cell2))
-                 flux_arr(k,iEdge) = dt*flux_arr(k,iEdge) - flux_upwind
-                 scalar_new(k,cell1) = scalar_new(k,cell1) - flux_upwind / areaCell(cell1)
-                 scalar_new(k,cell2) = scalar_new(k,cell2) + flux_upwind / areaCell(cell2)
-
-                 scale_arr(SCALE_OUT,k,cell1) = scale_arr(SCALE_OUT,k,cell1) - max(0.0_RKIND,flux_arr(k,iEdge)) / areaCell(cell1)
-                 scale_arr(SCALE_IN, k,cell1) = scale_arr(SCALE_IN, k,cell1) - min(0.0_RKIND,flux_arr(k,iEdge)) / areaCell(cell1)
-                 scale_arr(SCALE_OUT,k,cell2) = scale_arr(SCALE_OUT,k,cell2) + min(0.0_RKIND,flux_arr(k,iEdge)) / areaCell(cell2)
-                 scale_arr(SCALE_IN, k,cell2) = scale_arr(SCALE_IN, k,cell2) + max(0.0_RKIND,flux_arr(k,iEdge)) / areaCell(cell2)
-
+         do iEdge=1,nEdges
+            cell1 = cellsOnEdge(1,iEdge)
+            cell2 = cellsOnEdge(2,iEdge)
+            if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then  ! only for owned cells
+               do k=1, nVertLevels
+                  flux_upwind = grid % dvEdge % array(iEdge) * dt *   &amp;
+                         (max(0.0_RKIND,uhAvg(k,iEdge))*scalar_old(k,cell1) + min(0.0_RKIND,uhAvg(k,iEdge))*scalar_old(k,cell2))
+                  flux_arr(k,iEdge) = dt*flux_arr(k,iEdge) - flux_upwind
+                  scalar_new(k,cell1) = scalar_new(k,cell1) - flux_upwind / areaCell(cell1)
+                  scalar_new(k,cell2) = scalar_new(k,cell2) + flux_upwind / areaCell(cell2)

+                  scale_arr(SCALE_OUT,k,cell1) = scale_arr(SCALE_OUT,k,cell1) - max(0.0_RKIND,flux_arr(k,iEdge)) / areaCell(cell1)
+                  scale_arr(SCALE_IN, k,cell1) = scale_arr(SCALE_IN, k,cell1) - min(0.0_RKIND,flux_arr(k,iEdge)) / areaCell(cell1)
+                  scale_arr(SCALE_OUT,k,cell2) = scale_arr(SCALE_OUT,k,cell2) + min(0.0_RKIND,flux_arr(k,iEdge)) / areaCell(cell2)
+                  scale_arr(SCALE_IN, k,cell2) = scale_arr(SCALE_IN, k,cell2) + max(0.0_RKIND,flux_arr(k,iEdge)) / areaCell(cell2)

                end do
-             end if
-          end do
+            end if
+         end do
 
 !  next, the limiter
 
-          do iCell = 1, grid % nCellsSolve
+         do iCell = 1, nCellsSolve
             do k = 1, nVertLevels
                s_min_update = (scalar_new(k,iCell)+scale_arr(SCALE_OUT,k,iCell))/h_new(k,iCell)
                s_max_update = (scalar_new(k,iCell)+scale_arr(SCALE_IN,k,iCell))/h_new(k,iCell)
@@ -1688,72 +1700,70 @@
                scale_arr(SCALE_OUT,k,iCell) = min( 1.0_RKIND, max( 0.0_RKIND, scale_factor) )
 
             end do
-          end do
+         end do
 !
-!  communicate scale factors here
+!  communicate scale factors here.
+!  communicate only first halo row in these next two exchanges
 !
-!
-!  WCS_halo_opt_2 - communicate only first halo row in these next two exchanges
-!
-      tempField =&gt; tempFieldTarget
+         tempField =&gt; tempFieldTarget
 
-      tempField % block =&gt; block
-      tempField % dimSizes(1) = 2
-      tempField % dimSizes(2) = grid % nVertLevels
-      tempField % dimSizes(3) = grid % nCells
-      tempField % sendList =&gt; block % parinfo % cellsToSend
-      tempField % recvList =&gt; block % parinfo % cellsToRecv
-      tempField % copyList =&gt; block % parinfo % cellsToCopy
-      tempField % prev =&gt; null()
-      tempField % next =&gt; null()
+         tempField % block =&gt; block
+         tempField % dimSizes(1) = 2
+         tempField % dimSizes(2) = grid % nVertLevels
+         tempField % dimSizes(3) = grid % nCells
+         tempField % sendList =&gt; block % parinfo % cellsToSend
+         tempField % recvList =&gt; block % parinfo % cellsToRecv
+         tempField % copyList =&gt; block % parinfo % cellsToCopy
+         tempField % prev =&gt; null()
+         tempField % next =&gt; null()
 
-      tempField % array =&gt; scale_arr
-      call mpas_dmpar_exch_halo_field(tempField, (/ 1 /))
+         tempField % array =&gt; scale_arr
+         call mpas_dmpar_exch_halo_field(tempField, (/ 1 /))
 
 !
 !  rescale the fluxes
 !
-            do iEdge = 1, grid % nEdges
-               cell1 = grid % cellsOnEdge % array(1,iEdge)
-               cell2 = grid % cellsOnEdge % array(2,iEdge)
-               if (cell1 &lt;= grid%nCellsSolve .or. cell2 &lt;= grid%nCellsSolve) then
-                  do k = 1, nVertLevels
-                     flux = flux_arr(k,iEdge)
-                     flux = max(0.0_RKIND,flux) * min(scale_arr(SCALE_OUT,k,cell1), scale_arr(SCALE_IN, k,cell2)) &amp;
-                          + min(0.0_RKIND,flux) * min(scale_arr(SCALE_IN, k,cell1), scale_arr(SCALE_OUT,k,cell2))
-                     flux_arr(k,iEdge) = flux
-                  end do
-               end if
-            end do
+         do iEdge = 1, nEdges
+            cell1 = cellsOnEdge(1,iEdge)
+            cell2 = cellsOnEdge(2,iEdge)
+            if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then
+               do k = 1, nVertLevels
+                  flux = flux_arr(k,iEdge)
+                  flux = max(0.0_RKIND,flux) * min(scale_arr(SCALE_OUT,k,cell1), scale_arr(SCALE_IN, k,cell2)) &amp;
+                       + min(0.0_RKIND,flux) * min(scale_arr(SCALE_IN, k,cell1), scale_arr(SCALE_OUT,k,cell2))
+                  flux_arr(k,iEdge) = flux
+               end do
+            end if
+         end do
  
        ! rescale the vertical flux
  
-            do iCell=1,grid % nCells
-               do k = 2, nVertLevels
-                  flux =  wdtn(k,iCell)
-                  flux = max(0.0_RKIND,flux) * min(scale_arr(SCALE_OUT,k-1,iCell), scale_arr(SCALE_IN,k  ,iCell)) &amp;
-                       + min(0.0_RKIND,flux) * min(scale_arr(SCALE_OUT,k  ,iCell), scale_arr(SCALE_IN,k-1,iCell))
-                  wdtn(k,iCell) = flux
-               end do
+         do iCell=1,nCells
+            do k = 2, nVertLevels
+               flux =  wdtn(k,iCell)
+               flux = max(0.0_RKIND,flux) * min(scale_arr(SCALE_OUT,k-1,iCell), scale_arr(SCALE_IN,k  ,iCell)) &amp;
+                    + min(0.0_RKIND,flux) * min(scale_arr(SCALE_OUT,k  ,iCell), scale_arr(SCALE_IN,k-1,iCell))
+               wdtn(k,iCell) = flux
             end do
+         end do
 !
 !  do the scalar update now that we have the fluxes
 !
-         do iEdge=1,grid%nEdges
+         do iEdge=1,nEdges
             cell1 = cellsOnEdge(1,iEdge)
             cell2 = cellsOnEdge(2,iEdge)
-            if (cell1 &lt;= grid%nCellsSolve .or. cell2 &lt;= grid%nCellsSolve) then  ! only for owned cells
-               do k=1,grid % nVertLevels
+            if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then  ! only for owned cells
+               do k=1,nVertLevels
                   scalar_new(k,cell1) = scalar_new(k,cell1) - flux_arr(k,iEdge)/areaCell(cell1)
                   scalar_new(k,cell2) = scalar_new(k,cell2) + flux_arr(k,iEdge)/areaCell(cell2)
                end do
             end if
          end do
 
-         do iCell=1,grid % nCellsSolve
-            do k=1,grid % nVertLevels
-              scalar_new(k,iCell) = (   scalar_new(k,iCell)  &amp;
-                  + (-rdnw(k)*(wdtn(k+1,iCell)-wdtn(k,iCell)) ) )/h_new(k,iCell)
+         do iCell=1,nCellsSolve
+            do k=1,nVertLevels
+               scalar_new(k,iCell) = (   scalar_new(k,iCell)  &amp;
+                   + (-rdnw(k)*(wdtn(k+1,iCell)-wdtn(k,iCell)) ) )/h_new(k,iCell)
             end do
          end do
 
@@ -1761,25 +1771,28 @@
 
             scmin = scalar_new(1,1)
             scmax = scalar_new(1,1)
-            do iCell = 1, grid%nCellsSolve
-            do k=1, grid%nVertLevels
+            do iCell = 1, nCellsSolve
+            do k=1, nVertLevels
                scmax = max(scmax,scalar_new(k,iCell))
                scmin = min(scmin,scalar_new(k,iCell))
-               if(s_max(k,iCell) &lt; scalar_new(k,iCell)) then
+               if (s_max(k,iCell) &lt; scalar_new(k,iCell)) then
                   write(32,*) ' over - k,iCell,s_min,s_max,scalar_new ',k,iCell,s_min(k,iCell),s_max(k,iCell),scalar_new(k,iCell)
                end if
-               if(s_min(k,iCell) &gt; scalar_new(k,iCell)) then
+               if (s_min(k,iCell) &gt; scalar_new(k,iCell)) then
                   write(32,*) ' under - k,iCell,s_min,s_max,scalar_new ',k,iCell,s_min(k,iCell),s_max(k,iCell),scalar_new(k,iCell)
                end if
-            enddo
-            enddo
+            end do
+            end do
             write(0,*) ' scmin, scmax new out ',scmin,scmax
             write(0,*) ' icell_min, k_min ',icellmax, kmax
 
          end if
 
-         do iCell = 1, grid%nCells
-         do k=1, grid%nVertLevels
+         ! the update should be positive definite. but roundoff can sometimes leave small negative values
+         ! hence the enforcement of PD in the copy back to the model state.
+
+         do iCell = 1, nCells
+         do k=1, nVertLevels
             s_new % scalars % array(iScalar,k,iCell) = max(0.0_RKIND,scalar_new(k,iCell))
          end do
          end do
@@ -1796,10 +1809,12 @@
    !
    ! Input: s - current model state
    !        grid - grid metadata
+   !        diag - some grid diagnostics
    !
-   ! Output: tend - computed diagnostics (parallel velocities, v; mass fluxes, rv; 
-   !                circulation; vorticity; and kinetic energy, ke) and the 
-   !                tendencies for height (h) and u (u)
+   ! Output: tend - tendencies: tend_u, tend_w, tend_theta and tend_rho
+   !                these are all coupled-variable tendencies.
+   !         various other quantities in diag: Smagorinsky eddy viscosity
+   !                
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
 
       implicit none
@@ -1811,12 +1826,13 @@
       integer, intent(in) :: rk_step
       real (kind=RKIND), intent(in) :: dt
 
+
       logical, parameter :: rk_diffusion = .false.
 
       integer :: iEdge, iCell, iVertex, k, cell1, cell2, vertex1, vertex2, eoe, i, j, iq
       real (kind=RKIND) :: flux, workpv
 
-      integer :: nCells, nEdges, nVertices, nVertLevels, nCellsSolve
+      integer :: nCells, nEdges, nVertices, nVertLevels, nCellsSolve, nEdgesSolve
       real (kind=RKIND) :: h_mom_eddy_visc2,   v_mom_eddy_visc2,   h_mom_eddy_visc4
       real (kind=RKIND) :: h_theta_eddy_visc2, v_theta_eddy_visc2, h_theta_eddy_visc4
       real (kind=RKIND) :: u_diffusion
@@ -1855,10 +1871,8 @@
       real (kind=RKIND) :: cf1, cf2, cf3, pr, pl
       real (kind=RKIND) :: prandtl_inv
 
-!      logical, parameter :: debug = .true.
       logical, parameter :: debug = .false.
 
-      !SHP-curvature
       logical, parameter :: curvature = .true.
       real (kind=RKIND) :: r_earth
       real (kind=RKIND), dimension(:,:), pointer :: ur_cell, vr_cell
@@ -1884,7 +1898,6 @@
 
 !-----------
 
-      !SHP-curvature
       r_earth = grid % sphere_radius
       ur_cell =&gt; diag % uReconstructZonal % array
       vr_cell =&gt; diag % uReconstructMeridional % array
@@ -1967,6 +1980,7 @@
       nVertLevels = grid % nVertLevels
       nVertices   = grid % nVertices
       nCellsSolve = grid % nCellsSolve
+      nEdgesSolve = grid % nEdgesSolve
 
       h_mom_eddy_visc2 = config_h_mom_eddy_visc2
 !     h_mom_eddy_visc4 = config_h_mom_eddy_visc4
@@ -1975,6 +1989,7 @@
 !     h_theta_eddy_visc4 = config_h_theta_eddy_visc4
       v_theta_eddy_visc2 = config_v_theta_eddy_visc2
 
+      nEdgesOnCell =&gt; grid % nEdgesOnCell % array
       nAdvCellsForEdge =&gt; grid % nAdvCellsForEdge % array
       advCellsForEdge =&gt; grid % advCellsForEdge % array
       adv_coefs =&gt; grid % adv_coefs % array
@@ -1982,19 +1997,19 @@
 
       prandtl_inv = 1.0_RKIND/prandtl
 
-      !
-      ! Compute u (normal) velocity tendency for each edge (cell face)
-      !
-
       write(0,*) ' rk_step in compute_dyn_tend ',rk_step
 
 
       delsq_horiz_mixing = .false.
       if (config_horiz_mixing == &quot;2d_smagorinsky&quot; .and. (rk_step == 1 .or. rk_diffusion)) then
+
+         ! Smagorinsky eddy viscosity, based on horizontal deformation (in this case on model coordinate surfaces).
+         ! The integration coefficients were precomputed and stored in defc_a and defc_b
+
          do iCell = 1, nCells
             d_diag(:) = 0.
             d_off_diag(:) = 0.
-            do iEdge = 1, grid % nEdgesOnCell % array (iCell)
+            do iEdge = 1, nEdgesOnCell(iCell)
                do k=1, nVertLevels
                   d_diag(k)     = d_diag(k)     + defc_a(iEdge,iCell)*u(k,EdgesOnCell(iEdge,iCell))  &amp;
                                                 - defc_b(iEdge,iCell)*v(k,EdgesOnCell(iEdge,iCell))
@@ -2003,6 +2018,8 @@
                end do
             end do
             do k=1, nVertLevels
+               ! here is the Smagorinsky formulation, 
+               ! followed by imposition of an upper bound on the eddy viscosity
                kdiff(k,iCell) = (c_s * config_len_disp)**2 * sqrt(d_diag(k)**2 + d_off_diag(k)**2)
                kdiff(k,iCell) = min(kdiff(k,iCell),(0.01*config_len_disp**2)/dt)
             end do
@@ -2027,157 +2044,162 @@
       cf2 = grid % cf2 % scalar
       cf3 = grid % cf3 % scalar
 
-      !  tendency for density
-      !  divergence_ru may calculated in the diagnostic subroutine - it is temporary
+      ! tendency for density.
+      ! accumulate total water here for later use in w tendency calculation.
+
       allocate(divergence_ru(nVertLevels, nCells+1))
       allocate(qtot(nVertLevels, nCells+1))
 
       divergence_ru(:,:) = 0.0
       h_divergence(:,:) = 0.
-      do iEdge=1,grid % nEdges
+
+      ! accumulate horizontal mass-flux
+
+      do iEdge=1,nEdges
          cell1 = cellsOnEdge(1,iEdge)
          cell2 = cellsOnEdge(2,iEdge)
          do k=1,nVertLevels
-           flux = ru(k,iEdge)*dvEdge(iEdge)
-           divergence_ru(k,cell1) = divergence_ru(k,cell1) + flux
-           divergence_ru(k,cell2) = divergence_ru(k,cell2) - flux
+            flux = ru(k,iEdge)*dvEdge(iEdge)
+            divergence_ru(k,cell1) = divergence_ru(k,cell1) + flux
+            divergence_ru(k,cell2) = divergence_ru(k,cell2) - flux
          end do
       end do
 
       qtot(:,:)=0.
+
+      ! compute horiontal mass-flux divergence, add vertical mass flux divergence to complete tend_rho
+
       do iCell = 1,nCells
-        r = 1.0 / areaCell(iCell)
-        do k = 1,nVertLevels
-           divergence_ru(k,iCell) = divergence_ru(k,iCell) * r
-           h_divergence(k,iCell) = divergence_ru(k,iCell)
-           tend_rho(k,iCell) = -divergence_ru(k,iCell)-rdzw(k)*(rw(k+1,iCell)-rw(k,iCell))
+         r = 1.0 / areaCell(iCell)
+         do k = 1,nVertLevels
+            divergence_ru(k,iCell) = divergence_ru(k,iCell) * r
+            h_divergence(k,iCell) = divergence_ru(k,iCell)
+            tend_rho(k,iCell) = -divergence_ru(k,iCell)-rdzw(k)*(rw(k+1,iCell)-rw(k,iCell))
 
-           do iq = s % moist_start, s % moist_end
-              qtot(k,iCell) = qtot(k,iCell) + s % scalars % array (iq, k, iCell)
-           end do
+            do iq = s % moist_start, s % moist_end
+               qtot(k,iCell) = qtot(k,iCell) + s % scalars % array (iq, k, iCell)
+            end do
 
-        end do
+         end do
       end do    
 
-!****       u dyn tend
+      !
+      ! Compute u (normal) velocity tendency for each edge (cell face)
+      !
 
-         do iEdge=1,grid % nEdgesSolve
-            cell1 = cellsOnEdge(1,iEdge)
-            cell2 = cellsOnEdge(2,iEdge)
+      do iEdge=1,nEdgesSolve
 
-           if(newpx)  then
+         cell1 = cellsOnEdge(1,iEdge)
+         cell2 = cellsOnEdge(2,iEdge)
 
-               k = 1
-               pr  = cpr(k,iEdge)*pp(k,cell2)+cpr(k+1,iEdge)*pp(k+1,cell2)+cpr(k+2,iEdge)*pp(k+2,cell2)
-               pl  = cpl(k,iEdge)*pp(k,cell1)+cpl(k+1,iEdge)*pp(k+1,cell1)+cpl(k+2,iEdge)*pp(k+2,cell1)
-               tend_u(k,iEdge) =  - cqu(k,iEdge)*2./(zz(k,cell1)+zz(k,cell2))*(pr-pl)/dcEdge(iEdge)
+         ! horizontal pressure gradient 
 
-               do k=2,nVertLevels
+         if (newpx)  then
 
-                  kr = min(nVertLevels,k+ nint(.5-sign(0.5_RKIND,zx(k,iEdge)+zx(k+1,iEdge))))
-                  kl = min(nVertLevels,2*k+1-kr)
+            k = 1
+            pr  = cpr(k,iEdge)*pp(k,cell2)+cpr(k+1,iEdge)*pp(k+1,cell2)+cpr(k+2,iEdge)*pp(k+2,cell2)
+            pl  = cpl(k,iEdge)*pp(k,cell1)+cpl(k+1,iEdge)*pp(k+1,cell1)+cpl(k+2,iEdge)*pp(k+2,cell1)
+            tend_u(k,iEdge) =  - cqu(k,iEdge)*2./(zz(k,cell1)+zz(k,cell2))*(pr-pl)/dcEdge(iEdge)
 
-                  pr = pp(k,cell2)+.5*(zgrid(k   ,cell1)+zgrid(k +1,cell1)-zgrid(k ,cell2)-zgrid(k +1,cell2))   &amp;
-                                     /(zgrid(kr+1,cell2)-zgrid(kr-1,cell2))*( pp(kr,cell2)-pp   (kr-1,cell2))
-                  pl = pp(k,cell1)+.5*(zgrid(k   ,cell2)+zgrid(k +1,cell2)-zgrid(k ,cell1)-zgrid(k +1,cell1))   &amp;
-                                     /(zgrid(kl+1,cell1)-zgrid(kl-1,cell1))*( pp(kl,cell1)-pp   (kl-1,cell1))
-                  tend_u(k,iEdge) =  - cqu(k,iEdge)*2./(zz(k,cell1)+zz(k,cell2))*(pr-pl)/dcEdge(iEdge)
+            do k=2,nVertLevels
 
-               end do
+               kr = min(nVertLevels,k+ nint(.5-sign(0.5_RKIND,zx(k,iEdge)+zx(k+1,iEdge))))
+               kl = min(nVertLevels,2*k+1-kr)
 
-            else
-               k = 1
-!!              dpzx(k) = .5*zx(k,iEdge)*(cf1*(pp(k  ,cell2)+pp(k  ,cell1))   &amp;
-!!                                       +cf2*(pp(k+1,cell2)+pp(k+1,cell1))   &amp;
-!!                                       +cf3*(pp(k+2,cell2)+pp(k+2,cell1)))
+               pr = pp(k,cell2)+.5*(zgrid(k   ,cell1)+zgrid(k +1,cell1)-zgrid(k ,cell2)-zgrid(k +1,cell2))   &amp;
+                                  /(zgrid(kr+1,cell2)-zgrid(kr-1,cell2))*( pp(kr,cell2)-pp   (kr-1,cell2))
+               pl = pp(k,cell1)+.5*(zgrid(k   ,cell2)+zgrid(k +1,cell2)-zgrid(k ,cell1)-zgrid(k +1,cell1))   &amp;
+                                  /(zgrid(kl+1,cell1)-zgrid(kl-1,cell1))*( pp(kl,cell1)-pp   (kl-1,cell1))
+               tend_u(k,iEdge) =  - cqu(k,iEdge)*2./(zz(k,cell1)+zz(k,cell2))*(pr-pl)/dcEdge(iEdge)
 
-               dpzx(k) = .25*(zx(k,iEdge)+zx(k+1,iEdge))                  &amp;
-                            *(pzm(k,cell2)*(pp(k+1,cell2)-pp(k,cell2))    &amp;
-                             +pzm(k,cell1)*(pp(k+1,cell1)-pp(k,cell1))    &amp;
-                             +pzp(k,cell2)*(pp(k+2,cell2)-pp(k,cell2))    &amp;
-                             +pzp(k,cell1)*(pp(k+2,cell1)-pp(k,cell1))) 
-  
-               do k = 2, nVertLevels-1
+            end do
 
-!!              dpzx(k) = .5*zx(k,iEdge)*(fzm(k)*(pp(k  ,cell2)+pp(k  ,cell1))  &amp;
-!!                                   +fzp(k)*(pp(k-1,cell2)+pp(k-1,cell1)))
+         else
+            k = 1
 
-                  dpzx(k) = .25*(zx(k,iEdge)+zx(k+1,iEdge))                  &amp;
-                             *(pzp(k,cell2)*(pp(k+1,cell2)-pp(k  ,cell2))    &amp;
-                              +pzm(k,cell2)*(pp(k  ,cell2)-pp(k-1,cell2))    &amp;
-                              +pzp(k,cell1)*(pp(k+1,cell1)-pp(k  ,cell1))    &amp;
-                              +pzm(k,cell1)*(pp(k  ,cell1)-pp(k-1,cell1)))   
+            dpzx(k) = .25*(zx(k,iEdge)+zx(k+1,iEdge))                  &amp;
+                         *(pzm(k,cell2)*(pp(k+1,cell2)-pp(k,cell2))    &amp;
+                          +pzm(k,cell1)*(pp(k+1,cell1)-pp(k,cell1))    &amp;
+                          +pzp(k,cell2)*(pp(k+2,cell2)-pp(k,cell2))    &amp;
+                          +pzp(k,cell1)*(pp(k+2,cell1)-pp(k,cell1))) 
+  
+            do k = 2, nVertLevels-1
 
-               end do
-
-               k = nVertLevels
                dpzx(k) = .25*(zx(k,iEdge)+zx(k+1,iEdge))                  &amp;
-                          *(pzm(k,cell2)*(pp(k  ,cell2)-pp(k-1,cell2))    &amp;
+                          *(pzp(k,cell2)*(pp(k+1,cell2)-pp(k  ,cell2))    &amp;
+                           +pzm(k,cell2)*(pp(k  ,cell2)-pp(k-1,cell2))    &amp;
+                           +pzp(k,cell1)*(pp(k+1,cell1)-pp(k  ,cell1))    &amp;
                            +pzm(k,cell1)*(pp(k  ,cell1)-pp(k-1,cell1)))   
 
-!!               dpzx(nVertLevels+1) = 0.
+            end do
 
-               do k=1,nVertLevels
+            k = nVertLevels
+            dpzx(k) = .25*(zx(k,iEdge)+zx(k+1,iEdge))                  &amp;
+                       *(pzm(k,cell2)*(pp(k  ,cell2)-pp(k-1,cell2))    &amp;
+                        +pzm(k,cell1)*(pp(k  ,cell1)-pp(k-1,cell1)))   
 
-!!                  tend_u(k,iEdge) =  - cqu(k,iEdge)*( (pp(k,cell2)/zz(k,cell2) - pp(k,cell1)/zz(k,cell1))  &amp;
-!!                                                   /  dcEdge(iEdge) - rdzw(k)*(dpzx(k+1)-dpzx(k)) )
+            do k=1,nVertLevels
 
-                  tend_u(k,iEdge) =  - cqu(k,iEdge)*((pp(k,cell2)-pp(k,cell1))/dcEdge(iEdge)   &amp;
-                                          - dpzx(k) ) / (.5*(zz(k,cell2)+zz(k,cell1)))
-               end do
+               tend_u(k,iEdge) =  - cqu(k,iEdge)*((pp(k,cell2)-pp(k,cell1))/dcEdge(iEdge)   &amp;
+                                       - dpzx(k) ) / (.5*(zz(k,cell2)+zz(k,cell1)))
+            end do
 
-            end if
+         end if
 
-            wduz(1) = 0.
-            k = 2
-            wduz(k) =  0.5*( rw(k,cell1)+rw(k,cell2) )*(fzm(k)*u(k,iEdge)+fzp(k)*u(k-1,iEdge))  
-            do k=3,nVertLevels-1
-               wduz(k) = flux3( u(k-2,iEdge),u(k-1,iEdge),u(k,iEdge),u(k+1,iEdge),0.5*(rw(k,cell1)+rw(k,cell2)), 1.0_RKIND )
-            end do
-            k = nVertLevels
-            wduz(k) =  0.5*( rw(k,cell1)+rw(k,cell2) )*(fzm(k)*u(k,iEdge)+fzp(k)*u(k-1,iEdge))  
+         ! vertical transport of u
 
-            wduz(nVertLevels+1) = 0.
+         wduz(1) = 0.
+         k = 2
+         wduz(k) =  0.5*( rw(k,cell1)+rw(k,cell2) )*(fzm(k)*u(k,iEdge)+fzp(k)*u(k-1,iEdge))  
+         do k=3,nVertLevels-1
+            wduz(k) = flux3( u(k-2,iEdge),u(k-1,iEdge),u(k,iEdge),u(k+1,iEdge),0.5*(rw(k,cell1)+rw(k,cell2)), 1.0_RKIND )
+         end do
+         k = nVertLevels
+         wduz(k) =  0.5*( rw(k,cell1)+rw(k,cell2) )*(fzm(k)*u(k,iEdge)+fzp(k)*u(k-1,iEdge))  
 
+         wduz(nVertLevels+1) = 0.
+
+         do k=1,nVertLevels
+            tend_u(k,iEdge) = tend_u(k,iEdge) - rdzw(k)*(wduz(k+1)-wduz(k)) 
+         end do
+
+         ! Next, nonlinear Coriolis term (q) following Ringler et al JCP 2009
+
+         q(:) = 0.0
+         do j = 1,nEdgesOnEdge(iEdge)
+            eoe = edgesOnEdge(j,iEdge)
             do k=1,nVertLevels
-!               tend_u(k,iEdge) =  - cqu(k,iEdge)*( (pp(k,cell2)/zz(k,cell2) - pp(k,cell1)/zz(k,cell1))  &amp;
-!                                                /  dcEdge(iEdge) - rdzw(k)*(dpzx(k+1)-dpzx(k)) )
-               tend_u(k,iEdge) = tend_u(k,iEdge) - rdzw(k)*(wduz(k+1)-wduz(k)) 
+               workpv = 0.5 * (pv_edge(k,iEdge) + pv_edge(k,eoe))
+               q(k) = q(k) + weightsOnEdge(j,iEdge) * u(k,eoe) * workpv * rho_edge(k,eoe)
             end do
+         end do
 
-               q(:) = 0.0
-               do j = 1,nEdgesOnEdge(iEdge)
-                 eoe = edgesOnEdge(j,iEdge)
-                 do k=1,nVertLevels
-                   workpv = 0.5 * (pv_edge(k,iEdge) + pv_edge(k,eoe))
-                   q(k) = q(k) + weightsOnEdge(j,iEdge) * u(k,eoe) * workpv * rho_edge(k,eoe)
-                 end do
-               end do
+         do k=1,nVertLevels
 
-            do k=1,nVertLevels
-               tend_u(k,iEdge) = tend_u(k,iEdge) + rho_edge(k,iEdge)* (q(k) - (ke(k,cell2) - ke(k,cell1))       &amp;
-                                                                    / dcEdge(iEdge))                            &amp;
-                                                - u(k,iEdge)*0.5*(divergence_ru(k,cell1)+divergence_ru(k,cell2)) 
-               !SHP-curvature
-               if (curvature) then
+            ! horizontal ke gradient and vorticity terms in the vector invariant formulation
+            ! of the horizontal momentum equation
+            tend_u(k,iEdge) = tend_u(k,iEdge) + rho_edge(k,iEdge)* (q(k) - (ke(k,cell2) - ke(k,cell1))       &amp;
+                                                                 / dcEdge(iEdge))                            &amp;
+                                             - u(k,iEdge)*0.5*(divergence_ru(k,cell1)+divergence_ru(k,cell2)) 
+            if (curvature) then
+
+               ! curvature terms for the sphere
+
                tend_u(k,iEdge) = tend_u(k,iEdge) &amp;
                                 - 2.*omega*cos(grid % angleEdge % array(iEdge))*cos(grid % latEdge % array(iEdge))  &amp;
                                   *rho_edge(k,iEdge)*.25*(w(k,cell1)+w(k+1,cell1)+w(k,cell2)+w(k+1,cell2))          &amp; 
                                 - u(k,iEdge)*.25*(w(k+1,cell1)+w(k,cell1)+w(k,cell2)+w(k+1,cell2))                  &amp;
                                   *rho_edge(k,iEdge)/r_earth
-               !old-err.
-               !tend_u(k,iEdge) = tend_u(k,iEdge) &amp;
-               !                 - 2.*omega_e*cos(grid % angleEdge % array(iEdge))*cos(grid % latEdge % array(iEdge))  &amp;
-               !                   *.25*(rw(k,cell1)+rw(k+1,cell1)+rw(k,cell2)+rw(k+1,cell2))                          &amp;
-               !                 - u(k,iEdge)*.25*(rw(k+1,cell1)+rw(k,cell1)+rw(k,cell2)+rw(k+1,cell2))/r_earth
-               end if
-            end do
+            end if
          end do
+      end do
 
       deallocate(divergence_ru)
 
       !
       !  horizontal mixing for u
+      !  mixing terms are integrated using forward-Euler, so this tendency is only computed in the
+      !  first Runge-Kutta substep and saved for use in later RK substeps 2 and 3.
       !
 
       if (rk_step == 1 .or. rk_diffusion) then
@@ -2186,60 +2208,63 @@
 
       if (delsq_horiz_mixing) then
 
-        if ((h_mom_eddy_visc2 &gt; 0.0) .and. (config_horiz_mixing == &quot;2d_fixed&quot;)) then
-           do iEdge=1,grid % nEdgesSolve
-              cell1 = cellsOnEdge(1,iEdge)
-              cell2 = cellsOnEdge(2,iEdge)
-              vertex1 = verticesOnEdge(1,iEdge)
-              vertex2 = verticesOnEdge(2,iEdge)
+         if ((h_mom_eddy_visc2 &gt; 0.0) .and. (config_horiz_mixing == &quot;2d_fixed&quot;)) then
+            do iEdge=1, nEdgesSolve
+               cell1 = cellsOnEdge(1,iEdge)
+               cell2 = cellsOnEdge(2,iEdge)
+               vertex1 = verticesOnEdge(1,iEdge)
+               vertex2 = verticesOnEdge(2,iEdge)
+   
+               do k=1,nVertLevels
   
-              do k=1,nVertLevels
+                  !
+                  ! Compute diffusion, computed as </font>
<font color="black">abla divergence - k \times </font>
<font color="red">abla vorticity
+                  !                    only valid for h_mom_eddy_visc2 == constant
+                  !
+                  ! Note that we impose a lower bound on the edge length used in the derivative of the vorticity;
+                  ! this is done to avoid an overly stringent stability constraint for small edge lengths that can
+                  ! occur on some variable-resolution meshes.
+                  !
+                  u_diffusion =   ( divergence(k,cell2)  - divergence(k,cell1) ) / dcEdge(iEdge)  &amp;
+                                 -( vorticity(k,vertex2) - vorticity(k,vertex1) ) / max(dvEdge(iEdge),0.25*dcEdge(iEdge))
+                  u_diffusion = rho_edge(k,iEdge)*h_mom_eddy_visc2 * u_diffusion
+                  u_diffusion = u_diffusion * meshScalingDel2(iEdge)
   
-                 !
-                 ! Compute diffusion, computed as </font>
<font color="black">abla divergence - k \times </font>
<font color="red">abla vorticity
-                 !                    only valid for h_mom_eddy_visc2 == constant
-                 !
-                 u_diffusion =   ( divergence(k,cell2)  - divergence(k,cell1) ) / dcEdge(iEdge)  &amp;
-                                -( vorticity(k,vertex2) - vorticity(k,vertex1) ) / max(dvEdge(iEdge),0.25*dcEdge(iEdge))
-                 u_diffusion = rho_edge(k,iEdge)*h_mom_eddy_visc2 * u_diffusion
-                 u_diffusion = u_diffusion * meshScalingDel2(iEdge)
-  
-!                 tend_u(k,iEdge) = tend_u(k,iEdge) + u_diffusion
-                 tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) + u_diffusion
-              end do
-           end do
+                  tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) + u_diffusion
+               end do
+            end do
 
-        else if ( config_horiz_mixing == &quot;2d_smagorinsky&quot;) then
+         else if ( config_horiz_mixing == &quot;2d_smagorinsky&quot;) then
 
-           do iEdge=1,grid % nEdgesSolve
-              cell1 = cellsOnEdge(1,iEdge)
-              cell2 = cellsOnEdge(2,iEdge)
-              vertex1 = verticesOnEdge(1,iEdge)
-              vertex2 = verticesOnEdge(2,iEdge)
+            do iEdge=1, nEdgesSolve
+               cell1 = cellsOnEdge(1,iEdge)
+               cell2 = cellsOnEdge(2,iEdge)
+               vertex1 = verticesOnEdge(1,iEdge)
+               vertex2 = verticesOnEdge(2,iEdge)

+               do k=1,nVertLevels
+                  !
+                  ! Compute diffusion, computed as </font>
<font color="black">abla divergence - k \times </font>
<font color="red">abla vorticity
+                  !                    only valid for h_mom_eddy_visc2 == constant
+                  !
+                  u_diffusion =   ( divergence(k,cell2)  - divergence(k,cell1) ) / dcEdge(iEdge)  &amp;
+                                 -( vorticity(k,vertex2) - vorticity(k,vertex1) ) / max(dvEdge(iEdge),0.25*dcEdge(iEdge))
+                  u_diffusion = rho_edge(k,iEdge)* 0.5*(kdiff(k,cell1)+kdiff(k,cell2)) * u_diffusion
+                  u_diffusion = u_diffusion * meshScalingDel2(iEdge)

+                  tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) + u_diffusion
+               end do
+            end do
+         end if
 
-              do k=1,nVertLevels
-                 !
-                 ! Compute diffusion, computed as </font>
<font color="black">abla divergence - k \times </font>
<font color="gray">abla vorticity
-                 !                    only valid for h_mom_eddy_visc2 == constant
-                 !
-                 u_diffusion =   ( divergence(k,cell2)  - divergence(k,cell1) ) / dcEdge(iEdge)  &amp;
-                                -( vorticity(k,vertex2) - vorticity(k,vertex1) ) / max(dvEdge(iEdge),0.25*dcEdge(iEdge))
-                 u_diffusion = rho_edge(k,iEdge)* 0.5*(kdiff(k,cell1)+kdiff(k,cell2)) * u_diffusion
-                 u_diffusion = u_diffusion * meshScalingDel2(iEdge)
-
-!                 tend_u(k,iEdge) = tend_u(k,iEdge) + u_diffusion
-                 tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) + u_diffusion
-              end do
-           end do
-        end if
-
       end if ! delsq_horiz_mixing for u
 
-!ldf (2012-10-10): Modified loop below to allow hyper-diffusion when 2d_smagorinsky is set to true.
-!     if ((h_mom_eddy_visc4 &gt; 0.0) .and. (config_horiz_mixing == &quot;2d_fixed&quot;)) then
       if ((h_mom_eddy_visc4 &gt; 0.0 .and. config_horiz_mixing == &quot;2d_fixed&quot;) .or. &amp;
           (h_mom_eddy_visc4 &gt; 0.0 .and. config_horiz_mixing == &quot;2d_smagorinsky&quot;)) then
 
+         ! del^4 horizontal filter.  We compute this as del^2 ( del^2 (u) ).
+         ! First, storage to hold the result from the first del^2 computation.
+
          allocate(delsq_divergence(nVertLevels, nCells+1))
          allocate(delsq_u(nVertLevels, nEdges+1))
          allocate(delsq_circulation(nVertLevels, nVertices+1))
@@ -2247,7 +2272,7 @@
 
          delsq_u(:,:) = 0.0
 
-         do iEdge=1,grid % nEdges
+         do iEdge=1, nEdges
             cell1 = cellsOnEdge(1,iEdge)
             cell2 = cellsOnEdge(2,iEdge)
             vertex1 = verticesOnEdge(1,iEdge)
@@ -2268,10 +2293,10 @@
 
          delsq_circulation(:,:) = 0.0
          do iEdge=1,nEdges
-               do k=1,nVertLevels
-                  delsq_circulation(k,verticesOnEdge(1,iEdge)) = delsq_circulation(k,verticesOnEdge(1,iEdge)) - dcEdge(iEdge) * delsq_u(k,iEdge)
-                  delsq_circulation(k,verticesOnEdge(2,iEdge)) = delsq_circulation(k,verticesOnEdge(2,iEdge)) + dcEdge(iEdge) * delsq_u(k,iEdge)
-               end do
+            do k=1,nVertLevels
+               delsq_circulation(k,verticesOnEdge(1,iEdge)) = delsq_circulation(k,verticesOnEdge(1,iEdge)) - dcEdge(iEdge) * delsq_u(k,iEdge)
+               delsq_circulation(k,verticesOnEdge(2,iEdge)) = delsq_circulation(k,verticesOnEdge(2,iEdge)) + dcEdge(iEdge) * delsq_u(k,iEdge)
+            end do
          end do
          do iVertex=1,nVertices
             r = 1.0 / areaTriangle(iVertex)
@@ -2284,10 +2309,10 @@
          do iEdge=1,nEdges
             cell1 = cellsOnEdge(1,iEdge)
             cell2 = cellsOnEdge(2,iEdge)
-               do k=1,nVertLevels
-                 delsq_divergence(k,cell1) = delsq_divergence(k,cell1) + delsq_u(k,iEdge)*dvEdge(iEdge)
-                 delsq_divergence(k,cell2) = delsq_divergence(k,cell2) - delsq_u(k,iEdge)*dvEdge(iEdge)
-               end do
+            do k=1,nVertLevels
+               delsq_divergence(k,cell1) = delsq_divergence(k,cell1) + delsq_u(k,iEdge)*dvEdge(iEdge)
+               delsq_divergence(k,cell2) = delsq_divergence(k,cell2) - delsq_u(k,iEdge)*dvEdge(iEdge)
+            end do
          end do
          do iCell = 1,nCells
             r = 1.0 / areaCell(iCell)
@@ -2296,7 +2321,7 @@
             end do
          end do
 
-         do iEdge=1,grid % nEdgesSolve
+         do iEdge=1,nEdgesSolve
             cell1 = cellsOnEdge(1,iEdge)
             cell2 = cellsOnEdge(2,iEdge)
             vertex1 = verticesOnEdge(1,iEdge)
@@ -2308,11 +2333,14 @@
                ! Compute diffusion, computed as </font>
<font color="black">abla divergence - k \times </font>
<font color="gray">abla vorticity
                !                    only valid for h_mom_eddy_visc4 == constant
                !
-               u_diffusion =  rho_edge(k,iEdge) * (  ( delsq_divergence(k,cell2)  - delsq_divergence(k,cell1) ) / dcEdge(iEdge)  &amp;
+               ! Here, we scale the diffusion on the divergence part a factor of config_del4u_div_factor 
+               !    relative to the rotational part.  The stability constraint on the divergence component is much less
+               !    stringent than the rotational part, and this flexibility may be useful.
+               !
+               u_diffusion =  rho_edge(k,iEdge) * ( config_del4u_div_factor * ( delsq_divergence(k,cell2)  - delsq_divergence(k,cell1) ) / dcEdge(iEdge)  &amp;
                            -( delsq_vorticity(k,vertex2) - delsq_vorticity(k,vertex1) ) / max(dvEdge(iEdge), 0.25*dcEdge(iEdge)) &amp;
                                                   )
 
-!               tend_u(k,iEdge) = tend_u(k,iEdge) - h_mom_eddy_visc4 * u_diffusion
                u_diffusion = u_diffusion * meshScalingDel4(iEdge)
                tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) - h_mom_eddy_visc4 * u_diffusion
             end do
@@ -2326,71 +2354,65 @@
       end if
 
       !
-      !  vertical mixing for u - 2nd order 
+      !  vertical mixing for u - 2nd order filter in physical (z) space
       !
       if ( v_mom_eddy_visc2 &gt; 0.0 ) then
 
          if (config_mix_full) then
 
-         do iEdge=1,grid % nEdgesSolve
+            do iEdge=1,nEdgesSolve
 
-            cell1 = cellsOnEdge(1,iEdge)
-            cell2 = cellsOnEdge(2,iEdge)
+               cell1 = cellsOnEdge(1,iEdge)
+               cell2 = cellsOnEdge(2,iEdge)
 
-            do k=2,nVertLevels-1
+               do k=2,nVertLevels-1
 
-               z1 = 0.5*(zgrid(k-1,cell1)+zgrid(k-1,cell2))
-               z2 = 0.5*(zgrid(k  ,cell1)+zgrid(k  ,cell2))
-               z3 = 0.5*(zgrid(k+1,cell1)+zgrid(k+1,cell2))
-               z4 = 0.5*(zgrid(k+2,cell1)+zgrid(k+2,cell2))
+                  z1 = 0.5*(zgrid(k-1,cell1)+zgrid(k-1,cell2))
+                  z2 = 0.5*(zgrid(k  ,cell1)+zgrid(k  ,cell2))
+                  z3 = 0.5*(zgrid(k+1,cell1)+zgrid(k+1,cell2))
+                  z4 = 0.5*(zgrid(k+2,cell1)+zgrid(k+2,cell2))
 
-               zm = 0.5*(z1+z2)
-               z0 = 0.5*(z2+z3)
-               zp = 0.5*(z3+z4)
+                  zm = 0.5*(z1+z2)
+                  z0 = 0.5*(z2+z3)
+                  zp = 0.5*(z3+z4)
 
-!               tend_u(k,iEdge) = tend_u(k,iEdge) + rho_edge(k,iEdge) * v_mom_eddy_visc2*(  &amp;
-!                                  (u(k+1,iEdge)-u(k  ,iEdge))/(zp-z0)                      &amp;
-!                                 -(u(k  ,iEdge)-u(k-1,iEdge))/(z0-zm) )/(0.5*(zp-zm))
-               tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) + rho_edge(k,iEdge) * v_mom_eddy_visc2*(  &amp;
-                                  (u(k+1,iEdge)-u(k  ,iEdge))/(zp-z0)                      &amp;
-                                 -(u(k  ,iEdge)-u(k-1,iEdge))/(z0-zm) )/(0.5*(zp-zm))
+                  tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) + rho_edge(k,iEdge) * v_mom_eddy_visc2*(  &amp;
+                                     (u(k+1,iEdge)-u(k  ,iEdge))/(zp-z0)                      &amp;
+                                    -(u(k  ,iEdge)-u(k-1,iEdge))/(z0-zm) )/(0.5*(zp-zm))
+               end do
             end do
-         end do
 
          else  ! idealized cases where we mix on the perturbation from the initial 1-D state
 
-         do iEdge=1,grid % nEdgesSolve
+            do iEdge=1,nEdgesSolve
 
-            cell1 = cellsOnEdge(1,iEdge)
-            cell2 = cellsOnEdge(2,iEdge)
+               cell1 = cellsOnEdge(1,iEdge)
+               cell2 = cellsOnEdge(2,iEdge)
 
-            do k=1,nVertLevels
+               do k=1,nVertLevels
 #ifdef ROTATED_GRID
-              u_mix(k) = u(k,iEdge) - grid % u_init % array(k) * sin( grid % angleEdge % array(iEdge) )
+                  u_mix(k) = u(k,iEdge) - grid % u_init % array(k) * sin( grid % angleEdge % array(iEdge) )
 #else
-              u_mix(k) = u(k,iEdge) - grid % u_init % array(k) * cos( grid % angleEdge % array(iEdge) )
+                  u_mix(k) = u(k,iEdge) - grid % u_init % array(k) * cos( grid % angleEdge % array(iEdge) )
 #endif
-            end do
+               end do
 
-            do k=2,nVertLevels-1
+               do k=2,nVertLevels-1
 
-               z1 = 0.5*(zgrid(k-1,cell1)+zgrid(k-1,cell2))
-               z2 = 0.5*(zgrid(k  ,cell1)+zgrid(k  ,cell2))
-               z3 = 0.5*(zgrid(k+1,cell1)+zgrid(k+1,cell2))
-               z4 = 0.5*(zgrid(k+2,cell1)+zgrid(k+2,cell2))
+                  z1 = 0.5*(zgrid(k-1,cell1)+zgrid(k-1,cell2))
+                  z2 = 0.5*(zgrid(k  ,cell1)+zgrid(k  ,cell2))
+                  z3 = 0.5*(zgrid(k+1,cell1)+zgrid(k+1,cell2))
+                  z4 = 0.5*(zgrid(k+2,cell1)+zgrid(k+2,cell2))
 
-               zm = 0.5*(z1+z2)
-               z0 = 0.5*(z2+z3)
-               zp = 0.5*(z3+z4)
+                  zm = 0.5*(z1+z2)
+                  z0 = 0.5*(z2+z3)
+                  zp = 0.5*(z3+z4)
 
-               tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) + rho_edge(k,iEdge) * v_mom_eddy_visc2*(  &amp;
-                                  (u_mix(k+1)-u_mix(k  ))/(zp-z0)                      &amp;
-                                 -(u_mix(k  )-u_mix(k-1))/(z0-zm) )/(0.5*(zp-zm))
-!               tend_u(k,iEdge) = tend_u(k,iEdge) + rho_edge(k,iEdge) * v_mom_eddy_visc2*(  &amp;
-!                                  (u_mix(k+1)-u_mix(k  ))/(zp-z0)                      &amp;
-!                                 -(u_mix(k  )-u_mix(k-1))/(z0-zm) )/(0.5*(zp-zm))
+                  tend_u_euler(k,iEdge) = tend_u_euler(k,iEdge) + rho_edge(k,iEdge) * v_mom_eddy_visc2*(  &amp;
+                                     (u_mix(k+1)-u_mix(k  ))/(zp-z0)                      &amp;
+                                    -(u_mix(k  )-u_mix(k-1))/(z0-zm) )/(0.5*(zp-zm))
+               end do
             end do
-         end do
 
          end if
 
@@ -2400,7 +2422,7 @@
 
 !  add in mixing for u
 
-      do iEdge=1,grid % nEdgesSolve
+      do iEdge=1,nEdgesSolve
          do k=1,nVertLevels
             tend_u(k,iEdge) = tend_u(k,iEdge) + tend_u_euler(k,iEdge)
          end do
@@ -2420,7 +2442,7 @@
             cell1 = cellsOnEdge(1,iEdge)
             cell2 = cellsOnEdge(2,iEdge)
             if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then
-               do k=2,grid % nVertLevels
+               do k=2,nVertLevels
                   flux = dvEdge(iEdge) * (fzm(k)*ru(k,iEdge) + fzp(k)*ru(k-1,iEdge) ) &amp;
                                         *(w(k,cell1) + w(k,cell2))*0.5 
                   tend_w(k,cell1) = tend_w(k,cell1) - flux
@@ -2436,53 +2458,27 @@
             cell2 = cellsOnEdge(2,iEdge)
             if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then
 
-               do k=2,grid % nVertLevels
-                 ru_edge_w(k) = fzm(k)*ru(k,iEdge) + fzp(k)*ru(k-1,iEdge)
+               do k=2,nVertLevels
+                  ru_edge_w(k) = fzm(k)*ru(k,iEdge) + fzp(k)*ru(k-1,iEdge)
                end do
 
                flux_arr(:) = 0.
+
+               ! flux_arr stores the value of w at the cell edge used in the horizontal transport
+
                do i=1,nAdvCellsForEdge(iEdge)
-                 iCell = advCellsForEdge(i,iEdge)
-                 do k=2,grid % nVertLevels
-                   scalar_weight = adv_coefs(i,iEdge) + coef_3rd_order*sign(1.0_RKIND,ru_edge_w(k))*adv_coefs_3rd(i,iEdge)
-                   flux_arr(k) = flux_arr(k) + scalar_weight* w(k,iCell)
-                 end do
+                  iCell = advCellsForEdge(i,iEdge)
+                  do k=2,nVertLevels
+                     scalar_weight = adv_coefs(i,iEdge) + coef_3rd_order*sign(1.0_RKIND,ru_edge_w(k))*adv_coefs_3rd(i,iEdge)
+                     flux_arr(k) = flux_arr(k) + scalar_weight* w(k,iCell)
+                  end do
                end do
 
-               do k=1,grid % nVertLevels
+               do k=1,nVertLevels
                   tend_w(k,cell1) = tend_w(k,cell1) - ru_edge_w(k)*flux_arr(k)
                   tend_w(k,cell2) = tend_w(k,cell2) + ru_edge_w(k)*flux_arr(k)
                end do
 
-!               do k=2,grid % nVertLevels
-!
-!                  d2fdx2_cell1 = deriv_two(1,1,iEdge) * w(k,cell1)
-!                  d2fdx2_cell2 = deriv_two(1,2,iEdge) * w(k,cell2)
-!                  do i=1, grid % nEdgesOnCell % array (cell1)
-!                     if ( grid % CellsOnCell % array (i,cell1) &lt;= grid%nCells) &amp;
-!                     d2fdx2_cell1 = d2fdx2_cell1 + deriv_two(i+1,1,iEdge) * w(k,grid % CellsOnCell % array (i,cell1))
-!                  end do
-!                  do i=1, grid % nEdgesOnCell % array (cell2)
-!                     if ( grid % CellsOnCell % array (i,cell2) &lt;= grid%nCells) &amp;
-!                     d2fdx2_cell2 = d2fdx2_cell2 + deriv_two(i+1,2,iEdge) * w(k,grid % CellsOnCell % array (i,cell2))
-!                  end do
-!
-!  3rd order stencil
-!                  if( u(k,iEdge)+u(k-1,iEdge) &gt; 0) then
-!                     flux = dvEdge(iEdge) * (fzm(k)*ru(k,iEdge) + fzp(k)*ru(k-1,iEdge))*(  &amp;
-!                                             0.5*(w(k,cell1) + w(k,cell2))                 &amp;
-!                                            -(dcEdge(iEdge) **2) * (d2fdx2_cell1) / 6. )
-!                  else
-!                     flux = dvEdge(iEdge) * (fzm(k)*ru(k,iEdge) + fzp(k)*ru(k-1,iEdge))*(  &amp;
-!                                             0.5*(w(k,cell1) + w(k,cell2))                 &amp;
-!                                            -(dcEdge(iEdge) **2) * (d2fdx2_cell2) / 6. )
-!                  end if
-!
-!                  tend_w(k,cell1) = tend_w(k,cell1) - flux
-!                  tend_w(k,cell2) = tend_w(k,cell2) + flux
-!
-!               end do
-
             end if
          end do
 
@@ -2493,16 +2489,16 @@
             cell2 = cellsOnEdge(2,iEdge)
             if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then
 
-               do k=2,grid % nVertLevels
+               do k=2,nVertLevels
 
                   d2fdx2_cell1 = deriv_two(1,1,iEdge) * w(k,cell1)
                   d2fdx2_cell2 = deriv_two(1,2,iEdge) * w(k,cell2)
-                  do i=1, grid % nEdgesOnCell % array (cell1)
-                     if ( grid % CellsOnCell % array (i,cell1) &lt;= grid%nCells) &amp;
+                  do i=1, nEdgesOnCell(cell1)
+                     if ( grid % CellsOnCell % array (i,cell1) &lt;= nCells) &amp;
                      d2fdx2_cell1 = d2fdx2_cell1 + deriv_two(i+1,1,iEdge) * w(k,grid % CellsOnCell % array (i,cell1))
                   end do
-                  do i=1, grid % nEdgesOnCell % array (cell2)
-                     if ( grid % CellsOnCell % array (i,cell2) &lt;= grid%nCells) &amp;
+                  do i=1, nEdgesOnCell(cell2)
+                     if ( grid % CellsOnCell % array (i,cell2) &lt;= nCells) &amp;
                      d2fdx2_cell2 = d2fdx2_cell2 + deriv_two(i+1,2,iEdge) * w(k,grid % CellsOnCell % array (i,cell2))
                   end do
 
@@ -2519,10 +2515,9 @@
          end do
       end if
 
-      !SHP-curvature
       if (curvature) then
 
-         do iCell = 1, grid % nCellsSolve
+         do iCell = 1, nCellsSolve
             do k=2,nVertLevels
                tend_w(k,iCell) = tend_w(k,iCell) + (rho_zz(k,iCell)*fzm(k)+rho_zz(k-1,iCell)*fzp(k))*          &amp;
                                          ( (fzm(k)*ur_cell(k,iCell)+fzp(k)*ur_cell(k-1,iCell))**2.             &amp;
@@ -2531,12 +2526,6 @@
                                           *(fzm(k)*ur_cell(k,iCell)+fzp(k)*ur_cell(k-1,iCell))                 &amp;
                                           *(rho_zz(k,iCell)*fzm(k)+rho_zz(k-1,iCell)*fzp(k))
 
-               !old_err.
-               !tend_w(k,iCell) = tend_w(k,iCell) &amp;
-               !                 + rho_zz(k,iCell)*( (fzm(k)*ur_cell(k,iCell)+fzp(k)*ur_cell(k-1,iCell))**2.          &amp;
-               !                                 +(fzm(k)*vr_cell(k,iCell)+fzp(k)*vr_cell(k-1,iCell))**2. )/r_earth   &amp;
-               !                 + 2.*omega_e*cos(grid % latCell % array(iCell))*rho_zz(k,iCell)      &amp;
-               !                     *(fzm(k)*ur_cell(k,iCell)+fzp(k)*ur_cell(k-1,iCell))
             end do
          end do
 
@@ -2547,111 +2536,106 @@
       !  but here we can also code in hyperdiffusion if we wish (2nd order at present)
       !
 
-      !  Note: we are using quite a bit of the theta code here - could be combined later???
-
       if (rk_step == 1 .or. rk_diffusion) then
 
-      tend_w_euler = 0.
+         tend_w_euler = 0.
 
-      if (delsq_horiz_mixing) then
+         if (delsq_horiz_mixing) then
 
-        if ((h_mom_eddy_visc2 &gt; 0.0) .and. (config_horiz_mixing == &quot;2d_fixed&quot;)) then
+            if ((h_mom_eddy_visc2 &gt; 0.0) .and. (config_horiz_mixing == &quot;2d_fixed&quot;)) then
 
-          do iEdge=1,grid % nEdges
-            cell1 = grid % cellsOnEdge % array(1,iEdge)
-            cell2 = grid % cellsOnEdge % array(2,iEdge)
-            if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then
+               do iEdge=1,nEdges
+                  cell1 = cellsOnEdge(1,iEdge)
+                  cell2 = cellsOnEdge(2,iEdge)
+                  if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then
 
-               do k=2,grid % nVertLevels
-                  theta_turb_flux = h_mom_eddy_visc2*(w(k,cell2) - w(k,cell1))/dcEdge(iEdge)
-                  theta_turb_flux = theta_turb_flux * meshScalingDel2(iEdge)
-                  flux = 0.5*dvEdge (iEdge) * (rho_edge(k,iEdge)+rho_edge(k-1,iEdge)) * theta_turb_flux
-!                  tend_w(k,cell1) = tend_w(k,cell1) + flux
-!                  tend_w(k,cell2) = tend_w(k,cell2) - flux
-                  tend_w_euler(k,cell1) = tend_w_euler(k,cell1) + flux/areaCell(cell1)
-                  tend_w_euler(k,cell2) = tend_w_euler(k,cell2) - flux/areaCell(cell2)
+                     ! horizontal flux divergence of the gradient (i.e. del^2)
+                     ! note, for w, even though we use theta_* local scratch variables
+                     do k=2,nVertLevels
+                        theta_turb_flux = h_mom_eddy_visc2*(w(k,cell2) - w(k,cell1))/dcEdge(iEdge)
+                        theta_turb_flux = theta_turb_flux * meshScalingDel2(iEdge)
+                        flux = 0.5*dvEdge (iEdge) * (rho_edge(k,iEdge)+rho_edge(k-1,iEdge)) * theta_turb_flux
+                        tend_w_euler(k,cell1) = tend_w_euler(k,cell1) + flux/areaCell(cell1)
+                        tend_w_euler(k,cell2) = tend_w_euler(k,cell2) - flux/areaCell(cell2)
+                     end do
+
+                  end if
                end do
 
-             end if
-           end do
-        else if (config_horiz_mixing == &quot;2d_smagorinsky&quot;) then
+            else if (config_horiz_mixing == &quot;2d_smagorinsky&quot;) then
 
-          do iEdge=1,grid % nEdges
-            cell1 = grid % cellsOnEdge % array(1,iEdge)
-            cell2 = grid % cellsOnEdge % array(2,iEdge)
-            if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then
+               do iEdge=1,nEdges
+                  cell1 = cellsOnEdge(1,iEdge)
+                  cell2 = cellsOnEdge(2,iEdge)
+                  if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then
+   
+                     do k=2,nVertLevels
+                        theta_turb_flux = 0.25*(kdiff(k,cell1)+kdiff(k,cell2)+kdiff(k-1,cell1)+kdiff(k,cell2))  &amp;
+                                              *(w(k,cell2) - w(k,cell1))/dcEdge(iEdge)
+                        theta_turb_flux = theta_turb_flux * meshScalingDel2(iEdge)
+                        flux = 0.5*dvEdge (iEdge) * (rho_edge(k,iEdge)+rho_edge(k-1,iEdge)) * theta_turb_flux
+                        tend_w_euler(k,cell1) = tend_w_euler(k,cell1) + flux/areaCell(cell1)
+                        tend_w_euler(k,cell2) = tend_w_euler(k,cell2) - flux/areaCell(cell2)
+                     end do
 
-               do k=2,grid % nVertLevels
-!                  theta_turb_flux = h_mom_eddy_visc2*(w(k,cell2) - w(k,cell1))/dcEdge(iEdge)
-                  theta_turb_flux = 0.25*(kdiff(k,cell1)+kdiff(k,cell2)+kdiff(k-1,cell1)+kdiff(k,cell2))  &amp;
-                                        *(w(k,cell2) - w(k,cell1))/dcEdge(iEdge)
-                  theta_turb_flux = theta_turb_flux * meshScalingDel2(iEdge)
-                  flux = 0.5*dvEdge (iEdge) * (rho_edge(k,iEdge)+rho_edge(k-1,iEdge)) * theta_turb_flux
-!                  tend_w(k,cell1) = tend_w(k,cell1) + flux
-!                  tend_w(k,cell2) = tend_w(k,cell2) - flux
-                  tend_w_euler(k,cell1) = tend_w_euler(k,cell1) + flux/areaCell(cell1)
-                  tend_w_euler(k,cell2) = tend_w_euler(k,cell2) - flux/areaCell(cell2)
+                  end if
                end do
+            end if
+         end if   ! delsq_horiz_mixing
 
-             end if
-           end do
-        end if

-      end if
+         if ((h_mom_eddy_visc4 &gt; 0.0 .and. config_horiz_mixing == &quot;2d_fixed&quot;) .or. &amp;
+             (h_mom_eddy_visc4 &gt; 0.0 .and. config_horiz_mixing == &quot;2d_smagorinsky&quot;)) then
 
-!ldf (2010-10-10):
-!     if ( (h_mom_eddy_visc4 &gt; 0.0) .and. (config_horiz_mixing == &quot;2d_fixed&quot;)) then
-      if ((h_mom_eddy_visc4 &gt; 0.0 .and. config_horiz_mixing == &quot;2d_fixed&quot;) .or. &amp;
-          (h_mom_eddy_visc4 &gt; 0.0 .and. config_horiz_mixing == &quot;2d_smagorinsky&quot;)) then
 
-         allocate(delsq_theta(nVertLevels, nCells+1))
+            ! del^4 horizontal filter.  We compute this as del^2 ( del^2 (u) ).
+            !
+            ! First, storage to hold the result from the first del^2 computation.
+            !  we copied code from the theta mixing, hence the theta* names.
 
-         delsq_theta(:,:) = 0.
+            allocate(delsq_theta(nVertLevels, nCells+1))
 
-         do iEdge=1,grid % nEdges
-            cell1 = grid % cellsOnEdge % array(1,iEdge)
-            cell2 = grid % cellsOnEdge % array(2,iEdge)
-            do k=2,grid % nVertLevels
-               delsq_theta(k,cell1) = delsq_theta(k,cell1) + dvEdge(iEdge)*0.5*(rho_edge(k,iEdge)+rho_edge(k-1,iEdge))*(w(k,cell2) - w(k,cell1))/dcEdge(iEdge)
-               delsq_theta(k,cell2) = delsq_theta(k,cell2) - dvEdge(iEdge)*0.5*(rho_edge(k,iEdge)+rho_edge(k-1,iEdge))*(w(k,cell2) - w(k,cell1))/dcEdge(iEdge)
+            delsq_theta(:,:) = 0.
+
+            do iEdge=1,nEdges
+               cell1 = cellsOnEdge(1,iEdge)
+               cell2 = cellsOnEdge(2,iEdge)
+               do k=2,nVertLevels
+                  delsq_theta(k,cell1) = delsq_theta(k,cell1) + dvEdge(iEdge)*0.5*(rho_edge(k,iEdge)+rho_edge(k-1,iEdge))*(w(k,cell2) - w(k,cell1))/dcEdge(iEdge)
+                  delsq_theta(k,cell2) = delsq_theta(k,cell2) - dvEdge(iEdge)*0.5*(rho_edge(k,iEdge)+rho_edge(k-1,iEdge))*(w(k,cell2) - w(k,cell1))/dcEdge(iEdge)
+               end do
             end do
-         end do
 
-         do iCell = 1, nCells
-            r = 1.0 / areaCell(iCell)
-            do k=2,nVertLevels
-               delsq_theta(k,iCell) = delsq_theta(k,iCell) * r
+            do iCell = 1, nCells
+               r = 1.0 / areaCell(iCell)
+               do k=2,nVertLevels
+                  delsq_theta(k,iCell) = delsq_theta(k,iCell) * r
+               end do
             end do
-         end do
 
-         do iEdge=1,grid % nEdges
-            cell1 = grid % cellsOnEdge % array(1,iEdge)
-            cell2 = grid % cellsOnEdge % array(2,iEdge)
-            if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then
+            do iEdge=1,nEdges
+               cell1 = cellsOnEdge(1,iEdge)
+               cell2 = cellsOnEdge(2,iEdge)
+               if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then
 
-               do k=2,grid % nVertLevels
-                  theta_turb_flux = h_mom_eddy_visc4*(delsq_theta(k,cell2) - delsq_theta(k,cell1))/dcEdge(iEdge)
-                  theta_turb_flux = theta_turb_flux * meshScalingDel4(iEdge)
-                  flux = dvEdge (iEdge) * theta_turb_flux
+                  do k=2,nVertLevels
+                     theta_turb_flux = h_mom_eddy_visc4*(delsq_theta(k,cell2) - delsq_theta(k,cell1))/dcEdge(iEdge)
+                     theta_turb_flux = theta_turb_flux * meshScalingDel4(iEdge)
+                     flux = dvEdge (iEdge) * theta_turb_flux
+                     tend_w_euler(k,cell1) = tend_w_euler(k,cell1) - flux/areaCell(cell1)
+                     tend_w_euler(k,cell2) = tend_w_euler(k,cell2) + flux/areaCell(cell2)
+                  end do
 
-!                  tend_w(k,cell1) = tend_w(k,cell1) - flux
-!                  tend_w(k,cell2) = tend_w(k,cell2) + flux
-                  tend_w_euler(k,cell1) = tend_w_euler(k,cell1) - flux/areaCell(cell1)
-                  tend_w_euler(k,cell2) = tend_w_euler(k,cell2) + flux/areaCell(cell2)
-               end do
+               end if
+            end do
 
-            end if
-         end do
+            deallocate(delsq_theta)
 
-         deallocate(delsq_theta)
+         end if
 
-      end if
-
       end if ! horizontal mixing for w computed in first rk_step
 
       !
       !  vertical advection, pressure gradient and buoyancy for w
-      !  Note: we are also dividing through by the cell area after the horizontal flux divergence
       !
 
       do iCell = 1, nCells
@@ -2687,10 +2671,11 @@
 
          wdwz(nVertLevels+1) = 0.
 
+      !  Note: next we are also dividing through by the cell area after the horizontal flux divergence
+
          do k=2,nVertLevels
 
             tend_w(k,iCell) = tend_w(k,iCell)/areaCell(iCell) -rdzu(k)*(wdwz(k+1)-wdwz(k))    &amp;
-!SHP-buoy
                                   - cqw(k,iCell)*( rdzu(k)*(pp(k,iCell)-pp(k-1,iCell))        &amp;
                                   + gravity*  &amp;
                                    ( fzm(k)*(rb(k,iCell)*(qtot(k,iCell)) +         &amp;
@@ -2706,24 +2691,24 @@
 
       if (rk_step == 1 .or. rk_diffusion) then
 
-      if ( v_mom_eddy_visc2 &gt; 0.0 ) then
+         if ( v_mom_eddy_visc2 &gt; 0.0 ) then
 
-         do iCell = 1, grid % nCellsSolve
+            do iCell = 1, nCellsSolve
             do k=2,nVertLevels
                tend_w_euler(k,iCell) = tend_w_euler(k,iCell) + v_mom_eddy_visc2*0.5*(rho_zz(k,iCell)+rho_zz(k-1,iCell))*(  &amp;
                                         (w(k+1,iCell)-w(k  ,iCell))*rdzw(k)                              &amp;
                                        -(w(k  ,iCell)-w(k-1,iCell))*rdzw(k-1) )*rdzu(k)
             end do
-         end do
+            end do
 
-      end if
+         end if
 
       end if ! mixing term computed first rk_step
 
 
 ! add in mixing terms for w
 
-      do iCell = 1, grid % nCellsSolve
+      do iCell = 1, nCellsSolve
          do k=2,nVertLevels
             tend_w(k,iCell) = tend_w(k,iCell) + tend_w_euler(k,iCell)
          end do
@@ -2745,7 +2730,7 @@
             cell1 = cellsOnEdge(1,iEdge)
             cell2 = cellsOnEdge(2,iEdge)
             if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then
-               do k=1,grid % nVertLevels
+               do k=1,nVertLevels
                   flux = dvEdge(iEdge) *  ru(k,iEdge) * ( 0.5*(theta_m(k,cell1) + theta_m(k,cell2)) )
                   tend_theta(k,cell1) = tend_theta(k,cell1) - flux
                   tend_theta(k,cell2) = tend_theta(k,cell2) + flux
@@ -2762,51 +2747,18 @@
 
                flux_arr(:) = 0.
                do i=1,nAdvCellsForEdge(iEdge)
-                 iCell = advCellsForEdge(i,iEdge)
-                 do k=1,grid % nVertLevels
-                   scalar_weight = adv_coefs(i,iEdge) + coef_3rd_order*sign(1.0_RKIND,ru(k,iEdge))*adv_coefs_3rd(i,iEdge)
-                   flux_arr(k) = flux_arr(k) + scalar_weight* theta_m(k,iCell)
-                 end do
+                  iCell = advCellsForEdge(i,iEdge)
+                  do k=1,nVertLevels
+                     scalar_weight = adv_coefs(i,iEdge) + coef_3rd_order*sign(1.0_RKIND,ru(k,iEdge))*adv_coefs_3rd(i,iEdge)
+                     flux_arr(k) = flux_arr(k) + scalar_weight* theta_m(k,iCell)
+                  end do
                end do
 
-               do k=1,grid % nVertLevels
+               do k=1,nVertLevels
                   tend_theta(k,cell1) = tend_theta(k,cell1) - ru(k,iEdge)*flux_arr(k)
                   tend_theta(k,cell2) = tend_theta(k,cell2) + ru(k,iEdge)*flux_arr(k)
                end do
 
-
-!               do k=1,grid % nVertLevels
-!
-!                  d2fdx2_cell1 = deriv_two(1,1,iEdge) * theta_m(k,cell1)
-!                  d2fdx2_cell2 = deriv_two(1,2,iEdge) * theta_m(k,cell2)
-!                  do i=1, grid % nEdgesOnCell % array (cell1)
-!                     if ( grid % CellsOnCell % array (i,cell1) &lt;= grid%nCells) &amp;
-!                     d2fdx2_cell1 = d2fdx2_cell1 + deriv_two(i+1,1,iEdge) * theta_m(k,grid % CellsOnCell % array (i,cell1))
-!                  end do
-!                  do i=1, grid % nEdgesOnCell % array (cell2)
-!                     if ( grid % CellsOnCell % array (i,cell2) &lt;= grid%nCells) &amp;
-!                     d2fdx2_cell2 = d2fdx2_cell2 + deriv_two(i+1,2,iEdge) * theta_m(k,grid % CellsOnCell % array (i,cell2))
-!                  end do
-!
-!  3rd order stencil
-!
-!                  if( u(k,iEdge) &gt; 0) then
-!                        flux = dvEdge(iEdge) * ru(k,iEdge) * (          &amp;
-!                                               0.5*(theta_m(k,cell1) + theta_m(k,cell2))      &amp;
-!                                                -(dcEdge(iEdge) **2) * (d2fdx2_cell1 + d2fdx2_cell2) / 12.          &amp;
-!                                                -(dcEdge(iEdge) **2) * coef_3rd_order*(d2fdx2_cell1 - d2fdx2_cell2) / 12. )
-!                     else
-!                        flux = dvEdge(iEdge) *  ru(k,iEdge) * (          &amp;
-!                                               0.5*(theta_m(k,cell1) + theta_m(k,cell2))      &amp;
-!                                                -(dcEdge(iEdge) **2) * (d2fdx2_cell1 + d2fdx2_cell2) / 12.          &amp;
-!                                                +(dcEdge(iEdge) **2) * coef_3rd_order*(d2fdx2_cell1 - d2fdx2_cell2) / 12. )
-!                  end if
-!
-!                  tend_theta(k,cell1) = tend_theta(k,cell1) - flux
-!                  tend_theta(k,cell2) = tend_theta(k,cell2) + flux
-!
-!               end do
-
             end if
          end do
 
@@ -2815,19 +2767,19 @@
          do iEdge=1,nEdges
             cell1 = cellsOnEdge(1,iEdge)
             cell2 = cellsOnEdge(2,iEdge)
-            if (cell1 &lt;= grid%nCells .and. cell2 &lt;= grid%nCells) then
+            if (cell1 &lt;= nCells .and. cell2 &lt;= nCells) then
 
-               do k=1,grid % nVertLevels
+               do k=1,nVertLevels
 
                   d2fdx2_cell1 = deriv_two(1,1,iEdge) * theta_m(k,cell1)
                   d2fdx2_cell2 = deriv_two(1,2,iEdge) * theta_m(k,cell2)
-                  do i=1, grid % nEdgesOnCell % array (cell1)
+                  do i=1, nEdgesOnCell(cell1)
                      if ( grid % CellsOnCell % array (i,cell1) &lt;= grid%nCells) &amp;
-                     d2fdx2_cell1 = d2fdx2_cell1 + deriv_two(i+1,1,iEdge) * theta_m(k,grid % CellsOnCell % array (i,cell1))
+                        d2fdx2_cell1 = d2fdx2_cell1 + deriv_two(i+1,1,iEdge) * theta_m(k,grid % CellsOnCell % array (i,cell1))
                   end do
-                  do i=1, grid % nEdgesOnCell % array (cell2)
+                  do i=1, nEdgesOnCell(cell2)
                      if ( grid % CellsOnCell % array (i,cell2) &lt;= grid%nCells) &amp;
-                     d2fdx2_cell2 = d2fdx2_cell2 + deriv_two(i+1,2,iEdge) * theta_m(k,grid % CellsOnCell % array (i,cell2))
+                        d2fdx2_cell2 = d2fdx2_cell2 + deriv_two(i+1,2,iEdge) * theta_m(k,grid % CellsOnCell % array (i,cell2))
                   end do
 
                   flux = dvEdge(iEdge) *  ru(k,iEdge) * (                                               &amp;
@@ -2855,17 +2807,15 @@
       if (delsq_horiz_mixing) then
          if ( (h_theta_eddy_visc2 &gt; 0.0) .and. (config_horiz_mixing == &quot;2d_fixed&quot;) ) then
 
-            do iEdge=1,grid % nEdges
-               cell1 = grid % cellsOnEdge % array(1,iEdge)
-               cell2 = grid % cellsOnEdge % array(2,iEdge)
+            do iEdge=1,nEdges
+               cell1 = cellsOnEdge(1,iEdge)
+               cell2 = cellsOnEdge(2,iEdge)
                if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then
   
-                  do k=1,grid % nVertLevels
+                  do k=1,nVertLevels
                      theta_turb_flux = h_theta_eddy_visc2*prandtl_inv*(theta_m(k,cell2) - theta_m(k,cell1))/dcEdge(iEdge)
                      theta_turb_flux = theta_turb_flux * meshScalingDel2(iEdge)
                      flux = dvEdge (iEdge) * rho_edge(k,iEdge) * theta_turb_flux
-!                     tend_theta(k,cell1) = tend_theta(k,cell1) + flux
-!                     tend_theta(k,cell2) = tend_theta(k,cell2) - flux
                      tend_theta_euler(k,cell1) = tend_theta_euler(k,cell1) + flux/areaCell(cell1)
                      tend_theta_euler(k,cell2) = tend_theta_euler(k,cell2) - flux/areaCell(cell2)
                   end do
@@ -2875,18 +2825,16 @@
 
          else if (  ( config_horiz_mixing == &quot;2d_smagorinsky&quot;) ) then
 
-            do iEdge=1,grid % nEdges
-               cell1 = grid % cellsOnEdge % array(1,iEdge)
-               cell2 = grid % cellsOnEdge % array(2,iEdge)
+            do iEdge=1,nEdges
+               cell1 = cellsOnEdge(1,iEdge)
+               cell2 = cellsOnEdge(2,iEdge)
                if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then
  
-                  do k=1,grid % nVertLevels
+                  do k=1,nVertLevels
                      theta_turb_flux = 0.5*(kdiff(k,cell1)+kdiff(k,cell2))*prandtl_inv  &amp;
                                            *(theta_m(k,cell2) - theta_m(k,cell1))/dcEdge(iEdge)
                      theta_turb_flux = theta_turb_flux * meshScalingDel2(iEdge)
                      flux = dvEdge (iEdge) * rho_edge(k,iEdge) * theta_turb_flux
-!                     tend_theta(k,cell1) = tend_theta(k,cell1) + flux
-!                     tend_theta(k,cell2) = tend_theta(k,cell2) - flux
                      tend_theta_euler(k,cell1) = tend_theta_euler(k,cell1) + flux/areaCell(cell1)
                      tend_theta_euler(k,cell2) = tend_theta_euler(k,cell2) - flux/areaCell(cell2)
                   end do
@@ -2897,8 +2845,6 @@
 
       end if
 
-!ldf (2010-10-10):
-!     if ( (h_theta_eddy_visc4 &gt; 0.0) .and. (config_horiz_mixing == &quot;2d_fixed&quot;) ) then
       if ((h_theta_eddy_visc4 &gt; 0.0 .and. config_horiz_mixing == &quot;2d_fixed&quot;) .or. &amp;
           (h_theta_eddy_visc4 &gt; 0.0 .and. config_horiz_mixing == &quot;2d_smagorinsky&quot;)) then
 
@@ -2906,10 +2852,10 @@
 
          delsq_theta(:,:) = 0.
 
-         do iEdge=1,grid % nEdges
-            cell1 = grid % cellsOnEdge % array(1,iEdge)
-            cell2 = grid % cellsOnEdge % array(2,iEdge)
-            do k=1,grid % nVertLevels
+         do iEdge=1,nEdges
+            cell1 = cellsOnEdge(1,iEdge)
+            cell2 = cellsOnEdge(2,iEdge)
+            do k=1,nVertLevels
                delsq_theta(k,cell1) = delsq_theta(k,cell1) + dvEdge(iEdge)*rho_edge(k,iEdge)*(theta_m(k,cell2) - theta_m(k,cell1))/dcEdge(iEdge)
                delsq_theta(k,cell2) = delsq_theta(k,cell2) - dvEdge(iEdge)*rho_edge(k,iEdge)*(theta_m(k,cell2) - theta_m(k,cell1))/dcEdge(iEdge)
             end do
@@ -2922,18 +2868,15 @@
             end do
          end do
 
-         do iEdge=1,grid % nEdges
-            cell1 = grid % cellsOnEdge % array(1,iEdge)
-            cell2 = grid % cellsOnEdge % array(2,iEdge)
+         do iEdge=1,nEdges
+            cell1 = cellsOnEdge(1,iEdge)
+            cell2 = cellsOnEdge(2,iEdge)
             if (cell1 &lt;= nCellsSolve .or. cell2 &lt;= nCellsSolve) then
 
-               do k=1,grid % nVertLevels
+               do k=1,nVertLevels
                   theta_turb_flux = h_theta_eddy_visc4*prandtl_inv*(delsq_theta(k,cell2) - delsq_theta(k,cell1))/dcEdge(iEdge)
                   theta_turb_flux = theta_turb_flux * meshScalingDel4(iEdge)
                   flux = dvEdge (iEdge) * theta_turb_flux
-
-!                  tend_theta(k,cell1) = tend_theta(k,cell1) - flux
-!                  tend_theta(k,cell2) = tend_theta(k,cell2) + flux
                   tend_theta_euler(k,cell1) = tend_theta_euler(k,cell1) - flux/areaCell(cell1)
                   tend_theta_euler(k,cell2) = tend_theta_euler(k,cell2) + flux/areaCell(cell2)
                end do
@@ -3001,47 +2944,41 @@
 
          if (config_mix_full) then
 
-         do iCell = 1, grid % nCellsSolve
-            do k=2,nVertLevels-1
-               z1 = zgrid(k-1,iCell)
-               z2 = zgrid(k  ,iCell)
-               z3 = zgrid(k+1,iCell)
-               z4 = zgrid(k+2,iCell)
+            do iCell = 1, nCellsSolve
+               do k=2,nVertLevels-1
+                  z1 = zgrid(k-1,iCell)
+                  z2 = zgrid(k  ,iCell)
+                  z3 = zgrid(k+1,iCell)
+                  z4 = zgrid(k+2,iCell)
 
-               zm = 0.5*(z1+z2)
-               z0 = 0.5*(z2+z3)
-               zp = 0.5*(z3+z4)
+                  zm = 0.5*(z1+z2)
+                  z0 = 0.5*(z2+z3)
+                  zp = 0.5*(z3+z4)
 
-!               tend_theta(k,iCell) = tend_theta(k,iCell) + v_theta_eddy_visc2*prandtl_inv*rho_zz(k,iCell)*(&amp;
-!                                        (theta_m(k+1,iCell)-theta_m(k  ,iCell))/(zp-z0)                 &amp;
-!                                       -(theta_m(k  ,iCell)-theta_m(k-1,iCell))/(z0-zm) )/(0.5*(zp-zm))
-               tend_theta_euler(k,iCell) = tend_theta_euler(k,iCell) + v_theta_eddy_visc2*prandtl_inv*rho_zz(k,iCell)*(&amp;
-                                        (theta_m(k+1,iCell)-theta_m(k  ,iCell))/(zp-z0)                 &amp;
-                                       -(theta_m(k  ,iCell)-theta_m(k-1,iCell))/(z0-zm) )/(0.5*(zp-zm))
+                  tend_theta_euler(k,iCell) = tend_theta_euler(k,iCell) + v_theta_eddy_visc2*prandtl_inv*rho_zz(k,iCell)*(&amp;
+                                           (theta_m(k+1,iCell)-theta_m(k  ,iCell))/(zp-z0)                 &amp;
+                                          -(theta_m(k  ,iCell)-theta_m(k-1,iCell))/(z0-zm) )/(0.5*(zp-zm))
+               end do
             end do
-         end do
 
          else  ! idealized cases where we mix on the perturbation from the initial 1-D state
 
-         do iCell = 1, grid % nCellsSolve
-            do k=2,nVertLevels-1
-               z1 = zgrid(k-1,iCell)
-               z2 = zgrid(k  ,iCell)
-               z3 = zgrid(k+1,iCell)
-               z4 = zgrid(k+2,iCell)
+            do iCell = 1, nCellsSolve
+               do k=2,nVertLevels-1
+                  z1 = zgrid(k-1,iCell)
+                  z2 = zgrid(k  ,iCell)
+                  z3 = zgrid(k+1,iCell)
+                  z4 = zgrid(k+2,iCell)
 
-               zm = 0.5*(z1+z2)
-               z0 = 0.5*(z2+z3)
-               zp = 0.5*(z3+z4)
+                  zm = 0.5*(z1+z2)
+                  z0 = 0.5*(z2+z3)
+                  zp = 0.5*(z3+z4)
 
-!               tend_theta(k,iCell) = tend_theta(k,iCell) + v_theta_eddy_visc2*prandtl_inv*rho_zz(k,iCell)*(&amp;
-!                                        ((theta_m(k+1,iCell)-t_init(k+1,iCell))-(theta_m(k  ,iCell)-t_init(k,iCell)))/(zp-z0)      &amp;
-!                                       -((theta_m(k  ,iCell)-t_init(k,iCell))-(theta_m(k-1,iCell)-t_init(k-1,iCell)))/(z0-zm) )/(0.5*(zp-zm))
-               tend_theta_euler(k,iCell) = tend_theta_euler(k,iCell) + v_theta_eddy_visc2*prandtl_inv*rho_zz(k,iCell)*(&amp;
-                                        ((theta_m(k+1,iCell)-t_init(k+1,iCell))-(theta_m(k  ,iCell)-t_init(k,iCell)))/(zp-z0)      &amp;
-                                       -((theta_m(k  ,iCell)-t_init(k,iCell))-(theta_m(k-1,iCell)-t_init(k-1,iCell)))/(z0-zm) )/(0.5*(zp-zm))
+                  tend_theta_euler(k,iCell) = tend_theta_euler(k,iCell) + v_theta_eddy_visc2*prandtl_inv*rho_zz(k,iCell)*(&amp;
+                                           ((theta_m(k+1,iCell)-t_init(k+1,iCell))-(theta_m(k  ,iCell)-t_init(k,iCell)))/(zp-z0)      &amp;
+                                          -((theta_m(k  ,iCell)-t_init(k,iCell))-(theta_m(k-1,iCell)-t_init(k-1,iCell)))/(z0-zm) )/(0.5*(zp-zm))
+               end do
             end do
-         end do
 
          end if
 
@@ -3049,7 +2986,7 @@
 
       end if ! compute theta mixing on first rk_step
 
-      do iCell = 1, grid % nCellsSolve
+      do iCell = 1, nCellsSolve
          do k=1,nVertLevels
             tend_theta(k,iCell) = tend_theta(k,iCell) + tend_theta_euler(k,iCell)
          end do
@@ -3063,9 +3000,9 @@
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
    ! Compute diagnostic fields used in the tendency computations
    !
-   ! Input: grid - grid metadata
+   ! Input: state (s), grid - grid metadata
    !
-   ! Output: s - computed diagnostics
+   ! Output: diag - computed diagnostics
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
 
       implicit none
@@ -3079,7 +3016,7 @@
       integer :: iEdge, iCell, iVertex, k, cell1, cell2, eoe, i
       real (kind=RKIND) :: h_vertex, r
 
-      integer :: nCells, nEdges, nVertices, nVertLevels
+      integer :: nCells, nEdges, nVertices, nVertLevels, vertexDegree
       real (kind=RKIND), dimension(:), pointer :: fVertex, fEdge, dvEdge, dcEdge, areaCell, areaTriangle
       real (kind=RKIND), dimension(:,:), pointer :: vh, weightsOnEdge, kiteAreasOnVertex, h_edge, h, u, v, &amp;
                                                     circulation, vorticity, ke, pv_edge, pv_vertex, pv_cell, gradPVn, gradPVt, &amp;
@@ -3087,7 +3024,6 @@
       integer, dimension(:,:), pointer :: cellsOnEdge, cellsOnVertex, verticesOnEdge, edgesOnCell, edgesOnEdge, edgesOnVertex
       integer, dimension(:), pointer :: nEdgesOnCell, nEdgesOnEdge
 
-      !WCS-instability
       logical, parameter :: hollingsworth=.true.
       real (kind=RKIND), allocatable, dimension(:,:) :: ke_vertex
       real (kind=RKIND)  :: ke_fact
@@ -3125,10 +3061,11 @@
       fVertex           =&gt; grid % fVertex % array
       fEdge             =&gt; grid % fEdge % array
                   
-      nCells      = grid % nCells
-      nEdges      = grid % nEdges
-      nVertices   = grid % nVertices
-      nVertLevels = grid % nVertLevels
+      nCells       = grid % nCells
+      nEdges       = grid % nEdges
+      nVertices    = grid % nVertices
+      nVertLevels  = grid % nVertLevels
+      vertexDegree = grid % vertexDegree
 
       !
       ! Compute height on cell edges at velocity locations
@@ -3180,7 +3117,7 @@
 
 
       !
-      ! Compute kinetic energy in each cell
+      ! Compute kinetic energy in each cell (Ringler et al JCP 2009)
       !
       ke(:,:) = 0.0
       do iCell=1,nCells
@@ -3195,52 +3132,49 @@
          end do
       end do
 
-      !WCS-instability
-      ! Compute ke at cell vertices - AG's new KE construction, part 1
-      ! *** approximation here because we don't have inner triangle areas
-      !
       if (hollingsworth) then
-      allocate (ke_vertex(nVertLevels,nVertices))
-      do iVertex=1,nVertices
-         do k=1,nVertLevels
-!            ke_vertex(k,iVertex) = (  subTriangleAreasOnVertex(1,iVertex)*u(k,EdgesOnVertex(1,iVertex))**2.0  &amp;
-!                                    + subTriangleAreasOnVertex(2,iVertex)*u(k,EdgesOnVertex(2,iVertex))**2.0  &amp;
-!                                    + subTriangleAreasOnVertex(3,iVertex)*u(k,EdgesOnVertex(3,iVertex))**2.0  &amp;
-!                                                 ) / AreaTriangle(iVertex)
 
-            ke_vertex(k,iVertex) = (  dcEdge(EdgesOnVertex(1,iVertex))*dvEdge(EdgesOnVertex(1,iVertex))*u(k,EdgesOnVertex(1,iVertex))**2.0  &amp;
-                                     +dcEdge(EdgesOnVertex(2,iVertex))*dvEdge(EdgesOnVertex(2,iVertex))*u(k,EdgesOnVertex(2,iVertex))**2.0  &amp;
-                                     +dcEdge(EdgesOnVertex(3,iVertex))*dvEdge(EdgesOnVertex(3,iVertex))*u(k,EdgesOnVertex(3,iVertex))**2.0  &amp;
-                                                 ) * 0.25 / AreaTriangle(iVertex)
+         ! Compute ke at cell vertices - AG's new KE construction, part 1
+         ! *** approximation here because we don't have inner triangle areas
+         !
 
+         allocate (ke_vertex(nVertLevels,nVertices))
+         do iVertex=1,nVertices
+            do k=1,nVertLevels
+
+               ke_vertex(k,iVertex) = (  dcEdge(EdgesOnVertex(1,iVertex))*dvEdge(EdgesOnVertex(1,iVertex))*u(k,EdgesOnVertex(1,iVertex))**2.0  &amp;
+                                        +dcEdge(EdgesOnVertex(2,iVertex))*dvEdge(EdgesOnVertex(2,iVertex))*u(k,EdgesOnVertex(2,iVertex))**2.0  &amp;
+                                        +dcEdge(EdgesOnVertex(3,iVertex))*dvEdge(EdgesOnVertex(3,iVertex))*u(k,EdgesOnVertex(3,iVertex))**2.0  &amp;
+                                                    ) * 0.25 / AreaTriangle(iVertex)
+
+            end do
          end do
-      end do
 
-      ! adjust ke at cell vertices - AG's new KE construction, part 2
-      !
+         ! adjust ke at cell vertices - AG's new KE construction, part 2
+         !
 
-      ke_fact = 1.0 - .375
+         ke_fact = 1.0 - .375
 
-      do iCell=1,nCells
-         do k=1,nVertLevels
-            ke(k,iCell) = ke_fact*ke(k,iCell)
+         do iCell=1,nCells
+            do k=1,nVertLevels
+               ke(k,iCell) = ke_fact*ke(k,iCell)
+            end do
          end do
-      end do
 
-      do iVertex = 1, nVertices
-       do i=1,grid % vertexDegree
-          iCell = cellsOnVertex(i,iVertex)
-          do k = 1,nVertLevels
-             ke(k,iCell) = ke(k,iCell) + (1.-ke_fact)*kiteAreasOnVertex(i, iVertex) * ke_vertex(k, iVertex) / areaCell(iCell)
-          end do
-       end do
-      end do
-      deallocate (ke_vertex)
+         do iVertex = 1, nVertices
+            do i=1,vertexDegree
+               iCell = cellsOnVertex(i,iVertex)
+               do k = 1,nVertLevels
+                  ke(k,iCell) = ke(k,iCell) + (1.-ke_fact)*kiteAreasOnVertex(i, iVertex) * ke_vertex(k, iVertex) / areaCell(iCell)
+               end do
+            end do
+         end do
+         deallocate (ke_vertex)
+
       end if
-      !END of WCS-instability
 
       !
-      ! Compute v (tangential) velocities
+      ! Compute v (tangential) velocities following Thuburn et al JCP 2009
       !
       v(:,:) = 0.0
       do iEdge = 1,nEdges
@@ -3260,7 +3194,7 @@
       do iVertex = 1,nVertices
          do k=1,nVertLevels
             h_vertex = 0.0
-            do i=1,grid % vertexDegree
+            do i=1,vertexDegree
                h_vertex = h_vertex + h(k,cellsOnVertex(i,iVertex)) * kiteAreasOnVertex(i,iVertex)
             end do
             h_vertex = h_vertex / areaTriangle(iVertex)
@@ -3276,7 +3210,7 @@
       !
       pv_edge(:,:) = 0.0
       do iVertex = 1,nVertices
-         do i=1,grid % vertexDegree
+         do i=1,vertexDegree
             iEdge = edgesOnVertex(i,iVertex)
             do k=1,nVertLevels
                pv_edge(k,iEdge) =  pv_edge(k,iEdge)  + 0.5 * pv_vertex(k,iVertex)
@@ -3290,7 +3224,7 @@
       !
       pv_cell(:,:) = 0.0
       do iVertex = 1, nVertices
-         do i=1,grid % vertexDegree
+         do i=1,vertexDegree
             iCell = cellsOnVertex(i,iVertex)
             do k = 1,nVertLevels
                pv_cell(k,iCell) = pv_cell(k,iCell) + kiteAreasOnVertex(i, iVertex) * pv_vertex(k, iVertex) / areaCell(iCell)
@@ -3301,44 +3235,44 @@
 
       if (config_apvm_upwinding &gt; 0.0) then
 
-      !
-      ! Modify PV edge with upstream bias. 
-      !
-      ! Compute gradient of PV in the tangent direction
-      !   ( this computes gradPVt at all edges bounding real cells )
-      !
-      do iEdge = 1,nEdges
-         do k = 1,nVertLevels
-            gradPVt(k,iEdge) = (pv_vertex(k,verticesOnEdge(2,iEdge)) - pv_vertex(k,verticesOnEdge(1,iEdge))) / &amp;
-                               dvEdge(iEdge)
+         !
+         ! Modify PV edge with upstream bias. 
+         !
+         ! Compute gradient of PV in the tangent direction
+         !   ( this computes gradPVt at all edges bounding real cells )
+         !
+         do iEdge = 1,nEdges
+            do k = 1,nVertLevels
+               gradPVt(k,iEdge) = (pv_vertex(k,verticesOnEdge(2,iEdge)) - pv_vertex(k,verticesOnEdge(1,iEdge))) / &amp;
+                                  dvEdge(iEdge)
+            end do
          end do
-      end do
 
-      !
-      ! Compute gradient of PV in normal direction
-      !   (tdr: 2009-10-02: this is not correct because the pv_cell in the halo is not correct)
-      !
-      gradPVn(:,:) = 0.0
-      do iEdge = 1,nEdges
-         do k = 1,nVertLevels
-            gradPVn(k,iEdge) = (pv_cell(k,cellsOnEdge(2,iEdge)) - pv_cell(k,cellsOnEdge(1,iEdge))) / &amp;
-                                 dcEdge(iEdge)
+         !
+         ! Compute gradient of PV in normal direction
+         !   (tdr: 2009-10-02: this is not correct because the pv_cell in the halo is not correct)
+         !
+         gradPVn(:,:) = 0.0
+         do iEdge = 1,nEdges
+            do k = 1,nVertLevels
+               gradPVn(k,iEdge) = (pv_cell(k,cellsOnEdge(2,iEdge)) - pv_cell(k,cellsOnEdge(1,iEdge))) / &amp;
+                                    dcEdge(iEdge)
+            end do
          end do
-      end do
 
-      do iEdge = 1,nEdges
-         do k = 1,nVertLevels
-            pv_edge(k,iEdge) = pv_edge(k,iEdge) - config_apvm_upwinding * v(k,iEdge) * dt * gradPVt(k,iEdge)
+         do iEdge = 1,nEdges
+            do k = 1,nVertLevels
+               pv_edge(k,iEdge) = pv_edge(k,iEdge) - config_apvm_upwinding * v(k,iEdge) * dt * gradPVt(k,iEdge)
+            end do
          end do
-      end do
 
-      ! Modify PV edge with upstream bias.
-      !
-      do iEdge = 1,nEdges
-         do k = 1,nVertLevels
-            pv_edge(k,iEdge) = pv_edge(k,iEdge) - config_apvm_upwinding * u(k,iEdge) *dt * gradPVn(k,iEdge)
+         ! Modify PV edge with upstream bias.
+         !
+         do iEdge = 1,nEdges
+            do k = 1,nVertLevels
+               pv_edge(k,iEdge) = pv_edge(k,iEdge) - config_apvm_upwinding * u(k,iEdge) *dt * gradPVn(k,iEdge)
+            end do
          end do
-      end do
 
       end if  ! apvm upwinding
 
@@ -3355,64 +3289,56 @@
       type (diag_type), intent(inout) :: diag
       type (mesh_type), intent(inout) :: grid
 
-      !SHP-w
       integer :: k,iCell,iEdge,iCell1,iCell2, cell1, cell2, coef_3rd_order
+      integer :: nCells, nEdges, nVertLevels
       real (kind=RKIND) :: p0, rcv, flux
+      integer, dimension(:,:), pointer :: cellsOnEdge
 
-      !SHP-w
+      nCells      = grid % nCells
+      nEdges      = grid % nEdges
+      nVertLevels = grid % nVertLevels
+
+      cellsOnEdge =&gt; grid % cellsOnEdge % array
+
       coef_3rd_order = config_coef_3rd_order
       if(config_theta_adv_order /=3) coef_3rd_order = 0
 
       rcv = rgas / (cp-rgas)
       p0 = 1.e5  ! this should come from somewhere else...
 
-      do iCell=1,grid%nCells
-         do k=1,grid%nVertLevels
+      do iCell=1,nCells
+         do k=1,nVertLevels
             state % theta_m % array(k,iCell) = diag % theta % array(k,iCell) * (1._RKIND + rvord * state % scalars % array(state % index_qv,k,iCell))
             state % rho_zz % array(k,iCell) = diag % rho % array(k,iCell) / grid % zz % array(k,iCell)
          end do
       end do
 
-      do iEdge = 1, grid % nEdges
-         iCell1 = grid % cellsOnEdge % array(1,iEdge)
-         iCell2 = grid % cellsOnEdge % array(2,iEdge)
-         do k=1,grid % nVertLevels
+      do iEdge = 1, nEdges
+         iCell1 = cellsOnEdge(1,iEdge)
+         iCell2 = cellsOnEdge(2,iEdge)
+         do k=1,nVertLevels
             diag % ru % array(k,iEdge) = 0.5 * state % u % array(k,iEdge) * (state % rho_zz % array(k,iCell1) + state % rho_zz % array(k,iCell2))
          end do
       end do
 
-!      ! Compute w from rho_zz and rw
-!      do iCell=1,grid%nCells
-!         diag % rw % array(1,iCell) = 0.
-!         diag % rw % array(grid%nVertLevels+1,iCell) = 0.
-!         do k=2,grid%nVertLevels
-!            diag % rw % array(k,iCell) = state % w % array(k,iCell)     &amp;
-!                          * (grid % fzp % array(k) * state % rho_zz % array(k-1,iCell) + grid % fzm % array(k) * state % rho_zz % array(k,iCell))
-!         end do
-!      end do
-
-
-! WCS bug fix 20110916
-
       ! Compute rw (i.e. rho_zz * omega) from rho_zz, w, and ru.
       ! We are reversing the procedure we use in subroutine atm_recover_large_step_variables.
       ! first, the piece that depends on w.
-      do iCell=1,grid%nCells
+      do iCell=1,nCells
          diag % rw % array(1,iCell) = 0.
          diag % rw % array(grid%nVertLevels+1,iCell) = 0.
-         do k=2,grid%nVertLevels
+         do k=2,nVertLevels
             diag % rw % array(k,iCell) = state % w % array(k,iCell)     &amp;
                           * (grid % fzp % array(k) * state % rho_zz % array(k-1,iCell) + grid % fzm % array(k) * state % rho_zz % array(k,iCell)) &amp;
                           * (grid % fzp % array(k) * grid % zz % array(k-1,iCell) + grid % fzm % array(k) * grid % zz % array(k,iCell))
          end do
       end do
   
-      !SHP-w
       ! next, the piece that depends on ru
-      do iEdge=1,grid%nEdges
-        cell1 = grid % CellsOnEdge % array(1,iEdge)
-        cell2 = grid % CellsOnEdge % array(2,iEdge)
-          do k = 2, grid % nVertLevels
+      do iEdge=1,nEdges
+         cell1 = cellsOnEdge(1,iEdge)
+         cell2 = cellsOnEdge(2,iEdge)
+         do k = 2, nVertLevels
             flux = (grid % fzm % array(k) * diag % ru % array(k,iEdge)+grid % fzp % array(k) * diag % ru % array(k-1,iEdge))
             diag % rw % array(k,cell2) = diag % rw % array(k,cell2)   &amp;
                           + (grid % zb % array(k,2,iEdge) + coef_3rd_order * sign(1.0_RKIND,flux) * grid % zb3 % array(k,2,iEdge))*flux   &amp;
@@ -3420,38 +3346,36 @@
             diag % rw % array(k,cell1) = diag % rw % array(k,cell1)   &amp;
                           - (grid % zb % array(k,1,iEdge) + coef_3rd_order * sign(1.0_RKIND,flux) * grid % zb3 % array(k,1,iEdge))*flux   &amp;
                           * (grid % fzp % array(k) * grid % zz % array(k-1,cell1) + grid % fzm % array(k) * grid % zz % array(k,cell1))
-          end do
+         end do
       end do
 
-!  end WCS bug fix
-
-      do iCell = 1, grid % nCells
-         do k=1,grid % nVertLevels
+      do iCell = 1, nCells
+         do k=1,nVertLevels
             diag % rho_p % array(k,iCell) = state % rho_zz % array(k,iCell) - diag % rho_base % array(k,iCell)
          end do
       end do
 
-      do iCell = 1, grid % nCells
-         do k=1,grid % nVertLevels
+      do iCell = 1, nCells
+         do k=1,nVertLevels
             diag % rtheta_base % array(k,iCell) = diag % theta_base % array(k,iCell) * diag % rho_base % array(k,iCell)
          end do
       end do
 
-      do iCell = 1, grid % nCells
-         do k=1,grid % nVertLevels
+      do iCell = 1, nCells
+         do k=1,nVertLevels
             diag % rtheta_p % array(k,iCell) = state % theta_m % array(k,iCell) * diag % rho_p % array(k,iCell)  &amp;
                                              + diag % rho_base % array(k,iCell)   * (state % theta_m % array(k,iCell) - diag % theta_base % array(k,iCell))
          end do
       end do
 
-      do iCell=1,grid % nCells
-         do k=1,grid % nVertLevels
+      do iCell=1,nCells
+         do k=1,nVertLevels
             diag % exner % array(k,iCell) = (grid % zz % array(k,iCell) * (rgas/p0) * (diag % rtheta_p % array(k,iCell) + diag % rtheta_base % array(k,iCell)))**rcv
          end do
       end do
 
-      do iCell=1,grid % nCells
-         do k=1,grid % nVertLevels
+      do iCell=1,nCells
+         do k=1,nVertLevels
             diag % pressure_p % array(k,iCell) = grid % zz % array(k,iCell) * rgas &amp;
                                                * (  diag % exner % array(k,iCell) * diag % rtheta_p % array(k,iCell) &amp;
                                                   + diag % rtheta_base % array(k,iCell) * (diag % exner % array(k,iCell) - diag % exner_base % array(k,iCell)) &amp;

</font>
</pre>