<p><b>dwj07@fsu.edu</b> 2011-09-02 09:06:40 -0600 (Fri, 02 Sep 2011)</p><p><br>

        Some small edits of the design doc, as well as a pdf version for easy reading.<br>

</p><hr noshade><pre><font color="gray">Added: branches/ocean_projects/performance/PerformanceBranchDesign.pdf

===================================================================

(Binary files differ)

Property changes on: branches/ocean_projects/performance/PerformanceBranchDesign.pdf

___________________________________________________________________

Added: svn:mime-type

   + application/octet-stream

Modified: branches/ocean_projects/performance/PerformanceBranchDesign.tex

===================================================================

--- branches/ocean_projects/performance/PerformanceBranchDesign.tex        2011-09-01 22:02:02 UTC (rev 973)

+++ branches/ocean_projects/performance/PerformanceBranchDesign.tex        2011-09-02 15:06:40 UTC (rev 974)

@@ -102,44 +102,46 @@

 \begin{verbatimtab}

 do iEdge=1,nEdges

-        cell1 = cellsOnEdge(1,iEdge)

-        cell2 = cellsOnEdge(2,iEdge)

+  cell1 = cellsOnEdge(1,iEdge)

+  cell2 = cellsOnEdge(2,iEdge)

-        do k=1,maxLevelEdgeTop(iEdge)

+  do k=1,maxLevelEdgeTop(iEdge)

-                d2fdx2_cell1 = 0.0

-                d2fdx2_cell2 = 0.0

+    d2fdx2_cell1 = 0.0

+    d2fdx2_cell2 = 0.0

-                do iTracer=1,num_tracers

+    do iTracer=1,num_tracers

-                        !-- if not a boundary cell

-                        if(boundaryCell(k,cell1).eq.0.and.boundaryCell(k,cell2).eq.0) then

+      !-- if not a boundary cell

+      if(boundaryCell(k,cell1).eq.0.and.boundaryCell(k,cell2).eq.0) then

-                                d2fdx2_cell1 = deriv_two(1,1,iEdge) * tracers(iTracer,k,cell1)

-                                d2fdx2_cell2 = deriv_two(1,2,iEdge) * tracers(iTracer,k,cell2)

+        d2fdx2_cell1 = deriv_two(1,1,iEdge) * tracers(iTracer,k,cell1)

+        d2fdx2_cell2 = deriv_two(1,2,iEdge) * tracers(iTracer,k,cell2)

-                                !-- all edges of cell 1

-                                do i=1, grid % nEdgesOnCell % array (cell1)

-                                        d2fdx2_cell1 = d2fdx2_cell1 + &amp;

-                                        deriv_two(i+1,1,iEdge) * tracers(iTracer,k,grid % CellsOnCell % array (i,cell1))

-                                end do

+        !-- all edges of cell 1

+        do i=1, grid % nEdgesOnCell % array (cell1)

+          d2fdx2_cell1 = d2fdx2_cell1 &amp;

+                      + deriv_two(i+1,1,iEdge) &amp;

+                      * tracers(iTracer,k,grid % CellsOnCell % array (i,cell1))

+        end do

-                                !-- all edges of cell 2

-                                do i=1, grid % nEdgesOnCell % array (cell2)

-                                        d2fdx2_cell2 = d2fdx2_cell2 + &amp;

-                                        deriv_two(i+1,2,iEdge) * tracers(iTracer,k,grid % CellsOnCell % array (i,cell2))

-                                end do

-                        endif

+        !-- all edges of cell 2

+        do i=1, grid % nEdgesOnCell % array (cell2)

+          d2fdx2_cell2 = d2fdx2_cell2 &amp; 

+                      + deriv_two(i+1,2,iEdge) &amp;

+                      * tracers(iTracer,k,grid % CellsOnCell % array (i,cell2))

+        end do

+      endif

-                        flux = dvEdge(iEdge) *  u(k,iEdge) * h_edge(k,iEdge) * (          &amp;

-                           0.5*(tracers(iTracer,k,cell1) + tracers(iTracer,k,cell2))      &amp;

-                           -(dcEdge(iEdge) **2) * (d2fdx2_cell1 + d2fdx2_cell2) / 12. )

+      flux = dvEdge(iEdge) *  u(k,iEdge) * h_edge(k,iEdge) * (          &amp;

+         0.5*(tracers(iTracer,k,cell1) + tracers(iTracer,k,cell2))      &amp;

+         -(dcEdge(iEdge) **2) * (d2fdx2_cell1 + d2fdx2_cell2) / 12. )

-                        !-- update tendency

-                        tend_tr(iTracer,k,cell1) = tend_tr(iTracer,k,cell1) - flux/areaCell(cell1)

-                        tend_tr(iTracer,k,cell2) = tend_tr(iTracer,k,cell2) + flux/areaCell(cell2)

-                enddo

-        end do

+      !-- update tendency

+      tend_tr(iTracer,k,cell1) = tend_tr(iTracer,k,cell1) - flux/areaCell(cell1)

+      tend_tr(iTracer,k,cell2) = tend_tr(iTracer,k,cell2) + flux/areaCell(cell2)

+    enddo

+  end do

 end do

 \end{verbatimtab}

@@ -147,50 +149,54 @@

 \begin{verbatimtab}

 do iEdge=1,nEdges

-        cell1 = cellsOnEdge(1,iEdge)

-        cell2 = cellsOnEdge(2,iEdge)

+  cell1 = cellsOnEdge(1,iEdge)

+  cell2 = cellsOnEdge(2,iEdge)

-        do k=1,maxLevelEdgeTop(iEdge)

+  do k=1,maxLevelEdgeTop(iEdge)

-                d2fdx2_cell1 = 0.0

-                d2fdx2_cell2 = 0.0

-                boundaryMask1 = (.not. boundaryCell(k,cell1) ) * 1.d0

-                boundaryMask2 = (.not. boundaryCell(k,cell2) ) * 1.d0

+    d2fdx2_cell1 = 0.0

+    d2fdx2_cell2 = 0.0

+    boundaryMask1 = (.not. boundaryCell(k,cell1) ) * 1.d0

+    boundaryMask2 = (.not. boundaryCell(k,cell2) ) * 1.d0

-                do iTracer=1,num_tracers

-                        d2fdx2_cell1 = deriv_two(1,1,iEdge) * tracers(iTracer,k,cell1) * boundaryMask1

-                        d2fdx2_cell2 = deriv_two(1,2,iEdge) * tracers(iTracer,k,cell2) * boundaryMask2

+    do iTracer=1,num_tracers

+      d2fdx2_cell1 = deriv_two(1,1,iEdge) &amp;

+                  * tracers(iTracer,k,cell1)      &amp;

+                  * boundaryMask1

+      d2fdx2_cell2 = deriv_two(1,2,iEdge) &amp;

+                  * tracers(iTracer,k,cell2)      &amp;

+                  * boundaryMask2

-                        !-- all edges of cell 1

-                        do i=1, grid % nEdgesOnCell % array (cell1)

-                                d2fdx2_cell1 = d2fdx2_cell1     &amp;

-                                       + deriv_two(i+1,1,iEdge) &amp;

-                                           * tracers(iTracer,k,grid % CellsOnCell % array (i,cell1)) &amp;

-                                           * boundaryMask1

-                        end do

+      !-- all edges of cell 1

+      do i=1, grid % nEdgesOnCell % array (cell1)

+        d2fdx2_cell1 = d2fdx2_cell1     &amp;

+               + deriv_two(i+1,1,iEdge) &amp;

+             * tracers(iTracer,k,grid % CellsOnCell % array (i,cell1)) &amp;

+             * boundaryMask1

+      end do

-                        !-- all edges of cell 2

-                        do i=1, grid % nEdgesOnCell % array (cell2)

-                                d2fdx2_cell2 = d2fdx2_cell2     &amp;

-                                       + deriv_two(i+1,2,iEdge) &amp;

-                                           * tracers(iTracer,k,grid % CellsOnCell % array (i,cell2)) &amp;

-                                           * boundaryMask2

-                        end do

+      !-- all edges of cell 2

+      do i=1, grid % nEdgesOnCell % array (cell2)

+        d2fdx2_cell2 = d2fdx2_cell2     &amp;

+               + deriv_two(i+1,2,iEdge) &amp;

+             * tracers(iTracer,k,grid % CellsOnCell % array (i,cell2)) &amp;

+             * boundaryMask2

+      end do

-                        flux = dvEdge(iEdge) *  u(k,iEdge) * h_edge(k,iEdge) * (          &amp;

-                           0.5*(tracers(iTracer,k,cell1) + tracers(iTracer,k,cell2))      &amp;

-                           -(dcEdge(iEdge) **2) * (d2fdx2_cell1 + d2fdx2_cell2) / 12. )

+      flux = dvEdge(iEdge) *  u(k,iEdge) * h_edge(k,iEdge) * (          &amp;

+         0.5*(tracers(iTracer,k,cell1) + tracers(iTracer,k,cell2))      &amp;

+         -(dcEdge(iEdge) **2) * (d2fdx2_cell1 + d2fdx2_cell2) / 12. )

-                        !-- update tendency

-                        tend_tr(iTracer,k,cell1) = tend_tr(iTracer,k,cell1) - flux/areaCell(cell1)

-                        tend_tr(iTracer,k,cell2) = tend_tr(iTracer,k,cell2) + flux/areaCell(cell2)

-                enddo

-        end do

+      !-- update tendency

+      tend_tr(iTracer,k,cell1) = tend_tr(iTracer,k,cell1) - flux/areaCell(cell1)

+      tend_tr(iTracer,k,cell2) = tend_tr(iTracer,k,cell2) + flux/areaCell(cell2)

+    enddo

+  end do

 end do

 \end{verbatimtab}

-Which replaces the branch with a 6 multiplies and two logicial nots, and allows the loop to be vectorized easier. Other performance enhancements are implemented as seen fit.

+Which replaces the branch with a 6 multiplies and two logicial nots, and allows the loop to be vectorized easier. Other performance enhancements are to be implemented as seen fit.

 \section{Parallelism}

 Date last modified: 2011/09/01 \\

@@ -200,24 +206,24 @@

 The third level of parallelism will take the most work. To begin, a suitable method of parallization for accelerated architectures needs to be identified. In the event CUDA or OpenCL are chosen to perform a set of tasks on GPUs some major modifications will need to be done to algorithms suitable for programing in this fashion, at least if portable code is still a major goal. \\

-In order to maintain portable code and use CUDA or OpenCL it is likely that some algorithms, or modules, will need to be ported to having a Fortran interface on top of C code, to allow multiple compilers, as CUDA currently is only supported in Fortan using the PGI compilers, and OpenCL is not supported in Fortran at all. However it remains unclear if other avenues of parallelism on this level are available.

+In order to maintain portable code and use CUDA or OpenCL it is likely that some algorithms, or modules, will need to be ported to having a Fortran interface on top of C code, to allow the use of multiple compilers, as CUDA currently is only supported in Fortan using the PGI compilers, and OpenCL is not supported in Fortran at all. However it remains unclear if other avenues of parallelism on this level are available.

 \section{Modularity}

 Date last modified: 2011/09/01 \\

 Contributors: (Doug Jacobsen, Phil Jones) \\

-In order to aid the parallel development of code, and increase overall performance the modularity of MPAS-Ocean will be increased. Idenfied components within currently existing portions of code can be extracted and put into their own modules, as seen fit.

+In order to aid the parallel development of MPAS-Ocean by multiple developers, and increase overall performance of MPAS-Ocean, the modularity of MPAS-Ocean will be increased. Idenfied components within currently existing portions of code can be extracted and put into their own modules, as seen fit.

 As an example, within module\_time\_integration.F the horizontal mixing of tracers and momentum has a $</font>

<font color="black">abla^2$ and $</font>

<font color="gray">abla^4$ option hard coded. This would be removed, and put in it's own module named something like module\_OcnHmixMom.F. This module would include possible parameterizations, and handle the selection of each. In the described case two sub-modules would be created each named something like module\_OcnHmixMomDel2.F and module\_OcnHmixMomDel4.F, representing each of the two options. Each of these would contain the portion of code relevant to computing the tendencies for the momentum equation related to the parameterization.

-This modular programming fashion allows several things. First, array masking can be applied by passing in pointers to arrays that will be modifed, which allows the compiler to optimize computations involved these more effectively. Second, it allows parameterizations to be explored and implemented in an easier fashion that is currently available in MPAS-Ocean. Third, it should reduce the overall memory footprint of the MPAS-Ocean at any point in time, as anywhere in the code less arrays will be used. And finally, it increases the ability for encapsulation within MPAS-Ocean.

+This modular programming fashion allows several things. First, array masking can be applied by passing in pointers to arrays which may or may not be modifed, which allows the compiler to optimize computations involved these more effectively. Second, it allows parameterizations to be explored and implemented in an easier fashion that is currently available in MPAS-Ocean. Third, it should reduce the overall memory footprint of the MPAS-Ocean at any point in time, as anywhere in the code less arrays will be used. And finally, it increases the ability for encapsulation within MPAS-Ocean.

 Modularity can be seen in figure \ref{fig:modules}.

 \begin{figure}

-        \centering

-        \includegraphics[scale=0.5]{NewArchitecture.eps}

+        \includegraphics[scale=0.35]{NewArchitecture.eps}

         \label{fig:modules}

+        \caption{Example of proposed modularity of MPAS-Ocean.}

 \end{figure}

 \section{Encapsulation}

@@ -235,6 +241,8 @@

 Testing of each performance enhancement will be explored by using a basic set of timers and test cases. Unit tests will be developed that can then be set as part of a suite of tests useful for further development. Some larger tests will also be used to verify the models accuracy, and that no detrimental changes were made in each step.

+A useful effort would be to develop a ``test'' suite that can be run on a branch or trunk to validate the output of the current model. Ideally this would be a script that could be submitted to an HPC queue, and would run several test cases and examine the results to determine if the model is correct or not. Of course the test cases and criteria for a ``correct'' model need to be determined.

+

 %-----------------------------------------------------------------------

 \end{document}

</font>

</pre>