halo_common_gpu.F90

!=======================================================================
! This is part of the 2DECOMP&FFT library
! 
! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
! decomposition. It also implements a highly scalable distributed
! three-dimensional Fast Fourier Transform (FFT).
!
! Copyright (C) 2009-2012 Ning Li, the Numerical Algorithms Group (NAG)
!
!=======================================================================

    s1 = size(in,1)
    s2 = size(in,2)
    s3 = size(in,3)

    ! Calculate the starting index and ending index of output
    xs = 1
    xe = s1
    ys = 1 + level
    ye = s2 - level
    zs = 1 + level
    ze = s3  - level

    s2 = s2 - 2*level
    s3 = s3 - 2*level

    ! If needed, define MPI derived data type to pack halo data,
    ! then call MPI send/receive to exchange halo data

       ! *** north/south *** 
 
     tag_s = coord(1)
      if (coord(1)==dims(1)-1 .AND. periodic_y) then
         tag_n = 0
      else
         tag_n = coord(1) + 1
      end if
       icount = s3
       ilength = level * s1
       ijump = s1*(s2+2*level)

     if(tag_n.ne.tag_s) then
#ifdef USE_CUDA
       istat = cudaMemcpy2D( work1_r(               1), ilength, in(xs,ys          ,zs), ijump, ilength, icount )
       istat = cudaMemcpy2D( work1_r(ilength*icount+1), ilength, in(xs,ye-(level-1),zs), ijump, ilength, icount )

       ! receive from south
       call MPI_IRECV(work2_r(               1), ilength*icount, data_type, &
            neighbour(1,4), tag_s, DECOMP_2D_COMM_CART_X, &
            requests(1), ierror)
       ! receive from north
       call MPI_IRECV(work2_r(ilength*icount+1), ilength*icount, data_type, &
            neighbour(1,3), tag_n, DECOMP_2D_COMM_CART_X, &
            requests(2), ierror)

       istat = cudaDeviceSynchronize()

       ! send to south
       call MPI_ISSEND(work1_r(               1), ilength*icount, data_type, &
            neighbour(1,4), tag_s, DECOMP_2D_COMM_CART_X, &
            requests(3), ierror) 
       ! send to north 
       call MPI_ISSEND(work1_r(ilength*icount+1), ilength*icount, data_type, &
            neighbour(1,3), tag_n, DECOMP_2D_COMM_CART_X, &
            requests(4), ierror) 

       call MPI_WAITALL(4, requests, status, ierror)

       istat = cudaMemcpy2D( in(xs,ys-level,zs), ijump, work2_r(               1), ilength, ilength, icount )
       istat = cudaMemcpy2D( in(xs,ye+1    ,zs), ijump, work2_r(ilength*icount+1), ilength, ilength, icount )

#else

       call MPI_TYPE_VECTOR(icount, ilength, ijump, &
            data_type, halo12, ierror)
       call MPI_TYPE_COMMIT(halo12, ierror)
       ! receive from south
       call MPI_IRECV(in(xs,ys-level,zs), 1, halo12, &
            neighbour(1,4), tag_s, DECOMP_2D_COMM_CART_X, &
            requests(1), ierror)
       ! receive from north
       call MPI_IRECV(in(xs,ye+1,zs), 1, halo12, &
            neighbour(1,3), tag_n, DECOMP_2D_COMM_CART_X, &
            requests(2), ierror)
       ! send to south
       call MPI_ISSEND(in(xs,ys,zs), 1, halo12, &
            neighbour(1,4), tag_s, DECOMP_2D_COMM_CART_X, &
            requests(3), ierror)
       ! send to north
       call MPI_ISSEND(in(xs,ye-(level-1),zs), 1, halo12, &
            neighbour(1,3), tag_n, DECOMP_2D_COMM_CART_X, &
            requests(4), ierror)
       call MPI_WAITALL(4, requests, status, ierror)
       call MPI_TYPE_FREE(halo12, ierror)
#endif
      else
!EP   MEMCPY when already in local memory to avoid indirect indexing
!EP   Performance difference unchecked
      if (level .ne. 1)  print *,"This is not supported"


     !$cuf kernel do(2) <<<*,*>>> 
      do j=lbound(in,3),ubound(in,3) 
       do i=lbound(in,1),ubound(in,1) 
       in(i,ye+1,j) = in(i,ys,j)
       in(i,ys-1,j) = in(i,ye,j)
      end do
     end do
     endif


       ! *** top/bottom ***
      tag_b = coord(2)
       if (coord(2)==dims(2)-1 .AND. periodic_z) then
          tag_t = 0
       else
         tag_t = coord(2) + 1
       end if

      ys = 1 
      s2 = s2 + 2*level

      icount = (s1 * s2) * level

     if(tag_t.ne.tag_b) then
#ifdef USE_CUDA
       istat = cudaMemcpy( work1_r(1       ), in(xs,ys,zs          ), icount )
       istat = cudaMemcpy( work1_r(1+icount), in(xs,ys,ze-(level-1)), icount ) 
       ! receive from bottom
       call MPI_IRECV(work2_r(1), icount, data_type, &
            neighbour(1,6), tag_b, DECOMP_2D_COMM_CART_X, &
            requests(1), ierror)
       ! receive from top
       call MPI_IRECV(work2_r(1+icount), icount, data_type, &
            neighbour(1,5), tag_t, DECOMP_2D_COMM_CART_X, &
            requests(2), ierror)

       istat = cudaDeviceSynchronize()

       ! send to bottom
       call MPI_ISSEND(work1_r(1), icount, data_type, &
            neighbour(1,6), tag_b, DECOMP_2D_COMM_CART_X, &
            requests(3), ierror)
       ! send to top
       call MPI_ISSEND(work1_r(1+icount), icount, data_type, &
            neighbour(1,5), tag_t, DECOMP_2D_COMM_CART_X, &
            requests(4), ierror)
       call MPI_WAITALL(4, requests, status, ierror)

       istat = cudaMemcpy( in(xs,ys,zs-level), work2_r(1       ), icount )
       istat = cudaMemcpy( in(xs,ys,ze+1    ), work2_r(1+icount), icount )
#else
       ! receive from bottom
       call MPI_IRECV(in(xs,ys,zs-level), icount, data_type, &
            neighbour(1,6), tag_b, DECOMP_2D_COMM_CART_X, &
            requests(1), ierror)
       ! receive from top
       call MPI_IRECV(in(xs,ys,ze+1), icount, data_type, &
            neighbour(1,5), tag_t, DECOMP_2D_COMM_CART_X, &
            requests(2), ierror)
       ! send to bottom
       call MPI_ISSEND(in(xs,ys,zs), icount, data_type, &
            neighbour(1,6), tag_b, DECOMP_2D_COMM_CART_X, &
            requests(3), ierror)
       ! send to top
       call MPI_ISSEND(in(xs,ys,ze-(level-1)), icount, data_type, &
            neighbour(1,5), tag_t, DECOMP_2D_COMM_CART_X, &
            requests(4), ierror)
       call MPI_WAITALL(4, requests, status, ierror)
#endif
     else
      if (level .ne. 1)  print *,"This is not supported"

     !$cuf kernel do(2) <<<*,*>>> 
      do j=lbound(in,2),ubound(in,2) 
       do i=lbound(in,1),ubound(in,1) 
       in(i,j,ze+1) = in(i,j,zs)
       in(i,j,zs-1) = in(i,j,ze)
      end do
     end do
     endif

#ifdef HALO_DEBUG       
          if(nrank.eq.0) write(*,*) 'HALO COMPARISON'
          ys = 1 + level
          ye = size(in,2) - level
          zs = 1 + level
          ze = size(in,3)  - level


          cksum = 0.0d0
          cksum2 = 0.0d0
          cksum3 = 0.0d0
          cksum4 = 0.0d0
          do i=zs,ze
            do k=xs,xe
              cksum = cksum + in(k,ys,i)
              cksum2 = cksum2 + in(k,ye,i)
              cksum3 = cksum3 + in(k,ys-1,i)
              cksum4 = cksum4 + in(k,ye+1,i)
            end do
          end do
          write(*,*) "CKSUM YS",nrank,ys,cksum
          write(*,*) "CKSUM YE",nrank,ye,cksum2
          write(*,*) "CKSUM YS-1",nrank,ys-1,cksum3
          write(*,*) "CKSUM YE+1",nrank,ye+1,cksum4

           cksum = 0.0d0
           cksum2 = 0.0d0
           cksum3 = 0.0d0
           cksum4 = 0.0d0
           do i=zs,ze
             do k=xs,xe
               cksum = cksum + in(k,ys+1,i)
               cksum2 = cksum2 + in(k,ye-1,i)
               cksum3 = cksum3 + in(k,ys-2,i)
               cksum4 = cksum4 + in(k,ye+2,i)
             end do
           end do
           write(*,*) "CKSUM YS+1",nrank,ys+1,cksum
           write(*,*) "CKSUM YE-1",nrank,ye-1,cksum2
           write(*,*) "CKSUM YS-2",nrank,ys-2,cksum3
           write(*,*) "CKSUM YE+2",nrank,ye+2,cksum4

          cksum = 0.0d0
          cksum2 = 0.0d0
          cksum3 = 0.0d0
          cksum4 = 0.0d0
          do j=ys,ye
            do k=xs,xe
              cksum = cksum + in(k,j,zs)
              cksum2 = cksum2 + in(k,j,ze)
              cksum3 = cksum3 + in(k,j,zs-1)
              cksum4 = cksum4 + in(k,j,ze+1)
            end do
          end do
          write(*,*) "CKSUM ZS",nrank,zs,cksum
          write(*,*) "CKSUM ZE",nrank,ze,cksum2
          write(*,*) "CKSUM ZS-1",nrank,zs-1,cksum3
          write(*,*) "CKSUM ZE+1",nrank,ze+1,cksum4

          cksum = 0.0d0
          cksum2 = 0.0d0
          cksum3 = 0.0d0
          cksum4 = 0.0d0
          do j=ys,ye
            do k=xs,xe
              cksum = cksum + in(k,j,zs+1)
              cksum2 = cksum2 + in(k,j,ze-1)
              cksum3 = cksum3 + in(k,j,zs-2)
              cksum4 = cksum4 + in(k,j,ze+2)
            end do
          end do
          write(*,*) "CKSUM ZS+1",nrank,zs+1,cksum
          write(*,*) "CKSUM ZE-1",nrank,ze-1,cksum2
          write(*,*) "CKSUM ZS-2",nrank,zs-2,cksum3
          write(*,*) "CKSUM ZE+2",nrank,ze+2,cksum4
#endif