hsmt_nonsph_GPU.F90 18.4 KB
Newer Older
1
MODULE m_hsmt_nonsph_GPU
2
#define CPP_BLOCKSIZE 4096
3 4 5 6 7 8 9 10 11
  !      USE m_juDFT
  !$     USE omp_lib

  !TODO:
  !  Check what can be done in l_noco=.true. case in terms of use of zgemm or aa_block
  !  Check what happens in case of CPP_INVERSION -> real matrix a

  IMPLICIT NONE
CONTAINS
12
  SUBROUTINE hsmt_nonsph_GPU(DIMENSION,atoms,sym,SUB_COMM, n_size,n_rank,input,isp,nintsp,&
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
       hlpmsize,noco,l_socfirst, lapw, cell,tlmplm, fj,gj,gk,vk,oneD,l_real,aa_r,aa_c)

#include"cpp_double.h"
    USE m_constants, ONLY : tpi_const
    USE m_ylm
    USE m_hsmt_spinor
    USE m_hsmt_hlptomat
    USE m_types
#if defined(_OPENACC)
!        USE nvtx
        USE cublas
#endif
    IMPLICIT NONE
    TYPE(t_dimension),INTENT(IN):: DIMENSION
    TYPE(t_oneD),INTENT(IN)     :: oneD
    TYPE(t_input),INTENT(IN)    :: input
    TYPE(t_noco),INTENT(IN)     :: noco
    TYPE(t_sym),INTENT(IN)      :: sym
    TYPE(t_cell),INTENT(IN)     :: cell
    TYPE(t_atoms),INTENT(IN)    :: atoms
    TYPE(t_lapw),INTENT(INOUT)  :: lapw !lapw%nv_tot is updated

    !     ..
    !     .. Scalar Arguments ..
    INTEGER, INTENT (IN) :: nintsp,isp
    INTEGER, INTENT (IN) :: SUB_COMM,n_size,n_rank 
    INTEGER, INTENT (IN) :: hlpmsize
    LOGICAL, INTENT (IN) :: l_socfirst
    !     ..
    !     .. Array Arguments ..
    TYPE(t_tlmplm),INTENT(IN)::tlmplm
    REAL, INTENT(IN)     :: fj(:,0:,:,:),gj(:,0:,:,:)
    REAL,INTENT(IN)      :: gk(:,:,:),vk(:,:,:)
    !-odim
    !+odim
    LOGICAL, INTENT(IN)     :: l_real
    REAL,   ALLOCATABLE, INTENT (INOUT) :: aa_r(:)!(matsize)
    COMPLEX,ALLOCATABLE, INTENT (INOUT) :: aa_c(:)
    COMPLEX,PARAMETER :: one=CMPLX(1.0,0.0),zero=CMPLX(0.0,0.0)

    !     ..
    !     .. Local Scalars ..
    INTEGER :: i,iii,ii,ij,im,in,k,ki,kj,l,ll1,lm,lmp,lp,jd,m
    INTEGER :: mp,n,na,nn,np,kjmax,iintsp,jintsp
57
    INTEGER :: kiStart, kiiSTart, kc
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
    INTEGER :: nc ,kii,spin2,ab_dim,lnonsphd,bsize,bsize2,kb
    REAL    :: th,invsfct
    COMPLEX :: term,chi11,chi21,chi22,chihlp


    !     ..
    !     .. Local Arrays ..
    COMPLEX,ALLOCATABLE :: aa_block(:,:)
    COMPLEX,ALLOCATABLE :: dtd(:,:),dtu(:,:),utd(:,:),utu(:,:)
    REAL   :: bmrot(3,3),gkrot(DIMENSION%nvd,3),vmult(3),v(3)
    COMPLEX:: ylm( (atoms%lmaxd+1)**2 ),chi(2,2)
    !     ..
#if defined(_OPENACC)
        COMPLEX,PINNED,ALLOCATABLE :: a(:,:,:),b(:,:,:)
#else
        COMPLEX, ALLOCATABLE :: a(:,:,:),b(:,:,:)
#endif
    COMPLEX, ALLOCATABLE :: ax(:,:),bx(:,:)
    COMPLEX, ALLOCATABLE :: c_ph(:,:)
    COMPLEX,ALLOCATABLE :: aahlp(:),aa_tmphlp(:)
    INTEGER :: cublas_stream,istat


81
<<<<<<< HEAD
82

83
    !call nvtxStartRange("hsmt_nonsph",1)    
84 85 86 87 88
=======
#if defined(_OPENACC)
!    call nvtxStartRange("hsmt_nonsph",1)
#endif
>>>>>>> hsmt_simple
89 90 91 92 93

    lnonsphd=MAXVAL(atoms%lnonsph)*(MAXVAL(atoms%lnonsph)+2)
    ALLOCATE(dtd(0:lnonsphd,0:lnonsphd),utd(0:lnonsphd,0:lnonsphd),dtu(0:lnonsphd,0:lnonsphd),utu(0:lnonsphd,0:lnonsphd))
    !Decide how to distribute the work

94 95 96 97 98
    IF ( noco%l_noco .AND. (.NOT. noco%l_ss) ) THEN
       ALLOCATE ( aahlp(hlpmsize),aa_tmphlp(hlpmsize) )
    ELSE
       ALLOCATE ( aahlp(1),aa_tmphlp(1) )
    END IF
99 100 101 102 103 104 105 106 107 108

    ALLOCATE(aa_block(CPP_BLOCKSIZE,MAXVAL(lapw%nv)))

    ab_dim=1
    IF (noco%l_ss) ab_dim=2
    ALLOCATE(a(DIMENSION%nvd,0:DIMENSION%lmd,ab_dim),b(DIMENSION%nvd,0:DIMENSION%lmd,ab_dim))
    ALLOCATE(ax(DIMENSION%nvd,0:DIMENSION%lmd),bx(DIMENSION%nvd,0:DIMENSION%lmd))
    ALLOCATE(c_ph(DIMENSION%nvd,ab_dim))


109 110
    !$acc data copy(aa_r,aa_c) copyin(tlmplm, tlmplm%tdd, tlmplm%tdu, tlmplm%tud,tlmplm%tuu, tlmplm%ind, atoms,atoms%lnonsph,lapw,lapw%nv,noco ) create(utu,utd,dtu,dtd,ax,bx,a,b,aa_block,aahlp)
#if defined(_OPENACC)
111
!    call nvtxStartRange("hsmt_nonsph_outer_data",2)
112
#endif
113 114 115 116 117 118 119 120

    ntyploop: DO n=1,atoms%ntype
       IF (noco%l_noco) THEN
          IF (.NOT.noco%l_ss) aahlp=CMPLX(0.0,0.0)
          IF (.NOT.noco%l_ss) aa_tmphlp=CMPLX(0.0,0.0)
          CALL hsmt_spinor(isp,n, noco,input, chi, chi11, chi21, chi22)
       ENDIF
       DO nn = 1,atoms%neq(n)
121 122
          a=0.0
          b=0.0
123 124 125 126 127 128 129 130
          na = SUM(atoms%neq(:n-1))+nn
          IF (atoms%lnonsph(n)<0) CYCLE ntyploop
          IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN
             IF (atoms%invsat(na)==0) invsfct = 1
             IF (atoms%invsat(na)==1) invsfct = 2
             np = sym%invtab(atoms%ngopr(na))
             IF (oneD%odi%d1) np = oneD%ods%ngopr(na)
             !Using double buffering create_ab could be overlapped with following GPU work
131 132 133
#if defined(_OPENACC)
!             call nvtxStartRange("create_ab",6)
#endif
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
             !--->       loop over interstitial spins
             DO iintsp = 1,nintsp
                IF (noco%l_constr.OR.l_socfirst) THEN
                   spin2=isp
                ELSE
                   spin2=iintsp
                ENDIF
                !--->          set up phase factors
                DO k = 1,lapw%nv(iintsp)
                   th= DOT_PRODUCT((/lapw%k1(k,iintsp),lapw%k2(k,iintsp),lapw%k3(k,iintsp)/)+(iintsp-1.5)*noco%qss,atoms%taual(:,na))
                   c_ph(k,iintsp) = CMPLX(COS(tpi_const*th),-SIN(tpi_const*th))
                END DO

                IF (np==1) THEN
                   gkrot( 1:lapw%nv(iintsp),:) = gk( 1:lapw%nv(iintsp),:,iintsp)
                ELSE
                   IF (oneD%odi%d1) THEN
                      bmrot=MATMUL(oneD%ods%mrot(:,:,np),cell%bmat)
                   ELSE
                      bmrot=MATMUL(1.*sym%mrot(:,:,np),cell%bmat)
                   END IF
                   DO k = 1,lapw%nv(iintsp)
                      !-->  apply the rotation that brings this atom into the
                      !-->  representative (this is the definition of ngopr(na)
                      !-->  and transform to cartesian coordinates
                      v(:) = vk(k,:,iintsp)
                      gkrot(k,:) = MATMUL(TRANSPOSE(bmrot),v)
                   END DO
                END IF
                DO k = 1,lapw%nv(iintsp)
                   !-->    generate spherical harmonics
                   vmult(:) =  gkrot(k,:)
                   CALL ylm4(atoms%lnonsph(n),vmult,ylm)
                   !-->  synthesize the complex conjugates of a and b
                   DO l = 0,atoms%lnonsph(n)
                      ll1 = l* (l+1)
                      DO m = -l,l
                         term = c_ph(k,iintsp)*ylm(ll1+m+1)
                         a(k,ll1+m,iintsp) = fj(k,l,n,spin2)*term
                         b(k,ll1+m,iintsp) = gj(k,l,n,spin2)*term
                      END DO
                   END DO
                ENDDO !k-loop
                !--->       end loop over interstitial spin
             ENDDO
             !$acc update device(a,b)
180 181 182
#if defined(_OPENACC)
!             call nvtxEndRange
#endif
183 184 185 186

             !--->       loops over the interstitial spin
             DO iintsp = 1,nintsp

187 188 189
#if defined(_OPENACC)
!                call nvtxStartRange("hsmt_nonsph_DO_jintsp",3)
#endif
190 191 192 193 194 195 196 197
                DO jintsp = 1,iintsp

                   jd = 1 ; IF (noco%l_noco) jd = isp
                   !--->       loop over l',m'
                   !$acc kernels
                   utu=0.0;utd=0.0;dtu=0.0;dtd=0.0
!!$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(lp,mp,lmp,l,m,lm,in,utu,dtu,utd,dtd,im,k) &
!!$OMP SHARED(tlmplm,invsfct,lnonsph,nv,jintsp,jd,n)
198
                   !$acc loop collapse(2)
199
                   DO lmp=0,atoms%lnonsph(n)*(atoms%lnonsph(n)+2)
200 201 202
       !               lp=FLOOR(SQRT(1.0*lmp))
       !               mp=lmp-lp*(lp+1)
       !               IF (lp>atoms%lnonsph(n).OR.ABS(mp)>lp) STOP "BUG"
203
                      !--->             loop over l,m
204 205 206 207
                      DO lm = 0,atoms%lnonsph(n)*(atoms%lnonsph(n)+2)
                   !   DO l = 0,atoms%lnonsph(n)
                   !      DO m = -l,l
                   !         lm = l* (l+1) + m
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
                            in = tlmplm%ind(lmp,lm,n,jd)
                            IF (in/=-9999) THEN
                               IF (in>=0) THEN
                                  utu(lm,lmp) =CONJG(tlmplm%tuu(in,n,jd))*invsfct
                                  dtu(lm,lmp) =CONJG(tlmplm%tdu(in,n,jd))*invsfct
                                  utd(lm,lmp) =CONJG(tlmplm%tud(in,n,jd))*invsfct
                                  dtd(lm,lmp) =CONJG(tlmplm%tdd(in,n,jd))*invsfct
                               ELSE
                                  im = -in
                                  utu(lm,lmp) =tlmplm%tuu(im,n,jd)*invsfct
                                  dtu(lm,lmp) =tlmplm%tud(im,n,jd)*invsfct
                                  utd(lm,lmp) =tlmplm%tdu(im,n,jd)*invsfct
                                  dtd(lm,lmp) =tlmplm%tdd(im,n,jd)*invsfct
                               END IF
                               !--->    update ax, bx

                            END IF
225
                    !     END DO
226 227
                      END DO
                   ENDDO
228
                   !$acc end loop
229
                   !$acc end kernels
230
                   
231 232 233 234 235
!!$OMP END PARALLEL DO
                   lmp=atoms%lnonsph(n)*(atoms%lnonsph(n)+2)
                   !ax(:nv(jintsp),0:lmp)=(matmul(a(:nv(jintsp),0:lmp,jintsp),utu(0:lmp,0:lmp))+matmul(b(:nv(jintsp),0:lmp,jintsp),utd(0:lmp,0:lmp)))
                   !bx(:nv(jintsp),0:lmp)=(matmul(a(:nv(jintsp),0:lmp,jintsp),dtu(0:lmp,0:lmp))+matmul(b(:nv(jintsp),0:lmp,jintsp),dtd(0:lmp,0:lmp)))

236 237 238
#if defined(_OPENACC)
!                   call nvtxStartRange("hsmt_nonsph_zgemm_1",4)
#endif
239 240 241 242 243 244 245
                   !$acc host_data use_device(a,b,utu,utd,dtu,dtd,ax,bx )
                   CALL zgemm("N","N",lapw%nv(jintsp),lmp+1,lmp+1,one,a(1,0,jintsp),SIZE(a,1),utu(0,0),SIZE(utu,1),zero,ax,SIZE(ax,1))
                   CALL zgemm("N","N",lapw%nv(jintsp),lmp+1,lmp+1,one,b(1,0,jintsp),SIZE(a,1),utd(0,0),SIZE(utu,1),one,ax,SIZE(ax,1))

                   CALL zgemm("N","N",lapw%nv(jintsp),lmp+1,lmp+1,one,a(1,0,jintsp),SIZE(a,1),dtu(0,0),SIZE(utu,1),zero,bx,SIZE(ax,1))
                   CALL zgemm("N","N",lapw%nv(jintsp),lmp+1,lmp+1,one,b(1,0,jintsp),SIZE(a,1),dtd(0,0),SIZE(utu,1),one,bx,SIZE(ax,1))
                   !$acc end host_data
246 247 248
#if defined(_OPENACC)
!                   call nvtxEndRange
#endif
249 250 251 252 253 254 255 256 257 258 259


                   !
                   !--->             update hamiltonian and overlap matrices
                   nc = 0
                   IF ( noco%l_noco .AND. (n_size>1) ) THEN
                      lapw%nv_tot = lapw%nv(1) + lapw%nv(2)
                   ELSE
                      lapw%nv_tot = lapw%nv(iintsp)
                   ENDIF
                   kii=n_rank
260 261 262
#if defined(_OPENACC)
!                   call nvtxStartRange("hsmt_nonsph_while_kii",5)
#endif
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
                   DO WHILE(kii<lapw%nv_tot)
                      !DO kii =  n_rank, nv_tot-1, n_size
                      ki = MOD(kii,lapw%nv(iintsp)) + 1
                      bsize=MIN(SIZE(aa_block,1),(lapw%nv(iintsp)-ki)/n_size+1) !Either use maximal blocksize or number of rows left to calculate
                      IF (bsize<1) EXIT !nothing more to do here
                      bsize2=bsize*n_size
                      bsize2=min(bsize2,lapw%nv(iintsp)-ki+1)
                      !aa_block(:bsize,:ki+bsize2-1)=matmul(a(ki:ki+bsize2-1:n_size,0:lmp,iintsp),conjg(transpose(ax(:ki+bsize2-1,0:lmp))))+ &
                      !                              matmul(b(ki:ki+bsize2-1:n_size,0:lmp,iintsp),conjg(transpose(bx(:ki+bsize2-1,0:lmp))))
                      IF (n_size==1) THEN !Make this a special case to avoid copy-in of a array
                         !$acc host_data use_device( a,b,ax,bx,aa_block )
                         call zgemm("N","C",bsize,ki+bsize2-1,lmp+1,one,a(ki,0,iintsp),SIZE(a,1),ax(1,0),SIZE(ax,1),zero,aa_block,SIZE(aa_block,1))
                         call zgemm("N","C",bsize,ki+bsize2-1,lmp+1,one,b(ki,0,iintsp),SIZE(a,1),bx(1,0),SIZE(ax,1),one ,aa_block,SIZE(aa_block,1))
                         !$acc end host_data
                      ELSE
278
                         stop "Not implemented"
279
                         !$acc host_data use_device( a,b,ax,bx,aa_block )
280 281 282 283
                         !CALL zgemm("N","C",bsize,ki+bsize2-1,lmp+1,one,a(ki:ki+bsize2-1:n_size,0:lmp,iintsp), &
                         !           SIZE(a(ki:ki+bsize2-1:n_size,0:lmp,iintsp),1),ax(1,0),SIZE(ax,1),zero,aa_block,SIZE(aa_block,1))
                         !CALL zgemm("N","C",bsize,ki+bsize2-1,lmp+1,one,b(ki:ki+bsize2-1:n_size,0:lmp,iintsp), &
                         !           SIZE(a(ki:ki+bsize2-1:n_size,0:lmp,iintsp),1),bx(1,0),SIZE(ax,1),one,aa_block,SIZE(aa_block,1))
284 285 286
                         !$acc end host_data
                      ENDIF
                      IF ( noco%l_noco .AND. (.NOT. noco%l_ss) ) THEN
287
                          !$acc kernels
288 289 290 291 292
                          !$acc loop independent
                          DO kb=1,bsize
                            nc = 1+kii/n_size
                            ii = nc*(nc-1)/2*n_size-(nc-1)*(n_size-n_rank-1)
                            IF ( (n_size==1).OR.(kii+1<=lapw%nv(1)) ) THEN    !
293 294 295
                                stop "not implemented"
                               !comments below must be reactivated
                               !aahlp(ii+1:ii+ki) = aahlp(ii+1:ii+ki)+MATMUL(CONJG(ax(:ki,:lmp)),a(ki,:,iintsp))+MATMUL(CONJG(bx(:ki,:lmp)),b(ki,:lmp,iintsp))
296
                            ELSE                    ! components for <2||2> block unused
297
                               !aa_tmphlp(:ki) = MATMUL(CONJG(ax(:ki,:lmp)),a(ki,:lmp,iintsp))+MATMUL(CONJG(bx(:ki,:DIMENSION%lmd)),b(ki,:lmp,iintsp))
298 299 300 301 302 303 304 305 306 307 308 309
                               !--->                   spin-down spin-down part
                               ij = ii + lapw%nv(1)
                               aa_c(ij+1:ij+ki)=aa_c(ij+1:ij+ki)+chi22*aa_tmphlp(:ki)
                               !--->                   spin-down spin-up part, lower triangle
                               ij =  ii
                               aa_c(ij+1:ij+ki)=aa_c(ij+1:ij+ki)+chi21*aa_tmphlp(:ki)
                            ENDIF
                            !-||
                            ki=ki+n_size
                            kii=kii+n_size
                          ENDDO
                          !$acc end loop
310
                          !$acc end kernels
311
                       ELSEIF ( noco%l_noco .AND. noco%l_ss ) THEN
312
                          !$acc kernels
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
                          !$acc loop independent
                          DO kb=1,bsize
                            IF ( iintsp==1 .AND. jintsp==1 ) THEN
                               !--->                      spin-up spin-up part
                               kjmax = ki
                               chihlp = chi11
                               ii = (ki-1)*(ki)/2
                            ELSEIF ( iintsp==2 .AND. jintsp==2 ) THEN
                               !--->                      spin-down spin-down part
                               kjmax = ki
                               chihlp = chi22
                               ii = (lapw%nv(1)+atoms%nlotot+ki-1)*(lapw%nv(1)+atoms%nlotot+ki)/2+&
                                    lapw%nv(1)+atoms%nlotot
                            ELSE
                               !--->                      spin-down spin-up part
                               kjmax = lapw%nv(1)
                               chihlp = chi21
                               ii = (lapw%nv(1)+atoms%nlotot+ki-1)*(lapw%nv(1)+atoms%nlotot+ki)/2
                            ENDIF
332 333 334 335
                            STOP "NOT implemented"
                            !next line must be reactivated
                            !aa_c(ii+1:ii+kjmax) = aa_c(ii+1:ii+kjmax) + chihlp*&
                            !     (MATMUL(CONJG(ax(:kjmax,:lmp)),a(ki,:,iintsp))+MATMUL(CONJG(bx(:kjmax,:lmp)),b(ki,:lmp,iintsp)))
336 337 338 339
                            ki=ki+n_size
                            kii=kii+n_size
                          ENDDO
                          !$acc end loop
340
                          !$acc end kernels
341
                       ELSE
342 343 344 345
                          kiStart = ki
                          kiiSTart = kii
                          !$acc kernels
                          !$acc loop independent worker
346
                          DO kb=1,bsize
347 348
                            ki = kiStart + (kb-1)*n_size
                            kii = kiiStart + (kb-1)*n_size
349 350 351
                            nc = 1+kii/n_size
                            ii = nc*(nc-1)/2*n_size- (nc-1)*(n_size-n_rank-1)
                            if (l_real) THEN
352 353 354 355 356
                       !        aa_r(ii+1:ii+ki) = aa_r(ii+1:ii+ki) + aa_block(kb,:ki)
                               !$acc loop independent vector
                               DO kc = 1, ki
                                  aa_r(ii+kc) = aa_r(ii+kc) + aa_block(kb,kc)
                               END DO
357
                            ELSE
358 359 360 361 362
                       !        aa_c(ii+1:ii+ki) = aa_c(ii+1:ii+ki) + aa_block(kb,:ki)
                               !$acc loop independent vector
                               DO kc = 1, ki
                                  aa_c(ii+kc) = aa_c(ii+kc) + aa_block(kb,kc)
                               END DO
363 364 365 366 367
                            endif
                            !print*,ii,ki,kb
                            !                           IF (.not.apw(l)) THEN
                            !aa(ii+1:ii+ki) = aa(ii+1:ii+ki) + b(ki,lmp,iintsp)*bx(:ki)
                            !                           ENDIF
368 369
!                            ki=ki+n_size
!                            kii=kii+n_size
370 371
                          ENDDO
                          !$acc end loop
372 373 374
                          !$acc end kernels
                          ki = kiStart+bsize*n_size
                          kii = kiiSTart+bsize*n_size
375 376 377
                       ENDIF
                      !--->             end loop over ki
                   END DO
378 379 380
#if defined(_OPENACC)
!                   call nvtxEndRange
#endif
381 382
                   !--->       end loops over interstitial spin
                ENDDO
383 384 385
#if defined(_OPENACC)
!                call nvtxEndRange
#endif
386 387 388 389 390 391 392 393 394 395 396 397
             ENDDO
          ENDIF              ! atoms%invsat(na) = 0 or 1
          !--->    end loop over equivalent atoms
       END DO
       IF ( noco%l_noco .AND. (.NOT. noco%l_ss) )THEN
           !$acc update host(aahlp,aa_c)
           CALL hsmt_hlptomat(atoms%nlotot,lapw%nv,sub_comm,chi11,chi21,chi22,aahlp,aa_c)
           !$acc update device(aa_c)
       ENDIF
       !---> end loop over atom types (ntype)
    ENDDO ntyploop

398 399 400
#if defined(_OPENACC)
!    call nvtxEndRange
#endif
401
    !$acc end data
402 403 404
#if defined(_OPENACC)
!    call nvtxEndRange
#endif
405 406 407


    RETURN
408
  END SUBROUTINE hsmt_nonsph_GPU
409 410


411
END MODULE m_hsmt_nonsph_GPU