hsmt_ab.F90 9.13 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
!--------------------------------------------------------------------------------
! Copyright (c) 2016 Peter Grünberg Institut, Forschungszentrum Jülich, Germany
! This file is part of FLEUR and available as free software under the conditions
! of the MIT license as expressed in the LICENSE file in more detail.
!--------------------------------------------------------------------------------
MODULE m_hsmt_ab
  use m_juDFT
  implicit none

  INTERFACE hsmt_ab
    module procedure hsmt_ab_cpu
12
#ifdef CPP_GPU
13 14 15 16 17 18 19
    module procedure hsmt_ab_gpu
#endif
  END INTERFACE


CONTAINS

20
#ifdef CPP_GPU
21

22
  ATTRIBUTES(global) SUBROUTINE synth_ab(loop_size,n,lmax,ab_size,gkrot_dev,fj,gj,c_ph,ab)
23
    USE m_ylm
24
    INTEGER, VALUE, INTENT(IN) :: loop_size, n, lmax, ab_size
Uliana Alekseeva's avatar
Uliana Alekseeva committed
25
    REAL,   DEVICE, INTENT(IN) :: gkrot_dev(:,:),fj(:,:),gj(:,:)
Uliana Alekseeva's avatar
Uliana Alekseeva committed
26
    COMPLEX,DEVICE, INTENT(IN) :: c_ph(:)
27 28
    COMPLEX,DEVICE, INTENT (OUT) :: ab(:,:)
    COMPLEX,ALLOCATABLE :: ylm(:)
29 30
    INTEGER :: k,l,ll1,m,i
    INTEGER :: loop_start, loop_end
31 32 33 34 35 36 37 38 39 40 41 42 43 44

    ALLOCATE(ylm((lmax+1)**2))

    k = (blockidx%x-1)*blockdim%x + threadidx%x
    loop_start = (k-1) * loop_size + 1
    loop_end = loop_start + loop_size - 1
    if (loop_end > n ) loop_end = n

    DO i = loop_start,loop_end
       !-->    generate spherical harmonics
       CALL ylm4_dev(lmax,gkrot_dev(:,i),ylm(:))
       DO l = 0,lmax
          ll1 = l* (l+1)
          DO m = -l,l               
Uliana Alekseeva's avatar
Uliana Alekseeva committed
45 46
             ab(i,ll1+m+1)         = CONJG(fj(i,l+1)*c_ph(i)*ylm(ll1+m+1)) 
             ab(i,ll1+m+1+ab_size) = CONJG(gj(i,l+1)*c_ph(i)*ylm(ll1+m+1)) 
47 48 49 50 51 52 53
          END DO
       END DO
    ENDDO 

    DEALLOCATE(ylm)
  END SUBROUTINE synth_ab

54 55

  SUBROUTINE hsmt_ab_gpu(sym,atoms,noco,ispin,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,l_nonsph,abclo,alo1,blo1,clo1)
56
!Calculate overlap matrix, GPU version
57 58 59
    USE m_constants, ONLY : fpi_const,tpi_const
    USE m_types
    USE m_ylm
60
    USE cudafor
Uliana Alekseeva's avatar
Uliana Alekseeva committed
61
    USE nvtx
62 63 64 65 66 67 68 69 70 71 72 73 74
    IMPLICIT NONE
    TYPE(t_sym),INTENT(IN)      :: sym
    TYPE(t_cell),INTENT(IN)     :: cell
    TYPE(t_atoms),INTENT(IN)    :: atoms
    TYPE(t_lapw),INTENT(IN)     :: lapw
    TYPE(t_noco),INTENT(IN)     :: noco
    !     ..
    !     .. Scalar Arguments ..
    INTEGER, INTENT (IN) :: ispin,n,na,iintsp
    LOGICAL,INTENT(IN)   :: l_nonsph
    INTEGER,INTENT(OUT)  :: ab_size
    !     ..
    !     .. Array Arguments ..
75
    REAL, DEVICE, INTENT(IN)       :: fj(:,:,:),gj(:,:,:)
76 77 78 79 80 81
    COMPLEX,DEVICE, INTENT (OUT) :: ab(:,:)
    !Optional arguments if abc coef for LOs are needed
    COMPLEX, INTENT(INOUT),OPTIONAL:: abclo(:,-atoms%llod:,:,:)
    REAL,INTENT(IN),OPTIONAL:: alo1(:),blo1(:),clo1(:)
    
    INTEGER:: np,k,l,ll1,m,lmax,nkvec,lo,lm,invsfct
Uliana Alekseeva's avatar
Uliana Alekseeva committed
82 83 84 85 86 87 88
    REAL   :: th,v(3),bmrot(3,3),vmult(3)
    COMPLEX,ALLOCATABLE :: ylm(:,:)
    COMPLEX,ALLOCATABLE :: c_ph(:,:)
    REAL,   ALLOCATABLE :: gkrot(:,:)
    COMPLEX:: term

    COMPLEX,ALLOCATABLE,DEVICE :: c_ph_dev(:,:)
89
    REAL,   ALLOCATABLE,DEVICE :: gkrot_dev(:,:)
90
    INTEGER :: grid, block, loop_size
Uliana Alekseeva's avatar
Uliana Alekseeva committed
91
    INTEGER :: istat
Uliana Alekseeva's avatar
Uliana Alekseeva committed
92
 
Uliana Alekseeva's avatar
Uliana Alekseeva committed
93
    call nvtxStartRange("hsmt_ab",3)    
94 95
    lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph)

Uliana Alekseeva's avatar
Uliana Alekseeva committed
96
    ALLOCATE(c_ph_dev(lapw%nv(1),MERGE(2,1,noco%l_ss)))
97
    ALLOCATE(gkrot_dev(3,lapw%nv(1)))
98

99
    ALLOCATE(ylm((lmax+1)**2,lapw%nv(1)))
100 101 102 103 104 105 106
    ALLOCATE(c_ph(lapw%nv(1),MERGE(2,1,noco%l_ss)))
    ALLOCATE(gkrot(3,lapw%nv(1)))

    
    ab_size=lmax*(lmax+2)+1
    ab=0.0
    
107
    np = sym%invtab(sym%ngopr(na))
108 109
    !--->          set up phase factors
    CALL lapw%phase_factors(iintsp,atoms%taual(:,na),noco%qss,c_ph(:,iintsp))
Uliana Alekseeva's avatar
Uliana Alekseeva committed
110 111
    c_ph_dev=c_ph   
 
112 113 114 115 116 117 118 119 120 121 122 123 124
    IF (np==1) THEN
       gkrot(:, 1:lapw%nv(iintsp)) = lapw%gk(:, 1:lapw%nv(iintsp),iintsp)
    ELSE
       bmrot=MATMUL(1.*sym%mrot(:,:,np),cell%bmat)
       DO k = 1,lapw%nv(iintsp)
          !-->  apply the rotation that brings this atom into the
          !-->  representative (this is the definition of ngopr(na)
          !-->  and transform to cartesian coordinates
          v(:) = lapw%vk(:,k,iintsp)
          gkrot(:,k) = MATMUL(TRANSPOSE(bmrot),v)
       END DO
    END IF

125 126 127 128
    gkrot_dev = gkrot 


    !-->  synthesize the complex conjugates of a and b
129 130 131 132 133
    grid = 30    ! number of blocks in the grid
    block = 32   ! number of threads in a block
    loop_size = max(lapw%nv(1)/(grid*block),1)   !number of iterations performed by each thread
    if (loop_size * grid*block < lapw%nv(1)) loop_size = loop_size + 1
    CALL synth_ab<<<grid,block>>>(loop_size,lapw%nv(1),lmax,ab_size,gkrot_dev,&
Uliana Alekseeva's avatar
Uliana Alekseeva committed
134
                                  fj(:,:,iintsp),gj(:,:,iintsp),c_ph_dev(:,iintsp),ab)
135 136

    IF (PRESENT(abclo)) THEN
137 138 139
       print*, "Ooooops, TODO in hsmt_ab"
       !DO k = 1,lapw%nv(1)
       !   !determine also the abc coeffs for LOs
140
       !   invsfct=MERGE(1,2,sym%invsat(na).EQ.0)
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
       !   term = fpi_const/SQRT(cell%omtil)* ((atoms%rmt(n)**2)/2)*c_ph(k,iintsp)
       !   DO lo = 1,atoms%nlo(n)
       !      l = atoms%llo(lo,n)
       !      DO nkvec=1,invsfct*(2*l+1)
       !         IF (lapw%kvec(nkvec,lo,na)==k) THEN !This k-vector is used in LO
       !            ll1 = l*(l+1) + 1
       !            DO m = -l,l
       !               lm = ll1 + m
       !               abclo(1,m,nkvec,lo) = term*ylm(k,lm)*alo1(lo)
       !               abclo(2,m,nkvec,lo) = term*ylm(k,lm)*blo1(lo)
       !               abclo(3,m,nkvec,lo) = term*ylm(k,lm)*clo1(lo)
       !            END DO
       !         END IF
       !      ENDDO
       !   ENDDO
       !ENDDO
157 158
    ENDIF
       
159
    ab_size=ab_size*2
160
    
161 162 163
    DEALLOCATE(c_ph_dev)
    DEALLOCATE(gkrot_dev)

Uliana Alekseeva's avatar
Uliana Alekseeva committed
164 165
    istat = cudaDeviceSynchronize() 
    call nvtxEndRange
166 167 168 169
  END SUBROUTINE hsmt_ab_gpu
#endif

  SUBROUTINE hsmt_ab_cpu(sym,atoms,noco,ispin,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,l_nonsph,abclo,alo1,blo1,clo1)
170
!Calculate overlap matrix, CPU vesion
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
    USE m_constants, ONLY : fpi_const,tpi_const
    USE m_types
    USE m_ylm
    IMPLICIT NONE
    TYPE(t_sym),INTENT(IN)      :: sym
    TYPE(t_cell),INTENT(IN)     :: cell
    TYPE(t_atoms),INTENT(IN)    :: atoms
    TYPE(t_lapw),INTENT(IN)     :: lapw
    TYPE(t_noco),INTENT(IN)     :: noco
    !     ..
    !     .. Scalar Arguments ..
    INTEGER, INTENT (IN) :: ispin,n,na,iintsp
    LOGICAL,INTENT(IN)   :: l_nonsph
    INTEGER,INTENT(OUT)  :: ab_size
    !     ..
    !     .. Array Arguments ..
    REAL,INTENT(IN)       :: fj(:,0:,:),gj(:,0:,:)
    COMPLEX, INTENT (OUT) :: ab(:,:)
    !Optional arguments if abc coef for LOs are needed
    COMPLEX, INTENT(INOUT),OPTIONAL:: abclo(:,-atoms%llod:,:,:)
    REAL,INTENT(IN),OPTIONAL:: alo1(:),blo1(:),clo1(:)
    
    INTEGER:: np,k,l,ll1,m,lmax,nkvec,lo,lm,invsfct
Uliana Alekseeva's avatar
Uliana Alekseeva committed
194 195
    COMPLEX:: term
    REAL   :: th,v(3),bmrot(3,3),vmult(3)
196
    COMPLEX :: ylm((atoms%lmaxd+1)**2)
Uliana Alekseeva's avatar
Uliana Alekseeva committed
197 198
    COMPLEX,ALLOCATABLE:: c_ph(:,:)
    REAL,ALLOCATABLE   :: gkrot(:,:)
199 200
    LOGICAL :: l_apw
   
201
    ALLOCATE(c_ph(maxval(lapw%nv),MERGE(2,1,noco%l_ss.or.noco%l_mtNocoPot)))
202
    ALLOCATE(gkrot(3,maxval(lapw%nv)))
203 204 205 206 207 208 209

    lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph)
    
    ab_size=lmax*(lmax+2)+1
    l_apw=ALL(gj==0.0)
    ab=0.0
    
210
    np = sym%invtab(sym%ngopr(na))
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
    !--->          set up phase factors
    CALL lapw%phase_factors(iintsp,atoms%taual(:,na),noco%qss,c_ph(:,iintsp))
    
    IF (np==1) THEN
       gkrot(:, 1:lapw%nv(iintsp)) = lapw%gk(:, 1:lapw%nv(iintsp),iintsp)
    ELSE
       bmrot=MATMUL(1.*sym%mrot(:,:,np),cell%bmat)
       DO k = 1,lapw%nv(iintsp)
          !-->  apply the rotation that brings this atom into the
          !-->  representative (this is the definition of ngopr(na)
          !-->  and transform to cartesian coordinates
          v(:) = lapw%vk(:,k,iintsp)
          gkrot(:,k) = MATMUL(TRANSPOSE(bmrot),v)
       END DO
    END IF
    !$OMP PARALLEL DO DEFAULT(none) &
227
    !$OMP& SHARED(lapw,gkrot,lmax,c_ph,iintsp,ab,fj,gj,abclo,cell,atoms,sym) &
228 229
    !$OMP& SHARED(alo1,blo1,clo1,ab_size,na,n) &
    !$OMP& PRIVATE(k,vmult,ylm,l,ll1,m,lm,term,invsfct,lo,nkvec)
230
    DO k = 1,lapw%nv(iintsp)
231 232 233 234 235 236 237 238 239 240 241 242 243 244
       !-->    generate spherical harmonics
       vmult(:) =  gkrot(:,k)
       CALL ylm4(lmax,vmult,ylm)
       !-->  synthesize the complex conjugates of a and b
       DO l = 0,lmax
          ll1 = l* (l+1)
          DO m = -l,l               
             term = c_ph(k,iintsp)*ylm(ll1+m+1)
             ab(k,ll1+m+1)         = fj(k,l,iintsp)*term
             ab(k,ll1+m+1+ab_size) = gj(k,l,iintsp)*term
          END DO
       END DO
       IF (PRESENT(abclo)) THEN
          !determine also the abc coeffs for LOs
245
          invsfct=MERGE(1,2,sym%invsat(na).EQ.0)
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
          term = fpi_const/SQRT(cell%omtil)* ((atoms%rmt(n)**2)/2)*c_ph(k,iintsp)
          DO lo = 1,atoms%nlo(n)
             l = atoms%llo(lo,n)
             DO nkvec=1,invsfct*(2*l+1)
                IF (lapw%kvec(nkvec,lo,na)==k) THEN !This k-vector is used in LO
                   ll1 = l*(l+1) + 1
                   DO m = -l,l
                      lm = ll1 + m
                      abclo(1,m,nkvec,lo) = term*ylm(lm)*alo1(lo)
                      abclo(2,m,nkvec,lo) = term*ylm(lm)*blo1(lo)
                      abclo(3,m,nkvec,lo) = term*ylm(lm)*clo1(lo)
                   END DO
                END IF
             ENDDO
          ENDDO
       ENDIF
       
    ENDDO !k-loop
    !$OMP END PARALLEL DO
    IF (.NOT.l_apw) ab_size=ab_size*2
    
  END SUBROUTINE hsmt_ab_cpu
END MODULE m_hsmt_ab