Commit 0e88e072 authored by Uliana Alekseeva's avatar Uliana Alekseeva

hsmt_ab ported onto GPU (partly)

parent 60d5b177
......@@ -6,7 +6,7 @@ eigen/eigen.F90
eigen/hlomat.F90
eigen/hs_int.F90
eigen/hsmt_fjgj.F90
eigen/hsmt_ab.f90
eigen/hsmt_ab.F90
eigen/hsmt_sph.F90
eigen/hsmt_nonsph.F90
eigen/hsmt_spinor.F90
......@@ -34,6 +34,7 @@ eigen/vacfun.f90
eigen/vec_for_lo.f90
eigen/eigen_redist_matrix.f90
)
if (FLEUR_USE_GPU)
set(fleur_F90 ${fleur_F90} eigen/hsmt_nonsph_GPU.F90)
endif()
#if (FLEUR_USE_GPU)
# set(fleur_F90 ${fleur_F90}
#eigen/hsmt_nonsph_GPU.F90)
#endif()
......@@ -6,8 +6,151 @@
MODULE m_hsmt_ab
use m_juDFT
implicit none
INTERFACE hsmt_ab
module procedure hsmt_ab_cpu
#ifdef _CUDA
module procedure hsmt_ab_gpu
#endif
END INTERFACE
CONTAINS
SUBROUTINE hsmt_ab(sym,atoms,noco,ispin,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,l_nonsph,abclo,alo1,blo1,clo1)
#ifdef _CUDA
!private
!ATTRIBUTE(globali) &
SUBROUTINE build_ab(iintsp,n,lmax,ab_size,ylm,c_ph,fj,gj,ab)
integer, intent(in) :: iintsp,n,lmax,ab_size
complex, intent(in) :: ylm(:,:), c_ph(:,:)
real, device,intent(in) :: fj(:,:,:),gj(:,:,:)
complex,device,intent(out) :: ab(:,:)
integer :: k,l, ll1, m
complex :: term, term2
! k = (blockidx%x-1)*blockdim%x + threadidx%x
! if (k<n) then
DO k = 1,n
DO l = 0,lmax
ll1 = l* (l+1)
DO m = -l,l
term = c_ph(k,iintsp)*ylm(k,ll1+m+1)
term2 = fj(k,l+1,iintsp)*term
ab(k,ll1+m+1) = CONJG(term2) !fj(k,l+1,iintsp)*term
term2 = gj(k,l+1,iintsp)*term
ab(k,ll1+m+1+ab_size) = CONJG(term2) !gj(k,l+1,iintsp)*term
END DO
END DO
ENDDO !k-loop
END SUBROUTINE build_ab
SUBROUTINE hsmt_ab_gpu(sym,atoms,noco,ispin,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,l_nonsph,abclo,alo1,blo1,clo1)
!Calculate overlap matrix
USE m_constants, ONLY : fpi_const,tpi_const
USE m_types
USE m_ylm
USE m_apws
IMPLICIT NONE
TYPE(t_sym),INTENT(IN) :: sym
TYPE(t_cell),INTENT(IN) :: cell
TYPE(t_atoms),INTENT(IN) :: atoms
TYPE(t_lapw),INTENT(IN) :: lapw
TYPE(t_noco),INTENT(IN) :: noco
! ..
! .. Scalar Arguments ..
INTEGER, INTENT (IN) :: ispin,n,na,iintsp
LOGICAL,INTENT(IN) :: l_nonsph
INTEGER,INTENT(OUT) :: ab_size
! ..
! .. Array Arguments ..
REAL,INTENT(IN) :: fj(:,:,:),gj(:,:,:)
COMPLEX,DEVICE, INTENT (OUT) :: ab(:,:)
!Optional arguments if abc coef for LOs are needed
COMPLEX, INTENT(INOUT),OPTIONAL:: abclo(:,-atoms%llod:,:,:)
REAL,INTENT(IN),OPTIONAL:: alo1(:),blo1(:),clo1(:)
INTEGER:: np,k,l,ll1,m,lmax,nkvec,lo,lm,invsfct
complex:: term,term2
real :: th,v(3),bmrot(3,3),vmult(3)
COMPLEX,allocatable :: ylm(:,:)
complex,allocatable:: c_ph(:,:)
real,allocatable :: gkrot(:,:)
LOGICAL :: l_apw
REAL, ALLOCATABLE,DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
ALLOCATE(fj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
ALLOCATE(gj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
fj_dev(:,:,:)= fj(:,:,:)
gj_dev(:,:,:)= gj(:,:,:)
ALLOCATE(ylm(lapw%nv(1),(atoms%lmaxd+1)**2))
ALLOCATE(c_ph(lapw%nv(1),MERGE(2,1,noco%l_ss)))
ALLOCATE(gkrot(3,lapw%nv(1)))
lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph)
ab_size=lmax*(lmax+2)+1
l_apw=ALL(gj==0.0)
ab=0.0
np = sym%invtab(atoms%ngopr(na))
!---> set up phase factors
CALL lapw%phase_factors(iintsp,atoms%taual(:,na),noco%qss,c_ph(:,iintsp))
IF (np==1) THEN
gkrot(:, 1:lapw%nv(iintsp)) = lapw%gk(:, 1:lapw%nv(iintsp),iintsp)
ELSE
bmrot=MATMUL(1.*sym%mrot(:,:,np),cell%bmat)
DO k = 1,lapw%nv(iintsp)
!--> apply the rotation that brings this atom into the
!--> representative (this is the definition of ngopr(na)
!--> and transform to cartesian coordinates
v(:) = lapw%vk(:,k,iintsp)
gkrot(:,k) = MATMUL(TRANSPOSE(bmrot),v)
END DO
END IF
!--> generate spherical harmonics
DO k = 1,lapw%nv(1)
vmult(:) = gkrot(:,k)
CALL ylm4(lmax,vmult,ylm(k,:))
ENDDO
!--> synthesize the complex conjugates of a and b
CALL build_ab(iintsp,lapw%nv(1),lmax,ab_size,ylm,c_ph,fj_dev,gj_dev,ab)
IF (PRESENT(abclo)) THEN
DO k = 1,lapw%nv(1)
!determine also the abc coeffs for LOs
invsfct=MERGE(1,2,atoms%invsat(na).EQ.0)
term = fpi_const/SQRT(cell%omtil)* ((atoms%rmt(n)**2)/2)*c_ph(k,iintsp)
DO lo = 1,atoms%nlo(n)
l = atoms%llo(lo,n)
DO nkvec=1,invsfct*(2*l+1)
IF (lapw%kvec(nkvec,lo,na)==k) THEN !This k-vector is used in LO
ll1 = l*(l+1) + 1
DO m = -l,l
lm = ll1 + m
abclo(1,m,nkvec,lo) = term*ylm(k,lm)*alo1(lo)
abclo(2,m,nkvec,lo) = term*ylm(k,lm)*blo1(lo)
abclo(3,m,nkvec,lo) = term*ylm(k,lm)*clo1(lo)
END DO
END IF
ENDDO
ENDDO
ENDDO
ENDIF
IF (.NOT.l_apw) ab_size=ab_size*2
END SUBROUTINE hsmt_ab_gpu
#endif
SUBROUTINE hsmt_ab_cpu(sym,atoms,noco,ispin,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,l_nonsph,abclo,alo1,blo1,clo1)
!Calculate overlap matrix
USE m_constants, ONLY : fpi_const,tpi_const
USE m_types
......@@ -39,9 +182,9 @@ CONTAINS
complex,allocatable:: c_ph(:,:)
real,allocatable :: gkrot(:,:)
LOGICAL :: l_apw
ALLOCATE(c_ph(maxval(lapw%nv),MERGE(2,1,noco%l_ss)))
ALLOCATE(gkrot(3,MAXVAL(lapw%nv)))
ALLOCATE(c_ph(lapw%nv(1),MERGE(2,1,noco%l_ss)))
ALLOCATE(gkrot(3,lapw%nv(1)))
lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph)
......@@ -106,5 +249,5 @@ CONTAINS
!$OMP END PARALLEL DO
IF (.NOT.l_apw) ab_size=ab_size*2
END SUBROUTINE hsmt_ab
END SUBROUTINE hsmt_ab_cpu
END MODULE m_hsmt_ab
......@@ -72,6 +72,7 @@ CONTAINS
#ifdef _CUDA
COMPLEX,ALLOCATABLE,DEVICE :: c_dev(:,:), ab1_dev(:,:), ab_dev(:,:)
COMPLEX,ALLOCATABLE,DEVICE :: h_loc_dev(:,:)
!REAL, ALLOCATABLE,DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
integer :: i, j, istat
call nvtxStartRange("hsmt_nonsph",1)
print*, "running CUDA version"
......@@ -83,6 +84,10 @@ CONTAINS
ALLOCATE(ab1_dev(size(ab1,1),size(ab1,2)))
ALLOCATE(ab_dev(size(ab,1),size(ab,2)))
h_loc_dev(1:,1:) = CONJG(td%h_loc(0:,0:,n,isp)) !WORKAROUND, var_dev=CONJG(var_dev) does not work (pgi18.4)
!ALLOCATE(fj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
!ALLOCATE(gj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
!fj_dev(1:,1:,1:)= fj(1:,0:,1:)
!gj_dev(1:,1:,1:)= gj(1:,0:,1:)
!note that basically all matrices in the GPU version are conjugates of their
!cpu counterparts
#endif
......@@ -106,15 +111,15 @@ CONTAINS
IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN
rchi=MERGE(REAL(chi),REAL(chi)*2,(atoms%invsat(na)==0))
!#ifdef _CUDA
!CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab_dev,ab_size,.TRUE.)
#ifdef _CUDA
CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab_dev,ab_size,.TRUE.)
! istat = cudaDeviceSynchronize()
!#else
#else
CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.)
!#endif
#endif
!Calculate Hamiltonian
#ifdef _CUDA
ab_dev = CONJG(ab)
!ab_dev = CONJG(ab)
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab_dev,SIZE(ab_dev,1),h_loc_dev,SIZE(h_loc_dev,1),CMPLX(0.,0.),ab1_dev,SIZE(ab1_dev,1))
#else
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab1,SIZE(ab1,1))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment