Commit e288839d authored by Matthias Redies's avatar Matthias Redies

Merge branch 'develop' into MetaGGA

parents d7380394 cb805565
...@@ -8,6 +8,8 @@ if (CLI_FLEUR_USE_GPU) ...@@ -8,6 +8,8 @@ if (CLI_FLEUR_USE_GPU)
message("Using cuda8") message("Using cuda8")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "cuda9") elseif(${CLI_FLEUR_USE_GPU} MATCHES "cuda9")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas") set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "cuda9.1")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.1,cc60 -Mcuda=rdc -Mcudalib=cublas")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "nvtx") elseif(${CLI_FLEUR_USE_GPU} MATCHES "nvtx")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas -lnvToolsExt ") set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas -lnvToolsExt ")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "emu") elseif(${CLI_FLEUR_USE_GPU} MATCHES "emu")
...@@ -21,7 +23,7 @@ if (CLI_FLEUR_USE_GPU) ...@@ -21,7 +23,7 @@ if (CLI_FLEUR_USE_GPU)
LINK_LIBRARIES "-lcusolver" LINK_LIBRARIES "-lcusolver"
) )
if (FLEUR_USE_CUSOLVER) if (FLEUR_USE_CUSOLVER)
set(${FLEUR_LIBRARIES} "${FLEUR_LIBRARIES};-lcusolver") set(FLEUR_LIBRARIES "${FLEUR_LIBRARIES};-lcusolver")
set(FLEUR_MPI_DEFINITIONS ${FLEUR_MPI_DEFINITIONS} "CPP_CUSOLVER") set(FLEUR_MPI_DEFINITIONS ${FLEUR_MPI_DEFINITIONS} "CPP_CUSOLVER")
set(FLEUR_DEFINITIONS ${FLEUR_DEFINITIONS} "CPP_CUSOLVER") set(FLEUR_DEFINITIONS ${FLEUR_DEFINITIONS} "CPP_CUSOLVER")
endif() endif()
......
...@@ -12,92 +12,64 @@ MODULE m_magma ...@@ -12,92 +12,64 @@ MODULE m_magma
! using the MAGMA library for multiple GPUs ! using the MAGMA library for multiple GPUs
!********************************************************** !**********************************************************
CONTAINS CONTAINS
SUBROUTINE magma_diag(nsize,eig,ne,a_r,b_r,z_r,a_c,b_c,z_c) SUBROUTINE magma_diag(hmat,smat,ne,eig,zmat)
#ifdef CPP_MAGMA #ifdef CPP_MAGMA
use magma use magma
#endif #endif
#include"cpp_double.h" use m_types
IMPLICIT NONE IMPLICIT NONE
! ... Arguments ... ! ... Arguments ...
TYPE(t_mat),INTENT(INOUT) :: hmat,smat
INTEGER, INTENT (IN) :: nsize INTEGER,INTENT(INOUT) :: ne
CLASS(t_mat),ALLOCATABLE,INTENT(OUT) :: zmat
REAL,INTENT(OUT) :: eig(:)
REAL, INTENT(OUT) :: eig(:)
INTEGER, INTENT(INOUT) :: ne
REAL, OPTIONAL,ALLOCATABLE, INTENT (INOUT) :: a_r(:),b_r(:)
REAL, OPTIONAL,ALLOCATABLE, INTENT (INOUT) :: z_r(:,:)
COMPLEX, OPTIONAL,ALLOCATABLE, INTENT (INOUT) :: a_c(:),b_c(:)
COMPLEX, OPTIONAL,ALLOCATABLE, INTENT (INOUT) :: z_c(:,:)
#ifdef CPP_MAGMA #ifdef CPP_MAGMA
! ... Local Variables .. ! ... Local Variables ..
INTEGER iind,ind1,ind2,info,lwork,liwork,lrwork,err,i,mout(1) INTEGER :: lwork,liwork,lrwork,err,mout(1)
REAL eigTemp(nsize) REAL :: eigTemp(hmat%matsize1)
LOGICAL:: initialized=.false. LOGICAL :: initialized=.false.
REAL, ALLOCATABLE :: rwork(:) REAL, ALLOCATABLE :: rwork(:)
INTEGER, ALLOCATABLE :: iwork(:) INTEGER, ALLOCATABLE :: iwork(:)
COMPLEX, ALLOCATABLE :: work(:)
REAL, ALLOCATABLE :: largea_r(:,:),largeb_r(:,:)
COMPLEX, ALLOCATABLE :: largea_c(:,:),largeb_c(:,:)
COMPLEX,ALLOCATABLE :: work(:)
LOGICAL :: l_real
l_real=present(a_r)
print *,"MAGMA start"
IF (.NOT.initialized) THEN IF (.NOT.initialized) THEN
initialized=.true. initialized=.TRUE.
call magmaf_init() CALL magmaf_init()
print *,"MAGMA init"
ENDIF ENDIF
!********************************** IF (hmat%l_real) THEN
!expand from packed to full storage CALL juDFT_error("REAL diagonalization not implemented in magma.F90")
!**********************************
!hamiltonian
if (l_real) THEN
call packed_to_full(nsize,a_r,largea_r)
call packed_to_full(nsize,b_r,largeb_r)
!deallocate(a_r,b_r)
ELSE ELSE
call packed_to_full(nsize,a_c,largea_c) !Query the workspace size
call packed_to_full(nsize,b_c,largeb_c) ALLOCATE(work(1),rwork(1),iwork(1))
!deallocate(a_c,b_c) CALL magmaf_zhegvdx(1,'v','i','l',hmat%matsize1,hmat%data_c,SIZE(hmat%data_c,1),smat%data_c,SIZE(smat%data_c,1),&
Endif 0.0,0.0,1,ne,mout,eigTemp,work,-1,rwork,-1,iwork,-1,err)
IF (err/=0) CALL juDFT_error("Failed to query workspaces",calledby="magma.F90")
if (l_real) call juDFT_error("REAL diagonalization not implemented in magma.F90") lwork=work(1)
lrwork=rwork(1)
!Query the workspace size liwork=iwork(1)
allocate(work(1),rwork(1),iwork(1)) DEALLOCATE(work,rwork,iwork)
print *,"Magma workspace query" ALLOCATE(work(lwork),rwork(lrwork),iwork(liwork))
call flush() !Now the diagonalization
call magmaf_zhegvdx(1,'v','i','l',nsize,largea_c,nsize,largeb_c,nsize,& CALL magmaf_zhegvdx(1,'v','i','l',hmat%matsize1,hmat%data_c,SIZE(hmat%data_c,1),smat%data_c,SIZE(smat%data_c,1),&
0.0,0.0,1,ne,mout,eigTemp,work,-1,rwork,-1,iwork,-1,err) 0.0,0.0,1,ne,mout,eigTemp,work,lwork,rwork,lrwork,iwork,liwork,err)
lwork=work(1) IF (err/=0) CALL juDFT_error("Magma failed to diagonalize Hamiltonian")
lrwork=rwork(1) ENDIF
liwork=iwork(1) ALLOCATE(t_mat::zmat)
print*,"MAGMA:",lwork,lrwork,liwork CALL zmat%alloc(hmat%l_real,hmat%matsize1,ne)
deallocate(work,rwork,iwork)
allocate(work(lwork),rwork(lrwork),iwork(liwork))
if (err/=0) call juDFT_error("Failed to allocate workspaces",calledby="magma.F90")
!Now the diagonalization
print *,"Magma diagonalization"
print *,nsize,shape(largea_c),shape(eigTemp),ne
call magmaf_zhegvdx(1,'v','i','l',nsize,largea_c,nsize,largeb_c,nsize,&
0.0,0.0,1,ne,mout,eigTemp,work,lwork,rwork,lrwork,iwork,liwork,err)
print*,"MAGMA info:",err
if (err/=0) call juDFT_error("Magma failed to diagonalize Hamiltonian")
print *,"MAGMA mout:",mout
DO i = 1, ne DO i = 1, ne
eig(i) = eigTemp(i) eig(i) = eigTemp(i)
z_c(:nsize,i)=largea_c(:nsize,i) IF (hmat%l_real) THEN
zmat%data_r(:,i)=hmat%data_r(:nsize,i)
ELSE
zmat%data_c(:,i)=hmat%data_c(:nsize,i)
ENDIF
END DO END DO
!call judft_error("Eigenvectors are not calculated in MAGMA")
#endif #endif
END SUBROUTINE magma_diag END SUBROUTINE magma_diag
END MODULE m_magma END MODULE m_magma
......
...@@ -38,22 +38,22 @@ CONTAINS ...@@ -38,22 +38,22 @@ CONTAINS
!up-up component (or only component in collinear case) !up-up component (or only component in collinear case)
IF (SIZE(mat)==1) THEN IF (SIZE(mat)==1) THEN
CALL mat_final%move(mat(1,1)) CALL mat_final%move(mat(1,1))
CALL mat(1,1)%free() !CALL mat(1,1)%free()
RETURN RETURN
ENDIF ENDIF
CALL mat_final%copy(mat(1,1),1,1) CALL mat_final%copy(mat(1,1),1,1)
CALL mat(1,1)%free() !CALL mat(1,1)%free()
!down-down component !down-down component
CALL mat_final%copy(mat(2,2),lapw%nv(1)+atoms%nlotot+1,lapw%nv(1)+atoms%nlotot+1) CALL mat_final%copy(mat(2,2),lapw%nv(1)+atoms%nlotot+1,lapw%nv(1)+atoms%nlotot+1)
CALL mat(2,2)%free() !CALL mat(2,2)%free()
!Now collect off-diagonal parts !Now collect off-diagonal parts
CALL mat(1,2)%add_transpose(mat(2,1)) CALL mat(1,2)%add_transpose(mat(2,1))
CALL mat_final%copy(mat(1,2),1,lapw%nv(1)+atoms%nlotot+1) CALL mat_final%copy(mat(1,2),1,lapw%nv(1)+atoms%nlotot+1)
CALL mat(1,2)%free() !CALL mat(1,2)%free()
CALL mat(2,1)%free() !CALL mat(2,1)%free()
END SUBROUTINE eigen_redist_matrix END SUBROUTINE eigen_redist_matrix
END MODULE m_eigen_redist_matrix END MODULE m_eigen_redist_matrix
......
...@@ -43,7 +43,11 @@ CONTAINS ...@@ -43,7 +43,11 @@ CONTAINS
INTEGER, INTENT (IN) :: ispin INTEGER, INTENT (IN) :: ispin
!locals !locals
#ifdef CPP_GPU
REAL, ALLOCATABLE,MANAGED :: fj(:,:,:,:),gj(:,:,:,:)
#else
REAL, ALLOCATABLE :: fj(:,:,:,:),gj(:,:,:,:) REAL, ALLOCATABLE :: fj(:,:,:,:),gj(:,:,:,:)
#endif
INTEGER :: iintsp,jintsp,n INTEGER :: iintsp,jintsp,n
COMPLEX :: chi(2,2),chi_one COMPLEX :: chi(2,2),chi_one
...@@ -67,7 +71,7 @@ CONTAINS ...@@ -67,7 +71,7 @@ CONTAINS
IF (.NOT.noco%l_noco) THEN IF (.NOT.noco%l_noco) THEN
!This is for collinear calculations: the (1,1) element of the matrices is all !This is for collinear calculations: the (1,1) element of the matrices is all
!that is needed and allocated !that is needed and allocated
CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,cell,1,1,chi_one,lapw,enpara%el0,& CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,1,1,chi_one,lapw,enpara%el0,&
td%e_shift(n,ispin),usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),smat(1,1),hmat(1,1)) td%e_shift(n,ispin),usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),smat(1,1),hmat(1,1))
CALL hsmt_nonsph(n,mpi,sym,atoms,ispin,1,1,chi_one,noco,cell,lapw,td,& CALL hsmt_nonsph(n,mpi,sym,atoms,ispin,1,1,chi_one,noco,cell,lapw,td,&
fj(:,0:,ispin,:),gj(:,0:,ispin,:),hmat(1,1)) fj(:,0:,ispin,:),gj(:,0:,ispin,:),hmat(1,1))
...@@ -79,7 +83,7 @@ CONTAINS ...@@ -79,7 +83,7 @@ CONTAINS
!stored in tmp-variables. Then these are distributed (rotated) into the 2x2 !stored in tmp-variables. Then these are distributed (rotated) into the 2x2
!global spin-matrices. !global spin-matrices.
CALL hmat_tmp%clear();CALL smat_tmp%clear() CALL hmat_tmp%clear();CALL smat_tmp%clear()
CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,cell,1,1,chi_one,lapw,enpara%el0,td%e_shift(n,ispin),& CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,1,1,chi_one,lapw,enpara%el0,td%e_shift(n,ispin),&
usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),smat_tmp,hmat_tmp) usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),smat_tmp,hmat_tmp)
CALL hsmt_nonsph(n,mpi,sym,atoms,ispin,1,1,chi_one,noco,cell,lapw,td,& CALL hsmt_nonsph(n,mpi,sym,atoms,ispin,1,1,chi_one,noco,cell,lapw,td,&
fj(:,0:,ispin,:),gj(:,0:,ispin,:),hmat_tmp) fj(:,0:,ispin,:),gj(:,0:,ispin,:),hmat_tmp)
...@@ -99,7 +103,7 @@ CONTAINS ...@@ -99,7 +103,7 @@ CONTAINS
CALL hsmt_spinor(ispin,n,noco,chi) CALL hsmt_spinor(ispin,n,noco,chi)
DO iintsp=1,2 DO iintsp=1,2
DO jintsp=1,2 DO jintsp=1,2
CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,cell,iintsp,jintsp,chi(iintsp,jintsp),& CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,iintsp,jintsp,chi(iintsp,jintsp),&
lapw,enpara%el0,td%e_shift(n,ispin),usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),& lapw,enpara%el0,td%e_shift(n,ispin),usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),&
smat(iintsp,jintsp),hmat(iintsp,jintsp)) smat(iintsp,jintsp),hmat(iintsp,jintsp))
CALL hsmt_nonsph(n,mpi,sym,atoms,ispin,iintsp,jintsp,chi(iintsp,jintsp),noco,cell,& CALL hsmt_nonsph(n,mpi,sym,atoms,ispin,iintsp,jintsp,chi(iintsp,jintsp),noco,cell,&
......
...@@ -62,7 +62,7 @@ CONTAINS ...@@ -62,7 +62,7 @@ CONTAINS
USE m_ylm USE m_ylm
USE m_apws USE m_apws
USE cudafor USE cudafor
! USE nvtx USE nvtx
IMPLICIT NONE IMPLICIT NONE
TYPE(t_sym),INTENT(IN) :: sym TYPE(t_sym),INTENT(IN) :: sym
TYPE(t_cell),INTENT(IN) :: cell TYPE(t_cell),INTENT(IN) :: cell
...@@ -92,8 +92,9 @@ CONTAINS ...@@ -92,8 +92,9 @@ CONTAINS
COMPLEX,ALLOCATABLE,DEVICE :: c_ph_dev(:,:) COMPLEX,ALLOCATABLE,DEVICE :: c_ph_dev(:,:)
REAL, ALLOCATABLE,DEVICE :: gkrot_dev(:,:) REAL, ALLOCATABLE,DEVICE :: gkrot_dev(:,:)
INTEGER :: grid, block INTEGER :: grid, block
!INTEGER :: istat INTEGER :: istat
call nvtxStartRange("hsmt_ab",3)
lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph) lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph)
ALLOCATE(c_ph_dev(lapw%nv(1),MERGE(2,1,noco%l_ss))) ALLOCATE(c_ph_dev(lapw%nv(1),MERGE(2,1,noco%l_ss)))
...@@ -129,17 +130,12 @@ CONTAINS ...@@ -129,17 +130,12 @@ CONTAINS
!--> synthesize the complex conjugates of a and b !--> synthesize the complex conjugates of a and b
!call nvtxStartRange("hsmt_synthAB",5)
!istat = cudaDeviceSynchronize()
! pretty ugly solution ! pretty ugly solution
block = 256 block = 256
grid = lapw%nv(1)/(block*4) + 1 grid = lapw%nv(1)/(block*4) + 1
CALL synth_ab<<<grid,block>>>(grid,block,lapw%nv(1),lmax,ab_size,gkrot_dev,& CALL synth_ab<<<grid,block>>>(grid,block,lapw%nv(1),lmax,ab_size,gkrot_dev,&
fj(:,:,iintsp),gj(:,:,iintsp),c_ph_dev(:,iintsp),ab) fj(:,:,iintsp),gj(:,:,iintsp),c_ph_dev(:,iintsp),ab)
!istat = cudaDeviceSynchronize()
!call nvtxEndRange
IF (PRESENT(abclo)) THEN IF (PRESENT(abclo)) THEN
print*, "Ooooops, TODO in hsmt_ab" print*, "Ooooops, TODO in hsmt_ab"
...@@ -169,6 +165,8 @@ CONTAINS ...@@ -169,6 +165,8 @@ CONTAINS
DEALLOCATE(c_ph_dev) DEALLOCATE(c_ph_dev)
DEALLOCATE(gkrot_dev) DEALLOCATE(gkrot_dev)
istat = cudaDeviceSynchronize()
call nvtxEndRange
END SUBROUTINE hsmt_ab_gpu END SUBROUTINE hsmt_ab_gpu
#endif #endif
......
...@@ -17,7 +17,7 @@ CONTAINS ...@@ -17,7 +17,7 @@ CONTAINS
DO iintsp=1,2 DO iintsp=1,2
DO jintsp=1,2 DO jintsp=1,2
mat(jintsp,iintsp)%data_c(:,:)=chi(jintsp,iintsp)*mat_tmp%data_c(:,:)+mat(jintsp,iintsp)%data_c(:,:) mat(jintsp,iintsp)%data_c(:,:)=chi(iintsp,jintsp)*mat_tmp%data_c(:,:)+mat(jintsp,iintsp)%data_c(:,:)
ENDDO ENDDO
ENDDO ENDDO
END SUBROUTINE hsmt_distspins END SUBROUTINE hsmt_distspins
......
...@@ -29,39 +29,22 @@ CONTAINS ...@@ -29,39 +29,22 @@ CONTAINS
INTEGER, INTENT (IN) :: n,isp,iintsp,jintsp INTEGER, INTENT (IN) :: n,isp,iintsp,jintsp
COMPLEX,INTENT(IN) :: chi COMPLEX,INTENT(IN) :: chi
! .. Array Arguments .. ! .. Array Arguments ..
REAL,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:) #if defined CPP_GPU
REAL,MANAGED,INTENT(IN) :: fj(:,:,:),gj(:,:,:)
#else
REAL,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
#endif
CLASS(t_mat),INTENT(INOUT) ::hmat CLASS(t_mat),INTENT(INOUT) ::hmat
#if defined CPP_GPU #if defined CPP_GPU
REAL, ALLOCATABLE,DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
COMPLEX,ALLOCATABLE,DEVICE :: h_loc_dev(:,:) COMPLEX,ALLOCATABLE,DEVICE :: h_loc_dev(:,:)
COMPLEX,ALLOCATABLE,DEVICE :: c_dev(:,:)
#endif #endif
CALL timestart("non-spherical setup") CALL timestart("non-spherical setup")
IF (mpi%n_size==1) THEN IF (mpi%n_size==1) THEN
#if defined CPP_GPU #if defined CPP_GPU
ALLOCATE(fj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
ALLOCATE(gj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
fj_dev(1:,1:,1:)= fj(1:,0:,1:)
gj_dev(1:,1:,1:)= gj(1:,0:,1:)
ALLOCATE(h_loc_dev(size(td%h_loc,1),size(td%h_loc,2))) ALLOCATE(h_loc_dev(size(td%h_loc,1),size(td%h_loc,2)))
h_loc_dev(1:,1:) = CONJG(td%h_loc(0:,0:,n,isp)) h_loc_dev(1:,1:) = CONJG(td%h_loc(0:,0:,n,isp))
IF (hmat%l_real) THEN CALL priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,h_loc_dev,fj,gj,hmat)
IF (ANY(SHAPE(hmat%data_c)/=SHAPE(hmat%data_r))) THEN
DEALLOCATE(hmat%data_c)
ALLOCATE(hmat%data_c(SIZE(hmat%data_r,1),SIZE(hmat%data_r,2)))
ENDIF
hmat%data_c=0.0
ENDIF
ALLOCATE(c_dev(SIZE(hmat%data_c,1),SIZE(hmat%data_c,2)))
c_dev = hmat%data_c
CALL priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,h_loc_dev,fj_dev,gj_dev,c_dev)
hmat%data_c = c_dev
IF (hmat%l_real) THEN
hmat%data_r=hmat%data_r+REAL(hmat%data_c)
ENDIF
#else #else
CALL priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat) CALL priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
#endif #endif
...@@ -72,7 +55,7 @@ CONTAINS ...@@ -72,7 +55,7 @@ CONTAINS
END SUBROUTINE hsmt_nonsph END SUBROUTINE hsmt_nonsph
#if defined CPP_GPU #if defined CPP_GPU
SUBROUTINE priv_noMPI_gpu(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,h_loc_dev,fj_dev,gj_dev,c_dev) SUBROUTINE priv_noMPI_gpu(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,h_loc_dev,fj_dev,gj_dev,hmat)
!Calculate overlap matrix, GPU version !Calculate overlap matrix, GPU version
!note that basically all matrices in the GPU version are conjugates of their cpu counterparts !note that basically all matrices in the GPU version are conjugates of their cpu counterparts
USE m_hsmt_ab USE m_hsmt_ab
...@@ -101,8 +84,7 @@ CONTAINS ...@@ -101,8 +84,7 @@ CONTAINS
! .. ! ..
! .. Array Arguments .. ! .. Array Arguments ..
REAL, INTENT(IN), DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:) REAL, INTENT(IN), DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
COMPLEX,INTENT(INOUT),DEVICE :: c_dev(:,:) CLASS(t_mat),INTENT(INOUT) ::hmat
INTEGER:: nn,na,ab_size,l,ll,m INTEGER:: nn,na,ab_size,l,ll,m
real :: rchi real :: rchi
...@@ -114,6 +96,14 @@ CONTAINS ...@@ -114,6 +96,14 @@ CONTAINS
ALLOCATE(ab_dev(MAXVAL(lapw%nv),2*atoms%lmaxd*(atoms%lmaxd+2)+2)) ALLOCATE(ab_dev(MAXVAL(lapw%nv),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
IF (iintsp.NE.jintsp) ALLOCATE(ab2_dev(lapw%nv(iintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2)) IF (iintsp.NE.jintsp) ALLOCATE(ab2_dev(lapw%nv(iintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
IF (hmat%l_real) THEN
IF (ANY(SHAPE(hmat%data_c)/=SHAPE(hmat%data_r))) THEN
DEALLOCATE(hmat%data_c)
ALLOCATE(hmat%data_c(SIZE(hmat%data_r,1),SIZE(hmat%data_r,2)))
ENDIF
hmat%data_c=0.0
ENDIF
DO nn = 1,atoms%neq(n) DO nn = 1,atoms%neq(n)
na = SUM(atoms%neq(:n-1))+nn na = SUM(atoms%neq(:n-1))+nn
IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN
...@@ -124,10 +114,9 @@ CONTAINS ...@@ -124,10 +114,9 @@ CONTAINS
!Calculate Hamiltonian !Calculate Hamiltonian
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab_dev,SIZE(ab_dev,1),& CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab_dev,SIZE(ab_dev,1),&
h_loc_dev,SIZE(h_loc_dev,1),CMPLX(0.,0.),ab1_dev,SIZE(ab1_dev,1)) h_loc_dev,SIZE(h_loc_dev,1),CMPLX(0.,0.),ab1_dev,SIZE(ab1_dev,1))
!ab1=MATMUL(ab(:lapw%nv(iintsp),:ab_size),td%h_loc(:ab_size,:ab_size,n,isp))
IF (iintsp==jintsp) THEN IF (iintsp==jintsp) THEN
call nvtxStartRange("zherk",3) call nvtxStartRange("zherk",3)
CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,ab1_dev,SIZE(ab1_dev,1),1.0,c_dev,SIZE(c_dev,1)) CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,ab1_dev,SIZE(ab1_dev,1),1.0,hmat%data_c,SIZE(hmat%data_c,1))
istat = cudaDeviceSynchronize() istat = cudaDeviceSynchronize()
call nvtxEndRange() call nvtxEndRange()
ELSE !here the l_ss off-diagonal part starts ELSE !here the l_ss off-diagonal part starts
...@@ -144,11 +133,14 @@ CONTAINS ...@@ -144,11 +133,14 @@ CONTAINS
enddo enddo
enddo enddo
CALL zgemm("N","T",lapw%nv(iintsp),lapw%nv(jintsp),ab_size,chi,ab2_dev,SIZE(ab2_dev,1),& CALL zgemm("N","T",lapw%nv(iintsp),lapw%nv(jintsp),ab_size,chi,ab2_dev,SIZE(ab2_dev,1),&
ab1_dev,SIZE(ab1_dev,1),CMPLX(1.0,0.0),c_dev,SIZE(c_dev,1)) ab1_dev,SIZE(ab1_dev,1),CMPLX(1.0,0.0),hmat%data_c,SIZE(hmat%data_c,1))
ENDIF ENDIF
ENDIF ENDIF
END DO END DO
IF (hmat%l_real) THEN
hmat%data_r=hmat%data_r+REAL(hmat%data_c)
ENDIF
call nvtxEndRange call nvtxEndRange
END SUBROUTINE priv_noMPI_gpu END SUBROUTINE priv_noMPI_gpu
#endif #endif
......
...@@ -7,15 +7,450 @@ ...@@ -7,15 +7,450 @@
MODULE m_hsmt_sph MODULE m_hsmt_sph
USE m_juDFT USE m_juDFT
IMPLICIT NONE IMPLICIT NONE
INTERFACE hsmt_sph
module procedure hsmt_sph_cpu
#ifdef CPP_GPU
module procedure hsmt_sph_gpu
END INTERFACE
INTERFACE HsmtSphGpuKernel_noApw
module procedure HsmtSphGpuKernel_noApw_cmplx, HsmtSphGpuKernel_noApw_real
END INTERFACE
INTERFACE HsmtSphGpuKernel_Apw
module procedure HsmtSphGpuKernel_Apw_cmplx, HsmtSphGpuKernel_Apw_real
#endif
END INTERFACE
CONTAINS CONTAINS
SUBROUTINE hsmt_sph(n,atoms,mpi,isp,input,noco,cell,iintsp,jintsp,chi,lapw,el,e_shift,usdus,fj,gj,smat,hmat)
#ifdef CPP_GPU
SUBROUTINE HsmtSphGpuKernel_Apw_real(iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
lnonsph,chi,qssbti,qssbtj,gvec,gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,taual,ddn,el,e_shift,&
smat_data,hmat_data,&
uds,dus,us,duds,rmt)
INTEGER,INTENT(IN) :: iintsp,jintsp,nv(2),lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,lnonsph
COMPLEX, INTENT(IN) :: chi
REAL,INTENT(IN) :: qssbti(3),qssbtj(3)
INTEGER,INTENT(IN) :: gvec(:,:,:)
REAL,INTENT(IN) :: gk(:,:,:)
REAL,INTENT(IN) :: fleg1(0:lmaxd),fleg2(0:lmaxd),fl2p1(0:lmaxd)
REAL,INTENT(IN) :: fl2p1bt(0:lmaxd)
REAL,MANAGED,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
REAL,INTENT(IN) :: taual(:,:)
REAL,INTENT(IN) :: ddn(0:lmaxd)
REAL,INTENT(IN) :: el(0:lmaxd)
REAL,INTENT(IN) :: e_shift
REAL,INTENT(INOUT) :: smat_data(:,:),hmat_data(:,:)
!+APW
REAL,INTENT(IN),OPTIONAL :: uds(0:lmaxd),dus(0:lmaxd),us(0:lmaxd),duds(0:lmaxd)
REAL,INTENT(IN),OPTIONAL :: rmt
!-APW
REAL, PARAMETER :: tpi_const=2.*3.1415926535897932
REAL, ALLOCATABLE :: plegend(:,:)
COMPLEX, ALLOCATABLE :: cph(:)
REAL tnn(3), elall,fct,fjkiln,gjkiln,ddnln,ski(3)
REAL apw_lo1,apw_lo2,apw1,w1
INTEGER kii,ki,kj,l,nn
ALLOCATE(cph(MAXVAL(nv)))
ALLOCATE(plegend(MAXVAL(nv),0:lmaxd))
plegend=0.0
plegend(:,0)=1.0
DO ki = ki_start,ki_end,ki_step
kii=(ki-1)/ki_step+1
ski = gvec(:,ki,jintsp) + qssbti
!---> legendre polynomials
DO kj = 1,ki
plegend(kj,1) = DOT_PRODUCT(gk(:,kj,iintsp),gk(:,ki,jintsp))
END DO
DO l = 1,lmax - 1
plegend(:ki,l+1) = fleg1(l)*plegend(:ki,1)*plegend(:ki,l) - fleg2(l)*plegend(:ki,l-1)
END DO
!---> set up phase factors
cph = 0.0
DO nn = nn_start,nn_end
tnn = tpi_const*taual(:,nn)
DO kj = 1,ki
cph(kj) = cph(kj) +&
CMPLX(COS(DOT_PRODUCT(ski-gvec(:,kj,iintsp)-qssbtj,tnn)),&
SIN(DOT_PRODUCT(gvec(:,kj,iintsp)+qssbtj-ski,tnn)))
! IF (iintsp.NE.jintsp) cph(kj)=CONJG(cph(kj))
END DO
END DO
!---> update overlap and l-diagonal hamiltonian matrix
DO l = 0,lmax
!+APW
IF (PRESENT(uds)) THEN
w1 = 0.5 * ( uds(l)*dus(l) + us(l)*duds(l) )
apw_lo1 = fl2p1(l) * 0.5 * rmt**2 * ( gjkiln * w1 +&
fjkiln * us(l) * dus(l) )
apw_lo2 = fl2p1(l) * 0.5 * rmt**2 * ( fjkiln * w1 +&
gjkiln * uds(l) * duds(l) )
ENDIF
!-APW
fjkiln = fj(ki,l,jintsp)
gjkiln = gj(ki,l,jintsp)
ddnln = ddn(l)
elall = el(l)
IF (l<=lnonsph) elall=elall-e_shift!(isp)
DO kj = 1,ki
fct = plegend(kj,l)*fl2p1(l)*&
( fjkiln*fj(kj,l,iintsp) + gjkiln*gj(kj,l,iintsp)*ddnln )
smat_data(kj,kii)=smat_data(kj,kii)+REAL(cph(kj))*fct
hmat_data(kj,kii)=hmat_data(kj,kii) + REAL(cph(kj)) * &
( fct * elall + plegend(kj,l) * fl2p1bt(l) *&
( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) ) )
!+APW
IF (PRESENT(uds)) THEN
apw1 = REAL(cph(kj)) * plegend(kj,l) * &
( apw_lo1 * fj(kj,l,iintsp) + apw_lo2 * gj(kj,l,iintsp) )
hmat_data(kj,kii)=hmat_data(kj,kii) + apw1
ENDIF
!-APW
ENDDO
!---> end loop over l
ENDDO
!---> end loop over ki
ENDDO
DEALLOCATE(plegend)
DEALLOCATE(cph)
END SUBROUTINE HsmtSphGpuKernel_Apw_real
SUBROUTINE HsmtSphGpuKernel_Apw_cmplx(iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
lnonsph,chi,qssbti,qssbtj,gvec,gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,taual,ddn,el,e_shift,&
smat_data,hmat_data,&
uds,dus,us,duds,rmt)
INTEGER,INTENT(IN) :: iintsp,jintsp,nv(2),lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,lnonsph
COMPLEX, INTENT(IN) :: chi
REAL,INTENT(IN) :: qssbti(3),qssbtj(3)
INTEGER,INTENT(IN) :: gvec(:,:,:)
REAL,INTENT(IN) :: gk(:,:,:)
REAL,INTENT(IN) :: fleg1(0:lmaxd),fleg2(0:lmaxd),fl2p1(0:lmaxd)
REAL,INTENT(IN) :: fl2p1bt(0:lmaxd)
REAL,MANAGED,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
REAL,INTENT(IN) :: taual(:,:)
REAL,INTENT(IN) :: ddn(0:lmaxd)
REAL,INTENT(IN) :: el(0:lmaxd)
REAL,INTENT(IN) :: e_shift
COMPLEX,INTENT(INOUT) :: smat_data(:,:),hmat_data(:,:)
!+APW
REAL,INTENT(IN),OPTIONAL :: uds(0:lmaxd),dus(0:lmaxd),us(0:lmaxd),duds(0:lmaxd)
REAL,INTENT(IN),OPTIONAL :: rmt
!-APW
REAL, PARAMETER :: tpi_const=2.*3.1415926535897932
REAL, ALLOCATABLE :: plegend(:,:)
COMPLEX, ALLOCATABLE :: cph(:)
REAL apw_lo1,apw_lo2,w1
COMPLEX capw1
REAL tnn(3), elall,fct,fjkiln,gjkiln,ddnln,ski(3)
INTEGER kii,ki,kj,l,nn
ALLOCATE(cph(MAXVAL(nv)))
ALLOCATE(plegend(MAXVAL(nv),0:lmaxd))
plegend=0.0
plegend(:,0)=1.0
DO ki = ki_start,ki_end,ki_step
kii=(ki-1)/ki_step+1
ski = gvec(:,ki,jintsp) + qssbti
!---> legendre polynomials
DO kj = 1,ki
plegend(kj,1) = DOT_PRODUCT(gk(:,kj,iintsp),gk(:,ki,jintsp))
END DO
DO l = 1,lmax - 1
plegend(:ki,l+1) = fleg1(l)*plegend(:ki,1)*plegend(:ki,l) - fleg2(l)*plegend(:ki,l-1)
END DO
!---> set up phase factors
cph = 0.0