Commit e288839d authored by Matthias Redies's avatar Matthias Redies

Merge branch 'develop' into MetaGGA

parents d7380394 cb805565
......@@ -8,6 +8,8 @@ if (CLI_FLEUR_USE_GPU)
message("Using cuda8")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "cuda9")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "cuda9.1")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.1,cc60 -Mcuda=rdc -Mcudalib=cublas")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "nvtx")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas -lnvToolsExt ")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "emu")
......@@ -21,7 +23,7 @@ if (CLI_FLEUR_USE_GPU)
LINK_LIBRARIES "-lcusolver"
)
if (FLEUR_USE_CUSOLVER)
set(${FLEUR_LIBRARIES} "${FLEUR_LIBRARIES};-lcusolver")
set(FLEUR_LIBRARIES "${FLEUR_LIBRARIES};-lcusolver")
set(FLEUR_MPI_DEFINITIONS ${FLEUR_MPI_DEFINITIONS} "CPP_CUSOLVER")
set(FLEUR_DEFINITIONS ${FLEUR_DEFINITIONS} "CPP_CUSOLVER")
endif()
......
......@@ -12,92 +12,64 @@ MODULE m_magma
! using the MAGMA library for multiple GPUs
!**********************************************************
CONTAINS
SUBROUTINE magma_diag(nsize,eig,ne,a_r,b_r,z_r,a_c,b_c,z_c)
SUBROUTINE magma_diag(hmat,smat,ne,eig,zmat)
#ifdef CPP_MAGMA
use magma
#endif
#include"cpp_double.h"
use m_types
IMPLICIT NONE
! ... Arguments ...
INTEGER, INTENT (IN) :: nsize
TYPE(t_mat),INTENT(INOUT) :: hmat,smat
INTEGER,INTENT(INOUT) :: ne
CLASS(t_mat),ALLOCATABLE,INTENT(OUT) :: zmat
REAL,INTENT(OUT) :: eig(:)
REAL, INTENT(OUT) :: eig(:)
INTEGER, INTENT(INOUT) :: ne
REAL, OPTIONAL,ALLOCATABLE, INTENT (INOUT) :: a_r(:),b_r(:)
REAL, OPTIONAL,ALLOCATABLE, INTENT (INOUT) :: z_r(:,:)
COMPLEX, OPTIONAL,ALLOCATABLE, INTENT (INOUT) :: a_c(:),b_c(:)
COMPLEX, OPTIONAL,ALLOCATABLE, INTENT (INOUT) :: z_c(:,:)
#ifdef CPP_MAGMA
! ... Local Variables ..
INTEGER iind,ind1,ind2,info,lwork,liwork,lrwork,err,i,mout(1)
REAL eigTemp(nsize)
LOGICAL:: initialized=.false.
INTEGER :: lwork,liwork,lrwork,err,mout(1)
REAL :: eigTemp(hmat%matsize1)
LOGICAL :: initialized=.false.
REAL, ALLOCATABLE :: rwork(:)
INTEGER, ALLOCATABLE :: iwork(:)
REAL, ALLOCATABLE :: largea_r(:,:),largeb_r(:,:)
COMPLEX, ALLOCATABLE :: largea_c(:,:),largeb_c(:,:)
COMPLEX,ALLOCATABLE :: work(:)
COMPLEX, ALLOCATABLE :: work(:)
LOGICAL :: l_real
l_real=present(a_r)
print *,"MAGMA start"
IF (.NOT.initialized) THEN
initialized=.true.
call magmaf_init()
print *,"MAGMA init"
initialized=.TRUE.
CALL magmaf_init()
ENDIF
!**********************************
!expand from packed to full storage
!**********************************
!hamiltonian
if (l_real) THEN
call packed_to_full(nsize,a_r,largea_r)
call packed_to_full(nsize,b_r,largeb_r)
!deallocate(a_r,b_r)
IF (hmat%l_real) THEN
CALL juDFT_error("REAL diagonalization not implemented in magma.F90")
ELSE
call packed_to_full(nsize,a_c,largea_c)
call packed_to_full(nsize,b_c,largeb_c)
!deallocate(a_c,b_c)
Endif
if (l_real) call juDFT_error("REAL diagonalization not implemented in magma.F90")
!Query the workspace size
allocate(work(1),rwork(1),iwork(1))
print *,"Magma workspace query"
call flush()
call magmaf_zhegvdx(1,'v','i','l',nsize,largea_c,nsize,largeb_c,nsize,&
0.0,0.0,1,ne,mout,eigTemp,work,-1,rwork,-1,iwork,-1,err)
lwork=work(1)
lrwork=rwork(1)
liwork=iwork(1)
print*,"MAGMA:",lwork,lrwork,liwork
deallocate(work,rwork,iwork)
allocate(work(lwork),rwork(lrwork),iwork(liwork))
if (err/=0) call juDFT_error("Failed to allocate workspaces",calledby="magma.F90")
!Now the diagonalization
print *,"Magma diagonalization"
print *,nsize,shape(largea_c),shape(eigTemp),ne
call magmaf_zhegvdx(1,'v','i','l',nsize,largea_c,nsize,largeb_c,nsize,&
0.0,0.0,1,ne,mout,eigTemp,work,lwork,rwork,lrwork,iwork,liwork,err)
print*,"MAGMA info:",err
if (err/=0) call juDFT_error("Magma failed to diagonalize Hamiltonian")
print *,"MAGMA mout:",mout
!Query the workspace size
ALLOCATE(work(1),rwork(1),iwork(1))
CALL magmaf_zhegvdx(1,'v','i','l',hmat%matsize1,hmat%data_c,SIZE(hmat%data_c,1),smat%data_c,SIZE(smat%data_c,1),&
0.0,0.0,1,ne,mout,eigTemp,work,-1,rwork,-1,iwork,-1,err)
IF (err/=0) CALL juDFT_error("Failed to query workspaces",calledby="magma.F90")
lwork=work(1)
lrwork=rwork(1)
liwork=iwork(1)
DEALLOCATE(work,rwork,iwork)
ALLOCATE(work(lwork),rwork(lrwork),iwork(liwork))
!Now the diagonalization
CALL magmaf_zhegvdx(1,'v','i','l',hmat%matsize1,hmat%data_c,SIZE(hmat%data_c,1),smat%data_c,SIZE(smat%data_c,1),&
0.0,0.0,1,ne,mout,eigTemp,work,lwork,rwork,lrwork,iwork,liwork,err)
IF (err/=0) CALL juDFT_error("Magma failed to diagonalize Hamiltonian")
ENDIF
ALLOCATE(t_mat::zmat)
CALL zmat%alloc(hmat%l_real,hmat%matsize1,ne)
DO i = 1, ne
eig(i) = eigTemp(i)
z_c(:nsize,i)=largea_c(:nsize,i)
IF (hmat%l_real) THEN
zmat%data_r(:,i)=hmat%data_r(:nsize,i)
ELSE
zmat%data_c(:,i)=hmat%data_c(:nsize,i)
ENDIF
END DO
!call judft_error("Eigenvectors are not calculated in MAGMA")
#endif
END SUBROUTINE magma_diag
END MODULE m_magma
......
......@@ -38,22 +38,22 @@ CONTAINS
!up-up component (or only component in collinear case)
IF (SIZE(mat)==1) THEN
CALL mat_final%move(mat(1,1))
CALL mat(1,1)%free()
!CALL mat(1,1)%free()
RETURN
ENDIF
CALL mat_final%copy(mat(1,1),1,1)
CALL mat(1,1)%free()
!CALL mat(1,1)%free()
!down-down component
CALL mat_final%copy(mat(2,2),lapw%nv(1)+atoms%nlotot+1,lapw%nv(1)+atoms%nlotot+1)
CALL mat(2,2)%free()
!CALL mat(2,2)%free()
!Now collect off-diagonal parts
CALL mat(1,2)%add_transpose(mat(2,1))
CALL mat_final%copy(mat(1,2),1,lapw%nv(1)+atoms%nlotot+1)
CALL mat(1,2)%free()
CALL mat(2,1)%free()
!CALL mat(1,2)%free()
!CALL mat(2,1)%free()
END SUBROUTINE eigen_redist_matrix
END MODULE m_eigen_redist_matrix
......
......@@ -43,7 +43,11 @@ CONTAINS
INTEGER, INTENT (IN) :: ispin
!locals
#ifdef CPP_GPU
REAL, ALLOCATABLE,MANAGED :: fj(:,:,:,:),gj(:,:,:,:)
#else
REAL, ALLOCATABLE :: fj(:,:,:,:),gj(:,:,:,:)
#endif
INTEGER :: iintsp,jintsp,n
COMPLEX :: chi(2,2),chi_one
......@@ -67,7 +71,7 @@ CONTAINS
IF (.NOT.noco%l_noco) THEN
!This is for collinear calculations: the (1,1) element of the matrices is all
!that is needed and allocated
CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,cell,1,1,chi_one,lapw,enpara%el0,&
CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,1,1,chi_one,lapw,enpara%el0,&
td%e_shift(n,ispin),usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),smat(1,1),hmat(1,1))
CALL hsmt_nonsph(n,mpi,sym,atoms,ispin,1,1,chi_one,noco,cell,lapw,td,&
fj(:,0:,ispin,:),gj(:,0:,ispin,:),hmat(1,1))
......@@ -79,7 +83,7 @@ CONTAINS
!stored in tmp-variables. Then these are distributed (rotated) into the 2x2
!global spin-matrices.
CALL hmat_tmp%clear();CALL smat_tmp%clear()
CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,cell,1,1,chi_one,lapw,enpara%el0,td%e_shift(n,ispin),&
CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,1,1,chi_one,lapw,enpara%el0,td%e_shift(n,ispin),&
usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),smat_tmp,hmat_tmp)
CALL hsmt_nonsph(n,mpi,sym,atoms,ispin,1,1,chi_one,noco,cell,lapw,td,&
fj(:,0:,ispin,:),gj(:,0:,ispin,:),hmat_tmp)
......@@ -99,7 +103,7 @@ CONTAINS
CALL hsmt_spinor(ispin,n,noco,chi)
DO iintsp=1,2
DO jintsp=1,2
CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,cell,iintsp,jintsp,chi(iintsp,jintsp),&
CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,iintsp,jintsp,chi(iintsp,jintsp),&
lapw,enpara%el0,td%e_shift(n,ispin),usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),&
smat(iintsp,jintsp),hmat(iintsp,jintsp))
CALL hsmt_nonsph(n,mpi,sym,atoms,ispin,iintsp,jintsp,chi(iintsp,jintsp),noco,cell,&
......
......@@ -62,7 +62,7 @@ CONTAINS
USE m_ylm
USE m_apws
USE cudafor
! USE nvtx
USE nvtx
IMPLICIT NONE
TYPE(t_sym),INTENT(IN) :: sym
TYPE(t_cell),INTENT(IN) :: cell
......@@ -92,8 +92,9 @@ CONTAINS
COMPLEX,ALLOCATABLE,DEVICE :: c_ph_dev(:,:)
REAL, ALLOCATABLE,DEVICE :: gkrot_dev(:,:)
INTEGER :: grid, block
!INTEGER :: istat
INTEGER :: istat
call nvtxStartRange("hsmt_ab",3)
lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph)
ALLOCATE(c_ph_dev(lapw%nv(1),MERGE(2,1,noco%l_ss)))
......@@ -129,17 +130,12 @@ CONTAINS
!--> synthesize the complex conjugates of a and b
!call nvtxStartRange("hsmt_synthAB",5)
!istat = cudaDeviceSynchronize()
! pretty ugly solution
block = 256
grid = lapw%nv(1)/(block*4) + 1
CALL synth_ab<<<grid,block>>>(grid,block,lapw%nv(1),lmax,ab_size,gkrot_dev,&
fj(:,:,iintsp),gj(:,:,iintsp),c_ph_dev(:,iintsp),ab)
!istat = cudaDeviceSynchronize()
!call nvtxEndRange
IF (PRESENT(abclo)) THEN
print*, "Ooooops, TODO in hsmt_ab"
......@@ -169,6 +165,8 @@ CONTAINS
DEALLOCATE(c_ph_dev)
DEALLOCATE(gkrot_dev)
istat = cudaDeviceSynchronize()
call nvtxEndRange
END SUBROUTINE hsmt_ab_gpu
#endif
......
......@@ -17,7 +17,7 @@ CONTAINS
DO iintsp=1,2
DO jintsp=1,2
mat(jintsp,iintsp)%data_c(:,:)=chi(jintsp,iintsp)*mat_tmp%data_c(:,:)+mat(jintsp,iintsp)%data_c(:,:)
mat(jintsp,iintsp)%data_c(:,:)=chi(iintsp,jintsp)*mat_tmp%data_c(:,:)+mat(jintsp,iintsp)%data_c(:,:)
ENDDO
ENDDO
END SUBROUTINE hsmt_distspins
......
......@@ -29,39 +29,22 @@ CONTAINS
INTEGER, INTENT (IN) :: n,isp,iintsp,jintsp
COMPLEX,INTENT(IN) :: chi
! .. Array Arguments ..
REAL,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
#if defined CPP_GPU
REAL,MANAGED,INTENT(IN) :: fj(:,:,:),gj(:,:,:)
#else
REAL,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
#endif
CLASS(t_mat),INTENT(INOUT) ::hmat
#if defined CPP_GPU
REAL, ALLOCATABLE,DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
COMPLEX,ALLOCATABLE,DEVICE :: h_loc_dev(:,:)
COMPLEX,ALLOCATABLE,DEVICE :: c_dev(:,:)
#endif
CALL timestart("non-spherical setup")
IF (mpi%n_size==1) THEN
#if defined CPP_GPU
ALLOCATE(fj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
ALLOCATE(gj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
fj_dev(1:,1:,1:)= fj(1:,0:,1:)
gj_dev(1:,1:,1:)= gj(1:,0:,1:)
ALLOCATE(h_loc_dev(size(td%h_loc,1),size(td%h_loc,2)))
h_loc_dev(1:,1:) = CONJG(td%h_loc(0:,0:,n,isp))
IF (hmat%l_real) THEN
IF (ANY(SHAPE(hmat%data_c)/=SHAPE(hmat%data_r))) THEN
DEALLOCATE(hmat%data_c)
ALLOCATE(hmat%data_c(SIZE(hmat%data_r,1),SIZE(hmat%data_r,2)))
ENDIF
hmat%data_c=0.0
ENDIF
ALLOCATE(c_dev(SIZE(hmat%data_c,1),SIZE(hmat%data_c,2)))
c_dev = hmat%data_c
CALL priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,h_loc_dev,fj_dev,gj_dev,c_dev)
hmat%data_c = c_dev
IF (hmat%l_real) THEN
hmat%data_r=hmat%data_r+REAL(hmat%data_c)
ENDIF
CALL priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,h_loc_dev,fj,gj,hmat)
#else
CALL priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
#endif
......@@ -72,7 +55,7 @@ CONTAINS
END SUBROUTINE hsmt_nonsph
#if defined CPP_GPU
SUBROUTINE priv_noMPI_gpu(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,h_loc_dev,fj_dev,gj_dev,c_dev)
SUBROUTINE priv_noMPI_gpu(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,h_loc_dev,fj_dev,gj_dev,hmat)
!Calculate overlap matrix, GPU version
!note that basically all matrices in the GPU version are conjugates of their cpu counterparts
USE m_hsmt_ab
......@@ -101,8 +84,7 @@ CONTAINS
! ..
! .. Array Arguments ..
REAL, INTENT(IN), DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
COMPLEX,INTENT(INOUT),DEVICE :: c_dev(:,:)
CLASS(t_mat),INTENT(INOUT) ::hmat
INTEGER:: nn,na,ab_size,l,ll,m
real :: rchi
......@@ -114,6 +96,14 @@ CONTAINS
ALLOCATE(ab_dev(MAXVAL(lapw%nv),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
IF (iintsp.NE.jintsp) ALLOCATE(ab2_dev(lapw%nv(iintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
IF (hmat%l_real) THEN
IF (ANY(SHAPE(hmat%data_c)/=SHAPE(hmat%data_r))) THEN
DEALLOCATE(hmat%data_c)
ALLOCATE(hmat%data_c(SIZE(hmat%data_r,1),SIZE(hmat%data_r,2)))
ENDIF
hmat%data_c=0.0
ENDIF
DO nn = 1,atoms%neq(n)
na = SUM(atoms%neq(:n-1))+nn
IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN
......@@ -124,10 +114,9 @@ CONTAINS
!Calculate Hamiltonian
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab_dev,SIZE(ab_dev,1),&
h_loc_dev,SIZE(h_loc_dev,1),CMPLX(0.,0.),ab1_dev,SIZE(ab1_dev,1))
!ab1=MATMUL(ab(:lapw%nv(iintsp),:ab_size),td%h_loc(:ab_size,:ab_size,n,isp))
IF (iintsp==jintsp) THEN
call nvtxStartRange("zherk",3)
CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,ab1_dev,SIZE(ab1_dev,1),1.0,c_dev,SIZE(c_dev,1))
CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,ab1_dev,SIZE(ab1_dev,1),1.0,hmat%data_c,SIZE(hmat%data_c,1))
istat = cudaDeviceSynchronize()
call nvtxEndRange()
ELSE !here the l_ss off-diagonal part starts
......@@ -144,11 +133,14 @@ CONTAINS
enddo
enddo
CALL zgemm("N","T",lapw%nv(iintsp),lapw%nv(jintsp),ab_size,chi,ab2_dev,SIZE(ab2_dev,1),&
ab1_dev,SIZE(ab1_dev,1),CMPLX(1.0,0.0),c_dev,SIZE(c_dev,1))
ab1_dev,SIZE(ab1_dev,1),CMPLX(1.0,0.0),hmat%data_c,SIZE(hmat%data_c,1))
ENDIF
ENDIF
END DO
IF (hmat%l_real) THEN
hmat%data_r=hmat%data_r+REAL(hmat%data_c)
ENDIF
call nvtxEndRange
END SUBROUTINE priv_noMPI_gpu
#endif
......
......@@ -7,15 +7,450 @@
MODULE m_hsmt_sph
USE m_juDFT
IMPLICIT NONE
INTERFACE hsmt_sph
module procedure hsmt_sph_cpu
#ifdef CPP_GPU
module procedure hsmt_sph_gpu
END INTERFACE
INTERFACE HsmtSphGpuKernel_noApw
module procedure HsmtSphGpuKernel_noApw_cmplx, HsmtSphGpuKernel_noApw_real
END INTERFACE
INTERFACE HsmtSphGpuKernel_Apw
module procedure HsmtSphGpuKernel_Apw_cmplx, HsmtSphGpuKernel_Apw_real
#endif
END INTERFACE
CONTAINS
SUBROUTINE hsmt_sph(n,atoms,mpi,isp,input,noco,cell,iintsp,jintsp,chi,lapw,el,e_shift,usdus,fj,gj,smat,hmat)
#ifdef CPP_GPU
SUBROUTINE HsmtSphGpuKernel_Apw_real(iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
lnonsph,chi,qssbti,qssbtj,gvec,gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,taual,ddn,el,e_shift,&
smat_data,hmat_data,&
uds,dus,us,duds,rmt)
INTEGER,INTENT(IN) :: iintsp,jintsp,nv(2),lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,lnonsph
COMPLEX, INTENT(IN) :: chi
REAL,INTENT(IN) :: qssbti(3),qssbtj(3)
INTEGER,INTENT(IN) :: gvec(:,:,:)
REAL,INTENT(IN) :: gk(:,:,:)
REAL,INTENT(IN) :: fleg1(0:lmaxd),fleg2(0:lmaxd),fl2p1(0:lmaxd)
REAL,INTENT(IN) :: fl2p1bt(0:lmaxd)
REAL,MANAGED,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
REAL,INTENT(IN) :: taual(:,:)
REAL,INTENT(IN) :: ddn(0:lmaxd)
REAL,INTENT(IN) :: el(0:lmaxd)
REAL,INTENT(IN) :: e_shift
REAL,INTENT(INOUT) :: smat_data(:,:),hmat_data(:,:)
!+APW
REAL,INTENT(IN),OPTIONAL :: uds(0:lmaxd),dus(0:lmaxd),us(0:lmaxd),duds(0:lmaxd)
REAL,INTENT(IN),OPTIONAL :: rmt
!-APW
REAL, PARAMETER :: tpi_const=2.*3.1415926535897932
REAL, ALLOCATABLE :: plegend(:,:)
COMPLEX, ALLOCATABLE :: cph(:)
REAL tnn(3), elall,fct,fjkiln,gjkiln,ddnln,ski(3)
REAL apw_lo1,apw_lo2,apw1,w1
INTEGER kii,ki,kj,l,nn
ALLOCATE(cph(MAXVAL(nv)))
ALLOCATE(plegend(MAXVAL(nv),0:lmaxd))
plegend=0.0
plegend(:,0)=1.0
DO ki = ki_start,ki_end,ki_step
kii=(ki-1)/ki_step+1
ski = gvec(:,ki,jintsp) + qssbti
!---> legendre polynomials
DO kj = 1,ki
plegend(kj,1) = DOT_PRODUCT(gk(:,kj,iintsp),gk(:,ki,jintsp))
END DO
DO l = 1,lmax - 1
plegend(:ki,l+1) = fleg1(l)*plegend(:ki,1)*plegend(:ki,l) - fleg2(l)*plegend(:ki,l-1)
END DO
!---> set up phase factors
cph = 0.0
DO nn = nn_start,nn_end
tnn = tpi_const*taual(:,nn)
DO kj = 1,ki
cph(kj) = cph(kj) +&
CMPLX(COS(DOT_PRODUCT(ski-gvec(:,kj,iintsp)-qssbtj,tnn)),&
SIN(DOT_PRODUCT(gvec(:,kj,iintsp)+qssbtj-ski,tnn)))
! IF (iintsp.NE.jintsp) cph(kj)=CONJG(cph(kj))
END DO
END DO
!---> update overlap and l-diagonal hamiltonian matrix
DO l = 0,lmax
!+APW
IF (PRESENT(uds)) THEN
w1 = 0.5 * ( uds(l)*dus(l) + us(l)*duds(l) )
apw_lo1 = fl2p1(l) * 0.5 * rmt**2 * ( gjkiln * w1 +&
fjkiln * us(l) * dus(l) )
apw_lo2 = fl2p1(l) * 0.5 * rmt**2 * ( fjkiln * w1 +&
gjkiln * uds(l) * duds(l) )
ENDIF
!-APW
fjkiln = fj(ki,l,jintsp)
gjkiln = gj(ki,l,jintsp)
ddnln = ddn(l)
elall = el(l)
IF (l<=lnonsph) elall=elall-e_shift!(isp)
DO kj = 1,ki
fct = plegend(kj,l)*fl2p1(l)*&
( fjkiln*fj(kj,l,iintsp) + gjkiln*gj(kj,l,iintsp)*ddnln )
smat_data(kj,kii)=smat_data(kj,kii)+REAL(cph(kj))*fct
hmat_data(kj,kii)=hmat_data(kj,kii) + REAL(cph(kj)) * &
( fct * elall + plegend(kj,l) * fl2p1bt(l) *&
( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) ) )
!+APW
IF (PRESENT(uds)) THEN
apw1 = REAL(cph(kj)) * plegend(kj,l) * &
( apw_lo1 * fj(kj,l,iintsp) + apw_lo2 * gj(kj,l,iintsp) )
hmat_data(kj,kii)=hmat_data(kj,kii) + apw1
ENDIF
!-APW
ENDDO
!---> end loop over l
ENDDO
!---> end loop over ki
ENDDO
DEALLOCATE(plegend)
DEALLOCATE(cph)
END SUBROUTINE HsmtSphGpuKernel_Apw_real
SUBROUTINE HsmtSphGpuKernel_Apw_cmplx(iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
lnonsph,chi,qssbti,qssbtj,gvec,gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,taual,ddn,el,e_shift,&
smat_data,hmat_data,&
uds,dus,us,duds,rmt)
INTEGER,INTENT(IN) :: iintsp,jintsp,nv(2),lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,lnonsph
COMPLEX, INTENT(IN) :: chi
REAL,INTENT(IN) :: qssbti(3),qssbtj(3)
INTEGER,INTENT(IN) :: gvec(:,:,:)
REAL,INTENT(IN) :: gk(:,:,:)
REAL,INTENT(IN) :: fleg1(0:lmaxd),fleg2(0:lmaxd),fl2p1(0:lmaxd)
REAL,INTENT(IN) :: fl2p1bt(0:lmaxd)
REAL,MANAGED,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
REAL,INTENT(IN) :: taual(:,:)
REAL,INTENT(IN) :: ddn(0:lmaxd)
REAL,INTENT(IN) :: el(0:lmaxd)
REAL,INTENT(IN) :: e_shift
COMPLEX,INTENT(INOUT) :: smat_data(:,:),hmat_data(:,:)
!+APW
REAL,INTENT(IN),OPTIONAL :: uds(0:lmaxd),dus(0:lmaxd),us(0:lmaxd),duds(0:lmaxd)
REAL,INTENT(IN),OPTIONAL :: rmt
!-APW
REAL, PARAMETER :: tpi_const=2.*3.1415926535897932
REAL, ALLOCATABLE :: plegend(:,:)
COMPLEX, ALLOCATABLE :: cph(:)
REAL apw_lo1,apw_lo2,w1
COMPLEX capw1
REAL tnn(3), elall,fct,fjkiln,gjkiln,ddnln,ski(3)
INTEGER kii,ki,kj,l,nn
ALLOCATE(cph(MAXVAL(nv)))
ALLOCATE(plegend(MAXVAL(nv),0:lmaxd))
plegend=0.0
plegend(:,0)=1.0
DO ki = ki_start,ki_end,ki_step
kii=(ki-1)/ki_step+1
ski = gvec(:,ki,jintsp) + qssbti
!---> legendre polynomials
DO kj = 1,ki
plegend(kj,1) = DOT_PRODUCT(gk(:,kj,iintsp),gk(:,ki,jintsp))
END DO
DO l = 1,lmax - 1
plegend(:ki,l+1) = fleg1(l)*plegend(:ki,1)*plegend(:ki,l) - fleg2(l)*plegend(:ki,l-1)
END DO
!---> set up phase factors
cph = 0.0
DO nn = nn_start,nn_end
tnn = tpi_const*taual(:,nn)
DO kj = 1,ki
cph(kj) = cph(kj) +&
CMPLX(COS(DOT_PRODUCT(ski-gvec(:,kj,iintsp)-qssbtj,tnn)),&
SIN(DOT_PRODUCT(gvec(:,kj,iintsp)+qssbtj-ski,tnn)))
! IF (iintsp.NE.jintsp) cph(kj)=CONJG(cph(kj))
END DO
END DO
!---> update overlap and l-diagonal hamiltonian matrix
DO l = 0,lmax
!+APW
IF (PRESENT(uds)) THEN
w1 = 0.5 * ( uds(l)*dus(l) + us(l)*duds(l) )
apw_lo1 = fl2p1(l) * 0.5 * rmt**2 * ( gjkiln * w1 +&
fjkiln * us(l) * dus(l) )
apw_lo2 = fl2p1(l) * 0.5 * rmt**2 * ( fjkiln * w1 +&
gjkiln * uds(l) * duds(l) )
ENDIF
!-APW
fjkiln = fj(ki,l,jintsp)
gjkiln = gj(ki,l,jintsp)
ddnln = ddn(l)
elall = el(l)
IF (l<=lnonsph) elall=elall-e_shift!(isp)
DO kj = 1,MIN(ki,nv(iintsp))
fct = chi*plegend(kj,l)*fl2p1(l)*&
( fjkiln*fj(kj,l,iintsp) + gjkiln*gj(kj,l,iintsp)*ddnln )
smat_data(kj,kii)=smat_data(kj,kii) + cph(kj)*fct
hmat_data(kj,kii)=hmat_data(kj,kii) + cph(kj) * ( fct*elall &
+ chi*plegend(kj,l)*fl2p1bt(l) * ( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) ) )
!+APW
IF (PRESENT(uds)) THEN
capw1 = cph(kj)*plegend(kj,l)&
* ( apw_lo1 * fj(kj,l,iintsp) + apw_lo2 * gj(kj,l,iintsp) )
hmat_data(kj,kii)=hmat_data(kj,kii) + capw1
ENDIF
!-APW
END DO
!---> end loop over l
ENDDO
!---> end loop over ki
ENDDO
DEALLOCATE(plegend)
DEALLOCATE(cph)
END SUBROUTINE HsmtSphGpuKernel_Apw_cmplx
SUBROUTINE HsmtSphGpuKernel_noApw_cmplx(iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
lnonsph,chi,qssbti,qssbtj,gvec,gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,taual,ddn,el,e_shift,&
smat_data,hmat_data)
INTEGER,INTENT(IN) :: iintsp,jintsp,nv(2),lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,lnonsph
COMPLEX, INTENT(IN) :: chi
REAL,INTENT(IN) :: qssbti(3),qssbtj(3)
INTEGER,INTENT(IN) :: gvec(:,:,:)
REAL,INTENT(IN) :: gk(:,:,:)
REAL,INTENT(IN) :: fleg1(0:lmaxd),fleg2(0:lmaxd),fl2p1(0:lmaxd)
REAL,INTENT(IN) :: fl2p1bt(0:lmaxd)
REAL,MANAGED,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
REAL,INTENT(IN) :: taual(:,:)
REAL,INTENT(IN) :: ddn(0:lmaxd)
REAL,INTENT(IN) :: el(0:lmaxd)
REAL,INTENT(IN) :: e_shift
COMPLEX,INTENT(INOUT) :: smat_data(:,:),hmat_data(:,:)
REAL, PARAMETER :: tpi_const=2.*3.1415926535897932
REAL, ALLOCATABLE :: plegend(:,:)
COMPLEX, ALLOCATABLE :: cph(:)
REAL tnn(3), elall,fct,fjkiln,gjkiln,ddnln,ski(3)
INTEGER kii,ki,kj,l,nn
ALLOCATE(cph(MAXVAL(nv)))
ALLOCATE(plegend(MAXVAL(nv),0:lmaxd))
plegend=0.0
plegend(:,0)=1.0
DO ki = ki_start,ki_end,ki_step
kii=(ki-1)/ki_step+1
ski = gvec(:,ki,jintsp) + qssbti
!---> legendre polynomials
DO kj = 1,ki
plegend(kj,1) = DOT_PRODUCT(gk(:,kj,iintsp),gk(:,ki,jintsp))
END DO
DO l = 1,lmax - 1
plegend(:ki,l+1) = fleg1(l)*plegend(:ki,1)*plegend(:ki,l) - fleg2(l)*plegend(:ki,l-1)
END DO
!---> set up phase factors
cph = 0.0
DO nn = nn_start,nn_end
tnn = tpi_const*taual(:,nn)
DO kj = 1,ki
cph(kj) = cph(kj) +&
CMPLX(COS(DOT_PRODUCT(ski-gvec(:,kj,iintsp)-qssbtj,tnn)),&
SIN(DOT_PRODUCT(gvec(:,kj,iintsp)+qssbtj-ski,tnn)))
! IF (iintsp.NE.jintsp) cph(kj)=CONJG(cph(kj))
END DO
END DO
!---> update overlap and l-diagonal hamiltonian matrix
DO l = 0,lmax
fjkiln = fj(ki,l,jintsp)
gjkiln = gj(ki,l,jintsp)
ddnln = ddn(l)
elall = el(l)
IF (l<=lnonsph) elall=elall-e_shift!(isp)
DO kj = 1,MIN(ki,nv(iintsp))
fct = chi*plegend(kj,l)*fl2p1(l)*&
( fjkiln*fj(kj,l,iintsp) + gjkiln*gj(kj,l,iintsp)*ddnln )
smat_data(kj,kii)=smat_data(kj,kii) + cph(kj)*fct
hmat_data(kj,kii)=hmat_data(kj,kii) + cph(kj) * ( fct*elall &
+ chi*plegend(kj,l)*fl2p1bt(l) * ( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) ) )
END DO
!---> end loop over l
ENDDO
!---> end loop over ki
ENDDO
DEALLOCATE(plegend)
DEALLOCATE(cph)
END SUBROUTINE HsmtSphGpuKernel_noApw_cmplx