Commit 4756cbba authored by Matthias Redies's avatar Matthias Redies

Merge branch 'develop' into MetaGGA

parents 13e4beb9 d4cd3fdb
......@@ -12,35 +12,28 @@ MODULE m_hsmt_sph
module procedure hsmt_sph_cpu
#ifdef CPP_GPU
module procedure hsmt_sph_gpu
END INTERFACE
INTERFACE HsmtSphGpuKernel_noApw
module procedure HsmtSphGpuKernel_noApw_cmplx, HsmtSphGpuKernel_noApw_real
END INTERFACE
INTERFACE HsmtSphGpuKernel_Apw
module procedure HsmtSphGpuKernel_Apw_cmplx, HsmtSphGpuKernel_Apw_real
#endif
END INTERFACE
CONTAINS
#ifdef CPP_GPU
SUBROUTINE HsmtSphGpuKernel_Apw_real(iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
lnonsph,chi,qssbti,qssbtj,gvec,gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,taual,ddn,el,e_shift,&
ATTRIBUTES(global) SUBROUTINE HsmtSphGpuKernel_real(grid,block,iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
lnonsph,qssbti,qssbtj,gvec,gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,taual,ddn,el,e_shift,&
smat_data,hmat_data,&
uds,dus,us,duds,rmt)
INTEGER,INTENT(IN) :: iintsp,jintsp,nv(2),lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,lnonsph
COMPLEX, INTENT(IN) :: chi
INTEGER, VALUE, INTENT(IN) :: grid, block
INTEGER,VALUE,INTENT(IN) :: iintsp,jintsp,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,lnonsph
REAL,INTENT(IN) :: qssbti(3),qssbtj(3)
INTEGER,INTENT(IN) :: gvec(:,:,:)
INTEGER,INTENT(IN) :: gvec(:,:,:),nv(2)
REAL,INTENT(IN) :: gk(:,:,:)
REAL,INTENT(IN) :: fleg1(0:lmaxd),fleg2(0:lmaxd),fl2p1(0:lmaxd)
REAL,INTENT(IN) :: fl2p1bt(0:lmaxd)
REAL,MANAGED,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
REAL,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
REAL,INTENT(IN) :: taual(:,:)
REAL,INTENT(IN) :: ddn(0:lmaxd)
REAL,INTENT(IN) :: el(0:lmaxd)
REAL,INTENT(IN) :: e_shift
REAL,VALUE,INTENT(IN) :: e_shift
REAL,INTENT(INOUT) :: smat_data(:,:),hmat_data(:,:)
!+APW
REAL,INTENT(IN),OPTIONAL :: uds(0:lmaxd),dus(0:lmaxd),us(0:lmaxd),duds(0:lmaxd)
......@@ -50,16 +43,28 @@ CONTAINS
REAL, PARAMETER :: tpi_const=2.*3.1415926535897932
REAL, ALLOCATABLE :: plegend(:,:)
COMPLEX, ALLOCATABLE :: cph(:)
REAL tnn(3), elall,fct,fjkiln,gjkiln,ddnln,ski(3)
REAL tnn(3), elall,fct,fct2,fjkiln,gjkiln,ddnln,ski(3)
REAL apw_lo1,apw_lo2,apw1,w1
INTEGER kii,ki,kj,l,nn
INTEGER kii,ki,kj,l,nn,k
INTEGER :: loop_start, loop_end, i, loop_size
ALLOCATE(cph(MAXVAL(nv)))
ALLOCATE(plegend(MAXVAL(nv),0:lmaxd))
plegend=0.0
plegend(:,0)=1.0
DO ki = ki_start,ki_end,ki_step
k = (blockidx%x-1)*blockdim%x + threadidx%x
!TODO!!!
!for seq, i.e. ki_start = 1, ki_step = 1
loop_size = max(ki_end/(grid*block),1)
if (loop_size * grid*block < ki_end) loop_size = loop_size + 1
loop_start = (k-1) * loop_size + 1
loop_end = loop_start + loop_size - 1
if (loop_end > ki_end ) loop_end = ki_end
DO ki = loop_start,loop_end,ki_step
!DO ki = ki_start,ki_end,ki_step
kii=(ki-1)/ki_step+1
ski = gvec(:,ki,jintsp) + qssbti
!---> legendre polynomials
......@@ -100,10 +105,10 @@ CONTAINS
DO kj = 1,ki
fct = plegend(kj,l)*fl2p1(l)*&
( fjkiln*fj(kj,l,iintsp) + gjkiln*gj(kj,l,iintsp)*ddnln )
fct2 = plegend(kj,l)*fl2p1bt(l) * ( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) )
smat_data(kj,kii)=smat_data(kj,kii)+REAL(cph(kj))*fct
hmat_data(kj,kii)=hmat_data(kj,kii) + REAL(cph(kj)) * &
( fct * elall + plegend(kj,l) * fl2p1bt(l) *&
( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) ) )
hmat_data(kj,kii)=hmat_data(kj,kii) + REAL(cph(kj)) * ( fct * elall + fct2)
!+APW
IF (PRESENT(uds)) THEN
apw1 = REAL(cph(kj)) * plegend(kj,l) * &
......@@ -118,24 +123,25 @@ CONTAINS
ENDDO
DEALLOCATE(plegend)
DEALLOCATE(cph)
END SUBROUTINE HsmtSphGpuKernel_Apw_real
END SUBROUTINE HsmtSphGpuKernel_real
SUBROUTINE HsmtSphGpuKernel_Apw_cmplx(iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
ATTRIBUTES(global) SUBROUTINE HsmtSphGpuKernel_cmplx(grid,block,iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
lnonsph,chi,qssbti,qssbtj,gvec,gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,taual,ddn,el,e_shift,&
smat_data,hmat_data,&
uds,dus,us,duds,rmt)
INTEGER,INTENT(IN) :: iintsp,jintsp,nv(2),lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,lnonsph
COMPLEX, INTENT(IN) :: chi
INTEGER, VALUE, INTENT(IN) :: grid, block
INTEGER, VALUE, INTENT(IN) :: iintsp,jintsp,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,lnonsph
COMPLEX, VALUE, INTENT(IN) :: chi
REAL,INTENT(IN) :: qssbti(3),qssbtj(3)
INTEGER,INTENT(IN) :: gvec(:,:,:)
INTEGER,INTENT(IN) :: gvec(:,:,:),nv(2)
REAL,INTENT(IN) :: gk(:,:,:)
REAL,INTENT(IN) :: fleg1(0:lmaxd),fleg2(0:lmaxd),fl2p1(0:lmaxd)
REAL,INTENT(IN) :: fl2p1bt(0:lmaxd)
REAL,MANAGED,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
REAL,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
REAL,INTENT(IN) :: taual(:,:)
REAL,INTENT(IN) :: ddn(0:lmaxd)
REAL,INTENT(IN) :: el(0:lmaxd)
REAL,INTENT(IN) :: e_shift
REAL, VALUE,INTENT(IN) :: e_shift
COMPLEX,INTENT(INOUT) :: smat_data(:,:),hmat_data(:,:)
!+APW
REAL,INTENT(IN),OPTIONAL :: uds(0:lmaxd),dus(0:lmaxd),us(0:lmaxd),duds(0:lmaxd)
......@@ -144,18 +150,32 @@ CONTAINS
REAL, PARAMETER :: tpi_const=2.*3.1415926535897932
REAL, ALLOCATABLE :: plegend(:,:)
REAL, ALLOCATABLE :: VecHelpS(:),VecHelpH(:)
COMPLEX, ALLOCATABLE :: cph(:)
REAL apw_lo1,apw_lo2,w1
COMPLEX capw1
REAL tnn(3), elall,fct,fjkiln,gjkiln,ddnln,ski(3)
INTEGER kii,ki,kj,l,nn
REAL tnn(3), elall,fct,fct2,fjkiln,gjkiln,ddnln,ski(3)
INTEGER kii,ki,kj,l,nn,kj_end,k
INTEGER :: loop_start, loop_end, i, loop_size
ALLOCATE(cph(MAXVAL(nv)))
ALLOCATE(plegend(MAXVAL(nv),0:lmaxd))
plegend=0.0
plegend(:,0)=1.0
DO ki = ki_start,ki_end,ki_step
ALLOCATE(VecHelpS(MAXVAL(nv)),VecHelpH(MAXVAL(nv)))
k = (blockidx%x-1)*blockdim%x + threadidx%x
!TODO!!!
!for seq, i.e. ki_start = 1, ki_step = 1
loop_size = max(ki_end/(grid*block),1)
if (loop_size * grid*block < ki_end) loop_size = loop_size + 1
loop_start = (k-1) * loop_size + 1
loop_end = loop_start + loop_size - 1
if (loop_end > ki_end ) loop_end = ki_end
DO ki = loop_start,loop_end,ki_step
!DO ki = ki_start,ki_end,ki_step
kii=(ki-1)/ki_step+1
ski = gvec(:,ki,jintsp) + qssbti
!---> legendre polynomials
......@@ -178,6 +198,9 @@ CONTAINS
END DO
!---> update overlap and l-diagonal hamiltonian matrix
kj_end = MIN(ki,nv(iintsp))
VecHelpS = 0.d0
VecHelpH = 0.d0
DO l = 0,lmax
!+APW
IF (PRESENT(uds)) THEN
......@@ -193,13 +216,13 @@ CONTAINS
ddnln = ddn(l)
elall = el(l)
IF (l<=lnonsph) elall=elall-e_shift!(isp)
DO kj = 1,MIN(ki,nv(iintsp))
fct = chi*plegend(kj,l)*fl2p1(l)*&
( fjkiln*fj(kj,l,iintsp) + gjkiln*gj(kj,l,iintsp)*ddnln )
smat_data(kj,kii)=smat_data(kj,kii) + cph(kj)*fct
hmat_data(kj,kii)=hmat_data(kj,kii) + cph(kj) * ( fct*elall &
+ chi*plegend(kj,l)*fl2p1bt(l) * ( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) ) )
DO kj = 1,kj_end
fct = plegend(kj,l)*fl2p1(l)*&
( fjkiln*fj(kj,l,iintsp) + gjkiln*gj(kj,l,iintsp)*ddnln )
fct2 = plegend(kj,l)*fl2p1bt(l) * ( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) )
VecHelpS(kj) = VecHelpS(kj) + fct
VecHelpH(kj) = VecHelpH(kj) + fct*elall + fct2
!+APW
IF (PRESENT(uds)) THEN
capw1 = cph(kj)*plegend(kj,l)&
......@@ -210,156 +233,15 @@ CONTAINS
END DO
!---> end loop over l
ENDDO
smat_data(:kj_end,kii)=smat_data(:kj_end,kii) + chi*cph(:kj_end) * VecHelpS(:kj_end)
hmat_data(:kj_end,kii)=hmat_data(:kj_end,kii) + chi*cph(:kj_end) * VecHelpH(:kj_end)
!---> end loop over ki
ENDDO
DEALLOCATE(plegend)
DEALLOCATE(cph)
END SUBROUTINE HsmtSphGpuKernel_Apw_cmplx
SUBROUTINE HsmtSphGpuKernel_noApw_cmplx(iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
lnonsph,chi,qssbti,qssbtj,gvec,gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,taual,ddn,el,e_shift,&
smat_data,hmat_data)
INTEGER,INTENT(IN) :: iintsp,jintsp,nv(2),lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,lnonsph
COMPLEX, INTENT(IN) :: chi
REAL,INTENT(IN) :: qssbti(3),qssbtj(3)
INTEGER,INTENT(IN) :: gvec(:,:,:)
REAL,INTENT(IN) :: gk(:,:,:)
REAL,INTENT(IN) :: fleg1(0:lmaxd),fleg2(0:lmaxd),fl2p1(0:lmaxd)
REAL,INTENT(IN) :: fl2p1bt(0:lmaxd)
REAL,MANAGED,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
REAL,INTENT(IN) :: taual(:,:)
REAL,INTENT(IN) :: ddn(0:lmaxd)
REAL,INTENT(IN) :: el(0:lmaxd)
REAL,INTENT(IN) :: e_shift
COMPLEX,INTENT(INOUT) :: smat_data(:,:),hmat_data(:,:)
REAL, PARAMETER :: tpi_const=2.*3.1415926535897932
REAL, ALLOCATABLE :: plegend(:,:)
COMPLEX, ALLOCATABLE :: cph(:)
REAL tnn(3), elall,fct,fjkiln,gjkiln,ddnln,ski(3)
INTEGER kii,ki,kj,l,nn
ALLOCATE(cph(MAXVAL(nv)))
ALLOCATE(plegend(MAXVAL(nv),0:lmaxd))
plegend=0.0
plegend(:,0)=1.0
DO ki = ki_start,ki_end,ki_step
kii=(ki-1)/ki_step+1
ski = gvec(:,ki,jintsp) + qssbti
!---> legendre polynomials
DO kj = 1,ki
plegend(kj,1) = DOT_PRODUCT(gk(:,kj,iintsp),gk(:,ki,jintsp))
END DO
DO l = 1,lmax - 1
plegend(:ki,l+1) = fleg1(l)*plegend(:ki,1)*plegend(:ki,l) - fleg2(l)*plegend(:ki,l-1)
END DO
!---> set up phase factors
cph = 0.0
DO nn = nn_start,nn_end
tnn = tpi_const*taual(:,nn)
DO kj = 1,ki
cph(kj) = cph(kj) +&
CMPLX(COS(DOT_PRODUCT(ski-gvec(:,kj,iintsp)-qssbtj,tnn)),&
SIN(DOT_PRODUCT(gvec(:,kj,iintsp)+qssbtj-ski,tnn)))
! IF (iintsp.NE.jintsp) cph(kj)=CONJG(cph(kj))
END DO
END DO
!---> update overlap and l-diagonal hamiltonian matrix
DO l = 0,lmax
fjkiln = fj(ki,l,jintsp)
gjkiln = gj(ki,l,jintsp)
ddnln = ddn(l)
elall = el(l)
IF (l<=lnonsph) elall=elall-e_shift!(isp)
DO kj = 1,MIN(ki,nv(iintsp))
fct = chi*plegend(kj,l)*fl2p1(l)*&
( fjkiln*fj(kj,l,iintsp) + gjkiln*gj(kj,l,iintsp)*ddnln )
smat_data(kj,kii)=smat_data(kj,kii) + cph(kj)*fct
hmat_data(kj,kii)=hmat_data(kj,kii) + cph(kj) * ( fct*elall &
+ chi*plegend(kj,l)*fl2p1bt(l) * ( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) ) )
END DO
!---> end loop over l
ENDDO
!---> end loop over ki
ENDDO
DEALLOCATE(plegend)
DEALLOCATE(cph)
END SUBROUTINE HsmtSphGpuKernel_noApw_cmplx
SUBROUTINE HsmtSphGpuKernel_noApw_real(iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
lnonsph,chi,qssbti,qssbtj,gvec,gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,taual,ddn,el,e_shift,smat_data,hmat_data)
INTEGER,INTENT(IN) :: iintsp,jintsp,nv(2),lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,lnonsph
COMPLEX, INTENT(IN) :: chi
REAL,INTENT(IN) :: qssbti(3),qssbtj(3)
INTEGER,INTENT(IN) :: gvec(:,:,:)
REAL,INTENT(IN) :: gk(:,:,:)
REAL,INTENT(IN) :: fleg1(0:lmaxd),fleg2(0:lmaxd),fl2p1(0:lmaxd)
REAL,INTENT(IN) :: fl2p1bt(0:lmaxd)
REAL,MANAGED,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
REAL,INTENT(IN) :: taual(:,:)
REAL,INTENT(IN) :: ddn(0:lmaxd)
REAL,INTENT(IN) :: el(0:lmaxd)
REAL,INTENT(IN) :: e_shift
REAL,INTENT(INOUT) :: smat_data(:,:),hmat_data(:,:)
DEALLOCATE(VecHelpS,VecHelpH)
END SUBROUTINE HsmtSphGpuKernel_cmplx
REAL, PARAMETER :: tpi_const=2.*3.1415926535897932
REAL, ALLOCATABLE :: plegend(:,:)
COMPLEX, ALLOCATABLE :: cph(:)
REAL tnn(3), elall,fct,fjkiln,gjkiln,ddnln,ski(3)
INTEGER kii,ki,kj,l,nn
ALLOCATE(cph(MAXVAL(nv)))
ALLOCATE(plegend(MAXVAL(nv),0:lmaxd))
plegend=0.0
plegend(:,0)=1.0
DO ki = ki_start,ki_end,ki_step
kii=(ki-1)/ki_step+1
ski = gvec(:,ki,jintsp) + qssbti
!---> legendre polynomials
DO kj = 1,ki
plegend(kj,1) = DOT_PRODUCT(gk(:,kj,iintsp),gk(:,ki,jintsp))
END DO
DO l = 1,lmax - 1
plegend(:ki,l+1) = fleg1(l)*plegend(:ki,1)*plegend(:ki,l) - fleg2(l)*plegend(:ki,l-1)
END DO
!---> set up phase factors
cph = 0.0
DO nn = nn_start,nn_end
tnn = tpi_const*taual(:,nn)
DO kj = 1,ki
cph(kj) = cph(kj) +&
CMPLX(COS(DOT_PRODUCT(ski-gvec(:,kj,iintsp)-qssbtj,tnn)),&
SIN(DOT_PRODUCT(gvec(:,kj,iintsp)+qssbtj-ski,tnn)))
! IF (iintsp.NE.jintsp) cph(kj)=CONJG(cph(kj))
END DO
END DO
!---> update overlap and l-diagonal hamiltonian matrix
DO l = 0,lmax
fjkiln = fj(ki,l,jintsp)
gjkiln = gj(ki,l,jintsp)
ddnln = ddn(l)
elall = el(l)
IF (l<=lnonsph) elall=elall-e_shift!(isp)
DO kj = 1,ki
fct = plegend(kj,l)*fl2p1(l)*&
( fjkiln*fj(kj,l,iintsp) + gjkiln*gj(kj,l,iintsp)*ddnln )
smat_data(kj,kii)=smat_data(kj,kii)+REAL(cph(kj))*fct
hmat_data(kj,kii)=hmat_data(kj,kii) + REAL(cph(kj)) * &
( fct * elall + plegend(kj,l) * fl2p1bt(l) *&
( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) ) )
ENDDO
!---> end loop over l
ENDDO
!---> end loop over ki
ENDDO
DEALLOCATE(plegend)
DEALLOCATE(cph)
END SUBROUTINE HsmtSphGpuKernel_noApw_real
SUBROUTINE hsmt_sph_gpu(n,atoms,mpi,isp,input,noco,iintsp,jintsp,chi,lapw,el,e_shift,usdus,fj,gj,smat,hmat)
USE m_constants, ONLY : fpi_const,tpi_const
......@@ -379,22 +261,20 @@ CONTAINS
COMPLEX, INTENT(IN) :: chi
! ..
! .. Array Arguments ..
REAL, INTENT (IN) :: el(0:atoms%lmaxd,atoms%ntype,input%jspins)
REAL,MANAGED,INTENT (IN) :: el(0:atoms%lmaxd,atoms%ntype,input%jspins)
REAL, INTENT (IN) :: e_shift!(atoms%ntype,input%jspins)
#if defined CPP_GPU
REAL,MANAGED,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
#else
REAL, INTENT (IN) :: fj(:,0:,:),gj(:,0:,:)
#endif
! ..
! .. Local Scalars ..
INTEGER l
INTEGER :: grid, block
! ..
! .. Local Arrays ..
REAL fleg1(0:atoms%lmaxd),fleg2(0:atoms%lmaxd),fl2p1(0:atoms%lmaxd)
REAL fl2p1bt(0:atoms%lmaxd)
REAL qssbti(3),qssbtj(3)
REAL,MANAGED :: fleg1(0:atoms%lmaxd),fleg2(0:atoms%lmaxd),fl2p1(0:atoms%lmaxd)
REAL,MANAGED :: fl2p1bt(0:atoms%lmaxd)
REAL,MANAGED :: qssbti(3),qssbtj(3)
INTEGER, DEVICE :: nv_dev(2)
call nvtxStartRange("hsmt_sph",2)
CALL timestart("spherical setup")
......@@ -408,16 +288,22 @@ CONTAINS
qssbti=MERGE(- noco%qss/2,+ noco%qss/2,jintsp.EQ.1)
qssbtj=MERGE(- noco%qss/2,+ noco%qss/2,iintsp.EQ.1)
! pretty ugly solution
nv_dev = lapw%nv
block = 256
grid = lapw%nv(jintsp)/(block*4) + 1
IF (input%l_useapw) THEN
!TODO!!!!
! APW case is not testet
IF (smat%l_real) THEN
CALL HsmtSphGpuKernel_Apw(iintsp,jintsp,lapw%nv,atoms%lmaxd,atoms%lmax(n),mpi%n_rank+1,&
CALL HsmtSphGpuKernel_real<<<grid,block>>>(grid,block,iintsp,jintsp,nv_dev,atoms%lmaxd,atoms%lmax(n),mpi%n_rank+1,&
lapw%nv(jintsp), mpi%n_size,SUM(atoms%neq(:n-1))+1,SUM(atoms%neq(:n)),atoms%lnonsph(n),&
chi,qssbti,qssbtj,lapw%gvec,lapw%gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,atoms%taual,&
qssbti,qssbtj,lapw%gvec,lapw%gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,atoms%taual,&
usdus%ddn(:,n,isp),el(:,n,isp),e_shift,&
smat%data_r,hmat%data_r,&
usdus%uds(:,n,isp),usdus%dus(:,n,isp),usdus%us(:,n,isp),usdus%duds(:,n,isp),atoms%rmt(n))
ELSE
CALL HsmtSphGpuKernel_Apw(iintsp,jintsp,lapw%nv,atoms%lmaxd,atoms%lmax(n),mpi%n_rank+1,&
CALL HsmtSphGpuKernel_cmplx<<<grid,block>>>(grid,block,iintsp,jintsp,nv_dev,atoms%lmaxd,atoms%lmax(n),mpi%n_rank+1,&
lapw%nv(jintsp), mpi%n_size,SUM(atoms%neq(:n-1))+1,SUM(atoms%neq(:n)),atoms%lnonsph(n),&
chi,qssbti,qssbtj,lapw%gvec,lapw%gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,atoms%taual,&
usdus%ddn(:,n,isp),el(:,n,isp),e_shift,&
......@@ -426,12 +312,12 @@ CONTAINS
ENDIF
ELSE
IF (smat%l_real) THEN
CALL HsmtSphGpuKernel_Apw(iintsp,jintsp,lapw%nv,atoms%lmaxd,atoms%lmax(n),mpi%n_rank+1,&
CALL HsmtSphGpuKernel_real<<<grid,block>>>(grid,block,iintsp,jintsp,nv_dev,atoms%lmaxd,atoms%lmax(n),mpi%n_rank+1,&
lapw%nv(jintsp), mpi%n_size,SUM(atoms%neq(:n-1))+1,SUM(atoms%neq(:n)),atoms%lnonsph(n),&
chi,qssbti,qssbtj,lapw%gvec,lapw%gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,atoms%taual,&
qssbti,qssbtj,lapw%gvec,lapw%gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,atoms%taual,&
usdus%ddn(:,n,isp),el(:,n,isp),e_shift,smat%data_r,hmat%data_r)
ELSE
CALL HsmtSphGpuKernel_Apw(iintsp,jintsp,lapw%nv,atoms%lmaxd,atoms%lmax(n),mpi%n_rank+1,&
CALL HsmtSphGpuKernel_cmplx<<<grid,block>>>(grid,block,iintsp,jintsp,nv_dev,atoms%lmaxd,atoms%lmax(n),mpi%n_rank+1,&
lapw%nv(jintsp), mpi%n_size,SUM(atoms%neq(:n-1))+1,SUM(atoms%neq(:n)),atoms%lnonsph(n),&
chi,qssbti,qssbtj,lapw%gvec,lapw%gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,atoms%taual,&
usdus%ddn(:,n,isp),el(:,n,isp),e_shift,smat%data_c,hmat%data_c)
......@@ -466,11 +352,11 @@ CONTAINS
REAL, INTENT (IN) :: fj(:,0:,:),gj(:,0:,:)
! ..
! .. Local Scalars ..
REAL tnn(3), elall,fct,fjkiln,gjkiln,ddnln,ski(3)
REAL tnn(3), elall,fct,fct2,fjkiln,gjkiln,ddnln,ski(3)
REAL apw_lo1,apw_lo2,apw1,w1
COMPLEX capw1
INTEGER kii,ki,kj,l,nn
INTEGER kii,ki,kj,l,nn,kj_end
! ..
! .. Local Arrays ..
......@@ -478,6 +364,7 @@ CONTAINS
REAL fl2p1bt(0:atoms%lmaxd)
REAL qssbti(3),qssbtj(3)
REAL, ALLOCATABLE :: plegend(:,:)
REAL, ALLOCATABLE :: VecHelpS(:),VecHelpH(:)
COMPLEX, ALLOCATABLE :: cph(:)
LOGICAL apw(0:atoms%lmaxd)
......@@ -490,12 +377,13 @@ CONTAINS
fl2p1bt(l) = fl2p1(l)*0.5
END DO
!$OMP PARALLEL DEFAULT(SHARED)&
!$OMP PRIVATE(kii,ki,ski,kj,plegend,l)&
!$OMP PRIVATE(kii,ki,ski,kj,plegend,l,kj_end)&
!$OMP PRIVATE(cph,nn,tnn,fjkiln,gjkiln)&
!$OMP PRIVATE(w1,apw_lo1,apw_lo2,ddnln,elall,fct,apw1)&
!$OMP PRIVATE(capw1)
!$OMP PRIVATE(capw1,VecHelpS,VecHelpH)
ALLOCATE(cph(MAXVAL(lapw%nv)))
ALLOCATE(plegend(MAXVAL(lapw%nv),0:atoms%lmaxd))
ALLOCATE(VecHelpS(MAXVAL(lapw%nv)),VecHelpH(MAXVAL(lapw%nv)))
plegend=0.0
plegend(:,0)=1.0
qssbti=MERGE(- noco%qss/2,+ noco%qss/2,jintsp.EQ.1)
......@@ -524,6 +412,9 @@ CONTAINS
END DO
!---> update overlap and l-diagonal hamiltonian matrix
kj_end = MIN(ki,lapw%nv(iintsp))
VecHelpS = 0.d0
VecHelpH = 0.d0
DO l = 0,atoms%lmax(n)
fjkiln = fj(ki,l,jintsp)
gjkiln = gj(ki,l,jintsp)
......@@ -544,10 +435,10 @@ CONTAINS
DO kj = 1,ki
fct = plegend(kj,l)*fl2p1(l)*&
( fjkiln*fj(kj,l,iintsp) + gjkiln*gj(kj,l,iintsp)*ddnln )
fct2 = plegend(kj,l)*fl2p1bt(l) * ( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) )
smat%data_r(kj,kii)=smat%data_r(kj,kii)+REAL(cph(kj))*fct
hmat%data_r(kj,kii)=hmat%data_r(kj,kii) + REAL(cph(kj)) * &
( fct * elall + plegend(kj,l) * fl2p1bt(l) *&
( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) ) )
hmat%data_r(kj,kii)=hmat%data_r(kj,kii) + REAL(cph(kj)) * ( fct * elall + fct2)
!+APW
IF (input%l_useapw) THEN
apw1 = REAL(cph(kj)) * plegend(kj,l) * &
......@@ -557,13 +448,13 @@ CONTAINS
!-APW
ENDDO
ELSE
DO kj = 1,MIN(ki,lapw%nv(iintsp))
fct = chi*plegend(kj,l)*fl2p1(l)*&
DO kj = 1,kj_end
fct = plegend(kj,l)*fl2p1(l)*&
( fjkiln*fj(kj,l,iintsp) + gjkiln*gj(kj,l,iintsp)*ddnln )
smat%data_c(kj,kii)=smat%data_c(kj,kii) + cph(kj)*fct
hmat%data_c(kj,kii)=hmat%data_c(kj,kii) + cph(kj) * ( fct*elall &
+ chi*plegend(kj,l)*fl2p1bt(l) * ( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) ) )
fct2 = plegend(kj,l)*fl2p1bt(l) * ( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) )
VecHelpS(kj) = VecHelpS(kj) + fct
VecHelpH(kj) = VecHelpH(kj) + fct*elall + fct2
IF (input%l_useapw) THEN
capw1 = cph(kj)*plegend(kj,l)&
......@@ -575,11 +466,16 @@ CONTAINS
ENDIF
!---> end loop over l
ENDDO
IF (.not.smat%l_real) THEN
smat%data_c(:kj_end,kii)=smat%data_c(:kj_end,kii) + chi*cph(:kj_end) * VecHelpS(:kj_end)
hmat%data_c(:kj_end,kii)=hmat%data_c(:kj_end,kii) + chi*cph(:kj_end) * VecHelpH(:kj_end)
ENDIF
!---> end loop over ki
ENDDO
!$OMP END DO
DEALLOCATE(plegend)
DEALLOCATE(cph)
DEALLOCATE(VecHelpS,VecHelpH)
!$OMP END PARALLEL
CALL timestop("spherical setup")
......
......@@ -8,17 +8,26 @@ MODULE m_types_lapw
USE m_judft
PRIVATE
TYPE t_lapw
INTEGER :: nv(2),num_local_cols(2)
INTEGER :: nv(2)
INTEGER :: num_local_cols(2)
INTEGER :: nv_tot
INTEGER :: nmat
INTEGER :: nlotot
INTEGER,ALLOCATABLE:: k1(:,:)
INTEGER,ALLOCATABLE:: k2(:,:)
INTEGER,ALLOCATABLE:: k3(:,:)
#ifdef CPP_GPU
INTEGER,ALLOCATABLE,MANAGED:: gvec(:,:,:) !replaces k1,k2,k3
#else
INTEGER,ALLOCATABLE:: gvec(:,:,:) !replaces k1,k2,k3
#endif
INTEGER,ALLOCATABLE:: kp(:,:)
REAL,ALLOCATABLE::rk(:,:)
#ifdef CPP_GPU
REAL,ALLOCATABLE,MANAGED::gk(:,:,:)
#else
REAL,ALLOCATABLE::gk(:,:,:)
#endif
REAL,ALLOCATABLE::vk(:,:,:)
INTEGER,ALLOCATABLE::matind(:,:)
INTEGER,ALLOCATABLE::index_lo(:,:)
......
......@@ -138,7 +138,11 @@ MODULE m_types_setup
!Calaculate forces for this atom?
LOGICAL,ALLOCATABLE :: l_geo(:)
!MT-Radius (ntype)
#ifdef CPP_GPU
REAL,ALLOCATABLE,MANAGED::rmt(:)
#else
REAL,ALLOCATABLE::rmt(:)
#endif
!log increment(ntype)
REAL,ALLOCATABLE::dx(:)
!vol of MT(ntype)
......@@ -152,7 +156,11 @@ MODULE m_types_setup
!pos of atom (absol) (3,nat)
REAL,ALLOCATABLE::pos(:,:)
!pos of atom (relat)(3,nat)
#ifdef CPP_GPU
REAL,ALLOCATABLE,MANAGED::taual(:,:)
#else
REAL,ALLOCATABLE::taual(:,:)
#endif
!labels
CHARACTER(LEN=20), ALLOCATABLE :: label(:)
CHARACTER(len=20), ALLOCATABLE :: speciesName(:)
......
......@@ -6,11 +6,19 @@
MODULE m_types_usdus
TYPE t_usdus
#ifdef CPP_GPU
REAL,ALLOCATABLE,DIMENSION(:,:,:),MANAGED :: us
REAL,ALLOCATABLE,DIMENSION(:,:,:),MANAGED :: dus
REAL,ALLOCATABLE,DIMENSION(:,:,:),MANAGED :: uds
REAL,ALLOCATABLE,DIMENSION(:,:,:),MANAGED :: duds !(0:lmaxd,ntype,jspd)
REAL,ALLOCATABLE,DIMENSION(:,:,:),MANAGED :: ddn !(0:lmaxd,ntype,jspd)
#else
REAL,ALLOCATABLE,DIMENSION(:,:,:) :: us
REAL,ALLOCATABLE,DIMENSION(:,:,:) :: dus
REAL,ALLOCATABLE,DIMENSION(:,:,:) :: uds
REAL,ALLOCATABLE,DIMENSION(:,:,:) :: duds !(0:lmaxd,ntype,jspd)
REAL,ALLOCATABLE,DIMENSION(:,:,:) :: ddn !(0:lmaxd,ntype,jspd)
#endif
REAL,ALLOCATABLE,DIMENSION(:,:,:) :: ulos
REAL,ALLOCATABLE,DIMENSION(:,:,:) :: dulos
REAL,ALLOCATABLE,DIMENSION(:,:,:) :: uulon
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment