Commit c019e52f authored by Uliana Alekseeva's avatar Uliana Alekseeva

bugfix in hsmt_sph_gpu

parent cff660e8
......@@ -18,11 +18,12 @@ MODULE m_hsmt_sph
CONTAINS
#ifdef CPP_GPU
ATTRIBUTES(global) SUBROUTINE HsmtSphGpuKernel_real(grid,block,iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
ATTRIBUTES(global)&
SUBROUTINE HsmtSphGpuKernel_real(loop_size,iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
lnonsph,qssbti,qssbtj,gvec,gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,taual,ddn,el,e_shift,&
smat_data,hmat_data,&
uds,dus,us,duds,rmt)
INTEGER, VALUE, INTENT(IN) :: grid, block
INTEGER, VALUE, INTENT(IN) :: loop_size
INTEGER,VALUE,INTENT(IN) :: iintsp,jintsp,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,lnonsph
REAL,INTENT(IN) :: qssbti(3),qssbtj(3)
INTEGER,INTENT(IN) :: gvec(:,:,:),nv(2)
......@@ -41,95 +42,86 @@ CONTAINS
!-APW
REAL, PARAMETER :: tpi_const=2.*3.1415926535897932
REAL, ALLOCATABLE :: plegend(:,:)
COMPLEX, ALLOCATABLE :: cph(:)
REAL, ALLOCATABLE :: plegend(:)
REAL cph
REAL tnn(3), elall,fct,fct2,fjkiln,gjkiln,ddnln,ski(3)
REAL apw_lo1,apw_lo2,apw1,w1
INTEGER kii,ki,kj,l,nn,k
INTEGER :: loop_start, loop_end, i, loop_size
INTEGER :: loop_start, loop_end, i
ALLOCATE(cph(MAXVAL(nv)))
ALLOCATE(plegend(MAXVAL(nv),0:lmaxd))
ALLOCATE(plegend(0:lmaxd))
plegend=0.0
plegend(:,0)=1.0
plegend(0)=1.0
k = (blockidx%x-1)*blockdim%x + threadidx%x
!TODO!!!
!for seq, i.e. ki_start = 1, ki_step = 1
loop_size = max(ki_end/(grid*block),1)
if (loop_size * grid*block < ki_end) loop_size = loop_size + 1
loop_start = (k-1) * loop_size + 1
loop_end = loop_start + loop_size - 1
if (loop_end > ki_end ) loop_end = ki_end
DO ki = loop_start,loop_end,ki_step
!DO ki = ki_start,ki_end,ki_step
kii=(ki-1)/ki_step+1
ski = gvec(:,ki,jintsp) + qssbti
!---> legendre polynomials
DO kj = 1,ki
plegend(kj,1) = DOT_PRODUCT(gk(:,kj,iintsp),gk(:,ki,jintsp))
END DO
DO l = 1,lmax - 1
plegend(:ki,l+1) = fleg1(l)*plegend(:ki,1)*plegend(:ki,l) - fleg2(l)*plegend(:ki,l-1)
END DO
!---> set up phase factors
cph = 0.0
DO nn = nn_start,nn_end
tnn = tpi_const*taual(:,nn)
DO kj = 1,ki
cph(kj) = cph(kj) +&
CMPLX(COS(DOT_PRODUCT(ski-gvec(:,kj,iintsp)-qssbtj,tnn)),&
SIN(DOT_PRODUCT(gvec(:,kj,iintsp)+qssbtj-ski,tnn)))
! IF (iintsp.NE.jintsp) cph(kj)=CONJG(cph(kj))
kii=(ki-1)/ki_step+1
ski = gvec(:,ki,jintsp) + qssbti
!---> set up phase factors
cph = 0.0
DO nn = nn_start,nn_end
tnn = tpi_const*taual(:,nn)
cph = cph + COS(DOT_PRODUCT(ski-gvec(:,kj,iintsp)-qssbtj,tnn))
! IF (iintsp.NE.jintsp) cph(kj)=CONJG(cph(kj))
ENDDO
!---> legendre polynomials
plegend(1) = DOT_PRODUCT(gk(:,kj,iintsp),gk(:,ki,jintsp))
DO l = 1,lmax - 1
plegend(l+1) = fleg1(l)*plegend(1)*plegend(l) - fleg2(l)*plegend(l-1)
END DO
END DO
!---> update overlap and l-diagonal hamiltonian matrix
DO l = 0,lmax
!+APW
IF (PRESENT(uds)) THEN
w1 = 0.5 * ( uds(l)*dus(l) + us(l)*duds(l) )
apw_lo1 = fl2p1(l) * 0.5 * rmt**2 * ( gjkiln * w1 +&
DO l = 0,lmax
fjkiln = fj(ki,l,jintsp)
gjkiln = gj(ki,l,jintsp)
!+APW
IF (PRESENT(uds)) THEN
w1 = 0.5 * ( uds(l)*dus(l) + us(l)*duds(l) )
apw_lo1 = fl2p1(l) * 0.5 * rmt**2 * ( gjkiln * w1 +&
fjkiln * us(l) * dus(l) )
apw_lo2 = fl2p1(l) * 0.5 * rmt**2 * ( fjkiln * w1 +&
apw_lo2 = fl2p1(l) * 0.5 * rmt**2 * ( fjkiln * w1 +&
gjkiln * uds(l) * duds(l) )
ENDIF
!-APW
fjkiln = fj(ki,l,jintsp)
gjkiln = gj(ki,l,jintsp)
ddnln = ddn(l)
elall = el(l)
IF (l<=lnonsph) elall=elall-e_shift!(isp)
DO kj = 1,ki
fct = plegend(kj,l)*fl2p1(l)*&
ENDIF
!-APW
ddnln = ddn(l)
elall = el(l)
IF (l<=lnonsph) elall=elall-e_shift!(isp)
!DO kj = 1,ki
fct = plegend(l)*fl2p1(l)*&
( fjkiln*fj(kj,l,iintsp) + gjkiln*gj(kj,l,iintsp)*ddnln )
fct2 = plegend(kj,l)*fl2p1bt(l) * ( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) )
fct2 = plegend(l)*fl2p1bt(l) * ( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) )
smat_data(kj,kii)=smat_data(kj,kii)+REAL(cph(kj))*fct
hmat_data(kj,kii)=hmat_data(kj,kii) + REAL(cph(kj)) * ( fct * elall + fct2)
!+APW
IF (PRESENT(uds)) THEN
apw1 = REAL(cph(kj)) * plegend(kj,l) * &
smat_data(kj,kii)=smat_data(kj,kii)+cph*fct
hmat_data(kj,kii)=hmat_data(kj,kii) + cph * ( fct * elall + fct2)
!+APW
IF (PRESENT(uds)) THEN
apw1 = cph * plegend(l) * &
( apw_lo1 * fj(kj,l,iintsp) + apw_lo2 * gj(kj,l,iintsp) )
hmat_data(kj,kii)=hmat_data(kj,kii) + apw1
ENDIF
!-APW
ENDIF
!-APW
!ENDDO
!---> end loop over l
ENDDO
!---> end loop over l
ENDDO
!---> end loop over ki
ENDDO
DEALLOCATE(plegend)
DEALLOCATE(cph)
END SUBROUTINE HsmtSphGpuKernel_real
ATTRIBUTES(global) SUBROUTINE HsmtSphGpuKernel_cmplx(grid,block,iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
ATTRIBUTES(global)&
SUBROUTINE HsmtSphGpuKernel_cmplx(loop_size,iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
lnonsph,chi,qssbti,qssbtj,gvec,gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,taual,ddn,el,e_shift,&
smat_data,hmat_data,&
uds,dus,us,duds,rmt)
INTEGER, VALUE, INTENT(IN) :: grid, block
INTEGER, VALUE, INTENT(IN) :: loop_size
INTEGER, VALUE, INTENT(IN) :: iintsp,jintsp,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,lnonsph
COMPLEX, VALUE, INTENT(IN) :: chi
REAL,INTENT(IN) :: qssbti(3),qssbtj(3)
......@@ -149,97 +141,81 @@ CONTAINS
!-APW
REAL, PARAMETER :: tpi_const=2.*3.1415926535897932
REAL, ALLOCATABLE :: plegend(:,:)
REAL, ALLOCATABLE :: VecHelpS(:),VecHelpH(:)
COMPLEX, ALLOCATABLE :: cph(:)
REAL, ALLOCATABLE :: plegend(:)
COMPLEX :: cph
REAL apw_lo1,apw_lo2,w1
COMPLEX capw1
REAL tnn(3), elall,fct,fct2,fjkiln,gjkiln,ddnln,ski(3)
INTEGER kii,ki,kj,l,nn,kj_end,k
INTEGER :: loop_start, loop_end, i, loop_size
INTEGER kii,ki,kj,kjj,l,nn,kj_end,k
INTEGER :: loop_start, loop_end, i
ALLOCATE(cph(MAXVAL(nv)))
ALLOCATE(plegend(MAXVAL(nv),0:lmaxd))
ALLOCATE(plegend(0:lmaxd))
plegend=0.0
plegend(:,0)=1.0
ALLOCATE(VecHelpS(MAXVAL(nv)),VecHelpH(MAXVAL(nv)))
plegend(0)=1.0
k = (blockidx%x-1)*blockdim%x + threadidx%x
!TODO!!!
!for seq, i.e. ki_start = 1, ki_step = 1
loop_size = max(ki_end/(grid*block),1)
if (loop_size * grid*block < ki_end) loop_size = loop_size + 1
loop_start = (k-1) * loop_size + 1
loop_end = loop_start + loop_size - 1
if (loop_end > ki_end ) loop_end = ki_end
DO ki = loop_start,loop_end,ki_step
!DO ki = ki_start,ki_end,ki_step
kii=(ki-1)/ki_step+1
ski = gvec(:,ki,jintsp) + qssbti
!---> legendre polynomials
DO kj = 1,ki
plegend(kj,1) = DOT_PRODUCT(gk(:,kj,iintsp),gk(:,ki,jintsp))
END DO
DO l = 1,lmax - 1
plegend(:ki,l+1) = fleg1(l)*plegend(:ki,1)*plegend(:ki,l) - fleg2(l)*plegend(:ki,l-1)
END DO
!---> set up phase factors
cph = 0.0
DO nn = nn_start,nn_end
tnn = tpi_const*taual(:,nn)
DO kj = 1,ki
cph(kj) = cph(kj) +&
kj_end = MIN(ki,nv(iintsp))
DO kj = 1,kj_end
kii=(ki-1)/ki_step+1
ski = gvec(:,ki,jintsp) + qssbti
!---> set up phase factors
cph = 0.0
DO nn = nn_start,nn_end
tnn = tpi_const*taual(:,nn)
cph = cph +&
CMPLX(COS(DOT_PRODUCT(ski-gvec(:,kj,iintsp)-qssbtj,tnn)),&
SIN(DOT_PRODUCT(gvec(:,kj,iintsp)+qssbtj-ski,tnn)))
! IF (iintsp.NE.jintsp) cph(kj)=CONJG(cph(kj))
! IF (iintsp.NE.jintsp) cph(kj)=CONJG(cph(kj))
ENDDO
!---> legendre polynomials
plegend(1) = DOT_PRODUCT(gk(:,kj,iintsp),gk(:,ki,jintsp))
DO l = 1,lmax - 1
plegend(l+1) = fleg1(l)*plegend(1)*plegend(l) - fleg2(l)*plegend(l-1)
END DO
END DO
!---> update overlap and l-diagonal hamiltonian matrix
kj_end = MIN(ki,nv(iintsp))
VecHelpS = 0.d0
VecHelpH = 0.d0
DO l = 0,lmax
!+APW
IF (PRESENT(uds)) THEN
w1 = 0.5 * ( uds(l)*dus(l) + us(l)*duds(l) )
apw_lo1 = fl2p1(l) * 0.5 * rmt**2 * ( gjkiln * w1 +&
fjkiln * us(l) * dus(l) )
apw_lo2 = fl2p1(l) * 0.5 * rmt**2 * ( fjkiln * w1 +&
gjkiln * uds(l) * duds(l) )
ENDIF
!-APW
fjkiln = fj(ki,l,jintsp)
gjkiln = gj(ki,l,jintsp)
ddnln = ddn(l)
elall = el(l)
IF (l<=lnonsph) elall=elall-e_shift!(isp)
DO kj = 1,kj_end
fct = plegend(kj,l)*fl2p1(l)*&
DO l = 0,lmax
fjkiln = fj(ki,l,jintsp)
gjkiln = gj(ki,l,jintsp)
!+APW
IF (PRESENT(uds)) THEN
w1 = 0.5 * ( uds(l)*dus(l) + us(l)*duds(l) )
apw_lo1 = fl2p1(l) * 0.5 * rmt**2 * ( gjkiln * w1 +&
fjkiln * us(l) * dus(l) )
apw_lo2 = fl2p1(l) * 0.5 * rmt**2 * ( fjkiln * w1 +&
gjkiln * uds(l) * duds(l) )
ENDIF
!-APW
ddnln = ddn(l)
elall = el(l)
IF (l<=lnonsph) elall=elall-e_shift!(isp)
fct = plegend(l)*fl2p1(l)*&
( fjkiln*fj(kj,l,iintsp) + gjkiln*gj(kj,l,iintsp)*ddnln )
fct2 = plegend(kj,l)*fl2p1bt(l) * ( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) )
fct2 = plegend(l)*fl2p1bt(l) * ( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) )
VecHelpS(kj) = VecHelpS(kj) + fct
VecHelpH(kj) = VecHelpH(kj) + fct*elall + fct2
!+APW
IF (PRESENT(uds)) THEN
capw1 = cph(kj)*plegend(kj,l)&
smat_data(kj,kii)=smat_data(kj,kii) + chi * cph * fct
hmat_data(kj,kii)=hmat_data(kj,kii) + chi * cph * ( fct * elall + fct2)
!+APW
IF (PRESENT(uds)) THEN
capw1 = cph*plegend(l)&
* ( apw_lo1 * fj(kj,l,iintsp) + apw_lo2 * gj(kj,l,iintsp) )
hmat_data(kj,kii)=hmat_data(kj,kii) + capw1
ENDIF
!-APW
END DO
!---> end loop over l
ENDIF
!-APW
!---> end loop over l
ENDDO
ENDDO
smat_data(:kj_end,kii)=smat_data(:kj_end,kii) + chi*cph(:kj_end) * VecHelpS(:kj_end)
hmat_data(:kj_end,kii)=hmat_data(:kj_end,kii) + chi*cph(:kj_end) * VecHelpH(:kj_end)
!---> end loop over ki
ENDDO
DEALLOCATE(plegend)
DEALLOCATE(cph)
DEALLOCATE(VecHelpS,VecHelpH)
END SUBROUTINE HsmtSphGpuKernel_cmplx
......@@ -267,7 +243,7 @@ CONTAINS
! ..
! .. Local Scalars ..
INTEGER l
INTEGER :: grid, block
INTEGER :: grid, block, loop_size
! ..
! .. Local Arrays ..
......@@ -278,6 +254,7 @@ CONTAINS
call nvtxStartRange("hsmt_sph",2)
CALL timestart("spherical setup")
print*, "HsmtSph_GPU"
DO l = 0,atoms%lmaxd
fleg1(l) = REAL(l+l+1)/REAL(l+1)
......@@ -290,20 +267,25 @@ CONTAINS
! pretty ugly solution
nv_dev = lapw%nv
block = 256
grid = lapw%nv(jintsp)/(block*4) + 1
loop_size = 1
block = 32 ! number of threads in a block
grid = ceiling(real(lapw%nv(jintsp))/(loop_SIZE*block))
!loop_size = max(lapw%nv(jintsp)/(grid*block),1) !number of iterations performed by each thread
!if (loop_size * grid*block < lapw%nv(jintsp)) loop_size = loop_size + 1
IF (input%l_useapw) THEN
!TODO!!!!
! APW case is not testet
IF (smat%l_real) THEN
CALL HsmtSphGpuKernel_real<<<grid,block>>>(grid,block,iintsp,jintsp,nv_dev,atoms%lmaxd,atoms%lmax(n),mpi%n_rank+1,&
CALL HsmtSphGpuKernel_real<<<grid,block>>>(loop_size,iintsp,jintsp,nv_dev,&
atoms%lmaxd,atoms%lmax(n),mpi%n_rank+1,&
lapw%nv(jintsp), mpi%n_size,SUM(atoms%neq(:n-1))+1,SUM(atoms%neq(:n)),atoms%lnonsph(n),&
qssbti,qssbtj,lapw%gvec,lapw%gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,atoms%taual,&
usdus%ddn(:,n,isp),el(:,n,isp),e_shift,&
smat%data_r,hmat%data_r,&
usdus%uds(:,n,isp),usdus%dus(:,n,isp),usdus%us(:,n,isp),usdus%duds(:,n,isp),atoms%rmt(n))
ELSE
CALL HsmtSphGpuKernel_cmplx<<<grid,block>>>(grid,block,iintsp,jintsp,nv_dev,atoms%lmaxd,atoms%lmax(n),mpi%n_rank+1,&
CALL HsmtSphGpuKernel_cmplx<<<grid,block>>>(loop_size,iintsp,jintsp,nv_dev,&
atoms%lmaxd,atoms%lmax(n),mpi%n_rank+1,&
lapw%nv(jintsp), mpi%n_size,SUM(atoms%neq(:n-1))+1,SUM(atoms%neq(:n)),atoms%lnonsph(n),&
chi,qssbti,qssbtj,lapw%gvec,lapw%gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,atoms%taual,&
usdus%ddn(:,n,isp),el(:,n,isp),e_shift,&
......@@ -312,12 +294,14 @@ CONTAINS
ENDIF
ELSE
IF (smat%l_real) THEN
CALL HsmtSphGpuKernel_real<<<grid,block>>>(grid,block,iintsp,jintsp,nv_dev,atoms%lmaxd,atoms%lmax(n),mpi%n_rank+1,&
CALL HsmtSphGpuKernel_real<<<grid,block>>>(loop_size,iintsp,jintsp,nv_dev,&
atoms%lmaxd,atoms%lmax(n),mpi%n_rank+1,&
lapw%nv(jintsp), mpi%n_size,SUM(atoms%neq(:n-1))+1,SUM(atoms%neq(:n)),atoms%lnonsph(n),&
qssbti,qssbtj,lapw%gvec,lapw%gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,atoms%taual,&
usdus%ddn(:,n,isp),el(:,n,isp),e_shift,smat%data_r,hmat%data_r)
ELSE
CALL HsmtSphGpuKernel_cmplx<<<grid,block>>>(grid,block,iintsp,jintsp,nv_dev,atoms%lmaxd,atoms%lmax(n),mpi%n_rank+1,&
CALL HsmtSphGpuKernel_cmplx<<<grid,block>>>(loop_size,iintsp,jintsp,nv_dev,&
atoms%lmaxd,atoms%lmax(n),mpi%n_rank+1,&
lapw%nv(jintsp), mpi%n_size,SUM(atoms%neq(:n-1))+1,SUM(atoms%neq(:n)),atoms%lnonsph(n),&
chi,qssbti,qssbtj,lapw%gvec,lapw%gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,atoms%taual,&
usdus%ddn(:,n,isp),el(:,n,isp),e_shift,smat%data_c,hmat%data_c)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment