Commit 10a7d4db authored by Stefan Rost's avatar Stefan Rost

Merge branch 'develop' of iffgit.fz-juelich.de:fleur/fleur into develop

parents 9ed3bc96 f72baf97
......@@ -49,24 +49,24 @@ CONTAINS
! ..
! .. Allocatable Arrays ..
REAL, ALLOCATABLE :: f(:,:,:,:),g(:,:,:,:)
COMPLEX, ALLOCATABLE :: rho21(:,:,:)
COMPLEX :: rho21
!
CALL timestart("cdnmt")
IF (noco%l_mperp) THEN
IF (denCoeffsOffdiag%l_fmpl) THEN
ALLOCATE ( rho21(atoms%jmtd,0:sphhar%nlhd,atoms%ntype) )
rho21(:,:,:) = cmplx(0.0,0.0)
!ALLOCATE ( rho21(atoms%jmtd,0:sphhar%nlhd,atoms%ntype) )
rho(:,:,:,3:4) = CMPLX(0.0,0.0)
ENDIF
ENDIF
!$OMP PARALLEL DEFAULT(none) &
!$OMP SHARED(usdus,rho,moments,rho21,qmtl) &
!$OMP SHARED(usdus,rho,moments,qmtl) &
!$OMP SHARED(atoms,jsp_start,jsp_end,enpara,vr,denCoeffs,sphhar)&
!$OMP SHARED(orb,noco,denCoeffsOffdiag,jspd)&
!$OMP PRIVATE(itype,na,ispin,l,f,g,nodeu,noded,wronk,i,j,s,qmtllo,qmtt,nd,lh,lp,llp,cs)
!$OMP PRIVATE(itype,na,ispin,l,rho21,f,g,nodeu,noded,wronk,i,j,s,qmtllo,qmtt,nd,lh,lp,llp,cs)
IF (noco%l_mperp) THEN
ALLOCATE ( f(atoms%jmtd,2,0:atoms%lmaxd,jspd),g(atoms%jmtd,2,0:atoms%lmaxd,jspd) )
ELSE
......@@ -187,7 +187,10 @@ CONTAINS
+ denCoeffsOffdiag%ud21(l,itype)*( f(j,1,l,2)*g(j,1,l,1) +f(j,2,l,2)*g(j,2,l,1) )&
+ denCoeffsOffdiag%du21(l,itype)*( g(j,1,l,2)*f(j,1,l,1) +g(j,2,l,2)*f(j,2,l,1) )&
+ denCoeffsOffdiag%dd21(l,itype)*( g(j,1,l,2)*g(j,1,l,1) +g(j,2,l,2)*g(j,2,l,1) )
rho21(j,0,itype) = rho21(j,0,itype)+ conjg(cs)/(atoms%neq(itype)*sfp_const)
!rho21(j,0,itype) = rho21(j,0,itype)+ conjg(cs)/(atoms%neq(itype)*sfp_const)
rho21=CONJG(cs)/(atoms%neq(itype)*sfp_const)
rho(j,0,itype,3)=rho(j,0,itype,3)+REAL(rho21)
rho(j,0,itype,4)=rho(j,0,itype,4)+imag(rho21)
ENDDO
ENDDO
......@@ -203,7 +206,10 @@ CONTAINS
+ f(j,2,lp,2)*g(j,2,l,1) )+ denCoeffsOffdiag%dunmt21(llp,lh,itype)*(g(j,1,lp,2)*f(j,1,l,1)&
+ g(j,2,lp,2)*f(j,2,l,1) )+ denCoeffsOffdiag%ddnmt21(llp,lh,itype)*(g(j,1,lp,2)*g(j,1,l,1)&
+ g(j,2,lp,2)*g(j,2,l,1) )
rho21(j,lh,itype)= rho21(j,lh,itype)+ conjg(cs)/atoms%neq(itype)
!rho21(j,lh,itype)= rho21(j,lh,itype)+ CONJG(cs)/atoms%neq(itype)
rho21=CONJG(cs)/atoms%neq(itype)
rho(j,lh,itype,3)=rho(j,lh,itype,3)+REAL(rho21)
rho(j,lh,itype,4)=rho(j,lh,itype,4)+imag(rho21)
ENDDO
ENDDO
ENDDO
......@@ -241,15 +247,6 @@ CONTAINS
CALL timestop("cdnmt")
!---> for testing: to plot the offdiag. part of the density matrix it
!---> is written to the file rhomt21. This file can read in pldngen.
IF (denCoeffsOffdiag%l_fmpl) THEN
OPEN (26,file='rhomt21',form='unformatted',status='unknown')
WRITE (26) rho21
CLOSE (26)
DEALLOCATE ( rho21 )
ENDIF
!---> end of test output
END SUBROUTINE cdnmt
END MODULE m_cdnmt
......@@ -12,14 +12,17 @@
export FC=$MPIFC
export CC=$MPICC
fi
#ELPA
if [ $ELPA_MODULES ]
then
CLI_ELPA_OPENMP=1
FLEUR_LIBDIR="$FLEUR_LIBDIR $ELPA_LIB"
FLEUR_INCLUDEDIR="$FLEUR_INCLUDEDIR $ELPA_MODULES"
fi
elif module list 2>&1 | grep -q pgi
then
echo "found PGI compiler"
fi
fi
#ELPA
if [ $ELPA_MODULES ]
then
CLI_ELPA_OPENMP=1
FLEUR_LIBDIR="$FLEUR_LIBDIR $ELPA_LIB"
FLEUR_INCLUDEDIR="$FLEUR_INCLUDEDIR $ELPA_MODULES"
fi
......@@ -4,9 +4,12 @@ LINK_LIBRARIES ${FLEUR_LIBRARIES})
if (NOT FLEUR_USE_ELPA_ONENODE)
if (DEFINED CLI_ELPA_OPENMP)
message ("FLEUR_USE_GPU: ${FLEUR_USES_GPU}")
if (FLEUR_USE_GPU)
set(TEST_LIBRARIES "-lelpa_onenode;${FLEUR_LIBRARIES}")
else()
set(TEST_LIBRARIES "-lelpa_onenode_openmp;${FLEUR_LIBRARIES}")
#else()
# set(TEST_LIBRARIES "-lelpa;${FLEUR_LIBRARIES}")
endif()
endif()
try_compile(FLEUR_USE_ELPA_ONENODE ${CMAKE_BINARY_DIR} ${CMAKE_SOURCE_DIR}/cmake/tests/test_ELPA.f90
LINK_LIBRARIES ${TEST_LIBRARIES})
......
......@@ -11,9 +11,11 @@ if (CLI_FLEUR_USE_GPU)
elseif(${CLI_FLEUR_USE_GPU} MATCHES "cuda9.1")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.1,cc60 -Mcuda=rdc -Mcudalib=cublas")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "nvtx")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.1,cc60 -Mcuda=rdc -Mcudalib=cublas -lnvToolsExt ")
#set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.1,cc60,ptxinfo,lineinfo -Mcuda=rdc -Mcudalib=cublas -lnvToolsExt ")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.1,cc60 -Mcuda=rdc -Mcudalib=cublas -lnvToolsExt ")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "emu")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=emu -Mcudalib=cublas -Minfo=accel ")
#set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=emu -Mcudalib=cublas -Minfo=accel ")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=emu -Mcudalib=cublas ")
endif()
set(FLEUR_MPI_DEFINITIONS ${FLEUR_MPI_DEFINITIONS} "CPP_GPU" "CPP_MANAGED=,MANAGED")
set(FLEUR_DEFINITIONS ${FLEUR_DEFINITIONS} "CPP_GPU" "CPP_MANAGED=,MANAGED")
......
......@@ -45,6 +45,7 @@ CONTAINS
INTEGER :: kernel
CLASS(elpa_t),pointer :: elpa_obj
print*, "ELPA 20180525 started"
err = elpa_init(20180525)
elpa_obj => elpa_allocate()
......@@ -60,6 +61,9 @@ CONTAINS
CALL elpa_obj%set("local_ncols", hmat%matsize2, err)
CALL elpa_obj%set("nblk",hmat%matsize1, err)
CALL elpa_obj%set("blacs_context", -1, err)
#ifdef CPP_GPU
CALL elpa_obj%set("gpu",1,err)
#endif
err = elpa_obj%setup()
CALL hmat%add_transpose(hmat)
......
set(fleur_F77 ${fleur_F77}
)
set(fleur_F90 ${fleur_F90}
eigen/hsmt_mtNocoPot_offdiag.F90
eigen/eigen.F90
eigen/hlomat.F90
eigen/hs_int.F90
......
......@@ -70,15 +70,15 @@ CONTAINS
!$OMP MASTER
IF ((atoms%invsat(na).EQ.0) .OR. (atoms%invsat(na).EQ.1)) THEN
IF ((atoms%invsat(na) == 0) .OR. (atoms%invsat(na) == 1)) THEN
!---> if this atom is the first of two atoms related by inversion,
!---> the contributions to the overlap matrix of both atoms are added
!---> at once. where it is made use of the fact, that the sum of
!---> these contributions is twice the real part of the contribution
!---> of each atom. note, that in this case there are twice as many
!---> (2*(2*l+1)) k-vectors (compare abccoflo and comments there).
IF (atoms%invsat(na).EQ.0) invsfct = 1
IF (atoms%invsat(na).EQ.1) invsfct = 2
IF (atoms%invsat(na) == 0) invsfct = 1
IF (atoms%invsat(na) == 1) invsfct = 2
!
DO lo = 1,atoms%nlo(ntyp)
......@@ -127,7 +127,7 @@ CONTAINS
!+t3e
DO nkvec = 1,invsfct* (2*l+1)
locol= lapw%nv(jintsp)+lapw%index_lo(lo,na)+nkvec !this is the column of the matrix
IF (MOD(locol-1,mpi%n_size).EQ.mpi%n_rank) THEN
IF (MOD(locol-1,mpi%n_size) == mpi%n_rank) THEN
locol=(locol-1)/mpi%n_size+1 !this is the column in local storage
!-t3e
IF (hmat%l_real) THEN
......@@ -174,7 +174,7 @@ CONTAINS
!---> local orbitals at the same atom and with itself
DO nkvec = 1,invsfct* (2*l+1)
locol = lapw%nv(jintsp)+lapw%index_lo(lo,na)+nkvec !this is the column of the matrix
IF (MOD(locol-1,mpi%n_size).EQ.mpi%n_rank) THEN
IF (MOD(locol-1,mpi%n_size) == mpi%n_rank) THEN
locol=(locol-1)/mpi%n_size+1 !this is the column in local storage
!-t3e
!---> calculate the hamiltonian matrix elements with other
......
!--------------------------------------------------------------------------------
! Copyright (c) 2016 Peter Grünberg Institut, Forschungszentrum Jülich, Germany
! This file is part of FLEUR and available as free software under the conditions
! of the MIT license as expressed in the LICENSE file in more detail.
!--------------------------------------------------------------------------------
MODULE m_hsmt
USE m_juDFT
IMPLICIT NONE
......@@ -25,6 +30,7 @@ CONTAINS
USE m_hsmt_fjgj
USE m_hsmt_spinor
USE m_hsmt_soc_offdiag
USE m_hsmt_mtNocoPot_offdiag
USE m_hsmt_offdiag
IMPLICIT NONE
TYPE(t_mpi),INTENT(IN) :: mpi
......@@ -84,15 +90,18 @@ CONTAINS
!global spin-matrices.
CALL hmat_tmp%clear();CALL smat_tmp%clear()
CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,1,1,chi_one,lapw,enpara%el0,td%e_shift(n,ispin),&
usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),smat_tmp,hmat_tmp)
usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),smat_tmp,hmat_tmp)
CALL hsmt_nonsph(n,mpi,sym,atoms,ispin,1,1,chi_one,noco,cell,lapw,td,&
fj(:,0:,ispin,:),gj(:,0:,ispin,:),hmat_tmp)
fj(:,0:,ispin,:),gj(:,0:,ispin,:),hmat_tmp)
CALL hsmt_lo(input,atoms,sym,cell,mpi,noco,lapw,usdus,td,fj(:,0:,ispin,:),gj(:,0:,ispin,:),&
n,chi_one,ispin,iintsp,jintsp,hmat_tmp,smat_tmp)
n,chi_one,ispin,iintsp,jintsp,hmat_tmp,smat_tmp)
CALL hsmt_spinor(ispin,n,noco,chi)
CALL hsmt_distspins(chi,smat_tmp,smat)
CALL hsmt_distspins(chi,hmat_tmp,hmat)
!Add off-diagonal contributions to Hamiltonian if needed
IF (ispin==1.AND.noco%l_mtNocoPot) THEN
CALL hsmt_mtNocoPot_offdiag(n,mpi,sym,atoms,noco,cell,lapw,td,fj,gj,hmat_tmp,hmat)
ENDIF
IF (ispin==1.and.noco%l_soc) &
CALL hsmt_soc_offdiag(n,atoms,mpi,noco,lapw,usdus,td,fj(:,0:,:,iintsp),gj(:,0:,:,iintsp),hmat)
IF (noco%l_constr) &
......
!--------------------------------------------------------------------------------
! Copyright (c) 2016 Peter Grünberg Institut, Forschungszentrum Jülich, Germany
! This file is part of FLEUR and available as free software under the conditions
! of the MIT license as expressed in the LICENSE file in more detail.
!--------------------------------------------------------------------------------
MODULE m_hsmt_mtNocoPot_offdiag
USE m_juDFT
IMPLICIT NONE
CONTAINS
SUBROUTINE hsmt_mtNocoPot_offdiag(n,mpi,sym,atoms,noco,cell,lapw,td,fj,gj,hmat_tmp,hmat)
!Calculate the contribution from the local-spin-offdiagonal potential
!The following idea is used:
!Calculate the matrix by using non-spherical algorithm. This is done only once, since
!this sets up both the local spin-up-down and the spin-down-up part (it calculates the
!full matrix). So both can be updated from this matrix. But since the off-diagonal
!local potential is real we have to call the routine twice and use the chi_one factor
!to get the imaginary contribution
USE m_types
USE m_hsmt_nonsph
USE m_hsmt_distspins
USE m_hsmt_spinor
IMPLICIT NONE
TYPE(t_mpi),INTENT(IN) :: mpi
TYPE(t_sym),INTENT(IN) :: sym
TYPE(t_noco),INTENT(IN) :: noco
TYPE(t_cell),INTENT(IN) :: cell
TYPE(t_atoms),INTENT(IN) :: atoms
TYPE(t_lapw),INTENT(IN) :: lapw
TYPE(t_tlmplm),INTENT(IN) :: td
REAL,INTENT(IN) :: fj(:,0:,:,:),gj(:,0:,:,:)
! .. Scalar Arguments ..
INTEGER, INTENT (IN) :: n
COMPLEX :: chi_one,chi(2,2)
CLASS(t_mat),INTENT(INOUT) :: hmat(:,:),hmat_tmp
chi_one=1.0
CALL hmat_tmp%clear()
!The spin1,2 matrix is calculated(real part of potential)
CALL hsmt_nonsph(n,mpi,sym,atoms,3,1,1,chi_one,noco,cell,lapw,td,&
fj(:,0:,1,:),gj(:,0:,2,:),hmat_tmp)
CALL hsmt_spinor(3,n,noco,chi) !spinor for off-diagonal part
CALL hsmt_distspins(chi,hmat_tmp,hmat)
CALL hmat_tmp%TRANSPOSE()
hmat_tmp%data_c=CONJG(hmat_tmp%data_c)
CALL hsmt_spinor(4,n,noco,chi) !spinor for off-diagonal part
CALL hsmt_distspins(chi,hmat_tmp,hmat)
CALL hmat_tmp%clear()
!The spin1,2 matrix is calculated(imag part of potential)
chi_one=CMPLX(0.,1.)
CALL hsmt_nonsph(n,mpi,sym,atoms,4,1,1,chi_one,noco,cell,lapw,td,&
fj(:,0:,1,:),gj(:,0:,2,:),hmat_tmp)
CALL hsmt_spinor(3,n,noco,chi)
CALL hsmt_distspins(chi,hmat_tmp,hmat)
CALL hmat_tmp%TRANSPOSE()
hmat_tmp%data_c=CONJG(hmat_tmp%data_c)
CALL hsmt_spinor(4,n,noco,chi)
CALL hsmt_distspins(chi,hmat_tmp,hmat)
END SUBROUTINE hsmt_mtNocoPot_offdiag
END MODULE m_hsmt_mtNocoPot_offdiag
......@@ -197,7 +197,13 @@ CONTAINS
td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab1,SIZE(ab1,1))
!ab1=MATMUL(ab(:lapw%nv(iintsp),:ab_size),td%h_loc(:ab_size,:ab_size,n,isp))
IF (iintsp==jintsp) THEN
CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,CONJG(ab1),SIZE(ab1,1),1.0,hmat%data_c,SIZE(hmat%data_c,1))
IF (isp<3) THEN
CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,CONJG(ab1),SIZE(ab1,1),1.0,hmat%data_c,SIZE(hmat%data_c,1))
ELSE !This is the case of a local off-diagonal contribution.
!It is not Hermitian, so we need to USE zgemm CALL
CALL zgemm("N","T",lapw%nv(iintsp),lapw%nv(jintsp),ab_size,chi,CONJG(ab),SIZE(ab,1),&
ab1,SIZE(ab1,1),CMPLX(1.0,0.0),hmat%data_c,SIZE(hmat%data_c,1))
ENDIF
ELSE !here the l_ss off-diagonal part starts
!Second set of ab is needed
CALL hsmt_ab(sym,atoms,noco,isp,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.)
......
......@@ -5,466 +5,465 @@
!--------------------------------------------------------------------------------
MODULE m_hsmt_sph
USE m_juDFT
IMPLICIT NONE
USE m_juDFT
IMPLICIT NONE
INTERFACE hsmt_sph
module procedure hsmt_sph_cpu
INTERFACE hsmt_sph
module procedure hsmt_sph_cpu
#ifdef CPP_GPU
module procedure hsmt_sph_gpu
module procedure hsmt_sph_gpu
#endif
END INTERFACE
END INTERFACE
CONTAINS
#ifdef CPP_GPU
ATTRIBUTES(global)&
SUBROUTINE HsmtSphGpuKernel_real(loop_size,iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
lnonsph,qssbti,qssbtj,gvec,gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,taual,ddn,el,e_shift,&
smat_data,hmat_data,&
uds,dus,us,duds,rmt)
INTEGER, VALUE, INTENT(IN) :: loop_size
INTEGER,VALUE,INTENT(IN) :: iintsp,jintsp,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,lnonsph
REAL,INTENT(IN) :: qssbti(3),qssbtj(3)
INTEGER,INTENT(IN) :: gvec(:,:,:),nv(2)
REAL,INTENT(IN) :: gk(:,:,:)
REAL,INTENT(IN) :: fleg1(0:lmaxd),fleg2(0:lmaxd),fl2p1(0:lmaxd)
REAL,INTENT(IN) :: fl2p1bt(0:lmaxd)
REAL,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
REAL,INTENT(IN) :: taual(:,:)
REAL,INTENT(IN) :: ddn(0:lmaxd)
REAL,INTENT(IN) :: el(0:lmaxd)
REAL,VALUE,INTENT(IN) :: e_shift
REAL,INTENT(INOUT) :: smat_data(:,:),hmat_data(:,:)
!+APW
REAL,INTENT(IN),OPTIONAL :: uds(0:lmaxd),dus(0:lmaxd),us(0:lmaxd),duds(0:lmaxd)
REAL,INTENT(IN),OPTIONAL :: rmt
!-APW
REAL, PARAMETER :: tpi_const=2.*3.1415926535897932
REAL, ALLOCATABLE :: plegend(:)
REAL cph
REAL tnn(3), elall,fct,fct2,fjkiln,gjkiln,ddnln,ski(3)
REAL apw_lo1,apw_lo2,apw1,w1
INTEGER kii,ki,kj,l,nn,k
INTEGER :: loop_start, loop_end, i
ALLOCATE(plegend(0:lmaxd))
plegend=0.0
plegend(0)=1.0
k = (blockidx%x-1)*blockdim%x + threadidx%x
!TODO!!!
!for seq, i.e. ki_start = 1, ki_step = 1
loop_start = (k-1) * loop_size + 1
loop_end = loop_start + loop_size - 1
if (loop_end > ki_end ) loop_end = ki_end
DO ki = loop_start,loop_end,ki_step
!DO ki = ki_start,ki_end,ki_step
DO kj = 1,ki
kii=(ki-1)/ki_step+1
ski = gvec(:,ki,jintsp) + qssbti
!---> set up phase factors
cph = 0.0
DO nn = nn_start,nn_end
tnn = tpi_const*taual(:,nn)
cph = cph + COS(DOT_PRODUCT(ski-gvec(:,kj,iintsp)-qssbtj,tnn))
! IF (iintsp.NE.jintsp) cph(kj)=CONJG(cph(kj))
ENDDO
!---> legendre polynomials
plegend(1) = DOT_PRODUCT(gk(:,kj,iintsp),gk(:,ki,jintsp))
DO l = 1,lmax - 1
plegend(l+1) = fleg1(l)*plegend(1)*plegend(l) - fleg2(l)*plegend(l-1)
END DO
DO l = 0,lmax
fjkiln = fj(ki,l,jintsp)
gjkiln = gj(ki,l,jintsp)
!+APW
IF (PRESENT(uds)) THEN
ATTRIBUTES(global)&
SUBROUTINE HsmtSphGpuKernel_real(loop_size,iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
lnonsph,qssbti,qssbtj,gvec,gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,taual,ddn,el,e_shift,&
smat_data,hmat_data,&
uds,dus,us,duds,rmt)
INTEGER, VALUE, INTENT(IN) :: loop_size
INTEGER,VALUE,INTENT(IN) :: iintsp,jintsp,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,lnonsph
REAL,INTENT(IN) :: qssbti(3),qssbtj(3)
INTEGER,INTENT(IN) :: gvec(:,:,:),nv(2)
REAL,INTENT(IN) :: gk(:,:,:)
REAL,INTENT(IN) :: fleg1(0:lmaxd),fleg2(0:lmaxd),fl2p1(0:lmaxd)
REAL,INTENT(IN) :: fl2p1bt(0:lmaxd)
REAL,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
REAL,INTENT(IN) :: taual(:,:)
REAL,INTENT(IN) :: ddn(0:lmaxd)
REAL,INTENT(IN) :: el(0:lmaxd)
REAL,VALUE,INTENT(IN) :: e_shift
REAL,INTENT(INOUT) :: smat_data(:,:),hmat_data(:,:)
!+APW
REAL,INTENT(IN),OPTIONAL :: uds(0:lmaxd),dus(0:lmaxd),us(0:lmaxd),duds(0:lmaxd)
REAL,INTENT(IN),OPTIONAL :: rmt
!-APW
REAL, PARAMETER :: tpi_const=2.*3.1415926535897932
REAL, ALLOCATABLE :: plegend(:)
REAL cph
REAL tnn(3), elall,fct,fct2,fjkiln,gjkiln,ddnln,ski(3)
REAL apw_lo1,apw_lo2,apw1,w1
INTEGER kii,ki,kj,l,nn,k
INTEGER :: loop_start, loop_end, i
ALLOCATE(plegend(0:lmaxd))
plegend=0.0
plegend(0)=1.0
k = (blockidx%x-1)*blockdim%x + threadidx%x
!TODO!!!
!for seq, i.e. ki_start = 1, ki_step = 1
loop_start = (k-1) * loop_size + 1
loop_end = loop_start + loop_size - 1
if (loop_end > ki_end ) loop_end = ki_end
DO ki = loop_start,loop_end,ki_step
!DO ki = ki_start,ki_end,ki_step
DO kj = 1,ki
kii=(ki-1)/ki_step+1
ski = gvec(:,ki,jintsp) + qssbti
!---> set up phase factors
cph = 0.0
DO nn = nn_start,nn_end
tnn = tpi_const*taual(:,nn)
cph = cph + COS(DOT_PRODUCT(ski-gvec(:,kj,iintsp)-qssbtj,tnn))
! IF (iintsp.NE.jintsp) cph(kj)=CONJG(cph(kj))
ENDDO
!---> legendre polynomials
plegend(1) = DOT_PRODUCT(gk(:,kj,iintsp),gk(:,ki,jintsp))
DO l = 1,lmax - 1
plegend(l+1) = fleg1(l)*plegend(1)*plegend(l) - fleg2(l)*plegend(l-1)
END DO
DO l = 0,lmax
fjkiln = fj(ki,l,jintsp)
gjkiln = gj(ki,l,jintsp)
!+APW
IF (PRESENT(uds)) THEN
w1 = 0.5 * ( uds(l)*dus(l) + us(l)*duds(l) )
apw_lo1 = fl2p1(l) * 0.5 * rmt**2 * ( gjkiln * w1 +&
fjkiln * us(l) * dus(l) )
fjkiln * us(l) * dus(l) )
apw_lo2 = fl2p1(l) * 0.5 * rmt**2 * ( fjkiln * w1 +&
gjkiln * uds(l) * duds(l) )
ENDIF
!-APW
ddnln = ddn(l)
elall = el(l)
IF (l<=lnonsph) elall=elall-e_shift!(isp)
!DO kj = 1,ki
fct = plegend(l)*fl2p1(l)*&
( fjkiln*fj(kj,l,iintsp) + gjkiln*gj(kj,l,iintsp)*ddnln )
fct2 = plegend(l)*fl2p1bt(l) * ( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) )
smat_data(kj,kii)=smat_data(kj,kii)+cph*fct
hmat_data(kj,kii)=hmat_data(kj,kii) + cph * ( fct * elall + fct2)
!+APW
IF (PRESENT(uds)) THEN
apw1 = cph * plegend(l) * &
( apw_lo1 * fj(kj,l,iintsp) + apw_lo2 * gj(kj,l,iintsp) )
hmat_data(kj,kii)=hmat_data(kj,kii) + apw1
ENDIF
!-APW
!ENDDO
!---> end loop over l
ENDDO
ENDDO
!---> end loop over ki
ENDDO
DEALLOCATE(plegend)
END SUBROUTINE HsmtSphGpuKernel_real
ATTRIBUTES(global)&
SUBROUTINE HsmtSphGpuKernel_cmplx(loop_size,iintsp,jintsp,nv,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,&
lnonsph,chi,qssbti,qssbtj,gvec,gk,fleg1,fleg2,fl2p1,fl2p1bt,fj,gj,taual,ddn,el,e_shift,&
smat_data,hmat_data,&
uds,dus,us,duds,rmt)
INTEGER, VALUE, INTENT(IN) :: loop_size
INTEGER, VALUE, INTENT(IN) :: iintsp,jintsp,lmaxd,lmax,ki_start,ki_end,ki_step,nn_start,nn_end,lnonsph
COMPLEX, VALUE, INTENT(IN) :: chi
REAL,INTENT(IN) :: qssbti(3),qssbtj(3)
INTEGER,INTENT(IN) :: gvec(:,:,:),nv(2)
REAL,INTENT(IN) :: gk(:,:,:)
REAL,INTENT(IN) :: fleg1(0:lmaxd),fleg2(0:lmaxd),fl2p1(0:lmaxd)
REAL,INTENT(IN) :: fl2p1bt(0:lmaxd)
REAL,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
REAL,INTENT(IN) :: taual(:,:)
REAL,INTENT(IN) :: ddn(0:lmaxd)
REAL,INTENT(IN) :: el(0:lmaxd)
REAL, VALUE,INTENT(IN) :: e_shift
COMPLEX,INTENT(INOUT) :: smat_data(:,:),hmat_data(:,:)
!+APW
REAL,INTENT(IN),OPTIONAL :: uds(0:lmaxd),dus(0:lmaxd),us(0:lmaxd),duds(0:lmaxd)
REAL,INTENT(IN),OPTIONAL :: rmt
!-APW
REAL, PARAMETER :: tpi_const=2.*3.1415926535897932
REAL, ALLOCATABLE :: plegend(:)
COMPLEX :: cph
REAL apw_lo1,apw_lo2,w1
COMPLEX capw1
REAL tnn(3), elall,fct,fct2,fjkiln,gjkiln,ddnln,ski(3)
INTEGER kii,ki,kj,kjj,l,nn,kj_end,k
INTEGER :: loop_start, loop_end, i
ALLOCATE(plegend(0:lmaxd))
plegend=0.0
plegend(0)=1.0
k = (blockidx%x-1)*blockdim%x + threadidx%x
!TODO!!!
!for seq, i.e. ki_start = 1, ki_step = 1
loop_start = (k-1) * loop_size + 1
loop_end = loop_start + loop_size - 1
if (loop_end > ki_end ) loop_end = ki_end
DO ki = loop_start,loop_end,ki_step
!DO ki = ki_start,ki_end,ki_step
kj_end = MIN(ki,nv(iintsp))
DO kj = 1,kj_end
kii=(ki-1)/ki_step+1
ski = gvec(:,ki,jintsp) + qssbti
!---> set up phase factors
cph = 0.0
DO nn = nn_start,nn_end
tnn = tpi_const*taual(:,nn)
cph = cph +&
CMPLX(COS(DOT_PRODUCT(ski-gvec(:,kj,iintsp)-qssbtj,tnn)),&
SIN(DOT_PRODUCT(gvec(:,kj,iintsp)+qssbtj-ski,tnn)))
! IF (iintsp.NE.jintsp) cph(kj)=CONJG(cph(kj))
ENDDO
!---> legendre polynomials
plegend(1) = DOT_PRODUCT(gk(:,kj,iintsp),gk(:,ki,jintsp))
DO l = 1,lmax - 1
plegend(l+1) = fleg1(l)*plegend(1)*plegend(l) - fleg2(l)*plegend(l-1)
END DO
DO l = 0,lmax
fjkiln = fj(ki,l,jintsp)
gjkiln = gj(ki,l,jintsp)
!+APW
IF (PRESENT(uds)) THEN
w1 = 0.5 * ( uds(l)*dus(l) + us(l)*duds(l) )
apw_lo1 = fl2p1(l) * 0.5 * rmt**2 * ( gjkiln * w1 +&
fjkiln * us(l) * dus(l) )
apw_lo2 = fl2p1(l) * 0.5 * rmt**2 * ( fjkiln * w1 +&
gjkiln * uds(l) * duds(l) )
ENDIF
!-APW
ddnln = ddn(l)
elall = el(l)
IF (l<=lnonsph) elall=elall-e_shift!(isp)
fct = plegend(l)*fl2p1(l)*&
( fjkiln*fj(kj,l,iintsp) + gjkiln*gj(kj,l,iintsp)*ddnln )
fct2 = plegend(l)*fl2p1bt(l) * ( fjkiln*gj(kj,l,iintsp) + gjkiln*fj(kj,l,iintsp) )
smat_data(kj,kii)=smat_data(kj,kii) + chi * cph * fct
hmat_data(kj,kii)=hmat_data(kj,kii) + chi * cph * ( fct * elall + fct2)
!+APW
IF (PRESENT(uds)) THEN
capw1 = cph*plegend(l)&
* ( apw_lo1 * fj(kj,l,iintsp) + apw_lo2 * gj(kj,l,iintsp) )
hmat_data(kj,kii)=hmat_data(kj,kii) + capw1
ENDIF
!-APW
!---> end loop over l
ENDDO
ENDDO
!---> end loop over ki
ENDDO
DEALLOCATE(plegend)
END SUBROUTINE HsmtSphGpuKernel_cmplx
SUBROUTINE hsmt_sph_gpu(n,atoms,mpi,isp,input,noco,iintsp,jintsp,chi,lapw,el,e_shift,usdus,fj,gj,smat,hmat)
USE m_constants, ONLY : fpi_const,tpi_const
USE m_types
USE nvtx
IMPLICIT NONE
TYPE(t_input),INTENT(IN) :: input
TYPE(t_mpi),INTENT(IN) :: mpi
TYPE(t_noco),INTENT(IN) :: noco
TYPE(t_atoms),INTENT(IN) :: atoms
TYPE(t_lapw),INTENT(IN) :: lapw
TYPE(t_usdus),INTENT(IN) :: usdus
CLASS(t_mat),INTENT(INOUT) :: smat,hmat
! ..
! .. Scalar Arguments ..
INTEGER, INTENT (IN) :: n,isp,iintsp,jintsp
COMPLEX, INTENT(IN) :: chi
! ..
! .. Array Arguments ..
REAL,MANAGED,INTENT (IN) :: el(0:atoms%lmaxd,atoms%ntype,input%jspins)
REAL, INTENT (IN) :: e_shift!(atoms%ntype,input%jspins)
REAL,MANAGED,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
! ..
! .. Local Scalars ..
INTEGER l
INTEGER :: grid, block, loop_size
! ..
! .. Local Arrays ..
REAL,MANAGED :: fleg1(0:atoms%lmaxd),fleg2(0:atoms%lmaxd),fl2p1(0:atoms%lmaxd)
REAL,MANAGED :: fl2p1bt(0:atoms%lmaxd)
REAL,MANAGED :: qssbti(3),qssbtj(3)
INTEGER, DEVICE :: nv_dev(2)
call nvtxStartRange("hsmt_sph",2)
CALL timestart("spherical setup")
print*, "HsmtSph_GPU"
DO l = 0,atoms%lmaxd
fleg1(l) = REAL(l+l+1)/REAL(l+1)
fleg2(l) = REAL(l)/REAL(l+1)
fl2p1(l) = REAL(l+l+1)/fpi_const
fl2p1bt(l) = fl2p1(l)*0.5
END DO
qssbti=MERGE(- noco%qss/2,+ noco%qss/2,jintsp.EQ.1)
qssbtj=MERGE(- noco%qss/2,+ noco%qss/2,iintsp.EQ.1)
! pretty ugly solution
nv_dev = lapw%nv
loop_size = 1
block = 32 ! number of threads in a block