Commit d22a6489 authored by Daniel Wortmann's avatar Daniel Wortmann

Merge branch 'develop' of iffgit.fz-juelich.de:fleur/fleur into develop

Conflicts:
	eigen/hsmt_ab.F90
parents 1d790398 1b2cda7e
......@@ -93,8 +93,8 @@ CONTAINS
! Collect the four noco parts into a single matrix
! In collinear case only a copy is done
! In the parallel case also a redistribution happens
ALLOCATE(smat_final,source=smat(1,1))
ALLOCATE(hmat_final,source=smat(1,1))
ALLOCATE(smat_final,mold=smat(1,1))
ALLOCATE(hmat_final,mold=smat(1,1))
CALL eigen_redist_matrix(mpi,lapw,atoms,smat,smat_final)
CALL eigen_redist_matrix(mpi,lapw,atoms,hmat,hmat_final,smat_final)
......
......@@ -19,8 +19,44 @@ CONTAINS
#ifdef _CUDA
ATTRIBUTES(global) SUBROUTINE synth_ab(grid,block,n,lmax,iintsp,ab_size,gkrot_dev,fj,gj,c_ph,ab)
USE m_ylm
INTEGER, VALUE, INTENT(IN) :: grid, block, n, lmax, iintsp,ab_size
REAL, DEVICE, INTENT(IN) :: gkrot_dev(:,:),fj(:,:,:),gj(:,:,:)
COMPLEX,DEVICE, INTENT(IN) :: c_ph(:,:)
COMPLEX,DEVICE, INTENT (OUT) :: ab(:,:)
COMPLEX,ALLOCATABLE :: ylm(:)
INTEGER :: k,l,ll1,m
INTEGER :: loop_start, loop_end, i, loop_size
ALLOCATE(ylm((lmax+1)**2))
k = (blockidx%x-1)*blockdim%x + threadidx%x
loop_size = max(n/(grid*block),1)
if (loop_size * grid*block < n) loop_size = loop_size + 1
loop_start = (k-1) * loop_size + 1
loop_end = loop_start + loop_size - 1
if (loop_end > n ) loop_end = n
DO i = loop_start,loop_end
!--> generate spherical harmonics
CALL ylm4_dev(lmax,gkrot_dev(:,i),ylm(:))
DO l = 0,lmax
ll1 = l* (l+1)
DO m = -l,l
ab(i,ll1+m+1) = CONJG(fj(i,l+1,iintsp)*c_ph(i,iintsp)*ylm(ll1+m+1))
ab(i,ll1+m+1+ab_size) = CONJG(gj(i,l+1,iintsp)*c_ph(i,iintsp)*ylm(ll1+m+1))
END DO
END DO
ENDDO
DEALLOCATE(ylm)
END SUBROUTINE synth_ab
SUBROUTINE hsmt_ab_gpu(sym,atoms,noco,ispin,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,l_nonsph,abclo,alo1,blo1,clo1)
!Calculate overlap matrix
!Calculate overlap matrix, GPU version
USE m_constants, ONLY : fpi_const,tpi_const
USE m_types
USE m_ylm
......@@ -53,18 +89,13 @@ CONTAINS
REAL, ALLOCATABLE :: gkrot(:,:)
COMPLEX:: term
REAL, ALLOCATABLE,DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
COMPLEX,ALLOCATABLE,DEVICE :: c_ph_dev(:,:)
COMPLEX,ALLOCATABLE,DEVICE :: ylm_dev(:,:)
REAL, ALLOCATABLE,DEVICE :: gkrot_dev(:,:)
INTEGER :: istat
INTEGER :: istat, grid, block
! call nvtxStartRange("hsmt_ab",2)
lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph)
ALLOCATE(c_ph_dev(lapw%nv(1),MERGE(2,1,noco%l_ss)))
ALLOCATE(ylm_dev((lmax+1)**2,lapw%nv(1)))
ALLOCATE(gkrot_dev(3,lapw%nv(1)))
ALLOCATE(ylm((lmax+1)**2,lapw%nv(1)))
......@@ -97,31 +128,14 @@ CONTAINS
!--> synthesize the complex conjugates of a and b
!call nvtxStartRange("hsmt_synthAB",5)
istat = cudaDeviceSynchronize()
!!$cuf kernel do <<<*,256>>>
!DO k = 1,lapw%nv(1)
! !--> generate spherical harmonics
! CALL ylm4_dev(lmax,gkrot_dev(:,k),ylm_dev(:,k))
!ENDDO
DO k = 1,lapw%nv(1)
call ylm4(lmax,gkrot(:,k),ylm(:,k))
ENDDO
ylm_dev = ylm
! pretty ugly solution
block = 256
grid = lapw%nv(1)/(block*4) + 1
CALL synth_ab<<<grid,block>>>(grid,block,lapw%nv(1),lmax,iintsp,ab_size,gkrot_dev,fj,gj,c_ph_dev,ab)
!call nvtxStartRange("hsmt_cuf",5)
!$cuf kernel do <<<*,256>>>
DO k = 1,lapw%nv(1)
!--> generate spherical harmonics
!CALL ylm4_dev(lmax,gkrot_dev(:,k),ylm_dev(:,k))
DO l = 0,lmax
ll1 = l* (l+1)
DO m = -l,l
ab(k,ll1+m+1) = CONJG(fj(k,l+1,iintsp)*c_ph_dev(k,iintsp)*ylm_dev(ll1+m+1,k))
ab(k,ll1+m+1+ab_size) = CONJG(gj(k,l+1,iintsp)*c_ph_dev(k,iintsp)*ylm_dev(ll1+m+1,k))
END DO
END DO
ENDDO !k-loop
istat = cudaDeviceSynchronize()
!call nvtxEndRange
......@@ -150,12 +164,14 @@ CONTAINS
ab_size=ab_size*2
!call nvtxEndRange
DEALLOCATE(c_ph_dev)
DEALLOCATE(gkrot_dev)
END SUBROUTINE hsmt_ab_gpu
#endif
SUBROUTINE hsmt_ab_cpu(sym,atoms,noco,ispin,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,l_nonsph,abclo,alo1,blo1,clo1)
!Calculate overlap matrix
!Calculate overlap matrix, CPU vesion
USE m_constants, ONLY : fpi_const,tpi_const
USE m_types
USE m_ylm
......
......@@ -27,27 +27,31 @@ CONTAINS
CLASS(t_mat),INTENT(INOUT) ::hmat
CALL timestart("non-spherical setup")
IF (mpi%n_size==1) THEN
#if defined (_CUDA)
CALL priv_noMPI_gpu(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
#else
CALL priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
#endif
ELSE
CALL priv_MPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
ENDIF
CALL timestop("non-spherical setup")
END SUBROUTINE hsmt_nonsph
SUBROUTINE priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
#if defined (_CUDA)
SUBROUTINE priv_noMPI_gpu(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
!Calculate overlap matrix
USE m_hsmt_ab
USE m_constants, ONLY : fpi_const,tpi_const
USE m_types
USE m_ylm
#if defined (_CUDA)
! cublas: required to use generic BLAS interface
! cudafor: required to use CUDA runtime API routines (e.g.
! cudaDeviceSynchronize)
USE cublas
! cudafor: required to use CUDA runtime API routines
! nvtx: profiling
USE cublas
USE cudafor
USE nvtx
#endif
IMPLICIT NONE
TYPE(t_mpi),INTENT(IN) :: mpi
TYPE(t_sym),INTENT(IN) :: sym
......@@ -67,21 +71,16 @@ CONTAINS
INTEGER:: nn,na,ab_size,l,ll,m
COMPLEX,ALLOCATABLE:: ab(:,:),ab1(:,:),ab2(:,:)
real :: rchi
#ifdef _CUDA
COMPLEX,ALLOCATABLE,DEVICE :: c_dev(:,:), ab1_dev(:,:), ab_dev(:,:), ab2_dev(:,:)
COMPLEX,ALLOCATABLE,DEVICE :: h_loc_dev(:,:)
REAL, ALLOCATABLE,DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
integer :: i, j, istat
call nvtxStartRange("hsmt_nonsph",1)
#endif
ALLOCATE(ab(MAXVAL(lapw%nv),2*atoms%lmaxd*(atoms%lmaxd+2)+2),ab1(lapw%nv(jintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
#ifdef _CUDA
ALLOCATE(h_loc_dev(size(td%h_loc,1),size(td%h_loc,2)))
ALLOCATE(ab1_dev(size(ab1,1),size(ab1,2)))
ALLOCATE(ab_dev(size(ab,1),size(ab,2)))
ALLOCATE(ab1_dev(lapw%nv(jintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
ALLOCATE(ab_dev(MAXVAL(lapw%nv),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
h_loc_dev(1:,1:) = CONJG(td%h_loc(0:,0:,n,isp)) !WORKAROUND, var_dev=CONJG(var_dev) does not work
ALLOCATE(fj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
ALLOCATE(gj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
......@@ -89,12 +88,8 @@ CONTAINS
gj_dev(1:,1:,1:)= gj(1:,0:,1:)
!note that basically all matrices in the GPU version are conjugates of their
!cpu counterparts
#endif
IF (iintsp.NE.jintsp) ALLOCATE(ab2(lapw%nv(iintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
#ifdef _CUDA
IF (iintsp.NE.jintsp) ALLOCATE(ab2_dev(lapw%nv(iintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
#endif
IF (hmat%l_real) THEN
IF (ANY(SHAPE(hmat%data_c)/=SHAPE(hmat%data_r))) THEN
......@@ -103,73 +98,119 @@ CONTAINS
ENDIF
hmat%data_c=0.0
ENDIF
#ifdef _CUDA
ALLOCATE(c_dev(SIZE(hmat%data_c,1),SIZE(hmat%data_c,2)))
c_dev = hmat%data_c
#endif
DO nn = 1,atoms%neq(n)
na = SUM(atoms%neq(:n-1))+nn
IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN
rchi=MERGE(REAL(chi),REAL(chi)*2,(atoms%invsat(na)==0))
#ifdef _CUDA
CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj_dev,gj_dev,ab_dev,ab_size,.TRUE.)
! istat = cudaDeviceSynchronize()
#else
CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.)
#endif
!Calculate Hamiltonian
#ifdef _CUDA
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab_dev,SIZE(ab_dev,1),h_loc_dev,SIZE(h_loc_dev,1),CMPLX(0.,0.),ab1_dev,SIZE(ab1_dev,1))
#else
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab1,SIZE(ab1,1))
#endif
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab_dev,SIZE(ab_dev,1),&
h_loc_dev,SIZE(h_loc_dev,1),CMPLX(0.,0.),ab1_dev,SIZE(ab1_dev,1))
!ab1=MATMUL(ab(:lapw%nv(iintsp),:ab_size),td%h_loc(:ab_size,:ab_size,n,isp))
IF (iintsp==jintsp) THEN
#ifdef _CUDA
call nvtxStartRange("zherk",3)
!ab1_dev=CONJG(ab1)
call nvtxStartRange("zherk",3)
CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,ab1_dev,SIZE(ab1_dev,1),1.0,c_dev,SIZE(c_dev,1))
istat = cudaDeviceSynchronize()
call nvtxEndRange()
#else
CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,CONJG(ab1),SIZE(ab1,1),1.0,hmat%data_c,SIZE(hmat%data_c,1))
#endif
call nvtxEndRange()
ELSE !here the l_ss off-diagonal part starts
!Second set of ab is needed
#ifdef _CUDA
CALL hsmt_ab(sym,atoms,noco,isp,iintsp,n,na,cell,lapw,fj_dev,gj_dev,ab_dev,ab_size,.TRUE.)
#else
CALL hsmt_ab(sym,atoms,noco,isp,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.)
#endif
#ifdef _CUDA
CALL zgemm("N","N",lapw%nv(iintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab_dev,SIZE(ab_dev,1),h_loc_dev,SIZE(td%h_loc,1),CMPLX(0.,0.),ab2_dev,SIZE(ab2_dev,1))
#else
CALL zgemm("N","N",lapw%nv(iintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab2,SIZE(ab2,1))
#endif
CALL zgemm("N","N",lapw%nv(iintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab_dev,SIZE(ab_dev,1),&
h_loc_dev,SIZE(td%h_loc,1),CMPLX(0.,0.),ab2_dev,SIZE(ab2_dev,1))
!Multiply for Hamiltonian
#ifdef _CUDA
ab1 = ab1_dev
ab1_dev=CONJG(ab1)
CALL zgemm("N","T",lapw%nv(iintsp),lapw%nv(jintsp),ab_size,chi,ab2_dev,SIZE(ab2_dev,1),ab1_dev,SIZE(ab1_dev,1),CMPLX(1.0,0.0),c_dev,SIZE(c_dev,1))
#else
CALL zgemm("N","T",lapw%nv(iintsp),lapw%nv(jintsp),ab_size,chi,conjg(ab2),SIZE(ab2,1),ab1,SIZE(ab1,1),CMPLX(1.0,0.0),hmat%data_c,SIZE(hmat%data_c,1))
#endif
!$cuf kernel do<<<*,256>>>
do i = 1,size(ab1_dev,2)
do j = 1,size(ab1_dev,1)
ab1_dev(j,i) = conjg(ab1_dev(j,i))
enddo
enddo
CALL zgemm("N","T",lapw%nv(iintsp),lapw%nv(jintsp),ab_size,chi,ab2_dev,SIZE(ab2_dev,1),&
ab1_dev,SIZE(ab1_dev,1),CMPLX(1.0,0.0),c_dev,SIZE(c_dev,1))
ENDIF
ENDIF
END DO
#ifdef _CUDA
hmat%data_c = c_dev
#endif
IF (hmat%l_real) THEN
hmat%data_r=hmat%data_r+REAL(hmat%data_c)
ENDIF
#ifdef _CUDA
call nvtxEndRange
END SUBROUTINE priv_noMPI_gpu
#endif
SUBROUTINE priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
!Calculate overlap matrix
USE m_hsmt_ab
USE m_constants, ONLY : fpi_const,tpi_const
USE m_types
USE m_ylm
IMPLICIT NONE
TYPE(t_mpi),INTENT(IN) :: mpi
TYPE(t_sym),INTENT(IN) :: sym
TYPE(t_noco),INTENT(IN) :: noco
TYPE(t_cell),INTENT(IN) :: cell
TYPE(t_atoms),INTENT(IN) :: atoms
TYPE(t_lapw),INTENT(IN) :: lapw
TYPE(t_tlmplm),INTENT(IN) :: td
! ..
! .. Scalar Arguments ..
INTEGER, INTENT (IN) :: n,isp,iintsp,jintsp
COMPLEX,INTENT(in) :: chi
! ..
! .. Array Arguments ..
REAL,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
CLASS(t_mat),INTENT(INOUT)::hmat
INTEGER:: nn,na,ab_size,l,ll,m
COMPLEX,ALLOCATABLE:: ab(:,:),ab1(:,:),ab2(:,:)
real :: rchi
ALLOCATE(ab(MAXVAL(lapw%nv),2*atoms%lmaxd*(atoms%lmaxd+2)+2),ab1(lapw%nv(jintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
IF (iintsp.NE.jintsp) ALLOCATE(ab2(lapw%nv(iintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
IF (hmat%l_real) THEN
IF (ANY(SHAPE(hmat%data_c)/=SHAPE(hmat%data_r))) THEN
DEALLOCATE(hmat%data_c)
ALLOCATE(hmat%data_c(SIZE(hmat%data_r,1),SIZE(hmat%data_r,2)))
ENDIF
hmat%data_c=0.0
ENDIF
DO nn = 1,atoms%neq(n)
na = SUM(atoms%neq(:n-1))+nn
IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN
rchi=MERGE(REAL(chi),REAL(chi)*2,(atoms%invsat(na)==0))
CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.)
!Calculate Hamiltonian
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab1,SIZE(ab1,1))
!ab1=MATMUL(ab(:lapw%nv(iintsp),:ab_size),td%h_loc(:ab_size,:ab_size,n,isp))
IF (iintsp==jintsp) THEN
CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,CONJG(ab1),SIZE(ab1,1),1.0,hmat%data_c,SIZE(hmat%data_c,1))
ELSE !here the l_ss off-diagonal part starts
!Second set of ab is needed
CALL hsmt_ab(sym,atoms,noco,isp,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.)
CALL zgemm("N","N",lapw%nv(iintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab2,SIZE(ab2,1))
!Multiply for Hamiltonian
CALL zgemm("N","T",lapw%nv(iintsp),lapw%nv(jintsp),ab_size,chi,conjg(ab2),SIZE(ab2,1),ab1,SIZE(ab1,1),CMPLX(1.0,0.0),hmat%data_c,SIZE(hmat%data_c,1))
ENDIF
ENDIF
END DO
IF (hmat%l_real) THEN
hmat%data_r=hmat%data_r+REAL(hmat%data_c)
ENDIF
END SUBROUTINE priv_noMPI
......
......@@ -339,7 +339,7 @@ CONTAINS
type(t_blacsdata),INTENT(OUT)::blacsdata
INTEGER,INTENT(OUT):: local_size1,local_size2
#ifdef CPP_MPI
#ifdef CPP_SCALAPACK
INCLUDE 'mpif.h'
INTEGER :: myrowssca,mycolssca
INTEGER :: iamblacs,npblacs,np,myid,mycol,myrow
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment