Commit e288839d authored by Matthias Redies's avatar Matthias Redies

Merge branch 'develop' into MetaGGA

parents d7380394 cb805565
...@@ -8,6 +8,8 @@ if (CLI_FLEUR_USE_GPU) ...@@ -8,6 +8,8 @@ if (CLI_FLEUR_USE_GPU)
message("Using cuda8") message("Using cuda8")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "cuda9") elseif(${CLI_FLEUR_USE_GPU} MATCHES "cuda9")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas") set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "cuda9.1")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.1,cc60 -Mcuda=rdc -Mcudalib=cublas")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "nvtx") elseif(${CLI_FLEUR_USE_GPU} MATCHES "nvtx")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas -lnvToolsExt ") set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas -lnvToolsExt ")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "emu") elseif(${CLI_FLEUR_USE_GPU} MATCHES "emu")
...@@ -21,7 +23,7 @@ if (CLI_FLEUR_USE_GPU) ...@@ -21,7 +23,7 @@ if (CLI_FLEUR_USE_GPU)
LINK_LIBRARIES "-lcusolver" LINK_LIBRARIES "-lcusolver"
) )
if (FLEUR_USE_CUSOLVER) if (FLEUR_USE_CUSOLVER)
set(${FLEUR_LIBRARIES} "${FLEUR_LIBRARIES};-lcusolver") set(FLEUR_LIBRARIES "${FLEUR_LIBRARIES};-lcusolver")
set(FLEUR_MPI_DEFINITIONS ${FLEUR_MPI_DEFINITIONS} "CPP_CUSOLVER") set(FLEUR_MPI_DEFINITIONS ${FLEUR_MPI_DEFINITIONS} "CPP_CUSOLVER")
set(FLEUR_DEFINITIONS ${FLEUR_DEFINITIONS} "CPP_CUSOLVER") set(FLEUR_DEFINITIONS ${FLEUR_DEFINITIONS} "CPP_CUSOLVER")
endif() endif()
......
...@@ -12,92 +12,64 @@ MODULE m_magma ...@@ -12,92 +12,64 @@ MODULE m_magma
! using the MAGMA library for multiple GPUs ! using the MAGMA library for multiple GPUs
!********************************************************** !**********************************************************
CONTAINS CONTAINS
SUBROUTINE magma_diag(nsize,eig,ne,a_r,b_r,z_r,a_c,b_c,z_c) SUBROUTINE magma_diag(hmat,smat,ne,eig,zmat)
#ifdef CPP_MAGMA #ifdef CPP_MAGMA
use magma use magma
#endif #endif
#include"cpp_double.h" use m_types
IMPLICIT NONE IMPLICIT NONE
! ... Arguments ... ! ... Arguments ...
TYPE(t_mat),INTENT(INOUT) :: hmat,smat
INTEGER, INTENT (IN) :: nsize INTEGER,INTENT(INOUT) :: ne
CLASS(t_mat),ALLOCATABLE,INTENT(OUT) :: zmat
REAL,INTENT(OUT) :: eig(:)
REAL, INTENT(OUT) :: eig(:)
INTEGER, INTENT(INOUT) :: ne
REAL, OPTIONAL,ALLOCATABLE, INTENT (INOUT) :: a_r(:),b_r(:)
REAL, OPTIONAL,ALLOCATABLE, INTENT (INOUT) :: z_r(:,:)
COMPLEX, OPTIONAL,ALLOCATABLE, INTENT (INOUT) :: a_c(:),b_c(:)
COMPLEX, OPTIONAL,ALLOCATABLE, INTENT (INOUT) :: z_c(:,:)
#ifdef CPP_MAGMA #ifdef CPP_MAGMA
! ... Local Variables .. ! ... Local Variables ..
INTEGER iind,ind1,ind2,info,lwork,liwork,lrwork,err,i,mout(1) INTEGER :: lwork,liwork,lrwork,err,mout(1)
REAL eigTemp(nsize) REAL :: eigTemp(hmat%matsize1)
LOGICAL:: initialized=.false. LOGICAL :: initialized=.false.
REAL, ALLOCATABLE :: rwork(:) REAL, ALLOCATABLE :: rwork(:)
INTEGER, ALLOCATABLE :: iwork(:) INTEGER, ALLOCATABLE :: iwork(:)
COMPLEX, ALLOCATABLE :: work(:)
REAL, ALLOCATABLE :: largea_r(:,:),largeb_r(:,:)
COMPLEX, ALLOCATABLE :: largea_c(:,:),largeb_c(:,:)
COMPLEX,ALLOCATABLE :: work(:)
LOGICAL :: l_real
l_real=present(a_r)
print *,"MAGMA start"
IF (.NOT.initialized) THEN IF (.NOT.initialized) THEN
initialized=.true. initialized=.TRUE.
call magmaf_init() CALL magmaf_init()
print *,"MAGMA init"
ENDIF ENDIF
!********************************** IF (hmat%l_real) THEN
!expand from packed to full storage CALL juDFT_error("REAL diagonalization not implemented in magma.F90")
!**********************************
!hamiltonian
if (l_real) THEN
call packed_to_full(nsize,a_r,largea_r)
call packed_to_full(nsize,b_r,largeb_r)
!deallocate(a_r,b_r)
ELSE ELSE
call packed_to_full(nsize,a_c,largea_c) !Query the workspace size
call packed_to_full(nsize,b_c,largeb_c) ALLOCATE(work(1),rwork(1),iwork(1))
!deallocate(a_c,b_c) CALL magmaf_zhegvdx(1,'v','i','l',hmat%matsize1,hmat%data_c,SIZE(hmat%data_c,1),smat%data_c,SIZE(smat%data_c,1),&
Endif 0.0,0.0,1,ne,mout,eigTemp,work,-1,rwork,-1,iwork,-1,err)
IF (err/=0) CALL juDFT_error("Failed to query workspaces",calledby="magma.F90")
if (l_real) call juDFT_error("REAL diagonalization not implemented in magma.F90") lwork=work(1)
lrwork=rwork(1)
!Query the workspace size liwork=iwork(1)
allocate(work(1),rwork(1),iwork(1)) DEALLOCATE(work,rwork,iwork)
print *,"Magma workspace query" ALLOCATE(work(lwork),rwork(lrwork),iwork(liwork))
call flush() !Now the diagonalization
call magmaf_zhegvdx(1,'v','i','l',nsize,largea_c,nsize,largeb_c,nsize,& CALL magmaf_zhegvdx(1,'v','i','l',hmat%matsize1,hmat%data_c,SIZE(hmat%data_c,1),smat%data_c,SIZE(smat%data_c,1),&
0.0,0.0,1,ne,mout,eigTemp,work,-1,rwork,-1,iwork,-1,err) 0.0,0.0,1,ne,mout,eigTemp,work,lwork,rwork,lrwork,iwork,liwork,err)
lwork=work(1) IF (err/=0) CALL juDFT_error("Magma failed to diagonalize Hamiltonian")
lrwork=rwork(1) ENDIF
liwork=iwork(1) ALLOCATE(t_mat::zmat)
print*,"MAGMA:",lwork,lrwork,liwork CALL zmat%alloc(hmat%l_real,hmat%matsize1,ne)
deallocate(work,rwork,iwork)
allocate(work(lwork),rwork(lrwork),iwork(liwork))
if (err/=0) call juDFT_error("Failed to allocate workspaces",calledby="magma.F90")
!Now the diagonalization
print *,"Magma diagonalization"
print *,nsize,shape(largea_c),shape(eigTemp),ne
call magmaf_zhegvdx(1,'v','i','l',nsize,largea_c,nsize,largeb_c,nsize,&
0.0,0.0,1,ne,mout,eigTemp,work,lwork,rwork,lrwork,iwork,liwork,err)
print*,"MAGMA info:",err
if (err/=0) call juDFT_error("Magma failed to diagonalize Hamiltonian")
print *,"MAGMA mout:",mout
DO i = 1, ne DO i = 1, ne
eig(i) = eigTemp(i) eig(i) = eigTemp(i)
z_c(:nsize,i)=largea_c(:nsize,i) IF (hmat%l_real) THEN
zmat%data_r(:,i)=hmat%data_r(:nsize,i)
ELSE
zmat%data_c(:,i)=hmat%data_c(:nsize,i)
ENDIF
END DO END DO
!call judft_error("Eigenvectors are not calculated in MAGMA")
#endif #endif
END SUBROUTINE magma_diag END SUBROUTINE magma_diag
END MODULE m_magma END MODULE m_magma
......
...@@ -38,22 +38,22 @@ CONTAINS ...@@ -38,22 +38,22 @@ CONTAINS
!up-up component (or only component in collinear case) !up-up component (or only component in collinear case)
IF (SIZE(mat)==1) THEN IF (SIZE(mat)==1) THEN
CALL mat_final%move(mat(1,1)) CALL mat_final%move(mat(1,1))
CALL mat(1,1)%free() !CALL mat(1,1)%free()
RETURN RETURN
ENDIF ENDIF
CALL mat_final%copy(mat(1,1),1,1) CALL mat_final%copy(mat(1,1),1,1)
CALL mat(1,1)%free() !CALL mat(1,1)%free()
!down-down component !down-down component
CALL mat_final%copy(mat(2,2),lapw%nv(1)+atoms%nlotot+1,lapw%nv(1)+atoms%nlotot+1) CALL mat_final%copy(mat(2,2),lapw%nv(1)+atoms%nlotot+1,lapw%nv(1)+atoms%nlotot+1)
CALL mat(2,2)%free() !CALL mat(2,2)%free()
!Now collect off-diagonal parts !Now collect off-diagonal parts
CALL mat(1,2)%add_transpose(mat(2,1)) CALL mat(1,2)%add_transpose(mat(2,1))
CALL mat_final%copy(mat(1,2),1,lapw%nv(1)+atoms%nlotot+1) CALL mat_final%copy(mat(1,2),1,lapw%nv(1)+atoms%nlotot+1)
CALL mat(1,2)%free() !CALL mat(1,2)%free()
CALL mat(2,1)%free() !CALL mat(2,1)%free()
END SUBROUTINE eigen_redist_matrix END SUBROUTINE eigen_redist_matrix
END MODULE m_eigen_redist_matrix END MODULE m_eigen_redist_matrix
......
...@@ -43,7 +43,11 @@ CONTAINS ...@@ -43,7 +43,11 @@ CONTAINS
INTEGER, INTENT (IN) :: ispin INTEGER, INTENT (IN) :: ispin
!locals !locals
#ifdef CPP_GPU
REAL, ALLOCATABLE,MANAGED :: fj(:,:,:,:),gj(:,:,:,:)
#else
REAL, ALLOCATABLE :: fj(:,:,:,:),gj(:,:,:,:) REAL, ALLOCATABLE :: fj(:,:,:,:),gj(:,:,:,:)
#endif
INTEGER :: iintsp,jintsp,n INTEGER :: iintsp,jintsp,n
COMPLEX :: chi(2,2),chi_one COMPLEX :: chi(2,2),chi_one
...@@ -67,7 +71,7 @@ CONTAINS ...@@ -67,7 +71,7 @@ CONTAINS
IF (.NOT.noco%l_noco) THEN IF (.NOT.noco%l_noco) THEN
!This is for collinear calculations: the (1,1) element of the matrices is all !This is for collinear calculations: the (1,1) element of the matrices is all
!that is needed and allocated !that is needed and allocated
CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,cell,1,1,chi_one,lapw,enpara%el0,& CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,1,1,chi_one,lapw,enpara%el0,&
td%e_shift(n,ispin),usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),smat(1,1),hmat(1,1)) td%e_shift(n,ispin),usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),smat(1,1),hmat(1,1))
CALL hsmt_nonsph(n,mpi,sym,atoms,ispin,1,1,chi_one,noco,cell,lapw,td,& CALL hsmt_nonsph(n,mpi,sym,atoms,ispin,1,1,chi_one,noco,cell,lapw,td,&
fj(:,0:,ispin,:),gj(:,0:,ispin,:),hmat(1,1)) fj(:,0:,ispin,:),gj(:,0:,ispin,:),hmat(1,1))
...@@ -79,7 +83,7 @@ CONTAINS ...@@ -79,7 +83,7 @@ CONTAINS
!stored in tmp-variables. Then these are distributed (rotated) into the 2x2 !stored in tmp-variables. Then these are distributed (rotated) into the 2x2
!global spin-matrices. !global spin-matrices.
CALL hmat_tmp%clear();CALL smat_tmp%clear() CALL hmat_tmp%clear();CALL smat_tmp%clear()
CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,cell,1,1,chi_one,lapw,enpara%el0,td%e_shift(n,ispin),& CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,1,1,chi_one,lapw,enpara%el0,td%e_shift(n,ispin),&
usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),smat_tmp,hmat_tmp) usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),smat_tmp,hmat_tmp)
CALL hsmt_nonsph(n,mpi,sym,atoms,ispin,1,1,chi_one,noco,cell,lapw,td,& CALL hsmt_nonsph(n,mpi,sym,atoms,ispin,1,1,chi_one,noco,cell,lapw,td,&
fj(:,0:,ispin,:),gj(:,0:,ispin,:),hmat_tmp) fj(:,0:,ispin,:),gj(:,0:,ispin,:),hmat_tmp)
...@@ -99,7 +103,7 @@ CONTAINS ...@@ -99,7 +103,7 @@ CONTAINS
CALL hsmt_spinor(ispin,n,noco,chi) CALL hsmt_spinor(ispin,n,noco,chi)
DO iintsp=1,2 DO iintsp=1,2
DO jintsp=1,2 DO jintsp=1,2
CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,cell,iintsp,jintsp,chi(iintsp,jintsp),& CALL hsmt_sph(n,atoms,mpi,ispin,input,noco,iintsp,jintsp,chi(iintsp,jintsp),&
lapw,enpara%el0,td%e_shift(n,ispin),usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),& lapw,enpara%el0,td%e_shift(n,ispin),usdus,fj(:,0:,ispin,:),gj(:,0:,ispin,:),&
smat(iintsp,jintsp),hmat(iintsp,jintsp)) smat(iintsp,jintsp),hmat(iintsp,jintsp))
CALL hsmt_nonsph(n,mpi,sym,atoms,ispin,iintsp,jintsp,chi(iintsp,jintsp),noco,cell,& CALL hsmt_nonsph(n,mpi,sym,atoms,ispin,iintsp,jintsp,chi(iintsp,jintsp),noco,cell,&
......
...@@ -62,7 +62,7 @@ CONTAINS ...@@ -62,7 +62,7 @@ CONTAINS
USE m_ylm USE m_ylm
USE m_apws USE m_apws
USE cudafor USE cudafor
! USE nvtx USE nvtx
IMPLICIT NONE IMPLICIT NONE
TYPE(t_sym),INTENT(IN) :: sym TYPE(t_sym),INTENT(IN) :: sym
TYPE(t_cell),INTENT(IN) :: cell TYPE(t_cell),INTENT(IN) :: cell
...@@ -92,8 +92,9 @@ CONTAINS ...@@ -92,8 +92,9 @@ CONTAINS
COMPLEX,ALLOCATABLE,DEVICE :: c_ph_dev(:,:) COMPLEX,ALLOCATABLE,DEVICE :: c_ph_dev(:,:)
REAL, ALLOCATABLE,DEVICE :: gkrot_dev(:,:) REAL, ALLOCATABLE,DEVICE :: gkrot_dev(:,:)
INTEGER :: grid, block INTEGER :: grid, block
!INTEGER :: istat INTEGER :: istat
call nvtxStartRange("hsmt_ab",3)
lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph) lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph)
ALLOCATE(c_ph_dev(lapw%nv(1),MERGE(2,1,noco%l_ss))) ALLOCATE(c_ph_dev(lapw%nv(1),MERGE(2,1,noco%l_ss)))
...@@ -129,17 +130,12 @@ CONTAINS ...@@ -129,17 +130,12 @@ CONTAINS
!--> synthesize the complex conjugates of a and b !--> synthesize the complex conjugates of a and b
!call nvtxStartRange("hsmt_synthAB",5)
!istat = cudaDeviceSynchronize()
! pretty ugly solution ! pretty ugly solution
block = 256 block = 256
grid = lapw%nv(1)/(block*4) + 1 grid = lapw%nv(1)/(block*4) + 1
CALL synth_ab<<<grid,block>>>(grid,block,lapw%nv(1),lmax,ab_size,gkrot_dev,& CALL synth_ab<<<grid,block>>>(grid,block,lapw%nv(1),lmax,ab_size,gkrot_dev,&
fj(:,:,iintsp),gj(:,:,iintsp),c_ph_dev(:,iintsp),ab) fj(:,:,iintsp),gj(:,:,iintsp),c_ph_dev(:,iintsp),ab)
!istat = cudaDeviceSynchronize()
!call nvtxEndRange
IF (PRESENT(abclo)) THEN IF (PRESENT(abclo)) THEN
print*, "Ooooops, TODO in hsmt_ab" print*, "Ooooops, TODO in hsmt_ab"
...@@ -169,6 +165,8 @@ CONTAINS ...@@ -169,6 +165,8 @@ CONTAINS
DEALLOCATE(c_ph_dev) DEALLOCATE(c_ph_dev)
DEALLOCATE(gkrot_dev) DEALLOCATE(gkrot_dev)
istat = cudaDeviceSynchronize()
call nvtxEndRange
END SUBROUTINE hsmt_ab_gpu END SUBROUTINE hsmt_ab_gpu
#endif #endif
......
...@@ -17,7 +17,7 @@ CONTAINS ...@@ -17,7 +17,7 @@ CONTAINS
DO iintsp=1,2 DO iintsp=1,2
DO jintsp=1,2 DO jintsp=1,2
mat(jintsp,iintsp)%data_c(:,:)=chi(jintsp,iintsp)*mat_tmp%data_c(:,:)+mat(jintsp,iintsp)%data_c(:,:) mat(jintsp,iintsp)%data_c(:,:)=chi(iintsp,jintsp)*mat_tmp%data_c(:,:)+mat(jintsp,iintsp)%data_c(:,:)
ENDDO ENDDO
ENDDO ENDDO
END SUBROUTINE hsmt_distspins END SUBROUTINE hsmt_distspins
......
...@@ -29,39 +29,22 @@ CONTAINS ...@@ -29,39 +29,22 @@ CONTAINS
INTEGER, INTENT (IN) :: n,isp,iintsp,jintsp INTEGER, INTENT (IN) :: n,isp,iintsp,jintsp
COMPLEX,INTENT(IN) :: chi COMPLEX,INTENT(IN) :: chi
! .. Array Arguments .. ! .. Array Arguments ..
REAL,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:) #if defined CPP_GPU
REAL,MANAGED,INTENT(IN) :: fj(:,:,:),gj(:,:,:)
#else
REAL,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
#endif
CLASS(t_mat),INTENT(INOUT) ::hmat CLASS(t_mat),INTENT(INOUT) ::hmat
#if defined CPP_GPU #if defined CPP_GPU
REAL, ALLOCATABLE,DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
COMPLEX,ALLOCATABLE,DEVICE :: h_loc_dev(:,:) COMPLEX,ALLOCATABLE,DEVICE :: h_loc_dev(:,:)
COMPLEX,ALLOCATABLE,DEVICE :: c_dev(:,:)
#endif #endif
CALL timestart("non-spherical setup") CALL timestart("non-spherical setup")
IF (mpi%n_size==1) THEN IF (mpi%n_size==1) THEN
#if defined CPP_GPU #if defined CPP_GPU
ALLOCATE(fj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
ALLOCATE(gj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
fj_dev(1:,1:,1:)= fj(1:,0:,1:)
gj_dev(1:,1:,1:)= gj(1:,0:,1:)
ALLOCATE(h_loc_dev(size(td%h_loc,1),size(td%h_loc,2))) ALLOCATE(h_loc_dev(size(td%h_loc,1),size(td%h_loc,2)))
h_loc_dev(1:,1:) = CONJG(td%h_loc(0:,0:,n,isp)) h_loc_dev(1:,1:) = CONJG(td%h_loc(0:,0:,n,isp))
IF (hmat%l_real) THEN CALL priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,h_loc_dev,fj,gj,hmat)
IF (ANY(SHAPE(hmat%data_c)/=SHAPE(hmat%data_r))) THEN
DEALLOCATE(hmat%data_c)
ALLOCATE(hmat%data_c(SIZE(hmat%data_r,1),SIZE(hmat%data_r,2)))
ENDIF
hmat%data_c=0.0
ENDIF
ALLOCATE(c_dev(SIZE(hmat%data_c,1),SIZE(hmat%data_c,2)))
c_dev = hmat%data_c
CALL priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,h_loc_dev,fj_dev,gj_dev,c_dev)
hmat%data_c = c_dev
IF (hmat%l_real) THEN
hmat%data_r=hmat%data_r+REAL(hmat%data_c)
ENDIF
#else #else
CALL priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat) CALL priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
#endif #endif
...@@ -72,7 +55,7 @@ CONTAINS ...@@ -72,7 +55,7 @@ CONTAINS
END SUBROUTINE hsmt_nonsph END SUBROUTINE hsmt_nonsph
#if defined CPP_GPU #if defined CPP_GPU
SUBROUTINE priv_noMPI_gpu(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,h_loc_dev,fj_dev,gj_dev,c_dev) SUBROUTINE priv_noMPI_gpu(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,h_loc_dev,fj_dev,gj_dev,hmat)
!Calculate overlap matrix, GPU version !Calculate overlap matrix, GPU version
!note that basically all matrices in the GPU version are conjugates of their cpu counterparts !note that basically all matrices in the GPU version are conjugates of their cpu counterparts
USE m_hsmt_ab USE m_hsmt_ab
...@@ -101,8 +84,7 @@ CONTAINS ...@@ -101,8 +84,7 @@ CONTAINS
! .. ! ..
! .. Array Arguments .. ! .. Array Arguments ..
REAL, INTENT(IN), DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:) REAL, INTENT(IN), DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
COMPLEX,INTENT(INOUT),DEVICE :: c_dev(:,:) CLASS(t_mat),INTENT(INOUT) ::hmat
INTEGER:: nn,na,ab_size,l,ll,m INTEGER:: nn,na,ab_size,l,ll,m
real :: rchi real :: rchi
...@@ -114,6 +96,14 @@ CONTAINS ...@@ -114,6 +96,14 @@ CONTAINS
ALLOCATE(ab_dev(MAXVAL(lapw%nv),2*atoms%lmaxd*(atoms%lmaxd+2)+2)) ALLOCATE(ab_dev(MAXVAL(lapw%nv),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
IF (iintsp.NE.jintsp) ALLOCATE(ab2_dev(lapw%nv(iintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2)) IF (iintsp.NE.jintsp) ALLOCATE(ab2_dev(lapw%nv(iintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
IF (hmat%l_real) THEN
IF (ANY(SHAPE(hmat%data_c)/=SHAPE(hmat%data_r))) THEN
DEALLOCATE(hmat%data_c)
ALLOCATE(hmat%data_c(SIZE(hmat%data_r,1),SIZE(hmat%data_r,2)))
ENDIF
hmat%data_c=0.0
ENDIF
DO nn = 1,atoms%neq(n) DO nn = 1,atoms%neq(n)
na = SUM(atoms%neq(:n-1))+nn na = SUM(atoms%neq(:n-1))+nn
IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN
...@@ -124,10 +114,9 @@ CONTAINS ...@@ -124,10 +114,9 @@ CONTAINS
!Calculate Hamiltonian !Calculate Hamiltonian
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab_dev,SIZE(ab_dev,1),& CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab_dev,SIZE(ab_dev,1),&
h_loc_dev,SIZE(h_loc_dev,1),CMPLX(0.,0.),ab1_dev,SIZE(ab1_dev,1)) h_loc_dev,SIZE(h_loc_dev,1),CMPLX(0.,0.),ab1_dev,SIZE(ab1_dev,1))
!ab1=MATMUL(ab(:lapw%nv(iintsp),:ab_size),td%h_loc(:ab_size,:ab_size,n,isp))
IF (iintsp==jintsp) THEN IF (iintsp==jintsp) THEN
call nvtxStartRange("zherk",3) call nvtxStartRange("zherk",3)
CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,ab1_dev,SIZE(ab1_dev,1),1.0,c_dev,SIZE(c_dev,1)) CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,ab1_dev,SIZE(ab1_dev,1),1.0,hmat%data_c,SIZE(hmat%data_c,1))
istat = cudaDeviceSynchronize() istat = cudaDeviceSynchronize()
call nvtxEndRange() call nvtxEndRange()
ELSE !here the l_ss off-diagonal part starts ELSE !here the l_ss off-diagonal part starts
...@@ -144,11 +133,14 @@ CONTAINS ...@@ -144,11 +133,14 @@ CONTAINS
enddo enddo
enddo enddo
CALL zgemm("N","T",lapw%nv(iintsp),lapw%nv(jintsp),ab_size,chi,ab2_dev,SIZE(ab2_dev,1),& CALL zgemm("N","T",lapw%nv(iintsp),lapw%nv(jintsp),ab_size,chi,ab2_dev,SIZE(ab2_dev,1),&
ab1_dev,SIZE(ab1_dev,1),CMPLX(1.0,0.0),c_dev,SIZE(c_dev,1)) ab1_dev,SIZE(ab1_dev,1),CMPLX(1.0,0.0),hmat%data_c,SIZE(hmat%data_c,1))
ENDIF ENDIF
ENDIF ENDIF
END DO END DO
IF (hmat%l_real) THEN
hmat%data_r=hmat%data_r+REAL(hmat%data_c)
ENDIF
call nvtxEndRange call nvtxEndRange
END SUBROUTINE priv_noMPI_gpu END SUBROUTINE priv_noMPI_gpu
#endif #endif
......
This diff is collapsed.
...@@ -1733,7 +1733,7 @@ MODULE m_cdnpot_io_hdf ...@@ -1733,7 +1733,7 @@ MODULE m_cdnpot_io_hdf
dimsInt(:4)=(/2,nmzxy,od_nq2-1,nvac/) dimsInt(:4)=(/2,nmzxy,od_nq2-1,nvac/)
CALL h5dopen_f(groupID, 'cdomvxy', cdomvxySetID, hdfError) CALL h5dopen_f(groupID, 'cdomvxy', cdomvxySetID, hdfError)
CALL io_write_complex3(cdomvxySetID,(/-1,1,1,1/),dimsInt(:4),den%vacxy(:,:,:,3)) CALL io_write_complex3(cdomvxySetID,(/-1,1,1,1/),dimsInt(:4),den%vacxy(:,:,:nvac,3))
CALL h5dclose_f(cdomvxySetID, hdfError) CALL h5dclose_f(cdomvxySetID, hdfError)
END IF END IF
END IF END IF
...@@ -1818,7 +1818,7 @@ MODULE m_cdnpot_io_hdf ...@@ -1818,7 +1818,7 @@ MODULE m_cdnpot_io_hdf
CALL h5screate_simple_f(4,dims(:4),cdomvxySpaceID,hdfError) CALL h5screate_simple_f(4,dims(:4),cdomvxySpaceID,hdfError)
CALL h5dcreate_f(groupID, "cdomvxy", H5T_NATIVE_DOUBLE, cdomvxySpaceID, cdomvxySetID, hdfError) CALL h5dcreate_f(groupID, "cdomvxy", H5T_NATIVE_DOUBLE, cdomvxySpaceID, cdomvxySetID, hdfError)
CALL h5sclose_f(cdomvxySpaceID,hdfError) CALL h5sclose_f(cdomvxySpaceID,hdfError)
CALL io_write_complex3(cdomvxySetID,(/-1,1,1,1/),dimsInt(:4),den%vacxy(:,:,:,3)) CALL io_write_complex3(cdomvxySetID,(/-1,1,1,1/),dimsInt(:4),den%vacxy(:,:,:nvac,3))
CALL h5dclose_f(cdomvxySetID, hdfError) CALL h5dclose_f(cdomvxySetID, hdfError)
END IF END IF
END IF END IF
...@@ -1922,7 +1922,7 @@ MODULE m_cdnpot_io_hdf ...@@ -1922,7 +1922,7 @@ MODULE m_cdnpot_io_hdf
CALL h5screate_simple_f(4,dims(:4),cdomvxySpaceID,hdfError) CALL h5screate_simple_f(4,dims(:4),cdomvxySpaceID,hdfError)
CALL h5dcreate_f(groupID, "cdomvxy", H5T_NATIVE_DOUBLE, cdomvxySpaceID, cdomvxySetID, hdfError) CALL h5dcreate_f(groupID, "cdomvxy", H5T_NATIVE_DOUBLE, cdomvxySpaceID, cdomvxySetID, hdfError)
CALL h5sclose_f(cdomvxySpaceID,hdfError) CALL h5sclose_f(cdomvxySpaceID,hdfError)
CALL io_write_complex3(cdomvxySetID,(/-1,1,1,1/),dimsInt(:4),den%vacxy(:,:,:,3)) CALL io_write_complex3(cdomvxySetID,(/-1,1,1,1/),dimsInt(:4),den%vacxy(:,:,:nvac,3))
CALL h5dclose_f(cdomvxySetID, hdfError) CALL h5dclose_f(cdomvxySetID, hdfError)
END IF END IF
END IF END IF
......
This diff is collapsed.
...@@ -75,6 +75,7 @@ contains ...@@ -75,6 +75,7 @@ contains
type(t_potden) :: resDen, vYukawa type(t_potden) :: resDen, vYukawa
integer :: ierr(2) integer :: ierr(2)
!External functions !External functions
real :: CPP_BLAS_sdot real :: CPP_BLAS_sdot
external :: CPP_BLAS_sdot external :: CPP_BLAS_sdot
...@@ -230,7 +231,7 @@ contains ...@@ -230,7 +231,7 @@ contains
! KERKER PRECONDITIONER ! KERKER PRECONDITIONER
if( input%preconditioning_param /= 0 ) then if( input%preconditioning_param /= 0 ) then
call resDen%init( stars, atoms, sphhar, vacuum, input%jspins, noco%l_noco, 1001 ) call resDen%init( stars, atoms, sphhar, vacuum, input%jspins, noco%l_noco, POTDEN_TYPE_DEN )
call vYukawa%init( stars, atoms, sphhar, vacuum, input%jspins, noco%l_noco, 4 ) call vYukawa%init( stars, atoms, sphhar, vacuum, input%jspins, noco%l_noco, 4 )
MPI0_b: if( mpi%irank == 0 ) then MPI0_b: if( mpi%irank == 0 ) then
call resDen%subPotDen( outDen, inDen ) call resDen%subPotDen( outDen, inDen )
...@@ -258,6 +259,8 @@ contains ...@@ -258,6 +259,8 @@ contains
* vYukawa%mt(1:atoms%jri(n),lh,n,1) * atoms%rmsh(1:atoms%jri(n),n) ** 2 * vYukawa%mt(1:atoms%jri(n),lh,n,1) * atoms%rmsh(1:atoms%jri(n),n) ** 2
end do end do
end do end do
resDen%vacz = resDen%vacz - input%preconditioning_param ** 2 / fpi_const * vYukawa%vacz
resDen%vacxy = resDen%vacxy - input%preconditioning_param ** 2 / fpi_const * vYukawa%vacxy
if( input%jspins == 2 ) call resDen%ChargeAndMagnetisationToSpins() if( input%jspins == 2 ) call resDen%ChargeAndMagnetisationToSpins()
! fix the preconditioned density ! fix the preconditioned density
call outDen%addPotDen( resDen, inDen ) call outDen%addPotDen( resDen, inDen )
......
...@@ -8,9 +8,9 @@ jt::testrun("$executable ",$workdir); ...@@ -8,9 +8,9 @@ jt::testrun("$executable ",$workdir);
#now test output #now test output
$result=jt::test_grepexists("$workdir/out","it= 9 is completed"); $result=jt::test_grepexists("$workdir/out","it= 9 is completed");
$result+=jt::test_grepnumber("$workdir/out","new fermi energy",".*: *([^ ]*)",0.263,0.001); $result+=jt::test_grepnumber("$workdir/out","new fermi energy",".*: *([^ ]*)",0.262,0.001);
$result+=jt::test_grepnumber("$workdir/out","total energy=",".*= *([^ ]*)",-3191.9707,0.001); $result+=jt::test_grepnumber("$workdir/out","total energy=",".*= *([^ ]*)",-3191.938,0.001);
$result+=jt::test_grepnumber("$workdir/out","distance of charge densities for it= *9",": *([^ ]*)",1.439,0.01); $result+=jt::test_grepnumber("$workdir/out","distance of charge densities for it= *9",": *([^ ]*)",0.25,0.01);
$result+=jt::test_grepnumber("$workdir/out","mm 1",".*mm 1 *([^ ]*)",1.75,0.03); $result+=jt::test_grepnumber("$workdir/out","mm 1",".*mm 1 *([^ ]*)",1.72,0.03);
jt::stageresult($workdir,$result,"1"); jt::stageresult($workdir,$result,"1");
MODULE m_types_gpumat MODULE m_types_gpumat
USE m_judft USE m_judft
USE m_types_mat USE m_types_mat
#ifdef CPP_GPU
USE cudafor
#endif
IMPLICIT NONE IMPLICIT NONE
!<Some routines are overwritten for GPU handling !<Some routines are overwritten for GPU handling
...@@ -102,6 +105,7 @@ CONTAINS ...@@ -102,6 +105,7 @@ CONTAINS
CALL judft_error("Inconsistency between data types in m_mat") CALL judft_error("Inconsistency between data types in m_mat")
END IF END IF
END SELECT END SELECT
i=cudaDeviceSynchronize()
END SELECT END SELECT
END SUBROUTINE t_gpumat_add_transpose END SUBROUTINE t_gpumat_add_transpose
...@@ -159,6 +163,8 @@ CONTAINS ...@@ -159,6 +163,8 @@ CONTAINS
ENDDO ENDDO
ENDIF ENDIF
END SELECT END SELECT
i=cudaDeviceSynchronize()
END IF END IF
end SUBROUTINE t_gpumat_transpose end SUBROUTINE t_gpumat_transpose
...@@ -250,6 +256,7 @@ CONTAINS ...@@ -250,6 +256,7 @@ CONTAINS
ENDDO ENDDO
ENDDO ENDDO
END IF END IF
i=cudaDeviceSynchronize()
CLASS default CLASS default
no_gpu=.TRUE. no_gpu=.TRUE.
END SELECT END SELECT
...@@ -294,6 +301,7 @@ CONTAINS ...@@ -294,6 +301,7 @@ CONTAINS
ENDDO ENDDO
ENDDO ENDDO
ENDIF ENDIF
n=cudaDeviceSynchronize()
END SELECT END SELECT
END SUBROUTINE t_gpumat_clear END SUBROUTINE t_gpumat_clear
#endif #endif
......
...@@ -82,23 +82,18 @@ MODULE m_types_mat ...@@ -82,23 +82,18 @@ MODULE m_types_mat
IMPLICIT NONE IMPLICIT NONE
CLASS(t_mat),INTENT(INOUT) :: mat CLASS(t_mat),INTENT(INOUT) :: mat
CLASS(t_mat),INTENT(IN) :: templ CLASS(t_mat),INTENT(IN) :: templ
SELECT TYPE(templ) mat%l_real=templ%l_real
TYPE is(t_mat) mat%matsize1=templ%matsize1
mat%l_real=templ%l_real