Commit ed66f4b1 authored by Matthias Redies's avatar Matthias Redies

Merge branch 'develop' into MetaGGA

parents a7f8e4f5 dfb6226e
#Check if we can compile with GPU
if (${CLI_FLEUR_USE_GPU})
if (CLI_FLEUR_USE_GPU)
#No check is done
set(FLEUR_USE_GPU TRUE)
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -ta=tesla:cuda8.0,cc60 -Mcuda:kepler+ -Minfo=accel -acc ")
message("GPU:${CLI_FLEUR_USE_GPU}")
if (${CLI_FLEUR_USE_GPU} MATCHES "cuda8")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -ta=tesla:cuda8.0,cc60 -Mcuda:kepler+ -Minfo=accel -Mcudalib=cublas -acc ")
message("Using cuda8")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "cuda9")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas -lnvToolsExt ")
endif()
set(FLEUR_MPI_DEFINITIONS ${FLEUR_MPI_DEFINITIONS} "CPP_GPU")
set(FLEUR_DEFINITIONS ${FLEUR_DEFINITIONS} "CPP_GPU")
else()
set(FLEUR_USE_GPU FALSE)
endif()
\ No newline at end of file
endif()
......@@ -337,32 +337,32 @@ CONTAINS
include 'mpif.h'
CALL CPU_TIME(t1)
CALL MPI_COMM_RANK(hmat%mpi_com,myid,info)
CALL MPI_COMM_SIZE(hmat%mpi_com,np,info)
smat%blacs_desc=hmat%blacs_desc
CALL MPI_COMM_RANK(hmat%blacsdata%mpi_com,myid,info)
CALL MPI_COMM_SIZE(hmat%blacsdata%mpi_com,np,info)
smat%blacsdata%blacs_desc=hmat%blacsdata%blacs_desc
call smat%generate_full_matrix()
call hmat%generate_full_matrix()
call smat%generate_full_matrix()
call hmat%generate_full_matrix()
!Transform to standard problem using SCALAPACK
IF (hmat%l_real) THEN
CALL pdpotrf('U',smat%global_size1,smat%data_r,1,1,smat%blacs_desc,info)
CALL pdpotrf('U',smat%global_size1,smat%data_r,1,1,smat%blacsdata%blacs_desc,info)
IF (info.NE.0) THEN
WRITE (*,*) 'Error in pdpotrf: info =',info
CALL juDFT_error("Diagonalization failed",calledby="chase_diag")
ENDIF
CALL pdsygst(1,'U',smat%global_size1,hmat%data_r,1,1,hmat%blacs_desc,smat%data_r,1,1,smat%blacs_desc,scale,info)
CALL pdsygst(1,'U',smat%global_size1,hmat%data_r,1,1,hmat%blacsdata%blacs_desc,smat%data_r,1,1,smat%blacsdata%blacs_desc,scale,info)
IF (ABS(scale-1)>1E-10) call judft_error("Scale parameter not implemented in chase_diag")
IF (info.NE.0) THEN
WRITE (6,*) 'Error in pdsygst: info =',info
CALL juDFT_error("Diagonalization failed",calledby="chase_diag")
ENDIF
ELSE
CALL pzpotrf('U',smat%global_size1,smat%data_c,1,1,smat%blacs_desc,info)
CALL pzpotrf('U',smat%global_size1,smat%data_c,1,1,smat%blacsdata%blacs_desc,info)
IF (info.NE.0) THEN
WRITE (*,*) 'Error in pzpotrf: info =',info
CALL juDFT_error("Diagonalization failed",calledby="chase_diag")
ENDIF
CALL pzhegst(1,'U',smat%global_size1,hmat%data_c,1,1,smat%blacs_desc,smat%data_c,1,1,smat%blacs_desc,scale,info)
CALL pzhegst(1,'U',smat%global_size1,hmat%data_c,1,1,smat%blacsdata%blacs_desc,smat%data_c,1,1,smat%blacsdata%blacs_desc,scale,info)
IF (ABS(scale-1)>1E-10) call judft_error("Scale parameter not implemented in chase_diag")
IF (info.NE.0) THEN
WRITE (6,*) 'Error in pzhegst: info =',info
......@@ -414,11 +414,11 @@ CONTAINS
! --> recover the generalized eigenvectors z by solving z' = l^t * z
IF (smat%l_real) THEN
CALL pdtrtrs('U','N','N',hmat%global_size1,hmat%global_size1,smat%data_r,1,1,smat%blacs_desc,&
hmat%data_r,1,1,smat%blacs_desc,info)
CALL pdtrtrs('U','N','N',hmat%global_size1,hmat%global_size1,smat%data_r,1,1,smat%blacsdata%blacs_desc,&
hmat%data_r,1,1,smat%blacsdata%blacs_desc,info)
ELSE
CALL pztrtrs('U','N','N',hmat%global_size1,hmat%global_size1,smat%data_c,1,1,smat%blacs_desc,&
hmat%data_c,1,1,smat%blacs_desc,info)
CALL pztrtrs('U','N','N',hmat%global_size1,hmat%global_size1,smat%data_c,1,1,smat%blacsdata%blacs_desc,&
hmat%data_c,1,1,smat%blacsdata%blacs_desc,info)
END IF
IF (info.NE.0) THEN
WRITE (6,*) 'Error in p?trtrs: info =',info
......@@ -429,7 +429,7 @@ CONTAINS
! having all eigenvectors corresponding to his eigenvalues as above
!
ALLOCATE(t_mpimat::zmat)
CALL zmat%init(hmat%l_real,hmat%global_size1,hmat%global_size1,hmat%mpi_com,.FALSE.)
CALL zmat%init(hmat%l_real,hmat%global_size1,hmat%global_size1,hmat%blacsdata%mpi_com,.FALSE.)
CALL zmat%copy(hmat,1,1)
!Distribute eigenvalues over PEs
......@@ -470,43 +470,43 @@ CONTAINS
EXTERNAL blacs_pinfo, blacs_gridinit
INTEGER,EXTERNAL::numroc,indxl2g
mat%mpi_com=hmat%mpi_com
mat%blacsdata%mpi_com=hmat%blacsdata%mpi_com
mat%global_size1=hmat%global_size1
mat%global_size2=hmat%global_size1
mat%l_real=hmat%l_real
!Determine rank and no of processors
CALL MPI_COMM_RANK(hmat%mpi_com,myid,ierr)
CALL MPI_COMM_SIZE(hmat%mpi_com,np,ierr)
CALL MPI_COMM_RANK(hmat%blacsdata%mpi_com,myid,ierr)
CALL MPI_COMM_SIZE(hmat%blacsdata%mpi_com,np,ierr)
!Init ChASE
IF (mat%l_real) THEN
CALL mpi_dchase_init(hmat%mpi_com,mat%global_size1, nev, nex, rowoff,coloff,rowlen,collen,&
mat%nprow,mat%npcol,myrow,mycol)
CALL mpi_dchase_init(hmat%blacsdata%mpi_com,mat%global_size1, nev, nex, rowoff,coloff,rowlen,collen,&
mat%blacsdata%nprow,mat%blacsdata%npcol,myrow,mycol)
ELSE
CALL mpi_zchase_init(hmat%mpi_com,mat%global_size1, nev, nex, rowoff,coloff,rowlen,collen,&
mat%nprow,mat%npcol,myrow,mycol)
CALL mpi_zchase_init(hmat%blacsdata%mpi_com,mat%global_size1, nev, nex, rowoff,coloff,rowlen,collen,&
mat%blacsdata%nprow,mat%blacsdata%npcol,myrow,mycol)
ENDIF
!Determine block-sizes
CALL MPI_ALLREDUCE(rowlen,nbr,1,MPI_INTEGER,MPI_MAX,mat%mpi_com,ierr)
CALL MPI_ALLREDUCE(collen,nbc,1,MPI_INTEGER,MPI_MAX,mat%mpi_com,ierr)
CALL MPI_ALLREDUCE(rowlen,nbr,1,MPI_INTEGER,MPI_MAX,mat%blacsdata%mpi_com,ierr)
CALL MPI_ALLREDUCE(collen,nbc,1,MPI_INTEGER,MPI_MAX,mat%blacsdata%mpi_com,ierr)
ALLOCATE(iusermap(mat%nprow,mat%npcol))
ALLOCATE(iusermap(mat%blacsdata%nprow,mat%blacsdata%npcol))
iusermap=-2
!Get BLACS ranks for all MPI ranks
CALL BLACS_PINFO(iusermap(myrow+1,mycol+1),npblacs) ! iamblacs = local process rank (e.g. myid)
CALL MPI_ALLREDUCE(MPI_IN_PLACE, iusermap, np,MPI_INTEGER,MPI_MAX,mat%mpi_com,ierr)
CALL MPI_ALLREDUCE(MPI_IN_PLACE, iusermap, np,MPI_INTEGER,MPI_MAX,mat%blacsdata%mpi_com,ierr)
!Get the Blacs default context
CALL BLACS_GET(0,0,mat%blacs_ctext)
CALL BLACS_GET(0,0,mat%blacsdata%blacs_desc(2))
! Create the Grid
CALL BLACS_GRIDMAP(mat%blacs_ctext,iusermap,mat%nprow,mat%nprow,mat%npcol)
CALL BLACS_GRIDMAP(mat%blacsdata%blacs_desc(2),iusermap,mat%blacsdata%nprow,mat%blacsdata%nprow,mat%blacsdata%npcol)
!Now create the matrix
mat%matsize1=numroc(mat%global_size1,nbr,myrow,0,mat%nprow)
mat%matsize2=numroc(mat%global_size1,nbc,mycol,0,mat%npcol)
mat%matsize1=numroc(mat%global_size1,nbr,myrow,0,mat%blacsdata%nprow)
mat%matsize2=numroc(mat%global_size1,nbc,mycol,0,mat%blacsdata%npcol)
IF (mat%l_real) THEN
ALLOCATE(mat%data_r(mat%matsize1,mat%matsize2))
ELSE
......@@ -520,7 +520,7 @@ CONTAINS
ENDIF
!Create blacs descriptor for chase matrix
CALL descinit(mat%blacs_desc,mat%global_size1,mat%global_size2,nbr,nbc,0,0,mat%blacs_ctext,mat%matsize1,ierr)
CALL descinit(mat%blacsdata%blacs_desc,mat%global_size1,mat%global_size2,nbr,nbc,0,0,mat%blacsdata%blacs_desc(2),mat%matsize1,ierr)
IF (ierr /=0 ) CALL judft_error('Creation of BLACS descriptor failed')
!Copy data from hmat
......
......@@ -47,7 +47,7 @@ CONTAINS
INCLUDE 'mpif.h'
!... Local variables
!
INTEGER :: num,num2,myrow,mycol
INTEGER :: num,num2
INTEGER :: nb, myid, np
INTEGER :: n_col, n_row
LOGICAL :: ok
......@@ -77,18 +77,16 @@ CONTAINS
CALL MPI_BARRIER(hmat%blacsdata%mpi_com,err)
CALL MPI_COMM_RANK(hmat%blacsdata%mpi_com,myid,err)
CALL MPI_COMM_SIZE(hmat%blacsdata%mpi_com,np,err)
myrow = myid/hmat%blacsdata%npcol
mycol = myid -(myid/hmat%blacsdata%npcol)*hmat%blacsdata%npcol
!Create communicators for ELPA
#if defined (CPP_ELPA_201705003)
mpi_comm_rows = -1
mpi_comm_cols = -1
#elif defined (CPP_ELPA_201605004) || defined (CPP_ELPA_201605003)||defined(CPP_ELPA_NEW)
err=get_elpa_row_col_comms(hmat%blacsdata%mpi_com, myrow, mycol,mpi_comm_rows, mpi_comm_cols)
err=get_elpa_row_col_comms(hmat%blacsdata%mpi_com, hmat%blacsdata%myrow, hmat%blacsdata%mycol,mpi_comm_rows, mpi_comm_cols)
#else
CALL get_elpa_row_col_comms(hmat%blacsdata%mpi_com, myrow, mycol,mpi_comm_rows, mpi_comm_cols)
CALL get_elpa_row_col_comms(hmat%blacsdata%mpi_com, hmat%blacsdata%myrow, hmat%blacsdata%mycol,mpi_comm_rows, mpi_comm_cols)
#endif
!print *,"creating ELPA comms -- done"
......@@ -119,8 +117,8 @@ CONTAINS
CALL elpa_obj%set("local_ncols", hmat%matsize2, err)
CALL elpa_obj%set("nblk", nb, err)
CALL elpa_obj%set("mpi_comm_parent", hmat%blacsdata%mpi_com, err)
CALL elpa_obj%set("process_row", myrow, err)
CALL elpa_obj%set("process_col", mycol, err)
CALL elpa_obj%set("process_row", hmat%blacsdata%myrow, err)
CALL elpa_obj%set("process_col", hmat%blacsdata%mycol, err)
#ifdef CPP_ELPA2
CALL elpa_obj%set("solver", ELPA_SOLVER_2STAGE)
#else
......@@ -188,8 +186,8 @@ CONTAINS
DO i=1,hmat%matsize2
! Get global column corresponding to i and number of local rows up to
! and including the diagonal, these are unchanged in H
n_col = indxl2g(i, nb, mycol, 0, hmat%blacsdata%npcol)
n_row = numroc (n_col, nb, myrow, 0, hmat%blacsdata%nprow)
n_col = indxl2g(i, nb, hmat%blacsdata%mycol, 0, hmat%blacsdata%npcol)
n_row = numroc (n_col, nb, hmat%blacsdata%myrow, 0, hmat%blacsdata%nprow)
IF (hmat%l_real) THEN
hmat%data_r(n_row+1:hmat%matsize1,i) = 0.d0
ELSE
......@@ -210,8 +208,8 @@ CONTAINS
DO i=1,hmat%matsize2
! Get global column corresponding to i and number of local rows up to
! and including the diagonal, these are unchanged in H
n_col = indxl2g(i, nb, mycol, 0, hmat%blacsdata%npcol)
n_row = numroc (n_col, nb, myrow, 0, hmat%blacsdata%nprow)
n_col = indxl2g(i, nb, hmat%blacsdata%mycol, 0, hmat%blacsdata%npcol)
n_row = numroc (n_col, nb, hmat%blacsdata%myrow, 0, hmat%blacsdata%nprow)
IF (hmat%l_real) THEN
hmat%data_r(n_row+1:hmat%matsize1,i) = ev_dist%data_r(n_row+1:ev_dist%matsize1,i)
ELSE
......@@ -317,8 +315,8 @@ CONTAINS
DO i=1,hmat%matsize2
! Get global column corresponding to i and number of local rows up to
! and including the diagonal, these are unchanged in A
n_col = indxl2g(i, nb, mycol, 0, hmat%blacsdata%npcol)
n_row = numroc (n_col, nb, myrow, 0, hmat%blacsdata%nprow)
n_col = indxl2g(i, nb, hmat%blacsdata%mycol, 0, hmat%blacsdata%npcol)
n_row = numroc (n_col, nb, hmat%blacsdata%myrow, 0, hmat%blacsdata%nprow)
IF (hmat%l_real) THEN
hmat%data_r(n_row+1:hmat%matsize1,i) = ev_dist%data_r(n_row+1:ev_dist%matsize1,i)
ELSE
......
......@@ -25,6 +25,8 @@ CONTAINS
USE m_types
USE m_ylm
USE m_apws
USE cudafor
USE nvtx
IMPLICIT NONE
TYPE(t_sym),INTENT(IN) :: sym
TYPE(t_cell),INTENT(IN) :: cell
......@@ -38,7 +40,7 @@ CONTAINS
INTEGER,INTENT(OUT) :: ab_size
! ..
! .. Array Arguments ..
REAL, INTENT(IN) :: fj(:,:,:),gj(:,:,:)
REAL, DEVICE, INTENT(IN) :: fj(:,:,:),gj(:,:,:)
COMPLEX,DEVICE, INTENT (OUT) :: ab(:,:)
!Optional arguments if abc coef for LOs are needed
COMPLEX, INTENT(INOUT),OPTIONAL:: abclo(:,-atoms%llod:,:,:)
......@@ -49,29 +51,28 @@ CONTAINS
COMPLEX,ALLOCATABLE :: ylm(:,:)
COMPLEX,ALLOCATABLE :: c_ph(:,:)
REAL, ALLOCATABLE :: gkrot(:,:)
LOGICAL :: l_apw
COMPLEX:: term
REAL, ALLOCATABLE,DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
COMPLEX,ALLOCATABLE,DEVICE :: c_ph_dev(:,:)
COMPLEX,ALLOCATABLE,DEVICE :: ylm_dev(:,:)
REAL, ALLOCATABLE,DEVICE :: gkrot_dev(:,:)
INTEGER :: istat
ALLOCATE(fj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
ALLOCATE(gj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
call nvtxStartRange("hsmt_ab",2)
lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph)
ALLOCATE(c_ph_dev(lapw%nv(1),MERGE(2,1,noco%l_ss)))
ALLOCATE(ylm_dev(lapw%nv(1),(atoms%lmaxd+1)**2))
fj_dev(:,:,:)= fj(:,:,:)
gj_dev(:,:,:)= gj(:,:,:)
ALLOCATE(ylm_dev((lmax+1)**2,lapw%nv(1)))
ALLOCATE(gkrot_dev(3,lapw%nv(1)))
ALLOCATE(ylm(lapw%nv(1),(atoms%lmaxd+1)**2))
ALLOCATE(ylm((lmax+1)**2,lapw%nv(1)))
ALLOCATE(c_ph(lapw%nv(1),MERGE(2,1,noco%l_ss)))
ALLOCATE(gkrot(3,lapw%nv(1)))
lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph)
ab_size=lmax*(lmax+2)+1
l_apw=ALL(gj==0.0)
ab=0.0
np = sym%invtab(atoms%ngopr(na))
......@@ -92,50 +93,64 @@ CONTAINS
END DO
END IF
!--> generate spherical harmonics
gkrot_dev = gkrot
!--> synthesize the complex conjugates of a and b
!!$cuf kernel do <<<*,256>>>
!DO k = 1,lapw%nv(1)
! !--> generate spherical harmonics
! CALL ylm4_dev(lmax,gkrot_dev(:,k),ylm_dev(:,k))
!ENDDO
DO k = 1,lapw%nv(1)
vmult(:) = gkrot(:,k)
CALL ylm4(lmax,vmult,ylm(k,:))
call ylm4(lmax,gkrot(:,k),ylm(:,k))
ENDDO
ylm_dev=ylm
ylm_dev = ylm
!--> synthesize the complex conjugates of a and b
call nvtxStartRange("hsmt_cuf",5)
!$cuf kernel do <<<*,256>>>
DO k = 1,lapw%nv(1)
!--> generate spherical harmonics
!CALL ylm4_dev(lmax,gkrot_dev(:,k),ylm_dev(:,k))
DO l = 0,lmax
ll1 = l* (l+1)
DO m = -l,l
ab(k,ll1+m+1) = CONJG(fj_dev(k,l+1,iintsp)*c_ph_dev(k,iintsp)*ylm_dev(k,ll1+m+1))
ab(k,ll1+m+1+ab_size) = CONJG(gj_dev(k,l+1,iintsp)*c_ph_dev(k,iintsp)*ylm_dev(k,ll1+m+1))
ab(k,ll1+m+1) = CONJG(fj(k,l+1,iintsp)*c_ph_dev(k,iintsp)*ylm_dev(ll1+m+1,k))
ab(k,ll1+m+1+ab_size) = CONJG(gj(k,l+1,iintsp)*c_ph_dev(k,iintsp)*ylm_dev(ll1+m+1,k))
END DO
END DO
ENDDO !k-loop
istat = cudaDeviceSynchronize()
call nvtxEndRange
IF (PRESENT(abclo)) THEN
DO k = 1,lapw%nv(1)
!determine also the abc coeffs for LOs
invsfct=MERGE(1,2,atoms%invsat(na).EQ.0)
term = fpi_const/SQRT(cell%omtil)* ((atoms%rmt(n)**2)/2)*c_ph(k,iintsp)
DO lo = 1,atoms%nlo(n)
l = atoms%llo(lo,n)
DO nkvec=1,invsfct*(2*l+1)
IF (lapw%kvec(nkvec,lo,na)==k) THEN !This k-vector is used in LO
ll1 = l*(l+1) + 1
DO m = -l,l
lm = ll1 + m
abclo(1,m,nkvec,lo) = term*ylm(k,lm)*alo1(lo)
abclo(2,m,nkvec,lo) = term*ylm(k,lm)*blo1(lo)
abclo(3,m,nkvec,lo) = term*ylm(k,lm)*clo1(lo)
END DO
END IF
ENDDO
ENDDO
ENDDO
print*, "Ooooops, TODO in hsmt_ab"
!DO k = 1,lapw%nv(1)
! !determine also the abc coeffs for LOs
! invsfct=MERGE(1,2,atoms%invsat(na).EQ.0)
! term = fpi_const/SQRT(cell%omtil)* ((atoms%rmt(n)**2)/2)*c_ph(k,iintsp)
! DO lo = 1,atoms%nlo(n)
! l = atoms%llo(lo,n)
! DO nkvec=1,invsfct*(2*l+1)
! IF (lapw%kvec(nkvec,lo,na)==k) THEN !This k-vector is used in LO
! ll1 = l*(l+1) + 1
! DO m = -l,l
! lm = ll1 + m
! abclo(1,m,nkvec,lo) = term*ylm(k,lm)*alo1(lo)
! abclo(2,m,nkvec,lo) = term*ylm(k,lm)*blo1(lo)
! abclo(3,m,nkvec,lo) = term*ylm(k,lm)*clo1(lo)
! END DO
! END IF
! ENDDO
! ENDDO
!ENDDO
ENDIF
IF (.NOT.l_apw) ab_size=ab_size*2
ab_size=ab_size*2
call nvtxEndRange
END SUBROUTINE hsmt_ab_gpu
#endif
......
......@@ -70,29 +70,32 @@ CONTAINS
COMPLEX,ALLOCATABLE:: ab(:,:),ab1(:,:),ab2(:,:)
real :: rchi
#ifdef _CUDA
COMPLEX,ALLOCATABLE,DEVICE :: c_dev(:,:), ab1_dev(:,:), ab_dev(:,:)
COMPLEX,ALLOCATABLE,DEVICE :: c_dev(:,:), ab1_dev(:,:), ab_dev(:,:), ab2_dev(:,:)
COMPLEX,ALLOCATABLE,DEVICE :: h_loc_dev(:,:)
!REAL, ALLOCATABLE,DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
REAL, ALLOCATABLE,DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
integer :: i, j, istat
call nvtxStartRange("hsmt_nonsph",1)
print*, "running CUDA version"
#endif
print *, "nonsph start"
ALLOCATE(ab(MAXVAL(lapw%nv),2*atoms%lmaxd*(atoms%lmaxd+2)+2),ab1(lapw%nv(jintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
#ifdef _CUDA
ALLOCATE(h_loc_dev(size(td%h_loc,1),size(td%h_loc,2)))
ALLOCATE(ab1_dev(size(ab1,1),size(ab1,2)))
ALLOCATE(ab_dev(size(ab,1),size(ab,2)))
h_loc_dev(1:,1:) = CONJG(td%h_loc(0:,0:,n,isp)) !WORKAROUND, var_dev=CONJG(var_dev) does not work (pgi18.4)
!ALLOCATE(fj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
!ALLOCATE(gj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
!fj_dev(1:,1:,1:)= fj(1:,0:,1:)
!gj_dev(1:,1:,1:)= gj(1:,0:,1:)
h_loc_dev(1:,1:) = CONJG(td%h_loc(0:,0:,n,isp)) !WORKAROUND, var_dev=CONJG(var_dev) does not work
ALLOCATE(fj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
ALLOCATE(gj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
fj_dev(1:,1:,1:)= fj(1:,0:,1:)
gj_dev(1:,1:,1:)= gj(1:,0:,1:)
!note that basically all matrices in the GPU version are conjugates of their
!cpu counterparts
#endif
IF (iintsp.NE.jintsp) ALLOCATE(ab2(lapw%nv(iintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
#ifdef _CUDA
IF (iintsp.NE.jintsp) ALLOCATE(ab2_dev(lapw%nv(iintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
#endif
IF (hmat%l_real) THEN
IF (ANY(SHAPE(hmat%data_c)/=SHAPE(hmat%data_r))) THEN
......@@ -110,16 +113,15 @@ CONTAINS
na = SUM(atoms%neq(:n-1))+nn
IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN
rchi=MERGE(REAL(chi),REAL(chi)*2,(atoms%invsat(na)==0))
#ifdef _CUDA
CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab_dev,ab_size,.TRUE.)
! istat = cudaDeviceSynchronize()
CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj_dev,gj_dev,ab_dev,ab_size,.TRUE.)
! istat = cudaDeviceSynchronize()
#else
CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.)
#endif
!Calculate Hamiltonian
#ifdef _CUDA
!ab_dev = CONJG(ab)
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab_dev,SIZE(ab_dev,1),h_loc_dev,SIZE(h_loc_dev,1),CMPLX(0.,0.),ab1_dev,SIZE(ab1_dev,1))
#else
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab1,SIZE(ab1,1))
......@@ -127,7 +129,8 @@ CONTAINS
!ab1=MATMUL(ab(:lapw%nv(iintsp),:ab_size),td%h_loc(:ab_size,:ab_size,n,isp))
IF (iintsp==jintsp) THEN
#ifdef _CUDA
call nvtxStartRange("zherk",3)
call nvtxStartRange("zherk",3)
!ab1_dev=CONJG(ab1)
CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,ab1_dev,SIZE(ab1_dev,1),1.0,c_dev,SIZE(c_dev,1))
istat = cudaDeviceSynchronize()
call nvtxEndRange()
......@@ -136,10 +139,24 @@ CONTAINS
#endif
ELSE !here the l_ss off-diagonal part starts
!Second set of ab is needed
#ifdef _CUDA
CALL hsmt_ab(sym,atoms,noco,isp,iintsp,n,na,cell,lapw,fj_dev,gj_dev,ab_dev,ab_size,.TRUE.)
#else
CALL hsmt_ab(sym,atoms,noco,isp,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.)
CALL zgemm("N","N",lapw%nv(iintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(:,:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab2,SIZE(ab2,1))
#endif
#ifdef _CUDA
CALL zgemm("N","N",lapw%nv(iintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab_dev,SIZE(ab_dev,1),h_loc_dev,SIZE(td%h_loc,1),CMPLX(0.,0.),ab2_dev,SIZE(ab2_dev,1))
#else
CALL zgemm("N","N",lapw%nv(iintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab2,SIZE(ab2,1))
#endif
!Multiply for Hamiltonian
#ifdef _CUDA
ab1 = ab1_dev
ab1_dev=CONJG(ab1)
CALL zgemm("N","T",lapw%nv(iintsp),lapw%nv(jintsp),ab_size,chi,ab2_dev,SIZE(ab2_dev,1),ab1_dev,SIZE(ab1_dev,1),CMPLX(1.0,0.0),c_dev,SIZE(c_dev,1))
#else
CALL zgemm("N","T",lapw%nv(iintsp),lapw%nv(jintsp),ab_size,chi,conjg(ab2),SIZE(ab2,1),ab1,SIZE(ab1,1),CMPLX(1.0,0.0),hmat%data_c,SIZE(hmat%data_c,1))
#endif
ENDIF
ENDIF
END DO
......
......@@ -28,6 +28,7 @@ MODULE m_types_mpimat
!> 7,8: row/colum of grid for first row/colum of matrix
!> 9: leading dimension of local matrix
INTEGER:: npcol,nprow !> the number of columns/rows in the processor grid
INTEGER:: mycol,myrow
END TYPE t_blacsdata
......@@ -91,7 +92,7 @@ CONTAINS
SUBROUTINE generate_full_matrix(mat)
CLASS(t_mpimat),INTENT(INOUT) ::mat
INTEGER :: i,j,i_glob,j_glob,npcol,nprow,myid,err,myrow,mycol,np
INTEGER :: i,j,i_glob,j_glob,myid,err,np
COMPLEX,ALLOCATABLE:: tmp_c(:,:)
REAL,ALLOCATABLE :: tmp_r(:,:)
#ifdef CPP_SCALAPACK
......@@ -100,16 +101,14 @@ CONTAINS
CALL MPI_COMM_RANK(mat%blacsdata%mpi_com,myid,err)
CALL MPI_COMM_SIZE(mat%blacsdata%mpi_com,np,err)
CALL blacs_gridinfo(mat%blacsdata%blacs_desc(2),nprow,npcol,myrow,mycol)
!Set lower part of matrix to zero
DO i=1,mat%matsize1
DO j=1,mat%matsize2
! Get global column corresponding to i and number of local rows up to
! and including the diagonal, these are unchanged in A
i_glob = indxl2g(i, mat%blacsdata%blacs_desc(5), myrow, 0, nprow)
j_glob = indxl2g(j, mat%blacsdata%blacs_desc(6), mycol, 0, npcol)
i_glob = indxl2g(i, mat%blacsdata%blacs_desc(5), mat%blacsdata%myrow, 0, mat%blacsdata%nprow)
j_glob = indxl2g(j, mat%blacsdata%blacs_desc(6), mat%blacsdata%mycol, 0, mat%blacsdata%npcol)
IF (i_glob>j_glob) THEN
IF (mat%l_real) THEN
......@@ -299,9 +298,7 @@ CONTAINS
mat%blacsdata%no_use=1
mat%blacsdata%mpi_com=mpi_subcom
CALL priv_create_blacsgrid(mat%blacsdata%mpi_com,l_2d,matsize1,matsize2,nbx,nby,&
mat%blacsdata%blacs_desc,&
mat%matsize1,mat%matsize2,&
mat%blacsdata%npcol,mat%blacsdata%nprow)
mat%blacsdata,mat%matsize1,mat%matsize2)
CALL mat%alloc(l_real) !Attention,sizes determined in call to priv_create_blacsgrid
!check if this matrix is actually distributed over MPI_COMM_SELF
IF (mpi_subcom==MPI_COMM_SELF) THEN
......@@ -333,21 +330,20 @@ CONTAINS
END SUBROUTINE mpimat_init_template
SUBROUTINE priv_create_blacsgrid(mpi_subcom,l_2d,m1,m2,nbc,nbr,sc_desc,local_size1,local_size2,nprow,npcol)
SUBROUTINE priv_create_blacsgrid(mpi_subcom,l_2d,m1,m2,nbc,nbr,blacsdata,local_size1,local_size2)
IMPLICIT NONE
INTEGER,INTENT(IN) :: mpi_subcom
INTEGER,INTENT(IN) :: m1,m2
INTEGER,INTENT(INOUT)::nbc,nbr
LOGICAL,INTENT(IN) :: l_2d
INTEGER,INTENT(OUT):: sc_desc(:)
type(t_blacsdata),INTENT(OUT)::blacsdata
INTEGER,INTENT(OUT):: local_size1,local_size2
INTEGER,INTENT(OUT):: npcol,nprow
#ifdef CPP_MPI
INCLUDE 'mpif.h'
INTEGER :: myrowssca,mycolssca,myrow,mycol
INTEGER :: iamblacs,npblacs,np,myid
INTEGER :: nprow2,npcol2,myrowblacs,mycolblacs
INTEGER :: myrowssca,mycolssca
INTEGER :: iamblacs,npblacs,np,myid,mycol,myrow
INTEGER :: nprow2,npcol2
INTEGER :: k,i,j,ictextblacs
INTEGER :: ierr
......@@ -366,34 +362,34 @@ CONTAINS
IF (l_2d) THEN
distloop: DO j=INT(SQRT(REAL(np))),1,-1
IF ( (np/j) * j == np) THEN
npcol = np/j
nprow = j
blacsdata%npcol = np/j
blacsdata%nprow = j
EXIT distloop
ENDIF
ENDDO distloop
ELSE
nbc=1
nbr=MAX(m1,m2)
npcol=np
nprow=1
blacsdata%npcol=np
blacsdata%nprow=1
ENDIF
ALLOCATE(iblacsnums(np),ihelp(np),iusermap(nprow,npcol))
ALLOCATE(iblacsnums(np),ihelp(np),iusermap(blacsdata%nprow,blacsdata%npcol))
! An nprow*npcol processor grid will be created
! An blacsdata%nprow*blacsdata%npcol processor grid will be created
! Row and column index myrow, mycol of this processor in the grid
! and distribution of A and B in ScaLAPACK
! The local processor will get myrowssca rows and mycolssca columns
! of A and B
!
myrow = myid/npcol ! my row number in the BLACS nprow*npcol grid
mycol = myid -(myid/npcol)*npcol ! my column number in the BLACS nprow*npcol grid
myrow = myid/blacsdata%npcol ! my row number in the BLACS blacsdata%nprow*blacsdata%npcol grid
mycol = myid -(myid/blacsdata%npcol)*blacsdata%npcol ! my column number in the BLACS blacsdata%nprow*blacsdata%npcol grid
!
! Now allocate Asca to put the elements of Achi or receivebuffer to
!
myrowssca=(m1-1)/(nbr*nprow)*nbr+ MIN(MAX(m1-(m1-1)/(nbr*nprow)*nbr*nprow-nbr*myrow,0),nbr)
myrowssca=(m1-1)/(nbr*blacsdata%nprow)*nbr+ MIN(MAX(m1-(m1-1)/(nbr*blacsdata%nprow)*nbr*blacsdata%nprow-nbr*myrow,0),nbr)
! Number of rows the local process gets in ScaLAPACK distribution
mycolssca=(m2-1)/(nbc*npcol)*nbc+ MIN(MAX(m2-(m2-1)/(nbc*npcol)*nbc*npcol-nbc*mycol,0),nbc)
mycolssca=(m2-1)/(nbc*blacsdata%npcol)*nbc+ MIN(MAX(m2-(m2-1)/(nbc*blacsdata%npcol)*nbc*blacsdata%npcol-nbc*mycol,0),nbc)
!Get BLACS ranks for all MPI ranks
......@@ -407,8 +403,8 @@ CONTAINS
! iblacsnums(i) is the BLACS-process number of MPI-process i-1
k = 1
DO i = 1, nprow
DO j = 1, npcol
DO i = 1, blacsdata%nprow
DO j = 1, blacsdata%npcol
iusermap(i,j) = iblacsnums(k)
k = k + 1
ENDDO
......@@ -416,23 +412,23 @@ CONTAINS
!Get the Blacs default context
CALL BLACS_GET(0,0,ictextblacs)
! Create the Grid
CALL BLACS_GRIDMAP(ictextblacs,iusermap,nprow,nprow,npcol)
CALL BLACS_GRIDMAP(ictextblacs,iusermap,size(iusermap,1),blacsdata%nprow,blacsdata%npcol)
! Now control, whether the BLACS grid is the one we wanted
CALL BLACS_GRIDINFO(ictextblacs, nprow2,npcol2,myrowblacs,mycolblacs)
IF (nprow2 /= nprow) THEN
CALL BLACS_GRIDINFO(ictextblacs, nprow2,npcol2,blacsdata%myrow,blacsdata%mycol)
IF (nprow2 /= blacsdata%nprow) THEN
WRITE(6,*) 'Wrong number of rows in BLACS grid'
WRITE(6,*) 'nprow=',nprow,' nprow2=',nprow2
WRITE(6,*) 'nprow=',blacsdata%nprow,' nprow2=',nprow2
call judft_error('Wrong number of rows in BLACS grid')
ENDIF
IF (npcol2 /= npcol) THEN
IF (npcol2 /= blacsdata%npcol) THEN
WRITE(6,*) 'Wrong number of columns in BLACS grid'
WRITE(6,*) 'npcol=',npcol,' npcol2=',npcol2
WRITE(6,*) 'npcol=',blacsdata%npcol,' npcol2=',npcol2
call judft_error('Wrong number of columns in BLACS grid')
ENDIF
!Create the descriptors
CALL descinit(sc_desc,m1,m2,nbr,nbc,0,0,ictextblacs,myrowssca,ierr)
CALL descinit(blacsdata%blacs_desc,m1,m2,nbr,nbc,0,0,ictextblacs,myrowssca,ierr)
IF (ierr /=0 ) call judft_error('Creation of BLACS descriptor failed')
local_size1=myrowssca
local_size2=mycolssca
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment