diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index dd9c95e531f3d119f6facad5d62fd0a8604b14a0..57cd6e1a6730f333573af5ab7487f6e69650e623 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -33,7 +33,7 @@ test-gfortran: artifacts: when: on_failure paths: - - build/Testing/fails + - build/Testing/failed - build/Testing/test.oldlogs # only: # - schedules @@ -100,7 +100,7 @@ build-intel: - build.intel script: - set +e && source compilervars.sh intel64 && set -e ; ulimit -s unlimited - - cd /builds/fleur/fleur; FC=mpiifort FLEUR_LIBRARIES="-lmkl_scalapack_lp64;-lmkl_blacs_intelmpi_lp64" ./configure.sh -l intel AUTO ; cd build.intel; make + - cd /builds/fleur/fleur; FC=mpiifort FLEUR_LIBRARIES="-lmkl_scalapack_lp64;-lmkl_blacs_intelmpi_lp64" ./configure.sh -t -l intel AUTO ; cd build.intel; make only: - schedules - triggers @@ -119,7 +119,7 @@ test-intel: artifacts: when: on_failure paths: - - build/Testing/fails + - build/Testing/failed - build/Testing/test.oldlogs only: - schedules diff --git a/eigen/hsmt_nonsph.F90 b/eigen/hsmt_nonsph.F90 index aa3dac6bae250bc1e584d43dc031fbe5e3b3050f..3745b4c45fc0284e003def952e746991bbae29d0 100644 --- a/eigen/hsmt_nonsph.F90 +++ b/eigen/hsmt_nonsph.F90 @@ -40,6 +40,14 @@ CONTAINS USE m_constants, ONLY : fpi_const,tpi_const USE m_types USE m_ylm +#if defined (_CUDA) + ! cublas: required to use generic BLAS interface + ! cudafor: required to use CUDA runtime API routines (e.g. + ! cudaDeviceSynchronize) + USE cublas + USE cudafor + USE nvtx +#endif IMPLICIT NONE TYPE(t_mpi),INTENT(IN) :: mpi TYPE(t_sym),INTENT(IN) :: sym @@ -61,10 +69,25 @@ CONTAINS INTEGER:: nn,na,ab_size,l,ll,m COMPLEX,ALLOCATABLE:: ab(:,:),ab1(:,:),ab2(:,:) real :: rchi +#ifdef _CUDA + COMPLEX,ALLOCATABLE,DEVICE :: c_dev(:,:), ab1_dev(:,:), ab_dev(:,:) + COMPLEX,ALLOCATABLE,DEVICE :: h_loc_dev(:,:) + integer :: i, j, istat + call nvtxStartRange("hsmt_nonsph",1) + print*, "running CUDA version" +#endif - ALLOCATE(ab(MAXVAL(lapw%nv),2*atoms%lmaxd*(atoms%lmaxd+2)+2),ab1(lapw%nv(jintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2)) + ALLOCATE(ab(MAXVAL(lapw%nv),2*atoms%lmaxd*(atoms%lmaxd+2)+2),ab1(lapw%nv(iintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2)) +#ifdef _CUDA + ALLOCATE(h_loc_dev(size(td%h_loc,1),size(td%h_loc,2))) + ALLOCATE(ab1_dev(size(ab1,1),size(ab1,2))) + ALLOCATE(ab_dev(size(ab,1),size(ab,2))) + h_loc_dev(1:,1:) = CONJG(td%h_loc(0:,0:,n,isp)) !WORKAROUND, var_dev=CONJG(var_dev) does not work (pgi18.4) + !note that basically all matrices in the GPU version are conjugates of their + !cpu counterparts +#endif - IF (iintsp.NE.jintsp) ALLOCATE(ab2(lapw%nv(iintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2)) + IF (iintsp.NE.jintsp) ALLOCATE(ab2(lapw%nv(jintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2)) IF (hmat%l_real) THEN IF (ANY(SHAPE(hmat%data_c)/=SHAPE(hmat%data_r))) THEN @@ -73,33 +96,60 @@ CONTAINS ENDIF hmat%data_c=0.0 ENDIF +#ifdef _CUDA + ALLOCATE(c_dev(SIZE(hmat%data_c,1),SIZE(hmat%data_c,2))) + c_dev = hmat%data_c +#endif DO nn = 1,atoms%neq(n) na = SUM(atoms%neq(:n-1))+nn IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN rchi=MERGE(REAL(chi),REAL(chi)*2,(atoms%invsat(na)==0)) - CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.) +!#ifdef _CUDA + !CALL hsmt_ab(sym,atoms,noco,isp,iintsp,n,na,cell,lapw,fj,gj,ab_dev,ab_size,.TRUE.) + ! istat = cudaDeviceSynchronize() +!#else + CALL hsmt_ab(sym,atoms,noco,isp,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.) +!#endif !Calculate Hamiltonian - CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab1,SIZE(ab1,1)) +#ifdef _CUDA + ab_dev = CONJG(ab) + CALL zgemm("N","N",lapw%nv(iintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab_dev,SIZE(ab_dev,1),h_loc_dev,SIZE(h_loc_dev,1),CMPLX(0.,0.),ab1_dev,SIZE(ab1_dev,1)) +#else + CALL zgemm("N","N",lapw%nv(iintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab1,SIZE(ab1,1)) +#endif !ab1=MATMUL(ab(:lapw%nv(iintsp),:ab_size),td%h_loc(:ab_size,:ab_size,n,isp)) IF (iintsp==jintsp) THEN +#ifdef _CUDA + call nvtxStartRange("zherk",3) + CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,ab1_dev,SIZE(ab1_dev,1),1.0,c_dev,SIZE(c_dev,1)) + istat = cudaDeviceSynchronize() + call nvtxEndRange() +#else CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,CONJG(ab1),SIZE(ab1,1),1.0,hmat%data_c,SIZE(hmat%data_c,1)) +#endif ELSE !here the l_ss off-diagonal part starts !Second set of ab is needed - CALL hsmt_ab(sym,atoms,noco,isp,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.) - CALL zgemm("N","N",lapw%nv(iintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(:,:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab2,SIZE(ab2,1)) + CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.) + CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(:,:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab2,SIZE(ab2,1)) !Multiply for Hamiltonian - CALL zgemm("N","T",lapw%nv(iintsp),lapw%nv(jintsp),ab_size,chi,conjg(ab2),SIZE(ab2,1),ab1,SIZE(ab1,1),CMPLX(1.0,0.0),hmat%data_c,SIZE(hmat%data_c,1)) + CALL zgemm("N","T",lapw%nv(jintsp),lapw%nv(iintsp),ab_size,chi,conjg(ab2),SIZE(ab2,1),ab1,SIZE(ab1,1),CMPLX(1.0,0.0),hmat%data_c,SIZE(hmat%data_c,1)) ENDIF ENDIF END DO +#ifdef _CUDA + hmat%data_c = c_dev +#endif IF (hmat%l_real) THEN hmat%data_r=hmat%data_r+REAL(hmat%data_c) ENDIF - - END SUBROUTINE priv_noMPI + +#ifdef _CUDA + call nvtxEndRange +#endif + END SUBROUTINE priv_noMPI SUBROUTINE priv_MPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat) @@ -130,7 +180,7 @@ CONTAINS COMPLEX,ALLOCATABLE:: ab(:,:),ab1(:,:),ab_select(:,:) real :: rchi - ALLOCATE(ab(MAXVAL(lapw%nv),2*atoms%lnonsph(n)*(atoms%lnonsph(n)+2)+2),ab1(lapw%nv(jintsp),2*atoms%lnonsph(n)*(atoms%lnonsph(n)+2)+2),ab_select(lapw%num_local_cols(jintsp),2*atoms%lnonsph(n)*(atoms%lnonsph(n)+2)+2)) + ALLOCATE(ab(MAXVAL(lapw%nv),2*atoms%lnonsph(n)*(atoms%lnonsph(n)+2)+2),ab1(lapw%nv(iintsp),2*atoms%lnonsph(n)*(atoms%lnonsph(n)+2)+2),ab_select(lapw%num_local_cols(jintsp),2*atoms%lnonsph(n)*(atoms%lnonsph(n)+2)+2)) !IF (iintsp.NE.jintsp) ALLOCATE(ab_select1(lapw%num_local_cols(jintsp),2*atoms%lnonsph(n)*(atoms%lnonsph(n)+2)+2)) @@ -147,20 +197,20 @@ CONTAINS IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN rchi=MERGE(REAL(chi),REAL(chi)*2,(atoms%invsat(na)==0)) - CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.) + CALL hsmt_ab(sym,atoms,noco,isp,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.) !Calculate Hamiltonian - CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab1,SIZE(ab1,1)) + CALL zgemm("N","N",lapw%nv(iintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab1,SIZE(ab1,1)) !Cut out of ab1 only the needed elements here - ab_select=ab1(mpi%n_rank+1:lapw%nv(jintsp):mpi%n_size,:) + ab_select=ab1(mpi%n_rank+1:lapw%nv(iintsp):mpi%n_size,:) IF (iintsp==jintsp) THEN CALL zgemm("N","T",lapw%nv(iintsp),lapw%num_local_cols(iintsp),ab_size,CMPLX(rchi,0.0),CONJG(ab1),SIZE(ab1,1),ab_select,lapw%num_local_cols(iintsp),CMPLX(1.,0.0),hmat%data_c,SIZE(hmat%data_c,1)) ELSE !Second set of ab is needed - CALL hsmt_ab(sym,atoms,noco,isp,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.) + CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.) CALL zgemm("N","N",lapw%nv(iintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(:,:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab1,SIZE(ab1,1)) !Multiply for Hamiltonian - CALL zgemm("N","T",lapw%nv(iintsp),lapw%num_local_cols(jintsp),ab_size,chi,conjg(ab1),SIZE(ab1,1),ab_select,lapw%num_local_cols(jintsp),CMPLX(1.,0.0),hmat%data_c,SIZE(hmat%data_c,1)) + CALL zgemm("N","t",lapw%nv(iintsp),lapw%num_local_cols(iintsp),ab_size,chi,conjg(ab1),SIZE(ab1,1),ab_select,lapw%num_local_cols(iintsp),CMPLX(1.,0.0),hmat%data_c,SIZE(hmat%data_c,1)) ENDIF ENDIF END DO diff --git a/global/CMakeLists.txt b/global/CMakeLists.txt index da0720fc7fabe323147760d80c346f06d59ee71b..8f58c04afa589cce1689de7dcdc196fbbc73cbcb 100644 --- a/global/CMakeLists.txt +++ b/global/CMakeLists.txt @@ -31,4 +31,5 @@ global/qfix.f90 global/radflo.F90 global/utility.F90 global/find_enpara.f90 +global/nvtx.F90 ) diff --git a/global/nvtx.F90 b/global/nvtx.F90 new file mode 100644 index 0000000000000000000000000000000000000000..3fc476043c7d1bb99cb8f8808c49757584e53839 --- /dev/null +++ b/global/nvtx.F90 @@ -0,0 +1,64 @@ +module nvtx +! See https://devblogs.nvidia.com/parallelforall/customize-cuda-fortran-profiling-nvtx/ +use iso_c_binding +implicit none + +integer,private :: col(7) = [ Z'0000ff00', Z'000000ff', Z'00ffff00', Z'00ff00ff', Z'0000ffff', Z'00ff0000', Z'00ffffff'] +character(len=256),private :: tempName + +type, bind(C):: nvtxEventAttributes + integer(C_INT16_T):: version=1 + integer(C_INT16_T):: size=48 ! + integer(C_INT):: category=0 + integer(C_INT):: colorType=1 ! NVTX_COLOR_ARGB = 1 + integer(C_INT):: color + integer(C_INT):: payloadType=0 ! NVTX_PAYLOAD_UNKNOWN = 0 + integer(C_INT):: reserved0 + integer(C_INT64_T):: payload ! union uint,int,double + integer(C_INT):: messageType=1 ! NVTX_MESSAGE_TYPE_ASCII = 1 + type(C_PTR):: message ! ascii char +end type + +interface nvtxRangePush + ! push range with custom label and standard color + subroutine nvtxRangePushA(name) bind(C, name='nvtxRangePushA') + use iso_c_binding + character(kind=C_CHAR,len=*) :: name + end subroutine + + ! push range with custom label and custom color + subroutine nvtxRangePushEx(event) bind(C, name='nvtxRangePushEx') + use iso_c_binding + import:: nvtxEventAttributes + type(nvtxEventAttributes):: event + end subroutine +end interface + +interface nvtxRangePop + subroutine nvtxRangePop() bind(C, name='nvtxRangePop') + end subroutine +end interface + +contains + +subroutine nvtxStartRange(name,id) + character(kind=c_char,len=*) :: name + integer, optional:: id + type(nvtxEventAttributes):: event + + tempName=trim(name)//c_null_char + + if ( .not. present(id)) then + call nvtxRangePush(tempName) + else + event%color=col(mod(id,7)+1) + event%message=c_loc(tempName) + call nvtxRangePushEx(event) + end if +end subroutine + +subroutine nvtxEndRange + call nvtxRangePop +end subroutine + +end module nvtx