Commit b599e1c4 authored by Miriam Hinzen's avatar Miriam Hinzen

Solved merge conflict

parents 3fb39116 3e68dc4c
......@@ -29,9 +29,11 @@ test-gfortran:
paths:
- build
script:
- ulimit -s unlimited ;export juDFT_MPI="mpirun -n 2 --allow-run-as-root ";cd /builds/fleur/fleur/build;ctest
- ulimit -s unlimited ;export juDFT_MPI="mpirun -n 2 --allow-run-as-root ";export OMP_NUM_THREADS=4;cd /builds/fleur/fleur/build;ctest
artifacts:
when: on_failure
paths:
- build/Testing/failed
- build/Testing/test.oldlogs
# only:
# - schedules
......@@ -98,7 +100,7 @@ build-intel:
- build.intel
script:
- set +e && source compilervars.sh intel64 && set -e ; ulimit -s unlimited
- cd /builds/fleur/fleur; FC=mpiifort FLEUR_LIBRARIES="-lmkl_scalapack_lp64;-lmkl_blacs_intelmpi_lp64" ./configure.sh -l intel AUTO ; cd build.intel; make
- cd /builds/fleur/fleur; FC=mpiifort FLEUR_LIBRARIES="-lmkl_scalapack_lp64;-lmkl_blacs_intelmpi_lp64" ./configure.sh -t -l intel AUTO ; cd build.intel; make
only:
- schedules
- triggers
......@@ -114,6 +116,11 @@ test-intel:
script:
- set +e && source compilervars.sh intel64 && set -e; ulimit -s unlimited
- cd /builds/fleur/fleur/build.intel;ctest
artifacts:
when: on_failure
paths:
- build/Testing/failed
- build/Testing/test.oldlogs
only:
- schedules
- web
......
......@@ -51,12 +51,12 @@ inpgen/lapw_input.f inpgen/struct_input.f inpgen/write_struct.f
io/calculator.f global/ss_sym.f global/soc_sym.f math/inv3.f io/rw_symfile.f
global/sort.f kpoints/kptgen_hybrid.f kpoints/od_kptsgen.f kpoints/bravais.f kpoints/divi.f kpoints/brzone.f
kpoints/kptmop.f kpoints/kpttet.f init/bandstr1.F kpoints/ordstar.f kpoints/fulstar.f kpoints/kprep.f
kpoints/tetcon.f kpoints/kvecon.f init/boxdim.f math/ylm4.f global/radsra.f math/intgr.F global/differ.f math/inwint.f
kpoints/tetcon.f kpoints/kvecon.f init/boxdim.f global/radsra.f math/intgr.F global/differ.f math/inwint.f
math/outint.f xc-pot/gaunt.f math/grule.f
)
set(inpgen_F90 ${inpgen_F90} global/constants.f90 io/xsf_io.f90
eigen/vec_for_lo.f90 eigen/orthoglo.F90 juDFT/usage_data.F90
eigen/vec_for_lo.f90 eigen/orthoglo.F90 juDFT/usage_data.F90 math/ylm4.f90
global/enpara.f90 global/chkmt.f90 inpgen/inpgen.f90 inpgen/set_inp.f90 inpgen/inpgen_help.f90 io/rw_inp.f90 juDFT/juDFT.F90 global/find_enpara.f90
inpgen/closure.f90 inpgen/inpgen_arguments.F90
juDFT/info.F90 juDFT/stop.F90 juDFT/args.F90 juDFT/time.F90 juDFT/init.F90 juDFT/sysinfo.F90 io/w_inpXML.f90 kpoints/julia.f90 global/utility.F90
......
......@@ -11,14 +11,20 @@ if (${CMAKE_Fortran_COMPILER_ID} MATCHES "Intel")
else()
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -mkl -r8 -qopenmp -assume byterecl")
endif()
set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -xHost -O2")
set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -xHost -O2 -g")
set(CMAKE_Fortran_FLAGS_DEBUG "${CMAKE_Fortran_FLAGS_DEBUG} -C -traceback -O0 -g -ftrapuv -check uninit -check pointers -CB ")
elseif(${CMAKE_Fortran_COMPILER_ID} MATCHES "PGI")
message("PGI Fortran detected")
set(CMAKE_SHARED_LIBRARY_LINK_Fortran_FLAGS "") #fix problem in cmake
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -mp -Mr8 -Mr8intrinsics -Mcuda:kepler+ -ta:tesla:cuda7.5 -DUSE_STREAMS -DNUM_STREAMS=${N_STREAMS} -Minfo=accel -acc")
set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -O1") #"-fast -O3": problems with OpenMP, pgf90/18.4
set(CMAKE_Fortran_FLAGS_DEBUG "${CMAKE_Fortran_FLAGS_DEBUG} -C -traceback -O0 -g -Mchkstk -Mchkptr")
#CPU
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -mp -Mr8 -Mr8intrinsics")
#GPU
#set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -mp -Mr8 -Mr8intrinsics -Mcuda=cuda9.0,cc60 -Mcudalib=cublas")
#set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -mp -Mr8 -Mr8intrinsics -Mcuda:kepler+ -ta:tesla:cuda7.5 -DUSE_STREAMS -DNUM_STREAMS=${N_STREAMS} -Minfo=accel -acc")
#set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -mp -Mr8 -Mr8intrinsics -Mcuda:cuda9.0,cc70 -DUSE_STREAMS -DNUM_STREAMS=${N_STREAMS} -Minfo=accel -acc")
#set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fast -O3")
set(CMAKE_Fortran_FLAGS_RELEASE "-O1 ") # to prevent cmake from putting -fast which auses problems with PGI18.4
set(CMAKE_Fortran_FLAGS_DEBUG "${CMAKE_Fortran_FLAGS_DEBUG} -C -traceback -O0 -g -Mchkstk -Mchkptr -Ktrap=fp")
elseif(${CMAKE_Fortran_COMPILER_ID} MATCHES "XL")
message("IBM/BG Fortran detected")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qsmp=omp -qnosave -qarch=qp -qtune=qp -qrealsize=8 -qfixed -qsuppress=1520-022 -qessl")
......
......@@ -2,13 +2,16 @@
if module list 2>&1 | grep -q intel
then
if ! module list 2>&1| grep -q intelmpi
if module list 2>&1| grep -q openmpi
then
echo "Please use intelmpi, e.g. do a module switch openmpi intelmpi"
exit
fi
export FC=$MPIFC
export CC=$MPICC
if module list 2>&1| grep -q intelmpi
then
export FC=$MPIFC
export CC=$MPICC
fi
#ELPA
if [ $ELPA_MODULES ]
then
......
......@@ -6,7 +6,7 @@ eigen/eigen.F90
eigen/hlomat.F90
eigen/hs_int.F90
eigen/hsmt_fjgj.F90
eigen/hsmt_ab.f90
eigen/hsmt_ab.F90
eigen/hsmt_sph.F90
eigen/hsmt_nonsph.F90
eigen/hsmt_spinor.F90
......@@ -34,6 +34,7 @@ eigen/vacfun.f90
eigen/vec_for_lo.f90
eigen/eigen_redist_matrix.f90
)
if (FLEUR_USE_GPU)
set(fleur_F90 ${fleur_F90} eigen/hsmt_nonsph_GPU.F90)
endif()
#if (FLEUR_USE_GPU)
# set(fleur_F90 ${fleur_F90}
#eigen/hsmt_nonsph_GPU.F90)
#endif()
......@@ -96,7 +96,8 @@ CONTAINS
CLASS(t_mat), ALLOCATABLE :: smat_unfold !used for unfolding bandstructure
! Variables for HF or hybrid functional calculation
INTEGER :: comm(kpts%nkpt),irank2(kpts%nkpt),isize2(kpts%nkpt)
INTEGER :: comm(kpts%nkpt),irank2(kpts%nkpt),isize2(kpts%nkpt), dealloc_stat
character(len=300) :: errmsg
call ud%init(atoms,DIMENSION%jspd)
ALLOCATE (eig(DIMENSION%neigd),bkpt(3))
......@@ -162,7 +163,12 @@ CONTAINS
l_wu=.FALSE.
ne_all=DIMENSION%neigd
if (allocated(zmat)) deallocate(zmat)
if (allocated(zmat)) then
deallocate(zmat, stat=dealloc_stat, errmsg=errmsg)
if(dealloc_stat /= 0) call juDFT_error("deallocate failed for zmat",&
hint=errmsg, calledby="eigen.F90")
endif
!Try to symmetrize matrix
CALL symmetrize_matrix(mpi,noco,kpts,nk,hmat,smat)
......@@ -184,7 +190,10 @@ CONTAINS
END IF
CALL eigen_diag(mpi,hmat,smat,nk,jsp,iter,ne_all,eig,zMat)
DEALLOCATE(hmat,smat)
CALL smat%free()
DEALLOCATE(hmat,smat, stat=dealloc_stat, errmsg=errmsg)
if(dealloc_stat /= 0) call juDFT_error("deallocate failed for hmat or smat",&
hint=errmsg, calledby="eigen.F90")
! Output results
CALL timestart("EV output")
......@@ -210,7 +219,9 @@ CONTAINS
IF (banddos%unfoldband) THEN
CALL calculate_plot_w_n(banddos,cell,kpts,smat_unfold,zMat,lapw,nk,jsp,eig,results,input,atoms)
DEALLOCATE(smat_unfold)
DEALLOCATE(smat_unfold, stat=dealloc_stat, errmsg=errmsg)
if(dealloc_stat /= 0) call juDFT_error("deallocate failed for smat_unfold",&
hint=errmsg, calledby="eigen.F90")
END IF
END DO k_loop
......
......@@ -61,7 +61,7 @@ CONTAINS
DO i=1,nspins
DO j=1,nspins
CALL smat(i,j)%init(l_real,lapw%nv(i)+atoms%nlotot,lapw%nv(j)+atoms%nlotot,mpi%sub_comm,.false.)
CALL hmat(i,j)%init(l_real,lapw%nv(i)+atoms%nlotot,lapw%nv(j)+atoms%nlotot,mpi%sub_comm,.false.)
CALL hmat(i,j)%init(smat(i,j))
ENDDO
ENDDO
......@@ -94,7 +94,7 @@ CONTAINS
! In collinear case only a copy is done
! In the parallel case also a redistribution happens
CALL eigen_redist_matrix(mpi,lapw,atoms,smat,smat_final)
CALL eigen_redist_matrix(mpi,lapw,atoms,hmat,hmat_final)
CALL eigen_redist_matrix(mpi,lapw,atoms,hmat,hmat_final,smat_final)
END SUBROUTINE eigen_hssetup
END MODULE m_eigen_hssetup
......
......@@ -14,7 +14,7 @@ CONTAINS
!! In the non-collinear case, the 2x2 array of matrices is combined into the final matrix. Again a redistribution will happen in the parallel case
SUBROUTINE eigen_redist_matrix(mpi,lapw,atoms,mat,mat_final)
SUBROUTINE eigen_redist_matrix(mpi,lapw,atoms,mat,mat_final,mat_final_templ)
USE m_types
USE m_types_mpimat
IMPLICIT NONE
......@@ -23,33 +23,37 @@ CONTAINS
TYPE(t_atoms),INTENT(IN) :: atoms
CLASS(t_mat),INTENT(INOUT):: mat(:,:)
CLASS(t_mat),INTENT(INOUT):: mat_final
CLASS(t_mat),INTENT(IN),OPTIONAL :: mat_final_templ
INTEGER:: m
!determine final matrix size and allocate the final matrix
m=lapw%nv(1)+atoms%nlotot
IF (SIZE(mat)>1) m=m+lapw%nv(2)+atoms%nlotot
CALL mat_final%init(mat(1,1)%l_real,m,m,mpi%sub_comm,.TRUE.) !here the .true. creates a block-cyclic scalapack distribution
IF (.NOT.PRESENT(mat_final_templ)) THEN
CALL mat_final%init(mat(1,1)%l_real,m,m,mpi%sub_comm,.TRUE.) !here the .true. creates a block-cyclic scalapack distribution
ELSE
CALL mat_final%init(mat_final_templ)
ENDIF
!up-up component (or only component in collinear case)
IF (SIZE(mat)==1) THEN
CALL mat_final%move(mat(1,1))
CALL mat(1,1)%free()
IF (.NOT.PRESENT(mat_final_templ)) CALL mat(1,1)%free()
RETURN
ENDIF
CALL mat_final%copy(mat(1,1),1,1)
CALL mat(1,1)%free()
IF (.NOT.PRESENT(mat_final_templ)) CALL mat(1,1)%free()
!down-down component
CALL mat_final%copy(mat(2,2),lapw%nv(1)+atoms%nlotot+1,lapw%nv(1)+atoms%nlotot+1)
CALL mat(2,2)%free()
IF (.NOT.PRESENT(mat_final_templ)) CALL mat(2,2)%free()
!Now collect off-diagonal parts
CALL mat(1,2)%add_transpose(mat(2,1))
CALL mat_final%copy(mat(1,2),1,lapw%nv(1)+atoms%nlotot+1)
CALL mat(1,2)%free()
CALL mat(2,1)%free()
IF (.NOT.PRESENT(mat_final_templ)) CALL mat(1,2)%free()
IF (.NOT.PRESENT(mat_final_templ)) CALL mat(2,1)%free()
END SUBROUTINE eigen_redist_matrix
END MODULE m_eigen_redist_matrix
......
......@@ -6,8 +6,140 @@
MODULE m_hsmt_ab
use m_juDFT
implicit none
INTERFACE hsmt_ab
module procedure hsmt_ab_cpu
#ifdef _CUDA
module procedure hsmt_ab_gpu
#endif
END INTERFACE
CONTAINS
SUBROUTINE hsmt_ab(sym,atoms,noco,ispin,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,l_nonsph,abclo,alo1,blo1,clo1)
#ifdef _CUDA
SUBROUTINE hsmt_ab_gpu(sym,atoms,noco,ispin,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,l_nonsph,abclo,alo1,blo1,clo1)
!Calculate overlap matrix
USE m_constants, ONLY : fpi_const,tpi_const
USE m_types
USE m_ylm
USE m_apws
IMPLICIT NONE
TYPE(t_sym),INTENT(IN) :: sym
TYPE(t_cell),INTENT(IN) :: cell
TYPE(t_atoms),INTENT(IN) :: atoms
TYPE(t_lapw),INTENT(IN) :: lapw
TYPE(t_noco),INTENT(IN) :: noco
! ..
! .. Scalar Arguments ..
INTEGER, INTENT (IN) :: ispin,n,na,iintsp
LOGICAL,INTENT(IN) :: l_nonsph
INTEGER,INTENT(OUT) :: ab_size
! ..
! .. Array Arguments ..
REAL, INTENT(IN) :: fj(:,:,:),gj(:,:,:)
COMPLEX,DEVICE, INTENT (OUT) :: ab(:,:)
!Optional arguments if abc coef for LOs are needed
COMPLEX, INTENT(INOUT),OPTIONAL:: abclo(:,-atoms%llod:,:,:)
REAL,INTENT(IN),OPTIONAL:: alo1(:),blo1(:),clo1(:)
INTEGER:: np,k,l,ll1,m,lmax,nkvec,lo,lm,invsfct
REAL :: th,v(3),bmrot(3,3),vmult(3)
COMPLEX,ALLOCATABLE :: ylm(:,:)
COMPLEX,ALLOCATABLE :: c_ph(:,:)
REAL, ALLOCATABLE :: gkrot(:,:)
LOGICAL :: l_apw
COMPLEX:: term
REAL, ALLOCATABLE,DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
COMPLEX,ALLOCATABLE,DEVICE :: c_ph_dev(:,:)
COMPLEX,ALLOCATABLE,DEVICE :: ylm_dev(:,:)
ALLOCATE(fj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
ALLOCATE(gj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
ALLOCATE(c_ph_dev(lapw%nv(1),MERGE(2,1,noco%l_ss)))
ALLOCATE(ylm_dev(lapw%nv(1),(atoms%lmaxd+1)**2))
fj_dev(:,:,:)= fj(:,:,:)
gj_dev(:,:,:)= gj(:,:,:)
ALLOCATE(ylm(lapw%nv(1),(atoms%lmaxd+1)**2))
ALLOCATE(c_ph(lapw%nv(1),MERGE(2,1,noco%l_ss)))
ALLOCATE(gkrot(3,lapw%nv(1)))
lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph)
ab_size=lmax*(lmax+2)+1
l_apw=ALL(gj==0.0)
ab=0.0
np = sym%invtab(atoms%ngopr(na))
!---> set up phase factors
CALL lapw%phase_factors(iintsp,atoms%taual(:,na),noco%qss,c_ph(:,iintsp))
c_ph_dev=c_ph
IF (np==1) THEN
gkrot(:, 1:lapw%nv(iintsp)) = lapw%gk(:, 1:lapw%nv(iintsp),iintsp)
ELSE
bmrot=MATMUL(1.*sym%mrot(:,:,np),cell%bmat)
DO k = 1,lapw%nv(iintsp)
!--> apply the rotation that brings this atom into the
!--> representative (this is the definition of ngopr(na)
!--> and transform to cartesian coordinates
v(:) = lapw%vk(:,k,iintsp)
gkrot(:,k) = MATMUL(TRANSPOSE(bmrot),v)
END DO
END IF
!--> generate spherical harmonics
DO k = 1,lapw%nv(1)
vmult(:) = gkrot(:,k)
CALL ylm4(lmax,vmult,ylm(k,:))
ENDDO
ylm_dev=ylm
!--> synthesize the complex conjugates of a and b
!$cuf kernel do <<<*,256>>>
DO k = 1,lapw%nv(1)
DO l = 0,lmax
ll1 = l* (l+1)
DO m = -l,l
ab(k,ll1+m+1) = CONJG(fj_dev(k,l+1,iintsp)*c_ph_dev(k,iintsp)*ylm_dev(k,ll1+m+1))
ab(k,ll1+m+1+ab_size) = CONJG(gj_dev(k,l+1,iintsp)*c_ph_dev(k,iintsp)*ylm_dev(k,ll1+m+1))
END DO
END DO
ENDDO !k-loop
IF (PRESENT(abclo)) THEN
DO k = 1,lapw%nv(1)
!determine also the abc coeffs for LOs
invsfct=MERGE(1,2,atoms%invsat(na).EQ.0)
term = fpi_const/SQRT(cell%omtil)* ((atoms%rmt(n)**2)/2)*c_ph(k,iintsp)
DO lo = 1,atoms%nlo(n)
l = atoms%llo(lo,n)
DO nkvec=1,invsfct*(2*l+1)
IF (lapw%kvec(nkvec,lo,na)==k) THEN !This k-vector is used in LO
ll1 = l*(l+1) + 1
DO m = -l,l
lm = ll1 + m
abclo(1,m,nkvec,lo) = term*ylm(k,lm)*alo1(lo)
abclo(2,m,nkvec,lo) = term*ylm(k,lm)*blo1(lo)
abclo(3,m,nkvec,lo) = term*ylm(k,lm)*clo1(lo)
END DO
END IF
ENDDO
ENDDO
ENDDO
ENDIF
IF (.NOT.l_apw) ab_size=ab_size*2
END SUBROUTINE hsmt_ab_gpu
#endif
SUBROUTINE hsmt_ab_cpu(sym,atoms,noco,ispin,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,l_nonsph,abclo,alo1,blo1,clo1)
!Calculate overlap matrix
USE m_constants, ONLY : fpi_const,tpi_const
USE m_types
......@@ -33,15 +165,15 @@ CONTAINS
REAL,INTENT(IN),OPTIONAL:: alo1(:),blo1(:),clo1(:)
INTEGER:: np,k,l,ll1,m,lmax,nkvec,lo,lm,invsfct
complex:: term
real :: th,v(3),bmrot(3,3),vmult(3)
COMPLEX:: term
REAL :: th,v(3),bmrot(3,3),vmult(3)
COMPLEX :: ylm((atoms%lmaxd+1)**2)
complex,allocatable:: c_ph(:,:)
real,allocatable :: gkrot(:,:)
COMPLEX,ALLOCATABLE:: c_ph(:,:)
REAL,ALLOCATABLE :: gkrot(:,:)
LOGICAL :: l_apw
ALLOCATE(c_ph(maxval(lapw%nv),MERGE(2,1,noco%l_ss)))
ALLOCATE(gkrot(3,MAXVAL(lapw%nv)))
ALLOCATE(c_ph(lapw%nv(1),MERGE(2,1,noco%l_ss)))
ALLOCATE(gkrot(3,lapw%nv(1)))
lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph)
......@@ -106,5 +238,5 @@ CONTAINS
!$OMP END PARALLEL DO
IF (.NOT.l_apw) ab_size=ab_size*2
END SUBROUTINE hsmt_ab
END SUBROUTINE hsmt_ab_cpu
END MODULE m_hsmt_ab
......@@ -40,6 +40,14 @@ CONTAINS
USE m_constants, ONLY : fpi_const,tpi_const
USE m_types
USE m_ylm
#if defined (_CUDA)
! cublas: required to use generic BLAS interface
! cudafor: required to use CUDA runtime API routines (e.g.
! cudaDeviceSynchronize)
USE cublas
USE cudafor
USE nvtx
#endif
IMPLICIT NONE
TYPE(t_mpi),INTENT(IN) :: mpi
TYPE(t_sym),INTENT(IN) :: sym
......@@ -61,8 +69,28 @@ CONTAINS
INTEGER:: nn,na,ab_size,l,ll,m
COMPLEX,ALLOCATABLE:: ab(:,:),ab1(:,:),ab2(:,:)
real :: rchi
#ifdef _CUDA
COMPLEX,ALLOCATABLE,DEVICE :: c_dev(:,:), ab1_dev(:,:), ab_dev(:,:)
COMPLEX,ALLOCATABLE,DEVICE :: h_loc_dev(:,:)
!REAL, ALLOCATABLE,DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
integer :: i, j, istat
call nvtxStartRange("hsmt_nonsph",1)
print*, "running CUDA version"
#endif
ALLOCATE(ab(MAXVAL(lapw%nv),2*atoms%lmaxd*(atoms%lmaxd+2)+2),ab1(lapw%nv(jintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
#ifdef _CUDA
ALLOCATE(h_loc_dev(size(td%h_loc,1),size(td%h_loc,2)))
ALLOCATE(ab1_dev(size(ab1,1),size(ab1,2)))
ALLOCATE(ab_dev(size(ab,1),size(ab,2)))
h_loc_dev(1:,1:) = CONJG(td%h_loc(0:,0:,n,isp)) !WORKAROUND, var_dev=CONJG(var_dev) does not work (pgi18.4)
!ALLOCATE(fj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
!ALLOCATE(gj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
!fj_dev(1:,1:,1:)= fj(1:,0:,1:)
!gj_dev(1:,1:,1:)= gj(1:,0:,1:)
!note that basically all matrices in the GPU version are conjugates of their
!cpu counterparts
#endif
IF (iintsp.NE.jintsp) ALLOCATE(ab2(lapw%nv(iintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
......@@ -73,18 +101,39 @@ CONTAINS
ENDIF
hmat%data_c=0.0
ENDIF
#ifdef _CUDA
ALLOCATE(c_dev(SIZE(hmat%data_c,1),SIZE(hmat%data_c,2)))
c_dev = hmat%data_c
#endif
DO nn = 1,atoms%neq(n)
na = SUM(atoms%neq(:n-1))+nn
IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN
rchi=MERGE(REAL(chi),REAL(chi)*2,(atoms%invsat(na)==0))
#ifdef _CUDA
CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab_dev,ab_size,.TRUE.)
! istat = cudaDeviceSynchronize()
#else
CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.)
#endif
!Calculate Hamiltonian
#ifdef _CUDA
!ab_dev = CONJG(ab)
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab_dev,SIZE(ab_dev,1),h_loc_dev,SIZE(h_loc_dev,1),CMPLX(0.,0.),ab1_dev,SIZE(ab1_dev,1))
#else
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab1,SIZE(ab1,1))
#endif
!ab1=MATMUL(ab(:lapw%nv(iintsp),:ab_size),td%h_loc(:ab_size,:ab_size,n,isp))
IF (iintsp==jintsp) THEN
#ifdef _CUDA
call nvtxStartRange("zherk",3)
CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,ab1_dev,SIZE(ab1_dev,1),1.0,c_dev,SIZE(c_dev,1))
istat = cudaDeviceSynchronize()
call nvtxEndRange()
#else
CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,CONJG(ab1),SIZE(ab1,1),1.0,hmat%data_c,SIZE(hmat%data_c,1))
#endif
ELSE !here the l_ss off-diagonal part starts
!Second set of ab is needed
CALL hsmt_ab(sym,atoms,noco,isp,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.)
......@@ -94,12 +143,18 @@ CONTAINS
ENDIF
ENDIF
END DO
#ifdef _CUDA
hmat%data_c = c_dev
#endif
IF (hmat%l_real) THEN
hmat%data_r=hmat%data_r+REAL(hmat%data_c)
ENDIF
END SUBROUTINE priv_noMPI
#ifdef _CUDA
call nvtxEndRange
#endif
END SUBROUTINE priv_noMPI
SUBROUTINE priv_MPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
......@@ -160,7 +215,7 @@ CONTAINS
CALL hsmt_ab(sym,atoms,noco,isp,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.)
CALL zgemm("N","N",lapw%nv(iintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(:,:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab1,SIZE(ab1,1))
!Multiply for Hamiltonian
CALL zgemm("N","T",lapw%nv(iintsp),lapw%num_local_cols(jintsp),ab_size,chi,conjg(ab1),SIZE(ab1,1),ab_select,lapw%num_local_cols(jintsp),CMPLX(1.,0.0),hmat%data_c,SIZE(hmat%data_c,1))
CALL zgemm("N","t",lapw%nv(iintsp),lapw%num_local_cols(jintsp),ab_size,chi,conjg(ab1),SIZE(ab1,1),ab_select,lapw%num_local_cols(jintsp),CMPLX(1.,0.0),hmat%data_c,SIZE(hmat%data_c,1))
ENDIF
ENDIF
END DO
......
set(fleur_F77 ${fleur_F77}
)
set(fleur_F90 ${fleur_F90}
eigen_soc/abclocdn_soc.F90
eigen_soc/abcof_soc.F90
eigen_soc/alineso.F90
eigen_soc/anglso.f90
......
!--------------------------------------------------------------------------------
! Copyright (c) 2016 Peter Grünberg Institut, Forschungszentrum Jülich, Germany
! This file is part of FLEUR and available as free software under the conditions
! of the MIT license as expressed in the LICENSE file in more detail.
!--------------------------------------------------------------------------------
MODULE m_abclocdn_soc
USE m_juDFT
!*********************************************************************
! Calculates the (upper case) A, B and C coefficients for the local
! orbitals. The difference to abccoflo is, that a summation over the
! Gs ist performed. The A, B and C coeff. are set up for each eigen-
! state.
! Philipp Kurz 99/04
!*********************************************************************
!*************** ABBREVIATIONS ***************************************
! nkvec : stores the number of G-vectors that have been found and
! accepted during the construction of the local orbitals.
! kvec : k-vector used in hssphn to attach the local orbital 'lo'
! of atom 'na' to it.
!*********************************************************************
CONTAINS
SUBROUTINE abclocdn_soc(atoms,sym,noco,lapw,cell,ccchi,iintsp,phase,ylm,&
ntyp,na,na_l,k,nkvec,lo,ne,alo1,blo1,clo1,acof,bcof,ccof,zMat,l_force,fgp,force)
USE m_types
USE m_constants
IMPLICIT NONE
TYPE(t_noco), INTENT(IN) :: noco
TYPE(t_sym), INTENT(IN) :: sym
TYPE(t_atoms), INTENT(IN) :: atoms
TYPE(t_lapw), INTENT(IN) :: lapw
TYPE(t_cell), INTENT(IN) :: cell
TYPE(t_mat), INTENT(IN) :: zMat
TYPE(t_force), OPTIONAL, INTENT(INOUT) :: force
! .. Scalar Arguments ..
INTEGER, INTENT (IN) :: iintsp
INTEGER, INTENT (IN) :: k,na,na_l,ne,ntyp,nkvec,lo
COMPLEX, INTENT (IN) :: phase
LOGICAL, INTENT (IN) :: l_force
! .. Array Arguments ..
REAL, INTENT (IN) :: alo1(:),blo1(:),clo1(:)
COMPLEX, INTENT (IN) :: ylm( (atoms%lmaxd+1)**2 )
COMPLEX, INTENT (IN) :: ccchi(2)
COMPLEX, INTENT (INOUT) :: acof(:,0:,:)!(nobd,0:dimension%lmd,atoms%nat_l)
COMPLEX, INTENT (INOUT) :: bcof(:,0:,:)!(nobd,0:dimension%lmd,atoms%nat_l)
COMPLEX, INTENT (INOUT) :: ccof(-atoms%llod:,:,:,:)!(-atoms%llod:atoms%llod,nobd,atoms%nlod,atoms%nat_l)
REAL, OPTIONAL, INTENT (IN) :: fgp(3)
! .. Local Scalars ..
COMPLEX ctmp,term1
INTEGER i,j,l,ll1,lm,nbasf,m,na2,lmp
! ..
! ..
term1 = 2 * tpi_const/SQRT(cell%omtil) * ((atoms%rmt(ntyp)**2)/2) * phase
!
!---> the whole program is in hartree units, therefore 1/wronskian is
!---> (rmt**2)/2. the factor i**l, which usually appears in the a, b
!---> and c coefficients, is included in the t-matrices. thus, it does
!---> not show up in the formula above.
IF ((atoms%invsat(na)==0).OR.(atoms%invsat(na)==1)) THEN
na2=na
ELSE
na2 = sym%invsatnr(na)
ENDIF
nbasf=lapw%nv(iintsp)+lapw%index_lo(lo,na2)+nkvec
l = atoms%llo(lo,ntyp)
ll1 = l* (l+1)
DO i = 1,ne
DO m = -l,l
lm = ll1 + m
!+gu_con
IF ((atoms%invsat(na)==0).OR.(atoms%invsat(na)==1)) THEN
IF (zMat%l_real) THEN
ctmp = zMat%data_r(nbasf,i)*term1*CONJG(ylm(ll1+m+1))
ELSE
ctmp = zMat%data_c(nbasf,i)*term1*CONJG(ylm(ll1+m+1))
ENDIF
acof(i,lm,na_l) = acof(i,lm,na_l) + ctmp*alo1(lo)
bcof(i,lm,na_l) = bcof(i,lm,na_l) + ctmp*blo1(lo)
ccof(m,i,lo,na_l) = ccof(m,i,lo,na_l) + ctmp*clo1(lo)
ELSE
ctmp = zMat%data_c(nbasf,i)*CONJG(term1)*ylm(ll1+m+1)*(-1)**(l-m)
lmp = ll1 - m
acof(i,lmp,na_l) = acof(i,lmp,na_l) +ctmp*alo1(lo)
bcof(i,lmp,na_l) = bcof(i,lmp,na_l) +ctmp*blo1(lo)
ccof(-m,i,lo,na_l) = ccof(-m,i,lo,na_l) +ctmp*clo1(lo)
ENDIF
END DO
END DO
END SUBROUTINE abclocdn_soc
END MODULE m_abclocdn_soc
This diff is collapsed.
......@@ -155,9 +155,9 @@ CONTAINS
!
! set up A and B coefficients
!
ALLOCATE ( ahelp(atoms%lmaxd*(atoms%lmaxd+2),nat_l,DIMENSION%neigd,DIMENSION%jspd) )
ALLOCATE ( bhelp(atoms%lmaxd*(atoms%lmaxd+2),nat_l,DIMENSION%neigd,DIMENSION%jspd) )
ALLOCATE ( chelp(-atoms%llod :atoms%llod, DIMENSION%neigd,atoms%nlod,nat_l,DIMENSION%jspd) )
ALLOCATE ( ahelp(atoms%lmaxd*(atoms%lmaxd+2),nat_l,DIMENSION%neigd,input%jspins) )
ALLOCATE ( bhelp(atoms%lmaxd*(atoms%lmaxd+2),nat_l,DIMENSION%neigd,input%jspins) )
ALLOCATE ( chelp(-atoms%llod :atoms%llod, DIMENSION%neigd,atoms%nlod,nat_l,input%jspins) )
CALL timestart("alineso SOC: -help")
write(*,*) nat_start,nat_stop,nat_l
CALL hsohelp(&
......@@ -177,9 +177,10 @@ CONTAINS
CALL MPI_BARRIER(mpi%MPI_COMM,ierr)
#endif
ALLOCATE ( hsomtx(DIMENSION%neigd,DIMENSION%neigd,2,2) )
CALL hsoham(atoms,noco,input,nsz,chelp,rsoc,ahelp,bhelp,&
CALL hsoham(atoms,noco,input,nsz,dimension%neigd,chelp,rsoc,ahelp,bhelp,&
nat_start,nat_stop,mpi%n_rank,mpi%n_size,mpi%SUB_COMM,&
hsomtx)
write(*,*) 'after hsoham'
DEALLOCATE ( ahelp,bhelp,chelp )
CALL timestop("alineso SOC: -ham")
IF (mpi%n_rank==0) THEN
......
......@@ -130,7 +130,7 @@ CONTAINS
n_stride = 1
#endif
n_end = kpts%nkpt
write(*,'(4i12)') mpi%irank, mpi%n_groups, n_stride, mpi%n_start
!write(*,'(4i12)') mpi%irank, mpi%n_groups, n_stride, mpi%n_start
!
!---> start loop k-pts
!
......
......@@ -6,7 +6,7 @@ MODULE m_hsoham
!
CONTAINS
SUBROUTINE hsoham(&
atoms,noco,input,nsz,chelp,rsoc,ahelp,bhelp,&
atoms,noco,input,nsz,neigd,chelp,rsoc,ahelp,bhelp,&
nat_start,nat_stop,n_rank,n_size,SUB_COMM,&
hsomtx)
......@@ -25,20 +25,20 @@ CONTAINS
! ..
! .. Scalar Arguments ..
! ..
INTEGER, INTENT (IN) :: nat_start,nat_stop,n_rank,n_size,SUB_COMM
INTEGER, INTENT (IN) :: nat_start,nat_stop,n_rank,n_size,SUB_COMM,neigd
! .. Array Arguments ..
INTEGER, INTENT (IN) :: nsz(:)!(dimension%jspd)