Commit cc1a4deb authored by Matthias Redies's avatar Matthias Redies

merge develop

parents 6525ed7f 01d3cc4f
......@@ -51,12 +51,12 @@ inpgen/lapw_input.f inpgen/struct_input.f inpgen/write_struct.f
io/calculator.f global/ss_sym.f global/soc_sym.f math/inv3.f io/rw_symfile.f
global/sort.f kpoints/kptgen_hybrid.f kpoints/od_kptsgen.f kpoints/bravais.f kpoints/divi.f kpoints/brzone.f
kpoints/kptmop.f kpoints/kpttet.f init/bandstr1.F kpoints/ordstar.f kpoints/fulstar.f kpoints/kprep.f
kpoints/tetcon.f kpoints/kvecon.f init/boxdim.f math/ylm4.f global/radsra.f math/intgr.F global/differ.f math/inwint.f
kpoints/tetcon.f kpoints/kvecon.f init/boxdim.f global/radsra.f math/intgr.F global/differ.f math/inwint.f
math/outint.f xc-pot/gaunt.f math/grule.f
)
set(inpgen_F90 ${inpgen_F90} global/constants.f90 io/xsf_io.f90
eigen/vec_for_lo.f90 eigen/orthoglo.F90 juDFT/usage_data.F90
eigen/vec_for_lo.f90 eigen/orthoglo.F90 juDFT/usage_data.F90 math/ylm4.f90
global/enpara.f90 global/chkmt.f90 inpgen/inpgen.f90 inpgen/set_inp.f90 inpgen/inpgen_help.f90 io/rw_inp.f90 juDFT/juDFT.F90 global/find_enpara.f90
inpgen/closure.f90 inpgen/inpgen_arguments.F90
juDFT/info.F90 juDFT/stop.F90 juDFT/args.F90 juDFT/time.F90 juDFT/init.F90 juDFT/sysinfo.F90 io/w_inpXML.f90 kpoints/julia.f90 global/utility.F90
......
......@@ -6,7 +6,7 @@ eigen/eigen.F90
eigen/hlomat.F90
eigen/hs_int.F90
eigen/hsmt_fjgj.F90
eigen/hsmt_ab.f90
eigen/hsmt_ab.F90
eigen/hsmt_sph.F90
eigen/hsmt_nonsph.F90
eigen/hsmt_spinor.F90
......@@ -34,6 +34,7 @@ eigen/vacfun.f90
eigen/vec_for_lo.f90
eigen/eigen_redist_matrix.f90
)
if (FLEUR_USE_GPU)
set(fleur_F90 ${fleur_F90} eigen/hsmt_nonsph_GPU.F90)
endif()
#if (FLEUR_USE_GPU)
# set(fleur_F90 ${fleur_F90}
#eigen/hsmt_nonsph_GPU.F90)
#endif()
......@@ -6,8 +6,140 @@
MODULE m_hsmt_ab
use m_juDFT
implicit none
INTERFACE hsmt_ab
module procedure hsmt_ab_cpu
#ifdef _CUDA
module procedure hsmt_ab_gpu
#endif
END INTERFACE
CONTAINS
SUBROUTINE hsmt_ab(sym,atoms,noco,ispin,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,l_nonsph,abclo,alo1,blo1,clo1)
#ifdef _CUDA
SUBROUTINE hsmt_ab_gpu(sym,atoms,noco,ispin,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,l_nonsph,abclo,alo1,blo1,clo1)
!Calculate overlap matrix
USE m_constants, ONLY : fpi_const,tpi_const
USE m_types
USE m_ylm
USE m_apws
IMPLICIT NONE
TYPE(t_sym),INTENT(IN) :: sym
TYPE(t_cell),INTENT(IN) :: cell
TYPE(t_atoms),INTENT(IN) :: atoms
TYPE(t_lapw),INTENT(IN) :: lapw
TYPE(t_noco),INTENT(IN) :: noco
! ..
! .. Scalar Arguments ..
INTEGER, INTENT (IN) :: ispin,n,na,iintsp
LOGICAL,INTENT(IN) :: l_nonsph
INTEGER,INTENT(OUT) :: ab_size
! ..
! .. Array Arguments ..
REAL, INTENT(IN) :: fj(:,:,:),gj(:,:,:)
COMPLEX,DEVICE, INTENT (OUT) :: ab(:,:)
!Optional arguments if abc coef for LOs are needed
COMPLEX, INTENT(INOUT),OPTIONAL:: abclo(:,-atoms%llod:,:,:)
REAL,INTENT(IN),OPTIONAL:: alo1(:),blo1(:),clo1(:)
INTEGER:: np,k,l,ll1,m,lmax,nkvec,lo,lm,invsfct
REAL :: th,v(3),bmrot(3,3),vmult(3)
COMPLEX,ALLOCATABLE :: ylm(:,:)
COMPLEX,ALLOCATABLE :: c_ph(:,:)
REAL, ALLOCATABLE :: gkrot(:,:)
LOGICAL :: l_apw
COMPLEX:: term
REAL, ALLOCATABLE,DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
COMPLEX,ALLOCATABLE,DEVICE :: c_ph_dev(:,:)
COMPLEX,ALLOCATABLE,DEVICE :: ylm_dev(:,:)
ALLOCATE(fj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
ALLOCATE(gj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
ALLOCATE(c_ph_dev(lapw%nv(1),MERGE(2,1,noco%l_ss)))
ALLOCATE(ylm_dev(lapw%nv(1),(atoms%lmaxd+1)**2))
fj_dev(:,:,:)= fj(:,:,:)
gj_dev(:,:,:)= gj(:,:,:)
ALLOCATE(ylm(lapw%nv(1),(atoms%lmaxd+1)**2))
ALLOCATE(c_ph(lapw%nv(1),MERGE(2,1,noco%l_ss)))
ALLOCATE(gkrot(3,lapw%nv(1)))
lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph)
ab_size=lmax*(lmax+2)+1
l_apw=ALL(gj==0.0)
ab=0.0
np = sym%invtab(atoms%ngopr(na))
!---> set up phase factors
CALL lapw%phase_factors(iintsp,atoms%taual(:,na),noco%qss,c_ph(:,iintsp))
c_ph_dev=c_ph
IF (np==1) THEN
gkrot(:, 1:lapw%nv(iintsp)) = lapw%gk(:, 1:lapw%nv(iintsp),iintsp)
ELSE
bmrot=MATMUL(1.*sym%mrot(:,:,np),cell%bmat)
DO k = 1,lapw%nv(iintsp)
!--> apply the rotation that brings this atom into the
!--> representative (this is the definition of ngopr(na)
!--> and transform to cartesian coordinates
v(:) = lapw%vk(:,k,iintsp)
gkrot(:,k) = MATMUL(TRANSPOSE(bmrot),v)
END DO
END IF
!--> generate spherical harmonics
DO k = 1,lapw%nv(1)
vmult(:) = gkrot(:,k)
CALL ylm4(lmax,vmult,ylm(k,:))
ENDDO
ylm_dev=ylm
!--> synthesize the complex conjugates of a and b
!$cuf kernel do <<<*,256>>>
DO k = 1,lapw%nv(1)
DO l = 0,lmax
ll1 = l* (l+1)
DO m = -l,l
ab(k,ll1+m+1) = CONJG(fj_dev(k,l+1,iintsp)*c_ph_dev(k,iintsp)*ylm_dev(k,ll1+m+1))
ab(k,ll1+m+1+ab_size) = CONJG(gj_dev(k,l+1,iintsp)*c_ph_dev(k,iintsp)*ylm_dev(k,ll1+m+1))
END DO
END DO
ENDDO !k-loop
IF (PRESENT(abclo)) THEN
DO k = 1,lapw%nv(1)
!determine also the abc coeffs for LOs
invsfct=MERGE(1,2,atoms%invsat(na).EQ.0)
term = fpi_const/SQRT(cell%omtil)* ((atoms%rmt(n)**2)/2)*c_ph(k,iintsp)
DO lo = 1,atoms%nlo(n)
l = atoms%llo(lo,n)
DO nkvec=1,invsfct*(2*l+1)
IF (lapw%kvec(nkvec,lo,na)==k) THEN !This k-vector is used in LO
ll1 = l*(l+1) + 1
DO m = -l,l
lm = ll1 + m
abclo(1,m,nkvec,lo) = term*ylm(k,lm)*alo1(lo)
abclo(2,m,nkvec,lo) = term*ylm(k,lm)*blo1(lo)
abclo(3,m,nkvec,lo) = term*ylm(k,lm)*clo1(lo)
END DO
END IF
ENDDO
ENDDO
ENDDO
ENDIF
IF (.NOT.l_apw) ab_size=ab_size*2
END SUBROUTINE hsmt_ab_gpu
#endif
SUBROUTINE hsmt_ab_cpu(sym,atoms,noco,ispin,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,l_nonsph,abclo,alo1,blo1,clo1)
!Calculate overlap matrix
USE m_constants, ONLY : fpi_const,tpi_const
USE m_types
......@@ -33,15 +165,15 @@ CONTAINS
REAL,INTENT(IN),OPTIONAL:: alo1(:),blo1(:),clo1(:)
INTEGER:: np,k,l,ll1,m,lmax,nkvec,lo,lm,invsfct
complex:: term
real :: th,v(3),bmrot(3,3),vmult(3)
COMPLEX:: term
REAL :: th,v(3),bmrot(3,3),vmult(3)
COMPLEX :: ylm((atoms%lmaxd+1)**2)
complex,allocatable:: c_ph(:,:)
real,allocatable :: gkrot(:,:)
COMPLEX,ALLOCATABLE:: c_ph(:,:)
REAL,ALLOCATABLE :: gkrot(:,:)
LOGICAL :: l_apw
ALLOCATE(c_ph(maxval(lapw%nv),MERGE(2,1,noco%l_ss)))
ALLOCATE(gkrot(3,MAXVAL(lapw%nv)))
ALLOCATE(c_ph(lapw%nv(1),MERGE(2,1,noco%l_ss)))
ALLOCATE(gkrot(3,lapw%nv(1)))
lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph)
......@@ -106,5 +238,5 @@ CONTAINS
!$OMP END PARALLEL DO
IF (.NOT.l_apw) ab_size=ab_size*2
END SUBROUTINE hsmt_ab
END SUBROUTINE hsmt_ab_cpu
END MODULE m_hsmt_ab
......@@ -72,6 +72,7 @@ CONTAINS
#ifdef _CUDA
COMPLEX,ALLOCATABLE,DEVICE :: c_dev(:,:), ab1_dev(:,:), ab_dev(:,:)
COMPLEX,ALLOCATABLE,DEVICE :: h_loc_dev(:,:)
!REAL, ALLOCATABLE,DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
integer :: i, j, istat
call nvtxStartRange("hsmt_nonsph",1)
print*, "running CUDA version"
......@@ -83,6 +84,10 @@ CONTAINS
ALLOCATE(ab1_dev(size(ab1,1),size(ab1,2)))
ALLOCATE(ab_dev(size(ab,1),size(ab,2)))
h_loc_dev(1:,1:) = CONJG(td%h_loc(0:,0:,n,isp)) !WORKAROUND, var_dev=CONJG(var_dev) does not work (pgi18.4)
!ALLOCATE(fj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
!ALLOCATE(gj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
!fj_dev(1:,1:,1:)= fj(1:,0:,1:)
!gj_dev(1:,1:,1:)= gj(1:,0:,1:)
!note that basically all matrices in the GPU version are conjugates of their
!cpu counterparts
#endif
......@@ -106,15 +111,15 @@ CONTAINS
IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN
rchi=MERGE(REAL(chi),REAL(chi)*2,(atoms%invsat(na)==0))
!#ifdef _CUDA
!CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab_dev,ab_size,.TRUE.)
#ifdef _CUDA
CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab_dev,ab_size,.TRUE.)
! istat = cudaDeviceSynchronize()
!#else
#else
CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.)
!#endif
#endif
!Calculate Hamiltonian
#ifdef _CUDA
ab_dev = CONJG(ab)
!ab_dev = CONJG(ab)
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab_dev,SIZE(ab_dev,1),h_loc_dev,SIZE(h_loc_dev,1),CMPLX(0.,0.),ab1_dev,SIZE(ab1_dev,1))
#else
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab1,SIZE(ab1,1))
......
set(fleur_F77 ${fleur_F77}
)
set(fleur_F90 ${fleur_F90}
eigen_soc/abclocdn_soc.F90
eigen_soc/abcof_soc.F90
eigen_soc/alineso.F90
eigen_soc/anglso.f90
......
!--------------------------------------------------------------------------------
! Copyright (c) 2016 Peter Grünberg Institut, Forschungszentrum Jülich, Germany
! This file is part of FLEUR and available as free software under the conditions
! of the MIT license as expressed in the LICENSE file in more detail.
!--------------------------------------------------------------------------------
MODULE m_abclocdn_soc
USE m_juDFT
!*********************************************************************
! Calculates the (upper case) A, B and C coefficients for the local
! orbitals. The difference to abccoflo is, that a summation over the
! Gs ist performed. The A, B and C coeff. are set up for each eigen-
! state.
! Philipp Kurz 99/04
!*********************************************************************
!*************** ABBREVIATIONS ***************************************
! nkvec : stores the number of G-vectors that have been found and
! accepted during the construction of the local orbitals.
! kvec : k-vector used in hssphn to attach the local orbital 'lo'
! of atom 'na' to it.
!*********************************************************************
CONTAINS
SUBROUTINE abclocdn_soc(atoms,sym,noco,lapw,cell,ccchi,iintsp,phase,ylm,&
ntyp,na,na_l,k,nkvec,lo,ne,alo1,blo1,clo1,acof,bcof,ccof,zMat,l_force,fgp,force)
USE m_types
USE m_constants
IMPLICIT NONE
TYPE(t_noco), INTENT(IN) :: noco
TYPE(t_sym), INTENT(IN) :: sym
TYPE(t_atoms), INTENT(IN) :: atoms
TYPE(t_lapw), INTENT(IN) :: lapw
TYPE(t_cell), INTENT(IN) :: cell
TYPE(t_mat), INTENT(IN) :: zMat
TYPE(t_force), OPTIONAL, INTENT(INOUT) :: force
! .. Scalar Arguments ..
INTEGER, INTENT (IN) :: iintsp
INTEGER, INTENT (IN) :: k,na,na_l,ne,ntyp,nkvec,lo
COMPLEX, INTENT (IN) :: phase
LOGICAL, INTENT (IN) :: l_force
! .. Array Arguments ..
REAL, INTENT (IN) :: alo1(:),blo1(:),clo1(:)
COMPLEX, INTENT (IN) :: ylm( (atoms%lmaxd+1)**2 )
COMPLEX, INTENT (IN) :: ccchi(2)
COMPLEX, INTENT (INOUT) :: acof(:,0:,:)!(nobd,0:dimension%lmd,atoms%nat_l)
COMPLEX, INTENT (INOUT) :: bcof(:,0:,:)!(nobd,0:dimension%lmd,atoms%nat_l)
COMPLEX, INTENT (INOUT) :: ccof(-atoms%llod:,:,:,:)!(-atoms%llod:atoms%llod,nobd,atoms%nlod,atoms%nat_l)
REAL, OPTIONAL, INTENT (IN) :: fgp(3)
! .. Local Scalars ..
COMPLEX ctmp,term1
INTEGER i,j,l,ll1,lm,nbasf,m,na2,lmp
! ..
! ..
term1 = 2 * tpi_const/SQRT(cell%omtil) * ((atoms%rmt(ntyp)**2)/2) * phase
!
!---> the whole program is in hartree units, therefore 1/wronskian is
!---> (rmt**2)/2. the factor i**l, which usually appears in the a, b
!---> and c coefficients, is included in the t-matrices. thus, it does
!---> not show up in the formula above.
IF ((atoms%invsat(na)==0).OR.(atoms%invsat(na)==1)) THEN
na2=na
ELSE
na2 = sym%invsatnr(na)
ENDIF
nbasf=lapw%nv(iintsp)+lapw%index_lo(lo,na2)+nkvec
l = atoms%llo(lo,ntyp)
ll1 = l* (l+1)
DO i = 1,ne
DO m = -l,l
lm = ll1 + m
!+gu_con
IF ((atoms%invsat(na)==0).OR.(atoms%invsat(na)==1)) THEN
IF (zMat%l_real) THEN
ctmp = zMat%data_r(nbasf,i)*term1*CONJG(ylm(ll1+m+1))
ELSE
ctmp = zMat%data_c(nbasf,i)*term1*CONJG(ylm(ll1+m+1))
ENDIF
acof(i,lm,na_l) = acof(i,lm,na_l) + ctmp*alo1(lo)
bcof(i,lm,na_l) = bcof(i,lm,na_l) + ctmp*blo1(lo)
ccof(m,i,lo,na_l) = ccof(m,i,lo,na_l) + ctmp*clo1(lo)
ELSE
ctmp = zMat%data_c(nbasf,i)*CONJG(term1)*ylm(ll1+m+1)*(-1)**(l-m)
lmp = ll1 - m
acof(i,lmp,na_l) = acof(i,lmp,na_l) +ctmp*alo1(lo)
bcof(i,lmp,na_l) = bcof(i,lmp,na_l) +ctmp*blo1(lo)
ccof(-m,i,lo,na_l) = ccof(-m,i,lo,na_l) +ctmp*clo1(lo)
ENDIF
END DO
END DO
END SUBROUTINE abclocdn_soc
END MODULE m_abclocdn_soc
This diff is collapsed.
......@@ -51,8 +51,6 @@ CONTAINS
!+odim
! ..
! .. Locals ..
TYPE(t_atoms) :: atoms_local
TYPE(t_noco) :: noco_local
TYPE(t_mat) :: zMat_local
INTEGER ispin ,l,n ,na,ie,lm,ll1,nv1(DIMENSION%jspd),m,lmd
INTEGER, ALLOCATABLE :: g1(:,:),g2(:,:),g3(:,:)
......@@ -60,12 +58,7 @@ CONTAINS
!
! turn off the non-collinear part of abcof
!
noco_local=noco
noco_local%l_ss = .FALSE.
lmd = atoms%lmaxd*(atoms%lmaxd+2)
noco_local%qss(:) = 0.0
atoms_local=atoms
atoms_local%ngopr(:) = 1 ! use unrotated coeffs...
!
! some praparations to match array sizes
!
......@@ -89,8 +82,8 @@ CONTAINS
zMat_local%matsize2 = DIMENSION%neigd
ALLOCATE(zMat_local%data_c(zmat(1)%matsize1,DIMENSION%neigd))
zMat_local%data_c(:,:) = zso(:,1:DIMENSION%neigd,ispin)
CALL abcof_soc(input,atoms_local,sym,cell,lapw,nsz(ispin),&
usdus, noco_local,ispin,oneD,nat_start,nat_stop,nat_l,&
CALL abcof_soc(input,atoms,sym,cell,lapw,nsz(ispin),&
usdus, noco,ispin,oneD,nat_start,nat_stop,nat_l,&
acof,bcof,chelp(-atoms%llod:,:,:,:,ispin),zMat_local)
DEALLOCATE(zMat_local%data_c)
!
......@@ -109,15 +102,14 @@ CONTAINS
ENDDO
ENDDO
ENDDO
chelp(:,:,:,:,ispin) = (chelp(:,:,:,:,ispin))
ELSE
zMat_local%l_real = zmat(1)%l_real
zMat_local%matsize1 = zmat(1)%matsize1
zMat_local%matsize2 = DIMENSION%neigd
ALLOCATE(zMat_local%data_c(zmat(1)%matsize1,DIMENSION%neigd))
zMat_local%data_c(:,:) = zmat(ispin)%data_c(:,:)
CALL abcof_soc(input,atoms_local,sym,cell,lapw,nsz(ispin),&
usdus, noco_local,ispin,oneD,nat_start,nat_stop,nat_l,&
CALL abcof_soc(input,atoms,sym,cell,lapw,nsz(ispin),&
usdus,noco,ispin,oneD,nat_start,nat_stop,nat_l,&
acof,bcof,chelp(-atoms%llod:,:,:,:,ispin),zMat_local)
DEALLOCATE(zMat_local%data_c)
!
......
......@@ -21,7 +21,7 @@ math/rfft.F
math/sphbes.f
math/sphpts.f
math/util.F
math/ylm4.f
math/ylm4.f90
math/difcub.f
)
set(fleur_F90 ${fleur_F90}
......
......@@ -6,9 +6,7 @@
INTEGER, SAVE :: lmaxd = -1 ! initial value
public ylm4,ylmnorm_init
CONTAINS
SUBROUTINE ylm4(
> lmax,v,
< ylm)
SUBROUTINE ylm4(lmax,v,ylm)
!************************************************************
! generate the spherical harmonics for the vector v
! using a stable upward recursion in l. (see notes
......@@ -121,8 +119,8 @@
lmaxd = lmax
fpi = 4.0*pimach()
!$ if (omp_in_parallel()) call juDFT_error(
!$ + "ylmnorm should not called in parallel",calledby="ylm4.f")
!$ if (omp_in_parallel()) call juDFT_error( &
!$ "ylmnorm should not called in parallel",calledby="ylm4.f")
DO l=0,lmax
lm0 = l*(l+1) + 1
cd=1.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment