Commit 9722f9b6 authored by Daniel Wortmann's avatar Daniel Wortmann

Merge branch 'develop' of ssh://fleur-git/fleur into develop

parents c6a8dc8f ec38e10e
......@@ -12,10 +12,10 @@ project(FLEUR LANGUAGES C Fortran)
set(FLEUR_LIBRARIES ${FLEUR_LIBRARIES} $ENV{FLEUR_LIBRARIES})
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${FLEUR_Fortran_FLAGS} $ENV{CMAKE_Fortran_FLAGS}")
if (DEFINED ENV{FLEUR_NO_SERIAL})
set(FLEUR_USE_SERIAL false)
if (DEFINED ENV{FLEUR_USE_SERIAL})
set(FLEUR_USE_SERIAL ENV{FLEUR_USE_SERIAL})
else()
set(FLEUR_USE_SERIAL true)
set(FLEUR_USE_SERIAL TRUE)
endif()
......
set(FLEUR_DEFINITIONS ${FLEUR_DEFINITIONS} "CPP_DOUBLE")
set(FLEUR_MPI_DEFINITIONS ${FLEUR_MPI_DEFINITIONS} "CPP_DOUBLE")
include("cmake/tests/test_doxygen.cmake")
include("cmake/compilerflags.cmake")
include("cmake/tests/test_XML.cmake")
include("cmake/tests/test_LAPACK.cmake")
include("cmake/tests/test_MPI.cmake")
include("cmake/tests/test_HDF5.cmake")
include("cmake/tests/test_MAGMA.cmake")
include("cmake/tests/test_GPU.cmake")
if (FLEUR_USE_MPI)
include("cmake/tests/test_SCALAPACK.cmake")
include("cmake/tests/test_ELPA.cmake")
......
message("************Summary***************")
message("Compiler: ${CMAKE_Fortran_COMPILER}")
message("Compiler ID:${CMAKE_Fortran_COMPILER_ID}")
message("Flags: ${CMAKE_Fortran_FLAGS}")
message("Libraries:${FLEUR_LIBRARIES}")
message("\nThese Libraries are required:")
message("XML Library found:${FLEUR_USE_XML}")
message("LAPACK Library found:${FLEUR_USE_LAPACK}")
message("\nThese Libraries are optional:")
message("HDF5 Library found:${FLEUR_USE_HDF5}")
message("MPI Library found:${FLEUR_USE_MPI}")
if(NOT WIN32)
string(ASCII 27 Esc)
set(CReset "${Esc}[m")
set(Bold "${Esc}[1m")
set(Red "${Esc}[31m")
set(Green "${Esc}[32m")
set(BRed "${Esc}[1;31m")
set(BGreen "${Esc}[1;32m")
endif()
message("${BRed}\n\n************************Summary***************************${CReset}")
message("${Green}Compiler : ${CReset}${CMAKE_Fortran_COMPILER}")
message("${Green}Compiler ID: ${CReset}${CMAKE_Fortran_COMPILER_ID}")
message("${Green}Flags : ${CReset}${CMAKE_Fortran_FLAGS}")
message("${Green}Libraries : ${CReset}${FLEUR_LIBRARIES}")
message("\n${Red}These Libraries are required:${CReset}")
message("${Green} XML Library found :${CReset} ${FLEUR_USE_XML}")
message("${Green} LAPACK Library found:${CReset} ${FLEUR_USE_LAPACK}")
message("${Red}These Libraries are optional:${CReset}")
message("${Green} HDF5 Library found :${CReset} ${FLEUR_USE_HDF5}")
message("${Green} MAGMA Library found:${CReset} ${FLEUR_USE_MAGMA}")
message("${Green} MPI Library found :${CReset} ${FLEUR_USE_MPI}")
if (FLEUR_USE_MPI)
message("SCALAPACK Library found:${FLEUR_USE_SCALAPACK}")
message("ELPA Library found:${FLEUR_USE_ELPA}")
message("${Green} SCALAPACK Library found:${CReset}${FLEUR_USE_SCALAPACK}")
message("${Green} ELPA Library found :${CReset}${FLEUR_USE_ELPA}")
else()
message("${Green} SCALAPACK Library found:${CReset} ---")
message("${Green} ELPA Library found :${CReset} ---")
endif()
message("${Green} Compile GPU version: ${CReset} ${FLEUR_USE_GPU}")
message("\n${Green}Compile serial version : ${CReset}${FLEUR_USE_SERIAL}")
message("${Green}Compile parallel version: ${CReset}${FLEUR_USE_MPI}")
message("\n${Green}Git describe : ${CReset}${git_describe}")
message("${Green}Git hash : ${CReset}${git_hash}")
message("${Green}Doxygen found : ${CReset}${FLEUR_USE_DOXYGEN}")
message("${BRed}************************-------***************************${CReset}")
#some checks
if (NOT FLEUR_USE_XML)
message(FATAL_ERROR "${BRed}No libxml2 Library found. This is required.${CReset}")
endif()
if (NOT FLEUR_USE_LAPACK)
message(FATAL_ERROR "${BRed}No LAPACK Library found. This is required.${CReset}")
endif()
message("Git describe:${git_describe}")
message("Git hash:${git_hash}")
message("************-------***************")
if (NOT (FLEUR_USE_SERIAL OR FLEUR_USE_MPI))
message(FATAL_ERROR "You should either compile a parallel or serial version (or both)")
endif()
......@@ -13,6 +13,7 @@ elseif(${CMAKE_Fortran_COMPILER_ID} MATCHES "PGI")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -mp -Mr8 -Mr8intrinsics -Mcuda:kepler+ -ta:tesla:cuda7.5 -DUSE_STREAMS -DNUM_STREAMS=${N_STREAMS} -Minfo=accel -acc")
set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fast -O3")
set(CMAKE_Fortran_FLAGS_DEBUG "${CMAKE_Fortran_FLAGS_DEBUG} -C -traceback -O0 -g -Mchkstk -Mchkptr")
set(FLEUR_LIBRARIES ${FLEUR_LIBRARIES} "-lstdc++;-L$ENV{MKLROOT}/lib/intel64;-lmkl_scalapack_lp64;-lmkl_intel_lp64;-lmkl_pgi_thread;-lmkl_core;-lmkl_blacs_intelmpi_lp64")
elseif(${CMAKE_Fortran_COMPILER_ID} MATCHES "XL")
message("IBM/BG Fortran detected")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qsmp=omp -qnosave -qarch=qp -qtune=qp -qrealsize=8 -qfixed -qsuppress=1520-022 -qessl")
......
ml purge
ml PGI MVAPICH2 CMake libxml2/.2.9.4
ml PGI MVAPICH2 CMake libxml2/.2.9.4 imkl
#Check if we can compile with GPU
if ($ENV{FLEUR_USE_GPU})
#No check is done
set(FLEUR_USE_GPU TRUE)
set(FLEUR_MPI_DEFINITIONS ${FLEUR_MPI_DEFINITIONS} "CPP_GPU")
set(FLEUR_DEFINITIONS ${FLEUR_DEFINITIONS} "CPP_GPU")
else()
set(FLEUR_USE_GPU FALSE)
endif()
\ No newline at end of file
......@@ -54,6 +54,20 @@ if ( FLEUR_USE_HDF5)
endif()
message("HDF5 Library found:${FLEUR_USE_HDF5}")
if (DEFINED ENV{FLEUR_USE_HDF5})
if (ENV{FLEUR_USE_HDF5})
if (NOT FLEUR_USE_HDF5)
message(FATAL_ERROR "You asked for HDF5 but cmake couldn't find it. Please set HDF5_ROOT and or give additional compiler/linker flags")
endif()
else()
if (FLEUR_USE_HDF5)
message("HDF5 library found, but you explicitely asked not to use it")
set(FLEUR_USE_HDF5 FALSE)
endif()
endif()
endif()
if (FLEUR_USE_HDF5)
message("Parallel HDF5 Library found:${FLEUR_USE_HDF5}")
set(FLEUR_DEFINITIONS ${FLEUR_DEFINITIONS} "CPP_HDF")
......
#First check if we can compile with ELPA
try_compile(FLEUR_USE_MAGMA ${CMAKE_BINARY_DIR} ${CMAKE_SOURCE_DIR}/cmake/tests/test_MAGMA.f90
LINK_LIBRARIES ${FLEUR_LIBRARIES}
)
message("MAGMA Library found:${FLEUR_USE_MAGMA}")
if (FLEUR_USE_MAGMA)
set(FLEUR_DEFINITIONS ${FLEUR_DEFINITIONS} "CPP_MAGMA")
endif()
program test
use magma
IMPLICIT NONE
integer :: mout,err,iwork(5)
REAL :: eigtemp(5),rwork(5)
COMPLEX :: a(5,5),b(5,5),work(5)
call magmaf_zhegvdx_2stage_m(1,1,MagmaVec,MagmaRangeI,MagmaLower,5,a,5,b,5,&
0.0,0.0,1,2,mout,eigTemp,work,5,rwork,5,iwork,5,err)
end
......@@ -5,6 +5,19 @@ LINK_LIBRARIES ${FLEUR_LIBRARIES}
message("MPI Library found:${FLEUR_USE_MPI}")
if (DEFINED ENV{FLEUR_USE_MPI})
if (ENV{FLEUR_USE_MPI})
if (NOT FLEUR_USE_MPI)
message(FATAL_ERROR "You asked for MPI but cmake couldn't find it. Please check your Fortran compiler settings")
endif()
else()
if (FLEUR_USE_MPI)
message("MPI library found, but you explicitely asked not to use it")
set(FLEUR_USE_MPI FALSE)
endif()
endif()
endif()
if (FLEUR_USE_MPI)
set(FLEUR_MPI_DEFINITIONS ${FLEUR_MPI_DEFINITIONS} "CPP_MPI")
endif()
\ No newline at end of file
find_package(Doxygen)
set(FLEUR_USE_DOXYGEN ${DOXYGEN_FOUND})
......@@ -194,7 +194,12 @@ CONTAINS
#endif
#ifdef CPP_MAGMA
CASE (diag_magma)
CALL magma_diag(lapw%nmat,eig,ne_found,hamOvlp%a_r,hamOvlp%b_r,zMat%z_r,hamOvlp%a_c,hamOvlp%b_c,zMat%z_c)
if (l_real) THEN
call juDFT_error("REAL diagonalization not implemented in magma.F90")
else
print *,"Start magma_diag"
CALL magma_diag(lapw%nmat,eig,ne_found,a_c=hamOvlp%a_c,b_c=hamOvlp%b_c,z_c=zMat%z_c)
endif
#endif
CASE (diag_lapack2)
if (noco%l_ss) call juDFT_error("zsymsecloc not tested with noco%l_ss")
......
......@@ -13,6 +13,7 @@ MODULE m_magma
!**********************************************************
CONTAINS
SUBROUTINE magma_diag(nsize,eig,ne,a_r,b_r,z_r,a_c,b_c,z_c)
use m_packed_to_full
#ifdef CPP_MAGMA
use magma
#endif
......@@ -62,17 +63,19 @@ CONTAINS
if (l_real) THEN
call packed_to_full(nsize,a_r,largea_r)
call packed_to_full(nsize,b_r,largeb_r)
deallocate(a_r,b_r)
!deallocate(a_r,b_r)
ELSE
call packed_to_full(nsize,a_c,largea_c)
call packed_to_full(nsize,b_c,largeb_c)
deallocate(a_c,b_c)
!deallocate(a_c,b_c)
Endif
if (l_real) call juDFT_error("REAL diagonalization not implemented in magma.F90")
!Query the workspace size
allocate(work(1),rwork(1),iwork(1))
print *,"Magma workspace query"
call flush()
call magmaf_zhegvdx_2stage_m(NGPU_CONST,1,MagmaVec,MagmaRangeI,MagmaLower,nsize,largea_c,nsize,largeb_c,nsize,&
0.0,0.0,1,ne,mout,eigTemp,work,-1,rwork,-1,iwork,-1,err)
lwork=work(1)
......@@ -83,6 +86,8 @@ CONTAINS
allocate(work(lwork),rwork(lrwork),iwork(liwork))
if (err/=0) call juDFT_error("Failed to allocate workspaces",calledby="magma.F90")
!Now the diagonalization
print *,"Magma diagonalization"
print *,nsize,shape(largea_c),shape(eigTemp),ne
call magmaf_zhegvdx_2stage_m(NGPU_CONST,1,MagmaVec,MagmaRangeI,MagmaLower,nsize,largea_c,nsize,largeb_c,nsize,&
0.0,0.0,1,ne,mout,eigTemp,work,lwork,rwork,lrwork,iwork,liwork,err)
print*,"MAGMA info:",err
......@@ -91,8 +96,9 @@ CONTAINS
DO i = 1, ne
eig(i) = eigTemp(i)
z_c(:nsize,i)=largea_c(:nsize,i)
END DO
call judft_error("Eigenvectors are not calculated in MAGMA")
!call judft_error("Eigenvectors are not calculated in MAGMA")
#endif
END SUBROUTINE magma_diag
END MODULE m_magma
......
# add a target to generate API documentation with Doxygen
find_package(Doxygen)
if(DOXYGEN_FOUND)
if(FLEUR_USE_DOXYGEN)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/docs/Doxyfile.in ${CMAKE_CURRENT_BINARY_DIR}/docs/Doxyfile @ONLY)
add_custom_target(doc
${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/docs/Doxyfile
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs
COMMENT "Generating API documentation with Doxygen" VERBATIM
)
endif(DOXYGEN_FOUND)
endif()
......@@ -475,18 +475,18 @@
OPEN (18,FILE='bands'//spin12(jspin))
ntb = minval(nevk(:))
kx(1) = 0.0
vkr(:,1)=matmul(cell%bmat,vk(:,1))
vkr(:,1)=matmul(vk(:,1),cell%bmat)
DO k = 2, kpts%nkpt
vkr(:,k)=matmul(cell%bmat,vk(:,k))
dk = ( vkr(1,k) - vkr(1,k-1) )**2 + ( vkr(2,k) - vkr(2,k-1) )**2 +&
( vkr(3,k) - vkr(3,k-1) )**2
kx(k) = kx(k-1) + sqrt(dk)
vkr(:,k)=matmul(vk(:,k),cell%bmat)
dk = (vkr(1,k)-vkr(1,k-1))**2 + (vkr(2,k)-vkr(2,k-1) )**2 + &
(vkr(3,k)-vkr(3,k-1))**2
kx(k) = kx(k-1) + sqrt(dk)
ENDDO
DO i = 1, ntb
DO k = 1, kpts%nkpt
write(18,'(2f15.9)') kx(k),ev(i,k)
ENDDO
DO k = 1, kpts%nkpt
write(18,'(2f15.9)') kx(k),ev(i,k)
ENDDO
ENDDO
CLOSE (18)
ENDIF
......
......@@ -30,3 +30,6 @@ eigen/tlo.f90
eigen/vacfun.f90
eigen/vec_for_lo.f90
)
if (FLEUR_USE_GPU)
set(fleur_F90 ${fleur_F90} eigen/hsmt_nonsph_GPU.F90)
endif()
......@@ -66,6 +66,9 @@ CONTAINS
#include"cpp_double.h"
USE m_hsmt_socinit
USE m_hsmt_nonsph
#ifdef CPP_GPU
USE m_hsmt_nonsph_gpu
#endif
USE m_hsmt_sph
USE m_hsmt_extra
USE m_types
......@@ -233,10 +236,18 @@ CONTAINS
kveclo,l_real,hamOvlp%a_r,hamOvlp%b_r,hamOvlp%a_c,hamOvlp%b_c) !out/in
CALL timestop("hsmt extra")
CALL timestart("hsmt non-spherical")
#ifndef CPP_GPU
CALL hsmt_nonsph(DIMENSION,atoms,sym,SUB_COMM,n_size,n_rank,input,isp,nintsp,&
hlpmsize,noco,l_socfirst,lapw,cell,tlmplm,fj,gj,gk,vk,oneD,l_real,hamOvlp%a_r,hamOvlp%a_c)
CALL timestop("hsmt non-spherical")
#else
CALL timestart("hsmt non-spherical-GPU")
CALL hsmt_nonsph_gpu(DIMENSION,atoms,sym,SUB_COMM,n_size,n_rank,input,isp,nintsp,&
hlpmsize,noco,l_socfirst,lapw,cell,tlmplm,fj,gj,gk,vk,oneD,l_real,hamOvlp%a_r,hamOvlp%a_c)
CALL timestop("hsmt non-spherical-GPU")
#endif
ENDIF
ENDDO
#if 1==2
......
MODULE m_hsmt_nonsph_GPU
#define CPP_BLOCKSIZE 64
! USE m_juDFT
!$ USE omp_lib
!TODO:
! Check what can be done in l_noco=.true. case in terms of use of zgemm or aa_block
! Check what happens in case of CPP_INVERSION -> real matrix a
IMPLICIT NONE
CONTAINS
SUBROUTINE hsmt_nonsph_GPU(DIMENSION,atoms,sym,SUB_COMM, n_size,n_rank,input,isp,nintsp,&
hlpmsize,noco,l_socfirst, lapw, cell,tlmplm, fj,gj,gk,vk,oneD,l_real,aa_r,aa_c)
#include"cpp_double.h"
USE m_constants, ONLY : tpi_const
USE m_ylm
USE m_hsmt_spinor
USE m_hsmt_hlptomat
USE m_types
#if defined(_OPENACC)
! USE nvtx
USE cublas
#endif
IMPLICIT NONE
TYPE(t_dimension),INTENT(IN):: DIMENSION
TYPE(t_oneD),INTENT(IN) :: oneD
TYPE(t_input),INTENT(IN) :: input
TYPE(t_noco),INTENT(IN) :: noco
TYPE(t_sym),INTENT(IN) :: sym
TYPE(t_cell),INTENT(IN) :: cell
TYPE(t_atoms),INTENT(IN) :: atoms
TYPE(t_lapw),INTENT(INOUT) :: lapw !lapw%nv_tot is updated
! ..
! .. Scalar Arguments ..
INTEGER, INTENT (IN) :: nintsp,isp
INTEGER, INTENT (IN) :: SUB_COMM,n_size,n_rank
INTEGER, INTENT (IN) :: hlpmsize
LOGICAL, INTENT (IN) :: l_socfirst
! ..
! .. Array Arguments ..
TYPE(t_tlmplm),INTENT(IN)::tlmplm
REAL, INTENT(IN) :: fj(:,0:,:,:),gj(:,0:,:,:)
REAL,INTENT(IN) :: gk(:,:,:),vk(:,:,:)
!-odim
!+odim
LOGICAL, INTENT(IN) :: l_real
REAL, ALLOCATABLE, INTENT (INOUT) :: aa_r(:)!(matsize)
COMPLEX,ALLOCATABLE, INTENT (INOUT) :: aa_c(:)
COMPLEX,PARAMETER :: one=CMPLX(1.0,0.0),zero=CMPLX(0.0,0.0)
! ..
! .. Local Scalars ..
INTEGER :: i,iii,ii,ij,im,in,k,ki,kj,l,ll1,lm,lmp,lp,jd,m
INTEGER :: mp,n,na,nn,np,kjmax,iintsp,jintsp
INTEGER :: nc ,kii,spin2,ab_dim,lnonsphd,bsize,bsize2,kb
REAL :: th,invsfct
COMPLEX :: term,chi11,chi21,chi22,chihlp
! ..
! .. Local Arrays ..
COMPLEX,ALLOCATABLE :: aa_block(:,:)
COMPLEX,ALLOCATABLE :: dtd(:,:),dtu(:,:),utd(:,:),utu(:,:)
REAL :: bmrot(3,3),gkrot(DIMENSION%nvd,3),vmult(3),v(3)
COMPLEX:: ylm( (atoms%lmaxd+1)**2 ),chi(2,2)
! ..
#if defined(_OPENACC)
COMPLEX,PINNED,ALLOCATABLE :: a(:,:,:),b(:,:,:)
#else
COMPLEX, ALLOCATABLE :: a(:,:,:),b(:,:,:)
#endif
COMPLEX, ALLOCATABLE :: ax(:,:),bx(:,:)
COMPLEX, ALLOCATABLE :: c_ph(:,:)
COMPLEX,ALLOCATABLE :: aahlp(:),aa_tmphlp(:)
INTEGER :: cublas_stream,istat
call nvtxStartRange("hsmt_nonsph",1)
lnonsphd=MAXVAL(atoms%lnonsph)*(MAXVAL(atoms%lnonsph)+2)
ALLOCATE(dtd(0:lnonsphd,0:lnonsphd),utd(0:lnonsphd,0:lnonsphd),dtu(0:lnonsphd,0:lnonsphd),utu(0:lnonsphd,0:lnonsphd))
!Decide how to distribute the work
IF ( noco%l_noco .AND. (.NOT. noco%l_ss) ) ALLOCATE ( aahlp(hlpmsize),aa_tmphlp(hlpmsize) )
ALLOCATE(aa_block(CPP_BLOCKSIZE,MAXVAL(lapw%nv)))
ab_dim=1
IF (noco%l_ss) ab_dim=2
ALLOCATE(a(DIMENSION%nvd,0:DIMENSION%lmd,ab_dim),b(DIMENSION%nvd,0:DIMENSION%lmd,ab_dim))
ALLOCATE(ax(DIMENSION%nvd,0:DIMENSION%lmd),bx(DIMENSION%nvd,0:DIMENSION%lmd))
ALLOCATE(c_ph(DIMENSION%nvd,ab_dim))
!$acc data copy(aa_c,aa_r) copyin(tlmplm, tlmplm%tdd, tlmplm%tdu, tlmplm%tud,tlmplm%tuu, tlmplm%ind ) create(utu,utd,dtu,dtd,ax,bx,a,b,aa_block,aahlp)
! call nvtxStartRange("hsmt_nonsph_outer_data",2)
ntyploop: DO n=1,atoms%ntype
IF (noco%l_noco) THEN
IF (.NOT.noco%l_ss) aahlp=CMPLX(0.0,0.0)
IF (.NOT.noco%l_ss) aa_tmphlp=CMPLX(0.0,0.0)
CALL hsmt_spinor(isp,n, noco,input, chi, chi11, chi21, chi22)
ENDIF
DO nn = 1,atoms%neq(n)
a=0.0
b=0.0
na = SUM(atoms%neq(:n-1))+nn
IF (atoms%lnonsph(n)<0) CYCLE ntyploop
IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN
IF (atoms%invsat(na)==0) invsfct = 1
IF (atoms%invsat(na)==1) invsfct = 2
np = sym%invtab(atoms%ngopr(na))
IF (oneD%odi%d1) np = oneD%ods%ngopr(na)
!Using double buffering create_ab could be overlapped with following GPU work
! call nvtxStartRange("create_ab",6)
!---> loop over interstitial spins
DO iintsp = 1,nintsp
IF (noco%l_constr.OR.l_socfirst) THEN
spin2=isp
ELSE
spin2=iintsp
ENDIF
!---> set up phase factors
DO k = 1,lapw%nv(iintsp)
th= DOT_PRODUCT((/lapw%k1(k,iintsp),lapw%k2(k,iintsp),lapw%k3(k,iintsp)/)+(iintsp-1.5)*noco%qss,atoms%taual(:,na))
c_ph(k,iintsp) = CMPLX(COS(tpi_const*th),-SIN(tpi_const*th))
END DO
IF (np==1) THEN
gkrot( 1:lapw%nv(iintsp),:) = gk( 1:lapw%nv(iintsp),:,iintsp)
ELSE
IF (oneD%odi%d1) THEN
bmrot=MATMUL(oneD%ods%mrot(:,:,np),cell%bmat)
ELSE
bmrot=MATMUL(1.*sym%mrot(:,:,np),cell%bmat)
END IF
DO k = 1,lapw%nv(iintsp)
!--> apply the rotation that brings this atom into the
!--> representative (this is the definition of ngopr(na)
!--> and transform to cartesian coordinates
v(:) = vk(k,:,iintsp)
gkrot(k,:) = MATMUL(TRANSPOSE(bmrot),v)
END DO
END IF
DO k = 1,lapw%nv(iintsp)
!--> generate spherical harmonics
vmult(:) = gkrot(k,:)
CALL ylm4(atoms%lnonsph(n),vmult,ylm)
!--> synthesize the complex conjugates of a and b
DO l = 0,atoms%lnonsph(n)
ll1 = l* (l+1)
DO m = -l,l
term = c_ph(k,iintsp)*ylm(ll1+m+1)
a(k,ll1+m,iintsp) = fj(k,l,n,spin2)*term
b(k,ll1+m,iintsp) = gj(k,l,n,spin2)*term
END DO
END DO
ENDDO !k-loop
!---> end loop over interstitial spin
ENDDO
!$acc update device(a,b)
! call nvtxEndRange
!---> loops over the interstitial spin
DO iintsp = 1,nintsp
! call nvtxStartRange("hsmt_nonsph_DO_jintsp",3)
DO jintsp = 1,iintsp
jd = 1 ; IF (noco%l_noco) jd = isp
!---> loop over l',m'
!$acc kernels
utu=0.0;utd=0.0;dtu=0.0;dtd=0.0
!!$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(lp,mp,lmp,l,m,lm,in,utu,dtu,utd,dtd,im,k) &
!!$OMP SHARED(tlmplm,invsfct,lnonsph,nv,jintsp,jd,n)
DO lmp=0,atoms%lnonsph(n)*(atoms%lnonsph(n)+2)
lp=FLOOR(SQRT(1.0*lmp))
mp=lmp-lp*(lp+1)
IF (lp>atoms%lnonsph(n).OR.ABS(mp)>lp) STOP "BUG"
!---> loop over l,m
DO l = 0,atoms%lnonsph(n)
DO m = -l,l
lm = l* (l+1) + m
in = tlmplm%ind(lmp,lm,n,jd)
IF (in/=-9999) THEN
IF (in>=0) THEN
utu(lm,lmp) =CONJG(tlmplm%tuu(in,n,jd))*invsfct
dtu(lm,lmp) =CONJG(tlmplm%tdu(in,n,jd))*invsfct
utd(lm,lmp) =CONJG(tlmplm%tud(in,n,jd))*invsfct
dtd(lm,lmp) =CONJG(tlmplm%tdd(in,n,jd))*invsfct
ELSE
im = -in
utu(lm,lmp) =tlmplm%tuu(im,n,jd)*invsfct
dtu(lm,lmp) =tlmplm%tud(im,n,jd)*invsfct
utd(lm,lmp) =tlmplm%tdu(im,n,jd)*invsfct
dtd(lm,lmp) =tlmplm%tdd(im,n,jd)*invsfct
END IF
!---> update ax, bx
END IF
END DO
END DO
ENDDO
!$acc end kernels
!!$OMP END PARALLEL DO
lmp=atoms%lnonsph(n)*(atoms%lnonsph(n)+2)
!ax(:nv(jintsp),0:lmp)=(matmul(a(:nv(jintsp),0:lmp,jintsp),utu(0:lmp,0:lmp))+matmul(b(:nv(jintsp),0:lmp,jintsp),utd(0:lmp,0:lmp)))
!bx(:nv(jintsp),0:lmp)=(matmul(a(:nv(jintsp),0:lmp,jintsp),dtu(0:lmp,0:lmp))+matmul(b(:nv(jintsp),0:lmp,jintsp),dtd(0:lmp,0:lmp)))
! call nvtxStartRange("hsmt_nonsph_zgemm_1",4)
!$acc host_data use_device(a,b,utu,utd,dtu,dtd,ax,bx )
CALL zgemm("N","N",lapw%nv(jintsp),lmp+1,lmp+1,one,a(1,0,jintsp),SIZE(a,1),utu(0,0),SIZE(utu,1),zero,ax,SIZE(ax,1))
CALL zgemm("N","N",lapw%nv(jintsp),lmp+1,lmp+1,one,b(1,0,jintsp),SIZE(a,1),utd(0,0),SIZE(utu,1),one,ax,SIZE(ax,1))
CALL zgemm("N","N",lapw%nv(jintsp),lmp+1,lmp+1,one,a(1,0,jintsp),SIZE(a,1),dtu(0,0),SIZE(utu,1),zero,bx,SIZE(ax,1))
CALL zgemm("N","N",lapw%nv(jintsp),lmp+1,lmp+1,one,b(1,0,jintsp),SIZE(a,1),dtd(0,0),SIZE(utu,1),one,bx,SIZE(ax,1))
!$acc end host_data
! call nvtxEndRange
!
!---> update hamiltonian and overlap matrices
nc = 0
IF ( noco%l_noco .AND. (n_size>1) ) THEN
lapw%nv_tot = lapw%nv(1) + lapw%nv(2)
ELSE
lapw%nv_tot = lapw%nv(iintsp)
ENDIF
kii=n_rank
! call nvtxStartRange("hsmt_nonsph_while_kii",5)
DO WHILE(kii<lapw%nv_tot)
!DO kii = n_rank, nv_tot-1, n_size
ki = MOD(kii,lapw%nv(iintsp)) + 1
bsize=MIN(SIZE(aa_block,1),(lapw%nv(iintsp)-ki)/n_size+1) !Either use maximal blocksize or number of rows left to calculate
IF (bsize<1) EXIT !nothing more to do here
bsize2=bsize*n_size
bsize2=min(bsize2,lapw%nv(iintsp)-ki+1)
!aa_block(:bsize,:ki+bsize2-1)=matmul(a(ki:ki+bsize2-1:n_size,0:lmp,iintsp),conjg(transpose(ax(:ki+bsize2-1,0:lmp))))+ &
! matmul(b(ki:ki+bsize2-1:n_size,0:lmp,iintsp),conjg(transpose(bx(:ki+bsize2-1,0:lmp))))
IF (n_size==1) THEN !Make this a special case to avoid copy-in of a array
!$acc host_data use_device( a,b,ax,bx,aa_block )
call zgemm("N","C",bsize,ki+bsize2-1,lmp+1,one,a(ki,0,iintsp),SIZE(a,1),ax(1,0),SIZE(ax,1),zero,aa_block,SIZE(aa_block,1))
call zgemm("N","C",bsize,ki+bsize2-1,lmp+1,one,b(ki,0,iintsp),SIZE(a,1),bx(1,0),SIZE(ax,1),one ,aa_block,SIZE(aa_block,1))
!$acc end host_data
ELSE
stop "Not implemented"
!$acc host_data use_device( a,b,ax,bx,aa_block )
!CALL zgemm("N","C",bsize,ki+bsize2-1,lmp+1,one,a(ki:ki+bsize2-1:n_size,0:lmp,iintsp), &
! SIZE(a(ki:ki+bsize2-1:n_size,0:lmp,iintsp),1),ax(1,0),SIZE(ax,1),zero,aa_block,SIZE(aa_block,1))
!CALL zgemm("N","C",bsize,ki+bsize2-1,lmp+1,one,b(ki:ki+bsize2-1:n_size,0:lmp,iintsp), &
! SIZE(a(ki:ki+bsize2-1:n_size,0:lmp,iintsp),1),bx(1,0),SIZE(ax,1),one,aa_block,SIZE(aa_block,1))
!$acc end host_data
ENDIF
!$acc kernels
IF ( noco%l_noco .AND. (.NOT. noco%l_ss) ) THEN
!$acc loop independent