Commit 902547d4 authored by S.Rost's avatar S.Rost

abcof output without rotation is working merge

parents bbcc63ec 7f6f6328
......@@ -60,7 +60,7 @@ kpoints/tetcon.f kpoints/kvecon.f init/boxdim.f global/radsra.f math/intgr.F glo
set(inpgen_F90 ${inpgen_F90} global/constants.f90 io/xsf_io.f90
eigen/orthoglo.F90 juDFT/usage_data.F90 math/ylm4.F90
global/enpara.f90 global/chkmt.f90 inpgen/inpgen.f90 inpgen/set_inp.f90 inpgen/inpgen_help.f90 io/rw_inp.f90 juDFT/juDFT.F90 global/find_enpara.f90
global/chkmt.f90 inpgen/inpgen.f90 inpgen/set_inp.f90 inpgen/inpgen_help.f90 io/rw_inp.f90 juDFT/juDFT.F90 global/find_enpara.f90
inpgen/closure.f90 inpgen/inpgen_arguments.F90
juDFT/info.F90 juDFT/stop.F90 juDFT/args.F90 juDFT/time.F90 juDFT/init.F90 juDFT/sysinfo.F90 io/w_inpXML.f90 kpoints/julia.f90 global/utility.F90
init/compile_descr.F90 kpoints/kpoints.f90 io/xmlOutput.F90 kpoints/brzone2.f90 cdn/slab_dim.f90 cdn/slabgeom.f90 dos/nstm3.f90 cdn/int_21.f90
......
......@@ -37,7 +37,7 @@ else()
endif()
message("${Green} Compile GPU version : ${CReset} ${FLEUR_USE_GPU}")
if (FLEUR_USE_GPU)
message("${Green} CuSolver Library found : ${CReset} ${FLEUR_USE_CUSOLVER}")
message("${Green} CuSolver Library found : ${CReset} ${FLEUR_USE_CUSOLVER}")
endif()
message("\n")
message("${Green}Compile serial version : ${CReset} ${FLEUR_USE_SERIAL}")
......
......@@ -11,14 +11,14 @@ if (CLI_FLEUR_USE_GPU)
elseif(${CLI_FLEUR_USE_GPU} MATCHES "cuda9.1")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.1,cc60 -Mcuda=rdc -Mcudalib=cublas")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "nvtx")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas -lnvToolsExt ")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.1,cc60 -Mcuda=rdc -Mcudalib=cublas -lnvToolsExt ")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "emu")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=emu -Mcudalib=cublas -Minfo=accel ")
endif()
set(FLEUR_MPI_DEFINITIONS ${FLEUR_MPI_DEFINITIONS} "CPP_GPU" "CPP_MANAGED=,MANAGED")
set(FLEUR_DEFINITIONS ${FLEUR_DEFINITIONS} "CPP_GPU" "CPP_MANAGED=,MANAGED")
#Now check for cusolverDN library
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Mcuda")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Mcuda -ta=tesla,cuda9.1 ")
try_compile(FLEUR_USE_CUSOLVER ${CMAKE_BINARY_DIR} ${CMAKE_SOURCE_DIR}/cmake/tests/test_cusolver.c
LINK_LIBRARIES "-lcusolver"
)
......
......@@ -56,19 +56,21 @@ void cusolver_complex(cuDoubleComplex *H,cuDoubleComplex *S,int n,int ne,double
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of tolerance is machine zero */
status = cusolverDnXsyevjSetTolerance(&syevj_params,tol);
status = cusolverDnXsyevjSetTolerance(syevj_params,tol);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of max. sweeps is 100 */
status = cusolverDnXsyevjSetMaxSweeps(&syevj_params,max_sweeps);
status = cusolverDnXsyevjSetMaxSweeps(syevj_params,max_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
printf("Allocate data \n");
/* step 3: copy A to device */
cudaStat2 = cudaMalloc ((void**)&d_W, sizeof(cuDoubleComplex) * n);
cudaStat3 = cudaMalloc ((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
printf("query working space \n");
/* step 4: query working space of sygvj */
status = cusolverDnZhegvj_bufferSize(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,&lwork,syevj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
......@@ -76,6 +78,7 @@ void cusolver_complex(cuDoubleComplex *H,cuDoubleComplex *S,int n,int ne,double
cudaStat1 = cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex)*lwork);
assert(cudaSuccess == cudaStat1);
printf("compute eigen-pair \n");
/* step 5: compute eigen-pair */
status = cusolverDnZhegvj(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,d_work,lwork,d_info,syevj_params);
cudaStat1 = cudaDeviceSynchronize();
......@@ -163,26 +166,26 @@ void cusolver_real(double *H,double *S,int n,int ne,double tol,int max_sweeps,do
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of tolerance is machine zero */
status = cusolverDnXsyevjSetTolerance(&syevj_params,tol);
status = cusolverDnXsyevjSetTolerance(syevj_params,tol);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of max. sweeps is 100 */
status = cusolverDnXsyevjSetMaxSweeps(&syevj_params,max_sweeps);
status = cusolverDnXsyevjSetMaxSweeps(syevj_params,max_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
printf("Allocate data \n");
/* step 3: copy A to device */
cudaStat2 = cudaMalloc ((void**)&d_W, sizeof(double) * n);
cudaStat3 = cudaMalloc ((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
printf("query working space \n");
/* step 4: query working space of sygvj */
status = cusolverDnDsygvj_bufferSize(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,&lwork,syevj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaMalloc((void**)&d_work, sizeof(double)*lwork);
assert(cudaSuccess == cudaStat1);
printf("compute eigen-pair \n");
/* step 5: compute eigen-pair */
status = cusolverDnDsygvj(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,d_work,lwork,d_info,syevj_params);
cudaStat1 = cudaDeviceSynchronize();
......@@ -219,9 +222,9 @@ void cusolver_real(double *H,double *S,int n,int ne,double tol,int max_sweeps,do
if (d_info ) cudaFree(d_info);
if (d_work ) cudaFree(d_work);
if (cusolverH ) cusolverDnDestroy(cusolverH);
if (stream ) cudaStreamDestroy(stream);
if (syevj_params) cusolverDnDestroySyevjInfo(syevj_params);
if (stream ) cudaStreamDestroy(stream);
if (cusolverH ) cusolverDnDestroy(cusolverH);
// cudaDeviceReset();
......
......@@ -19,22 +19,19 @@ CONTAINS
#ifdef CPP_GPU
ATTRIBUTES(global) SUBROUTINE synth_ab(grid,block,n,lmax,ab_size,gkrot_dev,fj,gj,c_ph,ab)
ATTRIBUTES(global) SUBROUTINE synth_ab(loop_size,n,lmax,ab_size,gkrot_dev,fj,gj,c_ph,ab)
USE m_ylm
INTEGER, VALUE, INTENT(IN) :: grid, block, n, lmax, ab_size
INTEGER, VALUE, INTENT(IN) :: loop_size, n, lmax, ab_size
REAL, DEVICE, INTENT(IN) :: gkrot_dev(:,:),fj(:,:),gj(:,:)
COMPLEX,DEVICE, INTENT(IN) :: c_ph(:)
COMPLEX,DEVICE, INTENT (OUT) :: ab(:,:)
COMPLEX,ALLOCATABLE :: ylm(:)
INTEGER :: k,l,ll1,m
INTEGER :: loop_start, loop_end, i, loop_size
INTEGER :: k,l,ll1,m,i
INTEGER :: loop_start, loop_end
ALLOCATE(ylm((lmax+1)**2))
k = (blockidx%x-1)*blockdim%x + threadidx%x
loop_size = max(n/(grid*block),1)
if (loop_size * grid*block < n) loop_size = loop_size + 1
loop_start = (k-1) * loop_size + 1
loop_end = loop_start + loop_size - 1
if (loop_end > n ) loop_end = n
......@@ -90,7 +87,7 @@ CONTAINS
COMPLEX,ALLOCATABLE,DEVICE :: c_ph_dev(:,:)
REAL, ALLOCATABLE,DEVICE :: gkrot_dev(:,:)
INTEGER :: grid, block
INTEGER :: grid, block, loop_size
INTEGER :: istat
call nvtxStartRange("hsmt_ab",3)
......@@ -129,13 +126,13 @@ CONTAINS
!--> synthesize the complex conjugates of a and b
! pretty ugly solution
block = 256
grid = lapw%nv(1)/(block*4) + 1
CALL synth_ab<<<grid,block>>>(grid,block,lapw%nv(1),lmax,ab_size,gkrot_dev,&
grid = 30 ! number of blocks in the grid
block = 32 ! number of threads in a block
loop_size = max(lapw%nv(1)/(grid*block),1) !number of iterations performed by each thread
if (loop_size * grid*block < lapw%nv(1)) loop_size = loop_size + 1
CALL synth_ab<<<grid,block>>>(loop_size,lapw%nv(1),lmax,ab_size,gkrot_dev,&
fj(:,:,iintsp),gj(:,:,iintsp),c_ph_dev(:,iintsp),ab)
IF (PRESENT(abclo)) THEN
print*, "Ooooops, TODO in hsmt_ab"
!DO k = 1,lapw%nv(1)
......
MODULE m_hsmt_nonsph_GPU
#define CPP_BLOCKSIZE 4096
! USE m_juDFT
!$ USE omp_lib
!TODO:
! Check what can be done in l_noco=.true. case in terms of use of zgemm or aa_block
! Check what happens in case of CPP_INVERSION -> real matrix a
IMPLICIT NONE
CONTAINS
SUBROUTINE hsmt_nonsph_GPU(DIMENSION,atoms,sym,SUB_COMM, n_size,n_rank,input,isp,nintsp,&
hlpmsize,noco,l_socfirst, lapw, cell,tlmplm, fj,gj,gk,vk,oneD,l_real,aa_r,aa_c)
#include"cpp_double.h"
USE m_constants, ONLY : tpi_const
USE m_ylm
USE m_hsmt_spinor
USE m_hsmt_hlptomat
USE m_types
#if defined(_OPENACC)
! USE nvtx
USE cublas
#endif
IMPLICIT NONE
TYPE(t_dimension),INTENT(IN):: DIMENSION
TYPE(t_oneD),INTENT(IN) :: oneD
TYPE(t_input),INTENT(IN) :: input
TYPE(t_noco),INTENT(IN) :: noco
TYPE(t_sym),INTENT(IN) :: sym
TYPE(t_cell),INTENT(IN) :: cell
TYPE(t_atoms),INTENT(IN) :: atoms
TYPE(t_lapw),INTENT(INOUT) :: lapw !lapw%nv_tot is updated
! ..
! .. Scalar Arguments ..
INTEGER, INTENT (IN) :: nintsp,isp
INTEGER, INTENT (IN) :: SUB_COMM,n_size,n_rank
INTEGER, INTENT (IN) :: hlpmsize
LOGICAL, INTENT (IN) :: l_socfirst
! ..
! .. Array Arguments ..
TYPE(t_tlmplm),INTENT(IN)::tlmplm
REAL, INTENT(IN) :: fj(:,0:,:,:),gj(:,0:,:,:)
REAL,INTENT(IN) :: gk(:,:,:),vk(:,:,:)
!-odim
!+odim
LOGICAL, INTENT(IN) :: l_real
REAL, ALLOCATABLE, INTENT (INOUT) :: aa_r(:)!(matsize)
COMPLEX,ALLOCATABLE, INTENT (INOUT) :: aa_c(:)
COMPLEX,PARAMETER :: one=CMPLX(1.0,0.0),zero=CMPLX(0.0,0.0)
! ..
! .. Local Scalars ..
INTEGER :: i,iii,ii,ij,im,in,k,ki,kj,l,ll1,lm,lmp,lp,jd,m
INTEGER :: mp,n,na,nn,np,kjmax,iintsp,jintsp
INTEGER :: kiStart, kiiSTart, kc
INTEGER :: nc ,kii,spin2,ab_dim,lnonsphd,bsize,bsize2,kb
REAL :: th,invsfct
COMPLEX :: term,chi11,chi21,chi22,chihlp
! ..
! .. Local Arrays ..
COMPLEX,ALLOCATABLE :: aa_block(:,:)
COMPLEX,ALLOCATABLE :: dtd(:,:),dtu(:,:),utd(:,:),utu(:,:)
REAL :: bmrot(3,3),gkrot(DIMENSION%nvd,3),vmult(3),v(3)
COMPLEX:: ylm( (atoms%lmaxd+1)**2 ),chi(2,2)
! ..
#if defined(_OPENACC)
COMPLEX,PINNED,ALLOCATABLE :: a(:,:,:),b(:,:,:)
#else
COMPLEX, ALLOCATABLE :: a(:,:,:),b(:,:,:)
#endif
COMPLEX, ALLOCATABLE :: ax(:,:),bx(:,:)
COMPLEX, ALLOCATABLE :: c_ph(:,:)
COMPLEX,ALLOCATABLE :: aahlp(:),aa_tmphlp(:)
INTEGER :: cublas_stream,istat
<<<<<<< HEAD
!call nvtxStartRange("hsmt_nonsph",1)
=======
#if defined(_OPENACC)
! call nvtxStartRange("hsmt_nonsph",1)
#endif
>>>>>>> hsmt_simple
lnonsphd=MAXVAL(atoms%lnonsph)*(MAXVAL(atoms%lnonsph)+2)
ALLOCATE(dtd(0:lnonsphd,0:lnonsphd),utd(0:lnonsphd,0:lnonsphd),dtu(0:lnonsphd,0:lnonsphd),utu(0:lnonsphd,0:lnonsphd))
!Decide how to distribute the work
IF ( noco%l_noco .AND. (.NOT. noco%l_ss) ) THEN
ALLOCATE ( aahlp(hlpmsize),aa_tmphlp(hlpmsize) )
ELSE
ALLOCATE ( aahlp(1),aa_tmphlp(1) )
END IF
ALLOCATE(aa_block(CPP_BLOCKSIZE,MAXVAL(lapw%nv)))
ab_dim=1
IF (noco%l_ss) ab_dim=2
ALLOCATE(a(DIMENSION%nvd,0:DIMENSION%lmd,ab_dim),b(DIMENSION%nvd,0:DIMENSION%lmd,ab_dim))
ALLOCATE(ax(DIMENSION%nvd,0:DIMENSION%lmd),bx(DIMENSION%nvd,0:DIMENSION%lmd))
ALLOCATE(c_ph(DIMENSION%nvd,ab_dim))
!$acc data copy(aa_r,aa_c) copyin(tlmplm, tlmplm%tdd, tlmplm%tdu, tlmplm%tud,tlmplm%tuu, tlmplm%ind, atoms,atoms%lnonsph,lapw,lapw%nv,noco ) create(utu,utd,dtu,dtd,ax,bx,a,b,aa_block,aahlp)
#if defined(_OPENACC)
! call nvtxStartRange("hsmt_nonsph_outer_data",2)
#endif
ntyploop: DO n=1,atoms%ntype
IF (noco%l_noco) THEN
IF (.NOT.noco%l_ss) aahlp=CMPLX(0.0,0.0)
IF (.NOT.noco%l_ss) aa_tmphlp=CMPLX(0.0,0.0)
CALL hsmt_spinor(isp,n, noco,input, chi, chi11, chi21, chi22)
ENDIF
DO nn = 1,atoms%neq(n)
a=0.0
b=0.0
na = SUM(atoms%neq(:n-1))+nn
IF (atoms%lnonsph(n)<0) CYCLE ntyploop
IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN
IF (atoms%invsat(na)==0) invsfct = 1
IF (atoms%invsat(na)==1) invsfct = 2
np = sym%invtab(atoms%ngopr(na))
IF (oneD%odi%d1) np = oneD%ods%ngopr(na)
!Using double buffering create_ab could be overlapped with following GPU work
#if defined(_OPENACC)
! call nvtxStartRange("create_ab",6)
#endif
!---> loop over interstitial spins
DO iintsp = 1,nintsp
IF (noco%l_constr.OR.l_socfirst) THEN
spin2=isp
ELSE
spin2=iintsp
ENDIF
!---> set up phase factors
DO k = 1,lapw%nv(iintsp)
th= DOT_PRODUCT((/lapw%k1(k,iintsp),lapw%k2(k,iintsp),lapw%k3(k,iintsp)/)+(iintsp-1.5)*noco%qss,atoms%taual(:,na))
c_ph(k,iintsp) = CMPLX(COS(tpi_const*th),-SIN(tpi_const*th))
END DO
IF (np==1) THEN
gkrot( 1:lapw%nv(iintsp),:) = gk( 1:lapw%nv(iintsp),:,iintsp)
ELSE
IF (oneD%odi%d1) THEN
bmrot=MATMUL(oneD%ods%mrot(:,:,np),cell%bmat)
ELSE
bmrot=MATMUL(1.*sym%mrot(:,:,np),cell%bmat)
END IF
DO k = 1,lapw%nv(iintsp)
!--> apply the rotation that brings this atom into the
!--> representative (this is the definition of ngopr(na)
!--> and transform to cartesian coordinates
v(:) = vk(k,:,iintsp)
gkrot(k,:) = MATMUL(TRANSPOSE(bmrot),v)
END DO
END IF
DO k = 1,lapw%nv(iintsp)
!--> generate spherical harmonics
vmult(:) = gkrot(k,:)
CALL ylm4(atoms%lnonsph(n),vmult,ylm)
!--> synthesize the complex conjugates of a and b
DO l = 0,atoms%lnonsph(n)
ll1 = l* (l+1)
DO m = -l,l
term = c_ph(k,iintsp)*ylm(ll1+m+1)
a(k,ll1+m,iintsp) = fj(k,l,n,spin2)*term
b(k,ll1+m,iintsp) = gj(k,l,n,spin2)*term
END DO
END DO
ENDDO !k-loop
!---> end loop over interstitial spin
ENDDO
!$acc update device(a,b)
#if defined(_OPENACC)
! call nvtxEndRange
#endif
!---> loops over the interstitial spin
DO iintsp = 1,nintsp
#if defined(_OPENACC)
! call nvtxStartRange("hsmt_nonsph_DO_jintsp",3)
#endif
DO jintsp = 1,iintsp
jd = 1 ; IF (noco%l_noco) jd = isp
!---> loop over l',m'
!$acc kernels
utu=0.0;utd=0.0;dtu=0.0;dtd=0.0
!!$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(lp,mp,lmp,l,m,lm,in,utu,dtu,utd,dtd,im,k) &
!!$OMP SHARED(tlmplm,invsfct,lnonsph,nv,jintsp,jd,n)
!$acc loop collapse(2)
DO lmp=0,atoms%lnonsph(n)*(atoms%lnonsph(n)+2)
! lp=FLOOR(SQRT(1.0*lmp))
! mp=lmp-lp*(lp+1)
! IF (lp>atoms%lnonsph(n).OR.ABS(mp)>lp) STOP "BUG"
!---> loop over l,m
DO lm = 0,atoms%lnonsph(n)*(atoms%lnonsph(n)+2)
! DO l = 0,atoms%lnonsph(n)
! DO m = -l,l
! lm = l* (l+1) + m
in = tlmplm%ind(lmp,lm,n,jd)
IF (in/=-9999) THEN
IF (in>=0) THEN
utu(lm,lmp) =CONJG(tlmplm%tuu(in,n,jd))*invsfct
dtu(lm,lmp) =CONJG(tlmplm%tdu(in,n,jd))*invsfct
utd(lm,lmp) =CONJG(tlmplm%tud(in,n,jd))*invsfct
dtd(lm,lmp) =CONJG(tlmplm%tdd(in,n,jd))*invsfct
ELSE
im = -in
utu(lm,lmp) =tlmplm%tuu(im,n,jd)*invsfct
dtu(lm,lmp) =tlmplm%tud(im,n,jd)*invsfct
utd(lm,lmp) =tlmplm%tdu(im,n,jd)*invsfct
dtd(lm,lmp) =tlmplm%tdd(im,n,jd)*invsfct
END IF
!---> update ax, bx
END IF
! END DO
END DO
ENDDO
!$acc end loop
!$acc end kernels
!!$OMP END PARALLEL DO
lmp=atoms%lnonsph(n)*(atoms%lnonsph(n)+2)
!ax(:nv(jintsp),0:lmp)=(matmul(a(:nv(jintsp),0:lmp,jintsp),utu(0:lmp,0:lmp))+matmul(b(:nv(jintsp),0:lmp,jintsp),utd(0:lmp,0:lmp)))
!bx(:nv(jintsp),0:lmp)=(matmul(a(:nv(jintsp),0:lmp,jintsp),dtu(0:lmp,0:lmp))+matmul(b(:nv(jintsp),0:lmp,jintsp),dtd(0:lmp,0:lmp)))
#if defined(_OPENACC)
! call nvtxStartRange("hsmt_nonsph_zgemm_1",4)
#endif
!$acc host_data use_device(a,b,utu,utd,dtu,dtd,ax,bx )
CALL zgemm("N","N",lapw%nv(jintsp),lmp+1,lmp+1,one,a(1,0,jintsp),SIZE(a,1),utu(0,0),SIZE(utu,1),zero,ax,SIZE(ax,1))
CALL zgemm("N","N",lapw%nv(jintsp),lmp+1,lmp+1,one,b(1,0,jintsp),SIZE(a,1),utd(0,0),SIZE(utu,1),one,ax,SIZE(ax,1))
CALL zgemm("N","N",lapw%nv(jintsp),lmp+1,lmp+1,one,a(1,0,jintsp),SIZE(a,1),dtu(0,0),SIZE(utu,1),zero,bx,SIZE(ax,1))
CALL zgemm("N","N",lapw%nv(jintsp),lmp+1,lmp+1,one,b(1,0,jintsp),SIZE(a,1),dtd(0,0),SIZE(utu,1),one,bx,SIZE(ax,1))
!$acc end host_data
#if defined(_OPENACC)
! call nvtxEndRange
#endif
!
!---> update hamiltonian and overlap matrices
nc = 0
IF ( noco%l_noco .AND. (n_size>1) ) THEN
lapw%nv_tot = lapw%nv(1) + lapw%nv(2)
ELSE
lapw%nv_tot = lapw%nv(iintsp)
ENDIF
kii=n_rank
#if defined(_OPENACC)
! call nvtxStartRange("hsmt_nonsph_while_kii",5)
#endif
DO WHILE(kii<lapw%nv_tot)
!DO kii = n_rank, nv_tot-1, n_size
ki = MOD(kii,lapw%nv(iintsp)) + 1
bsize=MIN(SIZE(aa_block,1),(lapw%nv(iintsp)-ki)/n_size+1) !Either use maximal blocksize or number of rows left to calculate
IF (bsize<1) EXIT !nothing more to do here
bsize2=bsize*n_size
bsize2=min(bsize2,lapw%nv(iintsp)-ki+1)
!aa_block(:bsize,:ki+bsize2-1)=matmul(a(ki:ki+bsize2-1:n_size,0:lmp,iintsp),conjg(transpose(ax(:ki+bsize2-1,0:lmp))))+ &
! matmul(b(ki:ki+bsize2-1:n_size,0:lmp,iintsp),conjg(transpose(bx(:ki+bsize2-1,0:lmp))))
IF (n_size==1) THEN !Make this a special case to avoid copy-in of a array
!$acc host_data use_device( a,b,ax,bx,aa_block )
call zgemm("N","C",bsize,ki+bsize2-1,lmp+1,one,a(ki,0,iintsp),SIZE(a,1),ax(1,0),SIZE(ax,1),zero,aa_block,SIZE(aa_block,1))
call zgemm("N","C",bsize,ki+bsize2-1,lmp+1,one,b(ki,0,iintsp),SIZE(a,1),bx(1,0),SIZE(ax,1),one ,aa_block,SIZE(aa_block,1))
!$acc end host_data
ELSE
stop "Not implemented"
!$acc host_data use_device( a,b,ax,bx,aa_block )
!CALL zgemm("N","C",bsize,ki+bsize2-1,lmp+1,one,a(ki:ki+bsize2-1:n_size,0:lmp,iintsp), &
! SIZE(a(ki:ki+bsize2-1:n_size,0:lmp,iintsp),1),ax(1,0),SIZE(ax,1),zero,aa_block,SIZE(aa_block,1))
!CALL zgemm("N","C",bsize,ki+bsize2-1,lmp+1,one,b(ki:ki+bsize2-1:n_size,0:lmp,iintsp), &
! SIZE(a(ki:ki+bsize2-1:n_size,0:lmp,iintsp),1),bx(1,0),SIZE(ax,1),one,aa_block,SIZE(aa_block,1))
!$acc end host_data
ENDIF
IF ( noco%l_noco .AND. (.NOT. noco%l_ss) ) THEN
!$acc kernels
!$acc loop independent
DO kb=1,bsize
nc = 1+kii/n_size
ii = nc*(nc-1)/2*n_size-(nc-1)*(n_size-n_rank-1)
IF ( (n_size==1).OR.(kii+1<=lapw%nv(1)) ) THEN !
stop "not implemented"
!comments below must be reactivated
!aahlp(ii+1:ii+ki) = aahlp(ii+1:ii+ki)+MATMUL(CONJG(ax(:ki,:lmp)),a(ki,:,iintsp))+MATMUL(CONJG(bx(:ki,:lmp)),b(ki,:lmp,iintsp))
ELSE ! components for <2||2> block unused
!aa_tmphlp(:ki) = MATMUL(CONJG(ax(:ki,:lmp)),a(ki,:lmp,iintsp))+MATMUL(CONJG(bx(:ki,:DIMENSION%lmd)),b(ki,:lmp,iintsp))
!---> spin-down spin-down part
ij = ii + lapw%nv(1)
aa_c(ij+1:ij+ki)=aa_c(ij+1:ij+ki)+chi22*aa_tmphlp(:ki)
!---> spin-down spin-up part, lower triangle
ij = ii
aa_c(ij+1:ij+ki)=aa_c(ij+1:ij+ki)+chi21*aa_tmphlp(:ki)
ENDIF
!-||
ki=ki+n_size
kii=kii+n_size
ENDDO
!$acc end loop
!$acc end kernels
ELSEIF ( noco%l_noco .AND. noco%l_ss ) THEN
!$acc kernels
!$acc loop independent
DO kb=1,bsize
IF ( iintsp==1 .AND. jintsp==1 ) THEN
!---> spin-up spin-up part
kjmax = ki
chihlp = chi11
ii = (ki-1)*(ki)/2
ELSEIF ( iintsp==2 .AND. jintsp==2 ) THEN
!---> spin-down spin-down part
kjmax = ki
chihlp = chi22
ii = (lapw%nv(1)+atoms%nlotot+ki-1)*(lapw%nv(1)+atoms%nlotot+ki)/2+&
lapw%nv(1)+atoms%nlotot
ELSE
!---> spin-down spin-up part
kjmax = lapw%nv(1)
chihlp = chi21
ii = (lapw%nv(1)+atoms%nlotot+ki-1)*(lapw%nv(1)+atoms%nlotot+ki)/2
ENDIF
STOP "NOT implemented"
!next line must be reactivated
!aa_c(ii+1:ii+kjmax) = aa_c(ii+1:ii+kjmax) + chihlp*&
! (MATMUL(CONJG(ax(:kjmax,:lmp)),a(ki,:,iintsp))+MATMUL(CONJG(bx(:kjmax,:lmp)),b(ki,:lmp,iintsp)))
ki=ki+n_size
kii=kii+n_size
ENDDO
!$acc end loop
!$acc end kernels
ELSE
kiStart = ki
kiiSTart = kii
!$acc kernels
!$acc loop independent worker
DO kb=1,bsize
ki = kiStart + (kb-1)*n_size
kii = kiiStart + (kb-1)*n_size
nc = 1+kii/n_size
ii = nc*(nc-1)/2*n_size- (nc-1)*(n_size-n_rank-1)
if (l_real) THEN
! aa_r(ii+1:ii+ki) = aa_r(ii+1:ii+ki) + aa_block(kb,:ki)
!$acc loop independent vector
DO kc = 1, ki
aa_r(ii+kc) = aa_r(ii+kc) + aa_block(kb,kc)
END DO
ELSE
! aa_c(ii+1:ii+ki) = aa_c(ii+1:ii+ki) + aa_block(kb,:ki)
!$acc loop independent vector
DO kc = 1, ki
aa_c(ii+kc) = aa_c(ii+kc) + aa_block(kb,kc)
END DO
endif
!print*,ii,ki,kb
! IF (.not.apw(l)) THEN
!aa(ii+1:ii+ki) = aa(ii+1:ii+ki) + b(ki,lmp,iintsp)*bx(:ki)
! ENDIF
! ki=ki+n_size
! kii=kii+n_size
ENDDO
!$acc end loop
!$acc end kernels
ki = kiStart+bsize*n_size
kii = kiiSTart+bsize*n_size
ENDIF
!---> end loop over ki
END DO
#if defined(_OPENACC)
! call nvtxEndRange
#endif
!---> end loops over interstitial spin
ENDDO
#if defined(_OPENACC)
! call nvtxEndRange
#endif
ENDDO
ENDIF ! atoms%invsat(na) = 0 or 1
!---> end loop over equivalent atoms
END DO
IF ( noco%l_noco .AND. (.NOT. noco%l_ss) )THEN
!$acc update host(aahlp,aa_c)
CALL hsmt_hlptomat(atoms%nlotot,lapw%nv,sub_comm,chi11,chi21,chi22,aahlp,aa_c)
!$acc update device(aa_c)
ENDIF
!---> end loop over atom types (ntype)
ENDDO ntyploop
#if defined(_OPENACC)
! call nvtxEndRange
#endif
!$acc end data
#if defined(_OPENACC)
! call nvtxEndRange
#endif
RETURN
END SUBROUTINE hsmt_nonsph_GPU
END MODULE m_hsmt_nonsph_GPU
MODULE m_enpara
use m_juDFT
USE m_constants
! *************************************************************
! Module containing three subroutines
! r_enpara: read enpara file
! w_enpara: write enpara file
! mix_enpara: calculate new energy parameters
! *************************************************************
CONTAINS
SUBROUTINE w_enpara( &
& atoms,jspin,film,&
& enpara,&
& id)
!
! write enpara-file
!
USE m_types
IMPLICIT NONE
INTEGER, INTENT (IN) :: jspin,id
LOGICAL,INTENT(IN) :: film
TYPE(t_atoms),INTENT(IN) :: atoms
TYPE(t_enpara),INTENT(IN) :: enpara
INTEGER n,l,lo
LOGICAL l_opened