diff --git a/eigen/hsmt_ab.F90 b/eigen/hsmt_ab.F90 index 0c066831410d4b83869be7115b8ba53dc04dcb1c..5cc10bb4958a578d3dbaaaa8d8d8fc1024ccefd4 100644 --- a/eigen/hsmt_ab.F90 +++ b/eigen/hsmt_ab.F90 @@ -19,10 +19,10 @@ CONTAINS #ifdef _CUDA - ATTRIBUTES(global) SUBROUTINE synth_ab(grid,block,n,lmax,iintsp,ab_size,gkrot_dev,fj,gj,c_ph,ab) + ATTRIBUTES(global) SUBROUTINE synth_ab(grid,block,n,lmax,ab_size,gkrot_dev,fj,gj,c_ph,ab) USE m_ylm - INTEGER, VALUE, INTENT(IN) :: grid, block, n, lmax, iintsp,ab_size - REAL, DEVICE, INTENT(IN) :: gkrot_dev(:,:),fj(:,:,:),gj(:,:,:) + INTEGER, VALUE, INTENT(IN) :: grid, block, n, lmax, ab_size + REAL, DEVICE, INTENT(IN) :: gkrot_dev(:,:),fj(:,:),gj(:,:) COMPLEX,DEVICE, INTENT(IN) :: c_ph(:) COMPLEX,DEVICE, INTENT (OUT) :: ab(:,:) COMPLEX,ALLOCATABLE :: ylm(:) @@ -45,8 +45,8 @@ CONTAINS DO l = 0,lmax ll1 = l* (l+1) DO m = -l,l - ab(i,ll1+m+1) = CONJG(fj(i,l+1,iintsp)*c_ph(i)*ylm(ll1+m+1)) - ab(i,ll1+m+1+ab_size) = CONJG(gj(i,l+1,iintsp)*c_ph(i)*ylm(ll1+m+1)) + ab(i,ll1+m+1) = CONJG(fj(i,l+1)*c_ph(i)*ylm(ll1+m+1)) + ab(i,ll1+m+1+ab_size) = CONJG(gj(i,l+1)*c_ph(i)*ylm(ll1+m+1)) END DO END DO ENDDO @@ -91,7 +91,8 @@ CONTAINS COMPLEX,ALLOCATABLE,DEVICE :: c_ph_dev(:,:) REAL, ALLOCATABLE,DEVICE :: gkrot_dev(:,:) - INTEGER :: istat, grid, block + INTEGER :: grid, block + !INTEGER :: istat lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph) @@ -129,14 +130,15 @@ CONTAINS !--> synthesize the complex conjugates of a and b !call nvtxStartRange("hsmt_synthAB",5) - istat = cudaDeviceSynchronize() + !istat = cudaDeviceSynchronize() ! pretty ugly solution block = 256 grid = lapw%nv(1)/(block*4) + 1 - CALL synth_ab<<>>(grid,block,lapw%nv(1),lmax,iintsp,ab_size,gkrot_dev,fj,gj,c_ph_dev(:,iintsp),ab) + CALL synth_ab<<>>(grid,block,lapw%nv(1),lmax,ab_size,gkrot_dev,& + fj(:,:,iintsp),gj(:,:,iintsp),c_ph_dev(:,iintsp),ab) - istat = cudaDeviceSynchronize() + !istat = cudaDeviceSynchronize() !call nvtxEndRange IF (PRESENT(abclo)) THEN