Commit a732811a authored by Matthias Redies's avatar Matthias Redies

Merge branch 'develop' into MetaGGA

parents 30e80b24 ede0053b
......@@ -2,7 +2,7 @@ include_directories(include)
set(c_filesInpgen io/xml/inputSchema.h io/xml/dropInputSchema.c)
set(c_filesFleur io/xml/inputSchema.h io/xml/dropInputSchema.c io/xml/xmlInterfaceWrapper.c)
if(FLEUR_USE_GPU)
if(FLEUR_USE_CUSOLVER)
set(c_filesFleur ${c_filesFleur} diagonalization/cusolver.c)
endif()
......
......@@ -36,6 +36,9 @@ else()
message("${Green} ChASE Library found : ${CReset} ---")
endif()
message("${Green} Compile GPU version : ${CReset} ${FLEUR_USE_GPU}")
if (FLEUR_USE_GPU)
message("${Green} CuSolver Library found : ${CReset} ${FLEUR_USE_CUSOLVER}")
endif()
message("\n")
message("${Green}Compile serial version : ${CReset} ${FLEUR_USE_SERIAL}")
message("${Green}Compile parallel version : ${CReset} ${FLEUR_USE_MPI}")
......
......@@ -7,12 +7,24 @@ if (CLI_FLEUR_USE_GPU)
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -ta=tesla:cuda8.0,cc60 -Mcuda:kepler+ -Minfo=accel -Mcudalib=cublas -acc ")
message("Using cuda8")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "cuda9")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "nvtx")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas -lnvToolsExt ")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "emu")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=emu -Mcudalib=cublas -Minfo=accel ")
endif()
set(FLEUR_MPI_DEFINITIONS ${FLEUR_MPI_DEFINITIONS} "CPP_GPU")
set(FLEUR_DEFINITIONS ${FLEUR_DEFINITIONS} "CPP_GPU")
#Now check for cusolverDN library
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Mcuda")
try_compile(FLEUR_USE_CUSOLVER ${CMAKE_BINARY_DIR} ${CMAKE_SOURCE_DIR}/cmake/tests/test_cusolver.c
LINK_LIBRARIES "-lcusolver"
)
if (FLEUR_USE_CUSOLVER)
set(${FLEUR_LIBRARIES} "${FLEUR_LIBRARIES};-lcusolver")
set(FLEUR_MPI_DEFINITIONS ${FLEUR_MPI_DEFINITIONS} "CPP_CUSOLVER")
set(FLEUR_DEFINITIONS ${FLEUR_DEFINITIONS} "CPP_CUSOLVER")
endif()
else()
set(FLEUR_USE_GPU FALSE)
endif()
/*
!--------------------------------------------------------------------------------
! Copyright (c) 2016 Peter Grünberg Institut, Forschungszentrum Jülich, Germany
! This file is part of FLEUR and available as free software under the conditions
! of the MIT license as expressed in the LICENSE file in more detail.
!--------------------------------------------------------------------------------
*/
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
/* Interface for the cusolverDN routines for solving a generalized Eigenvalue problem
Code adopted from the example in the documentation
*/
void cusolver_complex(cuDoubleComplex *H,cuDoubleComplex *S,int n,int ne,double tol,int max_sweeps,double* eig,cuDoubleComplex *z){
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
syevjInfo_t syevj_params = NULL;
cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
double *d_W = NULL; /* eigenvalues on device*/
int *d_info = NULL; /* error info */
int lwork = 0; /* size of workspace */
cuDoubleComplex *d_work = NULL; /* device workspace for syevj */
int info = 0; /* host copy of error info */
/* configuration of syevj */
const cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1; //Solve H psi=S lambda psi
const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvectors.
const cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
/* numerical results of syevj */
double residual = 0;
int executed_sweeps = 0;
/* step 1: create cusolver handle, bind a stream */
status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
status = cusolverDnSetStream(cusolverH, stream);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 2: configuration of syevj */
status = cusolverDnCreateSyevjInfo(&syevj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of tolerance is machine zero */
status = cusolverDnXsyevjSetTolerance(&syevj_params,tol);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of max. sweeps is 100 */
status = cusolverDnXsyevjSetMaxSweeps(&syevj_params,max_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 3: copy A to device */
cudaStat2 = cudaMalloc ((void**)&d_W, sizeof(cuDoubleComplex) * n);
cudaStat3 = cudaMalloc ((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
/* step 4: query working space of sygvj */
status = cusolverDnZhegvj_bufferSize(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,&lwork,syevj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex)*lwork);
assert(cudaSuccess == cudaStat1);
/* step 5: compute eigen-pair */
status = cusolverDnZhegvj(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,d_work,lwork,d_info,syevj_params);
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(eig, d_W, sizeof(double)*ne, cudaMemcpyDeviceToHost);
cudaStat2 = cudaMemcpy(z, H, sizeof(cuDoubleComplex)*n*ne, cudaMemcpyDeviceToHost);
cudaStat3 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
if ( 0 == info ){
printf("sygvj converges \n");
}else if ( 0 > info ){
printf("%d-th parameter is wrong \n", -info);
exit(1);
}else{
printf("WARNING: info = %d : sygvj does not converge \n", info );
}
status = cusolverDnXsyevjGetSweeps(cusolverH,syevj_params,&executed_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
status = cusolverDnXsyevjGetResidual(cusolverH,syevj_params,&residual);
assert(CUSOLVER_STATUS_SUCCESS == status);
printf("residual |A - V*W*V**H|_F = %E \n", residual );
printf("number of executed sweeps = %d \n", executed_sweeps );
/* free resources */
if (d_W ) cudaFree(d_W);
if (d_info ) cudaFree(d_info);
if (d_work ) cudaFree(d_work);
if (cusolverH ) cusolverDnDestroy(cusolverH);
if (stream ) cudaStreamDestroy(stream);
if (syevj_params) cusolverDnDestroySyevjInfo(syevj_params);
// cudaDeviceReset();
return ;
}
void cusolver_real(double *H,double *S,int n,int ne,double tol,int max_sweeps,double* eig,double *z){
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
syevjInfo_t syevj_params = NULL;
cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
double *d_W = NULL; /* eigenvalues on device*/
int *d_info = NULL; /* error info */
int lwork = 0; /* size of workspace */
double *d_work = NULL; /* device workspace for syevj */
int info = 0; /* host copy of error info */
/* configuration of syevj */
const cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1; //Solve H psi=S lambda psi
const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvectors.
const cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
/* numerical results of syevj */
double residual = 0;
int executed_sweeps = 0;
/* step 1: create cusolver handle, bind a stream */
status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
status = cusolverDnSetStream(cusolverH, stream);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 2: configuration of syevj */
status = cusolverDnCreateSyevjInfo(&syevj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of tolerance is machine zero */
status = cusolverDnXsyevjSetTolerance(&syevj_params,tol);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of max. sweeps is 100 */
status = cusolverDnXsyevjSetMaxSweeps(&syevj_params,max_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 3: copy A to device */
cudaStat2 = cudaMalloc ((void**)&d_W, sizeof(double) * n);
cudaStat3 = cudaMalloc ((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
/* step 4: query working space of sygvj */
status = cusolverDnDsygvj_bufferSize(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,&lwork,syevj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaMalloc((void**)&d_work, sizeof(double)*lwork);
assert(cudaSuccess == cudaStat1);
/* step 5: compute eigen-pair */
status = cusolverDnDsygvj(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,d_work,lwork,d_info,syevj_params);
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(eig, d_W, sizeof(double)*ne, cudaMemcpyDeviceToHost);
cudaStat2 = cudaMemcpy(z, H, sizeof(double)*n*ne, cudaMemcpyDeviceToHost);
cudaStat3 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
if ( 0 == info ){
printf("sygvj converges \n");
}else if ( 0 > info ){
printf("%d-th parameter is wrong \n", -info);
exit(1);
}else{
printf("WARNING: info = %d : sygvj does not converge \n", info );
}
status = cusolverDnXsyevjGetSweeps(cusolverH,syevj_params,&executed_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
status = cusolverDnXsyevjGetResidual(cusolverH,syevj_params,&residual);
assert(CUSOLVER_STATUS_SUCCESS == status);
printf("residual |A - V*W*V**H|_F = %E \n", residual );
printf("number of executed sweeps = %d \n", executed_sweeps );
/* free resources */
if (d_W ) cudaFree(d_W);
if (d_info ) cudaFree(d_info);
if (d_work ) cudaFree(d_work);
if (cusolverH ) cusolverDnDestroy(cusolverH);
if (stream ) cudaStreamDestroy(stream);
if (syevj_params) cusolverDnDestroySyevjInfo(syevj_params);
// cudaDeviceReset();
return ;
}
void main(){
};
......@@ -56,7 +56,7 @@ void cusolver_complex(cuDoubleComplex *H,cuDoubleComplex *S,int n,int ne,double
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of tolerance is machine zero */
status = cusolverDnXsyevjSetTolerance(&svj_params,tol);
status = cusolverDnXsyevjSetTolerance(&syevj_params,tol);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of max. sweeps is 100 */
......@@ -163,7 +163,7 @@ void cusolver_real(double *H,double *S,int n,int ne,double tol,int max_sweeps,do
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of tolerance is machine zero */
status = cusolverDnXsyevjSetTolerance(&svj_params,tol);
status = cusolverDnXsyevjSetTolerance(&syevj_params,tol);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of max. sweeps is 100 */
......
......@@ -11,7 +11,7 @@ MODULE m_cusolver_diag
#endif
IMPLICIT NONE
PRIVATE
#ifdef CPP_GPU
#ifdef CPP_CUSOLVER
INTERFACE
SUBROUTINE cusolver_real(H,S,n,ne,tol,max_sweeps,eig,z) BIND(C,name="cusolver_real")
USE iso_c_binding
......@@ -46,7 +46,7 @@ CONTAINS
CLASS(t_mat),ALLOCATABLE,INTENT(OUT) :: zmat
REAL,INTENT(OUT) :: eig(:)
#ifdef CPP_GPU
#ifdef CPP_CUSOLVER
INTEGER,PARAMETER:: max_sweeps=15
REAL :: tol=1E-7
......
......@@ -34,7 +34,7 @@ MODULE m_eigen_diag
#else
INTEGER,PARAMETER:: diag_chase=-7
#endif
#ifdef CPP_GPU
#ifdef CPP_CUSOLVER
INTEGER,PARAMETER:: diag_cusolver=8
#else
INTEGER,PARAMETER:: diag_cusolver=-8
......
......@@ -9,7 +9,7 @@ MODULE m_hsmt_ab
INTERFACE hsmt_ab
module procedure hsmt_ab_cpu
#ifdef _CUDA
#ifdef CPP_GPU
module procedure hsmt_ab_gpu
#endif
END INTERFACE
......@@ -17,13 +17,13 @@ MODULE m_hsmt_ab
CONTAINS
#ifdef _CUDA
#ifdef CPP_GPU
ATTRIBUTES(global) SUBROUTINE synth_ab(grid,block,n,lmax,iintsp,ab_size,gkrot_dev,fj,gj,c_ph,ab)
ATTRIBUTES(global) SUBROUTINE synth_ab(grid,block,n,lmax,ab_size,gkrot_dev,fj,gj,c_ph,ab)
USE m_ylm
INTEGER, VALUE, INTENT(IN) :: grid, block, n, lmax, iintsp,ab_size
REAL, DEVICE, INTENT(IN) :: gkrot_dev(:,:),fj(:,:,:),gj(:,:,:)
COMPLEX,DEVICE, INTENT(IN) :: c_ph(:,:)
INTEGER, VALUE, INTENT(IN) :: grid, block, n, lmax, ab_size
REAL, DEVICE, INTENT(IN) :: gkrot_dev(:,:),fj(:,:),gj(:,:)
COMPLEX,DEVICE, INTENT(IN) :: c_ph(:)
COMPLEX,DEVICE, INTENT (OUT) :: ab(:,:)
COMPLEX,ALLOCATABLE :: ylm(:)
INTEGER :: k,l,ll1,m
......@@ -45,8 +45,8 @@ CONTAINS
DO l = 0,lmax
ll1 = l* (l+1)
DO m = -l,l
ab(i,ll1+m+1) = CONJG(fj(i,l+1,iintsp)*c_ph(i,iintsp)*ylm(ll1+m+1))
ab(i,ll1+m+1+ab_size) = CONJG(gj(i,l+1,iintsp)*c_ph(i,iintsp)*ylm(ll1+m+1))
ab(i,ll1+m+1) = CONJG(fj(i,l+1)*c_ph(i)*ylm(ll1+m+1))
ab(i,ll1+m+1+ab_size) = CONJG(gj(i,l+1)*c_ph(i)*ylm(ll1+m+1))
END DO
END DO
ENDDO
......@@ -62,7 +62,7 @@ CONTAINS
USE m_ylm
USE m_apws
USE cudafor
USE nvtx
! USE nvtx
IMPLICIT NONE
TYPE(t_sym),INTENT(IN) :: sym
TYPE(t_cell),INTENT(IN) :: cell
......@@ -91,9 +91,9 @@ CONTAINS
COMPLEX,ALLOCATABLE,DEVICE :: c_ph_dev(:,:)
REAL, ALLOCATABLE,DEVICE :: gkrot_dev(:,:)
INTEGER :: istat, grid, block
INTEGER :: grid, block
!INTEGER :: istat
!call nvtxStartRange("hsmt_ab",2)
lmax=MERGE(atoms%lnonsph(n),atoms%lmax(n),l_nonsph)
ALLOCATE(c_ph_dev(lapw%nv(1),MERGE(2,1,noco%l_ss)))
......@@ -130,14 +130,15 @@ CONTAINS
!--> synthesize the complex conjugates of a and b
!call nvtxStartRange("hsmt_synthAB",5)
istat = cudaDeviceSynchronize()
!istat = cudaDeviceSynchronize()
! pretty ugly solution
block = 256
grid = lapw%nv(1)/(block*4) + 1
CALL synth_ab<<<grid,block>>>(grid,block,lapw%nv(1),lmax,iintsp,ab_size,gkrot_dev,fj,gj,c_ph_dev,ab)
CALL synth_ab<<<grid,block>>>(grid,block,lapw%nv(1),lmax,ab_size,gkrot_dev,&
fj(:,:,iintsp),gj(:,:,iintsp),c_ph_dev(:,iintsp),ab)
istat = cudaDeviceSynchronize()
!istat = cudaDeviceSynchronize()
!call nvtxEndRange
IF (PRESENT(abclo)) THEN
......
......@@ -8,6 +8,12 @@ MODULE m_hsmt_nonsph
IMPLICIT NONE
PRIVATE
PUBLIC hsmt_nonsph
INTERFACE priv_noMPI
module procedure priv_noMPI_cpu
#ifdef CPP_GPU
module procedure priv_noMPI_gpu
#endif
END INTERFACE
CONTAINS
SUBROUTINE hsmt_nonsph(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
USE m_types
......@@ -25,10 +31,37 @@ CONTAINS
! .. Array Arguments ..
REAL,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
CLASS(t_mat),INTENT(INOUT) ::hmat
#if defined CPP_GPU
REAL, ALLOCATABLE,DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
COMPLEX,ALLOCATABLE,DEVICE :: h_loc_dev(:,:)
COMPLEX,ALLOCATABLE,DEVICE :: c_dev(:,:)
#endif
CALL timestart("non-spherical setup")
IF (mpi%n_size==1) THEN
#if defined (_CUDA)
CALL priv_noMPI_gpu(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
#if defined CPP_GPU
ALLOCATE(fj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
ALLOCATE(gj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
fj_dev(1:,1:,1:)= fj(1:,0:,1:)
gj_dev(1:,1:,1:)= gj(1:,0:,1:)
ALLOCATE(h_loc_dev(size(td%h_loc,1),size(td%h_loc,2)))
h_loc_dev(1:,1:) = CONJG(td%h_loc(0:,0:,n,isp))
IF (hmat%l_real) THEN
IF (ANY(SHAPE(hmat%data_c)/=SHAPE(hmat%data_r))) THEN
DEALLOCATE(hmat%data_c)
ALLOCATE(hmat%data_c(SIZE(hmat%data_r,1),SIZE(hmat%data_r,2)))
ENDIF
hmat%data_c=0.0
ENDIF
ALLOCATE(c_dev(SIZE(hmat%data_c,1),SIZE(hmat%data_c,2)))
c_dev = hmat%data_c
CALL priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,h_loc_dev,fj_dev,gj_dev,c_dev)
hmat%data_c = c_dev
IF (hmat%l_real) THEN
hmat%data_r=hmat%data_r+REAL(hmat%data_c)
ENDIF
#else
CALL priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
#endif
......@@ -38,9 +71,10 @@ CONTAINS
CALL timestop("non-spherical setup")
END SUBROUTINE hsmt_nonsph
#if defined (_CUDA)
SUBROUTINE priv_noMPI_gpu(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
!Calculate overlap matrix
#if defined CPP_GPU
SUBROUTINE priv_noMPI_gpu(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,h_loc_dev,fj_dev,gj_dev,c_dev)
!Calculate overlap matrix, GPU version
!note that basically all matrices in the GPU version are conjugates of their cpu counterparts
USE m_hsmt_ab
USE m_constants, ONLY : fpi_const,tpi_const
USE m_types
......@@ -59,48 +93,27 @@ CONTAINS
TYPE(t_cell),INTENT(IN) :: cell
TYPE(t_atoms),INTENT(IN) :: atoms
TYPE(t_lapw),INTENT(IN) :: lapw
TYPE(t_tlmplm),INTENT(IN) :: td
COMPLEX, INTENT(IN),DEVICE :: h_loc_dev(:,:)
! ..
! .. Scalar Arguments ..
INTEGER, INTENT (IN) :: n,isp,iintsp,jintsp
COMPLEX,INTENT(in) :: chi
! ..
! .. Array Arguments ..
REAL,INTENT(IN) :: fj(:,0:,:),gj(:,0:,:)
CLASS(t_mat),INTENT(INOUT)::hmat
REAL, INTENT(IN), DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
COMPLEX,INTENT(INOUT),DEVICE :: c_dev(:,:)
INTEGER:: nn,na,ab_size,l,ll,m
real :: rchi
COMPLEX,ALLOCATABLE,DEVICE :: c_dev(:,:), ab1_dev(:,:), ab_dev(:,:), ab2_dev(:,:)
COMPLEX,ALLOCATABLE,DEVICE :: h_loc_dev(:,:)
REAL, ALLOCATABLE,DEVICE :: fj_dev(:,:,:), gj_dev(:,:,:)
COMPLEX,ALLOCATABLE,DEVICE :: ab1_dev(:,:), ab_dev(:,:), ab2_dev(:,:)
integer :: i, j, istat
call nvtxStartRange("hsmt_nonsph",1)
ALLOCATE(h_loc_dev(size(td%h_loc,1),size(td%h_loc,2)))
ALLOCATE(ab1_dev(lapw%nv(jintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
ALLOCATE(ab_dev(MAXVAL(lapw%nv),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
h_loc_dev(1:,1:) = CONJG(td%h_loc(0:,0:,n,isp)) !WORKAROUND, var_dev=CONJG(var_dev) does not work
ALLOCATE(fj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
ALLOCATE(gj_dev(MAXVAL(lapw%nv),atoms%lmaxd+1,MERGE(2,1,noco%l_noco)))
fj_dev(1:,1:,1:)= fj(1:,0:,1:)
gj_dev(1:,1:,1:)= gj(1:,0:,1:)
!note that basically all matrices in the GPU version are conjugates of their
!cpu counterparts
IF (iintsp.NE.jintsp) ALLOCATE(ab2_dev(lapw%nv(iintsp),2*atoms%lmaxd*(atoms%lmaxd+2)+2))
IF (hmat%l_real) THEN
IF (ANY(SHAPE(hmat%data_c)/=SHAPE(hmat%data_r))) THEN
DEALLOCATE(hmat%data_c)
ALLOCATE(hmat%data_c(SIZE(hmat%data_r,1),SIZE(hmat%data_r,2)))
ENDIF
hmat%data_c=0.0
ENDIF
ALLOCATE(c_dev(SIZE(hmat%data_c,1),SIZE(hmat%data_c,2)))
c_dev = hmat%data_c
DO nn = 1,atoms%neq(n)
na = SUM(atoms%neq(:n-1))+nn
IF ((atoms%invsat(na)==0) .OR. (atoms%invsat(na)==1)) THEN
......@@ -121,31 +134,26 @@ CONTAINS
!Second set of ab is needed
CALL hsmt_ab(sym,atoms,noco,isp,iintsp,n,na,cell,lapw,fj_dev,gj_dev,ab_dev,ab_size,.TRUE.)
CALL zgemm("N","N",lapw%nv(iintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab_dev,SIZE(ab_dev,1),&
h_loc_dev,SIZE(td%h_loc,1),CMPLX(0.,0.),ab2_dev,SIZE(ab2_dev,1))
h_loc_dev,SIZE(h_loc_dev,1),CMPLX(0.,0.),ab2_dev,SIZE(ab2_dev,1))
!Multiply for Hamiltonian
!$cuf kernel do<<<*,256>>>
do i = 1,size(ab1_dev,2)
do j = 1,size(ab1_dev,1)
ab1_dev(j,i) = conjg(ab1_dev(j,i))
enddo
enddo
CALL zgemm("N","T",lapw%nv(iintsp),lapw%nv(jintsp),ab_size,chi,ab2_dev,SIZE(ab2_dev,1),&
ab1_dev,SIZE(ab1_dev,1),CMPLX(1.0,0.0),c_dev,SIZE(c_dev,1))
ENDIF
ENDIF
END DO
hmat%data_c = c_dev
IF (hmat%l_real) THEN
hmat%data_r=hmat%data_r+REAL(hmat%data_c)
ENDIF
call nvtxEndRange
END SUBROUTINE priv_noMPI_gpu
#endif
SUBROUTINE priv_noMPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
SUBROUTINE priv_noMPI_cpu(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
!Calculate overlap matrix
USE m_hsmt_ab
USE m_constants, ONLY : fpi_const,tpi_const
......@@ -193,16 +201,19 @@ CONTAINS
CALL hsmt_ab(sym,atoms,noco,isp,jintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.)
!Calculate Hamiltonian
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab1,SIZE(ab1,1))
CALL zgemm("N","N",lapw%nv(jintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),&
td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab1,SIZE(ab1,1))
!ab1=MATMUL(ab(:lapw%nv(iintsp),:ab_size),td%h_loc(:ab_size,:ab_size,n,isp))
IF (iintsp==jintsp) THEN
CALL ZHERK("U","N",lapw%nv(iintsp),ab_size,Rchi,CONJG(ab1),SIZE(ab1,1),1.0,hmat%data_c,SIZE(hmat%data_c,1))
ELSE !here the l_ss off-diagonal part starts
!Second set of ab is needed
CALL hsmt_ab(sym,atoms,noco,isp,iintsp,n,na,cell,lapw,fj,gj,ab,ab_size,.TRUE.)
CALL zgemm("N","N",lapw%nv(iintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab2,SIZE(ab2,1))
CALL zgemm("N","N",lapw%nv(iintsp),ab_size,ab_size,CMPLX(1.0,0.0),ab,SIZE(ab,1),&
td%h_loc(0:,0:,n,isp),SIZE(td%h_loc,1),CMPLX(0.,0.),ab2,SIZE(ab2,1))
!Multiply for Hamiltonian
CALL zgemm("N","T",lapw%nv(iintsp),lapw%nv(jintsp),ab_size,chi,conjg(ab2),SIZE(ab2,1),ab1,SIZE(ab1,1),CMPLX(1.0,0.0),hmat%data_c,SIZE(hmat%data_c,1))
CALL zgemm("N","T",lapw%nv(iintsp),lapw%nv(jintsp),ab_size,chi,conjg(ab2),SIZE(ab2,1),&
ab1,SIZE(ab1,1),CMPLX(1.0,0.0),hmat%data_c,SIZE(hmat%data_c,1))
ENDIF
ENDIF
END DO
......@@ -211,7 +222,7 @@ CONTAINS
hmat%data_r=hmat%data_r+REAL(hmat%data_c)
ENDIF
END SUBROUTINE priv_noMPI
END SUBROUTINE priv_noMPI_cpu
SUBROUTINE priv_MPI(n,mpi,sym,atoms,isp,iintsp,jintsp,chi,noco,cell,lapw,td,fj,gj,hmat)
......
......@@ -5,12 +5,12 @@
REAL, ALLOCATABLE, SAVE :: ynorm(:)
INTEGER, SAVE :: lmaxd = -1 ! initial value
PUBLIC ylm4,ylmnorm_init
#ifdef _CUDA
#ifdef CPP_GPU
PUBLIC ylm4_dev
#endif
CONTAINS
#ifdef _CUDA
#ifdef CPP_GPU
ATTRIBUTES(device) SUBROUTINE ylm4_dev(lmax,v,ylm)
!************************************************************
! generate the spherical harmonics for the vector v
......@@ -24,6 +24,9 @@
! the normalization is an internal subroutine and hence
! can only be called from here. also, no need to dimension
! arrays for ynorm, done dynamically. mw 1999
!
! GPU version added
! U.Alekseeva Oktober 2018
!************************************************************
! USE m_juDFT
INTEGER, VALUE, INTENT (IN) :: lmax
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment