Commit 42a10afe authored by Daniel Wortmann's avatar Daniel Wortmann

Added some GPU code. In particular the t_gpumat datatype and the cusolver diagonalization

parent 15555b9d
......@@ -2,6 +2,9 @@ include_directories(include)
set(c_filesInpgen io/xml/inputSchema.h io/xml/dropInputSchema.c)
set(c_filesFleur io/xml/inputSchema.h io/xml/dropInputSchema.c io/xml/xmlInterfaceWrapper.c)
if(FLEUR_USE_GPU)
set(c_filesFleur ${c_filesFleur} diagonalization/cusolver.c)
endif()
set(fleur_F90 main/fleur.F90)
set(fleur_F77 "")
......
......@@ -8,6 +8,8 @@ if (CLI_FLEUR_USE_GPU)
message("Using cuda8")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "cuda9")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas -lnvToolsExt ")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "emu")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=emu -Mcudalib=cublas -Minfo=accel ")
endif()
set(FLEUR_MPI_DEFINITIONS ${FLEUR_MPI_DEFINITIONS} "CPP_GPU")
set(FLEUR_DEFINITIONS ${FLEUR_DEFINITIONS} "CPP_GPU")
......
......@@ -17,3 +17,8 @@ else()
diagonalization/elpa.F90
)
endif()
if (FLEUR_USE_GPU)
set(fleur_F90 ${fleur_F90}
diagonalization/cusolver_diag.F90
)
endif()
!--------------------------------------------------------------------------------
! Copyright (c) 2016 Peter Grünberg Institut, Forschungszentrum Jülich, Germany
! This file is part of FLEUR and available as free software under the conditions
! of the MIT license as expressed in the LICENSE file in more detail.
!--------------------------------------------------------------------------------
MODULE m_cusolver
use m_juDFT
INTEGER,PARAMETER :: NGPU_CONST=1
INTEGER,PARAMETER :: CUSOLVER_EIG_TYPE_1=1
CHARACTER,PARAMETER :: CUSOLVER_EIG_MODE_VECTOR='V'
!**********************************************************
! Solve the generalized eigenvalue problem
! using the MAGMA library for multiple GPUs
!**********************************************************
CONTAINS
SUBROUTINE cusolver_diag(nsize,eig,ne,a_r,b_r,z_r,a_c,b_c,z_c)
use m_packed_to_full
#ifdef CPP_MAGMA
!use magma
#endif
#include"cpp_double.h"
IMPLICIT NONE
! ... Arguments ...
INTEGER, INTENT (IN) :: nsize
REAL, INTENT(OUT) :: eig(:)
INTEGER, INTENT(INOUT) :: ne
REAL, OPTIONAL,ALLOCATABLE, INTENT (INOUT) :: a_r(:),b_r(:)
REAL, OPTIONAL,ALLOCATABLE, INTENT (INOUT) :: z_r(:,:)
COMPLEX, OPTIONAL,ALLOCATABLE, INTENT (INOUT) :: a_c(:),b_c(:)
COMPLEX, OPTIONAL,ALLOCATABLE, INTENT (INOUT) :: z_c(:,:)
#ifdef CPP_MAGMA
! ... Local Variables ..
INTEGER iind,ind1,ind2,info,lwork,liwork,lrwork,err,i,mout(1)
REAL eigTemp(nsize)
LOGICAL:: initialized=.false.
REAL, ALLOCATABLE :: rwork(:)
INTEGER, ALLOCATABLE :: iwork(:)
REAL, ALLOCATABLE :: largea_r(:,:),largeb_r(:,:)
COMPLEX, ALLOCATABLE :: largea_c(:,:),largeb_c(:,:)
COMPLEX,ALLOCATABLE :: work(:)
INTEGER :: ifail(nsize)
LOGICAL :: l_real
l_real=present(a_r)
!**********************************
!expand from packed to full storage
!**********************************
!hamiltonian
if (l_real) THEN
call packed_to_full(nsize,a_r,largea_r)
call packed_to_full(nsize,b_r,largeb_r)
!deallocate(a_r,b_r)
ELSE
call packed_to_full(nsize,a_c,largea_c)
call packed_to_full(nsize,b_c,largeb_c)
!deallocate(a_c,b_c)
Endif
if (l_real) call juDFT_error("REAL diagonalization not implemented in magma.F90")
err=cusolverDnCreate(handle)
!$acc data copyin(largea_c,largeb_c)
!$acc host_data use_device(largea_c,largeb_c,eigtemp,work)
err=cusolverDnZhegvd( handle, CUSOLVER_EIG_TYPE_1, CUSOLVER_EIG_MODE_VECTOR, cublas_Fill_Mode_upper, nsize, largea_c,nsize,largeb_c,nsize, eigtemp, work, lwork, devInfo);
!$acc end host_data
!$acc end data
err=cusolverDnDestroy(handle)
DO i = 1, ne
eig(i) = eigTemp(i)
z_c(:nsize,i)=largea_c(:nsize,i)
END DO
!call judft_error("Eigenvectors are not calculated in MAGMA")
#endif
END SUBROUTINE magma_diag
END MODULE m_magma
module priv_cusolver_interface
contains
integer(c_int) function cusolverDnCreate(cusolver_Hndl) bind(C,name="cusolverDnCreate")
use iso_c_binding
implicit none
type(c_ptr)::cusolver_Hndl
end function cusolverDnCreate
integer(c_int) function cusolverDnDestroy(cusolver_Hndl) bind(C,name="cusolverDnDestroy")
use iso_c_binding
implicit none
type(c_ptr),value::cusolver_Hndl
end function cusolverDnDestroy
integer(c_int) function cusolverDnZhegvd( handle, CUSOLVER_EIG_TYPE, CUSOLVER_EIG_MODE, cublas_Fill_Mode, nsize, largea_c,nsize,largeb_c,nsize, eigtemp, work, lwork, devInfo) bind(C,name="cusolverDnZhegvd")
use iso_c_binding
implicit none
type(c_ptr),value :: handle
integer(c_int),value :: cusolver_eig_type
character(c_char),value:: cusolver_eig_mode
integer(c_int),value :: cublas_fill_mode
integer(c_int),value ::n
type(c_ptr),value::d_A
integer(c_int),value::lda
type(c_ptr),value::Lwork
end function cusolverDnZhegvd
end module priv_cusolver_interface
/*
!--------------------------------------------------------------------------------
! Copyright (c) 2016 Peter Grünberg Institut, Forschungszentrum Jülich, Germany
! This file is part of FLEUR and available as free software under the conditions
! of the MIT license as expressed in the LICENSE file in more detail.
!--------------------------------------------------------------------------------
*/
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
/* Interface for the cusolverDN routines for solving a generalized Eigenvalue problem
Code adopted from the example in the documentation
*/
void cusolver_complex(cuDoubleComplex *H,cuDoubleComplex *S,int n,int ne,double tol,int max_sweeps,double* eig,cuDoubleComplex *z){
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
syevjInfo_t syevj_params = NULL;
cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
double *d_W = NULL; /* eigenvalues on device*/
int *d_info = NULL; /* error info */
int lwork = 0; /* size of workspace */
cuDoubleComplex *d_work = NULL; /* device workspace for syevj */
int info = 0; /* host copy of error info */
/* configuration of syevj */
const cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1; //Solve H psi=S lambda psi
const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvectors.
const cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
/* numerical results of syevj */
double residual = 0;
int executed_sweeps = 0;
/* step 1: create cusolver handle, bind a stream */
status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
status = cusolverDnSetStream(cusolverH, stream);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 2: configuration of syevj */
status = cusolverDnCreateSyevjInfo(&syevj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of tolerance is machine zero */
status = cusolverDnXsyevjSetTolerance(&svj_params,tol);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of max. sweeps is 100 */
status = cusolverDnXsyevjSetMaxSweeps(&syevj_params,max_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 3: copy A to device */
cudaStat2 = cudaMalloc ((void**)&d_W, sizeof(cuDoubleComplex) * n);
cudaStat3 = cudaMalloc ((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
/* step 4: query working space of sygvj */
status = cusolverDnZhegvj_bufferSize(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,&lwork,syevj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex)*lwork);
assert(cudaSuccess == cudaStat1);
/* step 5: compute eigen-pair */
status = cusolverDnZhegvj(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,d_work,lwork,d_info,syevj_params);
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(eig, d_W, sizeof(double)*ne, cudaMemcpyDeviceToHost);
cudaStat2 = cudaMemcpy(z, H, sizeof(cuDoubleComplex)*n*ne, cudaMemcpyDeviceToHost);
cudaStat3 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
if ( 0 == info ){
printf("sygvj converges \n");
}else if ( 0 > info ){
printf("%d-th parameter is wrong \n", -info);
exit(1);
}else{
printf("WARNING: info = %d : sygvj does not converge \n", info );
}
status = cusolverDnXsyevjGetSweeps(cusolverH,syevj_params,&executed_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
status = cusolverDnXsyevjGetResidual(cusolverH,syevj_params,&residual);
assert(CUSOLVER_STATUS_SUCCESS == status);
printf("residual |A - V*W*V**H|_F = %E \n", residual );
printf("number of executed sweeps = %d \n", executed_sweeps );
/* free resources */
if (d_W ) cudaFree(d_W);
if (d_info ) cudaFree(d_info);
if (d_work ) cudaFree(d_work);
if (cusolverH ) cusolverDnDestroy(cusolverH);
if (stream ) cudaStreamDestroy(stream);
if (syevj_params) cusolverDnDestroySyevjInfo(syevj_params);
// cudaDeviceReset();
return ;
}
void cusolver_real(double *H,double *S,int n,int ne,double tol,int max_sweeps,double* eig,double *z){
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
syevjInfo_t syevj_params = NULL;
cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
double *d_W = NULL; /* eigenvalues on device*/
int *d_info = NULL; /* error info */
int lwork = 0; /* size of workspace */
double *d_work = NULL; /* device workspace for syevj */
int info = 0; /* host copy of error info */
/* configuration of syevj */
const cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1; //Solve H psi=S lambda psi
const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvectors.
const cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
/* numerical results of syevj */
double residual = 0;
int executed_sweeps = 0;
/* step 1: create cusolver handle, bind a stream */
status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
status = cusolverDnSetStream(cusolverH, stream);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 2: configuration of syevj */
status = cusolverDnCreateSyevjInfo(&syevj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of tolerance is machine zero */
status = cusolverDnXsyevjSetTolerance(&svj_params,tol);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of max. sweeps is 100 */
status = cusolverDnXsyevjSetMaxSweeps(&syevj_params,max_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 3: copy A to device */
cudaStat2 = cudaMalloc ((void**)&d_W, sizeof(double) * n);
cudaStat3 = cudaMalloc ((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
/* step 4: query working space of sygvj */
status = cusolverDnDsygvj_bufferSize(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,&lwork,syevj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaMalloc((void**)&d_work, sizeof(double)*lwork);
assert(cudaSuccess == cudaStat1);
/* step 5: compute eigen-pair */
status = cusolverDnDsygvj(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,d_work,lwork,d_info,syevj_params);
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(eig, d_W, sizeof(double)*ne, cudaMemcpyDeviceToHost);
cudaStat2 = cudaMemcpy(z, H, sizeof(double)*n*ne, cudaMemcpyDeviceToHost);
cudaStat3 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
if ( 0 == info ){
printf("sygvj converges \n");
}else if ( 0 > info ){
printf("%d-th parameter is wrong \n", -info);
exit(1);
}else{
printf("WARNING: info = %d : sygvj does not converge \n", info );
}
status = cusolverDnXsyevjGetSweeps(cusolverH,syevj_params,&executed_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
status = cusolverDnXsyevjGetResidual(cusolverH,syevj_params,&residual);
assert(CUSOLVER_STATUS_SUCCESS == status);
printf("residual |A - V*W*V**H|_F = %E \n", residual );
printf("number of executed sweeps = %d \n", executed_sweeps );
/* free resources */
if (d_W ) cudaFree(d_W);
if (d_info ) cudaFree(d_info);
if (d_work ) cudaFree(d_work);
if (cusolverH ) cusolverDnDestroy(cusolverH);
if (stream ) cudaStreamDestroy(stream);
if (syevj_params) cusolverDnDestroySyevjInfo(syevj_params);
// cudaDeviceReset();
return ;
}
!--------------------------------------------------------------------------------
! Copyright (c) 2016 Peter Grünberg Institut, Forschungszentrum Jülich, Germany
! This file is part of FLEUR and available as free software under the conditions
! of the MIT license as expressed in the LICENSE file in more detail.
!--------------------------------------------------------------------------------
MODULE m_cusolver_diag
USE m_types
USE m_judft
#ifdef CPP_GPU
USE m_types_gpumat
#endif
IMPLICIT NONE
PRIVATE
#ifdef CPP_GPU
INTERFACE
SUBROUTINE cusolver_real(H,S,n,ne,tol,max_sweeps,eig,z) BIND(C,name="cusolver_real")
USE iso_c_binding
IMPLICIT NONE
REAL(c_double) :: H(*),S(*)
INTEGER(c_int),VALUE :: n,ne,max_sweeps
REAL(c_double),VALUE :: tol
REAL(c_double) :: eig(*),z(*)
END SUBROUTINE cusolver_real
END INTERFACE
INTERFACE
SUBROUTINE cusolver_complex(H,S,n,ne,tol,max_sweeps,eig,z) BIND(C,name="cusolver_real")
USE iso_c_binding
IMPLICIT NONE
COMPLEX(c_double) :: H(*),S(*)
INTEGER(c_int),VALUE :: n,ne,max_sweeps
REAL(c_double),VALUE :: tol
REAL(c_double) :: eig(*)
COMPLEX(c_double) :: z(*)
END SUBROUTINE cusolver_complex
END INTERFACE
#endif
PUBLIC cusolver_diag
CONTAINS
SUBROUTINE cusolver_diag(hmat,smat,ne,eig,zmat)
!Simple driver to solve Generalized Eigenvalue Problem using CuSolverDN
IMPLICIT NONE
CLASS(t_mat),INTENT(INOUT) :: hmat,smat
INTEGER,INTENT(INOUT) :: ne
CLASS(t_mat),ALLOCATABLE,INTENT(OUT) :: zmat
REAL,INTENT(OUT) :: eig(:)
#ifdef CPP_GPU
INTEGER,PARAMETER:: max_sweeps=15
REAL :: tol=1E-7
TYPE(t_gpumat)::smat_gpu,hmat_gpu
ALLOCATE(t_mat::zmat)
CALL zmat%alloc(hmat%l_real,hmat%matsize1,ne)
SELECT TYPE(hmat)
TYPE IS (t_mat)
CALL hmat_gpu%init_from(hmat)
CALL smat_gpu%init_from(smat)
IF (hmat%l_real) THEN
CALL cusolver_real(hmat_gpu%gpu_r,smat_gpu%gpu_r,smat%matsize1,ne,tol,max_sweeps,eig,z%data_r)
ELSE
CALL cusolver_complex(hmat_gpu%gpu_c,smat_gpu%gpu_c,smat%matsize1,ne,tol,max_sweeps,eig,z%data_c)
END IF
TYPE IS (t_gpumat)
IF (hmat%l_real) THEN
CALL cusolver_real(hmat%gpu_r,smat%gpu_r,smat%matsize1,ne,tol,max_sweeps,eig,z%data_r)
ELSE
CALL cusolver_complex(hmat%gpu_c,smat%gpu_c,smat%matsize1,ne,tol,max_sweeps,eig,z%data_c)
END IF
END SELECT
#endif
END SUBROUTINE cusolver_diag
END MODULE m_cusolver_diag
......@@ -54,7 +54,11 @@ CONTAINS
!In noco case we need 4-matrices for each spin channel
nspins=MERGE(2,1,noco%l_noco)
IF (mpi%n_size==1) THEN
ALLOCATE(t_mat::smat(nspins,nspins),hmat(nspins,nspins))
IF (judft_was_argument("-gpu")) THEN
ALLOCATE(t_gpumat::smat(nspins,nspins),hmat(nspins,nspins))
ELSE
ALLOCATE(t_mat::smat(nspins,nspins),hmat(nspins,nspins))
ENDIF
ELSE
ALLOCATE(t_mpimat::smat(nspins,nspins),hmat(nspins,nspins))
ENDIF
......@@ -85,14 +89,10 @@ CONTAINS
CALL timestop("Vacuum part")
ENDIF
!Now copy the data into final matrix
IF (mpi%n_size==1) THEN
ALLOCATE(t_mat::smat_final,hmat_final)
ELSE
ALLOCATE(t_mpimat::smat_final,hmat_final)
ENDIF
! Collect the four noco parts into a single matrix
! In collinear case only a copy is done
! In the parallel case also a redistribution happens
ALLOCATE(smat_final,hmat_final,source=smat(1,1))
CALL eigen_redist_matrix(mpi,lapw,atoms,smat,smat_final)
CALL eigen_redist_matrix(mpi,lapw,atoms,hmat,hmat_final,smat_final)
......
......@@ -13,7 +13,7 @@ MODULE m_fleur_arguments
CHARACTER(len=200) :: values
END TYPE t_fleur_param
INTEGER,PARAMETER:: no_params=21
INTEGER,PARAMETER:: no_params=22
TYPE(t_fleur_param) :: fleur_param(no_params)=(/&
!Input options
t_fleur_param(0,"-toXML","Convert an old 'inp' file into the new XML format",""),&
......@@ -37,6 +37,9 @@ MODULE m_fleur_arguments
#endif
#ifdef CPP_MAGMA
//",magma"&
#endif
#ifdef CPP_GPU
//",cusolver"&
#endif
),&
t_fleur_param(1,"-eig","Method for storing the eigenvectors","mem,da"&
......@@ -61,6 +64,12 @@ MODULE m_fleur_arguments
,t_fleur_param(0,"-last_extra","Generate an additional file cdn_last.hdf that contains only the last density","")&
,t_fleur_param(2,"-sd","use starting density N, where N is the index of the density according to -info","")&
,t_fleur_param(1,"-delden","delete densities (either an index N, a range N-M or the keyword 'allbutlast' should be given)","")&
#ifdef CPP_GPU
!GPU paramter
,t_fleur_param(0,"-gpu","Use GPU for computing","")&
#else
,t_fleur_param(0,"","","")&
#endif
/)
......
......@@ -50,3 +50,8 @@ types/types_dos.f90
types/types_denCoeffsOffdiag.f90
types/types_force.f90
)
if (FLEUR_USE_GPU)
set(fleur_F90 ${fleur_F90}
types/types_gpumat.F90
)
endif()
This diff is collapsed.
......@@ -12,8 +12,13 @@ MODULE m_types_mat
LOGICAL :: l_real !>Store either real or complex data
INTEGER :: matsize1=-1 !> matsize1=size(data_?,1),i.e. no of rows
INTEGER :: matsize2=-1 !> matsize2=size(data_?,2),i.e. no of columns
#ifdef CPP_GPU
REAL,MANAGED,ALLOCATABLE :: data_r(:,:)
COMPLEX,MANAGED,ALLOCATABLE :: data_c(:,:)
#else
REAL,ALLOCATABLE :: data_r(:,:)
COMPLEX,ALLOCATABLE :: data_c(:,:)
#endif
CONTAINS
PROCEDURE :: alloc => t_mat_alloc !> allocate the data-arrays
PROCEDURE :: multiply=>t_mat_multiply !> do a matrix-matrix multiply
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment