Commit 902547d4 authored by S.Rost's avatar S.Rost

abcof output without rotation is working merge

parents bbcc63ec 7f6f6328
......@@ -60,7 +60,7 @@ kpoints/tetcon.f kpoints/kvecon.f init/boxdim.f global/radsra.f math/intgr.F glo
set(inpgen_F90 ${inpgen_F90} global/constants.f90 io/xsf_io.f90
eigen/orthoglo.F90 juDFT/usage_data.F90 math/ylm4.F90
global/enpara.f90 global/chkmt.f90 inpgen/inpgen.f90 inpgen/set_inp.f90 inpgen/inpgen_help.f90 io/rw_inp.f90 juDFT/juDFT.F90 global/find_enpara.f90
global/chkmt.f90 inpgen/inpgen.f90 inpgen/set_inp.f90 inpgen/inpgen_help.f90 io/rw_inp.f90 juDFT/juDFT.F90 global/find_enpara.f90
inpgen/closure.f90 inpgen/inpgen_arguments.F90
juDFT/info.F90 juDFT/stop.F90 juDFT/args.F90 juDFT/time.F90 juDFT/init.F90 juDFT/sysinfo.F90 io/w_inpXML.f90 kpoints/julia.f90 global/utility.F90
init/compile_descr.F90 kpoints/kpoints.f90 io/xmlOutput.F90 kpoints/brzone2.f90 cdn/slab_dim.f90 cdn/slabgeom.f90 dos/nstm3.f90 cdn/int_21.f90
......
......@@ -37,7 +37,7 @@ else()
endif()
message("${Green} Compile GPU version : ${CReset} ${FLEUR_USE_GPU}")
if (FLEUR_USE_GPU)
message("${Green} CuSolver Library found : ${CReset} ${FLEUR_USE_CUSOLVER}")
message("${Green} CuSolver Library found : ${CReset} ${FLEUR_USE_CUSOLVER}")
endif()
message("\n")
message("${Green}Compile serial version : ${CReset} ${FLEUR_USE_SERIAL}")
......
......@@ -11,14 +11,14 @@ if (CLI_FLEUR_USE_GPU)
elseif(${CLI_FLEUR_USE_GPU} MATCHES "cuda9.1")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.1,cc60 -Mcuda=rdc -Mcudalib=cublas")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "nvtx")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas -lnvToolsExt ")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=cuda9.1,cc60 -Mcuda=rdc -Mcudalib=cublas -lnvToolsExt ")
elseif(${CLI_FLEUR_USE_GPU} MATCHES "emu")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcuda=emu -Mcudalib=cublas -Minfo=accel ")
endif()
set(FLEUR_MPI_DEFINITIONS ${FLEUR_MPI_DEFINITIONS} "CPP_GPU" "CPP_MANAGED=,MANAGED")
set(FLEUR_DEFINITIONS ${FLEUR_DEFINITIONS} "CPP_GPU" "CPP_MANAGED=,MANAGED")
#Now check for cusolverDN library
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Mcuda")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Mcuda -ta=tesla,cuda9.1 ")
try_compile(FLEUR_USE_CUSOLVER ${CMAKE_BINARY_DIR} ${CMAKE_SOURCE_DIR}/cmake/tests/test_cusolver.c
LINK_LIBRARIES "-lcusolver"
)
......
......@@ -56,19 +56,21 @@ void cusolver_complex(cuDoubleComplex *H,cuDoubleComplex *S,int n,int ne,double
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of tolerance is machine zero */
status = cusolverDnXsyevjSetTolerance(&syevj_params,tol);
status = cusolverDnXsyevjSetTolerance(syevj_params,tol);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of max. sweeps is 100 */
status = cusolverDnXsyevjSetMaxSweeps(&syevj_params,max_sweeps);
status = cusolverDnXsyevjSetMaxSweeps(syevj_params,max_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
printf("Allocate data \n");
/* step 3: copy A to device */
cudaStat2 = cudaMalloc ((void**)&d_W, sizeof(cuDoubleComplex) * n);
cudaStat3 = cudaMalloc ((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
printf("query working space \n");
/* step 4: query working space of sygvj */
status = cusolverDnZhegvj_bufferSize(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,&lwork,syevj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
......@@ -76,6 +78,7 @@ void cusolver_complex(cuDoubleComplex *H,cuDoubleComplex *S,int n,int ne,double
cudaStat1 = cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex)*lwork);
assert(cudaSuccess == cudaStat1);
printf("compute eigen-pair \n");
/* step 5: compute eigen-pair */
status = cusolverDnZhegvj(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,d_work,lwork,d_info,syevj_params);
cudaStat1 = cudaDeviceSynchronize();
......@@ -163,26 +166,26 @@ void cusolver_real(double *H,double *S,int n,int ne,double tol,int max_sweeps,do
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of tolerance is machine zero */
status = cusolverDnXsyevjSetTolerance(&syevj_params,tol);
status = cusolverDnXsyevjSetTolerance(syevj_params,tol);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* default value of max. sweeps is 100 */
status = cusolverDnXsyevjSetMaxSweeps(&syevj_params,max_sweeps);
status = cusolverDnXsyevjSetMaxSweeps(syevj_params,max_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
printf("Allocate data \n");
/* step 3: copy A to device */
cudaStat2 = cudaMalloc ((void**)&d_W, sizeof(double) * n);
cudaStat3 = cudaMalloc ((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
printf("query working space \n");
/* step 4: query working space of sygvj */
status = cusolverDnDsygvj_bufferSize(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,&lwork,syevj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaMalloc((void**)&d_work, sizeof(double)*lwork);
assert(cudaSuccess == cudaStat1);
printf("compute eigen-pair \n");
/* step 5: compute eigen-pair */
status = cusolverDnDsygvj(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,d_work,lwork,d_info,syevj_params);
cudaStat1 = cudaDeviceSynchronize();
......@@ -219,9 +222,9 @@ void cusolver_real(double *H,double *S,int n,int ne,double tol,int max_sweeps,do
if (d_info ) cudaFree(d_info);
if (d_work ) cudaFree(d_work);
if (cusolverH ) cusolverDnDestroy(cusolverH);
if (stream ) cudaStreamDestroy(stream);
if (syevj_params) cusolverDnDestroySyevjInfo(syevj_params);
if (stream ) cudaStreamDestroy(stream);
if (cusolverH ) cusolverDnDestroy(cusolverH);
// cudaDeviceReset();
......
......@@ -19,22 +19,19 @@ CONTAINS
#ifdef CPP_GPU
ATTRIBUTES(global) SUBROUTINE synth_ab(grid,block,n,lmax,ab_size,gkrot_dev,fj,gj,c_ph,ab)
ATTRIBUTES(global) SUBROUTINE synth_ab(loop_size,n,lmax,ab_size,gkrot_dev,fj,gj,c_ph,ab)
USE m_ylm
INTEGER, VALUE, INTENT(IN) :: grid, block, n, lmax, ab_size
INTEGER, VALUE, INTENT(IN) :: loop_size, n, lmax, ab_size
REAL, DEVICE, INTENT(IN) :: gkrot_dev(:,:),fj(:,:),gj(:,:)
COMPLEX,DEVICE, INTENT(IN) :: c_ph(:)
COMPLEX,DEVICE, INTENT (OUT) :: ab(:,:)
COMPLEX,ALLOCATABLE :: ylm(:)
INTEGER :: k,l,ll1,m
INTEGER :: loop_start, loop_end, i, loop_size
INTEGER :: k,l,ll1,m,i
INTEGER :: loop_start, loop_end
ALLOCATE(ylm((lmax+1)**2))
k = (blockidx%x-1)*blockdim%x + threadidx%x
loop_size = max(n/(grid*block),1)
if (loop_size * grid*block < n) loop_size = loop_size + 1
loop_start = (k-1) * loop_size + 1
loop_end = loop_start + loop_size - 1
if (loop_end > n ) loop_end = n
......@@ -90,7 +87,7 @@ CONTAINS
COMPLEX,ALLOCATABLE,DEVICE :: c_ph_dev(:,:)
REAL, ALLOCATABLE,DEVICE :: gkrot_dev(:,:)
INTEGER :: grid, block
INTEGER :: grid, block, loop_size
INTEGER :: istat
call nvtxStartRange("hsmt_ab",3)
......@@ -129,13 +126,13 @@ CONTAINS
!--> synthesize the complex conjugates of a and b
! pretty ugly solution
block = 256
grid = lapw%nv(1)/(block*4) + 1
CALL synth_ab<<<grid,block>>>(grid,block,lapw%nv(1),lmax,ab_size,gkrot_dev,&
grid = 30 ! number of blocks in the grid
block = 32 ! number of threads in a block
loop_size = max(lapw%nv(1)/(grid*block),1) !number of iterations performed by each thread
if (loop_size * grid*block < lapw%nv(1)) loop_size = loop_size + 1
CALL synth_ab<<<grid,block>>>(loop_size,lapw%nv(1),lmax,ab_size,gkrot_dev,&
fj(:,:,iintsp),gj(:,:,iintsp),c_ph_dev(:,iintsp),ab)
IF (PRESENT(abclo)) THEN
print*, "Ooooops, TODO in hsmt_ab"
!DO k = 1,lapw%nv(1)
......
This diff is collapsed.
This diff is collapsed.
......@@ -175,6 +175,15 @@ MODULE m_cdn_io
CALL getIOMode(mode)
#ifndef CPP_HDF
filename = 'cdn.hdf'
IF(PRESENT(inFilename)) filename = TRIM(ADJUSTL(inFilename))//'.hdf'
INQUIRE(FILE=TRIM(ADJUSTL(filename)),EXIST=l_exist)
IF (l_exist) THEN
CALL juDFT_warn('Fleur not compiled for HDF5, but '//TRIM(ADJUSTL(filename))//' present',calledby='readDensity')
END IF
#endif
IF(mode.EQ.CDN_HDF5_MODE) THEN
#ifdef CPP_HDF
......
......@@ -12,9 +12,11 @@ SUBROUTINE writeBasis(input,noco,kpts,atoms,sym,cell,enpara,vTot,mpi,DIMENSION,r
USE m_types
USE m_juDFT
#ifdef CPP_HDF
USE hdf5
USE m_hdf_tools
USE m_genmtbasis
#endif
USE m_genmtbasis
! USE m_cdn_io
USE m_abcof
USE m_eig66_io, ONLY : read_eig
......@@ -47,12 +49,17 @@ SUBROUTINE writeBasis(input,noco,kpts,atoms,sym,cell,enpara,vTot,mpi,DIMENSION,r
INTEGER(HID_T) :: fileID
#ifdef CPP_HDF
LOGICAL :: l_exist
CHARACTER(LEN=30) :: filename
CHARACTER(LEN=30) :: kpt_name
CHARACTER(LEN=30) :: jsp_name
CHARACTER(LEN=30) :: itype_name
! CHARACTER(LEN=30) :: l_name
INTEGER(HID_T) :: fileID
INTEGER(HID_T) :: metaGroupID
INTEGER(HID_T) :: generalGroupID
INTEGER(HID_T) :: cellGroupID
......@@ -80,6 +87,7 @@ SUBROUTINE writeBasis(input,noco,kpts,atoms,sym,cell,enpara,vTot,mpi,DIMENSION,r
INTEGER(HID_T) :: kptWeightSpaceID, kptWeightSetID
! INTEGER(HID_T) :: kptSPLabelsSpaceID, kptSPLabelsSetID
! INTEGER(HID_T) :: kptsSPIndicesSpaceID, kptsSPIndicesSetID
INTEGER(HSIZE_T) :: dims(7)
INTEGER :: j, iAtom
! INTEGER :: noded, nodeu
......@@ -94,7 +102,6 @@ SUBROUTINE writeBasis(input,noco,kpts,atoms,sym,cell,enpara,vTot,mpi,DIMENSION,r
INTEGER :: atomicNumbers(atoms%nat)
INTEGER :: equivAtomsGroup(atoms%nat)
INTEGER(HSIZE_T) :: dims(7)
! REAL :: wronk
......@@ -118,7 +125,6 @@ SUBROUTINE writeBasis(input,noco,kpts,atoms,sym,cell,enpara,vTot,mpi,DIMENSION,r
ALLOCATE (flo(atoms%jmtd,2,atoms%nlod))
#ifdef CPP_HDF
l_real=sym%invs.AND..NOT.noco%l_noco
! check if z-reflection trick can be used
......
......@@ -9,7 +9,7 @@ MODULE m_types_enpara
IMPLICIT NONE
PRIVATE
TYPE t_enpara
REAL, ALLOCATABLE :: el0(:,:,:)
REAL, ALLOCATABLE CPP_MANAGED :: el0(:,:,:)
REAL :: evac0(2,2)
REAL :: evac(2,2)
REAL, ALLOCATABLE :: ello0(:,:,:)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment