Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
fleur
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
54
Issues
54
List
Boards
Labels
Service Desk
Milestones
Operations
Operations
Incidents
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
fleur
fleur
Commits
e288839d
Commit
e288839d
authored
Oct 18, 2018
by
Matthias Redies
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'develop' into MetaGGA
parents
d7380394
cb805565
Changes
17
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
719 additions
and
269 deletions
+719
-269
cmake/tests/test_GPU.cmake
cmake/tests/test_GPU.cmake
+3
-1
diagonalization/magma.F90
diagonalization/magma.F90
+36
-64
eigen/eigen_redist_matrix.f90
eigen/eigen_redist_matrix.f90
+5
-5
eigen/hsmt.F90
eigen/hsmt.F90
+7
-3
eigen/hsmt_ab.F90
eigen/hsmt_ab.F90
+5
-7
eigen/hsmt_distspin.F90
eigen/hsmt_distspin.F90
+1
-1
eigen/hsmt_nonsph.F90
eigen/hsmt_nonsph.F90
+21
-29
eigen/hsmt_sph.F90
eigen/hsmt_sph.F90
+440
-6
io/cdnpot_io_hdf.F90
io/cdnpot_io_hdf.F90
+3
-3
juDFT/hdf_tools_rw.F90
juDFT/hdf_tools_rw.F90
+27
-27
main/mix.F90
main/mix.F90
+4
-1
tests/tests/NiO_ldauXML/test.run1
tests/tests/NiO_ldauXML/test.run1
+4
-4
types/types_gpumat.F90
types/types_gpumat.F90
+8
-0
types/types_mat.F90
types/types_mat.F90
+12
-17
vgen/VYukawaFilm.f90
vgen/VYukawaFilm.f90
+140
-100
vgen/psqpw.F90
vgen/psqpw.F90
+1
-0
vgen/vgen_coulomb.F90
vgen/vgen_coulomb.F90
+2
-1
No files found.
cmake/tests/test_GPU.cmake
View file @
e288839d
...
...
@@ -8,6 +8,8 @@ if (CLI_FLEUR_USE_GPU)
message
(
"Using cuda8"
)
elseif
(
${
CLI_FLEUR_USE_GPU
}
MATCHES
"cuda9"
)
set
(
CMAKE_Fortran_FLAGS
"
${
CMAKE_Fortran_FLAGS
}
-Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas"
)
elseif
(
${
CLI_FLEUR_USE_GPU
}
MATCHES
"cuda9.1"
)
set
(
CMAKE_Fortran_FLAGS
"
${
CMAKE_Fortran_FLAGS
}
-Mcuda=cuda9.1,cc60 -Mcuda=rdc -Mcudalib=cublas"
)
elseif
(
${
CLI_FLEUR_USE_GPU
}
MATCHES
"nvtx"
)
set
(
CMAKE_Fortran_FLAGS
"
${
CMAKE_Fortran_FLAGS
}
-Mcuda=cuda9.0,cc60 -Mcuda=rdc -Mcudalib=cublas -lnvToolsExt "
)
elseif
(
${
CLI_FLEUR_USE_GPU
}
MATCHES
"emu"
)
...
...
@@ -21,7 +23,7 @@ if (CLI_FLEUR_USE_GPU)
LINK_LIBRARIES
"-lcusolver"
)
if
(
FLEUR_USE_CUSOLVER
)
set
(
${
FLEUR_LIBRARIES
}
"
${
FLEUR_LIBRARIES
}
;-lcusolver"
)
set
(
FLEUR_LIBRARIES
"
${
FLEUR_LIBRARIES
}
;-lcusolver"
)
set
(
FLEUR_MPI_DEFINITIONS
${
FLEUR_MPI_DEFINITIONS
}
"CPP_CUSOLVER"
)
set
(
FLEUR_DEFINITIONS
${
FLEUR_DEFINITIONS
}
"CPP_CUSOLVER"
)
endif
()
...
...
diagonalization/magma.F90
View file @
e288839d
...
...
@@ -12,92 +12,64 @@ MODULE m_magma
! using the MAGMA library for multiple GPUs
!**********************************************************
CONTAINS
SUBROUTINE
magma_diag
(
nsize
,
eig
,
ne
,
a_r
,
b_r
,
z_r
,
a_c
,
b_c
,
z_c
)
SUBROUTINE
magma_diag
(
hmat
,
smat
,
ne
,
eig
,
zmat
)
#ifdef CPP_MAGMA
use
magma
#endif
#include"cpp_double.h"
use
m_types
IMPLICIT
NONE
! ... Arguments ...
INTEGER
,
INTENT
(
IN
)
::
nsize
TYPE
(
t_mat
),
INTENT
(
INOUT
)
::
hmat
,
smat
INTEGER
,
INTENT
(
INOUT
)
::
ne
CLASS
(
t_mat
),
ALLOCATABLE
,
INTENT
(
OUT
)
::
zmat
REAL
,
INTENT
(
OUT
)
::
eig
(:)
REAL
,
INTENT
(
OUT
)
::
eig
(:)
INTEGER
,
INTENT
(
INOUT
)
::
ne
REAL
,
OPTIONAL
,
ALLOCATABLE
,
INTENT
(
INOUT
)
::
a_r
(:),
b_r
(:)
REAL
,
OPTIONAL
,
ALLOCATABLE
,
INTENT
(
INOUT
)
::
z_r
(:,:)
COMPLEX
,
OPTIONAL
,
ALLOCATABLE
,
INTENT
(
INOUT
)
::
a_c
(:),
b_c
(:)
COMPLEX
,
OPTIONAL
,
ALLOCATABLE
,
INTENT
(
INOUT
)
::
z_c
(:,:)
#ifdef CPP_MAGMA
! ... Local Variables ..
INTEGER
iind
,
ind1
,
ind2
,
info
,
lwork
,
liwork
,
lrwork
,
err
,
i
,
mout
(
1
)
REAL
eigTemp
(
nsize
)
LOGICAL
::
initialized
=
.false.
INTEGER
::
lwork
,
liwork
,
lrwork
,
err
,
mout
(
1
)
REAL
::
eigTemp
(
hmat
%
matsize1
)
LOGICAL
::
initialized
=
.false.
REAL
,
ALLOCATABLE
::
rwork
(:)
INTEGER
,
ALLOCATABLE
::
iwork
(:)
REAL
,
ALLOCATABLE
::
largea_r
(:,:),
largeb_r
(:,:)
COMPLEX
,
ALLOCATABLE
::
largea_c
(:,:),
largeb_c
(:,:)
COMPLEX
,
ALLOCATABLE
::
work
(:)
COMPLEX
,
ALLOCATABLE
::
work
(:)
LOGICAL
::
l_real
l_real
=
present
(
a_r
)
print
*
,
"MAGMA start"
IF
(
.NOT.
initialized
)
THEN
initialized
=
.true.
call
magmaf_init
()
print
*
,
"MAGMA init"
initialized
=
.TRUE.
CALL
magmaf_init
()
ENDIF
!**********************************
!expand from packed to full storage
!**********************************
!hamiltonian
if
(
l_real
)
THEN
call
packed_to_full
(
nsize
,
a_r
,
largea_r
)
call
packed_to_full
(
nsize
,
b_r
,
largeb_r
)
!deallocate(a_r,b_r)
IF
(
hmat
%
l_real
)
THEN
CALL
juDFT_error
(
"REAL diagonalization not implemented in magma.F90"
)
ELSE
call
packed_to_full
(
nsize
,
a_c
,
largea_c
)
call
packed_to_full
(
nsize
,
b_c
,
largeb_c
)
!deallocate(a_c,b_c)
Endif
if
(
l_real
)
call
juDFT_error
(
"REAL diagonalization not implemented in magma.F90"
)
!Query the workspace size
allocate
(
work
(
1
),
rwork
(
1
),
iwork
(
1
))
print
*
,
"Magma workspace query"
call
flush
()
call
magmaf_zhegvdx
(
1
,
'v'
,
'i'
,
'l'
,
nsize
,
largea_c
,
nsize
,
largeb_c
,
nsize
,&
0.0
,
0.0
,
1
,
ne
,
mout
,
eigTemp
,
work
,
-1
,
rwork
,
-1
,
iwork
,
-1
,
err
)
lwork
=
work
(
1
)
lrwork
=
rwork
(
1
)
liwork
=
iwork
(
1
)
print
*
,
"MAGMA:"
,
lwork
,
lrwork
,
liwork
deallocate
(
work
,
rwork
,
iwork
)
allocate
(
work
(
lwork
),
rwork
(
lrwork
),
iwork
(
liwork
))
if
(
err
/
=
0
)
call
juDFT_error
(
"Failed to allocate workspaces"
,
calledby
=
"magma.F90"
)
!Now the diagonalization
print
*
,
"Magma diagonalization"
print
*
,
nsize
,
shape
(
largea_c
),
shape
(
eigTemp
),
ne
call
magmaf_zhegvdx
(
1
,
'v'
,
'i'
,
'l'
,
nsize
,
largea_c
,
nsize
,
largeb_c
,
nsize
,&
0.0
,
0.0
,
1
,
ne
,
mout
,
eigTemp
,
work
,
lwork
,
rwork
,
lrwork
,
iwork
,
liwork
,
err
)
print
*
,
"MAGMA info:"
,
err
if
(
err
/
=
0
)
call
juDFT_error
(
"Magma failed to diagonalize Hamiltonian"
)
print
*
,
"MAGMA mout:"
,
mout
!Query the workspace size
ALLOCATE
(
work
(
1
),
rwork
(
1
),
iwork
(
1
))
CALL
magmaf_zhegvdx
(
1
,
'v'
,
'i'
,
'l'
,
hmat
%
matsize1
,
hmat
%
data_c
,
SIZE
(
hmat
%
data_c
,
1
),
smat
%
data_c
,
SIZE
(
smat
%
data_c
,
1
),&
0.0
,
0.0
,
1
,
ne
,
mout
,
eigTemp
,
work
,
-1
,
rwork
,
-1
,
iwork
,
-1
,
err
)
IF
(
err
/
=
0
)
CALL
juDFT_error
(
"Failed to query workspaces"
,
calledby
=
"magma.F90"
)
lwork
=
work
(
1
)
lrwork
=
rwork
(
1
)
liwork
=
iwork
(
1
)
DEALLOCATE
(
work
,
rwork
,
iwork
)
ALLOCATE
(
work
(
lwork
),
rwork
(
lrwork
),
iwork
(
liwork
))
!Now the diagonalization
CALL
magmaf_zhegvdx
(
1
,
'v'
,
'i'
,
'l'
,
hmat
%
matsize1
,
hmat
%
data_c
,
SIZE
(
hmat
%
data_c
,
1
),
smat
%
data_c
,
SIZE
(
smat
%
data_c
,
1
),&
0.0
,
0.0
,
1
,
ne
,
mout
,
eigTemp
,
work
,
lwork
,
rwork
,
lrwork
,
iwork
,
liwork
,
err
)
IF
(
err
/
=
0
)
CALL
juDFT_error
(
"Magma failed to diagonalize Hamiltonian"
)
ENDIF
ALLOCATE
(
t_mat
::
zmat
)
CALL
zmat
%
alloc
(
hmat
%
l_real
,
hmat
%
matsize1
,
ne
)
DO
i
=
1
,
ne
eig
(
i
)
=
eigTemp
(
i
)
z_c
(:
nsize
,
i
)
=
largea_c
(:
nsize
,
i
)
IF
(
hmat
%
l_real
)
THEN
zmat
%
data_r
(:,
i
)
=
hmat
%
data_r
(:
nsize
,
i
)
ELSE
zmat
%
data_c
(:,
i
)
=
hmat
%
data_c
(:
nsize
,
i
)
ENDIF
END
DO
!call judft_error("Eigenvectors are not calculated in MAGMA")
#endif
END
SUBROUTINE
magma_diag
END
MODULE
m_magma
...
...
eigen/eigen_redist_matrix.f90
View file @
e288839d
...
...
@@ -38,22 +38,22 @@ CONTAINS
!up-up component (or only component in collinear case)
IF
(
SIZE
(
mat
)
==
1
)
THEN
CALL
mat_final
%
move
(
mat
(
1
,
1
))
CALL
mat
(
1
,
1
)
%
free
()
!
CALL mat(1,1)%free()
RETURN
ENDIF
CALL
mat_final
%
copy
(
mat
(
1
,
1
),
1
,
1
)
CALL
mat
(
1
,
1
)
%
free
()
!
CALL mat(1,1)%free()
!down-down component
CALL
mat_final
%
copy
(
mat
(
2
,
2
),
lapw
%
nv
(
1
)
+
atoms
%
nlotot
+1
,
lapw
%
nv
(
1
)
+
atoms
%
nlotot
+1
)
CALL
mat
(
2
,
2
)
%
free
()
!
CALL mat(2,2)%free()
!Now collect off-diagonal parts
CALL
mat
(
1
,
2
)
%
add_transpose
(
mat
(
2
,
1
))
CALL
mat_final
%
copy
(
mat
(
1
,
2
),
1
,
lapw
%
nv
(
1
)
+
atoms
%
nlotot
+1
)
CALL
mat
(
1
,
2
)
%
free
()
CALL
mat
(
2
,
1
)
%
free
()
!
CALL mat(1,2)%free()
!
CALL mat(2,1)%free()
END
SUBROUTINE
eigen_redist_matrix
END
MODULE
m_eigen_redist_matrix
...
...
eigen/hsmt.F90
View file @
e288839d
...
...
@@ -43,7 +43,11 @@ CONTAINS
INTEGER
,
INTENT
(
IN
)
::
ispin
!locals
#ifdef CPP_GPU
REAL
,
ALLOCATABLE
,
MANAGED
::
fj
(:,:,:,:),
gj
(:,:,:,:)
#else
REAL
,
ALLOCATABLE
::
fj
(:,:,:,:),
gj
(:,:,:,:)
#endif
INTEGER
::
iintsp
,
jintsp
,
n
COMPLEX
::
chi
(
2
,
2
),
chi_one
...
...
@@ -67,7 +71,7 @@ CONTAINS
IF
(
.NOT.
noco
%
l_noco
)
THEN
!This is for collinear calculations: the (1,1) element of the matrices is all
!that is needed and allocated
CALL
hsmt_sph
(
n
,
atoms
,
mpi
,
ispin
,
input
,
noco
,
cell
,
1
,
1
,
chi_one
,
lapw
,
enpara
%
el0
,&
CALL
hsmt_sph
(
n
,
atoms
,
mpi
,
ispin
,
input
,
noco
,
1
,
1
,
chi_one
,
lapw
,
enpara
%
el0
,&
td
%
e_shift
(
n
,
ispin
),
usdus
,
fj
(:,
0
:,
ispin
,:),
gj
(:,
0
:,
ispin
,:),
smat
(
1
,
1
),
hmat
(
1
,
1
))
CALL
hsmt_nonsph
(
n
,
mpi
,
sym
,
atoms
,
ispin
,
1
,
1
,
chi_one
,
noco
,
cell
,
lapw
,
td
,&
fj
(:,
0
:,
ispin
,:),
gj
(:,
0
:,
ispin
,:),
hmat
(
1
,
1
))
...
...
@@ -79,7 +83,7 @@ CONTAINS
!stored in tmp-variables. Then these are distributed (rotated) into the 2x2
!global spin-matrices.
CALL
hmat_tmp
%
clear
();
CALL
smat_tmp
%
clear
()
CALL
hsmt_sph
(
n
,
atoms
,
mpi
,
ispin
,
input
,
noco
,
cell
,
1
,
1
,
chi_one
,
lapw
,
enpara
%
el0
,
td
%
e_shift
(
n
,
ispin
),&
CALL
hsmt_sph
(
n
,
atoms
,
mpi
,
ispin
,
input
,
noco
,
1
,
1
,
chi_one
,
lapw
,
enpara
%
el0
,
td
%
e_shift
(
n
,
ispin
),&
usdus
,
fj
(:,
0
:,
ispin
,:),
gj
(:,
0
:,
ispin
,:),
smat_tmp
,
hmat_tmp
)
CALL
hsmt_nonsph
(
n
,
mpi
,
sym
,
atoms
,
ispin
,
1
,
1
,
chi_one
,
noco
,
cell
,
lapw
,
td
,&
fj
(:,
0
:,
ispin
,:),
gj
(:,
0
:,
ispin
,:),
hmat_tmp
)
...
...
@@ -99,7 +103,7 @@ CONTAINS
CALL
hsmt_spinor
(
ispin
,
n
,
noco
,
chi
)
DO
iintsp
=
1
,
2
DO
jintsp
=
1
,
2
CALL
hsmt_sph
(
n
,
atoms
,
mpi
,
ispin
,
input
,
noco
,
cell
,
iintsp
,
jintsp
,
chi
(
iintsp
,
jintsp
),&
CALL
hsmt_sph
(
n
,
atoms
,
mpi
,
ispin
,
input
,
noco
,
iintsp
,
jintsp
,
chi
(
iintsp
,
jintsp
),&
lapw
,
enpara
%
el0
,
td
%
e_shift
(
n
,
ispin
),
usdus
,
fj
(:,
0
:,
ispin
,:),
gj
(:,
0
:,
ispin
,:),&
smat
(
iintsp
,
jintsp
),
hmat
(
iintsp
,
jintsp
))
CALL
hsmt_nonsph
(
n
,
mpi
,
sym
,
atoms
,
ispin
,
iintsp
,
jintsp
,
chi
(
iintsp
,
jintsp
),
noco
,
cell
,&
...
...
eigen/hsmt_ab.F90
View file @
e288839d
...
...
@@ -62,7 +62,7 @@ CONTAINS
USE
m_ylm
USE
m_apws
USE
cudafor
!
USE nvtx
USE
nvtx
IMPLICIT
NONE
TYPE
(
t_sym
),
INTENT
(
IN
)
::
sym
TYPE
(
t_cell
),
INTENT
(
IN
)
::
cell
...
...
@@ -92,8 +92,9 @@ CONTAINS
COMPLEX
,
ALLOCATABLE
,
DEVICE
::
c_ph_dev
(:,:)
REAL
,
ALLOCATABLE
,
DEVICE
::
gkrot_dev
(:,:)
INTEGER
::
grid
,
block
!
INTEGER :: istat
INTEGER
::
istat
call
nvtxStartRange
(
"hsmt_ab"
,
3
)
lmax
=
MERGE
(
atoms
%
lnonsph
(
n
),
atoms
%
lmax
(
n
),
l_nonsph
)
ALLOCATE
(
c_ph_dev
(
lapw
%
nv
(
1
),
MERGE
(
2
,
1
,
noco
%
l_ss
)))
...
...
@@ -129,17 +130,12 @@ CONTAINS
!--> synthesize the complex conjugates of a and b
!call nvtxStartRange("hsmt_synthAB",5)
!istat = cudaDeviceSynchronize()
! pretty ugly solution
block
=
256
grid
=
lapw
%
nv
(
1
)/(
block
*
4
)
+
1
CALL
synth_ab
<<<
grid
,
block
>>>
(
grid
,
block
,
lapw
%
nv
(
1
),
lmax
,
ab_size
,
gkrot_dev
,&
fj
(:,:,
iintsp
),
gj
(:,:,
iintsp
),
c_ph_dev
(:,
iintsp
),
ab
)
!istat = cudaDeviceSynchronize()
!call nvtxEndRange
IF
(
PRESENT
(
abclo
))
THEN
print
*
,
"Ooooops, TODO in hsmt_ab"
...
...
@@ -169,6 +165,8 @@ CONTAINS
DEALLOCATE
(
c_ph_dev
)
DEALLOCATE
(
gkrot_dev
)
istat
=
cudaDeviceSynchronize
()
call
nvtxEndRange
END
SUBROUTINE
hsmt_ab_gpu
#endif
...
...
eigen/hsmt_distspin.F90
View file @
e288839d
...
...
@@ -17,7 +17,7 @@ CONTAINS
DO
iintsp
=
1
,
2
DO
jintsp
=
1
,
2
mat
(
jintsp
,
iintsp
)
%
data_c
(:,:)
=
chi
(
jintsp
,
i
intsp
)
*
mat_tmp
%
data_c
(:,:)
+
mat
(
jintsp
,
iintsp
)
%
data_c
(:,:)
mat
(
jintsp
,
iintsp
)
%
data_c
(:,:)
=
chi
(
iintsp
,
j
intsp
)
*
mat_tmp
%
data_c
(:,:)
+
mat
(
jintsp
,
iintsp
)
%
data_c
(:,:)
ENDDO
ENDDO
END
SUBROUTINE
hsmt_distspins
...
...
eigen/hsmt_nonsph.F90
View file @
e288839d
...
...
@@ -29,39 +29,22 @@ CONTAINS
INTEGER
,
INTENT
(
IN
)
::
n
,
isp
,
iintsp
,
jintsp
COMPLEX
,
INTENT
(
IN
)
::
chi
! .. Array Arguments ..
REAL
,
INTENT
(
IN
)
::
fj
(:,
0
:,:),
gj
(:,
0
:,:)
#if defined CPP_GPU
REAL
,
MANAGED
,
INTENT
(
IN
)
::
fj
(:,:,:),
gj
(:,:,:)
#else
REAL
,
INTENT
(
IN
)
::
fj
(:,
0
:,:),
gj
(:,
0
:,:)
#endif
CLASS
(
t_mat
),
INTENT
(
INOUT
)
::
hmat
#if defined CPP_GPU
REAL
,
ALLOCATABLE
,
DEVICE
::
fj_dev
(:,:,:),
gj_dev
(:,:,:)
COMPLEX
,
ALLOCATABLE
,
DEVICE
::
h_loc_dev
(:,:)
COMPLEX
,
ALLOCATABLE
,
DEVICE
::
c_dev
(:,:)
#endif
CALL
timestart
(
"non-spherical setup"
)
IF
(
mpi
%
n_size
==
1
)
THEN
#if defined CPP_GPU
ALLOCATE
(
fj_dev
(
MAXVAL
(
lapw
%
nv
),
atoms
%
lmaxd
+1
,
MERGE
(
2
,
1
,
noco
%
l_noco
)))
ALLOCATE
(
gj_dev
(
MAXVAL
(
lapw
%
nv
),
atoms
%
lmaxd
+1
,
MERGE
(
2
,
1
,
noco
%
l_noco
)))
fj_dev
(
1
:,
1
:,
1
:)
=
fj
(
1
:,
0
:,
1
:)
gj_dev
(
1
:,
1
:,
1
:)
=
gj
(
1
:,
0
:,
1
:)
ALLOCATE
(
h_loc_dev
(
size
(
td
%
h_loc
,
1
),
size
(
td
%
h_loc
,
2
)))
h_loc_dev
(
1
:,
1
:)
=
CONJG
(
td
%
h_loc
(
0
:,
0
:,
n
,
isp
))
IF
(
hmat
%
l_real
)
THEN
IF
(
ANY
(
SHAPE
(
hmat
%
data_c
)/
=
SHAPE
(
hmat
%
data_r
)))
THEN
DEALLOCATE
(
hmat
%
data_c
)
ALLOCATE
(
hmat
%
data_c
(
SIZE
(
hmat
%
data_r
,
1
),
SIZE
(
hmat
%
data_r
,
2
)))
ENDIF
hmat
%
data_c
=
0.0
ENDIF
ALLOCATE
(
c_dev
(
SIZE
(
hmat
%
data_c
,
1
),
SIZE
(
hmat
%
data_c
,
2
)))
c_dev
=
hmat
%
data_c
CALL
priv_noMPI
(
n
,
mpi
,
sym
,
atoms
,
isp
,
iintsp
,
jintsp
,
chi
,
noco
,
cell
,
lapw
,
h_loc_dev
,
fj_dev
,
gj_dev
,
c_dev
)
hmat
%
data_c
=
c_dev
IF
(
hmat
%
l_real
)
THEN
hmat
%
data_r
=
hmat
%
data_r
+
REAL
(
hmat
%
data_c
)
ENDIF
CALL
priv_noMPI
(
n
,
mpi
,
sym
,
atoms
,
isp
,
iintsp
,
jintsp
,
chi
,
noco
,
cell
,
lapw
,
h_loc_dev
,
fj
,
gj
,
hmat
)
#else
CALL
priv_noMPI
(
n
,
mpi
,
sym
,
atoms
,
isp
,
iintsp
,
jintsp
,
chi
,
noco
,
cell
,
lapw
,
td
,
fj
,
gj
,
hmat
)
#endif
...
...
@@ -72,7 +55,7 @@ CONTAINS
END
SUBROUTINE
hsmt_nonsph
#if defined CPP_GPU
SUBROUTINE
priv_noMPI_gpu
(
n
,
mpi
,
sym
,
atoms
,
isp
,
iintsp
,
jintsp
,
chi
,
noco
,
cell
,
lapw
,
h_loc_dev
,
fj_dev
,
gj_dev
,
c_dev
)
SUBROUTINE
priv_noMPI_gpu
(
n
,
mpi
,
sym
,
atoms
,
isp
,
iintsp
,
jintsp
,
chi
,
noco
,
cell
,
lapw
,
h_loc_dev
,
fj_dev
,
gj_dev
,
hmat
)
!Calculate overlap matrix, GPU version
!note that basically all matrices in the GPU version are conjugates of their cpu counterparts
USE
m_hsmt_ab
...
...
@@ -101,8 +84,7 @@ CONTAINS
! ..
! .. Array Arguments ..
REAL
,
INTENT
(
IN
),
DEVICE
::
fj_dev
(:,:,:),
gj_dev
(:,:,:)
COMPLEX
,
INTENT
(
INOUT
),
DEVICE
::
c_dev
(:,:)
CLASS
(
t_mat
),
INTENT
(
INOUT
)
::
hmat
INTEGER
::
nn
,
na
,
ab_size
,
l
,
ll
,
m
real
::
rchi
...
...
@@ -114,6 +96,14 @@ CONTAINS
ALLOCATE
(
ab_dev
(
MAXVAL
(
lapw
%
nv
),
2
*
atoms
%
lmaxd
*
(
atoms
%
lmaxd
+2
)
+2
))
IF
(
iintsp
.NE.
jintsp
)
ALLOCATE
(
ab2_dev
(
lapw
%
nv
(
iintsp
),
2
*
atoms
%
lmaxd
*
(
atoms
%
lmaxd
+2
)
+2
))
IF
(
hmat
%
l_real
)
THEN
IF
(
ANY
(
SHAPE
(
hmat
%
data_c
)/
=
SHAPE
(
hmat
%
data_r
)))
THEN
DEALLOCATE
(
hmat
%
data_c
)
ALLOCATE
(
hmat
%
data_c
(
SIZE
(
hmat
%
data_r
,
1
),
SIZE
(
hmat
%
data_r
,
2
)))
ENDIF
hmat
%
data_c
=
0.0
ENDIF
DO
nn
=
1
,
atoms
%
neq
(
n
)
na
=
SUM
(
atoms
%
neq
(:
n
-1
))
+
nn
IF
((
atoms
%
invsat
(
na
)
==
0
)
.OR.
(
atoms
%
invsat
(
na
)
==
1
))
THEN
...
...
@@ -124,10 +114,9 @@ CONTAINS
!Calculate Hamiltonian
CALL
zgemm
(
"N"
,
"N"
,
lapw
%
nv
(
jintsp
),
ab_size
,
ab_size
,
CMPLX
(
1.0
,
0.0
),
ab_dev
,
SIZE
(
ab_dev
,
1
),&
h_loc_dev
,
SIZE
(
h_loc_dev
,
1
),
CMPLX
(
0.
,
0.
),
ab1_dev
,
SIZE
(
ab1_dev
,
1
))
!ab1=MATMUL(ab(:lapw%nv(iintsp),:ab_size),td%h_loc(:ab_size,:ab_size,n,isp))
IF
(
iintsp
==
jintsp
)
THEN
call
nvtxStartRange
(
"zherk"
,
3
)
CALL
ZHERK
(
"U"
,
"N"
,
lapw
%
nv
(
iintsp
),
ab_size
,
Rchi
,
ab1_dev
,
SIZE
(
ab1_dev
,
1
),
1.0
,
c_dev
,
SIZE
(
c_dev
,
1
))
CALL
ZHERK
(
"U"
,
"N"
,
lapw
%
nv
(
iintsp
),
ab_size
,
Rchi
,
ab1_dev
,
SIZE
(
ab1_dev
,
1
),
1.0
,
hmat
%
data_c
,
SIZE
(
hmat
%
data_c
,
1
))
istat
=
cudaDeviceSynchronize
()
call
nvtxEndRange
()
ELSE
!here the l_ss off-diagonal part starts
...
...
@@ -144,11 +133,14 @@ CONTAINS
enddo
enddo
CALL
zgemm
(
"N"
,
"T"
,
lapw
%
nv
(
iintsp
),
lapw
%
nv
(
jintsp
),
ab_size
,
chi
,
ab2_dev
,
SIZE
(
ab2_dev
,
1
),&
ab1_dev
,
SIZE
(
ab1_dev
,
1
),
CMPLX
(
1.0
,
0.0
),
c_dev
,
SIZE
(
c_dev
,
1
))
ab1_dev
,
SIZE
(
ab1_dev
,
1
),
CMPLX
(
1.0
,
0.0
),
hmat
%
data_c
,
SIZE
(
hmat
%
data_c
,
1
))
ENDIF
ENDIF
END
DO
IF
(
hmat
%
l_real
)
THEN
hmat
%
data_r
=
hmat
%
data_r
+
REAL
(
hmat
%
data_c
)
ENDIF
call
nvtxEndRange
END
SUBROUTINE
priv_noMPI_gpu
#endif
...
...
eigen/hsmt_sph.F90
View file @
e288839d
...
...
@@ -7,15 +7,450 @@
MODULE
m_hsmt_sph
USE
m_juDFT
IMPLICIT
NONE
INTERFACE
hsmt_sph
module
procedure
hsmt_sph_cpu
#ifdef CPP_GPU
module
procedure
hsmt_sph_gpu
END
INTERFACE
INTERFACE
HsmtSphGpuKernel_noApw
module
procedure
HsmtSphGpuKernel_noApw_cmplx
,
HsmtSphGpuKernel_noApw_real
END
INTERFACE
INTERFACE
HsmtSphGpuKernel_Apw
module
procedure
HsmtSphGpuKernel_Apw_cmplx
,
HsmtSphGpuKernel_Apw_real
#endif
END
INTERFACE
CONTAINS
SUBROUTINE
hsmt_sph
(
n
,
atoms
,
mpi
,
isp
,
input
,
noco
,
cell
,
iintsp
,
jintsp
,
chi
,
lapw
,
el
,
e_shift
,
usdus
,
fj
,
gj
,
smat
,
hmat
)
#ifdef CPP_GPU
SUBROUTINE
HsmtSphGpuKernel_Apw_real
(
iintsp
,
jintsp
,
nv
,
lmaxd
,
lmax
,
ki_start
,
ki_end
,
ki_step
,
nn_start
,
nn_end
,&
lnonsph
,
chi
,
qssbti
,
qssbtj
,
gvec
,
gk
,
fleg1
,
fleg2
,
fl2p1
,
fl2p1bt
,
fj
,
gj
,
taual
,
ddn
,
el
,
e_shift
,&
smat_data
,
hmat_data
,&
uds
,
dus
,
us
,
duds
,
rmt
)
INTEGER
,
INTENT
(
IN
)
::
iintsp
,
jintsp
,
nv
(
2
),
lmaxd
,
lmax
,
ki_start
,
ki_end
,
ki_step
,
nn_start
,
nn_end
,
lnonsph
COMPLEX
,
INTENT
(
IN
)
::
chi
REAL
,
INTENT
(
IN
)
::
qssbti
(
3
),
qssbtj
(
3
)
INTEGER
,
INTENT
(
IN
)
::
gvec
(:,:,:)
REAL
,
INTENT
(
IN
)
::
gk
(:,:,:)
REAL
,
INTENT
(
IN
)
::
fleg1
(
0
:
lmaxd
),
fleg2
(
0
:
lmaxd
),
fl2p1
(
0
:
lmaxd
)
REAL
,
INTENT
(
IN
)
::
fl2p1bt
(
0
:
lmaxd
)
REAL
,
MANAGED
,
INTENT
(
IN
)
::
fj
(:,
0
:,:),
gj
(:,
0
:,:)
REAL
,
INTENT
(
IN
)
::
taual
(:,:)
REAL
,
INTENT
(
IN
)
::
ddn
(
0
:
lmaxd
)
REAL
,
INTENT
(
IN
)
::
el
(
0
:
lmaxd
)
REAL
,
INTENT
(
IN
)
::
e_shift
REAL
,
INTENT
(
INOUT
)
::
smat_data
(:,:),
hmat_data
(:,:)
!+APW
REAL
,
INTENT
(
IN
),
OPTIONAL
::
uds
(
0
:
lmaxd
),
dus
(
0
:
lmaxd
),
us
(
0
:
lmaxd
),
duds
(
0
:
lmaxd
)
REAL
,
INTENT
(
IN
),
OPTIONAL
::
rmt
!-APW
REAL
,
PARAMETER
::
tpi_const
=
2.
*
3.1415926535897932
REAL
,
ALLOCATABLE
::
plegend
(:,:)
COMPLEX
,
ALLOCATABLE
::
cph
(:)
REAL
tnn
(
3
),
elall
,
fct
,
fjkiln
,
gjkiln
,
ddnln
,
ski
(
3
)
REAL
apw_lo1
,
apw_lo2
,
apw1
,
w1
INTEGER
kii
,
ki
,
kj
,
l
,
nn
ALLOCATE
(
cph
(
MAXVAL
(
nv
)))
ALLOCATE
(
plegend
(
MAXVAL
(
nv
),
0
:
lmaxd
))
plegend
=
0.0
plegend
(:,
0
)
=
1.0
DO
ki
=
ki_start
,
ki_end
,
ki_step
kii
=
(
ki
-1
)/
ki_step
+1
ski
=
gvec
(:,
ki
,
jintsp
)
+
qssbti
!---> legendre polynomials
DO
kj
=
1
,
ki
plegend
(
kj
,
1
)
=
DOT_PRODUCT
(
gk
(:,
kj
,
iintsp
),
gk
(:,
ki
,
jintsp
))
END
DO
DO
l
=
1
,
lmax
-
1
plegend
(:
ki
,
l
+1
)
=
fleg1
(
l
)
*
plegend
(:
ki
,
1
)
*
plegend
(:
ki
,
l
)
-
fleg2
(
l
)
*
plegend
(:
ki
,
l
-1
)
END
DO
!---> set up phase factors
cph
=
0.0
DO
nn
=
nn_start
,
nn_end
tnn
=
tpi_const
*
taual
(:,
nn
)
DO
kj
=
1
,
ki
cph
(
kj
)
=
cph
(
kj
)
+
&
CMPLX
(
COS
(
DOT_PRODUCT
(
ski
-
gvec
(:,
kj
,
iintsp
)
-
qssbtj
,
tnn
)),&
SIN
(
DOT_PRODUCT
(
gvec
(:,
kj
,
iintsp
)
+
qssbtj
-
ski
,
tnn
)))
! IF (iintsp.NE.jintsp) cph(kj)=CONJG(cph(kj))
END
DO
END
DO
!---> update overlap and l-diagonal hamiltonian matrix
DO
l
=
0
,
lmax
!+APW
IF
(
PRESENT
(
uds
))
THEN
w1
=
0.5
*
(
uds
(
l
)
*
dus
(
l
)
+
us
(
l
)
*
duds
(
l
)
)
apw_lo1
=
fl2p1
(
l
)
*
0.5
*
rmt
**
2
*
(
gjkiln
*
w1
+
&
fjkiln
*
us
(
l
)
*
dus
(
l
)
)
apw_lo2
=
fl2p1
(
l
)
*
0.5
*
rmt
**
2
*
(
fjkiln
*
w1
+
&
gjkiln
*
uds
(
l
)
*
duds
(
l
)
)
ENDIF
!-APW
fjkiln
=
fj
(
ki
,
l
,
jintsp
)
gjkiln
=
gj
(
ki
,
l
,
jintsp
)
ddnln
=
ddn
(
l
)
elall
=
el
(
l
)
IF
(
l
<=
lnonsph
)
elall
=
elall
-
e_shift
!(isp)
DO
kj
=
1
,
ki
fct
=
plegend
(
kj
,
l
)
*
fl2p1
(
l
)
*
&
(
fjkiln
*
fj
(
kj
,
l
,
iintsp
)
+
gjkiln
*
gj
(
kj
,
l
,
iintsp
)
*
ddnln
)
smat_data
(
kj
,
kii
)
=
smat_data
(
kj
,
kii
)
+
REAL
(
cph
(
kj
))
*
fct
hmat_data
(
kj
,
kii
)
=
hmat_data
(
kj
,
kii
)
+
REAL
(
cph
(
kj
))
*
&
(
fct
*
elall
+
plegend
(
kj
,
l
)
*
fl2p1bt
(
l
)
*
&
(
fjkiln
*
gj
(
kj
,
l
,
iintsp
)
+
gjkiln
*
fj
(
kj
,
l
,
iintsp
)
)
)
!+APW
IF
(
PRESENT
(
uds
))
THEN
apw1
=
REAL
(
cph
(
kj
))
*
plegend
(
kj
,
l
)
*
&
(
apw_lo1
*
fj
(
kj
,
l
,
iintsp
)
+
apw_lo2
*
gj
(
kj
,
l
,
iintsp
)
)
hmat_data
(
kj
,
kii
)
=
hmat_data
(
kj
,
kii
)
+
apw1
ENDIF
!-APW
ENDDO
!---> end loop over l
ENDDO
!---> end loop over ki
ENDDO
DEALLOCATE
(
plegend
)
DEALLOCATE
(
cph
)
END
SUBROUTINE
HsmtSphGpuKernel_Apw_real
SUBROUTINE
HsmtSphGpuKernel_Apw_cmplx
(
iintsp
,
jintsp
,
nv
,
lmaxd
,
lmax
,
ki_start
,
ki_end
,
ki_step
,
nn_start
,
nn_end
,&
lnonsph
,
chi
,
qssbti
,
qssbtj
,
gvec
,
gk
,
fleg1
,
fleg2
,
fl2p1
,
fl2p1bt
,
fj
,
gj
,
taual
,
ddn
,
el
,
e_shift
,&
smat_data
,
hmat_data
,&
uds
,
dus
,
us
,
duds
,
rmt
)
INTEGER
,
INTENT
(
IN
)
::
iintsp
,
jintsp
,
nv
(
2
),
lmaxd
,
lmax
,
ki_start
,
ki_end
,
ki_step
,
nn_start
,
nn_end
,
lnonsph
COMPLEX
,
INTENT
(
IN
)
::
chi
REAL
,
INTENT
(
IN
)
::
qssbti
(
3
),
qssbtj
(
3
)
INTEGER
,
INTENT
(
IN
)
::
gvec
(:,:,:)
REAL
,
INTENT
(
IN
)
::
gk
(:,:,:)
REAL
,
INTENT
(
IN
)
::
fleg1
(
0
:
lmaxd
),
fleg2
(
0
:
lmaxd
),
fl2p1
(
0
:
lmaxd
)
REAL
,
INTENT
(
IN
)
::
fl2p1bt
(
0
:
lmaxd
)
REAL
,
MANAGED
,
INTENT
(
IN
)
::
fj
(:,
0
:,:),
gj
(:,
0
:,:)
REAL
,
INTENT
(
IN
)
::
taual
(:,:)
REAL
,
INTENT
(
IN
)
::
ddn
(
0
:
lmaxd
)
REAL
,
INTENT
(
IN
)
::
el
(
0
:
lmaxd
)
REAL
,
INTENT
(
IN
)
::
e_shift
COMPLEX
,
INTENT
(
INOUT
)
::
smat_data
(:,:),
hmat_data
(:,:)
!+APW
REAL
,
INTENT
(
IN
),
OPTIONAL
::
uds
(
0
:
lmaxd
),
dus
(
0
:
lmaxd
),
us
(
0
:
lmaxd
),
duds
(
0
:
lmaxd
)
REAL
,
INTENT
(
IN
),
OPTIONAL
::
rmt
!-APW
REAL
,
PARAMETER
::
tpi_const
=
2.
*
3.1415926535897932
REAL
,
ALLOCATABLE
::
plegend
(:,:)
COMPLEX
,
ALLOCATABLE
::
cph
(:)
REAL
apw_lo1
,
apw_lo2
,
w1
COMPLEX
capw1
REAL
tnn
(
3
),
elall
,
fct
,
fjkiln
,
gjkiln
,
ddnln
,
ski
(
3
)
INTEGER
kii
,
ki
,
kj
,
l
,
nn
ALLOCATE
(
cph
(
MAXVAL
(
nv
)))
ALLOCATE
(
plegend
(
MAXVAL
(
nv
),
0
:
lmaxd
))
plegend
=
0.0
plegend
(:,
0
)
=
1.0
DO
ki
=
ki_start
,
ki_end
,
ki_step
kii
=
(
ki
-1
)/
ki_step
+1
ski
=
gvec
(:,
ki
,
jintsp
)
+
qssbti
!---> legendre polynomials
DO
kj
=
1
,
ki
plegend
(
kj
,
1
)
=
DOT_PRODUCT
(
gk
(:,
kj
,
iintsp
),
gk
(:,
ki
,
jintsp
))
END
DO
DO
l
=
1
,
lmax
-
1
plegend
(:
ki
,
l
+1
)
=
fleg1
(
l
)
*
plegend
(:
ki
,
1
)
*
plegend
(:
ki
,
l
)
-
fleg2
(
l
)
*
plegend
(:
ki
,
l
-1
)
END
DO
!---> set up phase factors
cph
=
0.0
DO
nn
=
nn_start
,
nn_end
tnn
=
tpi_const
*
taual
(:,
nn
)
DO
kj
=
1
,
ki
cph
(
kj
)
=
cph
(
kj
)
+
&
CMPLX
(
COS
(
DOT_PRODUCT
(
ski
-
gvec
(:,
kj
,
iintsp
)
-
qssbtj
,
tnn
)),&
SIN
(
DOT_PRODUCT
(
gvec
(:,
kj
,
iintsp
)
+
qssbtj
-
ski
,
tnn
)))
! IF (iintsp.NE.jintsp) cph(kj)=CONJG(cph(kj))
END
DO
END
DO
!---> update overlap and l-diagonal hamiltonian matrix
DO
l
=
0
,
lmax
!+APW
IF
(
PRESENT
(
uds
))
THEN
w1
=
0.5
*
(
uds
(
l
)
*
dus
(
l
)
+
us
(
l
)
*
duds
(
l
)
)
apw_lo1
=
fl2p1
(
l
)
*
0.5
*
rmt
**
2
*
(
gjkiln
*
w1
+
&
fjkiln
*
us
(
l
)
*
dus
(
l
)
)
apw_lo2
=
fl2p1
(
l
)
*
0.5
*
rmt
**
2
*
(
fjkiln
*
w1
+
&
gjkiln
*
uds
(
l
)
*
duds
(
l
)
)
ENDIF
!-APW
fjkiln
=
fj
(
ki
,
l
,
jintsp
)
gjkiln
=
gj
(
ki
,
l
,
jintsp
)
ddnln
=
ddn
(
l
)
elall
=
el
(
l
)
IF
(
l
<=
lnonsph
)
elall
=
elall
-
e_shift
!(isp)
DO
kj
=
1
,
MIN
(
ki
,
nv
(
iintsp
))
fct
=
chi
*
plegend
(
kj
,
l
)
*
fl2p1
(
l
)
*
&
(
fjkiln
*
fj
(
kj
,
l
,
iintsp
)
+
gjkiln
*
gj
(
kj
,
l
,
iintsp
)
*
ddnln
)
smat_data
(
kj
,
kii
)
=
smat_data
(
kj
,
kii
)
+
cph
(
kj
)
*
fct
hmat_data
(
kj
,
kii
)
=
hmat_data
(
kj
,
kii
)
+
cph
(
kj
)
*
(
fct
*
elall
&
+
chi
*
plegend
(
kj
,
l
)
*
fl2p1bt
(
l
)
*
(
fjkiln
*
gj
(
kj
,
l
,
iintsp
)
+
gjkiln
*
fj
(
kj
,
l
,
iintsp
)
)
)
!+APW
IF
(
PRESENT
(
uds
))
THEN
capw1
=
cph
(
kj
)
*
plegend
(
kj
,
l
)&
*
(
apw_lo1
*
fj
(
kj
,
l
,
iintsp
)
+
apw_lo2
*
gj
(
kj
,
l
,
iintsp
)
)
hmat_data
(
kj
,
kii
)
=
hmat_data
(
kj
,
kii
)
+
capw1
ENDIF
!-APW
END
DO
!---> end loop over l
ENDDO
!---> end loop over ki
ENDDO
DEALLOCATE
(
plegend
)