Commit 4219fe5e authored by Uliana Alekseeva's avatar Uliana Alekseeva

ELPA interface adjusted

parent 1a6d1755
......@@ -72,7 +72,7 @@ CONTAINS
!Select the solver
SELECT CASE (priv_select_solver(parallel))
CASE (diag_elpa)
!CALL elpa(hmat,smat,ne,eig,ev)
CALL elpa_diag(hmat,smat,ne,eig,ev)
CASE (diag_elemental)
!CALL ELEMENTAL(hmat,smat,ne,eig,ev)
CASE (diag_scalapack)
......
!--------------------------------------------------------------------------------
!-------------------------------------------------------------------------------
! Copyright (c) 2016 Peter Grünberg Institut, Forschungszentrum Jülich, Germany
! This file is part of FLEUR and available as free software under the conditions
! of the MIT license as expressed in the LICENSE file in more detail.
!--------------------------------------------------------------------------------
MODULE m_elpa
PRIVATE
#ifdef CPP_ELPA
interface elpa_diag
module procedure elpa_r,elpa_c
end interface elpa_diag
!Module to call elpa library for parallel diagonalization
!uses ssubredist1/2 for redistribution
PUBLIC elpa_diag
MODULE m_elpa
CONTAINS
! First the real version of the code
#define CPP_CHOLESKY cholesky_real
#define CPP_invert_trm invert_trm_real
#define CPP_solve_evp_1stage solve_evp_real_1stage
#define CPP_solve_evp solve_evp_real
#define CPP_solve_evp_2stage solve_evp_real_2stage
#define CPP_REALCOMPLEX real
#define CPP_transpose pdtran
#define CPP_ONE 1.0
#define CPP_ZERO 0.0
#ifdef CPP_ELPA_201605004
#define CPP_mult elpa_mult_at_b_real
#else
#define CPP_mult mult_at_b_real
#endif
#define CPP_REAL
SUBROUTINE elpa_r(m, SUB_COMM, a,b, z,eig,num)
!
!----------------------------------------------------
!- Parallel eigensystem solver - driver routine based on chani; dw'12
!
! m ........ actual (=leading) dimension of full a & b matrices
! must be problem size, as input a, b are one-dimensional
! and shall be redistributed to two-dimensional matrices
! actual (=leading) dimension of eigenvector z(,)
! n ........ number of columns of full (sub)matrix ( about n/np)
! SUB_COMM.. communicator for MPI
! a,b .... packed (sub)matrices, here expanded to non-packed
! z,eig .... eigenvectors and values, output
! num ...... number of ev's searched (and found) on this node
! On input, overall number of ev's searched,
! On output, local number of ev's found
!
!----------------------------------------------------
SUBROUTINE elpa_diag(hmat,smat,ne,eig,ev)
!
#include "elpa_cpp.F90"
END SUBROUTINE elpa_r
! Now the complex version
#define CPP_CHOLESKY cholesky_complex
#define CPP_invert_trm invert_trm_complex
#define CPP_solve_evp_1stage solve_evp_complex_1stage
#define CPP_solve_evp solve_evp_complex
#define CPP_solve_evp_2stage solve_evp_complex_2stage
#define CPP_REALCOMPLEX complex
#define CPP_transpose pztranc
#define CPP_ONE cmplx(1.,0.)
#define CPP_ZERO cmplx(0.,0.)
#ifdef CPP_ELPA_201605004
#define CPP_mult mult_ah_b_complex
#else
#define CPP_mult mult_ah_b_complex
#endif
#undef CPP_REAL
SUBROUTINE elpa_c(m, SUB_COMM, a,b, z,eig,num)
!
!----------------------------------------------------
!- Parallel eigensystem solver - driver routine based on chani; dw'12
! Uses the ELPA for the actual diagonalization
!
! m ........ actual (=leading) dimension of full a & b matrices
! must be problem size, as input a, b are one-dimensional
! and shall be redistributed to two-dimensional matrices
! actual (=leading) dimension of eigenvector z(,)
! n ........ number of columns of full (sub)matrix ( about n/np)
! SUB_COMM.. communicator for MPI
! a,b .... packed (sub)matrices, here expanded to non-packed
! z,eig .... eigenvectors and values, output
! num ...... number of ev's searched (and found) on this node
!
! hmat ..... Hamiltonian matrix
! smat ..... overlap matrix
! ne ....... number of ev's searched (and found) on this node
! On input, overall number of ev's searched,
! On output, local number of ev's found
! eig ...... eigenvalues, output
! ev ....... eigenvectors, output
!
!----------------------------------------------------
!
#include "elpa_cpp.F90"
END SUBROUTINE elpa_c
SUBROUTINE priv_create_blacsgrid(mpi_subcom,m, np,nb,myid,npcol,nprow,mycol,&
myrow,myrowssca,mycolssca, ictextblacs,sc_desc,mpi_comm_rows,mpi_comm_cols)
#include"cpp_double.h"
USE m_juDFT
USE m_types_mpimat
USE m_types
USE elpa1
#ifdef CPP_ELPA2
USE elpa2
#endif
#ifdef CPP_ELPA_201705003
USE elpa
#endif
IMPLICIT NONE
CLASS(t_mat),INTENT(INOUT) :: hmat,smat
CLASS(t_mat),ALLOCATABLE,INTENT(OUT)::ev
REAL,INTENT(out) :: eig(:)
INTEGER,INTENT(INOUT) :: ne
#ifdef CPP_ELPA
INCLUDE 'mpif.h'
INTEGER,INTENT(IN) :: mpi_subcom
INTEGER,INTENT(IN) :: m
INTEGER,INTENT(OUT):: np,nb, npcol,nprow,myrowssca,mycolssca
INTEGER,INTENT(OUT):: myrow,mycol,myid
INTEGER,INTENT(OUT):: ictextblacs,sc_desc(:)
INTEGER,INTENT(OUT):: mpi_comm_rows,mpi_comm_cols
!... Local variables
!
INTEGER :: num,num2,myrow,mycol
INTEGER :: nb, myid, np
INTEGER :: n_col, n_row
LOGICAL :: ok
INTEGER :: err
INTEGER :: mpi_comm_rows,mpi_comm_cols
INTEGER :: i,k
! large distributed Matrices
REAL,ALLOCATABLE :: eig2(:)
TYPE(t_mpimat) :: ev_dist
REAL,ALLOCATABLE :: tmp2_r(:,:)
COMPLEX,ALLOCATABLE :: tmp2_c(:,:)
INTEGER, EXTERNAL :: numroc, indxl2g !SCALAPACK functions
#ifdef CPP_ELPA_201705003
INTEGER :: kernel
CLASS(elpa_t),pointer :: elpa_obj
INTEGER :: iamblacs,npblacs
INTEGER :: nprow2,npcol2,myrowblacs,mycolblacs
INTEGER :: k,i,j
INTEGER :: ierr
err = elpa_init(20170403)
elpa_obj => elpa_allocate()
#endif
INTEGER,ALLOCATABLE :: iblacsnums(:),ihelp(:),iusermap(:,:)
SELECT TYPE(hmat)
TYPE IS (t_mpimat)
SELECT TYPE(smat)
TYPE IS (t_mpimat)
EXTERNAL descinit, blacs_get
EXTERNAL blacs_pinfo, blacs_gridinit
CALL MPI_BARRIER(hmat%mpi_com,err)
CALL MPI_COMM_RANK(hmat%mpi_com,myid,err)
CALL MPI_COMM_SIZE(hmat%mpi_com,np,err)
myrow = myid/hmat%npcol
mycol = myid -(myid/hmat%npcol)*hmat%npcol
!Determine rank and no of processors
CALL MPI_COMM_RANK(mpi_subcom,myid,ierr)
CALL MPI_COMM_SIZE(mpi_subcom,np,ierr)
!print *,"priv_create_blacsgrid"
! determine block size
!
nb = 64
!IF (m.GT.2048) nb = 30 !2
!IF (m.GT.8*2048) nb = 60 !4
! compute processor grid, as square as possible
! If not square with more rows than columns
distloop: DO j=INT(SQRT(REAL(np))),1,-1
IF ( (np/j) * j == np) THEN
npcol = np/j
nprow = j
EXIT distloop
ENDIF
ENDDO distloop
ALLOCATE(iblacsnums(np),ihelp(np),iusermap(nprow,npcol))
! An nprow*npcol processor grid will be created
! Row and column index myrow, mycol of this processor in the grid
! and distribution of A and B in ScaLAPACK
! The local processor will get myrowssca rows and mycolssca columns
! of A and B
!
!Create communicators for ELPA
#if defined (CPP_ELPA_201705003)
mpi_comm_rows = -1
mpi_comm_cols = -1
#elif defined (CPP_ELPA_201605004) || defined (CPP_ELPA_201605003)||defined(CPP_ELPA_NEW)
err=get_elpa_row_col_comms(hmat%mpi_com, myrow, mycol,mpi_comm_rows, mpi_comm_cols)
#else
CALL get_elpa_row_col_comms(hmat%mpi_com, myrow, mycol,mpi_comm_rows, mpi_comm_cols)
#endif
!print *,"creating ELPA comms -- done"
num2=ne !no of states solved for
ALLOCATE ( eig2(hmat%global_size1), stat=err ) ! The eigenvalue array for ScaLAPACK
IF (err.NE.0) CALL juDFT_error('Failed to allocated "eig2"', calledby ='elpa')
CALL ev_dist%init(hmat%l_real,hmat%global_size1,hmat%global_size2,hmat%mpi_com,.TRUE.)! Eigenvectors for ScaLAPACK
IF (err.NE.0) CALL juDFT_error('Failed to allocated "ev_dist"',calledby ='elpa')
IF (hmat%l_real) THEN
ALLOCATE ( tmp2_r(hmat%matsize1,hmat%matsize2), stat=err ) ! tmp_array
ELSE
ALLOCATE ( tmp2_c(hmat%matsize1,hmat%matsize2), stat=err ) ! tmp_array
ENDIF
IF (err.NE.0) CALL juDFT_error('Failed to allocated "tmp2"', calledby ='elpa')
smat%blacs_desc=hmat%blacs_desc
ev_dist%blacs_desc=hmat%blacs_desc
nb=hmat%blacs_desc(5)! Blocking factor
IF (nb.NE.hmat%blacs_desc(6)) CALL judft_error("Different block sizes for rows/columns not supported")
#ifdef CPP_ELPA_201705003
CALL elpa_obj%set("na", hmat%global_size1, err)
CALL elpa_obj%set("nev", num2, err)
CALL elpa_obj%set("local_nrows", hmat%matsize1, err)
CALL elpa_obj%set("local_ncols", hmat%matsize2, err)
CALL elpa_obj%set("nblk", nb, err)
CALL elpa_obj%set("mpi_comm_parent", hmat%mpi_com, err)
CALL elpa_obj%set("process_row", myrow, err)
CALL elpa_obj%set("process_col", mycol, err)
#ifdef CPP_ELPA2
CALL elpa_obj%set("solver", ELPA_SOLVER_2STAGE)
#else
CALL elpa_obj%set("solver", ELPA_SOLVER_1STAGE)
#endif
err = elpa_obj%setup()
if (myid == 0) then
call elpa_obj%get("complex_kernel", kernel)
print *, "elpa uses " // elpa_int_value_to_string("complex_kernel", kernel) // " kernel"
endif
#endif
!print *,"Before elpa"
myrow = myid/npcol ! my row number in the BLACS nprow*npcol grid
mycol = myid -(myid/npcol)*npcol ! my column number in the BLACS nprow*npcol grid
!ELPA -start here
! Solive generalized preblem
!
! Now allocate Asca to put the elements of Achi or receivebuffer to
! 1. Calculate Cholesky factorization of Matrix S = U**T * U
! and invert triangular matrix U
!
myrowssca=(m-1)/(nb*nprow)*nb+ MIN(MAX(m-(m-1)/(nb*nprow)*nb*nprow-nb*myrow,0),nb)
! Number of rows the local process gets in ScaLAPACK distribution
mycolssca=(m-1)/(nb*npcol)*nb+ MIN(MAX(m-(m-1)/(nb*npcol)*nb*npcol-nb*mycol,0),nb)
!Get BLACS ranks for all MPI ranks
CALL BLACS_PINFO(iamblacs,npblacs) ! iamblacs = local process rank (e.g. myid)
! npblacs = number of available processes
iblacsnums=-2
ihelp=-2
ihelp(myid+1)=iamblacs ! Get the Blacs id corresponding to the MPI id
!print *,"ALLREDUCE:",mpi_subcom
CALL MPI_ALLREDUCE(ihelp, iblacsnums, np,MPI_INTEGER,MPI_MAX,mpi_subcom,ierr)
IF (ierr.NE.0)CALL juDFT_error('Error in allreduce for BLACS nums' ,calledby='elpa')
! iblacsnums(i) is the BLACS-process number of MPI-process i-1
k = 1
DO i = 1, nprow
DO j = 1, npcol
iusermap(i,j) = iblacsnums(k)
k = k + 1
ENDDO
! Please note: cholesky_complex/invert_trm_complex are not trimmed for speed.
! The only reason having them is that the Scalapack counterpart
! PDPOTRF very often fails on higher processor numbers for unknown reasons!
#if defined(CPP_ELPA_201705003)
IF (hmat%l_real) THEN
CALL elpa_obj%cholesky(smat%data_r, err)
CALL elpa_obj%invert_triangular(smat%data_r, err)
ELSE
CALL elpa_obj%cholesky(smat%data_c, err)
CALL elpa_obj%invert_triangular(smat%data_c, err)
ENDIF
#elif defined(CPP_ELPA_201605003) || defined(CPP_ELPA_201605004)
IF (hmat%l_real) THEN
ok=cholesky_real(smat%global_size1,smat%data_r,smat%matsize1,nb,smat%matsize2,mpi_comm_rows,mpi_comm_cols,.false.)
ok=invert_trm_real(smat%global_size1,smat%data_r,smat%matsize1,nb,smat%matsize2,mpi_comm_rows,mpi_comm_cols,.false.)
ELSE
ok=cholesky_complex(smat%global_size1,smat%data_c,smat%matsize1,nb,smat%matsize2,mpi_comm_rows,mpi_comm_cols,.false.)
ok=invert_trm_complex(smat%global_size1,smat%data_c,smat%matsize1,nb,smat%matsize2,mpi_comm_rows,mpi_comm_cols,.false.)
ENDIF
#elif defined CPP_ELPA_NEW
IF (hmat%l_real) THEN
CALL cholesky_real(smat%global_size1,smat%data_r,smat%matsize1,nb,smat%matsize2,mpi_comm_rows,mpi_comm_cols,.false., ok)
CALL invert_trm_real(smat%global_size1,smat%data_r,smat%matsize1,nb,smat%matsize2,mpi_comm_rows,mpi_comm_cols,.false., ok)
ELSE
CALL cholesky_complex(smat%global_size1,smat%data_c,smat%matsize1,nb,smat%matsize2,mpi_comm_rows,mpi_comm_cols,.false.,ok)
CALL invert_trm_complex(smat%global_size1,smat%data_c,smat%matsize1,nb,smat%matsize2,mpi_comm_rows,mpi_comm_cols,.false.,ok)
ENDIF
#else
IF (hmat%l_real) THEN
CALL cholesky_real(smat%global_size1,smat%data_r,smat%matsize1,nb,mpi_comm_rows,mpi_comm_cols,.false.)
CALL invert_trm_real(smat%global_size1,smat%data_r,smat%matsize1,nb,mpi_comm_rows,mpi_comm_cols,.false.)
ELSE
CALL cholesky_complex(smat%global_size1,smat%data_c,smat%matsize1,nb,smat%matsize2,mpi_comm_rows,mpi_comm_cols,.false.)
CALL invert_trm_complex(smat%global_size1,smat%data_c,smat%matsize1,nb,smat%matsize2,mpi_comm_rows,mpi_comm_cols,.false.)
ENDIF
#endif
! 2. Calculate U**-T * H * U**-1
! 2a. ev_dist = U**-T * H
! H is only set in the upper half, solve_evp_real needs a full matrix
! Set lower half from upper half
DO i=1,hmat%matsize2
! Get global column corresponding to i and number of local rows up to
! and including the diagonal, these are unchanged in H
n_col = indxl2g(i, nb, mycol, 0, hmat%npcol)
n_row = numroc (n_col, nb, myrow, 0, hmat%nprow)
IF (hmat%l_real) THEN
hmat%data_r(n_row+1:hmat%matsize1,i) = 0.d0
ELSE
hmat%data_c(n_row+1:hmat%matsize1,i) = 0.d0
ENDIF
ENDDO
!Get the Blacs default context
CALL BLACS_GET(0,0,ictextblacs)
! Create the Grid
CALL BLACS_GRIDMAP(ictextblacs,iusermap,nprow,nprow,npcol)
! Now control, whether the BLACS grid is the one we wanted
CALL BLACS_GRIDINFO(ictextblacs, nprow2,npcol2,myrowblacs,mycolblacs)
IF (nprow2 /= nprow) THEN
WRITE(6,*) 'Wrong number of rows in BLACS grid'
WRITE(6,*) 'nprow=',nprow,' nprow2=',nprow2
CALL juDFT_error('Wrong number of rows in BLACS grid',calledby= 'elpa')
IF (hmat%l_real) THEN
CALL pdtran(hmat%global_size1,hmat%global_size1,1.d0,hmat%data_r,1,1,&
hmat%blacs_desc,0.d0,ev_dist%data_r,1,1,ev_dist%blacs_desc)
ELSE
CALL pztranc(hmat%global_size1,hmat%global_size2,cmplx(1.d0,0.d0),hmat%data_c,1,1,&
hmat%blacs_desc,cmplx(0.d0,0.d0),ev_dist%data_c,1,1,ev_dist%blacs_desc)
ENDIF
IF (npcol2 /= npcol) THEN
WRITE(6,*) 'Wrong number of columns in BLACS grid'
WRITE(6,*) 'npcol=',npcol,' npcol2=',npcol2
CALL juDFT_error('Wrong number of columns in BLACS grid', calledby ='elpa')
DO i=1,hmat%matsize2
! Get global column corresponding to i and number of local rows up to
! and including the diagonal, these are unchanged in H
n_col = indxl2g(i, nb, mycol, 0, hmat%npcol)
n_row = numroc (n_col, nb, myrow, 0, hmat%nprow)
IF (hmat%l_real) THEN
hmat%data_r(n_row+1:hmat%matsize1,i) = ev_dist%data_r(n_row+1:ev_dist%matsize1,i)
ELSE
hmat%data_c(n_row+1:hmat%matsize1,i) = ev_dist%data_c(n_row+1:ev_dist%matsize1,i)
ENDIF
ENDDO
#if defined (CPP_ELPA_201705003)
IF (hmat%l_real) THEN
CALL elpa_obj%hermitian_multiply('U','L',hmat%global_size1,smat%data_r,hmat%data_r,&
smat%matsize1,smat%matsize2,ev_dist%data_r,hmat%matsize1,hmat%matsize2,err)
ELSE
CALL elpa_obj%hermitian_multiply('U','L',hmat%global_size1,smat%data_c,hmat%data_c,&
smat%matsize1,smat%matsize2,ev_dist%data_c,hmat%matsize1,hmat%matsize2,err)
ENDIF
#elif defined (CPP_ELPA_201605004)
IF (hmat%l_real) THEN
ok=elpa_mult_at_b_real('U', 'L',hmat%global_size1,hmat%global_size1,smat%data_r,smat%matsize1,smat%matsize2,&
hmat%data_r,hmat%matsize1,hmat%matsize2,nb,mpi_comm_rows, mpi_comm_cols,&
ev_dist%data_r,ev_dist%matsize1,ev_dist%matsize2)
ELSE
ok=mult_ah_b_complex('U', 'L',hmat%global_size1,hmat%global_size1,smat%data_c,smat%matsize1,smat%matsize2,&
hmat%data_c,hmat%matsize1,hmat%matsize2,nb,mpi_comm_rows, mpi_comm_cols,&
ev_dist%data_c,ev_dist%matsize1,ev_dist%matsize2)
ENDIF
#elif defined (CPP_ELPA_201605003)
IF (hmat%l_real) THEN
ok=mult_at_b_real('U', 'L',hmat%global_size1,hmat%global_size1,smat%data_r,smat%matsize1,&
hmat%data_r,hmat%matsize1,nb,mpi_comm_rows, mpi_comm_cols,ev_dist%data_r,ev_dist%matsize1)
ELSE
ok=mult_ah_b_complex('U', 'L',hmat%global_size1,hmat%global_size1,smat%data_c,smat%matsize1,&
hmat%data_c,hmat%matsize1,nb,mpi_comm_rows, mpi_comm_cols,ev_dist%data_c,ev_dist%matsize1)
ENDIF
#else
IF (hmat%l_real) THEN
CALL mult_at_b_real('U', 'L',hmat%global_size1,hmat%global_size1,smat%data_r,smat%matsize1,&
hmat%data_r,hmat%matsize1,nb,mpi_comm_rows, mpi_comm_cols,ev_dist%data_r,ev_dist%matsize1)
ELSE
CALL mult_ah_b_complex('U', 'L',hmat%global_size1,hmat%global_size1,smat%data_c,smat%matsize1,&
hmat%data_c,hmat%matsize1,nb,mpi_comm_rows, mpi_comm_cols,ev_dist%data_c,ev_dist%matsize1)
ENDIF
#endif
!Create communicators for ELPA
!print *,"creating ELPA comms"
! 2b. tmp2 = eigvec**T
IF (hmat%l_real) THEN
CALL pdtran(ev_dist%global_size1,ev_dist%global_size1,1.d0,ev_dist%data_r,1,1,&
ev_dist%blacs_desc,0.d0,tmp2_r,1,1,ev_dist%blacs_desc)
ELSE
CALL pztranc(ev_dist%global_size1,ev_dist%global_size1,cmplx(1.0,0.0),ev_dist%data_c,1,1,&
ev_dist%blacs_desc,cmplx(0.d0,0.d0),tmp2_c,1,1,ev_dist%blacs_desc)
ENDIF
! 2c. A = U**-T * tmp2 ( = U**-T * Aorig * U**-1 )
#if defined (CPP_ELPA_201705003)
mpi_comm_rows = -1
mpi_comm_cols = -1
#elif defined (CPP_ELPA_201605004) || defined (CPP_ELPA_201605003)||defined(CPP_ELPA_NEW)
ierr=get_elpa_row_col_comms(mpi_subcom, myrowblacs, mycolblacs,mpi_comm_rows, mpi_comm_cols)
IF (hmat%l_real) THEN
CALL elpa_obj%hermitian_multiply('U','U',smat%global_size1,smat%data_r,tmp2_r,&
smat%matsize1,smat%matsize2,hmat%data_r,hmat%matsize1,hmat%matsize2,err)
ELSE
CALL elpa_obj%hermitian_multiply('U','U',smat%global_size1,smat%data_c,tmp2_c,&
smat%matsize1,smat%matsize2,hmat%data_c,hmat%matsize1,hmat%matsize2,err)
ENDIF
#elif defined (CPP_ELPA_201605004)
IF (hmat%l_real) THEN
ok=elpa_mult_at_b_real('U', 'U',smat%global_size1,smat%global_size1,smat%data_r,smat%matsize1,smat%matsize2,&
tmp2_r,SIZE(tmp2_r,1),SIZE(tmp2_r,2),nb,mpi_comm_rows, mpi_comm_cols,&
hmat%data_r,hmat%matsize1,hmat%matsize2)
ELSE
ok=mult_ah_b_complex('U', 'U',smat%global_size1,smat%global_size1,smat%data_c,smat%matsize1,smat%matsize2,&
tmp2_c,SIZE(tmp2_c,1),SIZE(tmp2_c,2),nb,mpi_comm_rows, mpi_comm_cols,&
hmat%data_c,hmat%matsize1,hmat%matsize2)
ENDIF
#elif defined (CPP_ELPA_201605003)
IF (hmat%l_real) THEN
ok=mult_at_b_real('U', 'U',smat%global_size1,smat%global_size1,smat%data_r,smat%matsize1,&
tmp2_r,SIZE(tmp2_r,1),nb,mpi_comm_rows, mpi_comm_cols,hmat%data_r,hmat%matsize1)
ELSE
ok=mult_ah_b_complex('U', 'U',smat%global_size1,smat%global_size1,smat%data_c,smat%matsize1,&
tmp2_c,SIZE(tmp2_c,1),nb,mpi_comm_rows, mpi_comm_cols,hmat%data_c,hmat%matsize1)
ENDIF
#else
CALL get_elpa_row_col_comms(mpi_subcom, myrowblacs, mycolblacs,mpi_comm_rows, mpi_comm_cols)
IF (hmat%l_real) THEN
CALL mult_at_b_real('U', 'U',smat%global_size1,smat%global_size1,smat%data_r,smat%matsize1,&
tmp2_r,SIZE(tmp2_r,1),nb,mpi_comm_rows, mpi_comm_cols,hmat%data_r,hmat%matsize1)
ELSE
CALL mult_ah_b_complex('U', 'U',smat%global_size1,smat%global_size1,smat%data_c,smat%matsize1,&
tmp2_c,SIZE(tmp2_c,1),nb,mpi_comm_rows, mpi_comm_cols,hmat%data_c,hmat%matsize1)
ENDIF
#endif
!print *,"creating ELPA comms -- done"
!Create the descriptors
CALL descinit(sc_desc,m,m,nb,nb,0,0,ictextblacs,myrowssca,ierr)
IF (ierr /=0 ) CALL juDFT_error('descinit1 failed',calledby='elpa')
! A is only set in the upper half, solve_evp_real needs a full matrix
! Set lower half from upper half
IF (hmat%l_real) THEN
CALL pdtran(hmat%global_size1,hmat%global_size1,1.d0,hmat%data_r,1,1,&
hmat%blacs_desc,0.d0,ev_dist%data_r,1,1,ev_dist%blacs_desc)
ELSE
CALL pztranc(hmat%global_size1,hmat%global_size1,cmplx(1.0,0.0),hmat%data_c,1,1,&
hmat%blacs_desc,cmplx(0.d0,0.d0),ev_dist%data_c,1,1,ev_dist%blacs_desc)
ENDIF
DO i=1,hmat%matsize2
! Get global column corresponding to i and number of local rows up to
! and including the diagonal, these are unchanged in A
n_col = indxl2g(i, nb, mycol, 0, hmat%npcol)
n_row = numroc (n_col, nb, myrow, 0, hmat%nprow)
IF (hmat%l_real) THEN
hmat%data_r(n_row+1:hmat%matsize1,i) = ev_dist%data_r(n_row+1:ev_dist%matsize1,i)
ELSE
hmat%data_c(n_row+1:hmat%matsize1,i) = ev_dist%data_c(n_row+1:ev_dist%matsize1,i)
ENDIF
ENDDO
END SUBROUTINE priv_create_blacsgrid
! 3. Calculate eigenvalues/eigenvectors of U**-T * A * U**-1
! Eigenvectors go to eigvec
#if defined (CPP_ELPA_201705003)
IF (hmat%l_real) THEN
CALL elpa_obj%eigenvectors(hmat%data_r, eig2, ev_dist%data_r, err)
ELSE
CALL elpa_obj%eigenvectors(hmat%data_c, eig2, ev_dist%data_c, err)
ENDIF
#elif defined(CPP_ELPA_201605003) || defined(CPP_ELPA_201605004)
#ifdef CPP_ELPA2
IF (hmat%l_real) THEN
ok=solve_evp_real_2stage(hmat%global_size1,num2,hmat%data_r,hmat%matsize1,&
eig2,ev_dist%data_r,ev_dist%matsize1, nb,ev_dist%matsize2, mpi_comm_rows, mpi_comm_cols,hmat%mpi_com)
ELSE
ok=solve_evp_complex_2stage(hmat%global_size1,num2,hmat%data_c,hmat%matsize1,&
eig2,ev_dist%data_c,ev_dist%matsize1, nb,ev_dist%matsize2, mpi_comm_rows, mpi_comm_cols,hmat%mpi_com)
ENDIF
#else
LOGICAL :: elpa_used=.false.
IF (hmat%l_real) THEN
ok=solve_evp_real_1stage(hmat%global_size1,num2,hmat%data_r,hmat%matsize1,&
eig2,ev_dist%data_r,ev_dist%matsize1, nb,ev_dist%matsize2, mpi_comm_rows, mpi_comm_cols)
ELSE
ok=solve_evp_complex_1stage(hmat%global_size1,num2,hmat%data_c,hmat%matsize1,&
eig2,ev_dist%data_c,ev_dist%matsize1, nb,ev_dist%matsize2, mpi_comm_rows, mpi_comm_cols)
ENDIF
#endif
end MODULE m_elpa
#elif defined CPP_ELPA_NEW
#ifdef CPP_ELPA2
IF (hmat%l_real) THEN
err=solve_evp_real_2stage(hmat%global_size1,num2,hmat%data_r,hmat%matsize1,&
eig2,ev_dist%data_r,ev_dist%matsize1, nb,ev_dist%matsize2, mpi_comm_rows, mpi_comm_cols,hmat%mpi_com)
ELSE
err=solve_evp_complex_2stage(hmat%global_size1,num2,hmat%data_c,hmat%matsize1,&
eig2,ev_dist%data_c,ev_dist%matsize1, nb,ev_dist%matsize2, mpi_comm_rows, mpi_comm_cols,hmat%mpi_com)
ENDIF
#else
IF (hmat%l_real) THEN
err=solve_evp_real_1stage(hmat%global_size1,num2,hmat%data_r,hmat%matsize1,&
eig2,ev_dist%data_r,ev_dist%matsize1, nb,ev_dist%matsize2, mpi_comm_rows, mpi_comm_cols)
ELSE
err=solve_evp_complex_1stage(hmat%global_size1,num2,hmat%data_c,hmat%matsize1,&
eig2,ev_dist%data_c,ev_dist%matsize1, nb,ev_dist%matsize2, mpi_comm_rows, mpi_comm_cols)
ENDIF
#endif
#else
#ifdef CPP_ELPA2
IF (hmat%l_real) THEN
CALL solve_evp_real_2stage(hmat%global_size1,num2,hmat%data_r,hmat%matsize1,&
eig2,ev_dist%data_r,ev_dist%matsize1, nb, mpi_comm_rows, mpi_comm_cols,hmat%mpi_com)
ELSE
CALL solve_evp_complex_2stage(hmat%global_size1,num2,hmat%data_c,hmat%matsize1,&
eig2,ev_dist%data_c,ev_dist%matsize1, nb, mpi_comm_rows, mpi_comm_cols,hmat%mpi_com)
ENDIF
#else
IF (hmat%l_real) THEN
CALL solve_evp_real(hmat%global_size1,num2,hmat%data_r,hmat%matsize1,&
eig2,ev_dist%data_r,ev_dist%matsize1, nb,mpi_comm_rows, mpi_comm_cols)
ELSE
CALL solve_evp_complex(hmat%global_size1,num2,hmat%data_c,hmat%matsize1,&
eig2,ev_dist%data_c,ev_dist%matsize1, nb,mpi_comm_rows, mpi_comm_cols)
ENDIF
#endif
#endif
! 4. Backtransform eigenvectors: Z = U**-1 * eigvec
! mult_ah_b_complex needs the transpose of U**-1, thus tmp2 = (U**-1)**T
IF (hmat%l_real) THEN
CALL pdtran(smat%global_size1,smat%global_size1,1.d0,smat%data_r,1,1,&
smat%blacs_desc,0.d0,tmp2_r,1,1,smat%blacs_desc)
ELSE
CALL pztranc(smat%global_size1,smat%global_size1,cmplx(1.d0,0.d0),smat%data_c,1,1,&
smat%blacs_desc,cmplx(0.d0,0.d0),tmp2_c,1,1,smat%blacs_desc)
ENDIF
#if defined (CPP_ELPA_201705003)
IF (hmat%l_real) THEN
CALL elpa_obj%hermitian_multiply('L','N',num2,tmp2_r,ev_dist%data_r,&
ev_dist%matsize1,ev_dist%matsize2,hmat%data_r,hmat%matsize1,hmat%matsize2,err)
ELSE