elpa_cpp.F90 7.44 KB
Newer Older
1 2 3 4 5 6 7
!--------------------------------------------------------------------------------
! Copyright (c) 2016 Peter Grünberg Institut, Forschungszentrum Jülich, Germany
! This file is part of FLEUR and available as free software under the conditions
! of the MIT license as expressed in the LICENSE file in more detail.
!--------------------------------------------------------------------------------
    USE m_juDFT
    USE elpa1
8 9
    USE m_subredist1
    USE m_subredist2
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
#ifdef CPP_ELPA2
    USE elpa2
#endif
    IMPLICIT NONE
    INCLUDE 'mpif.h'

    INTEGER, INTENT (IN)                  :: m,n
    INTEGER, INTENT (IN)                  :: SUB_COMM
    INTEGER, INTENT (INOUT)               :: num
    REAL,    INTENT   (OUT)               :: eig(:)
    CPP_REALCOMPLEX, ALLOCATABLE, INTENT (INOUT)  :: a(:),b(:)
    CPP_REALCOMPLEX, INTENT   (OUT)               :: z(:,:)
    !

    !...  Local variables
    !
    INTEGER           :: sc_desc(9) !SCALAPACK DESCRIPTOR
    INTEGER           :: np,nb,myid,npcol,nprow,mycol,myrow,myrowssca
    INTEGER           :: mycolssca,ictextblacs,mpi_comm_rows
    INTEGER           :: mpi_comm_cols
    INTEGER           :: num2

    LOGICAL           :: ok
    INTEGER           :: err,ierr
    INTEGER           :: i,k,n_col,n_row
    ! large distributed Matrices
    REAL,ALLOCATABLE     :: eig2(:)
    CPP_REALCOMPLEX, ALLOCATABLE :: asca(:,:), bsca(:,:),tmp2(:,:)
    CPP_REALCOMPLEX, ALLOCATABLE :: eigvec(:,:)
    INTEGER, EXTERNAL :: numroc, indxl2g  !SCALAPACK functions


    CALL priv_create_blacsgrid(sub_comm,m, np,nb,myid,npcol,nprow,mycol,myrow,myrowssca,mycolssca,&
         ictextblacs,sc_desc,mpi_comm_rows,mpi_comm_cols)

    !     Number of columns the local process gets in ScaLAPACK distribution
    ALLOCATE ( asca(myrowssca,mycolssca), stat=err )
    IF (err/=0) CALL juDFT_error("allocate asca", calledby="chani_elpa")
    asca=0.0

    ALLOCATE ( bsca(myrowssca,mycolssca), stat=err  )
    IF (err/=0) CALL juDFT_error("allocate bsca", calledby="chani_elpa")
    bsca=0.0
    !
    ! transfer to ScaLAPACK block cyclic 2d distribution with
    ! block-size nb
    !
    !print *,"Before subredist"
58 59 60 61 62 63 64
#ifdef CPP_REAL
    CALL subredist1(m,myrowssca,SUB_COMM, nprow, npcol, myid, ierr, nb, achi_r=a, asca_r=asca )
    CALL subredist1(m,myrowssca,SUB_COMM, nprow, npcol, myid, ierr, nb, achi_r=b, asca_r=bsca)
#else
    CALL subredist1(m,myrowssca,SUB_COMM, nprow, npcol, myid, ierr, nb, achi_c=a, asca_c=asca )
    CALL subredist1(m,myrowssca,SUB_COMM, nprow, npcol, myid, ierr, nb, achi_c=b, asca_c=bsca)
#endif
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
    ! for saving memory one might deallocate(a,b)
    !print *,"Before Allocate"
    ALLOCATE ( eig2(m), stat=err ) ! The eigenvalue array for ScaLAPACK
    IF (err.NE.0) CALL juDFT_error('Failed to allocated "eig2"', calledby ='elpa')


    ALLOCATE ( eigvec(myrowssca,mycolssca), stat=err ) ! Eigenvectors for ScaLAPACK
    IF (err.NE.0) CALL juDFT_error('Failed to allocated "eigvec"',calledby ='elpa')

    ALLOCATE ( tmp2(myrowssca,mycolssca), stat=err ) ! tmp_array
    IF (err.NE.0) CALL juDFT_error('Failed to allocated "tmp2"', calledby ='elpa')
    !print *,"Before elpa"

    !ELPA -start here
    ! Solve generalized problem
    !
    ! 1. Calculate Cholesky factorization of Matrix B = U**T * U
    !    and invert triangular matrix U
    !
    ! Please note: cholesky_complex/invert_trm_complex are not trimmed for speed.
    ! The only reason having them is that the Scalapack counterpart
    ! PDPOTRF very often fails on higher processor numbers for unknown reasons!
#ifdef CPP_ELPA_NEW
    CALL CPP_CHOLESKY (m,bsca,SIZE(bsca,1),nb,mycolssca,mpi_comm_rows,mpi_comm_cols,.false.,ok)
    CALL CPP_invert_trm(m,bsca,SIZE(bsca,1),nb,mycolssca,mpi_comm_rows,mpi_comm_cols,.false.,ok)
#else
    CALL CPP_CHOLESKY (m,bsca,SIZE(bsca,1),nb, mpi_comm_rows,mpi_comm_cols)
    CALL CPP_invert_trm(m, bsca, SIZE(bsca,1), nb, mpi_comm_rows, mpi_comm_cols)
#endif

    ! 2. Calculate U**-T * A * U**-1

    ! 2a. eigvec = U**-T * A

    ! A is only set in the upper half, solve_evp_real needs a full matrix
    ! Set lower half from upper half
    CALL CPP_transpose (m,m,CPP_ONE,asca,1,1,sc_desc, CPP_ZERO,eigvec,1,1,sc_desc)
    DO i=1,SIZE(asca,1)
       ! Get global column corresponding to i and number of local rows up to
       ! and including the diagonal, these are unchanged in A
       n_col = indxl2g(i,     nb, mycol, 0, npcol)
       n_row = numroc (n_col, nb, myrow, 0, nprow)
       asca(n_row+1:myrowssca,i) = eigvec(n_row+1:myrowssca,i)
    ENDDO

    CALL CPP_mult ('U', 'L',m, m,bsca,myrowssca,asca,SIZE(asca,1),nb, mpi_comm_rows, mpi_comm_cols,eigvec,myrowssca)

    ! 2b. tmp2 = eigvec**T
    CALL CPP_transpose(m,m,CPP_ONE,eigvec,1,1,sc_desc,CPP_ZERO,tmp2,1,1,sc_desc)

    ! 2c. A =  U**-T * tmp2 ( = U**-T * Aorig * U**-1 )
    CALL CPP_mult ('U', 'U', m, m, bsca, SIZE(bsca,1), tmp2,&
         SIZE(tmp2,1),nb, mpi_comm_rows, mpi_comm_cols, asca, SIZE(asca,1))

    ! A is only set in the upper half, solve_evp_real needs a full matrix
    ! Set lower half from upper half

    CALL CPP_transpose(m,m,CPP_ONE,asca,1,1,sc_desc, CPP_ZERO,eigvec,1,1,sc_desc)


    DO i=1,SIZE(asca,1)
       ! Get global column corresponding to i and number of local rows up to
       ! and including the diagonal, these are unchanged in A
       n_col = indxl2g(i,     nb, mycol, 0, npcol)
       n_row = numroc (n_col, nb, myrow, 0, nprow)
       asca(n_row+1:myrowssca,i) = eigvec(n_row+1:myrowssca,i)
    ENDDO

    ! 3. Calculate eigenvalues/eigenvectors of U**-T * A * U**-1
    !    Eigenvectors go to eigvec
    num2=num
#ifdef CPP_ELPA_NEW
#ifdef CPP_ELPA2
    err=CPP_solve_evp_2stage(m,num2,asca,SIZE(asca,1),&
         eig2,eigvec,SIZE(asca,1), nb,mycolssca, mpi_comm_rows, mpi_comm_cols,sub_comm)
#else
    err=CPP_solve_evp(m, num2,asca,SIZE(asca,1),&
         eig2,eigvec, SIZE(asca,1), nb,mpi_comm_rows, mpi_comm_cols)
#endif
#else
#ifdef CPP_ELPA2
    CALL CPP_solve_evp_2stage(m,&
         num2,asca,SIZE(asca,1),eig2,eigvec,SIZE(asca,1), nb,mpi_comm_rows, mpi_comm_cols,sub_comm)
#else
    CALL CPP_solve_evp(m, num2,&
         asca,SIZE(asca,1),eig2,eigvec, SIZE(asca,1), nb,mpi_comm_rows, mpi_comm_cols)
#endif
#endif

    ! 4. Backtransform eigenvectors: Z = U**-1 * eigvec

    ! mult_ah_b_complex needs the transpose of U**-1, thus tmp2 = (U**-1)**T
    CALL CPP_transpose(m,m,CPP_ONE,bsca,1,1,sc_desc,CPP_ZERO,tmp2,1,1,sc_desc)

    CALL CPP_mult ('L', 'N',m, num2, tmp2, SIZE(asca,1),&
         eigvec, SIZE(asca,1),nb,mpi_comm_rows, mpi_comm_cols, asca, SIZE(asca,1))

    ! END of ELPA stuff
    CALL BLACS_GRIDEXIT(ictextblacs,ierr)
    CALL MPI_COMM_FREE(mpi_comm_rows,ierr)
    CALL MPI_COMM_FREE(mpi_comm_cols,ierr)
    !print *,"elpa done"

    !
    !     Put those eigenvalues expected by chani to eig, i.e. for
    !     process i these are eigenvalues i+1, np+i+1, 2*np+i+1...
    !     Only num=num2/np eigenvalues per process
    !
    num=FLOOR(REAL(num2)/np)
    IF (myid.LT.num2-(num2/np)*np) num=num+1
    k=1
    DO i=myid+1,num2,np
       eig(k)=eig2(i)
       k=k+1
    ENDDO

    !write(*,"(a,i5,99f10.4)") "ELPA:",myid,eig

    !
    !     Redistribute eigvec from ScaLAPACK distribution to each process
    !     having all eigenvectors corresponding to his eigenvalues as above
    !
187 188 189 190 191
#ifdef CPP_REAL
    CALL subredist2(m,num2,myrowssca,SUB_COMM,nprow,npcol, myid,ierr,nb,z,asca)
#else
    CALL subredist2(m,num2,myrowssca,SUB_COMM,nprow,npcol, myid,ierr,nb,achi_c=z,asca_c=asca)
#endif
192 193 194 195 196 197
    !

    DEALLOCATE ( asca )
    DEALLOCATE ( bsca )
    DEALLOCATE (eigvec, tmp2, eig2)