Skip to content
Snippets Groups Projects
Commit e830b206 authored by Philipp Rüssmann's avatar Philipp Rüssmann
Browse files

Cleanup of rhoin and add test scripts

parent 5df906d4
No related branches found
No related tags found
No related merge requests found
......@@ -101,7 +101,13 @@ contains
complex (kind=dp) :: efac1, efac2, ffz, gmatl, ppz, v1, v2
real (kind=dp) :: c0ll
integer :: i, ir, j, j0, j1, l, l1, l2, lm1, lm2, lm3, lm3max, ln2, ln3, m
complex (kind=dp) :: vr(lmmaxd, lmmaxd), wf(irmd, 0:lmaxd, 0:lmaxd), wr(lmmaxd, lmmaxd), ar2(lmmaxd**2), vr2(lmmaxd**2)
complex (kind=dp) :: vr(lmmaxd, lmmaxd), wf(irmd, 0:lmaxd, 0:lmaxd), wr(lmmaxd, lmmaxd)
#ifdef __GFORTRAN__
! for the gfortran compiler zdotu leads to a segfault, then using
! dot_product gives the correct result without segfault, this needs these
! auxiliary arrays
complex (kind=dp) :: ar2(lmmaxd**2), vr2(lmmaxd**2)
#endif
complex (kind=dp), external :: zdotu
! C0LL = 1/sqrt(4*pi)
......@@ -113,7 +119,6 @@ contains
! use first vr
wr = czero
vr = czero
ar2 = czero
do lm2 = 1, lmmaxd
ln2 = lm2
v2 = efac(lm2)*efac(lm2)*gmat(ln2, ln2)
......@@ -135,23 +140,25 @@ contains
end do
end do
#ifdef __GFORTRAN__
ar2 = reshape(ar, [lmmaxd**2])
vr2 = reshape(vr, [lmmaxd**2])
#endif
do lm1 = 1, lmmaxd
efac1 = efac(lm1)
!write(*,*) lm1, lmmaxd, shape(ar2), shape(vr), efac1
!write(*,*) ar2(lm1,:)
!write(*,*) vr(lm1,:)
!wr(lm1, lm1) = zdotu(lmmaxd, ar(lm1,1), lmmaxd, vr(lm1,1), lmmaxd)/(efac1*efac1) ! this seems to work with the intel compiler
wr(lm1, lm1) = dot_product(conjg(ar2(lm1::lmmaxd)), vr2(lm1::lmmaxd))/(efac1*efac1) ! this works with conjugation!!!
!wr(lm1, lm1) = zdotu(lmmaxd, ar2(lm1), lmmaxd, vr2(lm1), lmmaxd)/(efac1*efac1)
!wr(lm1, lm1) = zdotu(lmmaxd, ar(lm1,1), lmmaxd, vr(lm1,1), lmmaxd)/(efac1*efac1)
#ifdef __GFORTRAN__
wr(lm1, lm1) = dot_product(conjg(ar2(lm1::lmmaxd)), vr2(lm1::lmmaxd))/(efac1*efac1) ! this works only with conjugation due to definition of dot_product
#else
wr(lm1, lm1) = zdotu(lmmaxd, ar(lm1,1), lmmaxd, vr(lm1,1), lmmaxd)/(efac1*efac1) ! this works with the intel compiler
#endif
do lm2 = 1, lm1 - 1
! ---> using symmetry of gaunt coeffients
efac2 = efac(lm2)
!wr(lm1, lm2) = ( zdotu(lmmaxd,ar(lm1,1),lmmaxd,vr(lm2,1),lmmaxd) + zdotu(lmmaxd,ar(lm2,1),lmmaxd,vr(lm1,1),lmmaxd) )/(efac1*efac2) ! this seems to work with the intel compiler
wr(lm1, lm2) = ( dot_product(conjg(ar2(lm1::lmmaxd)),vr2(lm2::lmmaxd)) + dot_product(conjg(ar2(lm2::lmmaxd)),vr2(lm1::lmmaxd)) )/(efac1*efac2) ! this works with conjugation!!!
!wr(lm1, lm2) = ( zdotu(lmmaxd,ar2(lm1),lmmaxd,vr2(lm2),lmmaxd) + zdotu(lmmaxd,ar2(lm2),lmmaxd,vr2(lm1),lmmaxd) )/(efac1*efac2)
#ifdef __GFORTRAN__
wr(lm1, lm2) = ( dot_product(conjg(ar2(lm1::lmmaxd)),vr2(lm2::lmmaxd)) + dot_product(conjg(ar2(lm2::lmmaxd)),vr2(lm1::lmmaxd)) )/(efac1*efac2) ! this works with gfortran
#else
wr(lm1, lm2) = ( zdotu(lmmaxd,ar(lm1,1),lmmaxd,vr(lm2,1),lmmaxd) + zdotu(lmmaxd,ar(lm2,1),lmmaxd,vr(lm1,1),lmmaxd) )/(efac1*efac2) ! this works with the intel compiler
#endif
end do
end do
......
#ifort -mkl -O3 -xHost -vec-report test.f90 && ./a.out | tee out
ifort -mkl -O3 -xHost -qopt-report=5 test_dot_prod.f90
export OMP_NUM_THREADS=1; ./a.out | tee out
ifort -mkl=parallel -qopenmp -O3 -xHost test_dot_prod.f90
export OMP_NUM_THREADS=4; ./a.out | tee out2
./plot_times.py
ifort -mkl -O3 -xHost -qopt-report=5 test_matmul.f90
export OMP_NUM_THREADS=1; ./a.out | tee out1
ifort -mkl=parallel -qopenmp -O3 -xHost test_matmul.f90
export OMP_NUM_THREADS=4; ./a.out | tee out12
./plot_times2.py
! this is a test program to compare the time a call of dot_product takes
! compared to a direct implementation of the loop or lapacks zdotu
! this kind of loop structure appears in rhoin
program test
implicit none
integer, parameter :: m=10000
integer :: i, j, n, k
integer :: clock_rate
integer :: start_time
integer :: stop_time
real*8 :: timing, timing2, timing3
real*8, allocatable :: tmp(:,:), tmp2(:,:)
complex*16, allocatable :: a(:,:), b(:,:), c(:), c2(:), a2(:), b2(:)
complex*16, external :: zdotu
call system_clock(count_rate=clock_rate) ! Find the rate
do n=1,100
allocate(a(n,n), b(n,n), c(n), c2(n), a2(n*n), b2(n*n), tmp(n,n), tmp2(n,n))
call random_number(tmp)
call random_number(tmp2)
a = cmplx(tmp, tmp2)
call random_number(tmp)
call random_number(tmp2)
b = cmplx(tmp, tmp2)
call system_clock(count=start_time)
do j=1,m
do i=1,n
c(i) = zdotu(n,a(i,1),n,b(i,1),n)
end do
end do
call system_clock(count=stop_time) ! Stop timing
timing = (stop_time-start_time)/real(clock_rate)
call system_clock(count=start_time)
do j=1,m
a2 = reshape(a, [n**2])
b2 = reshape(b, [n**2])
do i=1,n
c2(i) = dot_product(conjg(a2(i::n)), b2(i::n))
end do
end do
call system_clock(count=stop_time) ! Stop timing
timing2 = (stop_time-start_time)/real(clock_rate)
call system_clock(count=start_time)
do j=1,m
a2 = reshape(a, [n**2])
b2 = reshape(b, [n**2])
c2(:) = (0.0, 0.0)
!$omp parallel do private(i, k) default(shared)
do i=1,n
do k=0,n-1
c2(i) = c2(i) + a2(i+n*k)*b2(i+n*k)
end do
end do
!$omp end parallel do
end do
call system_clock(count=stop_time) ! Stop timing
timing3 = (stop_time-start_time)/real(clock_rate)
!write(*,*) 'c', c
!write(*,*) 'c2', c2
!write(*,*) n, 'c-c2', sum(c-c2)
write(*,'(i5,3es25.16)') n, timing, timing2, timing3
deallocate(a, b, c, c2, a2, b2, tmp, tmp2)
end do
end program test
program test
implicit none
integer, parameter :: m=10000
integer :: i, j, n, k, l
integer :: clock_rate
integer :: start_time
integer :: stop_time
real*8 :: timing, timing2, timing3
real*8, allocatable :: tmp(:,:), tmp2(:,:)
complex*16, allocatable :: a(:,:), b(:,:), c(:,:), b2(:,:)
complex*16 :: temp
complex*16, external :: zdotu
call system_clock(count_rate=clock_rate) ! Find the rate
do n=1,100
allocate(a(n,n), b(n,n), c(n,n), tmp(n,n), tmp2(n,n))
call random_number(tmp)
call random_number(tmp2)
a = cmplx(tmp, tmp2)
call random_number(tmp)
call random_number(tmp2)
b = cmplx(tmp, tmp2)
call system_clock(count=start_time)
do j=1,m
call zgemm('N','N', n,n,n, (1.0,0.0), a,n,b,n,(0.0,0.0),c,n)
end do
call system_clock(count=stop_time) ! Stop timing
timing = (stop_time-start_time)/real(clock_rate)
call system_clock(count=start_time)
do j=1,m
c = matmul(a, b)
end do
call system_clock(count=stop_time) ! Stop timing
timing2 = (stop_time-start_time)/real(clock_rate)
call system_clock(count=start_time)
do j=1,m
c = (0.0, 0.0)
!$omp parallel do private(i, k) default(shared)
do i=1,n
do k=1,n
temp = b(k,i)
do l=1,n
c(l,i) = c(l,i) + temp*a(l,k)
end do
end do
end do
!$omp end parallel do
end do
call system_clock(count=stop_time) ! Stop timing
timing3 = (stop_time-start_time)/real(clock_rate)
!write(*,*) 'c', c
write(*,'(i5,3es25.16)') n, timing, timing2, timing3
deallocate(a, b, c, tmp, tmp2)
end do
end program test
# README
# compile with debug options and run valgrind with the following options
valgrind --trace-children=yes --log-file=valgrind_out.txt --max-stackframe=99999999 --main-stacksize=99999999 --valgrind-stacksize=10485760 --leak-check=full --leak-resolution=high --show-reachable=yes --track-origins=yes ./kkr.x
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment