Cleanup of rhoin and add test scripts

e830b206 · Philipp Rüssmann · 5df906d4 · e830b206 · e830b206 · e830b206
Commit e830b206 authored 6 years ago by Philipp Rüssmann
--- a/source/KKRhost/rhoin.f90
+++ b/source/KKRhost/rhoin.f90
@@ -101,7 +101,13 @@ contains
    complex (kind=dp) :: efac1, efac2, ffz, gmatl, ppz, v1, v2
    real (kind=dp) :: c0ll
    integer :: i, ir, j, j0, j1, l, l1, l2, lm1, lm2, lm3, lm3max, ln2, ln3, m
-    complex (kind=dp) :: vr(lmmaxd, lmmaxd), wf(irmd, 0:lmaxd, 0:lmaxd), wr(lmmaxd, lmmaxd), ar2(lmmaxd**2), vr2(lmmaxd**2)
+    complex (kind=dp) :: vr(lmmaxd, lmmaxd), wf(irmd, 0:lmaxd, 0:lmaxd), wr(lmmaxd, lmmaxd)
+#ifdef __GFORTRAN__
+    ! for the gfortran compiler zdotu leads to a segfault, then using
+    ! dot_product gives the correct result without segfault, this needs these
+    ! auxiliary arrays
+    complex (kind=dp) :: ar2(lmmaxd**2), vr2(lmmaxd**2)
+#endif
    complex (kind=dp), external :: zdotu

    ! C0LL = 1/sqrt(4*pi)
@@ -113,7 +119,6 @@ contains
    !      use first vr
    wr = czero
    vr = czero
-    ar2 = czero
    do lm2 = 1, lmmaxd
      ln2 = lm2
      v2 = efac(lm2)*efac(lm2)*gmat(ln2, ln2)
@@ -135,23 +140,25 @@ contains
      end do
    end do

+#ifdef __GFORTRAN__
    ar2 = reshape(ar, [lmmaxd**2])
    vr2 = reshape(vr, [lmmaxd**2])
+#endif
    do lm1 = 1, lmmaxd
      efac1 = efac(lm1)
-      !write(*,*) lm1, lmmaxd, shape(ar2), shape(vr), efac1
-      !write(*,*) ar2(lm1,:)
-      !write(*,*) vr(lm1,:)
-      !wr(lm1, lm1) = zdotu(lmmaxd, ar(lm1,1), lmmaxd, vr(lm1,1), lmmaxd)/(efac1*efac1) ! this seems to work with the intel compiler
-      wr(lm1, lm1) = dot_product(conjg(ar2(lm1::lmmaxd)), vr2(lm1::lmmaxd))/(efac1*efac1) ! this works with conjugation!!!
-      !wr(lm1, lm1) = zdotu(lmmaxd, ar2(lm1), lmmaxd, vr2(lm1), lmmaxd)/(efac1*efac1)
-      !wr(lm1, lm1) = zdotu(lmmaxd, ar(lm1,1), lmmaxd, vr(lm1,1), lmmaxd)/(efac1*efac1)
+#ifdef __GFORTRAN__
+      wr(lm1, lm1) = dot_product(conjg(ar2(lm1::lmmaxd)), vr2(lm1::lmmaxd))/(efac1*efac1) ! this works only with conjugation due to definition of dot_product
+#else
+      wr(lm1, lm1) = zdotu(lmmaxd, ar(lm1,1), lmmaxd, vr(lm1,1), lmmaxd)/(efac1*efac1) ! this works with the intel compiler
+#endif
      do lm2 = 1, lm1 - 1
        ! ---> using symmetry of gaunt coeffients
        efac2 = efac(lm2)
-        !wr(lm1, lm2) = ( zdotu(lmmaxd,ar(lm1,1),lmmaxd,vr(lm2,1),lmmaxd) + zdotu(lmmaxd,ar(lm2,1),lmmaxd,vr(lm1,1),lmmaxd) )/(efac1*efac2) ! this seems to work with the intel compiler
-        wr(lm1, lm2) = ( dot_product(conjg(ar2(lm1::lmmaxd)),vr2(lm2::lmmaxd)) + dot_product(conjg(ar2(lm2::lmmaxd)),vr2(lm1::lmmaxd)) )/(efac1*efac2) ! this works with conjugation!!!
-        !wr(lm1, lm2) = ( zdotu(lmmaxd,ar2(lm1),lmmaxd,vr2(lm2),lmmaxd) + zdotu(lmmaxd,ar2(lm2),lmmaxd,vr2(lm1),lmmaxd) )/(efac1*efac2)
+#ifdef __GFORTRAN__
+        wr(lm1, lm2) = ( dot_product(conjg(ar2(lm1::lmmaxd)),vr2(lm2::lmmaxd)) + dot_product(conjg(ar2(lm2::lmmaxd)),vr2(lm1::lmmaxd)) )/(efac1*efac2) ! this works with gfortran
+#else
+        wr(lm1, lm2) = ( zdotu(lmmaxd,ar(lm1,1),lmmaxd,vr(lm2,1),lmmaxd) + zdotu(lmmaxd,ar(lm2,1),lmmaxd,vr(lm1,1),lmmaxd) )/(efac1*efac2) ! this works with the intel compiler
+#endif
      end do
    end do


--- a/tests/timing_test_matrix_ops/run_timing_test.sh
+++ b/tests/timing_test_matrix_ops/run_timing_test.sh
+#ifort -mkl -O3 -xHost -vec-report test.f90 && ./a.out | tee out
+
+ifort -mkl -O3 -xHost -qopt-report=5 test_dot_prod.f90
+export OMP_NUM_THREADS=1; ./a.out | tee out
+ifort -mkl=parallel -qopenmp -O3 -xHost test_dot_prod.f90
+export OMP_NUM_THREADS=4; ./a.out | tee out2
+./plot_times.py
+
+ifort -mkl -O3 -xHost -qopt-report=5 test_matmul.f90
+export OMP_NUM_THREADS=1; ./a.out | tee out1
+ifort -mkl=parallel -qopenmp -O3 -xHost test_matmul.f90
+export OMP_NUM_THREADS=4; ./a.out | tee out12
+./plot_times2.py
--- a/tests/timing_test_matrix_ops/test_dot_prod.f90
+++ b/tests/timing_test_matrix_ops/test_dot_prod.f90
+! this is a test program to compare the time a call of dot_product takes
+! compared to a direct implementation of the loop or lapacks zdotu
+! this kind of loop structure appears in rhoin
+program test
+
+implicit none
+integer, parameter :: m=10000
+integer :: i, j, n, k
+integer :: clock_rate
+integer :: start_time
+integer :: stop_time
+real*8 :: timing, timing2, timing3
+real*8, allocatable :: tmp(:,:), tmp2(:,:)
+complex*16, allocatable :: a(:,:), b(:,:), c(:), c2(:), a2(:), b2(:)
+complex*16, external :: zdotu
+
+call system_clock(count_rate=clock_rate) ! Find the rate
+
+do n=1,100
+  allocate(a(n,n), b(n,n), c(n), c2(n), a2(n*n), b2(n*n), tmp(n,n), tmp2(n,n))
+  call random_number(tmp)
+  call random_number(tmp2)
+  a = cmplx(tmp, tmp2)
+  call random_number(tmp)
+  call random_number(tmp2)
+  b = cmplx(tmp, tmp2)
+ 
+  call system_clock(count=start_time)
+  do j=1,m
+    do i=1,n
+      c(i) = zdotu(n,a(i,1),n,b(i,1),n)
+    end do
+  end do
+  call system_clock(count=stop_time) ! Stop timing
+  timing = (stop_time-start_time)/real(clock_rate)
+ 
+  call system_clock(count=start_time)
+  do j=1,m
+    a2 = reshape(a, [n**2])
+    b2 = reshape(b, [n**2])
+    do i=1,n
+      c2(i) = dot_product(conjg(a2(i::n)), b2(i::n))
+    end do
+  end do
+  call system_clock(count=stop_time) ! Stop timing
+  timing2 = (stop_time-start_time)/real(clock_rate)
+ 
+  call system_clock(count=start_time)
+  do j=1,m
+    a2 = reshape(a, [n**2])
+    b2 = reshape(b, [n**2])
+    c2(:) = (0.0, 0.0)
+    !$omp parallel do private(i, k) default(shared)
+    do i=1,n
+      do k=0,n-1
+        c2(i) = c2(i) + a2(i+n*k)*b2(i+n*k)
+      end do
+    end do
+    !$omp end parallel do
+  end do
+  call system_clock(count=stop_time) ! Stop timing
+  timing3 = (stop_time-start_time)/real(clock_rate)
+ 
+  !write(*,*) 'c', c
+  !write(*,*) 'c2', c2
+  !write(*,*) n, 'c-c2', sum(c-c2)
+  write(*,'(i5,3es25.16)') n, timing, timing2, timing3
+ 
+  deallocate(a, b, c, c2, a2, b2, tmp, tmp2)
+end do
+
+end program test
--- a/tests/timing_test_matrix_ops/test_matmul.f90
+++ b/tests/timing_test_matrix_ops/test_matmul.f90
+program test
+
+implicit none
+integer, parameter :: m=10000
+integer :: i, j, n, k, l
+integer :: clock_rate
+integer :: start_time
+integer :: stop_time
+real*8 :: timing, timing2, timing3
+real*8, allocatable :: tmp(:,:), tmp2(:,:)
+complex*16, allocatable :: a(:,:), b(:,:), c(:,:), b2(:,:)
+complex*16 :: temp
+complex*16, external :: zdotu
+
+call system_clock(count_rate=clock_rate) ! Find the rate
+
+do n=1,100
+  allocate(a(n,n), b(n,n), c(n,n), tmp(n,n), tmp2(n,n))
+  call random_number(tmp)
+  call random_number(tmp2)
+  a = cmplx(tmp, tmp2)
+  call random_number(tmp)
+  call random_number(tmp2)
+  b = cmplx(tmp, tmp2)
+ 
+  call system_clock(count=start_time)
+  do j=1,m
+    call zgemm('N','N', n,n,n, (1.0,0.0), a,n,b,n,(0.0,0.0),c,n)
+  end do
+  call system_clock(count=stop_time) ! Stop timing
+  timing = (stop_time-start_time)/real(clock_rate)
+ 
+  call system_clock(count=start_time)
+  do j=1,m
+    c = matmul(a, b)
+  end do
+  call system_clock(count=stop_time) ! Stop timing
+  timing2 = (stop_time-start_time)/real(clock_rate)
+ 
+  call system_clock(count=start_time)
+  do j=1,m
+    c = (0.0, 0.0)
+    !$omp parallel do private(i, k) default(shared)
+    do i=1,n
+      do k=1,n
+        temp = b(k,i)
+        do l=1,n
+          c(l,i)  = c(l,i) + temp*a(l,k)     
+        end do
+      end do
+    end do
+    !$omp end parallel do
+  end do
+  call system_clock(count=stop_time) ! Stop timing
+  timing3 = (stop_time-start_time)/real(clock_rate)
+ 
+  !write(*,*) 'c', c
+  write(*,'(i5,3es25.16)') n, timing, timing2, timing3
+ 
+  deallocate(a, b, c, tmp, tmp2)
+end do
+
+end program test
--- a/utils/run_valgrind.sh
+++ b/utils/run_valgrind.sh
+# README
+# compile with debug options and run valgrind with the following options
+
+valgrind --trace-children=yes --log-file=valgrind_out.txt --max-stackframe=99999999 --main-stacksize=99999999 --valgrind-stacksize=10485760 --leak-check=full --leak-resolution=high --show-reachable=yes --track-origins=yes ./kkr.x
+