From 1feedea7232c76ebeea0ef7d0ff7d7bdceefb407 Mon Sep 17 00:00:00 2001
From: Cem Oran <cemoran.01@gmail.com>
Date: Wed, 8 Sep 2021 17:07:43 +0200
Subject: [PATCH] Implementation of tfQMRgpu

Adopted MakeFile, swiched off grouping and changed transposition flags for gpu call.
---
 source/KKRnano/regtests/run_test.sh           |  2 +
 .../source/IterativeSolver/KKRmat_mod.F90     | 34 +++++++++-----
 .../IterativeSolver/fillKKRMatrix_mod.F90     |  4 +-
 source/KKRnano/source/Makefile                | 44 ++++++++-----------
 4 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/source/KKRnano/regtests/run_test.sh b/source/KKRnano/regtests/run_test.sh
index 02cfb37a9..117c9a127 100755
--- a/source/KKRnano/regtests/run_test.sh
+++ b/source/KKRnano/regtests/run_test.sh
@@ -1,6 +1,8 @@
 #!/bin/sh
 ARCHIVE=loggs
 
+export LD_LIBRARY_PATH=$MKLROOT/lib/intel64:$LD_LIBRARY_PATH
+
 ## today's date, hour, minute
 day=`date "+%Y%m%d%H%M"` 
 
diff --git a/source/KKRnano/source/IterativeSolver/KKRmat_mod.F90 b/source/KKRnano/source/IterativeSolver/KKRmat_mod.F90
index fc05a235f..6cabb140d 100644
--- a/source/KKRnano/source/IterativeSolver/KKRmat_mod.F90
+++ b/source/KKRnano/source/IterativeSolver/KKRmat_mod.F90
@@ -221,6 +221,8 @@ module KKRmat_mod
     use DirectSolver_mod, only: DirectSolver, solve
 #ifdef  has_tfQMRgpu
     use tfqmrgpu, only: tfqmrgpu_bsrsv_complete ! all-in-one GPU solver interface for rapid integration
+    use SolverStats_mod, only: reduce
+    use TimerMpi_mod, only: startTimer, stopTimer
 #endif
     use SparseMatrixDescription_mod, only: dump
     use InitialGuess_mod, only: InitialGuess, load, store
@@ -261,10 +263,8 @@ module KKRmat_mod
     double complex :: tracek  ! LLY
          
 #ifdef  has_tfQMRgpu
-    integer :: o = 6
-    integer :: ierr = 1
-    integer :: iterations, lda
-    double precision :: residuum
+    integer (kind=4) :: o, ierr, iterations, lda
+    real (kind=8) :: residual
 #endif
 
     integer :: num_trunc_atoms, lmsd, lm1, idx_lly, i1
@@ -395,15 +395,23 @@ module KKRmat_mod
     case (5) ! GPU solver
       
 #ifdef  has_tfQMRgpu
-      iterations = 1000
-      residuum = 1e-7
-      lda = size(op%mat_A, 1)
 
+       o=0
+       ierr=0
+
+      iterations = 2000
+      residual = iterative_solver%qmrbound
+      lda = size(op%mat_A, 1)
+      call startTimer(kernel_timer)
+   !   write(*,*) "Written by us: ", size(op%bsr_X%ColIndex)
       call tfqmrgpu_bsrsv_complete(op%bsr_A%nRows, lda, &
-        op%bsr_A%RowStart, op%bsr_A%ColIndex, op%mat_A(:,:,:,0), 'n', & !! A (in)
-        op%bsr_X%RowStart, op%bsr_X%ColIndex, op%mat_X, 'n', & !! X (out)
-        op%bsr_B%RowStart, op%bsr_B%ColIndex, op%mat_B, 'n', & !! B (in)
-        iterations, residuum, o, ierr)
+        op%bsr_A%RowStart, op%bsr_A%ColIndex, op%mat_A(:,:,:,0), 't', & !! A (in)
+        op%bsr_X%RowStart, op%bsr_X%ColIndex, op%mat_X, 't', & !! X (out)
+        op%bsr_B%RowStart, op%bsr_B%ColIndex, op%mat_B, 't', & !! B (in)
+        iterations, residual, o, ierr)
+      call stopTimer(kernel_timer)  
+      call reduce(iterative_solver%stats, iterations, residual, 0_8)
+           
 #else
       warn(6, "GPU solver needs -D has_tfQMRgpu (Problem is not solved) solver_type ="+solver_type)
 #endif
@@ -412,6 +420,10 @@ module KKRmat_mod
       warn(6, "No solver selected! Problem is not solved, solver_type ="+solver_type)
     endselect ! solver_type
     
+    !call dump(op%mat_X, "solution_form.dat", formatted=.true.)
+    !call dump(op%mat_B, "rhs_form.dat", formatted=.true.)
+    !stop __FILE__
+    
     TESTARRAYLOG(3, op%mat_B)
     TESTARRAYLOG(3, op%mat_X)
     ! RESULT: mat_X
diff --git a/source/KKRnano/source/IterativeSolver/fillKKRMatrix_mod.F90 b/source/KKRnano/source/IterativeSolver/fillKKRMatrix_mod.F90
index 745f05f02..88ea971c0 100644
--- a/source/KKRnano/source/IterativeSolver/fillKKRMatrix_mod.F90
+++ b/source/KKRnano/source/IterativeSolver/fillKKRMatrix_mod.F90
@@ -32,7 +32,7 @@ module fillKKRMatrix_mod
     type(SparseMatrixDescription), intent(out) :: bsr_X
     integer(kind=1), intent(in) :: lmax_a(:,:) !< lmax of each interaction dim(naez_trc,num_local_atoms), -1: truncated
 
-    integer, parameter :: GROUPING = 0 ! -1:never, 0:auto, 1:always
+    integer, parameter :: GROUPING = -1 ! -1:never, 0:auto, 1:always
     integer :: iRow, jCol, Xind, nnzb, group
  
     nnzb = count(lmax_a >= 0) ! number of non-zero blocks in X
@@ -46,7 +46,7 @@ module fillKKRMatrix_mod
     endif
 
     if (group > 0) then
-
+      !write(*,*) "we are in group > 0"
       call create(bsr_X, nRows=size(lmax_a, 1), nnzb=size(lmax_a, 1), nCols=1)
 
       ! generate BSR descriptor of a flat structure, fuse RHS atoms into rectangular blocks
diff --git a/source/KKRnano/source/Makefile b/source/KKRnano/source/Makefile
index d4893402d..b552c82cc 100644
--- a/source/KKRnano/source/Makefile
+++ b/source/KKRnano/source/Makefile
@@ -7,9 +7,6 @@ TYPE ?= nodebug
 ### the user may specify SMP=openmp as a command line argument to make
 SMP ?= none
 
-### can we make use of the tfQMRgpu library
-tfQMRgpu ?= no
-
 PROGRAM = kkr.exe
 
 # Path to put object files and module files
@@ -21,14 +18,9 @@ EXTRA_FLAGS ?=
 tfQMRgpu ?= no
 ifeq ($(tfQMRgpu),yes)
 	EXTRA_FLAGS += -D has_tfQMRgpu
-	CUDA_PATH=-L/usr/local/zam/CUDA/cuda-9.1/lib64
+    TFQMRGPU_PATH = $(HOME)/tfQMRgpu
 endif
 
-EXTRA_FLAGS ?=
-
-ifeq ($(tfQMRgpu),yes)
-	EXTRA_FLAGS += -D has_tfQMRgpu
-endif
 
 FC90FLAGS = 
 FCFLAGS =
@@ -388,22 +380,15 @@ SRCS90 = $(foreach DIR,$(DIRS),$(wildcard $(DIR)/*.f90))
 SRCSFPP = $(foreach DIR,$(DIRS),$(wildcard $(DIR)/*.F))
 SRCS90FPP = $(foreach DIR,$(DIRS),$(wildcard $(DIR)/*.F90))
 SRCS95TMPL = $(foreach DIR,$(DIRS),$(wildcard $(DIR)/*.F95))
+ifeq ($(tfQMRgpu),yes)
+	SRCS90FPP += $(TFQMRGPU_PATH)/tfqmrgpu_Fortran_example.F90
+	LDFLAGS += -L$(TFQMRGPU_PATH)/lib64 -ltfQMRgpu_Fortran -ltfQMRgpu
+endif
+
 
 OBJS=
 
 LINKER= $(FC90)
-ifeq ($(tfQMRgpu),yes)
-	OBJS += tfqmrgpu_Fortran.o
-	OBJS += tfqmrgpu_Fortran_wrappers.o
-	OBJS += tfqmrgpu.o
-	SRCS90FPP += GPU/tfqmrgpu_Fortran.F90
-# 	LDFLAGS += $(CUDA_PATH) -gfortran -lcudart -lnvToolsExt -lm -lcurand \
-# 	             -L/usr/lib64/gcc/x86_64-suse-linux/4.8 \
-# 	             -L/usr/lib -lstdc++
-	LDFLAGS += $(CUDA_PATH) -lcudart -lnvToolsExt -lm -lcurand
-	LDFLAGS += -L/usr/lib -lstdc++
-# # 	LINKER= gfortran
-endif
 
 
 # notdir extracts only filename
@@ -414,11 +399,6 @@ OBJS += $(notdir ${SRCS90FPP:.F90=.o})
 OBJS += $(notdir ${SRCS95TMPL:.F95=.o})
 
 
-ifeq ($(tfQMRgpu),yes)
-	OBJS += tfqmrgpu_Fortran.o tfqmrgpu_Fortran_wrappers.o tfqmrgpu.o
-	LDFLAGS += -L/usr/local/cuda/lib64 -lcudart -lnvToolsExt -lm -lcurand
-endif
-
 .PHONY: all
 all:	$(PROGRAM)
 
@@ -437,6 +417,12 @@ $(PROGRAM): $(OBJS)
 %.o: %.F90
 	$(FC90) $(FCFLAGS) $(EXTRA_FLAGS) $(FC90FLAGS) $(PPFLAGS) -c $< -o $(BUILDDIR)/$@
 
+ifeq ($(tfQMRgpu),yes)
+#	tfqmrgpu_Fortran_example.o: $(TFQMRGPU_PATH)/example/tfqmrgpu_Fortran_example.F90 
+#	$(FC90) $(FCFLAGS) $(EXTRA_FLAGS) $(FC90FLAGS) $(PPFLAGS) -c $< -o $(BUILDDIR)/$@
+endif
+
+
 ### Fortran-templates:
 %.o: %.F95
 	sed -e 's/_TYPE/D/' $< >  $(BUILDDIR)/$*.F90
@@ -456,12 +442,18 @@ clean:
 test:
 	@echo $(OBJS)
 	@echo $(VPATH)
+	@echo $(SRCS90FPP)
 
 .PHONY: depend 
 depend:
 	$(SRCS90)
 	makedepend
 
+ifeq ($(tfQMRgpu),yes)
+    KKRmat_mod.o: tfqmrgpu_Fortran_example.o
+endif
+
+
 #======================== Module dependencies ========================================
 
 KKRnano.o: Logging_mod.o KKRzero_mod.o PotentialConverter_mod.o KKRnanoParallel_mod.o BasisAtom_mod.o AtomicCore_mod.o RadialMeshData_mod.o main2_aux_mod.o ScatteringCalculation_mod.o Main2Arrays_mod.o KKRnano_Comm_mod.o ProcessKKRresults_mod.o InputParams_mod.o EBalanceHandler_mod.o LDAUData_mod.o TimerMpi_mod.o EnergyMesh_mod.o DimParams_mod.o CalculationData_mod.o
-- 
GitLab