From 13a8026acd60e9d4f9024edfe69cb9a2371b233d Mon Sep 17 00:00:00 2001
From: Paul Baumeister <p.baumeister@fz-juelich.de>
Date: Fri, 17 May 2019 13:52:59 +0200
Subject: [PATCH] prepare for a first GPU-accelerated version, make
 tfQMRgpu=yes

---
 .../source/IterativeSolver/KKRmat_mod.F90     | 24 +++++++++++++++++++
 source/KKRnano/source/Makefile                | 16 +++++++++++--
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/source/KKRnano/source/IterativeSolver/KKRmat_mod.F90 b/source/KKRnano/source/IterativeSolver/KKRmat_mod.F90
index 785b690e2..73a40cee9 100644
--- a/source/KKRnano/source/IterativeSolver/KKRmat_mod.F90
+++ b/source/KKRnano/source/IterativeSolver/KKRmat_mod.F90
@@ -219,6 +219,9 @@ module KKRmat_mod
     use fillKKRMatrix_mod, only: dump
     use IterativeSolver_mod, only: IterativeSolver, solve
     use DirectSolver_mod, only: DirectSolver, solve
+#ifdef  has_tfQMRgpu
+    ! use tfqmrgpu, only: tfqmrgpu_bsrsv_complete ! all-in-one GPU solver interface for rapid integration
+#endif
     use SparseMatrixDescription_mod, only: dump
     use InitialGuess_mod, only: InitialGuess, load, store
     use KKROperator_mod, only: KKROperator
@@ -257,6 +260,13 @@ module KKRmat_mod
     double complex, allocatable :: dPdE_local(:,:,:), gllke_x(:,:), dgde(:,:), MinvdMdE(:,:,:), TinvMinvdMdE(:,:,:) ! LLY
     double complex :: tracek  ! LLY
          
+#ifdef  has_tfQMRgpu
+    external :: tfqmrgpu_bsrsv_complete ! subroutine
+    integer :: o = 6
+    integer :: ierr = 1
+    integer :: iterations, lda
+    double precision :: residuum
+#endif
 
     integer :: num_trunc_atoms, lmsd, lm1, idx_lly, i1
     double complex :: cfctorinv
@@ -383,6 +393,20 @@ module KKRmat_mod
       ! store the initial guess
       call store(iguess_data, op%mat_X, ik=ikpoint, is=ispin, ie=ienergy)
 
+#ifdef  has_tfQMRgpu
+    case (5) ! GPU solver
+      
+      iterations = 1000
+      residuum = 1e-7
+      lda = size(op%mat_A, 1)
+
+      call tfqmrgpu_bsrsv_complete(op%bsr_A%nRows, lda, &
+        op%bsr_A%RowStart, op%bsr_A%ColIndex, op%mat_A(:,:,:,0), 'n', & !! A (in)
+        op%bsr_X%RowStart, op%bsr_X%ColIndex, op%mat_X, 'n', & !! X (out)
+        op%bsr_B%RowStart, op%bsr_B%ColIndex, op%mat_B, 'n', & !! B (in)
+        iterations, residuum, o, ierr)
+#endif
+      
     case default
       warn(6, "No solver selected! Problem is not solved, solver_type ="+solver_type)
     endselect ! solver_type
diff --git a/source/KKRnano/source/Makefile b/source/KKRnano/source/Makefile
index 05f24378b..25e3c764e 100644
--- a/source/KKRnano/source/Makefile
+++ b/source/KKRnano/source/Makefile
@@ -7,12 +7,19 @@ TYPE ?= nodebug
 ### the user may specify SMP=openmp as a command line argument to make
 SMP ?= none
 
+### can we make use of the tfQMRgpu library
+tfQMRgpu ?= no
+
 PROGRAM = kkr.exe
 
 # Path to put object files and module files
 BUILDDIR = $(HOME)/build
 
-EXTRA_FLAGS =
+EXTRA_FLAGS ?=
+
+ifeq ($(tfQMRgpu),yes)
+	EXTRA_FLAGS += -D has_tfQMRgpu
+endif
 FC90FLAGS = 
 FCFLAGS =
 PPFLAGS =
@@ -375,6 +382,11 @@ OBJS += $(notdir ${SRCS90:.f90=.o})
 OBJS += $(notdir ${SRCSFPP:.F=.o})
 OBJS += $(notdir ${SRCS90FPP:.F90=.o})
 
+ifeq ($(tfQMRgpu),yes)
+	OBJS += tfqmrgpu_Fortran.o tfqmrgpu_Fortran_wrappers.o tfqmrgpu.o
+	LDFLAGS += -L/usr/local/cuda/lib64 -lcudart -lnvToolsExt -lm -lcurand
+endif
+
 .PHONY: all
 all:	$(PROGRAM)
 
@@ -521,7 +533,7 @@ bsrmm_mod.o: CacheOverlap_mod.o
 CacheOverlap_mod.o:
 ChebMeshData_mod.o: InputParams_mod.o RadialMeshData_mod.o
 Truncation_mod.o: Logging_mod.o Exceptions_mod.o TruncationZone_mod.o
-NonCollinearMagnetism_mod.o: RadialMeshData_mod.o ChebMeshData_mod.o
+NonCollinearMagnetism_mod.o: RadialMeshData_mod.o ChebMeshData_mod.o read_formatted_shapefun_mod.o
 NonCollinearMagnetismData_mod.o: Exceptions_mod.o
 
 # DO NOT DELETE
-- 
GitLab