diff --git a/source/KKRnano/source/Makefile b/source/KKRnano/source/Makefile
index b552c82cc38743759993528b0b81f425dd5b6b70..cfb93cf13f35bcaa914c562fb0481c5edb9415e8 100644
--- a/source/KKRnano/source/Makefile
+++ b/source/KKRnano/source/Makefile
@@ -18,7 +18,7 @@ EXTRA_FLAGS ?=
 tfQMRgpu ?= no
 ifeq ($(tfQMRgpu),yes)
 	EXTRA_FLAGS += -D has_tfQMRgpu
-    TFQMRGPU_PATH = $(HOME)/tfQMRgpu
+	TFQMRGPU_PATH = $(HOME)/tfQMRgpu
 endif
 
 
@@ -318,7 +318,7 @@ PPFLAGS = -WF,-DUSE_VOROWEIGHTS -WF,-DCOMPUTE_tref_LOCALLY -WF,-DNOLOGGING
 
 ifeq ($(SMP),openmp)
 	FCFLAGS += -qsmp=omp
-        PPFLAGS += -WF,-DCPP_hybrid
+	PPFLAGS += -WF,-DCPP_hybrid
 endif
 
 #ESSL
@@ -328,16 +328,16 @@ endif
 LDFLAGS = -L/bgsys/local/lib -lesslsmpbg -L/opt/ibmcmp/xlsmp/3.1/bglib64 -lxlsmp -L/bgsys/local/fftw3/3.3.2/fftw_g/lib/ -lfftw3 
 
 ifeq ($(TYPE),debug)
-  FCFLAGS += -q64 -O0 -qstrict -g -qnosave -C -qinitauto=7FF7FFFF -WF,-DDEBUG1
+	FCFLAGS += -q64 -O0 -qstrict -g -qnosave -C -qinitauto=7FF7FFFF -WF,-DDEBUG1
 else ifeq ($(TYPE),scorep)
-  FC = scorep mpixlf77_r	
-  FC90 = scorep mpixlf90_r
-  FCFLAGS += -q64 -O3 -qstrict
+	FC = scorep mpixlf77_r	
+	FC90 = scorep mpixlf90_r
+	FCFLAGS += -q64 -O3 -qstrict
 else ifeq ($(TYPE),voronoi_mesh)
-  FCFLAGS += -q64 -O3 -qstrict
-  PPFLAGS += -WF,-DUSE_OLD_MESH 
+	FCFLAGS += -q64 -O3 -qstrict
+	PPFLAGS += -WF,-DUSE_OLD_MESH 
 else
-  FCFLAGS += -q64 -O3 -qstrict
+	FCFLAGS += -q64 -O3 -qstrict
 endif
 
 #ifeq ($(USETOOL),scalasca)
@@ -380,14 +380,16 @@ SRCS90 = $(foreach DIR,$(DIRS),$(wildcard $(DIR)/*.f90))
 SRCSFPP = $(foreach DIR,$(DIRS),$(wildcard $(DIR)/*.F))
 SRCS90FPP = $(foreach DIR,$(DIRS),$(wildcard $(DIR)/*.F90))
 SRCS95TMPL = $(foreach DIR,$(DIRS),$(wildcard $(DIR)/*.F95))
+
+
+OBJS=
+
 ifeq ($(tfQMRgpu),yes)
-	SRCS90FPP += $(TFQMRGPU_PATH)/tfqmrgpu_Fortran_example.F90
 	LDFLAGS += -L$(TFQMRGPU_PATH)/lib64 -ltfQMRgpu_Fortran -ltfQMRgpu
+	OBJS += tfQMRgpu_mod.o
 endif
 
 
-OBJS=
-
 LINKER= $(FC90)
 
 
@@ -418,8 +420,10 @@ $(PROGRAM): $(OBJS)
 	$(FC90) $(FCFLAGS) $(EXTRA_FLAGS) $(FC90FLAGS) $(PPFLAGS) -c $< -o $(BUILDDIR)/$@
 
 ifeq ($(tfQMRgpu),yes)
-#	tfqmrgpu_Fortran_example.o: $(TFQMRGPU_PATH)/example/tfqmrgpu_Fortran_example.F90 
-#	$(FC90) $(FCFLAGS) $(EXTRA_FLAGS) $(FC90FLAGS) $(PPFLAGS) -c $< -o $(BUILDDIR)/$@
+# add a target tfQMRgpu
+tfQMRgpu: $(TFQMRGPU_PATH)/example/tfqmrgpu_Fortran_example.F90
+	$(FC90) -I $(TFQMRGPU_PATH)/tfQMRgpu/include $(FCFLAGS) $(EXTRA_FLAGS) \
+	$(FC90FLAGS) $(PPFLAGS) -c $< -o $(BUILDDIR)/tfQMRgpu_mod.o
 endif
 
 
@@ -450,7 +454,8 @@ depend:
 	makedepend
 
 ifeq ($(tfQMRgpu),yes)
-    KKRmat_mod.o: tfqmrgpu_Fortran_example.o
+	# manually add a dependency
+	KKRmat_mod.o: tfQMRgpu_mod.o
 endif
 
 
diff --git a/source/KKRnano/source/ScatteringCalculation_mod.F90 b/source/KKRnano/source/ScatteringCalculation_mod.F90
index 2250861b593726077e4a30a60ac93eaa4b37ca82..dbdf267b417f46754e0566a4f24f91ef44ae6375 100644
--- a/source/KKRnano/source/ScatteringCalculation_mod.F90
+++ b/source/KKRnano/source/ScatteringCalculation_mod.F90
@@ -107,7 +107,7 @@ implicit none
     integer :: omp_threads !DEBUGGING
     logical :: xccpl
     double precision :: rMTref
-    double precision, allocatable :: rMTs(:)
+    double precision, allocatable :: rMTs(:,:), rMTrefs(:,:)
     
     double complex, allocatable :: tmatLL(:,:,:,:) !< all t-matrices inside the truncation zone
     double complex, allocatable :: GmatN_buffer(:,:,:) !< GmatN for all local atoms
@@ -187,9 +187,12 @@ implicit none
         enddo
      endif
 !---------------------------------------------------------
-    allocate(rMTs(calc%trunc_zone%naez_trc))
-    call distribute(calc%xTable, 1, calc%atomdata_a(:)%rMTref, rMTs) ! communicate the Muffin-Tin radii within the truncation zone
-    
+    allocate(rMTs(1,calc%trunc_zone%naez_trc), rMTrefs(1,num_local_atoms))
+    rMTrefs(1,:) = calc%atomdata_a(:)%rMTref
+    ! communicate the Muffin-Tin radii within the truncation zone
+    call distribute(calc%xTable, 1, rMTrefs, rMTs)
+    deallocate(rMTrefs, stat=ist) ! ignore status
+
   ! IE ====================================================================
   !     BEGIN do loop over energies (EMPID-parallel)
   ! IE ====================================================================
@@ -212,7 +215,7 @@ implicit none
           do iacls = 1, calc%ref_cluster_a(ila)%nacls
             ! this calls tref several times with the same parameters if the local atoms are close to each other
 !           rMTref = kkr(ila)%rMTref(iacls) ! possible if it has been communicated earlier
-            rMTref = rMTs(calc%trunc_zone%trunc_atom_idx(calc%ref_cluster_a(ila)%atom(iacls)))
+            rMTref = rMTs(1,calc%trunc_zone%trunc_atom_idx(calc%ref_cluster_a(ila)%atom(iacls)))
             call tref(emesh%EZ(IE), params%vref, dims%lmaxd, rMTref, &
                       kkr(ila)%Tref_ell(:,iacls), kkr(ila)%dTref_ell(:,iacls), derive=(dims%Lly > 0))
             !if (dims%korbit == 1) then ! NOCO
diff --git a/source/KKRnano/source/parallel/two_sided_comm_mod.F95 b/source/KKRnano/source/parallel/two_sided_comm_mod.F95
index 33e4c7a94c1ec12faf9562d4a56266a99aae6ab9..2f57417df366a91b605c3e1abe8596ad301111e4 100644
--- a/source/KKRnano/source/parallel/two_sided_comm_mod.F95
+++ b/source/KKRnano/source/parallel/two_sided_comm_mod.F95
@@ -40,6 +40,8 @@
 !> to replace a missing template feature in Fortran.
 !> Do not use more than one name with _TYPE per line!
 
+! #define DEBUG
+
 #define NUMBERZ double complex
 #define NUMBERMPIZ MPI_DOUBLE_COMPLEX
 #define NUMBERC complex
@@ -84,11 +86,12 @@ module two_sided_comm_TYPE_mod
     include 'mpif.h' ! only: MPI_STATUS_SIZE, MPI_INTEGER, MPI_REQUEST_NULL
 
     assert( self%comm /= 0 )
-    
+
     call MPI_Comm_rank(self%comm, myrank, ierr)
 
     allocate(sreq(self%send_n), sstats(MPI_STATUS_SIZE,self%send_n), &
-             rreq(self%recv_n), rstats(MPI_STATUS_SIZE,self%recv_n), stat=ist) ! ToDo: catch status
+             rreq(self%recv_n), rstats(MPI_STATUS_SIZE,self%recv_n), stat=ist)
+    if (ist /= 0) call MPI_Abort(self%comm, ist, ierr)
     sreq(:) = MPI_REQUEST_NULL
     rreq(:) = MPI_REQUEST_NULL
 
@@ -103,19 +106,19 @@ module two_sided_comm_TYPE_mod
         do inz = self%send_start(ipair), self%send_start(ipair + 1) - 1
           tag = inz - self%send_start(ipair)
           iinp = self%send_index(inz)
-          call MPI_Isend(Ginp(:,iinp), ncount, NUMBERMPI_TYPE, rank, tag, self%comm, sreq(inz), ierr)
 #ifdef DEBUG
           write(*, '(9(a,i0))') "send local _TYPE-element ",iinp,"@",myrank," with tag ",tag," to rank ",rank
 #endif
+          call MPI_Isend(Ginp(:,iinp), ncount, NUMBERMPI_TYPE, rank, tag, self%comm, sreq(inz), ierr)
         enddo ! inz
 
         do inz = self%recv_start(ipair), self%recv_start(ipair + 1) - 1
           tag = inz - self%recv_start(ipair)
           iout = self%recv_index(inz)
-          call MPI_Irecv(Gout(:,iout), ncount, NUMBERMPI_TYPE, rank, tag, self%comm, rreq(inz), ierr)
 #ifdef DEBUG
           write(*, '(9(a,i0))') "I (rank ",myrank,") want to receive a _TYPE-element with tag ",tag," from rank ",rank
 #endif
+          call MPI_Irecv(Gout(:,iout), ncount, NUMBERMPI_TYPE, rank, tag, self%comm, rreq(inz), ierr)
         enddo ! inz
 
       else  ! rank /= myrank
@@ -132,7 +135,7 @@ module two_sided_comm_TYPE_mod
           write(*, '(9(a,i0))') "copy local _TYPE-element ",iinp,"@",myrank," locally"
 #endif
         enddo ! inz
-        
+
       endif ! rank /= myrank
 
     enddo ! ipair