cusolver.c 8.35 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
/*
!--------------------------------------------------------------------------------
! Copyright (c) 2016 Peter Grünberg Institut, Forschungszentrum Jülich, Germany
! This file is part of FLEUR and available as free software under the conditions
! of the MIT license as expressed in the LICENSE file in more detail.
!--------------------------------------------------------------------------------
*/
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>

/* Interface for the cusolverDN routines for solving a generalized Eigenvalue problem
Code adopted from the example in the documentation
*/

void cusolver_complex(cuDoubleComplex *H,cuDoubleComplex *S,int n,int ne,double tol,int max_sweeps,double* eig,cuDoubleComplex *z){

  cusolverDnHandle_t cusolverH = NULL;
  cudaStream_t stream = NULL;
  syevjInfo_t syevj_params = NULL;

  cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
  cudaError_t cudaStat1 = cudaSuccess;
  cudaError_t cudaStat2 = cudaSuccess;
  cudaError_t cudaStat3 = cudaSuccess;

  double *d_W = NULL; /* eigenvalues on device*/
  int *d_info = NULL; /* error info */
  int  lwork = 0;     /* size of workspace */
  cuDoubleComplex *d_work = NULL; /* device workspace for syevj */
  int info = 0;       /* host copy of error info */

  /* configuration of syevj  */
  const cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1; //Solve H psi=S lambda psi
  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvectors.
  const cublasFillMode_t  uplo = CUBLAS_FILL_MODE_LOWER;

  /* numerical results of syevj  */
  double residual = 0;
  int executed_sweeps = 0;
  
  /* step 1: create cusolver handle, bind a stream  */
  status = cusolverDnCreate(&cusolverH);
  assert(CUSOLVER_STATUS_SUCCESS == status);

  cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
  assert(cudaSuccess == cudaStat1);

  status = cusolverDnSetStream(cusolverH, stream);
  assert(CUSOLVER_STATUS_SUCCESS == status);

  /* step 2: configuration of syevj */
  status = cusolverDnCreateSyevjInfo(&syevj_params);
  assert(CUSOLVER_STATUS_SUCCESS == status);
  
  /* default value of tolerance is machine zero */
dw113919's avatar
dw113919 committed
59
  status = cusolverDnXsyevjSetTolerance(syevj_params,tol);
60 61 62
  assert(CUSOLVER_STATUS_SUCCESS == status);
  
  /* default value of max. sweeps is 100 */
dw113919's avatar
dw113919 committed
63
  status = cusolverDnXsyevjSetMaxSweeps(syevj_params,max_sweeps);
64 65
  assert(CUSOLVER_STATUS_SUCCESS == status);

dw113919's avatar
dw113919 committed
66
  printf("Allocate data \n");
67 68 69 70 71
  /* step 3: copy A to device */
  cudaStat2 = cudaMalloc ((void**)&d_W, sizeof(cuDoubleComplex) * n);
  cudaStat3 = cudaMalloc ((void**)&d_info, sizeof(int));
  assert(cudaSuccess == cudaStat2);
  assert(cudaSuccess == cudaStat3);
dw113919's avatar
dw113919 committed
72 73

  printf("query working space \n");
74 75 76 77 78 79 80
  /* step 4: query working space of sygvj */
  status = cusolverDnZhegvj_bufferSize(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,&lwork,syevj_params);
  assert(CUSOLVER_STATUS_SUCCESS == status);
  
  cudaStat1 = cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex)*lwork);
  assert(cudaSuccess == cudaStat1);
  
dw113919's avatar
dw113919 committed
81
  printf("compute eigen-pair \n");
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
  /* step 5: compute eigen-pair   */
  status = cusolverDnZhegvj(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,d_work,lwork,d_info,syevj_params);
  cudaStat1 = cudaDeviceSynchronize();
  assert(CUSOLVER_STATUS_SUCCESS == status);
  assert(cudaSuccess == cudaStat1);

  cudaStat1 = cudaMemcpy(eig, d_W, sizeof(double)*ne, cudaMemcpyDeviceToHost);
  cudaStat2 = cudaMemcpy(z, H, sizeof(cuDoubleComplex)*n*ne, cudaMemcpyDeviceToHost);
  cudaStat3 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
  assert(cudaSuccess == cudaStat1);
  assert(cudaSuccess == cudaStat2);
  assert(cudaSuccess == cudaStat3);

  if ( 0 == info ){
    printf("sygvj converges \n");
  }else if ( 0 > info ){
    printf("%d-th parameter is wrong \n", -info);
    exit(1);
  }else{
    printf("WARNING: info = %d : sygvj does not converge \n", info );
  }

  status = cusolverDnXsyevjGetSweeps(cusolverH,syevj_params,&executed_sweeps);
  assert(CUSOLVER_STATUS_SUCCESS == status);

  status = cusolverDnXsyevjGetResidual(cusolverH,syevj_params,&residual);
  assert(CUSOLVER_STATUS_SUCCESS == status);

  printf("residual |A - V*W*V**H|_F = %E \n", residual );
  printf("number of executed sweeps = %d \n", executed_sweeps );

  /* free resources */
  if (d_W    ) cudaFree(d_W);
  if (d_info ) cudaFree(d_info);
  if (d_work ) cudaFree(d_work);

  if (cusolverH   ) cusolverDnDestroy(cusolverH);   
  if (stream      ) cudaStreamDestroy(stream);
  if (syevj_params) cusolverDnDestroySyevjInfo(syevj_params);

  //  cudaDeviceReset();

  return ;
}


void cusolver_real(double *H,double *S,int n,int ne,double tol,int max_sweeps,double* eig,double *z){

  cusolverDnHandle_t cusolverH = NULL;
  cudaStream_t stream = NULL;
  syevjInfo_t syevj_params = NULL;

  cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
  cudaError_t cudaStat1 = cudaSuccess;
  cudaError_t cudaStat2 = cudaSuccess;
  cudaError_t cudaStat3 = cudaSuccess;

  double *d_W = NULL; /* eigenvalues on device*/
  int *d_info = NULL; /* error info */
  int  lwork = 0;     /* size of workspace */
  double *d_work = NULL; /* device workspace for syevj */
  int info = 0;       /* host copy of error info */

/* configuration of syevj  */
  const cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1; //Solve H psi=S lambda psi
  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvectors.
  const cublasFillMode_t  uplo = CUBLAS_FILL_MODE_LOWER;

  /* numerical results of syevj  */
  double residual = 0;
  int executed_sweeps = 0;
  
  /* step 1: create cusolver handle, bind a stream  */
  status = cusolverDnCreate(&cusolverH);
  assert(CUSOLVER_STATUS_SUCCESS == status);

  cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
  assert(cudaSuccess == cudaStat1);

  status = cusolverDnSetStream(cusolverH, stream);
  assert(CUSOLVER_STATUS_SUCCESS == status);

  /* step 2: configuration of syevj */
  status = cusolverDnCreateSyevjInfo(&syevj_params);
  assert(CUSOLVER_STATUS_SUCCESS == status);
  
  /* default value of tolerance is machine zero */
dw113919's avatar
dw113919 committed
169
  status = cusolverDnXsyevjSetTolerance(syevj_params,tol);
170 171 172
  assert(CUSOLVER_STATUS_SUCCESS == status);
  
  /* default value of max. sweeps is 100 */
dw113919's avatar
dw113919 committed
173
  status = cusolverDnXsyevjSetMaxSweeps(syevj_params,max_sweeps);
174
  assert(CUSOLVER_STATUS_SUCCESS == status);
dw113919's avatar
dw113919 committed
175
  printf("Allocate data \n");
176 177 178 179 180
  /* step 3: copy A to device */
  cudaStat2 = cudaMalloc ((void**)&d_W, sizeof(double) * n);
  cudaStat3 = cudaMalloc ((void**)&d_info, sizeof(int));
  assert(cudaSuccess == cudaStat2);
  assert(cudaSuccess == cudaStat3);
dw113919's avatar
dw113919 committed
181
  printf("query working space \n");
182 183 184 185 186 187
  /* step 4: query working space of sygvj */
  status = cusolverDnDsygvj_bufferSize(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,&lwork,syevj_params);
  assert(CUSOLVER_STATUS_SUCCESS == status);
  
  cudaStat1 = cudaMalloc((void**)&d_work, sizeof(double)*lwork);
  assert(cudaSuccess == cudaStat1);
dw113919's avatar
dw113919 committed
188
  printf("compute eigen-pair \n");
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
  /* step 5: compute eigen-pair   */
  status = cusolverDnDsygvj(cusolverH,itype,jobz,uplo,n,H,n,S,n,d_W,d_work,lwork,d_info,syevj_params);
  cudaStat1 = cudaDeviceSynchronize();
  assert(CUSOLVER_STATUS_SUCCESS == status);
  assert(cudaSuccess == cudaStat1);

  cudaStat1 = cudaMemcpy(eig, d_W, sizeof(double)*ne, cudaMemcpyDeviceToHost);
  cudaStat2 = cudaMemcpy(z, H, sizeof(double)*n*ne, cudaMemcpyDeviceToHost);
  cudaStat3 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
  assert(cudaSuccess == cudaStat1);
  assert(cudaSuccess == cudaStat2);
  assert(cudaSuccess == cudaStat3);

  if ( 0 == info ){
    printf("sygvj converges \n");
  }else if ( 0 > info ){
    printf("%d-th parameter is wrong \n", -info);
    exit(1);
  }else{
    printf("WARNING: info = %d : sygvj does not converge \n", info );
  }

  status = cusolverDnXsyevjGetSweeps(cusolverH,syevj_params,&executed_sweeps);
  assert(CUSOLVER_STATUS_SUCCESS == status);

  status = cusolverDnXsyevjGetResidual(cusolverH,syevj_params,&residual);
  assert(CUSOLVER_STATUS_SUCCESS == status);

  printf("residual |A - V*W*V**H|_F = %E \n", residual );
  printf("number of executed sweeps = %d \n", executed_sweeps );

  /* free resources */
  if (d_W    ) cudaFree(d_W);
  if (d_info ) cudaFree(d_info);
  if (d_work ) cudaFree(d_work);

  if (syevj_params) cusolverDnDestroySyevjInfo(syevj_params);
dw113919's avatar
dw113919 committed
226 227
  if (stream      ) cudaStreamDestroy(stream);
  if (cusolverH   ) cusolverDnDestroy(cusolverH);   
228 229 230 231 232

  //  cudaDeviceReset();

  return ;
}