Actual source code: bvcuda.cu
slepc-3.23.1 2025-05-01
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: CUDA-related code common to several BV impls
12: */
14: #include <slepc/private/bvimpl.h>
15: #include <slepccupmblas.h>
17: #define BLOCKSIZE 64
19: /*
20: C := alpha*A*B + beta*C
21: */
22: PetscErrorCode BVMult_BLAS_CUDA(BV,PetscInt m_,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar beta,PetscScalar *d_C,PetscInt ldc_)
23: {
24: PetscCuBLASInt m=0,n=0,k=0,lda=0,ldb=0,ldc=0;
25: cublasHandle_t cublasv2handle;
27: PetscFunctionBegin;
28: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
29: PetscCall(PetscCuBLASIntCast(m_,&m));
30: PetscCall(PetscCuBLASIntCast(n_,&n));
31: PetscCall(PetscCuBLASIntCast(k_,&k));
32: PetscCall(PetscCuBLASIntCast(lda_,&lda));
33: PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
34: PetscCall(PetscCuBLASIntCast(ldc_,&ldc));
35: PetscCall(PetscLogGpuTimeBegin());
36: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,lda,d_B,ldb,&beta,d_C,ldc));
37: PetscCall(PetscLogGpuTimeEnd());
38: PetscCall(PetscLogGpuFlops(2.0*m*n*k));
39: PetscFunctionReturn(PETSC_SUCCESS);
40: }
42: /*
43: y := alpha*A*x + beta*y
44: */
45: PetscErrorCode BVMultVec_BLAS_CUDA(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar beta,PetscScalar *d_y)
46: {
47: PetscCuBLASInt n=0,k=0,lda=0,one=1;
48: cublasHandle_t cublasv2handle;
50: PetscFunctionBegin;
51: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
52: PetscCall(PetscCuBLASIntCast(n_,&n));
53: PetscCall(PetscCuBLASIntCast(k_,&k));
54: PetscCall(PetscCuBLASIntCast(lda_,&lda));
55: PetscCall(PetscLogGpuTimeBegin());
56: PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,lda,d_x,one,&beta,d_y,one));
57: PetscCall(PetscLogGpuTimeEnd());
58: PetscCall(PetscLogGpuFlops(2.0*n*k));
59: PetscFunctionReturn(PETSC_SUCCESS);
60: }
62: /*
63: A(:,s:e-1) := A*B(:,s:e-1)
64: */
65: PetscErrorCode BVMultInPlace_BLAS_CUDA(BV,PetscInt m_,PetscInt k_,PetscInt s,PetscInt e,PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscBool btrans)
66: {
67: const PetscScalar *d_B1;
68: PetscScalar *d_work,sone=1.0,szero=0.0;
69: PetscCuBLASInt m=0,n=0,k=0,l=0,lda=0,ldb=0,bs=BLOCKSIZE;
70: size_t freemem,totmem;
71: cublasHandle_t cublasv2handle;
72: cublasOperation_t bt;
74: PetscFunctionBegin;
75: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
76: PetscCall(PetscCuBLASIntCast(m_,&m));
77: PetscCall(PetscCuBLASIntCast(e-s,&n));
78: PetscCall(PetscCuBLASIntCast(k_,&k));
79: PetscCall(PetscCuBLASIntCast(lda_,&lda));
80: PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
81: PetscCall(PetscLogGpuTimeBegin());
82: if (PetscUnlikely(btrans)) {
83: d_B1 = d_B+s;
84: bt = CUBLAS_OP_C;
85: } else {
86: d_B1 = d_B+s*ldb;
87: bt = CUBLAS_OP_N;
88: }
89: /* try to allocate the whole matrix */
90: PetscCallCUDA(cudaMemGetInfo(&freemem,&totmem));
91: if (freemem>=lda*n*sizeof(PetscScalar)) {
92: PetscCallCUDA(cudaMalloc((void**)&d_work,lda*n*sizeof(PetscScalar)));
93: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,m,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,lda));
94: PetscCallCUDA(cudaMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,lda*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
95: } else {
96: PetscCall(PetscCuBLASIntCast(freemem/(m*sizeof(PetscScalar)),&bs));
97: PetscCallCUDA(cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar)));
98: PetscCall(PetscCuBLASIntCast(m % bs,&l));
99: if (l) {
100: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,l,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,l));
101: PetscCallCUDA(cudaMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,l*sizeof(PetscScalar),l*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
102: }
103: for (;l<m;l+=bs) {
104: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,bs,n,k,&sone,d_A+l,lda,d_B1,ldb,&szero,d_work,bs));
105: PetscCallCUDA(cudaMemcpy2D(d_A+l+s*lda,lda*sizeof(PetscScalar),d_work,bs*sizeof(PetscScalar),bs*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
106: }
107: }
108: PetscCall(PetscLogGpuTimeEnd());
109: PetscCallCUDA(cudaFree(d_work));
110: PetscCall(PetscLogGpuFlops(2.0*m*n*k));
111: PetscFunctionReturn(PETSC_SUCCESS);
112: }
114: /*
115: B := alpha*A + beta*B
116: */
117: PetscErrorCode BVAXPY_BLAS_CUDA(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,PetscScalar beta,PetscScalar *d_B,PetscInt ldb_)
118: {
119: PetscCuBLASInt n=0,k=0,lda=0,ldb=0;
120: cublasHandle_t cublasv2handle;
122: PetscFunctionBegin;
123: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
124: PetscCall(PetscCuBLASIntCast(n_,&n));
125: PetscCall(PetscCuBLASIntCast(k_,&k));
126: PetscCall(PetscCuBLASIntCast(lda_,&lda));
127: PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
128: PetscCall(PetscLogGpuTimeBegin());
129: PetscCallCUBLAS(cublasXgeam(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,k,&alpha,d_A,lda,&beta,d_B,ldb,d_B,ldb));
130: PetscCall(PetscLogGpuTimeEnd());
131: PetscCall(PetscLogGpuFlops((beta==(PetscScalar)1.0)?2.0*n*k:3.0*n*k));
132: PetscFunctionReturn(PETSC_SUCCESS);
133: }
135: /*
136: C := A'*B
138: C is a CPU array
139: */
140: PetscErrorCode BVDot_BLAS_CUDA(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar *C,PetscInt ldc_,PetscBool mpi)
141: {
142: PetscScalar *d_work,sone=1.0,szero=0.0,*CC;
143: PetscInt j;
144: PetscCuBLASInt m=0,n=0,k=0,lda=0,ldb=0,ldc=0;
145: PetscMPIInt len;
146: cublasHandle_t cublasv2handle;
148: PetscFunctionBegin;
149: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
150: PetscCall(PetscCuBLASIntCast(m_,&m));
151: PetscCall(PetscCuBLASIntCast(n_,&n));
152: PetscCall(PetscCuBLASIntCast(k_,&k));
153: PetscCall(PetscCuBLASIntCast(lda_,&lda));
154: PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
155: PetscCall(PetscCuBLASIntCast(ldc_,&ldc));
156: PetscCallCUDA(cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar)));
157: if (mpi) {
158: if (ldc==m) {
159: PetscCall(BVAllocateWork_Private(bv,m*n));
160: if (k) {
161: PetscCall(PetscLogGpuTimeBegin());
162: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,ldc));
163: PetscCall(PetscLogGpuTimeEnd());
164: PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
165: PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
166: } else PetscCall(PetscArrayzero(bv->work,m*n));
167: PetscCall(PetscMPIIntCast(m*n,&len));
168: PetscCallMPI(MPIU_Allreduce(bv->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
169: } else {
170: PetscCall(BVAllocateWork_Private(bv,2*m*n));
171: CC = bv->work+m*n;
172: if (k) {
173: PetscCall(PetscLogGpuTimeBegin());
174: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m));
175: PetscCall(PetscLogGpuTimeEnd());
176: PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
177: PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
178: } else PetscCall(PetscArrayzero(bv->work,m*n));
179: PetscCall(PetscMPIIntCast(m*n,&len));
180: PetscCallMPI(MPIU_Allreduce(bv->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
181: for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,CC+j*m,m));
182: }
183: } else {
184: if (k) {
185: PetscCall(BVAllocateWork_Private(bv,m*n));
186: PetscCall(PetscLogGpuTimeBegin());
187: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m));
188: PetscCall(PetscLogGpuTimeEnd());
189: PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
190: PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
191: for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,bv->work+j*m,m));
192: }
193: }
194: PetscCallCUDA(cudaFree(d_work));
195: PetscCall(PetscLogGpuFlops(2.0*m*n*k));
196: PetscFunctionReturn(PETSC_SUCCESS);
197: }
199: /*
200: y := A'*x
202: y is a CPU array, if NULL bv->buffer is used as a workspace
203: */
204: PetscErrorCode BVDotVec_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar *y,PetscBool mpi)
205: {
206: PetscScalar *d_work,szero=0.0,sone=1.0,*yy;
207: PetscCuBLASInt n=0,k=0,lda=0,one=1;
208: PetscMPIInt len;
209: cublasHandle_t cublasv2handle;
211: PetscFunctionBegin;
212: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
213: PetscCall(PetscCuBLASIntCast(n_,&n));
214: PetscCall(PetscCuBLASIntCast(k_,&k));
215: PetscCall(PetscCuBLASIntCast(lda_,&lda));
216: if (!y) PetscCall(VecCUDAGetArrayWrite(bv->buffer,&d_work));
217: else PetscCallCUDA(cudaMalloc((void**)&d_work,k*sizeof(PetscScalar)));
218: if (mpi) {
219: PetscCall(BVAllocateWork_Private(bv,k));
220: if (n) {
221: PetscCall(PetscLogGpuTimeBegin());
222: PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_C,n,k,&sone,d_A,lda,d_x,one,&szero,d_work,one));
223: PetscCall(PetscLogGpuTimeEnd());
224: PetscCallCUDA(cudaMemcpy(bv->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
225: PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
226: } else PetscCall(PetscArrayzero(bv->work,k));
227: /* reduction */
228: PetscCall(PetscMPIIntCast(k,&len));
229: if (!y) {
230: if (use_gpu_aware_mpi) { /* case 1: reduce on GPU using a temporary buffer */
231: PetscCallCUDA(cudaMalloc((void**)&yy,k*sizeof(PetscScalar)));
232: PetscCallMPI(MPIU_Allreduce(d_work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
233: PetscCallCUDA(cudaMemcpy(d_work,yy,k*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
234: PetscCallCUDA(cudaFree(yy));
235: } else { /* case 2: reduce on CPU, copy result back to GPU */
236: PetscCall(BVAllocateWork_Private(bv,2*k));
237: yy = bv->work+k;
238: PetscCallCUDA(cudaMemcpy(bv->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
239: PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
240: PetscCallMPI(MPIU_Allreduce(bv->work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
241: PetscCallCUDA(cudaMemcpy(d_work,yy,k*sizeof(PetscScalar),cudaMemcpyHostToDevice));
242: PetscCall(PetscLogCpuToGpu(k*sizeof(PetscScalar)));
243: }
244: PetscCall(VecCUDARestoreArrayWrite(bv->buffer,&d_work));
245: } else { /* case 3: user-provided array y, reduce on CPU */
246: PetscCallCUDA(cudaFree(d_work));
247: PetscCallMPI(MPIU_Allreduce(bv->work,y,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
248: }
249: } else {
250: if (n) {
251: PetscCall(PetscLogGpuTimeBegin());
252: PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_C,n,k,&sone,d_A,lda,d_x,one,&szero,d_work,one));
253: PetscCall(PetscLogGpuTimeEnd());
254: }
255: if (!y) PetscCall(VecCUDARestoreArrayWrite(bv->buffer,&d_work));
256: else {
257: PetscCallCUDA(cudaMemcpy(y,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
258: PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
259: PetscCallCUDA(cudaFree(d_work));
260: }
261: }
262: PetscCall(PetscLogGpuFlops(2.0*n*k));
263: PetscFunctionReturn(PETSC_SUCCESS);
264: }
266: /*
267: Scale n scalars
268: */
269: PetscErrorCode BVScale_BLAS_CUDA(BV,PetscInt n_,PetscScalar *d_A,PetscScalar alpha)
270: {
271: PetscCuBLASInt n=0,one=1;
272: cublasHandle_t cublasv2handle;
274: PetscFunctionBegin;
275: PetscCall(PetscCuBLASIntCast(n_,&n));
276: if (PetscUnlikely(alpha == (PetscScalar)0.0)) PetscCallCUDA(cudaMemset(d_A,0,n*sizeof(PetscScalar)));
277: else if (alpha != (PetscScalar)1.0) {
278: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
279: PetscCall(PetscLogGpuTimeBegin());
280: PetscCallCUBLAS(cublasXscal(cublasv2handle,n,&alpha,d_A,one));
281: PetscCall(PetscLogGpuTimeEnd());
282: PetscCall(PetscLogGpuFlops(1.0*n));
283: }
284: PetscFunctionReturn(PETSC_SUCCESS);
285: }
287: /*
288: Compute 2-norm of vector consisting of n scalars
289: */
290: PetscErrorCode BVNorm_BLAS_CUDA(BV,PetscInt n_,const PetscScalar *d_A,PetscReal *nrm)
291: {
292: PetscCuBLASInt n=0,one=1;
293: cublasHandle_t cublasv2handle;
295: PetscFunctionBegin;
296: PetscCall(PetscCuBLASIntCast(n_,&n));
297: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
298: PetscCall(PetscLogGpuTimeBegin());
299: PetscCallCUBLAS(cublasXnrm2(cublasv2handle,n,d_A,one,nrm));
300: PetscCall(PetscLogGpuTimeEnd());
301: PetscCall(PetscLogGpuFlops(2.0*n));
302: PetscFunctionReturn(PETSC_SUCCESS);
303: }
305: /*
306: Normalize the columns of A
307: */
308: PetscErrorCode BVNormalize_BLAS_CUDA(BV,PetscInt m_,PetscInt n_,PetscScalar *d_A,PetscInt lda_,PetscScalar *eigi)
309: {
310: PetscInt j,k;
311: PetscReal nrm,nrm1;
312: PetscScalar alpha;
313: PetscCuBLASInt m=0,one=1;
314: cublasHandle_t cublasv2handle;
316: PetscFunctionBegin;
317: PetscCall(PetscCuBLASIntCast(m_,&m));
318: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
319: PetscCall(PetscLogGpuTimeBegin());
320: for (j=0;j<n_;j++) {
321: k = 1;
322: #if !defined(PETSC_USE_COMPLEX)
323: if (eigi && eigi[j] != 0.0) k = 2;
324: #endif
325: PetscCallCUBLAS(cublasXnrm2(cublasv2handle,m,d_A+j*lda_,one,&nrm));
326: if (k==2) {
327: PetscCallCUBLAS(cublasXnrm2(cublasv2handle,m,d_A+(j+1)*lda_,one,&nrm1));
328: nrm = SlepcAbs(nrm,nrm1);
329: }
330: alpha = 1.0/nrm;
331: PetscCallCUBLAS(cublasXscal(cublasv2handle,m,&alpha,d_A+j*lda_,one));
332: if (k==2) {
333: PetscCallCUBLAS(cublasXscal(cublasv2handle,m,&alpha,d_A+(j+1)*lda_,one));
334: j++;
335: }
336: }
337: PetscCall(PetscLogGpuTimeEnd());
338: PetscCall(PetscLogGpuFlops(3.0*m*n_));
339: PetscFunctionReturn(PETSC_SUCCESS);
340: }
342: /*
343: BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
344: */
345: PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
346: {
347: PetscScalar *d_hh,*d_a;
348: PetscInt i;
350: PetscFunctionBegin;
351: if (!h) {
352: PetscCall(VecCUDAGetArray(bv->buffer,&d_a));
353: PetscCall(PetscLogGpuTimeBegin());
354: d_hh = d_a + j*(bv->nc+bv->m);
355: PetscCallCUDA(cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar)));
356: PetscCall(PetscLogGpuTimeEnd());
357: PetscCall(VecCUDARestoreArray(bv->buffer,&d_a));
358: } else { /* cpu memory */
359: for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
360: }
361: PetscFunctionReturn(PETSC_SUCCESS);
362: }
364: /*
365: BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
366: into column j of the bv buffer
367: */
368: PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
369: {
370: PetscScalar *d_h,*d_c,sone=1.0;
371: PetscInt i;
372: PetscCuBLASInt idx=0,one=1;
373: cublasHandle_t cublasv2handle;
375: PetscFunctionBegin;
376: if (!h) {
377: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
378: PetscCall(VecCUDAGetArray(bv->buffer,&d_c));
379: d_h = d_c + j*(bv->nc+bv->m);
380: PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx));
381: PetscCall(PetscLogGpuTimeBegin());
382: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,idx,&sone,d_c,one,d_h,one));
383: PetscCall(PetscLogGpuTimeEnd());
384: PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j)));
385: PetscCall(VecCUDARestoreArray(bv->buffer,&d_c));
386: } else { /* cpu memory */
387: for (i=0;i<bv->nc+j;i++) h[i] += c[i];
388: PetscCall(PetscLogFlops(1.0*(bv->nc+j)));
389: }
390: PetscFunctionReturn(PETSC_SUCCESS);
391: }
393: /*
394: BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
395: of the coefficients array
396: */
397: PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
398: {
399: PetscScalar *d_h,*a;
401: PetscFunctionBegin;
402: if (!h) {
403: PetscCall(VecCUDAGetArray(bv->buffer,&a));
404: PetscCall(PetscLogGpuTimeBegin());
405: d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
406: PetscCallCUDA(cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice));
407: PetscCall(PetscLogCpuToGpu(sizeof(PetscScalar)));
408: PetscCall(PetscLogGpuTimeEnd());
409: PetscCall(VecCUDARestoreArray(bv->buffer,&a));
410: } else { /* cpu memory */
411: h[bv->nc+j] = value;
412: }
413: PetscFunctionReturn(PETSC_SUCCESS);
414: }
416: /*
417: BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
418: coefficients array (up to position j)
419: */
420: PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
421: {
422: const PetscScalar *d_h;
423: PetscScalar dot;
424: PetscInt i;
425: PetscCuBLASInt idx=0,one=1;
426: cublasHandle_t cublasv2handle;
428: PetscFunctionBegin;
429: if (!h) {
430: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
431: PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h));
432: PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx));
433: PetscCall(PetscLogGpuTimeBegin());
434: PetscCallCUBLAS(cublasXdot(cublasv2handle,idx,d_h,one,d_h,one,&dot));
435: PetscCall(PetscLogGpuTimeEnd());
436: PetscCall(PetscLogGpuFlops(2.0*(bv->nc+j)));
437: *sum = PetscRealPart(dot);
438: PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h));
439: } else { /* cpu memory */
440: *sum = 0.0;
441: for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
442: PetscCall(PetscLogFlops(2.0*(bv->nc+j)));
443: }
444: PetscFunctionReturn(PETSC_SUCCESS);
445: }
447: /* pointwise multiplication */
448: static __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
449: {
450: PetscInt x;
452: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
453: if (x<n) a[x] *= PetscRealPart(b[x]);
454: }
456: /* pointwise division */
457: static __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
458: {
459: PetscInt x;
461: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
462: if (x<n) a[x] /= PetscRealPart(b[x]);
463: }
465: /*
466: BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
467: the contents of the coefficients array (up to position j) and omega is the signature;
468: if inverse=TRUE then the operation is h/omega
469: */
470: PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
471: {
472: PetscScalar *d_h;
473: const PetscScalar *d_omega,*omega;
474: PetscInt i,xcount;
475: dim3 blocks3d, threads3d;
477: PetscFunctionBegin;
478: if (!(bv->nc+j)) PetscFunctionReturn(PETSC_SUCCESS);
479: if (!h) {
480: PetscCall(VecCUDAGetArray(bv->buffer,&d_h));
481: PetscCall(VecCUDAGetArrayRead(bv->omega,&d_omega));
482: PetscCall(SlepcKernelSetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount));
483: PetscCall(PetscLogGpuTimeBegin());
484: if (inverse) {
485: for (i=0;i<xcount;i++) PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
486: } else {
487: for (i=0;i<xcount;i++) PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
488: }
489: PetscCallCUDA(cudaGetLastError());
490: PetscCall(PetscLogGpuTimeEnd());
491: PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j)));
492: PetscCall(VecCUDARestoreArrayRead(bv->omega,&d_omega));
493: PetscCall(VecCUDARestoreArray(bv->buffer,&d_h));
494: } else {
495: PetscCall(VecGetArrayRead(bv->omega,&omega));
496: if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
497: else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
498: PetscCall(VecRestoreArrayRead(bv->omega,&omega));
499: PetscCall(PetscLogFlops(1.0*(bv->nc+j)));
500: }
501: PetscFunctionReturn(PETSC_SUCCESS);
502: }
504: /*
505: BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
506: of the coefficients array
507: */
508: PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
509: {
510: const PetscScalar *d_h;
511: PetscScalar hh;
513: PetscFunctionBegin;
514: if (!h) {
515: PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h));
516: PetscCall(PetscLogGpuTimeBegin());
517: PetscCallCUDA(cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost));
518: PetscCall(PetscLogGpuToCpu(sizeof(PetscScalar)));
519: PetscCall(PetscLogGpuTimeEnd());
520: PetscCall(BV_SafeSqrt(bv,hh,beta));
521: PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h));
522: } else PetscCall(BV_SafeSqrt(bv,h[bv->nc+j],beta));
523: PetscFunctionReturn(PETSC_SUCCESS);
524: }
526: /*
527: BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
528: provided by the caller (only values from l to j are copied)
529: */
530: PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
531: {
532: const PetscScalar *d_h,*d_a;
533: PetscInt i;
535: PetscFunctionBegin;
536: if (!h) {
537: PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_a));
538: PetscCall(PetscLogGpuTimeBegin());
539: d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
540: PetscCallCUDA(cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
541: PetscCall(PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar)));
542: PetscCall(PetscLogGpuTimeEnd());
543: PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_a));
544: } else {
545: for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
546: }
547: PetscFunctionReturn(PETSC_SUCCESS);
548: }