Actual source code: mpicuda.cu
2: /*
3: This file contains routines for Parallel vector operations.
4: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <../src/vec/vec/impls/mpi/pvecimpl.h>
9: #include <petsc/private/cudavecimpl.h>
11: /*MC
12: VECCUDA - VECCUDA = "cuda" - A VECSEQCUDA on a single-process communicator, and VECMPICUDA otherwise.
14: Options Database Keys:
15: . -vec_type cuda - sets the vector type to VECCUDA during a call to VecSetFromOptions()
17: Level: beginner
19: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateMPIWithArray(), VECSEQCUDA, VECMPICUDA, VECSTANDARD, VecType, VecCreateMPI(), VecSetPinnedMemoryMin()
20: M*/
22: PetscErrorCode VecDestroy_MPICUDA(Vec v)
23: {
24: Vec_MPI *vecmpi = (Vec_MPI*)v->data;
25: Vec_CUDA *veccuda;
27: cudaError_t err;
30: if (v->spptr) {
31: veccuda = (Vec_CUDA*)v->spptr;
32: if (veccuda->GPUarray_allocated) {
33: err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
34: veccuda->GPUarray_allocated = NULL;
35: }
36: if (veccuda->stream) {
37: err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
38: }
39: if (v->pinned_memory) {
40: PetscMallocSetCUDAHost();
41: PetscFree(vecmpi->array_allocated);
42: PetscMallocResetCUDAHost();
43: v->pinned_memory = PETSC_FALSE;
44: }
45: PetscFree(v->spptr);
46: }
47: VecDestroy_MPI(v);
48: return(0);
49: }
51: PetscErrorCode VecNorm_MPICUDA(Vec xin,NormType type,PetscReal *z)
52: {
53: PetscReal sum,work = 0.0;
57: if (type == NORM_2 || type == NORM_FROBENIUS) {
58: VecNorm_SeqCUDA(xin,NORM_2,&work);
59: work *= work;
60: MPIU_Allreduce(&work,&sum,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
61: *z = PetscSqrtReal(sum);
62: } else if (type == NORM_1) {
63: /* Find the local part */
64: VecNorm_SeqCUDA(xin,NORM_1,&work);
65: /* Find the global max */
66: MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
67: } else if (type == NORM_INFINITY) {
68: /* Find the local max */
69: VecNorm_SeqCUDA(xin,NORM_INFINITY,&work);
70: /* Find the global max */
71: MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)xin));
72: } else if (type == NORM_1_AND_2) {
73: PetscReal temp[2];
74: VecNorm_SeqCUDA(xin,NORM_1,temp);
75: VecNorm_SeqCUDA(xin,NORM_2,temp+1);
76: temp[1] = temp[1]*temp[1];
77: MPIU_Allreduce(temp,z,2,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
78: z[1] = PetscSqrtReal(z[1]);
79: }
80: return(0);
81: }
83: PetscErrorCode VecDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z)
84: {
85: PetscScalar sum,work;
89: VecDot_SeqCUDA(xin,yin,&work);
90: MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
91: *z = sum;
92: return(0);
93: }
95: PetscErrorCode VecTDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z)
96: {
97: PetscScalar sum,work;
101: VecTDot_SeqCUDA(xin,yin,&work);
102: MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
103: *z = sum;
104: return(0);
105: }
107: PetscErrorCode VecMDot_MPICUDA(Vec xin,PetscInt nv,const Vec y[],PetscScalar *z)
108: {
109: PetscScalar awork[128],*work = awork;
113: if (nv > 128) {
114: PetscMalloc1(nv,&work);
115: }
116: VecMDot_SeqCUDA(xin,nv,y,work);
117: MPIU_Allreduce(work,z,nv,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
118: if (nv > 128) {
119: PetscFree(work);
120: }
121: return(0);
122: }
124: /*MC
125: VECMPICUDA - VECMPICUDA = "mpicuda" - The basic parallel vector, modified to use CUDA
127: Options Database Keys:
128: . -vec_type mpicuda - sets the vector type to VECMPICUDA during a call to VecSetFromOptions()
130: Level: beginner
132: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateMPIWithArray(), VECMPI, VecType, VecCreateMPI(), VecSetPinnedMemoryMin()
133: M*/
135: PetscErrorCode VecDuplicate_MPICUDA(Vec win,Vec *v)
136: {
138: Vec_MPI *vw,*w = (Vec_MPI*)win->data;
139: PetscScalar *array;
142: VecCreate(PetscObjectComm((PetscObject)win),v);
143: PetscLayoutReference(win->map,&(*v)->map);
145: VecCreate_MPICUDA_Private(*v,PETSC_TRUE,w->nghost,0);
146: vw = (Vec_MPI*)(*v)->data;
147: PetscMemcpy((*v)->ops,win->ops,sizeof(struct _VecOps));
149: /* save local representation of the parallel vector (and scatter) if it exists */
150: if (w->localrep) {
151: VecGetArray(*v,&array);
152: VecCreateSeqWithArray(PETSC_COMM_SELF,1,win->map->n+w->nghost,array,&vw->localrep);
153: PetscMemcpy(vw->localrep->ops,w->localrep->ops,sizeof(struct _VecOps));
154: VecRestoreArray(*v,&array);
155: PetscLogObjectParent((PetscObject)*v,(PetscObject)vw->localrep);
156: vw->localupdate = w->localupdate;
157: if (vw->localupdate) {
158: PetscObjectReference((PetscObject)vw->localupdate);
159: }
160: }
162: /* New vector should inherit stashing property of parent */
163: (*v)->stash.donotstash = win->stash.donotstash;
164: (*v)->stash.ignorenegidx = win->stash.ignorenegidx;
166: /* change type_name appropriately */
167: VecCUDAAllocateCheck(*v);
168: PetscObjectChangeTypeName((PetscObject)(*v),VECMPICUDA);
170: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*v))->olist);
171: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*v))->qlist);
172: (*v)->map->bs = PetscAbs(win->map->bs);
173: (*v)->bstash.bs = win->bstash.bs;
174: return(0);
175: }
177: PetscErrorCode VecDotNorm2_MPICUDA(Vec s,Vec t,PetscScalar *dp,PetscScalar *nm)
178: {
180: PetscScalar work[2],sum[2];
183: VecDotNorm2_SeqCUDA(s,t,work,work+1);
184: MPIU_Allreduce(&work,&sum,2,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)s));
185: *dp = sum[0];
186: *nm = sum[1];
187: return(0);
188: }
190: PetscErrorCode VecCreate_MPICUDA(Vec vv)
191: {
195: PetscCUDAInitializeCheck();
196: PetscLayoutSetUp(vv->map);
197: VecCUDAAllocateCheck(vv);
198: VecCreate_MPICUDA_Private(vv,PETSC_FALSE,0,((Vec_CUDA*)vv->spptr)->GPUarray_allocated);
199: VecCUDAAllocateCheckHost(vv);
200: VecSet(vv,0.0);
201: VecSet_Seq(vv,0.0);
202: vv->offloadmask = PETSC_OFFLOAD_BOTH;
203: return(0);
204: }
206: PetscErrorCode VecCreate_CUDA(Vec v)
207: {
209: PetscMPIInt size;
212: MPI_Comm_size(PetscObjectComm((PetscObject)v),&size);
213: if (size == 1) {
214: VecSetType(v,VECSEQCUDA);
215: } else {
216: VecSetType(v,VECMPICUDA);
217: }
218: return(0);
219: }
221: /*@
222: VecCreateMPICUDA - Creates a standard, parallel array-style vector for CUDA devices.
224: Collective
226: Input Parameters:
227: + comm - the MPI communicator to use
228: . n - local vector length (or PETSC_DECIDE to have calculated if N is given)
229: - N - global vector length (or PETSC_DETERMINE to have calculated if n is given)
231: Output Parameter:
232: . v - the vector
234: Notes:
235: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
236: same type as an existing vector.
238: Level: intermediate
240: .seealso: VecCreateMPICUDAWithArray(), VecCreateMPICUDAWithArrays(), VecCreateSeqCUDA(), VecCreateSeq(),
241: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost(),
242: VecCreateMPIWithArray(), VecCreateGhostWithArray(), VecMPISetGhost()
244: @*/
245: PetscErrorCode VecCreateMPICUDA(MPI_Comm comm,PetscInt n,PetscInt N,Vec *v)
246: {
250: VecCreate(comm,v);
251: VecSetSizes(*v,n,N);
252: VecSetType(*v,VECMPICUDA);
253: return(0);
254: }
256: /*@C
257: VecCreateMPICUDAWithArray - Creates a parallel, array-style vector,
258: where the user provides the GPU array space to store the vector values.
260: Collective
262: Input Parameters:
263: + comm - the MPI communicator to use
264: . bs - block size, same meaning as VecSetBlockSize()
265: . n - local vector length, cannot be PETSC_DECIDE
266: . N - global vector length (or PETSC_DECIDE to have calculated)
267: - array - the user provided GPU array to store the vector values
269: Output Parameter:
270: . vv - the vector
272: Notes:
273: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
274: same type as an existing vector.
276: If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
277: at a later stage to SET the array for storing the vector values.
279: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
280: The user should not free the array until the vector is destroyed.
282: Level: intermediate
284: .seealso: VecCreateMPICUDA(), VecCreateSeqCUDAWithArray(), VecCreateMPIWithArray(), VecCreateSeqWithArray(),
285: VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost(),
286: VecCreateMPI(), VecCreateGhostWithArray(), VecPlaceArray()
288: @*/
289: PetscErrorCode VecCreateMPICUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,PetscInt N,const PetscScalar array[],Vec *vv)
290: {
294: if (n == PETSC_DECIDE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Must set local size of vector");
295: PetscCUDAInitializeCheck();
296: VecCreate(comm,vv);
297: VecSetSizes(*vv,n,N);
298: VecSetBlockSize(*vv,bs);
299: VecCreate_MPICUDA_Private(*vv,PETSC_FALSE,0,array);
300: return(0);
301: }
303: /*@C
304: VecCreateMPICUDAWithArrays - Creates a parallel, array-style vector,
305: where the user provides the GPU array space to store the vector values.
307: Collective
309: Input Parameters:
310: + comm - the MPI communicator to use
311: . bs - block size, same meaning as VecSetBlockSize()
312: . n - local vector length, cannot be PETSC_DECIDE
313: . N - global vector length (or PETSC_DECIDE to have calculated)
314: - cpuarray - the user provided CPU array to store the vector values
315: - gpuarray - the user provided GPU array to store the vector values
317: Output Parameter:
318: . vv - the vector
320: Notes:
321: If both cpuarray and gpuarray are provided, the caller must ensure that
322: the provided arrays have identical values.
324: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
325: same type as an existing vector.
327: PETSc does NOT free the provided arrays when the vector is destroyed via
328: VecDestroy(). The user should not free the array until the vector is
329: destroyed.
331: Level: intermediate
333: .seealso: VecCreateSeqCUDAWithArrays(), VecCreateMPIWithArray(), VecCreateSeqWithArray(),
334: VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost(),
335: VecCreateMPI(), VecCreateGhostWithArray(), VecCUDAPlaceArray(), VecPlaceArray(),
336: VecCUDAAllocateCheckHost()
337: @*/
338: PetscErrorCode VecCreateMPICUDAWithArrays(MPI_Comm comm,PetscInt bs,PetscInt n,PetscInt N,const PetscScalar cpuarray[],const PetscScalar gpuarray[],Vec *vv)
339: {
343: VecCreateMPICUDAWithArray(comm,bs,n,N,gpuarray,vv);
345: if (cpuarray && gpuarray) {
346: Vec_MPI *s = (Vec_MPI*)((*vv)->data);
347: s->array = (PetscScalar*)cpuarray;
348: (*vv)->offloadmask = PETSC_OFFLOAD_BOTH;
349: } else if (cpuarray) {
350: Vec_MPI *s = (Vec_MPI*)((*vv)->data);
351: s->array = (PetscScalar*)cpuarray;
352: (*vv)->offloadmask = PETSC_OFFLOAD_CPU;
353: } else if (gpuarray) {
354: (*vv)->offloadmask = PETSC_OFFLOAD_GPU;
355: } else {
356: (*vv)->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
357: }
359: return(0);
360: }
362: PetscErrorCode VecMax_MPICUDA(Vec xin,PetscInt *idx,PetscReal *z)
363: {
365: PetscReal work;
368: VecMax_SeqCUDA(xin,idx,&work);
369: if (!idx) {
370: MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)xin));
371: } else {
372: struct { PetscReal v; PetscInt i; } in,out;
374: in.v = work;
375: in.i = *idx + xin->map->rstart;
376: MPIU_Allreduce(&in,&out,1,MPIU_REAL_INT,MPIU_MAXLOC,PetscObjectComm((PetscObject)xin));
377: *z = out.v;
378: *idx = out.i;
379: }
380: return(0);
381: }
383: PetscErrorCode VecMin_MPICUDA(Vec xin,PetscInt *idx,PetscReal *z)
384: {
386: PetscReal work;
389: VecMin_SeqCUDA(xin,idx,&work);
390: if (!idx) {
391: MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_MIN,PetscObjectComm((PetscObject)xin));
392: } else {
393: struct { PetscReal v; PetscInt i; } in,out;
395: in.v = work;
396: in.i = *idx + xin->map->rstart;
397: MPIU_Allreduce(&in,&out,1,MPIU_REAL_INT,MPIU_MINLOC,PetscObjectComm((PetscObject)xin));
398: *z = out.v;
399: *idx = out.i;
400: }
401: return(0);
402: }
404: PetscErrorCode VecBindToCPU_MPICUDA(Vec V,PetscBool pin)
405: {
409: V->boundtocpu = pin;
410: if (pin) {
411: VecCUDACopyFromGPU(V);
412: V->offloadmask = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
413: V->ops->dotnorm2 = NULL;
414: V->ops->waxpy = VecWAXPY_Seq;
415: V->ops->dot = VecDot_MPI;
416: V->ops->mdot = VecMDot_MPI;
417: V->ops->tdot = VecTDot_MPI;
418: V->ops->norm = VecNorm_MPI;
419: V->ops->scale = VecScale_Seq;
420: V->ops->copy = VecCopy_Seq;
421: V->ops->set = VecSet_Seq;
422: V->ops->swap = VecSwap_Seq;
423: V->ops->axpy = VecAXPY_Seq;
424: V->ops->axpby = VecAXPBY_Seq;
425: V->ops->maxpy = VecMAXPY_Seq;
426: V->ops->aypx = VecAYPX_Seq;
427: V->ops->axpbypcz = VecAXPBYPCZ_Seq;
428: V->ops->pointwisemult = VecPointwiseMult_Seq;
429: V->ops->setrandom = VecSetRandom_Seq;
430: V->ops->placearray = VecPlaceArray_Seq;
431: V->ops->replacearray = VecReplaceArray_SeqCUDA;
432: V->ops->resetarray = VecResetArray_Seq;
433: V->ops->dot_local = VecDot_Seq;
434: V->ops->tdot_local = VecTDot_Seq;
435: V->ops->norm_local = VecNorm_Seq;
436: V->ops->mdot_local = VecMDot_Seq;
437: V->ops->pointwisedivide = VecPointwiseDivide_Seq;
438: V->ops->getlocalvector = NULL;
439: V->ops->restorelocalvector = NULL;
440: V->ops->getlocalvectorread = NULL;
441: V->ops->restorelocalvectorread = NULL;
442: V->ops->getarraywrite = NULL;
443: V->ops->max = VecMax_MPI;
444: V->ops->min = VecMin_MPI;
445: V->ops->reciprocal = VecReciprocal_Default;
446: V->ops->sum = NULL;
447: V->ops->shift = NULL;
448: /* default random number generator */
449: PetscFree(V->defaultrandtype);
450: PetscStrallocpy(PETSCRANDER48,&V->defaultrandtype);
451: } else {
452: V->ops->dotnorm2 = VecDotNorm2_MPICUDA;
453: V->ops->waxpy = VecWAXPY_SeqCUDA;
454: V->ops->duplicate = VecDuplicate_MPICUDA;
455: V->ops->dot = VecDot_MPICUDA;
456: V->ops->mdot = VecMDot_MPICUDA;
457: V->ops->tdot = VecTDot_MPICUDA;
458: V->ops->norm = VecNorm_MPICUDA;
459: V->ops->scale = VecScale_SeqCUDA;
460: V->ops->copy = VecCopy_SeqCUDA;
461: V->ops->set = VecSet_SeqCUDA;
462: V->ops->swap = VecSwap_SeqCUDA;
463: V->ops->axpy = VecAXPY_SeqCUDA;
464: V->ops->axpby = VecAXPBY_SeqCUDA;
465: V->ops->maxpy = VecMAXPY_SeqCUDA;
466: V->ops->aypx = VecAYPX_SeqCUDA;
467: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUDA;
468: V->ops->pointwisemult = VecPointwiseMult_SeqCUDA;
469: V->ops->setrandom = VecSetRandom_SeqCUDA;
470: V->ops->placearray = VecPlaceArray_SeqCUDA;
471: V->ops->replacearray = VecReplaceArray_SeqCUDA;
472: V->ops->resetarray = VecResetArray_SeqCUDA;
473: V->ops->dot_local = VecDot_SeqCUDA;
474: V->ops->tdot_local = VecTDot_SeqCUDA;
475: V->ops->norm_local = VecNorm_SeqCUDA;
476: V->ops->mdot_local = VecMDot_SeqCUDA;
477: V->ops->destroy = VecDestroy_MPICUDA;
478: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUDA;
479: V->ops->getlocalvector = VecGetLocalVector_SeqCUDA;
480: V->ops->restorelocalvector = VecRestoreLocalVector_SeqCUDA;
481: V->ops->getlocalvectorread = VecGetLocalVectorRead_SeqCUDA;
482: V->ops->restorelocalvectorread = VecRestoreLocalVectorRead_SeqCUDA;
483: V->ops->getarraywrite = VecGetArrayWrite_SeqCUDA;
484: V->ops->getarray = VecGetArray_SeqCUDA;
485: V->ops->restorearray = VecRestoreArray_SeqCUDA;
486: V->ops->getarrayandmemtype = VecGetArrayAndMemType_SeqCUDA;
487: V->ops->restorearrayandmemtype = VecRestoreArrayAndMemType_SeqCUDA;
488: V->ops->max = VecMax_MPICUDA;
489: V->ops->min = VecMin_MPICUDA;
490: V->ops->reciprocal = VecReciprocal_SeqCUDA;
491: V->ops->sum = VecSum_SeqCUDA;
492: V->ops->shift = VecShift_SeqCUDA;
493: /* default random number generator */
494: PetscFree(V->defaultrandtype);
495: PetscStrallocpy(PETSCCURAND,&V->defaultrandtype);
496: }
497: return(0);
498: }
500: PetscErrorCode VecCreate_MPICUDA_Private(Vec vv,PetscBool alloc,PetscInt nghost,const PetscScalar array[])
501: {
503: Vec_CUDA *veccuda;
506: VecCreate_MPI_Private(vv,PETSC_FALSE,0,0);
507: PetscObjectChangeTypeName((PetscObject)vv,VECMPICUDA);
509: VecBindToCPU_MPICUDA(vv,PETSC_FALSE);
510: vv->ops->bindtocpu = VecBindToCPU_MPICUDA;
512: /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
513: if (alloc && !array) {
514: VecCUDAAllocateCheck(vv);
515: VecCUDAAllocateCheckHost(vv);
516: VecSet(vv,0.0);
517: VecSet_Seq(vv,0.0);
518: vv->offloadmask = PETSC_OFFLOAD_BOTH;
519: }
520: if (array) {
521: if (!vv->spptr) {
522: PetscReal pinned_memory_min;
523: PetscBool flag;
524: /* Cannot use PetscNew() here because spptr is void* */
525: PetscCalloc(sizeof(Vec_CUDA),&vv->spptr);
526: veccuda = (Vec_CUDA*)vv->spptr;
527: vv->minimum_bytes_pinned_memory = 0;
529: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
530: Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCUDAAllocateCheck(). Is there a good way to avoid this? */
531: PetscOptionsBegin(PetscObjectComm((PetscObject)vv),((PetscObject)vv)->prefix,"VECCUDA Options","Vec");
532: pinned_memory_min = vv->minimum_bytes_pinned_memory;
533: PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&flag);
534: if (flag) vv->minimum_bytes_pinned_memory = pinned_memory_min;
535: PetscOptionsEnd();
536: }
537: veccuda = (Vec_CUDA*)vv->spptr;
538: veccuda->GPUarray = (PetscScalar*)array;
539: vv->offloadmask = PETSC_OFFLOAD_GPU;
540: }
541: return(0);
542: }