Vous êtes sur la page 1sur 5

ExamplesofCuda code

1) Thedotproduct
2) Matrixvectormultiplication
3) Sparsematrixmultiplication
4) Globalreduction

Computingy=ax+ywithaSerialLoop
voidsaxpy_serial(int n,floatalpha,float*x,float*y)
{
for(inti=0;i<n;++i)
y[i]=alpha*x[i]+y[i];
}
//InvokeserialSAXPYkernel
saxpy_serial(n,2.0,x,y);

Computingy=ax+yinparallelusingCUDA
_global_void saxpy_parallel(int n,floatalpha,float*x,float*y)
{
int i =blockIdx.x*blockDim.x +threadIdx.x;
if(i<n)y[i]=alpha*x[i]+y[i];
}
//InvokeparallelSAXPYkernel(256threadsperblock)\\
intnblocks=(n+255)/256;
saxpy_parallel<<<nblocks,256>>>(n,2.0,x,y);

ComputingMatrixvectormultiplicationinparallelusingCUDA
__global__voidmm_simple(float*C,float*A,float*B,int n)
{
int row=blockIdx.y *blockDim.y +threadIdx.y;
int col=blockIdx.x *blockDim.x +threadIdx.x;
floatsum=0.0f;
for(int k=0;k<n;k++){
sum+=A[row*n+k]*B[k*n+col];
}
C[row*n+col]=sum;
}

Sparsematrixrepresentation

A=

30900
05002
00700
00584
00600

Av=[395275846]=nonzeroelements
Aj =[021422342]=columnindicesofelements
Ap =[024589]=pointerstothefirstelementineachrow

Serialsparsematrix/vectormultiplication
voidcsrmul_serial(int *Ap,int *Aj,float*Av,int num_rows,
float*x,float*y)
{
for(int row=0;row<num_rows;++row)
{
int row_begin =Ap[row];
int row_end =Ap[row+1];
y[row]=multiply_row(row_end row_begin, Aj+row_begin,
Av+row_begin,x);
}
}
floatmultiply_row(int rowsize,
int *Aj,
//columnindicesforrow
float*Av,
//nonzeroentriesforrow
float*x)
//theRHSvector
{
floatsum=0;
for(int column=0;column<rowsize;++column)
sum+=Av[column]*x[Aj[column]];
returnsum;
}

Parallelsparsematrix/vectormultiplication
_global_void csrmul_kernel(int *Ap,int *Aj,float*Av,int num_rows,
float*x,float*y)
{
int row=blockIdx.x*blockDim.x +threadIdx.x;
if(row<num_rows )
{
int row_begin =Ap[row];
int row_end =Ap[row+1];
y[row]=multiply_row(row_end row_begin,Aj+row_begin,
Av+row_begin,x);
}
}

Thecodetolaunchtheaboveparallelkernelis:
unsignedint blocksize =128;//oranysizeupto512
unsignedint nblocks =(num_rows +blocksize 1)/blocksize;
csrmul_kernel<<<nblocks,blocksize>>>(Ap,Aj,Av,num_rows,x,y);

Cachinginsharedmemory

=
Block_begin
Cachein
shared
memory

Athread
Block_end

therowexecuted
byathread

Expectmostofthenon
zeroelementshere
(aroundthediagonal)

_global_void csrmul_cached(int *Ap,int *Aj,float*Av,int num_rows,constfloat*x,float*y)


{
_shared_float cache[blocksize];//Cachetherowsofx[]correspondingtothisblock.
int block_begin =blockIdx.x *blockDim.x;
int block_end =block_begin +blockDim.x;
int row=block_begin +threadIdx.x;
//Fetchandcacheourwindowofx[].
if(row<num_rows)cache[threadIdx.x]=x[row];
_syncthreads();
if(row<num_rows )
{
int row_begin =Ap[row];
int row_end =Ap[row+1];
floatx_j ,sum=0;
for(int col=row_begin;col<row_end;++col)
{
int j=Aj[col];
if(j>=block_begin &&j<block_end ) //Fetchx_j fromourcachewhenpossible
x_j =cache[jblock_begin];
else
x_j =x[j];
sum+=Av[col]*x_j;
}
y[row]=sum;
}
}

Parallelreduction
_global_void plus_reduce(int *input,int N,int *total)
{
int tid =threadIdx.x;
int i =blockIdx.x*blockDim.x +threadIdx.x;
//Eachblockloadsitselementsintosharedmemory
_shared_int x[blocksize];
x[tid]=(i<N)?input[i]:0;
//lastblockmaypadwith0s
_syncthreads();
//Buildsummationtreeoverelements.
for(int s=blockDim.x/2;s>0;s=s/2)
{
if(tid <s)x[tid]+=x[tid +s];
_syncthreads();
}
//Thread0addsthepartialsumtothetotalsum
if(tid ==0)atomicAdd(total,x[tid]);
}

Vous aimerez peut-être aussi