Académique Documents
Professionnel Documents
Culture Documents
Register as Cache?
Vola5le
Qualier
!
Vola>le
qualier
__global__ void kernelFunc(int* result)
{
int temp1;
int temp2;
if(threadIdx.x<warpSize)
{
temp1=array[threadIdx.x]
array[threadIdx.x+1]=2;
iden>cal
reads
compiler
op>mized
this
read
away
temp2=array[threadIdx.x]
result[threadIdx.x]=temp1*temp2;
Vola5le
Qualier
!
Vola>le
qualier
__global__ void kernelFunc(int* result)
{
int temp1;
int temp2;
if(threadIdx.x<warpSize)
{
int temp=array[threadIdx.x];
temp1=temp; array[threadIdx.x+1]=2;
temp2=temp; result[threadIdx.x]=temp1*temp2;
Vola5le
Qualier
!
Vola>le
qualier
__global__ void kernelFunc(int* result)
{
int temp1;
int temp2;
if(threadIdx.x<warpSize)
{
temp1=array[threadIdx.x]*1;
array[threadIdx.x+1]=2;
__syncthreads();
temp2=array[threadIdx.x]*2;
result[threadIdx.x]=temp1*temp2;
}
Vola5le
Qualier
!
Vola>le
qualier
__global__ void kernelFunc(int* result)
{
volatile int temp1;
volatile int temp2;
if(threadIdx.x<warpSize)
{
temp1=array[threadIdx.x]*1;
array[threadIdx.x+1]=2;
temp2=array[threadIdx.x]*2;
result[threadIdx.x]=temp1*temp2;
Data Prefetch
Data
Prefetch
!
Hide
memory
latency
by
overlapping
loading
and
compu>ng
-
double
buer
is
tradi>onal
soQware
pipeline
technique
load
blue
block
to
shared
memory
Nd
Md
Pdsub
Data
Prefetch
!
Hide
memory
latency
by
overlapping
loading
and
compu>ng
-
double
buer
is
tradi>onal
soQware
pipeline
technique
for
loop
{
load
data
from
global
to
shared
memory
synchronize
block
compute
data
in
the
shared
memory
synchronize
block
}
Data
Prefetch
!
Hide
memory
latency
by
overlapping
loading
and
compu>ng
-
double
buer
is
tradi>onal
soQware
pipeline
technique
load
data
from
global
memory
to
registers
for
loop
{
store
data
from
register
to
shared
memory
synchronize
block
load
data
from
global
memory
to
registers
compute
data
in
the
shared
memory
synchronize
block
}
very
small
overhead
compu>ng
and
loading
overlap
both
memory
are
very
fast
register
and
shared
are
independent
10
Data
Prefetch
!
Matrix-matrix
mul>plica>on
11
Constant Memory
Constant
Memory
!
Where
is
constant
memory?
-
data
is
stored
in
the
device
global
memory
-
read
data
through
mul>processor
constant
cache
-
64KB
constant
memory
and
8KB
cache
for
each
mul>processor
!
How
about
the
performance?
-
op>mized
when
warp
of
threads
read
same
loca>on
-
4
bytes
per
cycle
through
broadcas>ng
to
warp
of
threads
-
serialized
when
warp
of
threads
read
in
dierent
loca>on
-
very
slow
when
cache
miss
(read
data
from
global
memory)
-
access
latency
can
range
from
one
to
hundreds
clock
cycles
13
Constant
Memory
!
How
to
use
constant
memory?
-
declare
constant
memory
on
the
le
scope
(global
variable)
-
copy
data
to
constant
memory
by
host
(because
it
is
constant!!)
//declare constant memory
__constant__ float cst_ptr[size];
//copy data from host to constant memory
cudaMemcpyToSymbol(cst_ptr,host_ptr,data_size);
14
Constant
Memory
//declare constant memory
__constant__ float cangle[360];
int main(int argc,char** argv)
{
int size=3200;
float* darray;
float hangle[360];
//allocate device memory
cudaMalloc((void**)&darray,sizeof(float)*size);
//initialize allocated memory
cudaMemset(darray,0,sizeof(float)*size);
//initialize angle array on host
for(int loop=0;loop<360;loop++)
hangle[loop]=acos(-1.0f)*loop/180.0f;
//copy host angle data to constant memory
cudaMemcpyToSymbol(cangle,hangle,sizeof(float)*360);
15
Constant
Memory
//execute device kernel
test_kernel<<<size/64,64>>>(darray);
//free device memory
cudaFree(darray);
}
return 0;
return;
16
Texture Memory
Texture
Memory
!
Texture
mapping
18
Texture
Memory
!
Texture
mapping
19
Texture
Memory
!
Texture
ltering
nearest-neighborhood
interpola>on
20
Texture
Memory
!
Texture
ltering
linear/bilinear/trilinear
interpola>on
21
Texture
Memory
!
Texture
ltering
two
>mes
bilinear
interpola>on
22
Texture
Memory
Host
Input Assembler
SP
SP
SP
TF
SP
SP
TF
L1
TF
L1
SP
SP
SP
SP
SP
TF
L1
L1
L2
FB
SP
TF
TF
L1
L2
FB
SP
Work Distribution
SP
TF
L1
L2
FB
SP
L1
L2
FB
23
SP
TF
L1
L2
FB
SP
Thread Processor
L2
FB
Texture Memory
24
Texture Memory
25
Texture
Memory
Host
Input Assembler
SP
SP
SP
TF
SP
SP
TF
L1
TF
L1
SP
SP
SP
SP
SP
TF
L1
L1
L2
FB
SP
TF
TF
L1
L2
FB
SP
Work Distribution
SP
TF
L1
L2
FB
SP
SP
TF
L1
L2
FB
SP
L1
L2
FB
Thread Processor
L2
FB
read only texture L2 cache for all TPC read only texture L1 cache for each TPC
26
Texture Memory
27
Texture
Memory
!
Texture
is
an
object
for
reading
data
-
data
is
stored
on
the
device
global
memory
-
global
memory
is
bound
with
texture
cache
SP
SP
TF
SP
SP
TF
L1
TF
L1
L2
FB
SP
SP
SP
SP
TF
TF
L1
L2
FB
SP
SP
TF
L1
L1
SP
L2
SP
TF
L1
28
L2
FB
SP
SP
TF
L1
L2
FB global memory FB
SP
Thread Processor
SP
L1
L2
Texture
Memory
!
Data
caching
-
helpful
when
global
memory
coalescing
is
the
main
bocleneck
SP
SP
TF
SP
SP
TF
L1
TF
L1
SP
SP
SP
SP
SP
TF
L1
L1
L2
FB
SP
TF
TF
L1
L2
FB
SP
SP
TF
L1
L2
FB
30
SP
SP
TF
L1
L2
FB
SP
L1
L2
FB
Thread Processor
SP
L2
FB
Texture
Memory
!
Data
ltering
-
support
linear/bilinear
and
trilinear
hardware
interpola>on
texture
specic
unit
intrinsic
interpola>on
cudaFilterModePoint
cudaFilterModeLinear
31
Texture
Memory
!
Accesses
modes
-
clamp
and
wrap
memory
accessing
for
out-of-bound
addresses
wrap
boundary
texture
specic
unit
cudaAddressModeWrap
clamp
boundary
cudaAddressModeClamp
32
Texture
Memory
!
Bound
to
linear
memory
-
only
support
1-dimension
problems
-
only
get
the
benets
from
texture
cache
-
not
support
addressing
modes
and
ltering
!
Bound
to
cuda
array
-
support
oat
addressing
-
support
addressing
modes
-
support
hardware
interpola>on
-
support
1/2/3-dimension
problems
33
Texture
Memory
!
Host
code
-
allocate
global
linear
memory
or
cuda
array
-
create
and
set
the
texture
reference
on
le
scope
-
bind
the
texture
reference
to
the
allocated
memory
-
unbind
the
texture
reference
to
free
cache
resource
!
Device
code
-
fetch
data
by
indica>ng
texture
reference
-
fetch
data
by
using
texture
fetch
func>on
34
Texture
Memory
!
Texture
memory
constrain
8192
31768
1024x128
(65536,32768)
(65536,65536)
(2048,2048,2048)
(4096,4096,4096)
35
Texture
Memory
!
Measuring
texture
cache
miss
or
hit
number
-
latest
visual
proler
can
count
cache
miss
or
hit
-
need
device
compute
capability
higher
than
1.2
36
Texture
Memory
//declare texture reference
texture<float,1,cudaReadModeElementType> texreference;
int main(int argc,char** argv)
{
int size=3200;
float* harray;
float* diarray;
float* doarray;
//allocate host and device memory
harray=(float*)malloc(sizeof(float)*size);
cudaMalloc((void**)&diarray,sizeof(float)*size);
cudaMalloc((void**)&doarray,sizeof(float)*size);
//initialize host array before usage
for(int loop=0;loop<size;loop++)
harray[loop]=(float)rand()/(float)(RAND_MAX-1);
//copy array from host to device memory
cudaMemcpy(diarray,harray,sizeof(float)*size,cudaMemcpyHostToDevice);
38
Texture
Memory
//bind texture reference with linear memory
cudaBindTexture(0,texreference,diarray,sizeof(float)*size);
//execute device kernel
kernel<<<(int)ceil((float)size/64),64>>>(doarray,size);
//unbind texture reference to free resource
cudaUnbindTexture(texreference);
//copy result array from device to host memory
cudaMemcpy(harray,doarray,sizeof(float)*size,cudaMemcpyDeviceToHost);
//free host and device memory
free(harray);
cudaFree(diarray);
cudaFree(doarray);
return 0;
}
39
Texture
Memory
__global__ void kernel(float* doarray,int size)
{
int index;
//calculate each thread global index
index=blockIdx.x*blockDim.x+threadIdx.x;
//fetch global memory through texture reference
doarray[index]=tex1Dfetch(texreference,index);
}
return;
40
Texture
Memory
__global__ void offsetCopy(float* idata,float* odata,int offset)
{
//compute each thread global index
int index=blockIdx.x*blockDim.x+threadIdx.x;
41
Texture
Memory
__global__ void offsetCopy(float* idata,float* odata,int offset)
{
//compute each thread global index
int index=blockIdx.x*blockDim.x+threadIdx.x;
42
Texture
Memory
#define size 3200
//declare texture reference
texture<float,2,cudaReadModeElementType> texreference;
int main(int argc,char** argv)
{
dim3 blocknum;
dim3 blocksize;
float* hmatrix;
float* dmatrix;
cudaArray* carray;
cudaChannelFormatDesc channel;
//allocate host and device memory
hmatrix=(float*)malloc(sizeof(float)*size*size);
cudaMalloc((void**)&dmatrix,sizeof(float)*size*size);
//initialize host matrix before usage
for(int loop=0;loop<size*size;loop++)
hmatrix[loop]=float)rand()/(float)(RAND_MAX-1);
44
Texture
Memory
//create channel to describe data type
channel=cudaCreateChannelDesc<float>();
//allocate device memory for cuda array
cudaMallocArray(&carray,&channel,size,size);
//copy matrix from host to device memory
bytes=sizeof(float)*size*size;
cudaMemcpyToArray(carray,0,0,hmatrix,bytes,cudaMemcpyHostToDevice);
//set texture filter mode property
//use cudaFilterModePoint or cudaFilterModeLinear
texreference.filterMode=cudaFilterModePoint;
//set texture address mode property
//use cudaAddressModeClamp or cudaAddressModeWrap
texreference.addressMode[0]=cudaAddressModeWrap;
texreference.addressMode[1]=cudaaddressModeClamp;
45
Texture
Memory
//bind texture reference with cuda array
cudaBindTextureToArray(texreference,carray);
blocksize.x=16;
blocksize.y=16;
blocknum.x=(int)ceil((float)size/16);
blocknum.y=(int)ceil((float)size/16);
//execute device kernel
kernel<<<blocknum,blocksize>>>(dmatrix,size);
//unbind texture reference to free resource
cudaUnbindTexture(texreference);
//copy result matrix from device to host memory
cudaMemcpy(hmatrix,dmatrix,bytes,cudaMemcpyDeviceToHost);
//free host and device memory
free(hmatrix);
cudaFree(dmatrix);
cudaFreeArray(carray);
}
return 0;
46
Texture
Memory
__global__ void kernel(float* dmatrix,int size)
{
int xindex;
int yindex;
//calculate each thread global index
xindex=blockIdx.x*blockDim.x+threadIdx.x;
yindex=blockIdx.y*blockDim.y+threadIdx.y;
//fetch cuda array through texture reference
dmatrix[yindex*size+xindex]=tex2D(texreference,xindex,yindex);
return;
}
47
Texture
Memory
#define size 256
//declare texture reference
texture<float,3,cudaReadModeElementType> texreference;
int main(int argc,char** argv)
{
dim3 blocknum;
dim3 blocksize;
float* hmatrix;
float* dmatrix;
cudaArray* cudaarray;
cudaExtent volumesize;
cudaChannelFormatDesc channel;
cudaMemcpy3DParms copyparms={0};
//allocate host and device memory
hmatrix=(float*)malloc(sizeof(float)*size*size*size);
cudaMalloc((void**)&dmatrix,sizeof(float)*size*size*size);
49
Texture
Memory
//initialize host matrix before usage
for(int loop=0;loop<size*size*size;loop++)
hmatrix[loop]=(float)rand()/(float)(RAND_MAX-1);
//set cuda array volume size
volumesize=make_cudaExtent(size,size,size);
//create channel to describe data type
channel=cudaCreateChannelDesc<float>();
//allocate device memory for cuda array
cudaMalloc3DArray(&cudaarray,&channel,volumesize);
//set cuda array copy parameters
copyparms.extent=volumesize;
copyparms.dstArray=cudaarray;
copyparms.kind=cudaMemcpyHostToDevice;
copyparms.srcPtr=
make_cudaPitchPtr((void*)hmatrix,sizeof(float)*size,size,size);
cudaMemcpy3D(©parms);
50
Texture
Memory
//set texture filter mode property
//use cudaFilterModePoint or cudaFilterModeLinear
texreference.filterMode=cudaFilterModePoint;
//set texture address mode property
//use cudaAddressModeClamp or cudaAddressModeWrap
texreference.addressMode[0]=cudaAddressModeWrap;
texreference.addressMode[1]=cudaAddressModeWrap;
texreference.addressMode[2]=cudaaddressModeClamp;
//bind texture reference with cuda array
cudaBindTextureToArray(texreference,carray,channel);
blocksize.x=8;
blocksize.y=8;
blocksize.z=8;
blocknum.x=(int)ceil((float)size/8);
blocknum.y=(int)ceil((float)size/8);
//execute device kernel
kernel<<<blocknum,blocksize>>>(dmatrix,size);
51
Texture
Memory
//unbind texture reference to free resource
cudaUnbindTexture(texreference);
//copy result matrix from device to host memory
cudaMemcpy(hmatrix,dmatrix,bytes,cudaMemcpyDeviceToHost);
//free host and device memory
free(hmatrix);
cudaFree(dmatrix);
cudaFreeArray(carray);
}
return 0;
52
Texture
Memory
__global__ void kernel(float* dmatrix,int size)
{
int loop;
int xindex;
int yindex;
int zindex;
//calculate each thread global index
xindex=threadIdx.x+blockIdx.x*blockDim.x;
yindex=threadIdx.y+blockIdx.y*blockDim.y;
for(loop=0;loop<size;loop++)
{
zindex=loop;
//fetch cuda array via texture reference
dmatrix[zindex*size*size+yindex*size+xindex]=
tex3D(texreference,xindex,yindex,zindex);
}
}
return;
53
Texture Memory
55
Texture Memory
intrinsic
interpola>on
units
is
very
powerful
trilinear
interpola>on
on
nearby
8
pixels
56
Texture Memory
1.891
0.198
-
9.5
texture/point
texture/linear
0.072
0.037
26.2
51.1
texture/linear/locality
texture/linear/locality/fast
math
0.012
0.011
157.5
171.9
57
Texture
Memory
!
CUDA
Array
is
reordered
to
something
like
space
lling
Z-order
-
soQware
driver
supports
reordering
data
-
hardware
supports
spa>al
memory
layout
59
Texture
Memory
!
Texture
cache
cannot
detect
the
dirty
data
lazy
update
for
write-back
modied
by
other
threads
cache
perform
some
opera>ons
on
cache
61
oat array
Texture
Memory
!
Write
data
to
global
memory
directly
without
texture
cache
-
only
suitable
for
global
linear
memory
not
cuda
array
tex1Dfetch(texreference,index)
darray[index]=value;
cache
texture
cache
may
not
be
updated
device memory
62
oat array
Texture Memory
64
Texture Memory
65
Texture Memory
66
Texture Memory
concurrent
execu>on
for
independent
units
streaming
processors
temp1=a/b+sin(c)
67
Texture
Memory
Memory
Loca>on
Cache
Speed
Access
global
o-chip
no
hundreds
all threads
constant
o-chip
yes
one ~ hundreds
all threads
texture
o-chip
yes
one ~ hundreds
all threads
shared
on-chip
one
block threads
local
o-chip
no
very slow
single thread
register
on-chip
one
single thread
instruc>on
o-chip
yes
invisible
68
Texture Memory
Memory
Read/Write
Property
global
read/write
input or output
constant
read
no structure
texture
read
locality structure
shared
read/write
local
read/write
register
read/write
69
!
Reference
-
Mark
Harris
http://www.markmark.net/
-
Wei-Chao
Chen http://www.cs.unc.edu/~ciao/
-
Wen-Mei
Hwu
http://impact.crhc.illinois.edu/people/current/hwu.php
70