Académique Documents
Professionnel Documents
Culture Documents
GPU architecture
David Black-Schaffer
david.
david.black-schaffer@it.
black-schaffer@it.uu.se
uu.se
Room 1221
Data-parallel
OpenCL
Programming model
Memory model
Hello World
Answer: Triangles
Basic processing:
@ 60fps
Answer: Yes!
Answer: Maybe
Vectors
Texture lookups
Complex math
Function calls
Control ow
No loops
They
Theyre fast, but hard to program
They
Theyre incredibly fast and awesome
Answer: No
From http://www2.ati.com/developer/gdc/D3DTutorial10_Half-Life2_Shading.pdf
GPU Design
1) Process pixels in parallel
Data-parallel:
130W, 263mm2
32 GB/s BW, 106 GFLOPs (SP)
Big caches (8MB)
Out-of-order
0.8 GFLOPs/W
188W, 334mm2
154 GB/s BW, 2720 GFLOPs (SP)
Small caches (<1MB)
Hardware thread scheduling
14.5 GFLOPs/W
Great speedups
GPU Design
2) Focus on throughput, not latency
L2
BP
Great scalability
BP
L1
BP
Fixed-function Logic
Nvidia G80
I$
LM
I$
LM
I$
LM
I$
LM
I$
LM
I$
LM
I$
L2
L1
BP
L1
8*8 Wimpy GPU Cores: No caches,
caches, inorder, single-issue, single-precision
single-precision
About 1 IPC per core, 64 IPC total @1.5GHz
Fixed-function Logic
LM
L1
Example GPUs
I$
L2
L2
LM
AMD 5870
Instructions
g= f+1
f=ld(
ld(e)
d= d+1
e=ld(
ld(d)
c= b+a
b= a+1
Instructions
Instructions
g= f+1
f=ld(
ld(e)
d= d+1
e=ld(
ld(d)
c= b+a
b= a+1
g= f+1
f=ld(
ld(e)
d= d+1
c= b+a
ld/st
Cycle 0
ld/st
b= a+1
e=ld(
ld(d)
Cycle 0
Instructions
Instructions
g= f+1
f=ld(
ld(e)
d= d+1
c= b+a
g= f+1
f=ld(
ld(e)
d= d+1
c= b+a
ld/st
ld/st
b= a+1
e=ld(
ld(d)
b= a+1
e=ld(
ld(d)
L1
Cache
Hit!
Cycle 0
Cycle 0
Instructions
Instructions
g= f+1
f=ld(
ld(e)
d= d+1
ld/st
c= b+a
e=ld(
ld(d)
g= f+1
f=ld(
ld(e)
d= d+1
L1
Cache
ld/st
c= b+a
e=ld(
ld(d)
b= a+1
b= a+1
Cycle 1
Cycle 1
Instructions
Instructions
g= f+1
f=ld(
ld(e)
d= d+1
g= f+1
ld/st
L1
Cache
ld/st
d= d+1
f=ld(
ld(e)
e=ld(
ld(d)
c= b+a
b= a+1
Cycle 1
L1
Cache
e=ld(
ld(d)
c= b+a
b= a+1
Cycle 2
L1
Cache
Instructions
Instructions
g= f+1
ld/st
g= f+1
L1
Cache
f=ld(
ld(e)
d= d+1
e=ld(
ld(d)
c= b+a
b= a+1
Miss!
d= d+1
e=ld(
ld(d)
c= b+a
b= a+1
Cycle 3
L1
Cache
Miss!
Hit!
Instructions
Instructions
L2
Cache
g= f+1
ld/st
L1
Cache
L2
Cache
g= f+1
f=ld(
ld(e)
ld/st
f=ld(
ld(e)
d= d+1
e=ld(
ld(d)
c= b+a
b= a+1
Cycle 23
ld/st
f=ld(
ld(e)
Cycle 3
L2
Cache
d= d+1
e=ld(
ld(d)
c= b+a
b= a+1
Cycle 24
L1
Cache
Instructions
Instructions
L2
Cache
g= f+1
L1
Cache
ld/st
L2
Cache
+
ld/st
L1
Cache
g= f+1
f=ld(
ld(e)
d= d+1
e=ld(
ld(d)
c= b+a
b= a+1
f=ld(
ld(e)
d= d+1
e=ld(
ld(d)
c= b+a
b= a+1
Cycle 25
Cycle 25
Instructions
Instructions
ld/st
g= f+1
f=ld(
ld(e)
d= d+1
e=ld(
ld(d)
c= b+a
b= a+1
g= f+1
f=ld(
ld(e)
d= d+1
e=ld(
ld(d)
c= b+a
b= a+1
Cycle 25
Cycle 0
ld/st
Instructions
Instructions
g= f+1
f=ld(
ld(e)
d= d+1
e=ld(
ld(d)
c= b+a
g= f+1
f=ld(
ld(e)
d= d+1
e=ld(
ld(d)
ld/st
b= a+1
ld/st
c= b+a
b= a+1
Cycle 0
Cycle 1
Instructions
Instructions
g= f+1
f=ld(
ld(e)
d= d+1
g= f+1
f=ld(
ld(e)
d= d+1
e=ld(
ld(d)
c= b+a
b= a+1
Solution: Give Up
ld/st
e=ld(
ld(d)
ld/st
e=ld(
ld(d)
Memory
c= b+a
b= a+1
Cycle 2
g= f+1
f=ld(
ld(e)
d= d+1
c= b+a
b= a+1
Cycle 2
Memory
Instructions
Instructions
g= f+1
f=ld(
ld(e)
d= d+1
e=ld(
ld(d)
c= b+a
g= f+1
f=ld(
ld(e)
d= d+1
e=ld(
ld(d)
g= f+1
f=ld(
ld(e)
d= d+1
ld/st
b= a+1
+
e=ld(
ld(d)
Memory
Memory
c= b+a
b= a+1
Cycle 4
Instructions
Instructions
g= f+1
f=ld(
ld(e)
d= d+1
g= f+1
f=ld(
ld(e)
d= d+1
e=ld(
ld(d)
c= b+a
b= a+1
g= f+1
f=ld(
ld(e)
d= d+1
ld/st
e=ld(
ld(d)
c= b+a
b= a+1
Cycle 5
e=ld(
ld(d)
b= a+1
Cycle 3
ld/st
c= b+a
c= b+a
b= a+1
g= f+1
f=ld(
ld(e)
d= d+1
+
e=ld(
ld(d)
g=
g= ff+1
+1
ff==ld(
ld
ld(
ld((ee))
d=
d= d+1
d+1
ld/st
ee==ld(
ld
ld(
ld((dd))
Memory
c=
c= bb+a
+a
bb== a+1
a+1
c= b+a
b= a+1
Cycle 5
Memory
Instructions
Instructions
g= f+1
f=ld(
ld(e)
d= d+1
e=ld(
ld(d)
c= b+a
g= f+1
f=ld(
ld(e)
d= d+1
e=ld(
ld(d)
c= b+a
g=
g= ff+1
+1
ff==ld(
ld
ld(
ld((ee))
d=
d= d+1
d+1
ld/st
+
ee==ld(
ld
ld(
ld((dd))
b= a+1
Memory
ld/st
b= a+1
g=
ff+1
g=
g=
f+1
+1
g=
fe
ff==
ld(
g=
((
f+1
)+1
ld
))
ld
ff=ld(
ld(
(e
ld
=
ld(
(e
ld
d=
f
d+1
=
ld(
(ee))
ld
d=
d+1
d=
d+1
d=
d= d+1
d+1
ee==ld(
((dd))
ld
ld
ee=ld(
ld(
(d)
ld
ld
e==ld(
ld(
ld((dd))
First load
ready!
Memory
c=
bb+a
c=
c=
b+a
c=
b+a
bb==
c=
a+1
b+a
+a
a+1
bb== a+1
b= a+1
a+1
c=
c= bb+a
+a
bb== a+1
a+1
Cycle 6
Cycle 102
Instructions
Instructions
g= f+1
f=ld(
ld(e)
d=
d+1
g=
ff+1
g=
+1
g=
fd
g=
f+1
+1
ef==
ld(
((
ld
ld(
g=
f)
+1
ld
ff==ld(
(e
e)
))
ld
ld(
(
ld
f
=
ld(
(e
ld
c=
bd+1
+a
d=
f
=
ld(
(ee))
ld
d=
d+1
d=
d+1
d=
d= d+1
d+1
ld/st
ee==ld(
((dd))
ld
ld
ee=ld(
ld(
(d)
ld
ld
e==ld(
ld(
ld((dd))
g= f+1
f=ld(
ld(e)
d= d+1
First load
ready!
bc=
= a+1
bb+a
c=
c=
b+a
c=
b+a
bb==
c=
a+1
b+a
+a
bb==a+1
a+1
a+1
b= a+1
Cycle 103
ld/st
e=ld(
ld(d)
Memory
c= b+a
b= a+1
Cycle 103
g= f+1
f=ld(
ld(e)
d=
d+1
g=
ff+1
g=
+1
g=
fd
g=
f+1
+1
ef==
ld(
((
ld
ld(
g=
f)
+1
ld
ff==ld(
(e
e)
))
ld
ld(
(
ld
f
=
ld(
(e
ld
c=
bd+1
+a
d=
f
=
ld(
(ee))
ld
d=
d+1
d=
d+1
d=
d= d+1
d+1
ee==ld(
((dd))
ld
ld
ee=ld(
ld(
(d)
ld
ld
e==ld(
ld(
ld((dd))
bc=
= a+1
bb+a
c=
c=
b+a
c=
b+a
bb==
c=
a+1
b+a
+a
bb==a+1
a+1
a+1
b= a+1
First load
ready!
Memory
Instructions
g= f+1
f=ld(
ld(e)
g= f+1
f=ld(
ld(e)
d=
d+1
g=
ff+1
g=
+1
g=
fd
g=
f+1
ef==
ld(
((
ld
ld(
g=
f)+1
)+1
ld
ff==ld(
(e
))
ld
ld(
(e
ld
f
=
ld(
(e
ld
c=
b
+a
d=
f
d+1
=
ld(
(ee))
ld
d=
d+1
d=
d+1
d=
d= d+1
d+1
ld/st
ee==ld(
((dd))
ld
ld
ee=ld(
ld(
(d)
ld
ld
e==ld(
ld(
ld((dd))
d= d+1
Cycle 104
I$
ee==ld(
((dd))
ld
ld
ee=ld(
ld(
(d)
ld
ld
e==ld(
ld(
ld((dd))
bc=
= a+1
bb+a
c=
c=
b+a
c=
b+a
bb==
c=
a+1
b+a
+a
a+1
bb== a+1
b= a+1
a+1
Notes:
GPUs have caches for textures
GPUs will soon have data caches
g= f+1
f=ld(
ld(e)
d=
d+1
g=
ff+1
g=
+1
g=
fd
g=
f+1
ef==
ld(
((
ld
ld(
g=
f)+1
)+1
ld
ff==ld(
(e
))
ld
ld(
(e
ld
f
=
ld(
(e
ld
c=
b
+a
d=
f
d+1
=
ld(
(ee))
ld
d=
d+1
d=
d+1
d=
d= d+1
d+1
Memory
bc=
= a+1
bb+a
c=
c=
b+a
c=
b+a
bb==
c=
a+1
b+a
+a
a+1
bb== a+1
b= a+1
a+1
e=ld(
ld(d)
c= b+a
b= a+1
Divergent Execution
Thread
Instructions
1
2
if
3
el
4
5
6
if (
()
do 3
else
do 4
thread
t0 t1 t2 t3 t4 t5 t6 t7
Cycle 0
Cycle 1
Cycle 2
Cycle 3
Cycle 4
Cycle 5
Cycle 6
Cycle 7
Cycle 8
Fetch:
Fetch:
Fetch:
Fetch:
Fetch:
Fetch:
Fetch:
Fetch:
Fetch:
1
2
if
3
el
5
4
6
5
1
2
if
3
1
2
if
3
1
2
if
3
1
2
if
3
1
2
if
3
1
2
if
3
1 1
2 2
if if
t7 stalls
3
el
5 5 5 5 5 5 5
4
6 6 6 6 6 6 6
5
Instruction Divergence
AMD
AMDs GPUs are 4-way SIMD
If you don
dont process 4-wide vectors you lose.
Intel
s
Intel Larabee is(was?) 16-way SIMD
Theoretically the compiler can handle this.
In general:
OpenCL
What is OpenCL?
Low-level language for high-performance
heterogeneous data-parallel computation.
Demo
CPUs
GPUs
Accelerators (e.g., CELL)
Based on C99
Portable across devices
Vector intrinsics and math libraries
Guaranteed precision for operations
Open standard
Computational Intensity
A[i] = B[i]
B[i] + C[i]
A[i] = B[i]
B[i] + C[i] * D[i]
A[i]++
[i]++
1:3
2:4
1:2
X:1
Data-Parallelism
Single Precision
32 bits should be enough for anything
Single Precision
*Performance may fall off a cliff if not exactly the same.
Double Precision
Local Dimensions
Examples
1k audio:
HD video:
3D MRI:
HD per line:
HD per 8x8 block:
1024
1920x1080
256x256x256
1080
240x135
1024 work-items
2M work-items
16M work-items
1080 work-items
32k work-items
Global domain:
Work-group size:
Synchronization Example:
Reduction
20x20
4x4
No Synchronization.
Different work-groups
Input Data
1st Reduction
+
3
2nd Reduction
11
+
10
3rd Reduction
15
26
12
03
Need a barrier to prevent
thread 0 from continuing
before thread 1 is done.
17
49
15
53
+
11
+
18
36
30
4th Reduction
Synchronization Example:
Reduction
11
18
30
Input Data
12
36
67
26
10
11
Work-group size = 4
1
66
Synchronization Example:
Reduction
Input Data
Thread
Assignment
03
17
53
11
18
30
0
66
6
67
12
36
26
Invalid Synchronization
49
10
15
0
66
11
Work-group size = 4
Thread
Assignment
Global dimensions
Natural division for the problem
Too few: no latency hiding
Too many: (too little work each) too much overhead
In general:
Expensive
Cheap
GPU: >2000
CPU: ~2*#CPU cores
Local dimensions
May be determined by the algorithm
Optimize for best processor utilization
(hardware-specic)
Device
Private
work
item
Private
work
item
Device
Private
Private
work
item
work
item
Compute unit
Compute unit
Local
Local
Global Memory
Host
Host Memory
Registers
Private
work
item
16-32kB
10x Global BW
0.25-4GB
Private
work
item
Private
work
item
Compute unit
Compute
ute unit
u
Local
L
Local
Lo
ocal
al
50-200GB/s
PCIe (slow)
1-16GB
work
item
Private
Host
data
~5GB/s
Host Memory
Moving Data
Device
Private
Private
work
item
work
item
work
item
Private
Compute unit
Local
Private
work
item
m
Compute
unit
Com
Co
m
~1000GB/s
L
Local
Loc
oc
occa
al
50-200GB/s
But
~5GB/s
H
Host
Hos
osst
o
st
data
Host Memory
Memory
Objects
oat4[]
oat4[]
oat4[]
Devices
Queue
Queue
Queue
Host
Queues
Devices
Contexts
Your OpenCL
Computation
Your
Application
Submit (enqueue
(enqueue)) work to devices
Notes:
OpenCL Kernels
An OpenCL Program
1.
2.
3.
Regular C:
void calcSin(float *data) {
for (int id=0; id<1023; id++)
data[id] = sin(data[id]);
}
4.
5.
6.
7.
OpenCL Kernel:
void kernel calcSin(global float *data) {
int id = get_global_id(0);
data[id] = sin(data[id]);
}
8.
9.
context =
clCreateContext(NULL, 1, &device,
NULL, NULL, NULL);
queue =
clCreateCommandQueue(context, device,
(cl_command_queue_properties)0, NULL);
char *source = {
"kernel calcSin(global float *data) {
" int id = get_global_id(0);
" data[id] = sin(data[id]);
"}
program =
\n
\n
\n
\n"};
clCreateProgramWithSource(context, 1,
(const char**)&source, NULL, NULL);
clBuildProgram(program, 0,
NULL, NULL, NULL, NULL);
kernel =
buffer =
clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
sizeof(cl_float)*10240,
data, NULL);
More OpenCL
Querying Devices
Images
Events
Querying Devices
CL_DEVICE_MAX_COMPUTE_UNITS*
CL_DEVICE_MAX_COMPUTE_UNITS*
Number of compute units that can run work-groups in parallel
CL_DEVICE_MAX_CLOCK_FREQUENCY*
CL_DEVICE_MAX_CLOCK_FREQUENCY*
CL_DEVICE_GLOBAL_MEM_SIZE*
CL_DEVICE_GLOBAL_MEM_SIZE*
Total global memory available on the device
CL_DEVICE_IMAGE_SUPPORT
Some GPUs don
dont support images today
CL_DEVICE_EXTENSIONS
double precision, atomic operations, OpenGL integration
Images
Why?
But
But
Events
Implication:
Events
clEnqueueNDRangeKernel(queue, kernel,
1, NULL, global_dimensions, NULL,
numberOfEventsInList, &waitList,
eventReturned);
Event Example
clEnqueueNDRangeKernel(CPU_queue, kernelA,
1, NULL, global_dimensions, NULL,
0, NULL, kernelA_event);
clEnqueueNDRangeKernel(GPU_queue, kernelB,
1, NULL, global_dimensions, NULL,
1, &kernelA_event, NULL);
Performance Optimizations
Math (2x
(2x on intensive workloads)
printf()
Approaches
Data-parallel
GPU Projects
Thread-parallel
Reduction
Simplest mapping
Just need right compute-to-memory ratio
Scan-based
Scan Algorithms
Simple Scan
http://mgarland.org/files/papers/nvr-2008-003.pdf
http://developer.download.nvidia.com/compute/cuda/sdk/website/projects/scan/doc/scan.pdf
Project Ideas
Local memory
Global memory