Académique Documents
Professionnel Documents
Culture Documents
for GPUs:
a success story
Daniel J. Bernstein, UIC
Hsieh-Chung Chen, Harvard
Chen-Mou Cheng, NTU
Tanja Lange, Eindhoven
Ruben Niederhagen, E+AS
Peter Schwabe, Academia Sinica
Bo-Yin Yang, Academia Sinica
Instruction selection:
e.g., compiler doesnt see
how to use vector instructions.
Instruction scheduling.
Register allocation.
x; y are in F2131 ;
x has even Hamming weight w;
j = 3 + ((w=2) mod 8);
j
j
2
2
= (y + y )=(x + x );
j
2
2
x = ++x+x ;
y = (x + x ) + x + y .
0
C/C++/CUDA:
z2 = x2 ^ y2;
PTX:
xor.b32 %r24, %r22, %r23;
cudasm:
xor.b32 $r2, $r3, $r2
qhasm-cudasm:
z2 = x2 ^ y2
low32 threadinfo
input threadinfo
enter Z9kerneladdPjPKjS_
low32 tid
low32 x
low32 y
low32 t
low32 tstart
low32 tend
low32 ttid
low32 tselected
low32 now
tselected = 0
low32 j
low32 twenty
cond testloop
tid = 65535 & threadinfo
cond tid12
tid12 tid - c[0]
low32 tid4
tid4 = tid << 2
offset tid4off
tid4off = tid << 2
low32 batchshift
batchshift = blockindex
batchshift int24*= 33536
x = parameters[0]
y = parameters[1]
t = parameters[2]
x += batchshift
y += batchshift
low32 0x0
low32 0x1
low32 0x2
low32 0x3
low32 0x4
low32 0pos
syncthreads
new 0x4
0pos = tid4 + x
0x0 = g[0pos]
0pos += 512
0x1 = g[0pos]
0pos += 512
0x2 = g[0pos]
0pos += 512
0x3 = g[0pos]
0pos += 512
0x4 = g[0pos] if tid12 signed<
s[tid4off + 512] = 0x0
s[tid4off + 1024] = 0x1
1pos = tid4 + y
1x0 = g[1pos]
1pos += 512
1x1 = g[1pos]
1pos += 512
1x2 = g[1pos]
1pos += 512
1x3 = g[1pos]
1pos += 512
1x4 = g[1pos] if tid12 signed<
s[tid4off + 2608] = 1x0
s[tid4off + 3120] = 1x1
s[tid4off + 3632] = 1x2
s[tid4off + 4144] = 1x3
low32 0y2
low32 0y3
low32 0y4
new 2x4
new 0y4
2x0 = s[tid4off + 512]
2x1 = s[tid4off + 1024]
2x2 = s[tid4off + 1536]
2x3 = s[tid4off + 2048]
tend -= tstart
tend <<= 1
low32 3x0
low32 3x1
low32 3x2
low32 3x3
low32 3x4
low32 2pos
syncthreads
new 3x4
3x0 = s[tid4off + 512]
3x1 = s[tid4off + 1024]