I’m using CUDA 0.8.
This program is garbage, it just demonstrates the problem.
#include <stdio.h>
#include <stdlib.h>
#define N 1000003
__global__ static void shuffle(int *dst,int *src,int *ps)
{
extern __shared__ int shared[];
int volatile *sh=shared;
int id=threadIdx.x;
int v=(id&31);
if(v==0)
sh[id]=src[id];
__syncthreads();
int n=sh[id];
__syncthreads();
sh[id]=dst[id]*12345;
__syncthreads();
int a=dst[9],b=dst[8];
l2: if(dst[0]>0)goto l0;
a++;
dst[n+7]=a+b-n;
l1: if(dst[1]>0)goto l1;
b++;
dst[n+11]=a+b-n;
l4: if(dst[2]>0)goto l2;
b++;
dst[n+23]=a+b-n;
l3: if(dst[3]>0)goto l3;
a++;
dst[n+69]=a+b-n;
l0: if(dst[4]>0)goto l4;
b++;
dst[n]=a+b-n;
}
int main()
{
int *a,*b,*c;
int *da,*db,*dc;
cudaMalloc((void**)&da,sizeof(int)*N);a=(int*)malloc(sizeof(int)*N);
cudaMalloc((void**)&db,sizeof(int)*N);b=(int*)malloc(sizeof(int)*N);
cudaMalloc((void**)&dc,sizeof(int)*N);c=(int*)malloc(sizeof(int)*N);
int p=1237;
for(int i=0;i<N;i++)
b[i]=i;
for(int i=0;i<N;i++)
c[i]=(i*p)%N;
cudaMemcpy(db,b,sizeof(int)*N,cudaMemcpyHostToDevice);
cudaMemcpy(dc,c,sizeof(int)*N,cudaMemcpyHostToDevice);
shuffle<<<N/512+1,512,4096>>>(da,db,dc);
cudaMemcpy(a,da,sizeof(int)*N,cudaMemcpyDeviceToHost);
cudaFree(dc);
cudaFree(db);
cudaFree(da);
for(int i=0;i<N;i++)
if(a[c[i]]!=i)printf("FAIL! %d for %d\n",a[c[i]],i);
return 0;
}
Generated ptx:
# c:\cuda\bin/../open64/lib//be.exe::1.10
#-----------------------------------------------------------
# Compiling c:/windows/temp/tmp_000014d8-1.i (E:/DOCUME~1/f-qmhou/LOCALS~1/Temp/ccBI#.a04340)
#-----------------------------------------------------------
#-----------------------------------------------------------
# Options:
#-----------------------------------------------------------
# Target:PROCESSOR_8, ISA:PTX1, Endian:little, Pointer Size:32
# -O2 (Optimization level)
# -g0 (Debug level)
# -m2 (Report advisories)
#-----------------------------------------------------------
# # .file 1 "c:/windows/temp/tmp_000014d8-5.gpu"
# # .file 2 "D:\f-qmhou\tcuda/E:\Program Files\Microsoft Visual Studio 8\VC\INCLUDE\crtdefs.h"
# # .file 3 "c:\cuda\bin/../include/crt/device_runtime.h"
# # .file 4 "c:\cuda\bin/../include/host_defines.h"
# # .file 5 "c:\cuda\bin/../include/builtin_types.h"
# # .file 6 "c:\cuda\bin/../include/driver_types.h"
# # .file 7 "c:\cuda\bin/../include/texture_types.h"
# # .file 8 "c:\cuda\bin/../include/vector_types.h"
# # .file 9 "c:\cuda\bin/../include/device_launch_parameters.h"
# # .file 10 "D:\f-qmhou\tcuda/c.cu"
# # .file 11 "c:\cuda\bin/../include/common_functions.h"
# # .file 12 "c:\cuda\bin/../include/crt/func_macro.h"
# # .file 13 "c:\cuda\bin/../include/math_functions.h"
# # .file 14 "c:\cuda\bin/../include/device_functions.h"
# # .file 15 "c:\cuda\bin/../include/math_constants.h"
# # .file 16 "c:\cuda\bin/../include/texture_fetch_functions.h"
# # .file 17 "c:\cuda\bin/../include/math_functions_dbl_ptx1.h"
.extern .shared .align 4 .b8 shared[];
.const .align 4 .b8 __cudart_A1[128] = {0,0,128,63,135,205,130,63,195,170,133,63,15,152,136,63,194,149,139,63,58,164,142,63,211,195,145,63,240,244,148,63,240,55,152,63,58,141,155,63,50,245,158,63,67,112,162,63,215,254,165,63,91,161,169,63,63,88,173,63,246,35,177,63,243,4,181,63,175,251,184,63,164,8,189,63,77,44,193,63,42,103,197,63,190,185,201,63,140,36,206,63,30,168,210,63,253,68,215,63,184,251,219,63,223,204,224,63,7,185,229,63,199,192,234,63,186,228,239,63,125,37,245,63,179,131,250,63};
.const .align 4 .b8 __cudart_A2[128] = {0,0,0,0,169,167,78,179,145,152,79,51,75,218,126,179,161,171,96,178,101,116,105,179,36,86,103,51,18,2,46,179,113,27,35,51,99,85,12,179,66,35,65,51,90,18,195,48,94,157,44,179,8,43,22,179,246,234,45,178,168,90,124,179,122,231,207,50,247,197,14,51,232,79,65,179,62,102,214,178,55,168,10,50,162,35,115,179,36,252,40,50,170,29,92,179,138,165,212,178,28,74,80,179,89,171,30,178,230,27,68,178,222,22,65,178,74,70,72,179,54,36,41,50,88,55,146,178};
.const .align 4 .b8 __cudart_Ainv[128] = {0,0,128,63,179,131,122,63,125,37,117,63,186,228,111,63,199,192,106,63,7,185,101,63,223,204,96,63,184,251,91,63,253,68,87,63,30,168,82,63,140,36,78,63,190,185,73,63,42,103,69,63,77,44,65,63,164,8,61,63,175,251,56,63,243,4,53,63,246,35,49,63,63,88,45,63,91,161,41,63,215,254,37,63,67,112,34,63,50,245,30,63,58,141,27,63,240,55,24,63,240,244,20,63,211,195,17,63,58,164,14,63,194,149,11,63,15,152,8,63,195,170,5,63,135,205,2,63};
.entry shuffle
{
.reg .u32 $r1,$r2,$r3,$r4,$r5,$r6,$r7,$r8,$r9,
$r10,$r11,$r12,$r13,$r14,$r15,$r16,$r17,$r18,$r19,
$r20,$r21,$r22,$r23,$r24,$r25,$r26,$r27,$r28,$r29,
$r30,$r31,$r32,$r33,$r34,$r35;
.reg .pred $p0,$p1,$p2,$p3,$p4;
.param .u32 %parm_dst;
.param .u32 %parm_src;
.param .u32 %parm_ps;
# # .file 10 "D:\f-qmhou\tcuda/c.cu"
# .loc 10 7 0
# 3 #include <windows.h>
# 4
# 5 #define N 1000003
# 6
# 7 __global__ static void shuffle(int *dst,int *src,int *ps)
$LBB1_shuffle:
cvt.u32.u16 $r1, %tid.x; #
mul.lo.u32 $r2, $r1, 4; #
mov.u32 $r3, (&shared); #
add.u32 $r4, $r2, $r3; #
and.s32 $r5, $r1, 31; #
mov.s32 $r6, 0; #
setp.ne.s32 $p1, $r5, $r6; #
@$p1 bra $Lt_0_18; #
$LBB2_shuffle:
# .loc 10 16 0
# 12
# 13 int id=threadIdx.x;
# 14 int v=(id&31);
# 15 if(v==0)
# 16 sh[id]=src[id];
ld.param.u32 $r7, %parm_src; # id:84 %parm_src+0x0
add.u32 $r8, $r7, $r2; #
ld.global.s32 $r9, [$r8+0]; # id:85
st.shared.s32 [$r4+0], $r9; # id:86 shared+0x0
$Lt_0_18:
# .loc 10 17 0
# 17 __syncthreads();
bar.wait 0; #
# .loc 10 18 0
# 18 int n=sh[id];
ld.shared.s32 $r10, [$r4+0]; # id:87 shared+0x0
# .loc 10 19 0
# 19 __syncthreads();
bar.wait 0; #
# .loc 10 20 0
# 20 sh[id]=dst[id]*12345;
ld.param.u32 $r11, %parm_dst; # id:88 %parm_dst+0x0
add.u32 $r12, $r2, $r11; #
ld.global.s32 $r13, [$r12+0]; # id:89
mul.lo.s32 $r14, $r13, 12345; #
st.shared.s32 [$r4+0], $r14; # id:90 shared+0x0
# .loc 10 21 0
# 21 __syncthreads();
bar.wait 0; #
# .loc 10 22 0
# 22 int a=dst[9],b=dst[8];
ld.global.s32 $r15, [$r11+36]; # id:91
ld.global.s32 $r16, [$r11+32]; # id:92
mul.lo.u32 $r17, $r10, 4; #
add.u32 $r18, $r17, $r11; #
$Lt_0_1:
#<loop> Loop body line 23
# .loc 10 23 0
# 23 l2: if(dst[0]>0)goto l0;
ld.global.s32 $r19, [$r11+0]; # id:93
mov.s32 $r20, 0; #
setp.le.s32 $p2, $r19, $r20; #
@$p2 bra $Lt_0_20; #
$LBB5_shuffle:
#<loop> Part of loop body line 23, head labeled $Lt_0_1
bra.uni $Lt_0_2; #
$Lt_0_20:
#<loop> Part of loop body line 23, head labeled $Lt_0_1
# .loc 10 24 0
# 24 a++;
add.s32 $r15, $r15, 1; #
# .loc 10 25 0
# 25 dst[n+7]=a+b-n;
add.s32 $r21, $r15, $r16; #
# .loc 10 18 0
ld.shared.s32 $r10, [$r4+0]; #
# .loc 10 25 0
sub.s32 $r22, $r21, $r10; #
st.global.s32 [$r18+28], $r22; # id:94
# .loc 10 27 0
# 26 l1: if(dst[1]>0)goto l1;
# 27 b++;
add.s32 $r16, $r16, 1; #
# .loc 10 28 0
# 28 dst[n+11]=a+b-n;
add.s32 $r23, $r15, $r16; #
sub.s32 $r24, $r23, $r10; #
st.global.s32 [$r18+44], $r24; # id:95
$Lt_0_4:
#<loop> Part of loop body line 23, head labeled $Lt_0_1
# .loc 10 29 0
# 29 l4: if(dst[2]>0)goto l2;
ld.global.s32 $r25, [$r11+8]; # id:96
mov.s32 $r26, 0; #
setp.le.s32 $p3, $r25, $r26; #
@$p3 bra $Lt_0_24; #
$LBB8_shuffle:
#<loop> Part of loop body line 23, head labeled $Lt_0_1
bra.uni $Lt_0_1; #
$Lt_0_24:
#<loop> Part of loop body line 23, head labeled $Lt_0_1
# .loc 10 30 0
# 30 b++;
add.s32 $r16, $r16, 1; #
# .loc 10 31 0
# 31 dst[n+23]=a+b-n;
add.s32 $r27, $r15, $r16; #
# .loc 10 18 0
ld.shared.s32 $r10, [$r4+0]; #
# .loc 10 31 0
sub.s32 $r28, $r27, $r10; #
st.global.s32 [$r18+92], $r28; # id:97
# .loc 10 33 0
# 32 l3: if(dst[3]>0)goto l3;
# 33 a++;
add.s32 $r15, $r15, 1; #
# .loc 10 34 0
# 34 dst[n+69]=a+b-n;
add.s32 $r29, $r15, $r16; #
sub.s32 $r30, $r29, $r10; #
st.global.s32 [$r18+276], $r30; # id:98
$Lt_0_2:
#<loop> Part of loop body line 23, head labeled $Lt_0_1
# .loc 10 35 0
# 35 l0: if(dst[4]>0)goto l4;
ld.global.s32 $r31, [$r11+16]; # id:99
mov.s32 $r32, 0; #
setp.le.s32 $p4, $r31, $r32; #
@$p4 bra $Lt_0_28; #
$LBB11_shuffle:
#<loop> Part of loop body line 23, head labeled $Lt_0_1
bra.uni $Lt_0_4; #
$Lt_0_28:
# .loc 10 37 0
# 36 b++;
# 37 dst[n]=a+b-n;
add.s32 $r33, $r15, $r16; #
# .loc 10 18 0
ld.shared.s32 $r10, [$r4+0]; #
# .loc 10 37 0
sub.s32 $r34, $r33, $r10; #
add.s32 $r35, $r34, 1; #
st.global.s32 [$r18+0], $r35; # id:100
exit; #
} # shuffle
.version 1.1
The problem is in variable n.
In some later accesses to n, a ld.shared.s32 $r10, [$r4+0]; is generated before the instruction concerning n. However, it’s totally useless (since n is guaranteed to be loaded by then), and WRONG (since shared memory is cleared after n is loaded the first time).