Erronous ptx generated

I’m using CUDA 0.8.

This program is garbage, it just demonstrates the problem.

#include <stdio.h>

#include <stdlib.h>

#define N 1000003

__global__ static void shuffle(int *dst,int *src,int *ps)

{

	extern __shared__ int shared[];

	int volatile *sh=shared;

	int id=threadIdx.x;

	int v=(id&31);

	if(v==0)

  sh[id]=src[id];

	__syncthreads();

	int n=sh[id];

	__syncthreads();

	sh[id]=dst[id]*12345;

	__syncthreads();

	int a=dst[9],b=dst[8];

l2:	if(dst[0]>0)goto l0;

	a++;

	dst[n+7]=a+b-n;

l1:	if(dst[1]>0)goto l1;

	b++;

	dst[n+11]=a+b-n;

l4:	if(dst[2]>0)goto l2;

	b++;

	dst[n+23]=a+b-n;

l3:	if(dst[3]>0)goto l3;

	a++;

	dst[n+69]=a+b-n;

l0:	if(dst[4]>0)goto l4;

	b++;

	dst[n]=a+b-n;

}

int main()

{

	int *a,*b,*c;

	int *da,*db,*dc;

   cudaMalloc((void**)&da,sizeof(int)*N);a=(int*)malloc(sizeof(int)*N);

    cudaMalloc((void**)&db,sizeof(int)*N);b=(int*)malloc(sizeof(int)*N);

    cudaMalloc((void**)&dc,sizeof(int)*N);c=(int*)malloc(sizeof(int)*N);

	int p=1237;

	for(int i=0;i<N;i++)

  b[i]=i;

	for(int i=0;i<N;i++)

  c[i]=(i*p)%N;

   cudaMemcpy(db,b,sizeof(int)*N,cudaMemcpyHostToDevice);

    cudaMemcpy(dc,c,sizeof(int)*N,cudaMemcpyHostToDevice);

	shuffle<<<N/512+1,512,4096>>>(da,db,dc);

    cudaMemcpy(a,da,sizeof(int)*N,cudaMemcpyDeviceToHost);

   cudaFree(dc);

    cudaFree(db);

    cudaFree(da);

    for(int i=0;i<N;i++)

    	if(a[c[i]]!=i)printf("FAIL! %d for %d\n",a[c[i]],i);

	return 0;

}

Generated ptx:

#  c:\cuda\bin/../open64/lib//be.exe::1.10

	#-----------------------------------------------------------

	# Compiling c:/windows/temp/tmp_000014d8-1.i (E:/DOCUME~1/f-qmhou/LOCALS~1/Temp/ccBI#.a04340)

	#-----------------------------------------------------------

	#-----------------------------------------------------------

	# Options:

	#-----------------------------------------------------------

	#  Target:PROCESSOR_8, ISA:PTX1, Endian:little, Pointer Size:32

	#  -O2	(Optimization level)

	#  -g0	(Debug level)

	#  -m2	(Report advisories)

	#-----------------------------------------------------------

#	# .file	1	"c:/windows/temp/tmp_000014d8-5.gpu"

 #	# .file	2	"D:\f-qmhou\tcuda/E:\Program Files\Microsoft Visual Studio 8\VC\INCLUDE\crtdefs.h"

 #	# .file	3	"c:\cuda\bin/../include/crt/device_runtime.h"

 #	# .file	4	"c:\cuda\bin/../include/host_defines.h"

 #	# .file	5	"c:\cuda\bin/../include/builtin_types.h"

 #	# .file	6	"c:\cuda\bin/../include/driver_types.h"

 #	# .file	7	"c:\cuda\bin/../include/texture_types.h"

 #	# .file	8	"c:\cuda\bin/../include/vector_types.h"

 #	# .file	9	"c:\cuda\bin/../include/device_launch_parameters.h"

 #	# .file	10	"D:\f-qmhou\tcuda/c.cu"

 #	# .file	11	"c:\cuda\bin/../include/common_functions.h"

 #	# .file	12	"c:\cuda\bin/../include/crt/func_macro.h"

 #	# .file	13	"c:\cuda\bin/../include/math_functions.h"

 #	# .file	14	"c:\cuda\bin/../include/device_functions.h"

 #	# .file	15	"c:\cuda\bin/../include/math_constants.h"

 #	# .file	16	"c:\cuda\bin/../include/texture_fetch_functions.h"

 #	# .file	17	"c:\cuda\bin/../include/math_functions_dbl_ptx1.h"

	.extern	.shared .align 4 .b8 shared[];

	.const .align 4 .b8 __cudart_A1[128] = {0,0,128,63,135,205,130,63,195,170,133,63,15,152,136,63,194,149,139,63,58,164,142,63,211,195,145,63,240,244,148,63,240,55,152,63,58,141,155,63,50,245,158,63,67,112,162,63,215,254,165,63,91,161,169,63,63,88,173,63,246,35,177,63,243,4,181,63,175,251,184,63,164,8,189,63,77,44,193,63,42,103,197,63,190,185,201,63,140,36,206,63,30,168,210,63,253,68,215,63,184,251,219,63,223,204,224,63,7,185,229,63,199,192,234,63,186,228,239,63,125,37,245,63,179,131,250,63};

	.const .align 4 .b8 __cudart_A2[128] = {0,0,0,0,169,167,78,179,145,152,79,51,75,218,126,179,161,171,96,178,101,116,105,179,36,86,103,51,18,2,46,179,113,27,35,51,99,85,12,179,66,35,65,51,90,18,195,48,94,157,44,179,8,43,22,179,246,234,45,178,168,90,124,179,122,231,207,50,247,197,14,51,232,79,65,179,62,102,214,178,55,168,10,50,162,35,115,179,36,252,40,50,170,29,92,179,138,165,212,178,28,74,80,179,89,171,30,178,230,27,68,178,222,22,65,178,74,70,72,179,54,36,41,50,88,55,146,178};

	.const .align 4 .b8 __cudart_Ainv[128] = {0,0,128,63,179,131,122,63,125,37,117,63,186,228,111,63,199,192,106,63,7,185,101,63,223,204,96,63,184,251,91,63,253,68,87,63,30,168,82,63,140,36,78,63,190,185,73,63,42,103,69,63,77,44,65,63,164,8,61,63,175,251,56,63,243,4,53,63,246,35,49,63,63,88,45,63,91,161,41,63,215,254,37,63,67,112,34,63,50,245,30,63,58,141,27,63,240,55,24,63,240,244,20,63,211,195,17,63,58,164,14,63,194,149,11,63,15,152,8,63,195,170,5,63,135,205,2,63};

	.entry shuffle

	{

	.reg .u32 $r1,$r2,$r3,$r4,$r5,$r6,$r7,$r8,$r9,

  $r10,$r11,$r12,$r13,$r14,$r15,$r16,$r17,$r18,$r19,

  $r20,$r21,$r22,$r23,$r24,$r25,$r26,$r27,$r28,$r29,

  $r30,$r31,$r32,$r33,$r34,$r35;

	.reg .pred $p0,$p1,$p2,$p3,$p4;

	.param .u32 %parm_dst;

	.param .u32 %parm_src;

	.param .u32 %parm_ps;

 #  # .file	10	"D:\f-qmhou\tcuda/c.cu"

 #  .loc	10	7	0

 #   3  #include <windows.h>

 #   4  

 #   5  #define N 1000003

 #   6  

 #   7  __global__ static void shuffle(int *dst,int *src,int *ps)

$LBB1_shuffle:

	cvt.u32.u16  $r1, %tid.x;      #  

	mul.lo.u32  $r2, $r1, 4;      	#  

	mov.u32  $r3, (&shared);      	#  

	add.u32  $r4, $r2, $r3;        #  

	and.s32  $r5, $r1, 31;        	#  

	mov.s32  $r6, 0;              	#  

	setp.ne.s32  $p1, $r5, $r6;    #  

	@$p1 bra  $Lt_0_18;            #  

$LBB2_shuffle:

 #  .loc	10	16	0

 #  12  

 #  13  	int id=threadIdx.x;

 #  14  	int v=(id&31);

 #  15  	if(v==0)

 #  16    sh[id]=src[id];

	ld.param.u32  $r7, %parm_src;  #  id:84 %parm_src+0x0

	add.u32  $r8, $r7, $r2;        #  

	ld.global.s32  $r9, [$r8+0];  	#  id:85

	st.shared.s32  [$r4+0], $r9;  	#  id:86 shared+0x0

$Lt_0_18:

 #  .loc	10	17	0

 #  17  	__syncthreads();

	bar.wait  0;                  	#  

 #  .loc	10	18	0

 #  18  	int n=sh[id];

	ld.shared.s32  $r10, [$r4+0];  #  id:87 shared+0x0

 #  .loc	10	19	0

 #  19  	__syncthreads();

	bar.wait  0;                  	#  

 #  .loc	10	20	0

 #  20  	sh[id]=dst[id]*12345;

	ld.param.u32  $r11, %parm_dst;	#  id:88 %parm_dst+0x0

	add.u32  $r12, $r2, $r11;      #  

	ld.global.s32  $r13, [$r12+0];	#  id:89

	mul.lo.s32  $r14, $r13, 12345;	#  

	st.shared.s32  [$r4+0], $r14;  #  id:90 shared+0x0

 #  .loc	10	21	0

 #  21  	__syncthreads();

	bar.wait  0;                  	#  

 #  .loc	10	22	0

 #  22  	int a=dst[9],b=dst[8];

	ld.global.s32  $r15, [$r11+36];	#  id:91

	ld.global.s32  $r16, [$r11+32];	#  id:92

	mul.lo.u32  $r17, $r10, 4;    	#  

	add.u32  $r18, $r17, $r11;    	#  

$Lt_0_1:

 #<loop> Loop body line 23

 #  .loc	10	23	0

 #  23  l2:	if(dst[0]>0)goto l0;

	ld.global.s32  $r19, [$r11+0];	#  id:93

	mov.s32  $r20, 0;              #  

	setp.le.s32  $p2, $r19, $r20;  #  

	@$p2 bra  $Lt_0_20;            #  

$LBB5_shuffle:

 #<loop> Part of loop body line 23, head labeled $Lt_0_1

	bra.uni  $Lt_0_2;              #  

$Lt_0_20:

 #<loop> Part of loop body line 23, head labeled $Lt_0_1

 #  .loc	10	24	0

 #  24  	a++;

	add.s32  $r15, $r15, 1;        #  

 #  .loc	10	25	0

 #  25  	dst[n+7]=a+b-n;

	add.s32  $r21, $r15, $r16;    	#  

 #  .loc	10	18	0

	ld.shared.s32  $r10, [$r4+0];  #  

 #  .loc	10	25	0

	sub.s32  $r22, $r21, $r10;    	#  

	st.global.s32  [$r18+28], $r22;	#  id:94

 #  .loc	10	27	0

 #  26  l1:	if(dst[1]>0)goto l1;

 #  27  	b++;

	add.s32  $r16, $r16, 1;        #  

 #  .loc	10	28	0

 #  28  	dst[n+11]=a+b-n;

	add.s32  $r23, $r15, $r16;    	#  

	sub.s32  $r24, $r23, $r10;    	#  

	st.global.s32  [$r18+44], $r24;	#  id:95

$Lt_0_4:

 #<loop> Part of loop body line 23, head labeled $Lt_0_1

 #  .loc	10	29	0

 #  29  l4:	if(dst[2]>0)goto l2;

	ld.global.s32  $r25, [$r11+8];	#  id:96

	mov.s32  $r26, 0;              #  

	setp.le.s32  $p3, $r25, $r26;  #  

	@$p3 bra  $Lt_0_24;            #  

$LBB8_shuffle:

 #<loop> Part of loop body line 23, head labeled $Lt_0_1

	bra.uni  $Lt_0_1;              #  

$Lt_0_24:

 #<loop> Part of loop body line 23, head labeled $Lt_0_1

 #  .loc	10	30	0

 #  30  	b++;

	add.s32  $r16, $r16, 1;        #  

 #  .loc	10	31	0

 #  31  	dst[n+23]=a+b-n;

	add.s32  $r27, $r15, $r16;    	#  

 #  .loc	10	18	0

	ld.shared.s32  $r10, [$r4+0];  #

 #  .loc	10	31	0

	sub.s32  $r28, $r27, $r10;    	#

	st.global.s32  [$r18+92], $r28;	#  id:97

 #  .loc	10	33	0

 #  32  l3:	if(dst[3]>0)goto l3;

 #  33  	a++;

	add.s32  $r15, $r15, 1;        #  

 #  .loc	10	34	0

 #  34  	dst[n+69]=a+b-n;

	add.s32  $r29, $r15, $r16;    	#  

	sub.s32  $r30, $r29, $r10;    	#  

	st.global.s32  [$r18+276], $r30;	#  id:98

$Lt_0_2:

 #<loop> Part of loop body line 23, head labeled $Lt_0_1

 #  .loc	10	35	0

 #  35  l0:	if(dst[4]>0)goto l4;

	ld.global.s32  $r31, [$r11+16];	#  id:99

	mov.s32  $r32, 0;              #  

	setp.le.s32  $p4, $r31, $r32;  #  

	@$p4 bra  $Lt_0_28;            #  

$LBB11_shuffle:

 #<loop> Part of loop body line 23, head labeled $Lt_0_1

	bra.uni  $Lt_0_4;              #  

$Lt_0_28:

 #  .loc	10	37	0

 #  36  	b++;

 #  37  	dst[n]=a+b-n;

	add.s32  $r33, $r15, $r16;    	#  

 #  .loc	10	18	0

	ld.shared.s32  $r10, [$r4+0];  #  

 #  .loc	10	37	0

	sub.s32  $r34, $r33, $r10;    	#  

	add.s32  $r35, $r34, 1;        #  

	st.global.s32  [$r18+0], $r35;	#  id:100

	exit;                          #  

	} # shuffle

	.version 1.1

The problem is in variable n.

In some later accesses to n, a ld.shared.s32 $r10, [$r4+0]; is generated before the instruction concerning n. However, it’s totally useless (since n is guaranteed to be loaded by then), and WRONG (since shared memory is cleared after n is loaded the first time).