Strange behaviour of a kernel function

Hallo,
I have a program using CUDA under Visual Studio 2005. The problem is the following. I have a kernel function computing some complex math. Its length is about 100 strings. Everything’s going fine until I add a definition of some variable. After this the kernel starts to work strange. There are no errors, programs still works and doesn’t crash. However there are no more output or results of computation from this kernel function. It looks like kernel stopped at the very beginning. It must be mentioned that addition of that variable in emulation mode doesn’t lead to some unexpected results.
It’s also strange that added variable is never used it’s just declarated and all! But with its declaration kernel doesn’t work properly anymore. The code file is attached.
Beginning of the kernel is:

global void FluidElem(CUDA_ARRAY* cuda)
{
const int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i >= cuda->elem_size)
return;

Quad* cur_quad = &cuda->quads[i];
if(cur_quad->Type != 1)//fluid
return;

const FTYPE Source_Height = cuda->Source_Height;
const FTYPE sourcea = cuda->sourcea;
const FPOINTEXT* points = cuda->points;
const FTYPE f0 = cuda->f0;

///////////////////
const FMatrix divu = 0;
const ANISO N = cur_quad->m_Nshear, A11 = cur_quad->H - N, A12 = cur_quad->C, A22 = cur_quad->M;
const FMatrix tau_div = divu*(A11 - N);
const FMatrix p = divuA12 + divuA22;

for(int k=0; k<=N_deg; k++)
for(int l=0; l<=N_deg; l++)
if(cur_quad->m_info(k,l).bd_type & (1<<POINTINFO:S))
cur_quad->GetCT(f0);
//////////////////////
…more different math computing
}

So this mystery variable is p. And if I comment
//const FMatrix p = divuA12 + divuA22;
everything will immediately work properly otherwise kernel does nothing. P is not used in the kernel, it’s just declared.
I have no idea what is the problem and how to figure out it.

System is Windows Server 2003 x64.
The videocard is Quadro FX 4600. Driver’s version 169.61.
Thanks in advance.
Biot.txt (3.92 KB)

strange, because the compiler will likely remove the definition of p since it is not used afterwards in the kernel, so it should generate exactly the same code. generate ptx code for both cases ans see if there is any difference.

Actually yes, and to tell you the truth the difference is quite heavy.

So in the first case I have

const FMatrix p = divuA12 + divuA22;

in the second

const FMatrix p = divuA12;// + divuA22;

The size of ptx file for the first case is 805 Kb, and for the second 801 Kb.

Both these files are attached. Here is just an extract showing this unexpected difference.

FIrst case

ld.local.f32 $f27, [$rd7+12]; // id:20812 __cuda___cuda___cuda___cuda___cuda_divu248888+0x0

mul.f32 	$f28, $f27, $f19;    	//  

st.local.f32 	[$rd8+12], $f28;	//  id:20813 __cuda___cuda_tau_div136108+0x0

ld.local.f32 	$f29, [$rd7+16];	//  id:20814 __cuda___cuda___cuda___cuda___cuda_divu248888+0x0

mul.f32 	$f30, $f29, $f19;    	//  

st.local.f32 	[$rd8+16], $f30;	//  id:20815 __cuda___cuda_tau_div136108+0x0

add.u16 	$rh7, $rh7, 20;      	//  

add.u64 	$rd8, $rd8, 20;      	//  

add.u64 	$rd7, $rd7, 20;      	//  

mov.s16 	$rh8, 100;           	//  

setp.ne.s16 	$p4, $rh7, $rh8; 	//  

@$p4 bra 	$Lt_0_4168;         	//  

.loc	4	323	0

mov.u16 	$rh7, 0;             	//  

mov.f32 	$f31, $f16;          	//  

mov.u64 	$rd7, __cuda___cuda___cuda___cuda___cuda_divu248888;	//  

mov.u64 	$rd9, __cuda___cuda___cuda___T235236208208;	//  

$Lt_0_4176:

// Loop body line 323, nesting depth: 1, iterations: 5

.loc	3	249	0

ld.local.f32 	$f32, [$rd7+0]; 	//  id:20817 __cuda___cuda___cuda___cuda___cuda_divu248888+0x0

mul.f32 	$f33, $f32, $f31;    	//  

st.local.f32 	[$rd9+0], $f33; 	//  id:20818 __cuda___cuda___cuda___T235236208208+0x0

ld.local.f32 	$f34, [$rd7+4]; 	//  id:20819 __cuda___cuda___cuda___cuda___cuda_divu248888+0x0

mul.f32 	$f35, $f34, $f31;    	//  

st.local.f32 	[$rd9+4], $f35; 	//  id:20820 __cuda___cuda___cuda___T235236208208+0x0

ld.local.f32 	$f36, [$rd7+8]; 	//  id:20821 __cuda___cuda___cuda___cuda___cuda_divu248888+0x0

mul.f32 	$f37, $f36, $f31;    	//  

st.local.f32 	[$rd9+8], $f37; 	//  id:20822 __cuda___cuda___cuda___T235236208208+0x0

ld.local.f32 	$f38, [$rd7+12];	//  id:20823 __cuda___cuda___cuda___cuda___cuda_divu248888+0x0

mul.f32 	$f39, $f38, $f31;    	//  

st.local.f32 	[$rd9+12], $f39;	//  id:20824 __cuda___cuda___cuda___T235236208208+0x0

ld.local.f32 	$f40, [$rd7+16];	//  id:20825 __cuda___cuda___cuda___cuda___cuda_divu248888+0x0

mul.f32 	$f41, $f40, $f31;    	//  

st.local.f32 	[$rd9+16], $f41;	//  id:20826 __cuda___cuda___cuda___T235236208208+0x0

add.u16 	$rh7, $rh7, 20;      	//  

add.u64 	$rd9, $rd9, 20;      	//  

add.u64 	$rd7, $rd7, 20;      	//  

mov.s16 	$rh9, 100;           	//  

setp.ne.s16 	$p5, $rh7, $rh9; 	//  

@$p5 bra 	$Lt_0_4176;         	//  

.loc	4	323	0

mov.u16 	$rh7, 0;             	//  

and second

ld.local.f32 $f25, [$rd7+12]; // id:20763 __cuda___cuda___cuda___cuda_divu24888+0x0

mul.f32 	$f26, $f25, $f17;    	//  

st.local.f32 	[$rd8+12], $f26;	//  id:20764 __cuda___cuda_tau_div132108+0x0

ld.local.f32 	$f27, [$rd7+16];	//  id:20765 __cuda___cuda___cuda___cuda_divu24888+0x0

mul.f32 	$f28, $f27, $f17;    	//  

st.local.f32 	[$rd8+16], $f28;	//  id:20766 __cuda___cuda_tau_div132108+0x0

add.u16 	$rh7, $rh7, 20;      	//  

add.u64 	$rd8, $rd8, 20;      	//  

add.u64 	$rd7, $rd7, 20;      	//  

mov.s16 	$rh8, 100;           	//  

setp.ne.s16 	$p4, $rh7, $rh8; 	//  

@$p4 bra 	$Lt_0_4152;         	//  

.loc	4	323	0

mov.u16 	$rh7, 0;             	//  

mov.f32 	$f29, $f16;          	//  

mov.u64 	$rd7, __cuda___cuda___cuda___cuda_divu24888;	//  

mov.u64 	$rd9, __cuda___cuda_p232208;	//  

$Lt_0_4160:

// Loop body line 323, nesting depth: 1, iterations: 5

.loc	3	249	0

ld.local.f32 	$f30, [$rd7+0]; 	//  id:20768 __cuda___cuda___cuda___cuda_divu24888+0x0

mul.f32 	$f31, $f30, $f29;    	//  

st.local.f32 	[$rd9+0], $f31; 	//  id:20769 __cuda___cuda_p232208+0x0

ld.local.f32 	$f32, [$rd7+4]; 	//  id:20770 __cuda___cuda___cuda___cuda_divu24888+0x0

mul.f32 	$f33, $f32, $f29;    	//  

st.local.f32 	[$rd9+4], $f33; 	//  id:20771 __cuda___cuda_p232208+0x0

ld.local.f32 	$f34, [$rd7+8]; 	//  id:20772 __cuda___cuda___cuda___cuda_divu24888+0x0

mul.f32 	$f35, $f34, $f29;    	//  

st.local.f32 	[$rd9+8], $f35; 	//  id:20773 __cuda___cuda_p232208+0x0

ld.local.f32 	$f36, [$rd7+12];	//  id:20774 __cuda___cuda___cuda___cuda_divu24888+0x0

mul.f32 	$f37, $f36, $f29;    	//  

st.local.f32 	[$rd9+12], $f37;	//  id:20775 __cuda___cuda_p232208+0x0

ld.local.f32 	$f38, [$rd7+16];	//  id:20776 __cuda___cuda___cuda___cuda_divu24888+0x0

mul.f32 	$f39, $f38, $f29;    	//  

st.local.f32 	[$rd9+16], $f39;	//  id:20777 __cuda___cuda_p232208+0x0

add.u16 	$rh7, $rh7, 20;      	//  

add.u64 	$rd9, $rd9, 20;      	//  

add.u64 	$rd7, $rd7, 20;      	//  

mov.s16 	$rh9, 100;           	//  

setp.ne.s16 	$p5, $rh7, $rh9; 	//  

@$p5 bra 	$Lt_0_4160;         	//  

.loc	4	323	0

mov.s64 	$rd10, $rd4;         	//

Biot2.txt (800 KB)
Biot1.txt (805 KB)