Hi All,
I have implemented a kernel :
__global__
void Kernel_func()
{
long idx = __mul24( blockIdx.x , blockDim.x ) + threadIdx.x;
uchar4 Array_1[64];
short4 Array_2[8];
if( idx < limit )
{
//Here I am calculating value of Array_1.
for( int l = 0; l < 8; ++l )
{
device_function(Array_1,Array_2)
}
//remaining body of the kernel.
}
}
This kernel calls device function :
__device__
void device_function( uchar4 *Array_1, short4 * result_x )
{
signed short int a0 = 0, a1 = 0, a2 = 0, a3 = 0, a4 = 0, a5 = 0, a6 = 0, a7 = 0;
a0 = tex1Dfetch(hCoeff_XTexture, 0 );
a1 = tex1Dfetch(hCoeff_XTexture, 0 );
a2 = tex1Dfetch(hCoeff_XTexture, 2 );
a3 = tex1Dfetch(hCoeff_XTexture, 3 );
a4 = tex1Dfetch(hCoeff_XTexture, 4 );
a5 = tex1Dfetch(hCoeff_XTexture, 5 );
a6 = tex1Dfetch(hCoeff_XTexture, 6 );
a7 = tex1Dfetch(hCoeff_XTexture, 7 );
Array_2[0] = cast_short4(((a0 * Array_1[0]) + (a1 * Array_1[1]) + (a2 * Array_1[2]) + (a3 * Array_1[3]) +
(a4 * Array_1[4]) + (a5 * Array_1[5]) + (a6 * Array_1[6]) + (a7 * Array_1[7])+ 255) >> 13);
Array_2[1] = cast_short4(((a0 * Array_1[8]) + (a1 * Array_1[9]) + (a2 * Array_1[10] )+ (a3 * Array_1[11] )+
(a4 * Array_1[12]) + (a5 * v[13]) + (a6 * Array_1[14] )+ (a7 * Array_1[15])+ 255) >> 13);
Array_2[2] = cast_short4(((a0 * Array_1[16]) + (a1 * Array_1[17]) + (a2 * Array_1[18]) + (a3 * Array_1[19]) +
(a4 * Array_1[20]) + (a5 * Array_1[21]) + (a6 * Array_1[22]) + (a7 * Array_1[23])+ 255) >> 13);
Array_2[3] = cast_short4(((a0 * Array_1[24]) +(a1 * Array_1[25]) + (a2 * Array_1[26]) + (a3 * Array_1[27]) +
(a4 * Array_1[28]) + (a5 * Array_1[29]) + (a6 * Array_1[30]) + (a7 * Array_1[31]) + 255) >> 13);
Array_2[4] = cast_short4(((a0 * Array_1[32])+(a1 * Array_1[33]) + (a2 * Array_1[34]) + (a3 * Array_1[35]) +
(a4 * Array_1[36]) + (a5 * Array_1[37]) + (a6 * Array_1[38]) + (a7 * Array_1[39]) + 255) >> 13);
Array_2[5] = cast_short4(((a0 * Array_1[40]) + (a1 * Array_1[41]) + (a2 * Array_1[42]) + (a3 * Array_1[43]) +
(a4 * Array_1[44]) + (a5 * Array_1[45]) + (a6 * Array_1[46]) + (a7 * Array_1[47])+ 255) >> 13);
Array_2[6] = cast_short4(((a0 * Array_1[48]) + (a1 * Array_1[49]) + (a2 * Array_1[50]) + (a3 * Array_1[51]) +
(a4 * Array_1[52]) + (a5 * Array_1[53]) + (a6 * Array_1[54]) + (a7 * Array_1[55])+ 255) >> 13);
Array_2[7] = cast_short4(((a0 * Array_1[56]) + (a1 * Array_1[57]) + (a2 * Array_1[58]) + (a3 * Array_1[59]) +
(a4 * Array_1[60] )+ (a5 * Array_1[61]) + (a6 * Array_1[62] )+ (a7 * Array_1[63])+ 255) >> 13);
}
Here in device function I define function for mutiplication * (short int and char4 type) and casting funtions cast_short4 that cast int4 type to short4 type.
My problem is this , when Kernel is launched it returns cudaSucess but its body is not executed but if I comment device_function then its remaining body is executed.
I canot understand why this device_function cause problem. My mutiplication * (short int and char4 type) and casting funtions cast_short4 that cast int4 type to short4 type working fine in other kernel so there is no problem in this.
can any one help?
please help I am fully stuck here.
is limit a constant variable, defined in another compilation unit?
does the kernel goes inside the ‘if’ block?
hope this help
Francesco
Hi All,
I have implemented a kernel :
__global__
void Kernel_func()
{
long idx = __mul24( blockIdx.x , blockDim.x ) + threadIdx.x;
uchar4 Array_1[64];
short4 Array_2[8];
if( idx < limit )
{
//Here I am calculating value of Array_1.
for( int l = 0; l < 8; ++l )
{
device_function(Array_1,Array-2)
}
//remaining body of the kernel.
}
}
This kernel calls device function :
__device__
void device_function( uchar4 *Array_1, short4 * result_x )
{
signed short int a0 = 0, a1 = 0, a2 = 0, a3 = 0, a4 = 0, a5 = 0, a6 = 0, a7 = 0;
a0 = tex1Dfetch(hCoeff_XTexture, 0 );
a1 = tex1Dfetch(hCoeff_XTexture, 0 );
a2 = tex1Dfetch(hCoeff_XTexture, 2 );
a3 = tex1Dfetch(hCoeff_XTexture, 3 );
a4 = tex1Dfetch(hCoeff_XTexture, 4 );
a5 = tex1Dfetch(hCoeff_XTexture, 5 );
a6 = tex1Dfetch(hCoeff_XTexture, 6 );
a7 = tex1Dfetch(hCoeff_XTexture, 7 );
Array_2[0] = cast_short4(((a0 * Array_1[0]) + (a1 * Array_1[1]) + (a2 * Array_1[2]) + (a3 * Array_1[3]) +
(a4 * Array_1[4]) + (a5 * Array_1[5]) + (a6 * Array_1[6]) + (a7 * Array_1[7])+ 255) >> 13);
Array_2[1] = cast_short4(((a0 * Array_1[8]) + (a1 * Array_1[9]) + (a2 * Array_1[10] )+ (a3 * Array_1[11] )+
(a4 * Array_1[12]) + (a5 * v[13]) + (a6 * Array_1[14] )+ (a7 * Array_1[15])+ 255) >> 13);
Array_2[2] = cast_short4(((a0 * Array_1[16]) + (a1 * Array_1[17]) + (a2 * Array_1[18]) + (a3 * Array_1[19]) +
(a4 * Array_1[20]) + (a5 * Array_1[21]) + (a6 * Array_1[22]) + (a7 * Array_1[23])+ 255) >> 13);
Array_2[3] = cast_short4(((a0 * Array_1[24]) +(a1 * Array_1[25]) + (a2 * Array_1[26]) + (a3 * Array_1[27]) +
(a4 * Array_1[28]) + (a5 * Array_1[29]) + (a6 * Array_1[30]) + (a7 * Array_1[31]) + 255) >> 13);
Array_2[4] = cast_short4(((a0 * Array_1[32])+(a1 * Array_1[33]) + (a2 * Array_1[34]) + (a3 * Array_1[35]) +
(a4 * Array_1[36]) + (a5 * Array_1[37]) + (a6 * Array_1[38]) + (a7 * Array_1[39]) + 255) >> 13);
Array_2[5] = cast_short4(((a0 * Array_1[40]) + (a1 * Array_1[41]) + (a2 * Array_1[42]) + (a3 * Array_1[43]) +
(a4 * Array_1[44]) + (a5 * Array_1[45]) + (a6 * Array_1[46]) + (a7 * Array_1[47])+ 255) >> 13);
Array_2[6] = cast_short4(((a0 * Array_1[48]) + (a1 * Array_1[49]) + (a2 * Array_1[50]) + (a3 * Array_1[51]) +
(a4 * Array_1[52]) + (a5 * Array_1[53]) + (a6 * Array_1[54]) + (a7 * Array_1[55])+ 255) >> 13);
Array_2[7] = cast_short4(((a0 * Array_1[56]) + (a1 * Array_1[57]) + (a2 * Array_1[58]) + (a3 * Array_1[59]) +
(a4 * Array_1[60] )+ (a5 * Array_1[61]) + (a6 * Array_1[62] )+ (a7 * Array_1[63])+ 255) >> 13);
}
Here in device function I define function for mutiplication * (short int and char4 type) and casting funtions cast_short4 that cast int4 type to short4 type.
My problem is this , when Kernel is launched it returns cudaSucess but its body is not executed but if I comment device_function then its remaining body is executed.
I canot understand why this device_function cause problem. My mutiplication * (short int and char4 type) and casting funtions cast_short4 that cast int4 type to short4 type working fine in other kernel so there is no problem in this.
can any one help?
CUDAkk
July 1, 2009, 11:24am
4
Here limit is the global variable is equal to heigh* width because I am working table of data.
If I comment device_function() then control goes inside if block. But if I open the comment (i.e. calling the device function) then control canot goes inside if block.
I can not understand what is the problem?
is limit a constant variable, defined in another compilation unit?
does the kernel goes inside the ‘if’ block?
hope this help
Francesco
I replaced device_function() call with its defination. After that still I got the same error.
Maybe if you have a variable named “Array” that might explain some problems:
you have:
device_function(Array_1,Array-2)
instead of
device_function(Array_1,Array_2)
Maybe if you have a variable named “Array” that might explain some problems:
you have:
device_function(Array_1,Array-2)
instead of
device_function(Array_1,Array_2)
In my code Array_2 is their. please read Array_2 instead of Array-2.