I’m having a problem when using templates with recursion. It works perfectly fine for emulating mode but when I switch to release it produces some weird results. The concept is to unroll the most inner loop as this:
texture<unsigned char, 2, cudaReadModeElementType> tex;
template <int x>
__device__ float comp
(
unsigned int startx,
unsigned int pos_y,
unsigned char bl,
unsigned char gr,
unsigned char re,
int rozn_b,
int rozn_g,
int rozn_r
)
{
rozn_b=bl-tex2D(tex,x*3+startx ,pos_y);
rozn_g=gr-tex2D(tex,x*3+startx+1,pos_y);
rozn_r=re-tex2D(tex,x*3+startx+2,pos_y);
return (sqrtf(
__mul24(rozn_b,rozn_b)+
__mul24(rozn_g,rozn_g)+
__mul24(rozn_r,rozn_r))+
comp<(x-1)(startx,pos_y,bl,gr,re,rozn_b,rozn_g,rozn_r));
}
template <>
__device__ float comp<0>
(
unsigned int startx,
unsigned int pos_y,
unsigned char bl,
unsigned char gr,
unsigned char re,
int rozn_b,
int rozn_g,
int rozn_r
)
{
rozn_b=bl-tex2D(tex,startx ,pos_y);
rozn_g=gr-tex2D(tex,startx+1,pos_y);
rozn_r=re-tex2D(tex,startx+2,pos_y);
return (sqrtf
(
__mul24(rozn_b,rozn_b)+
__mul24(rozn_g,rozn_g)+
__mul24(rozn_r,rozn_r)
));
}
I’m calling it in this way:
sum+=comp<(KERNEL_RADIUS*2)>(startx,a,bl,gr,re,rozn_b,rozn_g,rozn_r);
Using the simple “for” loop in analogical way gives corrrect results. Does anyone have any idea, what can be the problem with this code??
I use Intel Core 2 quad 4x2.4 GHz, 2 GB RAM, 8600 GTS, Windows XP,CUDA SDK & Toolkit v1.0
Jakub