Hi,
I found a small example were nvcc
create wrong results while unrolling loops.
System:
- Ubuntu 12.04.4 LTS
- NVIDIA-SMI 331.62 Driver Version: 331.62
- cuda 6.0
- Card: Tesla k20m
example code:
#include <iostream>
#include <cuda.h>
#include <stdio.h>
template<int N>
struct AnyClass
{
static const int begin = -1;
static const int end = begin + N;
template<typename T >
__device__ void operator()(T* ptr)
{
/* increment loop begin and end by one*/
const int offset_k = static_cast<int>(ptr[0]!=0);
for (int k = begin+offset_k ; k < end +offset_k; ++k) {
if ( k < begin+offset_k)
printf("error k=%i; begin=%i offset_k=%i => but k should be >= %i\n", k,begin,offset_k,begin+offset_k);
else
printf("OK k=%i; begin=%i offset_k=%i \n", k,begin,offset_k);
}
}
};
template <int N>
__global__ void run(float* ptr,int v){
ptr[0]=v;
AnyClass<N>()(ptr);
}
int main(int argc, char *argv[]){
float* d = 0;
unsigned int n = 1;
cudaMalloc((void**)&d, n*sizeof(float));
run<2><<<1, 1>>>(d,1);
cudaThreadSynchronize();
return 0;
}
The code should run through two cycles, but it somehow unrolls it to four cycles.
Output:
error k=-2; begin=-1 offset_k=1 => but k should be >= 0
error k=-1; begin=-1 offset_k=1 => but k should be >= 0
OK k=0; begin=-1 offset_k=1
OK k=1; begin=-1 offset_k=1
It enters two loop cycles.
In this example offset_k
is 1 (it is not equal to zero)
begin
= -1
k = begin + offset_k = -1 + 1
k should never be less than 0 but the for loop goes from [-2;1] ← that is wrong
Note: NVCC generates right results in cuda 5.0
Thank for any help, I applied for a CUDA developer account to open a bug report.