Hi All,
Could anybody compile this code and try it using your CUDA version and hardware? I have CUDA 2.0 on linux64 and 280GTX. The problem with this simple code is that it produces crappy result on grids with higher than 1x32768 size (first dim is always 1)… although should return the grid size. here it is:
#include <stdio.h>
#include <stdlib.h>
__global__ void foo(float *d_out)
{
int blidY = blockIdx.y;
int thidX = threadIdx.x;
int thidY = threadIdx.y;
int g_thidY = blidY * 8 + thidY;
if(!thidX)
d_out[g_thidY] = 1.0f;
}
void test(int grid_size)
{
int size = grid_size*8;
float *out, *d_out;
out = (float*)malloc(size*sizeof(float));
cudaMalloc((void**)&d_out, size*sizeof(float));
foo <<< dim3(1, size/8), dim3(8, 8) >>> (d_out);
double result = 0.0;
cudaMemcpy(out, d_out, size*sizeof(float), cudaMemcpyDeviceToHost);
for(int i = 0; i < size; i ++)
result += out[i];
printf("%f\n", result / 8);
free(out);
cudaFree(d_out);
}
void main()
{
for(int i = 32000; i < 33000; i += 1)
test(i);
}
here is my output
.
.
.
32761.000000
32762.000000
32763.000000
32764.000000
32765.000000
32766.000000
32767.000000
32768.000000
1482937404001209024512.000000
1482937403996897017856.000000
78296474278759370254712832.000000
78296495133133742119321600.000000
78296495133133742119321600.000000
156592962484230985571368960.000000
.
.
.