I tested the following codes on GT8800+CUDA1.1. When the n >= 2^16, the results are error: sum is always 0.
When I remove the line of " for (int p = 0; p < n; p++)" in GPU_kernel, it is right.
Why?
gpu_kernel.cu:
#ifndef GPU_KERNEL_H
#define GPU_KERNEL_H
// includes, system
#include <math.h>
// includes, project
#include <cutil.h>
global void
GPU_kenel(int n, float *d_array)
{
int start = threadIdx.x * (n / THREAD_SIZE);
int end = (threadIdx.x + 1) * (n / THREAD_SIZE);
for (int p = 0; p < n; p++)
for (int i = start; i < end; i++)
d_array[i] = 1.0;
__syncthreads();
}
#endif // GPU_KERNEL_H
main.cu:
// size
#define THREAD_SIZE 32
#define BLOCK_SIZE 1
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cutil.h>
// includes, kernels
#include <GPU_kernel.cu>
void main(int argc, char **argv)
{
int n;
n = 1 << (atoi(argv[1]);
printf("n = %d\n", n);
float *h_part;
h_part = (float *)malloc(n*sizeof(float));
assert(h_part);
float *d_part;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_part, n*sizeof(float)));
dim3 threads(THREAD_SIZE, 1);
dim3 grid(BLOCK_SIZE, 1);
// ************************************************************************************
GPU_kenel<<< grid, threads >>>(n, d_part);
// ************************************************************************************
CUT_SAFE_CALL(cudaThreadSynchronize());
// 2.4 check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
// 3. copy result from device to host
CUDA_SAFE_CALL(cudaMemcpy(h_part, d_part, n*sizeof(float), cudaMemcpyDeviceToHost) );
CUT_SAFE_CALL(cudaThreadSynchronize());
// 4. compute sum
printf("Get the sum ");
float sum = 0.;
for(int i = 0; i < n; i++){
sum += h_part[i];
}
printf(" %f\n", sum);
CUT_SAFE_CALL(cudaFree(d_part));
free(h_part);
printf("End execution on GPU\n");
}