Hi, all.
I am learning CUDA programming from scratch. The following is a (stupid and simple function) program, But the compiled binary file crashes on execution everytime. I think that is because there are read beyond the limitation of memory. However I cam’t fix the exact problem. Can anyone help me? Thanx.
my program environment:
-
Winxp with sp3, Vc 9.0( cl.exe version 15.00.21022.08), Geforce GT 9800, Nvidia Driver 266.58
-
Winxp with sp3, Vc 9.0( cl.exe version 15.00.21022.08), Geforce GT 9800, Nvidia Driver 270.61
-
Win7, Vc 9.0( cl.exe version 15.00.21022.08), C2050 + Quodra FX3800, Nvidia Driver 270.61
My vc says, sizeof(int)=4 bytes, so I only allocated
655353sizeof(int)=786420=786k=0.8M
in both host and cuda mem. I don’t think 0.8M exceeds the ability of ram
/*
search the number x, where x*x can be splited into 2 parts: head, tail
(head+tail)*(head+tail)=x*x
for example
(20 + 25)*(20 + 25)=45*45=20 25
head=20, tail=25
*/
#include<iostream>
#include<time.h>
#define NUM (65535*2) //if NUM is 65535, the program runs ok
__global__ void kernel(int *dev_num)
{
int tid=threadIdx.x + blockIdx.x*blockDim.x;
int head, tail, sq;
while (tid<NUM)
{
sq=tid*tid;
head=sq / 100;
tail=sq % 100;
if ((head+tail)*(head+tail)==sq)
{
//printf("head=%d, tail=%d", head, tail);
dev_num[3*tid]=sq;
dev_num[3*tid+1]=head;
dev_num[3*tid+2]=tail;
}
else
{
dev_num[3*tid]=0;
}
tid+=blockDim.x*gridDim.x;
}
}
int main(void)
{
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
int num[3*NUM];
int *dev_num;
int i=0;
cudaMalloc((void**)&dev_num, 3*NUM*sizeof(int));
memset(&num, 0, 3*NUM*sizeof(int));
kernel<<<128, 128>>>(dev_num);
cudaMemcpy(num, dev_num, 3*NUM*sizeof(int), cudaMemcpyDeviceToHost);
for (i=0;i<NUM;i++)
{
if (num[3*i])
{
printf("(%2d + %2d)^2 = %4d\n", num[3*i+1], num[3*i+2], num[3*i]) ;
}
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Time: %f sec(s)\n", elapsedTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaFree(dev_num);
}