I am writing an array addition program, which adds two arrays on the device in windows xp. The program works fine for any no of elements on Linux os, but on windows xp, it crashes for more than 90,000 elements in the array. The error i get is a pop window which says the exe file has encountered a problem and needs to be closed, i don’t understand why it happens in xp.
Can anybody help me out??
/* This program implements the addition of two arrays
using threads in the GPU. */
// includes, system
#include <stdio.h>
#include <cutil.h>
// define the dimensions
#define THX 256
#define THY 1
#define DIM 82000
// Device Code
__global__ void add_in_gpu(int *A,int *B,int *C)
{
int threadx,blockx;
// block index
blockx = blockIdx.x;
//thread index
threadx=threadIdx.x;
C[blockx*THX+threadx] = A[blockx*THX+threadx] + B[blockx*THX+threadx];
__syncthreads();
}
/************************************** Main Program ********************************************/
int main()
{
//CUT_DEVICE_INIT();
//Define the Grids and Threads
dim3 threads(THX,THY);
dim3 grids(DIM/threads.x+1,1);
//define dimensions
int *device_b;
int *device_a;
int *device_c;
int A[DIM];
int B[DIM],C[DIM];
int i,iter=50;
// create the timer
unsigned int timer=0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
// initialize the arrays A & B
for(i=0;i<DIM;i++)
{
A[i]=1;
B[i]=2;
}
// print the arrays A & B
printf("\n Array A\n\n");
for(i=0;i<DIM;i++)
printf("\t%d",A[i]);
printf("\n");
printf("\n Array B\n\n");
for(i=0;i<DIM;i++)
printf("\t%d",B[i]);
//ALLOCATE MEMORY IN GPU
int size=sizeof(int)*DIM;
CUDA_SAFE_CALL(cudaMalloc((void**)&device_a,size));
CUDA_SAFE_CALL(cudaMalloc((void**)&device_b,size));
CUDA_SAFE_CALL(cudaMalloc((void**)&device_c,size));
CUT_CHECK_ERROR("Kernel execution failed");
//FROM MEMORY FROM HOST TO DEVICE
CUDA_SAFE_CALL(cudaMemcpy(device_a,A,size,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(device_b,B,size,cudaMemcpyHostToDevice));
// start the timer and specify the no of iterations
CUT_SAFE_CALL(cutStartTimer(timer));
for(int i=0;i<iter;i++)
{
// INVOKING KERNEL
add_in_gpu<<<grids,threads>>>(device_a,device_b,device_c);
// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
}
// stop the timer and fetch the the timer value
CUT_SAFE_CALL(cutStopTimer(timer));
// Result is copied to Host
CUDA_SAFE_CALL(cudaMemcpy(C,device_c,size,cudaMemcpyDeviceToHost));
// printing the resultant array
printf("\n");
printf("\n The sum of two arrays in GPU\n\n");
for(i=0;i<DIM;i++)
{
printf("%d\t%d\n",i,C[i]);
}
printf("\n\nGPU Processing time: %f (ms)\n",(cutGetTimerValue(timer)));
printf("\n");
//Free Device and Host Memory
CUDA_SAFE_CALL(cudaFree(device_a));
CUDA_SAFE_CALL(cudaFree(device_b));
CUDA_SAFE_CALL(cudaFree(device_c));
}