Dear all I need a help. This code give the same results as CPU for nElem<=16384000 and for larger values it give wrong resultsgiven that my GPU is of CC 3.5 and has a DRAM 4GiBytes. The code is:
#include <time.h>
#include <iostream>
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>
using namespace std;
__global__ void sumArraysOnDevice(float *A, float *B, float *C, int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) C[i] = A[i] + B[i];
}
void sumArraysOnHost(float *A,float *B, float *C,int size)
{
for(int i=0;i<size;i++)
C[i]=A[i]+B[i];
}
void initialData(float *ip , int size)
{
// generate different seed for random number
//time_t t;
//srand((unsigned int) time(&t));
for (int i=0; i<size; i++)
{
ip[i] = (float) i ;//( rand() & 0xFF )/10.0f;
}
}
double cpuSecond() {
struct timeval tp;
gettimeofday(&tp,NULL);
return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);
}
int main(int argc, char **argv) {
int nElem = 4096000*4;
size_t nBytes = nElem * sizeof(float);
float *h_A, *h_B, *h_C;
float *d_A, *d_B, *d_C;
dim3 block(256);
dim3 grid(nElem/block.x);
cudaMalloc((float**)&d_A, nBytes);
cudaMalloc((float**)&d_B, nBytes);
cudaMalloc((float**)&d_C, nBytes);
h_A =(float *)malloc(nBytes);
h_B =(float *)malloc(nBytes);
h_C =(float *)malloc(nBytes);
initialData(h_A, nElem);
initialData(h_B, nElem);
cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
double istart=cpuSecond();
sumArraysOnDevice<<<grid, block>>>(d_A, d_B, d_C,nElem);
cudaDeviceSynchronize();
double iElapse=cpuSecond()-istart;
cout<<"The elapsed time on GPU is "<<iElapse<<"\n";
cudaMemcpy(h_C, d_C, nBytes, cudaMemcpyDeviceToHost);
cout<<h_C[nElem-1]<<"\n";
cout<<"----------------------------------------- \n";
istart=cpuSecond();
sumArraysOnHost(h_A, h_B, h_C,nElem);
iElapse=cpuSecond()-istart;
cout<<"The elapsed time on CPU is "<<iElapse<<"\n";
cout<<h_C[nElem-1]<<"\n";
free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return(0);
}