I have written the following code to do streaming on the GPU but i believe that my arithmetic is somehow flawed because it works for small input sizes but fails for larger ones.
Here is the code
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* Vector addition: C = A + B.
*
* This sample is a very basic sample that implements element by element
* vector addition. It is the same as the sample illustrating Chapter 3
* of the programming guide with some additions like error checking.
*
*/
// Includes
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#define GPU_MEM_SIZE 512*1024*1024
// Variables
float* h_A;
float* h_B;
float* h_C;
float* d_A;
float* d_B;
float* d_C;
bool noprompt = false;
// Functions
void Cleanup(void);
void RandomInit(float*, int);
// Device code
__global__ void VecAdd(const float* A, const float* B, float* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
// Host code
int main(int argc, char** argv)
{
int blockSize=(GPU_MEM_SIZE/3);
int numEl=atoi(argv[1]);
// int num64=(16*1024*1024);
printf("Vector addition\n");
int N = numEl;
printf("adding two vectors of %d Elements \n",N);
timer elapsed_timer;
int size=N*sizeof(float);
// Allocate input vectors h_A and h_B in host memory
h_A = (float*)malloc(size);
if (h_A == 0) Cleanup();
h_B = (float*)malloc(size);
if (h_B == 0) Cleanup();
h_C = (float*)malloc(size);
if (h_C == 0) Cleanup();
printf("allocated host memory \n");
// Initialize input vectors
RandomInit(h_A, N);
RandomInit(h_B, N);
printf("initialized host memory \n");
// Allocate vectors in device memory
cudaMalloc((void**)&d_A, blockSize) ;
cudaMalloc((void**)&d_B, blockSize) ;
cudaMalloc((void**)&d_C, blockSize) ;
printf("allocated device memory \n");
int iter=0;
int numElRem=N-(blockSize/sizeof(float))*iter;
for (iter=0;numElRem>0;iter++){
numElRem=N-(blockSize/sizeof(float))*iter;
// Copy vectors from host memory to device memory
cudaMemcpy(d_A, h_A+blockSize*iter, numElRem*sizeof(float), cudaMemcpyHostToDevice) ;
cudaMemcpy(d_B, h_B+blockSize*iter, numElRem*sizeof(float), cudaMemcpyHostToDevice) ;
// Invoke kernel
int numElKernel=(numElRem>blockSize/sizeof(float))?blockSize/sizeof(float):numElRem;
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElKernel);
// Copy result from device memory to host memory
// h_C contains the result in host memory
cudaMemcpy(h_C+blockSize*iter, d_C, numElRem*sizeof(float), cudaMemcpyDeviceToHost) ;
}
float elapsed_time=elapsed_timer.milliseconds_elapsed();
// Verify result
int i;
for (i = 0; i < N; ++i) {
float sum = h_A[i] + h_B[i];
if (fabs(h_C[i] - sum) > 1e-5)
break;
}
printf("%s\n", (i == N) ? "PASSED" : "FAILED");
printf("time elapsed in milliseconds is %f \n",elapsed_time);
Cleanup();
}
void Cleanup(void)
{
// Free device memory
if (d_A)
cudaFree(d_A);
if (d_B)
cudaFree(d_B);
if (d_C)
cudaFree(d_C);
// Free host memory
if (h_A)
free(h_A);
if (h_B)
free(h_B);
if (h_C)
free(h_C);
cudaDeviceReset();
exit(0);
}
// Allocates an array with random float entries.
void RandomInit(float* data, int n)
{
for (int i = 0; i < n; ++i)
data[i] = rand() / (float)RAND_MAX;
}
Any help will be appreciated.
Apostolis