Hi,
I have been running a simple code stated below, which simply adds two vectors to a third one using a kernel running on the device.
I do not get the desired results, it seems that the kernel is not run at all.
The results produced looks like an uninitialized block of memory.
I am running on windows XP with NVIDIA GEForce 9300 GS.
Any ideas?
thanks,
Elad
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include <cuda.h>
// Device code
global void VecAdd(BYTE* A, BYTE* B, BYTE* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i]+B[i];
}
// Host code
int HelloCUDA()
{
cudaSetDevice(1);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int n;
int N = 100;
size_t size = N;
// Allocate input vectors h_A and h_B in host memory
BYTE* h_A = (BYTE*)malloc(size);
BYTE* h_B = (BYTE*)malloc(size);
BYTE* h_C = (BYTE*)malloc(size);
// Initialize input vectors ...
memset(h_A,1,size);
memset(h_B,1,size);
memset(h_C,0,size);
// Allocate vectors in device memory
BYTE* stam;
int Err = cudaMalloc((void**)&stam, size);
BYTE* d_A;
Err = cudaMalloc((void**)&d_A, size);
BYTE* d_B;
Err = cudaMalloc((void**)&d_B, size);
BYTE* d_C;
Err = cudaMalloc((void**)&d_C, size);
// Copy vectors from host memory to device memory
Err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
Err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// Invoke kernel
VecAdd<<<5, 5>>>(d_A, d_B, d_C, N);
// Copy result from device memory to host memory
// h_C contains the result in host memory
Err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
printf("input\n");
for (int i=0;i<size;i++)
printf("%d ",h_A[i]);
printf("result\n");
for (int i=0;i<size;i++)
printf("%d ",h_C[i]);
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaFree(stam);
// Free host memory ...
free(h_A);
free(h_B);
free(h_C);
return 0;
}