I have a tiny program that moves some data into the global device memory, changes it slightly using one thread in one block, and then reads it back. Here is the code and the output, which shows that the data is only partially changed for some strange reason. What really dumb mistake am I making? If it makes a difference my machine is Red Hat Enterprise 5.4 with a Quadro FX 580, driver 256.35, Cuda 3.1. Thanks~
#include <stdio.h>
#include <cuda.h>
__global__ void kernel(float *x);
int main(void)
{
int i, size;
float *host, *device;
host = (float *) malloc(4 * sizeof(float));
for(i = 0; i < 4; ++i) { host[i] = 1.0; }
printf("host before:\n");
for(i = 0; i < 4; ++i) { printf("%f\n", host[i]); }
size = sizeof(host);
cudaMalloc(&device, size);
cudaMemcpy(device, host, size, cudaMemcpyHostToDevice);
kernel<<<1, 1>>>(device);
cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost);
printf("host after:\n");
for(i = 0; i < 4; ++i) { printf("%f\n", host[i]); }
cudaFree(device);
free(host);
return 0;
}
__global__ void kernel(float *x)
{
x[0] = 2.0;
x[1] = 2.0;
x[2] = 2.0;
x[3] = 2.0;
}
Gives this output:
host before:
1.000000
1.000000
1.000000
1.000000
host after:
2.000000
2.000000
1.000000
1.000000