Hi. In the process of learning CUDA I’ve run into a strange problem, and I’ve been unable to figure out what exactly is going wrong. I’ve written a simple program, which is very close to (and takes a lot from) one of the examples in the programming guide. In the first case, all goes as expected, but in the second case, the output of the third vector (the result of the kernel calculation) is the same as it was before the kernel call. With the second version, I get the warning, “Advisory: Cannot tell what pointer points to, assuming global memory space” when I compile. Anyone have any ideas what is going on here, and why it matters that I pass the vectors individually rather than as an array?
Thanks much,
Paul
Working code:
[codebox]#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// Device code
global void VecAdd(float* A, float* B, float* C, int N)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
//if (i < N)
C[i] = A[i]+B[i];
}
using namespace std;
int main (int argc, const char * argv)
{
int N = 10;
size_t size = N * sizeof(float);
float* h_A;
float* h_B;
float* h_C;
float* d_A;
float* d_B;
float* d_C;
//Allocate vector A in host and device memory, set default value
cudaMallocHost((void**)&(h_A), size);
cudaMalloc((void**)&(d_A), size);
for(int itr=0; itr<N; itr++)
{
h_A[itr] = 5.5;
}
//End Allocation
//Allocate vector B in host and device memory, set default value
cudaMallocHost((void**)&(h_B), size);
cudaMalloc((void**)&(d_B), size);
for(int itr=0; itr<N; itr++)
{
h_B[itr] = 5.5;
}
//End Allocation
//Allocate vector C in host and device memory, set default value
cudaMallocHost((void**)&(h_C), size);
cudaMalloc((void**)&(d_C), size);
for(int itr=0; itr<N; itr++)
{
h_C[itr] = 5.5;
}
//End Allocation
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice);
// Invoke kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
for(int incr=0;incr<N; incr++)
{
printf("[");
printf("%i",incr);
printf("]\t");
printf("%f",h_A[incr]);
printf("\t");
printf("%f",h_B[incr]);
printf("\t");
printf("%f",h_C[incr]);
printf("\n");
}
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free host memory
cudaFreeHost(h_A);
cudaFreeHost(h_B);
cudaFreeHost(h_C);
return 0;
}
[/codebox]
Misbehaving Code:
[codebox]#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// Device code
global void VecAdd(float** C, int N)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
//if (i < N)
C[2][i] = C[0][i]+C[1][i];
}
using namespace std;
int main (int argc, const char * argv)
{
int N = 10;
size_t size = N * sizeof(float);
float* hostPtr[3];
float* devPtr[3];
for (int i=0; i<3;i++)
{
printf(cudaGetErrorString(cudaMallocHost((void**)&(hostPtr[i]), size)));
printf("\n");
printf(cudaGetErrorString(cudaMalloc((void**)&(devPtr[i]), size)));
printf("\n");
for(int itr=0; itr<N; itr++)
{
hostPtr[i][itr] = 5.5;
}
}
for(int i = 0; i < 3;i++)
{
cudaMemcpy(devPtr[i], hostPtr[i], size, cudaMemcpyHostToDevice);
}
// Invoke kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(devPtr, N);
cudaMemcpy(hostPtr[2], devPtr[2], size, cudaMemcpyDeviceToHost);
for(int incr=0;incr<N; incr++)
{
printf("[");
printf("%i",incr);
printf("]\t");
printf("%f",hostPtr[0][incr]);
printf("\t");
printf("%f",hostPtr[1][incr]);
printf("\t");
printf("%f",hostPtr[2][incr]);
printf("\n");
}
// Free device memory
cudaFree(devPtr[0]);
cudaFree(devPtr[1]);
cudaFree(devPtr[2]);
// Free host memory
cudaFreeHost(hostPtr[0]);
cudaFreeHost(hostPtr[1]);
cudaFreeHost(hostPtr[2]);
return 0;
}
[/codebox]