I’ve ran the simple test-case below and valgrind seems to blow up when I cudaMalloc. In fact there’s a lot more memory than I had expected. Is this normal?
#include <iostream>
#include <cstdlib>
#include <ctime>
#define SIZE 1000
__global__ void test (int *array) {
array[threadIdx.x] *= 2;
}
int main (int argc, char *argv[]) {
int h_array;
for (int i=0;i<SIZE;i++)
h_array[i] = i;
for (int i=0;i<SIZE/10;i++) {
for (int j=0;j<10;j++)
std::cout << h_array[i * 10 + j] << '\t';
std::cout << std::endl;
}
std::cout << std::endl;
int *d_array;
cudaMalloc((void **) &d_array, SIZE * sizeof(int));
cudaMemcpy(d_array, h_array, SIZE * sizeof(int), cudaMemcpyHostToDevice);
test<<<1,SIZE>>>(d_array);
cudaMemcpy(h_array, d_array, SIZE * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_array);
for (int i=0;i<SIZE/10;i++) {
for (int j=0;j<10;j++)
std::cout << h_array[i * 10 + j] << '\t';
std::cout << std::endl;
}
return 0;
}