I have a couple of kernels, each of which write results to Sout and printf’s within the kernels confirm things are operating correctly, no problems.
If I uncomment the line “cudaMemcpy(Sout_h, Sout_d, 0x4000000, cudaMemcpyDeviceToHost);” to get the results back, it compiles and cuda-memchecks without error, but when executed, there is a seg fault in main().
I am very new to debugging - valgrind --tool=memcheck produces:
==4003== Warning: client switching stacks? SP change: 0x1fff000348 → 0x1ffb000300
==4003== to suppress, use: --max-stackframe=67108936 or greater
==4003== Invalid write of size 8
==4003== at 0x403216: main ()
==4003== Address 0x1ffb000320 is on thread 1’s stack
==4003==
==4003==
==4003== Process terminating with default action of signal 11 (SIGSEGV): dumping core
==4003== Access not within mapped region at address 0x1FFB000320
==4003== at 0x403216: main ()
I’m baffled by the reference to “Invalid write of size 8”, as I am not using any 64 bit values.
As I get stuck into learning gdb, I wondered if anyone can see a glaring error here?
Many thanks.
int main(void){
uint8_t A_h[8] = {
0xBA, 0x3F, 0xD8, 0xD1, 0x2A, 0xEE, 0x14, 0x2C
};
uint8_t *A_d;
uint8_t B_h[12] = {
0xA8, 0x3F, 0x28, 0x41, 0xC9, 0xFE, 0x74, 0xFD, 0xB7, 0x8A, 0x27, 0x00
};
uint8_t *B_d;
uint32_t Sout_h[0x1000000];
uint32_t *Sout_d;
//alloc device memory
cudaMalloc((void **) &A_d, 8);
cudaMalloc((void **) &B_d, 12);
cudaMalloc((void **) &Sout_d, 0x4000000); // 64MB
//Copy array
cudaMemcpy(A_d, A_h, 8, cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B_h, 12, cudaMemcpyHostToDevice);
kern1<<<BLOCKS,THREADS,0,0>>>(A_d, B_d, Sout_d);
kern2<<<BLOCKS,THREADS,0,0>>>(A_d, B_d, Sout_d);
cudaDeviceSynchronize();
// cudaMemcpy(Sout_h, Sout_d, 0x4000000, cudaMemcpyDeviceToHost);
//free device memory
cudaFree(A_d);
cudaFree(B_d);
cudaFree(Sout_d);
return 0;
}