Hi All,
I created the following simple program to illustrate my problem: (populating an integer vector in a parallel fashion)
#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<driver_types.h>
__global__ void mykernel(int* _debugv) {
_debugv[threadIdx.x] = 1;
}
int main() {
// Allocate host vector (initialized to zero)
int block_sz = 32;
int* debugv = (int*) calloc(block_sz, sizeof(int));
// Allocate and populate device side vector
int* _debugv;
cudaMalloc((void**) &_debugv, block_sz * sizeof(int));
cudaMemcpy((void*) _debugv, (void*) debugv, block_sz * sizeof(int), cudaMemcpyHostToDevice);
// Launch the kernel within a stream and copy back the result on the same stream
cudaStream_t stream;
cudaStreamCreate(&stream);
mykernel<<<1, block_sz, 0, stream>>>(_debugv);
cudaMemcpyAsync((void*) debugv, (void*) _debugv, block_sz * sizeof(int), cudaMemcpyDeviceToHost, stream);
// Sync the host thread with the stream
cudaStreamSynchronize(stream);
// Print the result
int i;
for (i = 0; i < block_sz; i++) {
printf("%s%d%s", (i == 0 ? "Debug: [": ""), debugv[i], (i == block_sz - 1 ? "]\n" : ","));
}
cudaStreamDestroy(stream);
return 0;
}
I was expecting the debug vector to output a series of ‘1’ digits but I’m getting all 0’s. Also, the problem goes away if I remove streams as below:
#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<driver_types.h>
__global__ void mykernel(int* _debugv) {
_debugv[threadIdx.x] = 1;
}
int main() {
int block_sz = 32;
int* debugv = (int*) calloc(block_sz, sizeof(int));
int* _debugv;
cudaMalloc((void**) &_debugv, block_sz * sizeof(int));
cudaMemcpy((void*) _debugv, (void*) debugv, block_sz * sizeof(int), cudaMemcpyHostToDevice);
mykernel<<<1, block_sz>>>(_debugv);
cudaMemcpy((void*) debugv, (void*) _debugv, block_sz * sizeof(int), cudaMemcpyDeviceToHost);
int i;
for (i = 0; i < block_sz; i++) {
printf("%s%d%s", (i == 0 ? "Debug: [": ""), debugv[i], (i == block_sz - 1 ? "]\n" : ","));
}
return 0;
}
Streams are vital for my actual application because I need to perform several kernel launches on the same stream (sequentially). Can someone spot what I’m doing wrong?
Thanks a bunch! External Image