Unable to synchronize with a specific stream

Hi All,

I created the following simple program to illustrate my problem: (populating an integer vector in a parallel fashion)

#include<stdio.h>

#include<cuda.h>

#include<cuda_runtime.h>

#include<driver_types.h>

__global__ void mykernel(int* _debugv) {

        _debugv[threadIdx.x] = 1;

}

int main() {

        // Allocate host vector (initialized to zero)

        int block_sz = 32;

        int* debugv = (int*) calloc(block_sz, sizeof(int));

// Allocate and populate device side vector

        int* _debugv;

        cudaMalloc((void**) &_debugv, block_sz * sizeof(int));

        cudaMemcpy((void*) _debugv, (void*) debugv, block_sz * sizeof(int), cudaMemcpyHostToDevice);

// Launch the kernel within a stream and copy back the result on the same stream

        cudaStream_t stream;

        cudaStreamCreate(&stream);

        mykernel<<<1, block_sz, 0, stream>>>(_debugv);

        cudaMemcpyAsync((void*) debugv, (void*) _debugv, block_sz * sizeof(int),  cudaMemcpyDeviceToHost, stream);

// Sync the host thread with the stream

        cudaStreamSynchronize(stream);

// Print the result

        int i;

        for (i = 0; i < block_sz; i++) {

                printf("%s%d%s", (i == 0 ? "Debug: [": ""), debugv[i], (i == block_sz - 1 ? "]\n" : ","));

        }

cudaStreamDestroy(stream);

        return 0;

}

I was expecting the debug vector to output a series of ‘1’ digits but I’m getting all 0’s. Also, the problem goes away if I remove streams as below:

#include<stdio.h>

#include<cuda.h>

#include<cuda_runtime.h>

#include<driver_types.h>

__global__ void mykernel(int* _debugv) {

        _debugv[threadIdx.x] = 1;

}

int main() {

        int block_sz = 32;

        int* debugv = (int*) calloc(block_sz, sizeof(int));

int* _debugv;

        cudaMalloc((void**) &_debugv, block_sz * sizeof(int));

        cudaMemcpy((void*) _debugv, (void*) debugv, block_sz * sizeof(int), cudaMemcpyHostToDevice);

mykernel<<<1, block_sz>>>(_debugv);

        cudaMemcpy((void*) debugv, (void*) _debugv, block_sz * sizeof(int),  cudaMemcpyDeviceToHost);

int i;

        for (i = 0; i < block_sz; i++) {

                printf("%s%d%s", (i == 0 ? "Debug: [": ""), debugv[i], (i == block_sz - 1 ? "]\n" : ","));

        }

return 0;

}

Streams are vital for my actual application because I need to perform several kernel launches on the same stream (sequentially). Can someone spot what I’m doing wrong?

Thanks a bunch! :thanks:

Ok, I think I asked the question too soon :smile:

I had forgotten that cudaMemcpyAsyc() must always be invoked on page-locked host memory. So the correct listing should be:

#include<stdio.h>

#include<cuda.h>

#include<cuda_runtime.h>

#include<driver_types.h>

__global__ void mykernel(int* _debugv) {

        _debugv[threadIdx.x] = 1;

}

int main() {

        int block_sz = 32;

        int* debugv;

        cudaMallocHost((void**) &debugv, block_sz * sizeof(int));

int* _debugv;

        cudaMalloc((void**) &_debugv, block_sz * sizeof(int));

        cudaMemcpy((void*) _debugv, (void*) debugv, block_sz * sizeof(int), cudaMemcpyHostToDevice);

cudaStream_t stream;

        cudaStreamCreate(&stream);

        mykernel<<<1, block_sz, 0, stream>>>(_debugv);

        cudaMemcpyAsync((void*) debugv, (void*) _debugv, block_sz * sizeof(int),  cudaMemcpyDeviceToHost, stream);

        cudaStreamSynchronize(stream);

int i;

        for (i = 0; i < block_sz; i++) {

                printf("%s%d%s", (i == 0 ? "Debug: [": ""), debugv[i], (i == block_sz - 1 ? "]\n" : ","));

        }

cudaStreamDestroy(stream);

        return 0;

}

Sorry about the mess :tongue: