Unable to synchronize with a specific stream

Asiri_Rathnayake · May 21, 2011, 9:55pm

Hi All,

I created the following simple program to illustrate my problem: (populating an integer vector in a parallel fashion)

#include<stdio.h>

#include<cuda.h>

#include<cuda_runtime.h>

#include<driver_types.h>

__global__ void mykernel(int* _debugv) {

        _debugv[threadIdx.x] = 1;

}

int main() {

        // Allocate host vector (initialized to zero)

        int block_sz = 32;

        int* debugv = (int*) calloc(block_sz, sizeof(int));

// Allocate and populate device side vector

        int* _debugv;

        cudaMalloc((void**) &_debugv, block_sz * sizeof(int));

        cudaMemcpy((void*) _debugv, (void*) debugv, block_sz * sizeof(int), cudaMemcpyHostToDevice);

// Launch the kernel within a stream and copy back the result on the same stream

        cudaStream_t stream;

        cudaStreamCreate(&stream);

        mykernel<<<1, block_sz, 0, stream>>>(_debugv);

        cudaMemcpyAsync((void*) debugv, (void*) _debugv, block_sz * sizeof(int),  cudaMemcpyDeviceToHost, stream);

// Sync the host thread with the stream

        cudaStreamSynchronize(stream);

// Print the result

        int i;

        for (i = 0; i < block_sz; i++) {

                printf("%s%d%s", (i == 0 ? "Debug: [": ""), debugv[i], (i == block_sz - 1 ? "]\n" : ","));

        }

cudaStreamDestroy(stream);

        return 0;

}

I was expecting the debug vector to output a series of ‘1’ digits but I’m getting all 0’s. Also, the problem goes away if I remove streams as below:

#include<stdio.h>

#include<cuda.h>

#include<cuda_runtime.h>

#include<driver_types.h>

__global__ void mykernel(int* _debugv) {

        _debugv[threadIdx.x] = 1;

}

int main() {

        int block_sz = 32;

        int* debugv = (int*) calloc(block_sz, sizeof(int));

int* _debugv;

        cudaMalloc((void**) &_debugv, block_sz * sizeof(int));

        cudaMemcpy((void*) _debugv, (void*) debugv, block_sz * sizeof(int), cudaMemcpyHostToDevice);

mykernel<<<1, block_sz>>>(_debugv);

        cudaMemcpy((void*) debugv, (void*) _debugv, block_sz * sizeof(int),  cudaMemcpyDeviceToHost);

int i;

        for (i = 0; i < block_sz; i++) {

                printf("%s%d%s", (i == 0 ? "Debug: [": ""), debugv[i], (i == block_sz - 1 ? "]\n" : ","));

        }

return 0;

}

Streams are vital for my actual application because I need to perform several kernel launches on the same stream (sequentially). Can someone spot what I’m doing wrong?

Thanks a bunch! External Image

Asiri_Rathnayake · May 21, 2011, 10:12pm

Ok, I think I asked the question too soon External Image

I had forgotten that cudaMemcpyAsyc() must always be invoked on page-locked host memory. So the correct listing should be:

#include<stdio.h>

#include<cuda.h>

#include<cuda_runtime.h>

#include<driver_types.h>

__global__ void mykernel(int* _debugv) {

        _debugv[threadIdx.x] = 1;

}

int main() {

        int block_sz = 32;

        int* debugv;

        cudaMallocHost((void**) &debugv, block_sz * sizeof(int));

int* _debugv;

        cudaMalloc((void**) &_debugv, block_sz * sizeof(int));

        cudaMemcpy((void*) _debugv, (void*) debugv, block_sz * sizeof(int), cudaMemcpyHostToDevice);

cudaStream_t stream;

        cudaStreamCreate(&stream);

        mykernel<<<1, block_sz, 0, stream>>>(_debugv);

        cudaMemcpyAsync((void*) debugv, (void*) _debugv, block_sz * sizeof(int),  cudaMemcpyDeviceToHost, stream);

        cudaStreamSynchronize(stream);

int i;

        for (i = 0; i < block_sz; i++) {

                printf("%s%d%s", (i == 0 ? "Debug: [": ""), debugv[i], (i == block_sz - 1 ? "]\n" : ","));

        }

cudaStreamDestroy(stream);

        return 0;

}

Sorry about the mess External Image

Topic		Replies	Views
Why some synchronize function make cudaMemcpyAsync and kernal in different stream work in sequential CUDA Programming and Performance	2	6610	March 1, 2011
Synchronization problem CUDA Programming and Performance	0	884	December 3, 2012
cudaMemcpyAsync CUDA Programming and Performance	10	22066	October 16, 2015
Unable to run kernel on device 1 with memory in device 2 CUDA Programming and Performance	10	1055	January 24, 2017
Stream synchronization problem didn't synchronize but returned no error CUDA Programming and Performance	0	2805	July 14, 2008
cudaDeviceSynchronize needed between kernel launch and cudaMemcpy ? CUDA Programming and Performance	15	16615	September 29, 2017
stream synchronize problem CUDA Programming and Performance	2	791	August 28, 2017
Questions on Streams CUDA Programming and Performance	5	2216	July 16, 2008
cudaMemcpyAsync with cudaMemcpyHostToDevice does not implicitly synchronize with stream CUDA Programming and Performance	2	107	July 2, 2025
is there need a streamsynchronize() between kernels and CULA function when use cuda stream? CUDA Programming and Performance	1	476	October 2, 2017

Unable to synchronize with a specific stream

Related topics