Concurrent kernel execution on Fermi ?

Hi all,

I’m trying to use ‘concurrent kernel execution’ feature on Fermi architecture.

I expect that the following code finishes normally on Fermi architecture and deadlocks on on previous architecture.

However, it deadlocks on our C2050 GPU.

Does my code have any problem or did I misunderstand anything?

CUDA Driver Version: 3.10

CUDA Runtime Version: 3.10

CentOS release 5.4

#include <sys/time.h>

#include <iostream>

#include <iomanip>

#include <cassert>

#define CUDA_SAFE_CALL(stmt) assert( (stmt) == cudaSuccess )

#define CUDA_ERR_CHECK do {											 \

		cudaError_t err = cudaGetLastError();						   \

		if( err != cudaSuccess ) {									  \

			cerr << "CUDA API error:"								   \

				 << cudaGetErrorString(err) << endl;					\

			exit(1);													\

		}															   \

	} while(0)

using namespace std;

void check_capability() {

	struct cudaDeviceProp prop;

	CUDA_SAFE_CALL( cudaGetDeviceProperties(&prop, 0) );

	cout << "Device Capability on concurrentKernels: "

		 << (prop.concurrentKernels ? "true" : "false") << endl;

}

/**

 * returns when mem[0] becomes 1.

 */

__global__ void kernel1(int* mem) {

	volatile int var=0; // to prevent the loop to be removed by compiler's optimization

	while(1) {

		if (mem[0] > 0) {

			return;

		}

		var++;

	}

}

/**

 * set mem[0] to 1.

 */

__global__ void kernel2(int* mem) {

	mem[0] = 1;

	return;

}

int main() {

	check_capability();

	int *dmem = NULL;

	int hmem[1] = {0};

	dim3 grid(1);

	dim3 block(1);

	CUDA_SAFE_CALL( cudaMalloc( (void**)&dmem, sizeof(int)) );

	CUDA_SAFE_CALL( cudaMemcpy( dmem, hmem, sizeof(int), cudaMemcpyHostToDevice) );

	cudaStream_t strm1;

	cudaStream_t strm2;

	cudaStreamCreate(&strm1);

	cudaStreamCreate(&strm2);

	// Execute kernel1 first, and kernel2 second.

	// If the threads execute concurrently, kernel1 must finish.

	// Otherwise, they deadlock.

	kernel1<<<grid, block, 0, strm1>>>(dmem);

	kernel2<<<grid, block, 0, strm2>>>(dmem);

	cudaThreadSynchronize();

	CUDA_ERR_CHECK;

	cout << "All Threads have ended." << endl;

	CUDA_SAFE_CALL( cudaFree(dmem) );

	CUDA_SAFE_CALL( cudaStreamDestroy(strm1) );

	CUDA_SAFE_CALL( cudaStreamDestroy(strm2) );

	return 0;

}

Any advice would help.

Keisuke

Hi all,

I’m trying to use ‘concurrent kernel execution’ feature on Fermi architecture.

I expect that the following code finishes normally on Fermi architecture and deadlocks on on previous architecture.

However, it deadlocks on our C2050 GPU.

Does my code have any problem or did I misunderstand anything?

CUDA Driver Version: 3.10

CUDA Runtime Version: 3.10

CentOS release 5.4

#include <sys/time.h>

#include <iostream>

#include <iomanip>

#include <cassert>

#define CUDA_SAFE_CALL(stmt) assert( (stmt) == cudaSuccess )

#define CUDA_ERR_CHECK do {											 \

		cudaError_t err = cudaGetLastError();						   \

		if( err != cudaSuccess ) {									  \

			cerr << "CUDA API error:"								   \

				 << cudaGetErrorString(err) << endl;					\

			exit(1);													\

		}															   \

	} while(0)

using namespace std;

void check_capability() {

	struct cudaDeviceProp prop;

	CUDA_SAFE_CALL( cudaGetDeviceProperties(&prop, 0) );

	cout << "Device Capability on concurrentKernels: "

		 << (prop.concurrentKernels ? "true" : "false") << endl;

}

/**

 * returns when mem[0] becomes 1.

 */

__global__ void kernel1(int* mem) {

	volatile int var=0; // to prevent the loop to be removed by compiler's optimization

	while(1) {

		if (mem[0] > 0) {

			return;

		}

		var++;

	}

}

/**

 * set mem[0] to 1.

 */

__global__ void kernel2(int* mem) {

	mem[0] = 1;

	return;

}

int main() {

	check_capability();

	int *dmem = NULL;

	int hmem[1] = {0};

	dim3 grid(1);

	dim3 block(1);

	CUDA_SAFE_CALL( cudaMalloc( (void**)&dmem, sizeof(int)) );

	CUDA_SAFE_CALL( cudaMemcpy( dmem, hmem, sizeof(int), cudaMemcpyHostToDevice) );

	cudaStream_t strm1;

	cudaStream_t strm2;

	cudaStreamCreate(&strm1);

	cudaStreamCreate(&strm2);

	// Execute kernel1 first, and kernel2 second.

	// If the threads execute concurrently, kernel1 must finish.

	// Otherwise, they deadlock.

	kernel1<<<grid, block, 0, strm1>>>(dmem);

	kernel2<<<grid, block, 0, strm2>>>(dmem);

	cudaThreadSynchronize();

	CUDA_ERR_CHECK;

	cout << "All Threads have ended." << endl;

	CUDA_SAFE_CALL( cudaFree(dmem) );

	CUDA_SAFE_CALL( cudaStreamDestroy(strm1) );

	CUDA_SAFE_CALL( cudaStreamDestroy(strm2) );

	return 0;

}

Any advice would help.

Keisuke

Declare mem as volatile in kernel1 and insert a __threadfence() in kernel2.

Alternatively, use atomic operations.

Declare mem as volatile in kernel1 and insert a __threadfence() in kernel2.

Alternatively, use atomic operations.

tera,

Thank you very much for your advice.

I modified the code and it exited successfully.

tera,

Thank you very much for your advice.

I modified the code and it exited successfully.