The problem about hyperQ

I have read the SDK sample “simplehyperQ”.After that,I write a HyperQprogram about image processing.But when I check the result by the Nsight debugger,I found that the streams worked serially,not parallely.The code is similar to the simplehyperq sample in the sdk.So I can’t figure out the reason and hope someone can find the error in my program.
The first floor is my code。
the attachment is a snapshot in Nsight debugger.
Thank for your reply

#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#include<cv.h>
#include<highgui.h>
#include<cxcore.h>
#include<math_functions.h>
#include<cuda_runtime.h>
#include<time.h>

#include<iostream>


const char *filename[16] = {"test1.bmp","test2.bmp","test3.bmp","test4.bmp","test5.bmp","test6.bmp","test7.bmp","test8.bmp","test9.bmp","test10.bmp","test11.bmp","test12.bmp","test13.bmp","test14.bmp","test15.bmp","test16.bmp"};
const int nstreams = 16;
const int interation = 20;
void checkCUDAError(const char* msg);

__global__ void edge_gpu(unsigned char* buff , unsigned char* buffer_out , int w , int h)
{
  int x = blockIdx.x * blockDim.x +threadIdx.x ;
	int y = blockIdx.y * blockDim.y +threadIdx.y; 
	int width = w , height = h ;
	
	if((x>=0 && x < width) && (y>=0 && y<height))
	{
		int hx = -buff[width*(y-1) + (x-1)] + buff[width*(y-1)+(x+1)]
			 -2*buff[width*(y)+(x-1)] + 2*buff[width*(y)+(x+1)]
			 -buff[width*(y+1)+(x-1)] + buff[width*(y+1)+(x+1)];

		int vx = buff[width*(y-1)+(x-1)] +2*buff[width*(y-1)+(x+1)] +buff[width*(y-1)+(x+1)]
			 -buff[width*(y+1)+(x-1)] - 2*buff[width*(y+1)+(x)] - buff[width*(y+1)+(x+1)];
 
		hx = hx;
		vx = vx;

		int val = (int)sqrt((float)(hx) * (float)(hx) + (float)(vx) * (float)(vx));					

		buffer_out[y * width + x] = (unsigned char) val;							
	}
}


void checkCUDAError(const char* msg) 
{
	cudaError_t err = cudaGetLastError();
  	if (cudaSuccess != err) 
  	{
    		fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
    		exit(EXIT_FAILURE);
  	}
}



int main(int argc , char** argv)
{
	int nWidth = 2448;
	int nHeight = 2048;
	IplImage *temp[nstreams];
	for(int i = 0;i < nstreams;i++)
		temp[i] = cvLoadImage(filename[i],0);

	int buffer_size = nWidth * nHeight;

	unsigned char *buffer[nstreams];
	unsigned char* buf[nstreams] ;
	unsigned char *buffer_dev[nstreams];
	unsigned char *buffer_out[nstreams];

	for(int i = 0;i < nstreams;i++)
	buffer[i] = (unsigned char*)(temp[i]->imageData);

	
	for(int i = 0;i < nstreams;i++){
	//buf[i] = (unsigned char*) malloc(buffer_size);
	 cudaMallocHost((void**)&buf[i],buffer_size);
	 checkCUDAError("Memory Allocation");
	}
	for(int i = 0;i < nstreams;i++){
	cudaMalloc((void**)&buffer_out[i],buffer_size);
	checkCUDAError("Memory Allocation");
	}
	
	for(int i = 0;i < nstreams;i++){
	cudaMalloc((void**)&buffer_dev[i],buffer_size);
	checkCUDAError("Memory Allocation");
	}




//==========  create a stream  ==========

	cudaStream_t *streams = (cudaStream_t*)malloc(nstreams * sizeof(cudaStream_t));
	for(int i = 0;i < nstreams;i++)
  	cudaStreamCreate(&(streams[i]));
//=========================================	

	for(int i = 0;i< nstreams;i++){
	cudaMemcpy(buffer_dev[i] , buffer[i] , buffer_size , cudaMemcpyHostToDevice);
	std::cout<<i<<std::endl;
	checkCUDAError("Memory Copy From Host To Device");
	}
	dim3 threadsPerBlock(8,8);
	dim3 numBlocks((nWidth)/8,(nHeight)/8);
	
	for(int i = 0;i < nstreams;i++){
	edge_gpu<<< numBlocks , threadsPerBlock , 0 , streams[i] >>>(buffer_dev[i] , buffer_out[i], nWidth , nHeight);
	checkCUDAError("Kernel");
	}

	//for (int i = 0; i < nstreams; i ++)
	//	cudaStreamSynchronize(streams[i]);

	for(int i = 0;i < nstreams;i++){
	cudaMemcpy(buf[i] , buffer_out[i] , buffer_size  , cudaMemcpyDeviceToHost);
	checkCUDAError("Memory Copy From Device To Host");
	}


//==========Check the result==============
	//memcpy(buffer[1],buf[1],buffer_size);//copying memory to show image
	//cvNamedWindow("0");
	//cvShowImage("0",temp[1]);
	//cvWaitKey(0);
	for(int i = 0;i < nstreams;i++)
	cudaFree(&(buffer_dev[i]));

	for(int i = 0;i < nstreams;i++)
	cudaFree(&(buffer_out[i]));

	for(int i = 0;i < nstreams;i++)
	cudaStreamDestroy(streams[i]);	

	//for(int i = 0;i < nstreams;i++)
	//free(&(buf[i]));
	for(int i = 0;i < nstreams;i++)
	cudaFreeHost(buf[i]);

	for(int i = 0;i < nstreams;i++)
	  cvReleaseImage(&(temp[i]));
	free(streams);
	
	printf("\n\nDONE...!!...Copy Successful...!!\n\n");

	return 1;
}

What is your device?

GTX780

THUMB!!^^^^

Is the nvidia sample working?

yes The sample simpleHyperq function well.And I can see that kenerls work parallely in the Nsight debugger!

Work from the sample, add in your code step by step until you see what you do that breaks the parallel kernels. I suspect it might be the order that you call your kernels in regards to your memory copies.

Thanks Bro!!I have made the size of picture smaller and use the function “cudaMemcpyAsync”,finally the snapshot in the nsight debugger shows that it can work parallely but not parallel totally.