Cuda streams

Sorry for this silly question about my code.

A very simple code to study the streams but still dont

understand what is wrong.

I get the famous error: Cuda error in file ‘mysimpleStreams.cu’ in line 52 : invalid argument.

Any help

The programs works well without the stream processing.

#include<cuda.h>

#include<cutil.h>

#include<stdio.h>

__global__ void multiplyValue(float *array, int size,int N)

{

	int tx = threadIdx.x + blockIdx.x * blockDim.x;

	if(tx < size)

		array[tx] = array[tx]*N;

}

int main()

{	

	int N = 32; 

	int stream = 4;

	int size = N / stream;

	//host variable 

	float* h_a,*h_b;

	//device variable 

	float* d_a;

	//memory allocation 

	h_a = (float*) malloc(N * sizeof(float));

	h_b = (float*) malloc(N * sizeof(float));

	

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_a,N*sizeof(float)));

	//host array initialization 

	for(int i = 0; i < N;i++){

		h_a[i] = 1.0;

	}

	//streams creation 

	cudaStream_t *streams = (cudaStream_t*)malloc(stream * sizeof(cudaStream_t));

	for(int i = 0; i < stream; i++){

		cudaStreamCreate(&(streams[i]));

	}

	//set gpu parameter 

	dim3 blockSize(N,1);

	dim3 gridSize(N/blockSize.x,1);

	printf("grid %d block %d\n",gridSize.x,blockSize.x);

	cudaError_t error = cudaGetLastError();

	//cudaMemcpy(d_a,h_a,N*sizeof(float),cudaMemcpyHostToDevice);

	//start streams operations

	for(int i = 0; i< stream;i++){

		CUDA_SAFE_CALL(cudaMemcpyAsync(d_a + i*size,h_a+i*size,size*sizeof(float),

                   cudaMemcpyHostToDevice,streams[i]));

	}

	for(int i = 0; i < stream; i++){

		multiplyValue<<<gridSize,blockSize,0,streams[i]>>>(d_a+i*size,size,i);

	}

	for(int i = 0; i < stream;i++){

		CUDA_SAFE_CALL(cudaMemcpyAsync(h_b + i*size,d_a+i*size,size*sizeof(float),

                  cudaMemcpyDeviceToHost,streams[i]));

	}

	cudaThreadSynchronize();

	//cudaMemcpy(h_b,d_a,N*sizeof(float),cudaMemcpyDeviceToHost);

	for(int i = 0; i < N;i++){

		printf("%2.1lf ",h_b[i]);

	}

	printf("\n");

	return 0;

}

Thanks to this post My link I corrected the error.

Actually just cange the way the host memory is allocated

by using the cudaMalloHost.