Sorry for this silly question about my code.
A very simple code to study the streams but still dont
understand what is wrong.
I get the famous error: Cuda error in file ‘mysimpleStreams.cu’ in line 52 : invalid argument.
Any help
The programs works well without the stream processing.
#include<cuda.h>
#include<cutil.h>
#include<stdio.h>
__global__ void multiplyValue(float *array, int size,int N)
{
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx < size)
array[tx] = array[tx]*N;
}
int main()
{
int N = 32;
int stream = 4;
int size = N / stream;
//host variable
float* h_a,*h_b;
//device variable
float* d_a;
//memory allocation
h_a = (float*) malloc(N * sizeof(float));
h_b = (float*) malloc(N * sizeof(float));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_a,N*sizeof(float)));
//host array initialization
for(int i = 0; i < N;i++){
h_a[i] = 1.0;
}
//streams creation
cudaStream_t *streams = (cudaStream_t*)malloc(stream * sizeof(cudaStream_t));
for(int i = 0; i < stream; i++){
cudaStreamCreate(&(streams[i]));
}
//set gpu parameter
dim3 blockSize(N,1);
dim3 gridSize(N/blockSize.x,1);
printf("grid %d block %d\n",gridSize.x,blockSize.x);
cudaError_t error = cudaGetLastError();
//cudaMemcpy(d_a,h_a,N*sizeof(float),cudaMemcpyHostToDevice);
//start streams operations
for(int i = 0; i< stream;i++){
CUDA_SAFE_CALL(cudaMemcpyAsync(d_a + i*size,h_a+i*size,size*sizeof(float),
cudaMemcpyHostToDevice,streams[i]));
}
for(int i = 0; i < stream; i++){
multiplyValue<<<gridSize,blockSize,0,streams[i]>>>(d_a+i*size,size,i);
}
for(int i = 0; i < stream;i++){
CUDA_SAFE_CALL(cudaMemcpyAsync(h_b + i*size,d_a+i*size,size*sizeof(float),
cudaMemcpyDeviceToHost,streams[i]));
}
cudaThreadSynchronize();
//cudaMemcpy(h_b,d_a,N*sizeof(float),cudaMemcpyDeviceToHost);
for(int i = 0; i < N;i++){
printf("%2.1lf ",h_b[i]);
}
printf("\n");
return 0;
}