Hi everyone!

I’m trying to use cuda streams in my matrix vector multiplication program, but I have some problems… Version without streams working very well . I suspect the problem is the memory allocation, because the vector of solutions is empty. Any ideas?

I’m working with mvs2010, cuda 3.2, geforce 560gtx

```
#include <iostream>
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <time.h>
#define BLOCK_SIZE 2
#define DATA_SIZE n * n
using namespace std;
__global__ void matixVectorMultiplicationINT(int n, int *A_G, int *x_G, int *y_G)
{
int i = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int t = 0;
for(int j = 0; j < n; j++)
{
t += A_G[i + j*n] * x_G[j];
}
y_G[i] = t;
}
void dataReset( int n, int *vec)
{
for(int i = 0; i < n; i++)
{
vec[i] = 0;
}
}
void showVector( int n, int *vec)
{
for(int i = 0; i < n; i++)
{
cout << vec[i] << endl;
}
}
float randomNumber()
{
return rand();
}
void generateRandomVector( int n, int *vec)
{
for(int i = 0; i < n; i++)
{
vec[i] = i+1;
}
}
void generateRandomMatrix(int n, int** mat)
{
int temp = 1;
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
mat[i][j] = temp;
temp++;
}
}
}
void showMatrix(int n, int** mat)
{
for (int i = 0; i < n; i++)
{
for (int j=0; j<n; j++)
{
cout << mat[i][j] << "\t";
}
cout<<endl;
}
}
void linearizeMatrix(int n, int ** mat, int linMatrix[])
{
int tempId = 0;
for(int i = 0; i < n; i++)
{
for(int j = 0; j < n; j++)
{
linMatrix[tempId] = mat[j][i];
tempId++;
}
}
}
int main()
{
int n = 8; //dimension of matrix
dim3 threads(BLOCK_SIZE);
dim3 grid(n/BLOCK_SIZE);
srand((unsigned)time(0));
cudaDeviceProp prop;
int whichDevice;
cudaGetDevice(&whichDevice);
cudaGetDeviceProperties(&prop, whichDevice);
if(!prop.deviceOverlap)
{
cout << "Urzadzenie nie wspiera cudaStreams!";
return 0;
}
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int ** matrix;
int *hostMatrix;
int *hostVector;
int *hostSolutionVector;
int *deviceMatrix;
int *deviceVector;
int *deviceSolutionVector;
//allcate memory for host data
matrix = new int *[n];
for(int i = 0; i < n; i++)
{
matrix[i] = new int[n];
}
hostVector = new int[n];
hostSolutionVector = new int[n];
generateRandomVector(n, hostVector);
dataReset(n, hostSolutionVector);
generateRandomMatrix(n, matrix);
hostMatrix = new int[DATA_SIZE];
linearizeMatrix(n, matrix, hostMatrix);
cout << "Matrix:" << endl;
showMatrix(n, matrix);
cout << endl;
cout<< "Vector:" << endl;
showVector(n, hostVector);
cout << endl;
//alokacja pamieci
cudaMalloc((void**)&deviceMatrix, sizeof(int) * DATA_SIZE);
cudaMalloc((void**)&deviceVector, sizeof(int) * n);
cudaMalloc((void**)&deviceSolutionVector, sizeof(int) * n);
cudaHostAlloc((void**)&hostMatrix, sizeof(int) * DATA_SIZE, cudaHostAllocDefault);
cudaHostAlloc((void**)&hostVector, sizeof(int) * n, cudaHostAllocDefault);
cudaHostAlloc((void**)&hostVector, sizeof(int) * n, cudaHostAllocDefault);
cudaMemcpyAsync(deviceMatrix, hostMatrix, sizeof(int) * DATA_SIZE, cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(deviceVector, hostVector, sizeof(int) * n, cudaMemcpyHostToDevice, stream);
cudaEventRecord(start, 0);
matixVectorMultiplicationINT<<<grid,threads,0, stream>>>(n, deviceMatrix, deviceVector, deviceSolutionVector);
cudaEventRecord(stop, 0);
cudaStreamSynchronize(stream);
cudaMemcpyAsync(hostSolutionVector, deviceSolutionVector, sizeof(int) * n, cudaMemcpyDeviceToHost, stream);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cout << "Computation time: " << elapsedTime << " ms" << endl;
cout << "\nSolution vector:" <<endl;
showVector(n, hostSolutionVector);
//bye bye stream
cudaStreamDestroy(stream);
//free device memory
cudaFree(deviceMatrix);
cudaFree(deviceVector);
cudaFree(deviceSolutionVector);
//free Host memory
cudaFreeHost(hostMatrix);
cudaFreeHost(hostVector);
cudaFreeHost(hostSolutionVector);
delete [] matrix;
cout << "\npress Enter..." << endl;
getchar();
return 0;
}
```

Regards

Hubert