I tried to do matrix multiplication using cuda stream.
But, when I command nvprof, some errors occur and I do not know what that means.
Can anybody help me?
This is my code and screenshot of error.
include “CodeSamples/common/common.h”
include <cuda_runtime.h>
include <stdio.h>
include <cuda_profiler_api.h>
define NSTREAM 4
global void MatMul(int *A, int *B, int *Result)
{
int sum = 0;
for(int i=0; i<1024; i++){
sum += A[1024*threadIdx.x + i] * B[i*1024 + blockIdx.x];
}
__syncthreads();
Result[threadIdx.x*1024+blockIdx.x] = sum;
}
int main(int argc, char **argv)
{
int nx = 1024;
int ny = 1024;
int n_streams = NSTREAM;
int nElem = 1024 * 1024;
int iElem = 1024 * 1024 / NSTREAM;
size_t nBytes = nElem * sizeof(int);
size_t iBytes = iElem * sizeof(int);
/* Set maximum connections. */
char *iname = "CUDA_DEVICE_MAX_CONNECTIONS";
setenv(iname, "4", 1);
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("> Using Device %d: %s with num_streams %d\n", dev, deviceProp.name,
n_streams);
CHECK(cudaSetDevice(dev));
/* Malloc pinned host memory for async memcpy. */
int *h_A, *h_B, *hostRef, *gpuRef;
cudaHostAlloc((int**)&h_A,nBytes, cudaHostAllocDefault);
cudaHostAlloc((int**)&h_B,nBytes, cudaHostAllocDefault);
cudaHostAlloc((int**)&gpuRef,nBytes, cudaHostAllocDefault);
cudaHostAlloc((int**)&hostRef,nBytes, cudaHostAllocDefault);
/* Initialize the input data. */
for( int i = 0; i < nElem; i++){
h_A[i] = i %32;
h_B[i] = i %32;
}
memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);
/* Malloc device global memory. */
int *d_A, *d_B, *d_C;
cudaMalloc((int **)&d_A, nBytes);
cudaMalloc((int **)&d_B, nBytes);
cudaMalloc((int **)&d_C, nBytes);
/* Create a set of non-null streams. */
cudaStream_t *streams = (cudaStream_t *)malloc(n_streams*sizeof(cudaStream_t));
for (int i=0; i<n_streams; i++){
cudaStreamCreate(&streams[i]);
}
/* Transfer the input data. */
for(int i=0; i<n_streams; i++){
int offset = i*iBytes;
cudaMemcpyAsync(&d_A[offset],&h_A[offset], iBytes,
cudaMemcpyHostToDevice, streams[i]);
cudaMemcpyAsync(d_B,h_B, nBytes,
cudaMemcpyHostToDevice, streams[i]);
}
dim3 grid(1024);
dim3 block(256);
printf("> grid (%d, %d) block (%d, %d)\n", grid.x, grid.y, block.x,
block.y);
for(int i = 0; i<n_streams; i++){
int offset = i*iBytes;
MatMul<<<grid, block, 0, streams[i]>>>(&d_A[offset],d_B,&d_C[offset]);
}
for (int i = 0; i<n_streams; i++){
int offset = i*iBytes;
cudaMemcpyAsync(&gpuRef[offset], &d_C[offset], iBytes,
cudaMemcpyDeviceToHost, streams[i]);
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaFreeHost(h_A);
cudaFreeHost(h_B);
cudaFreeHost(hostRef);
cudaFreeHost(gpuRef);
cudaProfilerStop();
for (int i = 0; i < n_streams; ++i)
{
cudaStreamDestroy(streams[i]);
}
cudaDeviceReset();
return 0;
}
==20697== NVPROF is profiling process 20697, command: ./new
Using Device 0: TITAN Xp with num_streams 4
grid (1024, 1) block (256, 1)
==20697== Profiling application: ./new
==20697== Warning: Found 4 invalid records in the result.
==20697== Warning: This can happen if device ran out of memory or if a device kernel was stopped due to an assertion.