matrix multiplication with stream

I tried to do matrix multiplication using cuda stream.
But, when I command nvprof, some errors occur and I do not know what that means.
Can anybody help me?
This is my code and screenshot of error.
include “CodeSamples/common/common.h”
include <cuda_runtime.h>
include <stdio.h>
include <cuda_profiler_api.h>

define NSTREAM 4

global void MatMul(int *A, int *B, int *Result)
{
int sum = 0;

for(int i=0; i<1024; i++){
    sum += A[1024*threadIdx.x + i] * B[i*1024 + blockIdx.x];
    
}
__syncthreads();
Result[threadIdx.x*1024+blockIdx.x] = sum; 

}
int main(int argc, char **argv)
{

int nx = 1024;
int ny = 1024;
int n_streams = NSTREAM;
int nElem = 1024 * 1024;
int iElem = 1024 * 1024 / NSTREAM;
size_t nBytes = nElem * sizeof(int);
size_t iBytes = iElem * sizeof(int);

/* Set maximum connections. */
char *iname = "CUDA_DEVICE_MAX_CONNECTIONS";
setenv(iname, "4", 1);

int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("> Using Device %d: %s with num_streams %d\n", dev, deviceProp.name,
       n_streams);
CHECK(cudaSetDevice(dev));

/* Malloc pinned host memory for async memcpy. */
int *h_A, *h_B, *hostRef, *gpuRef;
cudaHostAlloc((int**)&h_A,nBytes, cudaHostAllocDefault);
cudaHostAlloc((int**)&h_B,nBytes, cudaHostAllocDefault);
cudaHostAlloc((int**)&gpuRef,nBytes, cudaHostAllocDefault);
cudaHostAlloc((int**)&hostRef,nBytes, cudaHostAllocDefault);

/* Initialize the input data. */
for( int i = 0; i < nElem; i++){
    h_A[i] = i %32;
    h_B[i] = i %32; 
}
memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);

/* Malloc device global memory. */
int *d_A, *d_B, *d_C;
cudaMalloc((int **)&d_A, nBytes);
cudaMalloc((int **)&d_B, nBytes);
cudaMalloc((int **)&d_C, nBytes);

/* Create a set of non-null streams. */
cudaStream_t *streams = (cudaStream_t *)malloc(n_streams*sizeof(cudaStream_t));
for (int i=0; i<n_streams; i++){
    cudaStreamCreate(&streams[i]);
}

/* Transfer the input data. */
for(int i=0; i<n_streams; i++){
    int offset = i*iBytes;
    cudaMemcpyAsync(&d_A[offset],&h_A[offset], iBytes,
                    cudaMemcpyHostToDevice, streams[i]);
    cudaMemcpyAsync(d_B,h_B, nBytes,
                    cudaMemcpyHostToDevice, streams[i]);            
}
dim3 grid(1024);
dim3 block(256);
printf("> grid (%d, %d) block (%d, %d)\n", grid.x, grid.y, block.x,
block.y);

for(int i = 0; i<n_streams; i++){
    int offset = i*iBytes;
    MatMul<<<grid, block, 0, streams[i]>>>(&d_A[offset],d_B,&d_C[offset]);
}

for (int i = 0; i<n_streams; i++){
    int offset = i*iBytes;
    cudaMemcpyAsync(&gpuRef[offset], &d_C[offset], iBytes,
                    cudaMemcpyDeviceToHost, streams[i]);
}

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

cudaFreeHost(h_A);
cudaFreeHost(h_B);
cudaFreeHost(hostRef);
cudaFreeHost(gpuRef);
cudaProfilerStop();

for (int i = 0; i < n_streams; ++i)
{
    cudaStreamDestroy(streams[i]);
}

cudaDeviceReset();
return 0;

}

==20697== NVPROF is profiling process 20697, command: ./new

Using Device 0: TITAN Xp with num_streams 4
grid (1024, 1) block (256, 1)
==20697== Profiling application: ./new
==20697== Warning: Found 4 invalid records in the result.
==20697== Warning: This can happen if device ran out of memory or if a device kernel was stopped due to an assertion.