I have tried to profile my application with HyperQ, and I also see the simpleHyperQ in cuda samples, I don’t know my understand to this technology is right or not. I have analysed my application but find no kernel concurrent excution
The following is my test code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "helper_cuda.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void init(float *p, int m, int n)
{
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
p[i * n + j] = i * n + j;
}
}
}
void printArray(float *p, int m, int n)
{
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
printf("%f\t", p[i * n + j]);
}
printf("\n");
}
}
__global__ void addSub_kernel(float* dev_Xl, float* dev_Xr, int width, int height, char opera)
{
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
if ((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex;
if (opera == '+')
{
dev_Xl[index_in] += dev_Xr[index_in];
}
else if (opera == '-')
{
dev_Xl[index_in] = dev_Xl[index_in] - dev_Xr[index_in];
}
}
}
__global__ void sumReduction_kernel(float* out, float* in, int m, int n)
{
extern __shared__ float temp1[];
float sum = 0.0;
if (blockIdx.x < 8) {
for (int i = threadIdx.x; i < n; i += blockDim.x)
{
int index = blockIdx.y * n + i;
sum += in[index];
}
temp1[threadIdx.x] = sum;
__syncthreads();
}
for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
{
if (threadIdx.x < offset) {
temp1[threadIdx.x] += temp1[threadIdx.x + offset];
}
__syncthreads();
}
if (threadIdx.x == 0)
{
out[blockIdx.y] = temp1[0] / n;
}
}
int main()
{
const int T = 6;
int m = 2000;
int n = 4096;
int mem_size = sizeof(float) * m * n;
float *h_A = (float*)malloc(mem_size);
float *h_B = (float*)malloc(mem_size);
init(h_A, m, n);
init(h_B, m, n);
float *d_A, *d_B;
checkCudaErrors(cudaMalloc((void**)&d_A, mem_size));
checkCudaErrors(cudaMalloc((void**)&d_B, mem_size));
checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size, cudaMemcpyHostToDevice));
dim3 grid(8, 2000);
dim3 block(512, 1);
int sharedSize = block.x * sizeof(float);
float *d_C;
int mem_size_C = sizeof(float) * m;
checkCudaErrors(cudaMalloc((void**)&d_C, mem_size_C));
float totalTime = 0.0;
cudaStream_t *streams = (cudaStream_t *)malloc(sizeof(cudaStream_t) * T);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
for (int i = 0; i < T; i++)
{
cudaStreamCreate(&streams[i]);
}
for (int i = 0; i < T; i++)
{
cudaEventRecord(start, 0);
//addSub_kernel << <grid, block >> > (d_A, d_B, n, m, '+');
addSub_kernel << <grid, block, 0, streams[i] >> > (d_A, d_B, n, m, '+');
//sumReduction_kernel << <grid, block, sharedSize >> > (d_C, d_A, m, n);
sumReduction_kernel << <grid, block, sharedSize, streams[i] >> > (d_C, d_A, m, n);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
totalTime += elapsedTime;
printf("the %dth computation time use = %.3fms\n", i, elapsedTime);
float *h_C = (float*)malloc(mem_size_C);
checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, streams[i]));
free(h_C);
}
for (int i = 0; i < T; i++)
{
cudaStreamDestroy(streams[i]);
}
free(streams);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
printf("all the computation had done!\n");
printf("total time use = %.3fms\n", totalTime);
getchar();
return 0;
}
Any suggestion is appreciated