OK - a bit of background first. Essentially what I am trying to do is write a simple program which measures the length of time it takes CUDA to access a value stored in constant memory. In order to do this I have written the code below which creates the constant memory value, writes it to the device and then launches a kernel in which each thread will loop X number of times, reading the value from constant memory into a local variable.
However the problem that I’m having is that no matter how many times I loop inside the kernel, the results seem to remain the same. I have to conclude that either the loop is being unrolled by NVCC or there’s an error in my code somewhere and I haven’t caught it. I’m posting this here in the hope that someone looking at my code with a fresh pair of eyes might either spot the error or be able to tell me what is happening in the kernel.
I’ve tried varying the value of numKernelLoops from 10 to 10000000000000000 but have seen no difference in the timings. I understand that there is some caching involved in using constant memory but I can’t believe that the device would iterate through that many loops without some increase in the time taken
I’d be very grateful for any advice
My code in full:
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
//////////////////////////////////////
////////// GLOBAL VARIABLES //////////
//////////////////////////////////////
#define numKrytenBlocks 1
#define numKrytenThreads 512
#define numDrygalskiBlocks 16
#define numDrygalskiThreads 512
#define numKernelLoops 1000000
__constant__ int c;
////////////////////////////////
////////// PROTOTYPES //////////
////////////////////////////////
__host__ void runTest(char hostInd);
__global__ void runkernel();
int main(int argc, char *argv[])
{
//Check command line args are correct - should be 1 indicating if using Kryten or Drygalski
////////// CHECK COMMAND LINE ARGS ARE CORRECT //////////
if(argc != 2)
{
printf("\n**** ERROR - MISSING COMMAND LINE ARGUMENT ****\n\n");
printf("Usage: %s <host indicator (K or D)>\n\n",argv[0]);
exit(EXIT_FAILURE);
}
//Populate host variable with argv[1]
char *hostarg = argv[1];
char hostInd = hostarg[0];
//Call the runtest function, passing in host variable
runTest(hostInd);
return EXIT_SUCCESS;
}
__host__ void runTest(char hostInd)
{
////////// VARIABLES //////////
//The number of blocks
int blocks;
//The number of threads
int threads;
//The cuda events for timing the kernel
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//The time taken
float elapsedTime;
////////// SET BLOCKS & THREADS VALUES //////////
if (hostInd == 'K')
{
blocks = numKrytenBlocks;
threads = numKrytenThreads;
}
if (hostInd == 'D')
{
blocks = numDrygalskiBlocks;
threads = numDrygalskiThreads;
}
////////// CREATE CONSTANT MEMORY VARIABLE //////////
int temp_const = 42;
CUDA_SAFE_CALL(cudaMemcpyToSymbol("c", &temp_const, sizeof(int), 0, cudaMemcpyHostToDevice));
////////// START TIMING //////////
CUDA_SAFE_CALL(cudaEventRecord(start, 0));
////////// LAUNCH THE KERNEL //////////
runkernel<<<blocks,threads>>>();
////////// STOP TIMING //////////
CUDA_SAFE_CALL(cudaEventRecord(stop, 0));
CUDA_SAFE_CALL(cudaEventSynchronize(stop));
CUDA_SAFE_CALL(cudaEventElapsedTime(&elapsedTime, start, stop));
printf("%.8f ms\n", elapsedTime);
}
__global__ void runkernel(void)
{
////////// VARIABLES //////////
//The loop counter
long i;
//The local variable
int local;
//A TEST VARIABLE
int test = 10;
////////// LOOP THE LOOP //////////
//Inside the loop read the value in the constant memory variable into the local variable
for (i = 0; i < numKernelLoops; i++)
{
local = c;
}
}