After making corrections to the code snippet above I’m confident the answer is, the current driver 381.22 does not recognize HMM capability in the kernel.
I’d love to be a guinea pig for a beta driver that does support HMM.
For completeness here’s the test program. It succeeds if you set HMM = 0; it fails if HMM = 1.
#include <iostream>
#include <numeric>
#include <stdlib.h>
static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
/**
* CUDA kernel that computes reciprocal values for a given vector
*/
__global__ void reciprocalKernel(float *data, unsigned vectorSize) {
unsigned idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < vectorSize)
data[idx] = 1.0/data[idx];
}
const int HMM =1 ;
/**
* Host function that copies the data and launches the work on GPU
*/
float *gpuReciprocal(float *data, unsigned size)
{
float *rc = new float;
memset (rc, 0, size*sizeof(float));
float *gpuData;
if (HMM)
{
gpuData = static_cast<float*>( calloc( size, sizeof(float)));
memcpy(gpuData, data, sizeof(float)*size);
}
else
{
CUDA_CHECK_RETURN(cudaMalloc((void **)&gpuData, sizeof(float)*size));
CUDA_CHECK_RETURN(cudaMemcpy(gpuData, data, sizeof(float)*size, cudaMemcpyHostToDevice));
}
static const int BLOCK_SIZE = 256;
const int blockCount = (size+BLOCK_SIZE-1)/BLOCK_SIZE;
reciprocalKernel<<<blockCount, BLOCK_SIZE>>> (gpuData, size);
CUDA_CHECK_RETURN( cudaPeekAtLastError() );
CUDA_CHECK_RETURN( cudaDeviceSynchronize() );
if(HMM)
{
memcpy(rc, gpuData, sizeof(float)*size);
free(gpuData);
}
else
{
CUDA_CHECK_RETURN(cudaMemcpy(rc, gpuData, sizeof(float)*size, cudaMemcpyDeviceToHost));
CUDA_CHECK_RETURN(cudaFree(gpuData));
}
return rc;
}
float *cpuReciprocal(float *data, unsigned size)
{
float *rc = new float;
for (unsigned cnt = 0; cnt < size; ++cnt) rc[cnt] = 1.0/data[cnt];
return rc;
}
void initialize(float *data, unsigned size)
{
for (unsigned i = 0; i < size; ++i)
data[i] = .5*(i+1);
}
int main(void)
{
static const int WORK_SIZE = 65530;
float *data = new float[WORK_SIZE];
initialize (data, WORK_SIZE);
float *recCpu = cpuReciprocal(data, WORK_SIZE);
float *recGpu = gpuReciprocal(data, WORK_SIZE);
float cpuSum = std::accumulate (recCpu, recCpu+WORK_SIZE, 0.0);
float gpuSum = std::accumulate (recGpu, recGpu+WORK_SIZE, 0.0);
/* Verify the results */
std::cout<<"gpuSum = "<<gpuSum<< " cpuSum = " <<cpuSum<<std::endl;
/* Free memory */
delete[] data;
delete[] recCpu;
delete[] recGpu;
return 0;
}
/**
* Check the return value of the CUDA runtime API call and exit
* the application if the call has failed.
*/
static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err)
{
if (err == cudaSuccess)
return;
std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
exit (1);
}