I did.
cuParamSetTexRef() before every grid launch.
I did but only for the function using the texture.
I try to put it for all of them but it doesn’t change anything.
Here are the 2 versions of the function :
//////// My version in C++ + API Driver
I've got an object called Cudaloader (singleton). I init my context in the ctor.
//// initialization in ctor
CUT_DEVICE_INIT_DRV(_cuDevice, sArgc, sArgv);
CUresult status = cuCtxCreate( &_cuContext, 0, _cuDevice);
int major, minor;
cuDeviceComputeCapability(&major, &minor, _cuDevice);
status = cuModuleLoad(&_cuModule,"func.cubin");
(...)
_cuComputeDistanceGlobal = new CUfunction;
status = cuModuleGetFunction(_cuComputeDistanceGlobal, _cuModule,
"cuComputeDistanceGlobal");
(...)
_cuComputeDistanceTexture = new CUfunction;
status = cuModuleGetFunction(_cuComputeDistanceTexture, _cuModule,
"cuComputeDistanceTexture");
(...)
_cuInsertionSort = new CUfunction;
status = cuModuleGetFunction(_cuInsertionSort, _cuModule,
"cuInsertionSort");
(...)
_cuParallelSqrt = new CUfunction;
status = cuModuleGetFunction(_cuParallelSqrt, _cuModule,
"cuParallelSqrt");
////////////////////////////////////
float* CudaLoader::implDriverFunc(float * ref_host, int ref_width,
float * query_host, int query_width, int dim, int k,
CUfunction *cuComputeDistanceGlobal,
CUfunction *cuComputeDistanceTexture,
CUfunction *cuInsertionSort, CUfunction *cuParallelSqrt)
{
// Device array
CUdeviceptr query_dev;
CUdeviceptr ref_dev;
CUdeviceptr dist_dev;
CUarray ref_array;
// Variables
CUresult res;
size_t query_pitch;
size_t query_pitch_in_bytes;
size_t ref_pitch;
size_t ref_pitch_in_bytes;
size_t max_nb_query_traited;
size_t actual_nb_query_width;
unsigned int memory_total;
unsigned int memory_free;
unsigned int use_texture = (ref_width*sizeof(float)
<=MAX_TEXTURE_WIDTH_IN_BYTES && dim*sizeof(float)
<=MAX_TEXTURE_HEIGHT_IN_BYTES);
float *output_host = new float[query_width];
// Allocation CUDA memory
int elementSizeBytes = 4; /// --> 4 ??
cuMemGetInfo(&memory_free, &memory_total);
max_nb_query_traited = (size_t)(memory_free
* MAX_PART_OF_FREE_MEMORY_USED - sizeof(float) * ref_width
*dim ) / (sizeof(float) * (dim + ref_width) );
max_nb_query_traited = min((unsigned int)query_width,
(max_nb_query_traited / 16) * 16);
res
= CU_SAFE_CALL(cuMemAllocPitch( &query_dev, &query_pitch_in_bytes, max_nb_query_traited*sizeof(float), (dim+ref_width), elementSizeBytes));
(...)
query_pitch = query_pitch_in_bytes/sizeof(float);
dist_dev = query_dev + dim * query_pitch;
if (use_texture)
{
// Allocation of texture memory for reference points
CUDA_ARRAY_DESCRIPTOR desc;
desc.Format = CU_AD_FORMAT_FLOAT;
desc.Width = ref_width;
desc.Height = dim;
desc.NumChannels = 1; //doit être le meme que TexRefSetFormat
res = cuArrayCreate(&ref_array, &desc);
(...)
// cudaMemcpyToArray( ref_array, 0, 0, ref_host, ref_width*height*sof,cudaMemcpyHostToDevice );
CUDA_MEMCPY2D memcpy2Dparams;
memset(&memcpy2Dparams, 0, sizeof(memcpy2Dparams));
memcpy2Dparams.srcMemoryType = CU_MEMORYTYPE_HOST;
memcpy2Dparams.dstMemoryType = CU_MEMORYTYPE_ARRAY;
memcpy2Dparams.srcHost = ref_host;
memcpy2Dparams.srcPitch = ref_width*sizeof(float);
memcpy2Dparams.dstArray = ref_array;
memcpy2Dparams.WidthInBytes = memcpy2Dparams.srcPitch;
memcpy2Dparams.Height = dim;
AlertAndKillIfCudaError(cuMemcpy2D(&memcpy2Dparams));
CU_SAFE_CALL(cuModuleGetTexRef(&_tex, _cuModule, "texA"));
CU_SAFE_CALL(cuTexRefSetArray(_tex, ref_array, CU_TRSA_OVERRIDE_FORMAT));
CU_SAFE_CALL(cuTexRefSetAddressMode(_tex, 0, CU_TR_ADDRESS_MODE_CLAMP));
CU_SAFE_CALL(cuTexRefSetAddressMode(_tex, 1, CU_TR_ADDRESS_MODE_CLAMP));
CU_SAFE_CALL(cuTexRefSetFilterMode(_tex, CU_TR_FILTER_MODE_POINT));
CU_SAFE_CALL(cuTexRefSetFlags(_tex, 0));
CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, 1));
}
else
{
//res = cudaMallocPitch( (void **) &ref_dev, &ref_pitch_in_bytes,ref_width * size_of_float, height);
res
= CU_SAFE_CALL(cuMemAllocPitch( &ref_dev, &ref_pitch_in_bytes, ref_width*sizeof(float), dim, elementSizeBytes));
(...)
ref_pitch = ref_pitch_in_bytes/sizeof(float);
//cudaMemcpy2D(ref_dev, ref_pitch_in_bytes, ref_host, ref_width*size_of_float, ref_width*size_of_float, height,cudaMemcpyHostToDevice);
CUDA_MEMCPY2D memcpy2Dparams2;
memset(&memcpy2Dparams2, 0, sizeof(memcpy2Dparams2));
memcpy2Dparams2.srcMemoryType = CU_MEMORYTYPE_HOST;
memcpy2Dparams2.dstMemoryType = CU_MEMORYTYPE_DEVICE;
memcpy2Dparams2.srcHost = ref_host;
memcpy2Dparams2.srcPitch = ref_width*sizeof(float);
memcpy2Dparams2.dstDevice = ref_dev;
memcpy2Dparams2.dstPitch = ref_pitch_in_bytes;
memcpy2Dparams2.WidthInBytes = ref_width*sizeof(float);
memcpy2Dparams2.Height = dim;
res = CU_SAFE_CALL(cuMemcpy2D(&memcpy2Dparams2));
(...)
}
////
// Split queries to fit on GPU memory
for (int i=0; i<query_width; i+=max_nb_query_traited)
{
actual_nb_query_width = min((int)max_nb_query_traited,
query_width-i);
// Copy of part of query actually being treated
//cudaMemcpy2D(query_dev, query_pitch_in_bytes, &query_host[i],query_width*size_of_float, actual_nb_query_width*size_of_float,height, cudaMemcpyHostToDevice);
CUDA_MEMCPY2D memcpy2Dparams2;
memset(&memcpy2Dparams2, 0, sizeof(memcpy2Dparams2));
memcpy2Dparams2.srcMemoryType = CU_MEMORYTYPE_HOST;
memcpy2Dparams2.dstMemoryType = CU_MEMORYTYPE_DEVICE;
memcpy2Dparams2.srcHost = &query_host[i];
memcpy2Dparams2.srcPitch = query_width*sizeof(float);
memcpy2Dparams2.dstDevice = query_dev;
memcpy2Dparams2.dstPitch = query_pitch_in_bytes;
memcpy2Dparams2.WidthInBytes = actual_nb_query_width
*sizeof(float);
memcpy2Dparams2.Height = dim;
res = CU_SAFE_CALL(cuMemcpy2D(&memcpy2Dparams2));
(...)
// GRIDS ANS THREADS
//dim3 g_16x16(actual_nb_query_width/16, ref_width/16, 1);
unsigned int g_16x16_x = actual_nb_query_width/16;
unsigned int g_16x16_y = ref_width/16;
//dim3 t_16x16(16, 16, 1);
unsigned int t_16x16_x = 16;
unsigned int t_16x16_y = 16;
unsigned int t_16x16_z = 1;
if (actual_nb_query_width%16 != 0)
g_16x16_x += 1;
if (ref_width %16 != 0)
g_16x16_y += 1;
//dim3 g_256x1(actual_nb_query_width/256, 1, 1);
unsigned int g_256x1_x = actual_nb_query_width/256;
unsigned int g_256x1_y = 1;
//dim3 t_256x1(256, 1, 1);
unsigned int t_256x1_x = 256;
unsigned int t_256x1_y = 1;
unsigned int t_256x1_z = 1;
if (actual_nb_query_width%256 != 0)
g_256x1_x += 1;
// KERNEL 1 : Compute all the distances
if (use_texture)
{
//cuComputeDistanceTexture<<<g_16x16,t_16x16>>>(ref_width, query_dev, actual_nb_query_width, query_pitch, height, dist_dev);
int offset = 0;
AlertAndKillIfCudaError(cuFuncSetBlockShape(
*cuComputeDistanceTexture, t_16x16_x, t_16x16_y,
t_16x16_z));
AlertAndKillIfCudaError(cuParamSeti(
*cuComputeDistanceTexture, offset, ref_width));
offset += sizeof(ref_width);
AlertAndKillIfCudaError(cuParamSeti(
*cuComputeDistanceTexture, offset, query_dev));
offset += sizeof(query_dev);
AlertAndKillIfCudaError(cuParamSeti(
*cuComputeDistanceTexture, offset,
actual_nb_query_width));
offset += sizeof(actual_nb_query_width);
AlertAndKillIfCudaError(cuParamSeti(
*cuComputeDistanceTexture, offset, query_pitch /*/sizeof(float)*/)); //??????????????????
offset += sizeof(query_pitch);
AlertAndKillIfCudaError(cuParamSeti(
*cuComputeDistanceTexture, offset, dim));
offset += sizeof(dim);
AlertAndKillIfCudaError(cuParamSeti(
*cuComputeDistanceTexture, offset, dist_dev));
offset += sizeof(dist_dev);
AlertAndKillIfCudaError(cuParamSetSize(
*cuComputeDistanceTexture, offset));
CU_SAFE_CALL(cuParamSetTexRef(*cuComputeDistanceTexture, CU_PARAM_TR_DEFAULT, _tex));
CU_SAFE_CALL( cuCtxSynchronize() );
AlertAndKillIfCudaError(cuLaunchGrid(
*cuComputeDistanceTexture, g_16x16_x, g_16x16_y));
CU_SAFE_CALL( cuCtxSynchronize() );
}
else
{
cout<<"pas texture"<<endl;
//cuComputeDistanceGlobal<<<g_16x16,t_16x16>>>(ref_dev, ref_width, ref_pitch, query_dev, actual_nb_query_width, query_pitch, height, dist_dev);
int offset = 0;
AlertAndKillIfCudaError(cuFuncSetBlockShape(
*cuComputeDistanceGlobal, t_16x16_x, t_16x16_y,
t_16x16_z));
AlertAndKillIfCudaError(cuParamSeti(
*cuComputeDistanceGlobal, offset, ref_dev));
offset += sizeof(ref_dev);
AlertAndKillIfCudaError(cuParamSeti(
*cuComputeDistanceGlobal, offset, ref_width));
offset += sizeof(ref_width);
AlertAndKillIfCudaError(cuParamSeti(
*cuComputeDistanceGlobal, offset, ref_pitch));
offset += sizeof(ref_pitch);
AlertAndKillIfCudaError(cuParamSeti(
*cuComputeDistanceGlobal, offset, query_dev));
offset += sizeof(query_dev);
AlertAndKillIfCudaError(cuParamSeti(
*cuComputeDistanceGlobal, offset,
actual_nb_query_width));
offset += sizeof(actual_nb_query_width);
AlertAndKillIfCudaError(cuParamSeti(
*cuComputeDistanceGlobal, offset, query_pitch));
offset += sizeof(query_pitch);
AlertAndKillIfCudaError(cuParamSeti(
*cuComputeDistanceGlobal, offset, dim));
offset += sizeof(dim);
AlertAndKillIfCudaError(cuParamSeti(
*cuComputeDistanceGlobal, offset, dist_dev));
offset += sizeof(dist_dev);
AlertAndKillIfCudaError(cuParamSetSize(
*cuComputeDistanceGlobal, offset));
AlertAndKillIfCudaError(cuLaunchGrid(
*cuComputeDistanceGlobal, g_16x16_x, g_16x16_y));
}
// KERNEL 2 : Sort each column
//cuInsertionSort<<<g_256x1,t_256x1>>>(dist_dev, actual_nb_query_width, query_pitch, ref_width, k);
int offset = 0;
AlertAndKillIfCudaError(cuFuncSetBlockShape(*cuInsertionSort,
t_256x1_x, t_256x1_y, t_256x1_z));
AlertAndKillIfCudaError(cuParamSeti(*cuInsertionSort, offset,
dist_dev));
offset += sizeof(dist_dev);
AlertAndKillIfCudaError(cuParamSeti(*cuInsertionSort, offset,
actual_nb_query_width));
offset += sizeof(actual_nb_query_width);
AlertAndKillIfCudaError(cuParamSeti(*cuInsertionSort, offset,
query_pitch/sizeof(float)));
offset += sizeof(query_pitch);
AlertAndKillIfCudaError(cuParamSeti(*cuInsertionSort, offset,
ref_width));
offset += sizeof(ref_width);
AlertAndKillIfCudaError(cuParamSeti(*cuInsertionSort, offset, k));
offset += sizeof(k);
AlertAndKillIfCudaError(cuParamSetSize(*cuInsertionSort, offset));
AlertAndKillIfCudaError(cuLaunchGrid(*cuInsertionSort,
g_256x1_x, g_256x1_y));
// KERNEL 3 : Compute square root of k-th element
//cuParallelSqrt<<<g_256x1,t_256x1>>>(dist_dev+(k-1)*query_pitch, query_width);
offset = 0;
AlertAndKillIfCudaError(cuFuncSetBlockShape(*cuParallelSqrt,
t_256x1_x, t_256x1_y, t_256x1_z));
AlertAndKillIfCudaError(cuParamSeti(*cuParallelSqrt, offset,
dist_dev +(k-1)*query_pitch));
offset += sizeof(dist_dev);
AlertAndKillIfCudaError(cuParamSeti(*cuParallelSqrt, offset,
query_width));
offset += sizeof(query_width);
AlertAndKillIfCudaError(cuParamSetSize(*cuParallelSqrt, offset));
AlertAndKillIfCudaError(cuLaunchGrid(*cuParallelSqrt,
g_256x1_x, g_256x1_y));
// Memory copy of output from device to host
//cudaMemcpy2D(&output_host[i], query_width*size_of_float, dist_dev+(k-1)*query_pitch, query_pitch_in_bytes, actual_nb_query_width*size_of_float, 1, cudaMemcpyDeviceToHost);
CUDA_MEMCPY2D memcpy2Dparams3;
memset(&memcpy2Dparams3, 0, sizeof(memcpy2Dparams3));
memcpy2Dparams3.srcMemoryType = CU_MEMORYTYPE_DEVICE;
memcpy2Dparams3.dstMemoryType = CU_MEMORYTYPE_HOST;
memcpy2Dparams3.srcDevice = dist_dev+(k-1)*query_pitch;
memcpy2Dparams3.srcPitch = query_pitch_in_bytes; //// ici : pb
memcpy2Dparams3.dstHost = &output_host[i];
memcpy2Dparams3.dstPitch = query_width*sizeof(float);
memcpy2Dparams3.WidthInBytes = actual_nb_query_width
*sizeof(float);
memcpy2Dparams3.Height = 1;
cuMemcpy2D(&memcpy2Dparams3);
res = CU_SAFE_CALL(cuMemcpy2D(&memcpy2Dparams3));
(...)
}
// FREE MEMORY
if (use_texture)
cuArrayDestroy(ref_array);
else
cuMemFree(ref_dev);
cuMemFree(query_dev);
////
return output_host;
}
//////// version in C + Runtime API
void implRuntimeFunc(float* ref_host, int ref_width, float* query_host, int query_width, int height, int k, float* output_host){
unsigned int size_of_float = sizeof(float);
// Variables
float *query_dev;
float *ref_dev;
float *dist_dev;
cudaArray *ref_array;
cudaError_t result;
size_t query_pitch;
size_t query_pitch_in_bytes;
size_t ref_pitch;
size_t ref_pitch_in_bytes;
size_t max_nb_query_traited;
size_t actual_nb_query_width;
unsigned int memory_total;
unsigned int memory_free;
unsigned int use_texture = (ref_width*size_of_float<=MAX_TEXTURE_WIDTH_IN_BYTES && height*size_of_float<=MAX_TEXTURE_HEIGHT_IN_BYTES);
// CUDA Initialisation and get free memory (80% of free memory used)
CUdevice cuDevice=0;
CUcontext cuContext;
cuInit(0);
cuCtxCreate(&cuContext, 0, cuDevice);
cuMemGetInfo(&memory_free,&memory_total);
// Determine maximum number of query that can be treated
max_nb_query_traited = ( memory_free * MAX_PART_OF_FREE_MEMORY_USED - size_of_float * ref_width*height ) / ( size_of_float * (height + ref_width) );
max_nb_query_traited = min( query_width, (max_nb_query_traited / 16) * 16 );
// Allocation of global memory for query points and for distances
result = cudaMallocPitch( (void **) &query_dev, &query_pitch_in_bytes, max_nb_query_traited * size_of_float, height + ref_width);
(...)
query_pitch = query_pitch_in_bytes/size_of_float;
dist_dev = query_dev + height * query_pitch;
if (use_texture){
// Allocation of texture memory for reference points
cudaChannelFormatDesc channelDescA = cudaCreateChannelDesc<float>();
result = cudaMallocArray( &ref_array, &channelDescA, ref_width, height );
(...)
cudaMemcpyToArray( ref_array, 0, 0, ref_host, ref_width * height * size_of_float, cudaMemcpyHostToDevice );
// Set texture parameters and bind texture to array
texA.addressMode[0] = cudaAddressModeClamp;
texA.addressMode[1] = cudaAddressModeClamp;
texA.filterMode = cudaFilterModePoint;
texA.normalized = 0;
cudaBindTextureToArray(texA, ref_array);
}
else
{
result = cudaMallocPitch( (void **) &ref_dev, &ref_pitch_in_bytes, ref_width * size_of_float, height);
(...)
ref_pitch = ref_pitch_in_bytes/size_of_float;
cudaMemcpy2D(ref_dev, ref_pitch_in_bytes, ref_host, ref_width*size_of_float, ref_width*size_of_float, height, cudaMemcpyHostToDevice);
}
// Split queries to fit on GPU memory
for (int i=0;i<query_width;i+=max_nb_query_traited){
actual_nb_query_width = min( max_nb_query_traited, query_width-i );
// Copy of part of query actually being treated
cudaMemcpy2D(query_dev, query_pitch_in_bytes, &query_host[i], query_width*size_of_float, actual_nb_query_width*size_of_float, height, cudaMemcpyHostToDevice);
// GRIDS ANS THREADS
dim3 g_16x16(actual_nb_query_width/16, ref_width/16, 1);
dim3 t_16x16(16, 16, 1);
if (actual_nb_query_width%16 != 0) g_16x16.x += 1;
if (ref_width %16 != 0) g_16x16.y += 1;
//
dim3 g_256x1(actual_nb_query_width/256, 1, 1);
dim3 t_256x1(256, 1, 1);
if (actual_nb_query_width%256 != 0) g_256x1.x += 1;
// KERNEL 1 : Compute all the distances
if (use_texture)
cuComputeDistanceTexture<<<g_16x16,t_16x16>>>(ref_width, query_dev, actual_nb_query_width, query_pitch, height, dist_dev);
else
cuComputeDistanceGlobal<<<g_16x16,t_16x16>>>(ref_dev, ref_width, ref_pitch, query_dev, actual_nb_query_width, query_pitch, height, dist_dev);
// KERNEL 2 : Sort each column
cuInsertionSort<<<g_256x1,t_256x1>>>(dist_dev, actual_nb_query_width, query_pitch, ref_width, k);
// KERNEL 3 : Compute square root of k-th element
cuParallelSqrt<<<g_256x1,t_256x1>>>(dist_dev+(k-1)*query_pitch, query_width);
// Memory copy of output from device to host
cudaMemcpy2D(&output_host[i], query_width*size_of_float, dist_dev+(k-1)*query_pitch, query_pitch_in_bytes, actual_nb_query_width*size_of_float, 1, cudaMemcpyDeviceToHost);
}
// FREE MEMORY
if (use_texture)
cudaFreeArray(ref_array);
else
cudaFree(ref_dev);
cudaFree(query_dev);
}
I guess it must be an initialization problem but I don’t find why…
Thanks for your help :)