Hy
If I consider the doc, 2D textures can be of integer types. But when I change the float type with unsigned short or unsigned int in the simpleTexture example and launch it in Emurelease mode the kernel is blocking with no treatment. Even a printf(“here\n”); at the beginning of the kernel is not printed. Do I have forgotten something? Can you help me?
here it is the modified code with unsigned short:
#ifndef SIMPLETEXTURE_KERNEL_H
#define SIMPLETEXTURE_KERNEL_H
// declare texture reference for 2D float texture
texture<unsigned short, 2, cudaReadModeElementType> tex;
////////////////////////////////////////////////////////////////////////////////
//! Transform an image using texture lookups
//! @param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
global void
transformKernel(unsigned short* g_odata, int width, int height, float theta)
{
printf(“here\n”);
// calculate normalized texture coordinates
unsigned int x = blockIdx.xblockDim.x + threadIdx.x;
unsigned int y = blockIdx.yblockDim.y + threadIdx.y;
float u = x / (float) width;
float v = y / (float) height;
// transform coordinates
u -= 0.5;
v -= 0.5;
float tu = u*cos(theta) - v*sin(theta) + 0.5;
float tv = v*cos(theta) + u*sin(theta) + 0.5;
// read from texture and write to global memory
g_odata[y*width + x] = texfetch(tex, tu, tv);
}
#endif // #ifndef SIMPLETEXTURE_KERNEL_H
and the main code:
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cutil.h>
// includes, kernels
#include <simpleTexture_kernel.cu>
char *image_filename = “lena_bw.pgm”;
float angle = 0.5f; // angle to rotate image by (in radians)
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);
extern “C”
void computeGold( float* reference, float* idata, const unsigned int len);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
runTest( argc, argv);
CUT_EXIT(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runTest( int argc, char** argv)
{
// load image from disk
unsigned short* h_data = NULL;
unsigned int width, height;
char* image_path = cutFindFilePath(image_filename, argv[0]);
if (image_path == 0)
exit(EXIT_FAILURE);
CUT_SAFE_CALL( cutLoadPGMs(image_path, &h_data, &width, &height));
unsigned int size = width * height * sizeof(unsigned short);
printf("Loaded '%s', %d x %d pixels\n", image_filename, width, height);
// allocate device memory for result
unsigned short* d_data = NULL;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_data, size));
// allocate array and copy image data
cudaArray* cu_array;
CUDA_SAFE_CALL( cudaMallocArray( &cu_array, &tex.channelDesc, width, height ));
CUDA_SAFE_CALL( cudaMemcpy( cu_array, h_data, size, cudaMemcpyHostToDevice));
// set texture parameters
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = true; // access with normalized texture coordinates
// Bind the array to the texture
cudaBindTexture( tex, cu_array);
dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
// warmup
transformKernel<<< dimGrid, dimBlock, 0 >>>( d_data, width, height, angle);
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
// execute the kernel
transformKernel<<< dimGrid, dimBlock, 0 >>>( d_data, width, height, angle);
// check if kernel execution generated an error
CUT_CHECK_ERROR("Kernel execution failed");
CUT_SAFE_CALL( cutStopTimer( timer));
printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
printf("%.2f Mpixels/sec\n", (width*height / (cutGetTimerValue( timer) / 1000.0f)) / 1e6);
CUT_SAFE_CALL( cutDeleteTimer( timer));
// allocate mem for the result on host side
unsigned short* h_odata = (unsigned short*) malloc( size);
// copy result from device to host
CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_data, size, cudaMemcpyDeviceToHost) );
// write result to file
char output_filename[1024];
strcpy(output_filename, image_path);
strcpy(output_filename + strlen(image_path) - 4, "_out.pgm");
CUT_SAFE_CALL( cutSavePGMs( output_filename, h_odata, width, height));
printf("Wrote '%s'\n", output_filename);
// write regression file if necessary
if( cutCheckCmdLineFlag( argc, (const char**) argv, "regression"))
{
// write file for regression test
// CUT_SAFE_CALL( cutWriteFilef( "./data/regression.dat", h_odata, width*height, 0.0));
}
// cleanup memory
CUDA_SAFE_CALL(cudaFree(d_data));
CUDA_SAFE_CALL(cudaFreeArray(cu_array));
free(h_data);
free(h_odata);
cutFree(image_path);
}
thanks for help!