I’m trying to create a simple example of using 2D textures, but encountering problems and not sure how to debug. The example is to copy a small image by:
-
reading the original
-
copying the original to a CUDA array, binding that array to a texture
-
the kernel then copies each pixel from the texture to the new image
-
the new image is retrieved from device memory and written out to disk
The problem originates with the use of textures. When I copy from the texture using the kernel, the output is scrambled. If I copy directly from a linear array, everything is fine. If I just copy the CUDA array memory back from the device, everything is fine.
I hate to post a bunch of code but I’m at a dead end here. Semi experienced with CUDA and never had any problems in the past. I’ve read the relevant documentation several times, so I’m hoping I’ve got some simple conceptual mistake that can be cleared up easily by someone with more experience.
[codebox]texture<unsigned char, 2, cudaReadModeElementType> texRef;
global void copy_tex_to_global(uint8 * copy, uint32 width) {
//thread coordinates
int tx = threadIdx.x;
int bx = blockIdx.x;
//copy one row from texture
uint8 pixel = tex2D(texRef, (float)(tx), (float)(bx));
copy[bx*width + tx] = pixel;
}
int main(int argc, char *argv) {
//image dimensions
uint32 width, height;
//read image from disk
uint8* original = TIFFreadgray(argv[1], width, height);
//declare and allocate CUDA array
cudaArray* original_array;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned char>();
cudaMallocArray(&original_array, &channelDesc, width, height);
//copy 'class_image' to 'image_array'
cudaMemcpyToArray(original_array, 0, 0, original, width*height, cudaMemcpyHostToDevice);
//declare texture reference, initialize
texRef.normalized = 0;
texRef.filterMode = cudaFilterModeLinear;
texRef.addressMode[0] = cudaAddressModeClamp;
texRef.addressMode[1] = cudaAddressModeClamp;
//bind texture to array
cudaBindTextureToArray(texRef, original_array);
//declare image copy in device memory
uint8 * d_copy;
cudaMalloc((void**)&d_copy, width*height);
//execution configuration
dim3 dimBlock(width); //one thread for each pixel in a row
dim3 dimGrid(height); //one block for each row
//call kernel
copy_tex_to_global<<<dimGrid, dimBlock>>>(d_copy, width);
//allocate array for image copy
unsigned char * copy = (uint8 *)malloc(width*height);
//copy result from device
cudaMemcpy(copy, d_copy, width*height, cudaMemcpyDeviceToHost);
//write copy to disk
TIFFwritegray(copy, "copy.tif", width, height);
//free dynamically allocated memory
cudaFreeArray(original_array);
cudaFree(d_copy);
free(copy);
free(original);
}[/codebox]
TIFF_utilities.cpp (12.3 KB)