Problems binding textures in multiple file programs

I’ve been writing a simple random-walk dispersion code in CUDA. I want to store velocity fields in textures because I only need to read from the arrays to move the particles and I would like to take advantage of the reduced memory latency of texture memory accesses compared to global memory accesses. Unfortunately I am having trouble binding the velocity field arrays to textures. My program uses multiple files and every time that I try to compile it I get an error message similar to the one below:

/usr/local/cuda/bin/…/include/cuda_runtime.h:242: error: inline function ‘_Z15cudaBindTextureIfLi1EL19cudaTextureReadMode0EE9cudaEr
rorPmRK7textureIT_XT0_EXT1_EEPKvRK21cudaChannelFormatDescm’ cannot be declared weak

I have no idea what being “declared weak” means. In order to diagnose the problem I started fiddling around with the simple cudaMallocAndMemcpy example and modified it to use a kernel and textures instead of the device to device cudaMemcpy. This worked perfectly.

However when I tried separating the texture binding and kernel into another file I got the error shown above. I have been doing this on a MacPro so it may be something funky with the Mac version of CUDA. Does anyone know what I am doing wrong? Thanks in advance for any help.

I have included the code below.

/*********************************
*

  • Here is the single file example that works.

*********************************/

// Start of cudaMallocAndMemcpyTexture.cu
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>

void checkCUDAError(const char *msg);

texture<float, 1, cudaReadModeElementType> tex;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);

global void CopyTexture(float* d_b){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
d_b[idx]=tex1Dfetch(tex,idx);
}

int main( void ) {
int n, dimA;
float h_a, h_b;
float d_a, d_b;
dimA = 512;
size_t memSize = dimA
sizeof(float);
cudaMallocHost( (void
)&h_a, memSize );
cudaMallocHost( (void
*)&h_b, memSize );
printf(“Allocated Host Memory\n”);
for (n=0; n<dimA; n++){
//printf(“Set memory location %d\n”,n);
h_a[n] = (float) n;
}
printf(“Initialized Host Array\n”);
cudaMalloc( (void**)&d_a, memSize );
cudaMalloc( (void**)&d_b, memSize );
printf(“Allocated Device Memory\n”);
cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );
printf(“Copied Array from Host to Device\n”);

//This is the original copy statement
//cudaMemcpy( d_b, d_a, memSize, cudaMemcpyDeviceToDevice );

//This section of code is where I modified it to use textures
tex.addressMode[0] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModePoint;
tex.normalized = false;
cudaBindTexture(NULL,tex,d_a,channelDesc);
int numThreadsPerBlock=64;
int numBlocks = dimA/numThreadsPerBlock + (dimA%numThreadsPerBlock == 0?0:1);
CopyTexture<<<numBlocks,numThreadsPerBlock>>>(d_b);
// end of modifications

printf("Copied Array from Device to Device\n");
cudaMemcpy(  h_b, d_b, memSize, cudaMemcpyDeviceToHost );
printf("Copied Array from Device to Host\n");
checkCUDAError("cudaMemcpy calls");
for (n=0; n<dimA; n++){
    assert(h_a[n] ==  h_b[n]);
}
printf("Checked the array on Host\n");
cudaFree( d_a );
cudaFree( d_b );
printf("Freed Device Memory\n");
checkCUDAError("cudaFree");
cudaFreeHost(h_a);
cudaFreeHost(h_b);
printf("Freed Host Memory\n");
printf("Correct!\n");

return 0;

}

void checkCUDAError(const char *msg){
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
fprintf(stderr, “Cuda error: %s: %s.\n”, msg, cudaGetErrorString( err) );
exit(-1);
}
}
// End of cudaMallocAndMemcpyTexture.cu

/*********************************
*

  • Here is the multiple file example that does not work.

*********************************/

// Start of textureCopy.cpp
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>

void checkCUDAError(const char *msg);

void CopyTexture(float* d_b,float* d_a,int dimA);

int main( void ) {
int n, dimA;
float h_a, h_b;
float d_a, d_b;
dimA = 512;
size_t memSize = dimA
sizeof(float);
cudaMallocHost( (void
)&h_a, memSize );
cudaMallocHost( (void
*)&h_b, memSize );
printf(“Allocated Host Memory\n”);
for (n=0; n<dimA; n++){
//printf(“Set memory location %d\n”,n);
h_a[n] = (float) n;
}
printf(“Initialized Host Array\n”);
cudaMalloc( (void**)&d_a, memSize );
cudaMalloc( (void**)&d_b, memSize );
printf(“Allocated Device Memory\n”);
cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );
printf(“Copied Array from Host to Device\n”);
CopyTexture(d_b,d_a,dimA);
printf(“Copied Array from Device to Device\n”);
cudaMemcpy( h_b, d_b, memSize, cudaMemcpyDeviceToHost );
printf(“Copied Array from Device to Host\n”);
checkCUDAError(“cudaMemcpy calls”);
for (n=0; n<dimA; n++){
assert(h_a[n] == h_b[n]);
}
printf(“Checked the array on Host\n”);
cudaFree( d_a );
cudaFree( d_b );
printf(“Freed Device Memory\n”);
checkCUDAError(“cudaFree”);
cudaFreeHost(h_a);
cudaFreeHost(h_b);
printf(“Freed Host Memory\n”);
printf(“Correct!\n”);

return 0;

}

void checkCUDAError(const char *msg){
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
fprintf(stderr, “Cuda error: %s: %s.\n”, msg, cudaGetErrorString( err) );
exit(-1);
}
}
// End of textureCopy.cpp

// Start of textureCopy.cu
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include “textureCopy.h”

texture<float, 1, cudaReadModeElementType> tex;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);

global void CopyTextureKernel(float* d_b){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
d_b[idx]=tex1Dfetch(tex,idx);
}

void CopyTexture(float* d_b,float* d_a,int dimA){
int numThreadsPerBlock=64;
int numBlocks = dimA/numThreadsPerBlock + (dimA%numThreadsPerBlock == 0?0:1);
tex.addressMode[0] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModePoint;
tex.normalized = false;
cudaBindTexture(NULL,tex,d_a,channelDesc);
CopyTextureKernel<<<numBlocks,numThreadsPerBlock>>>(d_b);
}
// End of textureCopy.cu

I found the problem. With the single file version I was simply using nvcc from the command line which worked fine. For the multiple file version I was using the make files that came with the SDK. I was trying to use a most recent version of gcc on my Mac and apparently the latest version of gcc didn’t like the texture binding calls. Now that I am using the default version of gcc on the Mac it compiles without any problems.

This doesn’t solve the original problem. I get these errors as well. I am using the latest gcc 4.4 with c++0x, I am not sure what to do. Look at the gcc options? File a nvidia bug report? It’s the first time I’ve heard of weak symbols, had to look it up.

(weak functions are ones that can be overridden with strong functions)

I looked at the command line nvcc was using and it had

nvcc -o obj/release/riv.cu_o -c riv.cu -I. -I/usr/local/cuda/include -DUNIX -O3

I removed the -DUNIX (it was from common.mk) and it compiles now.