I’ve been writing a simple random-walk dispersion code in CUDA. I want to store velocity fields in textures because I only need to read from the arrays to move the particles and I would like to take advantage of the reduced memory latency of texture memory accesses compared to global memory accesses. Unfortunately I am having trouble binding the velocity field arrays to textures. My program uses multiple files and every time that I try to compile it I get an error message similar to the one below:
/usr/local/cuda/bin/…/include/cuda_runtime.h:242: error: inline function ‘_Z15cudaBindTextureIfLi1EL19cudaTextureReadMode0EE9cudaEr
rorPmRK7textureIT_XT0_EXT1_EEPKvRK21cudaChannelFormatDescm’ cannot be declared weak
I have no idea what being “declared weak” means. In order to diagnose the problem I started fiddling around with the simple cudaMallocAndMemcpy example and modified it to use a kernel and textures instead of the device to device cudaMemcpy. This worked perfectly.
However when I tried separating the texture binding and kernel into another file I got the error shown above. I have been doing this on a MacPro so it may be something funky with the Mac version of CUDA. Does anyone know what I am doing wrong? Thanks in advance for any help.
I have included the code below.
/*********************************
*
- Here is the single file example that works.
*********************************/
// Start of cudaMallocAndMemcpyTexture.cu
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
void checkCUDAError(const char *msg);
texture<float, 1, cudaReadModeElementType> tex;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);
global void CopyTexture(float* d_b){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
d_b[idx]=tex1Dfetch(tex,idx);
}
int main( void ) {
int n, dimA;
float h_a, h_b;
float d_a, d_b;
dimA = 512;
size_t memSize = dimAsizeof(float);
cudaMallocHost( (void)&h_a, memSize );
cudaMallocHost( (void*)&h_b, memSize );
printf(“Allocated Host Memory\n”);
for (n=0; n<dimA; n++){
//printf(“Set memory location %d\n”,n);
h_a[n] = (float) n;
}
printf(“Initialized Host Array\n”);
cudaMalloc( (void**)&d_a, memSize );
cudaMalloc( (void**)&d_b, memSize );
printf(“Allocated Device Memory\n”);
cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );
printf(“Copied Array from Host to Device\n”);
//This is the original copy statement
//cudaMemcpy( d_b, d_a, memSize, cudaMemcpyDeviceToDevice );
//This section of code is where I modified it to use textures
tex.addressMode[0] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModePoint;
tex.normalized = false;
cudaBindTexture(NULL,tex,d_a,channelDesc);
int numThreadsPerBlock=64;
int numBlocks = dimA/numThreadsPerBlock + (dimA%numThreadsPerBlock == 0?0:1);
CopyTexture<<<numBlocks,numThreadsPerBlock>>>(d_b);
// end of modifications
printf("Copied Array from Device to Device\n");
cudaMemcpy( h_b, d_b, memSize, cudaMemcpyDeviceToHost );
printf("Copied Array from Device to Host\n");
checkCUDAError("cudaMemcpy calls");
for (n=0; n<dimA; n++){
assert(h_a[n] == h_b[n]);
}
printf("Checked the array on Host\n");
cudaFree( d_a );
cudaFree( d_b );
printf("Freed Device Memory\n");
checkCUDAError("cudaFree");
cudaFreeHost(h_a);
cudaFreeHost(h_b);
printf("Freed Host Memory\n");
printf("Correct!\n");
return 0;
}
void checkCUDAError(const char *msg){
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
fprintf(stderr, “Cuda error: %s: %s.\n”, msg, cudaGetErrorString( err) );
exit(-1);
}
}
// End of cudaMallocAndMemcpyTexture.cu
/*********************************
*
- Here is the multiple file example that does not work.
*********************************/
// Start of textureCopy.cpp
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
void checkCUDAError(const char *msg);
void CopyTexture(float* d_b,float* d_a,int dimA);
int main( void ) {
int n, dimA;
float h_a, h_b;
float d_a, d_b;
dimA = 512;
size_t memSize = dimAsizeof(float);
cudaMallocHost( (void)&h_a, memSize );
cudaMallocHost( (void*)&h_b, memSize );
printf(“Allocated Host Memory\n”);
for (n=0; n<dimA; n++){
//printf(“Set memory location %d\n”,n);
h_a[n] = (float) n;
}
printf(“Initialized Host Array\n”);
cudaMalloc( (void**)&d_a, memSize );
cudaMalloc( (void**)&d_b, memSize );
printf(“Allocated Device Memory\n”);
cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );
printf(“Copied Array from Host to Device\n”);
CopyTexture(d_b,d_a,dimA);
printf(“Copied Array from Device to Device\n”);
cudaMemcpy( h_b, d_b, memSize, cudaMemcpyDeviceToHost );
printf(“Copied Array from Device to Host\n”);
checkCUDAError(“cudaMemcpy calls”);
for (n=0; n<dimA; n++){
assert(h_a[n] == h_b[n]);
}
printf(“Checked the array on Host\n”);
cudaFree( d_a );
cudaFree( d_b );
printf(“Freed Device Memory\n”);
checkCUDAError(“cudaFree”);
cudaFreeHost(h_a);
cudaFreeHost(h_b);
printf(“Freed Host Memory\n”);
printf(“Correct!\n”);
return 0;
}
void checkCUDAError(const char *msg){
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
fprintf(stderr, “Cuda error: %s: %s.\n”, msg, cudaGetErrorString( err) );
exit(-1);
}
}
// End of textureCopy.cpp
// Start of textureCopy.cu
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include “textureCopy.h”
texture<float, 1, cudaReadModeElementType> tex;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);
global void CopyTextureKernel(float* d_b){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
d_b[idx]=tex1Dfetch(tex,idx);
}
void CopyTexture(float* d_b,float* d_a,int dimA){
int numThreadsPerBlock=64;
int numBlocks = dimA/numThreadsPerBlock + (dimA%numThreadsPerBlock == 0?0:1);
tex.addressMode[0] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModePoint;
tex.normalized = false;
cudaBindTexture(NULL,tex,d_a,channelDesc);
CopyTextureKernel<<<numBlocks,numThreadsPerBlock>>>(d_b);
}
// End of textureCopy.cu