printf() not working in static library?

Hi all

After a week of frustration I have the following question.

I am making a static library of my cuda raytracer which I then want to link with my programs. It looks like I can link the static library but I’m not able to use printf() statements anymore.

My library is build from several files:

GPURaytracer.h

#ifndef __GPURAYTRACER_H

#define __GPURAYTRACER_H

#include "raytracer_kernel.h"

#include <vector>

#include "Dose/Photon/cube/cube.h"

#include "Dose/Photon/cube/cubeinfo.h"

#include "Dose/Photon/cube/cubecalc.h"

#include "CUDA_SDK/cutil.h"

#include "cuda.h"

#include "cuda_runtime.h"

#include "constants.h"

#include <iostream>

#include <cmath>

template <class VType>

class GPURaytracer {

public:

   GPURaytracer(const float x1, const float y1, const float z1,

        const std::vector<float> & x_plane, const std::vector<float> & y_plane, const std::vector<float> & z_plane,

        const short int dimx, const short int dimy, const short int dimz,

        const float dx, const float dy, const float dz,

        const VType *sData);

   

   ~GPURaytracer();

   

   void DoTrace();

   

   void RadiologicalDepth(float *radiologicalDepth_);

   void offSet(int offX, int offY);

   void blockSize(int blockX, int blockY);

   

   

private:

   

   // Memory size of the array holding the RD.

   unsigned int memSize_;

   

   // Array holding radiologicalDepth.

   float* d_radiologicalDepth_;

  // The CT dataset.

   VType *d_CT_dataset_;

  // Grid dimension in the y-direction (?).

   int gridX_;

   short int dimx_, dimy_, dimz_;

   unsigned int blockSize_x_, blockSize_y_, offset_x_, offset_y_;

};

template <class VType>

GPURaytracer<VType>::GPURaytracer(const float x1, const float y1, const float z1,

        const std::vector<float> & x_plane, const std::vector<float> & y_plane, const std::vector<float> & z_plane,

        const short int dimx, const short int dimy, const short int dimz,

        const float dx, const float dy, const float dz,

        const VType *sData)

{

   

   some cudaMemcpyToSymbols...

   

}

//destructor

template <class VType>

GPURaytracer<VType>::~GPURaytracer() {

	std::cout << "Destructor called." << std::endl; 

}

template <class VType>

void GPURaytracer<VType>::RadiologicalDepth(float* h_radiologicalDepth){

	std::cout << "Returning radiological depth..." << std::endl;

    CUDA_SAFE_CALL(cudaMemcpy(h_radiologicalDepth, d_radiologicalDepth_, memSize_, cudaMemcpyDeviceToHost) );

}

template <class VType>

void GPURaytracer<VType>::offSet(int offX, int offY)

{

 offset_x_ = offX;

 offset_y_ = offY;

}

template <class VType>

void GPURaytracer<VType>::blockSize(int blockX, int blockY)

{

 blockSize_x_ = blockX;

 blockSize_y_ = blockY;

}

template <class VType>

void GPURaytracer<VType>::DoTrace()

{

	//XXX TODO

	//Implementatie grid met generieke afmetingen != veelvoud 16

	//

	std::cout << "Raytracing..." << std::endl; 

	int a = floor(dimx_ / blockSize_x_);

	int b = floor(dimy_ / blockSize_y_);

	int blockSize_x_temp = blockSize_x_;

	

	std::cout << "a: " << a << "\t" << "b: " << b << std::endl;

	

  if(a > 0 && b > 0) {

  	printf("line 133\n");

   

  dim3 dimBlock(blockSize_x_, blockSize_y_);

  dim3 dimGrid(a*dimz_, b);

 gpu_raytracer(d_radiologicalDepth_, d_CT_dataset_,

    (dimGrid.x / dimz_), blockSize_x_, blockSize_y_,

    offset_x_, offset_y_, dimBlock, dimGrid);

 blockSize_x_ = dimx_ - blockSize_x_ * a;

  offset_x_ = blockSize_x_ * a;

 if(blockSize_x_ > 0) {

  	dim3 dimBlock(blockSize_x_, blockSize_y_);

  	dim3 dimGrid(1*dimz_, b);

  	

  	gpu_raytracer(d_radiologicalDepth_, d_CT_dataset_,

    	(dimGrid.x / dimz_), blockSize_x_, blockSize_y_,

    	offset_x_, offset_y_, dimBlock, dimGrid);

  }

 blockSize_y_ = dimy_ - blockSize_y_ * b;

  offset_y_ = blockSize_y_ * b;

 if(blockSize_x_ > 0 && blockSize_y_ > 0) {

  	dim3 dimBlock(blockSize_x_, blockSize_y_);

  	dim3 dimGrid(1*dimz_, 1);

 	gpu_raytracer(d_radiologicalDepth_, d_CT_dataset_,

    	(dimGrid.x / dimz_), blockSize_x_, blockSize_y_,

    	offset_x_, offset_y_, dimBlock, dimGrid);

  }

 offset_x_ = 0;

  blockSize_x_ = blockSize_x_temp;

 if(blockSize_y_ > 0) {	

  	dim3 dimBlock(blockSize_x_, blockSize_y_);

  	dim3 dimGrid(a*dimz_, 1);

  

  	gpu_raytracer(d_radiologicalDepth_, d_CT_dataset_,

    	(dimGrid.x / dimz_), blockSize_x_, blockSize_y_,

    	offset_x_, offset_y_, dimBlock, dimGrid);

  }

	}

}

#endif

this calls raytracer_kernel.h

#ifndef __RAYTRACER_KERNEL__

#define __RAYTRACER_KERNEL__

//#ifdef __cplusplus

#include "cuda.h"

#include "cuda_runtime.h"

void gpu_raytracer(float *radiologicalPath, short int *rho,

        int gridX, unsigned int blockSize_x, unsigned int blockSize_y,

        unsigned int offset_x, unsigned int offset_y, dim3 dimBlock, dim3 dimGrid);

#endif

raytracer_kernel.cu

//calculate for all voxels the delta x, y and z

#include <stdio.h>

#include <cstdio>

#include <cassert>

#include "constants.h"

#include "raytracer_kernel.h"

#include "compute_RD_shell.cu"

__global__ void raytracer_kernel(float *radiologicalPath, short int *rho,

        int gridX, unsigned int blockSize_x, unsigned int blockSize_y,

        unsigned int offset_x, unsigned int offset_y) {

  Some fancy stuff happens here...

   and also the printf()s

}

void gpu_raytracer(float *radiologicalPath, short int *rho,

        int gridX, unsigned int blockSize_x, unsigned int blockSize_y,

        unsigned int offset_x, unsigned int offset_y, dim3 dimBlock, dim3 dimGrid) 

{

//printf("Calling raytracer kernel...\n");

raytracer_kernel<<<dimGrid,dimBlock>>>(radiologicalPath, rho,

          gridX, blockSize_x, blockSize_y,

          offset_x, offset_y);

}

And the raytracer_kernel.cu calls a function in compute_RD_shell.cu which calls some functions in computeRD.cu

If I make these files using the nvcc -c -deviceemu mode

and after that combine all the object files into an .a file with ar

it all looks like it is going fine.

but when I then make my TestRayTracer.C file and link the good libraries I’m not getting any errors but its also not showing any printf()s

I hope some1 can comment on this strange behavior.

Thanks,

Jordy

  1. Are you compiling both raytracer_kernel.cu and compute_RD_shell.cu with nvcc? Since you include one into the other, you should only compile raytracer_kernel.cu.

  2. Are you linking with the cudart library in the final linking step?

  3. Is this CUDA 1.1 or CUDA 1.0? You may need to link with the cuda library in 1.0 for device emulation.

  4. Are you running in debug mode and checking for errors? Is it possible that your program is exiting before you even reach a kernel call?

  5. Try querying for devices at the beginning of main and printing out the available ones (like the deviceQuery sample in the SDK).

Here is a test that I did which worked without a problem.

printf_test.h

extern "C" void printf_test();

printf_test.cu

#include "printf_test.h"

#include <stdio.h>

__global__ void printf_test_kernel()

    {

    printf("%d\n", threadIdx.x);

    }

void printf_test()

    {

    printf_test_kernel<<<1, 32>>>();

    }

printf_test_main.cc

#include "printf_test.h"

int main()

    {

    printf_test();

    }

I compiled with

$ nvcc -c printf_test.cu -deviceemu

$ g++ -o printf_test printf_test_main.cc printf_test.o -lcudart -L/opt/cuda/lib

And I get the expected output when running ./printf_test

SORRY FOR THE BIG MESS!!!

/home/jcvaneijk/work/Algorithms/GPURayTrace% make

nvcc --compile -deviceemu   -I/usr/local/cuda/include -I../../MAKEHOME/.. -I/usr/local/cuda/include/ -I/usr/X11R6/include -I/usr/local/Trolltech/Qt-4.3.1/include -I/usr/local/Trolltech/Qt-4.3.1/include/QtXml -I/usr/local/Trolltech/Qt-4.3.1/include/QtNetwork -I/usr/local/Trolltech/Qt-4.3.1/include/QtCore   compute_RD_shell.cu -o release-LX/compute_RD_shell.o

nvcc --compile -deviceemu   -I/usr/local/cuda/include -I../../MAKEHOME/.. -I/usr/local/cuda/include/ -I/usr/X11R6/include -I/usr/local/Trolltech/Qt-4.3.1/include -I/usr/local/Trolltech/Qt-4.3.1/include/QtXml -I/usr/local/Trolltech/Qt-4.3.1/include/QtNetwork -I/usr/local/Trolltech/Qt-4.3.1/include/QtCore   raytracer_kernel.cu -o release-LX/raytracer_kernel.o

nvcc --compile -deviceemu   -I/usr/local/cuda/include -I../../MAKEHOME/.. -I/usr/local/cuda/include/ -I/usr/X11R6/include -I/usr/local/Trolltech/Qt-4.3.1/include -I/usr/local/Trolltech/Qt-4.3.1/include/QtXml -I/usr/local/Trolltech/Qt-4.3.1/include/QtNetwork -I/usr/local/Trolltech/Qt-4.3.1/include/QtCore   testkernel.cu -o release-LX/testkernel.o

nvcc --compile -deviceemu   -I/usr/local/cuda/include -I../../MAKEHOME/.. -I/usr/local/cuda/include/ -I/usr/X11R6/include -I/usr/local/Trolltech/Qt-4.3.1/include -I/usr/local/Trolltech/Qt-4.3.1/include/QtXml -I/usr/local/Trolltech/Qt-4.3.1/include/QtNetwork -I/usr/local/Trolltech/Qt-4.3.1/include/QtCore   updateRD.cu -o release-LX/updateRD.o

        Creating library identification

        Archiving library release-LX/libGPURayTrace.a

ar cru release-LX/libGPURayTrace.a  release-LX/.Id.o release-LX/compute_RD_shell.o release-LX/raytracer_kernel.o release-LX/testkernel.o release-LX/updateRD.o

        Making link ../../MAKEHOME/lib/release-LX/libGPURayTrace.a to release-LX/libGPURayTrace.a

compiling all the .cu files with nvcc

2, 3.

/home/jcvaneijk/work/Applications/TestRayTrace% make

g++   -O3 -march=pentium-m -mfpmath=sse   -Wno-deprecated  -DFUNCPROTO -DLINUX -D_ALL_SOURCE  -DRW_NO_UNBUFFERED  -D__RELEASE_VERSION   -I../../MAKEHOME/.. -I/usr/local/cuda/include -I/usr/X11R6/include -I/usr/local/Trolltech/Qt-4.3.1/include -I/usr/local/Trolltech/Qt-4.3.1/include/QtXml -I/usr/local/Trolltech/Qt-4.3.1/include/QtNetwork -I/usr/local/Trolltech/Qt-4.3.1/include/QtCore -fopenmp -Wp,-MD,release-LX/raytrace_benchmark.pp -c raytrace_benchmark.C -o release-LX/raytrace_benchmark.o

../../MAKEHOME/../Algorithms/GPURayTrace/GPURaytracer.h: In member function ‘void GPURaytracer<VType>::DoTrace() [with VType = short int]’:

raytrace_benchmark.C:295:   instantiated from here

../../MAKEHOME/../Algorithms/GPURayTrace/GPURaytracer.h:166: warning: converting to ‘int’ from ‘double’

../../MAKEHOME/../Algorithms/GPURayTrace/GPURaytracer.h:167: warning: converting to ‘int’ from ‘double’

g++   -O3 -march=pentium-m -mfpmath=sse   -Wno-deprecated  -DFUNCPROTO -DLINUX -D_ALL_SOURCE  -DRW_NO_UNBUFFERED  -D__RELEASE_VERSION   -I../../MAKEHOME/.. -I/usr/local/cuda/include -I/usr/X11R6/include -I/usr/local/Trolltech/Qt-4.3.1/include -I/usr/local/Trolltech/Qt-4.3.1/include/QtXml -I/usr/local/Trolltech/Qt-4.3.1/include/QtNetwork -I/usr/local/Trolltech/Qt-4.3.1/include/QtCore -fopenmp -Wp,-MD,release-LX/main.pp -c main.C -o release-LX/main.o

nvcc --compile   -I/usr/local/cuda/include -I../../MAKEHOME/.. -I/usr/local/cuda/include -I/usr/X11R6/include -I/usr/local/Trolltech/Qt-4.3.1/include -I/usr/local/Trolltech/Qt-4.3.1/include/QtXml -I/usr/local/Trolltech/Qt-4.3.1/include/QtNetwork -I/usr/local/Trolltech/Qt-4.3.1/include/QtCore   try.cu -o release-LX/try.o

        Creating application identification

        Linking executable release-LX/TestRayTrace 

g++  release-LX/.Id.o release-LX/raytrace_benchmark.o release-LX/main.o release-LX/try.o   -L../../MAKEHOME/lib/release-LX -L/usr/local/cuda/lib -lPlans -lBeamSetup -lDoseCalcAdapter -lDoseCalcAdapterAMC -lDosimetricData -lPatientHandler -lPlotDVH -lDVH -lDoseConvolution -ldc -lFluenceUtil -lcube -lutil -lGPURayTrace -lRayTraceDepths -lRayTrace -lFFTW_Convolve -lrfftw -lfftw -lGrid -lSimPlan -lUMS_IO -lPatientFile -lGeometry -lDecisionProtocol -lUtils -lAlgorithms -lSerializer -lrwtool -lnr -lrmplatform -lgomp -lcuda -lcudart -ltlshook -lm  -o release-LX/TestRayTrace

        Stripping executable release-LX/TestRayTrace

strip release-LX/TestRayTrace

        Making link ../../MAKEHOME/install/release-LX/TestRayTrace to release-LX/TestRayTrace

running in release mode I will run in debug mode as we speak but it takes some time to change the params. :)

I don’t get this one. You want me to look if there is a device available or what? I know that there is. And it is only on deviceemu I want to print so I don’t need a device right?

And about you last test. I already did that one myself. And all worked fine. I’m now doing this for a library and not seperate .o files.

  1. Regarding compiling all .cu files separately: I don’t see how this would cause a problem with you seeing the printf’s or not. But you will have problems down the road if you try to share textures or constant memory between multiple .cu files. See some other recent posts on the forums for more information on this.

Also I’m not sure why you don’t get multiply defined symbol errors since you both #include a .cu file and compile it into an object file.

2 and 3 look covered.

  1. When running in device emulation mode, CUDA reports 1 CUDA capable device named “Emulation” or something like that. Or at least I thought it did… Maybe I was confusing this with something else.

Regarding 5: I was mistaken. I just tested the deviceQuery sample compiled in emulation mode and it still reports CUDA capable hardware. I was thinking of what it does when cuda.so is missing, where it does report an “emulation” device.

  1. I will never use textures nor use my constants on another place so I don’t think this will never be a problem.

I still have a question how can I run the code in debug mode instead of release mode. Or how can I see what mode I’m in? I did take a look at the Makefile of the SDK they use -D_DEBUG and -g as compile options

I also tried this:

void gpu_raytracer(float *radiologicalPath, short int *rho,

        int gridX, unsigned int blockSize_x, unsigned int blockSize_y,

        unsigned int offset_x, unsigned int offset_y, dim3 dimBlock, dim3 dimGrid) 

{

//dim3 dimBlock(blockSize_x, blockSize_y);

//dim3 dimGrid((nx-1)*(nz-1)/ dimBlock.x, (ny-1) / dimBlock.y);

//

printf("Calling raytracer kernel...\n");

raytracer_kernel<<<dimGrid,dimBlock>>>(radiologicalPath, rho,

          gridX, blockSize_x, blockSize_y,

          offset_x, offset_y);

cudaThreadSynchronize();

cudaError_t error = cudaGetLastError();

if (error != cudaSuccess)

 printf("error :%s\n",cudaGetErrorString(error));

// check if kernel execution generated and error

//CUT_CHECK_ERROR("Kernel execution failed");

}

end get this error:

Calling raytracer kernel…

error :device emulation mode and device execution mode cannot be mixed

I’m at a loss. You can get this error when some files are compiled with -deviceemu and others are not. There has to be something funny in your build system, but it is not obvious.

  1. Is it possible that the way your makefile is updating the static library file somehow leaves something behind? Have you tried deleting the .a files and building from scratch?

  2. Why muss around with the static library anyways? You are is just using it as a “convenience library” so why not try just linking the object files into the executable directly?

Here is a slightly more radical solution, but not bad if you project is as small as it appears from the quoted output.

  1. Try CMake (www.cmake.org) as a build system, coupled with FindCUDA.cmake (google it or search the forums). It has never let me down and I constantly change between release, debug, emulation, and device builds. The only catch is that you need to do a make clean after switching between emulation<–>device.
  1. I always do a “make clean” before I “make” and it looks like it is deleting all the .o .a and .pp and what else it makes :P

  2. We are building this as a library because we have the intention to use it in various other clinical systems… But before we can do that we need to test it…

  3. The “make” file we use at the moment is very complex… Over 1500 rows / rules… every other application build at this department is automatically build etc. etc. all dependencies needed will be build and looked for any updates of the programs… So it is very hard to change some things in the makefile and still be able to run all other programs…

I will take a look at that “cmake” but I don’t know if we can use it in this setup…

But I’m kinda frustrated because it all works independently but now that we want to integrate it with each other it isn’t working anymore…

I hope someday we can achieve interactive dose calculation :D

But does “make clean” delete the static library, too? I.e. not just the object files.

Ok, one last idea then. Have you tried a shared library instead of a static one?

Maybe someone else on the forums has an idea because I don’t know of anything else to try.

I will do a “make” and then a “make clean” and do a “find” on the library name to see if there are some instances left behind. If this is indeed the thing what is happening then I will try to “make” a shared lib. need to find out how I’m going to do that in this setup.

And the others are always welcome to post their ideas ;)

because I ran out of ideas a week ago :P

Just an idea, but you probably already did it.

make clean
change your build rule for cuda files to also echo the commandline that is about to be executed.
maybe you can also add a verbose flag to nvcc??
run make while capturing all output to a file.
check if all your nvcc calls have deviceemu on the commandline

that is at least how I would start to debug it, so chances are you did that weeks ago too :/

As you can see I already echo the make rule while printing and it is saying -deviceemu for all 4 files. I also added the -v but I don’t know what I’m looking at exactly. And I can’t print the verbose make to a file (anyone know what output that is stderr / stdout?) while doing “make &>makeLog”

@mr. Anderson:

the make clean deletes all the .o files the .a files and also the link

hmm, to redirect error output you have to do some magic like &1>&2 I believe, wait googling for it

http://www.mathinfo.u-picardie.fr/asch/f/M…edirection.html

I finally know what it was… I had the directory where all my files were for my library. In this directory everything went smooth…
Then I also had another directory with my application which linked the library, in this directory was also a .cu file which was compiled upon “make” and because this nvcc had no -deviceemu (because I didn’t know there was a .cu file in the directory) it was compiled in device execution mode. And this gave the problems…

I didn’t saw it because I never looked there.

But thank you all for helping me out…

Hmm, looking at the time of the post, some cosmic ray struck both of us. For you it meant you found your trouble, for me it means my PC doesn’t boot anymore :angry:

At least one of us had benefit from it ;)

Indeed, that one was me I think ;)

Hi all.

I don’t know if it is related to this post, but it could be. When I compile CUDA code to generate a static library, i.e., something like:

$ nvcc file1.cu … -lib -o libsomething.a

the resulting generated library does NOT substitute the old one, but it is added to it instead (so, in fact, the library keeps on growing after each new compilation). Thus, I must manually remove the old file each time, this way:

$ rm libsomething.a; nvcc file1.cu … -lib -o libsomething.a

I think it could be a small error in nvcc, because if you don’t manually remove the old library, then old versions of the recompiled functions stay on it, resulting in replicated versions of the functions!! (as you can check using the nm command):

$ nm libsomething.a
… several versions of the same functions …

Am I right?

Greetings,
Pedro E.