Make object file to use in another cpp project I need some guidance please

Hi guys,

I need to use a cuda kernel in my cpp project, but my project already has a Makefile. Since I don’t have much experience in working with Makefiles, I really need your assistance.

So I have my kernel and a kernel caller function (called “gpucaller”), both inside my cuda_kernel.cu file.

#ifndef _CUDA_KERNEL_H_

#define _CUDA_KERNEL_H_

#define MAX_QUBITS 25

#define BLOCKDIM 512

#define MAX_TERMS_PER_BLOCK (2*BLOCKDIM)

// includes

#include <iostream>

using namespace std;

#include <cutil_inline.h>

//#include "operator.h"

#define THREAD_MASK (~0ul << 1)

__constant__ float devOpBit[2][2];

/**

* N - number of terms

* qbCount - the number of gates to apply

* qbStart - index of the first qubit to apply the gates on

*/

__global__ void qcl1(cuFloatComplex *a, int N, int qbCount, int blockGrpSize, int k)

{

	//int idx = blockIdx.x * BLOCKDIM + threadIdx.x;

	//int tx = threadIdx.x;

	

	cuFloatComplex t0_0, t0_1, t1_0, t1_1;

	int x0_idx, x1_idx;

	int i, grpSize, b0_idx, b1_idx;

	

	__shared__ cuFloatComplex aS[MAX_TERMS_PER_BLOCK];

		

	...

}

void gpucaller(opBit* op, quState* q) {

	// make an operator copy

    double** myOpBit = op->getDeviceReadyOpBit();

	unsigned int timer = 0;

    cuFloatComplex *a_d;

	long int N = 1 << q.mapbits();

	int size = sizeof(cuFloatComplex) * N;

	

	// start timer

    cutilCheckError( cutCreateTimer( &timer));

    cutilCheckError( cutStartTimer( timer));    

    // allocate device memory

	cudaMalloc((void**)&a_d,size);	

    // copy host memory to device

	cudaMemcpy(a_d, q.termsarray, size, cudaMemcpyHostToDevice);

	// copy quantic operator to constant memory

	cutilSafeCall( cudaMemcpyToSymbol(devOpBit, myOpBit, 2*sizeof(float[2]), 0) );

	printf("Cuda errors: %s\n", cudaGetErrorString( cudaGetLastError() ) );    

	

	// setup execution parameters

    dim3 dimBlock(BLOCKDIM, 1, 1);    

	int n_blocks = N/MAX_TERMS_PER_BLOCK + (N%MAX_TERMS_PER_BLOCK == 0 ? 0:1);

    dim3 dimGrid(n_blocks, 1, 1);

int blockGrpSize = 1;

    int qbCount = q.mapbits();

    int gates;

    int k=0;

gates = (qbCount > 10) ? 10 : qbCount;

    qbCount -= gates;

	// execute the kernel

	qcl1<<< dimGrid, dimBlock >>>(a_d, N, gates, blockGrpSize, k);

	// check if kernel execution generated and error

	cutilCheckMsg("Kernel execution failed");

	

	while (qbCount > 0) {

		qbCount--;

		blockGrpSize *= 2;

		k++;

		// execute the kernel

		qcl1<<< dimGrid, dimBlock >>>(a_d, N, 1, blockGrpSize, k);

		// check if kernel execution generated and error

		cutilCheckMsg("Kernel execution failed");

	}    

    // copy result from device to host

	cudaMemcpy(q.termsarray, a_d, size, cudaMemcpyDeviceToHost);	

	// stop timer

    cutilCheckError( cutStopTimer( timer));

    printf( "Optimized - GPU Processing time: %f (ms)\n", cutGetTimerValue( timer));    

    cutilCheckError( cutDeleteTimer( timer));

    // cleanup memory on device

	cudaFree(a_d);

    cudaThreadExit();

}

#endif // #ifndef _CUDA_KERNEL_H_

In the project’s source file where I need to call “gpucaller” I’ve declared it like this:

extern "C"

void gpucaller(opBit* op, quState* q);

There is only one call to “gpucaller” within my project’s source file.

So what I think I need is to generate an object file for the .cu file, then link it to the other .o files generated by my project’s Makefile. I hope that I got this right.

For the first task (ie generating the object file for the .cu file), I was thinking that I should use the Makefile from the SDK, but to modify it so as to only produce my object file.

The SDK’s Makefile consists of these 2 files:

################################################################################

#

# Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.

#

# NOTICE TO USER:   

#

# This source code is subject to NVIDIA ownership rights under U.S. and 

# international Copyright laws.  

#

# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 

# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 

# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 

# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 

# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   

# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 

# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 

# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 

# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 

# OR PERFORMANCE OF THIS SOURCE CODE.  

#

# U.S. Government End Users.  This source code is a "commercial item" as 

# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 

# "commercial computer software" and "commercial computer software 

# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 

# and is provided to the U.S. Government only as a commercial end item.  

# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 

# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 

# source code with only those rights set forth herein.

#

################################################################################

#

# Build script for project

#

################################################################################

# Add source files here

EXECUTABLE	:= template

# CUDA source files (compiled with cudacc)

CUFILES		:= template.cu

# CUDA dependency files

CU_DEPS		:= \

	template_kernel.cu \

# C/C++ source files (compiled with gcc / c++)

CCFILES		:= \

	template_gold.cpp \

################################################################################

# Rules and targets

include ../../common/common.mk

and common.mk

################################################################################

#

# Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.

#

# NVIDIA Corporation and its licensors retain all intellectual property and 

# proprietary rights in and to this software and related documentation. 

# Any use, reproduction, disclosure, or distribution of this software 

# and related documentation without an express license agreement from

# NVIDIA Corporation is strictly prohibited.

#

# Please refer to the applicable NVIDIA end user license agreement (EULA) 

# associated with this source code for terms and conditions that govern 

# your use of this NVIDIA software.

#

################################################################################

#

# Common build script for CUDA source projects for Linux and Mac platforms

#

################################################################################

.SUFFIXES : .cu .cu_dbg.o .c_dbg.o .cpp_dbg.o .cu_rel.o .c_rel.o .cpp_rel.o .cubin .ptx

# Add new SM Versions here as devices with new Compute Capability are released

SM_VERSIONS   := 10 11 12 13 20

CUDA_INSTALL_PATH ?= /usr/local/cuda

ifdef cuda-install

	CUDA_INSTALL_PATH := $(cuda-install)

endif

# detect OS

OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])

OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:])

# 'linux' is output for Linux system, 'darwin' for OS X

DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))

ifneq ($(DARWIN),)

   SNOWLEOPARD = $(strip $(findstring 10.6, $(shell egrep "<string>10\.6" /System/Library/CoreServices/SystemVersion.plist)))

endif

# detect 32-bit or 64-bit platform

HP_64 = $(shell uname -m | grep 64)

OSARCH= $(shell uname -m)

# Basic directory setup for SDK

# (override directories only if they are not already defined)

SRCDIR     ?= 

ROOTDIR    ?= ..

ROOTBINDIR ?= $(ROOTDIR)/../bin

BINDIR     ?= $(ROOTBINDIR)/$(OSLOWER)

ROOTOBJDIR ?= obj

LIBDIR     := $(ROOTDIR)/../lib

COMMONDIR  := $(ROOTDIR)/../common

SHAREDDIR  := $(ROOTDIR)/../../shared/

# Compilers

NVCC       := $(CUDA_INSTALL_PATH)/bin/nvcc 

CXX        := g++ -fPIC

CC         := gcc -fPIC

LINK       := g++ -fPIC

# Includes

INCLUDES  += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc -I$(SHAREDDIR)/inc

# Warning flags

CXXWARN_FLAGS := \

	-W -Wall \

	-Wimplicit \

	-Wswitch \

	-Wformat \

	-Wchar-subscripts \

	-Wparentheses \

	-Wmultichar \

	-Wtrigraphs \

	-Wpointer-arith \

	-Wcast-align \

	-Wreturn-type \

	-Wno-unused-function \

	$(SPACE)

CWARN_FLAGS := $(CXXWARN_FLAGS) \

	-Wstrict-prototypes \

	-Wmissing-prototypes \

	-Wmissing-declarations \

	-Wnested-externs \

	-Wmain \

# architecture flag for nvcc and gcc compilers build

CUBIN_ARCH_FLAG :=

CXX_ARCH_FLAGS  :=

NVCCFLAGS       :=

LIB_ARCH        := $(OSARCH)

# Determining the necessary Cross-Compilation Flags

# 32-bit OS, but we target 64-bit cross compilation

ifeq ($(x86_64),1) 

    NVCCFLAGS       += -m64

    LIB_ARCH         = x86_64

    CUDPPLIB_SUFFIX  = x86_64

    ifneq ($(DARWIN),)

         CXX_ARCH_FLAGS += -arch x86_64

    else

         CXX_ARCH_FLAGS += -m64

    endif

else 

# 64-bit OS, and we target 32-bit cross compilation

    ifeq ($(i386),1)

        NVCCFLAGS       += -m32

        LIB_ARCH         = i386

        CUDPPLIB_SUFFIX  = i386

        ifneq ($(DARWIN),)

             CXX_ARCH_FLAGS += -arch i386

        else

             CXX_ARCH_FLAGS += -m32

        endif

    else 

        ifeq "$(strip $(HP_64))" ""

            LIB_ARCH        = i386

            CUDPPLIB_SUFFIX = i386

            NVCCFLAGS      += -m32

            ifneq ($(DARWIN),)

               CXX_ARCH_FLAGS += -arch i386

            else

               CXX_ARCH_FLAGS += -m32

            endif

        else

            LIB_ARCH        = x86_64

            CUDPPLIB_SUFFIX = x86_64

            NVCCFLAGS      += -m64

            ifneq ($(DARWIN),)

               CXX_ARCH_FLAGS += -arch x86_64

            else

               CXX_ARCH_FLAGS += -m64

            endif

        endif

    endif

endif

# Compiler-specific flags (by default, we always use sm_10 and sm_20), unless we use the SMVERSION template

GENCODE_SM10 := -gencode=arch=compute_10,code=\"sm_10,compute_10\"

GENCODE_SM20 := -gencode=arch=compute_20,code=\"sm_20,compute_20\"

CXXFLAGS  += $(CXXWARN_FLAGS) $(CXX_ARCH_FLAGS)

CFLAGS    += $(CWARN_FLAGS) $(CXX_ARCH_FLAGS)

LINKFLAGS +=

LINK      += $(LINKFLAGS) $(CXX_ARCH_FLAGS)

# This option for Mac allows CUDA applications to work without requiring to set DYLD_LIBRARY_PATH

ifneq ($(DARWIN),)

   LINK += -Xlinker -rpath $(CUDA_INSTALL_PATH)/lib

endif

# Common flags

COMMONFLAGS += $(INCLUDES) -DUNIX

# Debug/release configuration

ifeq ($(dbg),1)

	COMMONFLAGS += -g

	NVCCFLAGS   += -D_DEBUG

	CXXFLAGS    += -D_DEBUG

	CFLAGS      += -D_DEBUG

	BINSUBDIR   := debug

	LIBSUFFIX   := D

else 

	COMMONFLAGS += -O2 

	BINSUBDIR   := release

	LIBSUFFIX   := 

	NVCCFLAGS   += --compiler-options -fno-strict-aliasing

	CXXFLAGS    += -fno-strict-aliasing

	CFLAGS      += -fno-strict-aliasing

endif

# architecture flag for cubin build

CUBIN_ARCH_FLAG :=

# OpenGL is used or not (if it is used, then it is necessary to include GLEW)

ifeq ($(USEGLLIB),1)

    ifneq ($(DARWIN),)

        OPENGLLIB := -L/System/Library/Frameworks/OpenGL.framework/Libraries 

        OPENGLLIB += -lGL -lGLU $(COMMONDIR)/lib/$(OSLOWER)/libGLEW.a

    else

# this case for linux platforms

	OPENGLLIB := -lGL -lGLU -lX11 -lXi -lXmu

# check if x86_64 flag has been set, otherwise, check HP_64 is i386/x86_64

        ifeq ($(x86_64),1) 

	       OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64

        else

             ifeq ($(i386),)

                 ifeq "$(strip $(HP_64))" ""

	             OPENGLLIB += -lGLEW -L/usr/X11R6/lib

                 else

	             OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64

                 endif

             endif

        endif

# check if i386 flag has been set, otehrwise check HP_64 is i386/x86_64

        ifeq ($(i386),1)

	       OPENGLLIB += -lGLEW -L/usr/X11R6/lib

        else

             ifeq ($(x86_64),)

                 ifeq "$(strip $(HP_64))" ""

	             OPENGLLIB += -lGLEW -L/usr/X11R6/lib

                 else

	             OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64

                 endif

             endif

        endif

    endif

endif

ifeq ($(USEGLUT),1)

    ifneq ($(DARWIN),)

	OPENGLLIB += -framework GLUT

    else

        ifeq ($(x86_64),1)

	     OPENGLLIB += -lglut -L/usr/lib64 

        endif

        ifeq ($(i386),1)

	     OPENGLLIB += -lglut -L/usr/lib 

        endif

ifeq ($(x86_64),)

            ifeq ($(i386),)  

	        OPENGLLIB += -lglut

            endif

        endif

    endif

endif

ifeq ($(USEPARAMGL),1)

	PARAMGLLIB := -lparamgl_$(LIB_ARCH)$(LIBSUFFIX)

endif

ifeq ($(USERENDERCHECKGL),1)

	RENDERCHECKGLLIB := -lrendercheckgl_$(LIB_ARCH)$(LIBSUFFIX)

endif

ifeq ($(USECUDPP), 1)

    CUDPPLIB := -lcudpp_$(CUDPPLIB_SUFFIX)

ifeq ($(emu), 1)

        CUDPPLIB := $(CUDPPLIB)_emu

    endif

endif

ifeq ($(USENVCUVID), 1)

     ifneq ($(DARWIN),)

         NVCUVIDLIB := -L../../common/lib/darwin -lnvcuvid

     endif

endif

# Libs

ifneq ($(DARWIN),)

    LIB       := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib $(NVCUVIDLIB) 

else

  ifeq "$(strip $(HP_64))" ""

    ifeq ($(x86_64),1)

       LIB       := -L$(CUDA_INSTALL_PATH)/lib64 -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib 

    else

       LIB       := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib

    endif

  else

    ifeq ($(i386),1)

       LIB       := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib

    else

       LIB       := -L$(CUDA_INSTALL_PATH)/lib64 -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib

    endif

  endif

endif

# If dynamically linking to CUDA and CUDART, we exclude the libraries from the LIB

ifeq ($(USECUDADYNLIB),1)

     LIB += ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} -ldl -rdynamic 

else

# static linking, we will statically link against CUDA and CUDART

  ifeq ($(USEDRVAPI),1)

     LIB += -lcuda   ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} 

  else

     ifeq ($(emu),1) 

         LIB += -lcudartemu

     else 

         LIB += -lcudart

     endif

     LIB += ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB}

  endif

endif

ifeq ($(USECUFFT),1)

  ifeq ($(emu),1)

    LIB += -lcufftemu

  else

    LIB += -lcufft

  endif

endif

ifeq ($(USECUBLAS),1)

  ifeq ($(emu),1)

    LIB += -lcublasemu

  else

    LIB += -lcublas

  endif

endif

ifeq ($(USECURAND),1)

    LIB += -lcurand

endif

ifeq ($(USECUSPARSE),1)

  LIB += -lcusparse

endif

# Lib/exe configuration

# Lib/exe configuration

# Lib/exe configuration

ifneq ($(STATIC_LIB),)

	TARGETDIR := $(LIBDIR)

	TARGET   := $(subst .a,_$(LIB_ARCH)$(LIBSUFFIX).a,$(LIBDIR)/$(STATIC_LIB))

	LINKLINE  = ar rucv $(TARGET) $(OBJS)

else

	ifneq ($(OMIT_CUTIL_LIB),1)

		LIB += -lcutil_$(LIB_ARCH)$(LIBSUFFIX) -lshrutil_$(LIB_ARCH)$(LIBSUFFIX)

	endif

	# Device emulation configuration

	ifeq ($(emu), 1)

		NVCCFLAGS   += -deviceemu

		CUDACCFLAGS += 

		BINSUBDIR   := emu$(BINSUBDIR)

		# consistency, makes developing easier

		CXXFLAGS		+= -D__DEVICE_EMULATION__

		CFLAGS			+= -D__DEVICE_EMULATION__

	endif

	TARGETDIR := $(BINDIR)/$(BINSUBDIR)

	TARGET    := $(TARGETDIR)/$(EXECUTABLE)

	LINKLINE  = $(LINK) -o $(TARGET) $(OBJS) $(LIB)

endif

# check if verbose 

ifeq ($(verbose), 1)

	VERBOSE :=

else

	VERBOSE := @

endif

################################################################################

# Check for input flags and set compiler flags appropriately

################################################################################

ifeq ($(fastmath), 1)

	NVCCFLAGS += -use_fast_math

endif

ifeq ($(keep), 1)

	NVCCFLAGS += -keep

	NVCC_KEEP_CLEAN := *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx

endif

ifdef maxregisters

	NVCCFLAGS += -maxrregcount $(maxregisters)

endif

ifeq ($(ptxas), 1)

        NVCCFLAGS += --ptxas-options=-v

endif

# Add cudacc flags

NVCCFLAGS += $(CUDACCFLAGS)

# Add common flags

NVCCFLAGS += $(COMMONFLAGS)

CXXFLAGS  += $(COMMONFLAGS)

CFLAGS    += $(COMMONFLAGS)

ifeq ($(nvcc_warn_verbose),1)

	NVCCFLAGS += $(addprefix --compiler-options ,$(CXXWARN_FLAGS)) 

	NVCCFLAGS += --compiler-options -fno-strict-aliasing

endif

################################################################################

# Set up object files

################################################################################

OBJDIR := $(ROOTOBJDIR)/$(LIB_ARCH)/$(BINSUBDIR)

OBJS +=  $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(notdir $(CCFILES)))

OBJS +=  $(patsubst %.c,$(OBJDIR)/%.c.o,$(notdir $(CFILES)))

OBJS +=  $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(notdir $(CUFILES)))

################################################################################

# Set up cubin output files

################################################################################

CUBINDIR := $(SRCDIR)data

CUBINS +=  $(patsubst %.cu,$(CUBINDIR)/%.cubin,$(notdir $(CUBINFILES)))

################################################################################

# Set up PTX output files

################################################################################

PTXDIR := $(SRCDIR)data

PTXBINS +=  $(patsubst %.cu,$(PTXDIR)/%.ptx,$(notdir $(PTXFILES)))

################################################################################

# Rules

################################################################################

$(OBJDIR)/%.c.o : $(SRCDIR)%.c $(C_DEPS)

	$(VERBOSE)$(CC) $(CFLAGS) -o $@ -c $<

$(OBJDIR)/%.cpp.o : $(SRCDIR)%.cpp $(C_DEPS)

	$(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $<

# Default arch includes gencode for sm_10, sm_20, and other archs from GENCODE_ARCH declared in the makefile

$(OBJDIR)/%.cu.o : $(SRCDIR)%.cu $(CU_DEPS)

	$(VERBOSE)$(NVCC) $(GENCODE_SM10) $(GENCODE_ARCH) $(GENCODE_SM20) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -c $<

# Default arch includes gencode for sm_10, sm_20, and other archs from GENCODE_ARCH declared in the makefile

$(CUBINDIR)/%.cubin : $(SRCDIR)%.cu cubindirectory

	$(VERBOSE)$(NVCC) $(GENCODE_SM10) $(GENCODE_ARCH) $(GENCODE_SM20) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -cubin $<

$(PTXDIR)/%.ptx : $(SRCDIR)%.cu ptxdirectory

	$(VERBOSE)$(NVCC) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -ptx $<

#

# The following definition is a template that gets instantiated for each SM

# version (sm_10, sm_13, etc.) stored in SMVERSIONS.  It does 2 things:

# 1. It adds to OBJS a .cu_sm_XX.o for each .cu file it finds in CUFILES_sm_XX.

# 2. It generates a rule for building .cu_sm_XX.o files from the corresponding 

#    .cu file.

#

# The intended use for this is to allow Makefiles that use common.mk to compile

# files to different Compute Capability targets (aka SM arch version).  To do

# so, in the Makefile, list files for each SM arch separately, like so:

# This will be used over the default rule abov

#

# CUFILES_sm_10 := mycudakernel_sm10.cu app.cu

# CUFILES_sm_12 := anothercudakernel_sm12.cu

#

define SMVERSION_template

#OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_$(1))))

OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_sm_$(1))))

$(OBJDIR)/%.cu_$(1).o : $(SRCDIR)%.cu $(CU_DEPS)

#	$(VERBOSE)$(NVCC) -o $$@ -c $$< $(NVCCFLAGS)  $(1)

	$(VERBOSE)$(NVCC) -gencode=arch=compute_$(1),code=\"sm_$(1),compute_$(1)\" $(GENCODE_SM20) -o $$@ -c $$< $(NVCCFLAGS)

endef

# This line invokes the above template for each arch version stored in

# SM_VERSIONS.  The call funtion invokes the template, and the eval

# function interprets it as make commands.

$(foreach smver,$(SM_VERSIONS),$(eval $(call SMVERSION_template,$(smver))))

$(TARGET): makedirectories $(OBJS) $(CUBINS) $(PTXBINS) Makefile

	$(VERBOSE)$(LINKLINE)

cubindirectory:

	$(VERBOSE)mkdir -p $(CUBINDIR)

ptxdirectory:

	$(VERBOSE)mkdir -p $(PTXDIR)

makedirectories:

	$(VERBOSE)mkdir -p $(LIBDIR)

	$(VERBOSE)mkdir -p $(OBJDIR)

	$(VERBOSE)mkdir -p $(TARGETDIR)

tidy :

	$(VERBOSE)find . | egrep "#" | xargs rm -f

	$(VERBOSE)find . | egrep "\~" | xargs rm -f

clean : tidy

	$(VERBOSE)rm -f *.stub.c *.gpu *.cu.cpp *.i *.ii

	$(VERBOSE)rm -f *.cubin *.ptx *.fatbin.c *.hash

	$(VERBOSE)rm -f *.cudafe1.c *.cudafe2.c *.cudafe1.cpp *.cudafe2.cpp

	$(VERBOSE)rm -f $(OBJS) 

	$(VERBOSE)rm -f $(CUBINS)

	$(VERBOSE)rm -f $(PTXBINS)

	$(VERBOSE)rm -f $(TARGET)

	$(VERBOSE)rm -f $(NVCC_KEEP_CLEAN)

	$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.ppm

	$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.pgm

	$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.bin

	$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.bmp

clobber : clean

	$(VERBOSE)rm -rf $(ROOTOBJDIR)

Note that in the first Makefile, there are also source files specified, and the end result will be an executable file. How could I modify the SDK’s Makefile to only generate an object file from my cuda_kernel.cu file and also what needs to be changed if in this file (cuda_kernel.cu) I need to include some headers from my project?

Thanks.

Hi guys,

I need to use a cuda kernel in my cpp project, but my project already has a Makefile. Since I don’t have much experience in working with Makefiles, I really need your assistance.

So I have my kernel and a kernel caller function (called “gpucaller”), both inside my cuda_kernel.cu file.

#ifndef _CUDA_KERNEL_H_

#define _CUDA_KERNEL_H_

#define MAX_QUBITS 25

#define BLOCKDIM 512

#define MAX_TERMS_PER_BLOCK (2*BLOCKDIM)

// includes

#include <iostream>

using namespace std;

#include <cutil_inline.h>

//#include "operator.h"

#define THREAD_MASK (~0ul << 1)

__constant__ float devOpBit[2][2];

/**

* N - number of terms

* qbCount - the number of gates to apply

* qbStart - index of the first qubit to apply the gates on

*/

__global__ void qcl1(cuFloatComplex *a, int N, int qbCount, int blockGrpSize, int k)

{

	//int idx = blockIdx.x * BLOCKDIM + threadIdx.x;

	//int tx = threadIdx.x;

	

	cuFloatComplex t0_0, t0_1, t1_0, t1_1;

	int x0_idx, x1_idx;

	int i, grpSize, b0_idx, b1_idx;

	

	__shared__ cuFloatComplex aS[MAX_TERMS_PER_BLOCK];

		

	...

}

void gpucaller(opBit* op, quState* q) {

	// make an operator copy

    double** myOpBit = op->getDeviceReadyOpBit();

	unsigned int timer = 0;

    cuFloatComplex *a_d;

	long int N = 1 << q.mapbits();

	int size = sizeof(cuFloatComplex) * N;

	

	// start timer

    cutilCheckError( cutCreateTimer( &timer));

    cutilCheckError( cutStartTimer( timer));    

    // allocate device memory

	cudaMalloc((void**)&a_d,size);	

    // copy host memory to device

	cudaMemcpy(a_d, q.termsarray, size, cudaMemcpyHostToDevice);

	// copy quantic operator to constant memory

	cutilSafeCall( cudaMemcpyToSymbol(devOpBit, myOpBit, 2*sizeof(float[2]), 0) );

	printf("Cuda errors: %s\n", cudaGetErrorString( cudaGetLastError() ) );    

	

	// setup execution parameters

    dim3 dimBlock(BLOCKDIM, 1, 1);    

	int n_blocks = N/MAX_TERMS_PER_BLOCK + (N%MAX_TERMS_PER_BLOCK == 0 ? 0:1);

    dim3 dimGrid(n_blocks, 1, 1);

int blockGrpSize = 1;

    int qbCount = q.mapbits();

    int gates;

    int k=0;

gates = (qbCount > 10) ? 10 : qbCount;

    qbCount -= gates;

	// execute the kernel

	qcl1<<< dimGrid, dimBlock >>>(a_d, N, gates, blockGrpSize, k);

	// check if kernel execution generated and error

	cutilCheckMsg("Kernel execution failed");

	

	while (qbCount > 0) {

		qbCount--;

		blockGrpSize *= 2;

		k++;

		// execute the kernel

		qcl1<<< dimGrid, dimBlock >>>(a_d, N, 1, blockGrpSize, k);

		// check if kernel execution generated and error

		cutilCheckMsg("Kernel execution failed");

	}    

    // copy result from device to host

	cudaMemcpy(q.termsarray, a_d, size, cudaMemcpyDeviceToHost);	

	// stop timer

    cutilCheckError( cutStopTimer( timer));

    printf( "Optimized - GPU Processing time: %f (ms)\n", cutGetTimerValue( timer));    

    cutilCheckError( cutDeleteTimer( timer));

    // cleanup memory on device

	cudaFree(a_d);

    cudaThreadExit();

}

#endif // #ifndef _CUDA_KERNEL_H_

In the project’s source file where I need to call “gpucaller” I’ve declared it like this:

extern "C"

void gpucaller(opBit* op, quState* q);

There is only one call to “gpucaller” within my project’s source file.

So what I think I need is to generate an object file for the .cu file, then link it to the other .o files generated by my project’s Makefile. I hope that I got this right.

For the first task (ie generating the object file for the .cu file), I was thinking that I should use the Makefile from the SDK, but to modify it so as to only produce my object file.

The SDK’s Makefile consists of these 2 files:

################################################################################

#

# Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.

#

# NOTICE TO USER:   

#

# This source code is subject to NVIDIA ownership rights under U.S. and 

# international Copyright laws.  

#

# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 

# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 

# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 

# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 

# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   

# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 

# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 

# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 

# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 

# OR PERFORMANCE OF THIS SOURCE CODE.  

#

# U.S. Government End Users.  This source code is a "commercial item" as 

# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 

# "commercial computer software" and "commercial computer software 

# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 

# and is provided to the U.S. Government only as a commercial end item.  

# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 

# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 

# source code with only those rights set forth herein.

#

################################################################################

#

# Build script for project

#

################################################################################

# Add source files here

EXECUTABLE	:= template

# CUDA source files (compiled with cudacc)

CUFILES		:= template.cu

# CUDA dependency files

CU_DEPS		:= \

	template_kernel.cu \

# C/C++ source files (compiled with gcc / c++)

CCFILES		:= \

	template_gold.cpp \

################################################################################

# Rules and targets

include ../../common/common.mk

and common.mk

################################################################################

#

# Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.

#

# NVIDIA Corporation and its licensors retain all intellectual property and 

# proprietary rights in and to this software and related documentation. 

# Any use, reproduction, disclosure, or distribution of this software 

# and related documentation without an express license agreement from

# NVIDIA Corporation is strictly prohibited.

#

# Please refer to the applicable NVIDIA end user license agreement (EULA) 

# associated with this source code for terms and conditions that govern 

# your use of this NVIDIA software.

#

################################################################################

#

# Common build script for CUDA source projects for Linux and Mac platforms

#

################################################################################

.SUFFIXES : .cu .cu_dbg.o .c_dbg.o .cpp_dbg.o .cu_rel.o .c_rel.o .cpp_rel.o .cubin .ptx

# Add new SM Versions here as devices with new Compute Capability are released

SM_VERSIONS   := 10 11 12 13 20

CUDA_INSTALL_PATH ?= /usr/local/cuda

ifdef cuda-install

	CUDA_INSTALL_PATH := $(cuda-install)

endif

# detect OS

OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])

OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:])

# 'linux' is output for Linux system, 'darwin' for OS X

DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))

ifneq ($(DARWIN),)

   SNOWLEOPARD = $(strip $(findstring 10.6, $(shell egrep "<string>10\.6" /System/Library/CoreServices/SystemVersion.plist)))

endif

# detect 32-bit or 64-bit platform

HP_64 = $(shell uname -m | grep 64)

OSARCH= $(shell uname -m)

# Basic directory setup for SDK

# (override directories only if they are not already defined)

SRCDIR     ?= 

ROOTDIR    ?= ..

ROOTBINDIR ?= $(ROOTDIR)/../bin

BINDIR     ?= $(ROOTBINDIR)/$(OSLOWER)

ROOTOBJDIR ?= obj

LIBDIR     := $(ROOTDIR)/../lib

COMMONDIR  := $(ROOTDIR)/../common

SHAREDDIR  := $(ROOTDIR)/../../shared/

# Compilers

NVCC       := $(CUDA_INSTALL_PATH)/bin/nvcc 

CXX        := g++ -fPIC

CC         := gcc -fPIC

LINK       := g++ -fPIC

# Includes

INCLUDES  += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc -I$(SHAREDDIR)/inc

# Warning flags

CXXWARN_FLAGS := \

	-W -Wall \

	-Wimplicit \

	-Wswitch \

	-Wformat \

	-Wchar-subscripts \

	-Wparentheses \

	-Wmultichar \

	-Wtrigraphs \

	-Wpointer-arith \

	-Wcast-align \

	-Wreturn-type \

	-Wno-unused-function \

	$(SPACE)

CWARN_FLAGS := $(CXXWARN_FLAGS) \

	-Wstrict-prototypes \

	-Wmissing-prototypes \

	-Wmissing-declarations \

	-Wnested-externs \

	-Wmain \

# architecture flag for nvcc and gcc compilers build

CUBIN_ARCH_FLAG :=

CXX_ARCH_FLAGS  :=

NVCCFLAGS       :=

LIB_ARCH        := $(OSARCH)

# Determining the necessary Cross-Compilation Flags

# 32-bit OS, but we target 64-bit cross compilation

ifeq ($(x86_64),1) 

    NVCCFLAGS       += -m64

    LIB_ARCH         = x86_64

    CUDPPLIB_SUFFIX  = x86_64

    ifneq ($(DARWIN),)

         CXX_ARCH_FLAGS += -arch x86_64

    else

         CXX_ARCH_FLAGS += -m64

    endif

else 

# 64-bit OS, and we target 32-bit cross compilation

    ifeq ($(i386),1)

        NVCCFLAGS       += -m32

        LIB_ARCH         = i386

        CUDPPLIB_SUFFIX  = i386

        ifneq ($(DARWIN),)

             CXX_ARCH_FLAGS += -arch i386

        else

             CXX_ARCH_FLAGS += -m32

        endif

    else 

        ifeq "$(strip $(HP_64))" ""

            LIB_ARCH        = i386

            CUDPPLIB_SUFFIX = i386

            NVCCFLAGS      += -m32

            ifneq ($(DARWIN),)

               CXX_ARCH_FLAGS += -arch i386

            else

               CXX_ARCH_FLAGS += -m32

            endif

        else

            LIB_ARCH        = x86_64

            CUDPPLIB_SUFFIX = x86_64

            NVCCFLAGS      += -m64

            ifneq ($(DARWIN),)

               CXX_ARCH_FLAGS += -arch x86_64

            else

               CXX_ARCH_FLAGS += -m64

            endif

        endif

    endif

endif

# Compiler-specific flags (by default, we always use sm_10 and sm_20), unless we use the SMVERSION template

GENCODE_SM10 := -gencode=arch=compute_10,code=\"sm_10,compute_10\"

GENCODE_SM20 := -gencode=arch=compute_20,code=\"sm_20,compute_20\"

CXXFLAGS  += $(CXXWARN_FLAGS) $(CXX_ARCH_FLAGS)

CFLAGS    += $(CWARN_FLAGS) $(CXX_ARCH_FLAGS)

LINKFLAGS +=

LINK      += $(LINKFLAGS) $(CXX_ARCH_FLAGS)

# This option for Mac allows CUDA applications to work without requiring to set DYLD_LIBRARY_PATH

ifneq ($(DARWIN),)

   LINK += -Xlinker -rpath $(CUDA_INSTALL_PATH)/lib

endif

# Common flags

COMMONFLAGS += $(INCLUDES) -DUNIX

# Debug/release configuration

ifeq ($(dbg),1)

	COMMONFLAGS += -g

	NVCCFLAGS   += -D_DEBUG

	CXXFLAGS    += -D_DEBUG

	CFLAGS      += -D_DEBUG

	BINSUBDIR   := debug

	LIBSUFFIX   := D

else 

	COMMONFLAGS += -O2 

	BINSUBDIR   := release

	LIBSUFFIX   := 

	NVCCFLAGS   += --compiler-options -fno-strict-aliasing

	CXXFLAGS    += -fno-strict-aliasing

	CFLAGS      += -fno-strict-aliasing

endif

# architecture flag for cubin build

CUBIN_ARCH_FLAG :=

# OpenGL is used or not (if it is used, then it is necessary to include GLEW)

ifeq ($(USEGLLIB),1)

    ifneq ($(DARWIN),)

        OPENGLLIB := -L/System/Library/Frameworks/OpenGL.framework/Libraries 

        OPENGLLIB += -lGL -lGLU $(COMMONDIR)/lib/$(OSLOWER)/libGLEW.a

    else

# this case for linux platforms

	OPENGLLIB := -lGL -lGLU -lX11 -lXi -lXmu

# check if x86_64 flag has been set, otherwise, check HP_64 is i386/x86_64

        ifeq ($(x86_64),1) 

	       OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64

        else

             ifeq ($(i386),)

                 ifeq "$(strip $(HP_64))" ""

	             OPENGLLIB += -lGLEW -L/usr/X11R6/lib

                 else

	             OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64

                 endif

             endif

        endif

# check if i386 flag has been set, otehrwise check HP_64 is i386/x86_64

        ifeq ($(i386),1)

	       OPENGLLIB += -lGLEW -L/usr/X11R6/lib

        else

             ifeq ($(x86_64),)

                 ifeq "$(strip $(HP_64))" ""

	             OPENGLLIB += -lGLEW -L/usr/X11R6/lib

                 else

	             OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64

                 endif

             endif

        endif

    endif

endif

ifeq ($(USEGLUT),1)

    ifneq ($(DARWIN),)

	OPENGLLIB += -framework GLUT

    else

        ifeq ($(x86_64),1)

	     OPENGLLIB += -lglut -L/usr/lib64 

        endif

        ifeq ($(i386),1)

	     OPENGLLIB += -lglut -L/usr/lib 

        endif

ifeq ($(x86_64),)

            ifeq ($(i386),)  

	        OPENGLLIB += -lglut

            endif

        endif

    endif

endif

ifeq ($(USEPARAMGL),1)

	PARAMGLLIB := -lparamgl_$(LIB_ARCH)$(LIBSUFFIX)

endif

ifeq ($(USERENDERCHECKGL),1)

	RENDERCHECKGLLIB := -lrendercheckgl_$(LIB_ARCH)$(LIBSUFFIX)

endif

ifeq ($(USECUDPP), 1)

    CUDPPLIB := -lcudpp_$(CUDPPLIB_SUFFIX)

ifeq ($(emu), 1)

        CUDPPLIB := $(CUDPPLIB)_emu

    endif

endif

ifeq ($(USENVCUVID), 1)

     ifneq ($(DARWIN),)

         NVCUVIDLIB := -L../../common/lib/darwin -lnvcuvid

     endif

endif

# Libs

ifneq ($(DARWIN),)

    LIB       := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib $(NVCUVIDLIB) 

else

  ifeq "$(strip $(HP_64))" ""

    ifeq ($(x86_64),1)

       LIB       := -L$(CUDA_INSTALL_PATH)/lib64 -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib 

    else

       LIB       := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib

    endif

  else

    ifeq ($(i386),1)

       LIB       := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib

    else

       LIB       := -L$(CUDA_INSTALL_PATH)/lib64 -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER) -L$(SHAREDDIR)/lib

    endif

  endif

endif

# If dynamically linking to CUDA and CUDART, we exclude the libraries from the LIB

ifeq ($(USECUDADYNLIB),1)

     LIB += ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} -ldl -rdynamic 

else

# static linking, we will statically link against CUDA and CUDART

  ifeq ($(USEDRVAPI),1)

     LIB += -lcuda   ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB} 

  else

     ifeq ($(emu),1) 

         LIB += -lcudartemu

     else 

         LIB += -lcudart

     endif

     LIB += ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB}

  endif

endif

ifeq ($(USECUFFT),1)

  ifeq ($(emu),1)

    LIB += -lcufftemu

  else

    LIB += -lcufft

  endif

endif

ifeq ($(USECUBLAS),1)

  ifeq ($(emu),1)

    LIB += -lcublasemu

  else

    LIB += -lcublas

  endif

endif

ifeq ($(USECURAND),1)

    LIB += -lcurand

endif

ifeq ($(USECUSPARSE),1)

  LIB += -lcusparse

endif

# Lib/exe configuration

# Lib/exe configuration

# Lib/exe configuration

ifneq ($(STATIC_LIB),)

	TARGETDIR := $(LIBDIR)

	TARGET   := $(subst .a,_$(LIB_ARCH)$(LIBSUFFIX).a,$(LIBDIR)/$(STATIC_LIB))

	LINKLINE  = ar rucv $(TARGET) $(OBJS)

else

	ifneq ($(OMIT_CUTIL_LIB),1)

		LIB += -lcutil_$(LIB_ARCH)$(LIBSUFFIX) -lshrutil_$(LIB_ARCH)$(LIBSUFFIX)

	endif

	# Device emulation configuration

	ifeq ($(emu), 1)

		NVCCFLAGS   += -deviceemu

		CUDACCFLAGS += 

		BINSUBDIR   := emu$(BINSUBDIR)

		# consistency, makes developing easier

		CXXFLAGS		+= -D__DEVICE_EMULATION__

		CFLAGS			+= -D__DEVICE_EMULATION__

	endif

	TARGETDIR := $(BINDIR)/$(BINSUBDIR)

	TARGET    := $(TARGETDIR)/$(EXECUTABLE)

	LINKLINE  = $(LINK) -o $(TARGET) $(OBJS) $(LIB)

endif

# check if verbose 

ifeq ($(verbose), 1)

	VERBOSE :=

else

	VERBOSE := @

endif

################################################################################

# Check for input flags and set compiler flags appropriately

################################################################################

ifeq ($(fastmath), 1)

	NVCCFLAGS += -use_fast_math

endif

ifeq ($(keep), 1)

	NVCCFLAGS += -keep

	NVCC_KEEP_CLEAN := *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx

endif

ifdef maxregisters

	NVCCFLAGS += -maxrregcount $(maxregisters)

endif

ifeq ($(ptxas), 1)

        NVCCFLAGS += --ptxas-options=-v

endif

# Add cudacc flags

NVCCFLAGS += $(CUDACCFLAGS)

# Add common flags

NVCCFLAGS += $(COMMONFLAGS)

CXXFLAGS  += $(COMMONFLAGS)

CFLAGS    += $(COMMONFLAGS)

ifeq ($(nvcc_warn_verbose),1)

	NVCCFLAGS += $(addprefix --compiler-options ,$(CXXWARN_FLAGS)) 

	NVCCFLAGS += --compiler-options -fno-strict-aliasing

endif

################################################################################

# Set up object files

################################################################################

OBJDIR := $(ROOTOBJDIR)/$(LIB_ARCH)/$(BINSUBDIR)

OBJS +=  $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(notdir $(CCFILES)))

OBJS +=  $(patsubst %.c,$(OBJDIR)/%.c.o,$(notdir $(CFILES)))

OBJS +=  $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(notdir $(CUFILES)))

################################################################################

# Set up cubin output files

################################################################################

CUBINDIR := $(SRCDIR)data

CUBINS +=  $(patsubst %.cu,$(CUBINDIR)/%.cubin,$(notdir $(CUBINFILES)))

################################################################################

# Set up PTX output files

################################################################################

PTXDIR := $(SRCDIR)data

PTXBINS +=  $(patsubst %.cu,$(PTXDIR)/%.ptx,$(notdir $(PTXFILES)))

################################################################################

# Rules

################################################################################

$(OBJDIR)/%.c.o : $(SRCDIR)%.c $(C_DEPS)

	$(VERBOSE)$(CC) $(CFLAGS) -o $@ -c $<

$(OBJDIR)/%.cpp.o : $(SRCDIR)%.cpp $(C_DEPS)

	$(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $<

# Default arch includes gencode for sm_10, sm_20, and other archs from GENCODE_ARCH declared in the makefile

$(OBJDIR)/%.cu.o : $(SRCDIR)%.cu $(CU_DEPS)

	$(VERBOSE)$(NVCC) $(GENCODE_SM10) $(GENCODE_ARCH) $(GENCODE_SM20) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -c $<

# Default arch includes gencode for sm_10, sm_20, and other archs from GENCODE_ARCH declared in the makefile

$(CUBINDIR)/%.cubin : $(SRCDIR)%.cu cubindirectory

	$(VERBOSE)$(NVCC) $(GENCODE_SM10) $(GENCODE_ARCH) $(GENCODE_SM20) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -cubin $<

$(PTXDIR)/%.ptx : $(SRCDIR)%.cu ptxdirectory

	$(VERBOSE)$(NVCC) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -ptx $<

#

# The following definition is a template that gets instantiated for each SM

# version (sm_10, sm_13, etc.) stored in SMVERSIONS.  It does 2 things:

# 1. It adds to OBJS a .cu_sm_XX.o for each .cu file it finds in CUFILES_sm_XX.

# 2. It generates a rule for building .cu_sm_XX.o files from the corresponding 

#    .cu file.

#

# The intended use for this is to allow Makefiles that use common.mk to compile

# files to different Compute Capability targets (aka SM arch version).  To do

# so, in the Makefile, list files for each SM arch separately, like so:

# This will be used over the default rule abov

#

# CUFILES_sm_10 := mycudakernel_sm10.cu app.cu

# CUFILES_sm_12 := anothercudakernel_sm12.cu

#

define SMVERSION_template

#OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_$(1))))

OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_sm_$(1))))

$(OBJDIR)/%.cu_$(1).o : $(SRCDIR)%.cu $(CU_DEPS)

#	$(VERBOSE)$(NVCC) -o $$@ -c $$< $(NVCCFLAGS)  $(1)

	$(VERBOSE)$(NVCC) -gencode=arch=compute_$(1),code=\"sm_$(1),compute_$(1)\" $(GENCODE_SM20) -o $$@ -c $$< $(NVCCFLAGS)

endef

# This line invokes the above template for each arch version stored in

# SM_VERSIONS.  The call funtion invokes the template, and the eval

# function interprets it as make commands.

$(foreach smver,$(SM_VERSIONS),$(eval $(call SMVERSION_template,$(smver))))

$(TARGET): makedirectories $(OBJS) $(CUBINS) $(PTXBINS) Makefile

	$(VERBOSE)$(LINKLINE)

cubindirectory:

	$(VERBOSE)mkdir -p $(CUBINDIR)

ptxdirectory:

	$(VERBOSE)mkdir -p $(PTXDIR)

makedirectories:

	$(VERBOSE)mkdir -p $(LIBDIR)

	$(VERBOSE)mkdir -p $(OBJDIR)

	$(VERBOSE)mkdir -p $(TARGETDIR)

tidy :

	$(VERBOSE)find . | egrep "#" | xargs rm -f

	$(VERBOSE)find . | egrep "\~" | xargs rm -f

clean : tidy

	$(VERBOSE)rm -f *.stub.c *.gpu *.cu.cpp *.i *.ii

	$(VERBOSE)rm -f *.cubin *.ptx *.fatbin.c *.hash

	$(VERBOSE)rm -f *.cudafe1.c *.cudafe2.c *.cudafe1.cpp *.cudafe2.cpp

	$(VERBOSE)rm -f $(OBJS) 

	$(VERBOSE)rm -f $(CUBINS)

	$(VERBOSE)rm -f $(PTXBINS)

	$(VERBOSE)rm -f $(TARGET)

	$(VERBOSE)rm -f $(NVCC_KEEP_CLEAN)

	$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.ppm

	$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.pgm

	$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.bin

	$(VERBOSE)rm -f $(ROOTBINDIR)/$(OSLOWER)/$(BINSUBDIR)/*.bmp

clobber : clean

	$(VERBOSE)rm -rf $(ROOTOBJDIR)

Note that in the first Makefile, there are also source files specified, and the end result will be an executable file. How could I modify the SDK’s Makefile to only generate an object file from my cuda_kernel.cu file and also what needs to be changed if in this file (cuda_kernel.cu) I need to include some headers from my project?

Thanks.

I think the SDK makefiles are overly complicated. For a start, you just need a rule for .cu files

%.o : %.cu

        nvcc -arch sm_20 -c -o $@ $<'

(adjust [font=“Courier New”]sm_20[/font] for the compute capability of your device),

add the [font=“Courier New”].o[/font] file from your CUDA file to the list of linked object files, and add libcudart to the linked libraries:

LDFLAGS += -lcudart

EDIT: And remember that the indentation in a Makefile rule has to be a tab, not spaces.

I think the SDK makefiles are overly complicated. For a start, you just need a rule for .cu files

%.o : %.cu

        nvcc -arch sm_20 -c -o $@ $<'

(adjust [font=“Courier New”]sm_20[/font] for the compute capability of your device),

add the [font=“Courier New”].o[/font] file from your CUDA file to the list of linked object files, and add libcudart to the linked libraries:

LDFLAGS += -lcudart

EDIT: And remember that the indentation in a Makefile rule has to be a tab, not spaces.

@tera

Thanks so much for the reply. Indeed, I find your approach a lot easier.

========================================================================

EDIT: Disregard what I wrote below, I’ve used a much simpler command:

nvcc -arch sm_11 -c -I"/home/[my_username]/NVIDIA_GPU_Computing_SDK/C/common/inc" -o cuda_kernel.o cuda_kernel.cu

========================================================================

[i]So I’ve modified the SDK Makefile to a bare minimum, because I needed to include the headers in the SDK’s common dir. Could you check that the Makefile is correct (ie valid)? I’m not sure if it’s “COMMONDIR ?=” or “COMMONDIR :=”. Anyways, this is my Makefile now to generate an object .o file from my .cu file.

CUDA_INSTALL_PATH ?= /usr/local/cuda

# Basic directory setup for SDK

# (override directories only if they are not already defined)

COMMONDIR  ?= /home/glu/NVIDIA_GPU_Computing_SDK/C/common

# Compilers

NVCC       := $(CUDA_INSTALL_PATH)/bin/nvcc

# Includes

INCLUDES  += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc

################################################################################

# Rules

################################################################################

%.cu.o : %.cu

	$(NVCC) -arch sm_11 $(INCLUDES) -o $@ -c $<

The problem now is that when I type ‘sudo make’ inside the Makefile’s dir (I’ve made sure it’s executable), I get this error:

make: *** No targets. Stop.

Does anyone has some suggestions?[/i]

@tera

Thanks so much for the reply. Indeed, I find your approach a lot easier.

========================================================================

EDIT: Disregard what I wrote below, I’ve used a much simpler command:

nvcc -arch sm_11 -c -I"/home/[my_username]/NVIDIA_GPU_Computing_SDK/C/common/inc" -o cuda_kernel.o cuda_kernel.cu

========================================================================

[i]So I’ve modified the SDK Makefile to a bare minimum, because I needed to include the headers in the SDK’s common dir. Could you check that the Makefile is correct (ie valid)? I’m not sure if it’s “COMMONDIR ?=” or “COMMONDIR :=”. Anyways, this is my Makefile now to generate an object .o file from my .cu file.

CUDA_INSTALL_PATH ?= /usr/local/cuda

# Basic directory setup for SDK

# (override directories only if they are not already defined)

COMMONDIR  ?= /home/glu/NVIDIA_GPU_Computing_SDK/C/common

# Compilers

NVCC       := $(CUDA_INSTALL_PATH)/bin/nvcc

# Includes

INCLUDES  += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc

################################################################################

# Rules

################################################################################

%.cu.o : %.cu

	$(NVCC) -arch sm_11 $(INCLUDES) -o $@ -c $<

The problem now is that when I type ‘sudo make’ inside the Makefile’s dir (I’ve made sure it’s executable), I get this error:

make: *** No targets. Stop.

Does anyone has some suggestions?[/i]

I have another problem now. I have compiled my .cu file in which resides the kernel and a kernel caller function (“gpucaller”). It compiles fine and I get my .o file. I include this in the Makefile as described by @tera above.

Now from my cpp project, in the source file in which I make the call to “gpucaller”, do I need to also have an include to this .cu file at the top?

Right now I do have an include at the top, and it looks like this:

#include "cuda_kernel.cu"

extern "C"

void gpucaller(opBit *op, quBaseState *q);

But the problem is that when I compile my project (having it’s own Makefile), it doesn’t seem to recognize the cuda “constant” and “global” keywords. I get an error like this: “expected constructor, destructor, or type conversion before ‘void’”.

My project’s Makefile is this one:

# Makefile for QCL

#

# This file is part of the Quantum Computation Language QCL.

# 

# (c) Copyright by Bernhard Oemer <oemer@tph.tuwien.ac.at> 1998-2006

# 

# This program comes without any warranty; without even the implied 

# warranty of merchantability or fitness for any particular purpose.

# 

#      This program is free software under the terms of the 

#      GNU General Public Licence (GPL) version 2 or higher

VERSION=0.6.3

# Directory for Standard .qcl files

QCLDIR = /usr/local/lib/qcl

# Path for qcl binaries

QCLBIN = /usr/local/bin

ARCH = `g++ -dumpmachine || echo bin`

# Comment out if you want to compile for a different target architecture

# To build libqc.a, you will also have to edit qc/Makefile!

#ARCH = i686-linux

#ARCHOPT = -m32 -march=i686

# Debugging and optimization options

#DEBUG = -g -pg -DQCL_DEBUG -DQC_DEBUG

#DEBUG = -g -DQCL_DEBUG -DQC_DEBUG

DEBUG = -O2 -g -DQCL_DEBUG -DQC_DEBUG

#DEBUG = -O2

# Plotting support 

#

# Comment out if you don't have GNU libplotter and X

PLOPT = -DQCL_PLOT

PLLIB = -L/usr/X11/lib -lplotter

# Readline support

#

# Comment out if you don't have GNU readline on your system

# explicit linking against libtermcap or libncurses may be required

RLOPT = -DQCL_USE_READLINE

#RLLIB = -lreadline

RLLIB = -lreadline -lncurses

# Interrupt support

#

# Comment out if your system doesn't support ANSI C signal handling

IRQOPT = -DQCL_IRQ

# Replace with lex and yacc on non-GNU systems (untested)

LEX = flex

YACC = bison 

INSTALL = install

##### You shouldn't have to edit the stuff below #####

DATE = `date +"%y.%m.%d-%H%M"`

QCDIR = qc

QCLIB = $(QCDIR)/libqc.a

QCLINC = lib

#CXX = g++

#CPP = $(CC) -E

CXXFLAGS = -c $(ARCHOPT) -Wall $(DEBUG) $(PLOPT) $(RLOPT) $(IRQOPT) -I$(QCDIR) -DDEF_INCLUDE_PATH="\"$(QCLDIR)\""

LDFLAGS = $(ARCHOPT) -L$(QCDIR) $(DEBUG) $(PLLIB) -lm -lfl -lqc $(RLLIB) -lcudart

FILESCC = $(wildcard *.cc)

FILESH = $(wildcard *.h)

SOURCE = $(FILESCC) $(FILESH) qcl.lex qcl.y Makefile

OBJECTS = types.o syntax.o typcheck.o symbols.o error.o \

          lex.o yacc.o print.o quheap.o extern.o eval.o exec.o \

          parse.o options.o debug.o cond.o dump.o plot.o format.o cuda_kernel.o

all: do-it-all

ifeq (.depend,$(wildcard .depend))

include .depend

do-it-all: build

else

do-it-all: dep

	$(MAKE)

endif

#### Rules for depend

dep: lex.cc yacc.cc yacc.h $(QCLIB)

	for i in *.cc; do \

	  $(CPP) -I$(QCDIR) -MM $$i; \

	done > .depend

lex.cc: qcl.lex yacc.h

	$(LEX) -olex.cc qcl.lex

yacc.cc: qcl.y

	$(YACC) -t -d -o yacc.cc qcl.y

yacc.h: yacc.cc

	mv yacc.*?h yacc.h

$(QCLIB):

	cd $(QCDIR) && $(MAKE) libqc.a

#### Rules for build

build: qcl $(QCLINC)/default.qcl

qcl: $(OBJECTS) qcl.o $(QCLIB)

	$(CXX) $(OBJECTS) qcl.o $(LDFLAGS) -o qcl

$(QCLINC)/default.qcl: extern.cc

	grep "^//!" extern.cc | cut -c5- > $(QCLINC)/default.qcl

checkinst:

	[ -f ./qcl -a -f $(QCLINC)/default.qcl ] || $(MAKE) build

install: checkinst

	$(INSTALL) -m 0755 -d $(QCLBIN) $(QCLDIR)

	$(INSTALL) -m 0755 ./qcl $(QCLBIN)

	$(INSTALL) -m 0644 ./$(QCLINC)/*.qcl $(QCLDIR)

uninstall:

	-rm -f $(QCLBIN)/qcl

	-rm -f $(QCLDIR)/*.qcl

	-rmdir $(QCLDIR)

#### Other Functions

edit:

	nedit $(SOURCE) &

clean:

	rm -f *.o lex.* yacc.* 

	cd $(QCDIR) && $(MAKE) clean

clear: clean

	rm -f qcl $(QCLINC)/default.qcl .depend

	cd $(QCDIR) && $(MAKE) clear

dist-src: dep

	mkdir qcl-$(VERSION)

	cp README CHANGES COPYING .depend $(SOURCE) qcl-$(VERSION) 

	mkdir qcl-$(VERSION)/qc

	cp qc/Makefile qc/*.h qc/*.cc qcl-$(VERSION)/qc

	cp -r lib qcl-$(VERSION)

	tar czf qcl-$(VERSION).tgz --owner=0 --group=0 qcl-$(VERSION)

	rm -r qcl-$(VERSION)

dist-bin: build

	mkdir qcl-$(VERSION)-$(ARCH)

	cp Makefile README CHANGES COPYING qcl qcl-$(VERSION)-$(ARCH) 

	cp -r lib qcl-$(VERSION)-$(ARCH)

	tar czf qcl-$(VERSION)-$(ARCH).tgz --owner=0 --group=0 qcl-$(VERSION)-$(ARCH)

	rm -r qcl-$(VERSION)-$(ARCH)

upload: dist-src

	scp qcl-$(VERSION)*.tgz oemer@tph.tuwien.ac.at:html/tgz

scp: dist-src

	scp qcl-$(VERSION).tgz oemer@tph.tuwien.ac.at:bak/qcl-$(DATE).tgz

I have another problem now. I have compiled my .cu file in which resides the kernel and a kernel caller function (“gpucaller”). It compiles fine and I get my .o file. I include this in the Makefile as described by @tera above.

Now from my cpp project, in the source file in which I make the call to “gpucaller”, do I need to also have an include to this .cu file at the top?

Right now I do have an include at the top, and it looks like this:

#include "cuda_kernel.cu"

extern "C"

void gpucaller(opBit *op, quBaseState *q);

But the problem is that when I compile my project (having it’s own Makefile), it doesn’t seem to recognize the cuda “constant” and “global” keywords. I get an error like this: “expected constructor, destructor, or type conversion before ‘void’”.

My project’s Makefile is this one:

# Makefile for QCL

#

# This file is part of the Quantum Computation Language QCL.

# 

# (c) Copyright by Bernhard Oemer <oemer@tph.tuwien.ac.at> 1998-2006

# 

# This program comes without any warranty; without even the implied 

# warranty of merchantability or fitness for any particular purpose.

# 

#      This program is free software under the terms of the 

#      GNU General Public Licence (GPL) version 2 or higher

VERSION=0.6.3

# Directory for Standard .qcl files

QCLDIR = /usr/local/lib/qcl

# Path for qcl binaries

QCLBIN = /usr/local/bin

ARCH = `g++ -dumpmachine || echo bin`

# Comment out if you want to compile for a different target architecture

# To build libqc.a, you will also have to edit qc/Makefile!

#ARCH = i686-linux

#ARCHOPT = -m32 -march=i686

# Debugging and optimization options

#DEBUG = -g -pg -DQCL_DEBUG -DQC_DEBUG

#DEBUG = -g -DQCL_DEBUG -DQC_DEBUG

DEBUG = -O2 -g -DQCL_DEBUG -DQC_DEBUG

#DEBUG = -O2

# Plotting support 

#

# Comment out if you don't have GNU libplotter and X

PLOPT = -DQCL_PLOT

PLLIB = -L/usr/X11/lib -lplotter

# Readline support

#

# Comment out if you don't have GNU readline on your system

# explicit linking against libtermcap or libncurses may be required

RLOPT = -DQCL_USE_READLINE

#RLLIB = -lreadline

RLLIB = -lreadline -lncurses

# Interrupt support

#

# Comment out if your system doesn't support ANSI C signal handling

IRQOPT = -DQCL_IRQ

# Replace with lex and yacc on non-GNU systems (untested)

LEX = flex

YACC = bison 

INSTALL = install

##### You shouldn't have to edit the stuff below #####

DATE = `date +"%y.%m.%d-%H%M"`

QCDIR = qc

QCLIB = $(QCDIR)/libqc.a

QCLINC = lib

#CXX = g++

#CPP = $(CC) -E

CXXFLAGS = -c $(ARCHOPT) -Wall $(DEBUG) $(PLOPT) $(RLOPT) $(IRQOPT) -I$(QCDIR) -DDEF_INCLUDE_PATH="\"$(QCLDIR)\""

LDFLAGS = $(ARCHOPT) -L$(QCDIR) $(DEBUG) $(PLLIB) -lm -lfl -lqc $(RLLIB) -lcudart

FILESCC = $(wildcard *.cc)

FILESH = $(wildcard *.h)

SOURCE = $(FILESCC) $(FILESH) qcl.lex qcl.y Makefile

OBJECTS = types.o syntax.o typcheck.o symbols.o error.o \

          lex.o yacc.o print.o quheap.o extern.o eval.o exec.o \

          parse.o options.o debug.o cond.o dump.o plot.o format.o cuda_kernel.o

all: do-it-all

ifeq (.depend,$(wildcard .depend))

include .depend

do-it-all: build

else

do-it-all: dep

	$(MAKE)

endif

#### Rules for depend

dep: lex.cc yacc.cc yacc.h $(QCLIB)

	for i in *.cc; do \

	  $(CPP) -I$(QCDIR) -MM $$i; \

	done > .depend

lex.cc: qcl.lex yacc.h

	$(LEX) -olex.cc qcl.lex

yacc.cc: qcl.y

	$(YACC) -t -d -o yacc.cc qcl.y

yacc.h: yacc.cc

	mv yacc.*?h yacc.h

$(QCLIB):

	cd $(QCDIR) && $(MAKE) libqc.a

#### Rules for build

build: qcl $(QCLINC)/default.qcl

qcl: $(OBJECTS) qcl.o $(QCLIB)

	$(CXX) $(OBJECTS) qcl.o $(LDFLAGS) -o qcl

$(QCLINC)/default.qcl: extern.cc

	grep "^//!" extern.cc | cut -c5- > $(QCLINC)/default.qcl

checkinst:

	[ -f ./qcl -a -f $(QCLINC)/default.qcl ] || $(MAKE) build

install: checkinst

	$(INSTALL) -m 0755 -d $(QCLBIN) $(QCLDIR)

	$(INSTALL) -m 0755 ./qcl $(QCLBIN)

	$(INSTALL) -m 0644 ./$(QCLINC)/*.qcl $(QCLDIR)

uninstall:

	-rm -f $(QCLBIN)/qcl

	-rm -f $(QCLDIR)/*.qcl

	-rmdir $(QCLDIR)

#### Other Functions

edit:

	nedit $(SOURCE) &

clean:

	rm -f *.o lex.* yacc.* 

	cd $(QCDIR) && $(MAKE) clean

clear: clean

	rm -f qcl $(QCLINC)/default.qcl .depend

	cd $(QCDIR) && $(MAKE) clear

dist-src: dep

	mkdir qcl-$(VERSION)

	cp README CHANGES COPYING .depend $(SOURCE) qcl-$(VERSION) 

	mkdir qcl-$(VERSION)/qc

	cp qc/Makefile qc/*.h qc/*.cc qcl-$(VERSION)/qc

	cp -r lib qcl-$(VERSION)

	tar czf qcl-$(VERSION).tgz --owner=0 --group=0 qcl-$(VERSION)

	rm -r qcl-$(VERSION)

dist-bin: build

	mkdir qcl-$(VERSION)-$(ARCH)

	cp Makefile README CHANGES COPYING qcl qcl-$(VERSION)-$(ARCH) 

	cp -r lib qcl-$(VERSION)-$(ARCH)

	tar czf qcl-$(VERSION)-$(ARCH).tgz --owner=0 --group=0 qcl-$(VERSION)-$(ARCH)

	rm -r qcl-$(VERSION)-$(ARCH)

upload: dist-src

	scp qcl-$(VERSION)*.tgz oemer@tph.tuwien.ac.at:html/tgz

scp: dist-src

	scp qcl-$(VERSION).tgz oemer@tph.tuwien.ac.at:bak/qcl-$(DATE).tgz

Don’t include the .cu file anywhere else. Just make a .h header file with the function prototypes and comon data structures and include it into both the .cu and .cpp file.

Don’t include the .cu file anywhere else. Just make a .h header file with the function prototypes and comon data structures and include it into both the .cu and .cpp file.

Did just that. Now my c++ file looks like this (extern.cc):

#include "extern.h"  

#include "cuda.h"  

...  

ROUTINE(ext_bit) {  

    int i;  

    quState *qbit;

    PAR_QUSTATE(q,"q");

    opBit *op;

    tComplex I(0,1);

    tComplex sg= inv ? -1 : 1;

    char c=(def->id())[0];

    if(def->id().length()!=1) c='?';

    switch(c) {

        case 'H': op=new opBit(1,1,1,-1,sqrt(0.5)); break;

        case 'X': op=new opBit(0,1,1,0);        break;

        case 'Y': op=new opBit(0,-I,I,0);       break;

        case 'Z': op=new opBit(1,0,0,-1);       break;

        case 'S': op=new opBit(1,0,0,sg*I);     break;

        case 'T': op=new opBit(1,0,0,sqrt(0.5)+sg*sqrt(0.5)*I); break;

        case '?':

        default: EXTERR("unknown single qubit operator "+def->id());

    } 

// This is where I call my wrapper function

    // the error that I get is: expected primary-expression before ',' token

    gpucaller(opBit, q);  

qcl_delete(op);

    return 0;

}

The .h file looks like this (cuda.h):

#ifndef _CUDA_H_

#define _CUDA_H_

#include "operator.h"

#include "qustates.h"

void gpucaller(opBit* op, quBaseState* q);

#endif // #ifndef _CUDA_H_

And the .cu file looks like this (cuda_kernel.cu):

/* compiling with:

nvcc -arch sm_11 -c -I"/home/glu/NVIDIA_GPU_Computing_SDK/C/common/inc" -I"." -I"./qc" -I"/usr/local/cuda/include" -o cuda_kernel.o cuda_kernel.cu

*/

#ifndef _CUDA_KERNEL_H_

#define _CUDA_KERNEL_H_

#define MAX_QUBITS 25

#define BLOCKDIM 512

#define MAX_TERMS_PER_BLOCK (2*BLOCKDIM)

#define THREAD_MASK (~0ul << 1)

// includes

#include <cutil_inline.h>

#include "cuda.h"

#include "operator.h"

#include "qustates.h"

__constant__ float devOpBit[2][2];

__global__ void qcl1(cuFloatComplex *a, int N, int qbCount, int blockGrpSize, int k)

{

    //int idx = blockIdx.x * BLOCKDIM + threadIdx.x;

    //int tx = threadIdx.x;

cuFloatComplex t0_0, t0_1, t1_0, t1_1;

    int x0_idx, x1_idx;

    int i, grpSize, b0_idx, b1_idx;

__shared__ cuFloatComplex aS[MAX_TERMS_PER_BLOCK];

    ...

}

void gpucaller(opBit* op, quBaseState* q) {

    // make an operator copy

    float** myOpBit = (float**)op->getDeviceReadyOpBit();

unsigned int timer = 0;

    cuFloatComplex *a_d;

    long int N = 1 << q->mapbits();

    int size = sizeof(cuFloatComplex) * N;

// start timer

    cutilCheckError( cutCreateTimer( &timer));

    cutilCheckError( cutStartTimer( timer));    

    // allocate device memory

    cudaMalloc((void**)&a_d,size);  

    // copy host memory to device

    cudaMemcpy(a_d, q->termsarray, size, cudaMemcpyHostToDevice);

    // copy quantic operator to constant memory

    cutilSafeCall( cudaMemcpyToSymbol(devOpBit, myOpBit, 2*sizeof(float[2]), 0) );

    printf("Cuda errors: %s\n", cudaGetErrorString( cudaGetLastError() ) );    

// setup execution parameters

    dim3 dimBlock(BLOCKDIM, 1, 1);    

    int n_blocks = N/MAX_TERMS_PER_BLOCK + (N%MAX_TERMS_PER_BLOCK == 0 ? 0:1);

    dim3 dimGrid(n_blocks, 1, 1);

    ...        

// execute the kernel

    qcl1<<< dimGrid, dimBlock >>>(a_d, N, gates, blockGrpSize, k);

    // check if kernel execution generated and error

    cutilCheckMsg("Kernel execution failed");

    ...

    // copy result from device to host

    cudaMemcpy(q->termsarray, a_d, size, cudaMemcpyDeviceToHost);   

    // stop timer

    cutilCheckError( cutStopTimer( timer));

    //printf( "GPU Processing time: %f (ms)\n", cutGetTimerValue( timer));    

    cutilCheckError( cutDeleteTimer( timer));

    // cleanup memory on device

    cudaFree(a_d);

    cudaThreadExit();

}

I compile the .cu file with the command commented above in the source code.

Now when I compile my project, I get the following error referring to the line in my c++ file (extern.cc) where I call my gpucaller function:

expected primary-expression before ',' token

I’ve figured that it’s because the gpucaller‘s arguments’ types aren’t recognized (?), although I’ve included the appropriate headers (operator.h and quStates.h) in my cuda.h file, so I’ve tried compiling again, but this time removing the arguments from the function’s definition and not passing any parameters. It compiles ok, so my hypothesis that the argument types aren’t recognized seems to be correct.

Can anyone help me with this problem?

Did just that. Now my c++ file looks like this (extern.cc):

#include "extern.h"  

#include "cuda.h"  

...  

ROUTINE(ext_bit) {  

    int i;  

    quState *qbit;

    PAR_QUSTATE(q,"q");

    opBit *op;

    tComplex I(0,1);

    tComplex sg= inv ? -1 : 1;

    char c=(def->id())[0];

    if(def->id().length()!=1) c='?';

    switch(c) {

        case 'H': op=new opBit(1,1,1,-1,sqrt(0.5)); break;

        case 'X': op=new opBit(0,1,1,0);        break;

        case 'Y': op=new opBit(0,-I,I,0);       break;

        case 'Z': op=new opBit(1,0,0,-1);       break;

        case 'S': op=new opBit(1,0,0,sg*I);     break;

        case 'T': op=new opBit(1,0,0,sqrt(0.5)+sg*sqrt(0.5)*I); break;

        case '?':

        default: EXTERR("unknown single qubit operator "+def->id());

    } 

// This is where I call my wrapper function

    // the error that I get is: expected primary-expression before ',' token

    gpucaller(opBit, q);  

qcl_delete(op);

    return 0;

}

The .h file looks like this (cuda.h):

#ifndef _CUDA_H_

#define _CUDA_H_

#include "operator.h"

#include "qustates.h"

void gpucaller(opBit* op, quBaseState* q);

#endif // #ifndef _CUDA_H_

And the .cu file looks like this (cuda_kernel.cu):

/* compiling with:

nvcc -arch sm_11 -c -I"/home/glu/NVIDIA_GPU_Computing_SDK/C/common/inc" -I"." -I"./qc" -I"/usr/local/cuda/include" -o cuda_kernel.o cuda_kernel.cu

*/

#ifndef _CUDA_KERNEL_H_

#define _CUDA_KERNEL_H_

#define MAX_QUBITS 25

#define BLOCKDIM 512

#define MAX_TERMS_PER_BLOCK (2*BLOCKDIM)

#define THREAD_MASK (~0ul << 1)

// includes

#include <cutil_inline.h>

#include "cuda.h"

#include "operator.h"

#include "qustates.h"

__constant__ float devOpBit[2][2];

__global__ void qcl1(cuFloatComplex *a, int N, int qbCount, int blockGrpSize, int k)

{

    //int idx = blockIdx.x * BLOCKDIM + threadIdx.x;

    //int tx = threadIdx.x;

cuFloatComplex t0_0, t0_1, t1_0, t1_1;

    int x0_idx, x1_idx;

    int i, grpSize, b0_idx, b1_idx;

__shared__ cuFloatComplex aS[MAX_TERMS_PER_BLOCK];

    ...

}

void gpucaller(opBit* op, quBaseState* q) {

    // make an operator copy

    float** myOpBit = (float**)op->getDeviceReadyOpBit();

unsigned int timer = 0;

    cuFloatComplex *a_d;

    long int N = 1 << q->mapbits();

    int size = sizeof(cuFloatComplex) * N;

// start timer

    cutilCheckError( cutCreateTimer( &timer));

    cutilCheckError( cutStartTimer( timer));    

    // allocate device memory

    cudaMalloc((void**)&a_d,size);  

    // copy host memory to device

    cudaMemcpy(a_d, q->termsarray, size, cudaMemcpyHostToDevice);

    // copy quantic operator to constant memory

    cutilSafeCall( cudaMemcpyToSymbol(devOpBit, myOpBit, 2*sizeof(float[2]), 0) );

    printf("Cuda errors: %s\n", cudaGetErrorString( cudaGetLastError() ) );    

// setup execution parameters

    dim3 dimBlock(BLOCKDIM, 1, 1);    

    int n_blocks = N/MAX_TERMS_PER_BLOCK + (N%MAX_TERMS_PER_BLOCK == 0 ? 0:1);

    dim3 dimGrid(n_blocks, 1, 1);

    ...        

// execute the kernel

    qcl1<<< dimGrid, dimBlock >>>(a_d, N, gates, blockGrpSize, k);

    // check if kernel execution generated and error

    cutilCheckMsg("Kernel execution failed");

    ...

    // copy result from device to host

    cudaMemcpy(q->termsarray, a_d, size, cudaMemcpyDeviceToHost);   

    // stop timer

    cutilCheckError( cutStopTimer( timer));

    //printf( "GPU Processing time: %f (ms)\n", cutGetTimerValue( timer));    

    cutilCheckError( cutDeleteTimer( timer));

    // cleanup memory on device

    cudaFree(a_d);

    cudaThreadExit();

}

I compile the .cu file with the command commented above in the source code.

Now when I compile my project, I get the following error referring to the line in my c++ file (extern.cc) where I call my gpucaller function:

expected primary-expression before ',' token

I’ve figured that it’s because the gpucaller‘s arguments’ types aren’t recognized (?), although I’ve included the appropriate headers (operator.h and quStates.h) in my cuda.h file, so I’ve tried compiling again, but this time removing the arguments from the function’s definition and not passing any parameters. It compiles ok, so my hypothesis that the argument types aren’t recognized seems to be correct.

Can anyone help me with this problem?