Determining Registers Per Thread and Shared Memory Per Thread Block CUDA Occupancy Calculator

I changed my build rule from:

“$(CUDA_BIN_PATH)\nvcc.exe” -ccbin “$(VCInstallDir)bin” -c -D_DEBUG -DWIN32 -D_CONSOLE -D_MBCS -Xcompiler /EHsc,/W3,/nologo,/Wp64,/Od,/Zi,/RTC1,/MTd -I"$(CUDA_INC_PATH)" -I./ -I…/…/common/inc -I"$(DXSDK_DIR)\Include" -Xcudafe --diag_suppress=unsigned_compare_with_negative -o $(ConfigurationName)\texture_2d.obj texture_2d.cu

to

“$(CUDA_BIN_PATH)\nvcc.exe” --ptxas-options=-v -keep “$(VCInstallDir)bin” -c -D_DEBUG -DWIN32 -D_CONSOLE -D_MBCS -Xcompiler /EHsc,/W3,/nologo,/Wp64,/Od,/Zi,/RTC1,/MTd -I"$(CUDA_INC_PATH)" -I./ -I…/…/common/inc -I"$(DXSDK_DIR)\Include" -Xcudafe --diag_suppress=unsigned_compare_with_negative -o $(ConfigurationName)\texture_2d.obj texture_2d.cu

but I keep getting this strange error:

1>nvcc fatal : A single input file is required for a non-link phase when an outputfile is specified

Anyone who can help me out?

Thanks in advance!

Solution:

“$(CUDA_BIN_PATH)\nvcc.exe” -keep --ptxas-options=-v -ccbin “$(VCInstallDir)bin” -c -D_DEBUG -DWIN32 -D_CONSOLE -D_MBCS -Xcompiler /EHsc,/W3,/nologo,/Wp64,/Od,/Zi,/RTC1,/MTd -I"$(CUDA_INC_PATH)" -I./ -I…/…/common/inc -I"$(DXSDK_DIR)\Include" -Xcudafe --diag_suppress=unsigned_compare_with_negative -o $(ConfigurationName)\texture_2d.obj texture_2d.cu

Got it solved, thanks anyways!

I am not quite sure where to put the command -pxtas-options=-v in the make File. I get the following error:

nvcc fatal : Unknown option ‘pxtas-options’

make: *** [obj//mt19937_ref.cu.o] Error 255

Here is my Makefile. Any help would be greaty appreciated!!

[codebox] ############################################################

####################

############################################################

####################

Build script for project

############################################################

####################

Add source files here

EXECUTABLE := evolve_smaller_particles

Cuda source files (compiled with cudacc)

CUFILES := mt19937_ref.cu simpleGL.cu

CCFILES := genmtrand.cpp

USEGLLIB := 1

USEPARAMGL := 0

USEGLUT := 1

############################################################

####################

Rules and targets

.SUFFIXES : .cu .cu_dbg.o .c_dbg.o .cpp_dbg.o .cu_rel.o .c_rel.o .cpp_rel.o .cubin

Add new SM Versions here as devices with new Compute Capability are released

SM_VERSIONS := sm_10 sm_11 sm_12 sm_13

CUDA_INSTALL_PATH ?= /usr/local/cuda

ifdef cuda-install

CUDA_INSTALL_PATH := $(cuda-install)

endif

detect OS

OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])

OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:])

‘linux’ is output for Linux system, ‘darwin’ for OS X

DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))

Basic directory setup for SDK

(override directories only if they are not already defined)

SRCDIR ?=

ROOTDIR ?= …/…

ROOTBINDIR ?= .

BINDIR ?= $(ROOTBINDIR)

ROOTOBJDIR ?= obj

LIBDIR := $(ROOTDIR)/lib

COMMONDIR := $(ROOTDIR)/common

MODULEDIR := ./modules/

Compilers

NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc

CXX := g++

CC := gcc

LINK := g++ -fPIC

Includes

INCLUDES += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc -I./include -I/usr/include/netcdf-3 -I./kernel -I/usr/include/vtk-5.0

architecture flag for cubin build

CUBIN_ARCH_FLAG := -m32

Warning flags

CXXWARN_FLAGS := \

-W -Wall \

-Wimplicit \

-Wswitch \

-Wformat \

-Wchar-subscripts \

-Wparentheses \

-Wmultichar \

-Wtrigraphs \

-Wpointer-arith \

-Wcast-align \

-Wreturn-type \

-Wno-unused-function \

$(SPACE)

CWARN_FLAGS := $(CXXWARN_FLAGS) \

-Wstrict-prototypes \

-Wmissing-prototypes \

-Wmissing-declarations \

-Wnested-externs \

-Wmain \

Compiler-specific flags

NVCCFLAGS := #-keep -pxtas-options=-v

CXXFLAGS := $(CXXWARN_FLAGS) -Wno-deprecated

CFLAGS := $(CWARN_FLAGS)

Common flags

COMMONFLAGS += $(INCLUDES) -DUNIX

Debug/release configuration

ifeq ($(dbg),1)

COMMONFLAGS += -g

NVCCFLAGS   += -D_DEBUG

BINSUBDIR   := 

LIBSUFFIX   := D

else

COMMONFLAGS += -O3 

BINSUBDIR   := 

LIBSUFFIX   :=

NVCCFLAGS   += --compiler-options -fno-strict-aliasing

CXXFLAGS    += -fno-strict-aliasing

CFLAGS      += -fno-strict-aliasing

endif

append optional arch/SM version flags (such as -arch sm_11)

#NVCCFLAGS += $(SMVERSIONFLAGS)

architecture flag for cubin build

CUBIN_ARCH_FLAG := -m32

detect if 32 bit or 64 bit system

HP_64 = $(shell uname -m | grep 64)

OpenGL is used or not (if it is used, then it is necessary to include GLEW)

ifeq ($(USEGLLIB),1)

ifneq ($(DARWIN),)

	OPENGLLIB := -L/System/Library/Frameworks/OpenGL.framework/Libraries -lGL -lGLU $(COMMONDIR)/lib/$(OSLOWER)/libGLEW.a

else

	OPENGLLIB := -lGL -lGLU -lX11 -lXi -lXmu

ifeq “$(strip $(HP_64))” “”

		OPENGLLIB += -lGLEW -L/usr/X11R6/lib

	else

		OPENGLLIB += -lGLEW_x86_64 -L/usr/X11R6/lib64

	endif

endif

CUBIN_ARCH_FLAG := -m64

endif

ifeq ($(USEGLUT),1)

ifneq ($(DARWIN),)

	OPENGLLIB += -framework GLUT

else

	OPENGLLIB += -lglut

endif

endif

ifeq ($(USEPARAMGL),1)

PARAMGLLIB := -lparamgl$(LIBSUFFIX)

endif

ifeq ($(USERENDERCHECKGL),1)

RENDERCHECKGLLIB := -lrendercheckgl$(LIBSUFFIX)

endif

ifeq ($(USECUDPP), 1)

ifeq "$(strip $(HP_64))" ""

	CUDPPLIB := -lcudpp

else

	CUDPPLIB := -lcudpp64

endif

CUDPPLIB := $(CUDPPLIB)$(LIBSUFFIX)

ifeq ($(emu), 1)

	CUDPPLIB := $(CUDPPLIB)_emu

endif

endif

Libs

LIB := -L$(CUDA_INSTALL_PATH)/lib -L$(LIBDIR) -L$(COMMONDIR)/lib/$(OSLOWER)

LIB += -lnetcdf_c++ -lnetcdf -L/usr/lib/netcdf-3 -lgsl -lgslcblas -lcufft

ifeq ($(USEDRVAPI),1)

LIB += -lcuda ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB}

else

LIB += -lcudart ${OPENGLLIB} $(PARAMGLLIB) $(RENDERCHECKGLLIB) $(CUDPPLIB) ${LIB}

endif

ifeq ($(USECUFFT),1)

ifeq ($(emu),1)

LIB += -lcufftemu

else

LIB += -lcufft

endif

endif

ifeq ($(USECUBLAS),1)

ifeq ($(emu),1)

LIB += -lcublasemu

else

LIB += -lcublas

endif

endif

Lib/exe configuration

ifneq ($(STATIC_LIB),)

TARGETDIR := $(LIBDIR)

TARGET   := $(subst .a,$(LIBSUFFIX).a,$(LIBDIR)/$(STATIC_LIB))

LINKLINE  = ar qv $(TARGET) $(OBJS) 

else

LIB += -lcutil$(LIBSUFFIX)

# Device emulation configuration

ifeq ($(emu), 1)

	NVCCFLAGS   += -deviceemu

	CUDACCFLAGS += 

	BINSUBDIR   := emu$(BINSUBDIR)

	# consistency, makes developing easier

	CXXFLAGS		+= -D__DEVICE_EMULATION__

	CFLAGS			+= -D__DEVICE_EMULATION__

endif

TARGETDIR := $(BINDIR)/$(BINSUBDIR)

TARGET    := $(TARGETDIR)/$(EXECUTABLE)

LINKLINE  = $(LINK) -o $(TARGET) $(OBJS) $(LIB)

endif

check if verbose

ifeq ($(verbose), 1)

VERBOSE :=

else

VERBOSE := @

endif

############################################################

####################

Check for input flags and set compiler flags appropriately

############################################################

####################

ifeq ($(fastmath), 1)

NVCCFLAGS += -use_fast_math

endif

ifeq ($(keep), 1)

NVCCFLAGS += -keep

NVCC_KEEP_CLEAN := *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx

endif

ifdef maxregisters

NVCCFLAGS += -maxrregcount $(maxregisters)

endif

Add cudacc flags

NVCCFLAGS += $(CUDACCFLAGS)

workaround for mac os x cuda 1.1 compiler issues

ifneq ($(DARWIN),)

NVCCFLAGS += --host-compilation=C

endif

Add common flags

NVCCFLAGS += $(COMMONFLAGS)

CXXFLAGS += $(COMMONFLAGS)

CFLAGS += $(COMMONFLAGS)

ifeq ($(nvcc_warn_verbose),1)

NVCCFLAGS += $(addprefix --compiler-options ,$(CXXWARN_FLAGS)) 

NVCCFLAGS += --compiler-options -fno-strict-aliasing

endif

############################################################

####################

Set up object files

############################################################

####################

OBJDIR := $(ROOTOBJDIR)/$(BINSUBDIR)

OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(notdir $(CCFILES)))

OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(notdir $(CFILES)))

OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(notdir $(CUFILES)))

############################################################

####################

Set up cubin files

############################################################

####################

CUBINDIR := $(SRCDIR)data

CUBINS += $(patsubst %.cu,$(CUBINDIR)/%.cubin,$(notdir $(CUBINFILES)))

############################################################

####################

Rules

############################################################

####################

$(OBJDIR)/%.c.o : $(SRCDIR)%.c $(C_DEPS)

$(VERBOSE)$(CC) $(CFLAGS) -o $@ -c $<

$(OBJDIR)/%.cpp.o : $(SRCDIR)%.cpp $(C_DEPS)

$(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $<

$(OBJDIR)/%.cpp.o : $(MODULEDIR)%.cpp $(C_DEPS)

$(VERBOSE)$(CXX) $(CXXFLAGS) -o $@ -c $<

$(OBJDIR)/%.cu.o : $(SRCDIR)%.cu $(CU_DEPS)

$(VERBOSE)$(NVCC) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -c $<

$(CUBINDIR)/%.cubin : $(SRCDIR)%.cu cubindirectory

$(VERBOSE)$(NVCC) $(CUBIN_ARCH_FLAG) $(NVCCFLAGS) $(SMVERSIONFLAGS) -o $@ -cubin $<

The following definition is a template that gets instantiated for each SM

version (sm_10, sm_13, etc.) stored in SMVERSIONS. It does 2 things:

1. It adds to OBJS a .cu_sm_XX.o for each .cu file it finds in CUFILES_sm_XX.

2. It generates a rule for building .cu_sm_XX.o files from the corresponding

.cu file.

The intended use for this is to allow Makefiles that use common.mk to compile

files to different Compute Capability targets (aka SM arch version). To do

so, in the Makefile, list files for each SM arch separately, like so:

CUFILES_sm_10 := mycudakernel_sm10.cu app.cu

CUFILES_sm_12 := anothercudakernel_sm12.cu

define SMVERSION_template

OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu_$(1).o,$(notdir $(CUFILES_$(1))))

$(OBJDIR)/%.cu_$(1).o : $(SRCDIR)%.cu $(CU_DEPS)

$(VERBOSE)$(NVCC) -o $$@ -c $$< $(NVCCFLAGS) -arch $(1)

endef

This line invokes the above template for each arch version stored in

SM_VERSIONS. The call funtion invokes the template, and the eval

function interprets it as make commands.

$(foreach smver,$(SM_VERSIONS),$(eval $(call SMVERSION_template,$(smver))))

$(TARGET): makedirectories $(OBJS) $(CUBINS) $(LIBDIR)/libcutil.so $(LIBDIR)/libparamgl.so $(LIBDIR)/librendercheckgl.so Makefile

$(VERBOSE)$(LINKLINE)

$(LIBDIR)/libcutil.so:

@make -C $(COMMONDIR)

$(LIBDIR)/libparamgl.so:

@make -C $(COMMONDIR) -f Makefile_paramgl 

$(LIBDIR)/librendercheckgl.so:

@make -C $(COMMONDIR) -f Makefile_rendercheckgl

cubindirectory:

$(VERBOSE)mkdir -p $(CUBINDIR)

makedirectories:

$(VERBOSE)mkdir -p $(LIBDIR)

$(VERBOSE)mkdir -p $(OBJDIR)

$(VERBOSE)mkdir -p $(TARGETDIR)

@echo $(COMMON_FLAGS)

tidy :

$(VERBOSE)find . | egrep "#" | xargs rm -f

$(VERBOSE)find . | egrep "\~" | xargs rm -f

clean : tidy

$(VERBOSE)rm -f $(OBJS)

$(VERBOSE)rm -f $(CUBINS)

$(VERBOSE)rm -f $(TARGET)

$(VERBOSE)rm -f $(NVCC_KEEP_CLEAN)

clobber : clean

$(VERBOSE)rm -rf $(ROOTOBJDIR)

[/codebox]

I just realized this is Cuda on XP forum. I am running my code on Linux. Sorry for posting in the wrong forum, any response would be appreciated anyways !!

It’s --ptxas-options=-v I believe.

you need double minus in front of ptxas ;)