nvcc / ptxas 4.0 compiler segfault

Hi,

When I compile this stripped-down code segment with nvcc 4.0, I get a segfault during the ptxas step.

struct s_trace

{

    int index;

    s_trace *next;

};

__global__ void kernel (s_trace *ptrace, int *pi)

{

    s_trace *tptr;

    int inode = 0;

for(int itarget = 1; itarget <= 2; itarget++)

    {

        tptr = ptrace;

        ptrace = ptrace->next;

        tptr->index = inode;

for (inode = *pi ; inode != 0; inode--)

        {

            tptr->next = tptr;

        }

    }

}

Here is my command line, and the compiler output:

$ nvcc -v -keep test3.cu -c -o test3.o

#$ _SPACE_= 

#$ _CUDART_=cudart

#$ _HERE_=/scratch/tianwu3/bailey/cuda/bin

#$ _THERE_=/scratch/tianwu3/bailey/cuda/bin

#$ _TARGET_SIZE_=64

#$ TOP=/scratch/tianwu3/bailey/cuda/bin/..

#$ LD_LIBRARY_PATH=/scratch/tianwu3/bailey/cuda/bin/../lib:/scratch/tianwu3/bailey/cuda/bin/../extools/lib::/scratch/tianwu3/bailey/cuda/lib64

#$ PATH=/scratch/tianwu3/bailey/cuda/bin/../open64/bin:/scratch/tianwu3/bailey/cuda/bin:/usr/bin:/bin:/opt/gnu/bin:/opt/local/bin:/sbin:/usr/sbin:/scratch/tianwu3/bailey/cuda/bin

#$ INCLUDES="-I/scratch/tianwu3/bailey/cuda/bin/../include" "-I/scratch/tianwu3/bailey/cuda/bin/../include/cudart"  

#$ LIBRARIES=  "-L/scratch/tianwu3/bailey/cuda/bin/../lib64" -lcudart

#$ CUDAFE_FLAGS=

#$ OPENCC_FLAGS=

#$ PTXAS_FLAGS=

#$ gcc -D__CUDA_ARCH__=100 -E -x c++ -DCUDA_NO_SM_13_DOUBLE_INTRINSICS  -DCUDA_FLOAT_MATH_FUNCTIONS -DCUDA_NO_SM_11_ATOMIC_INTRINSICS -DCUDA_NO_SM_12_ATOMIC_INTRINSICS  -D__CUDACC__ -C  "-I/scratch/tianwu3/bailey/cuda/bin/../include" "-I/scratch/tianwu3/bailey/cuda/bin/../include/cudart"   -include "cuda_runtime.h" -m64 -o "test3.cpp1.ii" "test3.cu" 

#$ cudafe --m64 --gnu_version=40403 -tused --no_remove_unneeded_entities  --gen_c_file_name "test3.cudafe1.c" --stub_file_name "test3.cudafe1.stub.c" --gen_device_file_name "test3.cudafe1.gpu" --include_file_name "test3.fatbin.c" "test3.cpp1.ii" 

#$ gcc -D__CUDA_ARCH__=100 -E -x c -DCUDA_NO_SM_13_DOUBLE_INTRINSICS  -DCUDA_FLOAT_MATH_FUNCTIONS -DCUDA_NO_SM_11_ATOMIC_INTRINSICS -DCUDA_NO_SM_12_ATOMIC_INTRINSICS  -D__CUDACC__ -C  -D__CUDA_FTZ "-I/scratch/tianwu3/bailey/cuda/bin/../include" "-I/scratch/tianwu3/bailey/cuda/bin/../include/cudart"   -m64 -o "test3.cpp2.i" "test3.cudafe1.gpu" 

#$ cudafe --m64 --gnu_version=40403 --c  --gen_c_file_name "test3.cudafe2.c" --stub_file_name "test3.cudafe2.stub.c" --gen_device_file_name "test3.cudafe2.gpu" --include_file_name "test3.fatbin.c" "test3.cpp2.i" 

#$ gcc -D__CUDA_ARCH__=100 -E -x c -DCUDA_NO_SM_13_DOUBLE_INTRINSICS  -DCUDA_FLOAT_MATH_FUNCTIONS -DCUDA_NO_SM_11_ATOMIC_INTRINSICS -DCUDA_NO_SM_12_ATOMIC_INTRINSICS  -D__CUDABE__  -D__CUDA_FTZ "-I/scratch/tianwu3/bailey/cuda/bin/../include" "-I/scratch/tianwu3/bailey/cuda/bin/../include/cudart"   -m64 -o "test3.cpp3.i" "test3.cudafe2.gpu" 

#$ filehash -s " " "test3.cpp3.i" > "test3.hash"

#$ gcc -E -x c++ -D__CUDACC__ -C  "-I/scratch/tianwu3/bailey/cuda/bin/../include" "-I/scratch/tianwu3/bailey/cuda/bin/../include/cudart"   -include "cuda_runtime.h" -m64 -o "test3.cpp4.ii" "test3.cu" 

#$ cudafe++ --m64 --gnu_version=40403 --parse_templates  --gen_c_file_name "test3.cudafe1.cpp" --stub_file_name "test3.cudafe1.stub.c" "test3.cpp4.ii" 

#$ nvopencc  -TARG:compute_10 -m64 -OPT:ftz=1 -CG:ftz=1 -CG:prec_div=0 -CG:prec_sqrt=0  "test3" "test3.cpp3.i"  -o "test3.ptx"

#$ ptxas  -arch=sm_10 -m64  "test3.ptx"  -o "test3.sm_10.cubin" 

Segmentation fault

# --error 0x8b --

I am running Ubuntu 10.04.3, 2.6.32-33-generic #72-Ubuntu SMP Fri Jul 29 21:07:13 UTC 2011 x86_64 GNU/Linux.

nvcc release 4.0, V0.2.1221

Segfaults in the toolchain are obviously not supposed to happen. Since you already have a small repro case in hand, it would be helpful if you could file a bug against the compiler attaching self-contained repro code. Thank you for your help, and sorry for the inconvenience.

For a temporary workaround, you could experiment with lowering the PTXAS optimization level via -Xptxas -O{0|1|2|3}. The default is -Xptxas -O3. Note that it is not recommended to use component-specific flags in production environments as these are generally unsupported.

Thanks. -O1 works as a temporary workaround, although -O0 and -O2 do not.

I filed a bug report at NVOnline.