nvcc with avx support cannot find gcc builtin intrinsics

I try to use AVX in CUDA application (ccminer) but nvcc shows an error (in CPU code only):

/usr/local/cuda/bin/nvcc -Xcompiler "-Wall -mavx" -O3 -I . -Xptxas "-abi=no -v" -gencode=arch=compute_50,code=\"sm_50,compute_50\" --maxrregcount=80 --ptxas-options=-v -I./compat/jansson -o x11/x11.o -c x11/x11.cu
/usr/lib/gcc/x86_64-linux-gnu/4.8/include/avxintrin.h(118): error: identifier "__builtin_ia32_addpd256" is undefined
[...]

This is just the first error. There are many ‘undefined’ builtin functions :-(

Everything is ok for ‘C/C++’ programs - with .c or .cpp extensions. But .cu - returns the error :-( What do I do wrong ? I can compile ccminer but I cannot add AVX intrinsics to .cu files - only .c files. I use Intel intrinsics not gcc.

Any help greatly appreciated. Thanks in advance.

Linux Mint (ubuntu 13) 64bit, gcc 4.8.1, cuda 6.5.

I do not expect AVX to work on GPU. In .cu file there is small portion CPU based code which I want to vectorize.

Someone else may be able to tell you explicitly what is wrong here, but the usual advice is to move the code that isn’t happy with nvcc into a .cpp file and compile it that way, then link it together with the remaining code in the .cu file.

Thanks. I am just a beginner in CUDA/vectorisation world.

So the general rule is to put only GPU part of code in .cu files while CPU part of code in .c/.cpp files. Correct ?

I wouldn’t say that’s the general rule. nvcc calls the gnu compiler under the hood to handle host code, so in general there shouldn’t be lots of incompatibilities. But when you do run into the occasional incompatibility, because of the way nvcc “preprocesses” the code before handing it off to gcc/g++, then the usual advice is to separate that code out into a .cpp file, and handle it that way.

Either approach should be OK in general.

Just in case someone wants to reproduce the error (maybe someone from NVIDIA wanted to issue a patch ;-).

I took simple cuda program from http://computer-graphics.se/hello-world-for-cuda.html. Added

#include <immintrin.h>

at the beginning, and copliled with the command

nvcc cudahello.cu -Xcompiler -mavx

. Got the error (one of many similar):

/usr/lib/gcc/x86_64-linux-gnu/4.8/include/avxintrin.h(118): error: identifier "__builtin_ia32_addpd256" is undefined

Here is whole program:

#include <stdio.h>

#if defined(__AVX__)
#include <immintrin.h>
#endif

const int N = 16; 
const int blocksize = 16; 
 
__global__ 
void hello(char *a, int *b) 
{
	a[threadIdx.x] += b[threadIdx.x];
}
 
int main()
{
	char a[N] = "Hello 

#include <stdio.h>

#if defined(AVX)
#include <immintrin.h>
#endif

const int N = 16;
const int blocksize = 16;

global
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
char a[N] = “Hello \0\0\0\0\0\0”;
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);

printf("%s", a);

cudaMalloc( (void**)&ad, csize ); 
cudaMalloc( (void**)&bd, isize ); 
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice ); 
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice ); 

dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost ); 
cudaFree( ad );
cudaFree( bd );

printf("%s\n", a);
return EXIT_SUCCESS;

}


#include <stdio.h>

#if defined(AVX)
#include <immintrin.h>
#endif

const int N = 16;
const int blocksize = 16;

global
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
char a[N] = “Hello \0\0\0\0\0\0”;
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);

printf("%s", a);

cudaMalloc( (void**)&ad, csize ); 
cudaMalloc( (void**)&bd, isize ); 
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice ); 
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice ); 

dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost ); 
cudaFree( ad );
cudaFree( bd );

printf("%s\n", a);
return EXIT_SUCCESS;

}


#include <stdio.h>

#if defined(AVX)
#include <immintrin.h>
#endif

const int N = 16;
const int blocksize = 16;

global
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
char a[N] = “Hello \0\0\0\0\0\0”;
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);

printf("%s", a);

cudaMalloc( (void**)&ad, csize ); 
cudaMalloc( (void**)&bd, isize ); 
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice ); 
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice ); 

dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost ); 
cudaFree( ad );
cudaFree( bd );

printf("%s\n", a);
return EXIT_SUCCESS;

}


#include <stdio.h>

#if defined(AVX)
#include <immintrin.h>
#endif

const int N = 16;
const int blocksize = 16;

global
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
char a[N] = “Hello \0\0\0\0\0\0”;
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);

printf("%s", a);

cudaMalloc( (void**)&ad, csize ); 
cudaMalloc( (void**)&bd, isize ); 
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice ); 
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice ); 

dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost ); 
cudaFree( ad );
cudaFree( bd );

printf("%s\n", a);
return EXIT_SUCCESS;

}


#include <stdio.h>

#if defined(AVX)
#include <immintrin.h>
#endif

const int N = 16;
const int blocksize = 16;

global
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
char a[N] = “Hello \0\0\0\0\0\0”;
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);

printf("%s", a);

cudaMalloc( (void**)&ad, csize ); 
cudaMalloc( (void**)&bd, isize ); 
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice ); 
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice ); 

dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost ); 
cudaFree( ad );
cudaFree( bd );

printf("%s\n", a);
return EXIT_SUCCESS;

}


#include <stdio.h>

#if defined(AVX)
#include <immintrin.h>
#endif

const int N = 16;
const int blocksize = 16;

global
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
char a[N] = “Hello \0\0\0\0\0\0”;
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);

printf("%s", a);

cudaMalloc( (void**)&ad, csize ); 
cudaMalloc( (void**)&bd, isize ); 
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice ); 
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice ); 

dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost ); 
cudaFree( ad );
cudaFree( bd );

printf("%s\n", a);
return EXIT_SUCCESS;

}

";
	int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
	char *ad;
	int *bd;
	const int csize = N*sizeof(char);
	const int isize = N*sizeof(int);
 
	printf("%s", a);
 
	cudaMalloc( (void**)&ad, csize ); 
	cudaMalloc( (void**)&bd, isize ); 
	cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice ); 
	cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice ); 
	
	dim3 dimBlock( blocksize, 1 );
	dim3 dimGrid( 1, 1 );
	hello<<<dimGrid, dimBlock>>>(ad, bd);
	cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost ); 
	cudaFree( ad );
	cudaFree( bd );
	
	printf("%s\n", a);
	return EXIT_SUCCESS;
}

Compile with

nvcc cudahello.cu -Xcompiler -mavx

to get the error or with

nvcc cudahello.cu

to compile clean.

I think I have an answer. Functions like:

_builtin_ia32_addpd256

are built into gcc and nvcc does not know about them. Since they are declared in

immintrin.h

nvcc returns errors while compiling .cu file with

immintrin.h

included. So we cannot mix cuda features with builtin gcc functions in one file.