Compilation error of sample from http://devblogs.nvidia.com/parallelforall/cuda-7-release-candidate-feature-overview/
c++11_cuda.cu
#include <initializer_list>
#include <iostream>
#include <cstring>
// Generic parallel find routine. Threads search through the
// array in parallel. A thread returns the index of the
// first value it finds that satisfies predicate `p`, or -1.
template <typename T, typename Predicate>
__device__ int find(T *data, int n, Predicate p)
{
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < n;
i += blockDim.x * gridDim.x)
{
if (p(data[i])) return i;
}
return -1;
}
// Use find with a lambda function that searches for x, y, z
// or w. Note the use of range-based for loop and
// initializer_list inside the functor, and auto means we
// don't have to know the type of the lambda or the array
__global__
void xyzw_frequency(unsigned int *count, char *data, int n)
{
auto match_xyzw = [](char c) {
for (const auto x : { 'x','y','z','w' })
if (c == x) return true;
return false;
};
int i = find(data, n, match_xyzw);
if (i >= 0) atomicAdd(count, 1);
}
int main(void)
{
char text[] = "zebra xylophone wax";
char *d_text;
cudaMalloc(&d_text, sizeof(text));
cudaMemcpy(d_text, text, sizeof(text),
cudaMemcpyHostToDevice);
unsigned int *d_count;
cudaMalloc(&d_count, sizeof(unsigned int));
cudaMemset(d_count, 0, sizeof(unsigned int));
xyzw_frequency<<<1, 64>>>(d_count, d_text,
strlen(text));
unsigned int count;
cudaMemcpy(&count, d_count, sizeof(unsigned int),
cudaMemcpyDeviceToHost);
std::cout << count << " instances of 'x', 'y', 'z', "
<< "in " << text << std::endl;
cudaFree(d_count);
cudaFree(d_text);
return 0;
}
$ /usr/local/cuda-7.0/bin/nvcc -std=c++11 c++11_cuda.cu -o c++11_cuda
c++11_cuda.cu(28): internal error: assertion failed: remove_from_variables_list: not found (/dvs/p4/build/sw/rel/gpu_drv/r346/r346_00/drivers/compiler/edg/EDG_4.9/src/il.c, line 13467)
1 catastrophic error detected in the compilation of “/tmp/tmpxft_00001d59_00000000-9_c++11_cuda.cpp1.ii”.
Compilation aborted.
Aborted (core dumped)
Compilation without error
test_auto.cu
#include <cuda.h>
#include <iostream>
__host__ void test() {
float a = 42;
auto b = a;
std::cout << b << std::endl;
}
int main()
{
test();
return 0;
}
$/usr/local/cuda-7.0/bin/nvcc -std=c++11 test_auto.cu -o test_auto
$./test_auto
42
============================================
GPU
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: “Quadro K600”
CUDA Driver Version / Runtime Version 7.0 / 7.0
CUDA Capability Major/Minor version number: 3.0
Total amount of global memory: 1023 MBytes (1073020928 bytes)
( 1) Multiprocessors, (192) CUDA Cores/MP: 192 CUDA Cores
GPU Clock rate: 876 MHz (0.88 GHz)
Memory Clock rate: 891 Mhz
Memory Bus Width: 128-bit
L2 Cache Size: 262144 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 1 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
Device supports Unified Addressing (UVA): Yes
Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 7.0, CUDA Runtime Version = 7.0, NumDevs = 1, Device0 = Quadro K600
Result = PASS
system
Linux sneg 3.13.0-44-generic #73-Ubuntu SMP Tue Dec 16 00:22:43 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
Distributor ID: Ubuntu
Description: Ubuntu 14.04.1 LTS
Release: 14.04
Codename: trusty
g++ -v
Using built-in specs.
COLLECT_GCC=g++
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.8/lto-wrapper
Target: x86_64-linux-gnu
Configured with: …/src/configure -v --with-pkgversion=‘Ubuntu 4.8.2-19ubuntu1’ --with-bugurl=file:///usr/share/doc/gcc-4.8/README.Bugs --enable-languages=c,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-4.8 --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.8 --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --enable-gnu-unique-object --disable-libmudflap --enable-plugin --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-4.8-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-4.8-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-4.8-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
Thread model: posix
gcc version 4.8.2 (Ubuntu 4.8.2-19ubuntu1)
install
sudo dpkg -i cuda-repo-ubuntu1404-7-0-rc_7.0-18_amd64.deb
sudo apt-get update
sudo apt-get install cuda