I’m using Visual Studio. Have been successfully using cuda and thrust in this project for awhile now. But now that I tried to use thrust::unique, it’s complaining. Not sure what I could be doing wrong.
Error 20 error LNK2001: unresolved external symbol __fatbinwrap_66_tmpxft_00000888_00000000_17_cuda_device_runtime_compute_52_cpp1_ii_8b1a5d37
Code:
struct CoordXYZEquality{
__host__ __device__
bool operator()(const CoordXYZ& o1, const CoordXYZ& o2) {
return o1.x == o2.x && o1.y == o2.y && o1.z == o2.z;
}
};
DeviceArray<CoordXYZ> contiguous_triangles;
... add some values to the array ...
//convert raw ptr to thrust ptr
thrust::device_ptr<CoordXYZ> thrust_vertices(contiguous_triangles.ptr());
CoordXYZEquality equal_cmp;
thrust::device_ptr<CoordXYZ> end = thrust::unique(thrust::cuda::par, thrust_vertices, thrust_vertices + num_vertices, equal_cmp);
I’ve also tried giving it the comparator a slightly different way.
thrust::device_ptr end = thrust::unique(thrust::cuda::par, thrust_vertices, thrust_vertices + num_vertices, CoordXYZEquality());
I’ll add, I’m on Cuda SDK 7.5 and VS 2013
I updated to cuda 8.0 and still the same error but slightly different:
Error 88 error LNK2001: unresolved external symbol __fatbinwrap_66_tmpxft_00006db0_00000000_18_cuda_device_runtime_compute_61_cpp1_ii_8b1a5d37
May be the same problem this guy is having:
https://github.com/thrust/thrust/issues/792
It seems to I need to disable rdc. But I did and now my program wont compile.
Can you build the simpleSeparateCompilation sample code?
What happens if you drop your thrust code into that project?
Same error. Added the following code to the main file “simpleSeperateCompilation.cu” . Compiled on debug.
Error 4 error LNK2001: unresolved external symbol __fatbinwrap_66_tmpxft_00006db0_00000000_18_cuda_device_runtime_compute_61_cpp1_ii_8b1a5d37 C:\ProgramData\NVIDIA Corporation\CUDA Samples\v8.0\0_Simple\simpleSeparateCompilation\simpleSeparateCompilation.device-link.obj simpleSeparateCompilation
#include <thrust/device_vector.h>
#include <thrust/remove.h>
#include <thrust/unique.h>
#include <thrust/binary_search.h>
#include <thrust/sort.h>
#include <thrust/system/cuda/execution_policy.h>
void weld_verts(){
int* ints;
cudaMalloc((void**)&ints, 128 * 4);
//convert raw ptr to thrust ptr
thrust::device_ptr<int> thrust_vertices(ints);
thrust::device_ptr<int> end = thrust::unique(thrust::cuda::par, thrust_vertices, thrust_vertices + 128);
uint64_t num_unique_vertices = thrust_vertices - end;
printf("[EXPORT] Vertex Count after Welding %i\n", num_unique_vertices);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
weld_verts();
cout << sampleName << " starting..." << endl;
runTest(argc, (const char **)argv);
cout << sampleName << " completed, returned "
<< (testResult ? "OK" : "ERROR") << endl;
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
Here’s the whole file:
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include <thrust/device_vector.h>
#include <thrust/remove.h>
#include <thrust/unique.h>
#include <thrust/binary_search.h>
#include <thrust/sort.h>
#include <thrust/system/cuda/execution_policy.h>
// System includes.
#include <stdio.h>
#include <iostream>
// STL.
#include <vector>
// CUDA runtime.
#include <cuda_runtime.h>
// Helper functions and utilities to work with CUDA.
#include <helper_functions.h>
#include <helper_cuda.h>
// Device library includes.
#include "simpleDeviceLibrary.cuh"
using std::cout;
using std::endl;
using std::vector;
#define EPS 1e-5
typedef unsigned int uint;
typedef float(*deviceFunc)(float);
const char *sampleName = "simpleSeparateCompilation";
////////////////////////////////////////////////////////////////////////////////
// Auto-Verification Code
bool testResult = true;
////////////////////////////////////////////////////////////////////////////////
// Static device pointers to __device__ functions.
__device__ deviceFunc dMultiplyByTwoPtr = multiplyByTwo;
__device__ deviceFunc dDivideByTwoPtr = divideByTwo;
////////////////////////////////////////////////////////////////////////////////
// Kernels
////////////////////////////////////////////////////////////////////////////////
//! Transforms vector.
//! Applies the __device__ function "f" to each element of the vector "v".
////////////////////////////////////////////////////////////////////////////////
__global__ void transformVector(float *v, deviceFunc f, uint size)
{
uint tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < size)
{
v[tid] = (*f)(v[tid]);
}
}
////////////////////////////////////////////////////////////////////////////////
// Declaration, forward
void runTest(int argc, const char **argv);
void weld_verts(){
int* ints;
cudaMalloc((void**)&ints, 128 * 4);
//convert raw ptr to thrust ptr
thrust::device_ptr<int> thrust_vertices(ints);
thrust::device_ptr<int> end = thrust::unique(thrust::cuda::par, thrust_vertices, thrust_vertices + 128);
uint64_t num_unique_vertices = thrust_vertices - end;
printf("[EXPORT] Vertex Count after Welding %i\n", num_unique_vertices);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
weld_verts();
cout << sampleName << " starting..." << endl;
runTest(argc, (const char **)argv);
cout << sampleName << " completed, returned "
<< (testResult ? "OK" : "ERROR") << endl;
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
void runTest(int argc, const char **argv)
{
try
{
int devID;
//cudaError_t error;
cudaDeviceProp deviceProp;
// This will pick the best possible CUDA capable device.
devID = findCudaDevice(argc, (const char **) argv);
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
if (deviceProp.major < 2)
{
cout << sampleName
<< " requires a GPU with compute capability "
<< "2.0 or later, exiting..." << endl;
exit(EXIT_SUCCESS);
}
// Create host vector.
const uint kVectorSize = 1000;
vector<float> hVector(kVectorSize);
for (uint i = 0; i < kVectorSize; ++i)
{
hVector[i] = rand() / static_cast<float>(RAND_MAX);
}
// Create and populate device vector.
float *dVector;
checkCudaErrors(cudaMalloc(&dVector, kVectorSize * sizeof(float)));
checkCudaErrors(cudaMemcpy(dVector,
&hVector[0],
kVectorSize * sizeof(float),
cudaMemcpyHostToDevice));
// Kernel configuration, where a one-dimensional
// grid and one-dimensional blocks are configured.
const int nThreads = 1024;
const int nBlocks = 1;
dim3 dimGrid(nBlocks);
dim3 dimBlock(nThreads);
// Test library functions.
deviceFunc hFunctionPtr;
cudaMemcpyFromSymbol(&hFunctionPtr,
dMultiplyByTwoPtr,
sizeof(deviceFunc));
transformVector<<<dimGrid, dimBlock>>>
(dVector, hFunctionPtr, kVectorSize);
checkCudaErrors(cudaGetLastError());
cudaMemcpyFromSymbol(&hFunctionPtr,
dDivideByTwoPtr,
sizeof(deviceFunc));
transformVector<<<dimGrid, dimBlock>>>
(dVector, hFunctionPtr, kVectorSize);
checkCudaErrors(cudaGetLastError());
// Download results.
vector<float> hResultVector(kVectorSize);
checkCudaErrors(cudaMemcpy(&hResultVector[0],
dVector,
kVectorSize *sizeof(float),
cudaMemcpyDeviceToHost));
// Check results.
for (int i = 0; i < kVectorSize; ++i)
{
if (fabs(hVector[i] - hResultVector[i]) > EPS)
{
cout << "Computations were incorrect..." << endl;
testResult = false;
break;
}
}
// Free resources.
if (dVector) checkCudaErrors(cudaFree(dVector));
}
catch (...)
{
cout << "Error occured, exiting..." << endl;
exit(EXIT_FAILURE);
}
}
I have very similar problem with another thrust function, copy_if. The following minimal example compiles and properly runs if compiled with the field
Properties → Configuration Properties → Cuda C/C++ → Device → Code Generation set to “compute_30,sm_30” (without the quotes).
However, if I change that field to “compute_35,sm_35” (without the quotes), compilation fails with the following message:
“Error LNK2001 unresolved external symbol __fatbinwrap_66_tmpxft_00009eb4_00000000_18_cuda_device_runtime_compute_61_cpp1_ii_8b1a5d37”
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/copy.h>
#include <stdio.h>
struct isFore
{
__device__
bool operator()(const float x)
{
return x < 1.0f;
}
};
int main()
{
float x[4] = { 1.0f, 0.2f, 3.0f, 0.4f };
thrust::device_vector<float> dX(x, x + 4);
thrust::device_vector<int> dI(dX.size());
thrust::device_vector<int>::iterator dEnd = thrust::copy_if(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(4),
dX.begin(),
dI.begin(),
isFore());
thrust::host_vector<int> hI(dI.begin(), dEnd);
printf("Elements less than 1.0: ");
for (int i = 0; i < hI.size(); ++i)
printf("%f ", x[hI[i]]);
return 0;
}
I am using CUDA 8.0, Nsight Visual Studio Edition 5.3 with Visual Studio 2015, Thrust v1.8.3, GeForce GTX 1050 Ti with compute capability 6.1.
I have also dropped the above code inside a fresh copy of simpleSeparateCompilation from Cuda v8.0 samples using the Visual Studio 2015 solution included in the sample’s directory. It compiles and properly runs. However, if I remove “compute_30,sm_30” and all lower compute capabilities from that Visual Studio 2015 solution’s properties, compilation fails with the same error as above.