[Thrust] using thrust::unique causes LNK2001: unresolved external symbol __fatbinwrap ... _cuda_devi...

I’m using Visual Studio. Have been successfully using cuda and thrust in this project for awhile now. But now that I tried to use thrust::unique, it’s complaining. Not sure what I could be doing wrong.

Error 20 error LNK2001: unresolved external symbol __fatbinwrap_66_tmpxft_00000888_00000000_17_cuda_device_runtime_compute_52_cpp1_ii_8b1a5d37

Code:

struct CoordXYZEquality{
	__host__ __device__
		bool operator()(const CoordXYZ& o1, const CoordXYZ& o2) {
		return o1.x == o2.x && o1.y == o2.y && o1.z == o2.z;
	}
};

DeviceArray<CoordXYZ> contiguous_triangles;

... add some values to the array ...

//convert raw ptr to thrust ptr
thrust::device_ptr<CoordXYZ> thrust_vertices(contiguous_triangles.ptr());

CoordXYZEquality equal_cmp;
thrust::device_ptr<CoordXYZ> end = thrust::unique(thrust::cuda::par, thrust_vertices, thrust_vertices + num_vertices, equal_cmp);

I’ve also tried giving it the comparator a slightly different way.

thrust::device_ptr end = thrust::unique(thrust::cuda::par, thrust_vertices, thrust_vertices + num_vertices, CoordXYZEquality());

I’ll add, I’m on Cuda SDK 7.5 and VS 2013

I updated to cuda 8.0 and still the same error but slightly different:

Error 88 error LNK2001: unresolved external symbol __fatbinwrap_66_tmpxft_00006db0_00000000_18_cuda_device_runtime_compute_61_cpp1_ii_8b1a5d37

May be the same problem this guy is having:

https://github.com/thrust/thrust/issues/792

It seems to I need to disable rdc. But I did and now my program wont compile.

Can you build the simpleSeparateCompilation sample code?

Yes, I can.

What happens if you drop your thrust code into that project?

Same error. Added the following code to the main file “simpleSeperateCompilation.cu” . Compiled on debug.

Error 4 error LNK2001: unresolved external symbol __fatbinwrap_66_tmpxft_00006db0_00000000_18_cuda_device_runtime_compute_61_cpp1_ii_8b1a5d37 C:\ProgramData\NVIDIA Corporation\CUDA Samples\v8.0\0_Simple\simpleSeparateCompilation\simpleSeparateCompilation.device-link.obj simpleSeparateCompilation

#include <thrust/device_vector.h>
#include <thrust/remove.h>
#include <thrust/unique.h>
#include <thrust/binary_search.h>
#include <thrust/sort.h>
#include <thrust/system/cuda/execution_policy.h>

void weld_verts(){

	int* ints;
	cudaMalloc((void**)&ints, 128 * 4);

	//convert raw ptr to thrust ptr
	thrust::device_ptr<int> thrust_vertices(ints);

	thrust::device_ptr<int> end = thrust::unique(thrust::cuda::par, thrust_vertices, thrust_vertices + 128);
	uint64_t num_unique_vertices = thrust_vertices - end;
	printf("[EXPORT] Vertex Count after Welding %i\n", num_unique_vertices);

}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
	weld_verts();

    cout << sampleName << " starting..." << endl;

    runTest(argc, (const char **)argv);

    cout << sampleName << " completed, returned "
         << (testResult ? "OK" : "ERROR") << endl;

    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

Here’s the whole file:

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */


#include <thrust/device_vector.h>
#include <thrust/remove.h>
#include <thrust/unique.h>
#include <thrust/binary_search.h>
#include <thrust/sort.h>
#include <thrust/system/cuda/execution_policy.h>

// System includes.
#include <stdio.h>
#include <iostream>

// STL.
#include <vector>

// CUDA runtime.
#include <cuda_runtime.h>

// Helper functions and utilities to work with CUDA.
#include <helper_functions.h>
#include <helper_cuda.h>

// Device library includes.
#include "simpleDeviceLibrary.cuh"

using std::cout;
using std::endl;

using std::vector;

#define EPS 1e-5

typedef unsigned int uint;
typedef float(*deviceFunc)(float);

const char *sampleName = "simpleSeparateCompilation";

////////////////////////////////////////////////////////////////////////////////
// Auto-Verification Code
bool testResult = true;

////////////////////////////////////////////////////////////////////////////////
// Static device pointers to __device__ functions.
__device__ deviceFunc dMultiplyByTwoPtr = multiplyByTwo;
__device__ deviceFunc dDivideByTwoPtr = divideByTwo;

////////////////////////////////////////////////////////////////////////////////
// Kernels
////////////////////////////////////////////////////////////////////////////////
//! Transforms vector.
//! Applies the __device__ function "f" to each element of the vector "v".
////////////////////////////////////////////////////////////////////////////////
__global__ void transformVector(float *v, deviceFunc f, uint size)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < size)
    {
        v[tid] = (*f)(v[tid]);
    }
}

////////////////////////////////////////////////////////////////////////////////
// Declaration, forward
void runTest(int argc, const char **argv);


void weld_verts(){

	int* ints;
	cudaMalloc((void**)&ints, 128 * 4);


	//convert raw ptr to thrust ptr
	thrust::device_ptr<int> thrust_vertices(ints);

	thrust::device_ptr<int> end = thrust::unique(thrust::cuda::par, thrust_vertices, thrust_vertices + 128);
	uint64_t num_unique_vertices = thrust_vertices - end;
	printf("[EXPORT] Vertex Count after Welding %i\n", num_unique_vertices);



}




////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
	weld_verts();

    cout << sampleName << " starting..." << endl;

    runTest(argc, (const char **)argv);

    cout << sampleName << " completed, returned "
         << (testResult ? "OK" : "ERROR") << endl;

    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}


void runTest(int argc, const char **argv)
{
    try
    {
        int devID;

        //cudaError_t error;
        cudaDeviceProp deviceProp;

        // This will pick the best possible CUDA capable device.
        devID = findCudaDevice(argc, (const char **) argv);

        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

        if (deviceProp.major < 2)
        {
            cout << sampleName
                 << " requires a GPU with compute capability "
                 << "2.0 or later, exiting..." << endl;
            exit(EXIT_SUCCESS);
        }

        // Create host vector.
        const uint kVectorSize = 1000;

        vector<float> hVector(kVectorSize);

        for (uint i = 0; i < kVectorSize; ++i)
        {
            hVector[i] = rand() / static_cast<float>(RAND_MAX);
        }

        // Create and populate device vector.
        float *dVector;
        checkCudaErrors(cudaMalloc(&dVector, kVectorSize * sizeof(float)));

        checkCudaErrors(cudaMemcpy(dVector,
                                   &hVector[0],
                                   kVectorSize * sizeof(float),
                                   cudaMemcpyHostToDevice));

        // Kernel configuration, where a one-dimensional
        // grid and one-dimensional blocks are configured.
        const int nThreads = 1024;
        const int nBlocks = 1;

        dim3 dimGrid(nBlocks);
        dim3 dimBlock(nThreads);

        // Test library functions.
        deviceFunc hFunctionPtr;

        cudaMemcpyFromSymbol(&hFunctionPtr,
                             dMultiplyByTwoPtr,
                             sizeof(deviceFunc));
        transformVector<<<dimGrid, dimBlock>>>
        (dVector, hFunctionPtr, kVectorSize);
        checkCudaErrors(cudaGetLastError());

        cudaMemcpyFromSymbol(&hFunctionPtr,
                             dDivideByTwoPtr,
                             sizeof(deviceFunc));
        transformVector<<<dimGrid, dimBlock>>>
        (dVector, hFunctionPtr, kVectorSize);
        checkCudaErrors(cudaGetLastError());

        // Download results.
        vector<float> hResultVector(kVectorSize);

        checkCudaErrors(cudaMemcpy(&hResultVector[0],
                                   dVector,
                                   kVectorSize *sizeof(float),
                                   cudaMemcpyDeviceToHost));

        // Check results.
        for (int i = 0; i < kVectorSize; ++i)
        {
            if (fabs(hVector[i] - hResultVector[i]) > EPS)
            {
                cout << "Computations were incorrect..." << endl;
                testResult = false;
                break;
            }
        }

        // Free resources.
        if (dVector) checkCudaErrors(cudaFree(dVector));
    }
    catch (...)
    {
        cout << "Error occured, exiting..." << endl;

        exit(EXIT_FAILURE);
    }
}

I have very similar problem with another thrust function, copy_if. The following minimal example compiles and properly runs if compiled with the field
Properties → Configuration Properties → Cuda C/C++ → Device → Code Generation set to “compute_30,sm_30” (without the quotes).

However, if I change that field to “compute_35,sm_35” (without the quotes), compilation fails with the following message:
“Error LNK2001 unresolved external symbol __fatbinwrap_66_tmpxft_00009eb4_00000000_18_cuda_device_runtime_compute_61_cpp1_ii_8b1a5d37”

#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/copy.h>
#include <stdio.h>  

struct isFore
{
	__device__
		bool operator()(const float x)
	{
		return x < 1.0f;
	}
};


int main()
{
	float x[4] = { 1.0f, 0.2f, 3.0f, 0.4f };
	thrust::device_vector<float> dX(x, x + 4);
	thrust::device_vector<int> dI(dX.size());

	thrust::device_vector<int>::iterator dEnd = thrust::copy_if(
		thrust::make_counting_iterator(0),
		thrust::make_counting_iterator(4),
		dX.begin(),
		dI.begin(),
		isFore());

	thrust::host_vector<int> hI(dI.begin(), dEnd);

	printf("Elements less than 1.0: ");
	for (int i = 0; i < hI.size(); ++i)
		printf("%f  ", x[hI[i]]);

    return 0;
}

I am using CUDA 8.0, Nsight Visual Studio Edition 5.3 with Visual Studio 2015, Thrust v1.8.3, GeForce GTX 1050 Ti with compute capability 6.1.

I have also dropped the above code inside a fresh copy of simpleSeparateCompilation from Cuda v8.0 samples using the Visual Studio 2015 solution included in the sample’s directory. It compiles and properly runs. However, if I remove “compute_30,sm_30” and all lower compute capabilities from that Visual Studio 2015 solution’s properties, compilation fails with the same error as above.