How to fix Error MSB3721

Im on Windows 10x64bit machine, CUDA 8.0, Geforce 610M(which is still supported), Visual Studio 2015(which integrated with CUDA)

I tried to compile Device Query and VectorAdd, it’s work just fine.
Then I created my own mergesort code, these error came up:

Error MSB3721 The command ““D:\Apps Skripsi\CUDA v8.0\v8.0\bin\nvcc.exe” -dlink -o x64\Debug\MergesortCUDA.device-link.obj -Xcompiler “/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -L"D:\Apps Skripsi\CUDA v8.0\v8.0\lib\x64” cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib -gencode=arch=compute_21,code=sm_21 -G --machine 64 x64\Debug\kernel.cu.obj” exited with code 1. MergesortCUDA C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\V140\BuildCustomizations\CUDA 8.0.targets 773

So tried to copy paste my own mergesort code in to VectorAdd.cu, need to check if the linker is the problem, but those error still came up,. How to fix this? thanks

MSB3721 is the VS way of saying “I ran nvcc, and it returned an error code.”

Other than knowing that your compilation failed, it is completely useless for understanding why it failed.

To understand why, it’s necessary to increase the verbosity of VS output so that it shows the actual invocation of nvcc and the actual error reported by nvcc (prior to VS reporting the MSB3721 error).

If you google how to increase verbosity of VS output, you’ll be able to find articles explaining how.

Thanks for your advice.
I did set the verbose to detailed, and yes, there is so much build output come out, such as platform toolset it got fix right after I edit the .vcxproj, then there is something else:

nvcc : unsupported gpu architecture “compute_21”

On this link http://stackoverflow.com/questions/21198105/cuda-unsupported-gpu-architecture-eclipse-nsight
it said there is such thing called compute_21 so I changed then Code generation in Properties but the error come out >200 or so. I googled in https://en.wikipedia.org/wiki/CUDA, it written that in CUDA 8.0 CC 2.0 - 6.x still supported. So what posibbly wrong, sir? thanks.

There is no such thing as compute_21

There is sm_21, but no compute_21

SO you should specify

compute_20,sm_21

Sorry I mean there is no such thing.
I tried that (compute_20, sm_21) in code generation in visual studio, but still got the MSB3721, and so many >200 errors, funny thing is I changed the build to x86 some error reduce. Btw I used Visual Studio Enterprise 2015 is that any effect? Or any another advices? thanks man.

Here my mergesort code:

#include <stdio.h>
#include <conio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <stdint.h>
#include <string.h>
#include "cuda.h"
#include "device_launch_parameters.h"
#include "cuda_runtime.h"
#include "cuda_device_runtime_api.h"

__device__ void merge(size_t *a, size_t *aux, int lo, int mid, float hi);

__global__ void mergeBU(size_t *a, size_t *aux, int N)
{

	for (int sz = 1; sz < N; sz = sz + sz)
	{
		for (int lo = 2 * sz*threadIdx.x; lo < N - sz; lo += sz + sz)
		{
			int mid = lo + sz - 1;
			float hi = fminf(lo + sz + sz - 1, N - 1);
			merge(a, aux, lo, mid, hi);

		}
	}
}

__device__ void merge(size_t *a, size_t *aux, int lo, int mid, int hi)
{
	int i = lo;
	int j = mid + 1;

	for (int k = lo; k <= hi; k++)
	{
		aux[k] = a[k];
	}

	for (int k = lo; k <= hi; k++)
	{
		if (i > mid) { a[k] = aux[j]; j++; }
		else if (j > hi) { a[k] = aux[i]; i++; }
		else if (aux[j] < aux[i]) { a[k] = aux[j]; j++; }
		else { a[k] = aux[i]; i++; }
	}
}


int main(int argc, char **argv)
{
	int gridDim = 8;
	int blockDim = 192;
	int N;
	size_t *a, *aux;
	size_t *d_a, *d_aux;
	clock_t t;
	printf("Masukan jumlah data : ");
	scanf_s("%d", &N);

	a = (size_t *)malloc(N * sizeof(size_t));
	aux = (size_t *)malloc(N * sizeof(size_t));

	cudaMalloc(&d_a, N * sizeof(size_t));
	cudaMalloc(&d_aux, N * sizeof(size_t));

	for (int i = 0; i < N; i++)
	{
		a[i] = rand()%100;
		//printf("%d ", a[i]);
	}

	cudaMemcpy(d_a, a, N * sizeof(size_t), cudaMemcpyHostToDevice);
	cudaMemcpy(d_aux, aux, N * sizeof(size_t), cudaMemcpyHostToDevice);

	t = clock();
	mergeBU << <gridDim, blockDim >> > (d_a, d_aux, N);
	t = clock() - t;

	cudaMemcpy(a, d_a, N * sizeof(size_t), cudaMemcpyDeviceToHost);

	for (int i = 0; i < N; i++) { printf("%d ", a[i]); }
	double time_taken = ((double)t) / CLOCKS_PER_SEC;

	printf("\n");
	printf("time = %f secs\n", time_taken);
	free(a);
	free(aux);
	cudaFree(d_a);
	cudaFree(d_aux);
	_getch();
	return 0;
}
__device__ void merge(size_t *a, size_t *aux, int lo, int mid, int hi)

I changed from int hi to float hi,
and change int gridDim, blockDim to dim3 gridDim,blockDim
The MSB3721 error gone.
shame to me actually.

But there is error : LNK2019, but then I add cudart.lib in the configuration Properties - linker - input. And build succeeded. Cheers man! thanks

Wait a minute!

#include <stdio.h>
#include <conio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <stdint.h>
#include <string.h>
#include <cuda.h>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>
#include <cuda_device_runtime_api.h>

__device__ void merge(size_t *a, size_t *aux, int lo, int mid, float hi);

__global__ void mergeBU(size_t *a, size_t *aux, int N)
{

	for (int sz = 1; sz < N; sz = sz + sz)
	{
		for (int lo = 2 * sz*threadIdx.x; lo < N - sz; lo += sz + sz)
		{
			int mid = lo + sz - 1;
			float hi = fminf(lo + sz + sz - 1, N - 1);
			merge(a, aux, lo, mid, hi);

		}
	}
}

__device__ void merge(size_t *a, size_t *aux, int lo, int mid, float hi)
{
	int i = lo;
	int j = mid + 1;

	for (int k = lo; k <= hi; k++)
	{
		aux[k] = a[k];
	}

	for (int k = lo; k <= hi; k++)
	{
		if (i > mid) { a[k] = aux[j]; j++; }
		else if (j > hi) { a[k] = aux[i]; i++; }
		else if (aux[j] < aux[i]) { a[k] = aux[j]; j++; }
		else { a[k] = aux[i]; i++; }
	}
}

unsigned int randr(unsigned int min, unsigned int max)
{
	double scaled = (double)rand() / RAND_MAX;

	return (max - min + 1)*scaled + min;
}

int main(void)
{
	dim3 gridDim = 8;
	dim3 blockDim = 192;
	int N;
	size_t *a, *aux;
	size_t *d_a, *d_aux;
	clock_t t;
	printf("Masukan jumlah data : ");
	scanf_s("%d", &N);

	a = (size_t *)malloc(N * sizeof(size_t));
	aux = (size_t *)malloc(N * sizeof(size_t));

	cudaMalloc(&d_a, N * sizeof(size_t));
	cudaMalloc(&d_aux, N * sizeof(size_t));

	for (int i = 0; i < N; i++)
	{
		a[i] = randr(1,10000);
		//printf("%d ", a[i]);
	}

	cudaMemcpy(d_a, a, N * sizeof(size_t), cudaMemcpyHostToDevice);
	cudaMemcpy(d_aux, aux, N * sizeof(size_t), cudaMemcpyHostToDevice);

	t = clock();
	mergeBU <<<gridDim, blockDim >>> (d_a, d_aux, N);
	t = clock() - t;

	cudaMemcpy(a, d_a, N * sizeof(size_t), cudaMemcpyDeviceToHost);

	for (int i = 0; i < N; i++) { printf("%d ", a[i]); }
	double time_taken = ((double)t) / CLOCKS_PER_SEC;

	printf("\n");
	printf("time = %f secs\n", time_taken);
	free(a);
	free(aux);
	cudaFree(d_a);
	cudaFree(d_aux);
	_getch();
	return 0;
}

Those code build succesfully, then I tried to compiled using Nsight, but the GPU stat is grey coloured, like is not been used. What I’m asking is those code is actually running on GPU? im not sure really.

TO simplyfy my quetion: what the difference between when I click on Local windows debugger and Nsight-start CUDA debugger?

Hello, I meet this problem, too. How can I solve it? Thanks.

1>C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\V120\BuildCustomizations\CUDA 7.0.targets(593,9): error MSB3721: 命令““C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0\bin\nvcc.exe” -gencode=arch=compute_35,code=“sm_35,compute_35” --use-local-env --cl-version 2013 -ccbin “C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\x86_amd64” -rdc=true -IInclude\cutil\inc -I"D:\Programs\BundleFusion-master\FriedLiver" -ISource\DXUT\Optional -ISource\DXUT\Core -IInclude\Uplink -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0\include" --keep-dir x64\Debug -maxrregcount=62 --machine 64 --compile -cudart static -g -DWIN32 -D_DEBUG -DDEBUG -DPROFILE -D_CONSOLE -DD3DXFX_LARGEADDRESS_HANDLE -D_CRT_SECURE_NO_WARNINGS -DNOMINMAX -D_UNICODE -DUNICODE -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o x64\Debug\CUDACache.cu.obj “D:\Programs\BundleFusion-master\FriedLiver\Source\CUDACache.cu””已退出,返回代码为 2。
1>已完成执行任务“CudaCompile”的操作 - 失败。
1>已完成在项目“FriedLiver.vcxproj”中生成目标“CudaBuild”的操作 - 失败。