Texture references problem

Hello,

I’m trying to write a program for matrix basic operation using texture, but Visual Studio 2013 give me error in the declaration of texture<float, 2> saying “texture is not a template” and it can’t find the function tex2D in function sum.

The complete code is splitted in 2 files:
gpuMatrix.h

#include <ostream>
#include <sstream>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>

#pragma once
class GPUMatrix
{
	friend std::ostream& operator<<(std::ostream &strm, GPUMatrix &a);
public:
	GPUMatrix(int rows, int cols);
	GPUMatrix(int *array, int rows, int cols);
	~GPUMatrix();
	int getNumRows();
	int getNumCols();
	void sum(GPUMatrix &b, GPUMatrix &output);
	void sub(GPUMatrix &b, GPUMatrix &output);
	void mult(GPUMatrix &b, GPUMatrix &output);
	class Row {
		friend class GPUMatrix;
	public:
		int operator[](const int col) {
			return parent.array[row * parent.cols + col];
		};
	private:
		GPUMatrix &parent;
		int row;
		Row(GPUMatrix &parent_, int row_) : parent(parent_), row(row_) {};
	};
	Row operator[](const int row) {
		return Row(*this, row);
	}
private:
	int *array;
	int rows, cols;
	cudaEvent_t start, stop;
};

std::ostream& operator<<(std::ostream &strm, GPUMatrix &a)

gpuMatrix.cu

#include "GPUMatrix.h"

#include <stdio.h>
#include <stdlib.h>
#include <stdexcept>
#include "book.h"

#include "texture_fetch_functions.h"
#include "texture_types.h"
#include "cuda_texture_types.h"

#define imin(a,b) (a < b ? a : b)
#define DEFAULT_BLOCKS_NUM 32

void allocateCudaMemory(void **obj, size_t memory);
void copyCudaMemory(void *target, void *source, size_t count, cudaMemcpyKind kind);

const int thread_per_block = 32;

texture<int, 2> matrixInput1;
texture<int, 2> matrixInput2;

__global__ void sumMatrix(int *output, int numRows, int numCols) {
	int col = threadIdx.x + (blockIdx.x * blockDim.x);
	int row = threadIdx.y + (blockIdx.y * blockDim.y);

	if (row < numRows && col < numCols) {
		int offset = row * numCols + col, l, r;
		l = tex2D(matrixInput1, row, col);
		r = tex2D(matrixInput2, row, col);
		output[offset] = l + r;
	}
}

__global__ void subMatrix(int *a, int *b, int *output, int numRows, int numCols) {
	int col = threadIdx.x + (blockIdx.x * blockDim.x);
	int row = threadIdx.y + (blockIdx.y * blockDim.y);

	if (row < numRows && col < numCols) {
		int offset = row * numCols + col;
		output[offset] -= a[offset] - b[offset];;
	}
}
__global__ void multMatrix(int *a, int *b, int *output, int numRows, int numCols, int n) {
	int col = threadIdx.x + (blockIdx.x * blockDim.x);
	int row = threadIdx.y + (blockIdx.y * blockDim.y);

	if (row < numRows && col < numCols) {
		int offset = row * numCols + col;
		for (int index = 0; index < n; index++)
			output[offset] += a[row * n + index] * b[index * numCols + col];
	}
}

GPUMatrix::GPUMatrix(int rows_, int cols_) : rows(rows_), cols(cols_) {
	//this->array = new int[rows * cols];
	HANDLE_ERROR(cudaEventCreate(&start));
	HANDLE_ERROR(cudaEventCreate(&stop));
	this->array = (int*)malloc(rows * cols * sizeof(int));
}

GPUMatrix::GPUMatrix(int *array_, int rows_, int cols_) : rows(rows_), cols(cols_) {
	HANDLE_ERROR(cudaEventCreate(&start));
	HANDLE_ERROR(cudaEventCreate(&stop));
	this->array = (int*)malloc(rows * cols * sizeof(int));
	for (int i = 0; i < rows * cols; i++) {
		this->array[i] = array_[i];
	}
}

GPUMatrix::~GPUMatrix()
{
	HANDLE_ERROR(cudaEventDestroy(start));
	HANDLE_ERROR(cudaEventDestroy(stop));
	free(array);
}

int GPUMatrix::getNumRows() { return this->rows; }
int GPUMatrix::getNumCols() { return this->cols; }

void GPUMatrix::sum(GPUMatrix &b, GPUMatrix &output) {
	int *op1, *op2, *result;
	HANDLE_ERROR(cudaEventRecord(start, 0));

	if (this->getNumCols() != b.getNumCols() || this->getNumRows() != b.getNumRows())
		throw std::invalid_argument("I due operandi devono avere la stessa dimensione");
	if (this->getNumCols() != output.getNumCols() || this->getNumRows() != output.getNumRows())
		throw std::invalid_argument("La matrice di output deve avere la stessa dimensione degli operandi");

	size_t arraySize = this->getNumCols() * this->getNumRows() * sizeof(int);
	allocateCudaMemory((void **)&op1, arraySize);
	copyCudaMemory(op1, this->array, arraySize, cudaMemcpyHostToDevice);
	allocateCudaMemory((void **)&op2, arraySize);
	copyCudaMemory(op2, b.array, arraySize, cudaMemcpyHostToDevice);
	allocateCudaMemory((void **)&result, arraySize);

	cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
	cudaBindTexture2D(NULL, matrixInput1, op1, desc, this->getNumRows(), this->getNumCols(), sizeof(int) * this->getNumRows());
	cudaBindTexture2D(NULL, matrixInput2, op2, desc, b.getNumRows(), b.getNumCols(), sizeof(int) * b.getNumRows());

	int blockPerGridRow = imin(DEFAULT_BLOCKS_NUM, (this->getNumRows() + thread_per_block - 1) / thread_per_block);
	int blockPerGridCol = imin(DEFAULT_BLOCKS_NUM, (this->getNumCols() + thread_per_block - 1) / thread_per_block);
	dim3 blocks(blockPerGridRow, blockPerGridCol);
	dim3 threads(thread_per_block, thread_per_block);
	sumMatrix << <blocks, threads >> >(result, this->getNumRows(), this->getNumCols());

	copyCudaMemory(output.array, result, arraySize, cudaMemcpyDeviceToHost);
	HANDLE_ERROR(cudaEventRecord(stop, 0));
	HANDLE_ERROR(cudaEventSynchronize(stop));
	float elapsedTime;
	HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
	printf("La somma ha impiegato %3.1f ms\n", elapsedTime);

	cudaUnbindTexture(matrixInput1);
	cudaUnbindTexture(matrixInput2);
	cudaFree(op1);
	cudaFree(op2);
	cudaFree(result);
}

void GPUMatrix::sub(GPUMatrix &b, GPUMatrix &output) {
	int *op1, *op2, *result;
	HANDLE_ERROR(cudaEventRecord(start, 0));

	if (this->getNumCols() != b.getNumCols() || this->getNumRows() != b.getNumRows())
		throw std::invalid_argument("Le due matrici devono avere la stessa dimensione");
	if (this->getNumCols() != output.getNumCols() || this->getNumRows() != output.getNumRows())
		throw std::invalid_argument("La matrice di output deve avere la stessa dimensione degli operandi");

	size_t arraySize = this->getNumCols() * this->getNumRows() * sizeof(int);
	allocateCudaMemory((void **)&op1, arraySize);
	copyCudaMemory(op1, this->array, arraySize, cudaMemcpyHostToDevice);
	allocateCudaMemory((void **)&op2, arraySize);
	copyCudaMemory(op2, b.array, arraySize, cudaMemcpyHostToDevice);
	allocateCudaMemory((void **)&result, arraySize);

	int blockPerGridRow = imin(DEFAULT_BLOCKS_NUM, (this->getNumRows() + thread_per_block - 1) / thread_per_block);
	int blockPerGridCol = imin(DEFAULT_BLOCKS_NUM, (this->getNumCols() + thread_per_block - 1) / thread_per_block);
	dim3 blocks(blockPerGridRow, blockPerGridCol);
	dim3 threads(thread_per_block, thread_per_block);
	subMatrix << <blocks, threads >> >(op1, op2, result, this->getNumRows(), this->getNumCols());


	copyCudaMemory(output.array, result, arraySize, cudaMemcpyDeviceToHost);
	HANDLE_ERROR(cudaEventRecord(stop, 0));
	HANDLE_ERROR(cudaEventSynchronize(stop));
	float elapsedTime;
	HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
	printf("La sottrazione ha impiegato %3.1f ms\n", elapsedTime);

	cudaFree(op1);
	cudaFree(op2);
	cudaFree(result);
}

void GPUMatrix::mult(GPUMatrix &b, GPUMatrix &output) {
	int *op1, *op2, *result;
	HANDLE_ERROR(cudaEventRecord(start, 0));

	if (this->getNumCols() != b.getNumRows())
		throw std::invalid_argument("Dimensioni delle matrici incompatibili");
	if (b.getNumCols() != output.getNumCols() || this->getNumRows() != output.getNumRows())
		throw std::invalid_argument("La matrice di output ha una dimensione incompatibile con quella degli operandi");

	allocateCudaMemory((void **)&op1, this->getNumCols() * this->getNumRows() * sizeof(int));
	copyCudaMemory(op1, this->array, this->getNumCols() * this->getNumRows() * sizeof(int), cudaMemcpyHostToDevice);
	allocateCudaMemory((void **)&op2, b.getNumCols() * b.getNumRows() * sizeof(int));
	copyCudaMemory(op2, b.array, b.getNumCols() * b.getNumRows() * sizeof(int), cudaMemcpyHostToDevice);
	size_t outputSize = this->getNumCols() * b.getNumRows() * sizeof(int);
	allocateCudaMemory((void **)&result, outputSize);

	int blockPerGridRow = imin(DEFAULT_BLOCKS_NUM, (this->getNumRows() + thread_per_block - 1) / thread_per_block);
	int blockPerGridCol = imin(DEFAULT_BLOCKS_NUM, (b.getNumCols() + thread_per_block - 1) / thread_per_block);
	dim3 blocks(blockPerGridRow, blockPerGridCol);
	dim3 threads(thread_per_block, thread_per_block);
	multMatrix << <blocks, threads >> >(op1, op2, result, this->getNumRows(), b.getNumCols(), this->getNumCols());

	copyCudaMemory(output.array, result, this->getNumRows() * b.getNumCols() * sizeof(int), cudaMemcpyDeviceToHost);
	HANDLE_ERROR(cudaEventRecord(stop, 0));
	HANDLE_ERROR(cudaEventSynchronize(stop));
	float elapsedTime;
	HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
	printf("Il prodotto ha impiegato %3.1f ms\n", elapsedTime);

	cudaFree(op1);
	cudaFree(op2);
	cudaFree(result);
}

void allocateCudaMemory(void **obj, size_t memory) {
	cudaError_t cudaStatus;

	cudaStatus = cudaMalloc(obj, memory);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc error: %s!\nIn file %s, line %s", cudaGetErrorString(cudaStatus), __FILE__, __LINE__);
		exit(EXIT_FAILURE);
	}
}

void copyCudaMemory(void *target, void *source, size_t count, cudaMemcpyKind kind) {
	cudaError_t cudaStatus;

	cudaStatus = cudaMemcpy(target, source, count, kind);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy error: %s!\nIn file %s, line %s", cudaGetErrorString(cudaStatus), __FILE__, __LINE__);
		exit(EXIT_FAILURE);
	}
}

std::ostream& operator<<(std::ostream &strm, GPUMatrix &a) {
	std::ostringstream stringStream;

	for (int row = 0; row < a.rows; row++) {
		for (int col = 0; col < a.cols; col++)
			stringStream << a[row][col] << " ";
		stringStream << std::endl;
	}
	return strm << stringStream.str() << std::endl;
}

I’m using cuda 8 with Visual Studio 2013. I tried to import the sample simpleTexture, but I get the same error.

Thanks in advance

What do you mean by this:

Do you mean that when you tried to compile the code, there was a compile error? If so, paste in the exact compiler output.

However if what you mean is that there is a red squiggly underline in the visual studio GUI, and when you hover your mouse over it you get those messages, that is an incompatibility with intellisense, not an actual CUDA coding problem.

In that case, just compile your codes. The sample code should compile correctly for you. Ignore the intellisense warnings around CUDA constructs if your code compiles correctly.

I think the error messages mentioned could also indicate that CUDA code is being sent to MSVC instead of NVCC, so check your build settings.

This is an Intellisense error: 2 IntelliSense: texture is not a template c:\Users\mattia\Documents\Visual Studio 2013\Projects\gpuMatrixTexture\gpuMatrixTexture\gpuMatrix.cu 20 1 gpuMatrixTexture

I can compile the code without problem. But when I try to run the code, Visual Studio tell me that there are Intellisense errors, so I can’t build and execute the code but it run the last successful build.

You’ll need to work around that somehow. An ordinary load of VS2013 does not prevent me from building and/or running the app even when there are Intellisense errors. You may need to adjust one of your VS settings.

Intellisense by itself is not compatible with CUDA constructs, so Intellisense may report errors even when there are none according to proper CUDA language usage.

I tried to disable Intellisence, I can build the code but I get an invalid argument on:

cudaBindTexture2D(NULL, matrixInput1, op1, desc, this->getNumCols(), this->getNumRows(), this->getNumCols() * sizeof(int))

If I had Intellisence abilitated I get an Intellisense error saying: IntelliSense: no suitable conversion function from “cudaChannelFormatDesc” to “const cudaChannelFormatDesc *” exists c:\Users\mattia\Documents\Visual Studio 2013\Projects\gpuMatrixTexture\gpuMatrixTexture\gpuMatrix.cu 97 2 gpuMatrixTexture

I tried also to write &desc in the cudaBindTexture2D but obviously I get a compile error saying: error : no instance of overloaded function “cudaBindTexture2D” matches the argument list C:\Users\mattia\documents\visual studio 2013\Projects\gpuMatrixTexture\gpuMatrixTexture\gpuMatrix.cu 97 1 gpuMatrixTexture.

Now I can build the program regardless of the fact that I have intellisense error only with disable/active intellisense. This is really insane,

I resolved using cudaBindTextureToArray instead of cudaBindTexture2D:

const cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
	cudaArray* cuArray1;
	cudaMallocArray(&cuArray1, &desc, this->getNumCols(), this->getNumRows());
	cudaMemcpyToArray(cuArray1, 0, 0, this->array, arraySize,
		cudaMemcpyHostToDevice);
	cudaBindTextureToArray(matrixInput1, cuArray1, desc);