It is not obvious how to use std::vector in CUDA, so I have designed my own Vector class:

```
#ifndef VECTORHEADERDEF
#define VECTORHEADERDEF
#include <cmath>
#include <iostream>
#include <cassert>
template <typename T>
class Vector
{
private:
T* mData; // data stored in vector
int mSize; // size of vector
public:
Vector(const Vector& otherVector); // Constructor
Vector(int size); // Constructor
~Vector(); // Desructor
__host__ __device__ int GetSize() const; // get size of the vector
T& operator[](int i); // see element
// change element i
__host__ __device__ void set(size_t i, T value) {
mData[i] = value;
}
template <class S> // output vector
friend std::ostream& operator<<(std::ostream& output, Vector<S>& v);
};
// Overridden copy constructor
// Allocates memory for new vector, and copies entries of other vector into it
template <typename T>
Vector<T>::Vector(const Vector& otherVector)
{
mSize = otherVector.GetSize();
mData = new T [mSize];
for (int i=0; i<mSize; i++)
{
mData[i] = otherVector.mData[i];
}
}
// Constructor for vector of a given size
// Allocates memory, and initialises entries to zero
template <typename T>
Vector<T>::Vector(int size)
{
assert(size > 0);
mSize = size;
mData = new T [mSize];
for (int i=0; i<mSize; i++)
{
mData[i] = 0.0;
}
}
// Overridden destructor to correctly free memory
template <typename T>
Vector<T>::~Vector()
{
delete[] mData;
}
// Method to get the size of a vector
template <typename T>
__host__ __device__ int Vector<T>::GetSize() const
{
return mSize;
}
// Overloading square brackets
// Note that this uses `zero-based' indexing, and a check on the validity of the index
template <typename T>
T& Vector<T>::operator[](int i)
{
assert(i > -1);
assert(i < mSize);
return mData[i];
}
// Overloading the assignment operator
template <typename T>
Vector<T>& Vector<T>::operator=(const Vector& otherVector)
{
assert(mSize == otherVector.mSize);
for (int i=0; i<mSize; i++)
{
mData[i] = otherVector.mData[i];
}
return *this;
}
// Overloading the insertion << operator
template <typename T>
std::ostream& operator<<(std::ostream& output, Vector<T>& v) {
for (int i=0; i<v.mSize; i++) {
output << v[i] << " ";
}
return output;
}
```

My main function - where I just pass a vector to the device, modify it and pass it back - is as follows (with the kernel designed just for testing purposes):

```
#include <iostream>
#include "Vector.hpp"
__global__ void alpha(Vector<int>* d_num)
{
int myId = threadIdx.x + blockDim.x * blockIdx.x;
d_num->set(0,100);
d_num->set(2,11);
}
int main()
{
Vector<int> num(10);
for (int i=0; i < num.GetSize(); ++i) num.set(i,i); // initialize elements to 0:9
std::cout << "Size of vector: " << num.GetSize() << "\n";
std::cout << num << "\n"; // print vector
Vector<int>* d_num;
// allocate global memory on the device
cudaMalloc((void **) &d_num, num.GetSize()*sizeof(Vector<int>));
// copy data from host memory to the device memory
cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(Vector<int>), cudaMemcpyHostToDevice);
// launch the kernel
alpha<<<1,100>>>(d_num);
// copy the modified array back to the host, overwriting the contents of h_arr
cudaMemcpy(num, &d_num[0], num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);
std::cout << num << "\n";
// free GPU memory allocation and exit
cudaFree(d_num);
return 0;
}
```

The problem I encounter is with cudaMemcpyDeviceToHost. It does not really copy the device vector to the num vector as can be seen from the output.

How should I deal with that?