Thanks for your reply. I’ll try to describe it more clearly.

First of all, the code.

```
template <typename T>
class Tensor {
public:
std::vector<int> dims_;
size_t size_;
std::shared_ptr<T> ptr_;
struct deleteCudaPtr {
void operator()(T *p) const {
cudaFree(p);
}
};
Tensor() {}
Tensor(std::vector<int> dims) : dims_(dims) {
T* tmp_ptr;
size_ = std::accumulate(dims_.begin(), dims_.end(), 1, std::multiplies<int>());
gpuErrchk( cudaMalloc(&tmp_ptr, sizeof(T) * size_) );
ptr_.reset(tmp_ptr, deleteCudaPtr());
}
T* begin() const { return ptr_.get(); }
T* end() const { return ptr_.get() + size_; }
size_t size() const { return size_; }
int first_dim() const { return dims_[0]; }
int second_dim() const { return dims_[1]; }
size_t byte_size() const { return size_ * sizeof(T); }
std::vector<int> dims() const { return dims_; }
void fill_float(float num) {
thrust::fill(thrust::device_ptr<T>(this->begin()),
thrust::device_ptr<T>(this->end()), num);
}
void copy_from_host(T *h_tensor) {
gpuErrchk(
cudaMemcpy((void *)ptr_.get(), (const void *) h_tensor,
byte_size(), cudaMemcpyHostToDevice)
);
}
};
typedef Tensor<float> float_tensor;
template<bool AT, bool BT>
__inline__ void gemm_rm(cublasHandle_t handle, float_tensor &C, float_tensor &A, float_tensor &B, float alpha=1.0f, float beta=0.0f) {
float gemm_alpha = alpha;
float gemm_beta = beta;
if (!AT && !BT) {
// no transpose
assert(A.second_dim() == B.first_dim() && "Inner dimension doesn't match");
}
else if (AT && !BT) {
// transpose A only
assert(A.first_dim() == B.first_dim() && "Inner dimension doesn't match");
}
else if (!AT && BT) {
// transpose B only
assert(A.second_dim() == B.second_dim() && "Inner dimension doesn't match");
}
else {
assert("Why not do ba instead of aTbT?" && false);
}
blasErrchk( cublasSgemm(handle,
(BT) ? CUBLAS_OP_T : CUBLAS_OP_N,
(AT) ? CUBLAS_OP_T : CUBLAS_OP_N,
C.second_dim(), // m -> c.second = k
C.first_dim(), // n -> c.first = m
A.second_dim(), // k -> a.second = n
&gemm_alpha,
B.begin(), B.second_dim(),
A.begin(), A.second_dim(),
&gemm_beta,
C.begin(), C.second_dim()));
}
```

Basically the float_tensor (typedef Tensor) is a wrapper around cuda device memory. The blas call is a wrapper around cublas.

The problem is that, I have two sets of matrices, they are

- A: (32768, 17408), B: (32768, 128)
- A: (16384, 17408), B: (16384, 128)

With everything the same, cublas won’t work on the second case. The way I call is:

```
blasErrchk(gemm_rm<true, false>(A, B)); // true to transpose A
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaDeviceSynchronize()); // <---- error here, unspecified launch error
```

And I get an unspecified launch error at the cudaDeviceSynchronize() call. I’m positive that this blas call is the cause of the problem: if I remove this line, everything’s fine.

Another thing is, if I run the full code (posted below) in the cuda debugger, with or withoug memchecker, it runs fine.

Here’s the more complete code:

```
iter = 0
while (iter < 2) {
X.copy_from_host(h_data);
y.copy_from_host(h_label);
float gradient_multiplier = lr / b;
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaDeviceSynchronize());
gemm_rm<false, false>(handle, y, X, weight, -1.0f, 1.0f);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaDeviceSynchronize());
gemm_rm<true, false>(handle, weight, X, y, -gradient_multiplier, 1.0f);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize()); // <-------- this is line line that errors out
iter++;
}
```