Hi all,
Trying to migrate to cuda7.5CR, I have a problem with thrust::reduce_by_key. It crashes when input length is greater than 20000.
Attached is a source code of a small program which demonstrates the issue. It is followed by the printed error message.
Same program has no problems with previous cuda versions, including cuda7.0.
GPU used: GTX980
OS: Linux CentOs 6.5
The code consists of 2 files:
main.cpp - arranges memory and calls reduceByKeyWrapper.
reduceByKeyWrapper.cu - wraps thrust::reduce_by_key.
main.cpp:
Description:
-Receives 2 arguments: length of input array, deviceId. (lines 9-18)
-Allocates the input and output arrays. Fill input with 0s (lines 27-48)
-Invokes the reduce_by_key wrapper (line 49)
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
void reduceByKeyWrapper(unsigned int *d_inKey, unsigned int *d_outKey, unsigned int *d_outVal, unsigned int length);
int main(int argc, char **argv)
{
if (argc <3 ){
std::cout << "Format: " << argv[0] << " <length> <devid>\n";
exit(0);
}
unsigned int length = atoi(argv[1]);
std::cout << "length: " << length<< "\n";
unsigned int devId = atoi(argv[2]);
std::cout << "devId: " << devId<< "\n";
if(cudaSuccess != cudaSetDevice((int)devId)){
std::cout << "cudaSetDevice err\n";
}
if(cudaSuccess !=cudaDeviceReset()){
std::cout << "cudaDeviceReset err\n";
}
unsigned int* d_inKey;
unsigned int* d_outVal;
unsigned int* d_outKey;
if(cudaSuccess != cudaMalloc((void **) &d_inKey, length * sizeof(unsigned int))){
std::cout << "cudaMalloc err\n";
}
if(cudaSuccess != cudaMalloc((void **) &d_outKey, length * sizeof(unsigned int))){
std::cout << "cudaMalloc err\n";
}
if(cudaSuccess != cudaMalloc((void **) &d_outVal, length * sizeof(unsigned int))){
std::cout << "cudaMalloc err\n";
}
// prepare values for copy - program hangs regardless the value here
unsigned int* hostBuf = new unsigned int[length];
for(unsigned int idx =0; idx < length;idx++){
hostBuf[idx] = 0;// program hangs regardless the value here
}
cudaMemcpy(d_inKey, hostBuf, length * sizeof(unsigned int), cudaMemcpyHostToDevice);
reduceByKeyWrapper( d_inKey, d_outKey, d_outVal, length);
return 0;
}
reduceByKeyWrapper.cu
wraps thrust::reduce_by_key
#include <cuda.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/functional.h>
#include <thrust/reduce.h>
#include <thrust/fill.h>
#include <thrust/execution_policy.h>
#include <timer.h>
void reduceByKeyWrapper(unsigned int *d_inKey, unsigned int *d_outKey, unsigned int *d_outVal, unsigned int length) {
thrust::device_ptr<unsigned int> d_inKeyPtr(d_inKey);
thrust::device_ptr<unsigned int> d_outKeyPtr(d_outKey);
thrust::device_ptr<unsigned int> d_outValPtr(d_outVal);
thrust::pair<thrust::device_ptr<unsigned int>, thrust::device_ptr<unsigned int> > resultIterPair;
thrust::constant_iterator<unsigned int> const_iter(1);
resultIterPair = thrust::reduce_by_key(d_inKeyPtr, d_inKeyPtr + length, const_iter, d_outKeyPtr, d_outValPtr);
}
Next is the execution command, where length is set to 20001 and devid to 0:
./run 20001 0
result:
terminate called after throwing an instance of 'thrust::system::system_error'
what(): function_attributes(): after cudaFuncGetAttributes: an illegal memory access was encountered
Aborted (core dumped)
Note: any length in the range [0,20000] runs with no crash.
p.s.
Reason for moving to cuda7.5 CR were problems I had with thrust::copy_if on cuda6.5. Here’s a link to that issue:
https://devtalk.nvidia.com/default/topic/860775/cpu-hangs-when-calling-thrust-copy_if/
Does anyone has an idea on how can this issue be solved?
Thanks,
Ronen Halevy.