Cub Library

Without showing any code, or describing your setup, it’s difficult to make specific suggestions.

The following code runs correctly for me, on CUDA 7, Fedora20, Quadro5000 GPU, and with the latest cub master (1.4.1):

http://nvlabs.github.io/cub/

downloaded and properly installed in /usr/local/cuda/include/cub:

$ cat t736.cu
#include <cub/cub.cuh>
#include <stdio.h>

int main(){

  // Declare, allocate, and initialize device pointers for input and output
  int num_items = 7;
  int *d_in;
  int h_in[]  = {8, 6, 7, 5, 3, 0, 9};
  int sz = sizeof(h_in)/sizeof(h_in[0]);
  int *d_out; // e.g., [ , , , , , , ]
  cudaMalloc(&d_in,  sz*sizeof(h_in[0]));
  cudaMalloc(&d_out, sz*sizeof(h_in[0]));
  cudaMemcpy(d_in, h_in, sz*sizeof(h_in[0]), cudaMemcpyHostToDevice);
  printf("\nInput:\n");
  for (int i = 0; i < sz; i++) printf("%d ", h_in[i]);
  // Determine temporary device storage requirements
  void *d_temp_storage = NULL;
  size_t temp_storage_bytes = 0;
  cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
  // Allocate temporary storage
  cudaMalloc(&d_temp_storage, temp_storage_bytes);
  // Run inclusive prefix sum
  cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
// d_out s<-- [8, 14, 21, 26, 29, 29, 38]
  cudaMemcpy(h_in, d_out, sz*sizeof(h_in[0]), cudaMemcpyDeviceToHost);
  printf("\nOutput:\n");
  for (int i = 0; i < sz; i++) printf("%d ", h_in[i]);
  printf("\n");
  return 0;
}
$ nvcc -o t736 t736.cu
$ cuda-memcheck ./t736
========= CUDA-MEMCHECK

Input:
8 6 7 5 3 0 9
Output:
8 14 21 26 29 29 38
========= ERROR SUMMARY: 0 errors
$

The above code is basically a straightforward completion of the code shown here:

http://nvlabs.github.io/cub/structcub_1_1_device_scan.html#a9416ac1ea26f9fde669d83ddc883795a