Using device pointer in thrust algorithm

Hi all.

I finished part of my thesis with CUDA, but I want to optimize my code and given that I have “zero” CPU code, I can only work around the CPU-GPU transfers(and vice versa) and rework non thrust functions.

One of the things I thought for optimizing the code was to use device pointers instead of host pointers in thrust inputs, but I tried with a simple program, and I keep getting segmentation fault. I was wondering if anyone could point me in the right direction. Here is the code:

#include <stdlib.h>

#include <stdio.h>

#include <float.h>

#include <math.h>

#include <thrust/remove.h>

#include <thrust/sort.h>

#include <thrust/device_vector.h>

#include <iostream>

#include <cuda.h>

#include <cuda_runtime.h>

template <typename T>

struct deletekey

{

  __host__ __device__ bool operator()(const thrust::tuple<T,T,T>& t)

  {

    return 

	   (thrust::get<0>(t) < 5 && thrust::get<1>(t) <5  && thrust::get<2>(t) < 5);

  }

};

int compact(float*intxc,float*intyc,float*intzc,int dim){

	thrust::device_ptr<float> dev_ptrx(intxc);

	thrust::device_ptr<float> dev_ptry(intyc);

	thrust::device_ptr<float> dev_ptrz(intzc);

/* Tried this way too, Segmentation fault nevertheless

	thrust::device_ptr<float>  dev_ptrx = thrust::device_pointer_cast(intxc);

	thrust::device_ptr<float>  dev_ptry = thrust::device_pointer_cast(intyc);

	thrust::device_ptr<float>  dev_ptrz = thrust::device_pointer_cast(intzc);*/

	thrust::device_vector<float> x(dim);

	thrust::device_vector<float> y(dim);

	thrust::device_vector<float> z(dim);

	thrust::copy(dev_ptrx,dev_ptrx+dim,x.begin());

	thrust::copy(dev_ptry,dev_ptry+dim,y.begin());

	thrust::copy(dev_ptrz,dev_ptrz+dim,z.begin());

	thrust::device_vector<float> x_out(dim);

	thrust::device_vector<float> y_out(dim);

	thrust::device_vector<float> z_out(dim);

	typedef thrust::device_vector<float>::iterator      Iterator;

	typedef thrust::tuple<Iterator, Iterator, Iterator> IteratorTuple;

	typedef thrust::zip_iterator<IteratorTuple>         ZipIterator;

	ZipIterator begin(thrust::make_tuple(x.begin(), y.begin(), z.begin()));

	ZipIterator end  (thrust::make_tuple(x.end(),   y.end(),   z.end()));

	ZipIterator output_begin(thrust::make_tuple(x_out.begin(), y_out.begin(), z_out.begin()));

	ZipIterator output_end = thrust::remove_copy_if(begin, end, output_begin, deletekey<float>());

	// compute size of output

	size_t ArraySizet = output_end - output_begin;

	thrust::copy(x_out.begin(),x_out.begin()+ArraySizet,intxc);

	thrust::copy(y_out.begin(),y_out.begin()+ArraySizet,intyc);

	thrust::copy(z_out.begin(),z_out.begin()+ArraySizet,intzc);

return (int) ArraySizet;

}

int main(){

	cudaError_t status;

	//CPU ALLOCATION AND DEBUG

	float *intersectionsx_h=(float*)malloc(10*sizeof(float));

	float *intersectionsy_h=(float*)malloc(10*sizeof(float));

	float *intersectionsz_h=(float*)malloc(10*sizeof(float));

	if (intersectionsx_h == NULL) 

		printf ( "!!!! host memory allocation error (interx)\n");

	if (intersectionsy_h == NULL) 

		printf ( "!!!! host memory allocation error (intery)\n");

	if (intersectionsz_h == NULL) 

		printf ( "!!!! host memory allocation error (interz)\n");

	for(int i=0;i<10;i++){

		intersectionsx_h[i]=(float)i;

		intersectionsy_h[i]=(float)i;

		intersectionsz_h[i]=(float)i;

	}

	//GPU ALLOCATION AND DEBUG

	float *intersectionsx_d,*intersectionsy_d,*intersectionsz_d;

	

	status=cudaMalloc((void**)&intersectionsx_d,10*sizeof(float));

	if (status != cudaSuccess) 

            printf ("!!!! device memory allocation error (interx)\n");

	status=cudaMalloc((void**)&intersectionsy_d,10*sizeof(float));

	if (status != cudaSuccess) 

            printf ("!!!! device memory allocation error (intery)\n");

	status=cudaMalloc((void**)&intersectionsz_d,10*sizeof(float));

	if (status != cudaSuccess) 

            printf ("!!!! device memory allocation error (interz)\n");

	//COPY INTERSECTION MATRIX FROM CPU TO GPU

	status=cudaMemcpy(intersectionsx_d,intersectionsx_h,10*sizeof(float),cudaMemcpyHostToDevice);

	if (status != cudaSuccess) 

            printf ("!!!! could not copy intersection array to GPU (interx)\n");	

	status=cudaMemcpy(intersectionsy_d,intersectionsy_h,10*sizeof(float),cudaMemcpyHostToDevice);

	if (status != cudaSuccess) 

            printf ("!!!! could not copy intersection array to GPU (intery)\n");	

	status=cudaMemcpy(intersectionsz_d,intersectionsz_h,10*sizeof(float),cudaMemcpyHostToDevice);

	if (status != cudaSuccess) 

            printf ("!!!! could not copy intersection array to GPU (interz)\n");

	int Arraysize=compact(intersectionsx_d,intersectionsy_d,intersectionsz_d,10);

	status=cudaFree(intersectionsx_d);

	if (status != cudaSuccess) 

		printf ("!!!! device memory free error (interx)\n");

	status=cudaFree(intersectionsy_d);

	if (status != cudaSuccess) 

		printf ("!!!! device memory free error (intery)\n");

	status=cudaFree(intersectionsz_d);

	if (status != cudaSuccess) 

		printf ("!!!! device memory free error (interz)\n");

	free(intersectionsx_h);

	free(intersectionsy_h);

	free(intersectionsz_h);

	

	return 0;

}

Thanks in advance

I got it I had to this this at the end of the thrust function

thrust::copy(x_out.begin(),x_out.begin()+ArraySizet,dev_ptrx);

	thrust::copy(y_out.begin(),y_out.begin()+ArraySizet,dev_ptry);

	thrust::copy(z_out.begin(),z_out.begin()+ArraySizet,dev_ptrz);

	intxc = thrust::raw_pointer_cast(dev_ptrx);

	intyc = thrust::raw_pointer_cast(dev_ptry);

	intzc = thrust::raw_pointer_cast(dev_ptrz);

Could anyone just clarify me how much memory allocation will I have while running this program:

(3104)2 or (310*4)*3 bytes ?

You could have copied more efficiently (directly) from the device pointer to thrust device vector as follows:
thrust::device_vector x(intxc, intxc + dim);
thrust::device_vector y(intyc, intyc + dim);
thrust::device_vector z(intzc, intzc + dim);