Hi all.
I finished part of my thesis with CUDA, but I want to optimize my code and given that I have “zero” CPU code, I can only work around the CPU-GPU transfers(and vice versa) and rework non thrust functions.
One of the things I thought for optimizing the code was to use device pointers instead of host pointers in thrust inputs, but I tried with a simple program, and I keep getting segmentation fault. I was wondering if anyone could point me in the right direction. Here is the code:
#include <stdlib.h>
#include <stdio.h>
#include <float.h>
#include <math.h>
#include <thrust/remove.h>
#include <thrust/sort.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
template <typename T>
struct deletekey
{
__host__ __device__ bool operator()(const thrust::tuple<T,T,T>& t)
{
return
(thrust::get<0>(t) < 5 && thrust::get<1>(t) <5 && thrust::get<2>(t) < 5);
}
};
int compact(float*intxc,float*intyc,float*intzc,int dim){
thrust::device_ptr<float> dev_ptrx(intxc);
thrust::device_ptr<float> dev_ptry(intyc);
thrust::device_ptr<float> dev_ptrz(intzc);
/* Tried this way too, Segmentation fault nevertheless
thrust::device_ptr<float> dev_ptrx = thrust::device_pointer_cast(intxc);
thrust::device_ptr<float> dev_ptry = thrust::device_pointer_cast(intyc);
thrust::device_ptr<float> dev_ptrz = thrust::device_pointer_cast(intzc);*/
thrust::device_vector<float> x(dim);
thrust::device_vector<float> y(dim);
thrust::device_vector<float> z(dim);
thrust::copy(dev_ptrx,dev_ptrx+dim,x.begin());
thrust::copy(dev_ptry,dev_ptry+dim,y.begin());
thrust::copy(dev_ptrz,dev_ptrz+dim,z.begin());
thrust::device_vector<float> x_out(dim);
thrust::device_vector<float> y_out(dim);
thrust::device_vector<float> z_out(dim);
typedef thrust::device_vector<float>::iterator Iterator;
typedef thrust::tuple<Iterator, Iterator, Iterator> IteratorTuple;
typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
ZipIterator begin(thrust::make_tuple(x.begin(), y.begin(), z.begin()));
ZipIterator end (thrust::make_tuple(x.end(), y.end(), z.end()));
ZipIterator output_begin(thrust::make_tuple(x_out.begin(), y_out.begin(), z_out.begin()));
ZipIterator output_end = thrust::remove_copy_if(begin, end, output_begin, deletekey<float>());
// compute size of output
size_t ArraySizet = output_end - output_begin;
thrust::copy(x_out.begin(),x_out.begin()+ArraySizet,intxc);
thrust::copy(y_out.begin(),y_out.begin()+ArraySizet,intyc);
thrust::copy(z_out.begin(),z_out.begin()+ArraySizet,intzc);
return (int) ArraySizet;
}
int main(){
cudaError_t status;
//CPU ALLOCATION AND DEBUG
float *intersectionsx_h=(float*)malloc(10*sizeof(float));
float *intersectionsy_h=(float*)malloc(10*sizeof(float));
float *intersectionsz_h=(float*)malloc(10*sizeof(float));
if (intersectionsx_h == NULL)
printf ( "!!!! host memory allocation error (interx)\n");
if (intersectionsy_h == NULL)
printf ( "!!!! host memory allocation error (intery)\n");
if (intersectionsz_h == NULL)
printf ( "!!!! host memory allocation error (interz)\n");
for(int i=0;i<10;i++){
intersectionsx_h[i]=(float)i;
intersectionsy_h[i]=(float)i;
intersectionsz_h[i]=(float)i;
}
//GPU ALLOCATION AND DEBUG
float *intersectionsx_d,*intersectionsy_d,*intersectionsz_d;
status=cudaMalloc((void**)&intersectionsx_d,10*sizeof(float));
if (status != cudaSuccess)
printf ("!!!! device memory allocation error (interx)\n");
status=cudaMalloc((void**)&intersectionsy_d,10*sizeof(float));
if (status != cudaSuccess)
printf ("!!!! device memory allocation error (intery)\n");
status=cudaMalloc((void**)&intersectionsz_d,10*sizeof(float));
if (status != cudaSuccess)
printf ("!!!! device memory allocation error (interz)\n");
//COPY INTERSECTION MATRIX FROM CPU TO GPU
status=cudaMemcpy(intersectionsx_d,intersectionsx_h,10*sizeof(float),cudaMemcpyHostToDevice);
if (status != cudaSuccess)
printf ("!!!! could not copy intersection array to GPU (interx)\n");
status=cudaMemcpy(intersectionsy_d,intersectionsy_h,10*sizeof(float),cudaMemcpyHostToDevice);
if (status != cudaSuccess)
printf ("!!!! could not copy intersection array to GPU (intery)\n");
status=cudaMemcpy(intersectionsz_d,intersectionsz_h,10*sizeof(float),cudaMemcpyHostToDevice);
if (status != cudaSuccess)
printf ("!!!! could not copy intersection array to GPU (interz)\n");
int Arraysize=compact(intersectionsx_d,intersectionsy_d,intersectionsz_d,10);
status=cudaFree(intersectionsx_d);
if (status != cudaSuccess)
printf ("!!!! device memory free error (interx)\n");
status=cudaFree(intersectionsy_d);
if (status != cudaSuccess)
printf ("!!!! device memory free error (intery)\n");
status=cudaFree(intersectionsz_d);
if (status != cudaSuccess)
printf ("!!!! device memory free error (interz)\n");
free(intersectionsx_h);
free(intersectionsy_h);
free(intersectionsz_h);
return 0;
}
Thanks in advance