Segmentation fault for printing array from float pointer in plugin

I have cuda code for plugin implementation.
Cuda code is called from

enque API from plugin
int ResizeAreaPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream)
{
    int status = -1;

    // Our plugin outputs only one tensor
    void* output = outputs[0];
    std::cout << "mupscale " << mupscale << " min_width " << min_width << " min_height " << min_height <<" min_channel " << min_channel  << std::endl;
    // Launch CUDA kernel wrapper and save its return value
    status = ResizeAreaInference(stream, inputs[0], output, mupscale, min_width, min_height, min_channel);

    return status;
}

I like to see input data and output data processed in cuda code.

void printdatain(const float *ptr, int size, const char* name, int stride)
{
   ofstream myfile;
   myfile.open (name);
   cout << "size " << size << endl;
   for(int i=0; i < size; i++){
      if(i % stride == 0 && i!=0)
         myfile << "\n";      
      myfile << *(ptr+i) << ",";
   }
   myfile.close();
   return;
}

void printdataout(float *ptr, int size, const char* name, int stride)
{
   ofstream myfile;
   myfile.open (name);
   for(int i=0; i < size; i++){
      if(i % stride == 0 && i!=0)
         myfile << "\n";
      myfile << *(ptr+i) << ",";
      
  
   }
   myfile.close();
   return;
}
__global__ void ResizeAreaKernel(const float *input, float *output, int upscale, int w, int h, int c, int total)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;//there are w*h number of threads
    int stride = blockDim.x * gridDim.x;
    for (int i = tid; i < total; i += stride){
       int rowadd= ((int)(i/w)*c*w*upscale*upscale)-((int)(i/w)*w*c*upscale);//(j*3*5*4*4) - (j*5*3*4)
       for(int y = 0; y < upscale; y++){
          int s=i*c*upscale+rowadd;
          int e=s+upscale*c;
          for(int x = s; x < e; x=x+c){
             for(int c_ = 0; c_ < c; c_++){
                output[x+c_+y*c*w*upscale] = input[i*c+c_];
             } 
          }       
       }
     }
    
    return;
}


int ResizeAreaInference(cudaStream_t stream,  const void* inputs, void* outputs, int upscale, int w, int h, int c)
{    
    //static int cnt=0;
    int n = (int)(w*h);
    const int THREADS_PER_BLOCK = 128;
    const int NUMBLOCKS = (int)((float)(n+THREADS_PER_BLOCK-1)/THREADS_PER_BLOCK); 
    ResizeAreaKernel<<<NUMBLOCKS, THREADS_PER_BLOCK, 0, stream>>>(static_cast<const float*>(inputs), static_cast<float*>(outputs), upscale, w, h, c, n);
    cudaDeviceSynchronize();
    if(cnt==0){
      printdatain(static_cast<const float*>(inputs), w*h*c, "inputs.txt", w*c);
      printdataout(static_cast<float*>(outputs), w*upscale*h*upscale*c, "outputs.txt", w*upscale*c);
    }
    //cnt++;
    //const float *input = static_cast<const float*>(inputs);
    //float *output = static_cast<float*>(outputs);
    //std::cout << "size of input " << sizeof(*input) << " size of output " << sizeof(*output) << std::endl;
    return 0;
}

I have Segmentation fault for printing in

printdatain

and

printdataout

What could be wrong?