Hello
I’m doing comparisons between two very long vectors (several million points). The question is to know if these two vectors have points in common and where are these points in each vector.
The vectors are a set of geographic coordinates (longitude, latitude).
Before going to the real vector I made a small test on smaller vectors but it does not work because only the first element are tested and I could not well know why.
Here is my code
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
cudaError_t compVect(unsigned char *c, const int *lona, const int *lata, const int *lonb, const int *latb, unsigned int sizeA, unsigned int sizeB);
__global__ void compKernel(unsigned char *c, const int *lona, const int *lata, const int *lonb, const int *latb, int nbA, int nbB)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
int j = threadIdx.y + blockIdx.y*blockDim.y;
int k = i + j * nbA;
while (k < nbA*nbB)
{
if (lona[i] == lonb[j] && lata[i] == latb[j])
c[k] = 1;
else
c[k] = 0;
i += gridDim.x*blockDim.x;
j += gridDim.y*blockDim.y;
k = i + j * nbA;
}
}
int main()
{
int *lon[2];
int *lat[2];
int nb[2];
#ifdef TEST
lon[0] = new int[15];
lat[0] = new int[15];
lon[1] = new int[7];
lat[1] = new int[7];
nb[0] = 15;
nb[1] = 7;
for (int i = 0; i < 15; i++)
{
lon[0][i] = i + 1;
lat[0][i] = 2 * i;
}
for (int i = 0; i < 7; i++)
{
lon[1][i] = 3 * i + 1;
lat[1][i] = 4 * i;
}
lon[0][14] = lon[1][6];
lat[0][14] = lat[1][6];
for (int i = 0; i < 15; i++)
{
printf("L=%i l=%i ", lon[0][i], lat[0][i]);
}
printf("\n");
for (int i = 0; i < 7; i++)
{
printf("L=%i l=%i ", lon[1][i], lat[1][i]);
}
unsigned char *c = new unsigned char[nb[0] * nb[1]];
// Add vectors in parallel.
cudaError_t cudaStatus = compVect(c, lon[0], lat[0], lon[1], lat[1], nb[0], nb[1]);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
for (int i = 0; i < nb[0] * nb[1]; i++)
{
if (c[i] == 1)
{
int y = i / nb[0];
int x = i % nb[0];
printf("A=%i B=%i", x, y);
}
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t compVect(unsigned char *c, const int *lona, const int *lata, const int *lonb, const int *latb, unsigned int sizeA, unsigned int sizeB)
{
int *dev_lona = 0;
int *dev_lonb = 0;
int *dev_lata = 0;
int *dev_latb = 0;
unsigned char *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, sizeA*sizeB * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_lona, sizeA * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_lata, sizeA * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_lonb, sizeB * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_latb, sizeB * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_lona, lona, sizeA * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_lata, lata, sizeA * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_lonb, lonb, sizeB * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_latb, latb, sizeB * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
dim3 grid(2, 2);
compKernel << <grid, grid >> > (dev_c, dev_lona, dev_lata, dev_lonb, dev_latb,sizeA,sizeB);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, sizeA * sizeB * sizeof(unsigned char), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_lona);
cudaFree(dev_lonb);
cudaFree(dev_lata);
cudaFree(dev_latb);
return cudaStatus;
}