C

.

// for example...
  float data[10000]; // big enough to retrieve items from DB
  int   size;
  // retrieve data from MySQL into 'data[]', and the number of items into 'size'
  // (your code here)
  float* dev_data;
  cudaMalloc(&dev_data, size*sizeof(float)); // allocate device memory
  cudaMemcpy(dev_data, data, size*sizeof(float), cudaMemcpyDeviceToHost); // copy HOST to DEVICE
  ...

see https://dev.mysql.com/doc/refman/5.6/en/c-api-functions.html
to query and get the result.

.

I’m sorry to inform you that you cannot perform these operations with CUDA.

In few word,with cuda you can move “big” bounches of data between CPU and GPU and perform the “same” operation over multiple items simultaneously (e.g.you move several pictures and perform the same elaboration over tons of pixel).

with mysql,you cannot use the SAME flow execution over hundred or millions of threads.

But, according to https://journals.agh.edu.pl/csci/article/view/280
It is possible.

Is there any another approach i can use?

according to the paper,

  • whole records are already on DEVICE-mem.
  • GPU find records from the record-set that match WHERE-clause
// nvcc fake_select.cpp --expt-extended-lambda 
// CUDA 8.0

#include <cuda_runtime.h>
#include <device_launch_parameters.h>

struct record {
  float height;
  float weight;
};

template<typename Where>
__global__ void kernel_select(const record* records, unsigned int size, 
                              Where condition, int* count, int* indices) {
  unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
  if ( i == 0 ) *count = 0;
  __syncthreads();
  while ( i < size ) {
    if ( condition(records[i]) ) {
      indices[atomicAdd(count,1)] = i;
    }
    i += gridDim.x * blockDim.x;
  }
}

#include <random>
#include <iostream>
#include <iomanip>
#include <algorithm>
using namespace std;

int main() {
  const int N = 100;
  record records[N];

  // fill records[] with random numbers
  mt19937 gen;
  uniform_real_distribution<float> dist;
  generate_n( records, N, [&]() { return record{dist(gen),dist(gen)}; });

  // result
  int indices[N];
  int count;

  // allocate device-mem.
  record* dev_records;
  cudaMalloc(&dev_records, N*sizeof(record));
  int* dev_indices;
  cudaMalloc(&dev_indices, N * sizeof(int));
  int* dev_count;
  cudaMalloc(&dev_count, sizeof(int));

  // copy records from HOST to DEVICE
  cudaMemcpy(dev_records, records, N*sizeof(record), cudaMemcpyHostToDevice);

  // SELECT * FROM dev_records
  // WHERE weight < 0.1
  dim3 grid = (N+255)/256;
  dim3 block = 256;
  kernel_select<<<grid, block>>>(
    dev_records, N,  
    [] __device__(const record& rec) { return rec.weight < 0.1f; },
    dev_count, dev_indices);

  // copy result from DEVICE to HOST.
  cudaMemcpy(&count, dev_count, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(indices, dev_indices, count*sizeof(int), cudaMemcpyDeviceToHost);
  for (int i = 0; i < count; ++i) {
    cout << setw(10) << records[indices[i]].height << " : "  
         << setw(10) << records[indices[i]].weight << endl;
  }
  cout << count << " records found." << endl;

  cudaFree(dev_records);
  cudaFree(dev_count);
  cudaFree(dev_indices);
  cudaDeviceReset();

}

outptus:

0.141886 : 0.00478348
  0.765517 :  0.0512164
    0.7952 :  0.0364413
  0.679703 : 0.00281843
  0.223812 :  0.0177739
  0.243525 :  0.0135391
   0.35166 :  0.0675954
7 records found.

Yes,I’ve read some years ago about these databases. I’ve just tell you “no” because I’ve supposed (maybe wrongly) what your knowledge about cuda was not good: you cannot simply launch over cuda the first C/C++ library that you see. When someone is able to create/run with good results an sql engine (or other technologies) over cuda, it doesn’t mean that you can run ANY sql engine over cuda. GPGPU is a different technology than CPU, with different kind of bottleneck