Load csv with cudf in c++

Hi,
I have recently start using cudf for parsing csv file in c++. I’m able to build an example and get some info about the loaded data. However, I would like to specify data type for each column then extract the value at each index (i…n, j…m).

    // Read given file passed as argument
    std::string filename(argv[1]);
    // TODO: how to specify data type for each columns
    auto reader = read_csv(filename);
    auto schema_info = reader.metadata.schema_info;
    size_t cols = reader.tbl->num_columns();
    
    for(size_t row = 0; row < reader.tbl->num_rows(); ++row)
    {
        // TODO: how to get access to value similar to lines below
        size_t col = 0;
        int64_t timestamp = reader.tbl->at(row,col);
        ++col; // implement check if col > cols
        std::string instrument = reader.tbl->at(row,col);
        ++col; // implement check if col > cols
        double price = reader.tbl->at(row,col);
    }

Thanks,
Dom

Most RAPIDS requests are in python but the C++ interface is the Python impl foundation. This sample code is meant to be building blocks for your code already presented and it use generic auto data typing to skip past impl details of varing column data types. Being that your question was mainly about setting column dtypes, please look over this Issue which lists all the possible types, https://github.com/rapidsai/cudf/issues/1119 . For character strings see this more detailed developers guide for libcudf and string_scalar topic https://docs.rapids.ai/api/libcudf/stable/developer_guide



|
Use traits to set gdf_data elements and other typedefs · Issue #1119 · rapidsai/cudf
Once #892 is merged and after we move away from cffi in #599 , update the union gdf_data to use traits defined members. The code to be changed: typedef union { int8_t si08; /**< GDF_INT8 */ int16_t…
github.com
|

  • | - |

#include <cudf/io/functions.hpp>
#include <cudf/io/csv.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/types.hpp>

std::unordered_map<std::string, std::vector<gdf_scalar>> read_csv_with_types(const std::string& filename, const std::unordered_map<std::string, gdf_dtype>& column_types)
{
// Create an empty unordered map to store the extracted data
std::unordered_map<std::string, std::vector<gdf_scalar>> data;

// Read the CSV file using cuDF
cudf::io::csv_reader_options options = cudf::io::csv_reader_options::builder(cudf::io::source_info{filename});
for (const auto& column_type : column_types)
{
options.set_dtypes({column_type.first}, {column_type.second});
}
auto result = cudf::io::read_csv(options);

// Iterate over each column in the result table
auto table = result.tbl;
for (cudf::size_type i = 0; i < table.num_columns(); ++i)
{
const auto& column = table.get_column(i);
const auto& column_name = column->name();
const auto& column_data = static_cast<const cudf::column_view&>(*column).begin<gdf_scalar>();

// Store the column data in the unordered map
data[column_name] = std::vector<gdf_scalar>(column_data, column_data + column->size());
}

return data;
}

int main()
{
// Specify the filename of the CSV file
std::string filename = “your_file.csv”;

// Specify the data types of each column
std::unordered_map<std::string, gdf_dtype> column_types;
column_types[“column1”] = GDF_INT32;
column_types[“column2”] = GDF_FLOAT64;
column_types[“column3”] = GDF_STRING;

// Call the CSV file reader function
std::unordered_map<std::string, std::vector<gdf_scalar>> data = read_csv_with_types(filename, column_types);

// Access the extracted data
for (const auto& column_data : data)
{
const std::string& column_name = column_data.first;
const std::vector<gdf_scalar>& column_values = column_data.second;

std::cout << “Column '” << column_name << “’ data:” << std::endl;
for (const auto& scalar : column_values)
{
// Access scalar value using scalar.is_valid and scalar.data
// …
}
}

return 0;
}

Hi @bfurtaw,

It looks like the gdf_data is not available in latest libcudf. However have done similar implementation using cudf::data_type but it gives segmentation fault when casting column value. Below code snippet

#include <cudf/io/csv.hpp>
#include <cudf/table/table.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/types.hpp>

using ColumnTypes = std::map<std::string, cudf::data_type>;

cudf::io::table_with_metadata read_csv(
    std::string const& file_path, ColumnTypes column_types)
{
    auto source_info = cudf::io::source_info(file_path);
    auto builder = cudf::io::csv_reader_options::builder(source_info).dtypes(column_types);
    cudf::io::csv_reader_options options = builder.build();
    
    return cudf::io::read_csv(options);
}

int main(int argc, char** argv)
{
    ColumnTypes column_types;
    column_types["open_time"] = cudf::data_type(cudf::type_id::INT64);
    column_types["open"] = cudf::data_type(cudf::type_id::FLOAT32);

    // Read data
    std::string filename(argv[1]);
    auto reader = read_csv(filename, column_types);
    auto schema_info = reader.metadata.schema_info;
    size_t cols = reader.tbl->num_columns();
    size_t rows = reader.tbl->num_rows();

    for(size_t ridx = 0; ridx < rows; ++ridx)
    {
        size_t cidx = 0; // iterate on first column only
        const auto& column = reader.tbl->get_column(cidx);
        auto name = schema_info.at(cidx).name;
        const int64_t* values = column.view().begin<int64_t>();
        std::cout << values[ridx] << "\n"; // THROW SEGMENTATION FAULT
    }
    
    std::cout << "\n";

    return 0;

}

Solved by copying data to host_vector

template<typename T>
thrust::host_vector<T> get_host_values(cudf::column_view col_view)
{
    size_t size = col_view.size();
    const T* ptr = col_view.begin<T>();
    thrust::host_vector<T> values(size);
    if(!col_view.is_empty())
    {
        CUDF_CUDA_TRY(cudaMemcpy(values.data(), ptr, size * sizeof(T), cudaMemcpyDefault));
    }

    return values;
}

int main(int argc, char** argv)
{
    ....
    thrust::host_vector<int64_t> values;
    for(size_t ridx = 0; ridx < rows; ++ridx)
    {
        size_t cidx = 0; // iterate on first column only
        const auto& column = reader.tbl->get_column(cidx);
        // cache values from device to host once
        if(values.size() == 0)
        {
            // TODO: get host data for each column.view().type().id()
            values = get_host_values<int64_t>(column.view());
        }
        std::cout << values[ridx] << "\n"; 
    }
   ....
}

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.