目标: 手写一些cuda库, 使用python调用这些库 (Linux)
步骤一: 在linux上安装pybind11
方法1: sudo apt-get install python3-pybind11
方法2: git clone https://github.com/pybind/pybind11.git, 如果将其放在项目目录下的话可以不编译
步骤二: 编写CUDA代码
示例: gpu_library.cu
#include <sstream>
#include <iostream>
#include <cuda_runtime.h>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <cuda_runtime.h>template <typename T>
__global__ void kernel
(T *vec, T scalar, int num_elements)
{unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;if (idx < num_elements) {vec[idx] = vec[idx] * scalar;}
}template <typename T>
void run_kernel
(T *vec, T scalar, int num_elements)
{dim3 dimBlock(256, 1, 1);dim3 dimGrid(ceil((T)num_elements / dimBlock.x));kernel<T><<<dimGrid, dimBlock>>>(vec, scalar, num_elements);cudaError_t error = cudaGetLastError();if (error != cudaSuccess) {std::stringstream strstr;strstr << "run_kernel launch failed" << std::endl;strstr << "dimBlock: " << dimBlock.x << ", " << dimBlock.y << std::endl;strstr << "dimGrid: " << dimGrid.x << ", " << dimGrid.y << std::endl;strstr << cudaGetErrorString(error);throw strstr.str();}
}template <typename T>
void map_array(pybind11::array_t<T> vec, T scalar)
{pybind11::buffer_info ha = vec.request();if (ha.ndim != 1) {std::stringstream strstr;strstr << "ha.ndim != 1" << std::endl;strstr << "ha.ndim: " << ha.ndim << std::endl;throw std::runtime_error(strstr.str());}int size = ha.shape[0];int size_bytes = size*sizeof(T);T *gpu_ptr;cudaError_t error = cudaMalloc(&gpu_ptr, size_bytes);if (error != cudaSuccess) {throw std::runtime_error(cudaGetErrorString(error));}T* ptr = reinterpret_cast<T*>(ha.ptr);error = cudaMemcpy(gpu_ptr, ptr, size_bytes, cudaMemcpyHostToDevice);if (error != cudaSuccess) {throw std::runtime_error(cudaGetErrorString(error));}run_kernel<T>(gpu_ptr, scalar, size);error = cudaMemcpy(ptr, gpu_ptr, size_bytes, cudaMemcpyDeviceToHost);if (error != cudaSuccess) {throw std::runtime_error(cudaGetErrorString(error));}error = cudaFree(gpu_ptr);if (error != cudaSuccess) {throw std::runtime_error(cudaGetErrorString(error));}
}PYBIND11_MODULE(gpu_library, m)
{m.def("multiply_with_scalar", map_array<double>);
}
步骤三: 编写CMakeLists.txt
cmake_minimum_required(VERSION 3.6)
find_package(CUDA REQUIRED)
find_package(PythonInterp 3.6 REQUIRED)
find_package(PythonLibs 3.6 REQUIRED)include_directories(${PYTHON_INCLUDE_DIRS}./pybind11/include)link_directories(/usr/local/cuda/lib64)set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")cuda_add_library(gpu_library SHAREDsrc/gpu_library.cu)target_link_libraries(gpu_library${PYTHON_LIBRARIES}cudart)set_target_properties(gpu_library PROPERTIES PREFIX "")
步骤四: 编译
mkdir build
cd build
cmake ..
make -j 8
export PYTHONPATH="$PWD:$PYTHONPATH" (这一句要在build下面执行)
cd ..
步骤五: 测试
测试文件示例
import gpu_library
import numpy as np
import timesize = 100000000
arr1 = np.linspace(1.0,100.0, size)
arr2 = np.linspace(1.0,100.0, size)runs = 10
factor = 3.0t0 = time.time()
for _ in range(runs):gpu_library.multiply_with_scalar(arr1, factor)
print("gpu time: " + str(time.time()-t0))
t0 = time.time()
for _ in range(runs):arr2 = arr2 * factor
print("cpu time: " + str(time.time()-t0))print("results match: " + str(np.allclose(arr1,arr2)))
输出结果: