您的位置:首页 > 游戏 > 手游 > ih5网页设计_网站建设课程设计要求_搜狗站长平台主动提交_互联网推广平台

ih5网页设计_网站建设课程设计要求_搜狗站长平台主动提交_互联网推广平台

2024/11/17 12:34:39 来源:https://blog.csdn.net/qq_43666068/article/details/143468670  浏览:    关键词:ih5网页设计_网站建设课程设计要求_搜狗站长平台主动提交_互联网推广平台
ih5网页设计_网站建设课程设计要求_搜狗站长平台主动提交_互联网推广平台

目标: 手写一些cuda库, 使用python调用这些库 (Linux)

步骤一: 在linux上安装pybind11

方法1: sudo apt-get install python3-pybind11
方法2: git clone https://github.com/pybind/pybind11.git, 如果将其放在项目目录下的话可以不编译

步骤二: 编写CUDA代码

示例: gpu_library.cu

#include <sstream>
#include <iostream>
#include <cuda_runtime.h>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <cuda_runtime.h>template <typename T>
__global__ void kernel
(T *vec, T scalar, int num_elements)
{unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;if (idx < num_elements) {vec[idx] = vec[idx] * scalar;}
}template <typename T>
void run_kernel
(T *vec, T scalar, int num_elements)
{dim3 dimBlock(256, 1, 1);dim3 dimGrid(ceil((T)num_elements / dimBlock.x));kernel<T><<<dimGrid, dimBlock>>>(vec, scalar, num_elements);cudaError_t error = cudaGetLastError();if (error != cudaSuccess) {std::stringstream strstr;strstr << "run_kernel launch failed" << std::endl;strstr << "dimBlock: " << dimBlock.x << ", " << dimBlock.y << std::endl;strstr << "dimGrid: " << dimGrid.x << ", " << dimGrid.y << std::endl;strstr << cudaGetErrorString(error);throw strstr.str();}
}template <typename T>
void map_array(pybind11::array_t<T> vec, T scalar)
{pybind11::buffer_info ha = vec.request();if (ha.ndim != 1) {std::stringstream strstr;strstr << "ha.ndim != 1" << std::endl;strstr << "ha.ndim: " << ha.ndim << std::endl;throw std::runtime_error(strstr.str());}int size = ha.shape[0];int size_bytes = size*sizeof(T);T *gpu_ptr;cudaError_t error = cudaMalloc(&gpu_ptr, size_bytes);if (error != cudaSuccess) {throw std::runtime_error(cudaGetErrorString(error));}T* ptr = reinterpret_cast<T*>(ha.ptr);error = cudaMemcpy(gpu_ptr, ptr, size_bytes, cudaMemcpyHostToDevice);if (error != cudaSuccess) {throw std::runtime_error(cudaGetErrorString(error));}run_kernel<T>(gpu_ptr, scalar, size);error = cudaMemcpy(ptr, gpu_ptr, size_bytes, cudaMemcpyDeviceToHost);if (error != cudaSuccess) {throw std::runtime_error(cudaGetErrorString(error));}error = cudaFree(gpu_ptr);if (error != cudaSuccess) {throw std::runtime_error(cudaGetErrorString(error));}
}PYBIND11_MODULE(gpu_library, m)
{m.def("multiply_with_scalar", map_array<double>);
}
步骤三: 编写CMakeLists.txt
cmake_minimum_required(VERSION 3.6)
find_package(CUDA REQUIRED)
find_package(PythonInterp 3.6 REQUIRED)
find_package(PythonLibs 3.6 REQUIRED)include_directories(${PYTHON_INCLUDE_DIRS}./pybind11/include)link_directories(/usr/local/cuda/lib64)set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")cuda_add_library(gpu_library SHAREDsrc/gpu_library.cu)target_link_libraries(gpu_library${PYTHON_LIBRARIES}cudart)set_target_properties(gpu_library PROPERTIES PREFIX "")
步骤四: 编译
mkdir build
cd build
cmake ..
make -j 8
export PYTHONPATH="$PWD:$PYTHONPATH"          (这一句要在build下面执行)
cd ..
步骤五: 测试

测试文件示例

import gpu_library
import numpy as np
import timesize = 100000000
arr1 = np.linspace(1.0,100.0, size)
arr2 = np.linspace(1.0,100.0, size)runs = 10
factor = 3.0t0 = time.time()
for _ in range(runs):gpu_library.multiply_with_scalar(arr1, factor)
print("gpu time: " + str(time.time()-t0))
t0 = time.time()
for _ in range(runs):arr2 = arr2 * factor
print("cpu time: " + str(time.time()-t0))print("results match: " + str(np.allclose(arr1,arr2)))

输出结果:
在这里插入图片描述

版权声明:

本网仅为发布的内容提供存储空间,不对发表、转载的内容提供任何形式的保证。凡本网注明“来源:XXX网络”的作品,均转载自其它媒体,著作权归作者所有,商业转载请联系作者获得授权,非商业转载请注明出处。

我们尊重并感谢每一位作者,均已注明文章来源和作者。如因作品内容、版权或其它问题,请及时与我们联系,联系邮箱:809451989@qq.com,投稿邮箱:809451989@qq.com