deviceInfo.cu
#include <cuda_runtime_api.h>
#include <iostream>// Beginning of GPU Architecture definitions
inline int _ConvertSMVer2Cores(int major, int minor) {// Defines for GPU Architecture types (using the SM version to determine// the # of cores per SMtypedef struct {int SM; // 0xMm (hexidecimal notation), M = SM Major version,// and m = SM minor versionint Cores;} sSMtoCores;sSMtoCores nGpuArchCoresPerSM[] = {{0x30, 192}, {0x32, 192}, {0x35, 192}, {0x37, 192}, {0x50, 128},{0x52, 128}, {0x53, 128}, {0x60, 64}, {0x61, 128}, {0x62, 128},{0x70, 64}, {0x72, 64}, {0x75, 64}, {0x80, 64}, {0x86, 128},{0x87, 128}, {0x89, 128}, {0x90, 128}, {0xa0, 128}, {0xa1, 128},{0xc0, 128}, {-1, -1}};int index = 0;while (nGpuArchCoresPerSM[index].SM != -1) {if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {return nGpuArchCoresPerSM[index].Cores;}index++;}// If we don't find the values, we default use the previous one// to run properlyprintf("MapSMtoCores for SM %d.%d is undefined."" Default to use %d Cores/SM\n",major, minor, nGpuArchCoresPerSM[index - 1].Cores);return nGpuArchCoresPerSM[index - 1].Cores;
}inline const char *_ConvertSMVer2ArchName(int major, int minor) {// Defines for GPU Architecture types (using the SM version to determine// the GPU Arch name)typedef struct {int SM; // 0xMm (hexidecimal notation), M = SM Major version,// and m = SM minor versionconst char *name;} sSMtoArchName;sSMtoArchName nGpuArchNameSM[] = {{0x30, "Kepler"}, {0x32, "Kepler"}, {0x35, "Kepler"},{0x37, "Kepler"}, {0x50, "Maxwell"}, {0x52, "Maxwell"},{0x53, "Maxwell"}, {0x60, "Pascal"}, {0x61, "Pascal"},{0x62, "Pascal"}, {0x70, "Volta"}, {0x72, "Xavier"},{0x75, "Turing"}, {0x80, "Ampere"}, {0x86, "Ampere"},{0x87, "Ampere"}, {0x89, "Ada"}, {0x90, "Hopper"},{0xa0, "Blackwell"}, {0xa1, "Blackwell"}, {0xc0, "Blackwell"},{-1, "Graphics Device"}};int index = 0;while (nGpuArchNameSM[index].SM != -1) {if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) {return nGpuArchNameSM[index].name;}index++;}// If we don't find the values, we default use the previous one// to run properlyprintf("MapSMtoArchName for SM %d.%d is undefined."" Default to use %s\n",major, minor, nGpuArchNameSM[index - 1].name);return nGpuArchNameSM[index - 1].name;
}
// end of GPU Architecture definitionsint main() {int count;cudaGetDeviceCount(&count); // 返回计算能力大于1.0的GPU数量// int gpuid = 0; // 选择GPU: 0// cudaSetDevice(gpuid); // 根据GPU的index设置需要的GPU,默认为0// cudaGetDevice(&gpuid); // 获得当前线程所使用的GPU index,赋值给devicefor (int i = 0; i < count; ++i) {struct cudaDeviceProp device_prop;auto error = cudaGetDeviceProperties(&device_prop, i);if (cudaSuccess != error) {std::cerr << "cudaGetDeviceProperties " << i << " error "<< cudaGetErrorString(error) << std::endl;break;}std::cout << "GPU \t" << i << std::endl;std::cout << "Name: \t" << device_prop.name << std::endl;std::cout << "Architecture: "<< _ConvertSMVer2ArchName(device_prop.major,device_prop.minor)<< std::endl;std::cout << "Capability: \t" << device_prop.major << "."<< device_prop.minor << std::endl;std::cout << "Spcores \t"<< _ConvertSMVer2Cores(device_prop.major, device_prop.minor) *device_prop.multiProcessorCount<< std::endl;std::cout << "Total Memory: \t"<< (device_prop.totalGlobalMem / 1024 / 1024) << " MB "<< std::endl;std::cout << "Shared Memory Per Block: \t"<< (device_prop.sharedMemPerBlock / 1024) << " KB "<< std::endl;std::cout << "warpSize: \t" << device_prop.warpSize << std::endl;std::cout << "Max Threads Per Block: \t"<< device_prop.maxThreadsPerBlock << std::endl;std::cout << "Max Threads Dim: \t[" << device_prop.maxThreadsDim[0]<< ", " << device_prop.maxThreadsDim[1] << ", "<< device_prop.maxThreadsDim[2] << "]" << std::endl;std::cout << "Max Grid Size: \t[" << device_prop.maxGridSize[0] << ", "<< device_prop.maxGridSize[1] << ", "<< device_prop.maxGridSize[2] << "]" << std::endl;}
}
_ConvertSMVer2Cores 用于获取每个流处理器的核心数,_ConvertSMVer2ArchName 用于获取架构名称,这两个函数都来自https://github.com/NVIDIA/cuda-samples/blob/master/Common/helper_cuda.h
CMakeLists.txt
cmake_minimum_required(VERSION 3.26)project(learningcuda CUDA CXX)# 该命令会导入一个名为 CUDA::toolkit 的模块. 并且会给包含在 CUDAToolkit 的一些库定义可选的导入目标. 例如可以使用
# CUDA::cudart 来导入 CUDA Runtime 库, 使用 CUDA::cublas 来导入 cuBLAS 库等.
find_package(CUDAToolkit REQUIRED)set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CUDA_STANDARD 11)# 变量 CMAKE_CUDA_ARCHITECTURES 是 CMake 3.18 版本中加入的一个变量, 用于指定编译 CUDA 代码时支持的 GPU
# 架构, 如果要使用新架构的一些特性, 则必须要指定特定的架构. nvidia-smi -q | grep Architecture 查看架构信息
set(CMAKE_CUDA_ARCHITECTURES 60)add_executable(deviceInfo deviceInfo.cu)
运行结果
$ ./deviceInfo
GPU 0
Name: NVIDIA GeForce GTX 1050 Ti
Architecture: Pascal
Capability: 6.1
Spcores 768
Total Memory: 4038 MB
Shared Memory Per Block: 48 KB
warpSize: 32
Max Threads Per Block: 1024
Max Threads Dim: [1024, 1024, 64]
Max Grid Size: [2147483647, 65535, 65535]