NVIDIA硬件提供了64KB的常量内存,并且对常量呢日常采取了不同于标准全局内存的处理方式。在某些情况下,常量内存替换全局内存能有效地减少内存带宽。
常量内存是有常量缓存的全局内存,数量有限,大小仅为64KB,由于有缓存,线程束在读取相同的常量内存数据时,访问速度比全局内存块。
常量内存中的数据对于同一编译单元内所有线程可见。
使用__constant__修饰的变量存放于常量内存中,不能定义在核函数中,且常量内存是静态定义的。
常量内存仅可读,不可写。
给函数传递数值参数时,这个变量就存放于常量内存。
常量内存可以定义的时候初始化,也可以在主机端使用cudaMemcpyToSymbol进行初始化。
线程束中所有的线程从相同内存地址中读取数据时,常量内存表现最好,例如数学公式中的系数,因为线程束中所有的线程都需要读取同一个地址空间的系统数据,因此只需要读取一次,广播给线程束的所有线程。
如果在核函数中定义常量内存会有报错,如下图:
示例一:常量内存变量的定义
#include <stdio.h>
#include "cpu_anim.h"
#include "cuda_runtime.h"
#include <device_launch_parameters.h>
#include <device_functions.h>// 常量的声明、定义、初始化
__constant__ float c_data;
__constant__ float c_data2 = 6.6f;__global__ void kernel_1()
{// 核函数中常量的使用printf("Constant data c_data = %.2f.\n", c_data);
}int main()
{float h_data = 8.8f;// 常量的赋值cudaMemcpyToSymbol(c_data, &h_data, sizeof(float));kernel_1 << <1, 1 >> > ();cudaDeviceSynchronize();// 获取常量的值cudaMemcpyFromSymbol( &h_data, c_data2, sizeof(float));printf("Constant data h_data = %.2f.\n", h_data);return 0;
}
示例二:模拟显示的显示(raytracing)
生成一堆小球,在一个显示框中生成一堆raytracing,最后生成一张图
#include "cuda_runtime.h"#include "cpu_bitmap.h"
#include <device_launch_parameters.h>#include <iostream>
#include <random>
#include <time.h>
#include <stdlib.h>#define INF 2e10f
#define SPHERE_COUNT 20
#define IMAGE_DIM 512 //图像尺寸
#define rnd(x) (x * rand() / double(RAND_MAX))static void CheckCudaErrorAux(const char*, unsigned, const char*, cudaError_t);
#define CUDA_CHECK(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)struct Sphere
{float r, g, b; // 小球颜色float radius; // 小球半径float x, y, z; // 小球原点// 计算(ox, oy)点和(x,y)的连线是否和当前小球相交// ox\oy的范围 [-512,511]__device__ float hit(float ox, float oy, float* n) {float dx = ox - x;float dy = oy - y;// printf("dx = %.2f,dy = %.2f \t", dx, dy);// 如果当前点在球内if ((dx * dx + dy * dy) < radius * radius){// 计算Z轴方向上当前点到球中心的距离// x*x+y*y+z*z=r*r,等价于z*z=r*r-x*x-y*yfloat dz = sqrtf(radius * radius - dx * dx - dy * dy);// n代表计算的Z分量与半径的比值,作用比值决定灰度的大小,比值大,灰度大*n = dz / sqrtf(radius * radius);return (dz + z);}return -INF;}
};__constant__ Sphere dev_s[SPHERE_COUNT];__global__ void genBitmap(unsigned char *ptr)
{int idx = threadIdx.x + blockIdx.x * blockDim.x; // [0,1023]int idy = threadIdx.y + blockIdx.y * blockDim.y; // [0,1023]int offset = idx + idy * gridDim.x * blockDim.x;// 让Z轴始终穿过图像的中心float ox = idx - IMAGE_DIM / 2; // [-512,511]float oy = idy - IMAGE_DIM / 2; // [-512,511] float r = 0.8f, g = 0.6f, b = 0.8f; // 默认背景色float maxz = -INF;for (size_t i = 0; i < SPHERE_COUNT; i++){float fScale = -INF;float t = dev_s[i].hit(ox, oy, &fScale);if (t > maxz){r = dev_s[i].r * fScale;g = dev_s[i].g * fScale;b = dev_s[i].b * fScale;}}ptr[offset * 4 + 0] = (int)(r * 255);ptr[offset * 4 + 1] = (int)(g * 255);ptr[offset * 4 + 2] = (int)(b * 255);ptr[offset * 4 + 3] = 255;
}int main()
{srand((unsigned)time(NULL));CPUBitmap bitmap(IMAGE_DIM, IMAGE_DIM);unsigned char* dev_bitmap = nullptr;CUDA_CHECK(cudaMalloc((void **)&dev_bitmap, bitmap.image_size()));// 创建了一个sphere的数组CUDA_CHECK(cudaMalloc((void**)&dev_s, sizeof(Sphere) * SPHERE_COUNT));// 随机生成球面的中心坐标、颜色、以及半径// 分配临时内存,对其初始化,并复制到GPU上的内存,然后释放临时内存Sphere* host_s = (Sphere*)malloc(sizeof(Sphere) * SPHERE_COUNT);for (size_t i = 0; i < SPHERE_COUNT; i++){host_s[i].r = rnd(1.0f);host_s[i].g = rnd(1.0f);host_s[i].b = rnd(1.0f);host_s[i].x = rnd(1000.0f) - 500;host_s[i].y = rnd(1000.0f) - 500;host_s[i].z = rnd(1000.0f) - 500;host_s[i].radius = rnd(100.0f) + 20;}CUDA_CHECK(cudaMemcpyToSymbol(dev_s, host_s, sizeof(Sphere) * SPHERE_COUNT));// 为球面数据生成一张位图dim3 blocks(32, 32);dim3 grids((IMAGE_DIM + blocks.x - 1) / blocks.x, (IMAGE_DIM + blocks.y - 1) / blocks.y);genBitmap <<<grids, blocks >>> (dev_bitmap);CUDA_CHECK(cudaDeviceSynchronize());// 将位图从GPU复制到CPU以显示CUDA_CHECK(cudaMemcpy(bitmap.get_ptr(), dev_bitmap, bitmap.image_size(), cudaMemcpyDeviceToHost));bitmap.display_and_exit();CUDA_CHECK(cudaFree(dev_bitmap));CUDA_CHECK(cudaFree(dev_s));free(host_s);return 0;
}static void CheckCudaErrorAux(const char* file, unsigned line, const char* statement, cudaError_t err)
{if (err == cudaSuccess)return;std::cerr << statement << " returned " << cudaGetErrorString(err) << "(" << err << ") at " << file << ":" << line << std::endl;exit(1);
}
结果图
计时:
延申: 增加小球个数200,增加计时
使用全局内存实现
#include "cuda_runtime.h"#include "cpu_bitmap.h"
#include <device_launch_parameters.h>#include <iostream>
#include <random>
#include <time.h>
#include <stdlib.h>#define INF 2e10f
#define SPHERE_COUNT 200
#define IMAGE_DIM 512 //图像尺寸
#define rnd(x) (x * rand() / double(RAND_MAX))static void CheckCudaErrorAux(const char*, unsigned, const char*, cudaError_t);
#define CUDA_CHECK(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)struct Sphere
{float r, g, b; // 小球颜色float radius; // 小球半径float x, y, z; // 小球原点// 计算(ox, oy)点和(x,y)的连线是否和当前小球相交// ox\oy的范围 [-512,511]__device__ float hit(float ox, float oy, float* n) {float dx = ox - x;float dy = oy - y;// printf("dx = %.2f,dy = %.2f \t", dx, dy);// 如果当前点在球内if ((dx * dx + dy * dy) < radius * radius){// 计算Z轴方向上当前点到球中心的距离// x*x+y*y+z*z=r*r,等价于z*z=r*r-x*x-y*yfloat dz = sqrtf(radius * radius - dx * dx - dy * dy);// n代表计算的Z分量与半径的比值,作用比值决定灰度的大小,比值大,灰度大*n = dz / sqrtf(radius * radius);return (dz + z);}return -INF;}
};__device__ Sphere dev_s[SPHERE_COUNT];__global__ void genRayTracingImage(unsigned char* ptr)
{int idx = threadIdx.x + blockIdx.x * blockDim.x; // [0,1023]int idy = threadIdx.y + blockIdx.y * blockDim.y; // [0,1023]int offset = idx + idy * gridDim.x * blockDim.x;// 让Z轴始终穿过图像的中心float ox = idx - IMAGE_DIM / 2; // [-512,511]float oy = idy - IMAGE_DIM / 2; // [-512,511] float r = 0.8f, g = 0.6f, b = 0.8f; // 默认背景色float maxz = -INF;for (size_t i = 0; i < SPHERE_COUNT; i++){float fScale = -INF;float t = dev_s[i].hit(ox, oy, &fScale);if (t > maxz){r = dev_s[i].r * fScale;g = dev_s[i].g * fScale;b = dev_s[i].b * fScale;}}ptr[offset * 4 + 0] = (int)(r * 255);ptr[offset * 4 + 1] = (int)(g * 255);ptr[offset * 4 + 2] = (int)(b * 255);ptr[offset * 4 + 3] = 255;
}int mainretrytruyrt()
{srand((unsigned)time(NULL));// 性能测试cudaEvent_t start, stop;CUDA_CHECK(cudaEventCreate(&start));CUDA_CHECK(cudaEventCreate(&stop));CPUBitmap bitmap(IMAGE_DIM, IMAGE_DIM);unsigned char* dev_bitmap = nullptr;CUDA_CHECK(cudaMalloc((void**)&dev_bitmap, bitmap.image_size()));// 创建了一个sphere的数组//CUDA_CHECK(cudaMalloc((void**)&dev_s, sizeof(Sphere) * SPHERE_COUNT));// 随机生成球面的中心坐标、颜色、以及半径// 分配临时内存,对其初始化,并复制到GPU上的内存,然后释放临时内存Sphere* host_s = (Sphere*)malloc(sizeof(Sphere) * SPHERE_COUNT);for (size_t i = 0; i < SPHERE_COUNT; i++){host_s[i].r = rnd(1.0f);host_s[i].g = rnd(1.0f);host_s[i].b = rnd(1.0f);host_s[i].x = rnd(1000.0f) - 500;host_s[i].y = rnd(1000.0f) - 500;host_s[i].z = rnd(1000.0f) - 500;host_s[i].radius = rnd(100.0f) + 20;}CUDA_CHECK(cudaEventRecord(start));// CUDA_CHECK(cudaMemcpy(dev_s, host_s, sizeof(Sphere) * SPHERE_COUNT, cudaMemcpyHostToDevice));CUDA_CHECK(cudaMemcpyToSymbol(dev_s, host_s, sizeof(Sphere) * SPHERE_COUNT));// 为球面数据生成一张位图dim3 blocks(32, 32);dim3 grids((IMAGE_DIM + blocks.x - 1) / blocks.x, (IMAGE_DIM + blocks.y - 1) / blocks.y);genRayTracingImage <<<grids, blocks >> > (dev_bitmap);CUDA_CHECK(cudaDeviceSynchronize());// 将位图从GPU复制到CPU以显示CUDA_CHECK(cudaMemcpy(bitmap.get_ptr(), dev_bitmap, bitmap.image_size(), cudaMemcpyDeviceToHost));CUDA_CHECK(cudaEventRecord(stop));CUDA_CHECK(cudaEventSynchronize(stop));float elapsedTime = 0.0f;CUDA_CHECK(cudaEventElapsedTime(&elapsedTime, start, stop));printf("Time to generate : %3.1f ms \n", elapsedTime);bitmap.display_and_exit();CUDA_CHECK(cudaFree(dev_bitmap));CUDA_CHECK(cudaFree(dev_s));CUDA_CHECK(cudaEventDestroy(start));CUDA_CHECK(cudaEventDestroy(stop));free(host_s);return 0;
}static void CheckCudaErrorAux(const char* file, unsigned line, const char* statement, cudaError_t err)
{if (err == cudaSuccess)return;std::cerr << statement << " returned " << cudaGetErrorString(err) << "(" << err << ") at " << file << ":" << line << std::endl;exit(1);
}