7.1.1 浮点指令
#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <iostream>int main(void){float a = 3.1415927f;float b = 3.1415928f;if(a == b){printf("a equal b");} else{printf("not equal");}return 0;
}
为a和b的最后一个数字不同,预期输出不同,但实际输出: a equal b
是因为在IEEE754体系下,位数不够。
换成double类型就可以输出不相等:
double A = 3.1415927;double B = 3.1415928;if(A == B){printf("A equal B\n");} else{printf("not equal\n");}
7.1.2 内部函数和标准函数
标准函数包含来自于C标准数学库的数学运算,如sqrt、exp和sin。CUDA内置函数只能对设备代码进行访问。在CUDA中,许多内部函数与标准函数是有关联的,这意味着存在与内部函数功能相同的标准函数。举个例子,标准函数中的双精度浮点平方根函数也就是sqrt。有相同功能的内部函数是__dsqrt_rn。内部函数分解成了比与它们等价的标准函数更少的指令。这会导致内部函数比等价的标准函数更快,但数值精确度却更低。
#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <iostream>__global__ void intrinsic(float *ptr){*ptr = __powf(*ptr, 2.0f);
}__global__ void standard(float *ptr){*ptr = powf(*ptr, 2.0f);
}
两个核函数分别用了标准函数和内部函数,编译PTX查看(nvcc 7_2_foo.cu --ptx -o foo.ptx):
下面是调用内部函数的PTX, 而调用标准函数的编译代码有~140行。
.visible .entry _Z9intrinsicPf(.param .u64 _Z9intrinsicPf_param_0
)
{.reg .f32 %f<5>;.reg .b64 %rd<3>;ld.param.u64 %rd1, [_Z9intrinsicPf_param_0];cvta.to.global.u64 %rd2, %rd1;ld.global.f32 %f1, [%rd2];lg2.approx.f32 %f2, %f1;add.f32 %f3, %f2, %f2;ex2.approx.f32 %f4, %f3;st.global.f32 [%rd2], %f4;ret;}
7.2.1 单精度与双精度的比较
#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <iostream>__global__ void kernel(float *F, double *D)
{int tid = blockIdx.x * blockDim.x + threadIdx.x;if (tid == 0){*F = 12.1;*D = 12.1;}
}int main(int argc, char **argv)
{float *deviceF;float h_deviceF;double *deviceD;double h_deviceD;float hostF = 12.1;double hostD = 12.1;CHECK(cudaMalloc((void **)&deviceF, sizeof(float)));CHECK(cudaMalloc((void **)&deviceD, sizeof(double)));kernel<<<1, 32>>>(deviceF, deviceD);CHECK(cudaMemcpy(&h_deviceF, deviceF, sizeof(float),cudaMemcpyDeviceToHost));CHECK(cudaMemcpy(&h_deviceD, deviceD, sizeof(double),cudaMemcpyDeviceToHost));printf("Host single-precision representation of 12.1 = %.20f\n", hostF);printf("Host double-precision representation of 12.1 = %.20f\n", hostD);printf("Device single-precision representation of 12.1 = %.20f\n", h_deviceF);printf("Device double-precision representation of 12.1 = %.20f\n", h_deviceD);printf("Device and host single-precision representation equal? %s\n",hostF == h_deviceF ? "yes" : "no");printf("Device and host double-precision representation equal? %s\n",hostD == h_deviceD ? "yes" : "no");return 0;
}
输出:
Host single-precision representation of 12.1 = 12.10000038146972656250
Host double-precision representation of 12.1 = 12.09999999999999964473
Device single-precision representation of 12.1 = 12.10000038146972656250
Device double-precision representation of 12.1 = 12.09999999999999964473
Device and host single-precision representation equal? yes
Device and host double-precision representation equal? yes