当前位置: 代码迷 >> 综合 >> CUDA By Examples 7 - 测量GPU运行耗时
  详细解决方案

CUDA By Examples 7 - 测量GPU运行耗时

热度:54   发布时间:2023-12-17 18:24:01.0

测量方法:
1. 使用cudaEventCreate创建event;
2. 使用cudaEventRecord记录;
3. 使用cudaEventSynchronize同步, 等待GPU指令完成. 方便读time stamp.

注意:
1. 不能用于测量device和host的混合代码的用时;
2. 只能用于测量GPU内部kernel执行指令和存储拷贝用时.

#include "cuda.h"
#include "../common/book.h"
#include "../common/cpu_bitmap.h"#define INF 2e10f
#define rnd( x ) (x * rand() / RAND_MAX)
#define SPHERES 20 //球体数量
#define DIM 1024 // bitmap图大小struct Sphere{float r,b,g;float radius;float x,y,z; //球心//从某一个像素射出的垂直于bitmap平面的射线,和当前球体相交,求出最近的一个交点//没有交点就返回负无穷__device__ float hit(float ox, float oy, float *n){float dx = ox - x;float dy = oy - y;if(dx*dx + dy*dy < radius*radius){float dz = sqrtf(radius*radius - dx*dx - dy*dy);*n = dz / sqrtf( radius * radius);return dz + z;}return -INF;}
};// 不是指针
// 常量内存的声明不能放在函数体内部.
__constant__ Sphere s[SPHERES];
__global__ void kernel( unsigned char *ptr)
{int x = threadIdx.x + blockIdx.x * blockDim.x;int y = threadIdx.y + blockIdx.y * blockDim.y;int offset = x + y * blockDim.x * gridDim.x;float ox = (x - DIM/2);float oy = (y - DIM/2);float r=0, g=0, b=0;float maxz = -INF;for (int i=0; i<SPHERES; i++){float n;float t = s[i].hit(ox, oy, &n);if (t > maxz){float fscale = n;r = s[i].r * fscale;g = s[i].g * fscale;b = s[i].b * fscale;}}//四个通道赋值ptr[offset*4 + 0] = (int)(r*255);ptr[offset*4 + 1] = (int)(g*255);ptr[offset*4 + 2] = (int)(b*255);ptr[offset*4 + 3] = 255;
}int main(void)
{//创建Event, 记录开始时间.cudaEvent_t start, stop;HANDLE_ERROR( cudaEventCreate( &start ) );HANDLE_ERROR( cudaEventCreate( &stop ) );HANDLE_ERROR( cudaEventRecord( start, 0 ) );CPUBitmap bitmap(DIM, DIM);unsigned char *dev_bitmap;HANDLE_ERROR( cudaMalloc( (void**)&dev_bitmap,bitmap.image_size() ) );//常量内存不需要为之开辟GPU存储空间.//HANDLE_ERROR( cudaMalloc( (void**)&s,// sizeof(Sphere) * SPHERES ) );//球体参数在CPU上初始化Sphere *temp_s = (Sphere*)malloc( sizeof(Sphere) * SPHERES );for (int i=0; i<SPHERES; i++){temp_s[i].r = rnd( 1.0f );temp_s[i].g = rnd( 1.0f );temp_s[i].b = rnd( 1.0f );temp_s[i].x = rnd( 1000.0f ) - 500;temp_s[i].y = rnd( 1000.0f ) - 500;temp_s[i].z = rnd( 1000.0f ) - 500;temp_s[i].radius = rnd( 100.0f ) + 20;}//把Sphere从主机拷贝到GPU上HANDLE_ERROR( cudaMemcpyToSymbol( s, temp_s,sizeof(Sphere) * SPHERES ) );free( temp_s );// GPU上做处理dim3 grids(DIM/16, DIM/16);dim3 threads(16, 16);kernel<<<grids,threads>>>(dev_bitmap);//从GPU拷贝到Host上HANDLE_ERROR( cudaMemcpy( bitmap.get_ptr(),dev_bitmap,bitmap.image_size(),cudaMemcpyDeviceToHost ) );//计算耗时HANDLE_ERROR( cudaEventRecord( stop, 0 ) );HANDLE_ERROR( cudaEventSynchronize( stop ) );float elapsedTime;HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, start, stop ) );printf( "Time to generate: %3.1f ms\n", elapsedTime );HANDLE_ERROR( cudaEventDestroy( start ) );HANDLE_ERROR( cudaEventDestroy( stop ) );//显示bitmap.display_and_exit();//free memorycudaFree( dev_bitmap );//cudaFree( s );}

这里写图片描述