如何建立和设置公司网站,西安视频拍摄制作公司,wordpress网站演示,创建公司网站免费2022-01-24 周一
《OpenCL编程指南》第三章
自写代码来理解get_global_id和get_global_size
使用本书第三章中关于输入信号卷积的代码来进行理解#xff0c;见随书代码“src/Chapter_3/OpenCLConvolution”#xff0c;附代码如下#xff1a;
//
// Book: O…2022-01-24 周一
《OpenCL编程指南》第三章
自写代码来理解get_global_id和get_global_size
使用本书第三章中关于输入信号卷积的代码来进行理解见随书代码“src/Chapter_3/OpenCLConvolution”附代码如下
//
// Book: OpenCL(R) Programming Guide
// Authors: Aaftab Munshi, Benedict Gaster, Timothy Mattson, James Fung, Dan Ginsburg
// ISBN-10: 0-321-74964-2
// ISBN-13: 978-0-321-74964-2
// Publisher: Addison-Wesley Professional
// URLs: http://safari.informit.com/9780132488006/
// http://www.openclprogrammingguide.com
//// Convolution.cpp
//
// This is a simple example that demonstrates OpenCL platform, device, and context
// use.#include iostream
#include fstream
#include sstream
#include string
#include iomanip#ifdef __APPLE__
#include OpenCL/cl.h
#else
#include CL/cl.h
#endif#if !defined(CL_CALLBACK)
#define CL_CALLBACK
#endif// Constants
const unsigned int inputSignalWidth 8;
const unsigned int inputSignalHeight 8;cl_uint inputSignal[inputSignalWidth][inputSignalHeight]
{{3, 1, 1, 4, 8, 2, 1, 3},{4, 2, 1, 1, 2, 1, 2, 3},{4, 4, 4, 4, 3, 2, 2, 2},{9, 8, 3, 8, 9, 0, 0, 0},{9, 3, 3, 9, 0, 0, 0, 0},{0, 9, 0, 8, 0, 0, 0, 0},{3, 0, 8, 8, 9, 4, 4, 4},{5, 9, 8, 1, 8, 1, 1, 1}
};const unsigned int outputSignalWidth 6;
const unsigned int outputSignalHeight 6;cl_uint outputSignal[outputSignalWidth][outputSignalHeight];const unsigned int maskWidth 3;
const unsigned int maskHeight 3;cl_uint mask[maskWidth][maskHeight]
{{1, 1, 1}, {1, 0, 1}, {1, 1, 1},
};///
// Function to check and handle OpenCL errors
inline void
checkErr(cl_int err, const char * name)
{if (err ! CL_SUCCESS) {std::cerr ERROR: name ( err ) std::endl;exit(EXIT_FAILURE);}
}void CL_CALLBACK contextCallback(const char * errInfo,const void * private_info,size_t cb,void * user_data)
{std::cout Error occured during context use: errInfo std::endl;// should really perform any clearup and so on at this point// but for simplicitly just exit.exit(1);
}///
// main() for Convoloution example
//
int main(int argc, char** argv)
{cl_int errNum;cl_uint numPlatforms;cl_uint numDevices;cl_platform_id * platformIDs;cl_device_id * deviceIDs;cl_context context NULL;cl_command_queue queue;cl_program program;cl_kernel kernel;cl_mem inputSignalBuffer;cl_mem outputSignalBuffer;cl_mem maskBuffer;// First, select an OpenCL platform to run on.errNum clGetPlatformIDs(0, NULL, numPlatforms);checkErr((errNum ! CL_SUCCESS) ? errNum : (numPlatforms 0 ? -1 : CL_SUCCESS),clGetPlatformIDs);platformIDs (cl_platform_id *)alloca(sizeof(cl_platform_id) * numPlatforms);errNum clGetPlatformIDs(numPlatforms, platformIDs, NULL);checkErr((errNum ! CL_SUCCESS) ? errNum : (numPlatforms 0 ? -1 : CL_SUCCESS),clGetPlatformIDs);// Iterate through the list of platforms until we find one that supports// a CPU device, otherwise fail with an error.deviceIDs NULL;cl_uint i;for (i 0; i numPlatforms; i){errNum clGetDeviceIDs(platformIDs[i],CL_DEVICE_TYPE_CPU,0,NULL,numDevices);if (errNum ! CL_SUCCESS errNum ! CL_DEVICE_NOT_FOUND){checkErr(errNum, clGetDeviceIDs);}else if (numDevices 0){deviceIDs (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices);errNum clGetDeviceIDs(platformIDs[i],CL_DEVICE_TYPE_CPU,numDevices,deviceIDs[0],NULL);checkErr(errNum, clGetDeviceIDs);break;}}// Check to see if we found at least one CPU device, otherwise returnif (deviceIDs NULL) {std::cout No CPU device found std::endl;exit(-1);}// Next, create an OpenCL context on the selected platform.cl_context_properties contextProperties[] {CL_CONTEXT_PLATFORM,(cl_context_properties)platformIDs[i],0};context clCreateContext(contextProperties,numDevices,deviceIDs,contextCallback,NULL,errNum);checkErr(errNum, clCreateContext);std::ifstream srcFile(Convolution.cl);checkErr(srcFile.is_open() ? CL_SUCCESS : -1, reading Convolution.cl);std::string srcProg(std::istreambuf_iteratorchar(srcFile),(std::istreambuf_iteratorchar()));const char * src srcProg.c_str();size_t length srcProg.length();// Create program from sourceprogram clCreateProgramWithSource(context,1,src,length,errNum);checkErr(errNum, clCreateProgramWithSource);// Build programerrNum clBuildProgram(program,numDevices,deviceIDs,NULL,NULL,NULL);if (errNum ! CL_SUCCESS){// Determine the reason for the errorchar buildLog[16384];clGetProgramBuildInfo(program,deviceIDs[0],CL_PROGRAM_BUILD_LOG,sizeof(buildLog),buildLog,NULL);std::cerr Error in kernel: std::endl;std::cerr buildLog;checkErr(errNum, clBuildProgram);}// Create kernel objectkernel clCreateKernel(program,convolve,errNum);checkErr(errNum, clCreateKernel);// Now allocate buffersinputSignalBuffer clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,sizeof(cl_uint) * inputSignalHeight * inputSignalWidth,static_castvoid *(inputSignal),errNum);checkErr(errNum, clCreateBuffer(inputSignal));maskBuffer clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,sizeof(cl_uint) * maskHeight * maskWidth,static_castvoid *(mask),errNum);checkErr(errNum, clCreateBuffer(mask));outputSignalBuffer clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(cl_uint) * outputSignalHeight * outputSignalWidth,NULL,errNum);checkErr(errNum, clCreateBuffer(outputSignal));// Pick the first device and create command queue.queue clCreateCommandQueue(context,deviceIDs[0],0,errNum);checkErr(errNum, clCreateCommandQueue);errNum clSetKernelArg(kernel, 0, sizeof(cl_mem), inputSignalBuffer);errNum | clSetKernelArg(kernel, 1, sizeof(cl_mem), maskBuffer);errNum | clSetKernelArg(kernel, 2, sizeof(cl_mem), outputSignalBuffer);errNum | clSetKernelArg(kernel, 3, sizeof(cl_uint), inputSignalWidth);errNum | clSetKernelArg(kernel, 4, sizeof(cl_uint), maskWidth);checkErr(errNum, clSetKernelArg);const size_t globalWorkSize[1] { outputSignalWidth * outputSignalHeight };const size_t localWorkSize[1] { 1 };// Queue the kernel up for execution across the arrayerrNum clEnqueueNDRangeKernel(queue,kernel,1,NULL,globalWorkSize,localWorkSize,0,NULL,NULL);checkErr(errNum, clEnqueueNDRangeKernel);errNum clEnqueueReadBuffer(queue,outputSignalBuffer,CL_TRUE,0,sizeof(cl_uint) * outputSignalHeight * outputSignalHeight,outputSignal,0,NULL,NULL);checkErr(errNum, clEnqueueReadBuffer);// Output the result bufferfor (int y 0; y outputSignalHeight; y){for (int x 0; x outputSignalWidth; x){std::cout std::setw(2) outputSignal[x][y] ;}std::cout std::endl;}std::cout std::endl Executed program succesfully. std::endl;return 0;
}//
// Book: OpenCL(R) Programming Guide
// Authors: Aaftab Munshi, Benedict Gaster, Timothy Mattson, James Fung, Dan Ginsburg
// ISBN-10: 0-321-74964-2
// ISBN-13: 978-0-321-74964-2
// Publisher: Addison-Wesley Professional
// URLs: http://safari.informit.com/9780132488006/
// http://www.openclprogrammingguide.com
//// Convolution.cl
//
// This is a simple kernel performing convolution.__kernel void convolve(const __global uint * const input,__constant uint * const mask,__global uint * const output,const int inputWidth,const int maskWidth)
{const int x get_global_id(0);const int y get_global_id(1);uint sum 0;for (int r 0; r maskWidth; r){const int idxIntmp (y r) * inputWidth x;for (int c 0; c maskWidth; c){sum mask[(r * maskWidth) c] * input[idxIntmp c];}}output[y * get_global_size(0) x] sum;
}注代码中clGetDeviceIDs()函数使用CL_DEVICE_TYPE_CPU虽然编译和执行都没有问题但是在vs2017上单步调试时clCreateContext()这个函数会出错返回错误-2这是CL_DEVICE_NOT_AVAILABLE错误 CL_DEVICE_NOT_AVAILABLE if a device in devices is currently not available even though the device was returned by clGetDeviceIDs. 具体原因不深究了将CL_DEVICE_TYPE_CPU都改为CL_DEVICE_TYPE_GPU就一切正常了。代码输出
22 27 19 35 41 12
21 35 10 26 48 24
27 35 29 16 31 26
25 31 33 6 34 48
22 31 39 22 9 37
16 27 43 31 0 38Executed program succesfully.我想用c代码来模拟kernel函数实现同样的效果因此我有如下的c代码
#include stdio.hint input[8][8] {{3, 1, 1, 4, 8, 2, 1, 3},{4, 2, 1, 1, 2, 1, 2, 3},{4, 4, 4, 4, 3, 2, 2, 2},{9, 8, 3, 8, 9, 0, 0, 0},{9, 3, 3, 9, 0, 0, 0, 0},{0, 9, 0, 8, 0, 0, 0, 0},{3, 0, 8, 8, 9, 4, 4, 4},{5, 9, 8, 1, 8, 1, 1, 1}
};int mask[3][3] {{1, 1, 1},{1, 0, 1},{1, 1, 1},
};int output[6][6] { 0 };void convolve()
{for (int y 0; y 6; y) {for (int x 0; x 6; x) {int sum 0;for (int r 0; r 3; r) {const int idxIntmp (y r) * 8 x;for (int c 0; c 3; c)sum *((int*)mask (r * 3) c) * *((int*)input idxIntmp c);}*((int*)output y * 6 x) sum;}}
}int main()
{convolve();for (int y 0; y 6; y) {for (int x 0; x 6; x)printf(%2d , output[x][y]);printf(\n);}
}对比一下发现结果不一致
// 书中的
22 27 19 35 41 12
21 35 10 26 48 24
27 35 29 16 31 26
25 31 33 6 34 48
22 31 39 22 9 37
16 27 43 31 0 38// 第一次
22 35 39 41 26 42
21 31 43 48 48 43
27 31 35 31 37 42
25 27 26 34 38 30
22 19 16 9 17 23
16 10 6 0 12 11百思不得其解确信我的理解是对的get_global_id(0)和get_global_id(1)的值都为6如果将上述c代码修改为
void convolve()
{for (int y 0; y /*6*/8; y) {for (int x 0; x /*6*/8; x) {int sum 0;for (int r 0; r 3; r) {const int idxIntmp (y r) * 8 x;for (int c 0; c 3; c)sum *((int*)mask (r * 3) c) * *((int*)input idxIntmp c);}*((int*)output y * /*6*/8 x) sum;}}
}输出结果就和书中的一致了
// 书中的
22 27 19 35 41 12
21 35 10 26 48 24
27 35 29 16 31 26
25 31 33 6 34 48
22 31 39 22 9 37
16 27 43 31 0 38// 第二次
22 27 19 35 41 12
21 35 10 26 48 24
27 35 29 16 31 26
25 31 33 6 34 48
22 31 39 22 9 37
16 27 43 31 0 38虽然结果一致但是此时output[6][6]变量的大小变成了output[8][8]说明已经溢出了难道get_global_id(0)和get_global_id(1)都是8如果真是这样的话那我之前的理解就全错了这也太恐怖了吧
经过尝试我发现
funcvalueget_global_id(0)0-35get_global_id(1)0get_global_size(0)36
原来书中代码是以一维数组传入的
const size_t globalWorkSize[1] { outputSignalWidth * outputSignalHeight };
const size_t localWorkSize[1] { 1 };// Queue the kernel up for execution across the array
errNum clEnqueueNDRangeKernel(queue,kernel,1,NULL,globalWorkSize,localWorkSize,0,NULL,NULL);
checkErr(errNum, clEnqueueNDRangeKernel);所以我将我的c代码也进行了相应的修改
void convolve()
{for (int y 0; y 1; y) {for (int x 0; x 36; x) {int sum 0;for (int r 0; r 3; r) {const int idxIntmp (y r) * 8 x;for (int c 0; c 3; c)sum *((int*)mask (r * 3) c) * *((int*)input idxIntmp c);}*((int*)output y * 0 x) sum;}}
}这样结果就一致了
// 书中的
22 27 19 35 41 12
21 35 10 26 48 24
27 35 29 16 31 26
25 31 33 6 34 48
22 31 39 22 9 37
16 27 43 31 0 38// 第三次
22 27 19 35 41 12
21 35 10 26 48 24
27 35 29 16 31 26
25 31 33 6 34 48
22 31 39 22 9 37
16 27 43 31 0 38或者我修改一下原书代码以二维数组传入
const size_t globalWorkSize[2] { outputSignalWidth, outputSignalHeight };
const size_t localWorkSize[2] { 1, 1 };// Queue the kernel up for execution across the array
errNum clEnqueueNDRangeKernel(queue,kernel,2,NULL,globalWorkSize,localWorkSize,0,NULL,NULL);
checkErr(errNum, clEnqueueNDRangeKernel);我的测试代码仍保持最初的模样即
void convolve()
{for (int y 0; y 6; y) {for (int x 0; x 6; x) {int sum 0;for (int r 0; r 3; r) {const int idxIntmp (y r) * 8 x;for (int c 0; c 3; c)sum *((int*)mask (r * 3) c) * *((int*)input idxIntmp c);}*((int*)output y * 6 x) sum;}}
}这样的修改后的结果也一致说明我的理解是正确的
// 书中的
22 35 39 41 26 42
21 31 43 48 48 43
27 31 35 31 37 42
25 27 26 34 38 30
22 19 16 9 17 23
16 10 6 0 12 11// 第一次
22 35 39 41 26 42
21 31 43 48 48 43
27 31 35 31 37 42
25 27 26 34 38 30
22 19 16 9 17 23
16 10 6 0 12 11现在就算真正理解了get_global_id和get_global_size最后一个疑问就是为什么以一组数组传入和以二维数组传入得到的结果不一样