diff --git a/build.xml b/build.xml index ee646ad16617dfbc5c803d5828a09fc3aaca0193..3b19d8139945510362fdfcc7df3dd4e2cc8f5abb 100644 --- a/build.xml +++ b/build.xml @@ -35,6 +35,7 @@ <fileset dir="test" includes="*/build.xml"/> </subant> <delete dir="examples\nbody\jogamp"/> <!-- we handle the jogamp delete here, save downloading each build --> + <delete file="test\codegen\junit-4.10.jar"/> <!-- we handle the junit delete here, save downloading each build --> <ant dir="com.amd.aparapi.jni" target="clean"/> <ant dir="com.amd.aparapi" target="clean"/> </target> diff --git a/com.amd.aparapi.jni/src/cpp/aparapi.cpp b/com.amd.aparapi.jni/src/cpp/aparapi.cpp index 29fa4c27e127e339e6f470cbeda81d866417a140..4e9c4b31188224e74b28f1e3cf68f0abe85aaf93 100644 --- a/com.amd.aparapi.jni/src/cpp/aparapi.cpp +++ b/com.amd.aparapi.jni/src/cpp/aparapi.cpp @@ -1,40 +1,40 @@ /* -Copyright (c) 2010-2011, Advanced Micro Devices, Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the -following conditions are met: - -Redistributions of source code must retain the above copyright notice, this list of conditions and the following -disclaimer. - -Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following -disclaimer in the documentation and/or other materials provided with the distribution. - -Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, -INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export -laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through -774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, -you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of -Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration -Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1, -E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups -D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject -to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774 -of EAR). For the most current Country Group listings, or for additional information about the EAR or your obligations -under those regulations, please refer to the U.S. Bureau of Industry and Security’s website at http://www.bis.doc.gov/. - -*/ + Copyright (c) 2010-2011, Advanced Micro Devices, Inc. + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted provided that the + following conditions are met: + + Redistributions of source code must retain the above copyright notice, this list of conditions and the following + disclaimer. + + Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided with the distribution. + + Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export + laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 + through 774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of + the EAR, you hereby certify that, except pursuant to a license granted by the United States Department of Commerce + Bureau of Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export + Administration Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in + Country Groups D:1, E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) + export to Country Groups D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced + direct product is subject to national security controls as identified on the Commerce Control List (currently + found in Supplement 1 to Part 774 of EAR). For the most current Country Group listings, or for additional + information about the EAR or your obligations under those regulations, please refer to the U.S. Bureau of Industry + and Security’s website at http://www.bis.doc.gov/. + */ #include <stdio.h> #include <stdlib.h> @@ -110,6 +110,10 @@ MicrosecondTimer timer; #include "com_amd_aparapi_KernelRunner.h" +#define CHECK_NO_RETURN(condition, msg) if(condition){\ + fprintf(stderr, "!!!!!!! %s failed !!!!!!!\n", msg);\ +} + #define CHECK(condition, msg) if(condition){\ fprintf(stderr, "!!!!!!! %s failed !!!!!!!\n", msg);\ return 0;\ @@ -127,26 +131,9 @@ MicrosecondTimer timer; #define PRINT_CL_ERR(status, msg) fprintf(stderr, "!!!!!!! %s failed %s\n", msg, CLErrString(status)); -#define ASSERT_FIELD(id) CHECK(id##FieldID == 0, "No such field as " #id) - -#define GET_DEV_INFO(deviceId, param, val, format){\ - status = clGetDeviceInfo(deviceId, param, sizeof(val), &(val), NULL);\ - ASSERT_CL_NO_RETURN( "clGetDeviceInfo().");\ - /*fprintf(stderr, #param " " format " \n", val);*/ \ -} - +#define ASSERT_FIELD(id) CHECK_NO_RETURN(id##FieldID == 0, "No such field as " #id) -jfieldID typeFieldID; -jfieldID isStaticFieldID; -jfieldID nameFieldID; -jfieldID javaArrayFieldID; -jfieldID bytesPerLocalSizeFieldID; -jfieldID sizeInBytesFieldID; -jfieldID numElementsFieldID; -// we rely on these being 0 initially to detect whether we have cached the above fieldId's -jclass clazz = (jclass)0; -jclass argClazz = (jclass)0; static const char *CLErrString(cl_int status) { static struct { cl_int code; const char *msg; } error_table[] = { @@ -214,7 +201,90 @@ static const char *CLErrString(cl_int status) { #endif return unknown; } +class Range{ + public: + static jclass rangeClazz; + static jfieldID globalSize_0_FieldID; + static jfieldID globalSize_1_FieldID; + static jfieldID globalSize_2_FieldID; + static jfieldID localSize_0_FieldID; + static jfieldID localSize_1_FieldID; + static jfieldID localSize_2_FieldID; + static jfieldID dimsFieldID; + static jfieldID localIsDerivedFieldID; + jobject range; + cl_int dims; + size_t *offsets; + size_t *globalDims; + size_t *localDims; + jboolean localIsDerived; + Range(JNIEnv *jenv, jobject range): + range(range), + dims(0), + offsets(NULL), + globalDims(NULL), + localDims(NULL){ + if (rangeClazz ==NULL){ + jclass rangeClazz = jenv->GetObjectClass(range); + globalSize_0_FieldID = jenv->GetFieldID(rangeClazz, "globalSize_0", "I"); ASSERT_FIELD(globalSize_0_); + globalSize_1_FieldID = jenv->GetFieldID(rangeClazz, "globalSize_1", "I"); ASSERT_FIELD(globalSize_1_); + globalSize_2_FieldID = jenv->GetFieldID(rangeClazz, "globalSize_2", "I"); ASSERT_FIELD(globalSize_2_); + localSize_0_FieldID = jenv->GetFieldID(rangeClazz, "localSize_0", "I"); ASSERT_FIELD(localSize_0_); + localSize_1_FieldID = jenv->GetFieldID(rangeClazz, "localSize_1", "I"); ASSERT_FIELD(localSize_1_); + localSize_2_FieldID = jenv->GetFieldID(rangeClazz, "localSize_2", "I"); ASSERT_FIELD(localSize_2_); + dimsFieldID = jenv->GetFieldID(rangeClazz, "dims", "I"); ASSERT_FIELD(dims); + localIsDerivedFieldID = jenv->GetFieldID(rangeClazz, "localIsDerived", "Z"); ASSERT_FIELD(localIsDerived); + } + dims = jenv->GetIntField(range, dimsFieldID); + localIsDerived = jenv->GetBooleanField(range, localIsDerivedFieldID); + if (dims >0){ + //fprintf(stderr, "native range dims == %d\n", dims); + offsets = new size_t[dims]; + globalDims = new size_t[dims]; + localDims = new size_t[dims]; + offsets[0]= 0; + localDims[0]= jenv->GetIntField(range, localSize_0_FieldID); + //fprintf(stderr, "native range localSize_0 == %d\n", localDims[0]); + globalDims[0]= jenv->GetIntField(range, globalSize_0_FieldID); + //fprintf(stderr, "native range globalSize_0 == %d\n", globalDims[0]); + if (dims >1){ + offsets[1]= 0; + localDims[1]= jenv->GetIntField(range, localSize_1_FieldID); + //fprintf(stderr, "native range localSize_1 == %d\n", localDims[1]); + globalDims[1]= jenv->GetIntField(range, globalSize_1_FieldID); + //fprintf(stderr, "native range globalSize_1 == %d\n", globalDims[1]); + if (dims >2){ + offsets[2]= 0; + localDims[2]= jenv->GetIntField(range, localSize_2_FieldID); + //fprintf(stderr, "native range localSize_2 == %d\n", localDims[2]); + globalDims[2]= jenv->GetIntField(range, globalSize_2_FieldID); + //fprintf(stderr, "native range globalSize_2 == %d\n", globalDims[2]); + } + } + } + } + ~Range(){ + if (offsets!= NULL){ + delete offsets; + } + if (globalDims!= NULL){ + delete globalDims; + } + if (localDims!= NULL){ + delete localDims; + } + } +}; +jclass Range::rangeClazz = (jclass)0; +jfieldID Range::globalSize_0_FieldID=0; +jfieldID Range::globalSize_1_FieldID=0; +jfieldID Range::globalSize_2_FieldID=0; +jfieldID Range::localSize_0_FieldID=0; +jfieldID Range::localSize_1_FieldID=0; +jfieldID Range::localSize_2_FieldID=0; +jfieldID Range::dimsFieldID=0; +jfieldID Range::localIsDerivedFieldID=0; class ProfileInfo{ public: @@ -243,13 +313,22 @@ class KernelArgRef{ class JNIContext ; // forward reference class KernelArg{ + private: + static jclass argClazz; + static jfieldID nameFieldID; + static jfieldID typeFieldID; + static jfieldID isStaticFieldID; + static jfieldID sizeInBytesFieldID; + static jfieldID numElementsFieldID; public: + static jfieldID javaArrayFieldID; + jobject argObj; char *name; // used for debugging printfs jfieldID fieldID; // The field that this arg represents in the kernel (java), used only for primitive updates jint type; // a bit mask determining the type of this arg jboolean isStatic; // A flag indicating if the value is static jint sizeInBytes; // bytes in the array or directBuf - jobject javaArg; // global reference to the corresponding java KernelArg + jobject javaArg; // global reference to the corresponding java KernelArg object union{ cl_char c; cl_double d; @@ -259,6 +338,29 @@ class KernelArg{ KernelArgRef ref; } value; + KernelArg(JNIEnv *jenv, jobject argObj): + argObj(argObj){ + javaArg = jenv->NewGlobalRef(argObj); // save a global ref to the java Arg Object + if (argClazz == 0){ + jclass c = jenv->GetObjectClass(argObj); + nameFieldID = jenv->GetFieldID(c, "name", "Ljava/lang/String;"); ASSERT_FIELD(name); + typeFieldID = jenv->GetFieldID(c, "type", "I"); ASSERT_FIELD(type); + isStaticFieldID = jenv->GetFieldID(c, "isStatic", "Z"); ASSERT_FIELD(isStatic); + javaArrayFieldID = jenv->GetFieldID(c, "javaArray", "Ljava/lang/Object;"); ASSERT_FIELD(javaArray); + sizeInBytesFieldID = jenv->GetFieldID(c, "sizeInBytes", "I"); ASSERT_FIELD(sizeInBytes); + numElementsFieldID = jenv->GetFieldID(c, "numElements", "I"); ASSERT_FIELD(numElements); + } + type = jenv->GetIntField(argObj, typeFieldID); + isStatic = jenv->GetBooleanField(argObj, isStaticFieldID); + jstring nameString = (jstring)jenv->GetObjectField(argObj, nameFieldID); + const char *nameChars = jenv->GetStringUTFChars(nameString, NULL); + name=strdup(nameChars); + jenv->ReleaseStringUTFChars(nameString, nameChars); + } + + ~KernelArg(){ + } + void unpinAbort(JNIEnv *jenv){ jenv->ReleasePrimitiveArrayCritical((jarray)value.ref.javaArray, value.ref.addr,JNI_ABORT); } @@ -342,9 +444,6 @@ class KernelArg{ int isAparapiBufHasArray(){ return (type&com_amd_aparapi_KernelRunner_ARG_APARAPI_BUF_HAS_ARRAY); } - int isAparapiBufIsDirect(){ - return (type&com_amd_aparapi_KernelRunner_ARG_APARAPI_BUF_IS_DIRECT); - } int isBackedByArray(){ return ( (isArray() && isGlobal()) || ((isGlobal() || isConstant()) && isAparapiBufHasArray())); } @@ -354,8 +453,27 @@ class KernelArg{ int mustWriteBuffer(){ return ((isImplicit()&&isRead()&&!isConstant())||(isExplicit()&&isExplicitWrite())); } - + void syncType(JNIEnv* jenv){ + type = jenv->GetIntField(javaArg, typeFieldID); + } + void syncSizeInBytes(JNIEnv* jenv){ + sizeInBytes = jenv->GetIntField(javaArg, sizeInBytesFieldID); + } + void syncJavaArrayLength(JNIEnv* jenv){ + value.ref.javaArrayLength = jenv->GetIntField(javaArg, numElementsFieldID); + } + void clearExplicitBufferBit(JNIEnv* jenv){ + type &= ~com_amd_aparapi_KernelRunner_ARG_EXPLICIT_WRITE; + jenv->SetIntField(javaArg, typeFieldID,type ); + } }; +jclass KernelArg::argClazz=(jclass)0; +jfieldID KernelArg::nameFieldID=0; +jfieldID KernelArg::typeFieldID=0; +jfieldID KernelArg::isStaticFieldID=0; +jfieldID KernelArg::javaArrayFieldID=0; +jfieldID KernelArg::sizeInBytesFieldID=0; +jfieldID KernelArg::numElementsFieldID=0; class JNIContext{ private: @@ -365,10 +483,7 @@ class JNIContext{ cl_platform_id* platforms; cl_uint platformc; public: - JNIEnv *jenv; jobject kernelObject; - jint numProcessors; - jint maxJTPLocalSize; jclass kernelClass; cl_uint deviceIdc; cl_device_id* deviceIds; @@ -391,6 +506,7 @@ class JNIContext{ // these map to camelCase form of CL_DEVICE_XXX_XXX For example CL_DEVICE_MAX_COMPUTE_UNITS == maxComputeUnits cl_uint maxComputeUnits; cl_uint maxWorkItemDimensions; + size_t *maxWorkItemSizes; size_t maxWorkGroupSize; cl_ulong globalMemSize; cl_ulong localMemSize; @@ -399,13 +515,10 @@ class JNIContext{ return((JNIContext*)jniContextHandle); } - JNIContext(JNIEnv *_jenv, jobject _kernelObject, jint _flags, jint _numProcessors, jint _maxJTPLocalSize): - jenv(_jenv), + JNIContext(JNIEnv *jenv, jobject _kernelObject, jint _flags): kernelObject(jenv->NewGlobalRef(_kernelObject)), kernelClass((jclass)jenv->NewGlobalRef(jenv->GetObjectClass(_kernelObject))), flags(_flags), - numProcessors(_numProcessors), - maxJTPLocalSize(_maxJTPLocalSize), platform(NULL), profileBaseTime(0), deviceType(((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_USE_GPU)==com_amd_aparapi_KernelRunner_JNI_FLAG_USE_GPU)?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU), @@ -438,1179 +551,1209 @@ class JNIContext{ // platformVersionName = "OpenCL 1.1 AMD-APP-SDK-v2.5 (684.213)"|"OpenCL 1.1 CUDA 4.0.1" #ifndef __APPLE__ // Here we check if the platformVersionName starts with "OpenCL 1.1" (10 chars!) - if (!strncmp(platformVersionName, "OpenCL 1.1", 10)) { + if (!strncmp(platformVersionName, "OpenCL 1.1", 10)) { //} #else - // Here we check if the platformVersionName starts with "OpenCL 1.1" or "OpenCL 1.0" (10 chars!) - if (!strncmp(platformVersionName, "OpenCL 1.1", 10) || !strncmp(platformVersionName, "OpenCL 1.0", 10)) { + // Here we check if the platformVersionName starts with "OpenCL 1.1" or "OpenCL 1.0" (10 chars!) + if (!strncmp(platformVersionName, "OpenCL 1.1", 10) || !strncmp(platformVersionName, "OpenCL 1.0", 10)) { // } #endif - // Get the # of devices - status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &deviceIdc); - // now check if this platform supports the requested device type (GPU or CPU) - if (status == CL_SUCCESS && deviceIdc >0 ){ - if (deviceIdc >1){ - if (isVerbose()){ - fprintf(stderr, "Warning attempt to use %d devices\n", deviceIdc); - } - deviceIdc = 1; // Hack to work around issue #18 (multiple device error) - if (isVerbose()){ - fprintf(stderr, "Locking deviceIdc to %d to work around issue #18\n", deviceIdc); - } - } - platform = platforms[i]; - if (isVerbose()){ - fprintf(stderr, "platform %s supports requested device type\n", platformVendorName); - } - - deviceIds = new cl_device_id[deviceIdc]; - status = clGetDeviceIDs(platform, deviceType, deviceIdc, deviceIds, NULL); - if (status == CL_SUCCESS){ - ASSERT_CL_NO_RETURN("clGetDeviceIDs()"); - - GET_DEV_INFO(deviceIds[0], CL_DEVICE_MAX_COMPUTE_UNITS, maxComputeUnits, "%d"); - GET_DEV_INFO(deviceIds[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, maxWorkItemDimensions, "%d"); - GET_DEV_INFO(deviceIds[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, maxWorkGroupSize, "%d"); - GET_DEV_INFO(deviceIds[0], CL_DEVICE_GLOBAL_MEM_SIZE, globalMemSize, "%d"); - GET_DEV_INFO(deviceIds[0], CL_DEVICE_LOCAL_MEM_SIZE, localMemSize, "%d"); - if (isVerbose()){ - - fprintf(stderr, "device[%p]: Type: ", deviceIds[0]); - if (deviceType & CL_DEVICE_TYPE_DEFAULT) { - // deviceType &= ~CL_DEVICE_TYPE_DEFAULT; - fprintf(stderr, "Default "); - }else if (deviceType & CL_DEVICE_TYPE_CPU) { - // deviceType &= ~CL_DEVICE_TYPE_CPU; - fprintf(stderr, "CPU "); - }else if (deviceType & CL_DEVICE_TYPE_GPU) { - // deviceType &= ~CL_DEVICE_TYPE_GPU; - fprintf(stderr, "GPU "); - }else if (deviceType & CL_DEVICE_TYPE_ACCELERATOR) { - // deviceType &= ~CL_DEVICE_TYPE_ACCELERATOR; - fprintf(stderr, "Accelerator "); - }else{ - fprintf(stderr, "Unknown (0x%llx) ", deviceType); - } - fprintf(stderr, "\n"); - } - cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; - cl_context_properties* cprops = (NULL == platform) ? NULL : cps; - context = clCreateContextFromType( cprops, deviceType, NULL, NULL, &status); - ASSERT_CL_NO_RETURN("clCreateContextFromType()"); - if (status == CL_SUCCESS){ - - valid = JNI_TRUE; - } - } - }else{ - if (isVerbose()){ - fprintf(stderr, "platform %s does not support requested device type skipping!\n", platformVendorName); - } + // Get the # of devices + status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &deviceIdc); + // now check if this platform supports the requested device type (GPU or CPU) + if (status == CL_SUCCESS && deviceIdc >0 ){ + if (deviceIdc >1){ + if (isVerbose()){ + fprintf(stderr, "Warning attempt to use %d devices\n", deviceIdc); } - - }else{ + deviceIdc = 1; // Hack to work around issue #18 (multiple device error) if (isVerbose()){ -#ifndef __APPLE__ - fprintf(stderr, "platform %s version %s is not OpenCL 1.1 skipping!\n", platformVendorName, platformVersionName); -#else - fprintf(stderr, "platform %s version %s is neither OpenCL 1.1 or OpenCL 1.0 skipping!\n", platformVendorName, platformVersionName); -#endif - + fprintf(stderr, "Locking deviceIdc to %d to work around issue #18\n", deviceIdc); } } - } + platform = platforms[i]; + if (isVerbose()){ + fprintf(stderr, "platform %s supports requested device type\n", platformVendorName); + } - } - }else{ - if (isVerbose()){ - fprintf(stderr, "no opencl platforms available!\n"); - } - } + deviceIds = new cl_device_id[deviceIdc]; + status = clGetDeviceIDs(platform, deviceType, deviceIdc, deviceIds, NULL); + ASSERT_CL_NO_RETURN("clGetDeviceIDs()"); + if (status == CL_SUCCESS){ - } + status = clGetDeviceInfo(deviceIds[0], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(maxComputeUnits), &maxComputeUnits, NULL); + ASSERT_CL_NO_RETURN( "clGetDeviceInfo(CL_DEVICE_MAX_COMPUTE_UNITS)."); - jboolean isValid(){ - return(valid); - } - jboolean isProfilingEnabled(){ - return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_PROFILING)==com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_PROFILING?JNI_TRUE:JNI_FALSE); - } - jboolean isUsingGPU(){ - return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_USE_GPU)==com_amd_aparapi_KernelRunner_JNI_FLAG_USE_GPU?JNI_TRUE:JNI_FALSE); - } - jboolean isVerbose(){ - return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_VERBOSE_JNI)==com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_VERBOSE_JNI?JNI_TRUE:JNI_FALSE); - } + status = clGetDeviceInfo(deviceIds[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(maxWorkItemDimensions), &maxWorkItemDimensions, NULL); + ASSERT_CL_NO_RETURN( "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS)."); - ~JNIContext(){ - cl_int status = CL_SUCCESS; - jenv->DeleteGlobalRef(kernelObject); - jenv->DeleteGlobalRef(kernelClass); - if (context != 0){ - status = clReleaseContext(context); - ASSERT_CL_NO_RETURN("clReleaseContext()"); - context = (cl_context)0; - } - if (commandQueues){ - for (int dev=0; dev<deviceIdc; dev++){ - status = clReleaseCommandQueue((cl_command_queue)commandQueues[dev]); - ASSERT_CL_NO_RETURN("clReleaseCommandQueue()"); - commandQueues[dev] = (cl_command_queue)0; - } - delete[] commandQueues; commandQueues = NULL; - } - if (program != 0){ - status = clReleaseProgram((cl_program)program); - ASSERT_CL_NO_RETURN("clReleaseProgram()"); - program = (cl_program)0; - } - if (kernel != 0){ - status = clReleaseKernel((cl_kernel)kernel); - ASSERT_CL_NO_RETURN("clReleaseKernel()"); - kernel = (cl_kernel)0; - } - if (platforms){ - delete []platforms; platforms=NULL; - } - if (deviceIds){ - delete [] deviceIds; deviceIds=NULL; - } - if (argc> 0){ - for (int i=0; i< argc; i++){ - KernelArg *arg = args[i]; - if (!arg->isPrimitive()){ - if (arg->value.ref.mem != 0){ - status = clReleaseMemObject((cl_mem)arg->value.ref.mem); - ASSERT_CL_NO_RETURN("clReleaseMemObject()"); - arg->value.ref.mem = (cl_mem)0; + maxWorkItemSizes = (size_t *)malloc(sizeof(size_t)*maxWorkItemDimensions); + status = clGetDeviceInfo(deviceIds[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxWorkItemDimensions, maxWorkItemSizes, NULL); + + ASSERT_CL_NO_RETURN( "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES)."); + + status = clGetDeviceInfo(deviceIds[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL); + ASSERT_CL_NO_RETURN( "clGetDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE)."); + + status = clGetDeviceInfo(deviceIds[0], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(globalMemSize), &globalMemSize, NULL); + ASSERT_CL_NO_RETURN( "clGetDeviceInfo(CL_DEVICE_GLOBAL_MEM_SIZE)."); + + status = clGetDeviceInfo(deviceIds[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(localMemSize), &localMemSize, NULL); + + ASSERT_CL_NO_RETURN( "clGetDeviceInfo(CL_DEVICE_LOCAL_MEM_SIZE)."); + + + if (isVerbose()){ + fprintf(stderr, "device[%p]: Type: ", deviceIds[0]); + if (deviceType & CL_DEVICE_TYPE_DEFAULT) { + // deviceType &= ~CL_DEVICE_TYPE_DEFAULT; + fprintf(stderr, "Default "); + }else if (deviceType & CL_DEVICE_TYPE_CPU) { + // deviceType &= ~CL_DEVICE_TYPE_CPU; + fprintf(stderr, "CPU "); + }else if (deviceType & CL_DEVICE_TYPE_GPU) { + // deviceType &= ~CL_DEVICE_TYPE_GPU; + fprintf(stderr, "GPU "); + }else if (deviceType & CL_DEVICE_TYPE_ACCELERATOR) { + // deviceType &= ~CL_DEVICE_TYPE_ACCELERATOR; + fprintf(stderr, "Accelerator "); + }else{ + fprintf(stderr, "Unknown (0x%llx) ", deviceType); + } + fprintf(stderr, "\n"); + } + cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; + cl_context_properties* cprops = (NULL == platform) ? NULL : cps; + context = clCreateContextFromType( cprops, deviceType, NULL, NULL, &status); + ASSERT_CL_NO_RETURN("clCreateContextFromType()"); + if (status == CL_SUCCESS){ + + valid = JNI_TRUE; + } } - if (arg->value.ref.javaArray != NULL) { - jenv->DeleteWeakGlobalRef((jweak) arg->value.ref.javaArray); + }else{ + if (isVerbose()){ + fprintf(stderr, "platform %s does not support requested device type skipping!\n", platformVendorName); } } - if (arg->name != NULL){ - free(arg->name); arg->name = NULL; - } - if (arg->javaArg != NULL ) { - jenv->DeleteGlobalRef((jobject) arg->javaArg); - } - delete arg; arg=args[i]=NULL; - } - delete[] args; args=NULL; - delete []readEvents; readEvents =NULL; - delete []writeEvents; writeEvents = NULL; - delete []executeEvents; executeEvents = NULL; + }else{ + if (isVerbose()){ +#ifndef __APPLE__ + fprintf(stderr, "platform %s version %s is not OpenCL 1.1 skipping!\n", platformVendorName, platformVersionName); +#else + fprintf(stderr, "platform %s version %s is neither OpenCL 1.1 or OpenCL 1.0 skipping!\n", platformVendorName, platformVersionName); +#endif - if (isProfilingEnabled()) { - if (profileFile != NULL && profileFile != stderr) { - fclose(profileFile); } - delete[] readEventArgs; readEventArgs=0; - delete[] writeEventArgs; writeEventArgs=0; - } - } - } - - /* - Release JNI critical pinned arrays before returning to java code - */ - void unpinAll() { - for (int i=0; i< argc; i++){ - KernelArg *arg = args[i]; - if (arg->isBackedByArray()) { - arg->unpin(jenv); } } + + } + }else{ + if (isVerbose()){ + fprintf(stderr, "no opencl platforms available!\n"); } + } +} - }; +jboolean isValid(){ + return(valid); +} +jboolean isProfilingEnabled(){ + return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_PROFILING)==com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_PROFILING?JNI_TRUE:JNI_FALSE); +} +jboolean isUsingGPU(){ + return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_USE_GPU)==com_amd_aparapi_KernelRunner_JNI_FLAG_USE_GPU?JNI_TRUE:JNI_FALSE); +} +jboolean isVerbose(){ + return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_VERBOSE_JNI)==com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_VERBOSE_JNI?JNI_TRUE:JNI_FALSE); +} - jclass cacheKernelArgFields(JNIEnv *jenv, jobject jobj){ - jclass c = jenv->GetObjectClass(jobj); - nameFieldID = jenv->GetFieldID(c, "name", "Ljava/lang/String;"); ASSERT_FIELD(name); - typeFieldID = jenv->GetFieldID(c, "type", "I"); ASSERT_FIELD(type); - isStaticFieldID = jenv->GetFieldID(c, "isStatic", "Z"); ASSERT_FIELD(isStatic); - javaArrayFieldID = jenv->GetFieldID(c, "javaArray", "Ljava/lang/Object;"); ASSERT_FIELD(javaArray); - bytesPerLocalSizeFieldID = jenv->GetFieldID(c, "bytesPerLocalSize", "I"); ASSERT_FIELD(bytesPerLocalSize); - sizeInBytesFieldID = jenv->GetFieldID(c, "sizeInBytes", "I"); ASSERT_FIELD(sizeInBytes); - numElementsFieldID = jenv->GetFieldID(c, "numElements", "I"); ASSERT_FIELD(numElements); - return(c); - } +~JNIContext(){ +} - JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_disposeJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) { - cl_int status = CL_SUCCESS; - JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); - if (jniContext != NULL){ - delete jniContext;//free(jniContext); - jniContext = NULL; - } - return(status); +void dispose(JNIEnv *jenv){ + cl_int status = CL_SUCCESS; + jenv->DeleteGlobalRef(kernelObject); + jenv->DeleteGlobalRef(kernelClass); + if (context != 0){ + status = clReleaseContext(context); + ASSERT_CL_NO_RETURN("clReleaseContext()"); + context = (cl_context)0; + } + if (commandQueues){ + for (int dev=0; dev<deviceIdc; dev++){ + status = clReleaseCommandQueue((cl_command_queue)commandQueues[dev]); + ASSERT_CL_NO_RETURN("clReleaseCommandQueue()"); + commandQueues[dev] = (cl_command_queue)0; } - - void idump(char *str, void *ptr, int size){ - int * iptr = (int *)ptr; - for (int i=0; i<size/sizeof(int); i++){ - fprintf(stderr, "%s%4d %d\n", str, i, iptr[i]); + delete[] commandQueues; commandQueues = NULL; + } + if (program != 0){ + status = clReleaseProgram((cl_program)program); + ASSERT_CL_NO_RETURN("clReleaseProgram()"); + program = (cl_program)0; + } + if (kernel != 0){ + status = clReleaseKernel((cl_kernel)kernel); + ASSERT_CL_NO_RETURN("clReleaseKernel()"); + kernel = (cl_kernel)0; + } + if (platforms){ + delete []platforms; platforms=NULL; + } + if (deviceIds){ + delete [] deviceIds; deviceIds=NULL; + } + if (argc> 0){ + for (int i=0; i< argc; i++){ + KernelArg *arg = args[i]; + if (!arg->isPrimitive()){ + if (arg->value.ref.mem != 0){ + status = clReleaseMemObject((cl_mem)arg->value.ref.mem); + ASSERT_CL_NO_RETURN("clReleaseMemObject()"); + arg->value.ref.mem = (cl_mem)0; + } + if (arg->value.ref.javaArray != NULL) { + jenv->DeleteWeakGlobalRef((jweak) arg->value.ref.javaArray); + } + } + if (arg->name != NULL){ + free(arg->name); arg->name = NULL; } + if (arg->javaArg != NULL ) { + jenv->DeleteGlobalRef((jobject) arg->javaArg); + } + delete arg; arg=args[i]=NULL; } + delete[] args; args=NULL; + + delete []readEvents; readEvents =NULL; + delete []writeEvents; writeEvents = NULL; + delete []executeEvents; executeEvents = NULL; - void fdump(char *str, void *ptr, int size){ - float * fptr = (float *)ptr; - for (int i=0; i<size/sizeof(float); i++){ - fprintf(stderr, "%s%4d %6.2f\n", str, i, fptr[i]); + if (isProfilingEnabled()) { + if (profileFile != NULL && profileFile != stderr) { + fclose(profileFile); } + delete[] readEventArgs; readEventArgs=0; + delete[] writeEventArgs; writeEventArgs=0; + } + } +} + +/* + Release JNI critical pinned arrays before returning to java code + */ +void unpinAll(JNIEnv* jenv) { + for (int i=0; i< argc; i++){ + KernelArg *arg = args[i]; + if (arg->isBackedByArray()) { + arg->unpin(jenv); } + } +} - jint writeProfileInfo(JNIContext* jniContext){ - cl_ulong currSampleBaseTime = -1; - int pos = 1; +}; - if (jniContext->firstRun) { - fprintf(jniContext->profileFile, "# PROFILE Name, queued, submit, start, end (microseconds)\n"); - } - // A read by a user kernel means the OpenCL layer wrote to the kernel and vice versa - for (int i=0; i< jniContext->argc; i++){ - KernelArg *arg=jniContext->args[i]; - if (arg->isBackedByArray() && arg->isRead()){ - // Initialize the base time for this sample - if (currSampleBaseTime == -1) { - currSampleBaseTime = arg->value.ref.write.queued; - } - if (jniContext->profileBaseTime == 0){ - jniContext->profileBaseTime = arg->value.ref.write.queued; +JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_disposeJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) { + cl_int status = CL_SUCCESS; + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); + if (jniContext != NULL){ + jniContext->dispose(jenv); + delete jniContext; + jniContext = NULL; + } + return(status); +} - // Write the base time as the first item in the csv - //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime); - } +void idump(char *str, void *ptr, int size){ + int * iptr = (int *)ptr; + for (int i=0; i<size/sizeof(int); i++){ + fprintf(stderr, "%s%4d %d\n", str, i, iptr[i]); + } +} - fprintf(jniContext->profileFile, "%d write %s,", pos++, arg->name); +void fdump(char *str, void *ptr, int size){ + float * fptr = (float *)ptr; + for (int i=0; i<size/sizeof(float); i++){ + fprintf(stderr, "%s%4d %6.2f\n", str, i, fptr[i]); + } +} - fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,", - (arg->value.ref.write.queued - currSampleBaseTime)/1000, - (arg->value.ref.write.submit - currSampleBaseTime)/1000, - (arg->value.ref.write.start - currSampleBaseTime)/1000, - (arg->value.ref.write.end - currSampleBaseTime)/1000); - } - } - if (jniContext->profileBaseTime == 0){ - jniContext->profileBaseTime = jniContext->exec.queued; +jint writeProfileInfo(JNIContext* jniContext){ + cl_ulong currSampleBaseTime = -1; + int pos = 1; - // Write the base time as the first item in the csv - //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime); - } + if (jniContext->firstRun) { + fprintf(jniContext->profileFile, "# PROFILE Name, queued, submit, start, end (microseconds)\n"); + } - // Initialize the base time for this sample if necessary + // A read by a user kernel means the OpenCL layer wrote to the kernel and vice versa + for (int i=0; i< jniContext->argc; i++){ + KernelArg *arg=jniContext->args[i]; + if (arg->isBackedByArray() && arg->isRead()){ + + // Initialize the base time for this sample if (currSampleBaseTime == -1) { - currSampleBaseTime = jniContext->exec.queued; + currSampleBaseTime = arg->value.ref.write.queued; } - // exec - fprintf(jniContext->profileFile, "%d exec,", pos++); - - fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,", - (jniContext->exec.queued - currSampleBaseTime)/1000, - (jniContext->exec.submit - currSampleBaseTime)/1000, - (jniContext->exec.start - currSampleBaseTime)/1000, - (jniContext->exec.end - currSampleBaseTime)/1000); - - // - if ( jniContext->argc == 0 ) { - fprintf(jniContext->profileFile, "\n"); - } else { - for (int i=0; i< jniContext->argc; i++){ - KernelArg *arg=jniContext->args[i]; - if (arg->isBackedByArray() && arg->isWrite()){ - if (jniContext->profileBaseTime == 0){ - jniContext->profileBaseTime = arg->value.ref.read.queued; - - // Write the base time as the first item in the csv - //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime); - } - - // Initialize the base time for this sample - if (currSampleBaseTime == -1) { - currSampleBaseTime = arg->value.ref.read.queued; - } - - fprintf(jniContext->profileFile, "%d read %s,", pos++, arg->name); + if (jniContext->profileBaseTime == 0){ + jniContext->profileBaseTime = arg->value.ref.write.queued; - fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,", - (arg->value.ref.read.queued - currSampleBaseTime)/1000, - (arg->value.ref.read.submit - currSampleBaseTime)/1000, - (arg->value.ref.read.start - currSampleBaseTime)/1000, - (arg->value.ref.read.end - currSampleBaseTime)/1000); - } - } + // Write the base time as the first item in the csv + //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime); } - fprintf(jniContext->profileFile, "\n"); - return(0); - } - - // Should failed profiling abort the run and return early? - cl_int profile(ProfileInfo *profileInfo, cl_event *event){ - cl_int status = CL_SUCCESS; - status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_QUEUED, sizeof(profileInfo->queued), &(profileInfo->queued), NULL); - ASSERT_CL( "clGetEventProfiliningInfo() QUEUED"); - status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_SUBMIT, sizeof(profileInfo->submit), &(profileInfo->submit), NULL); - ASSERT_CL( "clGetEventProfiliningInfo() SUBMIT"); - status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_START, sizeof(profileInfo->start), &(profileInfo->start), NULL); - ASSERT_CL( "clGetEventProfiliningInfo() START"); - status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_END, sizeof(profileInfo->end), &(profileInfo->end), NULL); - ASSERT_CL( "clGetEventProfiliningInfo() END"); - return status; - } - + fprintf(jniContext->profileFile, "%d write %s,", pos++, arg->name); + fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,", + (arg->value.ref.write.queued - currSampleBaseTime)/1000, + (arg->value.ref.write.submit - currSampleBaseTime)/1000, + (arg->value.ref.write.start - currSampleBaseTime)/1000, + (arg->value.ref.write.end - currSampleBaseTime)/1000); + } + } - jint updateKernel(JNIEnv *jenv, jobject jobj, JNIContext* jniContext) { - cl_int status = CL_SUCCESS; - if (jniContext != NULL){ - // we need to step through the array of KernelArg's to create the info required to create the cl_mem buffers. - for (jint i=0; i<jniContext->argc; i++){ - KernelArg *arg=jniContext->args[i]; - - arg->type = jenv->GetIntField(arg->javaArg, typeFieldID); - if (jniContext->isVerbose()){ - fprintf(stderr, "got type for %s: %08x\n", arg->name, arg->type); - } - if (!arg->isPrimitive()) { - // Following used for all primitive arrays, object arrays and nio Buffers - jarray newRef = (jarray)jenv->GetObjectField(arg->javaArg, javaArrayFieldID); - if (jniContext->isVerbose()){ - - fprintf(stderr, "testing for Resync javaArray %s: old=%p, new=%p\n", arg->name, arg->value.ref.javaArray, newRef); - } - - jboolean isSame = jenv->IsSameObject( newRef, arg->value.ref.javaArray); - if (isSame == JNI_FALSE) { - if (jniContext->isVerbose()){ - fprintf(stderr, "Resync javaArray for %s: %p %p\n", arg->name, newRef, arg->value.ref.javaArray); - } - // Free previous ref if any - if (arg->value.ref.javaArray != NULL) { - jenv->DeleteWeakGlobalRef((jweak) arg->value.ref.javaArray); - if (jniContext->isVerbose()){ - fprintf(stderr, "DeleteWeakGlobalRef for %s: %p\n", arg->name, arg->value.ref.javaArray); - } - } - - // need to free opencl buffers, run will reallocate later - if (arg->value.ref.mem != 0) { - //fprintf(stderr, "-->releaseMemObject[%d]\n", i); - status = clReleaseMemObject((cl_mem)arg->value.ref.mem); - //fprintf(stderr, "<--releaseMemObject[%d]\n", i); - ASSERT_CL("clReleaseMemObject()"); - arg->value.ref.mem = (cl_mem)0; - } - - arg->value.ref.mem = (cl_mem) 0; - arg->value.ref.addr = NULL; + if (jniContext->profileBaseTime == 0){ + jniContext->profileBaseTime = jniContext->exec.queued; - // Capture new array ref from the kernel arg object + // Write the base time as the first item in the csv + //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime); + } - if (newRef != NULL) { - arg->value.ref.javaArray = (jarray)jenv->NewWeakGlobalRef((jarray)newRef); - if (jniContext->isVerbose()){ - fprintf(stderr, "NewWeakGlobalRef for %s, set to %p\n", arg->name, - arg->value.ref.javaArray); - } - } else { - arg->value.ref.javaArray = NULL; - } - arg->value.ref.isArray = !arg->isAparapiBufIsDirect(); + // Initialize the base time for this sample if necessary + if (currSampleBaseTime == -1) { + currSampleBaseTime = jniContext->exec.queued; + } + + // exec + fprintf(jniContext->profileFile, "%d exec,", pos++); + + fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,", + (jniContext->exec.queued - currSampleBaseTime)/1000, + (jniContext->exec.submit - currSampleBaseTime)/1000, + (jniContext->exec.start - currSampleBaseTime)/1000, + (jniContext->exec.end - currSampleBaseTime)/1000); + + // + if ( jniContext->argc == 0 ) { + fprintf(jniContext->profileFile, "\n"); + } else { + for (int i=0; i< jniContext->argc; i++){ + KernelArg *arg=jniContext->args[i]; + if (arg->isBackedByArray() && arg->isWrite()){ + if (jniContext->profileBaseTime == 0){ + jniContext->profileBaseTime = arg->value.ref.read.queued; + + // Write the base time as the first item in the csv + //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime); + } - // Save the sizeInBytes which was set on the java side - arg->sizeInBytes = jenv->GetIntField(arg->javaArg, sizeInBytesFieldID); + // Initialize the base time for this sample + if (currSampleBaseTime == -1) { + currSampleBaseTime = arg->value.ref.read.queued; + } - if (jniContext->isVerbose()){ - fprintf(stderr, "updateKernel, args[%d].sizeInBytes=%d\n", i, arg->sizeInBytes); - } - } // !is_same - } - } // for each arg - } // if jniContext != NULL + fprintf(jniContext->profileFile, "%d read %s,", pos++, arg->name); - return(status); + fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,", + (arg->value.ref.read.queued - currSampleBaseTime)/1000, + (arg->value.ref.read.submit - currSampleBaseTime)/1000, + (arg->value.ref.read.start - currSampleBaseTime)/1000, + (arg->value.ref.read.end - currSampleBaseTime)/1000); + } } + } + fprintf(jniContext->profileFile, "\n"); + return(0); +} +// Should failed profiling abort the run and return early? +cl_int profile(ProfileInfo *profileInfo, cl_event *event){ + cl_int status = CL_SUCCESS; + status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_QUEUED, sizeof(profileInfo->queued), &(profileInfo->queued), NULL); + ASSERT_CL( "clGetEventProfiliningInfo() QUEUED"); + status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_SUBMIT, sizeof(profileInfo->submit), &(profileInfo->submit), NULL); + ASSERT_CL( "clGetEventProfiliningInfo() SUBMIT"); + status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_START, sizeof(profileInfo->start), &(profileInfo->start), NULL); + ASSERT_CL( "clGetEventProfiliningInfo() START"); + status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_END, sizeof(profileInfo->end), &(profileInfo->end), NULL); + ASSERT_CL( "clGetEventProfiliningInfo() END"); + return status; +} - JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_runKernelJNI(JNIEnv *jenv, - jobject jobj, jlong jniContextHandle, jint globalSize, jint localSize, jboolean needSync, - jboolean useNullForLocalSize, jint passes) { +//Step through all non-primitive (array of primitive or array object references) and determine if the field has changed +//The field may have been re-assigned by the Java code to NULL or another instance. +//If we detect a change then we discard the previous cl_mem buffer, the caller will detect that the buffers are null and will create new cl_mem buffers. +jint updateNonPrimitiveReferences(JNIEnv *jenv, jobject jobj, JNIContext* jniContext) { + cl_int status = CL_SUCCESS; + if (jniContext != NULL){ + for (jint i=0; i<jniContext->argc; i++){ + KernelArg *arg=jniContext->args[i]; + arg->syncType(jenv); // make sure that the JNI arg reflects the latest type info from the instance - cl_int status = CL_SUCCESS; - JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); if (jniContext->isVerbose()){ - timer.start(); + fprintf(stderr, "got type for %s: %08x\n", arg->name, arg->type); } - - // Need to capture array refs - if (jniContext->firstRun || needSync) { - updateKernel(jenv, jobj, jniContext ); + if (!arg->isPrimitive()) { + // Following used for all primitive arrays, object arrays and nio Buffers + jarray newRef = (jarray)jenv->GetObjectField(arg->javaArg, KernelArg::javaArrayFieldID); if (jniContext->isVerbose()){ - fprintf(stderr, "back from updateKernel\n"); + fprintf(stderr, "testing for Resync javaArray %s: old=%p, new=%p\n", arg->name, arg->value.ref.javaArray, newRef); } - } - - int writeEventCount = 0; - - // kernelArgPos is used to keep track of the kernel arg position, it can - // differ from "i" due to insertion of javaArrayLength args which are not - // fields read from the kernel object. - int kernelArgPos = 0; - - for (int i=0; i< jniContext->argc; i++){ - KernelArg *arg = jniContext->args[i]; - // TODO: see if we can get rid of this read - arg->type = jenv->GetIntField(arg->javaArg, typeFieldID); - if (jniContext->isVerbose()){ - fprintf(stderr, "got type for arg %d, %s, type=%08x\n", i, arg->name, arg->type); - } - if (!arg->isPrimitive() && !arg->isLocal()) { - // pin the arrays so that GC does not move them during the call - - // get the C memory address for the region being transferred - // this uses different JNI calls for arrays vs. directBufs - void * prevAddr = arg->value.ref.addr; - if (arg->value.ref.isArray) { - arg->pin(jenv); - } else if (arg->isAparapiBufIsDirect()) { - // different call used for directbuffers - arg->value.ref.addr = jenv->GetDirectBufferAddress(arg->value.ref.javaArray); - } + if (!jenv->IsSameObject(newRef, arg->value.ref.javaArray)) { if (jniContext->isVerbose()){ - fprintf(stderr, "runKernel: arrayOrBuf ref %p, oldAddr=%p, newAddr=%p, ref.mem=%p, isArray=%d\n", - arg->value.ref.javaArray, - prevAddr, - arg->value.ref.addr, - arg->value.ref.mem, - arg->value.ref.isArray ); - fprintf(stderr, "at memory addr %p, contents: ", arg->value.ref.addr); - unsigned char *pb = (unsigned char *) arg->value.ref.addr; - for (int k=0; k<8; k++) { - fprintf(stderr, "%02x ", pb[k]); - } - fprintf(stderr, "\n" ); + fprintf(stderr, "Resync javaArray for %s: %p %p\n", arg->name, newRef, arg->value.ref.javaArray); } - // record whether object moved - // if we see that isCopy was returned by getPrimitiveArrayCritical, treat that as a move - bool objectMoved = (arg->value.ref.addr != prevAddr) || arg->value.ref.isCopy; - -#ifdef VERBOSE_EXPLICIT - if (arg->isExplicit() && arg->isExplicitWrite()){ - fprintf(stderr, "explicit write of %s\n", arg->name); - } -#endif - - if (jniContext->firstRun || (arg->value.ref.mem == 0) || objectMoved ){ - // if either this is the first run or user changed input array - // or gc moved something, then we create buffers/args - cl_uint mask = CL_MEM_USE_HOST_PTR; - if (arg->isRead() && arg->isWrite()) mask |= CL_MEM_READ_WRITE; - else if (arg->isRead() && !arg->isWrite()) mask |= CL_MEM_READ_ONLY; - else if (arg->isWrite()) mask |= CL_MEM_WRITE_ONLY; - arg->value.ref.memMask = mask; + // Free previous ref if any + if (arg->value.ref.javaArray != NULL) { + jenv->DeleteWeakGlobalRef((jweak) arg->value.ref.javaArray); if (jniContext->isVerbose()){ - strcpy(arg->value.ref.memSpec,"CL_MEM_USE_HOST_PTR"); - if (mask & CL_MEM_READ_WRITE) strcat(arg->value.ref.memSpec,"|CL_MEM_READ_WRITE"); - if (mask & CL_MEM_READ_ONLY) strcat(arg->value.ref.memSpec,"|CL_MEM_READ_ONLY"); - if (mask & CL_MEM_WRITE_ONLY) strcat(arg->value.ref.memSpec,"|CL_MEM_WRITE_ONLY"); - - fprintf(stderr, "%s %d clCreateBuffer(context, %s, size=%08x bytes, address=%08x, &status)\n", arg->name, - i, arg->value.ref.memSpec, arg->sizeInBytes, arg->value.ref.addr); - } - arg->value.ref.mem = clCreateBuffer(jniContext->context, arg->value.ref.memMask, - arg->sizeInBytes, arg->value.ref.addr, &status); - - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clCreateBuffer"); - jniContext->unpinAll(); - return status; - } - - status = clSetKernelArg(jniContext->kernel, kernelArgPos++, sizeof(cl_mem), (void *)&(arg->value.ref.mem)); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clSetKernelArg (array)"); - jniContext->unpinAll(); - return status; - } - - // Add the array length if needed - if (arg->usesArrayLength()){ - arg->value.ref.javaArrayLength = jenv->GetIntField(arg->javaArg, numElementsFieldID); - status = clSetKernelArg(jniContext->kernel, kernelArgPos++, sizeof(jint), &(arg->value.ref.javaArrayLength)); - - if (jniContext->isVerbose()){ - fprintf(stderr, "runKernel arg %d %s, javaArrayLength = %d\n", i, arg->name, arg->value.ref.javaArrayLength); - } - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clSetKernelArg (array length)"); - jniContext->unpinAll(); - return status; - } - } - } else { - // Keep the arg position in sync if no updates were required - kernelArgPos++; - if (arg->usesArrayLength()){ - kernelArgPos++; + fprintf(stderr, "DeleteWeakGlobalRef for %s: %p\n", arg->name, arg->value.ref.javaArray); } } - // we only enqueue a write if we know the kernel actually reads the buffer or if there is an explicit write pending - // the default behavior for Constant buffers is also that there is no write enqueued unless explicit + // need to free opencl buffers, run will reallocate later + if (arg->value.ref.mem != 0) { + //fprintf(stderr, "-->releaseMemObject[%d]\n", i); + status = clReleaseMemObject((cl_mem)arg->value.ref.mem); + //fprintf(stderr, "<--releaseMemObject[%d]\n", i); + ASSERT_CL("clReleaseMemObject()"); + arg->value.ref.mem = (cl_mem)0; + } - if (arg->mustWriteBuffer()){ -#ifdef VERBOSE_EXPLICIT - if (arg->isExplicit() && arg->isExplicitWrite()){ - fprintf(stderr, "writing explicit buffer %d %s\n", i, arg->name); - } -#endif - if (jniContext->isVerbose()){ - fprintf(stderr, "%s writing buffer %d %s\n", (arg->isExplicit() ? "explicitly" : ""), - i, arg->name); - } - if (jniContext->isProfilingEnabled()) { - jniContext->writeEventArgs[writeEventCount]=i; - } + arg->value.ref.mem = (cl_mem) 0; + arg->value.ref.addr = NULL; - status = clEnqueueWriteBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, - arg->sizeInBytes, arg->value.ref.addr, 0, NULL, &(jniContext->writeEvents[writeEventCount++])); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clEnqueueWriteBuffer"); - jniContext->unpinAll(); - return status; - } - if (arg->isExplicit() && arg->isExplicitWrite()){ - arg->type &= ~com_amd_aparapi_KernelRunner_ARG_EXPLICIT_WRITE; -#ifdef VERBOSE_EXPLICIT - fprintf(stderr, "clearing explicit buffer bit %d %s\n", i, arg->name); -#endif - jenv->SetIntField(arg->javaArg, typeFieldID,arg->type ); - } - } - } else if (arg->isLocal()){ - if (jniContext->firstRun){ - // must multiply perlocalByteSize by localSize to get real opencl buffer size - int bytesPerLocalSize = jenv->GetIntField(arg->javaArg, bytesPerLocalSizeFieldID); - int adjustedLocalBufSize = bytesPerLocalSize * localSize; + // Capture new array ref from the kernel arg object + if (newRef != NULL) { + arg->value.ref.javaArray = (jarray)jenv->NewWeakGlobalRef((jarray)newRef); if (jniContext->isVerbose()){ - fprintf(stderr, "ISLOCAL, clSetKernelArg(jniContext->kernel, %d, %d, NULL);\n", i, adjustedLocalBufSize); - } - status = clSetKernelArg(jniContext->kernel, kernelArgPos++, adjustedLocalBufSize, NULL); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clSetKernelArg() (local)"); - jniContext->unpinAll(); - return status; + fprintf(stderr, "NewWeakGlobalRef for %s, set to %p\n", arg->name, + arg->value.ref.javaArray); } } else { - // Keep the arg position in sync if no updates were required - kernelArgPos++; - if (arg->usesArrayLength()){ - kernelArgPos++; - } + arg->value.ref.javaArray = NULL; } - }else{ // primitive arguments + arg->value.ref.isArray = true; - // we need to reflectively sync the value out of the kernel object - if (arg->isFloat()){ - if (arg->isStatic){ - arg->value.f = jenv->GetStaticFloatField(jniContext->kernelClass, arg->fieldID); - }else{ - arg->value.f = jenv->GetFloatField(jniContext->kernelObject, arg->fieldID); - } - //fprintf(stderr, "float arg %d\n", arg->value.f); - }else if (arg->isInt()){ - if (arg->isStatic){ - arg->value.i = jenv->GetStaticIntField(jniContext->kernelClass, arg->fieldID); - }else{ - arg->value.i = jenv->GetIntField(jniContext->kernelObject, arg->fieldID); - } - //fprintf(stderr, "int arg %d\n", arg->value.i); - }else if (arg->isBoolean()){ - if (arg->isStatic){ - arg->value.c = jenv->GetStaticBooleanField(jniContext->kernelClass, arg->fieldID); - }else{ - arg->value.c = jenv->GetBooleanField(jniContext->kernelObject, arg->fieldID); - } - //fprintf(stderr, "boolean arg %d\n", arg->value.c); - }else if (arg->isByte()){ - if (arg->isStatic){ - arg->value.c = jenv->GetStaticByteField(jniContext->kernelClass, arg->fieldID); - }else{ - arg->value.c = jenv->GetByteField(jniContext->kernelObject, arg->fieldID); - } - //fprintf(stderr, "byte arg %d\n", arg->value.c); - }else if (arg->isLong()){ - if (arg->isStatic){ - arg->value.j = jenv->GetStaticLongField(jniContext->kernelClass, arg->fieldID); - }else{ - arg->value.j = jenv->GetLongField(jniContext->kernelObject, arg->fieldID); - } - //fprintf(stderr, "long arg %d\n", arg->value.c); - }else if (arg->isDouble()){ - if (arg->isStatic){ - arg->value.d = jenv->GetStaticDoubleField(jniContext->kernelClass, arg->fieldID); - }else{ - arg->value.d = jenv->GetDoubleField(jniContext->kernelObject, arg->fieldID); - } - //fprintf(stderr, "double arg %d\n", arg->value.c); - } + // Save the sizeInBytes which was set on the java side + arg->syncSizeInBytes(jenv); if (jniContext->isVerbose()){ - fprintf(stderr, "clSetKernelArg %s: %d %d %d 0x%08x\n", arg->name, i, kernelArgPos, - arg->sizeInBytes, arg->value); - } - status = clSetKernelArg(jniContext->kernel, kernelArgPos++, arg->sizeInBytes, &(arg->value)); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clSetKernelArg() (value)"); - jniContext->unpinAll(); - return status; + fprintf(stderr, "updateNonPrimitiveReferences, args[%d].sizeInBytes=%d\n", i, arg->sizeInBytes); } - } - } // for each arg + } // object has changed + } + } // for each arg + } // if jniContext != NULL + return(status); +} - size_t globalSizeAsSizeT = (globalSize /jniContext->deviceIdc); - size_t localSizeAsSizeT = localSize; - // To support multiple passes we add a 'secret' final arg called 'passid' and just schedule multiple enqueuendrange kernels. Each of which having a separate value of passid - for (int passid=0; passid<passes; passid++){ - for (int dev =0; dev < jniContext->deviceIdc; dev++){ - size_t offset = (size_t)((globalSize/jniContext->deviceIdc)*dev); - status = clSetKernelArg(jniContext->kernel, kernelArgPos, sizeof(passid), &(passid)); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clSetKernelArg() (passid)"); - jniContext->unpinAll(); - return status; - } +JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_runKernelJNI(JNIEnv *jenv, + jobject jobj, jlong jniContextHandle, jobject _range, jboolean needSync, jint passes) { - // four options here due to passid - if (passid == 0 && passes==1){ - //fprintf(stderr, "setting passid to %d of %d first and last\n", passid, passes); - // there is one pass and this is it - // enqueue depends on write enqueues - // we don't block but and we populate the executeEvents - status = clEnqueueNDRangeKernel(jniContext->commandQueues[dev], jniContext->kernel, 1, &offset, &globalSizeAsSizeT, - useNullForLocalSize ? NULL : &localSizeAsSizeT, - writeEventCount, writeEventCount?jniContext->writeEvents:NULL, &jniContext->executeEvents[dev]); - }else if (passid == 0){ - //fprintf(stderr, "setting passid to %d of %d first not last\n", passid, passes); - // this is the first of multiple passes - // enqueue depends on write enqueues - // we block but do not populate executeEvents (only the last pass does this) - status = clEnqueueNDRangeKernel(jniContext->commandQueues[dev], jniContext->kernel, 1, &offset, &globalSizeAsSizeT, - useNullForLocalSize ? NULL : &localSizeAsSizeT, - writeEventCount, writeEventCount?jniContext->writeEvents:NULL, &jniContext->executeEvents[dev]); - - }else if (passid < passes-1){ - // we are in some middle pass (neither first or last) - // we don't depend on write enqueues - // we block and do not supply executeEvents (only the last pass does this) - //fprintf(stderr, "setting passid to %d of %d not first not last\n", passid, passes); - status = clEnqueueNDRangeKernel(jniContext->commandQueues[dev], jniContext->kernel, 1, &offset, &globalSizeAsSizeT, - useNullForLocalSize ? NULL : &localSizeAsSizeT, 0, NULL, &jniContext->executeEvents[dev]); - }else{ - // we are the last pass of >1 - // we don't depend on write enqueues - // we block and supply executeEvents - //fprintf(stderr, "setting passid to %d of %d last\n", passid, passes); - status = clEnqueueNDRangeKernel(jniContext->commandQueues[dev], jniContext->kernel, 1, &offset, &globalSizeAsSizeT, - useNullForLocalSize ? NULL : &localSizeAsSizeT, 0, NULL, &jniContext->executeEvents[dev]); - } + Range range(jenv, _range); + cl_int status = CL_SUCCESS; + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); + if (jniContext->isVerbose()){ + timer.start(); + } - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clEnqueueNDRangeKernel()"); - fprintf(stderr, "after clEnqueueNDRangeKernel, globalSize=%d localSize=%d usingNull=%d\n", (int)globalSizeAsSizeT, (int)localSizeAsSizeT, useNullForLocalSize); - jniContext->unpinAll(); - return status; - } - } - if (passid < passes-1){ - // we need to wait for the executions to complete... - status = clWaitForEvents(jniContext->deviceIdc, jniContext->executeEvents); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clWaitForEvents() execute events mid pass"); - jniContext->unpinAll(); - return status; - } + // Need to capture array refs + if (jniContext->firstRun || needSync) { + updateNonPrimitiveReferences(jenv, jobj, jniContext ); + if (jniContext->isVerbose()){ + fprintf(stderr, "back from updateNonPrimitiveReferences\n"); + } + } - for (int dev = 0; dev < jniContext->deviceIdc; dev++){ - status = clReleaseEvent(jniContext->executeEvents[dev]); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clReleaseEvent() read event"); - jniContext->unpinAll(); - return status; - } - } - } + int writeEventCount = 0; + + // kernelArgPos is used to keep track of the kernel arg position, it can + // differ from "i" due to insertion of javaArrayLength args which are not + // fields read from the kernel object. + int kernelArgPos = 0; + + for (int i=0; i< jniContext->argc; i++){ + KernelArg *arg = jniContext->args[i]; + // TODO: see if we can get rid of this read + arg->syncType(jenv); + if (jniContext->isVerbose()){ + fprintf(stderr, "got type for arg %d, %s, type=%08x\n", i, arg->name, arg->type); + } + if (!arg->isPrimitive() && !arg->isLocal()) { + // pin the arrays so that GC does not move them during the call + + // get the C memory address for the region being transferred + // this uses different JNI calls for arrays vs. directBufs + void * prevAddr = arg->value.ref.addr; + if (arg->value.ref.isArray) { + arg->pin(jenv); } - int readEventCount = 0; + if (jniContext->isVerbose()){ + fprintf(stderr, "runKernel: arrayOrBuf ref %p, oldAddr=%p, newAddr=%p, ref.mem=%p, isArray=%d\n", + arg->value.ref.javaArray, + prevAddr, + arg->value.ref.addr, + arg->value.ref.mem, + arg->value.ref.isArray ); + fprintf(stderr, "at memory addr %p, contents: ", arg->value.ref.addr); + unsigned char *pb = (unsigned char *) arg->value.ref.addr; + for (int k=0; k<8; k++) { + fprintf(stderr, "%02x ", pb[k]); + } + fprintf(stderr, "\n" ); + } + // record whether object moved + // if we see that isCopy was returned by getPrimitiveArrayCritical, treat that as a move + bool objectMoved = (arg->value.ref.addr != prevAddr) || arg->value.ref.isCopy; - for (int i=0; i< jniContext->argc; i++){ - KernelArg *arg = jniContext->args[i]; +#ifdef VERBOSE_EXPLICIT + if (arg->isExplicit() && arg->isExplicitWrite()){ + fprintf(stderr, "explicit write of %s\n", arg->name); + } +#endif - if (arg->mustReadBuffer()){ - if (jniContext->isProfilingEnabled()) { - jniContext->readEventArgs[readEventCount]=i; - } - if (jniContext->isVerbose()){ - fprintf(stderr, "reading buffer %d %s\n", i, arg->name); - } + if (jniContext->firstRun || (arg->value.ref.mem == 0) || objectMoved ){ + // if either this is the first run or user changed input array + // or gc moved something, then we create buffers/args + cl_uint mask = CL_MEM_USE_HOST_PTR; + if (arg->isRead() && arg->isWrite()) mask |= CL_MEM_READ_WRITE; + else if (arg->isRead() && !arg->isWrite()) mask |= CL_MEM_READ_ONLY; + else if (arg->isWrite()) mask |= CL_MEM_WRITE_ONLY; + arg->value.ref.memMask = mask; + if (jniContext->isVerbose()){ + strcpy(arg->value.ref.memSpec,"CL_MEM_USE_HOST_PTR"); + if (mask & CL_MEM_READ_WRITE) strcat(arg->value.ref.memSpec,"|CL_MEM_READ_WRITE"); + if (mask & CL_MEM_READ_ONLY) strcat(arg->value.ref.memSpec,"|CL_MEM_READ_ONLY"); + if (mask & CL_MEM_WRITE_ONLY) strcat(arg->value.ref.memSpec,"|CL_MEM_WRITE_ONLY"); - status = clEnqueueReadBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, - arg->sizeInBytes,arg->value.ref.addr , jniContext->deviceIdc, jniContext->executeEvents, &(jniContext->readEvents[readEventCount++])); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clEnqueueReadBuffer()"); - jniContext->unpinAll(); - return status; - } + fprintf(stderr, "%s %d clCreateBuffer(context, %s, size=%08x bytes, address=%08x, &status)\n", arg->name, + i, arg->value.ref.memSpec, arg->sizeInBytes, arg->value.ref.addr); } - } + arg->value.ref.mem = clCreateBuffer(jniContext->context, arg->value.ref.memMask, + arg->sizeInBytes, arg->value.ref.addr, &status); - // don't change the order here - // We wait for the reads which each depend on the execution, which depends on the writes ;) - // So after the reads have completed, we can release the execute and writes. + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clCreateBuffer"); + jniContext->unpinAll(jenv); + return status; + } - if (readEventCount >0){ - status = clWaitForEvents(readEventCount, jniContext->readEvents); + status = clSetKernelArg(jniContext->kernel, kernelArgPos++, sizeof(cl_mem), (void *)&(arg->value.ref.mem)); if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clWaitForEvents() read events"); - jniContext->unpinAll(); + PRINT_CL_ERR(status, "clSetKernelArg (array)"); + jniContext->unpinAll(jenv); return status; } - for (int i=0; i< readEventCount; i++){ - if (jniContext->isProfilingEnabled()) { - status = profile(&jniContext->args[jniContext->readEventArgs[i]]->value.ref.read, &jniContext->readEvents[i]); - if (status != CL_SUCCESS) { - jniContext->unpinAll(); - return status; - } + // Add the array length if needed + if (arg->usesArrayLength()){ + arg->syncJavaArrayLength(jenv); + + status = clSetKernelArg(jniContext->kernel, kernelArgPos++, sizeof(jint), &(arg->value.ref.javaArrayLength)); + + if (jniContext->isVerbose()){ + fprintf(stderr, "runKernel arg %d %s, javaArrayLength = %d\n", i, arg->name, arg->value.ref.javaArrayLength); } - status = clReleaseEvent(jniContext->readEvents[i]); if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clReleaseEvent() read event"); - jniContext->unpinAll(); + PRINT_CL_ERR(status, "clSetKernelArg (array length)"); + jniContext->unpinAll(jenv); return status; } } } else { - // if readEventCount == 0 then we don't need any reads so we just wait for the executions to complete - status = clWaitForEvents(jniContext->deviceIdc, jniContext->executeEvents); + // Keep the arg position in sync if no updates were required + kernelArgPos++; + if (arg->usesArrayLength()){ + kernelArgPos++; + } + } + + // we only enqueue a write if we know the kernel actually reads the buffer or if there is an explicit write pending + // the default behavior for Constant buffers is also that there is no write enqueued unless explicit + + if (arg->mustWriteBuffer()){ +#ifdef VERBOSE_EXPLICIT + if (arg->isExplicit() && arg->isExplicitWrite()){ + fprintf(stderr, "writing explicit buffer %d %s\n", i, arg->name); + } +#endif + if (jniContext->isVerbose()){ + fprintf(stderr, "%s writing buffer %d %s\n", (arg->isExplicit() ? "explicitly" : ""), + i, arg->name); + } + if (jniContext->isProfilingEnabled()) { + jniContext->writeEventArgs[writeEventCount]=i; + } + + status = clEnqueueWriteBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, + arg->sizeInBytes, arg->value.ref.addr, 0, NULL, &(jniContext->writeEvents[writeEventCount++])); if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clWaitForEvents() execute event"); - jniContext->unpinAll(); + PRINT_CL_ERR(status, "clEnqueueWriteBuffer"); + jniContext->unpinAll(jenv); return status; } + if (arg->isExplicit() && arg->isExplicitWrite()){ +#ifdef VERBOSE_EXPLICIT + fprintf(stderr, "clearing explicit buffer bit %d %s\n", i, arg->name); +#endif + arg->clearExplicitBufferBit(jenv); + } } + } else if (arg->isLocal()){ + if (jniContext->firstRun){ + int bytes = arg->sizeInBytes; - if (jniContext->isProfilingEnabled()) { - status = profile(&jniContext->exec, &jniContext->executeEvents[0]); + if (jniContext->isVerbose()){ + fprintf(stderr, "ISLOCAL, clSetKernelArg(jniContext->kernel, %d, %d, NULL);\n", i, bytes); + } + status = clSetKernelArg(jniContext->kernel, kernelArgPos++, bytes, NULL); if (status != CL_SUCCESS) { - jniContext->unpinAll(); + PRINT_CL_ERR(status, "clSetKernelArg() (local)"); + jniContext->unpinAll(jenv); return status; } + } else { + // Keep the arg position in sync if no updates were required + kernelArgPos++; + if (arg->usesArrayLength()){ + kernelArgPos++; + } + } + }else{ // primitive arguments + + // we need to reflectively sync the value out of the kernel object + if (arg->isFloat()){ + if (arg->isStatic){ + arg->value.f = jenv->GetStaticFloatField(jniContext->kernelClass, arg->fieldID); + }else{ + arg->value.f = jenv->GetFloatField(jniContext->kernelObject, arg->fieldID); + } + //fprintf(stderr, "float arg %d\n", arg->value.f); + }else if (arg->isInt()){ + if (arg->isStatic){ + arg->value.i = jenv->GetStaticIntField(jniContext->kernelClass, arg->fieldID); + }else{ + arg->value.i = jenv->GetIntField(jniContext->kernelObject, arg->fieldID); + } + //fprintf(stderr, "int arg %d\n", arg->value.i); + }else if (arg->isBoolean()){ + if (arg->isStatic){ + arg->value.c = jenv->GetStaticBooleanField(jniContext->kernelClass, arg->fieldID); + }else{ + arg->value.c = jenv->GetBooleanField(jniContext->kernelObject, arg->fieldID); + } + //fprintf(stderr, "boolean arg %d\n", arg->value.c); + }else if (arg->isByte()){ + if (arg->isStatic){ + arg->value.c = jenv->GetStaticByteField(jniContext->kernelClass, arg->fieldID); + }else{ + arg->value.c = jenv->GetByteField(jniContext->kernelObject, arg->fieldID); + } + //fprintf(stderr, "byte arg %d\n", arg->value.c); + }else if (arg->isLong()){ + if (arg->isStatic){ + arg->value.j = jenv->GetStaticLongField(jniContext->kernelClass, arg->fieldID); + }else{ + arg->value.j = jenv->GetLongField(jniContext->kernelObject, arg->fieldID); + } + //fprintf(stderr, "long arg %d\n", arg->value.c); + }else if (arg->isDouble()){ + if (arg->isStatic){ + arg->value.d = jenv->GetStaticDoubleField(jniContext->kernelClass, arg->fieldID); + }else{ + arg->value.d = jenv->GetDoubleField(jniContext->kernelObject, arg->fieldID); + } + //fprintf(stderr, "double arg %d\n", arg->value.c); + } + + if (jniContext->isVerbose()){ + fprintf(stderr, "clSetKernelArg %s: %d %d %d 0x%08x\n", arg->name, i, kernelArgPos, + arg->sizeInBytes, arg->value); } + status = clSetKernelArg(jniContext->kernel, kernelArgPos++, arg->sizeInBytes, &(arg->value)); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clSetKernelArg() (value)"); + jniContext->unpinAll(jenv); + return status; + } + } + } // for each arg - // extract the execution status from the executeEvent - cl_int executeStatus; - status = clGetEventInfo(jniContext->executeEvents[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &executeStatus, NULL); + // We will need to revisit the execution of multiple devices. + // POssibly cloning the range per device and mutating each to handle a unique subrange (of global) and + // maybe even pushing the offset into the range class. + + // size_t globalSize_0AsSizeT = (range.globalDims[0] /jniContext->deviceIdc); + // size_t localSize_0AsSizeT = range.localDims[0]; + + // To support multiple passes we add a 'secret' final arg called 'passid' and just schedule multiple enqueuendrange kernels. Each of which having a separate value of passid + // + + for (int passid=0; passid<passes; passid++){ + for (int dev =0; dev < jniContext->deviceIdc; dev++){ // this will always be 1 until we reserect multi-dim support + //size_t offset = 1; // (size_t)((range.globalDims[0]/jniContext->deviceIdc)*dev); + status = clSetKernelArg(jniContext->kernel, kernelArgPos, sizeof(passid), &(passid)); if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clGetEventInfo() execute event"); - jniContext->unpinAll(); + PRINT_CL_ERR(status, "clSetKernelArg() (passid)"); + jniContext->unpinAll(jenv); return status; } - if (executeStatus != CL_SUCCESS) { - // it should definitely not be negative, but since we did a wait above, it had better be CL_COMPLETE==CL_SUCCESS - PRINT_CL_ERR(executeStatus, "Execution status of execute event"); - jniContext->unpinAll(); - return executeStatus; + + // four options here due to passid + if (passid == 0 && passes==1){ + //fprintf(stderr, "setting passid to %d of %d first and last\n", passid, passes); + // there is one pass and this is it + // enqueue depends on write enqueues + // we don't block but and we populate the executeEvents + status = clEnqueueNDRangeKernel( + jniContext->commandQueues[dev], + jniContext->kernel, + range.dims, + range.offsets, range.globalDims, + range.localDims, + writeEventCount, + writeEventCount?jniContext->writeEvents:NULL, + &jniContext->executeEvents[dev]); + }else if (passid == 0){ + //fprintf(stderr, "setting passid to %d of %d first not last\n", passid, passes); + // this is the first of multiple passes + // enqueue depends on write enqueues + // we block but do not populate executeEvents (only the last pass does this) + status = clEnqueueNDRangeKernel( + jniContext->commandQueues[dev], + jniContext->kernel, + range.dims, + range.offsets, + range.globalDims, + range.localDims, + writeEventCount, + writeEventCount?jniContext->writeEvents:NULL, + &jniContext->executeEvents[dev]); + + }else if (passid < passes-1){ + // we are in some middle pass (neither first or last) + // we don't depend on write enqueues + // we block and do not supply executeEvents (only the last pass does this) + //fprintf(stderr, "setting passid to %d of %d not first not last\n", passid, passes); + status = clEnqueueNDRangeKernel( + jniContext->commandQueues[dev], + jniContext->kernel, + range.dims, + range.offsets, + range.globalDims, + range.localDims, + 0, // wait for this event count + NULL, // list of events to wait for + &jniContext->executeEvents[dev]); + }else{ + // we are the last pass of >1 + // we don't depend on write enqueues + // we block and supply executeEvents + //fprintf(stderr, "setting passid to %d of %d last\n", passid, passes); + status = clEnqueueNDRangeKernel( + jniContext->commandQueues[dev], + jniContext->kernel, + range.dims, + range.offsets, + range.globalDims, + range.localDims, + 0, // wait for this event count + NULL, // list of events to wait for + &jniContext->executeEvents[dev]); } - for (int dev=0; dev<jniContext->deviceIdc; dev++){ - status = clReleaseEvent(jniContext->executeEvents[dev]); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clReleaseEvent() execute event"); - jniContext->unpinAll(); - return status; - } + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clEnqueueNDRangeKernel()"); + fprintf(stderr, "after clEnqueueNDRangeKernel, globalSize_0=%d localSize_0=%d\n", (int)range.globalDims[0], range.localDims[0] ); + jniContext->unpinAll(jenv); + return status; + } + } + if (passid < passes-1){ + // we need to wait for the executions to complete... + status = clWaitForEvents(jniContext->deviceIdc, jniContext->executeEvents); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clWaitForEvents() execute events mid pass"); + jniContext->unpinAll(jenv); + return status; } - for (int i=0; i< writeEventCount; i++){ - if (jniContext->isProfilingEnabled()) { - profile(&jniContext->args[jniContext->writeEventArgs[i]]->value.ref.write, &jniContext->writeEvents[i]); - } - status = clReleaseEvent(jniContext->writeEvents[i]); + for (int dev = 0; dev < jniContext->deviceIdc; dev++){ + status = clReleaseEvent(jniContext->executeEvents[dev]); if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clReleaseEvent() write event"); - jniContext->unpinAll(); + PRINT_CL_ERR(status, "clReleaseEvent() read event"); + jniContext->unpinAll(jenv); return status; } } + } + } + + int readEventCount = 0; - jniContext->unpinAll(); + for (int i=0; i< jniContext->argc; i++){ + KernelArg *arg = jniContext->args[i]; + if (arg->mustReadBuffer()){ if (jniContext->isProfilingEnabled()) { - writeProfileInfo(jniContext); + jniContext->readEventArgs[readEventCount]=i; } - - jniContext->firstRun = false; if (jniContext->isVerbose()){ - timer.end("elapsed"); + fprintf(stderr, "reading buffer %d %s\n", i, arg->name); } - //fprintf(stderr, "About to return %d from exec\n", status); - return(status); + status = clEnqueueReadBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, + arg->sizeInBytes,arg->value.ref.addr , jniContext->deviceIdc, jniContext->executeEvents, &(jniContext->readEvents[readEventCount++])); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clEnqueueReadBuffer()"); + jniContext->unpinAll(jenv); + return status; + } } + } + // don't change the order here + // We wait for the reads which each depend on the execution, which depends on the writes ;) + // So after the reads have completed, we can release the execute and writes. - // we return the JNIContext from here - JNIEXPORT jlong JNICALL Java_com_amd_aparapi_KernelRunner_initJNI(JNIEnv *jenv, jclass clazz, jobject kernelObject, - jint flags, jint numProcessors, - jint maxJTPLocalSize) { - cl_int status = CL_SUCCESS; - JNIContext* jniContext = new JNIContext(jenv, kernelObject, flags, numProcessors, maxJTPLocalSize); - if (jniContext->isValid()){ - return((jlong)jniContext); - }else{ - return(0L); + if (readEventCount >0){ + status = clWaitForEvents(readEventCount, jniContext->readEvents); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clWaitForEvents() read events"); + jniContext->unpinAll(jenv); + return status; + } + + for (int i=0; i< readEventCount; i++){ + if (jniContext->isProfilingEnabled()) { + status = profile(&jniContext->args[jniContext->readEventArgs[i]]->value.ref.read, &jniContext->readEvents[i]); + if (status != CL_SUCCESS) { + jniContext->unpinAll(jenv); + return status; + } + } + status = clReleaseEvent(jniContext->readEvents[i]); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clReleaseEvent() read event"); + jniContext->unpinAll(jenv); + return status; } } + } else { + // if readEventCount == 0 then we don't need any reads so we just wait for the executions to complete + status = clWaitForEvents(jniContext->deviceIdc, jniContext->executeEvents); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clWaitForEvents() execute event"); + jniContext->unpinAll(jenv); + return status; + } + } + if (jniContext->isProfilingEnabled()) { + status = profile(&jniContext->exec, &jniContext->executeEvents[0]); + if (status != CL_SUCCESS) { + jniContext->unpinAll(jenv); + return status; + } + } - JNIEXPORT jlong JNICALL Java_com_amd_aparapi_KernelRunner_buildProgramJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jstring source) { - JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); - if (jniContext == NULL){ - return 0; - } + // extract the execution status from the executeEvent + cl_int executeStatus; + status = clGetEventInfo(jniContext->executeEvents[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &executeStatus, NULL); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clGetEventInfo() execute event"); + jniContext->unpinAll(jenv); + return status; + } + if (executeStatus != CL_SUCCESS) { + // it should definitely not be negative, but since we did a wait above, it had better be CL_COMPLETE==CL_SUCCESS + PRINT_CL_ERR(executeStatus, "Execution status of execute event"); + jniContext->unpinAll(jenv); + return executeStatus; + } - cl_int status = CL_SUCCESS; - const char *sourceChars = jenv->GetStringUTFChars(source, NULL); - CHECK(sourceChars == NULL, "jenv->GetStringUTFChars() returned null" ); - - size_t sourceSize[] = { strlen(sourceChars) }; - jniContext->program = clCreateProgramWithSource( jniContext->context, 1, &sourceChars, sourceSize, &status); - jenv->ReleaseStringUTFChars(source, sourceChars); - ASSERT_CL("clCreateProgramWithSource()"); - - status = clBuildProgram(jniContext->program, jniContext->deviceIdc, jniContext->deviceIds, NULL, NULL, NULL); - - if(status == CL_BUILD_PROGRAM_FAILURE) { - cl_int logStatus; - size_t buildLogSize = 0; - status = clGetProgramBuildInfo(jniContext->program, jniContext->deviceIds[0], - CL_PROGRAM_BUILD_LOG, buildLogSize, NULL, &buildLogSize); - ASSERT_CL("clGetProgramBuildInfo()"); - char * buildLog = new char[buildLogSize]; - CHECK(buildLog == NULL, "Failed to allocate host memory. (buildLog)"); - memset(buildLog, 0, buildLogSize); - status = clGetProgramBuildInfo (jniContext->program, jniContext->deviceIds[0], - CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); - ASSERT_CL("clGetProgramBuildInfo()"); - - fprintf(stderr, "clBuildProgram failed"); - fprintf(stderr, "\n************************************************\n"); - fprintf(stderr, "%s", buildLog); - fprintf(stderr, "\n************************************************\n\n\n"); - delete []buildLog; - return(0); - } + for (int dev=0; dev<jniContext->deviceIdc; dev++){ - jniContext->kernel = clCreateKernel(jniContext->program, "run", &status); - ASSERT_CL("clCreateKernel()"); + status = clReleaseEvent(jniContext->executeEvents[dev]); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clReleaseEvent() execute event"); + jniContext->unpinAll(jenv); + return status; + } + } - cl_command_queue_properties queue_props = 0; - if (jniContext->isProfilingEnabled()) { - queue_props |= CL_QUEUE_PROFILING_ENABLE; - } + for (int i=0; i< writeEventCount; i++){ + if (jniContext->isProfilingEnabled()) { + profile(&jniContext->args[jniContext->writeEventArgs[i]]->value.ref.write, &jniContext->writeEvents[i]); + } + status = clReleaseEvent(jniContext->writeEvents[i]); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clReleaseEvent() write event"); + jniContext->unpinAll(jenv); + return status; + } + } - jniContext->commandQueues= new cl_command_queue[jniContext->deviceIdc]; - for (int dev=0; dev < jniContext->deviceIdc; dev++){ - jniContext->commandQueues[dev]=clCreateCommandQueue(jniContext->context, (cl_device_id)jniContext->deviceIds[dev], - queue_props, - &status); - ASSERT_CL("clCreateCommandQueue()"); - } + jniContext->unpinAll(jenv); - if (jniContext->isProfilingEnabled()) { - // compute profile filename + if (jniContext->isProfilingEnabled()) { + writeProfileInfo(jniContext); + } + + jniContext->firstRun = false; + if (jniContext->isVerbose()){ + timer.end("elapsed"); + } + + //fprintf(stderr, "About to return %d from exec\n", status); + return(status); +} + + +// we return the JNIContext from here +JNIEXPORT jlong JNICALL Java_com_amd_aparapi_KernelRunner_initJNI(JNIEnv *jenv, jclass clazz, jobject kernelObject, + jint flags) { + cl_int status = CL_SUCCESS; + JNIContext* jniContext = new JNIContext(jenv, kernelObject, flags); + if (jniContext->isValid()){ + return((jlong)jniContext); + }else{ + return(0L); + } +} + + +JNIEXPORT jlong JNICALL Java_com_amd_aparapi_KernelRunner_buildProgramJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jstring source) { + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); + if (jniContext == NULL){ + return 0; + } + + cl_int status = CL_SUCCESS; + const char *sourceChars = jenv->GetStringUTFChars(source, NULL); + CHECK(sourceChars == NULL, "jenv->GetStringUTFChars() returned null" ); + + size_t sourceSize[] = { strlen(sourceChars) }; + jniContext->program = clCreateProgramWithSource( jniContext->context, 1, &sourceChars, sourceSize, &status); + jenv->ReleaseStringUTFChars(source, sourceChars); + ASSERT_CL("clCreateProgramWithSource()"); + + status = clBuildProgram(jniContext->program, jniContext->deviceIdc, jniContext->deviceIds, NULL, NULL, NULL); + + if(status == CL_BUILD_PROGRAM_FAILURE) { + cl_int logStatus; + size_t buildLogSize = 0; + status = clGetProgramBuildInfo(jniContext->program, jniContext->deviceIds[0], + CL_PROGRAM_BUILD_LOG, buildLogSize, NULL, &buildLogSize); + ASSERT_CL("clGetProgramBuildInfo()"); + char * buildLog = new char[buildLogSize]; + CHECK(buildLog == NULL, "Failed to allocate host memory. (buildLog)"); + memset(buildLog, 0, buildLogSize); + status = clGetProgramBuildInfo (jniContext->program, jniContext->deviceIds[0], + CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); + ASSERT_CL("clGetProgramBuildInfo()"); + + fprintf(stderr, "clBuildProgram failed"); + fprintf(stderr, "\n************************************************\n"); + fprintf(stderr, "%s", buildLog); + fprintf(stderr, "\n************************************************\n\n\n"); + delete []buildLog; + return(0); + } + + jniContext->kernel = clCreateKernel(jniContext->program, "run", &status); + ASSERT_CL("clCreateKernel()"); + + cl_command_queue_properties queue_props = 0; + if (jniContext->isProfilingEnabled()) { + queue_props |= CL_QUEUE_PROFILING_ENABLE; + } + + jniContext->commandQueues= new cl_command_queue[jniContext->deviceIdc]; + for (int dev=0; dev < jniContext->deviceIdc; dev++){ + jniContext->commandQueues[dev]=clCreateCommandQueue(jniContext->context, (cl_device_id)jniContext->deviceIds[dev], + queue_props, + &status); + ASSERT_CL("clCreateCommandQueue()"); + } + + if (jniContext->isProfilingEnabled()) { + // compute profile filename #if defined (_WIN32) - jint pid = GetCurrentProcessId(); + jint pid = GetCurrentProcessId(); #else - pid_t pid = getpid(); + pid_t pid = getpid(); #endif - // indicate cpu or gpu - // timestamp - // kernel name + // indicate cpu or gpu + // timestamp + // kernel name - jclass classMethodAccess = jenv->FindClass("java/lang/Class"); - jmethodID getNameID=jenv->GetMethodID(classMethodAccess,"getName","()Ljava/lang/String;"); - jstring className = (jstring)jenv->CallObjectMethod(jniContext->kernelClass, getNameID); - const char *classNameChars = jenv->GetStringUTFChars(className, NULL); + jclass classMethodAccess = jenv->FindClass("java/lang/Class"); + jmethodID getNameID=jenv->GetMethodID(classMethodAccess,"getName","()Ljava/lang/String;"); + jstring className = (jstring)jenv->CallObjectMethod(jniContext->kernelClass, getNameID); + const char *classNameChars = jenv->GetStringUTFChars(className, NULL); #define TIME_STR_LEN 200 - char timeStr[TIME_STR_LEN]; - struct tm *tmp; - time_t t = time(NULL); - tmp = localtime(&t); - if (tmp == NULL) { - perror("localtime"); - } - //strftime(timeStr, TIME_STR_LEN, "%F.%H%M%S", tmp); %F seemed to cause a core dump - strftime(timeStr, TIME_STR_LEN, "%H%M%S", tmp); - - char* fnameStr = new char[strlen(classNameChars) + strlen(timeStr) + 128]; + char timeStr[TIME_STR_LEN]; + struct tm *tmp; + time_t t = time(NULL); + tmp = localtime(&t); + if (tmp == NULL) { + perror("localtime"); + } + //strftime(timeStr, TIME_STR_LEN, "%F.%H%M%S", tmp); %F seemed to cause a core dump + strftime(timeStr, TIME_STR_LEN, "%H%M%S", tmp); - //sprintf(fnameStr, "%s.%s.%d.%llx\n", classNameChars, timeStr, pid, jniContext); - sprintf(fnameStr, "aparapiprof.%s.%d.%016lx", timeStr, pid, (unsigned long)jniContext); - jenv->ReleaseStringUTFChars(className, classNameChars); + char* fnameStr = new char[strlen(classNameChars) + strlen(timeStr) + 128]; - FILE* profileFile = fopen(fnameStr, "w"); - if (profileFile != NULL) { - jniContext->profileFile = profileFile; - } else { - jniContext->profileFile = stderr; - fprintf(stderr, "Could not open profile data file %s, reverting to stderr\n", fnameStr); - } - delete []fnameStr; - } + //sprintf(fnameStr, "%s.%s.%d.%llx\n", classNameChars, timeStr, pid, jniContext); + sprintf(fnameStr, "aparapiprof.%s.%d.%016lx", timeStr, pid, (unsigned long)jniContext); + jenv->ReleaseStringUTFChars(className, classNameChars); - return((jlong)jniContext); + FILE* profileFile = fopen(fnameStr, "w"); + if (profileFile != NULL) { + jniContext->profileFile = profileFile; + } else { + jniContext->profileFile = stderr; + fprintf(stderr, "Could not open profile data file %s, reverting to stderr\n", fnameStr); } + delete []fnameStr; + } + return((jlong)jniContext); +} - // this is called once when the arg list is first determined for this kernel - JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_setArgsJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jobjectArray argArray, jint argc) { - JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); - cl_int status = CL_SUCCESS; - if (jniContext != NULL){ - jniContext->argc = argc; - jniContext->args = new KernelArg*[jniContext->argc]; - jniContext->firstRun = true; - // Step through the array of KernelArg's to capture the type data for the Kernel's data members. - for (jint i=0; i<jniContext->argc; i++){ - KernelArg* arg = jniContext->args[i] = new KernelArg; +// this is called once when the arg list is first determined for this kernel +JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_setArgsJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jobjectArray argArray, jint argc) { + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); + cl_int status = CL_SUCCESS; + if (jniContext != NULL){ + jniContext->argc = argc; + jniContext->args = new KernelArg*[jniContext->argc]; + jniContext->firstRun = true; + // Step through the array of KernelArg's to capture the type data for the Kernel's data members. + for (jint i=0; i<jniContext->argc; i++){ + + + jobject argObj = jenv->GetObjectArrayElement(argArray, i); + KernelArg* arg = jniContext->args[i] = new KernelArg(jenv, argObj); - jobject argObj = jenv->GetObjectArrayElement(argArray, i); - if (argClazz == 0){ - argClazz = cacheKernelArgFields(jenv, argObj); - } - arg->javaArg = jenv->NewGlobalRef(argObj); // save a global ref to the java Arg Object - - arg->type = jenv->GetIntField(argObj, typeFieldID); - arg->isStatic = jenv->GetBooleanField(argObj, isStaticFieldID); - jstring name = (jstring)jenv->GetObjectField(argObj, nameFieldID); - const char *nameChars = jenv->GetStringUTFChars(name, NULL); - arg->name=strdup(nameChars); - jenv->ReleaseStringUTFChars(name, nameChars); #ifdef VERBOSE_EXPLICIT - if (arg->isExplicit()){ - fprintf(stderr, "%s is explicit!\n", arg->name); - } + if (arg->isExplicit()){ + fprintf(stderr, "%s is explicit!\n", arg->name); + } #endif - if (arg->isPrimitive()) { - // for primitives, we cache the fieldID for that field in the kernel's arg object - if (arg->isFloat()){ - if (arg->isStatic){ - arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "F"); - }else{ - arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "F"); - } - arg->sizeInBytes = sizeof(jfloat); - }else if (arg->isInt()){ - if (arg->isStatic){ - arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "I"); - }else{ - arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "I"); - } - arg->sizeInBytes = sizeof(jint); - }else if (arg->isByte()){ - if (arg->isStatic){ - arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "B"); - }else{ - arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "B"); - } - arg->sizeInBytes = sizeof(jbyte); - }else if (arg->isBoolean()){ - if (arg->isStatic){ - arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "Z"); - }else{ - arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "Z"); - } - arg->sizeInBytes = sizeof(jboolean); - }else if (arg->isLong()){ - if (arg->isStatic){ - arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "J"); - }else{ - arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "J"); - } - arg->sizeInBytes = sizeof(jlong); - }else if (arg->isDouble()){ - if (arg->isStatic){ - arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "D"); - }else{ - arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "D"); - } - arg->sizeInBytes = sizeof(jdouble); - } - }else{ // we will use an array - arg->value.ref.mem = (cl_mem) 0; - arg->value.ref.javaArray = 0; - arg->sizeInBytes = 0; + if (arg->isPrimitive()) { + // for primitives, we cache the fieldID for that field in the kernel's arg object + if (arg->isFloat()){ + if (arg->isStatic){ + arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "F"); + }else{ + arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "F"); } - if (jniContext->isVerbose()){ - fprintf(stderr, "in setArgs arg %d %s type %08x\n", i, arg->name, arg->type); + arg->sizeInBytes = sizeof(jfloat); + }else if (arg->isInt()){ + if (arg->isStatic){ + arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "I"); + }else{ + arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "I"); } - - //If an error occurred, return early so we report the first problem, not the last - if (jniContext->jenv->ExceptionCheck() == JNI_TRUE) { - jniContext->argc = -1; - delete[] jniContext->args; - jniContext->args = NULL; - jniContext->firstRun = true; - return (status); + arg->sizeInBytes = sizeof(jint); + }else if (arg->isByte()){ + if (arg->isStatic){ + arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "B"); + }else{ + arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "B"); } - - } - // we will need an executeEvent buffer for all devices - jniContext->executeEvents = new cl_event[jniContext->deviceIdc]; - - // We will need *at most* jniContext->argc read/write events - jniContext->readEvents = new cl_event[jniContext->argc]; - if (jniContext->isProfilingEnabled()) { - jniContext->readEventArgs = new jint[jniContext->argc]; - } - jniContext->writeEvents = new cl_event[jniContext->argc]; - if (jniContext->isProfilingEnabled()) { - jniContext->writeEventArgs = new jint[jniContext->argc]; + arg->sizeInBytes = sizeof(jbyte); + }else if (arg->isBoolean()){ + if (arg->isStatic){ + arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "Z"); + }else{ + arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "Z"); + } + arg->sizeInBytes = sizeof(jboolean); + }else if (arg->isLong()){ + if (arg->isStatic){ + arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "J"); + }else{ + arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "J"); + } + arg->sizeInBytes = sizeof(jlong); + }else if (arg->isDouble()){ + if (arg->isStatic){ + arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "D"); + }else{ + arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "D"); + } + arg->sizeInBytes = sizeof(jdouble); } + }else{ // we will use an array + arg->value.ref.mem = (cl_mem) 0; + arg->value.ref.javaArray = 0; + arg->sizeInBytes = 0; } - return(status); - } - - JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getLocalSizeJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jint globalSize, jint localBytesPerLocalId) { - size_t kernelMaxWorkGroupSize = 0; - size_t kernelWorkGroupSizeMultiple = 0; - cl_int status = CL_SUCCESS; - JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); - if (jniContext != NULL){ - clGetKernelWorkGroupInfo(jniContext->kernel, jniContext->deviceIds[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(kernelMaxWorkGroupSize), &kernelMaxWorkGroupSize, NULL); - ASSERT_CL("clGetKernelWorkGroupInfo()"); - // starting value depends on device type - // not sure why the CPU has a different starting size, but it does - int startLocalSize = (jniContext->deviceType == CL_DEVICE_TYPE_GPU ? kernelMaxWorkGroupSize : globalSize/(jniContext->numProcessors*4)); - - if (startLocalSize == 0) startLocalSize = 1; - if (startLocalSize > kernelMaxWorkGroupSize) startLocalSize = kernelMaxWorkGroupSize; - if (startLocalSize > globalSize) startLocalSize = globalSize; - // if the kernel uses any local memory, determine our max local memory size so we can possibly limit localsize - cl_ulong devLocalMemSize; - if (localBytesPerLocalId > 0) { - status = clGetDeviceInfo(jniContext->deviceIds[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &devLocalMemSize, NULL); - cl_uint localSizeLimitFromLocalMem = devLocalMemSize/localBytesPerLocalId; - if (startLocalSize > localSizeLimitFromLocalMem) startLocalSize = localSizeLimitFromLocalMem; - if (jniContext->isVerbose()){ - fprintf(stderr, "localBytesPerLocalId=%d, device localMemMax=%d, localSizeLimitFromLocalMem=%d\n", - localBytesPerLocalId, (cl_uint) devLocalMemSize, localSizeLimitFromLocalMem); - } - + if (jniContext->isVerbose()){ + fprintf(stderr, "in setArgs arg %d %s type %08x\n", i, arg->name, arg->type); + if (arg->isLocal()){ + fprintf(stderr, "in setArgs arg %d %s is local\n", i, arg->name); + }else{ + fprintf(stderr, "in setArgs arg %d %s is *not* local\n", i, arg->name); } + } - // then iterate down until we find a localSize that divides globalSize equally - for (int localSize = startLocalSize; localSize>0; localSize--) { - if (globalSize % localSize == 0) { - if (jniContext->isVerbose()){ - fprintf(stderr, "for globalSize=%d, stepping localSize from %d, returning localSize=%d\n", globalSize, startLocalSize, localSize); - } - return localSize; - } - } + //If an error occurred, return early so we report the first problem, not the last + if (jenv->ExceptionCheck() == JNI_TRUE) { + jniContext->argc = -1; + delete[] jniContext->args; + jniContext->args = NULL; + jniContext->firstRun = true; + return (status); } - // should never get this far - return 0; + } + // we will need an executeEvent buffer for all devices + jniContext->executeEvents = new cl_event[jniContext->deviceIdc]; + + // We will need *at most* jniContext->argc read/write events + jniContext->readEvents = new cl_event[jniContext->argc]; + if (jniContext->isProfilingEnabled()) { + jniContext->readEventArgs = new jint[jniContext->argc]; + } + jniContext->writeEvents = new cl_event[jniContext->argc]; + if (jniContext->isProfilingEnabled()) { + jniContext->writeEventArgs = new jint[jniContext->argc]; + } + } + return(status); +} - JNIEXPORT jstring JNICALL Java_com_amd_aparapi_KernelRunner_getExtensions(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) { - jstring jextensions = NULL; - JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); - if (jniContext != NULL){ - size_t retvalsize = 0; - cl_int status = CL_SUCCESS; - status = clGetDeviceInfo(jniContext->deviceIds[0], CL_DEVICE_EXTENSIONS, 0, NULL, &retvalsize); - ASSERT_CL("clGetDeviceInfo()"); - char* extensions = new char[retvalsize]; - clGetDeviceInfo(jniContext->deviceIds[0], CL_DEVICE_EXTENSIONS, retvalsize, extensions, NULL); - jextensions = jenv->NewStringUTF(extensions); - delete [] extensions; - } - return jextensions; - } - - // Called as a result of Kernel.get(someArray) - JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jobject buffer) { - cl_int status = CL_SUCCESS; - JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); - if (jniContext != NULL){ - jboolean foundArg = false; - for (jint i=0; i<jniContext->argc; i++){ - KernelArg *arg= jniContext->args[i]; - if (arg->isArray()){ - jboolean isSame = jenv->IsSameObject(buffer, arg->value.ref.javaArray); - // only do this if the array that we are passed is indeed an arg we are tracking - if (isSame){ - foundArg = true; - //fprintf(stderr, "get of %s\n", arg->name); + + +JNIEXPORT jstring JNICALL Java_com_amd_aparapi_KernelRunner_getExtensionsJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) { + jstring jextensions = NULL; + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); + if (jniContext != NULL){ + size_t retvalsize = 0; + cl_int status = CL_SUCCESS; + status = clGetDeviceInfo(jniContext->deviceIds[0], CL_DEVICE_EXTENSIONS, 0, NULL, &retvalsize); + ASSERT_CL("clGetDeviceInfo()"); + char* extensions = new char[retvalsize]; + clGetDeviceInfo(jniContext->deviceIds[0], CL_DEVICE_EXTENSIONS, retvalsize, extensions, NULL); + jextensions = jenv->NewStringUTF(extensions); + delete [] extensions; + } + return jextensions; +} + +// Called as a result of Kernel.get(someArray) +JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jobject buffer) { + cl_int status = CL_SUCCESS; + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); + if (jniContext != NULL){ + jboolean foundArg = false; + for (jint i=0; i<jniContext->argc; i++){ + KernelArg *arg= jniContext->args[i]; + if (arg->isArray()){ + jboolean isSame = jenv->IsSameObject(buffer, arg->value.ref.javaArray); + // only do this if the array that we are passed is indeed an arg we are tracking + if (isSame){ + foundArg = true; + //fprintf(stderr, "get of %s\n", arg->name); #ifdef VERBOSE_EXPLICIT - fprintf(stderr, "explicitly reading buffer %d %s\n", i, arg->name); + fprintf(stderr, "explicitly reading buffer %d %s\n", i, arg->name); #endif - arg->pin(jenv); + arg->pin(jenv); - status = clEnqueueReadBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, - arg->sizeInBytes,arg->value.ref.addr , 0, NULL, &jniContext->readEvents[0]); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clEnqueueReadBuffer()"); - return status; - } - status = clWaitForEvents(1, jniContext->readEvents); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clWaitForEvents"); - return status; - } - clReleaseEvent(jniContext->readEvents[0]); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clReleaseEvent() read event"); - return status; - } - // since this is an explicit buffer get, we expect the buffer to have changed so we commit - arg->unpinCommit(jenv); - } + status = clEnqueueReadBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, + arg->sizeInBytes,arg->value.ref.addr , 0, NULL, &jniContext->readEvents[0]); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clEnqueueReadBuffer()"); + return status; } - } - if (!foundArg){ - if (jniContext->isVerbose()){ - fprintf(stderr, "attempt to request to get a buffer that does not appear to be referenced from kernel\n"); + status = clWaitForEvents(1, jniContext->readEvents); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clWaitForEvents"); + return status; } + clReleaseEvent(jniContext->readEvents[0]); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clReleaseEvent() read event"); + return status; + } + // since this is an explicit buffer get, we expect the buffer to have changed so we commit + arg->unpinCommit(jenv); } } - return 0; } + if (!foundArg){ + if (jniContext->isVerbose()){ + fprintf(stderr, "attempt to request to get a buffer that does not appear to be referenced from kernel\n"); + } + } + } + return 0; +} + +JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getMaxComputeUnitsJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) { + cl_int status = CL_SUCCESS; + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); + if (jniContext != NULL){ + return(jniContext->maxComputeUnits); + }else{ + return(0); + } +} +JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getMaxWorkItemDimensionsJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) { + cl_int status = CL_SUCCESS; + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); + if (jniContext != NULL){ + return(jniContext->maxWorkItemDimensions); + }else{ + return(0); + } +} +JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getMaxWorkGroupSizeJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) { + cl_int status = CL_SUCCESS; + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); + if (jniContext != NULL){ + return(jniContext->maxWorkGroupSize); + }else{ + return(0); + } +} + +JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getMaxWorkItemSizeJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jint _index) { + cl_int status = CL_SUCCESS; + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); + if (jniContext != NULL && _index >=0 && _index <= jniContext->maxWorkItemDimensions){ + return(jniContext->maxWorkItemSizes[_index]); + }else{ + return(0); + } +} diff --git a/com.amd.aparapi/build.xml b/com.amd.aparapi/build.xml index c57f0cb31150f3b4d473d1497816519ccea49f44..099eee309ad340b72aebf7b865da2c3e6f398051 100644 --- a/com.amd.aparapi/build.xml +++ b/com.amd.aparapi/build.xml @@ -8,7 +8,7 @@ <delete file="aparapi.jar"/> </target> - <target name="build"> + <target name="build" depends="clean"> <mkdir dir="classes"/> <javac destdir="classes" debug="on" includeAntRuntime="false" > <src path="src/java"/> diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/ClassModel.java b/com.amd.aparapi/src/java/com/amd/aparapi/ClassModel.java index 57f259350c762d55c085b4bcf416c3f566e2d6a3..8d6b45b43174b897ec2b5029b1a93a68bd4c4ddf 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/ClassModel.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/ClassModel.java @@ -2090,9 +2090,9 @@ class ClassModel{ } public boolean isStatic() { - return (Access.STATIC.bitIsSet(methodAccessFlags)); + return (Access.STATIC.bitIsSet(methodAccessFlags)); } - + AttributePool getAttributePool() { return (methodAttributePool); } diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Config.java b/com.amd.aparapi/src/java/com/amd/aparapi/Config.java index 46d2266a191eadbf983dd9540ba7c711dede66de..192edeac461ea8cf50e3dbc17d7e4399db8a222b 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/Config.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/Config.java @@ -175,6 +175,7 @@ class Config{ static String instructionListenerClassName = System.getProperty(propPkgName + ".instructionListenerClass"); static public InstructionListener instructionListener = null; + { if (instructionListenerClassName != null && !instructionListenerClassName.equals("")) { diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/DeprecatedException.java b/com.amd.aparapi/src/java/com/amd/aparapi/DeprecatedException.java new file mode 100644 index 0000000000000000000000000000000000000000..70ca856e9cd07cde34185f34c976c813419177cd --- /dev/null +++ b/com.amd.aparapi/src/java/com/amd/aparapi/DeprecatedException.java @@ -0,0 +1,46 @@ +/* +Copyright (c) 2010-2011, Advanced Micro Devices, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the +following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and the following +disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials provided with the distribution. + +Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export +laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through +774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, +you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of +Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration +Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1, +E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups +D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject +to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774 +of EAR). For the most current Country Group listings, or for additional information about the EAR or your obligations +under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. + +*/ +package com.amd.aparapi; + +@SuppressWarnings("serial") class DeprecatedException extends AparapiException{ + + DeprecatedException(String msg) { + super(msg); + } + +} diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java b/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java index c6dd14a1332086db13894c25c700e11500dba77d..abef24b2bf16ab2d57ecb2341f5b283cf51eda0e 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java @@ -89,16 +89,17 @@ import com.amd.aparapi.ClassModel.ConstantPool.MethodReferenceEntry; * } * </pre></blockquote> * <p> - * To execute this kernel, first create a new instance of it and then call <code>execute(int globalSize)</code>. + * To execute this kernel, first create a new instance of it and then call <code>execute(Range _range)</code>. * <p> * <blockquote><pre> * int[] values = new int[1024]; * // fill values array + * Range range = Range.create(values.length); // create a range 0..1024 * SquareKernel kernel = new SquareKernel(values); - * kernel.execute(values.length); + * kernel.execute(range); * </pre></blockquote> * <p> - * When <code>execute()</code> returns, all the executions of Kernel.run() have completed and the results are available in the <code>squares</code> array. + * When <code>execute(Range)</code> returns, all the executions of <code>Kernel.run()</code> have completed and the results are available in the <code>squares</code> array. * <p> * <blockquote><pre> * int[] squares = kernel.getSquares(); @@ -110,16 +111,19 @@ import com.amd.aparapi.ClassModel.ConstantPool.MethodReferenceEntry; * A different approach to creating kernels that avoids extending Kernel is to write an anonymous inner class: * <p> * <blockquote><pre> + * * final int[] values = new int[1024]; - * // fill values array + * // fill the values array * final int[] squares = new int[values.length]; + * final Range range = Range.create(values.length); + * * Kernel kernel = new Kernel(){ * public void run() { * int gid = getGlobalID(); * squares[gid] = values[gid]*values[gid]; * } * }; - * kernel.execute(values.length); + * kernel.execute(range); * for (int i=0; i< values.length; i++){ * System.out.printf("%4d %4d %8d\n", i, values[i], squares[i]); * } @@ -141,15 +145,52 @@ public abstract class Kernel implements Cloneable{ } @Retention(RetentionPolicy.RUNTIME) @interface OpenCLDelegate { + } + /** + * We can use this Annotation to 'tag' intended local buffers. + * + * So we can either annotate the buffer + * <pre><code> + * @Local int[] buffer = new int[1024]; + * </code></pre> + * Or use a special suffix + * <pre><code> + * int[] buffer_$local$ = new int[1024]; + * </code></pre> + * + * @see LOCAL_SUFFIX + * + * + */ + public @Retention(RetentionPolicy.RUNTIME) @interface Local { + + } + + /** + * We can use this suffix to 'tag' intended local buffers. + * + * + * So either name the buffer + * <pre><code> + * int[] buffer_$local$ = new int[1024]; + * </code></pre> + * Or use the Annotation form + * <pre><code> + * @Local int[] buffer = new int[1024]; + * </code></pre> + */ + + final static String LOCAL_SUFFIX = "_$local$"; + private static Logger logger = Logger.getLogger(Config.getLoggerName()); public abstract class Entry{ public abstract void run(); - public Kernel execute(int _globalSize) { - return (Kernel.this.execute("foo", _globalSize, 1)); + public Kernel execute(Range _range) { + return (Kernel.this.execute("foo", _range, 1)); } } @@ -292,26 +333,30 @@ public abstract class Kernel implements Cloneable{ private EXECUTION_MODE executionMode = EXECUTION_MODE.getDefaultExecutionMode(); - private int globalId; - - private int localId; - - private int localSize; + int[] globalId = new int[] { + 0, + 0, + 0 + }; - private int globalSize; + int[] localId = new int[] { + 0, + 0, + 0 + }; - private int groupId; + int[] groupId = new int[] { + 0, + 0, + 0 + }; - private int passId; + Range range; - private int numGroups; + int passId; volatile CyclicBarrier localBarrier; - void setGlobalId(int _globalId) { - globalId = _globalId; - } - /** * Determine the globalId of an executing kernel. * <p> @@ -349,19 +394,26 @@ public abstract class Kernel implements Cloneable{ */ @OpenCLDelegate protected final int getGlobalId() { - return (globalId); + return (getGlobalId(0)); } - void setGroupId(int _groupId) { - groupId = _groupId; - + @OpenCLDelegate protected final int getGlobalId(int _dim) { + return (globalId[_dim]); } - void setPassId(int _passId) { - passId = _passId; + /* + @OpenCLDelegate protected final int getGlobalX() { + return (getGlobalId(0)); + } - } + @OpenCLDelegate protected final int getGlobalY() { + return (getGlobalId(1)); + } + @OpenCLDelegate protected final int getGlobalZ() { + return (getGlobalId(2)); + } + */ /** * Determine the groupId of an executing kernel. * <p> @@ -394,9 +446,26 @@ public abstract class Kernel implements Cloneable{ * @return The groupId for this Kernel being executed */ @OpenCLDelegate protected final int getGroupId() { - return (groupId); + return (getGroupId(0)); } + @OpenCLDelegate protected final int getGroupId(int _dim) { + return (groupId[_dim]); + } + + /* + @OpenCLDelegate protected final int getGroupX() { + return (getGroupId(0)); + } + + @OpenCLDelegate protected final int getGroupY() { + return (getGroupId(1)); + } + + @OpenCLDelegate protected final int getGroupZ() { + return (getGroupId(2)); + } + */ /** * Determine the passId of an executing kernel. * <p> @@ -416,10 +485,6 @@ public abstract class Kernel implements Cloneable{ return (passId); } - void setLocalId(int _localId) { - localId = _localId; - } - /** * Determine the local id of an executing kernel. * <p> @@ -451,9 +516,26 @@ public abstract class Kernel implements Cloneable{ * @return The local id for this Kernel being executed */ @OpenCLDelegate protected final int getLocalId() { - return (localId); + return (getLocalId(0)); } + @OpenCLDelegate protected final int getLocalId(int _dim) { + return (localId[_dim]); + } + + /* + @OpenCLDelegate protected final int getLocalX() { + return (getLocalId(0)); + } + + @OpenCLDelegate protected final int getLocalY() { + return (getLocalId(1)); + } + + @OpenCLDelegate protected final int getLocalZ() { + return (getLocalId(2)); + } + */ /** * Determine the size of the group that an executing kernel is a member of. * <p> @@ -472,9 +554,26 @@ public abstract class Kernel implements Cloneable{ * @return The size of the currently executing group. */ @OpenCLDelegate protected final int getLocalSize() { - return (localSize); + return (range.getLocalSize(0)); + } + + @OpenCLDelegate protected final int getLocalSize(int _dim) { + return (range.getLocalSize(_dim)); } + /* + @OpenCLDelegate protected final int getLocalWidth() { + return (range.getLocalSize(0)); + } + + @OpenCLDelegate protected final int getLocalHeight() { + return (range.getLocalSize(1)); + } + + @OpenCLDelegate protected final int getLocalDepth() { + return (range.getLocalSize(2)); + } + */ /** * Determine the value that was passed to <code>Kernel.execute(int globalSize)</code> method. * @@ -486,14 +585,26 @@ public abstract class Kernel implements Cloneable{ * @return The value passed to <code>Kernel.execute(int globalSize)</code> causing the current execution. */ @OpenCLDelegate protected final int getGlobalSize() { - return (globalSize); + return (range.getGlobalSize(0)); } - void setNumGroups(int _numGroups) { - numGroups = _numGroups; - + @OpenCLDelegate protected final int getGlobalSize(int _dim) { + return (range.getGlobalSize(_dim)); } + /* + @OpenCLDelegate protected final int getGlobalWidth() { + return (range.getGlobalSize(0)); + } + + @OpenCLDelegate protected final int getGlobalHeight() { + return (range.getGlobalSize(1)); + } + + @OpenCLDelegate protected final int getGlobalDepth() { + return (range.getGlobalSize(2)); + } + */ /** * Determine the number of groups that will be used to execute a kernel * <p> @@ -509,9 +620,26 @@ public abstract class Kernel implements Cloneable{ * @return The number of groups that kernels will be dispatched into. */ @OpenCLDelegate protected final int getNumGroups() { - return (numGroups); + return (range.getNumGroups(0)); + } + + @OpenCLDelegate protected final int getNumGroups(int _dim) { + return (range.getNumGroups(_dim)); } + /* + @OpenCLDelegate protected final int getNumGroupsWidth() { + return (range.getGroups(0)); + } + + @OpenCLDelegate protected final int getNumGroupsHeight() { + return (range.getGroups(1)); + } + + @OpenCLDelegate protected final int getNumGroupsDepth() { + return (range.getGroups(2)); + } + */ /** * The entry point of a kernel. * @@ -529,9 +657,21 @@ public abstract class Kernel implements Cloneable{ @Override protected Object clone() { try { Kernel worker = (Kernel) super.clone(); - // if there are any private buffers, go thru the fields here - // and allocate a new instance for each clone - + worker.groupId = new int[] { + 0, + 0, + 0 + }; + worker.localId = new int[] { + 0, + 0, + 0 + }; + worker.globalId = new int[] { + 0, + 0, + 0 + }; return worker; } catch (CloneNotSupportedException e) { // TODO Auto-generated catch block @@ -1373,23 +1513,12 @@ public abstract class Kernel implements Cloneable{ * Java version is identical to localBarrier() * * @annotion Experimental + * @deprecated */ - @OpenCLDelegate @Annotations.Experimental protected final void globalBarrier() { - try { - localBarrier.await(); - } catch (InterruptedException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (BrokenBarrierException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - } - - final void setSizes(int _globalSize, int _localSize) { - localSize = _localSize; - globalSize = _globalSize; + @OpenCLDelegate @Annotations.Experimental @Deprecated() protected final void globalBarrier() throws DeprecatedException { + throw new DeprecatedException( + "Kernel.globalBarrier() has been deprecated. It was based an incorrect understanding of OpenCL functionality."); } @@ -1441,23 +1570,38 @@ public abstract class Kernel implements Cloneable{ } /** - * Start execution of <code>globalSize</code> kernels. + * Start execution of <code>_range</code> kernels. * <p> * When <code>kernel.execute(globalSize)</code> is invoked, Aparapi will schedule the execution of <code>globalSize</code> kernels. If the execution mode is GPU then * the kernels will execute as OpenCL code on the GPU device. Otherwise, if the mode is JTP, the kernels will execute as a pool of Java threads on the CPU. * <p> - * @param _globalSize The number of Kernels that we would like to initiate. + * @param range The number of Kernels that we would like to initiate. + * @returnThe Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr) + * + */ + public synchronized Kernel execute(Range _range) { + return (execute(_range, 1)); + } + + /** + * Start execution of <code>_range</code> kernels. + * <p> + * When <code>kernel.execute(_range)</code> is invoked, Aparapi will schedule the execution of <code>_range</code> kernels. If the execution mode is GPU then + * the kernels will execute as OpenCL code on the GPU device. Otherwise, if the mode is JTP, the kernels will execute as a pool of Java threads on the CPU. + * <p> + * Since adding the new <code>Range class</code> this method offers backward compatibility and merely defers to <code> return (execute(Range.create(_range), 1));</code>. + * @param _range The number of Kernels that we would like to initiate. * @returnThe Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr) * */ - public synchronized Kernel execute(int _globalSize) { - return (execute(_globalSize, 1)); + public synchronized Kernel execute(int _range) { + return (execute(Range.create(_range), 1)); } /** - * Start execution of <code>_passes</code> iterations of <code>globalSize</code> kernels. + * Start execution of <code>_passes</code> iterations of <code>_range</code> kernels. * <p> - * When <code>kernel.execute(globalSize, passes)</code> is invoked, Aparapi will schedule the execution of <code>globalSize</code> kernels. If the execution mode is GPU then + * When <code>kernel.execute(_range, _passes)</code> is invoked, Aparapi will schedule the execution of <code>_reange</code> kernels. If the execution mode is GPU then * the kernels will execute as OpenCL code on the GPU device. Otherwise, if the mode is JTP, the kernels will execute as a pool of Java threads on the CPU. * <p> * @param _globalSize The number of Kernels that we would like to initiate. @@ -1465,8 +1609,23 @@ public abstract class Kernel implements Cloneable{ * @return The Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr) * */ - public synchronized Kernel execute(int _globalSize, int _passes) { - return (execute("run", _globalSize, _passes)); + public synchronized Kernel execute(Range _range, int _passes) { + return (execute("run", _range, _passes)); + } + + /** + * Start execution of <code>_passes</code> iterations over the <code>_range</code> of kernels. + * <p> + * When <code>kernel.execute(_range)</code> is invoked, Aparapi will schedule the execution of <code>_range</code> kernels. If the execution mode is GPU then + * the kernels will execute as OpenCL code on the GPU device. Otherwise, if the mode is JTP, the kernels will execute as a pool of Java threads on the CPU. + * <p> + * Since adding the new <code>Range class</code> this method offers backward compatibility and merely defers to <code> return (execute(Range.create(_range), 1));</code>. + * @param _range The number of Kernels that we would like to initiate. + * @returnThe Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr) + * + */ + public synchronized Kernel execute(int _range, int _passes) { + return (execute(Range.create(_range), _passes)); } /** @@ -1480,12 +1639,12 @@ public abstract class Kernel implements Cloneable{ * @return The Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr) * */ - public synchronized Kernel execute(Entry _entry, int _globalSize) { + public synchronized Kernel execute(Entry _entry, Range _range) { if (kernelRunner == null) { kernelRunner = new KernelRunner(this); } - return (kernelRunner.execute(_entry, _globalSize, 1)); + return (kernelRunner.execute(_entry, _range, 1)); } /** @@ -1499,8 +1658,8 @@ public abstract class Kernel implements Cloneable{ * @return The Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr) * */ - public synchronized Kernel execute(String _entrypoint, int _globalSize) { - return (execute(_entrypoint, _globalSize, 1)); + public synchronized Kernel execute(String _entrypoint, Range _range) { + return (execute(_entrypoint, _range, 1)); } @@ -1515,12 +1674,12 @@ public abstract class Kernel implements Cloneable{ * @return The Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr) * */ - public synchronized Kernel execute(String _entrypoint, int _globalSize, int _passes) { + public synchronized Kernel execute(String _entrypoint, Range _range, int _passes) { if (kernelRunner == null) { kernelRunner = new KernelRunner(this); } - return (kernelRunner.execute(_entrypoint, _globalSize, _passes)); + return (kernelRunner.execute(_entrypoint, _range, _passes)); } @@ -1693,11 +1852,6 @@ public abstract class Kernel implements Cloneable{ return (false); } - void setLocalSize(int _localSize) { - localSize = _localSize; - - } - // the flag useNullForLocalSize is useful for testing that what we compute for localSize is what OpenCL // would also compute if we passed in null. In non-testing mode, we just call execute with the // same localSize that we computed in getLocalSizeJNI. We don't want do publicize these of course. diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/KernelRunner.java b/com.amd.aparapi/src/java/com/amd/aparapi/KernelRunner.java index 87e7de463bddcbc308df4189a204dca722f27a52..ab2f07f56a1a3cd01d1afc9ac4a5e5e5f9a20721 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/KernelRunner.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/KernelRunner.java @@ -70,6 +70,9 @@ import com.amd.aparapi.Kernel.EXECUTION_MODE; * */ class KernelRunner{ + /** + * Be careful changing the name/type of this field as it is referenced from JNI code. + */ public @interface UsedByJNICode { } @@ -305,7 +308,7 @@ class KernelRunner{ * * @author gfrost */ - @UsedByJNICode public static final int ARG_APARAPI_BUF_IS_DIRECT = 1 << 20; + // @UsedByJNICode public static final int ARG_APARAPI_BUF_IS_DIRECT = 1 << 20; /** * This 'bit' indicates that a particular <code>KernelArg</code> represents a <code>char</code> type (array or primitive). @@ -447,7 +450,7 @@ class KernelRunner{ * * At present only set for AparapiLocalBuffer objs, JNI multiplies this by localSize */ - @Annotations.Unused @UsedByJNICode public int bytesPerLocalSize; + // @Annotations.Unused @UsedByJNICode public int bytesPerLocalWidth; /** * Only set for array objs, not used on JNI @@ -527,21 +530,25 @@ class KernelRunner{ * @param maxJTPLocalSize * @return */ - @Annotations.DocMe private native static synchronized long initJNI(Kernel _kernel, int _flags, int numProcessors, - int maxJTPLocalSize); + @Annotations.DocMe private native static synchronized long initJNI(Kernel _kernel, int _flags); private native long buildProgramJNI(long _jniContextHandle, String _source); private native int setArgsJNI(long _jniContextHandle, KernelArg[] _args, int argc); - private native int runKernelJNI(long _jniContextHandle, int _globalSize, int _localSize, boolean _needSync, - boolean useNullForLocalSize, int _passes); + private native int runKernelJNI(long _jniContextHandle, Range _range, boolean _needSync, int _passes); private native int disposeJNI(long _jniContextHandle); - private native int getLocalSizeJNI(long _jniContextHandle, int _globalSize, int localBytesPerLocalId); + private native String getExtensionsJNI(long _jniContextHandle); + + private native int getMaxWorkGroupSizeJNI(long _jniContextHandle); + + private native int getMaxWorkItemSizeJNI(long _jniContextHandle, int _index); + + private native int getMaxComputeUnitsJNI(long _jniContextHandle); - private native String getExtensions(long _jniContextHandle); + private native int getMaxWorkItemDimensionsJNI(long _jniContextHandle); private Set<String> capabilitiesSet; @@ -635,34 +642,6 @@ class KernelRunner{ return capabilitiesSet.contains(CL_KHR_GL_SHARING); } - private static int numCores = Runtime.getRuntime().availableProcessors(); - - private static int maxJTPLocalSize = Config.JTPLocalSizeMultiplier * numCores; - - static int getMaxJTPLocalSize() { - return maxJTPLocalSize; - } - - /** - * We need to match OpenCL's algorithm for localsize. - * - * @param _globalSize - * The globalsize requested by the user (via <code>Kernel.execute(globalSize)</code>) - * @return The value we use for JTP execution for localSize - */ - private static int getJTPLocalSizeForGlobalSize(int _globalSize) { - // iterate down until we find a localSize that divides _globalSize equally - for (int localSize = getMaxJTPLocalSize(); localSize > 0; localSize--) { - if (_globalSize % localSize == 0) { - if (logger.isLoggable(Level.FINE)) { - logger.fine("executeJava: picking localSize=" + localSize + " for globalSize=" + _globalSize); - } - return localSize; - } - } - return 0; - } - /** * Execute using a Java thread pool. Either because we were explicitly asked to do so, or because we 'fall back' after discovering an OpenCL issue. * @@ -672,108 +651,272 @@ class KernelRunner{ * The # of passes requested by the user (via <code>Kernel.execute(globalSize, passes)</code>). Note this is usually defaulted to 1 via <code>Kernel.execute(globalSize)</code>. * @return */ - private long executeJava(final int _globalSize, final int _passes) { + private long executeJava(final Range _range, final int _passes) { if (logger.isLoggable(Level.FINE)) { - logger.fine("executeJava: _globalSize=" + _globalSize); + logger.fine("executeJava: range = " + _range); } if (kernel.getExecutionMode().equals(EXECUTION_MODE.SEQ)) { - kernel.localBarrier = new CyclicBarrier(1); - - kernel.setSizes(_globalSize, 1); + /** + * SEQ mode is useful for testing trivial logic, but kernels which use SEQ mode cannot be used if the + * product of localSize(0..3) is >1. So we can use multi-dim ranges but only if the local size is 1 in all dimensions. + * + * As a result of this barrier is only ever 1 work item wide and probably should be turned into a no-op. + * + * So we need to check if the range is valid here. If not we have no choice but to punt. + */ + if (_range.getLocalSize(0) * _range.getLocalSize(1) * _range.getLocalSize(2) > 1) { + throw new IllegalStateException("Can't run range with group size >1 sequentially. Barriers would deadlock!"); + } - kernel.setNumGroups(_globalSize); - Kernel worker = (Kernel) kernel.clone(); - for (int passid = 0; passid < _passes; passid++) { - worker.setPassId(passid); - for (int id = 0; id < _globalSize; id++) { - worker.setGroupId(id); - worker.setGlobalId(id); - worker.setLocalId(0); - worker.run(); + Kernel kernelClone = (Kernel) kernel.clone(); + kernelClone.range = _range; + kernelClone.groupId[0] = 0; + kernelClone.groupId[1] = 0; + kernelClone.groupId[2] = 0; + kernelClone.localId[0] = 0; + kernelClone.localId[1] = 0; + kernelClone.localId[2] = 0; + kernelClone.localBarrier = new CyclicBarrier(1); + for (kernelClone.passId = 0; kernelClone.passId < _passes; kernelClone.passId++) { + + if (_range.getDims() == 1) { + for (int id = 0; id < _range.getGlobalSize(0); id++) { + kernelClone.globalId[0] = id; + kernelClone.run(); + } + } else if (_range.getDims() == 2) { + for (int x = 0; x < _range.getGlobalSize(0); x++) { + kernelClone.globalId[0] = x; + for (int y = 0; y < _range.getGlobalSize(1); y++) { + kernelClone.globalId[1] = y; + kernelClone.run(); + } + } + } else if (_range.getDims() == 3) { + for (int x = 0; x < _range.getGlobalSize(0); x++) { + kernelClone.globalId[0] = x; + for (int y = 0; y < _range.getGlobalSize(1); y++) { + kernelClone.globalId[1] = y; + for (int z = 0; z < _range.getGlobalSize(2); z++) { + kernelClone.globalId[2] = z; + kernelClone.run(); + } + kernelClone.run(); + } + } } } + } else { - // note uses of final so we can use in anonymous inner class - final int localSize = getJTPLocalSizeForGlobalSize(_globalSize); - // if (localSize == 0) return 0; // should never happen - final int numGroups = _globalSize / localSize; - - // compute numThreadSets by multiplying localSize until bigger than numCores - final int numThreadSets = localSize >= numCores ? 1 : (numCores + (localSize - 1)) / localSize; - final int numThreads = numThreadSets * localSize; - // when dividing to get groupsPerThreadSet, round up - final int groupsPerThreadSet = (numGroups + (numThreadSets - 1)) / numThreadSets; - if (logger.isLoggable(Level.FINE)) { - logger.fine("executeJava: localSize=" + localSize + ", numThreads=" + numThreads + ", numThreadSets=" + numThreadSets - + ", numGroups=" + numGroups); - } - Thread[] threads = new Thread[numThreads]; - // joinBarrier that says all threads are finished - final CyclicBarrier joinBarrier = new CyclicBarrier(numThreads + 1); - - // each threadSet shares a CyclicBarrier of size localSize - final CyclicBarrier localBarriers[] = new CyclicBarrier[numThreadSets]; - kernel.setSizes(_globalSize, localSize); - kernel.setNumGroups(numGroups); - for (int passid = 0; passid < _passes; passid++) { - kernel.setPassId(passid); - for (int thrSetId = 0; thrSetId < numThreadSets; thrSetId++) { - final int startGroupId = thrSetId * groupsPerThreadSet; - final int endGroupId = Math.min((thrSetId + 1) * groupsPerThreadSet, numGroups); - // System.out.println("thrSetId=" + thrSetId + " running groups from " + startGroupId + " thru " + (endGroupId-1)); - localBarriers[thrSetId] = new CyclicBarrier(localSize); - - // each threadSet has localSize threads - for (int lid = 0; lid < localSize; lid++) { // for each thread in threadSet - final int localId = lid; - final int threadId = thrSetId * localSize + localId; - final Kernel worker = (Kernel) kernel.clone(); - worker.setLocalId(localId); - worker.localBarrier = localBarriers[thrSetId]; // barrier that the kernel has access to - - threads[threadId] = new Thread(new Runnable(){ - @Override public void run() { - for (int groupId = startGroupId; groupId < endGroupId; groupId++) { - int globalId = (groupId * localSize) + localId; - worker.setGroupId(groupId); - worker.setGlobalId(globalId); - // System.out.println("running worker with gid=" + globalId + ", lid=" + localId - // + ", groupId=" + groupId + ", threadId=" + threadId); - worker.run(); - } - try { - joinBarrier.await(); - } catch (InterruptedException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (BrokenBarrierException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } + final int threads = _range.getLocalSize(0) * _range.getLocalSize(1) * _range.getLocalSize(2); + final int globalGroups = _range.getNumGroups(0) * _range.getNumGroups(1) * _range.getNumGroups(2); + final Thread threadArray[] = new Thread[threads]; + /** + * This joinBarrier is the barrier that we provide for the kernel threads to rendezvous with the current dispatch thread. + * So this barrier is threadCount+1 wide (the +1 is for the dispatch thread) + */ + final CyclicBarrier joinBarrier = new CyclicBarrier(threads + 1); + + /** + * This localBarrier is only ever used by the kernels. If the kernel does not use the barrier the threads + * can get out of sync, we promised nothing in JTP mode. + * + * As with OpenCL all threads within a group must wait at the barrier or none. It is a user error (possible deadlock!) + * if the barrier is in a conditional that is only executed by some of the threads within a group. + * + * Kernel developer must understand this. + * + * This barrier is threadCount wide. We never hit the barrier from the dispatch thread. + */ + final CyclicBarrier localBarrier = new CyclicBarrier(threads); + + /** + * Note that we emulate OpenCL by creating one thread per localId (across the group). + * + * So threadCount == range.getLocalSize(0)*range.getLocalSize(1)*range.getLocalSize(2); + * + * For a 1D range of 12 groups of 4 we create 4 threads. One per localId(0). + * + * We also clone the kernel 4 times. One per thread. + * + * We create local barrier which has a width of 4 + * + * Thread-0 handles localId(0) (global 0,4,8) + * Thread-1 handles localId(1) (global 1,5,7) + * Thread-2 handles localId(2) (global 2,6,10) + * Thread-3 handles localId(3) (global 3,7,11) + * + * This allows all threads to synchronize using the local barrier. + * + * Initially the use of local buffers seems broken as the buffers appears to be per Kernel. + * Thankfully Kernel.clone() performs a shallow clone of all buffers (local and global) + * So each of the cloned kernels actually still reference the same underlying local/global buffers. + * + * If the kernel uses local buffers but does not use barriers then it is possible for different groups + * to see mutations from each other (unlike OpenCL), however if the kernel does not us barriers then it + * cannot assume any coherence in OpenCL mode either (the failure mode will be different but still wrong) + * + * So even JTP mode use of local buffers will need to use barriers. Not for the same reason as OpenCL but to keep groups in lockstep. + * + **/ + + for (int id = 0; id < threads; id++) { + final int threadId = id; + + /** + * We clone one kernel for each thread. + * + * They will all share references to the same range, localBarrier and global/local buffers because the clone is shallow. + * We need clones so that each thread can assign 'state' (localId/globalId/groupId) without worrying + * about other threads. + */ + final Kernel kernelClone = (Kernel) kernel.clone(); + kernelClone.range = _range; + kernelClone.localBarrier = localBarrier; + + threadArray[threadId] = new Thread(new Runnable(){ + @Override public void run() { + for (int globalGroupId = 0; globalGroupId < globalGroups; globalGroupId++) { + + if (_range.getDims() == 1) { + kernelClone.localId[0] = threadId % _range.getLocalSize(0); + kernelClone.globalId[0] = threadId + globalGroupId * threads; + kernelClone.groupId[0] = globalGroupId; + } else if (_range.getDims() == 2) { + + /** + * Consider a 12x4 grid of 4*2 local groups + * <pre> + * threads = 4*2 = 8 + * localWidth=4 + * localHeight=2 + * globalWidth=12 + * globalHeight=4 + * + * 00 01 02 03 | 04 05 06 07 | 08 09 10 11 + * 12 13 14 15 | 16 17 18 19 | 20 21 22 23 + * ------------+-------------+------------ + * 24 25 26 27 | 28 29 30 31 | 32 33 34 35 + * 36 37 38 39 | 40 41 42 43 | 44 45 46 47 + * + * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 threadIds : [0..7]*6 + * 04 05 06 07 | 04 05 06 07 | 04 05 06 07 + * ------------+-------------+------------ + * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 + * 04 05 06 07 | 04 05 06 07 | 04 05 06 07 + * + * 00 00 00 00 | 01 01 01 01 | 02 02 02 02 groupId[0] : 0..6 + * 00 00 00 00 | 01 01 01 01 | 02 02 02 02 + * ------------+-------------+------------ + * 00 00 00 00 | 01 01 01 01 | 02 02 02 02 + * 00 00 00 00 | 01 01 01 01 | 02 02 02 02 + * + * 00 00 00 00 | 00 00 00 00 | 00 00 00 00 groupId[1] : 0..6 + * 00 00 00 00 | 00 00 00 00 | 00 00 00 00 + * ------------+-------------+------------ + * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 + * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 + * + * 00 01 02 03 | 08 09 10 11 | 16 17 18 19 globalThreadIds == threadId + groupId * threads; + * 04 05 06 07 | 12 13 14 15 | 20 21 22 23 + * ------------+-------------+------------ + * 24 25 26 27 | 32[33]34 35 | 40 41 42 43 + * 28 29 30 31 | 36 37 38 39 | 44 45 46 47 + * + * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 localX = threadId % localWidth; (for globalThreadId 33 = threadId = 01 : 01%4 =1) + * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 + * ------------+-------------+------------ + * 00 01 02 03 | 00[01]02 03 | 00 01 02 03 + * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 + * + * 00 00 00 00 | 00 00 00 00 | 00 00 00 00 localY = threadId /localWidth (for globalThreadId 33 = threadId = 01 : 01/4 =0) + * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 + * ------------+-------------+------------ + * 00 00 00 00 | 00[00]00 00 | 00 00 00 00 + * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 + * + * 00 01 02 03 | 04 05 06 07 | 08 09 10 11 globalX= + * 00 01 02 03 | 04 05 06 07 | 08 09 10 11 groupsPerLineWidth=globalWidth/localWidth (=12/4 =3) + * ------------+-------------+------------ groupInset =groupId%groupsPerLineWidth (=4%3 = 1) + * 00 01 02 03 | 04[05]06 07 | 08 09 10 11 + * 00 01 02 03 | 04 05 06 07 | 08 09 10 11 globalX = groupInset*localWidth+localX (= 1*4+1 = 5) + * + * 00 00 00 00 | 00 00 00 00 | 00 00 00 00 globalY + * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 + * ------------+-------------+------------ + * 02 02 02 02 | 02[02]02 02 | 02 02 02 02 + * 03 03 03 03 | 03 03 03 03 | 03 03 03 03 + * + * </pre> + * Assume we are trying to locate the id's for #33 + * + */ + + kernelClone.localId[0] = threadId % _range.getLocalSize(0); // threadId % localWidth = (for 33 = 1 % 4 = 1) + kernelClone.localId[1] = threadId / _range.getLocalSize(0); // threadId / localWidth = (for 33 = 1 / 4 == 0) + + int groupInset = globalGroupId % _range.getNumGroups(0); // 4%3 = 1 + kernelClone.globalId[0] = groupInset * _range.getLocalSize(0) + kernelClone.localId[0]; // 1*4+1=5 + + int completeLines = (globalGroupId / _range.getNumGroups(0)) * _range.getLocalSize(1);// (4/3) * 2 + kernelClone.globalId[1] = completeLines + kernelClone.localId[1]; // 2+0 = 2 + kernelClone.groupId[0] = globalGroupId % _range.getNumGroups(0); + kernelClone.groupId[1] = globalGroupId / _range.getNumGroups(0); + } else if (_range.getDims() == 3) { + + //Same as 2D actually turns out that localId[0] is identical for all three dims so could be hoisted out of conditional code + + kernelClone.localId[0] = threadId % _range.getLocalSize(0); + + kernelClone.localId[1] = (threadId / _range.getLocalSize(0)) % _range.getLocalSize(1); + + // the thread id's span WxHxD so threadId/(WxH) should yield the local depth + kernelClone.localId[2] = threadId / (_range.getLocalSize(0) * _range.getLocalSize(1)); + + kernelClone.globalId[0] = (globalGroupId % _range.getNumGroups(0)) * _range.getLocalSize(0) + + kernelClone.localId[0]; + + kernelClone.globalId[1] = ((globalGroupId / _range.getNumGroups(0)) * _range.getLocalSize(1)) + % _range.getGlobalSize(1) + kernelClone.localId[1]; + + kernelClone.globalId[2] = (globalGroupId / (_range.getNumGroups(0) * _range.getNumGroups(1))) + * _range.getLocalSize(2) + kernelClone.localId[2]; + + kernelClone.groupId[0] = globalGroupId % _range.getNumGroups(0); + kernelClone.groupId[1] = (globalGroupId / _range.getNumGroups(0)) % _range.getNumGroups(1); + kernelClone.groupId[2] = globalGroupId / (_range.getNumGroups(0) * _range.getNumGroups(1)); } - }); - threads[threadId].start(); - } + kernelClone.run(); - // this is where the main thread waits on the join barrier - try { - joinBarrier.await(); - } catch (InterruptedException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (BrokenBarrierException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + } + await(joinBarrier); // This thread will rendezvous with dispatch thread here. This is effectively a join. } - } + }); + threadArray[threadId].setName("aparapi-" + threadId + "/" + threads); + threadArray[threadId].start(); + } + await(joinBarrier); // This dispatch thread waits for all worker threads here. + } // execution mode == JTP return 0; } + private static void await(CyclicBarrier _barrier) { + try { + _barrier.await(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (BrokenBarrierException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + private KernelArg[] args = null; private boolean usesOopConversion = false; @@ -1036,12 +1179,8 @@ class KernelRunner{ } } - // this routine now also finds out how many perLocalItem bytes are specified for this kernel - private int localBytesPerLocalId = 0; - private boolean updateKernelArrayRefs() throws AparapiException { boolean needsSync = false; - localBytesPerLocalId = 0; for (int i = 0; i < argc; i++) { KernelArg arg = args[i]; @@ -1089,37 +1228,59 @@ class KernelRunner{ // private int numAvailableProcessors = Runtime.getRuntime().availableProcessors(); - private Kernel executeOpenCL(final String _entrypointName, final int _globalSize, final int _passes) throws AparapiException { + private Kernel executeOpenCL(final String _entrypointName, final Range _range, final int _passes) throws AparapiException { + if (_range.getDims() > getMaxWorkItemDimensionsJNI(jniContextHandle)) { + throw new RangeException("Range dim size " + _range.getDims() + " > device " + + getMaxWorkItemDimensionsJNI(jniContextHandle)); + } + if (_range.getWorkGroupSize() > getMaxWorkGroupSizeJNI(jniContextHandle)) { + throw new RangeException("Range workgroup size " + _range.getWorkGroupSize() + " > device " + + getMaxWorkGroupSizeJNI(jniContextHandle)); + } + /* + if (_range.getGlobalSize(0) > getMaxWorkItemSizeJNI(jniContextHandle, 0)) { + throw new RangeException("Range globalsize 0 " + _range.getGlobalSize(0) + " > device " + + getMaxWorkItemSizeJNI(jniContextHandle, 0)); + } + if (_range.getDims() > 1) { + if (_range.getGlobalSize(1) > getMaxWorkItemSizeJNI(jniContextHandle, 1)) { + throw new RangeException("Range globalsize 1 " + _range.getGlobalSize(1) + " > device " + + getMaxWorkItemSizeJNI(jniContextHandle, 1)); + } + if (_range.getDims() > 2) { + if (_range.getGlobalSize(2) > getMaxWorkItemSizeJNI(jniContextHandle, 2)) { + throw new RangeException("Range globalsize 2 " + _range.getGlobalSize(2) + " > device " + + getMaxWorkItemSizeJNI(jniContextHandle, 2)); + } + } + } + */ + + if (logger.isLoggable(Level.FINE)) { + logger.fine("maxComputeUnits=" + this.getMaxComputeUnitsJNI(jniContextHandle)); + logger.fine("maxWorkGroupSize=" + this.getMaxWorkGroupSizeJNI(jniContextHandle)); + logger.fine("maxWorkItemDimensions=" + this.getMaxWorkItemDimensionsJNI(jniContextHandle)); + logger.fine("maxWorkItemSize(0)=" + getMaxWorkItemSizeJNI(jniContextHandle, 0)); + if (_range.getDims() > 1) { + logger.fine("maxWorkItemSize(1)=" + getMaxWorkItemSizeJNI(jniContextHandle, 1)); + if (_range.getDims() > 2) { + logger.fine("maxWorkItemSize(2)=" + getMaxWorkItemSizeJNI(jniContextHandle, 2)); + } + } + } // Read the array refs after kernel may have changed them // We need to do this as input to computing the localSize assert args != null : "args should not be null"; boolean needSync = updateKernelArrayRefs(); - - // note: the above will also recompute the value localBytesPerLocalId - - int localSize = getLocalSizeJNI(jniContextHandle, _globalSize, localBytesPerLocalId); - if (localSize == 0) { - // should fall back to java? - logger.warning("getLocalSizeJNI failed, reverting java"); - kernel.setFallbackExecutionMode(); - return execute(_entrypointName, _globalSize, _passes); - } - assert localSize <= _globalSize : "localSize = " + localSize; - - // Call back to kernel for last minute changes - kernel.setSizes(_globalSize, localSize); - if (needSync && logger.isLoggable(Level.FINE)) { logger.fine("Need to resync arrays on " + kernel.getClass().getName()); } - // native side will reallocate array buffers if necessary - if (runKernelJNI(jniContextHandle, _globalSize, localSize, needSync, kernel.useNullForLocalSize, _passes) != 0) { - //System.out.println("CL exec seems to have failed"); + if (runKernelJNI(jniContextHandle, _range, needSync, _passes) != 0) { logger.warning("### CL exec seems to have failed. Trying to revert to Java ###"); kernel.setFallbackExecutionMode(); - return execute(_entrypointName, _globalSize, _passes); + return execute(_entrypointName, _range, _passes); } if (usesOopConversion == true) { @@ -1127,42 +1288,41 @@ class KernelRunner{ } if (logger.isLoggable(Level.FINE)) { - logger.fine("executeOpenCL completed. _globalSize=" + _globalSize); + logger.fine("executeOpenCL completed. " + _range); } return kernel; } - synchronized Kernel execute(Kernel.Entry entry, final int _globalSize, final int _passes) { + synchronized Kernel execute(Kernel.Entry entry, final Range _range, final int _passes) { System.out.println("execute(Kernel.Entry, size) not implemented"); return (kernel); } - synchronized private Kernel fallBackAndExecute(String _entrypointName, final int _globalSize, final int _passes) { + synchronized private Kernel fallBackAndExecute(String _entrypointName, final Range _range, final int _passes) { kernel.setFallbackExecutionMode(); - return execute(_entrypointName, _globalSize, _passes); + return execute(_entrypointName, _range, _passes); } - synchronized private Kernel warnFallBackAndExecute(String _entrypointName, final int _globalSize, final int _passes, + synchronized private Kernel warnFallBackAndExecute(String _entrypointName, final Range _range, final int _passes, Exception _exception) { if (logger.isLoggable(Level.WARNING)) { logger.warning("Reverting to Java Thread Pool (JTP) for " + kernel.getClass() + ": " + _exception.getMessage()); _exception.printStackTrace(); } - return fallBackAndExecute(_entrypointName, _globalSize, _passes); + return fallBackAndExecute(_entrypointName, _range, _passes); } - synchronized private Kernel warnFallBackAndExecute(String _entrypointName, final int _globalSize, final int _passes, - String _excuse) { + synchronized private Kernel warnFallBackAndExecute(String _entrypointName, final Range _range, final int _passes, String _excuse) { logger.warning("Reverting to Java Thread Pool (JTP) for " + kernel.getClass() + ": " + _excuse); - return fallBackAndExecute(_entrypointName, _globalSize, _passes); + return fallBackAndExecute(_entrypointName, _range, _passes); } - synchronized Kernel execute(String _entrypointName, final int _globalSize, final int _passes) { + synchronized Kernel execute(String _entrypointName, final Range _range, final int _passes) { long executeStartTime = System.currentTimeMillis(); - if (_globalSize == 0) { - throw new IllegalStateException("global size can't be 0"); + if (_range == null) { + throw new IllegalStateException("range can't be null"); } if (kernel.getExecutionMode().isOpenCL()) { @@ -1173,7 +1333,7 @@ class KernelRunner{ entryPoint = classModel.getEntrypoint(_entrypointName, kernel); } catch (Exception exception) { - return warnFallBackAndExecute(_entrypointName, _globalSize, _passes, exception); + return warnFallBackAndExecute(_entrypointName, _range, _passes, exception); } if ((entryPoint != null) && !entryPoint.shouldFallback()) { @@ -1184,12 +1344,12 @@ class KernelRunner{ jniFlags |= (kernel.getExecutionMode().equals(EXECUTION_MODE.GPU) ? JNI_FLAG_USE_GPU : 0); // Init the device to check capabilities before emitting the // code that requires the capabilities. - jniContextHandle = initJNI(kernel, jniFlags, Runtime.getRuntime().availableProcessors(), getMaxJTPLocalSize()); + jniContextHandle = initJNI(kernel, jniFlags); if (jniContextHandle == 0) { - return warnFallBackAndExecute(_entrypointName, _globalSize, _passes, "initJNI failed to return a valid handle"); + return warnFallBackAndExecute(_entrypointName, _range, _passes, "initJNI failed to return a valid handle"); } - String extensions = getExtensions(jniContextHandle); + String extensions = getExtensionsJNI(jniContextHandle); capabilitiesSet = new HashSet<String>(); StringTokenizer strTok = new StringTokenizer(extensions); while (strTok.hasMoreTokens()) { @@ -1201,12 +1361,12 @@ class KernelRunner{ if (entryPoint.requiresDoublePragma() && !hasFP64Support()) { - return warnFallBackAndExecute(_entrypointName, _globalSize, _passes, "FP64 required but not supported"); + return warnFallBackAndExecute(_entrypointName, _range, _passes, "FP64 required but not supported"); } if (entryPoint.requiresByteAddressableStorePragma() && !hasByteAddressableStoreSupport()) { - return warnFallBackAndExecute(_entrypointName, _globalSize, _passes, + return warnFallBackAndExecute(_entrypointName, _range, _passes, "Byte addressable stores required but not supported"); } @@ -1215,24 +1375,16 @@ class KernelRunner{ if (entryPoint.requiresAtomic32Pragma() && !all32AtomicsAvailable) { - return warnFallBackAndExecute(_entrypointName, _globalSize, _passes, "32 bit Atomics required but not supported"); + return warnFallBackAndExecute(_entrypointName, _range, _passes, "32 bit Atomics required but not supported"); } - final StringBuilder openCLStringBuilder = new StringBuilder(); - KernelWriter openCLWriter = new KernelWriter(){ - @Override public void write(String _string) { - openCLStringBuilder.append(_string); - } - }; - - // Emit the OpenCL source into a string + String openCL = null; try { - openCLWriter.write(entryPoint); - + openCL = KernelWriter.writeToString(entryPoint); } catch (CodeGenException codeGenException) { - return warnFallBackAndExecute(_entrypointName, _globalSize, _passes, codeGenException); + return warnFallBackAndExecute(_entrypointName, _range, _passes, codeGenException); } - String openCL = openCLStringBuilder.toString(); + if (Config.enableShowGeneratedOpenCL) { System.out.println(openCL); } @@ -1241,9 +1393,8 @@ class KernelRunner{ } // Send the string to OpenCL to compile it - if (buildProgramJNI(jniContextHandle, openCLStringBuilder.toString()) == 0) { - - return warnFallBackAndExecute(_entrypointName, _globalSize, _passes, "OpenCL compile failed"); + if (buildProgramJNI(jniContextHandle, openCL) == 0) { + return warnFallBackAndExecute(_entrypointName, _range, _passes, "OpenCL compile failed"); } args = new KernelArg[entryPoint.getReferencedFields().size()]; @@ -1256,9 +1407,15 @@ class KernelRunner{ args[i].name = field.getName(); args[i].field = field; args[i].isStatic = (field.getModifiers() & Modifier.STATIC) == Modifier.STATIC; - Class<?> type = field.getType(); if (type.isArray()) { + + if (field.getAnnotation(com.amd.aparapi.Kernel.Local.class) != null + || args[i].name.endsWith(Kernel.LOCAL_SUFFIX)) { + args[i].type |= ARG_LOCAL; + } else { + args[i].type |= ARG_GLOBAL; + } args[i].array = null; // will get updated in updateKernelArrayRefs args[i].type |= ARG_ARRAY; if (isExplicit()) { @@ -1269,7 +1426,7 @@ class KernelRunner{ args[i].type |= entryPoint.getArrayFieldAssignments().contains(field.getName()) ? (ARG_WRITE | ARG_READ) : 0; args[i].type |= entryPoint.getArrayFieldAccesses().contains(field.getName()) ? ARG_READ : 0; - args[i].type |= ARG_GLOBAL; + // args[i].type |= ARG_GLOBAL; args[i].type |= type.isAssignableFrom(float[].class) ? ARG_FLOAT : 0; args[i].type |= type.isAssignableFrom(int[].class) ? ARG_INT : 0; @@ -1351,26 +1508,26 @@ class KernelRunner{ conversionTime = System.currentTimeMillis() - executeStartTime; try { - executeOpenCL(_entrypointName, _globalSize, _passes); + executeOpenCL(_entrypointName, _range, _passes); } catch (AparapiException e) { - warnFallBackAndExecute(_entrypointName, _globalSize, _passes, e); + warnFallBackAndExecute(_entrypointName, _range, _passes, e); } } else { - warnFallBackAndExecute(_entrypointName, _globalSize, _passes, "failed to locate entrypoint"); + warnFallBackAndExecute(_entrypointName, _range, _passes, "failed to locate entrypoint"); } } else { try { - executeOpenCL(_entrypointName, _globalSize, _passes); + executeOpenCL(_entrypointName, _range, _passes); } catch (AparapiException e) { - warnFallBackAndExecute(_entrypointName, _globalSize, _passes, e); + warnFallBackAndExecute(_entrypointName, _range, _passes, e); } } } else { - executeJava(_globalSize, _passes); + executeJava(_range, _passes); } if (Config.enableExecutionModeReporting) { System.out.println(kernel.getClass().getCanonicalName() + ":" + kernel.getExecutionMode()); diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/KernelWriter.java b/com.amd.aparapi/src/java/com/amd/aparapi/KernelWriter.java index 7a1886bbce1e22624b2d2d77e1630bd5c1a6b718..a9eb0f28d1a34f31c1b8470b9d13b9e10ce5e977 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/KernelWriter.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/KernelWriter.java @@ -43,10 +43,11 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import com.amd.aparapi.ClassModel.ClassModelField; import com.amd.aparapi.ClassModel.AttributePool.LocalVariableTableEntry; -import com.amd.aparapi.ClassModel.AttributePool.LocalVariableTableEntry.LocalVariableInfo; import com.amd.aparapi.ClassModel.AttributePool.RuntimeAnnotationsEntry; -import com.amd.aparapi.ClassModel.ClassModelField; +import com.amd.aparapi.ClassModel.AttributePool.LocalVariableTableEntry.LocalVariableInfo; +import com.amd.aparapi.ClassModel.AttributePool.RuntimeAnnotationsEntry.AnnotationInfo; import com.amd.aparapi.ClassModel.ConstantPool.FieldEntry; import com.amd.aparapi.ClassModel.ConstantPool.MethodEntry; import com.amd.aparapi.InstructionSet.AccessArrayElement; @@ -92,14 +93,47 @@ abstract class KernelWriter extends BlockWriter{ final static Map<String, String> javaToCLIdentifierMap = new HashMap<String, String>(); { + javaToCLIdentifierMap.put("getGlobalId()I", "get_global_id(0)"); + javaToCLIdentifierMap.put("getGlobalId(I)I", "get_global_id"); // no parenthesis if we are conveying args + javaToCLIdentifierMap.put("getGlobalX()I", "get_global_id(0)"); + javaToCLIdentifierMap.put("getGlobalY()I", "get_global_id(1)"); + javaToCLIdentifierMap.put("getGlobalZ()I", "get_global_id(2)"); + javaToCLIdentifierMap.put("getGlobalSize()I", "get_global_size(0)"); + javaToCLIdentifierMap.put("getGlobalSize(I)I", "get_global_size"); // no parenthesis if we are conveying args + javaToCLIdentifierMap.put("getGlobalWidth()I", "get_global_size(0)"); + javaToCLIdentifierMap.put("getGlobalHeight()I", "get_global_size(1)"); + javaToCLIdentifierMap.put("getGlobalDepth()I", "get_global_size(2)"); + javaToCLIdentifierMap.put("getLocalId()I", "get_local_id(0)"); + javaToCLIdentifierMap.put("getLocalId(I)I", "get_local_id"); // no parenthesis if we are conveying args + javaToCLIdentifierMap.put("getLocalX()I", "get_local_id(0)"); + javaToCLIdentifierMap.put("getLocalY()I", "get_local_id(1)"); + javaToCLIdentifierMap.put("getLocalZ()I", "get_local_id(2)"); + javaToCLIdentifierMap.put("getLocalSize()I", "get_local_size(0)"); + javaToCLIdentifierMap.put("getLocalSize(I)I", "get_local_size"); // no parenthesis if we are conveying args + javaToCLIdentifierMap.put("getLocalWidth()I", "get_local_size(0)"); + javaToCLIdentifierMap.put("getLocalHeight()I", "get_local_size(1)"); + javaToCLIdentifierMap.put("getLocalDepth()I", "get_local_size(2)"); + javaToCLIdentifierMap.put("getNumGroups()I", "get_num_groups(0)"); + javaToCLIdentifierMap.put("getNumGroups(I)I", "get_num_groups"); // no parenthesis if we are conveying args + javaToCLIdentifierMap.put("getNumGroupsX()I", "get_num_groups(0)"); + javaToCLIdentifierMap.put("getNumGroupsY()I", "get_num_groups(1)"); + javaToCLIdentifierMap.put("getNumGroupsZ()I", "get_num_groups(2)"); + javaToCLIdentifierMap.put("getGroupId()I", "get_group_id(0)"); + javaToCLIdentifierMap.put("getGroupId(I)I", "get_group_id"); // no parenthesis if we are conveying args + javaToCLIdentifierMap.put("getGroupX()I", "get_group_id(0)"); + javaToCLIdentifierMap.put("getGroupY()I", "get_group_id(1)"); + javaToCLIdentifierMap.put("getGroupZ()I", "get_group_id(2)"); + javaToCLIdentifierMap.put("getPassId()I", "get_pass_id(this)"); + javaToCLIdentifierMap.put("localBarrier()V", "barrier(CLK_LOCAL_MEM_FENCE)"); + javaToCLIdentifierMap.put("globalBarrier()V", "barrier(CLK_GLOBAL_MEM_FENCE)"); } @@ -160,7 +194,19 @@ abstract class KernelWriter extends BlockWriter{ if (barrierAndGetterMappings != null) { // this is one of the OpenCL barrier or size getter methods // write the mapping and exit - write(barrierAndGetterMappings); + if (argc > 0) { + write(barrierAndGetterMappings); + write("("); + for (int arg = 0; arg < argc; arg++) { + if ((arg != 0)) { + write(", "); + } + writeInstruction(_methodCall.getArg(arg)); + } + write(")"); + } else { + write(barrierAndGetterMappings); + } } else { String intrinsicMapping = Kernel.getMappedMethodName(_methodEntry); @@ -220,14 +266,17 @@ abstract class KernelWriter extends BlockWriter{ newLine(); } + public final static String __local = "__local"; + + public final static String __global = "__global"; + + public final static String LOCAL_ANNOTATION_NAME = "L" + Kernel.Local.class.getName().replace(".", "/") + ";"; + @Override void write(Entrypoint _entryPoint) throws CodeGenException { List<String> thisStruct = new ArrayList<String>(); List<String> argLines = new ArrayList<String>(); List<String> assigns = new ArrayList<String>(); - // hack - // for (java.lang.reflect.Field f:_entryPoint.getTheClass().getDeclaredFields()){ - entryPoint = _entryPoint; for (ClassModelField field : _entryPoint.getReferencedClassModelFields()) { @@ -239,13 +288,18 @@ abstract class KernelWriter extends BlockWriter{ String signature = field.getDescriptor(); boolean isPointer = false; + + // check the suffix + String type = field.getName().endsWith(Kernel.LOCAL_SUFFIX) ? __local : __global; RuntimeAnnotationsEntry visibleAnnotations = field.fieldAttributePool.getRuntimeVisibleAnnotationsEntry(); - String type = "__global"; if (visibleAnnotations != null) { - // for (AnnotationInfo ai : visibleAnnotations) { - // String typeDescriptor = ai.getTypeDescriptor(); - // } + for (AnnotationInfo ai : visibleAnnotations) { + String typeDescriptor = ai.getTypeDescriptor(); + if (typeDescriptor.equals(LOCAL_ANNOTATION_NAME)) { + type = __local; + } + } } if (signature.startsWith("[")) { diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/MethodModel.java b/com.amd.aparapi/src/java/com/amd/aparapi/MethodModel.java index 4d5900370e383665af9c244b789921e4f9f565de..96106eea8247bef026ce48cfbdb4bc9a8c863c46 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/MethodModel.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/MethodModel.java @@ -117,8 +117,6 @@ class MethodModel{ private boolean methodIsGetter; private boolean methodIsSetter; - - // Only setters can use putfield private boolean usesPutfield; diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Range.java b/com.amd.aparapi/src/java/com/amd/aparapi/Range.java new file mode 100644 index 0000000000000000000000000000000000000000..d0b59cadada5ebf414dcfccbf33528817932ac7e --- /dev/null +++ b/com.amd.aparapi/src/java/com/amd/aparapi/Range.java @@ -0,0 +1,392 @@ +package com.amd.aparapi; + +import java.util.Arrays; + +/** + * + * A representation of 1, 2 or 3 dimensional range of execution. + * + * This class uses factory methods to allow one, two or three dimensional ranges to be created. + * <br/> + * For a Kernel operating over the linear range 0..1024 without a specified groups size we would create a one dimensional <code>Range</code> using + * <blockquote><pre>Range.create(1024);</pre></blockquote> + * To request the same linear range but with a groupSize of 64 (range must be a multiple of group size!) we would use + * <blockquote><pre>Range.create(1024,64);</pre></blockquote> + * To request a two dimensional range over a grid (0..width)x(0..height) where width==512 and height=256 we would use + * <blockquote><pre> + * int width=512; + * int height=256; + * Range.create2D(width,height) + * </pre></blockquote> + * Again the above does not specify the group size. One will be chosen for you. If you want to specify the groupSize (say 16x8; 16 wide by 8 high) use + * <blockquote><pre> + * int width=512; + * int height=256; + * int groupWidth=16; + * int groupHeight=8; + * Range.create2D(width, height, groupWidth, groupHeight); + * </pre></blockquote> + * Finally we can request a three dimensional range using + * <blockquote><pre> + * int width=512; + * int height=256; + * int depth=8; + * Range.create3D(width, height, depth); + * </pre></blockquote> + * And can specify a group size using + * <blockquote><pre> + * int width=512; + * int height=256; + * int depth=8; + * int groupWidth=8; + * int groupHeight=4; + * int groupDepth=2 + * Range.create3D(width, height, depth, groupWidth, groupHeight, groupDepth); + * </pre></blockquote> + */ +public class Range{ + @KernelRunner.UsedByJNICode private int globalSize_0 = 1; + + @KernelRunner.UsedByJNICode private int localSize_0 = 1; + + @KernelRunner.UsedByJNICode private int globalSize_1 = 1; + + @KernelRunner.UsedByJNICode private int localSize_1 = 1; + + @KernelRunner.UsedByJNICode private int globalSize_2 = 1; + + @KernelRunner.UsedByJNICode private int localSize_2 = 1; + + @KernelRunner.UsedByJNICode private int dims; + + @KernelRunner.UsedByJNICode private boolean valid; + + @KernelRunner.UsedByJNICode private boolean localIsDerived = false; + + /** + * Get the localSize (of the group) given the requested dimension + * + * @param _dim 0=width, 1=height, 2=depth + * @return The size of the group give the requested dimension + */ + public int getLocalSize(int _dim) { + return (_dim == 0 ? localSize_0 : (_dim == 1 ? localSize_1 : localSize_2)); + } + + /** + * Get the globalSize (of the range) given the requested dimension + * + * @param _dim 0=width, 1=height, 2=depth + * @return The size of the group give the requested dimension + */ + public int getGlobalSize(int _dim) { + return (_dim == 0 ? globalSize_0 : (_dim == 1 ? globalSize_1 : globalSize_2)); + } + + private static final int THREADS_PER_CORE = 16; + + private static final int MAX_OPENCL_GROUP_SIZE = 256; + + private static final int MAX_GROUP_SIZE = Math.max(Runtime.getRuntime().availableProcessors() * THREADS_PER_CORE, + MAX_OPENCL_GROUP_SIZE); + + /** + * Create a one dimensional range <code>0.._globalWidth</code> which is processed in groups of size _localWidth. + * <br/> + * Note that for this range to be valid : </br> <strong><code>_globalWidth > 0 && _localWidth > 0 && _localWidth < MAX_GROUP_SIZE && _globalWidth % _localWidth==0</code></strong> + * + * @param _globalWidth the overall range we wish to process + * @param _localWidth the size of the group we wish to process. + * @return A new Range with the requested dimensions + */ + public static Range create(int _globalWidth, int _localWidth) { + Range range = new Range(); + range.dims = 1; + range.globalSize_0 = _globalWidth; + range.localSize_0 = _localWidth; + range.valid = range.localSize_0 > 0 && range.localSize_0 < MAX_GROUP_SIZE && range.globalSize_0 % range.localSize_0 == 0; + return (range); + } + + /** + * Determine the set of factors for a given value. + * @param _value The value we wish to factorize. + * @return and array of factors of _value + */ + + private static int[] getFactors(int _value) { + int factors[] = new int[MAX_GROUP_SIZE]; + int factorIdx = 0; + for (int possibleFactor = 1; possibleFactor <= MAX_GROUP_SIZE; possibleFactor++) { + if (_value % possibleFactor == 0) { + factors[factorIdx++] = possibleFactor; + } + } + return (Arrays.copyOf(factors, factorIdx)); + } + + /** + * Create a one dimensional range <code>0.._globalWidth</code> with an undefined group size. + * <br/> + * Note that for this range to be valid :- </br> <strong><code>_globalWidth > 0 </code></strong> + * <br/> + * The groupsize will be chosen such that _localWidth > 0 && _localWidth < MAX_GROUP_SIZE && _globalWidth % _localWidth==0 is true + * + * We extract the factors of _globalWidth and choose the highest value. + * + * @param _globalWidth the overall range we wish to process + * @return A new Range with the requested dimensions + */ + public static Range create(int _globalWidth) { + Range withoutLocal = create(_globalWidth, 1); + withoutLocal.localIsDerived = true; + int[] factors = getFactors(withoutLocal.globalSize_0); + + withoutLocal.localSize_0 = factors[factors.length - 1]; + + withoutLocal.valid = withoutLocal.localSize_0 > 0 && withoutLocal.localSize_0 < MAX_GROUP_SIZE + && withoutLocal.globalSize_0 % withoutLocal.localSize_0 == 0; + return (withoutLocal); + } + + /** + * Create a two dimensional range 0.._globalWidth x 0.._globalHeight using a group which is _localWidth x _localHeight in size. + * <br/> + * Note that for this range to be valid _globalWidth > 0 && _globalHeight >0 && _localWidth>0 && _localHeight>0 && _localWidth*_localHeight < MAX_GROUP_SIZE && _globalWidth%_localWidth==0 && _globalHeight%_localHeight==0. + * + * @param _globalWidth the overall range we wish to process + * @return + */ + public static Range create2D(int _globalWidth, int _globalHeight, int _localWidth, int _localHeight) { + Range range = new Range(); + range.dims = 2; + range.globalSize_0 = _globalWidth; + range.localSize_0 = _localWidth; + range.globalSize_1 = _globalHeight; + range.localSize_1 = _localHeight; + range.valid = range.localSize_0 > 0 && range.localSize_1 > 0 && range.localSize_0 * range.localSize_1 < MAX_GROUP_SIZE + && range.globalSize_0 % range.localSize_0 == 0 && range.globalSize_1 % range.localSize_1 == 0; + + return (range); + } + + /** + * Create a two dimensional range <code>0.._globalWidth * 0.._globalHeight</code> choosing suitable values for <code>localWidth</code> and <code>localHeight</code>. + * <p> + * Note that for this range to be valid <code>_globalWidth > 0 && _globalHeight >0 && _localWidth>0 && _localHeight>0 && _localWidth*_localHeight < MAX_GROUP_SIZE && _globalWidth%_localWidth==0 && _globalHeight%_localHeight==0</code>. + * + * <p> + * To determine suitable values for <code>_localWidth</code> and <code>_localHeight</code> we extract the factors for <code>_globalWidth</code> and <code>_globalHeight</code> and then + * find the largest product ( <code><= MAX_GROUP_SIZE</code>) with the lowest perimeter. + * + * <p> + * For example for <code>MAX_GROUP_SIZE</code> of 16 we favor 4x4 over 1x16. + * + * @param _globalWidth the overall range we wish to process + * @return + */ + public static Range create2D(int _globalWidth, int _globalHeight) { + Range withoutLocal = create2D(_globalWidth, _globalHeight, 1, 1); + withoutLocal.localIsDerived = true; + int[] widthFactors = getFactors(_globalWidth); + int[] heightFactors = getFactors(_globalHeight); + + withoutLocal.localSize_0 = 1; + withoutLocal.localSize_1 = 1; + int max = 1; + int perimeter = 0; + for (int w : widthFactors) { + for (int h : heightFactors) { + int size = w * h; + if (size > MAX_GROUP_SIZE) { + break; + } + + if (size > max) { + max = size; + perimeter = w + h; + withoutLocal.localSize_0 = w; + withoutLocal.localSize_1 = h; + } else if (size == max) { + int localPerimeter = w + h; + if (localPerimeter < perimeter) {// is this the shortest perimeter so far + perimeter = localPerimeter; + withoutLocal.localSize_0 = w; + withoutLocal.localSize_1 = h; + } + } + } + } + + withoutLocal.valid = withoutLocal.localSize_0 > 0 && withoutLocal.localSize_1 > 0 + && withoutLocal.localSize_0 * withoutLocal.localSize_1 < MAX_GROUP_SIZE + && withoutLocal.globalSize_0 % withoutLocal.localSize_0 == 0 + && withoutLocal.globalSize_1 % withoutLocal.localSize_1 == 0; + + return (withoutLocal); + } + + /** + * Create a two dimensional range <code>0.._globalWidth * 0.._globalHeight *0../_globalDepth</code> + * in groups defined by <code>localWidth</code> * <code>localHeight</code> * <code>localDepth</code>. + * <p> + * Note that for this range to be valid <code>_globalWidth > 0 && _globalHeight >0 _globalDepth >0 && _localWidth>0 && _localHeight>0 && _localDepth>0 && _localWidth*_localHeight*_localDepth < MAX_GROUP_SIZE && _globalWidth%_localWidth==0 && _globalHeight%_localHeight==0 && _globalDepth%_localDepth==0</code>. + * + * @param _globalWidth the width of the 3D grid we wish to process + * @param _globalHieght the height of the 3D grid we wish to process + * @param _globalDepth the depth of the 3D grid we wish to process + * @param _localWidth the width of the 3D group we wish to process + * @param _localHieght the height of the 3D group we wish to process + * @param _localDepth the depth of the 3D group we wish to process + * @return + */ + public static Range create3D(int _globalWidth, int _globalHeight, int _globalDepth, int _localWidth, int _localHeight, + int _localDepth) { + Range range = new Range(); + range.dims = 3; + range.globalSize_0 = _globalWidth; + range.localSize_0 = _localWidth; + range.globalSize_1 = _globalHeight; + range.localSize_1 = _localHeight; + range.globalSize_2 = _globalDepth; + range.localSize_2 = _localDepth; + range.valid = range.localSize_0 > 0 && range.localSize_1 > 0 && range.localSize_2 > 0 + && range.localSize_0 * range.localSize_1 * range.localSize_2 < MAX_GROUP_SIZE + && range.globalSize_0 % range.localSize_0 == 0 && range.globalSize_1 % range.localSize_1 == 0 + && range.globalSize_2 % range.localSize_2 == 0; + + return (range); + } + + /** + * Create a two dimensional range <code>0.._globalWidth * 0.._globalHeight *0../_globalDepth</code> + * choosing suitable values for <code>localWidth</code>, <code>localHeight</code> and <code>localDepth</code>. + * <p> + * Note that for this range to be valid <code>_globalWidth > 0 && _globalHeight >0 _globalDepth >0 && _localWidth>0 && _localHeight>0 && _localDepth>0 && _localWidth*_localHeight*_localDepth < MAX_GROUP_SIZE && _globalWidth%_localWidth==0 && _globalHeight%_localHeight==0 && _globalDepth%_localDepth==0</code>. + * + * <p> + * To determine suitable values for <code>_localWidth</code>,<code>_localHeight</code> and <code>_lodalDepth</code> we extract the factors for <code>_globalWidth</code>,<code>_globalHeight</code> and <code>_globalDepth</code> and then + * find the largest product ( <code><= MAX_GROUP_SIZE</code>) with the lowest perimeter. + * + * <p> + * For example for <code>MAX_GROUP_SIZE</code> of 64 we favor 4x4x4 over 1x16x16. + * + * @param _globalWidth the width of the 3D grid we wish to process + * @param _globalHieght the height of the 3D grid we wish to process + * @param _globalDepth the depth of the 3D grid we wish to process + * @return + */ + public static Range create3D(int _globalWidth, int _globalHeight, int _globalDepth) { + Range withoutLocal = create3D(_globalWidth, _globalHeight, _globalDepth, 1, 1, 1); + withoutLocal.localIsDerived = true; + int[] widthFactors = getFactors(_globalWidth); + int[] heightFactors = getFactors(_globalHeight); + int[] depthFactors = getFactors(_globalDepth); + + withoutLocal.localSize_0 = 1; + withoutLocal.localSize_1 = 1; + withoutLocal.localSize_2 = 1; + int max = 1; + int perimeter = 0; + for (int w : widthFactors) { + for (int h : heightFactors) { + for (int d : depthFactors) { + int size = w * h * d; + if (size > MAX_GROUP_SIZE) { + break; + } + if (size > max) { + max = size; + perimeter = w + h + d; + withoutLocal.localSize_0 = w; + withoutLocal.localSize_1 = h; + withoutLocal.localSize_2 = d; + } else if (size == max) { + int localPerimeter = w + h + d; + if (localPerimeter < perimeter) { // is this the shortest perimeter so far + perimeter = localPerimeter; + withoutLocal.localSize_0 = w; + withoutLocal.localSize_1 = h; + withoutLocal.localSize_2 = d; + } + } + } + } + } + + withoutLocal.valid = withoutLocal.localSize_0 > 0 && withoutLocal.localSize_1 > 0 && withoutLocal.localSize_2 > 0 + && withoutLocal.localSize_0 * withoutLocal.localSize_1 * withoutLocal.localSize_2 < MAX_GROUP_SIZE + && withoutLocal.globalSize_0 % withoutLocal.localSize_0 == 0 + && withoutLocal.globalSize_1 % withoutLocal.localSize_1 == 0 + && withoutLocal.globalSize_2 % withoutLocal.localSize_2 == 0; + + return (withoutLocal); + + } + + /** + * Get the number of dims for this Range. + * + * @return 0, 1 or 2 for one dimensional, two dimensional and three dimensional range respectively. + */ + public int getDims() { + return (dims); + } + + /** + * Override {@link #toString()} + */ + public String toString() { + StringBuilder sb = new StringBuilder(); + + switch (dims) { + case 1: + + sb.append("global:" + globalSize_0 + " local:" + (localIsDerived ? "(derived)" : "") + localSize_0); + + break; + case 2: + sb.append("2D(global:" + globalSize_0 + "x" + globalSize_1 + " local:" + (localIsDerived ? "(derived)" : "") + + localSize_0 + "x" + localSize_1 + ")"); + break; + case 3: + sb.append("3D(global:" + globalSize_0 + "x" + globalSize_1 + "x" + globalSize_2 + " local:" + + (localIsDerived ? "(derived)" : "") + localSize_0 + "x" + localSize_1 + "x" + localSize_0 + ")"); + break; + + } + return (sb.toString()); + } + + /** + * Get the number of groups for the given dimension. + * + * <p> + * This will essentially return globalXXXX/localXXXX for the given dimension (width, height, depth) + * @param _dim The dim we are interested in 0, 1 or 2 + * @return the number of groups for the given dimension. + */ + + public int getNumGroups(int _dim) { + return (_dim == 0 ? (globalSize_0 / localSize_0) : (_dim == 1 ? (globalSize_1 / localSize_1) : (globalSize_2 / localSize_2))); + } + + /** + * + * @return The product of all valid localSize dimensions + */ + public int getWorkGroupSize() { + return localSize_0 * localSize_1 * localSize_2; + } + + /** + * Determine whether this Range is usable. + * + * @return true if this Range is usable/valid. + */ + + public boolean isValid() { + return (valid); + } + +} diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/RangeException.java b/com.amd.aparapi/src/java/com/amd/aparapi/RangeException.java new file mode 100644 index 0000000000000000000000000000000000000000..b3998f84d7bb621cb710a6f6b23159103520f747 --- /dev/null +++ b/com.amd.aparapi/src/java/com/amd/aparapi/RangeException.java @@ -0,0 +1,46 @@ +/* +Copyright (c) 2010-2011, Advanced Micro Devices, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the +following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and the following +disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials provided with the distribution. + +Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export +laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through +774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, +you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of +Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration +Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1, +E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups +D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject +to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774 +of EAR). For the most current Country Group listings, or for additional information about the EAR or your obligations +under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. + +*/ +package com.amd.aparapi; + +@SuppressWarnings("serial") class RangeException extends AparapiException{ + + RangeException(String msg) { + super(msg); + } + +} diff --git a/examples/effects/src/com/amd/aparapi/examples/effects/Main.java b/examples/effects/src/com/amd/aparapi/examples/effects/Main.java index dab3078f5f7a66f7aa6dc258bddfc3f111a46db9..5543d94c4373bce826f0c731d6e1faec5bb6752e 100644 --- a/examples/effects/src/com/amd/aparapi/examples/effects/Main.java +++ b/examples/effects/src/com/amd/aparapi/examples/effects/Main.java @@ -53,6 +53,7 @@ import javax.swing.JComponent; import javax.swing.JFrame; import com.amd.aparapi.Kernel; +import com.amd.aparapi.Range; /** * An example Aparapi application which tracks the mouse and updates the color pallete of the window based on the distance from the mouse pointer. @@ -161,6 +162,8 @@ public class Main{ /** Height of Mandelbrot view. */ final int height = 1024; + final Range range = Range.create2D(width, height); + /** The size of the pallette of pixel colors we choose from. */ final int palletteSize = 128; @@ -234,7 +237,7 @@ public class Main{ int trailLastUpdatedPosition = 0; - kernel.execute(width * height); + kernel.execute(range); System.arraycopy(rgb, 0, imageRgb, 0, rgb.length); viewer.repaint(); @@ -268,7 +271,7 @@ public class Main{ trailLastUpdatedPosition++; /** execute the kernel which calculates new pixel values **/ - kernel.execute(width * height); + kernel.execute(range); /** copy the rgb values to the imageRgb buffer **/ System.arraycopy(rgb, 0, imageRgb, 0, rgb.length); diff --git a/examples/nbody/local.bat b/examples/nbody/local.bat new file mode 100644 index 0000000000000000000000000000000000000000..9fe4eb04eb0d4e890106ca8ad0a8da5bfc3d2473 --- /dev/null +++ b/examples/nbody/local.bat @@ -0,0 +1,14 @@ +@echo off + +java ^ + -Djava.library.path=..\..\com.amd.aparapi.jni;jogamp ^ + -Dcom.amd.aparapi.executionMode=%1 ^ + -Dcom.amd.aparapi.enableShowGeneratedOpenCL=true ^ + -Dcom.amd.aparapi.enableVerboseJNI=false ^ + -Dbodies=%2 ^ + -Dheight=600 ^ + -Dwidth=600 ^ + -classpath jogamp\gluegen-rt.jar;jogamp\jogl.all.jar;..\..\com.amd.aparapi\aparapi.jar;nbody.jar ^ + com.amd.aparapi.examples.nbody.Local + + diff --git a/examples/nbody/src/com/amd/aparapi/examples/nbody/Local.java b/examples/nbody/src/com/amd/aparapi/examples/nbody/Local.java new file mode 100644 index 0000000000000000000000000000000000000000..7fd272e29efe6a7387c19952f3753c24b83061ca --- /dev/null +++ b/examples/nbody/src/com/amd/aparapi/examples/nbody/Local.java @@ -0,0 +1,356 @@ +/* +Copyright (c) 2010-2011, Advanced Micro Devices, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the +following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and the following +disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials provided with the distribution. + +Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export +laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through +774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, +you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of +Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration +Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1, +E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups +D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject +to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774 +of EAR). For the most current Country Group listings, or for additional information about the EAR or your obligations +under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. + +*/ +package com.amd.aparapi.examples.nbody; + +import java.awt.BorderLayout; +import java.awt.Dimension; +import java.awt.FlowLayout; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; +import java.io.IOException; +import java.io.InputStream; + +import javax.media.opengl.GL; +import javax.media.opengl.GL2; +import javax.media.opengl.GLAutoDrawable; +import javax.media.opengl.GLCapabilities; +import javax.media.opengl.GLEventListener; +import javax.media.opengl.GLException; +import javax.media.opengl.awt.GLCanvas; +import javax.media.opengl.fixedfunc.GLLightingFunc; +import javax.media.opengl.glu.GLU; +import javax.swing.JButton; +import javax.swing.JFrame; +import javax.swing.JLabel; +import javax.swing.JPanel; +import javax.swing.JTextField; +import javax.swing.WindowConstants; + +import com.amd.aparapi.Kernel; +import com.amd.aparapi.Range; +import com.jogamp.opengl.util.FPSAnimator; +import com.jogamp.opengl.util.texture.Texture; +import com.jogamp.opengl.util.texture.TextureIO; + +/** + * An NBody clone which uses local memory to cache NBody positions for execution. + * + * http://www.browndeertechnology.com/docs/BDT_OpenCL_Tutorial_NBody-rev3.html + * + * @see com.amd.aparapi.examples.nbody.Main + * + * @author gfrost + * + */ +public class Local{ + + public static class NBodyKernel extends Kernel{ + protected final float delT = .005f; + + protected final float espSqr = 1.0f; + + protected final float mass = 5f; + + private final Range range; + + private final float[] xyz; // positions xy and z of bodies + + private final float[] vxyz; // velocity component of x,y and z of bodies + + @Local private final float[] localStuff; // local memory + + /** + * Constructor initializes xyz and vxyz arrays. + * @param _bodies + */ + public NBodyKernel(Range _range) { + range = _range; + localStuff = new float[range.getLocalSize(0) * 3]; + + xyz = new float[range.getGlobalSize(0) * 3]; + vxyz = new float[range.getGlobalSize(0) * 3]; + float maxDist = 20f; + for (int body = 0; body < range.getGlobalSize(0) * 3; body += 3) { + float theta = (float) (Math.random() * Math.PI * 2); + float phi = (float) (Math.random() * Math.PI * 2); + float radius = (float) (Math.random() * maxDist); + + // get the 3D dimensional coordinates + xyz[body + 0] = (float) (radius * Math.cos(theta) * Math.sin(phi)); + xyz[body + 1] = (float) (radius * Math.sin(theta) * Math.sin(phi)); + xyz[body + 2] = (float) (radius * Math.cos(phi)); + + // divide into two 'spheres of bodies' by adjusting x + if (body % 2 == 0) { + xyz[body + 0] += maxDist * 1.5; + } else { + xyz[body + 0] -= maxDist * 1.5; + } + } + setExplicit(true); + } + + /** + * Here is the kernel entrypoint. Here is where we calculate the position of each body + */ + @Override public void run() { + + int globalId = getGlobalId(0) * 3; + + float accx = 0.f; + float accy = 0.f; + float accz = 0.f; + float myPosx = xyz[globalId + 0]; + float myPosy = xyz[globalId + 1]; + float myPosz = xyz[globalId + 2]; + + for (int tile = 0; tile < getGlobalSize(0) / getLocalSize(0); tile++) { + // load one tile into local memory + int gidx = (tile * getLocalSize(0) + getLocalId()) * 3; + int lidx = getLocalId(0) * 3; + localStuff[lidx + 0] = xyz[gidx + 0]; + localStuff[lidx + 1] = xyz[gidx + 1]; + localStuff[lidx + 2] = xyz[gidx + 2]; + // Synchronize to make sure data is available for processing + localBarrier(); + + for (int i = 0; i < getLocalSize() * 3; i += 3) { + float dx = localStuff[i + 0] - myPosx; + float dy = localStuff[i + 1] - myPosy; + float dz = localStuff[i + 2] - myPosz; + float invDist = rsqrt((dx * dx) + (dy * dy) + (dz * dz) + espSqr); + float s = mass * invDist * invDist * invDist; + accx = accx + s * dx; + accy = accy + s * dy; + accz = accz + s * dz; + } + localBarrier(); + } + accx = accx * delT; + accy = accy * delT; + accz = accz * delT; + xyz[globalId + 0] = myPosx + vxyz[globalId + 0] * delT + accx * .5f * delT; + xyz[globalId + 1] = myPosy + vxyz[globalId + 1] * delT + accy * .5f * delT; + xyz[globalId + 2] = myPosz + vxyz[globalId + 2] * delT + accz * .5f * delT; + + vxyz[globalId + 0] = vxyz[globalId + 0] + accx; + vxyz[globalId + 1] = vxyz[globalId + 1] + accy; + vxyz[globalId + 2] = vxyz[globalId + 2] + accz; + } + + /** + * Render all particles to the OpenGL context + * @param gl + */ + + protected void render(GL2 gl) { + gl.glBegin(GL2.GL_QUADS); + + for (int i = 0; i < range.getGlobalSize(0) * 3; i += 3) { + gl.glTexCoord2f(0, 1); + gl.glVertex3f(xyz[i + 0], xyz[i + 1] + 1, xyz[i + 2]); + gl.glTexCoord2f(0, 0); + gl.glVertex3f(xyz[i + 0], xyz[i + 1], xyz[i + 2]); + gl.glTexCoord2f(1, 0); + gl.glVertex3f(xyz[i + 0] + 1, xyz[i + 1], xyz[i + 2]); + gl.glTexCoord2f(1, 1); + gl.glVertex3f(xyz[i + 0] + 1, xyz[i + 1] + 1, xyz[i + 2]); + } + gl.glEnd(); + } + + } + + public static int width; + + public static int height; + + public static boolean running; + + public static void main(String _args[]) { + + final NBodyKernel kernel = new NBodyKernel(Range.create(Integer.getInteger("bodies", 8192), 256)); + + JFrame frame = new JFrame("NBody"); + + JPanel panel = new JPanel(new BorderLayout()); + JPanel controlPanel = new JPanel(new FlowLayout()); + panel.add(controlPanel, BorderLayout.SOUTH); + + final JButton startButton = new JButton("Start"); + + startButton.addActionListener(new ActionListener(){ + @Override public void actionPerformed(ActionEvent e) { + running = true; + startButton.setEnabled(false); + } + }); + controlPanel.add(startButton); + controlPanel.add(new JLabel(kernel.getExecutionMode().toString())); + + controlPanel.add(new JLabel(" Particles")); + controlPanel.add(new JTextField("" + kernel.range.getGlobalSize(0), 5)); + + controlPanel.add(new JLabel("FPS")); + final JTextField framesPerSecondTextField = new JTextField("0", 5); + + controlPanel.add(framesPerSecondTextField); + controlPanel.add(new JLabel("Score(")); + JLabel miniLabel = new JLabel("<html><small>calcs</small><hr/><small>µsec</small></html>"); + + controlPanel.add(miniLabel); + controlPanel.add(new JLabel(")")); + + final JTextField positionUpdatesPerMicroSecondTextField = new JTextField("0", 5); + + controlPanel.add(positionUpdatesPerMicroSecondTextField); + GLCapabilities caps = new GLCapabilities(null); + caps.setDoubleBuffered(true); + caps.setHardwareAccelerated(true); + final GLCanvas canvas = new GLCanvas(caps); + Dimension dimension = new Dimension(Integer.getInteger("width", 742), Integer.getInteger("height", 742)); + canvas.setPreferredSize(dimension); + + canvas.addGLEventListener(new GLEventListener(){ + private double ratio; + + private final float xeye = 0f; + + private final float yeye = 0f; + + private final float zeye = 100f; + + private final float xat = 0f; + + private final float yat = 0f; + + private final float zat = 0f; + + public final float zoomFactor = 1.0f; + + private int frames; + + private long last = System.currentTimeMillis(); + + @Override public void dispose(GLAutoDrawable drawable) { + + } + + @Override public void display(GLAutoDrawable drawable) { + + GL2 gl = drawable.getGL().getGL2(); + + gl.glLoadIdentity(); + gl.glClear(GL.GL_COLOR_BUFFER_BIT | GL.GL_DEPTH_BUFFER_BIT); + gl.glColor3f(1f, 1f, 1f); + + GLU glu = new GLU(); + glu.gluPerspective(45f, ratio, 0f, 1000f); + + glu.gluLookAt(xeye, yeye, zeye * zoomFactor, xat, yat, zat, 0f, 1f, 0f); + if (running) { + kernel.execute(kernel.range); + if (kernel.isExplicit()) { + kernel.get(kernel.xyz); + } + } + kernel.render(gl); + + long now = System.currentTimeMillis(); + long time = now - last; + frames++; + + if (time > 1000) { // We update the frames/sec every second + if (running) { + float framesPerSecond = (frames * 1000.0f) / time; + int updatesPerMicroSecond = (int) ((framesPerSecond * kernel.range.getGlobalSize(0) * kernel.range + .getGlobalSize(0)) / 1000000); + framesPerSecondTextField.setText(String.format("%5.2f", framesPerSecond)); + positionUpdatesPerMicroSecondTextField.setText(String.format("%4d", updatesPerMicroSecond)); + } + frames = 0; + last = now; + } + gl.glFlush(); + + } + + @Override public void init(GLAutoDrawable drawable) { + final GL2 gl = drawable.getGL().getGL2(); + + gl.glShadeModel(GLLightingFunc.GL_SMOOTH); + gl.glEnable(GL.GL_BLEND); + gl.glBlendFunc(GL.GL_SRC_ALPHA, GL.GL_ONE); + try { + InputStream textureStream = Local.class.getResourceAsStream("particle.jpg"); + Texture texture = TextureIO.newTexture(textureStream, false, null); + texture.enable(gl); + } catch (IOException e) { + e.printStackTrace(); + } catch (GLException e) { + e.printStackTrace(); + } + + } + + @Override public void reshape(GLAutoDrawable drawable, int x, int y, int _width, int _height) { + width = _width; + height = _height; + + GL2 gl = drawable.getGL().getGL2(); + gl.glViewport(0, 0, width, height); + + ratio = (double) width / (double) height; + + } + + }); + + panel.add(canvas, BorderLayout.CENTER); + frame.getContentPane().add(panel, BorderLayout.CENTER); + + frame.setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE); + frame.pack(); + frame.setVisible(true); + + FPSAnimator animator = new FPSAnimator(canvas, 100); + animator.start(); + + } + +} diff --git a/examples/nbody/src/com/amd/aparapi/examples/nbody/Main.java b/examples/nbody/src/com/amd/aparapi/examples/nbody/Main.java index ae92a90a4c9dba7cfc7a8e12a639698c5a587e7c..016a70849824c3a4a351f8e42f0488a6b86ad785 100644 --- a/examples/nbody/src/com/amd/aparapi/examples/nbody/Main.java +++ b/examples/nbody/src/com/amd/aparapi/examples/nbody/Main.java @@ -62,10 +62,23 @@ import javax.swing.JTextField; import javax.swing.WindowConstants; import com.amd.aparapi.Kernel; +import com.amd.aparapi.Range; import com.jogamp.opengl.util.FPSAnimator; import com.jogamp.opengl.util.texture.Texture; import com.jogamp.opengl.util.texture.TextureIO; +/** + * NBody implementing demonstrating Aparapi kernels. + * + * For a description of the NBody problem. + * @see http://en.wikipedia.org/wiki/N-body_problem + * + * We use JOGL to render the bodies. + * @see http://jogamp.org/jogl/www/ + * + * @author gfrost + * + */ public class Main{ public static class NBodyKernel extends Kernel{ @@ -75,7 +88,7 @@ public class Main{ protected final float mass = 5f; - private final int bodies; + private final Range range; private final float[] xyz; // positions xy and z of bodies @@ -85,27 +98,32 @@ public class Main{ * Constructor initializes xyz and vxyz arrays. * @param _bodies */ - public NBodyKernel(int _bodies) { - bodies = _bodies; - xyz = new float[bodies * 3]; - vxyz = new float[bodies * 3]; + public NBodyKernel(Range _range) { + range = _range; + // range = Range.create(bodies); + xyz = new float[range.getGlobalSize(0) * 3]; + vxyz = new float[range.getGlobalSize(0) * 3]; float maxDist = 20f; - for (int body = 0; body < bodies * 3; body += 3) { - // If I could remmember some basic algebra I guess I could avoid this loop ;) - // just ensures that the x,y,z is within maxdist radius of origin - do { - xyz[body + 0] = (float) (Math.random() * 2 * maxDist) - maxDist; //x - xyz[body + 1] = (float) (Math.random() * 2 * maxDist) - maxDist; //y - xyz[body + 2] = (float) (Math.random() * 2 * maxDist) - maxDist; //z - } while (xyz[body + 0] * xyz[body + 0] + xyz[body + 1] * xyz[body + 1] + xyz[body + 2] * xyz[body + 2] > maxDist - * maxDist); - // divide into two 'sphere of bodies' by adjusting x + for (int body = 0; body < range.getGlobalSize(0) * 3; body += 3) { + + float theta = (float) (Math.random() * Math.PI * 2); + float phi = (float) (Math.random() * Math.PI * 2); + float radius = (float) (Math.random() * maxDist); + + // get the 3D dimensional coordinates + xyz[body + 0] = (float) (radius * Math.cos(theta) * Math.sin(phi)); + xyz[body + 1] = (float) (radius * Math.sin(theta) * Math.sin(phi)); + xyz[body + 2] = (float) (radius * Math.cos(phi)); + + // divide into two 'spheres of bodies' by adjusting x + if (body % 2 == 0) { xyz[body + 0] += maxDist * 1.5; } else { xyz[body + 0] -= maxDist * 1.5; } } + setExplicit(true); } /** @@ -113,7 +131,7 @@ public class Main{ */ @Override public void run() { int body = getGlobalId(); - int count = bodies * 3; + int count = getGlobalSize(0) * 3; int globalId = body * 3; float accx = 0.f; @@ -153,7 +171,7 @@ public class Main{ protected void render(GL2 gl) { gl.glBegin(GL2.GL_QUADS); - for (int i = 0; i < bodies * 3; i += 3) { + for (int i = 0; i < range.getGlobalSize(0) * 3; i += 3) { gl.glTexCoord2f(0, 1); gl.glVertex3f(xyz[i + 0], xyz[i + 1] + 1, xyz[i + 2]); gl.glTexCoord2f(0, 0); @@ -176,7 +194,7 @@ public class Main{ public static void main(String _args[]) { - final NBodyKernel kernel = new NBodyKernel(Integer.getInteger("bodies", 8192)); + final NBodyKernel kernel = new NBodyKernel(Range.create(Integer.getInteger("bodies", 8192))); JFrame frame = new JFrame("NBody"); @@ -196,7 +214,7 @@ public class Main{ controlPanel.add(new JLabel(kernel.getExecutionMode().toString())); controlPanel.add(new JLabel(" Particles")); - controlPanel.add(new JTextField("" + kernel.bodies, 5)); + controlPanel.add(new JTextField("" + kernel.range.getGlobalSize(0), 5)); controlPanel.add(new JLabel("FPS")); final JTextField framesPerSecondTextField = new JTextField("0", 5); @@ -256,7 +274,10 @@ public class Main{ glu.gluLookAt(xeye, yeye, zeye * zoomFactor, xat, yat, zat, 0f, 1f, 0f); if (running) { - kernel.execute(kernel.bodies); + kernel.execute(kernel.range); + if (kernel.isExplicit()) { + kernel.get(kernel.xyz); + } } kernel.render(gl); @@ -267,7 +288,8 @@ public class Main{ if (time > 1000) { // We update the frames/sec every second if (running) { float framesPerSecond = (frames * 1000.0f) / time; - int updatesPerMicroSecond = (int) ((framesPerSecond * kernel.bodies * kernel.bodies) / 1000000); + int updatesPerMicroSecond = (int) ((framesPerSecond * kernel.range.getGlobalSize(0) * kernel.range + .getGlobalSize(0)) / 1000000); framesPerSecondTextField.setText(String.format("%5.2f", framesPerSecond)); positionUpdatesPerMicroSecondTextField.setText(String.format("%4d", updatesPerMicroSecond)); } @@ -316,7 +338,7 @@ public class Main{ frame.pack(); frame.setVisible(true); - FPSAnimator animator = new FPSAnimator(canvas, 200); + FPSAnimator animator = new FPSAnimator(canvas, 100); animator.start(); } diff --git a/samples/blackscholes/.classpath b/samples/blackscholes/.classpath new file mode 100644 index 0000000000000000000000000000000000000000..2b3d42947b0a9f028e1acf3ee7d6963e71b1003c --- /dev/null +++ b/samples/blackscholes/.classpath @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8"?> +<classpath> + <classpathentry kind="src" path="src"/> + <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> + <classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/> + <classpathentry combineaccessrules="false" kind="src" path="/com.amd.aparapi"/> + <classpathentry kind="output" path="classes"/> +</classpath> diff --git a/samples/blackscholes/.project b/samples/blackscholes/.project new file mode 100644 index 0000000000000000000000000000000000000000..eb5be55b443c4d9d23671cae1c7825d4a8d5a954 --- /dev/null +++ b/samples/blackscholes/.project @@ -0,0 +1,17 @@ +<?xml version="1.0" encoding="UTF-8"?> +<projectDescription> + <name>blackscholes</name> + <comment></comment> + <projects> + </projects> + <buildSpec> + <buildCommand> + <name>org.eclipse.jdt.core.javabuilder</name> + <arguments> + </arguments> + </buildCommand> + </buildSpec> + <natures> + <nature>org.eclipse.jdt.core.javanature</nature> + </natures> +</projectDescription> diff --git a/samples/blackscholes/src/com/amd/aparapi/samples/blackscholes/Main.java b/samples/blackscholes/src/com/amd/aparapi/samples/blackscholes/Main.java index a80ddd65e4d32183fbb2b218510221d2869df405..60c311427db70f5b00f929684b98ef0266ca62ca 100644 --- a/samples/blackscholes/src/com/amd/aparapi/samples/blackscholes/Main.java +++ b/samples/blackscholes/src/com/amd/aparapi/samples/blackscholes/Main.java @@ -38,6 +38,7 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit package com.amd.aparapi.samples.blackscholes; import com.amd.aparapi.Kernel; +import com.amd.aparapi.Range; public class Main{ @@ -181,9 +182,10 @@ public class Main{ public static void main(String[] _args) throws ClassNotFoundException, InstantiationException, IllegalAccessException { int size = Integer.getInteger("size", 512); - int iterations = Integer.getInteger("iterations", 5); - System.out.println("size ="+size); - System.out.println("iterations ="+iterations); + Range range = Range.create(size); + int iterations = Integer.getInteger("iterations", 5); + System.out.println("size =" + size); + System.out.println("iterations =" + iterations); BlackScholesKernel kernel = new BlackScholesKernel(size); long totalExecTime = 0; @@ -193,7 +195,7 @@ public class Main{ iterExecTime = kernel.execute(size).getExecutionTime(); totalExecTime += iterExecTime; }*/ - kernel.execute(size, iterations); + kernel.execute(range, iterations); System.out.println("Average execution time " + kernel.getAccumulatedExecutionTime() / iterations); kernel.showResults(10); diff --git a/samples/convolution/.classpath b/samples/convolution/.classpath new file mode 100644 index 0000000000000000000000000000000000000000..2b3d42947b0a9f028e1acf3ee7d6963e71b1003c --- /dev/null +++ b/samples/convolution/.classpath @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8"?> +<classpath> + <classpathentry kind="src" path="src"/> + <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> + <classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/> + <classpathentry combineaccessrules="false" kind="src" path="/com.amd.aparapi"/> + <classpathentry kind="output" path="classes"/> +</classpath> diff --git a/samples/convolution/.project b/samples/convolution/.project new file mode 100644 index 0000000000000000000000000000000000000000..a304e12fe2c740acb75674e6ccb285155ddf2c2a --- /dev/null +++ b/samples/convolution/.project @@ -0,0 +1,17 @@ +<?xml version="1.0" encoding="UTF-8"?> +<projectDescription> + <name>convolution</name> + <comment></comment> + <projects> + </projects> + <buildSpec> + <buildCommand> + <name>org.eclipse.jdt.core.javabuilder</name> + <arguments> + </arguments> + </buildCommand> + </buildSpec> + <natures> + <nature>org.eclipse.jdt.core.javanature</nature> + </natures> +</projectDescription> diff --git a/samples/convolution/build.xml b/samples/convolution/build.xml new file mode 100644 index 0000000000000000000000000000000000000000..90979e334ff82caa213a1027e54266a24bb65aca --- /dev/null +++ b/samples/convolution/build.xml @@ -0,0 +1,20 @@ +<?xml version="1.0"?> + +<project name="convolution" default="build" basedir="."> + <target name="build" depends="clean"> + <mkdir dir="classes"/> + <javac srcdir="src" destdir="classes" debug="on" includeantruntime="false" > + <classpath> + <pathelement path="../../com.amd.aparapi/aparapi.jar"/> + </classpath> + </javac> + <jar jarfile="${ant.project.name}.jar" basedir="classes"/> + </target> + + <target name="clean"> + <delete dir="classes"/> + <delete file="${ant.project.name}.jar"/> + </target> + + +</project> diff --git a/samples/convolution/conv.bat b/samples/convolution/conv.bat new file mode 100644 index 0000000000000000000000000000000000000000..ac0c4aab2b405198e1f7516ee3ed5d9adb71e131 --- /dev/null +++ b/samples/convolution/conv.bat @@ -0,0 +1,6 @@ +java ^ + -Djava.library.path=../../com.amd.aparapi.jni ^ + -Dcom.amd.aparapi.executionMode=%1 ^ + -classpath ../../com.amd.aparapi/aparapi.jar;convolution.jar ^ + com.amd.aparapi.sample.convolution.Main + diff --git a/samples/convolution/src/com/amd/aparapi/sample/convolution/Main.java b/samples/convolution/src/com/amd/aparapi/sample/convolution/Main.java new file mode 100644 index 0000000000000000000000000000000000000000..0196418136f905cc258dbf5f596eec8b3fda8b69 --- /dev/null +++ b/samples/convolution/src/com/amd/aparapi/sample/convolution/Main.java @@ -0,0 +1,234 @@ +/* +Copyright (c) 2010-2011, Advanced Micro Devices, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the +following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and the following +disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials provided with the distribution. + +Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export +laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through +774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, +you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of +Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration +Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1, +E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups +D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject +to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774 +of EAR). For the most current Country Group listings, or for additional information about the EAR or your obligations +under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. + +*/ + +package com.amd.aparapi.sample.convolution; + +import java.awt.Dimension; +import java.awt.Graphics; +import java.awt.image.BufferedImage; +import java.awt.image.DataBufferInt; +import java.io.File; +import java.io.IOException; + +import javax.imageio.ImageIO; +import javax.swing.JComponent; +import javax.swing.JFrame; +import javax.swing.WindowConstants; + +import com.amd.aparapi.Kernel; +import com.amd.aparapi.Range; + +/** + * An example Aparapi application which demonstrates image manipulation via convolution filter + * + * Converted to use int buffer and some performance tweaks by Gary Frost + * http://processing.org/learning/pixels/ + * + * @author Gary Frost + */ +public class Main{ + // http://docs.gimp.org/en/plug-in-convmatrix.html + + final static class ConvolutionFilter{ + private float[] weights; + + private int offset; + + ConvolutionFilter(float _nw, float _n, float ne, float _w, float _o, float _e, float _sw, float _s, float _se, int _offset) { + weights = new float[] { + _nw, + _w, + ne, + _w, + _o, + _e, + _sw, + _s, + _se + }; + offset = _offset; + } + + } + + private static final ConvolutionFilter NONE = new ConvolutionFilter(0f, 0f, 0f, 0f, 1f, 0f, 0f, 0f, 0f, 0); + + private static final ConvolutionFilter BLUR = new ConvolutionFilter(1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 0); + + private static final ConvolutionFilter EMBOSS = new ConvolutionFilter(-2f, -1f, 0f, -1f, 1f, 1f, 0f, 1f, 2f, 0); + + public static class ConvolutionKernel extends Kernel{ + + private final float[] filter = new float[9]; + + private final int[] inputData; + + private final int[] outputData; + + private final int width; + + private final int height; + + private int offset; + + public ConvolutionKernel(int _width, int _height, BufferedImage _inputImage, BufferedImage _outputImage) { + inputData = ((DataBufferInt) _inputImage.getRaster().getDataBuffer()).getData(); + outputData = ((DataBufferInt) _outputImage.getRaster().getDataBuffer()).getData(); + width = _width; + height = _height; + + // setExplicit(true); // This gives us a performance boost + // put(inputData); // Because we are using explicit buffer management we must put the imageData array + } + + public void run() { + + int x = getGlobalId(0); + int y = getGlobalId(1); + int lx = getLocalId(0); + int ly = getLocalId(1); + int w = getGlobalSize(0); + int h = getGlobalSize(1); + // System.out.println(x+","+y+" "+lx+","+ly+" "+w+","+h); + if (x > 1 && x < (w - 1) && y > 1 && y < (h - 1)) { + + int result = 0; + // We handle each color separately using rgbshift as an 8 bit mask for red, green, blue + for (int rgbShift = 0; rgbShift < 24; rgbShift += 8) { // 0,8,16 + int channelAccum = 0; + float accum = 0; + + for (int count = 0; count < 9; count++) { + int dx = (count % 3) - 1; // 0,1,2 -> -1,0,1 + int dy = (count / 3) - 1; // 0,1,2 -> -1,0,1 + + int rgb = (inputData[((y + dy) * w) + (x + dx)]); + int channelValue = ((rgb >> rgbShift) & 0xff); + accum += filter[count]; + channelAccum += channelValue * filter[count++]; + + } + channelAccum /= accum; + channelAccum += offset; + channelAccum = max(0, min(channelAccum, 0xff)); + result |= (channelAccum << rgbShift); + } + outputData[y * w + x] = result; + } + } + + public void convolve(ConvolutionFilter _filter) { + System.arraycopy(_filter.weights, 0, filter, 0, _filter.weights.length); + offset = _filter.offset; + put(filter); + execute(Range.create2D(width, height, 8, 8)); + get(outputData); + } + } + + public static final int PAD = 1024; + + public static int padValue(int value) { + return (PAD - (value % PAD)); + } + + public static int padTo(int value) { + return (value + padValue(value)); + } + + public static void main(String[] _args) throws IOException, InterruptedException { + + JFrame frame = new JFrame("Convolution"); + + BufferedImage testCard = ImageIO.read(new File("testcard.jpg")); + + int imageHeight = testCard.getHeight(); + + int imageWidth = testCard.getWidth(); + + final int width = padTo(imageWidth);// now multiple of 64 + + final int height = padTo(imageHeight); // now multiple of 64 + + System.out.println("image width,height=" + width + "," + height); + + final BufferedImage inputImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB); + + inputImage.getGraphics().drawImage(testCard, padValue(imageWidth) / 2, padValue(imageHeight) / 2, null); + final BufferedImage outputImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB); + outputImage.getGraphics().drawImage(testCard, padValue(imageWidth) / 2, padValue(imageHeight) / 2, null); + final ConvolutionKernel lifeKernel = new ConvolutionKernel(width, height, inputImage, outputImage); + + // Create a component for viewing the offsecreen image + @SuppressWarnings("serial") JComponent viewer = new JComponent(){ + @Override public void paintComponent(Graphics g) { + // if (lifeKernel.isExplicit()) { + // lifeKernel.get(lifeKernel.inputData); // We only pull the imageData when we intend to use it. + // } + g.drawImage(outputImage, 0, 0, width, height, 0, 0, width, height, this); + } + }; + + // Set the default size and add to the frames content pane + viewer.setPreferredSize(new Dimension(width, height)); + frame.getContentPane().add(viewer); + + // Swing housekeeping + frame.pack(); + frame.setVisible(true); + frame.setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE); + + ConvolutionFilter filters[] = new ConvolutionFilter[] { + NONE, + BLUR, + EMBOSS, + }; + long start = System.nanoTime(); + for (int i = 0; i < 100; i++) { + for (ConvolutionFilter filter : filters) { + + lifeKernel.convolve(filter); // Work is performed here + + viewer.repaint(); // Request a repaint of the viewer (causes paintComponent(Graphics) to be called later not synchronous + //Thread.sleep(1000); + } + } + System.out.println((System.nanoTime() - start) / 1000000); + + } +} diff --git a/samples/convolution/src/com/amd/aparapi/sample/convolution/Test12x4_4x2.java b/samples/convolution/src/com/amd/aparapi/sample/convolution/Test12x4_4x2.java new file mode 100644 index 0000000000000000000000000000000000000000..e68ca82cfd56c878bfe48ec9e78779cde98f71cf --- /dev/null +++ b/samples/convolution/src/com/amd/aparapi/sample/convolution/Test12x4_4x2.java @@ -0,0 +1,496 @@ +package com.amd.aparapi.sample.convolution; + +import com.amd.aparapi.Kernel; +import com.amd.aparapi.Range; + +public class Test12x4_4x2{ + public static void main(String[] _args) { + // globalThreadId, threadId, globalX, globalY, localX, localY + final int[][] test = new int[][] { + { + 0, //globalThreadId + 0,//threadId + 0,//globalX + 0,//globalY + 0,//localX + 0 + //localY + }, + { + 1,//globalThreadId + 1,//threadId + 1,//globalX + 0,//globalY + 1,//localX + 0 + //localY + }, + { + 2,//globalThreadId + 2,//threadId + 2,//globalX + 0,//globalY + 2,//localX + 0 + //localY + }, + { + 3,//globalThreadId + 3,//threadId + 3,//globalX + 0,//globalY + 3,//localX + 0 + //localY + }, + { + 4,//globalThreadId + 4,//threadId + 0,//globalX + 1,//globalY + 0,//localX + 1 + //localY + }, + { + 5,//globalThreadId + 5,//threadId + 1,//globalX + 1,//globalY + 1,//localX + 1 + //localY + }, + { + 6,//globalThreadId + 6,//threadId + 2,//globalX + 1,//globalY + 2,//localX + 1 + //localY + }, + { + 7,//globalThreadId + 7,//threadId + 3,//globalX + 1,//globalY + 3,//localX + 1 + //localY + }, + { + 8,//globalThreadId + 0,//threadId + 4,//globalX + 0,//globalY + 0,//localX + 0 + //localY + }, + { + 9,//globalThreadId + 1,//threadId + 5,//globalX + 0,//globalY + 1,//localX + 0 + //localY + }, + { + 10,//globalThreadId + 2,//threadId + 6,//globalX + 0,//globalY + 2,//localX + 0 + //localY + }, + { + 11,//globalThreadId + 3,//threadId + 7,//globalX + 0,//globalY + 3,//localX + 0 + //localY + }, + { + 12,//globalThreadId + 4,//threadId + 4,//globalX + 1,//globalY + 0,//localX + 1 + //localY + }, + { + 13,//globalThreadId + 5,//threadId + 5,//globalX + 1,//globalY + 1,//localX + 1 + //localY + }, + { + 14,//globalThreadId + 6,//threadId + 6,//globalX + 1,//globalY + 2,//localX + 1 + //localY + }, + { + 15,//globalThreadId + 7,//threadId + 7,//globalX + 1,//globalY + 3,//localX + 1 + //localY + }, + { + 16,//globalThreadId + 0,//threadId + 8,//globalX + 0,//globalY + 0,//localX + 0 + //localY + }, + { + 17,//globalThreadId + 1,//threadId + 9,//globalX + 0,//globalY + 1,//localX + 0 + //localY + }, + { + 18,//globalThreadId + 2,//threadId + 10,//globalX + 0,//globalY + 2,//localX + 0 + //localY + }, + { + 19,//globalThreadId + 3,//threadId + 11,//globalX + 0,//globalY + 3,//localX + 0 + //localY + }, + + { + 20,//globalThreadId + 4,//threadId + 8,//globalX + 1,//globalY + 0,//localX + 1 + //localY + }, + { + 21,//globalThreadId + 5,//threadId + 9,//globalX + 1,//globalY + 1,//localX + 1 + //localY + }, + { + 22,//globalThreadId + 6,//threadId + 10,//globalX + 1, + 2,//localX + 1 + //localY + }, + { + 23,//globalThreadId + 7,//threadId + 11,//globalX + 1,//globalY + 3,//localX + 1 + //localY + }, + { + 24,//globalThreadId + 0,//threadId + 0,//globalX + 2,//globalY + 0,//localX + 0 + //localY + }, + { + 25,//globalThreadId + 1,//threadId + 1,//globalX + 2,//globalY + 1,//localX + 0 + //localY + }, + { + 26,//globalThreadId + 2,//threadId + 2,//globalX + 2,//globalY + 2,//localX + 0 + //localY + }, + { + 27,//globalThreadId + 3,//threadId + 3,//globalX + 2,//globalY + 3,//localX + 0 + //localY + }, + { + 28,//globalThreadId + 4,//threadId + 0,//globalX + 3,//globalY + 0,//localX + 1 + //localY + }, + { + 29,//globalThreadId + 5,//threadId + 1,//globalX + 3,//globalY + 1,//localX + 1 + //localY + }, + { + 30,//globalThreadId + 6,//threadId + 2,//globalX + 3,//globalY + 2,//localX + 1 + //localY + }, + { + 31,//globalThreadId + 7,//threadId + 3,//globalX + 3,//globalY + 3,//localX + 1 + //localY + }, + { + 32,//globalThreadId + 0,//threadId + 4,//globalX + 2,//globalY + 0,//localX + 0 + //localY + }, + { + 33,//globalThreadId + 1,//threadId + 5,//globalX + 2,//globalY + 1,//localX + 0 + //localY + }, + { + 34,//globalThreadId + 2,//threadId + 6,//globalX + 2,//globalY + 2,//localX + 0 + //localY + }, + { + 35,//globalThreadId + 3,//threadId + 7,//globalX + 2,//globalY + 3,//localX + 0 + //localY + }, + { + 36,//globalThreadId + 4,//threadId + 4,//globalX + 3,//globalY + 0,//localX + 1 + //localY + }, + { + 37,//globalThreadId + 5,//threadId + 5,//globalX + 3,//globalY + 1,//localX + 1 + //localY + }, + { + 38,//globalThreadId + 6,//threadId + 6,//globalX + 3,//globalY + 2,//localX + 1 + //localY + }, + { + 39,//globalThreadId + 7,//threadId + 7,//globalX + 3,//globalY + 3,//localX + 1 + //localY + }, + { + 40,//globalThreadId + 0,//threadId + 8,//globalX + 2,//globalY + 0,//localX + 0 + //localY + }, + { + 41,//globalThreadId + 1,//threadId + 9,//globalX + 2,//globalY + 1,//localX + 0 + //localY + }, + { + 42,//globalThreadId + 2,//threadId + 10,//globalX + 2,//globalY + 2,//localX + 0 + //localY + }, + { + 43,//globalThreadId + 3,//threadId + 11,//globalX + 2,//globalY + 3,//localX + 0 + //localY + }, + + { + 44,//globalThreadId + 4,//threadId + 8,//globalX + 3,//globalY + 0,//localX + 1 + //localY + }, + { + 45,//globalThreadId + 5,//threadId + 9,//globalX + 3,//globalY + 1,//localX + 1 + //localY + }, + { + 46,//globalThreadId + 6,//threadId + 10,//globalX + 3,//globalY + 2,//localX + 1 + //localY + }, + { + 47,//globalThreadId + 7,//threadId + 11,//globalX + 3,//globalY + 3,//localX + 1 + //localY + }, + }; + Kernel kernel = new Kernel(){ + + @Override public void run() { + int x = getGlobalId(0); + int y = getGlobalId(1); + int lx = getLocalId(0); + int ly = getLocalId(1); + int w = getGlobalSize(0); + int h = getGlobalSize(1); + int globalThreadId = getGlobalId(1) * getGlobalSize(0) + getGlobalId(0); + int threadId = getLocalId(1) * getLocalSize(0) + getLocalId(0); + synchronized (test) { + boolean show = false; + if (globalThreadId != test[globalThreadId][0]) { + System.out.println("bad globalThreadId"); + show = true; + } + if (threadId != test[globalThreadId][1]) { + System.out.println("bad threadId"); + show = true; + } + if (x != test[globalThreadId][2]) { + System.out.println("bad globalx"); + show = true; + } + if (y != test[globalThreadId][3]) { + System.out.println("bad globaly"); + show = true; + } + if (lx != test[globalThreadId][4]) { + System.out.println("bad localx"); + show = true; + } + if (ly != test[globalThreadId][5]) { + System.out.println("bad localy"); + show = true; + } + if (show) { + System.out.println("derived =>" + globalThreadId + " " + threadId + " " + x + "," + y + " " + lx + "," + ly + " " + + w + "," + h); + System.out.println("data =>" + test[globalThreadId][0] + " " + test[globalThreadId][1] + " " + + test[globalThreadId][2] + "," + test[globalThreadId][3] + " " + test[globalThreadId][4] + "," + + test[globalThreadId][5] + " " + w + "," + h); + } + } + } + + }; + kernel.execute(Range.create2D(12, 4, 4, 2)); + + } +} diff --git a/samples/convolution/testcard.jpg b/samples/convolution/testcard.jpg new file mode 100644 index 0000000000000000000000000000000000000000..16b1a709365da51b2247255c903c7c6bb173207f Binary files /dev/null and b/samples/convolution/testcard.jpg differ diff --git a/samples/life/src/com/amd/aparapi/sample/life/Main.java b/samples/life/src/com/amd/aparapi/sample/life/Main.java index 4ede78b12789d46f31b9d68f91cdd0e557b1b5b0..73a0a867251f8ea2c0e6b0629568a279761b1f7c 100644 --- a/samples/life/src/com/amd/aparapi/sample/life/Main.java +++ b/samples/life/src/com/amd/aparapi/sample/life/Main.java @@ -55,6 +55,7 @@ import javax.swing.JPanel; import javax.swing.WindowConstants; import com.amd.aparapi.Kernel; +import com.amd.aparapi.Range; /** * An example Aparapi application which demonstrates Conways 'Game Of Life'. @@ -101,6 +102,8 @@ public class Main{ private final int height; + private final Range range; + private int fromBase; private int toBase; @@ -109,6 +112,8 @@ public class Main{ imageData = ((DataBufferInt) _image.getRaster().getDataBuffer()).getData(); width = _width; height = _height; + range = Range.create(width * height, 256); + System.out.println("range = " + range); fromBase = height * width; toBase = 0; setExplicit(true); // This gives us a performance boost @@ -160,7 +165,7 @@ public class Main{ fromBase = toBase; toBase = swap; - execute(width * height); + execute(range); } } diff --git a/samples/mandel/mandel2D.bat b/samples/mandel/mandel2D.bat new file mode 100644 index 0000000000000000000000000000000000000000..93e67e077cccab1762bd22405ad426df138b58b1 --- /dev/null +++ b/samples/mandel/mandel2D.bat @@ -0,0 +1,7 @@ +java ^ + -Djava.library.path=../../com.amd.aparapi.jni ^ + -Dcom.amd.aparapi.executionMode=%1 ^ + -classpath ../../com.amd.aparapi/aparapi.jar;mandel.jar ^ + com.amd.aparapi.sample.mandel.Main2D + + diff --git a/samples/mandel/mandel2D.sh b/samples/mandel/mandel2D.sh new file mode 100644 index 0000000000000000000000000000000000000000..99a1c45c882d6c93f4fbf23ecde2c5698d1cadac --- /dev/null +++ b/samples/mandel/mandel2D.sh @@ -0,0 +1,5 @@ +java\ + -Djava.library.path=../../com.amd.aparapi.jni\ + -Dcom.amd.aparapi.executionMode=$1\ + -classpath ../../com.amd.aparapi/aparapi.jar:mandel.jar\ + com.amd.aparapi.sample.mandel.Main2D diff --git a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java index 5879f7852652ea8c73f6e15c4a9a0aee6fd88c35..99d3373c080bd6cff626d5c9aadc0a94e5aca926 100644 --- a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java +++ b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java @@ -53,6 +53,7 @@ import javax.swing.JComponent; import javax.swing.JFrame; import com.amd.aparapi.Kernel; +import com.amd.aparapi.Range; /** * An example Aparapi application which displays a view of the Mandelbrot set and lets the user zoom in to a particular point. @@ -164,6 +165,9 @@ public class Main{ /** Height of Mandelbrot view. */ final int height = 768; + /** Mandelbrot image height. */ + final Range range = Range.create(width * height); + /** Maximum iterations for Mandelbrot. */ final int maxIterations = 256; @@ -220,7 +224,7 @@ public class Main{ // Set the default scale and offset, execute the kernel and force a repaint of the viewer. kernel.setScaleAndOffset(defaultScale, -1f, 0f); - kernel.execute(width * height); + kernel.execute(range); System.arraycopy(rgb, 0, imageRgb, 0, rgb.length); viewer.repaint(); @@ -266,7 +270,7 @@ public class Main{ // Set the scale and offset, execute the kernel and force a repaint of the viewer. kernel.setScaleAndOffset(scale, x, y); - kernel.execute(width * height); + kernel.execute(range); System.arraycopy(rgb, 0, imageRgb, 0, rgb.length); viewer.repaint(); } diff --git a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java new file mode 100644 index 0000000000000000000000000000000000000000..5e6b85a3c3da3096f91c3dce28a96db95607808c --- /dev/null +++ b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java @@ -0,0 +1,277 @@ +/* +Copyright (c) 2010-2011, Advanced Micro Devices, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the +following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and the following +disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials provided with the distribution. + +Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export +laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through +774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, +you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of +Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration +Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1, +E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups +D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject +to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774 +of EAR). For the most current Country Group listings, or for additional information about the EAR or your obligations +under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. + +*/ + +package com.amd.aparapi.sample.mandel; + +import java.awt.Color; +import java.awt.Dimension; +import java.awt.Graphics; +import java.awt.Point; +import java.awt.event.MouseAdapter; +import java.awt.event.MouseEvent; +import java.awt.event.WindowAdapter; +import java.awt.event.WindowEvent; +import java.awt.image.BufferedImage; +import java.awt.image.DataBufferInt; + +import javax.swing.JComponent; +import javax.swing.JFrame; + +import com.amd.aparapi.Kernel; +import com.amd.aparapi.Range; + +/** + * An example Aparapi application which displays a view of the Mandelbrot set and lets the user zoom in to a particular point. + * + * When the user clicks on the view, this example application will zoom in to the clicked point and zoom out there after. + * On GPU, additional computing units will offer a better viewing experience. On the other hand on CPU, this example + * application might suffer with sub-optimal frame refresh rate as compared to GPU. + * + * @author gfrost + * + */ + +public class Main2D{ + + /** + * An Aparapi Kernel implementation for creating a scaled view of the mandelbrot set. + * + * @author gfrost + * + */ + + public static class MandelKernel extends Kernel{ + + /** RGB buffer used to store the Mandelbrot image. This buffer holds (width * height) RGB values. */ + final private int rgb[]; + + /** Palette used for each iteration value 0..maxIterations. */ + final private int pallette[]; + + /** Maximum iterations we will check for. */ + final private int maxIterations; + + /** Mutable values of scale, offsetx and offsety so that we can modify the zoom level and position of a view. */ + private float scale = .0f; + + private float offsetx = .0f; + + private float offsety = .0f; + + /** + * Initialize the Kernel. + * + * @param _width Mandelbrot image width + * @param _height Mandelbrot image height + * @param _rgb Mandelbrot image RGB buffer + * @param _pallette Mandelbrot image palette + */ + public MandelKernel(int[] _rgb, int[] _pallette) { + + rgb = _rgb; + pallette = _pallette; + maxIterations = pallette.length - 1; + + } + + @Override public void run() { + + /** Determine which RGB value we are going to process (0..RGB.length). */ + int gid = getGlobalId(1) * getGlobalSize(0) + getGlobalId(0); + + /** Translate the gid into an x an y value. */ + float x = (((getGlobalId(0) * scale) - ((scale / 2) * getGlobalSize(0))) / getGlobalSize(0)) + offsetx; + + float y = (((getGlobalId(1) * scale) - ((scale / 2) * getGlobalSize(1))) / getGlobalSize(1)) + offsety; + + int count = 0; + + float zx = x; + float zy = y; + float new_zx = 0f; + + // Iterate until the algorithm converges or until maxIterations are reached. + while (count < maxIterations && zx * zx + zy * zy < 8) { + new_zx = zx * zx - zy * zy + x; + zy = 2 * zx * zy + y; + zx = new_zx; + count++; + } + + // Pull the value out of the palette for this iteration count. + rgb[gid] = pallette[count]; + } + + public void setScaleAndOffset(float _scale, float _offsetx, float _offsety) { + offsetx = _offsetx; + offsety = _offsety; + scale = _scale; + } + + } + + /** User selected zoom-in point on the Mandelbrot view. */ + public static volatile Point to = null; + + @SuppressWarnings("serial") public static void main(String[] _args) { + + JFrame frame = new JFrame("MandelBrot"); + + /** Mandelbrot image height. */ + final Range range = Range.create2D(768, 768); + System.out.println("range= " + range); + + /** Maximum iterations for Mandelbrot. */ + final int maxIterations = 256; + + /** Palette which maps iteration values to RGB values. */ + final int pallette[] = new int[maxIterations + 1]; + + //Initialize palette values + for (int i = 0; i < maxIterations; i++) { + float h = i / (float) maxIterations; + float b = 1.0f - h * h; + pallette[i] = Color.HSBtoRGB(h, 1f, b); + } + + /** Image for Mandelbrot view. */ + final BufferedImage image = new BufferedImage(range.getGlobalSize(0), range.getGlobalSize(1), BufferedImage.TYPE_INT_RGB); + final BufferedImage offscreen = new BufferedImage(range.getGlobalSize(0), range.getGlobalSize(1), BufferedImage.TYPE_INT_RGB); + // Draw Mandelbrot image + JComponent viewer = new JComponent(){ + @Override public void paintComponent(Graphics g) { + + g.drawImage(image, 0, 0, range.getGlobalSize(0), range.getGlobalSize(1), this); + } + }; + + // Set the size of JComponent which displays Mandelbrot image + viewer.setPreferredSize(new Dimension(range.getGlobalSize(0), range.getGlobalSize(1))); + + final Object doorBell = new Object(); + + // Mouse listener which reads the user clicked zoom-in point on the Mandelbrot view + viewer.addMouseListener(new MouseAdapter(){ + @Override public void mouseClicked(MouseEvent e) { + to = e.getPoint(); + synchronized (doorBell) { + doorBell.notify(); + } + } + }); + + // Swing housework to create the frame + frame.getContentPane().add(viewer); + frame.pack(); + frame.setLocationRelativeTo(null); + frame.setVisible(true); + + // Extract the underlying RGB buffer from the image. + // Pass this to the kernel so it operates directly on the RGB buffer of the image + final int[] rgb = ((DataBufferInt) offscreen.getRaster().getDataBuffer()).getData(); + final int[] imageRgb = ((DataBufferInt) image.getRaster().getDataBuffer()).getData(); + // Create a Kernel passing the size, RGB buffer and the palette. + final MandelKernel kernel = new MandelKernel(rgb, pallette); + + float defaultScale = 3f; + + // Set the default scale and offset, execute the kernel and force a repaint of the viewer. + kernel.setScaleAndOffset(defaultScale, -1f, 0f); + kernel.execute(range); + System.arraycopy(rgb, 0, imageRgb, 0, rgb.length); + viewer.repaint(); + + // Report target execution mode: GPU or JTP (Java Thread Pool). + System.out.println("Execution mode=" + kernel.getExecutionMode()); + + // Window listener to dispose Kernel resources on user exit. + frame.addWindowListener(new WindowAdapter(){ + public void windowClosing(WindowEvent _windowEvent) { + kernel.dispose(); + System.exit(0); + } + }); + + // Wait until the user selects a zoom-in point on the Mandelbrot view. + while (true) { + + // Wait for the user to click somewhere + while (to == null) { + synchronized (doorBell) { + try { + doorBell.wait(); + } catch (InterruptedException ie) { + ie.getStackTrace(); + } + } + } + + float x = -1f; + float y = 0f; + float scale = defaultScale; + float tox = (float) (to.x - range.getGlobalSize(0) / 2) / range.getGlobalSize(0) * scale; + float toy = (float) (to.y - range.getGlobalSize(1) / 2) / range.getGlobalSize(1) * scale; + + // This is how many frames we will display as we zoom in and out. + int frames = 128; + long startMillis = System.currentTimeMillis(); + for (int sign = -1; sign < 2; sign += 2) { + for (int i = 0; i < frames - 4; i++) { + scale = scale + sign * defaultScale / frames; + x = x - sign * (tox / frames); + y = y - sign * (toy / frames); + + // Set the scale and offset, execute the kernel and force a repaint of the viewer. + kernel.setScaleAndOffset(scale, x, y); + kernel.execute(range); + System.arraycopy(rgb, 0, imageRgb, 0, rgb.length); + viewer.repaint(); + } + } + + long elapsedMillis = System.currentTimeMillis() - startMillis; + System.out.println("FPS = " + frames * 1000 / elapsedMillis); + + // Reset zoom-in point. + to = null; + + } + + } + +} diff --git a/samples/squares/src/com/amd/aparapi/sample/squares/Main.java b/samples/squares/src/com/amd/aparapi/sample/squares/Main.java index 929d02b397dc8ed60bdc10922e208ab3d1a05ec7..32a1b70b8bfd16cd76eff8d5666442738a18dc72 100644 --- a/samples/squares/src/com/amd/aparapi/sample/squares/Main.java +++ b/samples/squares/src/com/amd/aparapi/sample/squares/Main.java @@ -39,6 +39,7 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit package com.amd.aparapi.sample.squares; import com.amd.aparapi.Kernel; +import com.amd.aparapi.Range; /** * An example Aparapi application which computes and displays squares of a set of 512 input values. @@ -77,7 +78,8 @@ public class Main{ }; // Execute Kernel. - kernel.execute(512); + + kernel.execute(Range.create(512)); // Report target execution mode: GPU or JTP (Java Thread Pool). System.out.println("Execution mode=" + kernel.getExecutionMode()); diff --git a/test/codegen/build.xml b/test/codegen/build.xml index 1118e3dafc678ec6e93d57d00fa67d0836faf54d..5af3d17a981ba103c708b4345d9dc7aa1f842735 100644 --- a/test/codegen/build.xml +++ b/test/codegen/build.xml @@ -2,8 +2,7 @@ <project name="codegen" default="junit" basedir="."> - <!--<property name="junit.jar" value="/home/gfrost/aparapi/trunk/tools/junit/junit.jar"/>--> - <property name="junit.jar" value="C:\Users\gfrost\javalabs\projects\aparapi\trunk\tools\junit\junit.jar"/> + <property name="junit.jar" value="./junit-4.10.jar"/> <path id="classpath"> <pathelement path="..\..\com.amd.aparapi\aparapi.jar"/> @@ -11,28 +10,22 @@ <pathelement path="classes"/> </path> - <target name="check"> - <fail message="Error:"> - <condition> - <not><isset property="junit.jar"/></not> - </condition> - <![CDATA[ - You will need to edit test/codegen/build.xml - - At present junit.dir is not set. It needs to point to the junit jar file in your junit installation. + <target name="check-junit"> + <condition property="need.to.upload.junit"> + <not><available file="${junit.jar}"/> </not> + </condition> + </target> - You can install/download junit from www.junit.org. - ]]> - </fail> - <available file="${junit.jar}" type="file" property="junit.jar.exists"/> + <target name="check" if="need.to.upload.junit"> + <get dest="."> + <url url="http://repo1.maven.org/maven2/junit/junit/4.10/junit-4.10.jar"/> + </get> <fail message="Error:"> <condition> - <not><isset property="junit.jar.exists"/></not> + <not><available file="${junit.jar}"/> </not> </condition> <![CDATA[ - You will need to edit test/codegen/build.xml - - At present junit.jar is set to ${junit.jar} but that file does not exist + Failed to upload junit from maven repository. ]]> </fail> </target> @@ -43,7 +36,7 @@ <delete dir="src/genjava/com"/> </target> - <target name="junit" depends="clean, check"> + <target name="junit" depends="clean, check-junit, check"> <mkdir dir="classes"/> <javac srcdir="src/java" destdir="classes" debug="on" includeAntRuntime="false" classpathref="classpath" />