diff --git a/CREDITS.txt b/CREDITS.txt index f2756caf9f524efb5edd4b960c43be42f61f733e..7d17627c7062929abbde2000a83d8f36347f7a03 100644 --- a/CREDITS.txt +++ b/CREDITS.txt @@ -6,6 +6,7 @@ Aparapi Contributors. Thanks to all. Witold Bolt provided patch for issue #5. Added and tested Mac OS support. Oct 12th 2011 kenneth@hexad.dk provided patch for issue #11 (correcting conversion of Java Nan and Infinity values to OpenCL equiv). Oct 28th 2011 +kenneth@hexad.dk provides patch for issue #12 (handling of Kernel static fields) Oct 28th 2011. diff --git a/com.amd.aparapi.jni/src/cpp/aparapi.cpp b/com.amd.aparapi.jni/src/cpp/aparapi.cpp index 0d114279b2b51929b2c2251525da6d2c9eaadcf9..d58632921e6b4eca267208c472bb0b77e56e12d2 100644 --- a/com.amd.aparapi.jni/src/cpp/aparapi.cpp +++ b/com.amd.aparapi.jni/src/cpp/aparapi.cpp @@ -137,6 +137,7 @@ MicrosecondTimer timer; jfieldID typeFieldID; +jfieldID isStaticFieldID; jfieldID nameFieldID; jfieldID javaArrayFieldID; jfieldID bytesPerLocalSizeFieldID; @@ -246,6 +247,7 @@ class KernelArg{ char *name; // used for debugging printfs jfieldID fieldID; // The field that this arg represents in the kernel (java), used only for primitive updates jint type; // a bit mask determining the type of this arg + jboolean isStatic; // A flag indicating if the value is static jint sizeInBytes; // bytes in the array or directBuf jobject javaArg; // global reference to the corresponding java KernelArg union{ @@ -413,7 +415,7 @@ class JNIContext{ // create the context using the mechanism described here // http://developer.amd.com/support/KnowledgeBase/Lists/KnowledgeBase/DispForm.aspx?ID=71 - + // We may have one or more available platforms so determine how many we have status = clGetPlatformIDs(0, NULL, &platformc); @@ -438,1110 +440,1168 @@ class JNIContext{ // Here we check if the platformVersionName starts with "OpenCL 1.1" (10 chars!) if (!strncmp(platformVersionName, "OpenCL 1.1", 10)) { #else - // Here we check if the platformVersionName starts with "OpenCL 1.1" or "OpenCL 1.0" (10 chars!) - if (!strncmp(platformVersionName, "OpenCL 1.1", 10) || !strncmp(platformVersionName, "OpenCL 1.0", 10)) { + // Here we check if the platformVersionName starts with "OpenCL 1.1" or "OpenCL 1.0" (10 chars!) + if (!strncmp(platformVersionName, "OpenCL 1.1", 10) || !strncmp(platformVersionName, "OpenCL 1.0", 10)) { #endif - // Get the # of devices - status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &deviceIdc); - // now check if this platform supports the requested device type (GPU or CPU) - if (status == CL_SUCCESS && deviceIdc >0 ){ - platform = platforms[i]; - if (isVerbose()){ - fprintf(stderr, "platform %s supports requested device type\n", platformVendorName); - } - - deviceIds = new cl_device_id[deviceIdc]; - status = clGetDeviceIDs(platform, deviceType, deviceIdc, deviceIds, NULL); - if (status == CL_SUCCESS){ - ASSERT_CL_NO_RETURN("clGetDeviceIDs()"); - - GET_DEV_INFO(deviceIds[0], CL_DEVICE_MAX_COMPUTE_UNITS, maxComputeUnits, "%d"); - GET_DEV_INFO(deviceIds[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, maxWorkItemDimensions, "%d"); - GET_DEV_INFO(deviceIds[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, maxWorkGroupSize, "%d"); - GET_DEV_INFO(deviceIds[0], CL_DEVICE_GLOBAL_MEM_SIZE, globalMemSize, "%d"); - GET_DEV_INFO(deviceIds[0], CL_DEVICE_LOCAL_MEM_SIZE, localMemSize, "%d"); + // Get the # of devices + status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &deviceIdc); + // now check if this platform supports the requested device type (GPU or CPU) + if (status == CL_SUCCESS && deviceIdc >0 ){ + platform = platforms[i]; if (isVerbose()){ - - fprintf(stderr, "device[%p]: Type: ", deviceIds[0]); - if (deviceType & CL_DEVICE_TYPE_DEFAULT) { - // deviceType &= ~CL_DEVICE_TYPE_DEFAULT; - fprintf(stderr, "Default "); - }else if (deviceType & CL_DEVICE_TYPE_CPU) { - // deviceType &= ~CL_DEVICE_TYPE_CPU; - fprintf(stderr, "CPU "); - }else if (deviceType & CL_DEVICE_TYPE_GPU) { - // deviceType &= ~CL_DEVICE_TYPE_GPU; - fprintf(stderr, "GPU "); - }else if (deviceType & CL_DEVICE_TYPE_ACCELERATOR) { - // deviceType &= ~CL_DEVICE_TYPE_ACCELERATOR; - fprintf(stderr, "Accelerator "); - }else{ - fprintf(stderr, "Unknown (0x%llx) ", deviceType); - } - fprintf(stderr, "\n"); + fprintf(stderr, "platform %s supports requested device type\n", platformVendorName); } - cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; - cl_context_properties* cprops = (NULL == platform) ? NULL : cps; - context = clCreateContextFromType( cprops, deviceType, NULL, NULL, &status); - ASSERT_CL_NO_RETURN("clCreateContextFromType()"); + + deviceIds = new cl_device_id[deviceIdc]; + status = clGetDeviceIDs(platform, deviceType, deviceIdc, deviceIds, NULL); if (status == CL_SUCCESS){ - - valid = JNI_TRUE; + ASSERT_CL_NO_RETURN("clGetDeviceIDs()"); + + GET_DEV_INFO(deviceIds[0], CL_DEVICE_MAX_COMPUTE_UNITS, maxComputeUnits, "%d"); + GET_DEV_INFO(deviceIds[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, maxWorkItemDimensions, "%d"); + GET_DEV_INFO(deviceIds[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, maxWorkGroupSize, "%d"); + GET_DEV_INFO(deviceIds[0], CL_DEVICE_GLOBAL_MEM_SIZE, globalMemSize, "%d"); + GET_DEV_INFO(deviceIds[0], CL_DEVICE_LOCAL_MEM_SIZE, localMemSize, "%d"); + if (isVerbose()){ + + fprintf(stderr, "device[%p]: Type: ", deviceIds[0]); + if (deviceType & CL_DEVICE_TYPE_DEFAULT) { + // deviceType &= ~CL_DEVICE_TYPE_DEFAULT; + fprintf(stderr, "Default "); + }else if (deviceType & CL_DEVICE_TYPE_CPU) { + // deviceType &= ~CL_DEVICE_TYPE_CPU; + fprintf(stderr, "CPU "); + }else if (deviceType & CL_DEVICE_TYPE_GPU) { + // deviceType &= ~CL_DEVICE_TYPE_GPU; + fprintf(stderr, "GPU "); + }else if (deviceType & CL_DEVICE_TYPE_ACCELERATOR) { + // deviceType &= ~CL_DEVICE_TYPE_ACCELERATOR; + fprintf(stderr, "Accelerator "); + }else{ + fprintf(stderr, "Unknown (0x%llx) ", deviceType); + } + fprintf(stderr, "\n"); + } + cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; + cl_context_properties* cprops = (NULL == platform) ? NULL : cps; + context = clCreateContextFromType( cprops, deviceType, NULL, NULL, &status); + ASSERT_CL_NO_RETURN("clCreateContextFromType()"); + if (status == CL_SUCCESS){ + + valid = JNI_TRUE; + } + } + }else{ + if (isVerbose()){ + fprintf(stderr, "platform %s does not support requested device type skipping!\n", platformVendorName); } } + }else{ if (isVerbose()){ - fprintf(stderr, "platform %s does not support requested device type skipping!\n", platformVendorName); - } - } - - }else{ - if (isVerbose()){ #ifndef __APPLE__ - fprintf(stderr, "platform %s version %s is not OpenCL 1.1 skipping!\n", platformVendorName, platformVersionName); + fprintf(stderr, "platform %s version %s is not OpenCL 1.1 skipping!\n", platformVendorName, platformVersionName); #else - fprintf(stderr, "platform %s version %s is neither OpenCL 1.1 or OpenCL 1.0 skipping!\n", platformVendorName, platformVersionName); + fprintf(stderr, "platform %s version %s is neither OpenCL 1.1 or OpenCL 1.0 skipping!\n", platformVendorName, platformVersionName); #endif - } + } + } } - } - } - }else{ - if (isVerbose()){ - fprintf(stderr, "no opencl platforms available!\n"); + } + }else{ + if (isVerbose()){ + fprintf(stderr, "no opencl platforms available!\n"); + } } - } - - } - jboolean isValid(){ - return(valid); - } - jboolean isProfilingEnabled(){ - return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_PROFILING)==com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_PROFILING?JNI_TRUE:JNI_FALSE); - } - jboolean isUsingGPU(){ - return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_USE_GPU)==com_amd_aparapi_KernelRunner_JNI_FLAG_USE_GPU?JNI_TRUE:JNI_FALSE); - } - jboolean isVerbose(){ - return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_VERBOSE_JNI)==com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_VERBOSE_JNI?JNI_TRUE:JNI_FALSE); - } + } - ~JNIContext(){ - cl_int status = CL_SUCCESS; - jenv->DeleteGlobalRef(kernelObject); - jenv->DeleteGlobalRef(kernelClass); - if (context != 0){ - status = clReleaseContext(context); - ASSERT_CL_NO_RETURN("clReleaseContext()"); - context = (cl_context)0; - } - if (commandQueues){ - for (int dev=0; dev<deviceIdc; dev++){ - status = clReleaseCommandQueue((cl_command_queue)commandQueues[dev]); - ASSERT_CL_NO_RETURN("clReleaseCommandQueue()"); - commandQueues[dev] = (cl_command_queue)0; + jboolean isValid(){ + return(valid); } - delete[] commandQueues; commandQueues = NULL; - } - if (program != 0){ - status = clReleaseProgram((cl_program)program); - ASSERT_CL_NO_RETURN("clReleaseProgram()"); - program = (cl_program)0; - } - if (kernel != 0){ - status = clReleaseKernel((cl_kernel)kernel); - ASSERT_CL_NO_RETURN("clReleaseKernel()"); - kernel = (cl_kernel)0; - } - if (platforms){ - delete []platforms; platforms=NULL; - } - if (deviceIds){ - delete [] deviceIds; deviceIds=NULL; - } - if (argc> 0){ - for (int i=0; i< argc; i++){ - KernelArg *arg = args[i]; - if (!arg->isPrimitive()){ - if (arg->value.ref.mem != 0){ - status = clReleaseMemObject((cl_mem)arg->value.ref.mem); - ASSERT_CL_NO_RETURN("clReleaseMemObject()"); - arg->value.ref.mem = (cl_mem)0; - } - if (arg->value.ref.javaArray != NULL) { - jenv->DeleteWeakGlobalRef((jweak) arg->value.ref.javaArray); + jboolean isProfilingEnabled(){ + return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_PROFILING)==com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_PROFILING?JNI_TRUE:JNI_FALSE); + } + jboolean isUsingGPU(){ + return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_USE_GPU)==com_amd_aparapi_KernelRunner_JNI_FLAG_USE_GPU?JNI_TRUE:JNI_FALSE); + } + jboolean isVerbose(){ + return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_VERBOSE_JNI)==com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_VERBOSE_JNI?JNI_TRUE:JNI_FALSE); + } + + ~JNIContext(){ + cl_int status = CL_SUCCESS; + jenv->DeleteGlobalRef(kernelObject); + jenv->DeleteGlobalRef(kernelClass); + if (context != 0){ + status = clReleaseContext(context); + ASSERT_CL_NO_RETURN("clReleaseContext()"); + context = (cl_context)0; + } + if (commandQueues){ + for (int dev=0; dev<deviceIdc; dev++){ + status = clReleaseCommandQueue((cl_command_queue)commandQueues[dev]); + ASSERT_CL_NO_RETURN("clReleaseCommandQueue()"); + commandQueues[dev] = (cl_command_queue)0; } + delete[] commandQueues; commandQueues = NULL; } - if (arg->name != NULL){ - free(arg->name); arg->name = NULL; + if (program != 0){ + status = clReleaseProgram((cl_program)program); + ASSERT_CL_NO_RETURN("clReleaseProgram()"); + program = (cl_program)0; } - if (arg->javaArg != NULL ) { - jenv->DeleteGlobalRef((jobject) arg->javaArg); + if (kernel != 0){ + status = clReleaseKernel((cl_kernel)kernel); + ASSERT_CL_NO_RETURN("clReleaseKernel()"); + kernel = (cl_kernel)0; } - delete arg; arg=args[i]=NULL; - } - delete[] args; args=NULL; + if (platforms){ + delete []platforms; platforms=NULL; + } + if (deviceIds){ + delete [] deviceIds; deviceIds=NULL; + } + if (argc> 0){ + for (int i=0; i< argc; i++){ + KernelArg *arg = args[i]; + if (!arg->isPrimitive()){ + if (arg->value.ref.mem != 0){ + status = clReleaseMemObject((cl_mem)arg->value.ref.mem); + ASSERT_CL_NO_RETURN("clReleaseMemObject()"); + arg->value.ref.mem = (cl_mem)0; + } + if (arg->value.ref.javaArray != NULL) { + jenv->DeleteWeakGlobalRef((jweak) arg->value.ref.javaArray); + } + } + if (arg->name != NULL){ + free(arg->name); arg->name = NULL; + } + if (arg->javaArg != NULL ) { + jenv->DeleteGlobalRef((jobject) arg->javaArg); + } + delete arg; arg=args[i]=NULL; + } + delete[] args; args=NULL; - delete []readEvents; readEvents =NULL; - delete []writeEvents; writeEvents = NULL; - delete []executeEvents; executeEvents = NULL; + delete []readEvents; readEvents =NULL; + delete []writeEvents; writeEvents = NULL; + delete []executeEvents; executeEvents = NULL; - if (isProfilingEnabled()) { - if (profileFile != NULL && profileFile != stderr) { - fclose(profileFile); + if (isProfilingEnabled()) { + if (profileFile != NULL && profileFile != stderr) { + fclose(profileFile); + } + delete[] readEventArgs; readEventArgs=0; + delete[] writeEventArgs; writeEventArgs=0; + } } - delete[] readEventArgs; readEventArgs=0; - delete[] writeEventArgs; writeEventArgs=0; - } - } - } + } - /* - Release JNI critical pinned arrays before returning to java code - */ - void unpinAll() { - for (int i=0; i< argc; i++){ - KernelArg *arg = args[i]; - if (arg->isBackedByArray()) { - arg->unpin(jenv); + /* + Release JNI critical pinned arrays before returning to java code + */ + void unpinAll() { + for (int i=0; i< argc; i++){ + KernelArg *arg = args[i]; + if (arg->isBackedByArray()) { + arg->unpin(jenv); + } + } } + + + }; + + jclass cacheKernelArgFields(JNIEnv *jenv, jobject jobj){ + jclass c = jenv->GetObjectClass(jobj); + nameFieldID = jenv->GetFieldID(c, "name", "Ljava/lang/String;"); ASSERT_FIELD(name); + typeFieldID = jenv->GetFieldID(c, "type", "I"); ASSERT_FIELD(type); + isStaticFieldID = jenv->GetFieldID(c, "isStatic", "Z"); ASSERT_FIELD(isStatic); + javaArrayFieldID = jenv->GetFieldID(c, "javaArray", "Ljava/lang/Object;"); ASSERT_FIELD(javaArray); + bytesPerLocalSizeFieldID = jenv->GetFieldID(c, "bytesPerLocalSize", "I"); ASSERT_FIELD(bytesPerLocalSize); + sizeInBytesFieldID = jenv->GetFieldID(c, "sizeInBytes", "I"); ASSERT_FIELD(sizeInBytes); + numElementsFieldID = jenv->GetFieldID(c, "numElements", "I"); ASSERT_FIELD(numElements); + return(c); + } + + JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_disposeJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) { + cl_int status = CL_SUCCESS; + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); + if (jniContext != NULL){ + delete jniContext;//free(jniContext); + jniContext = NULL; } + return(status); } + void idump(char *str, void *ptr, int size){ + int * iptr = (int *)ptr; + for (int i=0; i<size/sizeof(int); i++){ + fprintf(stderr, "%s%4d %d\n", str, i, iptr[i]); + } + } -}; + void fdump(char *str, void *ptr, int size){ + float * fptr = (float *)ptr; + for (int i=0; i<size/sizeof(float); i++){ + fprintf(stderr, "%s%4d %6.2f\n", str, i, fptr[i]); + } + } -jclass cacheKernelArgFields(JNIEnv *jenv, jobject jobj){ - jclass c = jenv->GetObjectClass(jobj); - nameFieldID = jenv->GetFieldID(c, "name", "Ljava/lang/String;"); ASSERT_FIELD(name); - typeFieldID = jenv->GetFieldID(c, "type", "I"); ASSERT_FIELD(type); - javaArrayFieldID = jenv->GetFieldID(c, "javaArray", "Ljava/lang/Object;"); ASSERT_FIELD(javaArray); - bytesPerLocalSizeFieldID = jenv->GetFieldID(c, "bytesPerLocalSize", "I"); ASSERT_FIELD(bytesPerLocalSize); - sizeInBytesFieldID = jenv->GetFieldID(c, "sizeInBytes", "I"); ASSERT_FIELD(sizeInBytes); - numElementsFieldID = jenv->GetFieldID(c, "numElements", "I"); ASSERT_FIELD(numElements); - return(c); -} -JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_disposeJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) { - cl_int status = CL_SUCCESS; - JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); - if (jniContext != NULL){ - delete jniContext;//free(jniContext); - jniContext = NULL; - } - return(status); -} + jint writeProfileInfo(JNIContext* jniContext){ + cl_ulong currSampleBaseTime = -1; + int pos = 1; -void idump(char *str, void *ptr, int size){ - int * iptr = (int *)ptr; - for (int i=0; i<size/sizeof(int); i++){ - fprintf(stderr, "%s%4d %d\n", str, i, iptr[i]); - } -} + if (jniContext->firstRun) { + fprintf(jniContext->profileFile, "# PROFILE Name, queued, submit, start, end (microseconds)\n"); + } -void fdump(char *str, void *ptr, int size){ - float * fptr = (float *)ptr; - for (int i=0; i<size/sizeof(float); i++){ - fprintf(stderr, "%s%4d %6.2f\n", str, i, fptr[i]); - } -} + // A read by a user kernel means the OpenCL layer wrote to the kernel and vice versa + for (int i=0; i< jniContext->argc; i++){ + KernelArg *arg=jniContext->args[i]; + if (arg->isBackedByArray() && arg->isRead()){ + // Initialize the base time for this sample + if (currSampleBaseTime == -1) { + currSampleBaseTime = arg->value.ref.write.queued; + } -jint writeProfileInfo(JNIContext* jniContext){ - cl_ulong currSampleBaseTime = -1; - int pos = 1; + if (jniContext->profileBaseTime == 0){ + jniContext->profileBaseTime = arg->value.ref.write.queued; - if (jniContext->firstRun) { - fprintf(jniContext->profileFile, "# PROFILE Name, queued, submit, start, end (microseconds)\n"); - } + // Write the base time as the first item in the csv + //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime); + } - // A read by a user kernel means the OpenCL layer wrote to the kernel and vice versa - for (int i=0; i< jniContext->argc; i++){ - KernelArg *arg=jniContext->args[i]; - if (arg->isBackedByArray() && arg->isRead()){ + fprintf(jniContext->profileFile, "%d write %s,", pos++, arg->name); - // Initialize the base time for this sample - if (currSampleBaseTime == -1) { - currSampleBaseTime = arg->value.ref.write.queued; - } + fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,", + (arg->value.ref.write.queued - currSampleBaseTime)/1000, + (arg->value.ref.write.submit - currSampleBaseTime)/1000, + (arg->value.ref.write.start - currSampleBaseTime)/1000, + (arg->value.ref.write.end - currSampleBaseTime)/1000); + } + } if (jniContext->profileBaseTime == 0){ - jniContext->profileBaseTime = arg->value.ref.write.queued; + jniContext->profileBaseTime = jniContext->exec.queued; // Write the base time as the first item in the csv //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime); } - fprintf(jniContext->profileFile, "%d write %s,", pos++, arg->name); + // Initialize the base time for this sample if necessary + if (currSampleBaseTime == -1) { + currSampleBaseTime = jniContext->exec.queued; + } + + // exec + fprintf(jniContext->profileFile, "%d exec,", pos++); fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,", - (arg->value.ref.write.queued - currSampleBaseTime)/1000, - (arg->value.ref.write.submit - currSampleBaseTime)/1000, - (arg->value.ref.write.start - currSampleBaseTime)/1000, - (arg->value.ref.write.end - currSampleBaseTime)/1000); - } - } + (jniContext->exec.queued - currSampleBaseTime)/1000, + (jniContext->exec.submit - currSampleBaseTime)/1000, + (jniContext->exec.start - currSampleBaseTime)/1000, + (jniContext->exec.end - currSampleBaseTime)/1000); + + // + if ( jniContext->argc == 0 ) { + fprintf(jniContext->profileFile, "\n"); + } else { + for (int i=0; i< jniContext->argc; i++){ + KernelArg *arg=jniContext->args[i]; + if (arg->isBackedByArray() && arg->isWrite()){ + if (jniContext->profileBaseTime == 0){ + jniContext->profileBaseTime = arg->value.ref.read.queued; + + // Write the base time as the first item in the csv + //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime); + } - if (jniContext->profileBaseTime == 0){ - jniContext->profileBaseTime = jniContext->exec.queued; + // Initialize the base time for this sample + if (currSampleBaseTime == -1) { + currSampleBaseTime = arg->value.ref.read.queued; + } - // Write the base time as the first item in the csv - //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime); - } + fprintf(jniContext->profileFile, "%d read %s,", pos++, arg->name); - // Initialize the base time for this sample if necessary - if (currSampleBaseTime == -1) { - currSampleBaseTime = jniContext->exec.queued; - } - - // exec - fprintf(jniContext->profileFile, "%d exec,", pos++); - - fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,", - (jniContext->exec.queued - currSampleBaseTime)/1000, - (jniContext->exec.submit - currSampleBaseTime)/1000, - (jniContext->exec.start - currSampleBaseTime)/1000, - (jniContext->exec.end - currSampleBaseTime)/1000); - - // - if ( jniContext->argc == 0 ) { - fprintf(jniContext->profileFile, "\n"); - } else { - for (int i=0; i< jniContext->argc; i++){ - KernelArg *arg=jniContext->args[i]; - if (arg->isBackedByArray() && arg->isWrite()){ - if (jniContext->profileBaseTime == 0){ - jniContext->profileBaseTime = arg->value.ref.read.queued; - - // Write the base time as the first item in the csv - //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime); + fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,", + (arg->value.ref.read.queued - currSampleBaseTime)/1000, + (arg->value.ref.read.submit - currSampleBaseTime)/1000, + (arg->value.ref.read.start - currSampleBaseTime)/1000, + (arg->value.ref.read.end - currSampleBaseTime)/1000); + } } + } + fprintf(jniContext->profileFile, "\n"); + return(0); + } - // Initialize the base time for this sample - if (currSampleBaseTime == -1) { - currSampleBaseTime = arg->value.ref.read.queued; - } + // Should failed profiling abort the run and return early? + cl_int profile(ProfileInfo *profileInfo, cl_event *event){ + cl_int status = CL_SUCCESS; + status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_QUEUED, sizeof(profileInfo->queued), &(profileInfo->queued), NULL); + ASSERT_CL( "clGetEventProfiliningInfo() QUEUED"); + status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_SUBMIT, sizeof(profileInfo->submit), &(profileInfo->submit), NULL); + ASSERT_CL( "clGetEventProfiliningInfo() SUBMIT"); + status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_START, sizeof(profileInfo->start), &(profileInfo->start), NULL); + ASSERT_CL( "clGetEventProfiliningInfo() START"); + status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_END, sizeof(profileInfo->end), &(profileInfo->end), NULL); + ASSERT_CL( "clGetEventProfiliningInfo() END"); + return status; + } - fprintf(jniContext->profileFile, "%d read %s,", pos++, arg->name); - fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,", - (arg->value.ref.read.queued - currSampleBaseTime)/1000, - (arg->value.ref.read.submit - currSampleBaseTime)/1000, - (arg->value.ref.read.start - currSampleBaseTime)/1000, - (arg->value.ref.read.end - currSampleBaseTime)/1000); - } - } - } - fprintf(jniContext->profileFile, "\n"); - return(0); -} -// Should failed profiling abort the run and return early? -cl_int profile(ProfileInfo *profileInfo, cl_event *event){ - cl_int status = CL_SUCCESS; - status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_QUEUED, sizeof(profileInfo->queued), &(profileInfo->queued), NULL); - ASSERT_CL( "clGetEventProfiliningInfo() QUEUED"); - status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_SUBMIT, sizeof(profileInfo->submit), &(profileInfo->submit), NULL); - ASSERT_CL( "clGetEventProfiliningInfo() SUBMIT"); - status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_START, sizeof(profileInfo->start), &(profileInfo->start), NULL); - ASSERT_CL( "clGetEventProfiliningInfo() START"); - status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_END, sizeof(profileInfo->end), &(profileInfo->end), NULL); - ASSERT_CL( "clGetEventProfiliningInfo() END"); - return status; -} + jint updateKernel(JNIEnv *jenv, jobject jobj, JNIContext* jniContext) { + cl_int status = CL_SUCCESS; + if (jniContext != NULL){ + // we need to step through the array of KernelArg's to create the info required to create the cl_mem buffers. + for (jint i=0; i<jniContext->argc; i++){ + KernelArg *arg=jniContext->args[i]; + + arg->type = jenv->GetIntField(arg->javaArg, typeFieldID); + if (jniContext->isVerbose()){ + fprintf(stderr, "got type for %s: %08x\n", arg->name, arg->type); + } + if (!arg->isPrimitive()) { + // Following used for all primitive arrays, object arrays and nio Buffers + jarray newRef = (jarray)jenv->GetObjectField(arg->javaArg, javaArrayFieldID); + if (jniContext->isVerbose()){ + + fprintf(stderr, "testing for Resync javaArray %s: old=%p, new=%p\n", arg->name, arg->value.ref.javaArray, newRef); + } + + jboolean isSame = jenv->IsSameObject( newRef, arg->value.ref.javaArray); + if (isSame == JNI_FALSE) { + if (jniContext->isVerbose()){ + fprintf(stderr, "Resync javaArray for %s: %p %p\n", arg->name, newRef, arg->value.ref.javaArray); + } + // Free previous ref if any + if (arg->value.ref.javaArray != NULL) { + jenv->DeleteWeakGlobalRef((jweak) arg->value.ref.javaArray); + if (jniContext->isVerbose()){ + fprintf(stderr, "DeleteWeakGlobalRef for %s: %p\n", arg->name, arg->value.ref.javaArray); + } + } + + // need to free opencl buffers, run will reallocate later + if (arg->value.ref.mem != 0) { + //fprintf(stderr, "-->releaseMemObject[%d]\n", i); + status = clReleaseMemObject((cl_mem)arg->value.ref.mem); + //fprintf(stderr, "<--releaseMemObject[%d]\n", i); + ASSERT_CL("clReleaseMemObject()"); + arg->value.ref.mem = (cl_mem)0; + } + + arg->value.ref.mem = (cl_mem) 0; + arg->value.ref.addr = NULL; + // Capture new array ref from the kernel arg object + if (newRef != NULL) { + arg->value.ref.javaArray = (jarray)jenv->NewWeakGlobalRef((jarray)newRef); + if (jniContext->isVerbose()){ + fprintf(stderr, "NewWeakGlobalRef for %s, set to %p\n", arg->name, + arg->value.ref.javaArray); + } + } else { + arg->value.ref.javaArray = NULL; + } + arg->value.ref.isArray = !arg->isAparapiBufIsDirect(); -jint updateKernel(JNIEnv *jenv, jobject jobj, JNIContext* jniContext) { - cl_int status = CL_SUCCESS; - if (jniContext != NULL){ - // we need to step through the array of KernelArg's to create the info required to create the cl_mem buffers. - for (jint i=0; i<jniContext->argc; i++){ - KernelArg *arg=jniContext->args[i]; + // Save the sizeInBytes which was set on the java side + arg->sizeInBytes = jenv->GetIntField(arg->javaArg, sizeInBytesFieldID); + + if (jniContext->isVerbose()){ + fprintf(stderr, "updateKernel, args[%d].sizeInBytes=%d\n", i, arg->sizeInBytes); + } + } // !is_same + } + } // for each arg + } // if jniContext != NULL + + return(status); + } - arg->type = jenv->GetIntField(arg->javaArg, typeFieldID); + + + JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_runKernelJNI(JNIEnv *jenv, + jobject jobj, jlong jniContextHandle, jint globalSize, jint localSize, jboolean needSync, + jboolean useNullForLocalSize, jint passes) { + + cl_int status = CL_SUCCESS; + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); if (jniContext->isVerbose()){ - fprintf(stderr, "got type for %s: %08x\n", arg->name, arg->type); + timer.start(); } - if (!arg->isPrimitive()) { - // Following used for all primitive arrays, object arrays and nio Buffers - jarray newRef = (jarray)jenv->GetObjectField(arg->javaArg, javaArrayFieldID); + + // Need to capture array refs + if (jniContext->firstRun || needSync) { + updateKernel(jenv, jobj, jniContext ); if (jniContext->isVerbose()){ + fprintf(stderr, "back from updateKernel\n"); + } + } - fprintf(stderr, "testing for Resync javaArray %s: old=%p, new=%p\n", arg->name, arg->value.ref.javaArray, newRef); + int writeEventCount = 0; + + // kernelArgPos is used to keep track of the kernel arg position, it can + // differ from "i" due to insertion of javaArrayLength args which are not + // fields read from the kernel object. + int kernelArgPos = 0; + + for (int i=0; i< jniContext->argc; i++){ + KernelArg *arg = jniContext->args[i]; + // TODO: see if we can get rid of this read + arg->type = jenv->GetIntField(arg->javaArg, typeFieldID); + if (jniContext->isVerbose()){ + fprintf(stderr, "got type for arg %d, %s, type=%08x\n", i, arg->name, arg->type); } + if (!arg->isPrimitive() && !arg->isLocal()) { + // pin the arrays so that GC does not move them during the call + + // get the C memory address for the region being transferred + // this uses different JNI calls for arrays vs. directBufs + void * prevAddr = arg->value.ref.addr; + if (arg->value.ref.isArray) { + arg->pin(jenv); + } else if (arg->isAparapiBufIsDirect()) { + // different call used for directbuffers + arg->value.ref.addr = jenv->GetDirectBufferAddress(arg->value.ref.javaArray); + } - jboolean isSame = jenv->IsSameObject( newRef, arg->value.ref.javaArray); - if (isSame == JNI_FALSE) { if (jniContext->isVerbose()){ - fprintf(stderr, "Resync javaArray for %s: %p %p\n", arg->name, newRef, arg->value.ref.javaArray); - } - // Free previous ref if any - if (arg->value.ref.javaArray != NULL) { - jenv->DeleteWeakGlobalRef((jweak) arg->value.ref.javaArray); - if (jniContext->isVerbose()){ - fprintf(stderr, "DeleteWeakGlobalRef for %s: %p\n", arg->name, arg->value.ref.javaArray); + fprintf(stderr, "runKernel: arrayOrBuf ref %p, oldAddr=%p, newAddr=%p, ref.mem=%p, isArray=%d\n", + arg->value.ref.javaArray, + prevAddr, + arg->value.ref.addr, + arg->value.ref.mem, + arg->value.ref.isArray ); + fprintf(stderr, "at memory addr %p, contents: ", arg->value.ref.addr); + unsigned char *pb = (unsigned char *) arg->value.ref.addr; + for (int k=0; k<8; k++) { + fprintf(stderr, "%02x ", pb[k]); } + fprintf(stderr, "\n" ); } + // record whether object moved + // if we see that isCopy was returned by getPrimitiveArrayCritical, treat that as a move + bool objectMoved = (arg->value.ref.addr != prevAddr) || arg->value.ref.isCopy; - // need to free opencl buffers, run will reallocate later - if (arg->value.ref.mem != 0) { - //fprintf(stderr, "-->releaseMemObject[%d]\n", i); - status = clReleaseMemObject((cl_mem)arg->value.ref.mem); - //fprintf(stderr, "<--releaseMemObject[%d]\n", i); - ASSERT_CL("clReleaseMemObject()"); - arg->value.ref.mem = (cl_mem)0; +#ifdef VERBOSE_EXPLICIT + if (arg->isExplicit() && arg->isExplicitWrite()){ + fprintf(stderr, "explicit write of %s\n", arg->name); } +#endif - arg->value.ref.mem = (cl_mem) 0; - arg->value.ref.addr = NULL; + if (jniContext->firstRun || (arg->value.ref.mem == 0) || objectMoved ){ + // if either this is the first run or user changed input array + // or gc moved something, then we create buffers/args + cl_uint mask = CL_MEM_USE_HOST_PTR; + if (arg->isRead() && arg->isWrite()) mask |= CL_MEM_READ_WRITE; + else if (arg->isRead() && !arg->isWrite()) mask |= CL_MEM_READ_ONLY; + else if (arg->isWrite()) mask |= CL_MEM_WRITE_ONLY; + arg->value.ref.memMask = mask; + if (jniContext->isVerbose()){ + strcpy(arg->value.ref.memSpec,"CL_MEM_USE_HOST_PTR"); + if (mask & CL_MEM_READ_WRITE) strcat(arg->value.ref.memSpec,"|CL_MEM_READ_WRITE"); + if (mask & CL_MEM_READ_ONLY) strcat(arg->value.ref.memSpec,"|CL_MEM_READ_ONLY"); + if (mask & CL_MEM_WRITE_ONLY) strcat(arg->value.ref.memSpec,"|CL_MEM_WRITE_ONLY"); - // Capture new array ref from the kernel arg object + fprintf(stderr, "%s %d clCreateBuffer(context, %s, size=%08x bytes, address=%08x, &status)\n", arg->name, + i, arg->value.ref.memSpec, arg->sizeInBytes, arg->value.ref.addr); + } + arg->value.ref.mem = clCreateBuffer(jniContext->context, arg->value.ref.memMask, + arg->sizeInBytes, arg->value.ref.addr, &status); - if (newRef != NULL) { - arg->value.ref.javaArray = (jarray)jenv->NewWeakGlobalRef((jarray)newRef); - if (jniContext->isVerbose()){ - fprintf(stderr, "NewWeakGlobalRef for %s, set to %p\n", arg->name, - arg->value.ref.javaArray); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clCreateBuffer"); + jniContext->unpinAll(); + return status; + } + + status = clSetKernelArg(jniContext->kernel, kernelArgPos++, sizeof(cl_mem), (void *)&(arg->value.ref.mem)); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clSetKernelArg (array)"); + jniContext->unpinAll(); + return status; + } + + // Add the array length if needed + if (arg->usesArrayLength()){ + arg->value.ref.javaArrayLength = jenv->GetIntField(arg->javaArg, numElementsFieldID); + status = clSetKernelArg(jniContext->kernel, kernelArgPos++, sizeof(jint), &(arg->value.ref.javaArrayLength)); + + if (jniContext->isVerbose()){ + fprintf(stderr, "runKernel arg %d %s, javaArrayLength = %d\n", i, arg->name, arg->value.ref.javaArrayLength); + } + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clSetKernelArg (array length)"); + jniContext->unpinAll(); + return status; + } } } else { - arg->value.ref.javaArray = NULL; + // Keep the arg position in sync if no updates were required + kernelArgPos++; + if (arg->usesArrayLength()){ + kernelArgPos++; + } } - arg->value.ref.isArray = !arg->isAparapiBufIsDirect(); - // Save the sizeInBytes which was set on the java side - arg->sizeInBytes = jenv->GetIntField(arg->javaArg, sizeInBytesFieldID); + // we only enqueue a write if we know the kernel actually reads the buffer or if there is an explicit write pending + // the default behavior for Constant buffers is also that there is no write enqueued unless explicit - if (jniContext->isVerbose()){ - fprintf(stderr, "updateKernel, args[%d].sizeInBytes=%d\n", i, arg->sizeInBytes); - } - } // !is_same - } - } // for each arg - } // if jniContext != NULL + if (arg->mustWriteBuffer()){ +#ifdef VERBOSE_EXPLICIT + if (arg->isExplicit() && arg->isExplicitWrite()){ + fprintf(stderr, "writing explicit buffer %d %s\n", i, arg->name); + } +#endif + if (jniContext->isVerbose()){ + fprintf(stderr, "%s writing buffer %d %s\n", (arg->isExplicit() ? "explicitly" : ""), + i, arg->name); + } + if (jniContext->isProfilingEnabled()) { + jniContext->writeEventArgs[writeEventCount]=i; + } - return(status); -} + status = clEnqueueWriteBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, + arg->sizeInBytes, arg->value.ref.addr, 0, NULL, &(jniContext->writeEvents[writeEventCount++])); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clEnqueueWriteBuffer"); + jniContext->unpinAll(); + return status; + } + if (arg->isExplicit() && arg->isExplicitWrite()){ + arg->type &= ~com_amd_aparapi_KernelRunner_ARG_EXPLICIT_WRITE; +#ifdef VERBOSE_EXPLICIT + fprintf(stderr, "clearing explicit buffer bit %d %s\n", i, arg->name); +#endif + jenv->SetIntField(arg->javaArg, typeFieldID,arg->type ); + } + } + } else if (arg->isLocal()){ + if (jniContext->firstRun){ + // must multiply perlocalByteSize by localSize to get real opencl buffer size + int bytesPerLocalSize = jenv->GetIntField(arg->javaArg, bytesPerLocalSizeFieldID); + int adjustedLocalBufSize = bytesPerLocalSize * localSize; + if (jniContext->isVerbose()){ + fprintf(stderr, "ISLOCAL, clSetKernelArg(jniContext->kernel, %d, %d, NULL);\n", i, adjustedLocalBufSize); + } + status = clSetKernelArg(jniContext->kernel, kernelArgPos++, adjustedLocalBufSize, NULL); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clSetKernelArg() (local)"); + jniContext->unpinAll(); + return status; + } + } else { + // Keep the arg position in sync if no updates were required + kernelArgPos++; + if (arg->usesArrayLength()){ + kernelArgPos++; + } + } + }else{ // primitive arguments + + // we need to reflectively sync the value out of the kernel object + if (arg->isFloat()){ + if (arg->isStatic){ + arg->value.f = jenv->GetStaticFloatField(jniContext->kernelClass, arg->fieldID); + }else{ + arg->value.f = jenv->GetFloatField(jniContext->kernelObject, arg->fieldID); + } + //fprintf(stderr, "float arg %d\n", arg->value.f); + }else if (arg->isInt()){ + if (arg->isStatic){ + arg->value.i = jenv->GetStaticIntField(jniContext->kernelClass, arg->fieldID); + }else{ + arg->value.i = jenv->GetIntField(jniContext->kernelObject, arg->fieldID); + } + //fprintf(stderr, "int arg %d\n", arg->value.i); + }else if (arg->isBoolean()){ + if (arg->isStatic){ + arg->value.c = jenv->GetStaticBooleanField(jniContext->kernelClass, arg->fieldID); + }else{ + arg->value.c = jenv->GetBooleanField(jniContext->kernelObject, arg->fieldID); + } + //fprintf(stderr, "boolean arg %d\n", arg->value.c); + }else if (arg->isByte()){ + if (arg->isStatic){ + arg->value.c = jenv->GetStaticByteField(jniContext->kernelClass, arg->fieldID); + }else{ + arg->value.c = jenv->GetByteField(jniContext->kernelObject, arg->fieldID); + } + //fprintf(stderr, "byte arg %d\n", arg->value.c); + }else if (arg->isLong()){ + if (arg->isStatic){ + arg->value.j = jenv->GetStaticLongField(jniContext->kernelClass, arg->fieldID); + }else{ + arg->value.j = jenv->GetLongField(jniContext->kernelObject, arg->fieldID); + } + //fprintf(stderr, "long arg %d\n", arg->value.c); + }else if (arg->isDouble()){ + if (arg->isStatic){ + arg->value.d = jenv->GetStaticDoubleField(jniContext->kernelClass, arg->fieldID); + }else{ + arg->value.d = jenv->GetDoubleField(jniContext->kernelObject, arg->fieldID); + } + //fprintf(stderr, "double arg %d\n", arg->value.c); + } + if (jniContext->isVerbose()){ + fprintf(stderr, "clSetKernelArg %s: %d %d %d 0x%08x\n", arg->name, i, kernelArgPos, + arg->sizeInBytes, arg->value); + } + status = clSetKernelArg(jniContext->kernel, kernelArgPos++, arg->sizeInBytes, &(arg->value)); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clSetKernelArg() (value)"); + jniContext->unpinAll(); + return status; + } + } + } // for each arg -JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_runKernelJNI(JNIEnv *jenv, - jobject jobj, jlong jniContextHandle, jint globalSize, jint localSize, jboolean needSync, - jboolean useNullForLocalSize, jint passes) { + size_t globalSizeAsSizeT = (globalSize /jniContext->deviceIdc); + size_t localSizeAsSizeT = localSize; - cl_int status = CL_SUCCESS; - JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); - if (jniContext->isVerbose()){ - timer.start(); - } + // To support multiple passes we add a 'secret' final arg called 'passid' and just schedule multiple enqueuendrange kernels. Each of which having a separate value of passid - // Need to capture array refs - if (jniContext->firstRun || needSync) { - updateKernel(jenv, jobj, jniContext ); - if (jniContext->isVerbose()){ - fprintf(stderr, "back from updateKernel\n"); - } - } + for (int passid=0; passid<passes; passid++){ + for (int dev =0; dev < jniContext->deviceIdc; dev++){ + size_t offset = (size_t)((globalSize/jniContext->deviceIdc)*dev); + status = clSetKernelArg(jniContext->kernel, kernelArgPos, sizeof(passid), &(passid)); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clSetKernelArg() (passid)"); + jniContext->unpinAll(); + return status; + } - int writeEventCount = 0; + // four options here due to passid + if (passid == 0 && passes==1){ + //fprintf(stderr, "setting passid to %d of %d first and last\n", passid, passes); + // there is one pass and this is it + // enqueue depends on write enqueues + // we don't block but and we populate the executeEvents + status = clEnqueueNDRangeKernel(jniContext->commandQueues[dev], jniContext->kernel, 1, &offset, &globalSizeAsSizeT, + useNullForLocalSize ? NULL : &localSizeAsSizeT, + writeEventCount, writeEventCount?jniContext->writeEvents:NULL, &jniContext->executeEvents[dev]); + }else if (passid == 0){ + //fprintf(stderr, "setting passid to %d of %d first not last\n", passid, passes); + // this is the first of multiple passes + // enqueue depends on write enqueues + // we block but do not populate executeEvents (only the last pass does this) + status = clEnqueueNDRangeKernel(jniContext->commandQueues[dev], jniContext->kernel, 1, &offset, &globalSizeAsSizeT, + useNullForLocalSize ? NULL : &localSizeAsSizeT, + writeEventCount, writeEventCount?jniContext->writeEvents:NULL, &jniContext->executeEvents[dev]); + + }else if (passid < passes-1){ + // we are in some middle pass (neither first or last) + // we don't depend on write enqueues + // we block and do not supply executeEvents (only the last pass does this) + //fprintf(stderr, "setting passid to %d of %d not first not last\n", passid, passes); + status = clEnqueueNDRangeKernel(jniContext->commandQueues[dev], jniContext->kernel, 1, &offset, &globalSizeAsSizeT, + useNullForLocalSize ? NULL : &localSizeAsSizeT, 0, NULL, &jniContext->executeEvents[dev]); + }else{ + // we are the last pass of >1 + // we don't depend on write enqueues + // we block and supply executeEvents + //fprintf(stderr, "setting passid to %d of %d last\n", passid, passes); + status = clEnqueueNDRangeKernel(jniContext->commandQueues[dev], jniContext->kernel, 1, &offset, &globalSizeAsSizeT, + useNullForLocalSize ? NULL : &localSizeAsSizeT, 0, NULL, &jniContext->executeEvents[dev]); + } - // kernelArgPos is used to keep track of the kernel arg position, it can - // differ from "i" due to insertion of javaArrayLength args which are not - // fields read from the kernel object. - int kernelArgPos = 0; - for (int i=0; i< jniContext->argc; i++){ - KernelArg *arg = jniContext->args[i]; - // TODO: see if we can get rid of this read - arg->type = jenv->GetIntField(arg->javaArg, typeFieldID); - if (jniContext->isVerbose()){ - fprintf(stderr, "got type for arg %d, %s, type=%08x\n", i, arg->name, arg->type); - } - if (!arg->isPrimitive() && !arg->isLocal()) { - // pin the arrays so that GC does not move them during the call - - // get the C memory address for the region being transferred - // this uses different JNI calls for arrays vs. directBufs - void * prevAddr = arg->value.ref.addr; - if (arg->value.ref.isArray) { - arg->pin(jenv); - } else if (arg->isAparapiBufIsDirect()) { - // different call used for directbuffers - arg->value.ref.addr = jenv->GetDirectBufferAddress(arg->value.ref.javaArray); - } + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clEnqueueNDRangeKernel()"); + fprintf(stderr, "after clEnqueueNDRangeKernel, globalSize=%d localSize=%d usingNull=%d\n", (int)globalSizeAsSizeT, (int)localSizeAsSizeT, useNullForLocalSize); + jniContext->unpinAll(); + return status; + } + } + if (passid < passes-1){ + // we need to wait for the executions to complete... + status = clWaitForEvents(jniContext->deviceIdc, jniContext->executeEvents); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clWaitForEvents() execute events mid pass"); + jniContext->unpinAll(); + return status; + } - if (jniContext->isVerbose()){ - fprintf(stderr, "runKernel: arrayOrBuf ref %p, oldAddr=%p, newAddr=%p, ref.mem=%p, isArray=%d\n", - arg->value.ref.javaArray, - prevAddr, - arg->value.ref.addr, - arg->value.ref.mem, - arg->value.ref.isArray ); - fprintf(stderr, "at memory addr %p, contents: ", arg->value.ref.addr); - unsigned char *pb = (unsigned char *) arg->value.ref.addr; - for (int k=0; k<8; k++) { - fprintf(stderr, "%02x ", pb[k]); + for (int dev = 0; dev < jniContext->deviceIdc; dev++){ + status = clReleaseEvent(jniContext->executeEvents[dev]); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clReleaseEvent() read event"); + jniContext->unpinAll(); + return status; + } + } } - fprintf(stderr, "\n" ); } - // record whether object moved - // if we see that isCopy was returned by getPrimitiveArrayCritical, treat that as a move - bool objectMoved = (arg->value.ref.addr != prevAddr) || arg->value.ref.isCopy; -#ifdef VERBOSE_EXPLICIT - if (arg->isExplicit() && arg->isExplicitWrite()){ - fprintf(stderr, "explicit write of %s\n", arg->name); - } -#endif + int readEventCount = 0; - if (jniContext->firstRun || (arg->value.ref.mem == 0) || objectMoved ){ - // if either this is the first run or user changed input array - // or gc moved something, then we create buffers/args - cl_uint mask = CL_MEM_USE_HOST_PTR; - if (arg->isRead() && arg->isWrite()) mask |= CL_MEM_READ_WRITE; - else if (arg->isRead() && !arg->isWrite()) mask |= CL_MEM_READ_ONLY; - else if (arg->isWrite()) mask |= CL_MEM_WRITE_ONLY; - arg->value.ref.memMask = mask; - if (jniContext->isVerbose()){ - strcpy(arg->value.ref.memSpec,"CL_MEM_USE_HOST_PTR"); - if (mask & CL_MEM_READ_WRITE) strcat(arg->value.ref.memSpec,"|CL_MEM_READ_WRITE"); - if (mask & CL_MEM_READ_ONLY) strcat(arg->value.ref.memSpec,"|CL_MEM_READ_ONLY"); - if (mask & CL_MEM_WRITE_ONLY) strcat(arg->value.ref.memSpec,"|CL_MEM_WRITE_ONLY"); + for (int i=0; i< jniContext->argc; i++){ + KernelArg *arg = jniContext->args[i]; - fprintf(stderr, "%s %d clCreateBuffer(context, %s, size=%08x bytes, address=%08x, &status)\n", arg->name, - i, arg->value.ref.memSpec, arg->sizeInBytes, arg->value.ref.addr); - } - arg->value.ref.mem = clCreateBuffer(jniContext->context, arg->value.ref.memMask, - arg->sizeInBytes, arg->value.ref.addr, &status); + if (arg->mustReadBuffer()){ + if (jniContext->isProfilingEnabled()) { + jniContext->readEventArgs[readEventCount]=i; + } + if (jniContext->isVerbose()){ + fprintf(stderr, "reading buffer %d %s\n", i, arg->name); + } - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clCreateBuffer"); - jniContext->unpinAll(); - return status; + status = clEnqueueReadBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, + arg->sizeInBytes,arg->value.ref.addr , jniContext->deviceIdc, jniContext->executeEvents, &(jniContext->readEvents[readEventCount++])); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clEnqueueReadBuffer()"); + jniContext->unpinAll(); + return status; + } } + } - status = clSetKernelArg(jniContext->kernel, kernelArgPos++, sizeof(cl_mem), (void *)&(arg->value.ref.mem)); + // don't change the order here + // We wait for the reads which each depend on the execution, which depends on the writes ;) + // So after the reads have completed, we can release the execute and writes. + + if (readEventCount >0){ + status = clWaitForEvents(readEventCount, jniContext->readEvents); if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clSetKernelArg (array)"); + PRINT_CL_ERR(status, "clWaitForEvents() read events"); jniContext->unpinAll(); return status; } - // Add the array length if needed - if (arg->usesArrayLength()){ - arg->value.ref.javaArrayLength = jenv->GetIntField(arg->javaArg, numElementsFieldID); - status = clSetKernelArg(jniContext->kernel, kernelArgPos++, sizeof(jint), &(arg->value.ref.javaArrayLength)); - - if (jniContext->isVerbose()){ - fprintf(stderr, "runKernel arg %d %s, javaArrayLength = %d\n", i, arg->name, arg->value.ref.javaArrayLength); + for (int i=0; i< readEventCount; i++){ + if (jniContext->isProfilingEnabled()) { + status = profile(&jniContext->args[jniContext->readEventArgs[i]]->value.ref.read, &jniContext->readEvents[i]); + if (status != CL_SUCCESS) { + jniContext->unpinAll(); + return status; + } } + status = clReleaseEvent(jniContext->readEvents[i]); if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clSetKernelArg (array length)"); + PRINT_CL_ERR(status, "clReleaseEvent() read event"); jniContext->unpinAll(); return status; } } } else { - // Keep the arg position in sync if no updates were required - kernelArgPos++; - if (arg->usesArrayLength()){ - kernelArgPos++; - } - } - - // we only enqueue a write if we know the kernel actually reads the buffer or if there is an explicit write pending - // the default behavior for Constant buffers is also that there is no write enqueued unless explicit - - if (arg->mustWriteBuffer()){ -#ifdef VERBOSE_EXPLICIT - if (arg->isExplicit() && arg->isExplicitWrite()){ - fprintf(stderr, "writing explicit buffer %d %s\n", i, arg->name); - } -#endif - if (jniContext->isVerbose()){ - fprintf(stderr, "%s writing buffer %d %s\n", (arg->isExplicit() ? "explicitly" : ""), - i, arg->name); - } - if (jniContext->isProfilingEnabled()) { - jniContext->writeEventArgs[writeEventCount]=i; - } - - status = clEnqueueWriteBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, - arg->sizeInBytes, arg->value.ref.addr, 0, NULL, &(jniContext->writeEvents[writeEventCount++])); + // if readEventCount == 0 then we don't need any reads so we just wait for the executions to complete + status = clWaitForEvents(jniContext->deviceIdc, jniContext->executeEvents); if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clEnqueueWriteBuffer"); + PRINT_CL_ERR(status, "clWaitForEvents() execute event"); jniContext->unpinAll(); return status; } - if (arg->isExplicit() && arg->isExplicitWrite()){ - arg->type &= ~com_amd_aparapi_KernelRunner_ARG_EXPLICIT_WRITE; -#ifdef VERBOSE_EXPLICIT - fprintf(stderr, "clearing explicit buffer bit %d %s\n", i, arg->name); -#endif - jenv->SetIntField(arg->javaArg, typeFieldID,arg->type ); - } } - } else if (arg->isLocal()){ - if (jniContext->firstRun){ - // must multiply perlocalByteSize by localSize to get real opencl buffer size - int bytesPerLocalSize = jenv->GetIntField(arg->javaArg, bytesPerLocalSizeFieldID); - int adjustedLocalBufSize = bytesPerLocalSize * localSize; - if (jniContext->isVerbose()){ - fprintf(stderr, "ISLOCAL, clSetKernelArg(jniContext->kernel, %d, %d, NULL);\n", i, adjustedLocalBufSize); - } - status = clSetKernelArg(jniContext->kernel, kernelArgPos++, adjustedLocalBufSize, NULL); + if (jniContext->isProfilingEnabled()) { + status = profile(&jniContext->exec, &jniContext->executeEvents[0]); if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clSetKernelArg() (local)"); jniContext->unpinAll(); return status; } - } else { - // Keep the arg position in sync if no updates were required - kernelArgPos++; - if (arg->usesArrayLength()){ - kernelArgPos++; - } - } - }else{ // primitive arguments - - // we need to reflectively sync the value out of the kernel object - if (arg->isFloat()){ - arg->value.f = jenv->GetFloatField(jniContext->kernelObject, arg->fieldID); - //fprintf(stderr, "float arg %d\n", arg->value.f); - }else if (arg->isInt()){ - arg->value.i = jenv->GetIntField(jniContext->kernelObject, arg->fieldID); - //fprintf(stderr, "int arg %d\n", arg->value.i); - }else if (arg->isBoolean()){ - arg->value.c = jenv->GetBooleanField(jniContext->kernelObject, arg->fieldID); - //fprintf(stderr, "boolean arg %d\n", arg->value.c); - }else if (arg->isByte()){ - arg->value.c = jenv->GetByteField(jniContext->kernelObject, arg->fieldID); - //fprintf(stderr, "byte arg %d\n", arg->value.c); - }else if (arg->isLong()){ - arg->value.j = jenv->GetLongField(jniContext->kernelObject, arg->fieldID); - //fprintf(stderr, "long arg %d\n", arg->value.c); - }else if (arg->isDouble()){ - arg->value.d = jenv->GetDoubleField(jniContext->kernelObject, arg->fieldID); - //fprintf(stderr, "double arg %d\n", arg->value.c); } - if (jniContext->isVerbose()){ - fprintf(stderr, "clSetKernelArg %s: %d %d %d 0x%08x\n", arg->name, i, kernelArgPos, - arg->sizeInBytes, arg->value); - } - status = clSetKernelArg(jniContext->kernel, kernelArgPos++, arg->sizeInBytes, &(arg->value)); + // extract the execution status from the executeEvent + cl_int executeStatus; + status = clGetEventInfo(jniContext->executeEvents[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &executeStatus, NULL); if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clSetKernelArg() (value)"); + PRINT_CL_ERR(status, "clGetEventInfo() execute event"); jniContext->unpinAll(); return status; } - } - } // for each arg - - size_t globalSizeAsSizeT = (globalSize /jniContext->deviceIdc); - size_t localSizeAsSizeT = localSize; - - // To support multiple passes we add a 'secret' final arg called 'passid' and just schedule multiple enqueuendrange kernels. Each of which having a separate value of passid - - for (int passid=0; passid<passes; passid++){ - for (int dev =0; dev < jniContext->deviceIdc; dev++){ - size_t offset = (size_t)((globalSize/jniContext->deviceIdc)*dev); - status = clSetKernelArg(jniContext->kernel, kernelArgPos, sizeof(passid), &(passid)); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clSetKernelArg() (passid)"); + if (executeStatus != CL_SUCCESS) { + // it should definitely not be negative, but since we did a wait above, it had better be CL_COMPLETE==CL_SUCCESS + PRINT_CL_ERR(executeStatus, "Execution status of execute event"); jniContext->unpinAll(); - return status; + return executeStatus; } - // four options here due to passid - if (passid == 0 && passes==1){ - //fprintf(stderr, "setting passid to %d of %d first and last\n", passid, passes); - // there is one pass and this is it - // enqueue depends on write enqueues - // we don't block but and we populate the executeEvents - status = clEnqueueNDRangeKernel(jniContext->commandQueues[dev], jniContext->kernel, 1, &offset, &globalSizeAsSizeT, - useNullForLocalSize ? NULL : &localSizeAsSizeT, - writeEventCount, writeEventCount?jniContext->writeEvents:NULL, &jniContext->executeEvents[dev]); - }else if (passid == 0){ - //fprintf(stderr, "setting passid to %d of %d first not last\n", passid, passes); - // this is the first of multiple passes - // enqueue depends on write enqueues - // we block but do not populate executeEvents (only the last pass does this) - status = clEnqueueNDRangeKernel(jniContext->commandQueues[dev], jniContext->kernel, 1, &offset, &globalSizeAsSizeT, - useNullForLocalSize ? NULL : &localSizeAsSizeT, - writeEventCount, writeEventCount?jniContext->writeEvents:NULL, &jniContext->executeEvents[dev]); - - }else if (passid < passes-1){ - // we are in some middle pass (neither first or last) - // we don't depend on write enqueues - // we block and do not supply executeEvents (only the last pass does this) - //fprintf(stderr, "setting passid to %d of %d not first not last\n", passid, passes); - status = clEnqueueNDRangeKernel(jniContext->commandQueues[dev], jniContext->kernel, 1, &offset, &globalSizeAsSizeT, - useNullForLocalSize ? NULL : &localSizeAsSizeT, 0, NULL, &jniContext->executeEvents[dev]); - }else{ - // we are the last pass of >1 - // we don't depend on write enqueues - // we block and supply executeEvents - //fprintf(stderr, "setting passid to %d of %d last\n", passid, passes); - status = clEnqueueNDRangeKernel(jniContext->commandQueues[dev], jniContext->kernel, 1, &offset, &globalSizeAsSizeT, - useNullForLocalSize ? NULL : &localSizeAsSizeT, 0, NULL, &jniContext->executeEvents[dev]); - } - - - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clEnqueueNDRangeKernel()"); - fprintf(stderr, "after clEnqueueNDRangeKernel, globalSize=%d localSize=%d usingNull=%d\n", (int)globalSizeAsSizeT, (int)localSizeAsSizeT, useNullForLocalSize); - jniContext->unpinAll(); - return status; - } - } - if (passid < passes-1){ - // we need to wait for the executions to complete... - status = clWaitForEvents(jniContext->deviceIdc, jniContext->executeEvents); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clWaitForEvents() execute events mid pass"); - jniContext->unpinAll(); - return status; - } + for (int dev=0; dev<jniContext->deviceIdc; dev++){ - for (int dev = 0; dev < jniContext->deviceIdc; dev++){ status = clReleaseEvent(jniContext->executeEvents[dev]); if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clReleaseEvent() read event"); + PRINT_CL_ERR(status, "clReleaseEvent() execute event"); jniContext->unpinAll(); return status; } } - } - } - - int readEventCount = 0; - - for (int i=0; i< jniContext->argc; i++){ - KernelArg *arg = jniContext->args[i]; - - if (arg->mustReadBuffer()){ - if (jniContext->isProfilingEnabled()) { - jniContext->readEventArgs[readEventCount]=i; - } - if (jniContext->isVerbose()){ - fprintf(stderr, "reading buffer %d %s\n", i, arg->name); - } - status = clEnqueueReadBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, - arg->sizeInBytes,arg->value.ref.addr , jniContext->deviceIdc, jniContext->executeEvents, &(jniContext->readEvents[readEventCount++])); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clEnqueueReadBuffer()"); - jniContext->unpinAll(); - return status; - } - } - } - - // don't change the order here - // We wait for the reads which each depend on the execution, which depends on the writes ;) - // So after the reads have completed, we can release the execute and writes. - - if (readEventCount >0){ - status = clWaitForEvents(readEventCount, jniContext->readEvents); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clWaitForEvents() read events"); - jniContext->unpinAll(); - return status; - } - - for (int i=0; i< readEventCount; i++){ - if (jniContext->isProfilingEnabled()) { - status = profile(&jniContext->args[jniContext->readEventArgs[i]]->value.ref.read, &jniContext->readEvents[i]); + for (int i=0; i< writeEventCount; i++){ + if (jniContext->isProfilingEnabled()) { + profile(&jniContext->args[jniContext->writeEventArgs[i]]->value.ref.write, &jniContext->writeEvents[i]); + } + status = clReleaseEvent(jniContext->writeEvents[i]); if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clReleaseEvent() write event"); jniContext->unpinAll(); return status; } } - status = clReleaseEvent(jniContext->readEvents[i]); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clReleaseEvent() read event"); - jniContext->unpinAll(); - return status; - } - } - } else { - // if readEventCount == 0 then we don't need any reads so we just wait for the executions to complete - status = clWaitForEvents(jniContext->deviceIdc, jniContext->executeEvents); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clWaitForEvents() execute event"); - jniContext->unpinAll(); - return status; - } - } - if (jniContext->isProfilingEnabled()) { - status = profile(&jniContext->exec, &jniContext->executeEvents[0]); - if (status != CL_SUCCESS) { jniContext->unpinAll(); - return status; - } - } - // extract the execution status from the executeEvent - cl_int executeStatus; - status = clGetEventInfo(jniContext->executeEvents[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &executeStatus, NULL); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clGetEventInfo() execute event"); - jniContext->unpinAll(); - return status; - } - if (executeStatus != CL_SUCCESS) { - // it should definitely not be negative, but since we did a wait above, it had better be CL_COMPLETE==CL_SUCCESS - PRINT_CL_ERR(executeStatus, "Execution status of execute event"); - jniContext->unpinAll(); - return executeStatus; - } - - for (int dev=0; dev<jniContext->deviceIdc; dev++){ + if (jniContext->isProfilingEnabled()) { + writeProfileInfo(jniContext); + } - status = clReleaseEvent(jniContext->executeEvents[dev]); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clReleaseEvent() execute event"); - jniContext->unpinAll(); - return status; - } - } + jniContext->firstRun = false; + if (jniContext->isVerbose()){ + timer.end("elapsed"); + } - for (int i=0; i< writeEventCount; i++){ - if (jniContext->isProfilingEnabled()) { - profile(&jniContext->args[jniContext->writeEventArgs[i]]->value.ref.write, &jniContext->writeEvents[i]); + //fprintf(stderr, "About to return %d from exec\n", status); + return(status); } - status = clReleaseEvent(jniContext->writeEvents[i]); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clReleaseEvent() write event"); - jniContext->unpinAll(); - return status; - } - } - - jniContext->unpinAll(); - - if (jniContext->isProfilingEnabled()) { - writeProfileInfo(jniContext); - } - - jniContext->firstRun = false; - if (jniContext->isVerbose()){ - timer.end("elapsed"); - } - - //fprintf(stderr, "About to return %d from exec\n", status); - return(status); -} -// we return the JNIContext from here -JNIEXPORT jlong JNICALL Java_com_amd_aparapi_KernelRunner_initJNI(JNIEnv *jenv, jclass clazz, jobject kernelObject, - jint flags, jint numProcessors, - jint maxJTPLocalSize) { - cl_int status = CL_SUCCESS; - JNIContext* jniContext = new JNIContext(jenv, kernelObject, flags, numProcessors, maxJTPLocalSize); - if (jniContext->isValid()){ - return((jlong)jniContext); - }else{ - return(0L); - } -} + // we return the JNIContext from here + JNIEXPORT jlong JNICALL Java_com_amd_aparapi_KernelRunner_initJNI(JNIEnv *jenv, jclass clazz, jobject kernelObject, + jint flags, jint numProcessors, + jint maxJTPLocalSize) { + cl_int status = CL_SUCCESS; + JNIContext* jniContext = new JNIContext(jenv, kernelObject, flags, numProcessors, maxJTPLocalSize); + if (jniContext->isValid()){ + return((jlong)jniContext); + }else{ + return(0L); + } + } -JNIEXPORT jlong JNICALL Java_com_amd_aparapi_KernelRunner_buildProgramJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jstring source) { - JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); - if (jniContext == NULL){ - return 0; - } + JNIEXPORT jlong JNICALL Java_com_amd_aparapi_KernelRunner_buildProgramJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jstring source) { + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); + if (jniContext == NULL){ + return 0; + } - cl_int status = CL_SUCCESS; - const char *sourceChars = jenv->GetStringUTFChars(source, NULL); - CHECK(sourceChars == NULL, "jenv->GetStringUTFChars() returned null" ); - - size_t sourceSize[] = { strlen(sourceChars) }; - jniContext->program = clCreateProgramWithSource( jniContext->context, 1, &sourceChars, sourceSize, &status); - jenv->ReleaseStringUTFChars(source, sourceChars); - ASSERT_CL("clCreateProgramWithSource()"); - - status = clBuildProgram(jniContext->program, jniContext->deviceIdc, jniContext->deviceIds, NULL, NULL, NULL); - - if(status == CL_BUILD_PROGRAM_FAILURE) { - cl_int logStatus; - size_t buildLogSize = 0; - status = clGetProgramBuildInfo(jniContext->program, jniContext->deviceIds[0], - CL_PROGRAM_BUILD_LOG, buildLogSize, NULL, &buildLogSize); - ASSERT_CL("clGetProgramBuildInfo()"); - char * buildLog = new char[buildLogSize]; - CHECK(buildLog == NULL, "Failed to allocate host memory. (buildLog)"); - memset(buildLog, 0, buildLogSize); - status = clGetProgramBuildInfo (jniContext->program, jniContext->deviceIds[0], - CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); - ASSERT_CL("clGetProgramBuildInfo()"); - - fprintf(stderr, "clBuildProgram failed"); - fprintf(stderr, "\n************************************************\n"); - fprintf(stderr, "%s", buildLog); - fprintf(stderr, "\n************************************************\n\n\n"); - delete []buildLog; - return(0); - } + cl_int status = CL_SUCCESS; + const char *sourceChars = jenv->GetStringUTFChars(source, NULL); + CHECK(sourceChars == NULL, "jenv->GetStringUTFChars() returned null" ); + + size_t sourceSize[] = { strlen(sourceChars) }; + jniContext->program = clCreateProgramWithSource( jniContext->context, 1, &sourceChars, sourceSize, &status); + jenv->ReleaseStringUTFChars(source, sourceChars); + ASSERT_CL("clCreateProgramWithSource()"); + + status = clBuildProgram(jniContext->program, jniContext->deviceIdc, jniContext->deviceIds, NULL, NULL, NULL); + + if(status == CL_BUILD_PROGRAM_FAILURE) { + cl_int logStatus; + size_t buildLogSize = 0; + status = clGetProgramBuildInfo(jniContext->program, jniContext->deviceIds[0], + CL_PROGRAM_BUILD_LOG, buildLogSize, NULL, &buildLogSize); + ASSERT_CL("clGetProgramBuildInfo()"); + char * buildLog = new char[buildLogSize]; + CHECK(buildLog == NULL, "Failed to allocate host memory. (buildLog)"); + memset(buildLog, 0, buildLogSize); + status = clGetProgramBuildInfo (jniContext->program, jniContext->deviceIds[0], + CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); + ASSERT_CL("clGetProgramBuildInfo()"); + + fprintf(stderr, "clBuildProgram failed"); + fprintf(stderr, "\n************************************************\n"); + fprintf(stderr, "%s", buildLog); + fprintf(stderr, "\n************************************************\n\n\n"); + delete []buildLog; + return(0); + } - jniContext->kernel = clCreateKernel(jniContext->program, "run", &status); - ASSERT_CL("clCreateKernel()"); + jniContext->kernel = clCreateKernel(jniContext->program, "run", &status); + ASSERT_CL("clCreateKernel()"); - cl_command_queue_properties queue_props = 0; - if (jniContext->isProfilingEnabled()) { - queue_props |= CL_QUEUE_PROFILING_ENABLE; - } + cl_command_queue_properties queue_props = 0; + if (jniContext->isProfilingEnabled()) { + queue_props |= CL_QUEUE_PROFILING_ENABLE; + } - jniContext->commandQueues= new cl_command_queue[jniContext->deviceIdc]; - for (int dev=0; dev < jniContext->deviceIdc; dev++){ - jniContext->commandQueues[dev]=clCreateCommandQueue(jniContext->context, (cl_device_id)jniContext->deviceIds[dev], - queue_props, - &status); - ASSERT_CL("clCreateCommandQueue()"); - } + jniContext->commandQueues= new cl_command_queue[jniContext->deviceIdc]; + for (int dev=0; dev < jniContext->deviceIdc; dev++){ + jniContext->commandQueues[dev]=clCreateCommandQueue(jniContext->context, (cl_device_id)jniContext->deviceIds[dev], + queue_props, + &status); + ASSERT_CL("clCreateCommandQueue()"); + } - if (jniContext->isProfilingEnabled()) { - // compute profile filename + if (jniContext->isProfilingEnabled()) { + // compute profile filename #if defined (_WIN32) - jint pid = GetCurrentProcessId(); + jint pid = GetCurrentProcessId(); #else - pid_t pid = getpid(); + pid_t pid = getpid(); #endif - // indicate cpu or gpu - // timestamp - // kernel name + // indicate cpu or gpu + // timestamp + // kernel name - jclass classMethodAccess = jenv->FindClass("java/lang/Class"); - jmethodID getNameID=jenv->GetMethodID(classMethodAccess,"getName","()Ljava/lang/String;"); - jstring className = (jstring)jenv->CallObjectMethod(jniContext->kernelClass, getNameID); - const char *classNameChars = jenv->GetStringUTFChars(className, NULL); + jclass classMethodAccess = jenv->FindClass("java/lang/Class"); + jmethodID getNameID=jenv->GetMethodID(classMethodAccess,"getName","()Ljava/lang/String;"); + jstring className = (jstring)jenv->CallObjectMethod(jniContext->kernelClass, getNameID); + const char *classNameChars = jenv->GetStringUTFChars(className, NULL); #define TIME_STR_LEN 200 - char timeStr[TIME_STR_LEN]; - struct tm *tmp; - time_t t = time(NULL); - tmp = localtime(&t); - if (tmp == NULL) { - perror("localtime"); - } - //strftime(timeStr, TIME_STR_LEN, "%F.%H%M%S", tmp); %F seemed to cause a core dump - strftime(timeStr, TIME_STR_LEN, "%H%M%S", tmp); - - char* fnameStr = new char[strlen(classNameChars) + strlen(timeStr) + 128]; + char timeStr[TIME_STR_LEN]; + struct tm *tmp; + time_t t = time(NULL); + tmp = localtime(&t); + if (tmp == NULL) { + perror("localtime"); + } + //strftime(timeStr, TIME_STR_LEN, "%F.%H%M%S", tmp); %F seemed to cause a core dump + strftime(timeStr, TIME_STR_LEN, "%H%M%S", tmp); - //sprintf(fnameStr, "%s.%s.%d.%llx\n", classNameChars, timeStr, pid, jniContext); - sprintf(fnameStr, "aparapiprof.%s.%d.%016lx", timeStr, pid, (unsigned long)jniContext); - jenv->ReleaseStringUTFChars(className, classNameChars); + char* fnameStr = new char[strlen(classNameChars) + strlen(timeStr) + 128]; - FILE* profileFile = fopen(fnameStr, "w"); - if (profileFile != NULL) { - jniContext->profileFile = profileFile; - } else { - jniContext->profileFile = stderr; - fprintf(stderr, "Could not open profile data file %s, reverting to stderr\n", fnameStr); - } - delete []fnameStr; - } + //sprintf(fnameStr, "%s.%s.%d.%llx\n", classNameChars, timeStr, pid, jniContext); + sprintf(fnameStr, "aparapiprof.%s.%d.%016lx", timeStr, pid, (unsigned long)jniContext); + jenv->ReleaseStringUTFChars(className, classNameChars); - return((jlong)jniContext); -} + FILE* profileFile = fopen(fnameStr, "w"); + if (profileFile != NULL) { + jniContext->profileFile = profileFile; + } else { + jniContext->profileFile = stderr; + fprintf(stderr, "Could not open profile data file %s, reverting to stderr\n", fnameStr); + } + delete []fnameStr; + } + return((jlong)jniContext); + } -// this is called once when the arg list is first determined for this kernel -JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_setArgsJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jobjectArray argArray, jint argc) { - JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); - cl_int status = CL_SUCCESS; - if (jniContext != NULL){ - jniContext->argc = argc; - jniContext->args = new KernelArg*[jniContext->argc]; - jniContext->firstRun = true; - // Step through the array of KernelArg's to capture the type data for the Kernel's data members. - for (jint i=0; i<jniContext->argc; i++){ - KernelArg* arg = jniContext->args[i] = new KernelArg; + // this is called once when the arg list is first determined for this kernel + JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_setArgsJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jobjectArray argArray, jint argc) { + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); + cl_int status = CL_SUCCESS; + if (jniContext != NULL){ + jniContext->argc = argc; + jniContext->args = new KernelArg*[jniContext->argc]; + jniContext->firstRun = true; + // Step through the array of KernelArg's to capture the type data for the Kernel's data members. + for (jint i=0; i<jniContext->argc; i++){ + KernelArg* arg = jniContext->args[i] = new KernelArg; - jobject argObj = jenv->GetObjectArrayElement(argArray, i); - if (argClazz == 0){ - argClazz = cacheKernelArgFields(jenv, argObj); - } - arg->javaArg = jenv->NewGlobalRef(argObj); // save a global ref to the java Arg Object - arg->type = jenv->GetIntField(argObj, typeFieldID); - jstring name = (jstring)jenv->GetObjectField(argObj, nameFieldID); - const char *nameChars = jenv->GetStringUTFChars(name, NULL); - arg->name=strdup(nameChars); - jenv->ReleaseStringUTFChars(name, nameChars); + jobject argObj = jenv->GetObjectArrayElement(argArray, i); + if (argClazz == 0){ + argClazz = cacheKernelArgFields(jenv, argObj); + } + arg->javaArg = jenv->NewGlobalRef(argObj); // save a global ref to the java Arg Object + + arg->type = jenv->GetIntField(argObj, typeFieldID); + arg->isStatic = jenv->GetBooleanField(argObj, isStaticFieldID); + jstring name = (jstring)jenv->GetObjectField(argObj, nameFieldID); + const char *nameChars = jenv->GetStringUTFChars(name, NULL); + arg->name=strdup(nameChars); + jenv->ReleaseStringUTFChars(name, nameChars); #ifdef VERBOSE_EXPLICIT - if (arg->isExplicit()){ - fprintf(stderr, "%s is explicit!\n", arg->name); - } + if (arg->isExplicit()){ + fprintf(stderr, "%s is explicit!\n", arg->name); + } #endif - if (arg->isPrimitive()) { - // for primitives, we cache the fieldID for that field in the kernel's arg object - if (arg->isFloat()){ - arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "F"); - arg->sizeInBytes = sizeof(jfloat); - }else if (arg->isInt()){ - arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "I"); - arg->sizeInBytes = sizeof(jint); - }else if (arg->isByte()){ - arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "B"); - arg->sizeInBytes = sizeof(jbyte); - }else if (arg->isBoolean()){ - arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "Z"); - arg->sizeInBytes = sizeof(jboolean); - }else if (arg->isLong()){ - arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "J"); - arg->sizeInBytes = sizeof(jlong); - }else if (arg->isDouble()){ - arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "D"); - arg->sizeInBytes = sizeof(jdouble); - } - }else{ // we will use an array - arg->value.ref.mem = (cl_mem) 0; - arg->value.ref.javaArray = 0; - arg->sizeInBytes = 0; - } - if (jniContext->isVerbose()){ - fprintf(stderr, "in setArgs arg %d %s type %08x\n", i, arg->name, arg->type); - } + if (arg->isPrimitive()) { + // for primitives, we cache the fieldID for that field in the kernel's arg object + if (arg->isFloat()){ + if (arg->isStatic){ + arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "F"); + }else{ + arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "F"); + } + arg->sizeInBytes = sizeof(jfloat); + }else if (arg->isInt()){ + if (arg->isStatic){ + arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "I"); + }else{ + arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "I"); + } + arg->sizeInBytes = sizeof(jint); + }else if (arg->isByte()){ + if (arg->isStatic){ + arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "B"); + }else{ + arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "B"); + } + arg->sizeInBytes = sizeof(jbyte); + }else if (arg->isBoolean()){ + if (arg->isStatic){ + arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "Z"); + }else{ + arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "Z"); + } + arg->sizeInBytes = sizeof(jboolean); + }else if (arg->isLong()){ + if (arg->isStatic){ + arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "J"); + }else{ + arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "J"); + } + arg->sizeInBytes = sizeof(jlong); + }else if (arg->isDouble()){ + if (arg->isStatic){ + arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "D"); + }else{ + arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "D"); + } + arg->sizeInBytes = sizeof(jdouble); + } + }else{ // we will use an array + arg->value.ref.mem = (cl_mem) 0; + arg->value.ref.javaArray = 0; + arg->sizeInBytes = 0; + } + if (jniContext->isVerbose()){ + fprintf(stderr, "in setArgs arg %d %s type %08x\n", i, arg->name, arg->type); + } - } - // we will need an executeEvent buffer for all devices - jniContext->executeEvents = new cl_event[jniContext->deviceIdc]; + //If an error occurred, return early so we report the first problem, not the last + if (jniContext->jenv->ExceptionCheck() == JNI_TRUE) { + jniContext->argc = -1; + delete[] jniContext->args; + jniContext->args = NULL; + jniContext->firstRun = true; + return (status); + } - // We will need *at most* jniContext->argc read/write events - jniContext->readEvents = new cl_event[jniContext->argc]; - if (jniContext->isProfilingEnabled()) { - jniContext->readEventArgs = new jint[jniContext->argc]; - } - jniContext->writeEvents = new cl_event[jniContext->argc]; - if (jniContext->isProfilingEnabled()) { - jniContext->writeEventArgs = new jint[jniContext->argc]; - } - } - return(status); -} + } + // we will need an executeEvent buffer for all devices + jniContext->executeEvents = new cl_event[jniContext->deviceIdc]; -JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getLocalSizeJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jint globalSize, jint localBytesPerLocalId) { - size_t kernelMaxWorkGroupSize = 0; - size_t kernelWorkGroupSizeMultiple = 0; - cl_int status = CL_SUCCESS; - JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); - if (jniContext != NULL){ - clGetKernelWorkGroupInfo(jniContext->kernel, jniContext->deviceIds[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(kernelMaxWorkGroupSize), &kernelMaxWorkGroupSize, NULL); - ASSERT_CL("clGetKernelWorkGroupInfo()"); - // starting value depends on device type - // not sure why the CPU has a different starting size, but it does - int startLocalSize = (jniContext->deviceType == CL_DEVICE_TYPE_GPU ? kernelMaxWorkGroupSize : globalSize/(jniContext->numProcessors*4)); - - if (startLocalSize == 0) startLocalSize = 1; - if (startLocalSize > kernelMaxWorkGroupSize) startLocalSize = kernelMaxWorkGroupSize; - if (startLocalSize > globalSize) startLocalSize = globalSize; - // if the kernel uses any local memory, determine our max local memory size so we can possibly limit localsize - cl_ulong devLocalMemSize; - if (localBytesPerLocalId > 0) { - status = clGetDeviceInfo(jniContext->deviceIds[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &devLocalMemSize, NULL); - cl_uint localSizeLimitFromLocalMem = devLocalMemSize/localBytesPerLocalId; - if (startLocalSize > localSizeLimitFromLocalMem) startLocalSize = localSizeLimitFromLocalMem; - if (jniContext->isVerbose()){ - fprintf(stderr, "localBytesPerLocalId=%d, device localMemMax=%d, localSizeLimitFromLocalMem=%d\n", - localBytesPerLocalId, (cl_uint) devLocalMemSize, localSizeLimitFromLocalMem); + // We will need *at most* jniContext->argc read/write events + jniContext->readEvents = new cl_event[jniContext->argc]; + if (jniContext->isProfilingEnabled()) { + jniContext->readEventArgs = new jint[jniContext->argc]; + } + jniContext->writeEvents = new cl_event[jniContext->argc]; + if (jniContext->isProfilingEnabled()) { + jniContext->writeEventArgs = new jint[jniContext->argc]; + } } - + return(status); } - // then iterate down until we find a localSize that divides globalSize equally - for (int localSize = startLocalSize; localSize>0; localSize--) { - if (globalSize % localSize == 0) { - if (jniContext->isVerbose()){ - fprintf(stderr, "for globalSize=%d, stepping localSize from %d, returning localSize=%d\n", globalSize, startLocalSize, localSize); + JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getLocalSizeJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jint globalSize, jint localBytesPerLocalId) { + size_t kernelMaxWorkGroupSize = 0; + size_t kernelWorkGroupSizeMultiple = 0; + cl_int status = CL_SUCCESS; + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); + if (jniContext != NULL){ + clGetKernelWorkGroupInfo(jniContext->kernel, jniContext->deviceIds[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(kernelMaxWorkGroupSize), &kernelMaxWorkGroupSize, NULL); + ASSERT_CL("clGetKernelWorkGroupInfo()"); + // starting value depends on device type + // not sure why the CPU has a different starting size, but it does + int startLocalSize = (jniContext->deviceType == CL_DEVICE_TYPE_GPU ? kernelMaxWorkGroupSize : globalSize/(jniContext->numProcessors*4)); + + if (startLocalSize == 0) startLocalSize = 1; + if (startLocalSize > kernelMaxWorkGroupSize) startLocalSize = kernelMaxWorkGroupSize; + if (startLocalSize > globalSize) startLocalSize = globalSize; + // if the kernel uses any local memory, determine our max local memory size so we can possibly limit localsize + cl_ulong devLocalMemSize; + if (localBytesPerLocalId > 0) { + status = clGetDeviceInfo(jniContext->deviceIds[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &devLocalMemSize, NULL); + cl_uint localSizeLimitFromLocalMem = devLocalMemSize/localBytesPerLocalId; + if (startLocalSize > localSizeLimitFromLocalMem) startLocalSize = localSizeLimitFromLocalMem; + if (jniContext->isVerbose()){ + fprintf(stderr, "localBytesPerLocalId=%d, device localMemMax=%d, localSizeLimitFromLocalMem=%d\n", + localBytesPerLocalId, (cl_uint) devLocalMemSize, localSizeLimitFromLocalMem); + } + + } + + // then iterate down until we find a localSize that divides globalSize equally + for (int localSize = startLocalSize; localSize>0; localSize--) { + if (globalSize % localSize == 0) { + if (jniContext->isVerbose()){ + fprintf(stderr, "for globalSize=%d, stepping localSize from %d, returning localSize=%d\n", globalSize, startLocalSize, localSize); + } + return localSize; + } } - return localSize; } + // should never get this far + return 0; } - } - // should never get this far - return 0; -} -JNIEXPORT jstring JNICALL Java_com_amd_aparapi_KernelRunner_getExtensions(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) { - jstring jextensions = NULL; - JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); - if (jniContext != NULL){ - size_t retvalsize = 0; - cl_int status = CL_SUCCESS; - status = clGetDeviceInfo(jniContext->deviceIds[0], CL_DEVICE_EXTENSIONS, 0, NULL, &retvalsize); - ASSERT_CL("clGetDeviceInfo()"); - char* extensions = new char[retvalsize]; - clGetDeviceInfo(jniContext->deviceIds[0], CL_DEVICE_EXTENSIONS, retvalsize, extensions, NULL); - jextensions = jenv->NewStringUTF(extensions); - delete [] extensions; - } - return jextensions; -} + JNIEXPORT jstring JNICALL Java_com_amd_aparapi_KernelRunner_getExtensions(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) { + jstring jextensions = NULL; + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); + if (jniContext != NULL){ + size_t retvalsize = 0; + cl_int status = CL_SUCCESS; + status = clGetDeviceInfo(jniContext->deviceIds[0], CL_DEVICE_EXTENSIONS, 0, NULL, &retvalsize); + ASSERT_CL("clGetDeviceInfo()"); + char* extensions = new char[retvalsize]; + clGetDeviceInfo(jniContext->deviceIds[0], CL_DEVICE_EXTENSIONS, retvalsize, extensions, NULL); + jextensions = jenv->NewStringUTF(extensions); + delete [] extensions; + } + return jextensions; + } -// Called as a result of Kernel.get(someArray) -JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jobject buffer) { - cl_int status = CL_SUCCESS; - JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); - if (jniContext != NULL){ - jboolean foundArg = false; - for (jint i=0; i<jniContext->argc; i++){ - KernelArg *arg= jniContext->args[i]; - if (arg->isArray()){ - jboolean isSame = jenv->IsSameObject(buffer, arg->value.ref.javaArray); - // only do this if the array that we are passed is indeed an arg we are tracking - if (isSame){ - foundArg = true; - //fprintf(stderr, "get of %s\n", arg->name); + // Called as a result of Kernel.get(someArray) + JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jobject buffer) { + cl_int status = CL_SUCCESS; + JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle); + if (jniContext != NULL){ + jboolean foundArg = false; + for (jint i=0; i<jniContext->argc; i++){ + KernelArg *arg= jniContext->args[i]; + if (arg->isArray()){ + jboolean isSame = jenv->IsSameObject(buffer, arg->value.ref.javaArray); + // only do this if the array that we are passed is indeed an arg we are tracking + if (isSame){ + foundArg = true; + //fprintf(stderr, "get of %s\n", arg->name); #ifdef VERBOSE_EXPLICIT - fprintf(stderr, "explicitly reading buffer %d %s\n", i, arg->name); + fprintf(stderr, "explicitly reading buffer %d %s\n", i, arg->name); #endif - arg->pin(jenv); + arg->pin(jenv); - status = clEnqueueReadBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, - arg->sizeInBytes,arg->value.ref.addr , 0, NULL, &jniContext->readEvents[0]); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clEnqueueReadBuffer()"); - return status; - } - status = clWaitForEvents(1, jniContext->readEvents); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clWaitForEvents"); - return status; + status = clEnqueueReadBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, + arg->sizeInBytes,arg->value.ref.addr , 0, NULL, &jniContext->readEvents[0]); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clEnqueueReadBuffer()"); + return status; + } + status = clWaitForEvents(1, jniContext->readEvents); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clWaitForEvents"); + return status; + } + clReleaseEvent(jniContext->readEvents[0]); + if (status != CL_SUCCESS) { + PRINT_CL_ERR(status, "clReleaseEvent() read event"); + return status; + } + // since this is an explicit buffer get, we expect the buffer to have changed so we commit + arg->unpinCommit(jenv); + } } - clReleaseEvent(jniContext->readEvents[0]); - if (status != CL_SUCCESS) { - PRINT_CL_ERR(status, "clReleaseEvent() read event"); - return status; + } + if (!foundArg){ + if (jniContext->isVerbose()){ + fprintf(stderr, "attempt to request to get a buffer that does not appear to be referenced from kernel\n"); } - // since this is an explicit buffer get, we expect the buffer to have changed so we commit - arg->unpinCommit(jenv); } } + return 0; } - if (!foundArg){ - if (jniContext->isVerbose()){ - fprintf(stderr, "attempt to request to get a buffer that does not appear to be referenced from kernel\n"); - } - } - } - return 0; -} - diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/KernelRunner.java b/com.amd.aparapi/src/java/com/amd/aparapi/KernelRunner.java index 1d9ac0a48989d43fb39e839926aeb00323031902..778171b03390781a7f097c39967fda1a66f52d2f 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/KernelRunner.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/KernelRunner.java @@ -39,6 +39,7 @@ package com.amd.aparapi; import java.lang.reflect.Array; import java.lang.reflect.Field; +import java.lang.reflect.Modifier; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.HashSet; @@ -404,6 +405,11 @@ class KernelRunner{ */ @UsedByJNICode public int type; + /** + * True if the field is static + */ + @UsedByJNICode public boolean isStatic; + /** * Name of the field */ @@ -1222,6 +1228,7 @@ class KernelRunner{ args[i] = new KernelArg(); args[i].name = field.getName(); args[i].field = field; + args[i].isStatic = (field.getModifiers() & Modifier.STATIC) == Modifier.STATIC; Class<?> type = field.getType(); if (type.isArray()) {