From 960aa78c4bc2ee62337aee91fbd516cead4d14e1 Mon Sep 17 00:00:00 2001 From: Gary Frost <frost.gary@gmail.com> Date: Tue, 18 Jun 2013 18:17:19 +0000 Subject: [PATCH] Added Paul Miners patch to speedup JTP and to fix address printfs for windows --- CREDITS.txt | 6 ++-- .../src/cpp/runKernel/Aparapi.cpp | 29 ++++++++++--------- .../aparapi/internal/kernel/KernelRunner.java | 16 +++++----- 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/CREDITS.txt b/CREDITS.txt index c6e603b4..8c36a4c5 100644 --- a/CREDITS.txt +++ b/CREDITS.txt @@ -2,7 +2,7 @@ We want to correctly attribute all contributions and will maintain this CREDITS. We discourage including attribution as comments in the code, instead we intend to let the history feature of SVN be the primary method for tracking attributions. -Aparapi Contributors. Thanks to all. +Aparapi Contributors. Thanks to all. If I have missed you contribution, please let me know. Witold Bolt provided patch for issue #5. Added and tested Mac OS support. Oct 12th 2011 Kenneth Skovhede provided patch for issue #11 (Correcting conversion of Java Nan and Infinity values to OpenCL equiv). Oct 28th 2011 @@ -12,4 +12,6 @@ Ryan LaMothe provided patch for issue #22 (Integrate FindBugs into the Ant build Ryan LaMothe provided patch for issue #37 (Add Kernel and JNI build support for Applets and JNLP JWS) Feb 18th 2012. Oliver Coleman provided patch for issue #64 (Support explicit boolean put and get) Aug 15th 2012 Steven Libby provided patch for #6 (Allow finer control over fallback mode selection) Aug 21th 2012 - +Steven Libby and Ryan Lamothe for #10 (Support for OpenMP, major refactoring cleanup and support for multidim arrays) March 28th 2013 + Many thanks for Steven and Ryans work here. This was a huge tidy up and refactoring effort. +Paul Miner issue #61 and #115 (JTP Speed up and fixes to explicit puts) June 13th 2013 diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp index 277cbe94..de56ac98 100644 --- a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp +++ b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp @@ -349,8 +349,8 @@ void updateArray(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argP if (mask & CL_MEM_READ_ONLY) strcat(arg->arrayBuffer->memSpec,"|CL_MEM_READ_ONLY"); if (mask & CL_MEM_WRITE_ONLY) strcat(arg->arrayBuffer->memSpec,"|CL_MEM_WRITE_ONLY"); - fprintf(stderr, "%s %d clCreateBuffer(context, %s, size=%08lx bytes, address=%08lx, &status)\n", arg->name, - argIdx, arg->arrayBuffer->memSpec, (unsigned long)arg->arrayBuffer->lengthInBytes, (unsigned long)arg->arrayBuffer->addr); + fprintf(stderr, "%s %d clCreateBuffer(context, %s, size=%08lx bytes, address=%p, &status)\n", arg->name, + argIdx, arg->arrayBuffer->memSpec, (unsigned long)arg->arrayBuffer->lengthInBytes, arg->arrayBuffer->addr); } arg->arrayBuffer->mem = clCreateBuffer(jniContext->context, arg->arrayBuffer->memMask, @@ -459,11 +459,12 @@ void processArray(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& arg arg->pin(jenv); if (config->isVerbose()) { - fprintf(stderr, "runKernel: arrayOrBuf ref %p, oldAddr=%p, newAddr=%p, ref.mem=%p\n", + fprintf(stderr, "runKernel: arrayOrBuf ref %p, oldAddr=%p, newAddr=%p, ref.mem=%p isCopy=%s\n", arg->arrayBuffer->javaArray, prevAddr, arg->arrayBuffer->addr, - arg->arrayBuffer->mem); + arg->arrayBuffer->mem, + arg->arrayBuffer->isCopy ? "true" : "false"); fprintf(stderr, "at memory addr %p, contents: ", arg->arrayBuffer->addr); unsigned char *pb = (unsigned char *) arg->arrayBuffer->addr; for (int k=0; k<8; k++) { @@ -717,7 +718,7 @@ int processArgs(JNIEnv* jenv, JNIContext* jniContext, int& argPos, int& writeEve if (!arg->isPrimitive() && !arg->isLocal()) { processObject(jenv, jniContext, arg, argPos, argIdx); - if (arg->needToEnqueueWrite() && !arg->isConstant()) { + if (arg->needToEnqueueWrite() && (!arg->isConstant() || arg->isExplicitWrite())) { if (config->isVerbose()) { fprintf(stderr, "%swriting %s%sbuffer argIndex=%d argPos=%d %s\n", (arg->isExplicit() ? "explicitly " : ""), @@ -810,9 +811,9 @@ void enqueueKernel(JNIContext* jniContext, Range& range, int passes, int argPos, // we don't block but and we populate the executeEvents if (passid == 0) { - int writeCount = writeEventCount; + writeCount = writeEventCount; if(writeEventCount > 0) { - cl_event* writeEvents = jniContext->writeEvents; + writeEvents = jniContext->writeEvents; } // we are in some passid > 0 pass @@ -821,7 +822,7 @@ void enqueueKernel(JNIContext* jniContext, Range& range, int passes, int argPos, // we block and do supply executeEvents } else { //fprintf(stderr, "setting passid to %d of %d not first not last\n", passid, passes); - + status = clWaitForEvents(1, &jniContext->executeEvents[0]); if (status != CL_SUCCESS) throw CLException(status, "clWaitForEvents() execute event"); @@ -1005,7 +1006,7 @@ void checkEvents(JNIEnv* jenv, JNIContext* jniContext, int writeEventCount) { status = clGetEventInfo(jniContext->executeEvents[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &executeStatus, NULL); if (status != CL_SUCCESS) throw CLException(status, "clGetEventInfo() execute event"); - if (executeStatus != CL_SUCCESS) throw CLException(executeStatus, "Execution status of execute event"); + if (executeStatus != CL_COMPLETE) throw CLException(executeStatus, "Execution status of execute event"); status = clReleaseEvent(jniContext->executeEvents[0]); if (status != CL_SUCCESS) throw CLException(status, "clReleaseEvent() read event"); @@ -1148,7 +1149,7 @@ void writeProfile(JNIEnv* jenv, JNIContext* jniContext) { jint pid = getProcess(); //sprintf(fnameStr, "%s.%s.%d.%llx\n", classNameChars, timeStr, pid, jniContext); - sprintf(fnameStr, "aparapiprof.%s.%d.%016lx", timeStr, pid, (unsigned long)jniContext); + sprintf(fnameStr, "aparapiprof.%s.%d.%p", timeStr, pid, jniContext); jenv->ReleaseStringUTFChars(className, classNameChars); FILE* profileFile = fopen(fnameStr, "w"); @@ -1353,8 +1354,8 @@ JNI_JAVA(jint, KernelRunnerJNI, getJNI) arg->arrayBuffer->addr , 0, NULL, &jniContext->readEvents[0]); if (config->isVerbose()){ - fprintf(stderr, "explicitly read %s ptr=%lx len=%d\n", - arg->name, (unsigned long)arg->arrayBuffer->addr, + fprintf(stderr, "explicitly read %s ptr=%p len=%d\n", + arg->name, arg->arrayBuffer->addr, arg->arrayBuffer->lengthInBytes ); } if (status != CL_SUCCESS) throw CLException(status, "clEnqueueReadBuffer()"); @@ -1389,8 +1390,8 @@ JNI_JAVA(jint, KernelRunnerJNI, getJNI) arg->aparapiBuffer->data, 0, NULL, &jniContext->readEvents[0]); if (config->isVerbose()){ - fprintf(stderr, "explicitly read %s ptr=%lx len=%d\n", - arg->name, (unsigned long)arg->aparapiBuffer->data, + fprintf(stderr, "explicitly read %s ptr=%p len=%d\n", + arg->name, arg->aparapiBuffer->data, arg->aparapiBuffer->lengthInBytes ); } if (status != CL_SUCCESS) throw CLException(status, "clEnqueueReadBuffer()"); diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java index b036829c..26e46c4a 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java @@ -48,13 +48,17 @@ import java.util.Set; import java.util.StringTokenizer; import java.util.concurrent.BrokenBarrierException; import java.util.concurrent.CyclicBarrier; +import java.util.concurrent.Executors; +import java.util.concurrent.ExecutorService; import java.util.logging.Level; import java.util.logging.Logger; import com.amd.aparapi.Config; import com.amd.aparapi.Kernel; +import com.amd.aparapi.Kernel.Constant; import com.amd.aparapi.Kernel.EXECUTION_MODE; import com.amd.aparapi.Kernel.KernelState; +import com.amd.aparapi.Kernel.Local; import com.amd.aparapi.ProfileInfo; import com.amd.aparapi.Range; import com.amd.aparapi.device.Device; @@ -68,8 +72,6 @@ import com.amd.aparapi.internal.model.Entrypoint; import com.amd.aparapi.internal.util.UnsafeWrapper; import com.amd.aparapi.internal.writer.KernelWriter; import com.amd.aparapi.opencl.OpenCL; -import com.amd.aparapi.opencl.OpenCL.Constant; -import com.amd.aparapi.opencl.OpenCL.Local; /** * The class is responsible for executing <code>Kernel</code> implementations. <br/> @@ -98,7 +100,8 @@ public class KernelRunner extends KernelRunnerJNI{ private Entrypoint entryPoint; private int argc; - + + private final ExecutorService threadPool = Executors.newCachedThreadPool(); /** * Create a KernelRunner for a specific Kernel instance. * @@ -117,6 +120,7 @@ public class KernelRunner extends KernelRunnerJNI{ if (kernel.getExecutionMode().isOpenCL()) { disposeJNI(jniContextHandle); } + threadPool.shutdownNow(); } private Set<String> capabilitiesSet; @@ -287,7 +291,6 @@ public class KernelRunner extends KernelRunnerJNI{ } else { final int threads = _range.getLocalSize(0) * _range.getLocalSize(1) * _range.getLocalSize(2); final int globalGroups = _range.getNumGroups(0) * _range.getNumGroups(1) * _range.getNumGroups(2); - final Thread threadArray[] = new Thread[threads]; /** * This joinBarrier is the barrier that we provide for the kernel threads to rendezvous with the current dispatch thread. * So this barrier is threadCount+1 wide (the +1 is for the dispatch thread) @@ -354,7 +357,7 @@ public class KernelRunner extends KernelRunnerJNI{ kernelState.setLocalBarrier(localBarrier); kernelState.setPassId(passId); - threadArray[threadId] = new Thread(new Runnable(){ + threadPool.submit(new Runnable(){ @Override public void run() { for (int globalGroupId = 0; globalGroupId < globalGroups; globalGroupId++) { @@ -478,9 +481,6 @@ public class KernelRunner extends KernelRunnerJNI{ await(joinBarrier); // This thread will rendezvous with dispatch thread here. This is effectively a join. } }); - - threadArray[threadId].setName("aparapi-" + threadId + "/" + threads); - threadArray[threadId].start(); } await(joinBarrier); // This dispatch thread waits for all worker threads here. -- GitLab