diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java new file mode 100644 index 0000000000000000000000000000000000000000..d99809e9c3353db214658321b92e41b731dea54a --- /dev/null +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java @@ -0,0 +1,1775 @@ +/* +Copyright (c) 2010-2011, Advanced Micro Devices, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the +following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and the following +disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials provided with the distribution. + +Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export +laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through +774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, +you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of +Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration +Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1, +E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups +D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject +to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774 +of EAR). For the most current Country Group listings, or for additional information about the EAR or your obligations +under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. + +*/ +package com.amd.aparapi.internal.kernel; + +import com.amd.aparapi.*; +import com.amd.aparapi.Kernel.Constant; +import com.amd.aparapi.Kernel.*; +import com.amd.aparapi.device.*; +import com.amd.aparapi.internal.annotation.*; +import com.amd.aparapi.internal.exception.*; +import com.amd.aparapi.internal.instruction.InstructionSet.*; +import com.amd.aparapi.internal.jni.*; +import com.amd.aparapi.internal.model.*; +import com.amd.aparapi.internal.util.*; +import com.amd.aparapi.internal.writer.*; +import com.amd.aparapi.opencl.*; + +import java.lang.reflect.*; +import java.nio.*; +import java.util.*; +import java.util.concurrent.*; +import java.util.concurrent.ForkJoinPool.*; +import java.util.logging.*; + +/** + * The class is responsible for executing <code>Kernel</code> implementations. <br/> + * + * The <code>KernelRunner</code> is the real workhorse for Aparapi. Each <code>Kernel</code> instance creates a single + * <code>KernelRunner</code> to encapsulate state and to help coordinate interactions between the <code>Kernel</code> + * and it's execution logic.<br/> + * + * The <code>KernelRunner</code> is created <i>lazily</i> as a result of calling <code>Kernel.execute()</code>. A this + * time the <code>ExecutionMode</code> is consulted to determine the default requested mode. This will dictate how + * the <code>KernelRunner</code> will attempt to execute the <code>Kernel</code> + * + * @see com.amd.aparapi.Kernel#execute(int _globalSize) + * + * @author gfrost + * + */ +public class KernelRunner extends KernelRunnerJNI{ + + public static boolean BINARY_CACHING_DISABLED = false; + + private static final int MINIMUM_ARRAY_SIZE = 1; + + /** @see #getCurrentPass() */ + @UsedByJNICode public static final int PASS_ID_PREPARING_EXECUTION = -2; + /** @see #getCurrentPass() */ + @UsedByJNICode public static final int PASS_ID_COMPLETED_EXECUTION = -1; + @UsedByJNICode public static final int CANCEL_STATUS_FALSE = 0; + @UsedByJNICode public static final int CANCEL_STATUS_TRUE = 1; + private static final String CODE_GEN_ERROR_MARKER = CodeGenException.class.getName(); + + private static Logger logger = Logger.getLogger(Config.getLoggerName()); + + private long jniContextHandle = 0; + + private final Kernel kernel; + + private Entrypoint entryPoint; + + private int argc; + + // may be read by a thread other than the control thread, hence volatile + private volatile boolean executing; + + // may be read by a thread other than the control thread, hence volatile + private volatile int passId = PASS_ID_PREPARING_EXECUTION; + + /** + * A direct ByteBuffer used for asynchronous intercommunication between java and JNI C code. + * + * <p> + * At present this is a 4 byte buffer to be interpreted as an int[1], used for passing from java to C a single integer interpreted as a cancellation indicator. + */ + private final ByteBuffer inBufferRemote; + private final IntBuffer inBufferRemoteInt; + + /** A direct ByteBuffer used for asynchronous intercommunication between java and JNI C code. + * <p> + * At present this is a 4 byte buffer to be interpreted as an int[1], used for passing from C to java a single integer interpreted as a + * the current pass id. + */ + private final ByteBuffer outBufferRemote; + private final IntBuffer outBufferRemoteInt; + + private boolean isFallBack = false; // If isFallBack, rebuild the kernel (necessary?) + + private static final ForkJoinWorkerThreadFactory lowPriorityThreadFactory = new ForkJoinWorkerThreadFactory(){ + @Override public ForkJoinWorkerThread newThread(ForkJoinPool pool) { + ForkJoinWorkerThread newThread = ForkJoinPool.defaultForkJoinWorkerThreadFactory.newThread(pool); + newThread.setPriority(Thread.MIN_PRIORITY); + return newThread; + } + }; + + private static final ForkJoinPool threadPool = new ForkJoinPool(Runtime.getRuntime().availableProcessors(), + lowPriorityThreadFactory, null, false); + private static HashMap<Class<? extends Kernel>, String> openCLCache = new HashMap<>(); + private static LinkedHashSet<String> seenBinaryKeys = new LinkedHashSet<>(); + + /** + * Create a KernelRunner for a specific Kernel instance. + * + * @param _kernel + */ + public KernelRunner(Kernel _kernel) { + kernel = _kernel; + + inBufferRemote = ByteBuffer.allocateDirect(4); + outBufferRemote = ByteBuffer.allocateDirect(4); + + inBufferRemote.order(ByteOrder.nativeOrder()); + outBufferRemote.order(ByteOrder.nativeOrder()); + + inBufferRemoteInt = inBufferRemote.asIntBuffer(); + outBufferRemoteInt = outBufferRemote.asIntBuffer(); + + KernelManager.instance(); // ensures static initialization of KernelManager + } + + /** + * @see Kernel#cleanUpArrays(). + */ + public void cleanUpArrays() { + if (args != null && kernel.isRunningCL()) { + for (KernelArg arg : args) { + if ((arg.getType() & KernelRunnerJNI.ARG_ARRAY) != 0) { + Field field = arg.getField(); + if (field != null && field.getType().isArray() && !Modifier.isFinal(field.getModifiers())) { + field.setAccessible(true); + Class<?> componentType = field.getType().getComponentType(); + Object newValue = Array.newInstance(componentType, MINIMUM_ARRAY_SIZE); + try { + field.set(kernel, newValue); + } + catch (IllegalAccessException e) { + throw new RuntimeException(e); + } + } + } + } + kernel.execute(0); + } else if (kernel.isRunningCL()) { + logger.log(Level.SEVERE, "KernelRunner#cleanUpArrays() could not execute as no args available (Kernel has not been executed?)"); + } + } + + /** + * <code>Kernel.dispose()</code> delegates to <code>KernelRunner.dispose()</code> which delegates to <code>disposeJNI()</code> to actually close JNI data structures.<br/> + * + * @see KernelRunnerJNI#disposeJNI(long) + */ + public synchronized void dispose() { + if (kernel.isRunningCL()) { + disposeJNI(jniContextHandle); + seenBinaryKeys.clear(); + } + // We are using a shared pool, so there's no need no shutdown it when kernel is disposed + // threadPool.shutdownNow(); + } + + private Set<String> capabilitiesSet; + + boolean hasFP64Support() { + if (capabilitiesSet == null) { + throw new IllegalStateException("Capabilities queried before they were initialized"); + } + return (capabilitiesSet.contains(OpenCL.CL_KHR_FP64)); + } + + boolean hasSelectFPRoundingModeSupport() { + if (capabilitiesSet == null) { + throw new IllegalStateException("Capabilities queried before they were initialized"); + } + return capabilitiesSet.contains(OpenCL.CL_KHR_SELECT_FPROUNDING_MODE); + } + + boolean hasGlobalInt32BaseAtomicsSupport() { + if (capabilitiesSet == null) { + throw new IllegalStateException("Capabilities queried before they were initialized"); + } + return capabilitiesSet.contains(OpenCL.CL_KHR_GLOBAL_INT32_BASE_ATOMICS); + } + + boolean hasGlobalInt32ExtendedAtomicsSupport() { + if (capabilitiesSet == null) { + throw new IllegalStateException("Capabilities queried before they were initialized"); + } + return capabilitiesSet.contains(OpenCL.CL_KHR_GLOBAL_INT32_EXTENDED_ATOMICS); + } + + boolean hasLocalInt32BaseAtomicsSupport() { + if (capabilitiesSet == null) { + throw new IllegalStateException("Capabilities queried before they were initialized"); + } + return capabilitiesSet.contains(OpenCL.CL_KHR_LOCAL_INT32_BASE_ATOMICS); + } + + boolean hasLocalInt32ExtendedAtomicsSupport() { + if (capabilitiesSet == null) { + throw new IllegalStateException("Capabilities queried before they were initialized"); + } + return capabilitiesSet.contains(OpenCL.CL_KHR_LOCAL_INT32_EXTENDED_ATOMICS); + } + + boolean hasInt64BaseAtomicsSupport() { + if (capabilitiesSet == null) { + throw new IllegalStateException("Capabilities queried before they were initialized"); + } + return capabilitiesSet.contains(OpenCL.CL_KHR_INT64_BASE_ATOMICS); + } + + boolean hasInt64ExtendedAtomicsSupport() { + if (capabilitiesSet == null) { + throw new IllegalStateException("Capabilities queried before they were initialized"); + } + return capabilitiesSet.contains(OpenCL.CL_KHR_INT64_EXTENDED_ATOMICS); + } + + boolean has3DImageWritesSupport() { + if (capabilitiesSet == null) { + throw new IllegalStateException("Capabilities queried before they were initialized"); + } + return capabilitiesSet.contains(OpenCL.CL_KHR_3D_IMAGE_WRITES); + } + + boolean hasByteAddressableStoreSupport() { + if (capabilitiesSet == null) { + throw new IllegalStateException("Capabilities queried before they were initialized"); + } + return capabilitiesSet.contains(OpenCL.CL_KHR_BYTE_ADDRESSABLE_SUPPORT); + } + + boolean hasFP16Support() { + if (capabilitiesSet == null) { + throw new IllegalStateException("Capabilities queried before they were initialized"); + } + return capabilitiesSet.contains(OpenCL.CL_KHR_FP16); + } + + boolean hasGLSharingSupport() { + if (capabilitiesSet == null) { + throw new IllegalStateException("Capabilities queried before they were initialized"); + } + return capabilitiesSet.contains(OpenCL.CL_KHR_GL_SHARING); + } + + private static final class FJSafeCyclicBarrier extends CyclicBarrier{ + FJSafeCyclicBarrier(final int threads) { + super(threads); + } + + @Override public int await() throws InterruptedException, BrokenBarrierException { + class Awaiter implements ManagedBlocker{ + private int value; + + private boolean released; + + @Override public boolean block() throws InterruptedException { + try { + value = superAwait(); + released = true; + return true; + } catch (final BrokenBarrierException e) { + throw new RuntimeException(e); + } + } + + @Override public boolean isReleasable() { + return released; + } + + int getValue() { + return value; + } + } + final Awaiter awaiter = new Awaiter(); + ForkJoinPool.managedBlock(awaiter); + return awaiter.getValue(); + } + + int superAwait() throws InterruptedException, BrokenBarrierException { + return super.await(); + } + } + + // @FunctionalInterface + private interface ThreadIdSetter{ + void set(KernelState kernelState, int globalGroupId, int threadId); + } + + /** + * Execute using a Java thread pool, or sequentially, or using an alternative algorithm, usually as a result of failing to compile or execute OpenCL + */ + @SuppressWarnings("deprecation") + protected void executeJava(ExecutionSettings _settings, Device device) { + if (logger.isLoggable(Level.FINE)) { + logger.fine("executeJava: range = " + _settings.range + ", device = " + device); + } + boolean legacySequentialMode = kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.SEQ); + + passId = PASS_ID_PREPARING_EXECUTION; + _settings.profile.onEvent(ProfilingEvent.PREPARE_EXECUTE); + + try { + if (device == JavaDevice.ALTERNATIVE_ALGORITHM) { + if (kernel.hasFallbackAlgorithm()) { + for (passId = 0; passId < _settings.passes; ++passId) { + kernel.executeFallbackAlgorithm(_settings.range, passId); + } + } else { + boolean silently = true; // not having an alternative algorithm is the normal state, and does not need reporting + fallBackToNextDevice(_settings, (Exception) null, silently); + } + } else { + final int localSize0 = _settings.range.getLocalSize(0); + final int localSize1 = _settings.range.getLocalSize(1); + final int localSize2 = _settings.range.getLocalSize(2); + final int globalSize1 = _settings.range.getGlobalSize(1); + if (legacySequentialMode || device == JavaDevice.SEQUENTIAL) { + /** + * SEQ mode is useful for testing trivial logic, but kernels which use SEQ mode cannot be used if the + * product of localSize(0..3) is >1. So we can use multi-dim ranges but only if the local size is 1 in all dimensions. + * + * As a result of this barrier is only ever 1 work item wide and probably should be turned into a no-op. + * + * So we need to check if the range is valid here. If not we have no choice but to punt. + */ + if ((localSize0 * localSize1 * localSize2) > 1) { + throw new IllegalStateException("Can't run range with group size >1 sequentially. Barriers would deadlock!"); + } + + final Kernel kernelClone = kernel.clone(); + final KernelState kernelState = kernelClone.getKernelState(); + + kernelState.setRange(_settings.range); + kernelState.setGroupId(0, 0); + kernelState.setGroupId(1, 0); + kernelState.setGroupId(2, 0); + kernelState.setLocalId(0, 0); + kernelState.setLocalId(1, 0); + kernelState.setLocalId(2, 0); + kernelState.setLocalBarrier(new FJSafeCyclicBarrier(1)); + + for (passId = 0; passId < _settings.passes; passId++) { + if (getCancelState() == CANCEL_STATUS_TRUE) { + break; + } + kernelState.setPassId(passId); + + if (_settings.range.getDims() == 1) { + for (int id = 0; id < _settings.range.getGlobalSize(0); id++) { + kernelState.setGlobalId(0, id); + kernelClone.run(); + } + } + else if (_settings.range.getDims() == 2) { + for (int x = 0; x < _settings.range.getGlobalSize(0); x++) { + kernelState.setGlobalId(0, x); + + for (int y = 0; y < globalSize1; y++) { + kernelState.setGlobalId(1, y); + kernelClone.run(); + } + } + } + else if (_settings.range.getDims() == 3) { + for (int x = 0; x < _settings.range.getGlobalSize(0); x++) { + kernelState.setGlobalId(0, x); + + for (int y = 0; y < globalSize1; y++) { + kernelState.setGlobalId(1, y); + + for (int z = 0; z < _settings.range.getGlobalSize(2); z++) { + kernelState.setGlobalId(2, z); + kernelClone.run(); + } + + kernelClone.run(); + } + } + } + } + passId = PASS_ID_COMPLETED_EXECUTION; + } + else { + if (device != JavaDevice.THREAD_POOL && kernel.getExecutionMode() != Kernel.EXECUTION_MODE.JTP) { + throw new AssertionError("unexpected JavaDevice or EXECUTION_MODE"); + } + final int threads = localSize0 * localSize1 * localSize2; + final int numGroups0 = _settings.range.getNumGroups(0); + final int numGroups1 = _settings.range.getNumGroups(1); + final int globalGroups = numGroups0 * numGroups1 * _settings.range.getNumGroups(2); + /** + * This joinBarrier is the barrier that we provide for the kernel threads to rendezvous with the current dispatch thread. + * So this barrier is threadCount+1 wide (the +1 is for the dispatch thread) + */ + final CyclicBarrier joinBarrier = new FJSafeCyclicBarrier(threads + 1); + + /** + * This localBarrier is only ever used by the kernels. If the kernel does not use the barrier the threads + * can get out of sync, we promised nothing in JTP mode. + * + * As with OpenCL all threads within a group must wait at the barrier or none. It is a user error (possible deadlock!) + * if the barrier is in a conditional that is only executed by some of the threads within a group. + * + * Kernel developer must understand this. + * + * This barrier is threadCount wide. We never hit the barrier from the dispatch thread. + */ + final CyclicBarrier localBarrier = new FJSafeCyclicBarrier(threads); + + final ThreadIdSetter threadIdSetter; + + if (_settings.range.getDims() == 1) { + threadIdSetter = new ThreadIdSetter() { + @Override + public void set(KernelState kernelState, int globalGroupId, int threadId) { + // (kernelState, globalGroupId, threadId) ->{ + kernelState.setLocalId(0, (threadId % localSize0)); + kernelState.setGlobalId(0, (threadId + (globalGroupId * threads))); + kernelState.setGroupId(0, globalGroupId); + } + }; + } + else if (_settings.range.getDims() == 2) { + + /** + * Consider a 12x4 grid of 4*2 local groups + * <pre> + * threads = 4*2 = 8 + * localWidth=4 + * localHeight=2 + * globalWidth=12 + * globalHeight=4 + * + * 00 01 02 03 | 04 05 06 07 | 08 09 10 11 + * 12 13 14 15 | 16 17 18 19 | 20 21 22 23 + * ------------+-------------+------------ + * 24 25 26 27 | 28 29 30 31 | 32 33 34 35 + * 36 37 38 39 | 40 41 42 43 | 44 45 46 47 + * + * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 threadIds : [0..7]*6 + * 04 05 06 07 | 04 05 06 07 | 04 05 06 07 + * ------------+-------------+------------ + * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 + * 04 05 06 07 | 04 05 06 07 | 04 05 06 07 + * + * 00 00 00 00 | 01 01 01 01 | 02 02 02 02 groupId[0] : 0..6 + * 00 00 00 00 | 01 01 01 01 | 02 02 02 02 + * ------------+-------------+------------ + * 00 00 00 00 | 01 01 01 01 | 02 02 02 02 + * 00 00 00 00 | 01 01 01 01 | 02 02 02 02 + * + * 00 00 00 00 | 00 00 00 00 | 00 00 00 00 groupId[1] : 0..6 + * 00 00 00 00 | 00 00 00 00 | 00 00 00 00 + * ------------+-------------+------------ + * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 + * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 + * + * 00 01 02 03 | 08 09 10 11 | 16 17 18 19 globalThreadIds == threadId + groupId * threads; + * 04 05 06 07 | 12 13 14 15 | 20 21 22 23 + * ------------+-------------+------------ + * 24 25 26 27 | 32[33]34 35 | 40 41 42 43 + * 28 29 30 31 | 36 37 38 39 | 44 45 46 47 + * + * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 localX = threadId % localWidth; (for globalThreadId 33 = threadId = 01 : 01%4 =1) + * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 + * ------------+-------------+------------ + * 00 01 02 03 | 00[01]02 03 | 00 01 02 03 + * 00 01 02 03 | 00 01 02 03 | 00 01 02 03 + * + * 00 00 00 00 | 00 00 00 00 | 00 00 00 00 localY = threadId /localWidth (for globalThreadId 33 = threadId = 01 : 01/4 =0) + * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 + * ------------+-------------+------------ + * 00 00 00 00 | 00[00]00 00 | 00 00 00 00 + * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 + * + * 00 01 02 03 | 04 05 06 07 | 08 09 10 11 globalX= + * 00 01 02 03 | 04 05 06 07 | 08 09 10 11 groupsPerLineWidth=globalWidth/localWidth (=12/4 =3) + * ------------+-------------+------------ groupInset =groupId%groupsPerLineWidth (=4%3 = 1) + * 00 01 02 03 | 04[05]06 07 | 08 09 10 11 + * 00 01 02 03 | 04 05 06 07 | 08 09 10 11 globalX = groupInset*localWidth+localX (= 1*4+1 = 5) + * + * 00 00 00 00 | 00 00 00 00 | 00 00 00 00 globalY + * 01 01 01 01 | 01 01 01 01 | 01 01 01 01 + * ------------+-------------+------------ + * 02 02 02 02 | 02[02]02 02 | 02 02 02 02 + * 03 03 03 03 | 03 03 03 03 | 03 03 03 03 + * + * </pre> + * Assume we are trying to locate the id's for #33 + * + */ + threadIdSetter = new ThreadIdSetter() { + @Override + public void set(KernelState kernelState, int globalGroupId, int threadId) { + // (kernelState, globalGroupId, threadId) ->{ + kernelState.setLocalId(0, (threadId % localSize0)); // threadId % localWidth = (for 33 = 1 % 4 = 1) + kernelState.setLocalId(1, (threadId / localSize0)); // threadId / localWidth = (for 33 = 1 / 4 == 0) + + final int groupInset = globalGroupId % numGroups0; // 4%3 = 1 + kernelState.setGlobalId(0, ((groupInset * localSize0) + kernelState.getLocalIds()[0])); // 1*4+1=5 + + final int completeLines = (globalGroupId / numGroups0) * localSize1;// (4/3) * 2 + kernelState.setGlobalId(1, (completeLines + kernelState.getLocalIds()[1])); // 2+0 = 2 + kernelState.setGroupId(0, (globalGroupId % numGroups0)); + kernelState.setGroupId(1, (globalGroupId / numGroups0)); + } + }; + } + else if (_settings.range.getDims() == 3) { + //Same as 2D actually turns out that localId[0] is identical for all three dims so could be hoisted out of conditional code + threadIdSetter = new ThreadIdSetter() { + @Override + public void set(KernelState kernelState, int globalGroupId, int threadId) { + // (kernelState, globalGroupId, threadId) ->{ + kernelState.setLocalId(0, (threadId % localSize0)); + + kernelState.setLocalId(1, ((threadId / localSize0) % localSize1)); + + // the thread id's span WxHxD so threadId/(WxH) should yield the local depth + kernelState.setLocalId(2, (threadId / (localSize0 * localSize1))); + + kernelState.setGlobalId(0, (((globalGroupId % numGroups0) * localSize0) + kernelState.getLocalIds()[0])); + + kernelState.setGlobalId(1, + ((((globalGroupId / numGroups0) * localSize1) % globalSize1) + kernelState.getLocalIds()[1])); + + kernelState.setGlobalId(2, + (((globalGroupId / (numGroups0 * numGroups1)) * localSize2) + kernelState.getLocalIds()[2])); + + kernelState.setGroupId(0, (globalGroupId % numGroups0)); + kernelState.setGroupId(1, ((globalGroupId / numGroups0) % numGroups1)); + kernelState.setGroupId(2, (globalGroupId / (numGroups0 * numGroups1))); + } + }; + } + else + throw new IllegalArgumentException("Expected 1,2 or 3 dimensions, found " + _settings.range.getDims()); + for (passId = 0; passId < _settings.passes; passId++) { + if (getCancelState() == CANCEL_STATUS_TRUE) { + break; + } + /** + * Note that we emulate OpenCL by creating one thread per localId (across the group). + * + * So threadCount == range.getLocalSize(0)*range.getLocalSize(1)*range.getLocalSize(2); + * + * For a 1D range of 12 groups of 4 we create 4 threads. One per localId(0). + * + * We also clone the kernel 4 times. One per thread. + * + * We create local barrier which has a width of 4 + * + * Thread-0 handles localId(0) (global 0,4,8) + * Thread-1 handles localId(1) (global 1,5,7) + * Thread-2 handles localId(2) (global 2,6,10) + * Thread-3 handles localId(3) (global 3,7,11) + * + * This allows all threads to synchronize using the local barrier. + * + * Initially the use of local buffers seems broken as the buffers appears to be per Kernel. + * Thankfully Kernel.clone() performs a shallow clone of all buffers (local and global) + * So each of the cloned kernels actually still reference the same underlying local/global buffers. + * + * If the kernel uses local buffers but does not use barriers then it is possible for different groups + * to see mutations from each other (unlike OpenCL), however if the kernel does not us barriers then it + * cannot assume any coherence in OpenCL mode either (the failure mode will be different but still wrong) + * + * So even JTP mode use of local buffers will need to use barriers. Not for the same reason as OpenCL but to keep groups in lockstep. + * + **/ + for (int id = 0; id < threads; id++) { + final int threadId = id; + + /** + * We clone one kernel for each thread. + * + * They will all share references to the same range, localBarrier and global/local buffers because the clone is shallow. + * We need clones so that each thread can assign 'state' (localId/globalId/groupId) without worrying + * about other threads. + */ + final Kernel kernelClone = kernel.clone(); + final KernelState kernelState = kernelClone.getKernelState(); + kernelState.setRange(_settings.range); + kernelState.setPassId(passId); + + if (threads == 1) { + kernelState.disableLocalBarrier(); + } + else { + kernelState.setLocalBarrier(localBarrier); + } + + threadPool.submit( + // () -> { + new Runnable() { + public void run() { + try { + for (int globalGroupId = 0; globalGroupId < globalGroups; globalGroupId++) { + threadIdSetter.set(kernelState, globalGroupId, threadId); + kernelClone.run(); + } + } + catch (RuntimeException | Error e) { + logger.log(Level.SEVERE, "Execution failed", e); + } + finally { + await(joinBarrier); // This thread will rendezvous with dispatch thread here. This is effectively a join. + } + } + }); + } + + await(joinBarrier); // This dispatch thread waits for all worker threads here. + } + passId = PASS_ID_COMPLETED_EXECUTION; + } // execution mode == JTP + } + } finally { + passId = PASS_ID_COMPLETED_EXECUTION; + } + } + + private static void await(CyclicBarrier _barrier) { + try { + _barrier.await(); + } catch (final InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (final BrokenBarrierException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + private KernelArg[] args = null; + + private boolean usesOopConversion = false; + + /** + * + * @param arg + * @return + * @throws AparapiException + */ + private boolean prepareOopConversionBuffer(KernelArg arg) throws AparapiException { + usesOopConversion = true; + final Class<?> arrayClass = arg.getField().getType(); + ClassModel c = null; + boolean didReallocate = false; + + if (arg.getObjArrayElementModel() == null) { + final String tmp = arrayClass.getName().substring(2).replace('/', '.'); + final String arrayClassInDotForm = tmp.substring(0, tmp.length() - 1); + + if (logger.isLoggable(Level.FINE)) { + logger.fine("looking for type = " + arrayClassInDotForm); + } + + // get ClassModel of obj array from entrypt.objectArrayFieldsClasses + c = entryPoint.getObjectArrayFieldsClasses().get(arrayClassInDotForm); + arg.setObjArrayElementModel(c); + } else { + c = arg.getObjArrayElementModel(); + } + assert c != null : "should find class for elements " + arrayClass.getName(); + + final int arrayBaseOffset = UnsafeWrapper.arrayBaseOffset(arrayClass); + final int arrayScale = UnsafeWrapper.arrayIndexScale(arrayClass); + + if (logger.isLoggable(Level.FINEST)) { + logger.finest("Syncing obj array type = " + arrayClass + " cvtd= " + c.getClassWeAreModelling().getName() + + "arrayBaseOffset=" + arrayBaseOffset + " arrayScale=" + arrayScale); + } + + int objArraySize = 0; + Object newRef = null; + try { + newRef = arg.getField().get(kernel); + objArraySize = Array.getLength(newRef); + } catch (final IllegalAccessException e) { + throw new AparapiException(e); + } + + assert (newRef != null) && (objArraySize != 0) : "no data"; + + final int totalStructSize = c.getTotalStructSize(); + final int totalBufferSize = objArraySize * totalStructSize; + + // allocate ByteBuffer if first time or array changed + if ((arg.getObjArrayBuffer() == null) || (newRef != arg.getArray())) { + final ByteBuffer structBuffer = ByteBuffer.allocate(totalBufferSize); + arg.setObjArrayByteBuffer(structBuffer.order(ByteOrder.LITTLE_ENDIAN)); + arg.setObjArrayBuffer(arg.getObjArrayByteBuffer().array()); + didReallocate = true; + if (logger.isLoggable(Level.FINEST)) { + logger.finest("objArraySize = " + objArraySize + " totalStructSize= " + totalStructSize + " totalBufferSize=" + + totalBufferSize); + } + } else { + arg.getObjArrayByteBuffer().clear(); + } + + // copy the fields that the JNI uses + arg.setJavaArray(arg.getObjArrayBuffer()); + arg.setNumElements(objArraySize); + arg.setSizeInBytes(totalBufferSize); + + for (int j = 0; j < objArraySize; j++) { + int sizeWritten = 0; + + final Object object = UnsafeWrapper.getObject(newRef, arrayBaseOffset + (arrayScale * j)); + for (int i = 0; i < c.getStructMemberTypes().size(); i++) { + final TypeSpec t = c.getStructMemberTypes().get(i); + final long offset = c.getStructMemberOffsets().get(i); + + if (logger.isLoggable(Level.FINEST)) { + logger.finest("name = " + c.getStructMembers().get(i).getNameAndTypeEntry().getNameUTF8Entry().getUTF8() + " t= " + + t); + } + + switch (t) { + case I: { + final int x = UnsafeWrapper.getInt(object, offset); + arg.getObjArrayByteBuffer().putInt(x); + sizeWritten += t.getSize(); + break; + } + case F: { + final float x = UnsafeWrapper.getFloat(object, offset); + arg.getObjArrayByteBuffer().putFloat(x); + sizeWritten += t.getSize(); + break; + } + case J: { + final long x = UnsafeWrapper.getLong(object, offset); + arg.getObjArrayByteBuffer().putLong(x); + sizeWritten += t.getSize(); + break; + } + case Z: { + final boolean x = UnsafeWrapper.getBoolean(object, offset); + arg.getObjArrayByteBuffer().put(x == true ? (byte) 1 : (byte) 0); + // Booleans converted to 1 byte C chars for opencl + sizeWritten += TypeSpec.B.getSize(); + break; + } + case B: { + final byte x = UnsafeWrapper.getByte(object, offset); + arg.getObjArrayByteBuffer().put(x); + sizeWritten += t.getSize(); + break; + } + case D: { + throw new AparapiException("Double not implemented yet"); + } + default: + assert true == false : "typespec did not match anything"; + throw new AparapiException("Unhandled type in buffer conversion"); + } + } + + // add padding here if needed + if (logger.isLoggable(Level.FINEST)) { + logger.finest("sizeWritten = " + sizeWritten + " totalStructSize= " + totalStructSize); + } + + assert sizeWritten <= totalStructSize : "wrote too much into buffer"; + + while (sizeWritten < totalStructSize) { + if (logger.isLoggable(Level.FINEST)) { + logger.finest(arg.getName() + " struct pad byte = " + sizeWritten + " totalStructSize= " + totalStructSize); + } + arg.getObjArrayByteBuffer().put((byte) -1); + sizeWritten++; + } + } + + assert arg.getObjArrayByteBuffer().arrayOffset() == 0 : "should be zero"; + + return didReallocate; + } + + private void extractOopConversionBuffer(KernelArg arg) throws AparapiException { + final Class<?> arrayClass = arg.getField().getType(); + final ClassModel c = arg.getObjArrayElementModel(); + assert c != null : "should find class for elements: " + arrayClass.getName(); + assert arg.getArray() != null : "array is null"; + + final int arrayBaseOffset = UnsafeWrapper.arrayBaseOffset(arrayClass); + final int arrayScale = UnsafeWrapper.arrayIndexScale(arrayClass); + if (logger.isLoggable(Level.FINEST)) { + logger.finest("Syncing field:" + arg.getName() + ", bb=" + arg.getObjArrayByteBuffer() + ", type = " + arrayClass); + } + + int objArraySize = 0; + try { + objArraySize = Array.getLength(arg.getField().get(kernel)); + } catch (final IllegalAccessException e) { + throw new AparapiException(e); + } + + assert objArraySize > 0 : "should be > 0"; + + final int totalStructSize = c.getTotalStructSize(); + // int totalBufferSize = objArraySize * totalStructSize; + // assert arg.objArrayBuffer.length == totalBufferSize : "size should match"; + + arg.getObjArrayByteBuffer().rewind(); + + for (int j = 0; j < objArraySize; j++) { + int sizeWritten = 0; + final Object object = UnsafeWrapper.getObject(arg.getArray(), arrayBaseOffset + (arrayScale * j)); + for (int i = 0; i < c.getStructMemberTypes().size(); i++) { + final TypeSpec t = c.getStructMemberTypes().get(i); + final long offset = c.getStructMemberOffsets().get(i); + switch (t) { + case I: { + // read int value from buffer and store into obj in the array + final int x = arg.getObjArrayByteBuffer().getInt(); + if (logger.isLoggable(Level.FINEST)) { + logger.finest("fType = " + t.getShortName() + " x= " + x); + } + UnsafeWrapper.putInt(object, offset, x); + sizeWritten += t.getSize(); + break; + } + case F: { + final float x = arg.getObjArrayByteBuffer().getFloat(); + if (logger.isLoggable(Level.FINEST)) { + logger.finest("fType = " + t.getShortName() + " x= " + x); + } + UnsafeWrapper.putFloat(object, offset, x); + sizeWritten += t.getSize(); + break; + } + case J: { + final long x = arg.getObjArrayByteBuffer().getLong(); + if (logger.isLoggable(Level.FINEST)) { + logger.finest("fType = " + t.getShortName() + " x= " + x); + } + UnsafeWrapper.putLong(object, offset, x); + sizeWritten += t.getSize(); + break; + } + case Z: { + final byte x = arg.getObjArrayByteBuffer().get(); + if (logger.isLoggable(Level.FINEST)) { + logger.finest("fType = " + t.getShortName() + " x= " + x); + } + UnsafeWrapper.putBoolean(object, offset, (x == 1 ? true : false)); + // Booleans converted to 1 byte C chars for open cl + sizeWritten += TypeSpec.B.getSize(); + break; + } + case B: { + final byte x = arg.getObjArrayByteBuffer().get(); + if (logger.isLoggable(Level.FINEST)) { + logger.finest("fType = " + t.getShortName() + " x= " + x); + } + UnsafeWrapper.putByte(object, offset, x); + sizeWritten += t.getSize(); + break; + } + case D: { + throw new AparapiException("Double not implemented yet"); + } + default: + assert true == false : "typespec did not match anything"; + throw new AparapiException("Unhandled type in buffer conversion"); + } + } + + // add padding here if needed + if (logger.isLoggable(Level.FINEST)) { + logger.finest("sizeWritten = " + sizeWritten + " totalStructSize= " + totalStructSize); + } + + assert sizeWritten <= totalStructSize : "wrote too much into buffer"; + + while (sizeWritten < totalStructSize) { + // skip over pad bytes + arg.getObjArrayByteBuffer().get(); + sizeWritten++; + } + } + } + + private void restoreObjects() throws AparapiException { + for (int i = 0; i < argc; i++) { + final KernelArg arg = args[i]; + if ((arg.getType() & ARG_OBJ_ARRAY_STRUCT) != 0) { + extractOopConversionBuffer(arg); + } + } + } + + private boolean updateKernelArrayRefs() throws AparapiException { + boolean needsSync = false; + + for (int i = 0; i < argc; i++) { + final KernelArg arg = args[i]; + try { + if ((arg.getType() & ARG_ARRAY) != 0) { + Object newArrayRef; + newArrayRef = arg.getField().get(kernel); + + if (newArrayRef == null) { + throw new IllegalStateException("Cannot send null refs to kernel, reverting to java"); + } + + String fieldName = arg.getField().getName(); + int arrayLength = Array.getLength(newArrayRef); + Integer privateMemorySize = ClassModel.getPrivateMemorySizeFromField(arg.getField()); + if (privateMemorySize == null) { + privateMemorySize = ClassModel.getPrivateMemorySizeFromFieldName(fieldName); + } + if (privateMemorySize != null) { + if (arrayLength > privateMemorySize) { + throw new IllegalStateException("__private array field " + fieldName + " has illegal length " + arrayLength + + " > " + privateMemorySize); + } + } + + if ((arg.getType() & ARG_OBJ_ARRAY_STRUCT) != 0) { + prepareOopConversionBuffer(arg); + } else { + // set up JNI fields for normal arrays + arg.setJavaArray(newArrayRef); + arg.setNumElements(arrayLength); + arg.setSizeInBytes(arg.getNumElements() * arg.getPrimitiveSize()); + + if (((args[i].getType() & ARG_EXPLICIT) != 0) && puts.contains(newArrayRef)) { + args[i].setType(args[i].getType() | ARG_EXPLICIT_WRITE); + // System.out.println("detected an explicit write " + args[i].name); + puts.remove(newArrayRef); + } + } + + if (newArrayRef != arg.getArray()) { + needsSync = true; + + if (logger.isLoggable(Level.FINE)) { + logger.fine("saw newArrayRef for " + arg.getName() + " = " + newArrayRef + ", newArrayLen = " + + Array.getLength(newArrayRef)); + } + } + + arg.setArray(newArrayRef); + assert arg.getArray() != null : "null array ref"; + } else if ((arg.getType() & ARG_APARAPI_BUFFER) != 0) { + // TODO: check if the 2D/3D array is changed. + // can Arrays.equals help? + needsSync = true; // Always need syn + Object buffer = new Object(); + try { + buffer = arg.getField().get(kernel); + } catch (IllegalAccessException e) { + e.printStackTrace(); + } + int numDims = arg.getNumDims(); + Object subBuffer = buffer; + int[] dims = new int[numDims]; + for (int d = 0; d < numDims - 1; d++) { + dims[d] = Array.getLength(subBuffer); + subBuffer = Array.get(subBuffer, 0); + } + dims[numDims - 1] = Array.getLength(subBuffer); + arg.setDims(dims); + + int primitiveSize = getPrimitiveSize(arg.getType()); + int totalElements = 1; + for (int d = 0; d < numDims; d++) { + totalElements *= dims[d]; + } + arg.setJavaBuffer(buffer); + arg.setSizeInBytes(totalElements * primitiveSize); + arg.setArray(buffer); + } + } catch (final IllegalArgumentException e) { + e.printStackTrace(); + } catch (final IllegalAccessException e) { + e.printStackTrace(); + } + } + return needsSync; + } + + @SuppressWarnings("deprecation") + private Kernel executeOpenCL(ExecutionSettings _settings) throws AparapiException { + + // Read the array refs after kernel may have changed them + // We need to do this as input to computing the localSize + assert args != null : "args should not be null"; + final boolean needSync = updateKernelArrayRefs(); + if (needSync && logger.isLoggable(Level.FINE)) { + logger.fine("Need to resync arrays on " + kernel); + } + + // native side will reallocate array buffers if necessary + int returnValue = runKernelJNI(jniContextHandle, _settings.range, needSync, _settings.passes, inBufferRemote, outBufferRemote); + if (returnValue != 0) { + String reason = "OpenCL execution seems to have failed (runKernelJNI returned " + returnValue + ")"; + return fallBackToNextDevice(_settings, new AparapiException(reason)); + } + + if (usesOopConversion == true) { + restoreObjects(); + } + + if (logger.isLoggable(Level.FINE)) { + logger.fine("executeOpenCL completed. " + _settings.range); + } + + return kernel; + } + + @SuppressWarnings("deprecation") + synchronized private Kernel fallBackByExecutionMode(ExecutionSettings _settings) { + isFallBack = true; + if (kernel.hasNextExecutionMode()) { + kernel.tryNextExecutionMode(); + if (logger.isLoggable(Level.WARNING)) { + logger.warning("Trying next execution mode " + kernel.getExecutionMode()); + } + } else { + kernel.setFallbackExecutionMode(); + } + recreateRange(_settings); + return executeInternalInner(_settings); + } + + private void recreateRange(ExecutionSettings _settings) { + if (_settings.range.isLocalIsDerived() && !_settings.legacyExecutionMode) { + Device device = kernel.getTargetDevice(); + Range result; + switch (_settings.range.getDims()) { + case 1: { + result = Range.create(device, _settings.range.getGlobalSize_0()); + break; + } + case 2: { + result = Range.create2D(device, _settings.range.getGlobalSize_0(), _settings.range.getGlobalSize_1()); + break; + } + case 3: { + result = Range.create3D(device, _settings.range.getGlobalSize_0(), _settings.range.getGlobalSize_1(), _settings.range.getGlobalSize_2()); + break; + } + default: { + throw new AssertionError("Range.getDims() = " + _settings.range.getDims()); + } + } + _settings.range = result; + } + } + + private Kernel fallBackToNextDevice(ExecutionSettings _settings, String _reason) { + return fallBackToNextDevice(_settings, new AparapiException(_reason)); + } + + @SuppressWarnings("deprecation") + synchronized private Kernel fallBackToNextDevice(ExecutionSettings _settings, Exception _exception) { + return fallBackToNextDevice(_settings, _exception, false); + } + + @SuppressWarnings("deprecation") + synchronized private Kernel fallBackToNextDevice(ExecutionSettings _settings, Exception _exception, boolean _silently) { + isFallBack = true; + _settings.profile.onEvent(ProfilingEvent.EXECUTED); + if (_settings.legacyExecutionMode) { + if (!_silently && logger.isLoggable(Level.WARNING)) { + logger.warning("Execution mode " + kernel.getExecutionMode() + " failed for " + kernel + ": " + _exception.getMessage()); + _exception.printStackTrace(); + } + return fallBackByExecutionMode(_settings); + } else { + KernelPreferences preferences = KernelManager.instance().getPreferences(kernel); + if (!_silently && logger.isLoggable(Level.WARNING)) { + logger.warning("Device failed for " + kernel + ": " + _exception.getMessage()); + } + + preferences.markPreferredDeviceFailed(); + +// Device nextDevice = preferences.getPreferredDevice(kernel); +// +// if (nextDevice == null) { +// if (!_silently && logger.isLoggable(Level.SEVERE)) { +// logger.severe("No Devices left to try, giving up"); +// } +// throw new RuntimeException(_exception); +// } + if (!_silently && logger.isLoggable(Level.WARNING)) { + _exception.printStackTrace(); + logger.warning("Trying next device: " + describeDevice()); + } + } + + recreateRange(_settings); + return executeInternalInner(_settings); + } + + @SuppressWarnings("deprecation") + public synchronized Kernel execute(String _entrypoint, final Range _range, final int _passes) { + executing = true; + try { + clearCancelMultiPass(); + KernelProfile profile = KernelManager.instance().getProfile(kernel.getClass()); + KernelPreferences preferences = KernelManager.instance().getPreferences(kernel); + boolean legacyExecutionMode = kernel.getExecutionMode() != Kernel.EXECUTION_MODE.AUTO; + + ExecutionSettings settings = new ExecutionSettings(preferences, profile, _entrypoint, _range, _passes, legacyExecutionMode); + // Two Kernels of the same class share the same KernelPreferences object, and since failure (fallback) generally mutates + // the preferences object, we must lock it. Note this prevents two Kernels of the same class executing simultaneously. + synchronized (preferences) { + return executeInternalOuter(settings); + } + } finally { + executing = false; + clearCancelMultiPass(); + } + } + + private synchronized Kernel executeInternalOuter(ExecutionSettings _settings) { + try { + return executeInternalInner(_settings); + } finally { + if (kernel.isAutoCleanUpArrays() &&_settings.range.getGlobalSize_0() != 0) { + cleanUpArrays(); + } + } + } + + @SuppressWarnings("deprecation") + private synchronized Kernel executeInternalInner(ExecutionSettings _settings) { + + if (_settings.range == null) { + throw new IllegalStateException("range can't be null"); + } + + EXECUTION_MODE requestedExecutionMode = kernel.getExecutionMode(); + + if (requestedExecutionMode.isOpenCL() && _settings.range.getDevice() != null && !(_settings.range.getDevice() instanceof OpenCLDevice)) { + fallBackToNextDevice(_settings, "OpenCL EXECUTION_MODE was requested but Device supplied was not an OpenCLDevice"); + } + + Device device = _settings.range.getDevice(); + boolean userSpecifiedDevice = true; + if (device == null) { + userSpecifiedDevice = false; + if (!_settings.legacyExecutionMode) { + device = _settings.preferences.getPreferredDevice(kernel); + if (device == null) { + // the default fallback when KernelPreferences has run out of options is JTP + device = JavaDevice.THREAD_POOL; + } + } else { + if (requestedExecutionMode == EXECUTION_MODE.JTP) { + device = JavaDevice.THREAD_POOL; + } else if (requestedExecutionMode == EXECUTION_MODE.SEQ) { + device = JavaDevice.SEQUENTIAL; + } + } + } else { + boolean compatible = isDeviceCompatible(device); + if (!compatible) { + throw new AssertionError("user supplied Device incompatible with current EXECUTION_MODE or getTargetDevice(); device = " + + device.getShortDescription() + "; kernel = " + kernel); + } + } + + try { + OpenCLDevice openCLDevice = device instanceof OpenCLDevice ? (OpenCLDevice) device : null; + + int jniFlags = 0; + // for legacy reasons use old logic where Kernel.EXECUTION_MODE is not AUTO + if (_settings.legacyExecutionMode && !userSpecifiedDevice && requestedExecutionMode.isOpenCL()) { + if (requestedExecutionMode.equals(EXECUTION_MODE.GPU)) { + // Get the best GPU + openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.bestGPU(); + jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now. + if (openCLDevice == null) { + return fallBackToNextDevice(_settings, "GPU request can't be honored, no GPU device"); + } + } else if (requestedExecutionMode.equals(EXECUTION_MODE.ACC)) { + // Get the best ACC + openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.bestACC(); + jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now. + if (openCLDevice == null) { + return fallBackToNextDevice(_settings, "ACC request can't be honored, no ACC device"); + } + } else { + // We fetch the first CPU device + openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.firstDevice(Device.TYPE.CPU); + if (openCLDevice == null) { + return fallBackToNextDevice(_settings, "CPU request can't be honored, no CPU device"); + } + } + } else { + if (device.getType() == Device.TYPE.GPU) { + jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now. + } else if (device.getType() == Device.TYPE.ACC) { + jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now. + } + } + if (device == null && openCLDevice != null) { + device = openCLDevice; + } + assert device != null : "No device available"; + _settings.profile.onStart(device); + /* for backward compatibility reasons we still honor execution mode */ + boolean isOpenCl = requestedExecutionMode.isOpenCL() || device instanceof OpenCLDevice; + if (isOpenCl) { + if ((entryPoint == null) || (isFallBack)) { + if (entryPoint == null) { + try { + final ClassModel classModel = ClassModel.createClassModel(kernel.getClass()); + entryPoint = classModel.getEntrypoint(_settings.entrypoint, kernel); + _settings.profile.onEvent(ProfilingEvent.CLASS_MODEL_BUILT); + } catch (final Exception exception) { + _settings.profile.onEvent(ProfilingEvent.CLASS_MODEL_BUILT); + return fallBackToNextDevice(_settings, exception); + } + } + + if ((entryPoint != null)) { + synchronized (Kernel.class) { // This seems to be needed because of a race condition uncovered with issue #68 http://code.google.com/p/aparapi/issues/detail?id=68 + + // jniFlags |= (Config.enableProfiling ? JNI_FLAG_ENABLE_PROFILING : 0); + // jniFlags |= (Config.enableProfilingCSV ? JNI_FLAG_ENABLE_PROFILING_CSV | JNI_FLAG_ENABLE_PROFILING : 0); + // jniFlags |= (Config.enableVerboseJNI ? JNI_FLAG_ENABLE_VERBOSE_JNI : 0); + // jniFlags |= (Config.enableVerboseJNIOpenCLResourceTracking ? JNI_FLAG_ENABLE_VERBOSE_JNI_OPENCL_RESOURCE_TRACKING :0); + // jniFlags |= (kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.GPU) ? JNI_FLAG_USE_GPU : 0); + // Init the device to check capabilities before emitting the + // code that requires the capabilities. + jniContextHandle = initJNI(kernel, openCLDevice, jniFlags); // openCLDevice will not be null here + _settings.profile.onEvent(ProfilingEvent.INIT_JNI); + } // end of synchronized! issue 68 + + if (jniContextHandle == 0) { + return fallBackToNextDevice(_settings, "initJNI failed to return a valid handle"); + } + + final String extensions = getExtensionsJNI(jniContextHandle); + capabilitiesSet = new HashSet<String>(); + + final StringTokenizer strTok = new StringTokenizer(extensions); + while (strTok.hasMoreTokens()) { + capabilitiesSet.add(strTok.nextToken()); + } + + if (logger.isLoggable(Level.FINE)) { + logger.fine("Capabilities initialized to :" + capabilitiesSet.toString()); + } + + if (entryPoint.requiresDoublePragma() && !hasFP64Support()) { + return fallBackToNextDevice(_settings, "FP64 required but not supported"); + } + + if (entryPoint.requiresByteAddressableStorePragma() && !hasByteAddressableStoreSupport()) { + return fallBackToNextDevice(_settings, "Byte addressable stores required but not supported"); + } + + final boolean all32AtomicsAvailable = hasGlobalInt32BaseAtomicsSupport() + && hasGlobalInt32ExtendedAtomicsSupport() && hasLocalInt32BaseAtomicsSupport() + && hasLocalInt32ExtendedAtomicsSupport(); + + if (entryPoint.requiresAtomic32Pragma() && !all32AtomicsAvailable) { + + return fallBackToNextDevice(_settings, "32 bit Atomics required but not supported"); + } + + String openCL; + synchronized (openCLCache) { + openCL = openCLCache.get(kernel.getClass()); + if (openCL == null) { + try { + openCL = KernelWriter.writeToString(entryPoint); + if (logger.isLoggable(Level.INFO)) { + logger.info(openCL); + } + else if (Config.enableShowGeneratedOpenCL) { + System.out.println(openCL); + } + _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED); + openCLCache.put(kernel.getClass(), openCL); + } + catch (final CodeGenException codeGenException) { + openCLCache.put(kernel.getClass(), CODE_GEN_ERROR_MARKER); + _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED); + return fallBackToNextDevice(_settings, codeGenException); + } + } + else { + if (openCL.equals(CODE_GEN_ERROR_MARKER)) { + _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED); + boolean silently = true; // since we must have already reported the CodeGenException + return fallBackToNextDevice(_settings, null, silently); + } + } + } + + // Send the string to OpenCL to compile it, or if the compiled binary is already cached on JNI side just empty string to use cached binary + long handle; + if (BINARY_CACHING_DISABLED) { + handle = buildProgramJNI(jniContextHandle, openCL, ""); + } else { + synchronized (seenBinaryKeys) { + String binaryKey = kernel.getClass().getName() + ":" + device.getDeviceId(); + if (seenBinaryKeys.contains(binaryKey)) { + // use cached binary + logger.log(Level.INFO, "reusing cached binary for " + binaryKey); + handle = buildProgramJNI(jniContextHandle, "", binaryKey); + } + else { + // create and cache binary + logger.log(Level.INFO, "compiling new binary for " + binaryKey); + handle = buildProgramJNI(jniContextHandle, openCL, binaryKey); + seenBinaryKeys.add(binaryKey); + } + } + } + _settings.profile.onEvent(ProfilingEvent.OPENCL_COMPILED); + if (handle == 0) { + return fallBackToNextDevice(_settings, "OpenCL compile failed"); + } + + args = new KernelArg[entryPoint.getReferencedFields().size()]; + int i = 0; + + for (final Field field : entryPoint.getReferencedFields()) { + try { + field.setAccessible(true); + args[i] = new KernelArg(); + args[i].setName(field.getName()); + args[i].setField(field); + if ((field.getModifiers() & Modifier.STATIC) == Modifier.STATIC) { + args[i].setType(args[i].getType() | ARG_STATIC); + } + + final Class<?> type = field.getType(); + if (type.isArray()) { + + if (field.getAnnotation(Local.class) != null || args[i].getName().endsWith(Kernel.LOCAL_SUFFIX)) { + args[i].setType(args[i].getType() | ARG_LOCAL); + } else if ((field.getAnnotation(Constant.class) != null) + || args[i].getName().endsWith(Kernel.CONSTANT_SUFFIX)) { + args[i].setType(args[i].getType() | ARG_CONSTANT); + } else { + args[i].setType(args[i].getType() | ARG_GLOBAL); + } + if (isExplicit()) { + args[i].setType(args[i].getType() | ARG_EXPLICIT); + } + // for now, treat all write arrays as read-write, see bugzilla issue 4859 + // we might come up with a better solution later + args[i].setType(args[i].getType() + | (entryPoint.getArrayFieldAssignments().contains(field.getName()) ? (ARG_WRITE | ARG_READ) : 0)); + args[i].setType(args[i].getType() + | (entryPoint.getArrayFieldAccesses().contains(field.getName()) ? ARG_READ : 0)); + // args[i].type |= ARG_GLOBAL; + + if (type.getName().startsWith("[L")) { + args[i].setArray(null); // will get updated in updateKernelArrayRefs + args[i].setType(args[i].getType() + | (ARG_ARRAY | ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)); + + if (logger.isLoggable(Level.FINE)) { + logger.fine("tagging " + args[i].getName() + " as (ARG_ARRAY | ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)"); + } + } else if (type.getName().startsWith("[[")) { + + try { + setMultiArrayType(args[i], type); + } catch (AparapiException e) { + return fallBackToNextDevice(_settings, "failed to set kernel arguement " + + args[i].getName() + ". Aparapi only supports 2D and 3D arrays."); + } + } else { + + args[i].setArray(null); // will get updated in updateKernelArrayRefs + args[i].setType(args[i].getType() | ARG_ARRAY); + + args[i].setType(args[i].getType() | (type.isAssignableFrom(float[].class) ? ARG_FLOAT : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(int[].class) ? ARG_INT : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(boolean[].class) ? ARG_BOOLEAN : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(byte[].class) ? ARG_BYTE : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(char[].class) ? ARG_CHAR : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(double[].class) ? ARG_DOUBLE : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(long[].class) ? ARG_LONG : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(short[].class) ? ARG_SHORT : 0)); + + // arrays whose length is used will have an int arg holding + // the length as a kernel param + if (entryPoint.getArrayFieldArrayLengthUsed().contains(args[i].getName())) { + args[i].setType(args[i].getType() | ARG_ARRAYLENGTH); + } + + if (type.getName().startsWith("[L")) { + args[i].setType(args[i].getType() | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)); + if (logger.isLoggable(Level.FINE)) { + logger.fine("tagging " + args[i].getName() + + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)"); + } + } + } + } else if (type.isAssignableFrom(float.class)) { + args[i].setType(args[i].getType() | ARG_PRIMITIVE); + args[i].setType(args[i].getType() | ARG_FLOAT); + } else if (type.isAssignableFrom(int.class)) { + args[i].setType(args[i].getType() | ARG_PRIMITIVE); + args[i].setType(args[i].getType() | ARG_INT); + } else if (type.isAssignableFrom(double.class)) { + args[i].setType(args[i].getType() | ARG_PRIMITIVE); + args[i].setType(args[i].getType() | ARG_DOUBLE); + } else if (type.isAssignableFrom(long.class)) { + args[i].setType(args[i].getType() | ARG_PRIMITIVE); + args[i].setType(args[i].getType() | ARG_LONG); + } else if (type.isAssignableFrom(boolean.class)) { + args[i].setType(args[i].getType() | ARG_PRIMITIVE); + args[i].setType(args[i].getType() | ARG_BOOLEAN); + } else if (type.isAssignableFrom(byte.class)) { + args[i].setType(args[i].getType() | ARG_PRIMITIVE); + args[i].setType(args[i].getType() | ARG_BYTE); + } else if (type.isAssignableFrom(char.class)) { + args[i].setType(args[i].getType() | ARG_PRIMITIVE); + args[i].setType(args[i].getType() | ARG_CHAR); + } else if (type.isAssignableFrom(short.class)) { + args[i].setType(args[i].getType() | ARG_PRIMITIVE); + args[i].setType(args[i].getType() | ARG_SHORT); + } + // System.out.printf("in execute, arg %d %s %08x\n", i,args[i].name,args[i].type ); + } catch (final IllegalArgumentException e) { + e.printStackTrace(); + } + + args[i].setPrimitiveSize(getPrimitiveSize(args[i].getType())); + + if (logger.isLoggable(Level.FINE)) { + logger.fine("arg " + i + ", " + args[i].getName() + ", type=" + Integer.toHexString(args[i].getType()) + + ", primitiveSize=" + args[i].getPrimitiveSize()); + } + + i++; + } + + // at this point, i = the actual used number of arguments + // (private buffers do not get treated as arguments) + + argc = i; + + setArgsJNI(jniContextHandle, args, argc); + _settings.profile.onEvent(ProfilingEvent.PREPARE_EXECUTE); + try { + executeOpenCL(_settings); + isFallBack = false; + } catch (final AparapiException e) { + fallBackToNextDevice(_settings, e); + } + } else { // (entryPoint != null) && !entryPoint.shouldFallback() + fallBackToNextDevice(_settings, "failed to locate entrypoint"); + } + } else { // (entryPoint == null) || (isFallBack) + try { + executeOpenCL(_settings); + isFallBack = false; + } catch (final AparapiException e) { + fallBackToNextDevice(_settings, e); + } + } + } else { // isOpenCL + if (!(device instanceof JavaDevice)) { + fallBackToNextDevice(_settings, "Non-OpenCL Kernel.EXECUTION_MODE requested but device is not a JavaDevice "); + } + executeJava(_settings, (JavaDevice) device); + } + + if (Config.enableExecutionModeReporting) { + System.out.println("execution complete: " + kernel); + } + + return kernel; + } + finally { + _settings.profile.onEvent(ProfilingEvent.EXECUTED); + maybeReportProfile(_settings); + } + } + + @Override + public String toString() { + return "KernelRunner{" + kernel + "}"; + } + + private String describeDevice() { + Device device = KernelManager.instance().getPreferences(kernel).getPreferredDevice(kernel); + return (device == null) ? "<default fallback>" : device.getShortDescription(); + } + + private void maybeReportProfile(ExecutionSettings _settings) { + if (Config.dumpProfileOnExecution) { + StringBuilder report = new StringBuilder(); + report.append(KernelDeviceProfile.getTableHeader()).append('\n'); + report.append(_settings.profile.getLastDeviceProfile().getLastAsTableRow()); + System.out.println(report); + } + } + + @SuppressWarnings("deprecation") + private boolean isDeviceCompatible(Device device) { + Kernel.EXECUTION_MODE mode = kernel.getExecutionMode(); + if (mode != Kernel.EXECUTION_MODE.AUTO) { + switch (device.getType()) { + case GPU: + return mode == Kernel.EXECUTION_MODE.GPU; + case CPU: + return mode == Kernel.EXECUTION_MODE.CPU; + case JTP: + return mode == Kernel.EXECUTION_MODE.JTP; + case SEQ: + return mode == Kernel.EXECUTION_MODE.SEQ; + case ACC: + return mode == Kernel.EXECUTION_MODE.ACC; + default: + return false; + } + } else { + return (device == kernel.getTargetDevice()); + } + } + + public int getCancelState() { + return inBufferRemoteInt.get(0); + } + + public void cancelMultiPass() { + inBufferRemoteInt.put(0, CANCEL_STATUS_TRUE); + } + + private void clearCancelMultiPass() { + inBufferRemoteInt.put(0, CANCEL_STATUS_FALSE); + } + + /** + * Returns the index of the current pass, or one of two special constants with negative values to indicate special progress states. Those constants are + * {@link #PASS_ID_PREPARING_EXECUTION} to indicate that the Kernel has started executing but not reached the initial pass, or + * {@link #PASS_ID_COMPLETED_EXECUTION} to indicate that execution is complete (possibly due to early termination via {@link #cancelMultiPass()}), i.e. the Kernel + * is idle. {@link #PASS_ID_COMPLETED_EXECUTION} is also returned before the first execution has been invoked. + * + * <p>This can be used, for instance, to update a visual progress bar. + * + * @see #execute(String, Range, int) + */ + public int getCurrentPass() { + if (!executing) { + return PASS_ID_COMPLETED_EXECUTION; + } + + if (kernel.isRunningCL()) { + return getCurrentPassRemote(); + } else { + return getCurrentPassLocal(); + } + } + + /** + * True while any of the {@code execute()} methods are in progress. + */ + public boolean isExecuting() { + return executing; + } + + protected int getCurrentPassRemote() { + return outBufferRemoteInt.get(0); + } + + private int getCurrentPassLocal() { + return passId; + } + + private int getPrimitiveSize(int type) { + if ((type & ARG_FLOAT) != 0) { + return 4; + } else if ((type & ARG_INT) != 0) { + return 4; + } else if ((type & ARG_BYTE) != 0) { + return 1; + } else if ((type & ARG_CHAR) != 0) { + return 2; + } else if ((type & ARG_BOOLEAN) != 0) { + return 1; + } else if ((type & ARG_SHORT) != 0) { + return 2; + } else if ((type & ARG_LONG) != 0) { + return 8; + } else if ((type & ARG_DOUBLE) != 0) { + return 8; + } + return 0; + } + + private void setMultiArrayType(KernelArg arg, Class<?> type) throws AparapiException { + arg.setType(arg.getType() | (ARG_WRITE | ARG_READ | ARG_APARAPI_BUFFER)); + int numDims = 0; + while (type.getName().startsWith("[[[[")) { + throw new AparapiException("Aparapi only supports 2D and 3D arrays."); + } + arg.setType(arg.getType() | ARG_ARRAYLENGTH); + while (type.getName().charAt(numDims) == '[') { + numDims++; + } + arg.setNumDims(numDims); + arg.setJavaBuffer(null); // will get updated in updateKernelArrayRefs + arg.setArray(null); // will get updated in updateKernelArrayRefs + + Class<?> elementType = arg.getField().getType(); + while (elementType.isArray()) { + elementType = elementType.getComponentType(); + } + + if (elementType.isAssignableFrom(float.class)) { + arg.setType(arg.getType() | ARG_FLOAT); + } else if (elementType.isAssignableFrom(int.class)) { + arg.setType(arg.getType() | ARG_INT); + } else if (elementType.isAssignableFrom(boolean.class)) { + arg.setType(arg.getType() | ARG_BOOLEAN); + } else if (elementType.isAssignableFrom(byte.class)) { + arg.setType(arg.getType() | ARG_BYTE); + } else if (elementType.isAssignableFrom(char.class)) { + arg.setType(arg.getType() | ARG_CHAR); + } else if (elementType.isAssignableFrom(double.class)) { + arg.setType(arg.getType() | ARG_DOUBLE); + } else if (elementType.isAssignableFrom(long.class)) { + arg.setType(arg.getType() | ARG_LONG); + } else if (elementType.isAssignableFrom(short.class)) { + arg.setType(arg.getType() | ARG_SHORT); + } + } + + private final Set<Object> puts = new HashSet<Object>(); + + /** + * Enqueue a request to return this array from the GPU. This method blocks until the array is available. + * <br/> + * Note that <code>Kernel.put(type [])</code> calls will delegate to this call. + * <br/> + * Package public + * + * @param array + * It is assumed that this parameter is indeed an array (of int, float, short etc). + * + * @see Kernel#get(int[] arr) + * @see Kernel#get(float[] arr) + * @see Kernel#get(double[] arr) + * @see Kernel#get(long[] arr) + * @see Kernel#get(char[] arr) + * @see Kernel#get(boolean[] arr) + */ + public void get(Object array) { + if (explicit && (kernel.isRunningCL())) { + // Only makes sense when we are using OpenCL + getJNI(jniContextHandle, array); + } + } + + public List<ProfileInfo> getProfileInfo() { + if (explicit && (kernel.isRunningCL())) { + // Only makes sense when we are using OpenCL + return (getProfileInfoJNI(jniContextHandle)); + } else { + return (null); + } + } + + /** + * Tag this array so that it is explicitly enqueued before the kernel is executed. <br/> + * Note that <code>Kernel.put(type [])</code> calls will delegate to this call. <br/> + * Package public + * + * @param array + * It is assumed that this parameter is indeed an array (of int, float, short etc). + * @see Kernel#put(int[] arr) + * @see Kernel#put(float[] arr) + * @see Kernel#put(double[] arr) + * @see Kernel#put(long[] arr) + * @see Kernel#put(char[] arr) + * @see Kernel#put(boolean[] arr) + */ + + public void put(Object array) { + if (explicit && (kernel.isRunningCL())) { + // Only makes sense when we are using OpenCL + puts.add(array); + } + } + + private boolean explicit = false; + + public void setExplicit(boolean _explicit) { + explicit = _explicit; + } + + public boolean isExplicit() { + return (explicit); + } + + private static class ExecutionSettings { + final KernelPreferences preferences; + final KernelProfile profile; + final String entrypoint; + Range range; + final int passes; + final boolean legacyExecutionMode; + + private ExecutionSettings(KernelPreferences preferences, KernelProfile profile, String entrypoint, Range range, int passes, boolean legacyExecutionMode) { + this.preferences = preferences; + this.profile = profile; + this.entrypoint = entrypoint; + this.range = range; + this.passes = passes; + this.legacyExecutionMode = legacyExecutionMode; + } + + @Override + public String toString() { + return "ExecutionSettings{" + + "preferences=" + preferences + + ", profile=" + profile + + ", entrypoint='" + entrypoint + '\'' + + ", range=" + range + + ", passes=" + passes + + ", legacyExecutionMode=" + legacyExecutionMode + + '}'; + } + } +}