diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java
new file mode 100644
index 0000000000000000000000000000000000000000..d99809e9c3353db214658321b92e41b731dea54a
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java
@@ -0,0 +1,1775 @@
+/*
+Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following
+disclaimer. 
+
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials provided with the distribution. 
+
+Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
+laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
+774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
+you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
+Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
+Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
+E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
+D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
+to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
+of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
+under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 
+
+*/
+package com.amd.aparapi.internal.kernel;
+
+import com.amd.aparapi.*;
+import com.amd.aparapi.Kernel.Constant;
+import com.amd.aparapi.Kernel.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.annotation.*;
+import com.amd.aparapi.internal.exception.*;
+import com.amd.aparapi.internal.instruction.InstructionSet.*;
+import com.amd.aparapi.internal.jni.*;
+import com.amd.aparapi.internal.model.*;
+import com.amd.aparapi.internal.util.*;
+import com.amd.aparapi.internal.writer.*;
+import com.amd.aparapi.opencl.*;
+
+import java.lang.reflect.*;
+import java.nio.*;
+import java.util.*;
+import java.util.concurrent.*;
+import java.util.concurrent.ForkJoinPool.*;
+import java.util.logging.*;
+
+/**
+ * The class is responsible for executing <code>Kernel</code> implementations. <br/>
+ * 
+ * The <code>KernelRunner</code> is the real workhorse for Aparapi.  Each <code>Kernel</code> instance creates a single
+ * <code>KernelRunner</code> to encapsulate state and to help coordinate interactions between the <code>Kernel</code> 
+ * and it's execution logic.<br/>
+ * 
+ * The <code>KernelRunner</code> is created <i>lazily</i> as a result of calling <code>Kernel.execute()</code>. A this 
+ * time the <code>ExecutionMode</code> is consulted to determine the default requested mode.  This will dictate how 
+ * the <code>KernelRunner</code> will attempt to execute the <code>Kernel</code>
+ *   
+ * @see com.amd.aparapi.Kernel#execute(int _globalSize)
+ * 
+ * @author gfrost
+ *
+ */
+public class KernelRunner extends KernelRunnerJNI{
+
+   public static boolean BINARY_CACHING_DISABLED = false;
+
+   private static final int MINIMUM_ARRAY_SIZE = 1;
+
+   /** @see #getCurrentPass() */
+   @UsedByJNICode public static final int PASS_ID_PREPARING_EXECUTION = -2;
+   /** @see #getCurrentPass() */
+   @UsedByJNICode public static final int PASS_ID_COMPLETED_EXECUTION = -1;
+   @UsedByJNICode public static final int CANCEL_STATUS_FALSE = 0;
+   @UsedByJNICode public static final int CANCEL_STATUS_TRUE = 1;
+   private static final String CODE_GEN_ERROR_MARKER = CodeGenException.class.getName();
+
+   private static Logger logger = Logger.getLogger(Config.getLoggerName());
+
+   private long jniContextHandle = 0;
+
+   private final Kernel kernel;
+
+   private Entrypoint entryPoint;
+
+   private int argc;
+
+   // may be read by a thread other than the control thread, hence volatile
+   private volatile boolean executing;
+
+   // may be read by a thread other than the control thread, hence volatile
+   private volatile int passId = PASS_ID_PREPARING_EXECUTION;
+
+   /**
+    * A direct ByteBuffer used for asynchronous intercommunication between java and JNI C code.
+    *
+    * <p>
+    * At present this is a 4 byte buffer to be interpreted as an int[1], used for passing from java to C a single integer interpreted as a cancellation indicator.
+    */
+   private final ByteBuffer inBufferRemote;
+   private final IntBuffer inBufferRemoteInt;
+
+   /** A direct ByteBuffer used for asynchronous intercommunication between java and JNI C code.
+    * <p>
+    * At present this is a 4 byte buffer to be interpreted as an int[1], used for passing from C to java a single integer interpreted as a
+    * the current pass id.
+    */
+   private final ByteBuffer outBufferRemote;
+   private final IntBuffer outBufferRemoteInt;
+
+   private boolean isFallBack = false; // If isFallBack, rebuild the kernel (necessary?)
+
+   private static final ForkJoinWorkerThreadFactory lowPriorityThreadFactory = new ForkJoinWorkerThreadFactory(){
+      @Override public ForkJoinWorkerThread newThread(ForkJoinPool pool) {
+         ForkJoinWorkerThread newThread = ForkJoinPool.defaultForkJoinWorkerThreadFactory.newThread(pool);
+         newThread.setPriority(Thread.MIN_PRIORITY);
+         return newThread;
+      }
+   };
+
+   private static final ForkJoinPool threadPool = new ForkJoinPool(Runtime.getRuntime().availableProcessors(),
+         lowPriorityThreadFactory, null, false);
+   private static HashMap<Class<? extends Kernel>, String> openCLCache = new HashMap<>();
+   private static LinkedHashSet<String> seenBinaryKeys = new LinkedHashSet<>();
+
+   /**
+    * Create a KernelRunner for a specific Kernel instance.
+    * 
+    * @param _kernel
+    */
+   public KernelRunner(Kernel _kernel) {
+      kernel = _kernel;
+
+      inBufferRemote = ByteBuffer.allocateDirect(4);
+      outBufferRemote = ByteBuffer.allocateDirect(4);
+
+      inBufferRemote.order(ByteOrder.nativeOrder());
+      outBufferRemote.order(ByteOrder.nativeOrder());
+
+      inBufferRemoteInt = inBufferRemote.asIntBuffer();
+      outBufferRemoteInt = outBufferRemote.asIntBuffer();
+
+      KernelManager.instance(); // ensures static initialization of KernelManager
+   }
+
+   /**
+    * @see Kernel#cleanUpArrays().
+    */
+   public void cleanUpArrays() {
+      if (args != null && kernel.isRunningCL()) {
+         for (KernelArg arg : args) {
+            if ((arg.getType() & KernelRunnerJNI.ARG_ARRAY) != 0) {
+               Field field = arg.getField();
+               if (field != null && field.getType().isArray() && !Modifier.isFinal(field.getModifiers())) {
+                  field.setAccessible(true);
+                  Class<?> componentType = field.getType().getComponentType();
+                  Object newValue = Array.newInstance(componentType, MINIMUM_ARRAY_SIZE);
+                  try {
+                     field.set(kernel, newValue);
+                  }
+                  catch (IllegalAccessException e) {
+                     throw new RuntimeException(e);
+                  }
+               }
+            }
+         }
+         kernel.execute(0);
+      } else if (kernel.isRunningCL()) {
+         logger.log(Level.SEVERE, "KernelRunner#cleanUpArrays() could not execute as no args available (Kernel has not been executed?)");
+      }
+   }
+
+   /**
+    * <code>Kernel.dispose()</code> delegates to <code>KernelRunner.dispose()</code> which delegates to <code>disposeJNI()</code> to actually close JNI data structures.<br/>
+    * 
+    * @see KernelRunnerJNI#disposeJNI(long)
+    */
+   public synchronized void dispose() {
+      if (kernel.isRunningCL()) {
+         disposeJNI(jniContextHandle);
+         seenBinaryKeys.clear();
+      }
+      // We are using a shared pool, so there's no need no shutdown it when kernel is disposed
+      //      threadPool.shutdownNow();
+   }
+
+   private Set<String> capabilitiesSet;
+
+   boolean hasFP64Support() {
+      if (capabilitiesSet == null) {
+         throw new IllegalStateException("Capabilities queried before they were initialized");
+      }
+      return (capabilitiesSet.contains(OpenCL.CL_KHR_FP64));
+   }
+
+   boolean hasSelectFPRoundingModeSupport() {
+      if (capabilitiesSet == null) {
+         throw new IllegalStateException("Capabilities queried before they were initialized");
+      }
+      return capabilitiesSet.contains(OpenCL.CL_KHR_SELECT_FPROUNDING_MODE);
+   }
+
+   boolean hasGlobalInt32BaseAtomicsSupport() {
+      if (capabilitiesSet == null) {
+         throw new IllegalStateException("Capabilities queried before they were initialized");
+      }
+      return capabilitiesSet.contains(OpenCL.CL_KHR_GLOBAL_INT32_BASE_ATOMICS);
+   }
+
+   boolean hasGlobalInt32ExtendedAtomicsSupport() {
+      if (capabilitiesSet == null) {
+         throw new IllegalStateException("Capabilities queried before they were initialized");
+      }
+      return capabilitiesSet.contains(OpenCL.CL_KHR_GLOBAL_INT32_EXTENDED_ATOMICS);
+   }
+
+   boolean hasLocalInt32BaseAtomicsSupport() {
+      if (capabilitiesSet == null) {
+         throw new IllegalStateException("Capabilities queried before they were initialized");
+      }
+      return capabilitiesSet.contains(OpenCL.CL_KHR_LOCAL_INT32_BASE_ATOMICS);
+   }
+
+   boolean hasLocalInt32ExtendedAtomicsSupport() {
+      if (capabilitiesSet == null) {
+         throw new IllegalStateException("Capabilities queried before they were initialized");
+      }
+      return capabilitiesSet.contains(OpenCL.CL_KHR_LOCAL_INT32_EXTENDED_ATOMICS);
+   }
+
+   boolean hasInt64BaseAtomicsSupport() {
+      if (capabilitiesSet == null) {
+         throw new IllegalStateException("Capabilities queried before they were initialized");
+      }
+      return capabilitiesSet.contains(OpenCL.CL_KHR_INT64_BASE_ATOMICS);
+   }
+
+   boolean hasInt64ExtendedAtomicsSupport() {
+      if (capabilitiesSet == null) {
+         throw new IllegalStateException("Capabilities queried before they were initialized");
+      }
+      return capabilitiesSet.contains(OpenCL.CL_KHR_INT64_EXTENDED_ATOMICS);
+   }
+
+   boolean has3DImageWritesSupport() {
+      if (capabilitiesSet == null) {
+         throw new IllegalStateException("Capabilities queried before they were initialized");
+      }
+      return capabilitiesSet.contains(OpenCL.CL_KHR_3D_IMAGE_WRITES);
+   }
+
+   boolean hasByteAddressableStoreSupport() {
+      if (capabilitiesSet == null) {
+         throw new IllegalStateException("Capabilities queried before they were initialized");
+      }
+      return capabilitiesSet.contains(OpenCL.CL_KHR_BYTE_ADDRESSABLE_SUPPORT);
+   }
+
+   boolean hasFP16Support() {
+      if (capabilitiesSet == null) {
+         throw new IllegalStateException("Capabilities queried before they were initialized");
+      }
+      return capabilitiesSet.contains(OpenCL.CL_KHR_FP16);
+   }
+
+   boolean hasGLSharingSupport() {
+      if (capabilitiesSet == null) {
+         throw new IllegalStateException("Capabilities queried before they were initialized");
+      }
+      return capabilitiesSet.contains(OpenCL.CL_KHR_GL_SHARING);
+   }
+
+   private static final class FJSafeCyclicBarrier extends CyclicBarrier{
+      FJSafeCyclicBarrier(final int threads) {
+         super(threads);
+      }
+
+      @Override public int await() throws InterruptedException, BrokenBarrierException {
+         class Awaiter implements ManagedBlocker{
+            private int value;
+
+            private boolean released;
+
+            @Override public boolean block() throws InterruptedException {
+               try {
+                  value = superAwait();
+                  released = true;
+                  return true;
+               } catch (final BrokenBarrierException e) {
+                  throw new RuntimeException(e);
+               }
+            }
+
+            @Override public boolean isReleasable() {
+               return released;
+            }
+
+            int getValue() {
+               return value;
+            }
+         }
+         final Awaiter awaiter = new Awaiter();
+         ForkJoinPool.managedBlock(awaiter);
+         return awaiter.getValue();
+      }
+
+      int superAwait() throws InterruptedException, BrokenBarrierException {
+         return super.await();
+      }
+   }
+
+   //   @FunctionalInterface
+   private interface ThreadIdSetter{
+      void set(KernelState kernelState, int globalGroupId, int threadId);
+   }
+
+   /**
+    * Execute using a Java thread pool, or sequentially, or using an alternative algorithm, usually as a result of failing to compile or execute OpenCL
+    */
+   @SuppressWarnings("deprecation")
+   protected void executeJava(ExecutionSettings _settings, Device device) {
+      if (logger.isLoggable(Level.FINE)) {
+         logger.fine("executeJava: range = " + _settings.range + ", device = " + device);
+      }
+      boolean legacySequentialMode = kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.SEQ);
+
+      passId = PASS_ID_PREPARING_EXECUTION;
+      _settings.profile.onEvent(ProfilingEvent.PREPARE_EXECUTE);
+
+      try {
+         if (device == JavaDevice.ALTERNATIVE_ALGORITHM) {
+            if (kernel.hasFallbackAlgorithm()) {
+               for (passId = 0; passId < _settings.passes; ++passId) {
+                  kernel.executeFallbackAlgorithm(_settings.range, passId);
+               }
+            } else {
+               boolean silently = true; // not having an alternative algorithm is the normal state, and does not need reporting
+               fallBackToNextDevice(_settings, (Exception) null, silently);
+            }
+         } else {
+            final int localSize0 = _settings.range.getLocalSize(0);
+            final int localSize1 = _settings.range.getLocalSize(1);
+            final int localSize2 = _settings.range.getLocalSize(2);
+            final int globalSize1 = _settings.range.getGlobalSize(1);
+            if (legacySequentialMode || device == JavaDevice.SEQUENTIAL) {
+               /**
+                * SEQ mode is useful for testing trivial logic, but kernels which use SEQ mode cannot be used if the
+                * product of localSize(0..3) is >1.  So we can use multi-dim ranges but only if the local size is 1 in all dimensions.
+                *
+                * As a result of this barrier is only ever 1 work item wide and probably should be turned into a no-op.
+                *
+                * So we need to check if the range is valid here. If not we have no choice but to punt.
+                */
+               if ((localSize0 * localSize1 * localSize2) > 1) {
+                  throw new IllegalStateException("Can't run range with group size >1 sequentially. Barriers would deadlock!");
+               }
+
+               final Kernel kernelClone = kernel.clone();
+               final KernelState kernelState = kernelClone.getKernelState();
+
+               kernelState.setRange(_settings.range);
+               kernelState.setGroupId(0, 0);
+               kernelState.setGroupId(1, 0);
+               kernelState.setGroupId(2, 0);
+               kernelState.setLocalId(0, 0);
+               kernelState.setLocalId(1, 0);
+               kernelState.setLocalId(2, 0);
+               kernelState.setLocalBarrier(new FJSafeCyclicBarrier(1));
+
+               for (passId = 0; passId < _settings.passes; passId++) {
+                  if (getCancelState() == CANCEL_STATUS_TRUE) {
+                     break;
+                  }
+                  kernelState.setPassId(passId);
+
+                  if (_settings.range.getDims() == 1) {
+                     for (int id = 0; id < _settings.range.getGlobalSize(0); id++) {
+                        kernelState.setGlobalId(0, id);
+                        kernelClone.run();
+                     }
+                  }
+                  else if (_settings.range.getDims() == 2) {
+                     for (int x = 0; x < _settings.range.getGlobalSize(0); x++) {
+                        kernelState.setGlobalId(0, x);
+
+                        for (int y = 0; y < globalSize1; y++) {
+                           kernelState.setGlobalId(1, y);
+                           kernelClone.run();
+                        }
+                     }
+                  }
+                  else if (_settings.range.getDims() == 3) {
+                     for (int x = 0; x < _settings.range.getGlobalSize(0); x++) {
+                        kernelState.setGlobalId(0, x);
+
+                        for (int y = 0; y < globalSize1; y++) {
+                           kernelState.setGlobalId(1, y);
+
+                           for (int z = 0; z < _settings.range.getGlobalSize(2); z++) {
+                              kernelState.setGlobalId(2, z);
+                              kernelClone.run();
+                           }
+
+                           kernelClone.run();
+                        }
+                     }
+                  }
+               }
+               passId = PASS_ID_COMPLETED_EXECUTION;
+            }
+            else {
+               if (device != JavaDevice.THREAD_POOL && kernel.getExecutionMode() != Kernel.EXECUTION_MODE.JTP) {
+                  throw new AssertionError("unexpected JavaDevice or EXECUTION_MODE");
+               }
+               final int threads = localSize0 * localSize1 * localSize2;
+               final int numGroups0 = _settings.range.getNumGroups(0);
+               final int numGroups1 = _settings.range.getNumGroups(1);
+               final int globalGroups = numGroups0 * numGroups1 * _settings.range.getNumGroups(2);
+               /**
+                * This joinBarrier is the barrier that we provide for the kernel threads to rendezvous with the current dispatch thread.
+                * So this barrier is threadCount+1 wide (the +1 is for the dispatch thread)
+                */
+               final CyclicBarrier joinBarrier = new FJSafeCyclicBarrier(threads + 1);
+
+               /**
+                * This localBarrier is only ever used by the kernels.  If the kernel does not use the barrier the threads
+                * can get out of sync, we promised nothing in JTP mode.
+                *
+                * As with OpenCL all threads within a group must wait at the barrier or none.  It is a user error (possible deadlock!)
+                * if the barrier is in a conditional that is only executed by some of the threads within a group.
+                *
+                * Kernel developer must understand this.
+                *
+                * This barrier is threadCount wide.  We never hit the barrier from the dispatch thread.
+                */
+               final CyclicBarrier localBarrier = new FJSafeCyclicBarrier(threads);
+
+               final ThreadIdSetter threadIdSetter;
+
+               if (_settings.range.getDims() == 1) {
+                  threadIdSetter = new ThreadIdSetter() {
+                     @Override
+                     public void set(KernelState kernelState, int globalGroupId, int threadId) {
+                        //                   (kernelState, globalGroupId, threadId) ->{
+                        kernelState.setLocalId(0, (threadId % localSize0));
+                        kernelState.setGlobalId(0, (threadId + (globalGroupId * threads)));
+                        kernelState.setGroupId(0, globalGroupId);
+                     }
+                  };
+               }
+               else if (_settings.range.getDims() == 2) {
+
+                  /**
+                   * Consider a 12x4 grid of 4*2 local groups
+                   * <pre>
+                   *                                             threads = 4*2 = 8
+                   *                                             localWidth=4
+                   *                                             localHeight=2
+                   *                                             globalWidth=12
+                   *                                             globalHeight=4
+                   *
+                   *    00 01 02 03 | 04 05 06 07 | 08 09 10 11
+                   *    12 13 14 15 | 16 17 18 19 | 20 21 22 23
+                   *    ------------+-------------+------------
+                   *    24 25 26 27 | 28 29 30 31 | 32 33 34 35
+                   *    36 37 38 39 | 40 41 42 43 | 44 45 46 47
+                   *
+                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03  threadIds : [0..7]*6
+                   *    04 05 06 07 | 04 05 06 07 | 04 05 06 07
+                   *    ------------+-------------+------------
+                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
+                   *    04 05 06 07 | 04 05 06 07 | 04 05 06 07
+                   *
+                   *    00 00 00 00 | 01 01 01 01 | 02 02 02 02  groupId[0] : 0..6
+                   *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
+                   *    ------------+-------------+------------
+                   *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
+                   *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
+                   *
+                   *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  groupId[1] : 0..6
+                   *    00 00 00 00 | 00 00 00 00 | 00 00 00 00
+                   *    ------------+-------------+------------
+                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                   *
+                   *    00 01 02 03 | 08 09 10 11 | 16 17 18 19  globalThreadIds == threadId + groupId * threads;
+                   *    04 05 06 07 | 12 13 14 15 | 20 21 22 23
+                   *    ------------+-------------+------------
+                   *    24 25 26 27 | 32[33]34 35 | 40 41 42 43
+                   *    28 29 30 31 | 36 37 38 39 | 44 45 46 47
+                   *
+                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03  localX = threadId % localWidth; (for globalThreadId 33 = threadId = 01 : 01%4 =1)
+                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
+                   *    ------------+-------------+------------
+                   *    00 01 02 03 | 00[01]02 03 | 00 01 02 03
+                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
+                   *
+                   *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  localY = threadId /localWidth  (for globalThreadId 33 = threadId = 01 : 01/4 =0)
+                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                   *    ------------+-------------+------------
+                   *    00 00 00 00 | 00[00]00 00 | 00 00 00 00
+                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                   *
+                   *    00 01 02 03 | 04 05 06 07 | 08 09 10 11  globalX=
+                   *    00 01 02 03 | 04 05 06 07 | 08 09 10 11     groupsPerLineWidth=globalWidth/localWidth (=12/4 =3)
+                   *    ------------+-------------+------------     groupInset =groupId%groupsPerLineWidth (=4%3 = 1)
+                   *    00 01 02 03 | 04[05]06 07 | 08 09 10 11
+                   *    00 01 02 03 | 04 05 06 07 | 08 09 10 11     globalX = groupInset*localWidth+localX (= 1*4+1 = 5)
+                   *
+                   *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  globalY
+                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                   *    ------------+-------------+------------
+                   *    02 02 02 02 | 02[02]02 02 | 02 02 02 02
+                   *    03 03 03 03 | 03 03 03 03 | 03 03 03 03
+                   *
+                   * </pre>
+                   * Assume we are trying to locate the id's for #33
+                   *
+                   */
+                  threadIdSetter = new ThreadIdSetter() {
+                     @Override
+                     public void set(KernelState kernelState, int globalGroupId, int threadId) {
+                        //                   (kernelState, globalGroupId, threadId) ->{
+                        kernelState.setLocalId(0, (threadId % localSize0)); // threadId % localWidth =  (for 33 = 1 % 4 = 1)
+                        kernelState.setLocalId(1, (threadId / localSize0)); // threadId / localWidth = (for 33 = 1 / 4 == 0)
+
+                        final int groupInset = globalGroupId % numGroups0; // 4%3 = 1
+                        kernelState.setGlobalId(0, ((groupInset * localSize0) + kernelState.getLocalIds()[0])); // 1*4+1=5
+
+                        final int completeLines = (globalGroupId / numGroups0) * localSize1;// (4/3) * 2
+                        kernelState.setGlobalId(1, (completeLines + kernelState.getLocalIds()[1])); // 2+0 = 2
+                        kernelState.setGroupId(0, (globalGroupId % numGroups0));
+                        kernelState.setGroupId(1, (globalGroupId / numGroups0));
+                     }
+                  };
+               }
+               else if (_settings.range.getDims() == 3) {
+                  //Same as 2D actually turns out that localId[0] is identical for all three dims so could be hoisted out of conditional code
+                  threadIdSetter = new ThreadIdSetter() {
+                     @Override
+                     public void set(KernelState kernelState, int globalGroupId, int threadId) {
+                        //                   (kernelState, globalGroupId, threadId) ->{
+                        kernelState.setLocalId(0, (threadId % localSize0));
+
+                        kernelState.setLocalId(1, ((threadId / localSize0) % localSize1));
+
+                        // the thread id's span WxHxD so threadId/(WxH) should yield the local depth
+                        kernelState.setLocalId(2, (threadId / (localSize0 * localSize1)));
+
+                        kernelState.setGlobalId(0, (((globalGroupId % numGroups0) * localSize0) + kernelState.getLocalIds()[0]));
+
+                        kernelState.setGlobalId(1,
+                        ((((globalGroupId / numGroups0) * localSize1) % globalSize1) + kernelState.getLocalIds()[1]));
+
+                        kernelState.setGlobalId(2,
+                        (((globalGroupId / (numGroups0 * numGroups1)) * localSize2) + kernelState.getLocalIds()[2]));
+
+                        kernelState.setGroupId(0, (globalGroupId % numGroups0));
+                        kernelState.setGroupId(1, ((globalGroupId / numGroups0) % numGroups1));
+                        kernelState.setGroupId(2, (globalGroupId / (numGroups0 * numGroups1)));
+                     }
+                  };
+               }
+               else
+                  throw new IllegalArgumentException("Expected 1,2 or 3 dimensions, found " + _settings.range.getDims());
+               for (passId = 0; passId < _settings.passes; passId++) {
+                  if (getCancelState() == CANCEL_STATUS_TRUE) {
+                     break;
+                  }
+                  /**
+                   * Note that we emulate OpenCL by creating one thread per localId (across the group).
+                   *
+                   * So threadCount == range.getLocalSize(0)*range.getLocalSize(1)*range.getLocalSize(2);
+                   *
+                   * For a 1D range of 12 groups of 4 we create 4 threads. One per localId(0).
+                   *
+                   * We also clone the kernel 4 times. One per thread.
+                   *
+                   * We create local barrier which has a width of 4
+                   *
+                   *    Thread-0 handles localId(0) (global 0,4,8)
+                   *    Thread-1 handles localId(1) (global 1,5,7)
+                   *    Thread-2 handles localId(2) (global 2,6,10)
+                   *    Thread-3 handles localId(3) (global 3,7,11)
+                   *
+                   * This allows all threads to synchronize using the local barrier.
+                   *
+                   * Initially the use of local buffers seems broken as the buffers appears to be per Kernel.
+                   * Thankfully Kernel.clone() performs a shallow clone of all buffers (local and global)
+                   * So each of the cloned kernels actually still reference the same underlying local/global buffers.
+                   *
+                   * If the kernel uses local buffers but does not use barriers then it is possible for different groups
+                   * to see mutations from each other (unlike OpenCL), however if the kernel does not us barriers then it
+                   * cannot assume any coherence in OpenCL mode either (the failure mode will be different but still wrong)
+                   *
+                   * So even JTP mode use of local buffers will need to use barriers. Not for the same reason as OpenCL but to keep groups in lockstep.
+                   *
+                   **/
+                  for (int id = 0; id < threads; id++) {
+                     final int threadId = id;
+
+                     /**
+                      *  We clone one kernel for each thread.
+                      *
+                      *  They will all share references to the same range, localBarrier and global/local buffers because the clone is shallow.
+                      *  We need clones so that each thread can assign 'state' (localId/globalId/groupId) without worrying
+                      *  about other threads.
+                      */
+                     final Kernel kernelClone = kernel.clone();
+                     final KernelState kernelState = kernelClone.getKernelState();
+                     kernelState.setRange(_settings.range);
+                     kernelState.setPassId(passId);
+
+                     if (threads == 1) {
+                        kernelState.disableLocalBarrier();
+                     }
+                     else {
+                        kernelState.setLocalBarrier(localBarrier);
+                     }
+
+                     threadPool.submit(
+                     //                     () -> {
+                     new Runnable() {
+                        public void run() {
+                           try {
+                              for (int globalGroupId = 0; globalGroupId < globalGroups; globalGroupId++) {
+                                 threadIdSetter.set(kernelState, globalGroupId, threadId);
+                                 kernelClone.run();
+                              }
+                           }
+                           catch (RuntimeException | Error e) {
+                              logger.log(Level.SEVERE, "Execution failed", e);
+                           }
+                           finally {
+                              await(joinBarrier); // This thread will rendezvous with dispatch thread here. This is effectively a join.
+                           }
+                        }
+                     });
+                  }
+
+                  await(joinBarrier); // This dispatch thread waits for all worker threads here.
+               }
+               passId = PASS_ID_COMPLETED_EXECUTION;
+            } // execution mode == JTP
+         }
+      } finally {
+         passId = PASS_ID_COMPLETED_EXECUTION;
+      }
+   }
+
+   private static void await(CyclicBarrier _barrier) {
+      try {
+         _barrier.await();
+      } catch (final InterruptedException e) {
+         // TODO Auto-generated catch block
+         e.printStackTrace();
+      } catch (final BrokenBarrierException e) {
+         // TODO Auto-generated catch block
+         e.printStackTrace();
+      }
+   }
+
+   private KernelArg[] args = null;
+
+   private boolean usesOopConversion = false;
+
+   /**
+    * 
+    * @param arg
+    * @return
+    * @throws AparapiException
+    */
+   private boolean prepareOopConversionBuffer(KernelArg arg) throws AparapiException {
+      usesOopConversion = true;
+      final Class<?> arrayClass = arg.getField().getType();
+      ClassModel c = null;
+      boolean didReallocate = false;
+
+      if (arg.getObjArrayElementModel() == null) {
+         final String tmp = arrayClass.getName().substring(2).replace('/', '.');
+         final String arrayClassInDotForm = tmp.substring(0, tmp.length() - 1);
+
+         if (logger.isLoggable(Level.FINE)) {
+            logger.fine("looking for type = " + arrayClassInDotForm);
+         }
+
+         // get ClassModel of obj array from entrypt.objectArrayFieldsClasses
+         c = entryPoint.getObjectArrayFieldsClasses().get(arrayClassInDotForm);
+         arg.setObjArrayElementModel(c);
+      } else {
+         c = arg.getObjArrayElementModel();
+      }
+      assert c != null : "should find class for elements " + arrayClass.getName();
+
+      final int arrayBaseOffset = UnsafeWrapper.arrayBaseOffset(arrayClass);
+      final int arrayScale = UnsafeWrapper.arrayIndexScale(arrayClass);
+
+      if (logger.isLoggable(Level.FINEST)) {
+         logger.finest("Syncing obj array type = " + arrayClass + " cvtd= " + c.getClassWeAreModelling().getName()
+               + "arrayBaseOffset=" + arrayBaseOffset + " arrayScale=" + arrayScale);
+      }
+
+      int objArraySize = 0;
+      Object newRef = null;
+      try {
+         newRef = arg.getField().get(kernel);
+         objArraySize = Array.getLength(newRef);
+      } catch (final IllegalAccessException e) {
+         throw new AparapiException(e);
+      }
+
+      assert (newRef != null) && (objArraySize != 0) : "no data";
+
+      final int totalStructSize = c.getTotalStructSize();
+      final int totalBufferSize = objArraySize * totalStructSize;
+
+      // allocate ByteBuffer if first time or array changed
+      if ((arg.getObjArrayBuffer() == null) || (newRef != arg.getArray())) {
+         final ByteBuffer structBuffer = ByteBuffer.allocate(totalBufferSize);
+         arg.setObjArrayByteBuffer(structBuffer.order(ByteOrder.LITTLE_ENDIAN));
+         arg.setObjArrayBuffer(arg.getObjArrayByteBuffer().array());
+         didReallocate = true;
+         if (logger.isLoggable(Level.FINEST)) {
+            logger.finest("objArraySize = " + objArraySize + " totalStructSize= " + totalStructSize + " totalBufferSize="
+                  + totalBufferSize);
+         }
+      } else {
+         arg.getObjArrayByteBuffer().clear();
+      }
+
+      // copy the fields that the JNI uses
+      arg.setJavaArray(arg.getObjArrayBuffer());
+      arg.setNumElements(objArraySize);
+      arg.setSizeInBytes(totalBufferSize);
+
+      for (int j = 0; j < objArraySize; j++) {
+         int sizeWritten = 0;
+
+         final Object object = UnsafeWrapper.getObject(newRef, arrayBaseOffset + (arrayScale * j));
+         for (int i = 0; i < c.getStructMemberTypes().size(); i++) {
+            final TypeSpec t = c.getStructMemberTypes().get(i);
+            final long offset = c.getStructMemberOffsets().get(i);
+
+            if (logger.isLoggable(Level.FINEST)) {
+               logger.finest("name = " + c.getStructMembers().get(i).getNameAndTypeEntry().getNameUTF8Entry().getUTF8() + " t= "
+                     + t);
+            }
+
+            switch (t) {
+               case I: {
+                  final int x = UnsafeWrapper.getInt(object, offset);
+                  arg.getObjArrayByteBuffer().putInt(x);
+                  sizeWritten += t.getSize();
+                  break;
+               }
+               case F: {
+                  final float x = UnsafeWrapper.getFloat(object, offset);
+                  arg.getObjArrayByteBuffer().putFloat(x);
+                  sizeWritten += t.getSize();
+                  break;
+               }
+               case J: {
+                  final long x = UnsafeWrapper.getLong(object, offset);
+                  arg.getObjArrayByteBuffer().putLong(x);
+                  sizeWritten += t.getSize();
+                  break;
+               }
+               case Z: {
+                  final boolean x = UnsafeWrapper.getBoolean(object, offset);
+                  arg.getObjArrayByteBuffer().put(x == true ? (byte) 1 : (byte) 0);
+                  // Booleans converted to 1 byte C chars for opencl
+                  sizeWritten += TypeSpec.B.getSize();
+                  break;
+               }
+               case B: {
+                  final byte x = UnsafeWrapper.getByte(object, offset);
+                  arg.getObjArrayByteBuffer().put(x);
+                  sizeWritten += t.getSize();
+                  break;
+               }
+               case D: {
+                  throw new AparapiException("Double not implemented yet");
+               }
+               default:
+                  assert true == false : "typespec did not match anything";
+                  throw new AparapiException("Unhandled type in buffer conversion");
+            }
+         }
+
+         // add padding here if needed
+         if (logger.isLoggable(Level.FINEST)) {
+            logger.finest("sizeWritten = " + sizeWritten + " totalStructSize= " + totalStructSize);
+         }
+
+         assert sizeWritten <= totalStructSize : "wrote too much into buffer";
+
+         while (sizeWritten < totalStructSize) {
+            if (logger.isLoggable(Level.FINEST)) {
+               logger.finest(arg.getName() + " struct pad byte = " + sizeWritten + " totalStructSize= " + totalStructSize);
+            }
+            arg.getObjArrayByteBuffer().put((byte) -1);
+            sizeWritten++;
+         }
+      }
+
+      assert arg.getObjArrayByteBuffer().arrayOffset() == 0 : "should be zero";
+
+      return didReallocate;
+   }
+
+   private void extractOopConversionBuffer(KernelArg arg) throws AparapiException {
+      final Class<?> arrayClass = arg.getField().getType();
+      final ClassModel c = arg.getObjArrayElementModel();
+      assert c != null : "should find class for elements: " + arrayClass.getName();
+      assert arg.getArray() != null : "array is null";
+
+      final int arrayBaseOffset = UnsafeWrapper.arrayBaseOffset(arrayClass);
+      final int arrayScale = UnsafeWrapper.arrayIndexScale(arrayClass);
+      if (logger.isLoggable(Level.FINEST)) {
+         logger.finest("Syncing field:" + arg.getName() + ", bb=" + arg.getObjArrayByteBuffer() + ", type = " + arrayClass);
+      }
+
+      int objArraySize = 0;
+      try {
+         objArraySize = Array.getLength(arg.getField().get(kernel));
+      } catch (final IllegalAccessException e) {
+         throw new AparapiException(e);
+      }
+
+      assert objArraySize > 0 : "should be > 0";
+
+      final int totalStructSize = c.getTotalStructSize();
+      // int totalBufferSize = objArraySize * totalStructSize;
+      // assert arg.objArrayBuffer.length == totalBufferSize : "size should match";
+
+      arg.getObjArrayByteBuffer().rewind();
+
+      for (int j = 0; j < objArraySize; j++) {
+         int sizeWritten = 0;
+         final Object object = UnsafeWrapper.getObject(arg.getArray(), arrayBaseOffset + (arrayScale * j));
+         for (int i = 0; i < c.getStructMemberTypes().size(); i++) {
+            final TypeSpec t = c.getStructMemberTypes().get(i);
+            final long offset = c.getStructMemberOffsets().get(i);
+            switch (t) {
+               case I: {
+                  // read int value from buffer and store into obj in the array
+                  final int x = arg.getObjArrayByteBuffer().getInt();
+                  if (logger.isLoggable(Level.FINEST)) {
+                     logger.finest("fType = " + t.getShortName() + " x= " + x);
+                  }
+                  UnsafeWrapper.putInt(object, offset, x);
+                  sizeWritten += t.getSize();
+                  break;
+               }
+               case F: {
+                  final float x = arg.getObjArrayByteBuffer().getFloat();
+                  if (logger.isLoggable(Level.FINEST)) {
+                     logger.finest("fType = " + t.getShortName() + " x= " + x);
+                  }
+                  UnsafeWrapper.putFloat(object, offset, x);
+                  sizeWritten += t.getSize();
+                  break;
+               }
+               case J: {
+                  final long x = arg.getObjArrayByteBuffer().getLong();
+                  if (logger.isLoggable(Level.FINEST)) {
+                     logger.finest("fType = " + t.getShortName() + " x= " + x);
+                  }
+                  UnsafeWrapper.putLong(object, offset, x);
+                  sizeWritten += t.getSize();
+                  break;
+               }
+               case Z: {
+                  final byte x = arg.getObjArrayByteBuffer().get();
+                  if (logger.isLoggable(Level.FINEST)) {
+                     logger.finest("fType = " + t.getShortName() + " x= " + x);
+                  }
+                  UnsafeWrapper.putBoolean(object, offset, (x == 1 ? true : false));
+                  // Booleans converted to 1 byte C chars for open cl
+                  sizeWritten += TypeSpec.B.getSize();
+                  break;
+               }
+               case B: {
+                  final byte x = arg.getObjArrayByteBuffer().get();
+                  if (logger.isLoggable(Level.FINEST)) {
+                     logger.finest("fType = " + t.getShortName() + " x= " + x);
+                  }
+                  UnsafeWrapper.putByte(object, offset, x);
+                  sizeWritten += t.getSize();
+                  break;
+               }
+               case D: {
+                  throw new AparapiException("Double not implemented yet");
+               }
+               default:
+                  assert true == false : "typespec did not match anything";
+                  throw new AparapiException("Unhandled type in buffer conversion");
+            }
+         }
+
+         // add padding here if needed
+         if (logger.isLoggable(Level.FINEST)) {
+            logger.finest("sizeWritten = " + sizeWritten + " totalStructSize= " + totalStructSize);
+         }
+
+         assert sizeWritten <= totalStructSize : "wrote too much into buffer";
+
+         while (sizeWritten < totalStructSize) {
+            // skip over pad bytes
+            arg.getObjArrayByteBuffer().get();
+            sizeWritten++;
+         }
+      }
+   }
+
+   private void restoreObjects() throws AparapiException {
+      for (int i = 0; i < argc; i++) {
+         final KernelArg arg = args[i];
+         if ((arg.getType() & ARG_OBJ_ARRAY_STRUCT) != 0) {
+            extractOopConversionBuffer(arg);
+         }
+      }
+   }
+
+   private boolean updateKernelArrayRefs() throws AparapiException {
+      boolean needsSync = false;
+
+      for (int i = 0; i < argc; i++) {
+         final KernelArg arg = args[i];
+         try {
+            if ((arg.getType() & ARG_ARRAY) != 0) {
+               Object newArrayRef;
+               newArrayRef = arg.getField().get(kernel);
+
+               if (newArrayRef == null) {
+                  throw new IllegalStateException("Cannot send null refs to kernel, reverting to java");
+               }
+
+               String fieldName = arg.getField().getName();
+               int arrayLength = Array.getLength(newArrayRef);
+               Integer privateMemorySize = ClassModel.getPrivateMemorySizeFromField(arg.getField());
+               if (privateMemorySize == null) {
+                  privateMemorySize = ClassModel.getPrivateMemorySizeFromFieldName(fieldName);
+               }
+               if (privateMemorySize != null) {
+                  if (arrayLength > privateMemorySize) {
+                     throw new IllegalStateException("__private array field " + fieldName + " has illegal length " + arrayLength
+                           + " > " + privateMemorySize);
+                  }
+               }
+
+               if ((arg.getType() & ARG_OBJ_ARRAY_STRUCT) != 0) {
+                  prepareOopConversionBuffer(arg);
+               } else {
+                  // set up JNI fields for normal arrays
+                  arg.setJavaArray(newArrayRef);
+                  arg.setNumElements(arrayLength);
+                  arg.setSizeInBytes(arg.getNumElements() * arg.getPrimitiveSize());
+
+                  if (((args[i].getType() & ARG_EXPLICIT) != 0) && puts.contains(newArrayRef)) {
+                     args[i].setType(args[i].getType() | ARG_EXPLICIT_WRITE);
+                     // System.out.println("detected an explicit write " + args[i].name);
+                     puts.remove(newArrayRef);
+                  }
+               }
+
+               if (newArrayRef != arg.getArray()) {
+                  needsSync = true;
+
+                  if (logger.isLoggable(Level.FINE)) {
+                     logger.fine("saw newArrayRef for " + arg.getName() + " = " + newArrayRef + ", newArrayLen = "
+                           + Array.getLength(newArrayRef));
+                  }
+               }
+
+               arg.setArray(newArrayRef);
+               assert arg.getArray() != null : "null array ref";
+            } else if ((arg.getType() & ARG_APARAPI_BUFFER) != 0) {
+               // TODO: check if the 2D/3D array is changed. 
+               //   can Arrays.equals help?
+               needsSync = true; // Always need syn
+               Object buffer = new Object();
+               try {
+                  buffer = arg.getField().get(kernel);
+               } catch (IllegalAccessException e) {
+                  e.printStackTrace();
+               }
+               int numDims = arg.getNumDims();
+               Object subBuffer = buffer;
+               int[] dims = new int[numDims];
+               for (int d = 0; d < numDims - 1; d++) {
+                  dims[d] = Array.getLength(subBuffer);
+                  subBuffer = Array.get(subBuffer, 0);
+               }
+               dims[numDims - 1] = Array.getLength(subBuffer);
+               arg.setDims(dims);
+
+               int primitiveSize = getPrimitiveSize(arg.getType());
+               int totalElements = 1;
+               for (int d = 0; d < numDims; d++) {
+                  totalElements *= dims[d];
+               }
+               arg.setJavaBuffer(buffer);
+               arg.setSizeInBytes(totalElements * primitiveSize);
+               arg.setArray(buffer);
+            }
+         } catch (final IllegalArgumentException e) {
+            e.printStackTrace();
+         } catch (final IllegalAccessException e) {
+            e.printStackTrace();
+         }
+      }
+      return needsSync;
+   }
+
+   @SuppressWarnings("deprecation")
+   private Kernel executeOpenCL(ExecutionSettings _settings) throws AparapiException {
+
+      // Read the array refs after kernel may have changed them
+      // We need to do this as input to computing the localSize
+      assert args != null : "args should not be null";
+      final boolean needSync = updateKernelArrayRefs();
+      if (needSync && logger.isLoggable(Level.FINE)) {
+         logger.fine("Need to resync arrays on " + kernel);
+      }
+
+      // native side will reallocate array buffers if necessary
+      int returnValue = runKernelJNI(jniContextHandle, _settings.range, needSync, _settings.passes, inBufferRemote, outBufferRemote);
+      if (returnValue != 0) {
+         String reason = "OpenCL execution seems to have failed (runKernelJNI returned " + returnValue + ")";
+         return fallBackToNextDevice(_settings, new AparapiException(reason));
+      }
+
+      if (usesOopConversion == true) {
+         restoreObjects();
+      }
+
+      if (logger.isLoggable(Level.FINE)) {
+         logger.fine("executeOpenCL completed. " + _settings.range);
+      }
+
+      return kernel;
+   }
+
+   @SuppressWarnings("deprecation")
+   synchronized private Kernel fallBackByExecutionMode(ExecutionSettings _settings) {
+      isFallBack = true;
+      if (kernel.hasNextExecutionMode()) {
+         kernel.tryNextExecutionMode();
+         if (logger.isLoggable(Level.WARNING)) {
+            logger.warning("Trying next execution mode " + kernel.getExecutionMode());
+         }
+      } else {
+         kernel.setFallbackExecutionMode();
+      }
+      recreateRange(_settings);
+      return executeInternalInner(_settings);
+   }
+
+   private void recreateRange(ExecutionSettings _settings) {
+      if (_settings.range.isLocalIsDerived() && !_settings.legacyExecutionMode) {
+         Device device = kernel.getTargetDevice();
+         Range result;
+         switch (_settings.range.getDims()) {
+            case 1: {
+               result = Range.create(device, _settings.range.getGlobalSize_0());
+               break;
+            }
+            case 2: {
+               result = Range.create2D(device, _settings.range.getGlobalSize_0(), _settings.range.getGlobalSize_1());
+               break;
+            }
+            case 3: {
+               result = Range.create3D(device, _settings.range.getGlobalSize_0(), _settings.range.getGlobalSize_1(), _settings.range.getGlobalSize_2());
+               break;
+            }
+            default: {
+               throw new AssertionError("Range.getDims() = " + _settings.range.getDims());
+            }
+         }
+         _settings.range = result;
+      }
+   }
+
+   private Kernel fallBackToNextDevice(ExecutionSettings _settings, String _reason) {
+      return fallBackToNextDevice(_settings, new AparapiException(_reason));
+   }
+
+   @SuppressWarnings("deprecation")
+   synchronized private Kernel fallBackToNextDevice(ExecutionSettings _settings, Exception _exception) {
+      return fallBackToNextDevice(_settings, _exception, false);
+   }
+
+   @SuppressWarnings("deprecation")
+   synchronized private Kernel fallBackToNextDevice(ExecutionSettings _settings, Exception _exception, boolean _silently) {
+      isFallBack = true;
+      _settings.profile.onEvent(ProfilingEvent.EXECUTED);
+      if (_settings.legacyExecutionMode) {
+         if (!_silently && logger.isLoggable(Level.WARNING)) {
+            logger.warning("Execution mode " + kernel.getExecutionMode() + " failed for " + kernel + ": " + _exception.getMessage());
+             _exception.printStackTrace();
+          }
+          return fallBackByExecutionMode(_settings);
+      } else {
+         KernelPreferences preferences = KernelManager.instance().getPreferences(kernel);
+         if (!_silently && logger.isLoggable(Level.WARNING)) {
+            logger.warning("Device failed for " + kernel + ": " + _exception.getMessage());
+         }
+
+         preferences.markPreferredDeviceFailed();
+
+//         Device nextDevice = preferences.getPreferredDevice(kernel);
+//
+//         if (nextDevice == null) {
+//            if (!_silently && logger.isLoggable(Level.SEVERE)) {
+//               logger.severe("No Devices left to try, giving up");
+//            }
+//            throw new RuntimeException(_exception);
+//         }
+         if (!_silently && logger.isLoggable(Level.WARNING)) {
+            _exception.printStackTrace();
+            logger.warning("Trying next device: " + describeDevice());
+         }
+      }
+
+      recreateRange(_settings);
+      return executeInternalInner(_settings);
+   }
+
+   @SuppressWarnings("deprecation")
+   public synchronized Kernel execute(String _entrypoint, final Range _range, final int _passes) {
+      executing = true;
+      try {
+         clearCancelMultiPass();
+         KernelProfile profile = KernelManager.instance().getProfile(kernel.getClass());
+         KernelPreferences preferences = KernelManager.instance().getPreferences(kernel);
+         boolean legacyExecutionMode = kernel.getExecutionMode() != Kernel.EXECUTION_MODE.AUTO;
+
+         ExecutionSettings settings = new ExecutionSettings(preferences, profile, _entrypoint, _range, _passes, legacyExecutionMode);
+         // Two Kernels of the same class share the same KernelPreferences object, and since failure (fallback) generally mutates
+         // the preferences object, we must lock it. Note this prevents two Kernels of the same class executing simultaneously.
+         synchronized (preferences) {
+            return executeInternalOuter(settings);
+         }
+      } finally {
+         executing = false;
+         clearCancelMultiPass();
+      }
+   }
+
+   private synchronized Kernel executeInternalOuter(ExecutionSettings _settings) {
+      try {
+         return executeInternalInner(_settings);
+      } finally {
+         if (kernel.isAutoCleanUpArrays() &&_settings.range.getGlobalSize_0() != 0) {
+            cleanUpArrays();
+         }
+      }
+   }
+
+   @SuppressWarnings("deprecation")
+   private synchronized Kernel executeInternalInner(ExecutionSettings _settings) {
+
+      if (_settings.range == null) {
+         throw new IllegalStateException("range can't be null");
+      }
+
+      EXECUTION_MODE requestedExecutionMode = kernel.getExecutionMode();
+
+      if (requestedExecutionMode.isOpenCL() && _settings.range.getDevice() != null && !(_settings.range.getDevice() instanceof OpenCLDevice)) {
+         fallBackToNextDevice(_settings, "OpenCL EXECUTION_MODE was requested but Device supplied was not an OpenCLDevice");
+      }
+
+      Device device = _settings.range.getDevice();
+      boolean userSpecifiedDevice = true;
+      if (device == null) {
+         userSpecifiedDevice = false;
+         if (!_settings.legacyExecutionMode) {
+            device = _settings.preferences.getPreferredDevice(kernel);
+            if (device == null) {
+               // the default fallback when KernelPreferences has run out of options is JTP
+               device = JavaDevice.THREAD_POOL;
+            }
+         } else {
+            if (requestedExecutionMode == EXECUTION_MODE.JTP) {
+               device = JavaDevice.THREAD_POOL;
+            } else if (requestedExecutionMode == EXECUTION_MODE.SEQ) {
+               device = JavaDevice.SEQUENTIAL;
+            }
+         }
+      } else {
+         boolean compatible = isDeviceCompatible(device);
+         if (!compatible) {
+            throw new AssertionError("user supplied Device incompatible with current EXECUTION_MODE or getTargetDevice(); device = "
+                    + device.getShortDescription() + "; kernel = " + kernel);
+         }
+      }
+
+      try {
+         OpenCLDevice openCLDevice = device instanceof OpenCLDevice ? (OpenCLDevice) device : null;
+
+         int jniFlags = 0;
+         // for legacy reasons use old logic where Kernel.EXECUTION_MODE is not AUTO
+         if (_settings.legacyExecutionMode && !userSpecifiedDevice && requestedExecutionMode.isOpenCL()) {
+            if (requestedExecutionMode.equals(EXECUTION_MODE.GPU)) {
+               // Get the best GPU
+               openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.bestGPU();
+               jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now.
+               if (openCLDevice == null) {
+                  return fallBackToNextDevice(_settings, "GPU request can't be honored, no GPU device");
+               }
+            } else if (requestedExecutionMode.equals(EXECUTION_MODE.ACC)) {
+               // Get the best ACC
+               openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.bestACC();
+               jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now.
+               if (openCLDevice == null) {
+                  return fallBackToNextDevice(_settings, "ACC request can't be honored, no ACC device");
+               }
+            } else {
+               // We fetch the first CPU device
+               openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.firstDevice(Device.TYPE.CPU);
+               if (openCLDevice == null) {
+                  return fallBackToNextDevice(_settings, "CPU request can't be honored, no CPU device");
+               }
+            }
+         } else {
+            if (device.getType() == Device.TYPE.GPU) {
+               jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now.
+            } else if (device.getType() == Device.TYPE.ACC) {
+               jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now.
+            }
+         }
+         if (device == null && openCLDevice != null) {
+            device = openCLDevice;
+         }
+         assert device != null : "No device available";
+         _settings.profile.onStart(device);
+         /* for backward compatibility reasons we still honor execution mode */
+         boolean isOpenCl = requestedExecutionMode.isOpenCL() || device instanceof OpenCLDevice;
+         if (isOpenCl) {
+            if ((entryPoint == null) || (isFallBack)) {
+               if (entryPoint == null) {
+                  try {
+                     final ClassModel classModel = ClassModel.createClassModel(kernel.getClass());
+                     entryPoint = classModel.getEntrypoint(_settings.entrypoint, kernel);
+                     _settings.profile.onEvent(ProfilingEvent.CLASS_MODEL_BUILT);
+                  } catch (final Exception exception) {
+                     _settings.profile.onEvent(ProfilingEvent.CLASS_MODEL_BUILT);
+                     return fallBackToNextDevice(_settings, exception);
+                  }
+               }
+
+               if ((entryPoint != null)) {
+                  synchronized (Kernel.class) { // This seems to be needed because of a race condition uncovered with issue #68 http://code.google.com/p/aparapi/issues/detail?id=68
+
+                     //  jniFlags |= (Config.enableProfiling ? JNI_FLAG_ENABLE_PROFILING : 0);
+                     //  jniFlags |= (Config.enableProfilingCSV ? JNI_FLAG_ENABLE_PROFILING_CSV | JNI_FLAG_ENABLE_PROFILING : 0);
+                     //  jniFlags |= (Config.enableVerboseJNI ? JNI_FLAG_ENABLE_VERBOSE_JNI : 0);
+                     // jniFlags |= (Config.enableVerboseJNIOpenCLResourceTracking ? JNI_FLAG_ENABLE_VERBOSE_JNI_OPENCL_RESOURCE_TRACKING :0);
+                     // jniFlags |= (kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.GPU) ? JNI_FLAG_USE_GPU : 0);
+                     // Init the device to check capabilities before emitting the
+                     // code that requires the capabilities.
+                     jniContextHandle = initJNI(kernel, openCLDevice, jniFlags); // openCLDevice will not be null here
+                     _settings.profile.onEvent(ProfilingEvent.INIT_JNI);
+                  } // end of synchronized! issue 68
+
+                  if (jniContextHandle == 0) {
+                     return fallBackToNextDevice(_settings, "initJNI failed to return a valid handle");
+                  }
+
+                  final String extensions = getExtensionsJNI(jniContextHandle);
+                  capabilitiesSet = new HashSet<String>();
+
+                  final StringTokenizer strTok = new StringTokenizer(extensions);
+                  while (strTok.hasMoreTokens()) {
+                     capabilitiesSet.add(strTok.nextToken());
+                  }
+
+                  if (logger.isLoggable(Level.FINE)) {
+                     logger.fine("Capabilities initialized to :" + capabilitiesSet.toString());
+                  }
+
+                  if (entryPoint.requiresDoublePragma() && !hasFP64Support()) {
+                     return fallBackToNextDevice(_settings, "FP64 required but not supported");
+                  }
+
+                  if (entryPoint.requiresByteAddressableStorePragma() && !hasByteAddressableStoreSupport()) {
+                     return fallBackToNextDevice(_settings, "Byte addressable stores required but not supported");
+                  }
+
+                  final boolean all32AtomicsAvailable = hasGlobalInt32BaseAtomicsSupport()
+                        && hasGlobalInt32ExtendedAtomicsSupport() && hasLocalInt32BaseAtomicsSupport()
+                        && hasLocalInt32ExtendedAtomicsSupport();
+
+                  if (entryPoint.requiresAtomic32Pragma() && !all32AtomicsAvailable) {
+
+                     return fallBackToNextDevice(_settings, "32 bit Atomics required but not supported");
+                  }
+
+                  String openCL;
+                  synchronized (openCLCache) {
+                     openCL = openCLCache.get(kernel.getClass());
+                     if (openCL == null) {
+                        try {
+                           openCL = KernelWriter.writeToString(entryPoint);
+                           if (logger.isLoggable(Level.INFO)) {
+                              logger.info(openCL);
+                           }
+                           else if (Config.enableShowGeneratedOpenCL) {
+                              System.out.println(openCL);
+                           }
+                           _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED);
+                           openCLCache.put(kernel.getClass(), openCL);
+                        }
+                        catch (final CodeGenException codeGenException) {
+                           openCLCache.put(kernel.getClass(), CODE_GEN_ERROR_MARKER);
+                           _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED);
+                           return fallBackToNextDevice(_settings, codeGenException);
+                        }
+                     }
+                     else {
+                        if (openCL.equals(CODE_GEN_ERROR_MARKER)) {
+                           _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED);
+                           boolean silently = true; // since we must have already reported the CodeGenException
+                           return fallBackToNextDevice(_settings, null, silently);
+                        }
+                     }
+                  }
+
+                  // Send the string to OpenCL to compile it, or if the compiled binary is already cached on JNI side just empty string to use cached binary
+                  long handle;
+                  if (BINARY_CACHING_DISABLED) {
+                     handle = buildProgramJNI(jniContextHandle, openCL, "");
+                  } else {
+                     synchronized (seenBinaryKeys) {
+                        String binaryKey = kernel.getClass().getName() + ":" + device.getDeviceId();
+                        if (seenBinaryKeys.contains(binaryKey)) {
+                           // use cached binary
+                           logger.log(Level.INFO, "reusing cached binary for " + binaryKey);
+                           handle = buildProgramJNI(jniContextHandle, "", binaryKey);
+                        }
+                        else {
+                           // create and cache binary
+                           logger.log(Level.INFO, "compiling new binary for " + binaryKey);
+                           handle = buildProgramJNI(jniContextHandle, openCL, binaryKey);
+                           seenBinaryKeys.add(binaryKey);
+                        }
+                     }
+                  }
+                  _settings.profile.onEvent(ProfilingEvent.OPENCL_COMPILED);
+                  if (handle == 0) {
+                     return fallBackToNextDevice(_settings, "OpenCL compile failed");
+                  }
+
+                  args = new KernelArg[entryPoint.getReferencedFields().size()];
+                  int i = 0;
+
+                  for (final Field field : entryPoint.getReferencedFields()) {
+                     try {
+                        field.setAccessible(true);
+                        args[i] = new KernelArg();
+                        args[i].setName(field.getName());
+                        args[i].setField(field);
+                        if ((field.getModifiers() & Modifier.STATIC) == Modifier.STATIC) {
+                           args[i].setType(args[i].getType() | ARG_STATIC);
+                        }
+
+                        final Class<?> type = field.getType();
+                        if (type.isArray()) {
+
+                           if (field.getAnnotation(Local.class) != null || args[i].getName().endsWith(Kernel.LOCAL_SUFFIX)) {
+                              args[i].setType(args[i].getType() | ARG_LOCAL);
+                           } else if ((field.getAnnotation(Constant.class) != null)
+                                 || args[i].getName().endsWith(Kernel.CONSTANT_SUFFIX)) {
+                              args[i].setType(args[i].getType() | ARG_CONSTANT);
+                           } else {
+                              args[i].setType(args[i].getType() | ARG_GLOBAL);
+                           }
+                           if (isExplicit()) {
+                              args[i].setType(args[i].getType() | ARG_EXPLICIT);
+                           }
+                           // for now, treat all write arrays as read-write, see bugzilla issue 4859
+                           // we might come up with a better solution later
+                           args[i].setType(args[i].getType()
+                                 | (entryPoint.getArrayFieldAssignments().contains(field.getName()) ? (ARG_WRITE | ARG_READ) : 0));
+                           args[i].setType(args[i].getType()
+                                 | (entryPoint.getArrayFieldAccesses().contains(field.getName()) ? ARG_READ : 0));
+                           // args[i].type |= ARG_GLOBAL;
+
+                           if (type.getName().startsWith("[L")) {
+                              args[i].setArray(null); // will get updated in updateKernelArrayRefs
+                              args[i].setType(args[i].getType()
+                                    | (ARG_ARRAY | ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ));
+
+                              if (logger.isLoggable(Level.FINE)) {
+                                 logger.fine("tagging " + args[i].getName() + " as (ARG_ARRAY | ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)");
+                              }
+                           } else if (type.getName().startsWith("[[")) {
+
+                              try {
+                                 setMultiArrayType(args[i], type);
+                              } catch (AparapiException e) {
+                                 return fallBackToNextDevice(_settings, "failed to set kernel arguement "
+                                       + args[i].getName() + ".  Aparapi only supports 2D and 3D arrays.");
+                              }
+                           } else {
+
+                              args[i].setArray(null); // will get updated in updateKernelArrayRefs
+                              args[i].setType(args[i].getType() | ARG_ARRAY);
+
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(float[].class) ? ARG_FLOAT : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(int[].class) ? ARG_INT : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(boolean[].class) ? ARG_BOOLEAN : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(byte[].class) ? ARG_BYTE : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(char[].class) ? ARG_CHAR : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(double[].class) ? ARG_DOUBLE : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(long[].class) ? ARG_LONG : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(short[].class) ? ARG_SHORT : 0));
+
+                              // arrays whose length is used will have an int arg holding
+                              // the length as a kernel param
+                              if (entryPoint.getArrayFieldArrayLengthUsed().contains(args[i].getName())) {
+                                 args[i].setType(args[i].getType() | ARG_ARRAYLENGTH);
+                              }
+
+                              if (type.getName().startsWith("[L")) {
+                                 args[i].setType(args[i].getType() | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ));
+                                 if (logger.isLoggable(Level.FINE)) {
+                                    logger.fine("tagging " + args[i].getName()
+                                          + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)");
+                                 }
+                              }
+                           }
+                        } else if (type.isAssignableFrom(float.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_FLOAT);
+                        } else if (type.isAssignableFrom(int.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_INT);
+                        } else if (type.isAssignableFrom(double.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_DOUBLE);
+                        } else if (type.isAssignableFrom(long.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_LONG);
+                        } else if (type.isAssignableFrom(boolean.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_BOOLEAN);
+                        } else if (type.isAssignableFrom(byte.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_BYTE);
+                        } else if (type.isAssignableFrom(char.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_CHAR);
+                        } else if (type.isAssignableFrom(short.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_SHORT);
+                        }
+                        // System.out.printf("in execute, arg %d %s %08x\n", i,args[i].name,args[i].type );
+                     } catch (final IllegalArgumentException e) {
+                        e.printStackTrace();
+                     }
+
+                     args[i].setPrimitiveSize(getPrimitiveSize(args[i].getType()));
+
+                     if (logger.isLoggable(Level.FINE)) {
+                        logger.fine("arg " + i + ", " + args[i].getName() + ", type=" + Integer.toHexString(args[i].getType())
+                              + ", primitiveSize=" + args[i].getPrimitiveSize());
+                     }
+
+                     i++;
+                  }
+
+                  // at this point, i = the actual used number of arguments
+                  // (private buffers do not get treated as arguments)
+
+                  argc = i;
+
+                  setArgsJNI(jniContextHandle, args, argc);
+                  _settings.profile.onEvent(ProfilingEvent.PREPARE_EXECUTE);
+                  try {
+                     executeOpenCL(_settings);
+                     isFallBack = false;
+                  } catch (final AparapiException e) {
+                     fallBackToNextDevice(_settings, e);
+                  }
+               } else { // (entryPoint != null) && !entryPoint.shouldFallback()
+                  fallBackToNextDevice(_settings, "failed to locate entrypoint");
+               }
+            } else { // (entryPoint == null) || (isFallBack)
+               try {
+                  executeOpenCL(_settings);
+                  isFallBack = false;
+               } catch (final AparapiException e) {
+                  fallBackToNextDevice(_settings, e);
+               }
+            }
+         } else { // isOpenCL
+            if (!(device instanceof JavaDevice)) {
+               fallBackToNextDevice(_settings, "Non-OpenCL Kernel.EXECUTION_MODE requested but device is not a JavaDevice ");
+            }
+            executeJava(_settings, (JavaDevice) device);
+         }
+
+         if (Config.enableExecutionModeReporting) {
+            System.out.println("execution complete: " + kernel);
+         }
+
+         return kernel;
+      }
+      finally {
+         _settings.profile.onEvent(ProfilingEvent.EXECUTED);
+         maybeReportProfile(_settings);
+      }
+   }
+
+   @Override
+   public String toString() {
+      return "KernelRunner{" + kernel + "}";
+   }
+
+   private String describeDevice() {
+      Device device = KernelManager.instance().getPreferences(kernel).getPreferredDevice(kernel);
+      return (device == null) ? "<default fallback>" : device.getShortDescription();
+   }
+
+   private void maybeReportProfile(ExecutionSettings _settings) {
+      if (Config.dumpProfileOnExecution) {
+         StringBuilder report = new StringBuilder();
+         report.append(KernelDeviceProfile.getTableHeader()).append('\n');
+         report.append(_settings.profile.getLastDeviceProfile().getLastAsTableRow());
+         System.out.println(report);
+      }
+   }
+
+   @SuppressWarnings("deprecation")
+   private boolean isDeviceCompatible(Device device) {
+      Kernel.EXECUTION_MODE mode = kernel.getExecutionMode();
+      if (mode != Kernel.EXECUTION_MODE.AUTO) {
+         switch (device.getType()) {
+            case GPU:
+               return mode == Kernel.EXECUTION_MODE.GPU;
+            case CPU:
+               return mode == Kernel.EXECUTION_MODE.CPU;
+            case JTP:
+               return mode == Kernel.EXECUTION_MODE.JTP;
+            case SEQ:
+               return mode == Kernel.EXECUTION_MODE.SEQ;
+            case ACC:
+               return mode == Kernel.EXECUTION_MODE.ACC;
+            default:
+               return false;
+         }
+      } else {
+         return (device == kernel.getTargetDevice());
+      }
+   }
+
+   public int getCancelState() {
+      return inBufferRemoteInt.get(0);
+   }
+
+   public void cancelMultiPass() {
+      inBufferRemoteInt.put(0, CANCEL_STATUS_TRUE);
+   }
+
+   private void clearCancelMultiPass() {
+      inBufferRemoteInt.put(0, CANCEL_STATUS_FALSE);
+   }
+
+   /**
+    * Returns the index of the current pass, or one of two special constants with negative values to indicate special progress states. Those constants are
+    * {@link #PASS_ID_PREPARING_EXECUTION} to indicate that the Kernel has started executing but not reached the initial pass, or
+    * {@link #PASS_ID_COMPLETED_EXECUTION} to indicate that execution is complete (possibly due to early termination via {@link #cancelMultiPass()}), i.e. the Kernel
+    * is idle. {@link #PASS_ID_COMPLETED_EXECUTION} is also returned before the first execution has been invoked.
+    *
+    * <p>This can be used, for instance, to update a visual progress bar.
+    *
+    * @see #execute(String, Range, int)
+    */
+   public int getCurrentPass() {
+      if (!executing) {
+         return PASS_ID_COMPLETED_EXECUTION;
+      }
+
+      if (kernel.isRunningCL()) {
+         return getCurrentPassRemote();
+      } else {
+         return getCurrentPassLocal();
+      }
+   }
+
+   /**
+    * True while any of the {@code execute()} methods are in progress.
+    */
+   public boolean isExecuting() {
+      return executing;
+   }
+
+   protected int getCurrentPassRemote() {
+      return outBufferRemoteInt.get(0);
+   }
+
+   private int getCurrentPassLocal() {
+      return passId;
+   }
+
+   private int getPrimitiveSize(int type) {
+      if ((type & ARG_FLOAT) != 0) {
+         return 4;
+      } else if ((type & ARG_INT) != 0) {
+         return 4;
+      } else if ((type & ARG_BYTE) != 0) {
+         return 1;
+      } else if ((type & ARG_CHAR) != 0) {
+         return 2;
+      } else if ((type & ARG_BOOLEAN) != 0) {
+         return 1;
+      } else if ((type & ARG_SHORT) != 0) {
+         return 2;
+      } else if ((type & ARG_LONG) != 0) {
+         return 8;
+      } else if ((type & ARG_DOUBLE) != 0) {
+         return 8;
+      }
+      return 0;
+   }
+
+   private void setMultiArrayType(KernelArg arg, Class<?> type) throws AparapiException {
+      arg.setType(arg.getType() | (ARG_WRITE | ARG_READ | ARG_APARAPI_BUFFER));
+      int numDims = 0;
+      while (type.getName().startsWith("[[[[")) {
+         throw new AparapiException("Aparapi only supports 2D and 3D arrays.");
+      }
+      arg.setType(arg.getType() | ARG_ARRAYLENGTH);
+      while (type.getName().charAt(numDims) == '[') {
+         numDims++;
+      }
+      arg.setNumDims(numDims);
+      arg.setJavaBuffer(null); // will get updated in updateKernelArrayRefs
+      arg.setArray(null); // will get updated in updateKernelArrayRefs
+
+      Class<?> elementType = arg.getField().getType();
+      while (elementType.isArray()) {
+         elementType = elementType.getComponentType();
+      }
+
+      if (elementType.isAssignableFrom(float.class)) {
+         arg.setType(arg.getType() | ARG_FLOAT);
+      } else if (elementType.isAssignableFrom(int.class)) {
+         arg.setType(arg.getType() | ARG_INT);
+      } else if (elementType.isAssignableFrom(boolean.class)) {
+         arg.setType(arg.getType() | ARG_BOOLEAN);
+      } else if (elementType.isAssignableFrom(byte.class)) {
+         arg.setType(arg.getType() | ARG_BYTE);
+      } else if (elementType.isAssignableFrom(char.class)) {
+         arg.setType(arg.getType() | ARG_CHAR);
+      } else if (elementType.isAssignableFrom(double.class)) {
+         arg.setType(arg.getType() | ARG_DOUBLE);
+      } else if (elementType.isAssignableFrom(long.class)) {
+         arg.setType(arg.getType() | ARG_LONG);
+      } else if (elementType.isAssignableFrom(short.class)) {
+         arg.setType(arg.getType() | ARG_SHORT);
+      }
+   }
+
+   private final Set<Object> puts = new HashSet<Object>();
+
+   /**
+    * Enqueue a request to return this array from the GPU. This method blocks until the array is available.
+    * <br/>
+    * Note that <code>Kernel.put(type [])</code> calls will delegate to this call.
+    * <br/>
+    * Package public
+    * 
+    * @param array
+    *          It is assumed that this parameter is indeed an array (of int, float, short etc).
+    * 
+    * @see Kernel#get(int[] arr)
+    * @see Kernel#get(float[] arr)
+    * @see Kernel#get(double[] arr)
+    * @see Kernel#get(long[] arr)
+    * @see Kernel#get(char[] arr)
+    * @see Kernel#get(boolean[] arr)
+    */
+   public void get(Object array) {
+      if (explicit && (kernel.isRunningCL())) {
+        // Only makes sense when we are using OpenCL
+         getJNI(jniContextHandle, array);
+      }
+   }
+
+   public List<ProfileInfo> getProfileInfo() {
+      if (explicit && (kernel.isRunningCL())) {
+         // Only makes sense when we are using OpenCL
+         return (getProfileInfoJNI(jniContextHandle));
+      } else {
+         return (null);
+      }
+   }
+
+   /**
+    * Tag this array so that it is explicitly enqueued before the kernel is executed. <br/>
+    * Note that <code>Kernel.put(type [])</code> calls will delegate to this call. <br/>
+    * Package public
+    * 
+    * @param array
+    *          It is assumed that this parameter is indeed an array (of int, float, short etc).
+    * @see Kernel#put(int[] arr)
+    * @see Kernel#put(float[] arr)
+    * @see Kernel#put(double[] arr)
+    * @see Kernel#put(long[] arr)
+    * @see Kernel#put(char[] arr)
+    * @see Kernel#put(boolean[] arr)
+    */
+
+   public void put(Object array) {
+      if (explicit && (kernel.isRunningCL())) {
+         // Only makes sense when we are using OpenCL
+         puts.add(array);
+      }
+   }
+
+   private boolean explicit = false;
+
+   public void setExplicit(boolean _explicit) {
+      explicit = _explicit;
+   }
+
+   public boolean isExplicit() {
+      return (explicit);
+   }
+
+   private static class ExecutionSettings {
+      final KernelPreferences preferences;
+      final KernelProfile profile;
+      final String entrypoint;
+      Range range;
+      final int passes;
+      final boolean legacyExecutionMode;
+
+      private ExecutionSettings(KernelPreferences preferences, KernelProfile profile, String entrypoint, Range range, int passes, boolean legacyExecutionMode) {
+         this.preferences = preferences;
+         this.profile = profile;
+         this.entrypoint = entrypoint;
+         this.range = range;
+         this.passes = passes;
+         this.legacyExecutionMode = legacyExecutionMode;
+      }
+
+      @Override
+      public String toString() {
+         return "ExecutionSettings{" +
+                 "preferences=" + preferences +
+                 ", profile=" + profile +
+                 ", entrypoint='" + entrypoint + '\'' +
+                 ", range=" + range +
+                 ", passes=" + passes +
+                 ", legacyExecutionMode=" + legacyExecutionMode +
+                 '}';
+      }
+   }
+}