diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java
deleted file mode 100644
index d99809e9c3353db214658321b92e41b731dea54a..0000000000000000000000000000000000000000
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java
+++ /dev/null
@@ -1,1775 +0,0 @@
-/*
-Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
-following conditions are met:
-
-Redistributions of source code must retain the above copyright notice, this list of conditions and the following
-disclaimer. 
-
-Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
-disclaimer in the documentation and/or other materials provided with the distribution. 
-
-Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission. 
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
-INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
-laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
-774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
-you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
-Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
-Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
-E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
-D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
-to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
-of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
-under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 
-
-*/
-package com.amd.aparapi.internal.kernel;
-
-import com.amd.aparapi.*;
-import com.amd.aparapi.Kernel.Constant;
-import com.amd.aparapi.Kernel.*;
-import com.amd.aparapi.device.*;
-import com.amd.aparapi.internal.annotation.*;
-import com.amd.aparapi.internal.exception.*;
-import com.amd.aparapi.internal.instruction.InstructionSet.*;
-import com.amd.aparapi.internal.jni.*;
-import com.amd.aparapi.internal.model.*;
-import com.amd.aparapi.internal.util.*;
-import com.amd.aparapi.internal.writer.*;
-import com.amd.aparapi.opencl.*;
-
-import java.lang.reflect.*;
-import java.nio.*;
-import java.util.*;
-import java.util.concurrent.*;
-import java.util.concurrent.ForkJoinPool.*;
-import java.util.logging.*;
-
-/**
- * The class is responsible for executing <code>Kernel</code> implementations. <br/>
- * 
- * The <code>KernelRunner</code> is the real workhorse for Aparapi.  Each <code>Kernel</code> instance creates a single
- * <code>KernelRunner</code> to encapsulate state and to help coordinate interactions between the <code>Kernel</code> 
- * and it's execution logic.<br/>
- * 
- * The <code>KernelRunner</code> is created <i>lazily</i> as a result of calling <code>Kernel.execute()</code>. A this 
- * time the <code>ExecutionMode</code> is consulted to determine the default requested mode.  This will dictate how 
- * the <code>KernelRunner</code> will attempt to execute the <code>Kernel</code>
- *   
- * @see com.amd.aparapi.Kernel#execute(int _globalSize)
- * 
- * @author gfrost
- *
- */
-public class KernelRunner extends KernelRunnerJNI{
-
-   public static boolean BINARY_CACHING_DISABLED = false;
-
-   private static final int MINIMUM_ARRAY_SIZE = 1;
-
-   /** @see #getCurrentPass() */
-   @UsedByJNICode public static final int PASS_ID_PREPARING_EXECUTION = -2;
-   /** @see #getCurrentPass() */
-   @UsedByJNICode public static final int PASS_ID_COMPLETED_EXECUTION = -1;
-   @UsedByJNICode public static final int CANCEL_STATUS_FALSE = 0;
-   @UsedByJNICode public static final int CANCEL_STATUS_TRUE = 1;
-   private static final String CODE_GEN_ERROR_MARKER = CodeGenException.class.getName();
-
-   private static Logger logger = Logger.getLogger(Config.getLoggerName());
-
-   private long jniContextHandle = 0;
-
-   private final Kernel kernel;
-
-   private Entrypoint entryPoint;
-
-   private int argc;
-
-   // may be read by a thread other than the control thread, hence volatile
-   private volatile boolean executing;
-
-   // may be read by a thread other than the control thread, hence volatile
-   private volatile int passId = PASS_ID_PREPARING_EXECUTION;
-
-   /**
-    * A direct ByteBuffer used for asynchronous intercommunication between java and JNI C code.
-    *
-    * <p>
-    * At present this is a 4 byte buffer to be interpreted as an int[1], used for passing from java to C a single integer interpreted as a cancellation indicator.
-    */
-   private final ByteBuffer inBufferRemote;
-   private final IntBuffer inBufferRemoteInt;
-
-   /** A direct ByteBuffer used for asynchronous intercommunication between java and JNI C code.
-    * <p>
-    * At present this is a 4 byte buffer to be interpreted as an int[1], used for passing from C to java a single integer interpreted as a
-    * the current pass id.
-    */
-   private final ByteBuffer outBufferRemote;
-   private final IntBuffer outBufferRemoteInt;
-
-   private boolean isFallBack = false; // If isFallBack, rebuild the kernel (necessary?)
-
-   private static final ForkJoinWorkerThreadFactory lowPriorityThreadFactory = new ForkJoinWorkerThreadFactory(){
-      @Override public ForkJoinWorkerThread newThread(ForkJoinPool pool) {
-         ForkJoinWorkerThread newThread = ForkJoinPool.defaultForkJoinWorkerThreadFactory.newThread(pool);
-         newThread.setPriority(Thread.MIN_PRIORITY);
-         return newThread;
-      }
-   };
-
-   private static final ForkJoinPool threadPool = new ForkJoinPool(Runtime.getRuntime().availableProcessors(),
-         lowPriorityThreadFactory, null, false);
-   private static HashMap<Class<? extends Kernel>, String> openCLCache = new HashMap<>();
-   private static LinkedHashSet<String> seenBinaryKeys = new LinkedHashSet<>();
-
-   /**
-    * Create a KernelRunner for a specific Kernel instance.
-    * 
-    * @param _kernel
-    */
-   public KernelRunner(Kernel _kernel) {
-      kernel = _kernel;
-
-      inBufferRemote = ByteBuffer.allocateDirect(4);
-      outBufferRemote = ByteBuffer.allocateDirect(4);
-
-      inBufferRemote.order(ByteOrder.nativeOrder());
-      outBufferRemote.order(ByteOrder.nativeOrder());
-
-      inBufferRemoteInt = inBufferRemote.asIntBuffer();
-      outBufferRemoteInt = outBufferRemote.asIntBuffer();
-
-      KernelManager.instance(); // ensures static initialization of KernelManager
-   }
-
-   /**
-    * @see Kernel#cleanUpArrays().
-    */
-   public void cleanUpArrays() {
-      if (args != null && kernel.isRunningCL()) {
-         for (KernelArg arg : args) {
-            if ((arg.getType() & KernelRunnerJNI.ARG_ARRAY) != 0) {
-               Field field = arg.getField();
-               if (field != null && field.getType().isArray() && !Modifier.isFinal(field.getModifiers())) {
-                  field.setAccessible(true);
-                  Class<?> componentType = field.getType().getComponentType();
-                  Object newValue = Array.newInstance(componentType, MINIMUM_ARRAY_SIZE);
-                  try {
-                     field.set(kernel, newValue);
-                  }
-                  catch (IllegalAccessException e) {
-                     throw new RuntimeException(e);
-                  }
-               }
-            }
-         }
-         kernel.execute(0);
-      } else if (kernel.isRunningCL()) {
-         logger.log(Level.SEVERE, "KernelRunner#cleanUpArrays() could not execute as no args available (Kernel has not been executed?)");
-      }
-   }
-
-   /**
-    * <code>Kernel.dispose()</code> delegates to <code>KernelRunner.dispose()</code> which delegates to <code>disposeJNI()</code> to actually close JNI data structures.<br/>
-    * 
-    * @see KernelRunnerJNI#disposeJNI(long)
-    */
-   public synchronized void dispose() {
-      if (kernel.isRunningCL()) {
-         disposeJNI(jniContextHandle);
-         seenBinaryKeys.clear();
-      }
-      // We are using a shared pool, so there's no need no shutdown it when kernel is disposed
-      //      threadPool.shutdownNow();
-   }
-
-   private Set<String> capabilitiesSet;
-
-   boolean hasFP64Support() {
-      if (capabilitiesSet == null) {
-         throw new IllegalStateException("Capabilities queried before they were initialized");
-      }
-      return (capabilitiesSet.contains(OpenCL.CL_KHR_FP64));
-   }
-
-   boolean hasSelectFPRoundingModeSupport() {
-      if (capabilitiesSet == null) {
-         throw new IllegalStateException("Capabilities queried before they were initialized");
-      }
-      return capabilitiesSet.contains(OpenCL.CL_KHR_SELECT_FPROUNDING_MODE);
-   }
-
-   boolean hasGlobalInt32BaseAtomicsSupport() {
-      if (capabilitiesSet == null) {
-         throw new IllegalStateException("Capabilities queried before they were initialized");
-      }
-      return capabilitiesSet.contains(OpenCL.CL_KHR_GLOBAL_INT32_BASE_ATOMICS);
-   }
-
-   boolean hasGlobalInt32ExtendedAtomicsSupport() {
-      if (capabilitiesSet == null) {
-         throw new IllegalStateException("Capabilities queried before they were initialized");
-      }
-      return capabilitiesSet.contains(OpenCL.CL_KHR_GLOBAL_INT32_EXTENDED_ATOMICS);
-   }
-
-   boolean hasLocalInt32BaseAtomicsSupport() {
-      if (capabilitiesSet == null) {
-         throw new IllegalStateException("Capabilities queried before they were initialized");
-      }
-      return capabilitiesSet.contains(OpenCL.CL_KHR_LOCAL_INT32_BASE_ATOMICS);
-   }
-
-   boolean hasLocalInt32ExtendedAtomicsSupport() {
-      if (capabilitiesSet == null) {
-         throw new IllegalStateException("Capabilities queried before they were initialized");
-      }
-      return capabilitiesSet.contains(OpenCL.CL_KHR_LOCAL_INT32_EXTENDED_ATOMICS);
-   }
-
-   boolean hasInt64BaseAtomicsSupport() {
-      if (capabilitiesSet == null) {
-         throw new IllegalStateException("Capabilities queried before they were initialized");
-      }
-      return capabilitiesSet.contains(OpenCL.CL_KHR_INT64_BASE_ATOMICS);
-   }
-
-   boolean hasInt64ExtendedAtomicsSupport() {
-      if (capabilitiesSet == null) {
-         throw new IllegalStateException("Capabilities queried before they were initialized");
-      }
-      return capabilitiesSet.contains(OpenCL.CL_KHR_INT64_EXTENDED_ATOMICS);
-   }
-
-   boolean has3DImageWritesSupport() {
-      if (capabilitiesSet == null) {
-         throw new IllegalStateException("Capabilities queried before they were initialized");
-      }
-      return capabilitiesSet.contains(OpenCL.CL_KHR_3D_IMAGE_WRITES);
-   }
-
-   boolean hasByteAddressableStoreSupport() {
-      if (capabilitiesSet == null) {
-         throw new IllegalStateException("Capabilities queried before they were initialized");
-      }
-      return capabilitiesSet.contains(OpenCL.CL_KHR_BYTE_ADDRESSABLE_SUPPORT);
-   }
-
-   boolean hasFP16Support() {
-      if (capabilitiesSet == null) {
-         throw new IllegalStateException("Capabilities queried before they were initialized");
-      }
-      return capabilitiesSet.contains(OpenCL.CL_KHR_FP16);
-   }
-
-   boolean hasGLSharingSupport() {
-      if (capabilitiesSet == null) {
-         throw new IllegalStateException("Capabilities queried before they were initialized");
-      }
-      return capabilitiesSet.contains(OpenCL.CL_KHR_GL_SHARING);
-   }
-
-   private static final class FJSafeCyclicBarrier extends CyclicBarrier{
-      FJSafeCyclicBarrier(final int threads) {
-         super(threads);
-      }
-
-      @Override public int await() throws InterruptedException, BrokenBarrierException {
-         class Awaiter implements ManagedBlocker{
-            private int value;
-
-            private boolean released;
-
-            @Override public boolean block() throws InterruptedException {
-               try {
-                  value = superAwait();
-                  released = true;
-                  return true;
-               } catch (final BrokenBarrierException e) {
-                  throw new RuntimeException(e);
-               }
-            }
-
-            @Override public boolean isReleasable() {
-               return released;
-            }
-
-            int getValue() {
-               return value;
-            }
-         }
-         final Awaiter awaiter = new Awaiter();
-         ForkJoinPool.managedBlock(awaiter);
-         return awaiter.getValue();
-      }
-
-      int superAwait() throws InterruptedException, BrokenBarrierException {
-         return super.await();
-      }
-   }
-
-   //   @FunctionalInterface
-   private interface ThreadIdSetter{
-      void set(KernelState kernelState, int globalGroupId, int threadId);
-   }
-
-   /**
-    * Execute using a Java thread pool, or sequentially, or using an alternative algorithm, usually as a result of failing to compile or execute OpenCL
-    */
-   @SuppressWarnings("deprecation")
-   protected void executeJava(ExecutionSettings _settings, Device device) {
-      if (logger.isLoggable(Level.FINE)) {
-         logger.fine("executeJava: range = " + _settings.range + ", device = " + device);
-      }
-      boolean legacySequentialMode = kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.SEQ);
-
-      passId = PASS_ID_PREPARING_EXECUTION;
-      _settings.profile.onEvent(ProfilingEvent.PREPARE_EXECUTE);
-
-      try {
-         if (device == JavaDevice.ALTERNATIVE_ALGORITHM) {
-            if (kernel.hasFallbackAlgorithm()) {
-               for (passId = 0; passId < _settings.passes; ++passId) {
-                  kernel.executeFallbackAlgorithm(_settings.range, passId);
-               }
-            } else {
-               boolean silently = true; // not having an alternative algorithm is the normal state, and does not need reporting
-               fallBackToNextDevice(_settings, (Exception) null, silently);
-            }
-         } else {
-            final int localSize0 = _settings.range.getLocalSize(0);
-            final int localSize1 = _settings.range.getLocalSize(1);
-            final int localSize2 = _settings.range.getLocalSize(2);
-            final int globalSize1 = _settings.range.getGlobalSize(1);
-            if (legacySequentialMode || device == JavaDevice.SEQUENTIAL) {
-               /**
-                * SEQ mode is useful for testing trivial logic, but kernels which use SEQ mode cannot be used if the
-                * product of localSize(0..3) is >1.  So we can use multi-dim ranges but only if the local size is 1 in all dimensions.
-                *
-                * As a result of this barrier is only ever 1 work item wide and probably should be turned into a no-op.
-                *
-                * So we need to check if the range is valid here. If not we have no choice but to punt.
-                */
-               if ((localSize0 * localSize1 * localSize2) > 1) {
-                  throw new IllegalStateException("Can't run range with group size >1 sequentially. Barriers would deadlock!");
-               }
-
-               final Kernel kernelClone = kernel.clone();
-               final KernelState kernelState = kernelClone.getKernelState();
-
-               kernelState.setRange(_settings.range);
-               kernelState.setGroupId(0, 0);
-               kernelState.setGroupId(1, 0);
-               kernelState.setGroupId(2, 0);
-               kernelState.setLocalId(0, 0);
-               kernelState.setLocalId(1, 0);
-               kernelState.setLocalId(2, 0);
-               kernelState.setLocalBarrier(new FJSafeCyclicBarrier(1));
-
-               for (passId = 0; passId < _settings.passes; passId++) {
-                  if (getCancelState() == CANCEL_STATUS_TRUE) {
-                     break;
-                  }
-                  kernelState.setPassId(passId);
-
-                  if (_settings.range.getDims() == 1) {
-                     for (int id = 0; id < _settings.range.getGlobalSize(0); id++) {
-                        kernelState.setGlobalId(0, id);
-                        kernelClone.run();
-                     }
-                  }
-                  else if (_settings.range.getDims() == 2) {
-                     for (int x = 0; x < _settings.range.getGlobalSize(0); x++) {
-                        kernelState.setGlobalId(0, x);
-
-                        for (int y = 0; y < globalSize1; y++) {
-                           kernelState.setGlobalId(1, y);
-                           kernelClone.run();
-                        }
-                     }
-                  }
-                  else if (_settings.range.getDims() == 3) {
-                     for (int x = 0; x < _settings.range.getGlobalSize(0); x++) {
-                        kernelState.setGlobalId(0, x);
-
-                        for (int y = 0; y < globalSize1; y++) {
-                           kernelState.setGlobalId(1, y);
-
-                           for (int z = 0; z < _settings.range.getGlobalSize(2); z++) {
-                              kernelState.setGlobalId(2, z);
-                              kernelClone.run();
-                           }
-
-                           kernelClone.run();
-                        }
-                     }
-                  }
-               }
-               passId = PASS_ID_COMPLETED_EXECUTION;
-            }
-            else {
-               if (device != JavaDevice.THREAD_POOL && kernel.getExecutionMode() != Kernel.EXECUTION_MODE.JTP) {
-                  throw new AssertionError("unexpected JavaDevice or EXECUTION_MODE");
-               }
-               final int threads = localSize0 * localSize1 * localSize2;
-               final int numGroups0 = _settings.range.getNumGroups(0);
-               final int numGroups1 = _settings.range.getNumGroups(1);
-               final int globalGroups = numGroups0 * numGroups1 * _settings.range.getNumGroups(2);
-               /**
-                * This joinBarrier is the barrier that we provide for the kernel threads to rendezvous with the current dispatch thread.
-                * So this barrier is threadCount+1 wide (the +1 is for the dispatch thread)
-                */
-               final CyclicBarrier joinBarrier = new FJSafeCyclicBarrier(threads + 1);
-
-               /**
-                * This localBarrier is only ever used by the kernels.  If the kernel does not use the barrier the threads
-                * can get out of sync, we promised nothing in JTP mode.
-                *
-                * As with OpenCL all threads within a group must wait at the barrier or none.  It is a user error (possible deadlock!)
-                * if the barrier is in a conditional that is only executed by some of the threads within a group.
-                *
-                * Kernel developer must understand this.
-                *
-                * This barrier is threadCount wide.  We never hit the barrier from the dispatch thread.
-                */
-               final CyclicBarrier localBarrier = new FJSafeCyclicBarrier(threads);
-
-               final ThreadIdSetter threadIdSetter;
-
-               if (_settings.range.getDims() == 1) {
-                  threadIdSetter = new ThreadIdSetter() {
-                     @Override
-                     public void set(KernelState kernelState, int globalGroupId, int threadId) {
-                        //                   (kernelState, globalGroupId, threadId) ->{
-                        kernelState.setLocalId(0, (threadId % localSize0));
-                        kernelState.setGlobalId(0, (threadId + (globalGroupId * threads)));
-                        kernelState.setGroupId(0, globalGroupId);
-                     }
-                  };
-               }
-               else if (_settings.range.getDims() == 2) {
-
-                  /**
-                   * Consider a 12x4 grid of 4*2 local groups
-                   * <pre>
-                   *                                             threads = 4*2 = 8
-                   *                                             localWidth=4
-                   *                                             localHeight=2
-                   *                                             globalWidth=12
-                   *                                             globalHeight=4
-                   *
-                   *    00 01 02 03 | 04 05 06 07 | 08 09 10 11
-                   *    12 13 14 15 | 16 17 18 19 | 20 21 22 23
-                   *    ------------+-------------+------------
-                   *    24 25 26 27 | 28 29 30 31 | 32 33 34 35
-                   *    36 37 38 39 | 40 41 42 43 | 44 45 46 47
-                   *
-                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03  threadIds : [0..7]*6
-                   *    04 05 06 07 | 04 05 06 07 | 04 05 06 07
-                   *    ------------+-------------+------------
-                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
-                   *    04 05 06 07 | 04 05 06 07 | 04 05 06 07
-                   *
-                   *    00 00 00 00 | 01 01 01 01 | 02 02 02 02  groupId[0] : 0..6
-                   *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
-                   *    ------------+-------------+------------
-                   *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
-                   *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
-                   *
-                   *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  groupId[1] : 0..6
-                   *    00 00 00 00 | 00 00 00 00 | 00 00 00 00
-                   *    ------------+-------------+------------
-                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
-                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
-                   *
-                   *    00 01 02 03 | 08 09 10 11 | 16 17 18 19  globalThreadIds == threadId + groupId * threads;
-                   *    04 05 06 07 | 12 13 14 15 | 20 21 22 23
-                   *    ------------+-------------+------------
-                   *    24 25 26 27 | 32[33]34 35 | 40 41 42 43
-                   *    28 29 30 31 | 36 37 38 39 | 44 45 46 47
-                   *
-                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03  localX = threadId % localWidth; (for globalThreadId 33 = threadId = 01 : 01%4 =1)
-                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
-                   *    ------------+-------------+------------
-                   *    00 01 02 03 | 00[01]02 03 | 00 01 02 03
-                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
-                   *
-                   *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  localY = threadId /localWidth  (for globalThreadId 33 = threadId = 01 : 01/4 =0)
-                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
-                   *    ------------+-------------+------------
-                   *    00 00 00 00 | 00[00]00 00 | 00 00 00 00
-                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
-                   *
-                   *    00 01 02 03 | 04 05 06 07 | 08 09 10 11  globalX=
-                   *    00 01 02 03 | 04 05 06 07 | 08 09 10 11     groupsPerLineWidth=globalWidth/localWidth (=12/4 =3)
-                   *    ------------+-------------+------------     groupInset =groupId%groupsPerLineWidth (=4%3 = 1)
-                   *    00 01 02 03 | 04[05]06 07 | 08 09 10 11
-                   *    00 01 02 03 | 04 05 06 07 | 08 09 10 11     globalX = groupInset*localWidth+localX (= 1*4+1 = 5)
-                   *
-                   *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  globalY
-                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
-                   *    ------------+-------------+------------
-                   *    02 02 02 02 | 02[02]02 02 | 02 02 02 02
-                   *    03 03 03 03 | 03 03 03 03 | 03 03 03 03
-                   *
-                   * </pre>
-                   * Assume we are trying to locate the id's for #33
-                   *
-                   */
-                  threadIdSetter = new ThreadIdSetter() {
-                     @Override
-                     public void set(KernelState kernelState, int globalGroupId, int threadId) {
-                        //                   (kernelState, globalGroupId, threadId) ->{
-                        kernelState.setLocalId(0, (threadId % localSize0)); // threadId % localWidth =  (for 33 = 1 % 4 = 1)
-                        kernelState.setLocalId(1, (threadId / localSize0)); // threadId / localWidth = (for 33 = 1 / 4 == 0)
-
-                        final int groupInset = globalGroupId % numGroups0; // 4%3 = 1
-                        kernelState.setGlobalId(0, ((groupInset * localSize0) + kernelState.getLocalIds()[0])); // 1*4+1=5
-
-                        final int completeLines = (globalGroupId / numGroups0) * localSize1;// (4/3) * 2
-                        kernelState.setGlobalId(1, (completeLines + kernelState.getLocalIds()[1])); // 2+0 = 2
-                        kernelState.setGroupId(0, (globalGroupId % numGroups0));
-                        kernelState.setGroupId(1, (globalGroupId / numGroups0));
-                     }
-                  };
-               }
-               else if (_settings.range.getDims() == 3) {
-                  //Same as 2D actually turns out that localId[0] is identical for all three dims so could be hoisted out of conditional code
-                  threadIdSetter = new ThreadIdSetter() {
-                     @Override
-                     public void set(KernelState kernelState, int globalGroupId, int threadId) {
-                        //                   (kernelState, globalGroupId, threadId) ->{
-                        kernelState.setLocalId(0, (threadId % localSize0));
-
-                        kernelState.setLocalId(1, ((threadId / localSize0) % localSize1));
-
-                        // the thread id's span WxHxD so threadId/(WxH) should yield the local depth
-                        kernelState.setLocalId(2, (threadId / (localSize0 * localSize1)));
-
-                        kernelState.setGlobalId(0, (((globalGroupId % numGroups0) * localSize0) + kernelState.getLocalIds()[0]));
-
-                        kernelState.setGlobalId(1,
-                        ((((globalGroupId / numGroups0) * localSize1) % globalSize1) + kernelState.getLocalIds()[1]));
-
-                        kernelState.setGlobalId(2,
-                        (((globalGroupId / (numGroups0 * numGroups1)) * localSize2) + kernelState.getLocalIds()[2]));
-
-                        kernelState.setGroupId(0, (globalGroupId % numGroups0));
-                        kernelState.setGroupId(1, ((globalGroupId / numGroups0) % numGroups1));
-                        kernelState.setGroupId(2, (globalGroupId / (numGroups0 * numGroups1)));
-                     }
-                  };
-               }
-               else
-                  throw new IllegalArgumentException("Expected 1,2 or 3 dimensions, found " + _settings.range.getDims());
-               for (passId = 0; passId < _settings.passes; passId++) {
-                  if (getCancelState() == CANCEL_STATUS_TRUE) {
-                     break;
-                  }
-                  /**
-                   * Note that we emulate OpenCL by creating one thread per localId (across the group).
-                   *
-                   * So threadCount == range.getLocalSize(0)*range.getLocalSize(1)*range.getLocalSize(2);
-                   *
-                   * For a 1D range of 12 groups of 4 we create 4 threads. One per localId(0).
-                   *
-                   * We also clone the kernel 4 times. One per thread.
-                   *
-                   * We create local barrier which has a width of 4
-                   *
-                   *    Thread-0 handles localId(0) (global 0,4,8)
-                   *    Thread-1 handles localId(1) (global 1,5,7)
-                   *    Thread-2 handles localId(2) (global 2,6,10)
-                   *    Thread-3 handles localId(3) (global 3,7,11)
-                   *
-                   * This allows all threads to synchronize using the local barrier.
-                   *
-                   * Initially the use of local buffers seems broken as the buffers appears to be per Kernel.
-                   * Thankfully Kernel.clone() performs a shallow clone of all buffers (local and global)
-                   * So each of the cloned kernels actually still reference the same underlying local/global buffers.
-                   *
-                   * If the kernel uses local buffers but does not use barriers then it is possible for different groups
-                   * to see mutations from each other (unlike OpenCL), however if the kernel does not us barriers then it
-                   * cannot assume any coherence in OpenCL mode either (the failure mode will be different but still wrong)
-                   *
-                   * So even JTP mode use of local buffers will need to use barriers. Not for the same reason as OpenCL but to keep groups in lockstep.
-                   *
-                   **/
-                  for (int id = 0; id < threads; id++) {
-                     final int threadId = id;
-
-                     /**
-                      *  We clone one kernel for each thread.
-                      *
-                      *  They will all share references to the same range, localBarrier and global/local buffers because the clone is shallow.
-                      *  We need clones so that each thread can assign 'state' (localId/globalId/groupId) without worrying
-                      *  about other threads.
-                      */
-                     final Kernel kernelClone = kernel.clone();
-                     final KernelState kernelState = kernelClone.getKernelState();
-                     kernelState.setRange(_settings.range);
-                     kernelState.setPassId(passId);
-
-                     if (threads == 1) {
-                        kernelState.disableLocalBarrier();
-                     }
-                     else {
-                        kernelState.setLocalBarrier(localBarrier);
-                     }
-
-                     threadPool.submit(
-                     //                     () -> {
-                     new Runnable() {
-                        public void run() {
-                           try {
-                              for (int globalGroupId = 0; globalGroupId < globalGroups; globalGroupId++) {
-                                 threadIdSetter.set(kernelState, globalGroupId, threadId);
-                                 kernelClone.run();
-                              }
-                           }
-                           catch (RuntimeException | Error e) {
-                              logger.log(Level.SEVERE, "Execution failed", e);
-                           }
-                           finally {
-                              await(joinBarrier); // This thread will rendezvous with dispatch thread here. This is effectively a join.
-                           }
-                        }
-                     });
-                  }
-
-                  await(joinBarrier); // This dispatch thread waits for all worker threads here.
-               }
-               passId = PASS_ID_COMPLETED_EXECUTION;
-            } // execution mode == JTP
-         }
-      } finally {
-         passId = PASS_ID_COMPLETED_EXECUTION;
-      }
-   }
-
-   private static void await(CyclicBarrier _barrier) {
-      try {
-         _barrier.await();
-      } catch (final InterruptedException e) {
-         // TODO Auto-generated catch block
-         e.printStackTrace();
-      } catch (final BrokenBarrierException e) {
-         // TODO Auto-generated catch block
-         e.printStackTrace();
-      }
-   }
-
-   private KernelArg[] args = null;
-
-   private boolean usesOopConversion = false;
-
-   /**
-    * 
-    * @param arg
-    * @return
-    * @throws AparapiException
-    */
-   private boolean prepareOopConversionBuffer(KernelArg arg) throws AparapiException {
-      usesOopConversion = true;
-      final Class<?> arrayClass = arg.getField().getType();
-      ClassModel c = null;
-      boolean didReallocate = false;
-
-      if (arg.getObjArrayElementModel() == null) {
-         final String tmp = arrayClass.getName().substring(2).replace('/', '.');
-         final String arrayClassInDotForm = tmp.substring(0, tmp.length() - 1);
-
-         if (logger.isLoggable(Level.FINE)) {
-            logger.fine("looking for type = " + arrayClassInDotForm);
-         }
-
-         // get ClassModel of obj array from entrypt.objectArrayFieldsClasses
-         c = entryPoint.getObjectArrayFieldsClasses().get(arrayClassInDotForm);
-         arg.setObjArrayElementModel(c);
-      } else {
-         c = arg.getObjArrayElementModel();
-      }
-      assert c != null : "should find class for elements " + arrayClass.getName();
-
-      final int arrayBaseOffset = UnsafeWrapper.arrayBaseOffset(arrayClass);
-      final int arrayScale = UnsafeWrapper.arrayIndexScale(arrayClass);
-
-      if (logger.isLoggable(Level.FINEST)) {
-         logger.finest("Syncing obj array type = " + arrayClass + " cvtd= " + c.getClassWeAreModelling().getName()
-               + "arrayBaseOffset=" + arrayBaseOffset + " arrayScale=" + arrayScale);
-      }
-
-      int objArraySize = 0;
-      Object newRef = null;
-      try {
-         newRef = arg.getField().get(kernel);
-         objArraySize = Array.getLength(newRef);
-      } catch (final IllegalAccessException e) {
-         throw new AparapiException(e);
-      }
-
-      assert (newRef != null) && (objArraySize != 0) : "no data";
-
-      final int totalStructSize = c.getTotalStructSize();
-      final int totalBufferSize = objArraySize * totalStructSize;
-
-      // allocate ByteBuffer if first time or array changed
-      if ((arg.getObjArrayBuffer() == null) || (newRef != arg.getArray())) {
-         final ByteBuffer structBuffer = ByteBuffer.allocate(totalBufferSize);
-         arg.setObjArrayByteBuffer(structBuffer.order(ByteOrder.LITTLE_ENDIAN));
-         arg.setObjArrayBuffer(arg.getObjArrayByteBuffer().array());
-         didReallocate = true;
-         if (logger.isLoggable(Level.FINEST)) {
-            logger.finest("objArraySize = " + objArraySize + " totalStructSize= " + totalStructSize + " totalBufferSize="
-                  + totalBufferSize);
-         }
-      } else {
-         arg.getObjArrayByteBuffer().clear();
-      }
-
-      // copy the fields that the JNI uses
-      arg.setJavaArray(arg.getObjArrayBuffer());
-      arg.setNumElements(objArraySize);
-      arg.setSizeInBytes(totalBufferSize);
-
-      for (int j = 0; j < objArraySize; j++) {
-         int sizeWritten = 0;
-
-         final Object object = UnsafeWrapper.getObject(newRef, arrayBaseOffset + (arrayScale * j));
-         for (int i = 0; i < c.getStructMemberTypes().size(); i++) {
-            final TypeSpec t = c.getStructMemberTypes().get(i);
-            final long offset = c.getStructMemberOffsets().get(i);
-
-            if (logger.isLoggable(Level.FINEST)) {
-               logger.finest("name = " + c.getStructMembers().get(i).getNameAndTypeEntry().getNameUTF8Entry().getUTF8() + " t= "
-                     + t);
-            }
-
-            switch (t) {
-               case I: {
-                  final int x = UnsafeWrapper.getInt(object, offset);
-                  arg.getObjArrayByteBuffer().putInt(x);
-                  sizeWritten += t.getSize();
-                  break;
-               }
-               case F: {
-                  final float x = UnsafeWrapper.getFloat(object, offset);
-                  arg.getObjArrayByteBuffer().putFloat(x);
-                  sizeWritten += t.getSize();
-                  break;
-               }
-               case J: {
-                  final long x = UnsafeWrapper.getLong(object, offset);
-                  arg.getObjArrayByteBuffer().putLong(x);
-                  sizeWritten += t.getSize();
-                  break;
-               }
-               case Z: {
-                  final boolean x = UnsafeWrapper.getBoolean(object, offset);
-                  arg.getObjArrayByteBuffer().put(x == true ? (byte) 1 : (byte) 0);
-                  // Booleans converted to 1 byte C chars for opencl
-                  sizeWritten += TypeSpec.B.getSize();
-                  break;
-               }
-               case B: {
-                  final byte x = UnsafeWrapper.getByte(object, offset);
-                  arg.getObjArrayByteBuffer().put(x);
-                  sizeWritten += t.getSize();
-                  break;
-               }
-               case D: {
-                  throw new AparapiException("Double not implemented yet");
-               }
-               default:
-                  assert true == false : "typespec did not match anything";
-                  throw new AparapiException("Unhandled type in buffer conversion");
-            }
-         }
-
-         // add padding here if needed
-         if (logger.isLoggable(Level.FINEST)) {
-            logger.finest("sizeWritten = " + sizeWritten + " totalStructSize= " + totalStructSize);
-         }
-
-         assert sizeWritten <= totalStructSize : "wrote too much into buffer";
-
-         while (sizeWritten < totalStructSize) {
-            if (logger.isLoggable(Level.FINEST)) {
-               logger.finest(arg.getName() + " struct pad byte = " + sizeWritten + " totalStructSize= " + totalStructSize);
-            }
-            arg.getObjArrayByteBuffer().put((byte) -1);
-            sizeWritten++;
-         }
-      }
-
-      assert arg.getObjArrayByteBuffer().arrayOffset() == 0 : "should be zero";
-
-      return didReallocate;
-   }
-
-   private void extractOopConversionBuffer(KernelArg arg) throws AparapiException {
-      final Class<?> arrayClass = arg.getField().getType();
-      final ClassModel c = arg.getObjArrayElementModel();
-      assert c != null : "should find class for elements: " + arrayClass.getName();
-      assert arg.getArray() != null : "array is null";
-
-      final int arrayBaseOffset = UnsafeWrapper.arrayBaseOffset(arrayClass);
-      final int arrayScale = UnsafeWrapper.arrayIndexScale(arrayClass);
-      if (logger.isLoggable(Level.FINEST)) {
-         logger.finest("Syncing field:" + arg.getName() + ", bb=" + arg.getObjArrayByteBuffer() + ", type = " + arrayClass);
-      }
-
-      int objArraySize = 0;
-      try {
-         objArraySize = Array.getLength(arg.getField().get(kernel));
-      } catch (final IllegalAccessException e) {
-         throw new AparapiException(e);
-      }
-
-      assert objArraySize > 0 : "should be > 0";
-
-      final int totalStructSize = c.getTotalStructSize();
-      // int totalBufferSize = objArraySize * totalStructSize;
-      // assert arg.objArrayBuffer.length == totalBufferSize : "size should match";
-
-      arg.getObjArrayByteBuffer().rewind();
-
-      for (int j = 0; j < objArraySize; j++) {
-         int sizeWritten = 0;
-         final Object object = UnsafeWrapper.getObject(arg.getArray(), arrayBaseOffset + (arrayScale * j));
-         for (int i = 0; i < c.getStructMemberTypes().size(); i++) {
-            final TypeSpec t = c.getStructMemberTypes().get(i);
-            final long offset = c.getStructMemberOffsets().get(i);
-            switch (t) {
-               case I: {
-                  // read int value from buffer and store into obj in the array
-                  final int x = arg.getObjArrayByteBuffer().getInt();
-                  if (logger.isLoggable(Level.FINEST)) {
-                     logger.finest("fType = " + t.getShortName() + " x= " + x);
-                  }
-                  UnsafeWrapper.putInt(object, offset, x);
-                  sizeWritten += t.getSize();
-                  break;
-               }
-               case F: {
-                  final float x = arg.getObjArrayByteBuffer().getFloat();
-                  if (logger.isLoggable(Level.FINEST)) {
-                     logger.finest("fType = " + t.getShortName() + " x= " + x);
-                  }
-                  UnsafeWrapper.putFloat(object, offset, x);
-                  sizeWritten += t.getSize();
-                  break;
-               }
-               case J: {
-                  final long x = arg.getObjArrayByteBuffer().getLong();
-                  if (logger.isLoggable(Level.FINEST)) {
-                     logger.finest("fType = " + t.getShortName() + " x= " + x);
-                  }
-                  UnsafeWrapper.putLong(object, offset, x);
-                  sizeWritten += t.getSize();
-                  break;
-               }
-               case Z: {
-                  final byte x = arg.getObjArrayByteBuffer().get();
-                  if (logger.isLoggable(Level.FINEST)) {
-                     logger.finest("fType = " + t.getShortName() + " x= " + x);
-                  }
-                  UnsafeWrapper.putBoolean(object, offset, (x == 1 ? true : false));
-                  // Booleans converted to 1 byte C chars for open cl
-                  sizeWritten += TypeSpec.B.getSize();
-                  break;
-               }
-               case B: {
-                  final byte x = arg.getObjArrayByteBuffer().get();
-                  if (logger.isLoggable(Level.FINEST)) {
-                     logger.finest("fType = " + t.getShortName() + " x= " + x);
-                  }
-                  UnsafeWrapper.putByte(object, offset, x);
-                  sizeWritten += t.getSize();
-                  break;
-               }
-               case D: {
-                  throw new AparapiException("Double not implemented yet");
-               }
-               default:
-                  assert true == false : "typespec did not match anything";
-                  throw new AparapiException("Unhandled type in buffer conversion");
-            }
-         }
-
-         // add padding here if needed
-         if (logger.isLoggable(Level.FINEST)) {
-            logger.finest("sizeWritten = " + sizeWritten + " totalStructSize= " + totalStructSize);
-         }
-
-         assert sizeWritten <= totalStructSize : "wrote too much into buffer";
-
-         while (sizeWritten < totalStructSize) {
-            // skip over pad bytes
-            arg.getObjArrayByteBuffer().get();
-            sizeWritten++;
-         }
-      }
-   }
-
-   private void restoreObjects() throws AparapiException {
-      for (int i = 0; i < argc; i++) {
-         final KernelArg arg = args[i];
-         if ((arg.getType() & ARG_OBJ_ARRAY_STRUCT) != 0) {
-            extractOopConversionBuffer(arg);
-         }
-      }
-   }
-
-   private boolean updateKernelArrayRefs() throws AparapiException {
-      boolean needsSync = false;
-
-      for (int i = 0; i < argc; i++) {
-         final KernelArg arg = args[i];
-         try {
-            if ((arg.getType() & ARG_ARRAY) != 0) {
-               Object newArrayRef;
-               newArrayRef = arg.getField().get(kernel);
-
-               if (newArrayRef == null) {
-                  throw new IllegalStateException("Cannot send null refs to kernel, reverting to java");
-               }
-
-               String fieldName = arg.getField().getName();
-               int arrayLength = Array.getLength(newArrayRef);
-               Integer privateMemorySize = ClassModel.getPrivateMemorySizeFromField(arg.getField());
-               if (privateMemorySize == null) {
-                  privateMemorySize = ClassModel.getPrivateMemorySizeFromFieldName(fieldName);
-               }
-               if (privateMemorySize != null) {
-                  if (arrayLength > privateMemorySize) {
-                     throw new IllegalStateException("__private array field " + fieldName + " has illegal length " + arrayLength
-                           + " > " + privateMemorySize);
-                  }
-               }
-
-               if ((arg.getType() & ARG_OBJ_ARRAY_STRUCT) != 0) {
-                  prepareOopConversionBuffer(arg);
-               } else {
-                  // set up JNI fields for normal arrays
-                  arg.setJavaArray(newArrayRef);
-                  arg.setNumElements(arrayLength);
-                  arg.setSizeInBytes(arg.getNumElements() * arg.getPrimitiveSize());
-
-                  if (((args[i].getType() & ARG_EXPLICIT) != 0) && puts.contains(newArrayRef)) {
-                     args[i].setType(args[i].getType() | ARG_EXPLICIT_WRITE);
-                     // System.out.println("detected an explicit write " + args[i].name);
-                     puts.remove(newArrayRef);
-                  }
-               }
-
-               if (newArrayRef != arg.getArray()) {
-                  needsSync = true;
-
-                  if (logger.isLoggable(Level.FINE)) {
-                     logger.fine("saw newArrayRef for " + arg.getName() + " = " + newArrayRef + ", newArrayLen = "
-                           + Array.getLength(newArrayRef));
-                  }
-               }
-
-               arg.setArray(newArrayRef);
-               assert arg.getArray() != null : "null array ref";
-            } else if ((arg.getType() & ARG_APARAPI_BUFFER) != 0) {
-               // TODO: check if the 2D/3D array is changed. 
-               //   can Arrays.equals help?
-               needsSync = true; // Always need syn
-               Object buffer = new Object();
-               try {
-                  buffer = arg.getField().get(kernel);
-               } catch (IllegalAccessException e) {
-                  e.printStackTrace();
-               }
-               int numDims = arg.getNumDims();
-               Object subBuffer = buffer;
-               int[] dims = new int[numDims];
-               for (int d = 0; d < numDims - 1; d++) {
-                  dims[d] = Array.getLength(subBuffer);
-                  subBuffer = Array.get(subBuffer, 0);
-               }
-               dims[numDims - 1] = Array.getLength(subBuffer);
-               arg.setDims(dims);
-
-               int primitiveSize = getPrimitiveSize(arg.getType());
-               int totalElements = 1;
-               for (int d = 0; d < numDims; d++) {
-                  totalElements *= dims[d];
-               }
-               arg.setJavaBuffer(buffer);
-               arg.setSizeInBytes(totalElements * primitiveSize);
-               arg.setArray(buffer);
-            }
-         } catch (final IllegalArgumentException e) {
-            e.printStackTrace();
-         } catch (final IllegalAccessException e) {
-            e.printStackTrace();
-         }
-      }
-      return needsSync;
-   }
-
-   @SuppressWarnings("deprecation")
-   private Kernel executeOpenCL(ExecutionSettings _settings) throws AparapiException {
-
-      // Read the array refs after kernel may have changed them
-      // We need to do this as input to computing the localSize
-      assert args != null : "args should not be null";
-      final boolean needSync = updateKernelArrayRefs();
-      if (needSync && logger.isLoggable(Level.FINE)) {
-         logger.fine("Need to resync arrays on " + kernel);
-      }
-
-      // native side will reallocate array buffers if necessary
-      int returnValue = runKernelJNI(jniContextHandle, _settings.range, needSync, _settings.passes, inBufferRemote, outBufferRemote);
-      if (returnValue != 0) {
-         String reason = "OpenCL execution seems to have failed (runKernelJNI returned " + returnValue + ")";
-         return fallBackToNextDevice(_settings, new AparapiException(reason));
-      }
-
-      if (usesOopConversion == true) {
-         restoreObjects();
-      }
-
-      if (logger.isLoggable(Level.FINE)) {
-         logger.fine("executeOpenCL completed. " + _settings.range);
-      }
-
-      return kernel;
-   }
-
-   @SuppressWarnings("deprecation")
-   synchronized private Kernel fallBackByExecutionMode(ExecutionSettings _settings) {
-      isFallBack = true;
-      if (kernel.hasNextExecutionMode()) {
-         kernel.tryNextExecutionMode();
-         if (logger.isLoggable(Level.WARNING)) {
-            logger.warning("Trying next execution mode " + kernel.getExecutionMode());
-         }
-      } else {
-         kernel.setFallbackExecutionMode();
-      }
-      recreateRange(_settings);
-      return executeInternalInner(_settings);
-   }
-
-   private void recreateRange(ExecutionSettings _settings) {
-      if (_settings.range.isLocalIsDerived() && !_settings.legacyExecutionMode) {
-         Device device = kernel.getTargetDevice();
-         Range result;
-         switch (_settings.range.getDims()) {
-            case 1: {
-               result = Range.create(device, _settings.range.getGlobalSize_0());
-               break;
-            }
-            case 2: {
-               result = Range.create2D(device, _settings.range.getGlobalSize_0(), _settings.range.getGlobalSize_1());
-               break;
-            }
-            case 3: {
-               result = Range.create3D(device, _settings.range.getGlobalSize_0(), _settings.range.getGlobalSize_1(), _settings.range.getGlobalSize_2());
-               break;
-            }
-            default: {
-               throw new AssertionError("Range.getDims() = " + _settings.range.getDims());
-            }
-         }
-         _settings.range = result;
-      }
-   }
-
-   private Kernel fallBackToNextDevice(ExecutionSettings _settings, String _reason) {
-      return fallBackToNextDevice(_settings, new AparapiException(_reason));
-   }
-
-   @SuppressWarnings("deprecation")
-   synchronized private Kernel fallBackToNextDevice(ExecutionSettings _settings, Exception _exception) {
-      return fallBackToNextDevice(_settings, _exception, false);
-   }
-
-   @SuppressWarnings("deprecation")
-   synchronized private Kernel fallBackToNextDevice(ExecutionSettings _settings, Exception _exception, boolean _silently) {
-      isFallBack = true;
-      _settings.profile.onEvent(ProfilingEvent.EXECUTED);
-      if (_settings.legacyExecutionMode) {
-         if (!_silently && logger.isLoggable(Level.WARNING)) {
-            logger.warning("Execution mode " + kernel.getExecutionMode() + " failed for " + kernel + ": " + _exception.getMessage());
-             _exception.printStackTrace();
-          }
-          return fallBackByExecutionMode(_settings);
-      } else {
-         KernelPreferences preferences = KernelManager.instance().getPreferences(kernel);
-         if (!_silently && logger.isLoggable(Level.WARNING)) {
-            logger.warning("Device failed for " + kernel + ": " + _exception.getMessage());
-         }
-
-         preferences.markPreferredDeviceFailed();
-
-//         Device nextDevice = preferences.getPreferredDevice(kernel);
-//
-//         if (nextDevice == null) {
-//            if (!_silently && logger.isLoggable(Level.SEVERE)) {
-//               logger.severe("No Devices left to try, giving up");
-//            }
-//            throw new RuntimeException(_exception);
-//         }
-         if (!_silently && logger.isLoggable(Level.WARNING)) {
-            _exception.printStackTrace();
-            logger.warning("Trying next device: " + describeDevice());
-         }
-      }
-
-      recreateRange(_settings);
-      return executeInternalInner(_settings);
-   }
-
-   @SuppressWarnings("deprecation")
-   public synchronized Kernel execute(String _entrypoint, final Range _range, final int _passes) {
-      executing = true;
-      try {
-         clearCancelMultiPass();
-         KernelProfile profile = KernelManager.instance().getProfile(kernel.getClass());
-         KernelPreferences preferences = KernelManager.instance().getPreferences(kernel);
-         boolean legacyExecutionMode = kernel.getExecutionMode() != Kernel.EXECUTION_MODE.AUTO;
-
-         ExecutionSettings settings = new ExecutionSettings(preferences, profile, _entrypoint, _range, _passes, legacyExecutionMode);
-         // Two Kernels of the same class share the same KernelPreferences object, and since failure (fallback) generally mutates
-         // the preferences object, we must lock it. Note this prevents two Kernels of the same class executing simultaneously.
-         synchronized (preferences) {
-            return executeInternalOuter(settings);
-         }
-      } finally {
-         executing = false;
-         clearCancelMultiPass();
-      }
-   }
-
-   private synchronized Kernel executeInternalOuter(ExecutionSettings _settings) {
-      try {
-         return executeInternalInner(_settings);
-      } finally {
-         if (kernel.isAutoCleanUpArrays() &&_settings.range.getGlobalSize_0() != 0) {
-            cleanUpArrays();
-         }
-      }
-   }
-
-   @SuppressWarnings("deprecation")
-   private synchronized Kernel executeInternalInner(ExecutionSettings _settings) {
-
-      if (_settings.range == null) {
-         throw new IllegalStateException("range can't be null");
-      }
-
-      EXECUTION_MODE requestedExecutionMode = kernel.getExecutionMode();
-
-      if (requestedExecutionMode.isOpenCL() && _settings.range.getDevice() != null && !(_settings.range.getDevice() instanceof OpenCLDevice)) {
-         fallBackToNextDevice(_settings, "OpenCL EXECUTION_MODE was requested but Device supplied was not an OpenCLDevice");
-      }
-
-      Device device = _settings.range.getDevice();
-      boolean userSpecifiedDevice = true;
-      if (device == null) {
-         userSpecifiedDevice = false;
-         if (!_settings.legacyExecutionMode) {
-            device = _settings.preferences.getPreferredDevice(kernel);
-            if (device == null) {
-               // the default fallback when KernelPreferences has run out of options is JTP
-               device = JavaDevice.THREAD_POOL;
-            }
-         } else {
-            if (requestedExecutionMode == EXECUTION_MODE.JTP) {
-               device = JavaDevice.THREAD_POOL;
-            } else if (requestedExecutionMode == EXECUTION_MODE.SEQ) {
-               device = JavaDevice.SEQUENTIAL;
-            }
-         }
-      } else {
-         boolean compatible = isDeviceCompatible(device);
-         if (!compatible) {
-            throw new AssertionError("user supplied Device incompatible with current EXECUTION_MODE or getTargetDevice(); device = "
-                    + device.getShortDescription() + "; kernel = " + kernel);
-         }
-      }
-
-      try {
-         OpenCLDevice openCLDevice = device instanceof OpenCLDevice ? (OpenCLDevice) device : null;
-
-         int jniFlags = 0;
-         // for legacy reasons use old logic where Kernel.EXECUTION_MODE is not AUTO
-         if (_settings.legacyExecutionMode && !userSpecifiedDevice && requestedExecutionMode.isOpenCL()) {
-            if (requestedExecutionMode.equals(EXECUTION_MODE.GPU)) {
-               // Get the best GPU
-               openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.bestGPU();
-               jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now.
-               if (openCLDevice == null) {
-                  return fallBackToNextDevice(_settings, "GPU request can't be honored, no GPU device");
-               }
-            } else if (requestedExecutionMode.equals(EXECUTION_MODE.ACC)) {
-               // Get the best ACC
-               openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.bestACC();
-               jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now.
-               if (openCLDevice == null) {
-                  return fallBackToNextDevice(_settings, "ACC request can't be honored, no ACC device");
-               }
-            } else {
-               // We fetch the first CPU device
-               openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.firstDevice(Device.TYPE.CPU);
-               if (openCLDevice == null) {
-                  return fallBackToNextDevice(_settings, "CPU request can't be honored, no CPU device");
-               }
-            }
-         } else {
-            if (device.getType() == Device.TYPE.GPU) {
-               jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now.
-            } else if (device.getType() == Device.TYPE.ACC) {
-               jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now.
-            }
-         }
-         if (device == null && openCLDevice != null) {
-            device = openCLDevice;
-         }
-         assert device != null : "No device available";
-         _settings.profile.onStart(device);
-         /* for backward compatibility reasons we still honor execution mode */
-         boolean isOpenCl = requestedExecutionMode.isOpenCL() || device instanceof OpenCLDevice;
-         if (isOpenCl) {
-            if ((entryPoint == null) || (isFallBack)) {
-               if (entryPoint == null) {
-                  try {
-                     final ClassModel classModel = ClassModel.createClassModel(kernel.getClass());
-                     entryPoint = classModel.getEntrypoint(_settings.entrypoint, kernel);
-                     _settings.profile.onEvent(ProfilingEvent.CLASS_MODEL_BUILT);
-                  } catch (final Exception exception) {
-                     _settings.profile.onEvent(ProfilingEvent.CLASS_MODEL_BUILT);
-                     return fallBackToNextDevice(_settings, exception);
-                  }
-               }
-
-               if ((entryPoint != null)) {
-                  synchronized (Kernel.class) { // This seems to be needed because of a race condition uncovered with issue #68 http://code.google.com/p/aparapi/issues/detail?id=68
-
-                     //  jniFlags |= (Config.enableProfiling ? JNI_FLAG_ENABLE_PROFILING : 0);
-                     //  jniFlags |= (Config.enableProfilingCSV ? JNI_FLAG_ENABLE_PROFILING_CSV | JNI_FLAG_ENABLE_PROFILING : 0);
-                     //  jniFlags |= (Config.enableVerboseJNI ? JNI_FLAG_ENABLE_VERBOSE_JNI : 0);
-                     // jniFlags |= (Config.enableVerboseJNIOpenCLResourceTracking ? JNI_FLAG_ENABLE_VERBOSE_JNI_OPENCL_RESOURCE_TRACKING :0);
-                     // jniFlags |= (kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.GPU) ? JNI_FLAG_USE_GPU : 0);
-                     // Init the device to check capabilities before emitting the
-                     // code that requires the capabilities.
-                     jniContextHandle = initJNI(kernel, openCLDevice, jniFlags); // openCLDevice will not be null here
-                     _settings.profile.onEvent(ProfilingEvent.INIT_JNI);
-                  } // end of synchronized! issue 68
-
-                  if (jniContextHandle == 0) {
-                     return fallBackToNextDevice(_settings, "initJNI failed to return a valid handle");
-                  }
-
-                  final String extensions = getExtensionsJNI(jniContextHandle);
-                  capabilitiesSet = new HashSet<String>();
-
-                  final StringTokenizer strTok = new StringTokenizer(extensions);
-                  while (strTok.hasMoreTokens()) {
-                     capabilitiesSet.add(strTok.nextToken());
-                  }
-
-                  if (logger.isLoggable(Level.FINE)) {
-                     logger.fine("Capabilities initialized to :" + capabilitiesSet.toString());
-                  }
-
-                  if (entryPoint.requiresDoublePragma() && !hasFP64Support()) {
-                     return fallBackToNextDevice(_settings, "FP64 required but not supported");
-                  }
-
-                  if (entryPoint.requiresByteAddressableStorePragma() && !hasByteAddressableStoreSupport()) {
-                     return fallBackToNextDevice(_settings, "Byte addressable stores required but not supported");
-                  }
-
-                  final boolean all32AtomicsAvailable = hasGlobalInt32BaseAtomicsSupport()
-                        && hasGlobalInt32ExtendedAtomicsSupport() && hasLocalInt32BaseAtomicsSupport()
-                        && hasLocalInt32ExtendedAtomicsSupport();
-
-                  if (entryPoint.requiresAtomic32Pragma() && !all32AtomicsAvailable) {
-
-                     return fallBackToNextDevice(_settings, "32 bit Atomics required but not supported");
-                  }
-
-                  String openCL;
-                  synchronized (openCLCache) {
-                     openCL = openCLCache.get(kernel.getClass());
-                     if (openCL == null) {
-                        try {
-                           openCL = KernelWriter.writeToString(entryPoint);
-                           if (logger.isLoggable(Level.INFO)) {
-                              logger.info(openCL);
-                           }
-                           else if (Config.enableShowGeneratedOpenCL) {
-                              System.out.println(openCL);
-                           }
-                           _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED);
-                           openCLCache.put(kernel.getClass(), openCL);
-                        }
-                        catch (final CodeGenException codeGenException) {
-                           openCLCache.put(kernel.getClass(), CODE_GEN_ERROR_MARKER);
-                           _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED);
-                           return fallBackToNextDevice(_settings, codeGenException);
-                        }
-                     }
-                     else {
-                        if (openCL.equals(CODE_GEN_ERROR_MARKER)) {
-                           _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED);
-                           boolean silently = true; // since we must have already reported the CodeGenException
-                           return fallBackToNextDevice(_settings, null, silently);
-                        }
-                     }
-                  }
-
-                  // Send the string to OpenCL to compile it, or if the compiled binary is already cached on JNI side just empty string to use cached binary
-                  long handle;
-                  if (BINARY_CACHING_DISABLED) {
-                     handle = buildProgramJNI(jniContextHandle, openCL, "");
-                  } else {
-                     synchronized (seenBinaryKeys) {
-                        String binaryKey = kernel.getClass().getName() + ":" + device.getDeviceId();
-                        if (seenBinaryKeys.contains(binaryKey)) {
-                           // use cached binary
-                           logger.log(Level.INFO, "reusing cached binary for " + binaryKey);
-                           handle = buildProgramJNI(jniContextHandle, "", binaryKey);
-                        }
-                        else {
-                           // create and cache binary
-                           logger.log(Level.INFO, "compiling new binary for " + binaryKey);
-                           handle = buildProgramJNI(jniContextHandle, openCL, binaryKey);
-                           seenBinaryKeys.add(binaryKey);
-                        }
-                     }
-                  }
-                  _settings.profile.onEvent(ProfilingEvent.OPENCL_COMPILED);
-                  if (handle == 0) {
-                     return fallBackToNextDevice(_settings, "OpenCL compile failed");
-                  }
-
-                  args = new KernelArg[entryPoint.getReferencedFields().size()];
-                  int i = 0;
-
-                  for (final Field field : entryPoint.getReferencedFields()) {
-                     try {
-                        field.setAccessible(true);
-                        args[i] = new KernelArg();
-                        args[i].setName(field.getName());
-                        args[i].setField(field);
-                        if ((field.getModifiers() & Modifier.STATIC) == Modifier.STATIC) {
-                           args[i].setType(args[i].getType() | ARG_STATIC);
-                        }
-
-                        final Class<?> type = field.getType();
-                        if (type.isArray()) {
-
-                           if (field.getAnnotation(Local.class) != null || args[i].getName().endsWith(Kernel.LOCAL_SUFFIX)) {
-                              args[i].setType(args[i].getType() | ARG_LOCAL);
-                           } else if ((field.getAnnotation(Constant.class) != null)
-                                 || args[i].getName().endsWith(Kernel.CONSTANT_SUFFIX)) {
-                              args[i].setType(args[i].getType() | ARG_CONSTANT);
-                           } else {
-                              args[i].setType(args[i].getType() | ARG_GLOBAL);
-                           }
-                           if (isExplicit()) {
-                              args[i].setType(args[i].getType() | ARG_EXPLICIT);
-                           }
-                           // for now, treat all write arrays as read-write, see bugzilla issue 4859
-                           // we might come up with a better solution later
-                           args[i].setType(args[i].getType()
-                                 | (entryPoint.getArrayFieldAssignments().contains(field.getName()) ? (ARG_WRITE | ARG_READ) : 0));
-                           args[i].setType(args[i].getType()
-                                 | (entryPoint.getArrayFieldAccesses().contains(field.getName()) ? ARG_READ : 0));
-                           // args[i].type |= ARG_GLOBAL;
-
-                           if (type.getName().startsWith("[L")) {
-                              args[i].setArray(null); // will get updated in updateKernelArrayRefs
-                              args[i].setType(args[i].getType()
-                                    | (ARG_ARRAY | ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ));
-
-                              if (logger.isLoggable(Level.FINE)) {
-                                 logger.fine("tagging " + args[i].getName() + " as (ARG_ARRAY | ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)");
-                              }
-                           } else if (type.getName().startsWith("[[")) {
-
-                              try {
-                                 setMultiArrayType(args[i], type);
-                              } catch (AparapiException e) {
-                                 return fallBackToNextDevice(_settings, "failed to set kernel arguement "
-                                       + args[i].getName() + ".  Aparapi only supports 2D and 3D arrays.");
-                              }
-                           } else {
-
-                              args[i].setArray(null); // will get updated in updateKernelArrayRefs
-                              args[i].setType(args[i].getType() | ARG_ARRAY);
-
-                              args[i].setType(args[i].getType() | (type.isAssignableFrom(float[].class) ? ARG_FLOAT : 0));
-                              args[i].setType(args[i].getType() | (type.isAssignableFrom(int[].class) ? ARG_INT : 0));
-                              args[i].setType(args[i].getType() | (type.isAssignableFrom(boolean[].class) ? ARG_BOOLEAN : 0));
-                              args[i].setType(args[i].getType() | (type.isAssignableFrom(byte[].class) ? ARG_BYTE : 0));
-                              args[i].setType(args[i].getType() | (type.isAssignableFrom(char[].class) ? ARG_CHAR : 0));
-                              args[i].setType(args[i].getType() | (type.isAssignableFrom(double[].class) ? ARG_DOUBLE : 0));
-                              args[i].setType(args[i].getType() | (type.isAssignableFrom(long[].class) ? ARG_LONG : 0));
-                              args[i].setType(args[i].getType() | (type.isAssignableFrom(short[].class) ? ARG_SHORT : 0));
-
-                              // arrays whose length is used will have an int arg holding
-                              // the length as a kernel param
-                              if (entryPoint.getArrayFieldArrayLengthUsed().contains(args[i].getName())) {
-                                 args[i].setType(args[i].getType() | ARG_ARRAYLENGTH);
-                              }
-
-                              if (type.getName().startsWith("[L")) {
-                                 args[i].setType(args[i].getType() | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ));
-                                 if (logger.isLoggable(Level.FINE)) {
-                                    logger.fine("tagging " + args[i].getName()
-                                          + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)");
-                                 }
-                              }
-                           }
-                        } else if (type.isAssignableFrom(float.class)) {
-                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                           args[i].setType(args[i].getType() | ARG_FLOAT);
-                        } else if (type.isAssignableFrom(int.class)) {
-                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                           args[i].setType(args[i].getType() | ARG_INT);
-                        } else if (type.isAssignableFrom(double.class)) {
-                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                           args[i].setType(args[i].getType() | ARG_DOUBLE);
-                        } else if (type.isAssignableFrom(long.class)) {
-                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                           args[i].setType(args[i].getType() | ARG_LONG);
-                        } else if (type.isAssignableFrom(boolean.class)) {
-                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                           args[i].setType(args[i].getType() | ARG_BOOLEAN);
-                        } else if (type.isAssignableFrom(byte.class)) {
-                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                           args[i].setType(args[i].getType() | ARG_BYTE);
-                        } else if (type.isAssignableFrom(char.class)) {
-                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                           args[i].setType(args[i].getType() | ARG_CHAR);
-                        } else if (type.isAssignableFrom(short.class)) {
-                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                           args[i].setType(args[i].getType() | ARG_SHORT);
-                        }
-                        // System.out.printf("in execute, arg %d %s %08x\n", i,args[i].name,args[i].type );
-                     } catch (final IllegalArgumentException e) {
-                        e.printStackTrace();
-                     }
-
-                     args[i].setPrimitiveSize(getPrimitiveSize(args[i].getType()));
-
-                     if (logger.isLoggable(Level.FINE)) {
-                        logger.fine("arg " + i + ", " + args[i].getName() + ", type=" + Integer.toHexString(args[i].getType())
-                              + ", primitiveSize=" + args[i].getPrimitiveSize());
-                     }
-
-                     i++;
-                  }
-
-                  // at this point, i = the actual used number of arguments
-                  // (private buffers do not get treated as arguments)
-
-                  argc = i;
-
-                  setArgsJNI(jniContextHandle, args, argc);
-                  _settings.profile.onEvent(ProfilingEvent.PREPARE_EXECUTE);
-                  try {
-                     executeOpenCL(_settings);
-                     isFallBack = false;
-                  } catch (final AparapiException e) {
-                     fallBackToNextDevice(_settings, e);
-                  }
-               } else { // (entryPoint != null) && !entryPoint.shouldFallback()
-                  fallBackToNextDevice(_settings, "failed to locate entrypoint");
-               }
-            } else { // (entryPoint == null) || (isFallBack)
-               try {
-                  executeOpenCL(_settings);
-                  isFallBack = false;
-               } catch (final AparapiException e) {
-                  fallBackToNextDevice(_settings, e);
-               }
-            }
-         } else { // isOpenCL
-            if (!(device instanceof JavaDevice)) {
-               fallBackToNextDevice(_settings, "Non-OpenCL Kernel.EXECUTION_MODE requested but device is not a JavaDevice ");
-            }
-            executeJava(_settings, (JavaDevice) device);
-         }
-
-         if (Config.enableExecutionModeReporting) {
-            System.out.println("execution complete: " + kernel);
-         }
-
-         return kernel;
-      }
-      finally {
-         _settings.profile.onEvent(ProfilingEvent.EXECUTED);
-         maybeReportProfile(_settings);
-      }
-   }
-
-   @Override
-   public String toString() {
-      return "KernelRunner{" + kernel + "}";
-   }
-
-   private String describeDevice() {
-      Device device = KernelManager.instance().getPreferences(kernel).getPreferredDevice(kernel);
-      return (device == null) ? "<default fallback>" : device.getShortDescription();
-   }
-
-   private void maybeReportProfile(ExecutionSettings _settings) {
-      if (Config.dumpProfileOnExecution) {
-         StringBuilder report = new StringBuilder();
-         report.append(KernelDeviceProfile.getTableHeader()).append('\n');
-         report.append(_settings.profile.getLastDeviceProfile().getLastAsTableRow());
-         System.out.println(report);
-      }
-   }
-
-   @SuppressWarnings("deprecation")
-   private boolean isDeviceCompatible(Device device) {
-      Kernel.EXECUTION_MODE mode = kernel.getExecutionMode();
-      if (mode != Kernel.EXECUTION_MODE.AUTO) {
-         switch (device.getType()) {
-            case GPU:
-               return mode == Kernel.EXECUTION_MODE.GPU;
-            case CPU:
-               return mode == Kernel.EXECUTION_MODE.CPU;
-            case JTP:
-               return mode == Kernel.EXECUTION_MODE.JTP;
-            case SEQ:
-               return mode == Kernel.EXECUTION_MODE.SEQ;
-            case ACC:
-               return mode == Kernel.EXECUTION_MODE.ACC;
-            default:
-               return false;
-         }
-      } else {
-         return (device == kernel.getTargetDevice());
-      }
-   }
-
-   public int getCancelState() {
-      return inBufferRemoteInt.get(0);
-   }
-
-   public void cancelMultiPass() {
-      inBufferRemoteInt.put(0, CANCEL_STATUS_TRUE);
-   }
-
-   private void clearCancelMultiPass() {
-      inBufferRemoteInt.put(0, CANCEL_STATUS_FALSE);
-   }
-
-   /**
-    * Returns the index of the current pass, or one of two special constants with negative values to indicate special progress states. Those constants are
-    * {@link #PASS_ID_PREPARING_EXECUTION} to indicate that the Kernel has started executing but not reached the initial pass, or
-    * {@link #PASS_ID_COMPLETED_EXECUTION} to indicate that execution is complete (possibly due to early termination via {@link #cancelMultiPass()}), i.e. the Kernel
-    * is idle. {@link #PASS_ID_COMPLETED_EXECUTION} is also returned before the first execution has been invoked.
-    *
-    * <p>This can be used, for instance, to update a visual progress bar.
-    *
-    * @see #execute(String, Range, int)
-    */
-   public int getCurrentPass() {
-      if (!executing) {
-         return PASS_ID_COMPLETED_EXECUTION;
-      }
-
-      if (kernel.isRunningCL()) {
-         return getCurrentPassRemote();
-      } else {
-         return getCurrentPassLocal();
-      }
-   }
-
-   /**
-    * True while any of the {@code execute()} methods are in progress.
-    */
-   public boolean isExecuting() {
-      return executing;
-   }
-
-   protected int getCurrentPassRemote() {
-      return outBufferRemoteInt.get(0);
-   }
-
-   private int getCurrentPassLocal() {
-      return passId;
-   }
-
-   private int getPrimitiveSize(int type) {
-      if ((type & ARG_FLOAT) != 0) {
-         return 4;
-      } else if ((type & ARG_INT) != 0) {
-         return 4;
-      } else if ((type & ARG_BYTE) != 0) {
-         return 1;
-      } else if ((type & ARG_CHAR) != 0) {
-         return 2;
-      } else if ((type & ARG_BOOLEAN) != 0) {
-         return 1;
-      } else if ((type & ARG_SHORT) != 0) {
-         return 2;
-      } else if ((type & ARG_LONG) != 0) {
-         return 8;
-      } else if ((type & ARG_DOUBLE) != 0) {
-         return 8;
-      }
-      return 0;
-   }
-
-   private void setMultiArrayType(KernelArg arg, Class<?> type) throws AparapiException {
-      arg.setType(arg.getType() | (ARG_WRITE | ARG_READ | ARG_APARAPI_BUFFER));
-      int numDims = 0;
-      while (type.getName().startsWith("[[[[")) {
-         throw new AparapiException("Aparapi only supports 2D and 3D arrays.");
-      }
-      arg.setType(arg.getType() | ARG_ARRAYLENGTH);
-      while (type.getName().charAt(numDims) == '[') {
-         numDims++;
-      }
-      arg.setNumDims(numDims);
-      arg.setJavaBuffer(null); // will get updated in updateKernelArrayRefs
-      arg.setArray(null); // will get updated in updateKernelArrayRefs
-
-      Class<?> elementType = arg.getField().getType();
-      while (elementType.isArray()) {
-         elementType = elementType.getComponentType();
-      }
-
-      if (elementType.isAssignableFrom(float.class)) {
-         arg.setType(arg.getType() | ARG_FLOAT);
-      } else if (elementType.isAssignableFrom(int.class)) {
-         arg.setType(arg.getType() | ARG_INT);
-      } else if (elementType.isAssignableFrom(boolean.class)) {
-         arg.setType(arg.getType() | ARG_BOOLEAN);
-      } else if (elementType.isAssignableFrom(byte.class)) {
-         arg.setType(arg.getType() | ARG_BYTE);
-      } else if (elementType.isAssignableFrom(char.class)) {
-         arg.setType(arg.getType() | ARG_CHAR);
-      } else if (elementType.isAssignableFrom(double.class)) {
-         arg.setType(arg.getType() | ARG_DOUBLE);
-      } else if (elementType.isAssignableFrom(long.class)) {
-         arg.setType(arg.getType() | ARG_LONG);
-      } else if (elementType.isAssignableFrom(short.class)) {
-         arg.setType(arg.getType() | ARG_SHORT);
-      }
-   }
-
-   private final Set<Object> puts = new HashSet<Object>();
-
-   /**
-    * Enqueue a request to return this array from the GPU. This method blocks until the array is available.
-    * <br/>
-    * Note that <code>Kernel.put(type [])</code> calls will delegate to this call.
-    * <br/>
-    * Package public
-    * 
-    * @param array
-    *          It is assumed that this parameter is indeed an array (of int, float, short etc).
-    * 
-    * @see Kernel#get(int[] arr)
-    * @see Kernel#get(float[] arr)
-    * @see Kernel#get(double[] arr)
-    * @see Kernel#get(long[] arr)
-    * @see Kernel#get(char[] arr)
-    * @see Kernel#get(boolean[] arr)
-    */
-   public void get(Object array) {
-      if (explicit && (kernel.isRunningCL())) {
-        // Only makes sense when we are using OpenCL
-         getJNI(jniContextHandle, array);
-      }
-   }
-
-   public List<ProfileInfo> getProfileInfo() {
-      if (explicit && (kernel.isRunningCL())) {
-         // Only makes sense when we are using OpenCL
-         return (getProfileInfoJNI(jniContextHandle));
-      } else {
-         return (null);
-      }
-   }
-
-   /**
-    * Tag this array so that it is explicitly enqueued before the kernel is executed. <br/>
-    * Note that <code>Kernel.put(type [])</code> calls will delegate to this call. <br/>
-    * Package public
-    * 
-    * @param array
-    *          It is assumed that this parameter is indeed an array (of int, float, short etc).
-    * @see Kernel#put(int[] arr)
-    * @see Kernel#put(float[] arr)
-    * @see Kernel#put(double[] arr)
-    * @see Kernel#put(long[] arr)
-    * @see Kernel#put(char[] arr)
-    * @see Kernel#put(boolean[] arr)
-    */
-
-   public void put(Object array) {
-      if (explicit && (kernel.isRunningCL())) {
-         // Only makes sense when we are using OpenCL
-         puts.add(array);
-      }
-   }
-
-   private boolean explicit = false;
-
-   public void setExplicit(boolean _explicit) {
-      explicit = _explicit;
-   }
-
-   public boolean isExplicit() {
-      return (explicit);
-   }
-
-   private static class ExecutionSettings {
-      final KernelPreferences preferences;
-      final KernelProfile profile;
-      final String entrypoint;
-      Range range;
-      final int passes;
-      final boolean legacyExecutionMode;
-
-      private ExecutionSettings(KernelPreferences preferences, KernelProfile profile, String entrypoint, Range range, int passes, boolean legacyExecutionMode) {
-         this.preferences = preferences;
-         this.profile = profile;
-         this.entrypoint = entrypoint;
-         this.range = range;
-         this.passes = passes;
-         this.legacyExecutionMode = legacyExecutionMode;
-      }
-
-      @Override
-      public String toString() {
-         return "ExecutionSettings{" +
-                 "preferences=" + preferences +
-                 ", profile=" + profile +
-                 ", entrypoint='" + entrypoint + '\'' +
-                 ", range=" + range +
-                 ", passes=" + passes +
-                 ", legacyExecutionMode=" + legacyExecutionMode +
-                 '}';
-      }
-   }
-}