diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Config.java b/com.amd.aparapi/src/java/com/amd/aparapi/Config.java
index 339ee89e9e482130aa26c93cd1904f72d3026460..fbae39bb67e433a2983a77dc7f7326ed87897a0e 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/Config.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/Config.java
@@ -37,13 +37,11 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
  */
 package com.amd.aparapi;
 
-import java.util.logging.Handler;
-import java.util.logging.Level;
-import java.util.logging.Logger;
+import com.amd.aparapi.internal.instruction.*;
+import com.amd.aparapi.internal.jni.*;
+import com.amd.aparapi.internal.tool.*;
 
-import com.amd.aparapi.internal.instruction.Instruction;
-import com.amd.aparapi.internal.jni.ConfigJNI;
-import com.amd.aparapi.internal.tool.InstructionViewer;
+import java.util.logging.*;
 
 /**
  * A central location for holding all runtime configurable properties as well as logging configuration.
@@ -99,6 +97,14 @@ public class Config extends ConfigJNI{
     *  
     */
    public static final boolean enableShowGeneratedOpenCL = Boolean.getBoolean(propPkgName + ".enableShowGeneratedOpenCL");
+   
+   /**
+    * Upon exiting the JVM, dumps kernel profiling info to standard out.
+    *
+    *  Usage -Dcom.amd.aparapi.dumpProfilesOnExit={true|false}
+    *  
+    */
+   public static final boolean dumpProfilesOnExit = Boolean.getBoolean(propPkgName + ".dumpProfilesOnExit");
 
    // Pragma/OpenCL codegen related flags
    public static final boolean enableAtomic32 = Boolean.getBoolean(propPkgName + ".enableAtomic32");
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java b/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java
index 708005ccde41337bd9e23bf26fb84043a632e2db..8bead23faddde914beb74a491a53ad23d1d03864 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java
@@ -38,8 +38,9 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 package com.amd.aparapi;
 
 import com.amd.aparapi.annotation.Experimental;
+import com.amd.aparapi.device.*;
 import com.amd.aparapi.exception.DeprecatedException;
-import com.amd.aparapi.internal.kernel.KernelRunner;
+import com.amd.aparapi.internal.kernel.*;
 import com.amd.aparapi.internal.model.CacheEnabler;
 import com.amd.aparapi.internal.model.ClassModel.ConstantPool.MethodReferenceEntry;
 import com.amd.aparapi.internal.model.ClassModel.ConstantPool.NameAndTypeEntry;
@@ -47,7 +48,7 @@ import com.amd.aparapi.internal.model.ValueCache;
 import com.amd.aparapi.internal.model.ValueCache.ThrowingValueComputer;
 import com.amd.aparapi.internal.model.ValueCache.ValueComputer;
 import com.amd.aparapi.internal.opencl.OpenCLLoader;
-import com.amd.aparapi.internal.util.UnsafeWrapper;
+import com.amd.aparapi.internal.util.*;
 
 import java.lang.annotation.Annotation;
 import java.lang.annotation.ElementType;
@@ -55,14 +56,7 @@ import java.lang.annotation.Retention;
 import java.lang.annotation.RetentionPolicy;
 import java.lang.annotation.Target;
 import java.lang.reflect.Method;
-import java.util.ArrayDeque;
-import java.util.Arrays;
-import java.util.Deque;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.LinkedHashSet;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 import java.util.concurrent.BrokenBarrierException;
 import java.util.concurrent.CyclicBarrier;
 import java.util.logging.Logger;
@@ -314,7 +308,13 @@ public abstract class Kernel implements Cloneable {
    }
 
    /**
-    * The <i>execution mode</i> ENUM enumerates the possible modes of executing a kernel. 
+    * @deprecated It is no longer recommended that {@code EXECUTION_MODE}s are used, as a more sophisticated {@link com.amd.aparapi.device.Device}
+    * preference mechanism is in place, see {@link com.amd.aparapi.internal.kernel.KernelManager}. Though {@link #setExecutionMode(EXECUTION_MODE)}
+    * is still honored, the default EXECUTION_MODE is now {@link EXECUTION_MODE#AUTO}, which indicates that the KernelManager
+    * will determine execution behaviours.
+    *
+    * <p>
+    * The <i>execution mode</i> ENUM enumerates the possible modes of executing a kernel.
     * One can request a mode of execution using the values below, and query a kernel after it first executes to 
     * determine how it executed.  
     *    
@@ -354,8 +354,12 @@ public abstract class Kernel implements Cloneable {
     * @author  gfrost AMD Javalabs
     * @version Alpha, 21/09/2010
     */
-
+   @Deprecated
    public static enum EXECUTION_MODE {
+      /**
+       *
+       */
+      AUTO,
       /**
        * A dummy value to indicate an unknown state.
        */
@@ -389,27 +393,9 @@ public abstract class Kernel implements Cloneable {
        */
       ACC;
 
-      static EXECUTION_MODE getDefaultExecutionMode() {
-         EXECUTION_MODE defaultExecutionMode = OpenCLLoader.isOpenCLAvailable() ? GPU : JTP;
-         final String executionMode = Config.executionMode;
-         if (executionMode != null) {
-            try {
-               EXECUTION_MODE requestedExecutionMode;
-               requestedExecutionMode = getExecutionModeFromString(executionMode).iterator().next();
-               logger.fine("requested execution mode =");
-               if ((OpenCLLoader.isOpenCLAvailable() && requestedExecutionMode.isOpenCL()) || !requestedExecutionMode.isOpenCL()) {
-                  defaultExecutionMode = requestedExecutionMode;
-               }
-            } catch (final Throwable t) {
-               // we will take the default
-            }
-         }
-
-         logger.fine("default execution modes = " + defaultExecutionMode);
-
-         return (defaultExecutionMode);
-      }
-
+      /**
+       * @deprecated See {@link EXECUTION_MODE}.
+       */
       static LinkedHashSet<EXECUTION_MODE> getDefaultExecutionModes() {
          LinkedHashSet<EXECUTION_MODE> defaultExecutionModes = new LinkedHashSet<EXECUTION_MODE>();
 
@@ -956,6 +942,26 @@ public abstract class Kernel implements Cloneable {
     */
    public abstract void run();
 
+   /** False by default. In the event that all preferred devices fail to execute a kernel, it is possible to supply an alternate (possibly non-parallel)
+    * execution algorithm by overriding this method to return true, and overriding {@link #executeFallbackAlgorithm(Range, int)} with the alternate
+    * algorithm.
+    */
+   public boolean hasFallbackAlgorithm() {
+      return false;
+   }
+
+   /** If {@link #hasFallbackAlgorithm()} has been overriden to return true, this method should be overriden so as to
+    * apply a single pass of the kernel's logic to the entire _range.
+    *
+    * <p>
+    * This is not normally required, as fallback to {@link JavaDevice#THREAD_POOL} will implement the algorithm in parallel. However
+    * in the event that thread pool execution may be prohibitively slow, this method might implement a "quick and dirty" approximation
+    * to the desired result (for example, a simple box-blur as opposed to a gaussian blur in an image processing application).
+    */
+   public void executeFallbackAlgorithm(Range _range, int _passId) {
+      // nothing
+   }
+
    /**
     * Invoking this method flags that once the current pass is complete execution should be abandoned. Due to the complexity of intercommunication
     * between java (or C) and executing OpenCL, this is the best we can do for general cancellation of execution at present. OpenCL 2.0 should introduce
@@ -1930,26 +1936,29 @@ public abstract class Kernel implements Cloneable {
       return kernelState;
    }
 
+   private KernelRunner prepareKernelRunner() {
+      if (kernelRunner == null) {
+         kernelRunner = new KernelRunner(this);
+      }
+      return kernelRunner;
+   }
+
    /**
     * Determine the execution time of the previous Kernel.execute(range) call.
-    * 
-    * Note that for the first call this will include the conversion time. 
-    * 
-    * @return The time spent executing the kernel (ms) 
-    * 
+    *
+    * Note that for the first call this will include the conversion time.
+    *
+    * @return The time spent executing the kernel (ms)
+    *
     * @see #getConversionTime();
     * @see #getAccumulatedExecutionTime();
-    * 
+    *
     */
-   public synchronized long getExecutionTime() {
-      return prepareKernelRunner().getExecutionTime();
-   }
-
-   private KernelRunner prepareKernelRunner() {
-      if (kernelRunner == null) {
-         kernelRunner = new KernelRunner(this);
+   public double getExecutionTime() {
+      KernelProfile profile = KernelManager.instance().getProfile(getClass());
+      synchronized (profile) {
+         return profile.getLastExecutionTime();
       }
-      return kernelRunner;
    }
 
    /**
@@ -1963,8 +1972,11 @@ public abstract class Kernel implements Cloneable {
     * @see #getConversionTime();
     * 
     */
-   public synchronized long getAccumulatedExecutionTime() {
-      return prepareKernelRunner().getAccumulatedExecutionTime();
+   public double getAccumulatedExecutionTime() {
+      KernelProfile profile = KernelManager.instance().getProfile(getClass());
+      synchronized (profile) {
+         return profile.getAccumulatedTotalTime();
+      }
    }
 
    /**
@@ -1974,8 +1986,11 @@ public abstract class Kernel implements Cloneable {
     * @see #getExecutionTime();
     * @see #getAccumulatedExecutionTime();
     */
-   public synchronized long getConversionTime() {
-      return prepareKernelRunner().getConversionTime();
+   public double getConversionTime() {
+      KernelProfile profile = KernelManager.instance().getProfile(getClass());
+      synchronized (profile) {
+         return profile.getLastConversionTime();
+      }
    }
 
    /**
@@ -1992,10 +2007,30 @@ public abstract class Kernel implements Cloneable {
       return (execute(_range, 1));
    }
 
+   @Override
+   @SuppressWarnings("deprecation")
+   public String toString() {
+      if (executionMode == EXECUTION_MODE.AUTO) {
+         List<Device> preferredDevices = KernelManager.instance().getPreferences(this).getPreferredDevices(this);
+         StringBuilder preferredDevicesSummary = new StringBuilder("{");
+         for (int i = 0; i < preferredDevices.size(); ++i) {
+            Device device = preferredDevices.get(i);
+            preferredDevicesSummary.append(device.getShortDescription());
+            if (i < preferredDevices.size() - 1) {
+               preferredDevicesSummary.append("|");
+            }
+         }
+         preferredDevicesSummary.append("}");
+         return Reflection.getSimpleName(getClass()) + ", devices=" + preferredDevicesSummary.toString();
+      } else {
+         return Reflection.getSimpleName(getClass()) + ", modes=" + executionModes + ", current = " + executionMode;
+      }
+   }
+
    /**
     * Start execution of <code>_range</code> kernels.
     * <p>
-    * When <code>kernel.execute(_range)</code> is invoked, Aparapi will schedule the execution of <code>_range</code> kernels. If the execution mode is GPU then 
+    * When <code>kernel.execute(_range)</code> is 1invoked, Aparapi will schedule the execution of <code>_range</code> kernels. If the execution mode is GPU then
     * the kernels will execute as OpenCL code on the GPU device. Otherwise, if the mode is JTP, the kernels will execute as a pool of Java threads on the CPU. 
     * <p>
     * Since adding the new <code>Range class</code> this method offers backward compatibility and merely defers to <code> return (execute(Range.create(_range), 1));</code>.
@@ -2004,7 +2039,18 @@ public abstract class Kernel implements Cloneable {
     * 
     */
    public synchronized Kernel execute(int _range) {
-      return (execute(Range.create(_range), 1));
+      return (execute(createRange(_range), 1));
+   }
+
+   @SuppressWarnings("deprecation")
+   protected Range createRange(int _range) {
+      if (executionMode.equals(EXECUTION_MODE.AUTO)) {
+         Device device = getTargetDevice();
+         Range range = Range.create(device, _range);
+         return range;
+      } else {
+         return Range.create(null, _range);
+      }
    }
 
    /**
@@ -2033,21 +2079,7 @@ public abstract class Kernel implements Cloneable {
     * 
     */
    public synchronized Kernel execute(int _range, int _passes) {
-      return (execute(Range.create(_range), _passes));
-   }
-
-   /**
-    * Start execution of <code>globalSize</code> kernels for the given entrypoint.
-    * <p>
-    * When <code>kernel.execute("entrypoint", globalSize)</code> is invoked, Aparapi will schedule the execution of <code>globalSize</code> kernels. If the execution mode is GPU then 
-    * the kernels will execute as OpenCL code on the GPU device. Otherwise, if the mode is JTP, the kernels will execute as a pool of Java threads on the CPU. 
-    * <p>
-    * @param _entry is the name of the method we wish to use as the entrypoint to the kernel
-    * @return The Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr)
-    * 
-    */
-   public synchronized Kernel execute(Entry _entry, Range _range) {
-      return prepareKernelRunner().execute(_entry, _range, 1);
+      return (execute(createRange(_range), _passes));
    }
 
    /**
@@ -2093,7 +2125,22 @@ public abstract class Kernel implements Cloneable {
       }
    }
 
+   public boolean isRunningCL() {
+      return getTargetDevice() instanceof OpenCLDevice;
+   }
+
+   public final Device getTargetDevice() {
+      return KernelManager.instance().getPreferences(this).getPreferredDevice(this);
+   }
+
+   /** @return true by default, may be overriden to allow vetoing of a device or devices by a given Kernel instance. */
+   public boolean isAllowDevice(Device _device) {
+      return true;
+   }
+
    /**
+    * @deprecated See {@link EXECUTION_MODE}
+    * <p>
     * Return the current execution mode.  
     * 
     * Before a Kernel executes, this return value will be the execution mode as determined by the setting of 
@@ -2108,11 +2155,14 @@ public abstract class Kernel implements Cloneable {
     * 
     * @see #setExecutionMode(EXECUTION_MODE)
     */
+   @Deprecated
    public EXECUTION_MODE getExecutionMode() {
       return (executionMode);
    }
 
    /**
+    * @deprecated See {@link EXECUTION_MODE}
+    * <p>
     * Set the execution mode. 
     * <p>
     * This should be regarded as a request. The real mode will be determined at runtime based on the availability of OpenCL and the characteristics of the workload.
@@ -2121,10 +2171,15 @@ public abstract class Kernel implements Cloneable {
     * 
     * @see #getExecutionMode()
     */
+   @Deprecated
    public void setExecutionMode(EXECUTION_MODE _executionMode) {
       executionMode = _executionMode;
    }
 
+   /**
+    * @deprecated See {@link EXECUTION_MODE}
+    */
+   @Deprecated
    public void setFallbackExecutionMode() {
       executionMode = EXECUTION_MODE.getFallbackExecutionMode();
    }
@@ -2718,13 +2773,24 @@ public abstract class Kernel implements Cloneable {
       return prepareKernelRunner().getProfileInfo();
    }
 
-   private final LinkedHashSet<EXECUTION_MODE> executionModes = EXECUTION_MODE.getDefaultExecutionModes();
+   /**
+    * @deprecated See {@link EXECUTION_MODE}.
+    */
+   private final LinkedHashSet<EXECUTION_MODE> executionModes = (Config.executionMode != null) ? EXECUTION_MODE.getDefaultExecutionModes() :  new LinkedHashSet<>(Collections.singleton(EXECUTION_MODE.AUTO));
 
+   /**
+    * @deprecated See {@link EXECUTION_MODE}.
+    */
    private Iterator<EXECUTION_MODE> currentMode = executionModes.iterator();
 
+   /**
+    * @deprecated See {@link EXECUTION_MODE}.
+    */
    private EXECUTION_MODE executionMode = currentMode.next();
 
    /**
+    * @deprecated See {@link EXECUTION_MODE}.
+    * <p>
     * set possible fallback path for execution modes.
     * for example setExecutionFallbackPath(GPU,CPU,JTP) will try to use the GPU
     * if it fails it will fall back to OpenCL CPU and finally it will try JTP.
@@ -2736,6 +2802,7 @@ public abstract class Kernel implements Cloneable {
    }
 
    /**
+    * @deprecated See {@link EXECUTION_MODE}.
     * @return is there another execution path we can try
     */
    public boolean hasNextExecutionMode() {
@@ -2743,6 +2810,7 @@ public abstract class Kernel implements Cloneable {
    }
 
    /**
+    * @deprecated See {@link EXECUTION_MODE}.
     * try the next execution path in the list if there aren't any more than give up
     */
    public void tryNextExecutionMode() {
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Range.java b/com.amd.aparapi/src/java/com/amd/aparapi/Range.java
index 5fb435a46b0535215833f1dea888fa4934db17bd..75db2c245b680a1e4f4b9d134a07f048292755d6 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/Range.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/Range.java
@@ -1,9 +1,9 @@
 package com.amd.aparapi;
 
-import java.util.Arrays;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.jni.*;
 
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.internal.jni.RangeJNI;
+import java.util.*;
 
 /**
  * 
@@ -56,7 +56,7 @@ public class Range extends RangeJNI{
    public static final int MAX_GROUP_SIZE = Math.max(Runtime.getRuntime().availableProcessors() * THREADS_PER_CORE,
          MAX_OPENCL_GROUP_SIZE);
 
-   private Device device = null;
+   private OpenCLDevice device = null;
 
    private int maxWorkGroupSize;
 
@@ -73,7 +73,7 @@ public class Range extends RangeJNI{
     * @param _dims
     */
    public Range(Device _device, int _dims) {
-      device = _device;
+      device = !(_device instanceof OpenCLDevice) ? null : (OpenCLDevice) _device;
       dims = _dims;
 
       if (device != null) {
@@ -317,7 +317,7 @@ public class Range extends RangeJNI{
     * For example for <code>MAX_GROUP_SIZE</code> of 64 we favor 4x4x4 over 1x16x16.
     * 
     * @param _globalWidth the width of the 3D grid we wish to process
-    * @param _globalHieght the height of the 3D grid we wish to process
+    * @param _globalHeight the height of the 3D grid we wish to process
     * @param _globalDepth the depth of the 3D grid we wish to process
     * @return
     */
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/device/Device.java b/com.amd.aparapi/src/java/com/amd/aparapi/device/Device.java
index a4bfcdeb9d6411ce52e9593e41d2fd9f3294a9eb..c3790880b8278ac5689b02e0da67fcb1b934e1e1 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/device/Device.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/device/Device.java
@@ -1,79 +1,76 @@
 package com.amd.aparapi.device;
 
-import com.amd.aparapi.Range;
-import com.amd.aparapi.device.OpenCLDevice.DeviceComparitor;
-import com.amd.aparapi.device.OpenCLDevice.DeviceSelector;
+import com.amd.aparapi.*;
+import com.amd.aparapi.internal.kernel.*;
 
 public abstract class Device{
 
    public static enum TYPE {
-      UNKNOWN,
-      GPU,
-      CPU,
-      JTP,
-      SEQ,
-      ACC
+      UNKNOWN(Integer.MAX_VALUE),
+      GPU(2),
+      CPU(3),
+      JTP(5),
+      SEQ(6),
+      ACC(1),
+      ALT(4);
+
+      /** Heuristic ranking of device types, lower is better. */
+      public final int rank;
+
+      TYPE(int rank) {
+         this.rank = rank;
+      }
    };
 
-   /**
-    * @return Now return the device of any types having the maximum compute units
+   /** @deprecated  use {@link KernelManager#bestDevice()}
+    *  @see com.amd.aparapi.device
     */
+   @Deprecated
    public static Device best() {
-      return (OpenCLDevice.select(new DeviceComparitor(){
-         @Override public OpenCLDevice select(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs) {
-            if (_deviceLhs.getMaxComputeUnits() > _deviceRhs.getMaxComputeUnits()) {
-               return (_deviceLhs);
-            } else {
-               return (_deviceRhs);
-            }
-         }
-      }));
+      return KernelManager.instance().bestDevice();
    }
 
+   /**
+    *  @see com.amd.aparapi.device
+    */
+   @SuppressWarnings("deprecation")
+   @Deprecated
    public static Device bestGPU() {
-      return (OpenCLDevice.select(new DeviceComparitor(){
-         @Override public OpenCLDevice select(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs) {
-            if (_deviceLhs.getMaxComputeUnits() > _deviceRhs.getMaxComputeUnits()) {
-               return (_deviceLhs);
-            } else {
-               return (_deviceRhs);
-            }
-         }
-      }, Device.TYPE.GPU));
-   }
-
-   public static Device bestACC() {
-      return (OpenCLDevice.select(new DeviceComparitor(){
-         @Override public OpenCLDevice select(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs) {
-            if (_deviceLhs.getMaxComputeUnits() > _deviceRhs.getMaxComputeUnits()) {
-               return (_deviceLhs);
-            } else {
-               return (_deviceRhs);
-            }
-         }
-      }, Device.TYPE.ACC));
+      return firstGPU();
    }
 
+   /**
+    *  @see com.amd.aparapi.device
+    */
+   @Deprecated
    public static Device first(final Device.TYPE _type) {
-      return (OpenCLDevice.select(new DeviceSelector(){
-         @Override public OpenCLDevice select(OpenCLDevice _device) {
-            return (_device.getType() == _type ? _device : null);
-         }
-      }));
+      return KernelManager.DeprecatedMethods.firstDevice(_type);
    }
 
+   /**
+    *  @see com.amd.aparapi.device
+    */
+   @SuppressWarnings("deprecation")
+   @Deprecated
    public static Device firstGPU() {
-      return (first(Device.TYPE.GPU));
+      return KernelManager.DeprecatedMethods.firstDevice(TYPE.GPU);
    }
 
+   /**
+    *  @see com.amd.aparapi.device
+    */
+   @SuppressWarnings("deprecation")
+   @Deprecated
    public static Device firstCPU() {
-      return (first(Device.TYPE.CPU));
-
+      return KernelManager.DeprecatedMethods.firstDevice(TYPE.CPU);
    }
 
-   public static Device firstACC() {
-      return (first(Device.TYPE.ACC));
-
+   /**
+    *  @see com.amd.aparapi.device
+    */
+   @Deprecated
+   public static Device bestACC() {
+      throw new UnsupportedOperationException();
    }
 
    protected TYPE type = TYPE.UNKNOWN;
@@ -88,6 +85,8 @@ public abstract class Device{
          0
    };
 
+   public abstract String getShortDescription();
+
    public TYPE getType() {
       return type;
    }
@@ -144,4 +143,25 @@ public abstract class Device{
          int _localDepth) {
       return (Range.create3D(this, _globalWidth, _globalHeight, _globalDepth, _localWidth, _localHeight, _localDepth));
    }
+
+   public abstract long getDeviceId();
+
+   @Override
+   public boolean equals(Object o) {
+      if (this == o) {
+         return true;
+      }
+      if (!(o instanceof Device)) {
+         return false;
+      }
+
+      Device device = (Device) o;
+
+      return getDeviceId() == device.getDeviceId();
+   }
+
+   @Override
+   public int hashCode() {
+      return Long.hashCode(getDeviceId());
+   }
 }
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/device/JavaDevice.java b/com.amd.aparapi/src/java/com/amd/aparapi/device/JavaDevice.java
index 78082e77d74d0512dd91df791b130b7beec75bf8..33f5cd4d22e02c6b7f31dc731995e2f906c5fda6 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/device/JavaDevice.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/device/JavaDevice.java
@@ -1,5 +1,32 @@
 package com.amd.aparapi.device;
 
-public class JavaDevice extends Device{
+public class JavaDevice extends Device {
 
+   public static final JavaDevice THREAD_POOL = new JavaDevice(TYPE.JTP, "Java Thread Pool", -3);
+   public static final JavaDevice ALTERNATIVE_ALGORITHM = new JavaDevice(TYPE.ALT, "Java Alternative Algorithm", -2);
+   public static final JavaDevice SEQUENTIAL = new JavaDevice(TYPE.SEQ, "Java Sequential", -1);
+
+   private final String name;
+   private final long deviceId;
+
+   private JavaDevice(TYPE _type, String _name, long deviceId) {
+      this.deviceId = deviceId;
+      this.type = _type;
+      this.name = _name;
+   }
+
+   @Override
+   public String getShortDescription() {
+      return name;
+   }
+
+   @Override
+   public long getDeviceId() {
+      return deviceId;
+   }
+
+   @Override
+   public String toString() {
+      return getShortDescription();
+   }
 }
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/device/OpenCLDevice.java b/com.amd.aparapi/src/java/com/amd/aparapi/device/OpenCLDevice.java
index 61bfe548a2b292191f91de30bc77f74a70a3b615..ce196121488778aaee505202071d933a181d46c8 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/device/OpenCLDevice.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/device/OpenCLDevice.java
@@ -1,34 +1,15 @@
 package com.amd.aparapi.device;
 
-import com.amd.aparapi.ProfileInfo;
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.lang.annotation.Annotation;
-import java.lang.reflect.InvocationHandler;
-import java.lang.reflect.Method;
-import java.lang.reflect.Proxy;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import com.amd.aparapi.Range;
-import com.amd.aparapi.internal.opencl.OpenCLArgDescriptor;
-import com.amd.aparapi.internal.opencl.OpenCLKernel;
-import com.amd.aparapi.internal.opencl.OpenCLPlatform;
-import com.amd.aparapi.internal.opencl.OpenCLProgram;
-import com.amd.aparapi.opencl.OpenCL;
-import com.amd.aparapi.opencl.OpenCL.Arg;
-import com.amd.aparapi.opencl.OpenCL.Constant;
-import com.amd.aparapi.opencl.OpenCL.GlobalReadOnly;
-import com.amd.aparapi.opencl.OpenCL.GlobalReadWrite;
-import com.amd.aparapi.opencl.OpenCL.GlobalWriteOnly;
+import com.amd.aparapi.*;
+import com.amd.aparapi.internal.opencl.*;
+import com.amd.aparapi.opencl.*;
+import com.amd.aparapi.opencl.OpenCL.*;
 import com.amd.aparapi.opencl.OpenCL.Kernel;
-import com.amd.aparapi.opencl.OpenCL.Local;
-import com.amd.aparapi.opencl.OpenCL.Resource;
-import com.amd.aparapi.opencl.OpenCL.Source;
+
+import java.io.*;
+import java.lang.annotation.*;
+import java.lang.reflect.*;
+import java.util.*;
 
 public class OpenCLDevice extends Device{
 
@@ -44,6 +25,8 @@ public class OpenCLDevice extends Device{
 
    private long maxMemAllocSize;
 
+   private String shortDescription = null;
+
    /**
     * Minimal constructor
     * 
@@ -101,6 +84,18 @@ public class OpenCLDevice extends Device{
       return (deviceId);
    }
 
+   @Override
+   public String getShortDescription() {
+      if (shortDescription == null) {
+         String vendor = platform.getName();
+         // Hopefully(!) this equates to the recognisable name of the vendor, e.g. "Intel", "NVIDIA", "AMD"
+         // Note, it is not necessarily the hardware vendor, e.g. if the AMD CPU driver (i.e. platform) is used for an Intel CPU, this will be "AMD"
+         String[] split = vendor.split("[\\s\\(\\)]"); // split on whitespace or on '(' or ')' since Intel use "Intel(R)" here
+         shortDescription = split[0] + "<" + getType() + ">";
+      }
+      return shortDescription;
+   }
+
    public static class OpenCLInvocationHandler<T extends OpenCL<T>> implements InvocationHandler{
       private final Map<String, OpenCLKernel> map;
 
@@ -380,8 +375,6 @@ public class OpenCLDevice extends Device{
          }
       }
 
-      // System.out.println("opencl{\n" + _source + "\n}opencl");
-
       final OpenCLProgram program = new OpenCLProgram(this, _source).createProgram(this);
 
       final Map<String, OpenCLKernel> map = new HashMap<String, OpenCLKernel>();
@@ -412,6 +405,22 @@ public class OpenCLDevice extends Device{
       OpenCLDevice select(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs);
    }
 
+   /** List OpenCLDevices of a given TYPE, or all OpenCLDevices if type == null. */
+   public static List<OpenCLDevice> listDevices(TYPE type) {
+      final OpenCLPlatform platform = new OpenCLPlatform(0, null, null, null);
+      final ArrayList<OpenCLDevice> results = new ArrayList<>();
+
+      for (final OpenCLPlatform p : platform.getOpenCLPlatforms()) {
+         for (final OpenCLDevice device : p.getOpenCLDevices()) {
+            if (type == null || device.getType() == type) {
+               results.add(device);
+            }
+         }
+      }
+
+      return results;
+   }
+
    public static OpenCLDevice select(DeviceSelector _deviceSelector) {
       OpenCLDevice device = null;
       final OpenCLPlatform platform = new OpenCLPlatform(0, null, null, null);
@@ -435,8 +444,10 @@ public class OpenCLDevice extends Device{
       OpenCLDevice device = null;
       final OpenCLPlatform platform = new OpenCLPlatform(0, null, null, null);
 
-      for (final OpenCLPlatform p : platform.getOpenCLPlatforms()) {
-         for (final OpenCLDevice d : p.getOpenCLDevices()) {
+      List<OpenCLPlatform> openCLPlatforms = platform.getOpenCLPlatforms();
+      for (final OpenCLPlatform p : openCLPlatforms) {
+         List<OpenCLDevice> openCLDevices = p.getOpenCLDevices();
+         for (final OpenCLDevice d : openCLDevices) {
             if (device == null) {
                device = d;
             } else {
@@ -466,7 +477,6 @@ public class OpenCLDevice extends Device{
       return (device);
    }
 
-
    @Override public String toString() {
       final StringBuilder s = new StringBuilder("{");
       boolean first = true;
@@ -482,7 +492,8 @@ public class OpenCLDevice extends Device{
 
       s.append("}");
 
-      return ("Device " + deviceId + "\n  type:" + type + "\n  maxComputeUnits=" + maxComputeUnits + "\n  maxWorkItemDimensions="
+      return ("Device " + deviceId + "\n  vendor = " + getOpenCLPlatform().getVendor()
+            + "\n  type:" + type + "\n  maxComputeUnits=" + maxComputeUnits + "\n  maxWorkItemDimensions="
             + maxWorkItemDimensions + "\n  maxWorkItemSizes=" + s + "\n  maxWorkWorkGroupSize=" + maxWorkGroupSize
             + "\n  globalMemSize=" + globalMemSize + "\n  localMemSize=" + localMemSize);
    }
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/device/package-info.java b/com.amd.aparapi/src/java/com/amd/aparapi/device/package-info.java
index babe06a6e60af0c368db8ffce5c5bfb85a16fd6b..039f1883909ce1a6b8934baa634a17763d44869f 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/device/package-info.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/device/package-info.java
@@ -1,4 +1,19 @@
 /**
+ * Contains classes representing OpenCL-capable devices, and "virtual" (java) devices which execute kernels using java.
+ *
+ * <p>Various methods of {@link com.amd.aparapi.device.Device} which selected devices of a particular type have been deprecated,
+ * as now the preferred mechanism for device selection is to rely on the {@link com.amd.aparapi.internal.kernel.KernelManager} to
+ * select an appropriate device. Where a particular device is required to be used for a certain kernel, for such purposes as
+ * debugging or unit testing, this can be achieved by using
+ * {@link com.amd.aparapi.internal.kernel.KernelManager#setKernelManager(com.amd.aparapi.internal.kernel.KernelManager)} prior to
+ * invoking any Kernel executions, by overriding {@link com.amd.aparapi.Kernel#isAllowDevice(com.amd.aparapi.device.Device)}
+  * to veto/approve devices from the available devices for a given Kernel class, or (not recommended) by using
+ * {@link com.amd.aparapi.internal.kernel.KernelManager#setPreferredDevices(com.amd.aparapi.Kernel, java.util.LinkedHashSet)} to specify
+ * a particular device list for a given Kernel class.
+ *
+ * <p>In order to determine the Device which will be used to execute a particular Kernel, use {@link com.amd.aparapi.Kernel#getTargetDevice()}.
+ * This can also be used immediately after execution to see on which device the kernel actually got executed (in case the execution failed
+ * and fell back to another device).
  *
  */
 package com.amd.aparapi.device;
\ No newline at end of file
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelArg.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelArg.java
index 9599bf0469be570c30199d78fc409e3cd76ad823..ce34d6d062e3cad65231c82e84cfc6491efa0ef2 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelArg.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelArg.java
@@ -1,11 +1,11 @@
 package com.amd.aparapi.internal.kernel;
 
-import java.lang.reflect.Field;
-import java.nio.ByteBuffer;
+import com.amd.aparapi.internal.jni.*;
+import com.amd.aparapi.internal.model.*;
+import com.amd.aparapi.internal.util.*;
 
-import com.amd.aparapi.Kernel;
-import com.amd.aparapi.internal.jni.KernelArgJNI;
-import com.amd.aparapi.internal.model.ClassModel;
+import java.lang.reflect.*;
+import java.nio.*;
 
 /**
  * Each field (or captured field in the case of an anonymous inner class) referenced by any bytecode reachable from the users Kernel.run(), will
@@ -48,7 +48,7 @@ public class KernelArg extends KernelArgJNI{
     * Default constructor
     */
    protected KernelArg() {
-
+      // empty
    }
 
    /**
@@ -260,4 +260,9 @@ public class KernelArg extends KernelArgJNI{
    protected void setDims(int[] dims) {
       this.dims = dims;
    }
+
+   @Override
+   public String toString() {
+      return Reflection.getSimpleName(field.getType()) + " " + field.getName();
+   }
 }
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java
new file mode 100644
index 0000000000000000000000000000000000000000..87e221ef9060e348a9126c53d1590ff4b3b2eee4
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java
@@ -0,0 +1,193 @@
+package com.amd.aparapi.internal.kernel;
+
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+
+import java.text.*;
+import java.util.*;
+import java.util.logging.*;
+
+/**
+ * Created by Barney on 02/09/2015.
+ */
+public class KernelDeviceProfile {
+
+   private static Logger logger = Logger.getLogger(Config.getLoggerName());
+   private static final double MILLION = 1000 * 1000;
+   private static final int TABLE_COLUMN_HEADER_WIDTH = 21;
+   private static final int TABLE_COLUMN_COUNT_WIDTH = 8;
+   private static final int TABLE_COLUMN_WIDTH;
+   private final Class<? extends Kernel> kernel;
+   private final Device device;
+   private long[] currentTimes = new long[ProfilingEvent.values().length];
+   private long[] accumulatedTimes = new long[ProfilingEvent.values().length];
+   private ProfilingEvent lastEvent = null;
+   private final DecimalFormat format;
+   private long invocationCount = 0;
+
+   static {
+      assert ProfilingEvent.START.ordinal() == 0 : "ProfilingEvent.START.ordinal() != 0";
+      int max = 0;
+      for (ProfilingEvent event : ProfilingEvent.values()) {
+         max = Math.max(max, event.name().length());
+      }
+      TABLE_COLUMN_WIDTH = max + 1;
+   }
+
+   public KernelDeviceProfile(Class<? extends Kernel> kernel, Device device) {
+      this.kernel = kernel;
+      this.device = device;
+      this.format = (DecimalFormat) DecimalFormat.getNumberInstance();
+      format.setMinimumFractionDigits(3);
+      format.setMaximumFractionDigits(3);
+   }
+
+   public void onEvent(ProfilingEvent event) {
+      if (event == ProfilingEvent.START) {
+         if (lastEvent != null) {
+            logger.log(Level.SEVERE, "ProfilingEvent.START encountered without ProfilingEvent.EXECUTED");
+         } else if (lastEvent == ProfilingEvent.START) {
+            logger.log(Level.SEVERE, "Duplicate event ProfilingEvent.START");
+         }
+         Arrays.fill(currentTimes, 0L);
+         ++invocationCount;
+      } else {
+         if (lastEvent == null) {
+            if (event != ProfilingEvent.EXECUTED) {
+               logger.log(Level.SEVERE, "ProfilingEvent.START was not invoked prior to ProfilingEvent." + event);
+            }
+         } else {
+            for (int i = lastEvent.ordinal() + 1; i < event.ordinal(); ++i) {
+               currentTimes[i] = currentTimes[i - 1];
+            }
+         }
+      }
+      currentTimes[event.ordinal()] = System.nanoTime();
+      if (event == ProfilingEvent.EXECUTED) {
+         for (int i = 1; i < currentTimes.length; ++i) {
+            long elapsed = currentTimes[i] - currentTimes[i - 1];
+            if (elapsed < 0) {
+               logger.log(Level.SEVERE, "negative elapsed time for event " + event);
+               break;
+            }
+            accumulatedTimes[i] += elapsed;
+         }
+      }
+      lastEvent = event;
+      if (event == ProfilingEvent.EXECUTED) {
+         lastEvent = null;
+      }
+   }
+
+   /** Elapsed time for a single event only, i.e. since the previous stage rather than from the start. */
+   public double getLastElapsedTime(ProfilingEvent stage) {
+      if (stage == ProfilingEvent.START) {
+         return 0;
+      }
+      return (currentTimes[stage.ordinal()] - currentTimes[stage.ordinal() - 1]) / MILLION;
+   }
+
+   /** Elapsed time for all events {@code from} through {@code to}.*/
+   public double getLastElapsedTime(ProfilingEvent from, ProfilingEvent to) {
+      return (currentTimes[to.ordinal()] - currentTimes[from.ordinal()]) / MILLION;
+   }
+
+   /** Elapsed time for a single event only, i.e. since the previous stage rather than from the start, summed over all executions. */
+   public double getCumulativeElapsedTime(ProfilingEvent stage) {
+      return (accumulatedTimes[stage.ordinal()]) / MILLION;
+   }
+
+   /** Elapsed time of entire execution, summed over all executions. */
+   public double getCumulativeElapsedTimeAll() {
+      double sum = 0;
+      for (int i = 1; i <= ProfilingEvent.EXECUTED.ordinal(); ++i) {
+         sum += accumulatedTimes[i];
+      }
+      return sum;
+   }
+
+   public static String getTableHeader() {
+      int length = ProfilingEvent.values().length;
+      StringBuilder builder = new StringBuilder(150);
+      appendRowHeaders(builder, "Device", "Count");
+      for (int i = 1; i < length; ++i) {
+         ProfilingEvent stage = ProfilingEvent.values()[i];
+         String heading = stage.name();
+         appendCell(builder, heading);
+      }
+      builder.append("  ").append("Total");
+      return builder.toString();
+   }
+
+   public String getLastAsTableRow() {
+      double total = 0;
+      StringBuilder builder = new StringBuilder(150);
+      appendRowHeaders(builder, device.getShortDescription(), String.valueOf(invocationCount));
+      for (int i = 1; i < currentTimes.length; ++i) {
+         ProfilingEvent stage = ProfilingEvent.values()[i];
+         double time = getLastElapsedTime(stage);
+         total += time;
+         String formatted = format.format(time);
+         appendCell(builder, formatted);
+      }
+      builder.append("  ").append(format.format(total));
+      return builder.toString();
+   }
+
+   public String getCumulativeAsTableRow() {
+      return internalCumulativeAsTableRow(false);
+   }
+
+   public String getAverageAsTableRow() {
+      return internalCumulativeAsTableRow(true);
+   }
+
+   private String internalCumulativeAsTableRow(boolean mean) {
+      double total = 0;
+      double count = mean ? invocationCount : 1;
+      StringBuilder builder = new StringBuilder(150);
+      appendRowHeaders(builder, device.getShortDescription(), String.valueOf(invocationCount));
+      for (int i = 1; i < currentTimes.length; ++i) {
+         ProfilingEvent stage = ProfilingEvent.values()[i];
+         double time = getCumulativeElapsedTime(stage);
+         if (mean) {
+            time /= count;
+         }
+         total += time;
+         String formatted = format.format(time);
+         appendCell(builder, formatted);
+      }
+      builder.append("  ").append(format.format(total));
+      return builder.toString();
+   }
+
+   private static void appendRowHeaders(StringBuilder builder, String device, String count) {
+      if (device.length() > TABLE_COLUMN_HEADER_WIDTH - 1) {
+         device = device.substring(0, TABLE_COLUMN_HEADER_WIDTH - 1);
+      }
+      builder.append(device);
+      int padding = TABLE_COLUMN_HEADER_WIDTH - device.length();
+      for (int i = 0; i < padding; ++i) {
+         builder.append(' ');
+      }
+
+      builder.append(count);
+      padding = TABLE_COLUMN_COUNT_WIDTH - count.length();
+      for (int i = 0; i < padding; ++i) {
+         builder.append(' ');
+      }
+   }
+
+   private static void appendCell(StringBuilder builder, String cell) {
+      int padding = TABLE_COLUMN_WIDTH - cell.length();
+      for (int paddingIndex = 0; paddingIndex < padding; ++paddingIndex) {
+         builder.append(' ');
+      }
+      builder.append(cell);
+   }
+
+   @Override
+   public String toString() {
+      return "KernelDeviceProfile{" + kernel.toString() + ", " + device.getShortDescription() + "}";
+   }
+}
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java
new file mode 100644
index 0000000000000000000000000000000000000000..c1f29cbc51c95ed04011346b978f5a1bfd635293
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java
@@ -0,0 +1,300 @@
+package com.amd.aparapi.internal.kernel;
+
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.util.*;
+
+import java.lang.reflect.*;
+import java.util.*;
+
+/**
+ * Created by Barney on 24/08/2015.
+ */
+public class KernelManager {
+
+   private static KernelManager INSTANCE = new KernelManager();
+   private LinkedHashMap<Class<? extends Kernel>, KernelPreferences> preferences = new LinkedHashMap<>();
+   private LinkedHashMap<Class<? extends Kernel>, KernelProfile> profiles = new LinkedHashMap<>();
+   private LinkedHashMap<Class<? extends Kernel>, Kernel> sharedInstances = new LinkedHashMap<>();
+
+   private KernelPreferences defaultPreferences;
+
+   protected KernelManager() {
+      defaultPreferences = createDefaultPreferences();
+   }
+
+   public static KernelManager instance() {
+      return INSTANCE;
+   }
+
+   public static void setKernelManager(KernelManager manager) {
+      INSTANCE = manager;
+   }
+
+   static {
+      if (Config.dumpProfilesOnExit) {
+         Runtime.getRuntime().addShutdownHook(new Thread() {
+            @Override
+            public void run() {
+               StringBuilder builder = new StringBuilder(2048);
+               instance().reportProfilingSummary(builder);
+               System.out.println(builder);
+            }
+         });
+      }
+   }
+
+   /** This method returns a shared instance of a given Kernel subclass. The kernelClass needs a no-args constructor, which
+    *  need not be public.
+    *
+    *  <p>Given that compilation of OpenCL is relatively expensive and that (currently!) there is no caching of compiled OpenCL
+    *  it is desirable to only ever create one instance of any given Kernel subclass, which this method facilitates.</p>
+    *
+    *  <p>In order to maintain thread saftey, it is necessary to synchronize on the returned kernel for the duration of the process of setting up,
+    *  executing and extracting the results from that kernel, when using a shared instance.</p>
+    *
+    *  @throws RuntimeException if the class cannot be instantiated
+    */
+   public static <T extends Kernel> T sharedKernelInstance(Class<T> kernelClass) {
+       return instance().getSharedKernelInstance(kernelClass);
+   }
+
+   /** Append a report to {@code builder} which contains information, per Kernel subclass, on which device is currently being used for the
+    * kernel class, and which (if any) devices failed to execute a given Kernel.
+    */
+   public void reportDeviceUsage(StringBuilder builder, boolean withProfilingInfo) {
+      builder.append("Device Usage by Kernel Subclass");
+      if (withProfilingInfo) {
+         builder.append(" (showing mean elapsed times in milliseconds)");
+      }
+      builder.append("\n\n");
+      for (Class<? extends Kernel> klass : preferences.keySet()) {
+         KernelPreferences preferences = this.preferences.get(klass);
+         KernelProfile profile = withProfilingInfo ? profiles.get(klass) : null;
+         builder.append(klass.getName()).append(":\n\tusing ").append(preferences.getPreferredDevice(null).getShortDescription());
+         List<Device> failedDevices = preferences.getFailedDevices();
+         if (failedDevices.size() > 0) {
+            builder.append(", failed devices = ");
+            for (int i = 0; i < failedDevices.size(); ++i) {
+               builder.append(failedDevices.get(i).getShortDescription());
+               if (i < failedDevices.size() - 1) {
+                  builder.append(" | ");
+               }
+            }
+         }
+         if (profile != null) {
+            builder.append("\n");
+            int row = 0;
+            for (KernelDeviceProfile deviceProfile : profile.getDeviceProfiles()) {
+               if (row == 0) {
+                  builder.append(deviceProfile.getTableHeader()).append("\n");
+               }
+               builder.append(deviceProfile.getAverageAsTableRow()).append("\n");
+               ++row;
+            }
+         }
+         builder.append("\n");
+      }
+   }
+
+   public void reportProfilingSummary(StringBuilder builder) {
+      builder.append("\nProfiles by Kernel Subclass (mean elapsed times in milliseconds)\n\n");
+      builder.append(KernelDeviceProfile.getTableHeader()).append("\n");
+      for (Class<? extends Kernel> kernelClass : profiles.keySet()) {
+         String simpleName = Reflection.getSimpleName(kernelClass);
+         String kernelName = "----------------- [[ " + simpleName + " ]] ";
+         builder.append(kernelName);
+         int dashes = 132 - kernelName.length();
+         for (int i = 0; i < dashes; ++i) {
+            builder.append('-');
+         }
+         builder.append("\n");
+         KernelProfile kernelProfile = profiles.get(kernelClass);
+         for (KernelDeviceProfile deviceProfile : kernelProfile.getDeviceProfiles()) {
+            builder.append(deviceProfile.getAverageAsTableRow()).append("\n");
+         }
+      }
+   }
+
+
+   public KernelPreferences getPreferences(Kernel kernel) {
+      synchronized (preferences) {
+         KernelPreferences kernelPreferences = preferences.get(kernel.getClass());
+         if (kernelPreferences == null) {
+            kernelPreferences = new KernelPreferences(this, kernel.getClass());
+            preferences.put(kernel.getClass(), kernelPreferences);
+         }
+         return kernelPreferences;
+      }
+   }
+
+   public void setPreferredDevices(Kernel _kernel, LinkedHashSet<Device> _devices) {
+      KernelPreferences kernelPreferences = getPreferences(_kernel);
+      kernelPreferences.setPreferredDevices(_devices);
+   }
+
+   public KernelPreferences getDefaultPreferences() {
+      return defaultPreferences;
+   }
+
+   public void setDefaultPreferredDevices(LinkedHashSet<Device> _devices) {
+      defaultPreferences.setPreferredDevices(_devices);
+   }
+
+   protected KernelPreferences createDefaultPreferences() {
+      KernelPreferences preferences = new KernelPreferences(this, null);
+      preferences.setPreferredDevices(createDefaultPreferredDevices());
+      return preferences;
+   }
+
+   private <T extends Kernel> T getSharedKernelInstance(Class<T> kernelClass) {
+      synchronized (sharedInstances) {
+         T shared = (T) sharedInstances.get(kernelClass);
+         if (shared == null) {
+            try {
+               Constructor<T> constructor = kernelClass.getConstructor();
+               constructor.setAccessible(true);
+               shared = constructor.newInstance();
+               sharedInstances.put(kernelClass, shared);
+            }
+            catch (Exception e) {
+               throw new RuntimeException(e);
+            }
+         }
+         return shared;
+      }
+   }
+
+   protected LinkedHashSet<Device> createDefaultPreferredDevices() {
+      LinkedHashSet<Device> devices = new LinkedHashSet<>();
+
+      List<OpenCLDevice> accelerators = OpenCLDevice.listDevices(Device.TYPE.ACC);
+      List<OpenCLDevice> gpus = OpenCLDevice.listDevices(Device.TYPE.GPU);
+      List<OpenCLDevice> cpus = OpenCLDevice.listDevices(Device.TYPE.CPU);
+
+      Collections.sort(accelerators, getDefaultAcceleratorComparator());
+      Collections.sort(gpus, getDefaultGPUComparator());
+
+      List<Device.TYPE> preferredDeviceTypes = getPreferredDeviceTypes();
+
+      for (Device.TYPE type : preferredDeviceTypes) {
+         switch (type) {
+            case UNKNOWN:
+               throw new AssertionError("UNKNOWN device type not supported");
+            case GPU:
+               devices.addAll(gpus);
+               break;
+            case CPU:
+               devices.add(cpus.get(0));
+               break;
+            case JTP:
+               devices.add(JavaDevice.THREAD_POOL);
+               break;
+            case SEQ:
+               devices.add(JavaDevice.SEQUENTIAL);
+               break;
+            case ACC:
+               devices.addAll(accelerators);
+               break;
+            case ALT:
+               devices.add(JavaDevice.ALTERNATIVE_ALGORITHM);
+               break;
+         }
+      }
+
+      return devices;
+   }
+
+   protected List<Device.TYPE> getPreferredDeviceTypes() {
+      return Arrays.asList(Device.TYPE.ACC, Device.TYPE.GPU, Device.TYPE.CPU, Device.TYPE.ALT, Device.TYPE.JTP);
+   }
+
+   /** NB, returns -ve for the better device. */
+   protected Comparator<OpenCLDevice> getDefaultAcceleratorComparator() {
+      return new Comparator<OpenCLDevice>() {
+         @Override
+         public int compare(OpenCLDevice left, OpenCLDevice right) {
+            return (right.getMaxComputeUnits() - left.getMaxComputeUnits());
+         }
+      };
+   }
+
+   /** NB, returns -ve for the better device. */
+   protected Comparator<OpenCLDevice> getDefaultGPUComparator() {
+      return new Comparator<OpenCLDevice>() {
+         @Override
+         public int compare(OpenCLDevice left, OpenCLDevice right) {
+            return selectLhs(left, right) ? -1 : 1;
+         }
+      };
+   }
+
+   public Device bestDevice() {
+      return getDefaultPreferences().getPreferredDevice(null);
+   }
+
+    protected static boolean selectLhs(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs) {
+       boolean nvidiaLhs = _deviceLhs.getOpenCLPlatform().getVendor().toLowerCase().contains("nvidia");
+       boolean nvidiaRhs = _deviceRhs.getOpenCLPlatform().getVendor().toLowerCase().contains("nvidia");
+       if (nvidiaLhs || nvidiaRhs) {
+          return selectLhsIfCUDA(_deviceLhs, _deviceRhs);
+       }
+       return _deviceLhs.getMaxComputeUnits() > _deviceRhs.getMaxComputeUnits();
+    }
+
+    /** NVidia/CUDA architecture reports maxComputeUnits in a completely different context, i.e. maxComputeUnits is not same as
+     * (is much less than) the number of OpenCL cores available.
+     *
+     * <p>Therefore when comparing an NVidia device we use different criteria.</p>
+     */
+    protected static boolean selectLhsIfCUDA(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs) {
+       if (_deviceLhs.getType() != _deviceRhs.getType()) {
+          return selectLhsByType(_deviceLhs.getType(), _deviceRhs.getType());
+       }
+       return _deviceLhs.getMaxWorkGroupSize() == _deviceRhs.getMaxWorkGroupSize()
+               ? _deviceLhs.getGlobalMemSize() > _deviceRhs.getGlobalMemSize()
+               : _deviceLhs.getMaxWorkGroupSize() > _deviceRhs.getMaxWorkGroupSize();
+    }
+
+   private static boolean selectLhsByType(Device.TYPE lhs, Device.TYPE rhs) {
+      return lhs.rank < rhs.rank;
+   }
+
+   public KernelProfile getProfile(Class<? extends Kernel> kernelClass) {
+      synchronized (profiles) {
+         KernelProfile profile = profiles.get(kernelClass);
+         if (profile == null) {
+            profile = new KernelProfile(kernelClass);
+            profiles.put(kernelClass, profile);
+         }
+         return profile;
+      }
+   }
+
+   /** New home for deprecated methods of {@link Device}. */
+   public static class DeprecatedMethods {
+
+      @Deprecated
+      public static Device firstDevice(Device.TYPE _type) {
+         List<Device> devices = instance().getDefaultPreferences().getPreferredDevices(null);
+         for (Device device : devices) {
+            if(device.getType() == _type) {
+               return device;
+            }
+         }
+         return null;
+      }
+
+      @SuppressWarnings("deprecation")
+      @Deprecated
+      public static Device bestGPU() {
+         return firstDevice(Device.TYPE.GPU);
+      }
+
+      @SuppressWarnings("deprecation")
+      @Deprecated
+      public static Device bestACC() {
+         return firstDevice(Device.TYPE.ACC);
+      }
+   }
+}
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManagers.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManagers.java
new file mode 100644
index 0000000000000000000000000000000000000000..8a31cd70be8d93895f8254b66a2f266f4bc164c6
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManagers.java
@@ -0,0 +1,31 @@
+package com.amd.aparapi.internal.kernel;
+
+import com.amd.aparapi.device.*;
+
+import java.util.*;
+
+/**
+ * KernelManager instances useful for debugging.
+ */
+public class KernelManagers {
+
+   public static final KernelManager JTP_ONLY = new KernelManager() {
+
+      private List<Device.TYPE> types = Collections.singletonList(Device.TYPE.JTP);
+
+      @Override
+      protected List<Device.TYPE> getPreferredDeviceTypes() {
+         return types;
+      }
+   };
+
+   public static final KernelManager SEQUENTIAL_ONLY = new KernelManager() {
+
+      private final List<Device.TYPE> types = Collections.singletonList(Device.TYPE.SEQ);
+
+      @Override
+      protected List<Device.TYPE> getPreferredDeviceTypes() {
+         return types;
+      }
+   };
+}
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java
new file mode 100644
index 0000000000000000000000000000000000000000..17e479a85fcfb7a9874943ade1d0c1cc042b674a
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java
@@ -0,0 +1,103 @@
+package com.amd.aparapi.internal.kernel;
+
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+
+import java.util.*;
+import java.util.logging.*;
+
+/**
+ * Collects profiling information per kernel class per device. Not thread safe, it is necessary for client code to correctly synchronize on
+ * objects of this class.
+ */
+public class KernelProfile {
+
+   private static Logger logger = Logger.getLogger(Config.getLoggerName());
+   private final Class<? extends Kernel> kernelClass;
+   private LinkedHashMap<Device, KernelDeviceProfile> deviceProfiles = new LinkedHashMap<>();
+   private Device currentDevice;
+   private Device lastDevice;
+   private KernelDeviceProfile currentDeviceProfile;
+
+   public KernelProfile(Class<? extends Kernel> _kernelClass) {
+      kernelClass = _kernelClass;
+   }
+
+   public double getLastExecutionTime() {
+      KernelDeviceProfile lastDeviceProfile = getLastDeviceProfile();
+      return lastDeviceProfile == null ? Double.NaN : lastDeviceProfile.getLastElapsedTime(ProfilingEvent.START, ProfilingEvent.EXECUTED);
+   }
+
+   public double getLastConversionTime() {
+      KernelDeviceProfile lastDeviceProfile = getLastDeviceProfile();
+      return lastDeviceProfile == null ? Double.NaN : lastDeviceProfile.getLastElapsedTime(ProfilingEvent.START, ProfilingEvent.EXECUTED);   }
+
+   public double getAccumulatedTotalTime() {
+      KernelDeviceProfile lastDeviceProfile = getLastDeviceProfile();
+      if (lastDeviceProfile == null) {
+         return Double.NaN;
+      }
+      else {
+         return lastDeviceProfile.getCumulativeElapsedTimeAll();
+      }
+   }
+
+   private KernelDeviceProfile getLastDeviceProfile() {
+      return null;
+   }
+
+   void onStart(Device device) {
+      currentDevice = device;
+      synchronized (deviceProfiles) {
+         currentDeviceProfile = deviceProfiles.get(device);
+         if (currentDeviceProfile == null) {
+            currentDeviceProfile = new KernelDeviceProfile(kernelClass, device);
+            deviceProfiles.put(device, currentDeviceProfile);
+         }
+      }
+      currentDeviceProfile.onEvent(ProfilingEvent.START);
+   }
+
+   void onEvent(ProfilingEvent event) {
+      switch (event) {
+         case CLASS_MODEL_BUILT: // fallthrough
+         case OPENCL_GENERATED: // fallthrough
+         case OPENCL_COMPILED: // fallthrough
+         case PREPARE_EXECUTE: // fallthrough
+         case EXECUTED: // fallthrough
+         {
+            if (currentDeviceProfile == null) {
+               logger.log(Level.SEVERE, "Error in KernelProfile, no currentDevice (synchronization error?");
+            }
+            currentDeviceProfile.onEvent(event);
+            break;
+         }
+         case START:
+            throw new IllegalArgumentException("must use onStart(Device) to start profiling");
+         default:
+            throw new IllegalArgumentException("Unhandled event " + event);
+      }
+   }
+
+   void onFinishedExecution() {
+      reset();
+   }
+
+   private void reset() {
+      lastDevice = currentDevice;
+      currentDevice = null;
+      currentDeviceProfile = null;
+   }
+
+   public Collection<Device> getDevices() {
+      return deviceProfiles.keySet();
+   }
+
+   public Collection<KernelDeviceProfile> getDeviceProfiles() {
+      return deviceProfiles.values();
+   }
+
+   public KernelDeviceProfile getDeviceProfile(Device device) {
+      return deviceProfiles.get(device);
+   }
+}
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java
index c2b69e44f1cb3564fa00d82c632d861c5ae93986..f162d695ed5130737b525cdbc707f49b41d56b30 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java
@@ -37,45 +37,25 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 */
 package com.amd.aparapi.internal.kernel;
 
-import com.amd.aparapi.Config;
-import com.amd.aparapi.Kernel;
+import com.amd.aparapi.*;
 import com.amd.aparapi.Kernel.Constant;
-import com.amd.aparapi.Kernel.EXECUTION_MODE;
-import com.amd.aparapi.Kernel.KernelState;
-import com.amd.aparapi.Kernel.Local;
-import com.amd.aparapi.ProfileInfo;
-import com.amd.aparapi.Range;
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.device.OpenCLDevice;
-import com.amd.aparapi.internal.annotation.UsedByJNICode;
-import com.amd.aparapi.internal.exception.AparapiException;
-import com.amd.aparapi.internal.exception.CodeGenException;
-import com.amd.aparapi.internal.instruction.InstructionSet.TypeSpec;
-import com.amd.aparapi.internal.jni.KernelRunnerJNI;
-import com.amd.aparapi.internal.model.ClassModel;
-import com.amd.aparapi.internal.model.Entrypoint;
-import com.amd.aparapi.internal.util.UnsafeWrapper;
-import com.amd.aparapi.internal.writer.KernelWriter;
-import com.amd.aparapi.opencl.OpenCL;
-
-import java.lang.reflect.Array;
-import java.lang.reflect.Field;
-import java.lang.reflect.Modifier;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.nio.IntBuffer;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import java.util.StringTokenizer;
-import java.util.concurrent.BrokenBarrierException;
-import java.util.concurrent.CyclicBarrier;
-import java.util.concurrent.ForkJoinPool;
-import java.util.concurrent.ForkJoinPool.ForkJoinWorkerThreadFactory;
-import java.util.concurrent.ForkJoinPool.ManagedBlocker;
-import java.util.concurrent.ForkJoinWorkerThread;
-import java.util.logging.Level;
-import java.util.logging.Logger;
+import com.amd.aparapi.Kernel.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.annotation.*;
+import com.amd.aparapi.internal.exception.*;
+import com.amd.aparapi.internal.instruction.InstructionSet.*;
+import com.amd.aparapi.internal.jni.*;
+import com.amd.aparapi.internal.model.*;
+import com.amd.aparapi.internal.util.*;
+import com.amd.aparapi.internal.writer.*;
+import com.amd.aparapi.opencl.*;
+
+import java.lang.reflect.*;
+import java.nio.*;
+import java.util.*;
+import java.util.concurrent.*;
+import java.util.concurrent.ForkJoinPool.*;
+import java.util.logging.*;
 
 /**
  * The class is responsible for executing <code>Kernel</code> implementations. <br/>
@@ -101,6 +81,7 @@ public class KernelRunner extends KernelRunnerJNI{
    @UsedByJNICode public static final int PASS_ID_COMPLETED_EXECUTION = -1;
    @UsedByJNICode public static final int CANCEL_STATUS_FALSE = 0;
    @UsedByJNICode public static final int CANCEL_STATUS_TRUE = 1;
+   private static final String CODE_GEN_ERROR_MARKER = CodeGenException.class.getName();
 
    private static Logger logger = Logger.getLogger(Config.getLoggerName());
 
@@ -147,6 +128,7 @@ public class KernelRunner extends KernelRunnerJNI{
 
    private static final ForkJoinPool threadPool = new ForkJoinPool(Runtime.getRuntime().availableProcessors(),
          lowPriorityThreadFactory, null, false);
+   private static HashMap<Class<? extends Kernel>, String> openCLCache = new HashMap<>();
 
    /**
     * Create a KernelRunner for a specific Kernel instance.
@@ -164,6 +146,8 @@ public class KernelRunner extends KernelRunnerJNI{
 
       inBufferRemoteInt = inBufferRemote.asIntBuffer();
       outBufferRemoteInt = outBufferRemote.asIntBuffer();
+
+      KernelManager.instance(); // ensures static initialization of KernalManager
    }
 
    /**
@@ -172,7 +156,7 @@ public class KernelRunner extends KernelRunnerJNI{
     * @see KernelRunnerJNI#disposeJNI(long)
     */
    public void dispose() {
-      if (kernel.getExecutionMode().isOpenCL()) {
+      if (kernel.isRunningCL()) {
          disposeJNI(jniContextHandle);
       }
       // We are using a shared pool, so there's no need no shutdown it when kernel is disposed
@@ -181,12 +165,6 @@ public class KernelRunner extends KernelRunnerJNI{
 
    private Set<String> capabilitiesSet;
 
-   private long accumulatedExecutionTime = 0;
-
-   private long conversionTime = 0;
-
-   private long executionTime = 0;
-
    boolean hasFP64Support() {
       if (capabilitiesSet == null) {
          throw new IllegalStateException("Capabilities queried before they were initialized");
@@ -316,312 +294,334 @@ public class KernelRunner extends KernelRunnerJNI{
    }
 
    /**
-    * Execute using a Java thread pool. Either because we were explicitly asked to do so, or because we 'fall back' after discovering an OpenCL issue.
-    * 
-    * @param _range
-    *          The globalSize requested by the user (via <code>Kernel.execute(globalSize)</code>)
-    * @param _passes
-    *          The # of passes requested by the user (via <code>Kernel.execute(globalSize, passes)</code>). Note this is usually defaulted to 1 via <code>Kernel.execute(globalSize)</code>.
-    * @return
+    * Execute using a Java thread pool, or sequentially, or using an alternative algorithm, usually as a result of failing to compile or execute OpenCL
     */
-   protected long executeJava(final Range _range, final int _passes) {
+   @SuppressWarnings("deprecation")
+   protected void executeJava(ExecutionSettings _settings, Device device) {
       if (logger.isLoggable(Level.FINE)) {
-         logger.fine("executeJava: range = " + _range);
+         logger.fine("executeJava: range = " + _settings.range + ", device = " + device);
       }
+      boolean legacySequentialMode = kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.SEQ);
 
       passId = PASS_ID_PREPARING_EXECUTION;
+      _settings.profile.onEvent(ProfilingEvent.PREPARE_EXECUTE);
+
       try {
-         final int localSize0 = _range.getLocalSize(0);
-         final int localSize1 = _range.getLocalSize(1);
-         final int localSize2 = _range.getLocalSize(2);
-         final int globalSize1 = _range.getGlobalSize(1);
-         if (kernel.getExecutionMode().equals(EXECUTION_MODE.SEQ)) {
-            /**
-             * SEQ mode is useful for testing trivial logic, but kernels which use SEQ mode cannot be used if the
-             * product of localSize(0..3) is >1.  So we can use multi-dim ranges but only if the local size is 1 in all dimensions.
-             *
-             * As a result of this barrier is only ever 1 work item wide and probably should be turned into a no-op.
-             *
-             * So we need to check if the range is valid here. If not we have no choice but to punt.
-             */
-            if ((localSize0 * localSize1 * localSize2) > 1) {
-               throw new IllegalStateException("Can't run range with group size >1 sequentially. Barriers would deadlock!");
+         if (device == JavaDevice.ALTERNATIVE_ALGORITHM) {
+            if (kernel.hasFallbackAlgorithm()) {
+               for (passId = 0; passId < _settings.passes; ++passId) {
+                  kernel.executeFallbackAlgorithm(_settings.range, passId);
+               }
+            } else {
+               boolean silently = true; // not having an alternative algorithm is the normal state, and does not need reporting
+               fallBackToNextDevice(_settings, (Exception) null, silently);
             }
-
-            final Kernel kernelClone = kernel.clone();
-            final KernelState kernelState = kernelClone.getKernelState();
-
-            kernelState.setRange(_range);
-            kernelState.setGroupId(0, 0);
-            kernelState.setGroupId(1, 0);
-            kernelState.setGroupId(2, 0);
-            kernelState.setLocalId(0, 0);
-            kernelState.setLocalId(1, 0);
-            kernelState.setLocalId(2, 0);
-            kernelState.setLocalBarrier(new FJSafeCyclicBarrier(1));
-
-            for (passId = 0; passId < _passes; passId++) {
-               if (getCancelState() == CANCEL_STATUS_TRUE) {
-                  break;
+         } else {
+            final int localSize0 = _settings.range.getLocalSize(0);
+            final int localSize1 = _settings.range.getLocalSize(1);
+            final int localSize2 = _settings.range.getLocalSize(2);
+            final int globalSize1 = _settings.range.getGlobalSize(1);
+            if (legacySequentialMode || device == JavaDevice.SEQUENTIAL) {
+               /**
+                * SEQ mode is useful for testing trivial logic, but kernels which use SEQ mode cannot be used if the
+                * product of localSize(0..3) is >1.  So we can use multi-dim ranges but only if the local size is 1 in all dimensions.
+                *
+                * As a result of this barrier is only ever 1 work item wide and probably should be turned into a no-op.
+                *
+                * So we need to check if the range is valid here. If not we have no choice but to punt.
+                */
+               if ((localSize0 * localSize1 * localSize2) > 1) {
+                  throw new IllegalStateException("Can't run range with group size >1 sequentially. Barriers would deadlock!");
                }
-               kernelState.setPassId(passId);
 
-               if (_range.getDims() == 1) {
-                  for (int id = 0; id < _range.getGlobalSize(0); id++) {
-                     kernelState.setGlobalId(0, id);
-                     kernelClone.run();
+               final Kernel kernelClone = kernel.clone();
+               final KernelState kernelState = kernelClone.getKernelState();
+
+               kernelState.setRange(_settings.range);
+               kernelState.setGroupId(0, 0);
+               kernelState.setGroupId(1, 0);
+               kernelState.setGroupId(2, 0);
+               kernelState.setLocalId(0, 0);
+               kernelState.setLocalId(1, 0);
+               kernelState.setLocalId(2, 0);
+               kernelState.setLocalBarrier(new FJSafeCyclicBarrier(1));
+
+               for (passId = 0; passId < _settings.passes; passId++) {
+                  if (getCancelState() == CANCEL_STATUS_TRUE) {
+                     break;
                   }
-               } else if (_range.getDims() == 2) {
-                  for (int x = 0; x < _range.getGlobalSize(0); x++) {
-                     kernelState.setGlobalId(0, x);
+                  kernelState.setPassId(passId);
 
-                     for (int y = 0; y < globalSize1; y++) {
-                        kernelState.setGlobalId(1, y);
+                  if (_settings.range.getDims() == 1) {
+                     for (int id = 0; id < _settings.range.getGlobalSize(0); id++) {
+                        kernelState.setGlobalId(0, id);
                         kernelClone.run();
                      }
                   }
-               } else if (_range.getDims() == 3) {
-                  for (int x = 0; x < _range.getGlobalSize(0); x++) {
-                     kernelState.setGlobalId(0, x);
-
-                     for (int y = 0; y < globalSize1; y++) {
-                        kernelState.setGlobalId(1, y);
+                  else if (_settings.range.getDims() == 2) {
+                     for (int x = 0; x < _settings.range.getGlobalSize(0); x++) {
+                        kernelState.setGlobalId(0, x);
 
-                        for (int z = 0; z < _range.getGlobalSize(2); z++) {
-                           kernelState.setGlobalId(2, z);
+                        for (int y = 0; y < globalSize1; y++) {
+                           kernelState.setGlobalId(1, y);
                            kernelClone.run();
                         }
+                     }
+                  }
+                  else if (_settings.range.getDims() == 3) {
+                     for (int x = 0; x < _settings.range.getGlobalSize(0); x++) {
+                        kernelState.setGlobalId(0, x);
 
-                        kernelClone.run();
+                        for (int y = 0; y < globalSize1; y++) {
+                           kernelState.setGlobalId(1, y);
+
+                           for (int z = 0; z < _settings.range.getGlobalSize(2); z++) {
+                              kernelState.setGlobalId(2, z);
+                              kernelClone.run();
+                           }
+
+                           kernelClone.run();
+                        }
                      }
                   }
                }
+               passId = PASS_ID_COMPLETED_EXECUTION;
             }
-            passId = PASS_ID_COMPLETED_EXECUTION;
-         } else {
-            final int threads = localSize0 * localSize1 * localSize2;
-            final int numGroups0 = _range.getNumGroups(0);
-            final int numGroups1 = _range.getNumGroups(1);
-            final int globalGroups = numGroups0 * numGroups1 * _range.getNumGroups(2);
-            /**
-             * This joinBarrier is the barrier that we provide for the kernel threads to rendezvous with the current dispatch thread.
-             * So this barrier is threadCount+1 wide (the +1 is for the dispatch thread)
-             */
-            final CyclicBarrier joinBarrier = new FJSafeCyclicBarrier(threads + 1);
-
-            /**
-             * This localBarrier is only ever used by the kernels.  If the kernel does not use the barrier the threads
-             * can get out of sync, we promised nothing in JTP mode.
-             *
-             * As with OpenCL all threads within a group must wait at the barrier or none.  It is a user error (possible deadlock!)
-             * if the barrier is in a conditional that is only executed by some of the threads within a group.
-             *
-             * Kernel developer must understand this.
-             *
-             * This barrier is threadCount wide.  We never hit the barrier from the dispatch thread.
-             */
-            final CyclicBarrier localBarrier = new FJSafeCyclicBarrier(threads);
-
-            final ThreadIdSetter threadIdSetter;
-
-            if (_range.getDims() == 1) {
-               threadIdSetter = new ThreadIdSetter(){
-                  @Override public void set(KernelState kernelState, int globalGroupId, int threadId) {
-                     //                   (kernelState, globalGroupId, threadId) ->{
-                     kernelState.setLocalId(0, (threadId % localSize0));
-                     kernelState.setGlobalId(0, (threadId + (globalGroupId * threads)));
-                     kernelState.setGroupId(0, globalGroupId);
-                  }
-               };
-            } else if (_range.getDims() == 2) {
+            else {
+               if (device != JavaDevice.THREAD_POOL && kernel.getExecutionMode() != Kernel.EXECUTION_MODE.JTP) {
+                  throw new AssertionError("unexpected JavaDevice or EXECUTION_MODE");
+               }
+               final int threads = localSize0 * localSize1 * localSize2;
+               final int numGroups0 = _settings.range.getNumGroups(0);
+               final int numGroups1 = _settings.range.getNumGroups(1);
+               final int globalGroups = numGroups0 * numGroups1 * _settings.range.getNumGroups(2);
+               /**
+                * This joinBarrier is the barrier that we provide for the kernel threads to rendezvous with the current dispatch thread.
+                * So this barrier is threadCount+1 wide (the +1 is for the dispatch thread)
+                */
+               final CyclicBarrier joinBarrier = new FJSafeCyclicBarrier(threads + 1);
 
                /**
-                * Consider a 12x4 grid of 4*2 local groups
-                * <pre>
-                *                                             threads = 4*2 = 8
-                *                                             localWidth=4
-                *                                             localHeight=2
-                *                                             globalWidth=12
-                *                                             globalHeight=4
-                *
-                *    00 01 02 03 | 04 05 06 07 | 08 09 10 11
-                *    12 13 14 15 | 16 17 18 19 | 20 21 22 23
-                *    ------------+-------------+------------
-                *    24 25 26 27 | 28 29 30 31 | 32 33 34 35
-                *    36 37 38 39 | 40 41 42 43 | 44 45 46 47
+                * This localBarrier is only ever used by the kernels.  If the kernel does not use the barrier the threads
+                * can get out of sync, we promised nothing in JTP mode.
                 *
-                *    00 01 02 03 | 00 01 02 03 | 00 01 02 03  threadIds : [0..7]*6
-                *    04 05 06 07 | 04 05 06 07 | 04 05 06 07
-                *    ------------+-------------+------------
-                *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
-                *    04 05 06 07 | 04 05 06 07 | 04 05 06 07
+                * As with OpenCL all threads within a group must wait at the barrier or none.  It is a user error (possible deadlock!)
+                * if the barrier is in a conditional that is only executed by some of the threads within a group.
                 *
-                *    00 00 00 00 | 01 01 01 01 | 02 02 02 02  groupId[0] : 0..6
-                *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
-                *    ------------+-------------+------------
-                *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
-                *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
-                *
-                *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  groupId[1] : 0..6
-                *    00 00 00 00 | 00 00 00 00 | 00 00 00 00
-                *    ------------+-------------+------------
-                *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
-                *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
-                *
-                *    00 01 02 03 | 08 09 10 11 | 16 17 18 19  globalThreadIds == threadId + groupId * threads;
-                *    04 05 06 07 | 12 13 14 15 | 20 21 22 23
-                *    ------------+-------------+------------
-                *    24 25 26 27 | 32[33]34 35 | 40 41 42 43
-                *    28 29 30 31 | 36 37 38 39 | 44 45 46 47
-                *
-                *    00 01 02 03 | 00 01 02 03 | 00 01 02 03  localX = threadId % localWidth; (for globalThreadId 33 = threadId = 01 : 01%4 =1)
-                *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
-                *    ------------+-------------+------------
-                *    00 01 02 03 | 00[01]02 03 | 00 01 02 03
-                *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
-                *
-                *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  localY = threadId /localWidth  (for globalThreadId 33 = threadId = 01 : 01/4 =0)
-                *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
-                *    ------------+-------------+------------
-                *    00 00 00 00 | 00[00]00 00 | 00 00 00 00
-                *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
-                *
-                *    00 01 02 03 | 04 05 06 07 | 08 09 10 11  globalX=
-                *    00 01 02 03 | 04 05 06 07 | 08 09 10 11     groupsPerLineWidth=globalWidth/localWidth (=12/4 =3)
-                *    ------------+-------------+------------     groupInset =groupId%groupsPerLineWidth (=4%3 = 1)
-                *    00 01 02 03 | 04[05]06 07 | 08 09 10 11
-                *    00 01 02 03 | 04 05 06 07 | 08 09 10 11     globalX = groupInset*localWidth+localX (= 1*4+1 = 5)
-                *
-                *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  globalY
-                *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
-                *    ------------+-------------+------------
-                *    02 02 02 02 | 02[02]02 02 | 02 02 02 02
-                *    03 03 03 03 | 03 03 03 03 | 03 03 03 03
-                *
-                * </pre>
-                * Assume we are trying to locate the id's for #33
+                * Kernel developer must understand this.
                 *
+                * This barrier is threadCount wide.  We never hit the barrier from the dispatch thread.
                 */
-               threadIdSetter = new ThreadIdSetter(){
-                  @Override public void set(KernelState kernelState, int globalGroupId, int threadId) {
-                     //                   (kernelState, globalGroupId, threadId) ->{
-                     kernelState.setLocalId(0, (threadId % localSize0)); // threadId % localWidth =  (for 33 = 1 % 4 = 1)
-                     kernelState.setLocalId(1, (threadId / localSize0)); // threadId / localWidth = (for 33 = 1 / 4 == 0)
-
-                     final int groupInset = globalGroupId % numGroups0; // 4%3 = 1
-                     kernelState.setGlobalId(0, ((groupInset * localSize0) + kernelState.getLocalIds()[0])); // 1*4+1=5
-
-                     final int completeLines = (globalGroupId / numGroups0) * localSize1;// (4/3) * 2
-                     kernelState.setGlobalId(1, (completeLines + kernelState.getLocalIds()[1])); // 2+0 = 2
-                     kernelState.setGroupId(0, (globalGroupId % numGroups0));
-                     kernelState.setGroupId(1, (globalGroupId / numGroups0));
-                  }
-               };
-            } else if (_range.getDims() == 3) {
-               //Same as 2D actually turns out that localId[0] is identical for all three dims so could be hoisted out of conditional code
-               threadIdSetter = new ThreadIdSetter(){
-                  @Override public void set(KernelState kernelState, int globalGroupId, int threadId) {
-                     //                   (kernelState, globalGroupId, threadId) ->{
-                     kernelState.setLocalId(0, (threadId % localSize0));
+               final CyclicBarrier localBarrier = new FJSafeCyclicBarrier(threads);
+
+               final ThreadIdSetter threadIdSetter;
+
+               if (_settings.range.getDims() == 1) {
+                  threadIdSetter = new ThreadIdSetter() {
+                     @Override
+                     public void set(KernelState kernelState, int globalGroupId, int threadId) {
+                        //                   (kernelState, globalGroupId, threadId) ->{
+                        kernelState.setLocalId(0, (threadId % localSize0));
+                        kernelState.setGlobalId(0, (threadId + (globalGroupId * threads)));
+                        kernelState.setGroupId(0, globalGroupId);
+                     }
+                  };
+               }
+               else if (_settings.range.getDims() == 2) {
+
+                  /**
+                   * Consider a 12x4 grid of 4*2 local groups
+                   * <pre>
+                   *                                             threads = 4*2 = 8
+                   *                                             localWidth=4
+                   *                                             localHeight=2
+                   *                                             globalWidth=12
+                   *                                             globalHeight=4
+                   *
+                   *    00 01 02 03 | 04 05 06 07 | 08 09 10 11
+                   *    12 13 14 15 | 16 17 18 19 | 20 21 22 23
+                   *    ------------+-------------+------------
+                   *    24 25 26 27 | 28 29 30 31 | 32 33 34 35
+                   *    36 37 38 39 | 40 41 42 43 | 44 45 46 47
+                   *
+                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03  threadIds : [0..7]*6
+                   *    04 05 06 07 | 04 05 06 07 | 04 05 06 07
+                   *    ------------+-------------+------------
+                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
+                   *    04 05 06 07 | 04 05 06 07 | 04 05 06 07
+                   *
+                   *    00 00 00 00 | 01 01 01 01 | 02 02 02 02  groupId[0] : 0..6
+                   *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
+                   *    ------------+-------------+------------
+                   *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
+                   *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
+                   *
+                   *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  groupId[1] : 0..6
+                   *    00 00 00 00 | 00 00 00 00 | 00 00 00 00
+                   *    ------------+-------------+------------
+                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                   *
+                   *    00 01 02 03 | 08 09 10 11 | 16 17 18 19  globalThreadIds == threadId + groupId * threads;
+                   *    04 05 06 07 | 12 13 14 15 | 20 21 22 23
+                   *    ------------+-------------+------------
+                   *    24 25 26 27 | 32[33]34 35 | 40 41 42 43
+                   *    28 29 30 31 | 36 37 38 39 | 44 45 46 47
+                   *
+                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03  localX = threadId % localWidth; (for globalThreadId 33 = threadId = 01 : 01%4 =1)
+                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
+                   *    ------------+-------------+------------
+                   *    00 01 02 03 | 00[01]02 03 | 00 01 02 03
+                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
+                   *
+                   *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  localY = threadId /localWidth  (for globalThreadId 33 = threadId = 01 : 01/4 =0)
+                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                   *    ------------+-------------+------------
+                   *    00 00 00 00 | 00[00]00 00 | 00 00 00 00
+                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                   *
+                   *    00 01 02 03 | 04 05 06 07 | 08 09 10 11  globalX=
+                   *    00 01 02 03 | 04 05 06 07 | 08 09 10 11     groupsPerLineWidth=globalWidth/localWidth (=12/4 =3)
+                   *    ------------+-------------+------------     groupInset =groupId%groupsPerLineWidth (=4%3 = 1)
+                   *    00 01 02 03 | 04[05]06 07 | 08 09 10 11
+                   *    00 01 02 03 | 04 05 06 07 | 08 09 10 11     globalX = groupInset*localWidth+localX (= 1*4+1 = 5)
+                   *
+                   *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  globalY
+                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                   *    ------------+-------------+------------
+                   *    02 02 02 02 | 02[02]02 02 | 02 02 02 02
+                   *    03 03 03 03 | 03 03 03 03 | 03 03 03 03
+                   *
+                   * </pre>
+                   * Assume we are trying to locate the id's for #33
+                   *
+                   */
+                  threadIdSetter = new ThreadIdSetter() {
+                     @Override
+                     public void set(KernelState kernelState, int globalGroupId, int threadId) {
+                        //                   (kernelState, globalGroupId, threadId) ->{
+                        kernelState.setLocalId(0, (threadId % localSize0)); // threadId % localWidth =  (for 33 = 1 % 4 = 1)
+                        kernelState.setLocalId(1, (threadId / localSize0)); // threadId / localWidth = (for 33 = 1 / 4 == 0)
+
+                        final int groupInset = globalGroupId % numGroups0; // 4%3 = 1
+                        kernelState.setGlobalId(0, ((groupInset * localSize0) + kernelState.getLocalIds()[0])); // 1*4+1=5
+
+                        final int completeLines = (globalGroupId / numGroups0) * localSize1;// (4/3) * 2
+                        kernelState.setGlobalId(1, (completeLines + kernelState.getLocalIds()[1])); // 2+0 = 2
+                        kernelState.setGroupId(0, (globalGroupId % numGroups0));
+                        kernelState.setGroupId(1, (globalGroupId / numGroups0));
+                     }
+                  };
+               }
+               else if (_settings.range.getDims() == 3) {
+                  //Same as 2D actually turns out that localId[0] is identical for all three dims so could be hoisted out of conditional code
+                  threadIdSetter = new ThreadIdSetter() {
+                     @Override
+                     public void set(KernelState kernelState, int globalGroupId, int threadId) {
+                        //                   (kernelState, globalGroupId, threadId) ->{
+                        kernelState.setLocalId(0, (threadId % localSize0));
 
-                     kernelState.setLocalId(1, ((threadId / localSize0) % localSize1));
+                        kernelState.setLocalId(1, ((threadId / localSize0) % localSize1));
 
-                     // the thread id's span WxHxD so threadId/(WxH) should yield the local depth
-                     kernelState.setLocalId(2, (threadId / (localSize0 * localSize1)));
+                        // the thread id's span WxHxD so threadId/(WxH) should yield the local depth
+                        kernelState.setLocalId(2, (threadId / (localSize0 * localSize1)));
 
-                     kernelState.setGlobalId(0, (((globalGroupId % numGroups0) * localSize0) + kernelState.getLocalIds()[0]));
+                        kernelState.setGlobalId(0, (((globalGroupId % numGroups0) * localSize0) + kernelState.getLocalIds()[0]));
 
-                     kernelState.setGlobalId(1,
-                           ((((globalGroupId / numGroups0) * localSize1) % globalSize1) + kernelState.getLocalIds()[1]));
+                        kernelState.setGlobalId(1,
+                        ((((globalGroupId / numGroups0) * localSize1) % globalSize1) + kernelState.getLocalIds()[1]));
 
-                     kernelState.setGlobalId(2,
-                           (((globalGroupId / (numGroups0 * numGroups1)) * localSize2) + kernelState.getLocalIds()[2]));
+                        kernelState.setGlobalId(2,
+                        (((globalGroupId / (numGroups0 * numGroups1)) * localSize2) + kernelState.getLocalIds()[2]));
 
-                     kernelState.setGroupId(0, (globalGroupId % numGroups0));
-                     kernelState.setGroupId(1, ((globalGroupId / numGroups0) % numGroups1));
-                     kernelState.setGroupId(2, (globalGroupId / (numGroups0 * numGroups1)));
-                  }
-               };
-            } else
-               throw new IllegalArgumentException("Expected 1,2 or 3 dimensions, found " + _range.getDims());
-            for (passId = 0; passId < _passes; passId++) {
-               if (getCancelState() == CANCEL_STATUS_TRUE) {
-                  break;
+                        kernelState.setGroupId(0, (globalGroupId % numGroups0));
+                        kernelState.setGroupId(1, ((globalGroupId / numGroups0) % numGroups1));
+                        kernelState.setGroupId(2, (globalGroupId / (numGroups0 * numGroups1)));
+                     }
+                  };
                }
-               /**
-                 * Note that we emulate OpenCL by creating one thread per localId (across the group).
-                 *
-                 * So threadCount == range.getLocalSize(0)*range.getLocalSize(1)*range.getLocalSize(2);
-                 *
-                 * For a 1D range of 12 groups of 4 we create 4 threads. One per localId(0).
-                 *
-                 * We also clone the kernel 4 times. One per thread.
-                 *
-                 * We create local barrier which has a width of 4
-                 *
-                 *    Thread-0 handles localId(0) (global 0,4,8)
-                 *    Thread-1 handles localId(1) (global 1,5,7)
-                 *    Thread-2 handles localId(2) (global 2,6,10)
-                 *    Thread-3 handles localId(3) (global 3,7,11)
-                 *
-                 * This allows all threads to synchronize using the local barrier.
-                 *
-                 * Initially the use of local buffers seems broken as the buffers appears to be per Kernel.
-                 * Thankfully Kernel.clone() performs a shallow clone of all buffers (local and global)
-                 * So each of the cloned kernels actually still reference the same underlying local/global buffers.
-                 *
-                 * If the kernel uses local buffers but does not use barriers then it is possible for different groups
-                 * to see mutations from each other (unlike OpenCL), however if the kernel does not us barriers then it
-                 * cannot assume any coherence in OpenCL mode either (the failure mode will be different but still wrong)
-                 *
-                 * So even JTP mode use of local buffers will need to use barriers. Not for the same reason as OpenCL but to keep groups in lockstep.
-                 *
-                 **/
-               for (int id = 0; id < threads; id++) {
-                  final int threadId = id;
-
+               else
+                  throw new IllegalArgumentException("Expected 1,2 or 3 dimensions, found " + _settings.range.getDims());
+               for (passId = 0; passId < _settings.passes; passId++) {
+                  if (getCancelState() == CANCEL_STATUS_TRUE) {
+                     break;
+                  }
                   /**
-                   *  We clone one kernel for each thread.
+                   * Note that we emulate OpenCL by creating one thread per localId (across the group).
                    *
-                   *  They will all share references to the same range, localBarrier and global/local buffers because the clone is shallow.
-                   *  We need clones so that each thread can assign 'state' (localId/globalId/groupId) without worrying
-                   *  about other threads.
-                   */
-                  final Kernel kernelClone = kernel.clone();
-                  final KernelState kernelState = kernelClone.getKernelState();
-                  kernelState.setRange(_range);
-                  kernelState.setPassId(passId);
-
-                  if (threads == 1) {
-                     kernelState.disableLocalBarrier();
-                  } else {
-                     kernelState.setLocalBarrier(localBarrier);
-                  }
+                   * So threadCount == range.getLocalSize(0)*range.getLocalSize(1)*range.getLocalSize(2);
+                   *
+                   * For a 1D range of 12 groups of 4 we create 4 threads. One per localId(0).
+                   *
+                   * We also clone the kernel 4 times. One per thread.
+                   *
+                   * We create local barrier which has a width of 4
+                   *
+                   *    Thread-0 handles localId(0) (global 0,4,8)
+                   *    Thread-1 handles localId(1) (global 1,5,7)
+                   *    Thread-2 handles localId(2) (global 2,6,10)
+                   *    Thread-3 handles localId(3) (global 3,7,11)
+                   *
+                   * This allows all threads to synchronize using the local barrier.
+                   *
+                   * Initially the use of local buffers seems broken as the buffers appears to be per Kernel.
+                   * Thankfully Kernel.clone() performs a shallow clone of all buffers (local and global)
+                   * So each of the cloned kernels actually still reference the same underlying local/global buffers.
+                   *
+                   * If the kernel uses local buffers but does not use barriers then it is possible for different groups
+                   * to see mutations from each other (unlike OpenCL), however if the kernel does not us barriers then it
+                   * cannot assume any coherence in OpenCL mode either (the failure mode will be different but still wrong)
+                   *
+                   * So even JTP mode use of local buffers will need to use barriers. Not for the same reason as OpenCL but to keep groups in lockstep.
+                   *
+                   **/
+                  for (int id = 0; id < threads; id++) {
+                     final int threadId = id;
+
+                     /**
+                      *  We clone one kernel for each thread.
+                      *
+                      *  They will all share references to the same range, localBarrier and global/local buffers because the clone is shallow.
+                      *  We need clones so that each thread can assign 'state' (localId/globalId/groupId) without worrying
+                      *  about other threads.
+                      */
+                     final Kernel kernelClone = kernel.clone();
+                     final KernelState kernelState = kernelClone.getKernelState();
+                     kernelState.setRange(_settings.range);
+                     kernelState.setPassId(passId);
+
+                     if (threads == 1) {
+                        kernelState.disableLocalBarrier();
+                     }
+                     else {
+                        kernelState.setLocalBarrier(localBarrier);
+                     }
 
-                  threadPool.submit(
-                  //                     () -> {
-                        new Runnable(){
-                           public void run() {
-                              try {
-                                 for (int globalGroupId = 0; globalGroupId < globalGroups; globalGroupId++) {
-                                    threadIdSetter.set(kernelState, globalGroupId, threadId);
-                                    kernelClone.run();
-                                 }
-                              } catch (RuntimeException | Error e) {
-                                 logger.log(Level.SEVERE, "Execution failed", e);
-                              } finally {
-                                 await(joinBarrier); // This thread will rendezvous with dispatch thread here. This is effectively a join.
+                     threadPool.submit(
+                     //                     () -> {
+                     new Runnable() {
+                        public void run() {
+                           try {
+                              for (int globalGroupId = 0; globalGroupId < globalGroups; globalGroupId++) {
+                                 threadIdSetter.set(kernelState, globalGroupId, threadId);
+                                 kernelClone.run();
                               }
                            }
-                        });
-               }
-
-               await(joinBarrier); // This dispatch thread waits for all worker threads here.
-            }
-            passId = PASS_ID_COMPLETED_EXECUTION;
-         } // execution mode == JTP
+                           catch (RuntimeException | Error e) {
+                              logger.log(Level.SEVERE, "Execution failed", e);
+                           }
+                           finally {
+                              await(joinBarrier); // This thread will rendezvous with dispatch thread here. This is effectively a join.
+                           }
+                        }
+                     });
+                  }
 
-         return 0;
+                  await(joinBarrier); // This dispatch thread waits for all worker threads here.
+               }
+               passId = PASS_ID_COMPLETED_EXECUTION;
+            } // execution mode == JTP
+         }
       } finally {
          passId = PASS_ID_COMPLETED_EXECUTION;
       }
@@ -964,63 +964,22 @@ public class KernelRunner extends KernelRunnerJNI{
       return needsSync;
    }
 
-   // private int numAvailableProcessors = Runtime.getRuntime().availableProcessors();
-
-   private Kernel executeOpenCL(final String _entrypointName, final Range _range, final int _passes) throws AparapiException {
-      /*
-      if (_range.getDims() > getMaxWorkItemDimensionsJNI(jniContextHandle)) {
-         throw new RangeException("Range dim size " + _range.getDims() + " > device "
-               + getMaxWorkItemDimensionsJNI(jniContextHandle));
-      }
-      if (_range.getWorkGroupSize() > getMaxWorkGroupSizeJNI(jniContextHandle)) {
-         throw new RangeException("Range workgroup size " + _range.getWorkGroupSize() + " > device "
-               + getMaxWorkGroupSizeJNI(jniContextHandle));
-      }
-      
-            if (_range.getGlobalSize(0) > getMaxWorkItemSizeJNI(jniContextHandle, 0)) {
-               throw new RangeException("Range globalsize 0 " + _range.getGlobalSize(0) + " > device "
-                     + getMaxWorkItemSizeJNI(jniContextHandle, 0));
-            }
-            if (_range.getDims() > 1) {
-               if (_range.getGlobalSize(1) > getMaxWorkItemSizeJNI(jniContextHandle, 1)) {
-                  throw new RangeException("Range globalsize 1 " + _range.getGlobalSize(1) + " > device "
-                        + getMaxWorkItemSizeJNI(jniContextHandle, 1));
-               }
-               if (_range.getDims() > 2) {
-                  if (_range.getGlobalSize(2) > getMaxWorkItemSizeJNI(jniContextHandle, 2)) {
-                     throw new RangeException("Range globalsize 2 " + _range.getGlobalSize(2) + " > device "
-                           + getMaxWorkItemSizeJNI(jniContextHandle, 2));
-                  }
-               }
-            }
-      
+   @SuppressWarnings("deprecation")
+   private Kernel executeOpenCL(ExecutionSettings _settings) throws AparapiException {
 
-      if (logger.isLoggable(Level.FINE)) {
-         logger.fine("maxComputeUnits=" + this.getMaxComputeUnitsJNI(jniContextHandle));
-         logger.fine("maxWorkGroupSize=" + this.getMaxWorkGroupSizeJNI(jniContextHandle));
-         logger.fine("maxWorkItemDimensions=" + this.getMaxWorkItemDimensionsJNI(jniContextHandle));
-         logger.fine("maxWorkItemSize(0)=" + getMaxWorkItemSizeJNI(jniContextHandle, 0));
-         if (_range.getDims() > 1) {
-            logger.fine("maxWorkItemSize(1)=" + getMaxWorkItemSizeJNI(jniContextHandle, 1));
-            if (_range.getDims() > 2) {
-               logger.fine("maxWorkItemSize(2)=" + getMaxWorkItemSizeJNI(jniContextHandle, 2));
-            }
-         }
-      }
-      */
       // Read the array refs after kernel may have changed them
       // We need to do this as input to computing the localSize
       assert args != null : "args should not be null";
       final boolean needSync = updateKernelArrayRefs();
       if (needSync && logger.isLoggable(Level.FINE)) {
-         logger.fine("Need to resync arrays on " + describeKernelClass());
+         logger.fine("Need to resync arrays on " + kernel);
       }
 
       // native side will reallocate array buffers if necessary
-      if (runKernelJNI(jniContextHandle, _range, needSync, _passes, inBufferRemote, outBufferRemote) != 0) {
-         logger.warning("### " + describeKernelClass() + " - CL exec seems to have failed. Trying to revert to Java ###");
-         kernel.setFallbackExecutionMode();
-         return execute(_entrypointName, _range, _passes);
+      int returnValue = runKernelJNI(jniContextHandle, _settings.range, needSync, _settings.passes, inBufferRemote, outBufferRemote);
+      if (returnValue != 0) {
+         String reason = "OpenCL execution seems to have failed (runKernelJNI returned " + returnValue + ")";
+         return fallBackToNextDevice(_settings, new AparapiException(reason));
       }
 
       if (usesOopConversion == true) {
@@ -1028,343 +987,488 @@ public class KernelRunner extends KernelRunnerJNI{
       }
 
       if (logger.isLoggable(Level.FINE)) {
-         logger.fine("executeOpenCL completed. " + _range);
+         logger.fine("executeOpenCL completed. " + _settings.range);
       }
 
       return kernel;
    }
 
-   public synchronized Kernel execute(Kernel.Entry entry, final Range _range, final int _passes) {
-      System.out.println("execute(Kernel.Entry, size) not implemented");
-      return (kernel);
-   }
-
-   synchronized private Kernel fallBackAndExecute(String _entrypointName, final Range _range, final int _passes) {
+   @SuppressWarnings("deprecation")
+   synchronized private Kernel fallBackByExecutionMode(ExecutionSettings _settings) {
       isFallBack = true;
       if (kernel.hasNextExecutionMode()) {
          kernel.tryNextExecutionMode();
+         if (logger.isLoggable(Level.WARNING)) {
+            logger.warning("Trying next execution mode " + kernel.getExecutionMode());
+         }
       } else {
          kernel.setFallbackExecutionMode();
       }
+      recreateRange(_settings);
+      return executeInternal(_settings);
+   }
+
+   private void recreateRange(ExecutionSettings _settings) {
+      if (_settings.range.isLocalIsDerived() && !_settings.legacyExecutionMode) {
+         Device device = kernel.getTargetDevice();
+         Range result;
+         switch (_settings.range.getDims()) {
+            case 1: {
+               result = Range.create(device, _settings.range.getGlobalSize_0());
+               break;
+            }
+            case 2: {
+               result = Range.create2D(device, _settings.range.getGlobalSize_0(), _settings.range.getGlobalSize_1());
+               break;
+            }
+            case 3: {
+               result = Range.create3D(device, _settings.range.getGlobalSize_0(), _settings.range.getGlobalSize_1(), _settings.range.getGlobalSize_2());
+               break;
+            }
+            default: {
+               throw new AssertionError("Range.getDims() = " + _settings.range.getDims());
+            }
+         }
+         _settings.range = result;
+      }
+   }
+
+   private Kernel fallBackToNextDevice(ExecutionSettings _settings, String _reason) {
+      return fallBackToNextDevice(_settings, new AparapiException(_reason));
+   }
 
-      return execute(_entrypointName, _range, _passes);
+   @SuppressWarnings("deprecation")
+   synchronized private Kernel fallBackToNextDevice(ExecutionSettings _settings, Exception _exception) {
+      return fallBackToNextDevice(_settings, _exception, false);
    }
 
-   synchronized private Kernel warnFallBackAndExecute(String _entrypointName, final Range _range, final int _passes,
-         Exception _exception) {
-      if (logger.isLoggable(Level.WARNING)) {
-         logger.warning("Reverting to the next execution mode for " + describeKernelClass() + ": " + _exception.getMessage());
-         _exception.printStackTrace();
+   @SuppressWarnings("deprecation")
+   synchronized private Kernel fallBackToNextDevice(ExecutionSettings _settings, Exception _exception, boolean _silently) {
+      isFallBack = true;
+      _settings.profile.onEvent(ProfilingEvent.EXECUTED);
+      if (_settings.legacyExecutionMode) {
+         if (!_silently && logger.isLoggable(Level.WARNING)) {
+            logger.warning("Execution mode " + kernel.getExecutionMode() + " failed for " + kernel + ": " + _exception.getMessage());
+             _exception.printStackTrace();
+          }
+          return fallBackByExecutionMode(_settings);
+      } else {
+         KernelPreferences preferences = KernelManager.instance().getPreferences(kernel);
+         if (!_silently && logger.isLoggable(Level.WARNING)) {
+            logger.warning("Device failed for " + kernel + ": " + _exception.getMessage());
+         }
+
+         preferences.markPreferredDeviceFailed();
+
+//         Device nextDevice = preferences.getPreferredDevice(kernel);
+//
+//         if (nextDevice == null) {
+//            if (!_silently && logger.isLoggable(Level.SEVERE)) {
+//               logger.severe("No Devices left to try, giving up");
+//            }
+//            throw new RuntimeException(_exception);
+//         }
+         if (!_silently && logger.isLoggable(Level.WARNING)) {
+            _exception.printStackTrace();
+            logger.warning("Trying next device: " + describeDevice());
+         }
       }
-      return fallBackAndExecute(_entrypointName, _range, _passes);
+
+      recreateRange(_settings);
+      return executeInternal(_settings);
    }
 
-   private String describeKernelClass() {
-      return kernel.getClass().getName();
+   private String describeDevice() {
+      Device device = KernelManager.instance().getPreferences(kernel).getPreferredDevice(kernel);
+      return (device == null) ? "<default fallback>" : device.getShortDescription();
    }
 
-   synchronized private Kernel warnFallBackAndExecute(String _entrypointName, final Range _range, final int _passes, String _excuse) {
-      logger.warning("Reverting to the next execution mode for " + describeKernelClass() + ": " + _excuse);
-      return fallBackAndExecute(_entrypointName, _range, _passes);
+   @Override
+   public String toString() {
+      return "KernelRunner{" + kernel + "}";
    }
 
-   public synchronized Kernel execute(String _entrypointName, final Range _range, final int _passes) {
-      clearCancelMultiPass();
+   @SuppressWarnings("deprecation")
+   public synchronized Kernel execute(String _entrypoint, final Range _range, final int _passes) {
       executing = true;
-      try {
-         long executeStartTime = System.currentTimeMillis();
+      clearCancelMultiPass();
+      KernelProfile profile = KernelManager.instance().getProfile(kernel.getClass());
+      KernelPreferences preferences = KernelManager.instance().getPreferences(kernel);
+      boolean legacyExecutionMode = kernel.getExecutionMode() != Kernel.EXECUTION_MODE.AUTO;
 
-         if (_range == null) {
-            throw new IllegalStateException("range can't be null");
+      ExecutionSettings settings = new ExecutionSettings(preferences, profile, _entrypoint, _range, _passes, legacyExecutionMode);
+      try {
+         // Two Kernels of the same class share the same KernelPreferences object, and since failure (fallback) generally mutates
+         // the preferences object, we must lock it. Note this prevents two Kernels of the same class executing simultaneously.
+         synchronized (preferences) {
+            return executeInternal(settings);
          }
+      } finally {
+         executing = false;
+         clearCancelMultiPass();
+      }
+   }
 
-         /* for backward compatibility reasons we still honor execution mode */
-         if (kernel.getExecutionMode().isOpenCL()) {
-            // System.out.println("OpenCL");
+   @SuppressWarnings("deprecation")
+   private synchronized Kernel executeInternal(ExecutionSettings _settings) {
 
-            // See if user supplied a Device
-            Device device = _range.getDevice();
+      if (_settings.range == null) {
+         throw new IllegalStateException("range can't be null");
+      }
 
-            if ((device == null) || (device instanceof OpenCLDevice)) {
-               if ((entryPoint == null) || (isFallBack)) {
-                  if (entryPoint == null) {
-                     try {
-                        final ClassModel classModel = ClassModel.createClassModel(kernel.getClass());
-                        entryPoint = classModel.getEntrypoint(_entrypointName, kernel);
-                     } catch (final Exception exception) {
-                        return warnFallBackAndExecute(_entrypointName, _range, _passes, exception);
-                     }
-                  }
+      EXECUTION_MODE requestedExecutionMode = kernel.getExecutionMode();
 
-                  if ((entryPoint != null) && !entryPoint.shouldFallback()) {
-                     synchronized (Kernel.class) { // This seems to be needed because of a race condition uncovered with issue #68 http://code.google.com/p/aparapi/issues/detail?id=68
-                        if (device != null && !(device instanceof OpenCLDevice)) {
-                           throw new IllegalStateException("range's device is not suitable for OpenCL ");
-                        }
+      if (requestedExecutionMode.isOpenCL() && _settings.range.getDevice() != null && !(_settings.range.getDevice() instanceof OpenCLDevice)) {
+         fallBackToNextDevice(_settings, "OpenCL was requested but Device supplied was not an OpenCLDevice");
+      }
 
-                        OpenCLDevice openCLDevice = (OpenCLDevice) device; // still might be null!
+      Device device = _settings.range.getDevice();
+      boolean userSpecifiedDevice = true;
+      if (device == null) {
+         userSpecifiedDevice = false;
+         if (!_settings.legacyExecutionMode) {
+            device = _settings.preferences.getPreferredDevice(kernel);
+            if (device == null) {
+               // the default fallback when KernelPreferences has run out of options is JTP
+               device = JavaDevice.THREAD_POOL;
+            }
+         } else {
+            if (requestedExecutionMode == EXECUTION_MODE.JTP) {
+               device = JavaDevice.THREAD_POOL;
+            } else if (requestedExecutionMode == EXECUTION_MODE.SEQ) {
+               device = JavaDevice.SEQUENTIAL;
+            }
+         }
+      } else {
+         boolean compatible = isDeviceCompatible(device);
+         if (!compatible) {
+            throw new AssertionError("user supplied Device incompatible with current EXECUTION_MODE or getTargetDevice(); device = "
+                    + device.getShortDescription() + "; kernel = " + kernel);
+         }
+      }
 
-                        int jniFlags = 0;
-                        if (openCLDevice == null) {
-                           if (kernel.getExecutionMode().equals(EXECUTION_MODE.GPU)) {
-                              // Get the best GPU
-                              openCLDevice = (OpenCLDevice) OpenCLDevice.bestGPU();
-                              jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now.
-                              if (openCLDevice == null) {
-                                 return warnFallBackAndExecute(_entrypointName, _range, _passes, "GPU request can't be honored");
-                              }
-                           } else if (kernel.getExecutionMode().equals(EXECUTION_MODE.ACC)) {
-                              // Get the best ACC
-                              openCLDevice = (OpenCLDevice) OpenCLDevice.bestACC();
-                              jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now.
-                              if (openCLDevice == null) {
-                                 return warnFallBackAndExecute(_entrypointName, _range, _passes, "ACC request can't be honored");
-                              }
-                           } else {
-                              // We fetch the first CPU device
-                              openCLDevice = (OpenCLDevice) OpenCLDevice.firstCPU();
-                              if (openCLDevice == null) {
-                                 return warnFallBackAndExecute(_entrypointName, _range, _passes,
-                                       "CPU request can't be honored not CPU device");
-                              }
-                           }
-                        } else { // openCLDevice == null
-                           if (openCLDevice.getType() == Device.TYPE.GPU) {
-                              jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now.
-                           } else if (openCLDevice.getType() == Device.TYPE.ACC) {
-                              jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now.
-                           }
-                        }
+      try {
+         OpenCLDevice openCLDevice = device instanceof OpenCLDevice ? (OpenCLDevice) device : null;
 
-                        //  jniFlags |= (Config.enableProfiling ? JNI_FLAG_ENABLE_PROFILING : 0);
-                        //  jniFlags |= (Config.enableProfilingCSV ? JNI_FLAG_ENABLE_PROFILING_CSV | JNI_FLAG_ENABLE_PROFILING : 0);
-                        //  jniFlags |= (Config.enableVerboseJNI ? JNI_FLAG_ENABLE_VERBOSE_JNI : 0);
-                        // jniFlags |= (Config.enableVerboseJNIOpenCLResourceTracking ? JNI_FLAG_ENABLE_VERBOSE_JNI_OPENCL_RESOURCE_TRACKING :0);
-                        // jniFlags |= (kernel.getExecutionMode().equals(EXECUTION_MODE.GPU) ? JNI_FLAG_USE_GPU : 0);
-                        // Init the device to check capabilities before emitting the
-                        // code that requires the capabilities.
+         int jniFlags = 0;
+         if (_settings.legacyExecutionMode && device != null && !(device instanceof OpenCLDevice)) {
+            hashCode();
+         }
+         // for legacy reasons use old logic where Kernel.EXECUTION_MODE is not AUTO
+         if (_settings.legacyExecutionMode && !userSpecifiedDevice && requestedExecutionMode.isOpenCL()) {
+            if (requestedExecutionMode.equals(EXECUTION_MODE.GPU)) {
+               // Get the best GPU
+               openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.bestGPU();
+               jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now.
+               if (openCLDevice == null) {
+                  return fallBackToNextDevice(_settings, "GPU request can't be honored, no GPU device");
+               }
+            } else if (requestedExecutionMode.equals(EXECUTION_MODE.ACC)) {
+               // Get the best ACC
+               openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.bestACC();
+               jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now.
+               if (openCLDevice == null) {
+                  return fallBackToNextDevice(_settings, "ACC request can't be honored, no ACC device");
+               }
+            } else {
+               // We fetch the first CPU device
+               openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.firstDevice(Device.TYPE.CPU);
+               if (openCLDevice == null) {
+                  return fallBackToNextDevice(_settings, "CPU request can't be honored, no CPU device");
+               }
+            }
+         } else {
+            if (device.getType() == Device.TYPE.GPU) {
+               jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now.
+            } else if (device.getType() == Device.TYPE.ACC) {
+               jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now.
+            }
+         }
+         if (device == null && openCLDevice != null) {
+            device = openCLDevice;
+         }
+         assert device != null : "No device available";
+         _settings.profile.onStart(device);
+         /* for backward compatibility reasons we still honor execution mode */
+         boolean isOpenCl = requestedExecutionMode.isOpenCL() || device instanceof OpenCLDevice;
+         if (isOpenCl) {
+            if ((entryPoint == null) || (isFallBack)) {
+               if (entryPoint == null) {
+                  try {
+                     final ClassModel classModel = ClassModel.createClassModel(kernel.getClass());
+                     entryPoint = classModel.getEntrypoint(_settings.entrypoint, kernel);
+                     _settings.profile.onEvent(ProfilingEvent.CLASS_MODEL_BUILT);
+                  } catch (final Exception exception) {
+                     _settings.profile.onEvent(ProfilingEvent.CLASS_MODEL_BUILT);
+                     return fallBackToNextDevice(_settings, exception);
+                  }
+               }
 
-                        // synchronized(Kernel.class){
-                        jniContextHandle = initJNI(kernel, openCLDevice, jniFlags); // openCLDevice will not be null here
-                     } // end of synchronized! issue 68
+               if ((entryPoint != null)) {
+                  synchronized (Kernel.class) { // This seems to be needed because of a race condition uncovered with issue #68 http://code.google.com/p/aparapi/issues/detail?id=68
 
-                     if (jniContextHandle == 0) {
-                        return warnFallBackAndExecute(_entrypointName, _range, _passes, "initJNI failed to return a valid handle");
-                     }
+                     //  jniFlags |= (Config.enableProfiling ? JNI_FLAG_ENABLE_PROFILING : 0);
+                     //  jniFlags |= (Config.enableProfilingCSV ? JNI_FLAG_ENABLE_PROFILING_CSV | JNI_FLAG_ENABLE_PROFILING : 0);
+                     //  jniFlags |= (Config.enableVerboseJNI ? JNI_FLAG_ENABLE_VERBOSE_JNI : 0);
+                     // jniFlags |= (Config.enableVerboseJNIOpenCLResourceTracking ? JNI_FLAG_ENABLE_VERBOSE_JNI_OPENCL_RESOURCE_TRACKING :0);
+                     // jniFlags |= (kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.GPU) ? JNI_FLAG_USE_GPU : 0);
+                     // Init the device to check capabilities before emitting the
+                     // code that requires the capabilities.
 
-                     final String extensions = getExtensionsJNI(jniContextHandle);
-                     capabilitiesSet = new HashSet<String>();
+                     // synchronized(Kernel.class){
+                     jniContextHandle = initJNI(kernel, openCLDevice, jniFlags); // openCLDevice will not be null here
+                  } // end of synchronized! issue 68
 
-                     final StringTokenizer strTok = new StringTokenizer(extensions);
-                     while (strTok.hasMoreTokens()) {
-                        capabilitiesSet.add(strTok.nextToken());
-                     }
+                  if (jniContextHandle == 0) {
+                     return fallBackToNextDevice(_settings, "initJNI failed to return a valid handle");
+                  }
 
-                     if (logger.isLoggable(Level.FINE)) {
-                        logger.fine("Capabilities initialized to :" + capabilitiesSet.toString());
-                     }
+                  final String extensions = getExtensionsJNI(jniContextHandle);
+                  capabilitiesSet = new HashSet<String>();
 
-                     if (entryPoint.requiresDoublePragma() && !hasFP64Support()) {
-                        return warnFallBackAndExecute(_entrypointName, _range, _passes, "FP64 required but not supported");
-                     }
+                  final StringTokenizer strTok = new StringTokenizer(extensions);
+                  while (strTok.hasMoreTokens()) {
+                     capabilitiesSet.add(strTok.nextToken());
+                  }
 
-                     if (entryPoint.requiresByteAddressableStorePragma() && !hasByteAddressableStoreSupport()) {
-                        return warnFallBackAndExecute(_entrypointName, _range, _passes,
-                              "Byte addressable stores required but not supported");
-                     }
+                  if (logger.isLoggable(Level.FINE)) {
+                     logger.fine("Capabilities initialized to :" + capabilitiesSet.toString());
+                  }
 
-                     final boolean all32AtomicsAvailable = hasGlobalInt32BaseAtomicsSupport()
-                           && hasGlobalInt32ExtendedAtomicsSupport() && hasLocalInt32BaseAtomicsSupport()
-                           && hasLocalInt32ExtendedAtomicsSupport();
+                  if (entryPoint.requiresDoublePragma() && !hasFP64Support()) {
+                     return fallBackToNextDevice(_settings, "FP64 required but not supported");
+                  }
 
-                     if (entryPoint.requiresAtomic32Pragma() && !all32AtomicsAvailable) {
+                  if (entryPoint.requiresByteAddressableStorePragma() && !hasByteAddressableStoreSupport()) {
+                     return fallBackToNextDevice(_settings, "Byte addressable stores required but not supported");
+                  }
 
-                        return warnFallBackAndExecute(_entrypointName, _range, _passes, "32 bit Atomics required but not supported");
-                     }
+                  final boolean all32AtomicsAvailable = hasGlobalInt32BaseAtomicsSupport()
+                        && hasGlobalInt32ExtendedAtomicsSupport() && hasLocalInt32BaseAtomicsSupport()
+                        && hasLocalInt32ExtendedAtomicsSupport();
 
-                     String openCL = null;
-                     try {
-                        openCL = KernelWriter.writeToString(entryPoint);
-                     } catch (final CodeGenException codeGenException) {
-                        return warnFallBackAndExecute(_entrypointName, _range, _passes, codeGenException);
-                     }
+                  if (entryPoint.requiresAtomic32Pragma() && !all32AtomicsAvailable) {
 
-                     if (Config.enableShowGeneratedOpenCL) {
-                        System.out.println(openCL);
-                     }
+                     return fallBackToNextDevice(_settings, "32 bit Atomics required but not supported");
+                  }
 
-                     if (logger.isLoggable(Level.INFO)) {
-                        logger.info(openCL);
+                  String openCL;
+                  synchronized (openCLCache) {
+                     openCL = openCLCache.get(kernel.getClass());
+                     if (openCL == null) {
+                        try {
+                           openCL = KernelWriter.writeToString(entryPoint);
+                           if (logger.isLoggable(Level.INFO)) {
+                              logger.info(openCL);
+                           }
+                           else if (Config.enableShowGeneratedOpenCL) {
+                              System.out.println(openCL);
+                           }
+                           _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED);
+                           openCLCache.put(kernel.getClass(), openCL);
+                        }
+                        catch (final CodeGenException codeGenException) {
+                           openCLCache.put(kernel.getClass(), CODE_GEN_ERROR_MARKER);
+                           _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED);
+                           return fallBackToNextDevice(_settings, codeGenException);
+                        }
                      }
-
-                     // Send the string to OpenCL to compile it
-                     if (buildProgramJNI(jniContextHandle, openCL) == 0) {
-                        return warnFallBackAndExecute(_entrypointName, _range, _passes, "OpenCL compile failed");
+                     else {
+                        if (openCL.equals(CODE_GEN_ERROR_MARKER)) {
+                           _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED);
+                           boolean silently = true; // since we must have already reported the CodeGenException
+                           return fallBackToNextDevice(_settings, null, silently);
+                        }
                      }
+                  }
 
-                     args = new KernelArg[entryPoint.getReferencedFields().size()];
-                     int i = 0;
+                  // Send the string to OpenCL to compile it
+                  long handle = buildProgramJNI(jniContextHandle, openCL);
+                  _settings.profile.onEvent(ProfilingEvent.OPENCL_COMPILED);
+                  if (handle == 0) {
+                     return fallBackToNextDevice(_settings, "OpenCL compile failed");
+                  }
 
-                     for (final Field field : entryPoint.getReferencedFields()) {
-                        try {
-                           field.setAccessible(true);
-                           args[i] = new KernelArg();
-                           args[i].setName(field.getName());
-                           args[i].setField(field);
-                           if ((field.getModifiers() & Modifier.STATIC) == Modifier.STATIC) {
-                              args[i].setType(args[i].getType() | ARG_STATIC);
+                  args = new KernelArg[entryPoint.getReferencedFields().size()];
+                  int i = 0;
+
+                  for (final Field field : entryPoint.getReferencedFields()) {
+                     try {
+                        field.setAccessible(true);
+                        args[i] = new KernelArg();
+                        args[i].setName(field.getName());
+                        args[i].setField(field);
+                        if ((field.getModifiers() & Modifier.STATIC) == Modifier.STATIC) {
+                           args[i].setType(args[i].getType() | ARG_STATIC);
+                        }
+
+                        final Class<?> type = field.getType();
+                        if (type.isArray()) {
+
+                           if (field.getAnnotation(Local.class) != null || args[i].getName().endsWith(Kernel.LOCAL_SUFFIX)) {
+                              args[i].setType(args[i].getType() | ARG_LOCAL);
+                           } else if ((field.getAnnotation(Constant.class) != null)
+                                 || args[i].getName().endsWith(Kernel.CONSTANT_SUFFIX)) {
+                              args[i].setType(args[i].getType() | ARG_CONSTANT);
+                           } else {
+                              args[i].setType(args[i].getType() | ARG_GLOBAL);
                            }
+                           if (isExplicit()) {
+                              args[i].setType(args[i].getType() | ARG_EXPLICIT);
+                           }
+                           // for now, treat all write arrays as read-write, see bugzilla issue 4859
+                           // we might come up with a better solution later
+                           args[i].setType(args[i].getType()
+                                 | (entryPoint.getArrayFieldAssignments().contains(field.getName()) ? (ARG_WRITE | ARG_READ) : 0));
+                           args[i].setType(args[i].getType()
+                                 | (entryPoint.getArrayFieldAccesses().contains(field.getName()) ? ARG_READ : 0));
+                           // args[i].type |= ARG_GLOBAL;
+
+                           if (type.getName().startsWith("[L")) {
+                              args[i].setType(args[i].getType()
+                                    | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ | ARG_APARAPI_BUFFER));
 
-                           final Class<?> type = field.getType();
-                           if (type.isArray()) {
+                              if (logger.isLoggable(Level.FINE)) {
+                                 logger.fine("tagging " + args[i].getName() + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)");
+                              }
+                           } else if (type.getName().startsWith("[[")) {
 
-                              if (field.getAnnotation(Local.class) != null || args[i].getName().endsWith(Kernel.LOCAL_SUFFIX)) {
-                                 args[i].setType(args[i].getType() | ARG_LOCAL);
-                              } else if ((field.getAnnotation(Constant.class) != null)
-                                    || args[i].getName().endsWith(Kernel.CONSTANT_SUFFIX)) {
-                                 args[i].setType(args[i].getType() | ARG_CONSTANT);
-                              } else {
-                                 args[i].setType(args[i].getType() | ARG_GLOBAL);
+                              try {
+                                 setMultiArrayType(args[i], type);
+                              } catch (AparapiException e) {
+                                 return fallBackToNextDevice(_settings, "failed to set kernel arguement "
+                                       + args[i].getName() + ".  Aparapi only supports 2D and 3D arrays.");
                               }
-                              if (isExplicit()) {
-                                 args[i].setType(args[i].getType() | ARG_EXPLICIT);
+                           } else {
+
+                              args[i].setArray(null); // will get updated in updateKernelArrayRefs
+                              args[i].setType(args[i].getType() | ARG_ARRAY);
+
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(float[].class) ? ARG_FLOAT : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(int[].class) ? ARG_INT : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(boolean[].class) ? ARG_BOOLEAN : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(byte[].class) ? ARG_BYTE : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(char[].class) ? ARG_CHAR : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(double[].class) ? ARG_DOUBLE : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(long[].class) ? ARG_LONG : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(short[].class) ? ARG_SHORT : 0));
+
+                              // arrays whose length is used will have an int arg holding
+                              // the length as a kernel param
+                              if (entryPoint.getArrayFieldArrayLengthUsed().contains(args[i].getName())) {
+                                 args[i].setType(args[i].getType() | ARG_ARRAYLENGTH);
                               }
-                              // for now, treat all write arrays as read-write, see bugzilla issue 4859
-                              // we might come up with a better solution later
-                              args[i].setType(args[i].getType()
-                                    | (entryPoint.getArrayFieldAssignments().contains(field.getName()) ? (ARG_WRITE | ARG_READ) : 0));
-                              args[i].setType(args[i].getType()
-                                    | (entryPoint.getArrayFieldAccesses().contains(field.getName()) ? ARG_READ : 0));
-                              // args[i].type |= ARG_GLOBAL;
 
                               if (type.getName().startsWith("[L")) {
-                                 args[i].setType(args[i].getType()
-                                       | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ | ARG_APARAPI_BUFFER));
-
+                                 args[i].setType(args[i].getType() | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ));
                                  if (logger.isLoggable(Level.FINE)) {
-                                    logger.fine("tagging " + args[i].getName() + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)");
-                                 }
-                              } else if (type.getName().startsWith("[[")) {
-
-                                 try {
-                                    setMultiArrayType(args[i], type);
-                                 } catch (AparapiException e) {
-                                    return warnFallBackAndExecute(_entrypointName, _range, _passes, "failed to set kernel arguement "
-                                          + args[i].getName() + ".  Aparapi only supports 2D and 3D arrays.");
-                                 }
-                              } else {
-
-                                 args[i].setArray(null); // will get updated in updateKernelArrayRefs
-                                 args[i].setType(args[i].getType() | ARG_ARRAY);
-
-                                 args[i].setType(args[i].getType() | (type.isAssignableFrom(float[].class) ? ARG_FLOAT : 0));
-                                 args[i].setType(args[i].getType() | (type.isAssignableFrom(int[].class) ? ARG_INT : 0));
-                                 args[i].setType(args[i].getType() | (type.isAssignableFrom(boolean[].class) ? ARG_BOOLEAN : 0));
-                                 args[i].setType(args[i].getType() | (type.isAssignableFrom(byte[].class) ? ARG_BYTE : 0));
-                                 args[i].setType(args[i].getType() | (type.isAssignableFrom(char[].class) ? ARG_CHAR : 0));
-                                 args[i].setType(args[i].getType() | (type.isAssignableFrom(double[].class) ? ARG_DOUBLE : 0));
-                                 args[i].setType(args[i].getType() | (type.isAssignableFrom(long[].class) ? ARG_LONG : 0));
-                                 args[i].setType(args[i].getType() | (type.isAssignableFrom(short[].class) ? ARG_SHORT : 0));
-
-                                 // arrays whose length is used will have an int arg holding
-                                 // the length as a kernel param
-                                 if (entryPoint.getArrayFieldArrayLengthUsed().contains(args[i].getName())) {
-                                    args[i].setType(args[i].getType() | ARG_ARRAYLENGTH);
-                                 }
-
-                                 if (type.getName().startsWith("[L")) {
-                                    args[i].setType(args[i].getType() | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ));
-                                    if (logger.isLoggable(Level.FINE)) {
-                                       logger.fine("tagging " + args[i].getName()
-                                             + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)");
-                                    }
+                                    logger.fine("tagging " + args[i].getName()
+                                          + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)");
                                  }
                               }
-                           } else if (type.isAssignableFrom(float.class)) {
-                              args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                              args[i].setType(args[i].getType() | ARG_FLOAT);
-                           } else if (type.isAssignableFrom(int.class)) {
-                              args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                              args[i].setType(args[i].getType() | ARG_INT);
-                           } else if (type.isAssignableFrom(double.class)) {
-                              args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                              args[i].setType(args[i].getType() | ARG_DOUBLE);
-                           } else if (type.isAssignableFrom(long.class)) {
-                              args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                              args[i].setType(args[i].getType() | ARG_LONG);
-                           } else if (type.isAssignableFrom(boolean.class)) {
-                              args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                              args[i].setType(args[i].getType() | ARG_BOOLEAN);
-                           } else if (type.isAssignableFrom(byte.class)) {
-                              args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                              args[i].setType(args[i].getType() | ARG_BYTE);
-                           } else if (type.isAssignableFrom(char.class)) {
-                              args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                              args[i].setType(args[i].getType() | ARG_CHAR);
-                           } else if (type.isAssignableFrom(short.class)) {
-                              args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                              args[i].setType(args[i].getType() | ARG_SHORT);
                            }
-                           // System.out.printf("in execute, arg %d %s %08x\n", i,args[i].name,args[i].type );
-                        } catch (final IllegalArgumentException e) {
-                           e.printStackTrace();
+                        } else if (type.isAssignableFrom(float.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_FLOAT);
+                        } else if (type.isAssignableFrom(int.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_INT);
+                        } else if (type.isAssignableFrom(double.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_DOUBLE);
+                        } else if (type.isAssignableFrom(long.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_LONG);
+                        } else if (type.isAssignableFrom(boolean.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_BOOLEAN);
+                        } else if (type.isAssignableFrom(byte.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_BYTE);
+                        } else if (type.isAssignableFrom(char.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_CHAR);
+                        } else if (type.isAssignableFrom(short.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_SHORT);
                         }
+                        // System.out.printf("in execute, arg %d %s %08x\n", i,args[i].name,args[i].type );
+                     } catch (final IllegalArgumentException e) {
+                        e.printStackTrace();
+                     }
 
-                        args[i].setPrimitiveSize(getPrimitiveSize(args[i].getType()));
-
-                        if (logger.isLoggable(Level.FINE)) {
-                           logger.fine("arg " + i + ", " + args[i].getName() + ", type=" + Integer.toHexString(args[i].getType())
-                                 + ", primitiveSize=" + args[i].getPrimitiveSize());
-                        }
+                     args[i].setPrimitiveSize(getPrimitiveSize(args[i].getType()));
 
-                        i++;
+                     if (logger.isLoggable(Level.FINE)) {
+                        logger.fine("arg " + i + ", " + args[i].getName() + ", type=" + Integer.toHexString(args[i].getType())
+                              + ", primitiveSize=" + args[i].getPrimitiveSize());
                      }
 
-                     // at this point, i = the actual used number of arguments
-                     // (private buffers do not get treated as arguments)
-
-                     argc = i;
+                     i++;
+                  }
 
-                     setArgsJNI(jniContextHandle, args, argc);
+                  // at this point, i = the actual used number of arguments
+                  // (private buffers do not get treated as arguments)
 
-                     conversionTime = System.currentTimeMillis() - executeStartTime;
+                  argc = i;
 
-                     try {
-                        executeOpenCL(_entrypointName, _range, _passes);
-                        isFallBack = false;
-                     } catch (final AparapiException e) {
-                        warnFallBackAndExecute(_entrypointName, _range, _passes, e);
-                     }
-                  } else { // (entryPoint != null) && !entryPoint.shouldFallback()
-                     warnFallBackAndExecute(_entrypointName, _range, _passes, "failed to locate entrypoint");
-                  }
-               } else { // (entryPoint == null) || (isFallBack)
+                  setArgsJNI(jniContextHandle, args, argc);
+                  _settings.profile.onEvent(ProfilingEvent.PREPARE_EXECUTE);
                   try {
-                     executeOpenCL(_entrypointName, _range, _passes);
+                     executeOpenCL(_settings);
                      isFallBack = false;
                   } catch (final AparapiException e) {
-                     warnFallBackAndExecute(_entrypointName, _range, _passes, e);
+                     fallBackToNextDevice(_settings, e);
                   }
+               } else { // (entryPoint != null) && !entryPoint.shouldFallback()
+                  fallBackToNextDevice(_settings, "failed to locate entrypoint");
+               }
+            } else { // (entryPoint == null) || (isFallBack)
+               try {
+                  executeOpenCL(_settings);
+                  isFallBack = false;
+               } catch (final AparapiException e) {
+                  fallBackToNextDevice(_settings, e);
                }
-            } else { // (device == null) || (device instanceof OpenCLDevice)
-               warnFallBackAndExecute(_entrypointName, _range, _passes,
-                     "OpenCL was requested but Device supplied was not an OpenCLDevice");
             }
-         } else { // kernel.getExecutionMode().isOpenCL()
-            executeJava(_range, _passes);
+         } else { // isOpenCL
+            if (!(device instanceof JavaDevice)) {
+               fallBackToNextDevice(_settings, "Non-OpenCL Kernel.EXECUTION_MODE requested but device is not a JavaDevice ");
+            }
+            executeJava(_settings, (JavaDevice) device);
          }
 
          if (Config.enableExecutionModeReporting) {
-            System.out.println(describeKernelClass() + ":" + kernel.getExecutionMode());
+            System.out.println("execution complete: " + kernel);
          }
 
-         executionTime = System.currentTimeMillis() - executeStartTime;
-         accumulatedExecutionTime += executionTime;
-
          return kernel;
-      } finally {
-         executing = false;
-         clearCancelMultiPass();
+      }
+      finally {
+         _settings.profile.onEvent(ProfilingEvent.EXECUTED);
+      }
+   }
+
+   @SuppressWarnings("deprecation")
+   private boolean isDeviceCompatible(Device device) {
+      Kernel.EXECUTION_MODE mode = kernel.getExecutionMode();
+      if (mode != Kernel.EXECUTION_MODE.AUTO) {
+         switch (device.getType()) {
+            case GPU:
+               return mode == Kernel.EXECUTION_MODE.GPU;
+            case CPU:
+               return mode == Kernel.EXECUTION_MODE.CPU;
+            case JTP:
+               return mode == Kernel.EXECUTION_MODE.JTP;
+            case SEQ:
+               return mode == Kernel.EXECUTION_MODE.SEQ;
+            case ACC:
+               return mode == Kernel.EXECUTION_MODE.ACC;
+            default:
+               return false;
+         }
+      } else {
+         return (device == kernel.getTargetDevice());
       }
    }
 
@@ -1394,14 +1498,11 @@ public class KernelRunner extends KernelRunnerJNI{
       if (!executing) {
          return PASS_ID_COMPLETED_EXECUTION;
       }
-      switch (kernel.getExecutionMode()) {
-         case NONE:
-            return PASS_ID_COMPLETED_EXECUTION;
-         case JTP: // fallthrough
-         case SEQ:
-            return getCurrentPassLocal();
-         default:
-            return getCurrentPassRemote();
+
+      if (kernel.isRunningCL()) {
+         return getCurrentPassRemote();
+      } else {
+         return getCurrentPassLocal();
       }
    }
 
@@ -1520,17 +1621,14 @@ public class KernelRunner extends KernelRunnerJNI{
     * @see Kernel#get(boolean[] arr)
     */
    public void get(Object array) {
-      if (explicit
-            && ((kernel.getExecutionMode() == Kernel.EXECUTION_MODE.GPU)
-                  || (kernel.getExecutionMode() == Kernel.EXECUTION_MODE.ACC) || (kernel.getExecutionMode() == Kernel.EXECUTION_MODE.CPU))) {
-         // Only makes sense when we are using OpenCL
+      if (explicit && (kernel.isRunningCL())) {
+        // Only makes sense when we are using OpenCL
          getJNI(jniContextHandle, array);
       }
    }
 
    public List<ProfileInfo> getProfileInfo() {
-      if (((kernel.getExecutionMode() == Kernel.EXECUTION_MODE.GPU) || (kernel.getExecutionMode() == Kernel.EXECUTION_MODE.ACC) || (kernel
-            .getExecutionMode() == Kernel.EXECUTION_MODE.CPU))) {
+      if (explicit && (kernel.isRunningCL())) {
          // Only makes sense when we are using OpenCL
          return (getProfileInfoJNI(jniContextHandle));
       } else {
@@ -1554,9 +1652,7 @@ public class KernelRunner extends KernelRunnerJNI{
     */
 
    public void put(Object array) {
-      if (explicit
-            && ((kernel.getExecutionMode() == Kernel.EXECUTION_MODE.GPU)
-                  || (kernel.getExecutionMode() == Kernel.EXECUTION_MODE.ACC) || (kernel.getExecutionMode() == Kernel.EXECUTION_MODE.CPU))) {
+      if (explicit && (kernel.isRunningCL())) {
          // Only makes sense when we are using OpenCL
          puts.add(array);
       }
@@ -1572,33 +1668,33 @@ public class KernelRunner extends KernelRunnerJNI{
       return (explicit);
    }
 
-   /**
-    * Determine the time taken to convert bytecode to OpenCL for first Kernel.execute(range) call.
-    * 
-    * @return The time spent preparing the kernel for execution using GPU
-    * 
-    */
-   public long getConversionTime() {
-      return conversionTime;
-   }
-
-   /**
-    * Determine the execution time of the previous Kernel.execute(range) call.
-    * 
-    * @return The time spent executing the kernel (ms)
-    * 
-    */
-   public long getExecutionTime() {
-      return executionTime;
-   }
+   private static class ExecutionSettings {
+      final KernelPreferences preferences;
+      final KernelProfile profile;
+      final String entrypoint;
+      Range range;
+      final int passes;
+      final boolean legacyExecutionMode;
+
+      private ExecutionSettings(KernelPreferences preferences, KernelProfile profile, String entrypoint, Range range, int passes, boolean legacyExecutionMode) {
+         this.preferences = preferences;
+         this.profile = profile;
+         this.entrypoint = entrypoint;
+         this.range = range;
+         this.passes = passes;
+         this.legacyExecutionMode = legacyExecutionMode;
+      }
 
-   /**
-    * Determine the accumulated execution time of all previous Kernel.execute(range) calls.
-    * 
-    * @return The accumulated time spent executing this kernel (ms)
-    * 
-    */
-   public long getAccumulatedExecutionTime() {
-      return accumulatedExecutionTime;
+      @Override
+      public String toString() {
+         return "ExecutionSettings{" +
+                 "preferences=" + preferences +
+                 ", profile=" + profile +
+                 ", entrypoint='" + entrypoint + '\'' +
+                 ", range=" + range +
+                 ", passes=" + passes +
+                 ", legacyExecutionMode=" + legacyExecutionMode +
+                 '}';
+      }
    }
 }
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java
new file mode 100644
index 0000000000000000000000000000000000000000..77959b65cc75208325ffbafd5e954c6499aa07cd
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java
@@ -0,0 +1,8 @@
+package com.amd.aparapi.internal.kernel;
+
+/**
+ * Created by Barney on 02/09/2015.
+ */
+public enum ProfilingEvent {
+   START, CLASS_MODEL_BUILT, OPENCL_GENERATED, OPENCL_COMPILED, PREPARE_EXECUTE, EXECUTED
+}
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java
index e4728c5e892f769305fcec0f8b29878a1aecbd41..132f4f21ae49d9371e19914bfc03805f5aceb880 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java
@@ -45,6 +45,7 @@ import com.amd.aparapi.internal.model.ValueCache.ThrowingValueComputer;
 import com.amd.aparapi.internal.model.ClassModel.AttributePool.*;
 import com.amd.aparapi.internal.model.ClassModel.ConstantPool.*;
 import com.amd.aparapi.internal.reader.*;
+import com.amd.aparapi.internal.util.*;
 
 import java.io.*;
 import java.lang.reflect.*;
@@ -2629,7 +2630,7 @@ public class ClassModel{
          methods.add(method);
       }
 
-      attributePool = new AttributePool(byteReader, getClassWeAreModelling().getSimpleName());
+      attributePool = new AttributePool(byteReader, Reflection.getSimpleName(getClassWeAreModelling()));
    }
 
    public int getMagic() {
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Entrypoint.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Entrypoint.java
index 7ae155efa905a1dca6cd88f39931977a6ea9317a..974dac64adfec1c2ba8ca681c3576e6ccad28fda 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Entrypoint.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Entrypoint.java
@@ -62,8 +62,6 @@ public class Entrypoint implements Cloneable {
 
    private Object kernelInstance = null;
 
-   private final boolean fallback = false;
-
    private final Set<String> referencedFieldNames = new LinkedHashSet<String>();
 
    private final Set<String> arrayFieldAssignments = new LinkedHashSet<String>();
@@ -474,7 +472,7 @@ public class Entrypoint implements Cloneable {
 
       // methodMap now contains a list of method called by run itself().
       // Walk the whole graph of called methods and add them to the methodMap
-      while (!fallback && discovered) {
+      while (discovered) {
          discovered = false;
          for (final MethodModel mm : new ArrayList<MethodModel>(methodMap.values())) {
             for (final MethodCall methodCall : mm.getMethodCalls()) {
@@ -506,295 +504,288 @@ public class Entrypoint implements Cloneable {
 
       methodModel.checkForRecursion(new HashSet<MethodModel>());
 
-      if (logger.isLoggable(Level.FINE)) {
-         logger.fine("fallback=" + fallback);
-      }
-
-      if (!fallback) {
-         calledMethods.addAll(methodMap.values());
-         Collections.reverse(calledMethods);
-         final List<MethodModel> methods = new ArrayList<MethodModel>(calledMethods);
+      calledMethods.addAll(methodMap.values());
+      Collections.reverse(calledMethods);
+      final List<MethodModel> methods = new ArrayList<MethodModel>(calledMethods);
 
-         // add method to the calledMethods so we can include in this list
-         methods.add(methodModel);
-         final Set<String> fieldAssignments = new HashSet<String>();
+      // add method to the calledMethods so we can include in this list
+      methods.add(methodModel);
+      final Set<String> fieldAssignments = new HashSet<String>();
 
-         final Set<String> fieldAccesses = new HashSet<String>();
+      final Set<String> fieldAccesses = new HashSet<String>();
 
-         for (final MethodModel methodModel : methods) {
-
-            // Record which pragmas we need to enable
-            if (methodModel.requiresDoublePragma()) {
-               usesDoubles = true;
-               if (logger.isLoggable(Level.FINE)) {
-                  logger.fine("Enabling doubles on " + methodModel.getName());
-               }
+      for (final MethodModel methodModel : methods) {
 
+         // Record which pragmas we need to enable
+         if (methodModel.requiresDoublePragma()) {
+            usesDoubles = true;
+            if (logger.isLoggable(Level.FINE)) {
+               logger.fine("Enabling doubles on " + methodModel.getName());
             }
-            if (methodModel.requiresByteAddressableStorePragma()) {
-               usesByteWrites = true;
-               if (logger.isLoggable(Level.FINE)) {
-                  logger.fine("Enabling byte addressable on " + methodModel.getName());
-               }
+
+         }
+         if (methodModel.requiresByteAddressableStorePragma()) {
+            usesByteWrites = true;
+            if (logger.isLoggable(Level.FINE)) {
+               logger.fine("Enabling byte addressable on " + methodModel.getName());
             }
+         }
 
-            for (Instruction instruction = methodModel.getPCHead(); instruction != null; instruction = instruction.getNextPC()) {
+         for (Instruction instruction = methodModel.getPCHead(); instruction != null; instruction = instruction.getNextPC()) {
 
-               if (instruction instanceof AssignToArrayElement) {
-                  final AssignToArrayElement assignment = (AssignToArrayElement) instruction;
+            if (instruction instanceof AssignToArrayElement) {
+               final AssignToArrayElement assignment = (AssignToArrayElement) instruction;
 
-                  final Instruction arrayRef = assignment.getArrayRef();
-                  // AccessField here allows instance and static array refs
-                  if (arrayRef instanceof I_GETFIELD) {
-                     final I_GETFIELD getField = (I_GETFIELD) arrayRef;
-                     final FieldEntry field = getField.getConstantPoolFieldEntry();
-                     final String assignedArrayFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
-                     arrayFieldAssignments.add(assignedArrayFieldName);
-                     referencedFieldNames.add(assignedArrayFieldName);
+               final Instruction arrayRef = assignment.getArrayRef();
+               // AccessField here allows instance and static array refs
+               if (arrayRef instanceof I_GETFIELD) {
+                  final I_GETFIELD getField = (I_GETFIELD) arrayRef;
+                  final FieldEntry field = getField.getConstantPoolFieldEntry();
+                  final String assignedArrayFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
+                  arrayFieldAssignments.add(assignedArrayFieldName);
+                  referencedFieldNames.add(assignedArrayFieldName);
 
-                  }
-               } else if (instruction instanceof AccessArrayElement) {
-                  final AccessArrayElement access = (AccessArrayElement) instruction;
-
-                  final Instruction arrayRef = access.getArrayRef();
-                  // AccessField here allows instance and static array refs
-                  if (arrayRef instanceof I_GETFIELD) {
-                     final I_GETFIELD getField = (I_GETFIELD) arrayRef;
-                     final FieldEntry field = getField.getConstantPoolFieldEntry();
-                     final String accessedArrayFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
-                     arrayFieldAccesses.add(accessedArrayFieldName);
-                     referencedFieldNames.add(accessedArrayFieldName);
+               }
+            } else if (instruction instanceof AccessArrayElement) {
+               final AccessArrayElement access = (AccessArrayElement) instruction;
+
+               final Instruction arrayRef = access.getArrayRef();
+               // AccessField here allows instance and static array refs
+               if (arrayRef instanceof I_GETFIELD) {
+                  final I_GETFIELD getField = (I_GETFIELD) arrayRef;
+                  final FieldEntry field = getField.getConstantPoolFieldEntry();
+                  final String accessedArrayFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
+                  arrayFieldAccesses.add(accessedArrayFieldName);
+                  referencedFieldNames.add(accessedArrayFieldName);
 
-                  }
-               } else if (instruction instanceof I_ARRAYLENGTH) {
-                  Instruction child = instruction.getFirstChild();
-                  while(child instanceof I_AALOAD) {
-                     child = child.getFirstChild();
-                  }
-                  if (!(child instanceof AccessField)) {
-                     throw new ClassParseException(ClassParseException.TYPE.LOCALARRAYLENGTHACCESS);
-                  }
-                  final AccessField childField = (AccessField) child;
-                  final String arrayName = childField.getConstantPoolFieldEntry().getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
-                  arrayFieldArrayLengthUsed.add(arrayName);
-                  if (logger.isLoggable(Level.FINE)) {
-                     logger.fine("Noted arraylength in " + methodModel.getName() + " on " + arrayName);
-                  }
-               } else if (instruction instanceof AccessField) {
-                  final AccessField access = (AccessField) instruction;
-                  final FieldEntry field = access.getConstantPoolFieldEntry();
-                  final String accessedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
-                  fieldAccesses.add(accessedFieldName);
-                  referencedFieldNames.add(accessedFieldName);
-
-                  final String signature = field.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8();
-                  if (logger.isLoggable(Level.FINE)) {
-                     logger.fine("AccessField field type= " + signature + " in " + methodModel.getName());
-                  }
+               }
+            } else if (instruction instanceof I_ARRAYLENGTH) {
+               Instruction child = instruction.getFirstChild();
+               while(child instanceof I_AALOAD) {
+                  child = child.getFirstChild();
+               }
+               if (!(child instanceof AccessField)) {
+                  throw new ClassParseException(ClassParseException.TYPE.LOCALARRAYLENGTHACCESS);
+               }
+               final AccessField childField = (AccessField) child;
+               final String arrayName = childField.getConstantPoolFieldEntry().getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
+               arrayFieldArrayLengthUsed.add(arrayName);
+               if (logger.isLoggable(Level.FINE)) {
+                  logger.fine("Noted arraylength in " + methodModel.getName() + " on " + arrayName);
+               }
+            } else if (instruction instanceof AccessField) {
+               final AccessField access = (AccessField) instruction;
+               final FieldEntry field = access.getConstantPoolFieldEntry();
+               final String accessedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
+               fieldAccesses.add(accessedFieldName);
+               referencedFieldNames.add(accessedFieldName);
+
+               final String signature = field.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8();
+               if (logger.isLoggable(Level.FINE)) {
+                  logger.fine("AccessField field type= " + signature + " in " + methodModel.getName());
+               }
 
-                  // Add the class model for the referenced obj array
-                  if (signature.startsWith("[L")) {
-                     // Turn [Lcom/amd/javalabs/opencl/demo/DummyOOA; into com.amd.javalabs.opencl.demo.DummyOOA for example
-                     final String className = (signature.substring(2, signature.length() - 1)).replace('/', '.');
-                     final ClassModel arrayFieldModel = getOrUpdateAllClassAccesses(className);
-                     if (arrayFieldModel != null) {
-                        final Class<?> memberClass = arrayFieldModel.getClassWeAreModelling();
-                        final int modifiers = memberClass.getModifiers();
-                        if (!Modifier.isFinal(modifiers)) {
-                           throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTNONFINAL);
-                        }
+               // Add the class model for the referenced obj array
+               if (signature.startsWith("[L")) {
+                  // Turn [Lcom/amd/javalabs/opencl/demo/DummyOOA; into com.amd.javalabs.opencl.demo.DummyOOA for example
+                  final String className = (signature.substring(2, signature.length() - 1)).replace('/', '.');
+                  final ClassModel arrayFieldModel = getOrUpdateAllClassAccesses(className);
+                  if (arrayFieldModel != null) {
+                     final Class<?> memberClass = arrayFieldModel.getClassWeAreModelling();
+                     final int modifiers = memberClass.getModifiers();
+                     if (!Modifier.isFinal(modifiers)) {
+                        throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTNONFINAL);
+                     }
 
-                        final ClassModel refModel = objectArrayFieldsClasses.get(className);
-                        if (refModel == null) {
-
-                           // Verify no other member with common parent
-                           for (final ClassModel memberObjClass : objectArrayFieldsClasses.values()) {
-                              ClassModel superModel = memberObjClass;
-                              while (superModel != null) {
-                                 if (superModel.isSuperClass(memberClass)) {
-                                    throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTFIELDNAMECONFLICT);
-                                 }
-                                 superModel = superModel.getSuperClazz();
+                     final ClassModel refModel = objectArrayFieldsClasses.get(className);
+                     if (refModel == null) {
+
+                        // Verify no other member with common parent
+                        for (final ClassModel memberObjClass : objectArrayFieldsClasses.values()) {
+                           ClassModel superModel = memberObjClass;
+                           while (superModel != null) {
+                              if (superModel.isSuperClass(memberClass)) {
+                                 throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTFIELDNAMECONFLICT);
                               }
+                              superModel = superModel.getSuperClazz();
                            }
+                        }
 
-                           objectArrayFieldsClasses.put(className, arrayFieldModel);
-                           if (logger.isLoggable(Level.FINE)) {
-                              logger.fine("adding class to objectArrayFields: " + className);
-                           }
+                        objectArrayFieldsClasses.put(className, arrayFieldModel);
+                        if (logger.isLoggable(Level.FINE)) {
+                           logger.fine("adding class to objectArrayFields: " + className);
                         }
                      }
-                  } else {
-                     final String className = (field.getClassEntry().getNameUTF8Entry().getUTF8()).replace('/', '.');
-                     // Look for object data member access
-                     if (!className.equals(getClassModel().getClassWeAreModelling().getName())
-                           && (getFieldFromClassHierarchy(getClassModel().getClassWeAreModelling(), accessedFieldName) == null)) {
-                        updateObjectMemberFieldAccesses(className, field);
-                     }
                   }
-
-               } else if (instruction instanceof AssignToField) {
-                  final AssignToField assignment = (AssignToField) instruction;
-                  final FieldEntry field = assignment.getConstantPoolFieldEntry();
-                  final String assignedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
-                  fieldAssignments.add(assignedFieldName);
-                  referencedFieldNames.add(assignedFieldName);
-
+               } else {
                   final String className = (field.getClassEntry().getNameUTF8Entry().getUTF8()).replace('/', '.');
                   // Look for object data member access
                   if (!className.equals(getClassModel().getClassWeAreModelling().getName())
-                        && (getFieldFromClassHierarchy(getClassModel().getClassWeAreModelling(), assignedFieldName) == null)) {
+                        && (getFieldFromClassHierarchy(getClassModel().getClassWeAreModelling(), accessedFieldName) == null)) {
                      updateObjectMemberFieldAccesses(className, field);
-                  } else {
+                  }
+               }
 
-                     if ((!Config.enablePUTFIELD) && methodModel.methodUsesPutfield() && !methodModel.isSetter()) {
-                        throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTONLYSUPPORTSSIMPLEPUTFIELD);
-                     }
+            } else if (instruction instanceof AssignToField) {
+               final AssignToField assignment = (AssignToField) instruction;
+               final FieldEntry field = assignment.getConstantPoolFieldEntry();
+               final String assignedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
+               fieldAssignments.add(assignedFieldName);
+               referencedFieldNames.add(assignedFieldName);
+
+               final String className = (field.getClassEntry().getNameUTF8Entry().getUTF8()).replace('/', '.');
+               // Look for object data member access
+               if (!className.equals(getClassModel().getClassWeAreModelling().getName())
+                     && (getFieldFromClassHierarchy(getClassModel().getClassWeAreModelling(), assignedFieldName) == null)) {
+                  updateObjectMemberFieldAccesses(className, field);
+               } else {
 
+                  if ((!Config.enablePUTFIELD) && methodModel.methodUsesPutfield() && !methodModel.isSetter()) {
+                     throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTONLYSUPPORTSSIMPLEPUTFIELD);
                   }
 
                }
-               else if (instruction instanceof I_INVOKEVIRTUAL) {
-                  final I_INVOKEVIRTUAL invokeInstruction = (I_INVOKEVIRTUAL) instruction;
-                  MethodModel invokedMethod = invokeInstruction.getMethod();
-                  FieldEntry getterField = getSimpleGetterField(invokedMethod);
-                  if (getterField != null) {
-                     referencedFieldNames.add(getterField.getNameAndTypeEntry().getNameUTF8Entry().getUTF8());
-                  }
-                  else {
-                     final MethodEntry methodEntry = invokeInstruction.getConstantPoolMethodEntry();
-                     if (Kernel.isMappedMethod(methodEntry)) { //only do this for intrinsics
 
-                        if (Kernel.usesAtomic32(methodEntry)) {
-                           setRequiresAtomics32Pragma(true);
-                        }
+            }
+            else if (instruction instanceof I_INVOKEVIRTUAL) {
+               final I_INVOKEVIRTUAL invokeInstruction = (I_INVOKEVIRTUAL) instruction;
+               MethodModel invokedMethod = invokeInstruction.getMethod();
+               FieldEntry getterField = getSimpleGetterField(invokedMethod);
+               if (getterField != null) {
+                  referencedFieldNames.add(getterField.getNameAndTypeEntry().getNameUTF8Entry().getUTF8());
+               }
+               else {
+                  final MethodEntry methodEntry = invokeInstruction.getConstantPoolMethodEntry();
+                  if (Kernel.isMappedMethod(methodEntry)) { //only do this for intrinsics
 
-                        final Arg methodArgs[] = methodEntry.getArgs();
-                        if ((methodArgs.length > 0) && methodArgs[0].isArray()) { //currently array arg can only take slot 0
-                           final Instruction arrInstruction = invokeInstruction.getArg(0);
-                           if (arrInstruction instanceof AccessField) {
-                              final AccessField access = (AccessField) arrInstruction;
-                              final FieldEntry field = access.getConstantPoolFieldEntry();
-                              final String accessedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
-                              arrayFieldAssignments.add(accessedFieldName);
-                              referencedFieldNames.add(accessedFieldName);
-                           }
-                           else {
-                              throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTSETTERARRAY);
-                           }
-                        }
+                     if (Kernel.usesAtomic32(methodEntry)) {
+                        setRequiresAtomics32Pragma(true);
                      }
 
+                     final Arg methodArgs[] = methodEntry.getArgs();
+                     if ((methodArgs.length > 0) && methodArgs[0].isArray()) { //currently array arg can only take slot 0
+                        final Instruction arrInstruction = invokeInstruction.getArg(0);
+                        if (arrInstruction instanceof AccessField) {
+                           final AccessField access = (AccessField) arrInstruction;
+                           final FieldEntry field = access.getConstantPoolFieldEntry();
+                           final String accessedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
+                           arrayFieldAssignments.add(accessedFieldName);
+                           referencedFieldNames.add(accessedFieldName);
+                        }
+                        else {
+                           throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTSETTERARRAY);
+                        }
+                     }
                   }
+
                }
             }
          }
+      }
 
-         for (final String referencedFieldName : referencedFieldNames) {
+      for (final String referencedFieldName : referencedFieldNames) {
 
-            try {
-               final Class<?> clazz = classModel.getClassWeAreModelling();
-               final Field field = getFieldFromClassHierarchy(clazz, referencedFieldName);
-               if (field != null) {
-                  referencedFields.add(field);
-                  final ClassModelField ff = classModel.getField(referencedFieldName);
-                  assert ff != null : "ff should not be null for " + clazz.getName() + "." + referencedFieldName;
-                  referencedClassModelFields.add(ff);
-               }
-            } catch (final SecurityException e) {
-               e.printStackTrace();
+         try {
+            final Class<?> clazz = classModel.getClassWeAreModelling();
+            final Field field = getFieldFromClassHierarchy(clazz, referencedFieldName);
+            if (field != null) {
+               referencedFields.add(field);
+               final ClassModelField ff = classModel.getField(referencedFieldName);
+               assert ff != null : "ff should not be null for " + clazz.getName() + "." + referencedFieldName;
+               referencedClassModelFields.add(ff);
             }
+         } catch (final SecurityException e) {
+            e.printStackTrace();
          }
+      }
 
-         // Build data needed for oop form transforms if necessary
-         if (!objectArrayFieldsClasses.keySet().isEmpty()) {
+      // Build data needed for oop form transforms if necessary
+      if (!objectArrayFieldsClasses.keySet().isEmpty()) {
 
-            for (final ClassModel memberObjClass : objectArrayFieldsClasses.values()) {
+         for (final ClassModel memberObjClass : objectArrayFieldsClasses.values()) {
 
-               // At this point we have already done the field override safety check, so 
-               // add all the superclass fields into the kernel member class to be
-               // sorted by size and emitted into the struct
-               ClassModel superModel = memberObjClass.getSuperClazz();
-               while (superModel != null) {
-                  if (logger.isLoggable(Level.FINEST)) {
-                     logger.finest("adding = " + superModel.getClassWeAreModelling().getName() + " fields into "
-                           + memberObjClass.getClassWeAreModelling().getName());
-                  }
-                  memberObjClass.getStructMembers().addAll(superModel.getStructMembers());
-                  superModel = superModel.getSuperClazz();
+            // At this point we have already done the field override safety check, so
+            // add all the superclass fields into the kernel member class to be
+            // sorted by size and emitted into the struct
+            ClassModel superModel = memberObjClass.getSuperClazz();
+            while (superModel != null) {
+               if (logger.isLoggable(Level.FINEST)) {
+                  logger.finest("adding = " + superModel.getClassWeAreModelling().getName() + " fields into "
+                        + memberObjClass.getClassWeAreModelling().getName());
                }
+               memberObjClass.getStructMembers().addAll(superModel.getStructMembers());
+               superModel = superModel.getSuperClazz();
             }
+         }
 
-            // Sort fields of each class biggest->smallest
-            final Comparator<FieldEntry> fieldSizeComparator = new Comparator<FieldEntry>(){
-               @Override public int compare(FieldEntry aa, FieldEntry bb) {
-                  final String aType = aa.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8();
-                  final String bType = bb.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8();
-
-                  // Booleans get converted down to bytes
-                  final int aSize = InstructionSet.TypeSpec.valueOf(aType.equals("Z") ? "B" : aType).getSize();
-                  final int bSize = InstructionSet.TypeSpec.valueOf(bType.equals("Z") ? "B" : bType).getSize();
+         // Sort fields of each class biggest->smallest
+         final Comparator<FieldEntry> fieldSizeComparator = new Comparator<FieldEntry>(){
+            @Override public int compare(FieldEntry aa, FieldEntry bb) {
+               final String aType = aa.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8();
+               final String bType = bb.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8();
 
-                  if (logger.isLoggable(Level.FINEST)) {
-                     logger.finest("aType= " + aType + " aSize= " + aSize + " . . bType= " + bType + " bSize= " + bSize);
-                  }
+               // Booleans get converted down to bytes
+               final int aSize = InstructionSet.TypeSpec.valueOf(aType.equals("Z") ? "B" : aType).getSize();
+               final int bSize = InstructionSet.TypeSpec.valueOf(bType.equals("Z") ? "B" : bType).getSize();
 
-                  // Note this is sorting in reverse order so the biggest is first
-                  if (aSize > bSize) {
-                     return -1;
-                  } else if (aSize == bSize) {
-                     return 0;
-                  } else {
-                     return 1;
-                  }
+               if (logger.isLoggable(Level.FINEST)) {
+                  logger.finest("aType= " + aType + " aSize= " + aSize + " . . bType= " + bType + " bSize= " + bSize);
                }
-            };
-
-            for (final ClassModel c : objectArrayFieldsClasses.values()) {
-               final ArrayList<FieldEntry> fields = c.getStructMembers();
-               if (fields.size() > 0) {
-                  Collections.sort(fields, fieldSizeComparator);
-
-                  // Now compute the total size for the struct
-                  int totalSize = 0;
-                  int alignTo = 0;
-
-                  for (final FieldEntry f : fields) {
-                     // Record field offset for use while copying
-                     // Get field we will copy out of the kernel member object
-                     final Field rfield = getFieldFromClassHierarchy(c.getClassWeAreModelling(), f.getNameAndTypeEntry()
-                           .getNameUTF8Entry().getUTF8());
-
-                     c.getStructMemberOffsets().add(UnsafeWrapper.objectFieldOffset(rfield));
-
-                     final String fType = f.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8();
-                     //c.getStructMemberTypes().add(TypeSpec.valueOf(fType.equals("Z") ? "B" : fType));
-                     c.getStructMemberTypes().add(TypeSpec.valueOf(fType));
-                     final int fSize = TypeSpec.valueOf(fType.equals("Z") ? "B" : fType).getSize();
-                     if (fSize > alignTo) {
-                        alignTo = fSize;
-                     }
 
-                     totalSize += fSize;
-                     if (logger.isLoggable(Level.FINEST)) {
-                        logger.finest("Field = " + f.getNameAndTypeEntry().getNameUTF8Entry().getUTF8() + " size=" + fSize
-                              + " totalSize=" + totalSize);
-                     }
+               // Note this is sorting in reverse order so the biggest is first
+               if (aSize > bSize) {
+                  return -1;
+               } else if (aSize == bSize) {
+                  return 0;
+               } else {
+                  return 1;
+               }
+            }
+         };
+
+         for (final ClassModel c : objectArrayFieldsClasses.values()) {
+            final ArrayList<FieldEntry> fields = c.getStructMembers();
+            if (fields.size() > 0) {
+               Collections.sort(fields, fieldSizeComparator);
+
+               // Now compute the total size for the struct
+               int totalSize = 0;
+               int alignTo = 0;
+
+               for (final FieldEntry f : fields) {
+                  // Record field offset for use while copying
+                  // Get field we will copy out of the kernel member object
+                  final Field rfield = getFieldFromClassHierarchy(c.getClassWeAreModelling(), f.getNameAndTypeEntry()
+                        .getNameUTF8Entry().getUTF8());
+
+                  c.getStructMemberOffsets().add(UnsafeWrapper.objectFieldOffset(rfield));
+
+                  final String fType = f.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8();
+                  //c.getStructMemberTypes().add(TypeSpec.valueOf(fType.equals("Z") ? "B" : fType));
+                  c.getStructMemberTypes().add(TypeSpec.valueOf(fType));
+                  final int fSize = TypeSpec.valueOf(fType.equals("Z") ? "B" : fType).getSize();
+                  if (fSize > alignTo) {
+                     alignTo = fSize;
                   }
 
-                  // compute total size for OpenCL buffer
-                  int totalStructSize = 0;
-                  if ((totalSize % alignTo) == 0) {
-                     totalStructSize = totalSize;
-                  } else {
-                     // Pad up if necessary
-                     totalStructSize = ((totalSize / alignTo) + 1) * alignTo;
+                  totalSize += fSize;
+                  if (logger.isLoggable(Level.FINEST)) {
+                     logger.finest("Field = " + f.getNameAndTypeEntry().getNameUTF8Entry().getUTF8() + " size=" + fSize
+                           + " totalSize=" + totalSize);
                   }
-                  c.setTotalStructSize(totalStructSize);
                }
+
+               // compute total size for OpenCL buffer
+               int totalStructSize = 0;
+               if ((totalSize % alignTo) == 0) {
+                  totalStructSize = totalSize;
+               } else {
+                  // Pad up if necessary
+                  totalStructSize = ((totalSize / alignTo) + 1) * alignTo;
+               }
+               c.setTotalStructSize(totalStructSize);
             }
          }
-
       }
    }
 
@@ -807,10 +798,6 @@ public class Entrypoint implements Cloneable {
       return method.getAccessorVariableFieldEntry();
    }
 
-   public boolean shouldFallback() {
-      return (fallback);
-   }
-
    public List<ClassModel.ClassModelField> getReferencedClassModelFields() {
       return (referencedClassModelFields);
    }
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/opencl/OpenCLPlatform.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/opencl/OpenCLPlatform.java
index 12b52360ef683e9bf74d2bf9a5f2a2b73d2092c0..1f8321336f6999aec5fc7540f65d32ab07cef2bd 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/opencl/OpenCLPlatform.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/opencl/OpenCLPlatform.java
@@ -1,10 +1,9 @@
 package com.amd.aparapi.internal.opencl;
 
-import java.util.ArrayList;
-import java.util.List;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.jni.*;
 
-import com.amd.aparapi.device.OpenCLDevice;
-import com.amd.aparapi.internal.jni.OpenCLJNI;
+import java.util.*;
 
 public class OpenCLPlatform extends OpenCLJNI{
 
@@ -18,6 +17,8 @@ public class OpenCLPlatform extends OpenCLJNI{
 
    private final List<OpenCLDevice> devices = new ArrayList<OpenCLDevice>();
 
+   private static List<OpenCLPlatform> platforms;
+
    /**
     * Default constructor
     */
@@ -51,11 +52,14 @@ public class OpenCLPlatform extends OpenCLJNI{
    }
 
    public List<OpenCLPlatform> getOpenCLPlatforms() {
-      if (OpenCLLoader.isOpenCLAvailable()) {
-         return (getPlatforms());
-      } else {
-         return (new ArrayList<OpenCLPlatform>());
+      if (platforms == null) {
+         if (OpenCLLoader.isOpenCLAvailable()) {
+            platforms = getPlatforms();
+         } else {
+            return (Collections.EMPTY_LIST);
+         }
       }
+      return platforms;
    }
 
    public String getName() {
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/util/Reflection.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/util/Reflection.java
new file mode 100644
index 0000000000000000000000000000000000000000..3f2ad65d866931cb04f9739020717f47ba15fc4f
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/util/Reflection.java
@@ -0,0 +1,18 @@
+package com.amd.aparapi.internal.util;
+
+/**
+ * Created by Barney on 03/09/2015.
+ */
+public class Reflection {
+
+   /** Avoids getting dumb empty names for anonymous inners. */
+   public static String getSimpleName(Class<?> klass) {
+      String simpleName = klass.getSimpleName();
+      if (simpleName.isEmpty()) {
+         String fullName = klass.getName();
+         int index = fullName.lastIndexOf('.');
+         simpleName = (index < 0) ? fullName : fullName.substring(index + 1);
+      }
+      return simpleName;
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java
new file mode 100644
index 0000000000000000000000000000000000000000..2e52a6e1113f61d475f46a942800d15a1088d052
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java
@@ -0,0 +1,82 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.*;
+import com.amd.aparapi.internal.kernel.*;
+
+import java.util.*;
+
+/**
+ * Tests device selection via {@link com.amd.aparapi.internal.kernel.KernelManager}.
+ */
+public class ConfigurationDemo {
+   public static void main(String[] ignored) {
+      System.setProperty("com.amd.aparapi.dumpProfilesOnExit", "true");
+
+      StringBuilder report;
+
+      List<Integer> tests = Arrays.asList(0, 1, 2, 3);
+      int reps = 300;
+      for (int rep = 0; rep < reps; ++rep) {
+         runTests(rep == 0, tests);
+
+         if (rep % 100 == 99 || rep == 0) {
+            report = new StringBuilder("rep = " + rep + "\n");
+            KernelManager.instance().reportDeviceUsage(report, true);
+            System.out.println(report);
+         }
+      }
+   }
+
+   private static void runTests(boolean verbose, List<Integer> testIndicesToRun) {
+      final int globalSize = 1;
+      Kernel kernel;
+      if (testIndicesToRun.contains(0)) {
+         if (verbose) {
+            System.out.println();
+            System.out.println("Testing default KernelPreferences with kernel which cannot be run in OpenCL, with fallback algorithm");
+            System.out.println();
+         }
+         kernel = new KernelWithAlternateFallbackAlgorithm();
+         kernel.execute(globalSize);
+         kernel.dispose();
+      }
+
+      if (testIndicesToRun.contains(1)) {
+         if (verbose) {
+            System.out.println();
+            System.out.println("Testing default KernelPreferences with kernel which cannot be run in OpenCL, without fallback algorithm");
+            System.out.println();
+         }
+         kernel = new KernelWithoutAlternateFallbackAlgorithm();
+         kernel.execute(globalSize);
+         kernel.dispose();
+      }
+
+      if (testIndicesToRun.contains(2)) {
+         if (verbose) {
+            System.out.println();
+            System.out.println("Retesting previous case, should jump straight to regular java implementation without warnings");
+            System.out.println();
+         }
+         kernel = new KernelWithoutAlternateFallbackAlgorithm();
+         kernel.execute(globalSize);
+         kernel.dispose();
+      }
+
+      if (testIndicesToRun.contains(3)) {
+         if (verbose) {
+            System.out.println();
+            System.out.println("Testing default KernelPreferences with kernel which should be run in OpenCL");
+            System.out.println();
+         }
+         KernelOkayInOpenCL clKernel = new KernelOkayInOpenCL();
+         kernel = clKernel;
+         kernel.execute(clKernel.inChars.length);
+         String result = new String(clKernel.outChars);
+         if (verbose) {
+            System.out.println("kernel output: " + result);
+         }
+         kernel.dispose();
+      }
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/CustomConfigurationDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/CustomConfigurationDemo.java
new file mode 100644
index 0000000000000000000000000000000000000000..476737d42fe97c6d3bd6eee1fc5f78fc105ebaf5
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/CustomConfigurationDemo.java
@@ -0,0 +1,42 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
+
+import java.util.*;
+
+/**
+ * Created by Barney on 31/08/2015.
+ */
+public class CustomConfigurationDemo {
+
+   public static void main(String[] ignored) {
+      System.setProperty("com.amd.aparapi.dumpProfilesOnExit", "true");
+      KernelManager manager = new KernelManager() {
+         @Override
+         protected List<Device.TYPE> getPreferredDeviceTypes() {
+            return Arrays.asList(Device.TYPE.CPU, Device.TYPE.ALT, Device.TYPE.JTP);
+         }
+      };
+      KernelManager.setKernelManager(manager);
+
+      System.out.println("\nTesting custom KernelPreferences with kernel, preferences choose CPU");
+      KernelOkayInOpenCL kernel = new KernelOkayInOpenCL();
+      kernel.execute(kernel.inChars.length);
+      System.out.println(kernel.outChars);
+
+      System.out.println("\nTesting custom KernelPreferences with kernel, preferences specify CPU but kernel vetos CPU");
+      kernel = new KernelOkayInOpenCL() {
+         @Override
+         public boolean isAllowDevice(Device _device) {
+            return _device.getType() != Device.TYPE.CPU;
+         }
+      };
+      kernel.execute(kernel.inChars.length);
+      System.out.println(kernel.outChars);
+
+      StringBuilder report = new StringBuilder("\n");
+      KernelManager.instance().reportDeviceUsage(report, true);
+      System.out.println(report);
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelOkayInOpenCL.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelOkayInOpenCL.java
new file mode 100644
index 0000000000000000000000000000000000000000..6ed54e5b7ef47eab954c42a3e9df5a795de42566
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelOkayInOpenCL.java
@@ -0,0 +1,21 @@
+package com.amd.aparapi.sample.configuration;
+
+/**
+ * Created by Barney on 24/08/2015.
+ */
+public class KernelOkayInOpenCL extends com.amd.aparapi.Kernel {
+   char[] inChars = "KernelOkayInOpenCL".toCharArray();
+   char[] outChars = new char[inChars.length];
+
+   @Override
+   public void run() {
+      int index = getGlobalId();
+      oops();
+      outChars[index] = inChars[index];
+   }
+
+   @NoCL
+   private void oops() {
+      System.out.println("Oops, running in kernel in Java");
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithAlternateFallbackAlgorithm.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithAlternateFallbackAlgorithm.java
new file mode 100644
index 0000000000000000000000000000000000000000..670e6a669193d05d017648f04515a439d9f0b8d1
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithAlternateFallbackAlgorithm.java
@@ -0,0 +1,24 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.*;
+
+/**
+ * Kernel which will always fail to run on an OpenCLDevice but has an alternative fallback algorithm.
+ */
+public class KernelWithAlternateFallbackAlgorithm extends Kernel {
+   @Override
+   public void run() {
+      // deliberately, will fail to generate OpenCL as println is unsupported
+      System.out.println("Running in Java (regular algorithm)");
+   }
+
+   @Override
+   public boolean hasFallbackAlgorithm() {
+      return true;
+   }
+
+   @Override
+   public void executeFallbackAlgorithm(Range _range, int _passes) {
+      System.out.println("Running in Java (alternate non-parallel algorithm)");
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithoutAlternateFallbackAlgorithm.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithoutAlternateFallbackAlgorithm.java
new file mode 100644
index 0000000000000000000000000000000000000000..1096a092e38c2c696c153d969eb31b54c4d8c844
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithoutAlternateFallbackAlgorithm.java
@@ -0,0 +1,14 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.*;
+
+/**
+ * Kernel which will always fail to run on an OpenCLDevice but has an alternative fallback algorithm.
+ */
+public class KernelWithoutAlternateFallbackAlgorithm extends Kernel {
+   @Override
+   public void run() {
+      // deliberately, will fail to generate OpenCL as println is unsupported
+      System.out.println("Running in Java (regular algorithm)");
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/LegacyConfigurationDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/LegacyConfigurationDemo.java
new file mode 100644
index 0000000000000000000000000000000000000000..db4149a139f3b50d49de50b94c48ceafe98ec4e5
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/LegacyConfigurationDemo.java
@@ -0,0 +1,26 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.*;
+import com.amd.aparapi.internal.kernel.*;
+
+/**
+ * Tests device selection when circumventing the {@link com.amd.aparapi.internal.kernel.KernelManager} by using the legacy mechanism
+ * (setExecutionMode, etc.).
+ */
+public class LegacyConfigurationDemo {
+
+   @SuppressWarnings("deprecation")
+   public static void main(String[] ignored) {
+      System.setProperty("com.amd.aparapi.executionMode", "GPU,CPU,SEQ");
+      System.setProperty("com.amd.aparapi.dumpProfilesOnExit", "true");
+
+      KernelWithAlternateFallbackAlgorithm kernel = new KernelWithAlternateFallbackAlgorithm();
+      kernel.setExecutionMode(Kernel.EXECUTION_MODE.GPU);
+      int globalRange = 1;
+      kernel.execute(globalRange);
+
+      StringBuilder report = new StringBuilder("\n");
+      KernelManager.instance().reportDeviceUsage(report, true);
+      System.out.println(report);
+   }
+}
diff --git a/samples/convolution/src/com/amd/aparapi/sample/convolution/ConvolutionOpenCL.java b/samples/convolution/src/com/amd/aparapi/sample/convolution/ConvolutionOpenCL.java
index 57f96b060a8c0993309d714e7e804fffa704bcc8..4b916b252e1ba399bea1c57f5860c2f4d6d9ea68 100644
--- a/samples/convolution/src/com/amd/aparapi/sample/convolution/ConvolutionOpenCL.java
+++ b/samples/convolution/src/com/amd/aparapi/sample/convolution/ConvolutionOpenCL.java
@@ -38,13 +38,13 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 
 package com.amd.aparapi.sample.convolution;
 
-import java.io.File;
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
+import com.amd.aparapi.opencl.*;
+import com.amd.aparapi.opencl.OpenCL.*;
 
-import com.amd.aparapi.Range;
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.device.OpenCLDevice;
-import com.amd.aparapi.opencl.OpenCL;
-import com.amd.aparapi.opencl.OpenCL.Resource;
+import java.io.*;
 
 public class ConvolutionOpenCL{
 
@@ -61,7 +61,7 @@ public class ConvolutionOpenCL{
    public static void main(final String[] _args) {
       final File file = new File(_args.length == 1 ? _args[0] : "testcard.jpg");
 
-      final OpenCLDevice openclDevice = (OpenCLDevice) Device.best();
+      final OpenCLDevice openclDevice = (OpenCLDevice) KernelManager.instance().bestDevice();
 
       final Convolution convolution = openclDevice.bind(Convolution.class);
       final float convMatrix3x3[] = new float[] {
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/FFTExample.java b/samples/extension/src/com/amd/aparapi/sample/extension/FFTExample.java
index 9284fa503455f429b2f10d9cfdaa519e7f183650..7c575c7a2c8200b95e8755e49fc8d15992ac1ea4 100644
--- a/samples/extension/src/com/amd/aparapi/sample/extension/FFTExample.java
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/FFTExample.java
@@ -1,12 +1,12 @@
 package com.amd.aparapi.sample.extension;
 
-import java.util.Arrays;
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
+import com.amd.aparapi.opencl.*;
+import com.amd.aparapi.opencl.OpenCL.*;
 
-import com.amd.aparapi.Range;
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.device.OpenCLDevice;
-import com.amd.aparapi.opencl.OpenCL;
-import com.amd.aparapi.opencl.OpenCL.Resource;
+import java.util.*;
 
 public class FFTExample{
 
@@ -98,7 +98,7 @@ public class FFTExample{
       final float imaginary[] = new float[LEN];
       final float referenceReal[] = Arrays.copyOf(real, real.length);
       final float referenceImaginary[] = Arrays.copyOf(imaginary, imaginary.length);
-      final OpenCLDevice device = (OpenCLDevice) Device.best();
+      final OpenCLDevice device = (OpenCLDevice) KernelManager.instance().getDefaultPreferences().getPreferredDevice(null);
       final FFT fft = device.bind(FFT.class);
       for (int i = 0; i < LEN; i++) {
          initial[i] = real[i] = referenceReal[i] = (float) (Math.random() * 256);
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/Histogram.java b/samples/extension/src/com/amd/aparapi/sample/extension/Histogram.java
index 54b06c3b0a98eae4b8685b3762959f44d9c9e232..e260d5e825f5a287f987bcc5ac063ed68a8a0041 100644
--- a/samples/extension/src/com/amd/aparapi/sample/extension/Histogram.java
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/Histogram.java
@@ -41,7 +41,6 @@ public class Histogram{
       System.out.println("binResult size=" + binResult.length);
       final int[] histo = new int[BIN_SIZE];
       final int[] refHisto = new int[BIN_SIZE];
-      final Device device = Device.firstGPU();
       final Kernel k = new Kernel(){
 
          @Override public void run() {
@@ -52,6 +51,7 @@ public class Histogram{
          }
 
       };
+      final Device device = k.getTargetDevice();
       final Range range2 = device.createRange(BIN_SIZE);
       k.execute(range2);
 
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/HistogramIdeal.java b/samples/extension/src/com/amd/aparapi/sample/extension/HistogramIdeal.java
index 1ff76e9f5593423d2ab04c4dc73617937efc47f5..a0f74813706604358021cdc53d02663332d63a67 100644
--- a/samples/extension/src/com/amd/aparapi/sample/extension/HistogramIdeal.java
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/HistogramIdeal.java
@@ -3,6 +3,7 @@ package com.amd.aparapi.sample.extension;
 import com.amd.aparapi.Range;
 import com.amd.aparapi.device.Device;
 import com.amd.aparapi.device.OpenCLDevice;
+import com.amd.aparapi.internal.kernel.*;
 import com.amd.aparapi.opencl.OpenCL;
 
 public class HistogramIdeal{
@@ -40,7 +41,7 @@ public class HistogramIdeal{
       System.out.println("binResult size=" + binResult.length);
       final int[] histo = new int[BIN_SIZE];
       final int[] refHisto = new int[BIN_SIZE];
-      final Device device = Device.best();
+      final Device device = KernelManager.instance().bestDevice();
 
       if (device != null) {
          System.out.println(((OpenCLDevice) device).getOpenCLPlatform().getName());
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/MandelExample.java b/samples/extension/src/com/amd/aparapi/sample/extension/MandelExample.java
index ba2d20ac2d6b496bb7b766ac0edecd7a1d781c3d..85ac9cda4614810b3936c568ec47d39213e06ba6 100644
--- a/samples/extension/src/com/amd/aparapi/sample/extension/MandelExample.java
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/MandelExample.java
@@ -38,37 +38,17 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 
 package com.amd.aparapi.sample.extension;
 
-import java.awt.BorderLayout;
-import java.awt.Dimension;
-import java.awt.FlowLayout;
-import java.awt.Graphics;
-import java.awt.Point;
-import java.awt.event.ItemEvent;
-import java.awt.event.ItemListener;
-import java.awt.event.MouseAdapter;
-import java.awt.event.MouseEvent;
-import java.awt.event.WindowAdapter;
-import java.awt.event.WindowEvent;
-import java.awt.image.BufferedImage;
-import java.awt.image.DataBufferInt;
-import java.util.concurrent.BrokenBarrierException;
-import java.util.concurrent.CyclicBarrier;
-
-import javax.swing.JComboBox;
-import javax.swing.JComponent;
-import javax.swing.JFrame;
-import javax.swing.JLabel;
-import javax.swing.JPanel;
-import javax.swing.JTextField;
-
-import com.amd.aparapi.Range;
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.device.OpenCLDevice;
-import com.amd.aparapi.internal.opencl.OpenCLPlatform;
-import com.amd.aparapi.internal.util.OpenCLUtil;
-import com.amd.aparapi.opencl.OpenCL;
-import com.amd.aparapi.opencl.OpenCL.Resource;
-import com.amd.aparapi.opencl.OpenCLAdapter;
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
+import com.amd.aparapi.opencl.*;
+import com.amd.aparapi.opencl.OpenCL.*;
+
+import javax.swing.*;
+import java.awt.*;
+import java.awt.event.*;
+import java.awt.image.*;
+import java.util.concurrent.*;
 
 /**
  * An example Aparapi application which displays a view of the Mandelbrot set and lets the user zoom in to a particular point. 
@@ -418,7 +398,7 @@ public class MandelExample{
       float offsetx = .0f;
 
       float offsety = .0f;
-      Device device = Device.best();
+      Device device = KernelManager.instance().bestDevice();
       if (device instanceof OpenCLDevice) {
          final OpenCLDevice openclDevice = (OpenCLDevice) device;
 
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/MandelSimple.java b/samples/extension/src/com/amd/aparapi/sample/extension/MandelSimple.java
index 1b9c993b7ed53ef6ad1db3f224aa5132b2540902..89faa7f2f6ef027a2d5163f9e7f139cb0080a43e 100644
--- a/samples/extension/src/com/amd/aparapi/sample/extension/MandelSimple.java
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/MandelSimple.java
@@ -38,29 +38,16 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 
 package com.amd.aparapi.sample.extension;
 
-import java.awt.BorderLayout;
-import java.awt.Dimension;
-import java.awt.FlowLayout;
-import java.awt.Graphics;
-import java.awt.Point;
-import java.awt.event.MouseAdapter;
-import java.awt.event.MouseEvent;
-import java.awt.event.WindowAdapter;
-import java.awt.event.WindowEvent;
-import java.awt.image.BufferedImage;
-import java.awt.image.DataBufferInt;
-
-import javax.swing.JComponent;
-import javax.swing.JFrame;
-import javax.swing.JLabel;
-import javax.swing.JPanel;
-import javax.swing.JTextField;
-
-import com.amd.aparapi.Range;
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.device.OpenCLDevice;
-import com.amd.aparapi.opencl.OpenCL;
-import com.amd.aparapi.opencl.OpenCL.Resource;
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
+import com.amd.aparapi.opencl.*;
+import com.amd.aparapi.opencl.OpenCL.*;
+
+import javax.swing.*;
+import java.awt.*;
+import java.awt.event.*;
+import java.awt.image.*;
 
 /**
  * An example Aparapi application which displays a view of the Mandelbrot set and lets the user zoom in to a particular point. 
@@ -155,7 +142,7 @@ public class MandelSimple{
       float offsetx = .0f;
 
       float offsety = .0f;
-      final Device device = Device.best();
+      final Device device = KernelManager.instance().bestDevice();
       if (device instanceof OpenCLDevice) {
          final OpenCLDevice openclDevice = (OpenCLDevice) device;
 
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/Pow4Example.java b/samples/extension/src/com/amd/aparapi/sample/extension/Pow4Example.java
index 7bc046767b738a2fcbd496cc8155454863e31355..0ea3043e19eaf2fc0203beaffdb6709e7e1a2230 100644
--- a/samples/extension/src/com/amd/aparapi/sample/extension/Pow4Example.java
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/Pow4Example.java
@@ -3,6 +3,7 @@ package com.amd.aparapi.sample.extension;
 import com.amd.aparapi.Range;
 import com.amd.aparapi.device.Device;
 import com.amd.aparapi.device.OpenCLDevice;
+import com.amd.aparapi.internal.kernel.*;
 import com.amd.aparapi.opencl.OpenCL;
 import com.amd.aparapi.opencl.OpenCL.Resource;
 
@@ -26,7 +27,7 @@ public class Pow4Example{
       final float[] squares = new float[size];
       final Range range = Range.create(size);
 
-      final Device device = Device.best();
+      final Device device = KernelManager.instance().bestDevice();
 
       if (device instanceof OpenCLDevice) {
          final OpenCLDevice openclDevice = (OpenCLDevice) device;
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/SquareExample.java b/samples/extension/src/com/amd/aparapi/sample/extension/SquareExample.java
index e2b2aa4814ec8baf558f6a24fa92e8c54cc3c9cf..58f01c0b8789a51ae886b73e608dc2f3bb98b25d 100644
--- a/samples/extension/src/com/amd/aparapi/sample/extension/SquareExample.java
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/SquareExample.java
@@ -4,6 +4,7 @@ import com.amd.aparapi.ProfileInfo;
 import com.amd.aparapi.Range;
 import com.amd.aparapi.device.Device;
 import com.amd.aparapi.device.OpenCLDevice;
+import com.amd.aparapi.internal.kernel.*;
 import com.amd.aparapi.opencl.OpenCL;
 import com.amd.aparapi.opencl.OpenCL.Resource;
 import com.amd.aparapi.opencl.OpenCL.Source;
@@ -54,7 +55,7 @@ public class SquareExample{
       final float[] quads = new float[size];
       final Range range = Range.create(size);
 
-      final Device device = Device.best();
+      final Device device = KernelManager.instance().bestDevice();
 
       if (device instanceof OpenCLDevice) {
          final OpenCLDevice openclDevice = (OpenCLDevice) device;
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/SwapExample.java b/samples/extension/src/com/amd/aparapi/sample/extension/SwapExample.java
index 381b52faa4b694b5ae86a1618b8b2382c3a602cd..d5fe0bf9244580e431ae6d72ea6ae372a0998beb 100644
--- a/samples/extension/src/com/amd/aparapi/sample/extension/SwapExample.java
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/SwapExample.java
@@ -3,6 +3,7 @@ package com.amd.aparapi.sample.extension;
 import com.amd.aparapi.Range;
 import com.amd.aparapi.device.Device;
 import com.amd.aparapi.device.OpenCLDevice;
+import com.amd.aparapi.internal.kernel.*;
 import com.amd.aparapi.opencl.OpenCL;
 
 public class SwapExample{
@@ -29,7 +30,7 @@ public class SwapExample{
       final float[] rhs = new float[size];
       final Range range = Range.create(size);
 
-      final Device device = Device.best();
+      final Device device = KernelManager.instance().bestDevice();
 
       if (device instanceof OpenCLDevice) {
          final OpenCLDevice openclDevice = (OpenCLDevice) device;
diff --git a/samples/info/src/com/amd/aparapi/sample/info/Main.java b/samples/info/src/com/amd/aparapi/sample/info/Main.java
index fcff248937d1be7a55fed94e9bf5a047ca6ece9e..8397715d404927671ebb496cbaeb8cd925ab6022 100644
--- a/samples/info/src/com/amd/aparapi/sample/info/Main.java
+++ b/samples/info/src/com/amd/aparapi/sample/info/Main.java
@@ -38,11 +38,11 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 
 package com.amd.aparapi.sample.info;
 
-import java.util.List;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
+import com.amd.aparapi.internal.opencl.*;
 
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.device.OpenCLDevice;
-import com.amd.aparapi.internal.opencl.OpenCLPlatform;
+import java.util.*;
 
 public class Main{
    public static void main(String[] _args) {
@@ -73,90 +73,13 @@ public class Main{
          platformc++;
       }
 
-      Device bestDevice = OpenCLDevice.best();
-      if (bestDevice == null) {
-         System.out.println("OpenCLDevice.best() returned null!");
-      } else {
-         System.out.println("OpenCLDevice.best() returned { ");
-         System.out.println("   Type                  : " + bestDevice.getType());
-         System.out.println("   GlobalMemSize         : " + ((OpenCLDevice) bestDevice).getGlobalMemSize());
-         System.out.println("   LocalMemSize          : " + ((OpenCLDevice) bestDevice).getLocalMemSize());
-         System.out.println("   MaxComputeUnits       : " + ((OpenCLDevice) bestDevice).getMaxComputeUnits());
-         System.out.println("   MaxWorkGroupSizes     : " + ((OpenCLDevice) bestDevice).getMaxWorkGroupSize());
-         System.out.println("   MaxWorkItemDimensions : " + ((OpenCLDevice) bestDevice).getMaxWorkItemDimensions());
-         System.out.println("}");
-      }
-
-      Device firstCPU = OpenCLDevice.firstCPU();
-      if (firstCPU == null) {
-         System.out.println("OpenCLDevice.firstCPU() returned null!");
-      } else {
-         System.out.println("OpenCLDevice.firstCPU() returned { ");
-         System.out.println("   Type                  : " + firstCPU.getType());
-         System.out.println("   GlobalMemSize         : " + ((OpenCLDevice) firstCPU).getGlobalMemSize());
-         System.out.println("   LocalMemSize          : " + ((OpenCLDevice) firstCPU).getLocalMemSize());
-         System.out.println("   MaxComputeUnits       : " + ((OpenCLDevice) firstCPU).getMaxComputeUnits());
-         System.out.println("   MaxWorkGroupSizes     : " + ((OpenCLDevice) firstCPU).getMaxWorkGroupSize());
-         System.out.println("   MaxWorkItemDimensions : " + ((OpenCLDevice) firstCPU).getMaxWorkItemDimensions());
-         System.out.println("}");
-      }
-
-      Device firstGPU = OpenCLDevice.firstGPU();
-      if (firstGPU == null) {
-         System.out.println("OpenCLDevice.firstGPU() returned null!");
-      } else {
-         System.out.println("OpenCLDevice.firstGPU() returned { ");
-         System.out.println("   Type                  : " + firstGPU.getType());
-         System.out.println("   GlobalMemSize         : " + ((OpenCLDevice) firstGPU).getGlobalMemSize());
-         System.out.println("   LocalMemSize          : " + ((OpenCLDevice) firstGPU).getLocalMemSize());
-         System.out.println("   MaxComputeUnits       : " + ((OpenCLDevice) firstGPU).getMaxComputeUnits());
-         System.out.println("   MaxWorkGroupSizes     : " + ((OpenCLDevice) firstGPU).getMaxWorkGroupSize());
-         System.out.println("   MaxWorkItemDimensions : " + ((OpenCLDevice) firstGPU).getMaxWorkItemDimensions());
-         System.out.println("}");
-      }
+      KernelPreferences preferences = KernelManager.instance().getDefaultPreferences();
+      System.out.println("\nDevices in preferred order:\n");
 
-      Device bestGPU = OpenCLDevice.bestGPU();
-      if (bestGPU == null) {
-         System.out.println("OpenCLDevice.bestGPU() returned null!");
-      } else {
-         System.out.println("OpenCLDevice.bestGPU() returned { ");
-         System.out.println("   Type                  : " + bestGPU.getType());
-         System.out.println("   GlobalMemSize         : " + ((OpenCLDevice) bestGPU).getGlobalMemSize());
-         System.out.println("   LocalMemSize          : " + ((OpenCLDevice) bestGPU).getLocalMemSize());
-         System.out.println("   MaxComputeUnits       : " + ((OpenCLDevice) bestGPU).getMaxComputeUnits());
-         System.out.println("   MaxWorkGroupSizes     : " + ((OpenCLDevice) bestGPU).getMaxWorkGroupSize());
-         System.out.println("   MaxWorkItemDimensions : " + ((OpenCLDevice) bestGPU).getMaxWorkItemDimensions());
-         System.out.println("}");
-      }
-
-      Device firstACC = OpenCLDevice.firstACC();
-      if (firstACC == null) {
-         System.out.println("OpenCLDevice.firstACC() returned null!");
-      } else {
-         System.out.println("OpenCLDevice.firstACC() returned { ");
-         System.out.println("   Type                  : " + firstACC.getType());
-         System.out.println("   GlobalMemSize         : " + ((OpenCLDevice) firstACC).getGlobalMemSize());
-         System.out.println("   LocalMemSize          : " + ((OpenCLDevice) firstACC).getLocalMemSize());
-         System.out.println("   MaxComputeUnits       : " + ((OpenCLDevice) firstACC).getMaxComputeUnits());
-         System.out.println("   MaxWorkGroupSizes     : " + ((OpenCLDevice) firstACC).getMaxWorkGroupSize());
-         System.out.println("   MaxWorkItemDimensions : " + ((OpenCLDevice) firstACC).getMaxWorkItemDimensions());
-         System.out.println("}");
+      for (Device device : preferences.getPreferredDevices(null)) {
+         System.out.println(device);
+         System.out.println();
       }
-
-      Device bestACC = OpenCLDevice.bestACC();
-      if (bestACC == null) {
-         System.out.println("OpenCLDevice.bestACC() returned null!");
-      } else {
-         System.out.println("OpenCLDevice.bestACC() returned { ");
-         System.out.println("   Type                  : " + bestACC.getType());
-         System.out.println("   GlobalMemSize         : " + ((OpenCLDevice) bestACC).getGlobalMemSize());
-         System.out.println("   LocalMemSize          : " + ((OpenCLDevice) bestACC).getLocalMemSize());
-         System.out.println("   MaxComputeUnits       : " + ((OpenCLDevice) bestACC).getMaxComputeUnits());
-         System.out.println("   MaxWorkGroupSizes     : " + ((OpenCLDevice) bestACC).getMaxWorkGroupSize());
-         System.out.println("   MaxWorkItemDimensions : " + ((OpenCLDevice) bestACC).getMaxWorkItemDimensions());
-         System.out.println("}");
-      }
-
    }
 
 }
diff --git a/samples/life/src/com/amd/aparapi/sample/life/Main.java b/samples/life/src/com/amd/aparapi/sample/life/Main.java
index 963cceb9ed0750585f0891c483d1bead7c3b4dd8..e51ca5fafa4431d417c07206fff95782c4d168c0 100644
--- a/samples/life/src/com/amd/aparapi/sample/life/Main.java
+++ b/samples/life/src/com/amd/aparapi/sample/life/Main.java
@@ -38,26 +38,14 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 
 package com.amd.aparapi.sample.life;
 
-import java.awt.BorderLayout;
-import java.awt.Dimension;
-import java.awt.FlowLayout;
-import java.awt.Graphics;
-import java.awt.event.ActionEvent;
-import java.awt.event.ActionListener;
-import java.awt.image.BufferedImage;
-import java.awt.image.DataBufferInt;
-import java.util.List;
-
-import javax.swing.JButton;
-import javax.swing.JComponent;
-import javax.swing.JFrame;
-import javax.swing.JLabel;
-import javax.swing.JPanel;
-import javax.swing.WindowConstants;
-
 import com.amd.aparapi.Kernel;
-import com.amd.aparapi.ProfileInfo;
-import com.amd.aparapi.Range;
+import com.amd.aparapi.*;
+
+import javax.swing.*;
+import java.awt.*;
+import java.awt.event.*;
+import java.awt.image.*;
+import java.util.List;
 
 /**
  * An example Aparapi application which demonstrates Conways 'Game Of Life'.
@@ -239,7 +227,7 @@ public class Main{
          }
       });
       controlPanel.add(startButton);
-      controlPanel.add(new JLabel(lifeKernel.getExecutionMode().toString()));
+      controlPanel.add(new JLabel(lifeKernel.getTargetDevice().getShortDescription()));
 
       controlPanel.add(new JLabel("  Generations/Second="));
       final JLabel generationsPerSecond = new JLabel("0.00");
diff --git a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java
index 8ad4a7a766537877946a11e6ce71e2038431c2be..13de958505466f8a17ce3af2cbe84f3481d130f8 100644
--- a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java
+++ b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java
@@ -38,24 +38,14 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 
 package com.amd.aparapi.sample.mandel;
 
-import java.awt.Color;
-import java.awt.Dimension;
-import java.awt.Graphics;
-import java.awt.Point;
-import java.awt.event.MouseAdapter;
-import java.awt.event.MouseEvent;
-import java.awt.event.WindowAdapter;
-import java.awt.event.WindowEvent;
-import java.awt.image.BufferedImage;
-import java.awt.image.DataBufferInt;
-import java.util.List;
-
-import javax.swing.JComponent;
-import javax.swing.JFrame;
-
 import com.amd.aparapi.Kernel;
-import com.amd.aparapi.ProfileInfo;
-import com.amd.aparapi.Range;
+import com.amd.aparapi.*;
+
+import javax.swing.*;
+import java.awt.*;
+import java.awt.event.*;
+import java.awt.image.*;
+import java.util.List;
 
 /**
  * An example Aparapi application which displays a view of the Mandelbrot set and lets the user zoom in to a particular point. 
@@ -107,7 +97,6 @@ public class Main{
        * @param _width Mandelbrot image width
        * @param _height Mandelbrot image height
        * @param _rgb Mandelbrot image RGB buffer
-       * @param _pallette Mandelbrot image palette
        */
       public MandelKernel(int _width, int _height, int[] _rgb) {
          //Initialize palette values
@@ -229,8 +218,7 @@ public class Main{
       System.arraycopy(rgb, 0, imageRgb, 0, rgb.length);
       viewer.repaint();
 
-      // Report target execution mode: GPU or JTP (Java Thread Pool).
-      System.out.println("Execution mode=" + kernel.getExecutionMode());
+      System.out.println("device=" + kernel.getTargetDevice());
 
       // Window listener to dispose Kernel resources on user exit.
       frame.addWindowListener(new WindowAdapter(){
diff --git a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java
index 65c62320965962f53fcb5aa98bae0254a6aca5ca..8a1b7faa68eceb14aeae40c133bf2d6f57303bd0 100644
--- a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java
+++ b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java
@@ -38,24 +38,14 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 
 package com.amd.aparapi.sample.mandel;
 
-import java.awt.Color;
-import java.awt.Dimension;
-import java.awt.Graphics;
-import java.awt.Point;
-import java.awt.event.MouseAdapter;
-import java.awt.event.MouseEvent;
-import java.awt.event.WindowAdapter;
-import java.awt.event.WindowEvent;
-import java.awt.image.BufferedImage;
-import java.awt.image.DataBufferInt;
-import java.util.List;
-
-import javax.swing.JComponent;
-import javax.swing.JFrame;
-
 import com.amd.aparapi.Kernel;
-import com.amd.aparapi.ProfileInfo;
-import com.amd.aparapi.Range;
+import com.amd.aparapi.*;
+
+import javax.swing.*;
+import java.awt.*;
+import java.awt.event.*;
+import java.awt.image.*;
+import java.util.List;
 
 /**
  * An example Aparapi application which displays a view of the Mandelbrot set and lets the user zoom in to a particular point. 
@@ -97,11 +87,8 @@ public class Main2D{
 
       /**
        * Initialize the Kernel.
-       *  
-       * @param _width Mandelbrot image width
-       * @param _height Mandelbrot image height
+       *
        * @param _rgb Mandelbrot image RGB buffer
-       * @param _pallette Mandelbrot image palette
        */
       public MandelKernel(int[] _rgb) {
          rgb = _rgb;
@@ -209,8 +196,7 @@ public class Main2D{
       System.arraycopy(rgb, 0, imageRgb, 0, rgb.length);
       viewer.repaint();
 
-      // Report target execution mode: GPU or JTP (Java Thread Pool).
-      System.out.println("Execution mode=" + kernel.getExecutionMode());
+      System.out.println("device=" + kernel.getTargetDevice());
 
       // Window listener to dispose Kernel resources on user exit.
       frame.addWindowListener(new WindowAdapter(){
diff --git a/samples/mdarray/build.xml b/samples/mdarray/build.xml
index 7c5bf8ec6b3363b236090b2fd542ad50cd23766f..787fd0950e69d37599e2ba0b855c16714fa19710 100644
--- a/samples/mdarray/build.xml
+++ b/samples/mdarray/build.xml
@@ -19,7 +19,7 @@
 
 	<target name="build" depends="clean">
 		<mkdir dir="classes" />
-		<javac srcdir="src" destdir="classes" debug="on" includeantruntime="false" fork="true" memorymaximumsize="3G">
+		<javac srcdir="src" destdir="classes" debug="on" includeantruntime="false" fork="true" memorymaximumsize="1024m">
 			<classpath>
 				<pathelement path="../../com.amd.aparapi/dist/aparapi.jar" />
 			</classpath>
diff --git a/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java b/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java
index 99fa6259ff541663b292bc0e0c29aaf6709d61c3..f4e3e28c5d7e748613d067ba6e76dbe018429b6e 100644
--- a/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java
+++ b/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java
@@ -1,33 +1,52 @@
 package com.amd.aparapi.sample.median;
 
-import com.amd.aparapi.Kernel;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
 
 import javax.imageio.*;
 import javax.swing.*;
 import java.awt.*;
 import java.awt.image.*;
 import java.io.*;
+import java.util.*;
 
 /**
  * Demonstrate use of __private namespaces and @NoCL annotations.
  */
 public class MedianDemo {
-   public final static BufferedImage testImage;
+   public static BufferedImage testImage;
 
    static {
       try {
-         File imageFile = new File("./../../../samples/convolution/testcard.jpg").getCanonicalFile();
-         testImage = ImageIO.read(imageFile);
+         File imageFile = new File("./samples/convolution/testcard.jpg").getCanonicalFile();
+         if (imageFile.exists()) {
+            testImage = ImageIO.read(imageFile);
+         }
       } catch (IOException e) {
          throw new RuntimeException(e);
       }
    }
 
-   private static final boolean TEST_JTP = false;
+   private static final boolean TEST_JTP = true;
 
    public static void main(String[] ignored) {
       final int size = 5;
       System.setProperty("com.amd.aparapi.enableShowGeneratedOpenCL", "true");
+      boolean verbose = true;
+      if (verbose)
+      {
+          System.setProperty("com.amd.aparapi.enableVerboseJNI", "true");
+          System.setProperty("com.amd.aparapi.dumpFlags", "true");
+          System.setProperty("com.amd.aparapi.enableShowGeneratedOpenCL", "true");
+          System.setProperty("com.amd.aparapi.enableVerboseJNIOpenCLResourceTracking", "true");
+          System.setProperty("com.amd.aparapi.enableExecutionModeReporting", "true");
+      }
+
+      if (TEST_JTP) {
+         LinkedHashSet<Device> devices = new LinkedHashSet<>(Collections.singleton(JavaDevice.THREAD_POOL));
+         KernelManager.instance().setDefaultPreferredDevices(devices);
+      }
+
       int[] argbs = testImage.getRGB(0, 0, testImage.getWidth(), testImage.getHeight(), null, 0, testImage.getWidth());
       MedianKernel7x7 kernel = new MedianKernel7x7();
       kernel._imageTypeOrdinal = MedianKernel7x7.RGB;
@@ -35,9 +54,7 @@ public class MedianDemo {
       kernel._sourceHeight = testImage.getHeight();
       kernel._sourcePixels = argbs;
       kernel._destPixels = new int[argbs.length];
-      if (TEST_JTP) {
-         kernel.setExecutionMode(Kernel.EXECUTION_MODE.JTP);
-      }
+
       kernel.processImages(new MedianSettings(size));
       BufferedImage out = new BufferedImage(testImage.getWidth(), testImage.getHeight(), BufferedImage.TYPE_INT_RGB);
       out.setRGB(0, 0, testImage.getWidth(), testImage.getHeight(), kernel._destPixels, 0, testImage.getWidth());
diff --git a/samples/median/src/com/amd/aparapi/sample/median/MedianKernel7x7.java b/samples/median/src/com/amd/aparapi/sample/median/MedianKernel7x7.java
index 6cbece4157e3518a897c4e1bfead8fdf2ba7dbbd..c393720be7b4b200645d039ec0b28425f8d86e5b 100644
--- a/samples/median/src/com/amd/aparapi/sample/median/MedianKernel7x7.java
+++ b/samples/median/src/com/amd/aparapi/sample/median/MedianKernel7x7.java
@@ -28,7 +28,8 @@ public class MedianKernel7x7 extends Kernel {
    protected int[] _destPixels;
 
    // NB could also use suffix naming instead of annotation ... field would be named _window_$private$49
-   @PrivateMemorySpace(MAX_WINDOW_SIZE) private short[] _window = new short[MAX_WINDOW_SIZE];
+   @PrivateMemorySpace(MAX_WINDOW_SIZE)
+   private short[] _window = new short[MAX_WINDOW_SIZE];
    @NoCL private static ThreadLocal<short[]> _threadLocalWindow = new ThreadLocal<short[]>() {
       @Override
       protected short[] initialValue() {
diff --git a/samples/progress/src/com/amd/aparapi/sample/progress/MultiPassKernelSwingWorkerDemo.java b/samples/progress/src/com/amd/aparapi/sample/progress/MultiPassKernelSwingWorkerDemo.java
index 7cc2584b1cb10d054f16632dd12ff27f2102c53b..7bfc91e4eea39ce1148611cfa428cdc6879a90bf 100644
--- a/samples/progress/src/com/amd/aparapi/sample/progress/MultiPassKernelSwingWorkerDemo.java
+++ b/samples/progress/src/com/amd/aparapi/sample/progress/MultiPassKernelSwingWorkerDemo.java
@@ -1,7 +1,7 @@
 package com.amd.aparapi.sample.progress;
 
-import com.amd.aparapi.Kernel;
-import com.amd.aparapi.internal.kernel.KernelRunner;
+import com.amd.aparapi.*;
+import com.amd.aparapi.internal.kernel.*;
 import com.amd.aparapi.util.swing.MultiPassKernelSwingWorker;
 
 import javax.swing.*;
@@ -23,13 +23,13 @@ public class MultiPassKernelSwingWorkerDemo {
    private static LongRunningKernel kernel;
    private static MultiPassKernelSwingWorker worker;
 
-   private static final boolean TEST_JTP = true;
+   private static final boolean TEST_JTP = false;
 
    public static void main(String[] ignored) throws Exception {
-      kernel = new LongRunningKernel();
       if (TEST_JTP) {
-         kernel.setExecutionMode(Kernel.EXECUTION_MODE.JTP);
+         KernelManager.setKernelManager(KernelManagers.JTP_ONLY);
       }
+      kernel = new LongRunningKernel();
 
       UIManager.setLookAndFeel(NimbusLookAndFeel.class.getName());
       JPanel rootPanel = new JPanel();
diff --git a/samples/progress/src/com/amd/aparapi/sample/progress/ProgressAndCancelDemo.java b/samples/progress/src/com/amd/aparapi/sample/progress/ProgressAndCancelDemo.java
index b114dcac4f19b5d93e6ec82b1d84da19193fa719..721f2c611ee06bf1fd3a144aedc16262785d84b5 100644
--- a/samples/progress/src/com/amd/aparapi/sample/progress/ProgressAndCancelDemo.java
+++ b/samples/progress/src/com/amd/aparapi/sample/progress/ProgressAndCancelDemo.java
@@ -1,13 +1,11 @@
 package com.amd.aparapi.sample.progress;
 
-import com.amd.aparapi.Kernel;
-import com.amd.aparapi.internal.kernel.KernelRunner;
+import com.amd.aparapi.internal.kernel.*;
 
 import javax.swing.*;
-import javax.swing.plaf.nimbus.NimbusLookAndFeel;
+import javax.swing.plaf.nimbus.*;
 import java.awt.*;
-import java.awt.event.ActionEvent;
-import java.awt.event.ActionListener;
+import java.awt.event.*;
 
 /**
  * Demonstrates progress tracking and cancellation for multi-pass kernels.
@@ -36,7 +34,7 @@ public class ProgressAndCancelDemo {
 
       kernel = new LongRunningKernel();
       if (TEST_JTP) {
-         kernel.setExecutionMode(Kernel.EXECUTION_MODE.JTP);
+         KernelManager.setKernelManager(KernelManagers.JTP_ONLY);
       }
       Thread asynchReader = new Thread() {
          @Override
diff --git a/samples/squares/src/com/amd/aparapi/sample/squares/Main.java b/samples/squares/src/com/amd/aparapi/sample/squares/Main.java
index 32a1b70b8bfd16cd76eff8d5666442738a18dc72..247cda6f8ea46601b339efe22b8144dc94b88b8b 100644
--- a/samples/squares/src/com/amd/aparapi/sample/squares/Main.java
+++ b/samples/squares/src/com/amd/aparapi/sample/squares/Main.java
@@ -82,7 +82,7 @@ public class Main{
       kernel.execute(Range.create(512));
 
       // Report target execution mode: GPU or JTP (Java Thread Pool).
-      System.out.println("Execution mode=" + kernel.getExecutionMode());
+      System.out.println("Device = " + kernel.getTargetDevice().getShortDescription());
 
       // Display computed square values.
       for (int i = 0; i < size; i++) {
diff --git a/test/codegen/src/java/com/amd/aparapi/Source.java b/test/codegen/src/java/com/amd/aparapi/Source.java
index a08c2872186874a4d7aef3c387116130718b3770..d9774096ed5499de4435e7da3e095b0c00d53bde 100644
--- a/test/codegen/src/java/com/amd/aparapi/Source.java
+++ b/test/codegen/src/java/com/amd/aparapi/Source.java
@@ -84,7 +84,7 @@ public class Source{
 
    public Source(Class<?> _clazz, File _rootDir) {
       clazz = _clazz;
-      String srcName = clazz.getPackage().getName().replace(".", "/") + "/" + clazz.getSimpleName() + ".java";
+      String srcName = clazz.getPackage().getName().replace(".", "/") + "/" + clazz + ".java";
       file = new File(_rootDir, srcName);
       try {
          BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/BufferTransfer.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/BufferTransfer.java
index 847a69575859dce0ab6d894f34cd9812270a7f36..1f9a36fa893ca1d383606c82cf52538f11eec61c 100644
--- a/test/runtime/src/java/com/amd/aparapi/test/runtime/BufferTransfer.java
+++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/BufferTransfer.java
@@ -1,17 +1,13 @@
 package com.amd.aparapi.test.runtime;
 
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
+import org.junit.*;
 
-import java.util.Arrays;
+import java.util.*;
 
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import com.amd.aparapi.Kernel;
-import com.amd.aparapi.Range;
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.device.OpenCLDevice;
+import static org.junit.Assert.*;
 
 public class BufferTransfer{
 
@@ -19,7 +15,7 @@ public class BufferTransfer{
 
    @BeforeClass public static void setUpBeforeClass() throws Exception {
 
-      Device device = Device.best();
+      Device device = KernelManager.instance().bestDevice();
       if (device == null || !(device instanceof OpenCLDevice)) {
          fail("no opencl device!");
       }
@@ -209,7 +205,7 @@ public class BufferTransfer{
             for (int n = 0; n < neuronOutputs.length; n++)
                log[n][simStep[0]] = neuronOutputs[n];
          }
-         System.out.println(getExecutionMode() + (isExplicit() ? ", explicit" : ", auto"));
+         System.out.println(getTargetDevice().getShortDescription() + (isExplicit() ? ", explicit" : ", auto"));
 
          for (int n = 0; n < neuronOutputs.length; n++)
             System.out.println(Arrays.toString(log[n]));
diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/CallStaticFromAnonymousKernel.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/CallStaticFromAnonymousKernel.java
index ca70f1a2ec6c39b6da1114a1cd1f39262c5af4f0..8cfb0d251027af33dff7c4b884055a94c7a03adb 100644
--- a/test/runtime/src/java/com/amd/aparapi/test/runtime/CallStaticFromAnonymousKernel.java
+++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/CallStaticFromAnonymousKernel.java
@@ -1,8 +1,10 @@
 package com.amd.aparapi.test.runtime;
 
-import static org.junit.Assert.assertTrue;
-import org.junit.Test;
-import com.amd.aparapi.Kernel;
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import org.junit.*;
+
+import static org.junit.Assert.*;
 
 class AnotherClass{
    static public int foo(int i) {
@@ -42,7 +44,7 @@ public class CallStaticFromAnonymousKernel{
          }
       };
       kernel.execute(size);
-      assertTrue("ran on GPU", kernel.getExecutionMode() == Kernel.EXECUTION_MODE.GPU);
+      assertTrue("ran on GPU", kernel.getTargetDevice().getType() == Device.TYPE.GPU);
 
       for (int i = 0; i < size; i++) {
          assertTrue("results == fooBar", results[i] == (fooBar(values[i]) + AnotherClass.foo(i)));
diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/ExplicitBoolean.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/ExplicitBoolean.java
index c80b587ba5fff03255756764f1256c7eaab0a44a..c59efbd9f90b1fce79b1de38202f4084b8a0ed5f 100644
--- a/test/runtime/src/java/com/amd/aparapi/test/runtime/ExplicitBoolean.java
+++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/ExplicitBoolean.java
@@ -1,10 +1,9 @@
 package com.amd.aparapi.test.runtime;
 
-import static org.junit.Assert.assertTrue;
+import com.amd.aparapi.*;
+import org.junit.*;
 
-import org.junit.Test;
-
-import com.amd.aparapi.Kernel;
+import static org.junit.Assert.*;
 
 public class ExplicitBoolean{
 
@@ -61,7 +60,7 @@ public class ExplicitBoolean{
          printArray(k2.output);
 
       assertTrue("k1.input == k2.input", Util.same(k1.output, k1.output));
-      System.out.println(k1.getExecutionMode());
+      System.out.println(k1.getTargetDevice().getShortDescription());
    }
 
    private static void printArray(boolean[] a) {
diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/LoadCL.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/LoadCL.java
index 28b2a73b50b76eba3ecee1751eaecdefaaaacf22..99d1764c9857952a461135f27ed89ab46cd12ba2 100644
--- a/test/runtime/src/java/com/amd/aparapi/test/runtime/LoadCL.java
+++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/LoadCL.java
@@ -1,14 +1,13 @@
 package com.amd.aparapi.test.runtime;
 
-import com.amd.aparapi.Range;
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.device.OpenCLDevice;
-import com.amd.aparapi.opencl.OpenCL;
-import com.amd.aparapi.opencl.OpenCL.Resource;
-import com.amd.aparapi.opencl.OpenCL.Source;
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
+import com.amd.aparapi.opencl.*;
+import com.amd.aparapi.opencl.OpenCL.*;
+import org.junit.*;
 
-import static org.junit.Assert.assertTrue;
-import org.junit.Test;
+import static org.junit.Assert.*;
 
 public class LoadCL{
 
@@ -31,7 +30,7 @@ public class LoadCL{
       final float[] quads = new float[size];
       final Range range = Range.create(size);
 
-      final Device device = Device.best();
+      final Device device = KernelManager.instance().bestDevice();
 
       if (device instanceof OpenCLDevice) {
          final OpenCLDevice openclDevice = (OpenCLDevice) device;
diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/Test12x4_4x2.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/Test12x4_4x2.java
index 51cee4366bdc15dc648cfb647976f9eb7cf423b0..b415b7764a36e6cbb210a3df724b8752802998a2 100644
--- a/test/runtime/src/java/com/amd/aparapi/test/runtime/Test12x4_4x2.java
+++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/Test12x4_4x2.java
@@ -1,12 +1,14 @@
 package com.amd.aparapi.test.runtime;
 
+import com.amd.aparapi.device.*;
 import org.junit.Test;
 
 import com.amd.aparapi.Kernel;
 import com.amd.aparapi.Range;
 
 public class Test12x4_4x2{
-   @Test public void test() {
+      @SuppressWarnings("deprecation")
+      @Test public void test() {
       // globalThreadId, threadId, globalX, globalY, localX, localY
       final int[][] test = new int[][] {
             {
@@ -446,7 +448,12 @@ public class Test12x4_4x2{
       };
       Kernel kernel = new Kernel(){
 
-         @Override public void run() {
+            @Override
+            public boolean isAllowDevice(Device _device) {
+                  return _device.getType() == Device.TYPE.JTP;
+            }
+
+            @Override public void run() {
             int x = getGlobalId(0);
             int y = getGlobalId(1);
             int lx = getLocalId(0);
@@ -492,7 +499,6 @@ public class Test12x4_4x2{
          }
 
       };
-      kernel.setExecutionMode(Kernel.EXECUTION_MODE.JTP);
       kernel.execute(Range.create2D(12, 4, 4, 2));
 
    }
diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/UseStaticArray.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/UseStaticArray.java
index 41f4b0d21e02207f9ad02621c06f2776a67bf5fd..5ce32645e4c77beb6776101754dc2918f5c0d743 100644
--- a/test/runtime/src/java/com/amd/aparapi/test/runtime/UseStaticArray.java
+++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/UseStaticArray.java
@@ -1,8 +1,10 @@
 package com.amd.aparapi.test.runtime;
 
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import org.junit.*;
+
 import static org.junit.Assert.*;
-import org.junit.Test;
-import com.amd.aparapi.Kernel;
 
 public class UseStaticArray extends Kernel{
 
@@ -26,7 +28,7 @@ public class UseStaticArray extends Kernel{
 
       execute(size);
 
-      assertTrue("ran on GPU", getExecutionMode() == Kernel.EXECUTION_MODE.GPU);
+      assertTrue("ran on GPU", getTargetDevice().getType() == Device.TYPE.GPU);
 
       assertArrayEquals("results == fooBar", results, values);
 //      for (int i = 0; i < size; i++) {