diff --git a/com.amd.aparapi.jni/src/cpp/CLHelper.cpp b/com.amd.aparapi.jni/src/cpp/CLHelper.cpp
index 1d0752e7d23bef40876cdc5dd393884b3737566b..acf91311e665af7a9d7888b8ed37a0795cf4c3d5 100644
--- a/com.amd.aparapi.jni/src/cpp/CLHelper.cpp
+++ b/com.amd.aparapi.jni/src/cpp/CLHelper.cpp
@@ -40,6 +40,8 @@
 #include "CLHelper.h"
 #include "List.h"
 #include <map>
+#include <vector>
+#include <stdio.h>
 
 void setMap(std::map<cl_int, const char*>& errorMap) {
    errorMap[CL_SUCCESS]                         = "success";
@@ -129,14 +131,62 @@ void CLHelper::getBuildErr(JNIEnv *jenv, cl_device_id deviceId,  cl_program prog
    delete []buildLog;
 }
 
-cl_program CLHelper::compile(JNIEnv *jenv, cl_context context, size_t deviceCount, cl_device_id* deviceIds, jstring source, jstring* log, cl_int* status){
-   const char *sourceChars = jenv->GetStringUTFChars(source, NULL);
-   size_t sourceSize[] = { strlen(sourceChars) };
-   cl_program program = clCreateProgramWithSource(context, 1, &sourceChars, sourceSize, status); 
-   jenv->ReleaseStringUTFChars(source, sourceChars);
-   *status = clBuildProgram(program, deviceCount, deviceIds, NULL, NULL, NULL);
+cl_program CLHelper::compile(JNIEnv *jenv, cl_context context, cl_device_id* deviceId, jstring* source, jstring* binaryKey, jstring* log, cl_int* status){
+   using std::map;
+   using std::vector;
+   using std::string;
+
+   static map<string, vector<unsigned char *> > src2bin;
+   static map<string, vector<size_t> > src2len;
+
+   const char* sourceChars = jenv->GetStringUTFChars(*source, NULL);
+   const char* keyChars = jenv->GetStringUTFChars(*binaryKey, NULL);
+   string sourceStr(sourceChars);
+   string keyStr(keyChars);
+
+   size_t sourceLength[] = {sourceStr.length()};
+
+   bool cacheDisabled = jenv->GetStringLength(*binaryKey) == 0;
+
+   cl_program program;
+   bool is_built_from_source = false;
+   bool keyNotFound = src2bin.find(keyStr) == src2bin.end();
+
+   if (cacheDisabled || keyNotFound) {
+      is_built_from_source = true;
+      program = clCreateProgramWithSource(context, 1, &sourceChars, sourceLength, status);
+   }
+   else{
+      cl_int *binary_status = new cl_int[1];
+      program = clCreateProgramWithBinary(context, 1, deviceId, &src2len[keyStr][0], (const unsigned char**)&src2bin[keyStr][0], binary_status, NULL);
+      cl_int theStatus = binary_status[0];
+      if (theStatus != CL_SUCCESS) {
+         getBuildErr(jenv, *deviceId, program, log);
+      }
+      delete[] binary_status;
+   }
+
+   jenv->ReleaseStringUTFChars(*source, sourceChars);
+   jenv->ReleaseStringUTFChars(*binaryKey, keyChars);
+
+   *status = clBuildProgram(program, 1, deviceId, NULL, NULL, NULL);
    if(*status == CL_BUILD_PROGRAM_FAILURE) {
-      getBuildErr(jenv, *deviceIds, program, log);
+      getBuildErr(jenv, *deviceId, program, log);
+   }
+
+   if(is_built_from_source && !cacheDisabled) {
+      vector<unsigned char *> &bins = src2bin[keyStr];
+      vector<size_t> &lens = src2len[keyStr];
+
+      bins.resize(1);
+      lens.resize(1);
+
+      clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &lens[0], NULL);
+      for(size_t i = 0; i < 1; ++i){
+         bins[i] = new unsigned char[lens[i]];
+      }
+
+      clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &bins[0], NULL);
    }
    return(program);
 }
diff --git a/com.amd.aparapi.jni/src/cpp/CLHelper.h b/com.amd.aparapi.jni/src/cpp/CLHelper.h
index d1581e9e297a586e3a2f6ad390c10eeb1040c08f..71f761efe26dfc38163c71c1e07ef69afb923680 100644
--- a/com.amd.aparapi.jni/src/cpp/CLHelper.h
+++ b/com.amd.aparapi.jni/src/cpp/CLHelper.h
@@ -45,7 +45,7 @@ class CLHelper{
    public:
    static const char *errString(cl_int status);
    static void getBuildErr(JNIEnv *jenv, cl_device_id deviceId, cl_program program, jstring *log);
-   static cl_program compile(JNIEnv *jenv, cl_context context, size_t deviceCount, cl_device_id* deviceId, jstring source, jstring* log, cl_int *status);
+   static cl_program compile(JNIEnv *jenv, cl_context context, cl_device_id* deviceId, jstring* source, jstring* binaryKey, jstring* log, cl_int *status);
    static jstring getExtensions(JNIEnv *jenv, cl_device_id deviceId, cl_int *status);
 };
 
diff --git a/com.amd.aparapi.jni/src/cpp/invoke/OpenCLJNI.cpp b/com.amd.aparapi.jni/src/cpp/invoke/OpenCLJNI.cpp
index ccfa62bfbae9254f8821dac1fa436380efeb6695..b637a390736f83d7d5f5e5d3f0ab8eaf32c51079 100644
--- a/com.amd.aparapi.jni/src/cpp/invoke/OpenCLJNI.cpp
+++ b/com.amd.aparapi.jni/src/cpp/invoke/OpenCLJNI.cpp
@@ -88,7 +88,7 @@ void OpenCLRange::fill(JNIEnv *jenv, jobject rangeInstance, jint dims, size_t* o
 }
 
 JNI_JAVA(jobject, OpenCLJNI, createProgram)
-   (JNIEnv *jenv, jobject jobj, jobject deviceInstance, jstring source) {
+   (JNIEnv *jenv, jobject jobj, jobject deviceInstance, jstring source, jstring binaryKey) {
 
       jobject platformInstance = OpenCLDevice::getPlatformInstance(jenv, deviceInstance);
       cl_platform_id platformId = OpenCLPlatform::getPlatformId(jenv, platformInstance);
@@ -105,7 +105,7 @@ JNI_JAVA(jobject, OpenCLJNI, createProgram)
 
 
       jstring log=NULL;
-      cl_program program = CLHelper::compile(jenv, context, 1, &deviceId, source, &log, &status);
+      cl_program program = CLHelper::compile(jenv, context, &deviceId, &source, &binaryKey, &log, &status);
       cl_command_queue queue = NULL;
       if(status == CL_SUCCESS) {
          cl_command_queue_properties queue_props = CL_QUEUE_PROFILING_ENABLE;
diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp
index cbad7e81539f48989f847e32edc5ebf643f2b413..eb404c523e909a00a609b35af1d7edf0d1de23a4 100644
--- a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp
+++ b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp
@@ -52,7 +52,7 @@
 
 static const int PASS_ID_PREPARING_EXECUTION = -2;
 static const int PASS_ID_COMPLETED_EXECUTION = -1;
-static const int CANCEL_STATUS_FALSE = 0;
+static const int CANCEL_STATUS_FALSE = 0;
 static const int CANCEL_STATUS_TRUE = 1;
 
 //compiler dependant code
@@ -1198,7 +1198,7 @@ void writeProfile(JNIEnv* jenv, JNIContext* jniContext) {
 }
 
 JNI_JAVA(jlong, KernelRunnerJNI, buildProgramJNI)
-   (JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jstring source) {
+   (JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jstring source, jstring binaryKey) {
       JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
       if (jniContext == NULL){
          return 0;
@@ -1207,7 +1207,7 @@ JNI_JAVA(jlong, KernelRunnerJNI, buildProgramJNI)
       try {
          cl_int status = CL_SUCCESS;
 
-         jniContext->program = CLHelper::compile(jenv, jniContext->context,  1, &jniContext->deviceId, source, NULL, &status);
+         jniContext->program = CLHelper::compile(jenv, jniContext->context, &jniContext->deviceId, &source, &binaryKey, NULL, &status);
 
          if(status == CL_BUILD_PROGRAM_FAILURE) throw CLException(status, "");
 
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Config.java b/com.amd.aparapi/src/java/com/amd/aparapi/Config.java
index fbae39bb67e433a2983a77dc7f7326ed87897a0e..6f08fbd80f47b505e4f1f480fbb61f9ddb2b5400 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/Config.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/Config.java
@@ -106,6 +106,14 @@ public class Config extends ConfigJNI{
     */
    public static final boolean dumpProfilesOnExit = Boolean.getBoolean(propPkgName + ".dumpProfilesOnExit");
 
+   /**
+    * Dumps profiling info (for a single execution) after every Kernel execution.
+    *
+    *  Usage -Dcom.amd.aparapi.dumpProfileOnExecution={true|false}
+    *
+    */
+   public static final boolean dumpProfileOnExecution = Boolean.getBoolean(propPkgName + ".dumpProfileOnExecution");
+
    // Pragma/OpenCL codegen related flags
    public static final boolean enableAtomic32 = Boolean.getBoolean(propPkgName + ".enableAtomic32");
 
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java b/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java
index 8bead23faddde914beb74a491a53ad23d1d03864..df9f7c463b4247a398a31ca4367d61da64d79ead 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java
@@ -464,6 +464,8 @@ public abstract class Kernel implements Cloneable {
 
    private KernelRunner kernelRunner = null;
 
+   private boolean autoCleanUpArrays = false;
+
    private KernelState kernelState = new KernelState();
 
    /**
@@ -2110,6 +2112,33 @@ public abstract class Kernel implements Cloneable {
       return prepareKernelRunner().execute(_entrypoint, _range, _passes);
    }
 
+   public boolean isAutoCleanUpArrays() {
+      return autoCleanUpArrays;
+   }
+
+   /**
+    * Property which if true enables automatic calling of {@link #cleanUpArrays()} following each execution.
+    */
+   public void setAutoCleanUpArrays(boolean autoCleanUpArrays) {
+      this.autoCleanUpArrays = autoCleanUpArrays;
+   }
+
+   /**
+    * Frees the bulk of the resources used by this kernel, by setting array sizes in non-primitive {@link KernelArg}s to 1 (0 size is prohibited) and invoking kernel
+    * execution on a zero size range. Unlike {@link #dispose()}, this does not prohibit further invocations of this kernel, as sundry resources such as OpenCL queues are
+    * <b>not</b> freed by this method.
+    *
+    * <p>This allows a "dormant" Kernel to remain in existence without undue strain on GPU resources, which may be strongly preferable to disposing a Kernel and
+    * recreating another one later, as creation/use of a new Kernel (specifically creation of its associated OpenCL context) is expensive.</p>
+    *
+    * <p>Note that where the underlying array field is declared final, for obvious reasons it is not resized to zero.</p>
+    */
+   public synchronized void cleanUpArrays() {
+      if (kernelRunner != null) {
+         kernelRunner.cleanUpArrays();
+      }
+   }
+
    /**
     * Release any resources associated with this Kernel.
     * <p>
@@ -2125,6 +2154,12 @@ public abstract class Kernel implements Cloneable {
       }
    }
 
+   /** Automatically releases any resources associated with this Kernel when the Kernel is garbage collected. */
+   @Override
+   protected void finalize() {
+      dispose();
+   }
+
    public boolean isRunningCL() {
       return getTargetDevice() instanceof OpenCLDevice;
    }
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Range.java b/com.amd.aparapi/src/java/com/amd/aparapi/Range.java
index 75db2c245b680a1e4f4b9d134a07f048292755d6..3d6aef9a7973f0bc089243ef887b2d3da6a0d552 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/Range.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/Range.java
@@ -140,6 +140,11 @@ public class Range extends RangeJNI{
    public static Range create(Device _device, int _globalWidth) {
       final Range withoutLocal = create(_device, _globalWidth, 1);
 
+      if (_globalWidth == 0) {
+         withoutLocal.setLocalIsDerived(true);
+         return withoutLocal;
+      }
+
       if (withoutLocal.isValid()) {
          withoutLocal.setLocalIsDerived(true);
          final int[] factors = getFactors(withoutLocal.getGlobalSize_0(), withoutLocal.getMaxWorkItemSize()[0]);
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelArgJNI.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelArgJNI.java
index 270321ff7c58508e4016eb82dc3afadbabeb1b81..895d1ff9e5f17cdc59deaabb2bf27fb180090cdf 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelArgJNI.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelArgJNI.java
@@ -1,8 +1,8 @@
 package com.amd.aparapi.internal.jni;
 
-import java.lang.reflect.Field;
+import com.amd.aparapi.internal.annotation.*;
 
-import com.amd.aparapi.internal.annotation.UsedByJNICode;
+import java.lang.reflect.*;
 
 /**
  * This class is intended to be used as a 'proxy' or 'facade' object for Java code to interact with JNI
@@ -12,28 +12,25 @@ public abstract class KernelArgJNI{
    /**
     * The type of this KernelArg. Created by or-ing appropriate flags
     * 
-    * @see ARG_BOOLEAN
-    * @see ARG_BYTE
-    * @see ARG_CHAR
-    * @see ARG_FLOAT
-    * @see ARG_INT
-    * @see ARG_DOUBLE
-    * @see ARG_LONG
-    * @see ARG_SHORT
-    * @see ARG_ARRAY
-    * @see ARG_PRIMITIVE
-    * @see ARG_READ
-    * @see ARG_WRITE
-    * @see ARG_LOCAL
-    * @see ARG_GLOBAL
-    * @see ARG_CONSTANT
-    * @see ARG_ARRAYLENGTH
-    * @see ARG_APARAPI_BUF
-    * @see ARG_EXPLICIT
-    * @see ARG_EXPLICIT_WRITE
-    * @see ARG_OBJ_ARRAY_STRUCT
-    * @see ARG_APARAPI_BUF_HAS_ARRAY
-    * @see ARG_APARAPI_BUF_IS_DIRECT
+    * @see KernelRunnerJNI#ARG_BOOLEAN
+    * @see KernelRunnerJNI#ARG_BYTE
+    * @see KernelRunnerJNI#ARG_CHAR
+    * @see KernelRunnerJNI#ARG_FLOAT
+    * @see KernelRunnerJNI#ARG_INT
+    * @see KernelRunnerJNI#ARG_DOUBLE
+    * @see KernelRunnerJNI#ARG_LONG
+    * @see KernelRunnerJNI#ARG_SHORT
+    * @see KernelRunnerJNI#ARG_ARRAY
+    * @see KernelRunnerJNI#ARG_PRIMITIVE
+    * @see KernelRunnerJNI#ARG_READ
+    * @see KernelRunnerJNI#ARG_WRITE
+    * @see KernelRunnerJNI#ARG_LOCAL
+    * @see KernelRunnerJNI#ARG_GLOBAL
+    * @see KernelRunnerJNI#ARG_CONSTANT
+    * @see KernelRunnerJNI#ARG_ARRAYLENGTH
+    * @see KernelRunnerJNI#ARG_EXPLICIT
+    * @see KernelRunnerJNI#ARG_EXPLICIT_WRITE
+    * @see KernelRunnerJNI#ARG_OBJ_ARRAY_STRUCT
     */
    @UsedByJNICode protected int type;
 
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelRunnerJNI.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelRunnerJNI.java
index 923875ee51bdf4bab77d550bf36f40b06b7883bd..7b83bb9b4ff345fd9caf40452ce706a3ee9ef34e 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelRunnerJNI.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelRunnerJNI.java
@@ -307,7 +307,15 @@ public abstract class KernelRunnerJNI{
 
    protected native int getJNI(long _jniContextHandle, Object _array);
 
-   protected native long buildProgramJNI(long _jniContextHandle, String _source);
+   /**
+    * @param _source The OpenCL source code to compile, which may be sent empty if the binary for that source code is known to be cached on the JNI side
+    *                under the key {@code _binaryKey}.
+    * @param _binaryKey A key which embodies a Kernel class and a Device, under which the JNI side will cache the compiled binary corresponding to that Kernel/Device
+    *                   pair. Once a certain _binaryKey has been passed to this method once, further calls to this method with that key will ignore the _source (which
+    *                   can be passed empty) andused the cached binary.
+    *                   <p>By passing an empty String as the _binaryKey, the entire JNI-side binary caching apparatus can be disabled.
+    */
+   protected native long buildProgramJNI(long _jniContextHandle, String _source, String _binaryKey);
 
    protected native int setArgsJNI(long _jniContextHandle, KernelArgJNI[] _args, int argc);
 
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java
index 577c13d93e6e5f97740591c1005c4a27ce5921d4..55c4ee5043e9028b9147d2a496a803a79b4af1b4 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java
@@ -17,6 +17,7 @@ public class KernelDeviceProfile {
    private static final int TABLE_COLUMN_HEADER_WIDTH = 21;
    private static final int TABLE_COLUMN_COUNT_WIDTH = 8;
    private static final int TABLE_COLUMN_WIDTH;
+   private static String tableHeader = null;
    private final Class<? extends Kernel> kernel;
    private final Device device;
    private long[] currentTimes = new long[ProfilingEvent.values().length];
@@ -106,17 +107,20 @@ public class KernelDeviceProfile {
       return sum;
    }
 
-   public static String getTableHeader() {
-      int length = ProfilingEvent.values().length;
-      StringBuilder builder = new StringBuilder(150);
-      appendRowHeaders(builder, "Device", "Count");
-      for (int i = 1; i < length; ++i) {
-         ProfilingEvent stage = ProfilingEvent.values()[i];
-         String heading = stage.name();
-         appendCell(builder, heading);
+   public static synchronized String getTableHeader() {
+      if (tableHeader == null) {
+         int length = ProfilingEvent.values().length;
+         StringBuilder builder = new StringBuilder(150);
+         appendRowHeaders(builder, "Device", "Count");
+         for (int i = 1; i < length; ++i) {
+            ProfilingEvent stage = ProfilingEvent.values()[i];
+            String heading = stage.name();
+            appendCell(builder, heading);
+         }
+         builder.append("  ").append("Total");
+         tableHeader = builder.toString();
       }
-      builder.append("  ").append("Total");
-      return builder.toString();
+      return tableHeader;
    }
 
    public String getLastAsTableRow() {
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java
index 3ced0ae0361ff83686972c6c695b8ba6a8d71345..2b5dc2e2af3474649d22b192fdc662e7a0088d86 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java
@@ -47,11 +47,16 @@ public class KernelManager {
    /** This method returns a shared instance of a given Kernel subclass. The kernelClass needs a no-args constructor, which
     *  need not be public.
     *
-    *  <p>Given that compilation of OpenCL is relatively expensive and that (currently!) there is no caching of compiled OpenCL
-    *  it is desirable to only ever create one instance of any given Kernel subclass, which this method facilitates.</p>
+    *  <p>Each new Kernel instance requires a new JNIContext, the creation of which is expensive. There is apparently no simple solution by which a cached JNIContext can be reused
+    *  for all instances of a given Kernel class, since it is intimately connected with resource aquisition and release. In the absence of a context caching solution, it is often
+    *  highly desirable to only ever create one instance of any given Kernel subclass, which this method facilitates.</p>
     *
-    *  <p>In order to maintain thread saftey, it is necessary to synchronize on the returned kernel for the duration of the process of setting up,
-    *  executing and extracting the results from that kernel, when using a shared instance.</p>
+    *  <p>In order to maintain thread saftey when using a shared instance, it is necessary to synchronize on the returned kernel for the duration of the process of setting up,
+    *  executing and extracting the results from that kernel.</p>
+    *
+    *  <p>This method instantiates a Kernel (per Kernel class) via Reflection, and thus can only be used where the Kernel class has a no-args constructor, which need not be public.
+    *  In fact, if a Kernel subclass is designed to be used in conjunction with this method, it is recommended that its <b>only</b> constructor is a <b>private</b> no-args constructor.
+    *  </p>
     *
     *  @throws RuntimeException if the class cannot be instantiated
     */
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java
index 647ab5133fc3d50fa24247b1508d080eeaafdf7b..3d1caaa11906ae2fcccacbce59050b0d4b8c86c7 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java
@@ -12,6 +12,7 @@ import java.util.logging.*;
  */
 public class KernelProfile {
 
+   private static final double MILLION = 1000000d;
    private static Logger logger = Logger.getLogger(Config.getLoggerName());
    private final Class<? extends Kernel> kernelClass;
    private LinkedHashMap<Device, KernelDeviceProfile> deviceProfiles = new LinkedHashMap<>();
@@ -25,12 +26,13 @@ public class KernelProfile {
 
    public double getLastExecutionTime() {
       KernelDeviceProfile lastDeviceProfile = getLastDeviceProfile();
-      return lastDeviceProfile == null ? Double.NaN : lastDeviceProfile.getLastElapsedTime(ProfilingEvent.START, ProfilingEvent.EXECUTED);
+      return lastDeviceProfile == null ? Double.NaN : lastDeviceProfile.getLastElapsedTime(ProfilingEvent.START, ProfilingEvent.EXECUTED) / MILLION;
    }
 
    public double getLastConversionTime() {
       KernelDeviceProfile lastDeviceProfile = getLastDeviceProfile();
-      return lastDeviceProfile == null ? Double.NaN : lastDeviceProfile.getLastElapsedTime(ProfilingEvent.START, ProfilingEvent.EXECUTED);   }
+      return lastDeviceProfile == null ? Double.NaN : lastDeviceProfile.getLastElapsedTime(ProfilingEvent.START, ProfilingEvent.EXECUTED) / MILLION;
+   }
 
    public double getAccumulatedTotalTime() {
       KernelDeviceProfile lastDeviceProfile = getLastDeviceProfile();
@@ -38,12 +40,12 @@ public class KernelProfile {
          return Double.NaN;
       }
       else {
-         return lastDeviceProfile.getCumulativeElapsedTimeAll();
+         return lastDeviceProfile.getCumulativeElapsedTimeAll() / MILLION;
       }
    }
 
-   private KernelDeviceProfile getLastDeviceProfile() {
-      return null;
+   public KernelDeviceProfile getLastDeviceProfile() {
+      return deviceProfiles.get(currentDevice);
    }
 
    void onStart(Device device) {
@@ -61,10 +63,11 @@ public class KernelProfile {
    void onEvent(ProfilingEvent event) {
       switch (event) {
          case CLASS_MODEL_BUILT: // fallthrough
-         case OPENCL_GENERATED: // fallthrough
-         case OPENCL_COMPILED: // fallthrough
-         case PREPARE_EXECUTE: // fallthrough
-         case EXECUTED: // fallthrough
+         case OPENCL_GENERATED:  // fallthrough
+         case INIT_JNI:          // fallthrough
+         case OPENCL_COMPILED:   // fallthrough
+         case PREPARE_EXECUTE:   // fallthrough
+         case EXECUTED:          // fallthrough
          {
             if (currentDeviceProfile == null) {
                logger.log(Level.SEVERE, "Error in KernelProfile, no currentDevice (synchronization error?");
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java
index f162d695ed5130737b525cdbc707f49b41d56b30..7f250d0fe0ab949e51408df82bb2766ff4f12ec5 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java
@@ -75,6 +75,10 @@ import java.util.logging.*;
  */
 public class KernelRunner extends KernelRunnerJNI{
 
+   public static boolean BINARY_CACHING_DISABLED = false;
+
+   private static final int MINIMUM_ARRAY_SIZE = 1;
+
    /** @see #getCurrentPass() */
    @UsedByJNICode public static final int PASS_ID_PREPARING_EXECUTION = -2;
    /** @see #getCurrentPass() */
@@ -129,6 +133,7 @@ public class KernelRunner extends KernelRunnerJNI{
    private static final ForkJoinPool threadPool = new ForkJoinPool(Runtime.getRuntime().availableProcessors(),
          lowPriorityThreadFactory, null, false);
    private static HashMap<Class<? extends Kernel>, String> openCLCache = new HashMap<>();
+   private static LinkedHashSet<String> seenBinaryKeys = new LinkedHashSet<>();
 
    /**
     * Create a KernelRunner for a specific Kernel instance.
@@ -147,7 +152,34 @@ public class KernelRunner extends KernelRunnerJNI{
       inBufferRemoteInt = inBufferRemote.asIntBuffer();
       outBufferRemoteInt = outBufferRemote.asIntBuffer();
 
-      KernelManager.instance(); // ensures static initialization of KernalManager
+      KernelManager.instance(); // ensures static initialization of KernelManager
+   }
+
+   /**
+    * @see Kernel#cleanUpArrays().
+    */
+   public void cleanUpArrays() {
+      if (args != null && kernel.isRunningCL()) {
+         for (KernelArg arg : args) {
+            if ((arg.getType() & KernelRunnerJNI.ARG_ARRAY) != 0) {
+               Field field = arg.getField();
+               if (field != null && field.getType().isArray() && !Modifier.isFinal(field.getModifiers())) {
+                  field.setAccessible(true);
+                  Class<?> componentType = field.getType().getComponentType();
+                  Object newValue = Array.newInstance(componentType, MINIMUM_ARRAY_SIZE);
+                  try {
+                     field.set(kernel, newValue);
+                  }
+                  catch (IllegalAccessException e) {
+                     throw new RuntimeException(e);
+                  }
+               }
+            }
+         }
+         kernel.execute(0);
+      } else if (kernel.isRunningCL()) {
+         logger.log(Level.SEVERE, "KernelRunner#cleanUpArrays() could not execute as no args available (Kernel has not been executed?)");
+      }
    }
 
    /**
@@ -156,7 +188,7 @@ public class KernelRunner extends KernelRunnerJNI{
     * @see KernelRunnerJNI#disposeJNI(long)
     */
    public void dispose() {
-      if (kernel.isRunningCL()) {
+      if (args != null || kernel.isRunningCL()) {
          disposeJNI(jniContextHandle);
       }
       // We are using a shared pool, so there's no need no shutdown it when kernel is disposed
@@ -1005,7 +1037,7 @@ public class KernelRunner extends KernelRunnerJNI{
          kernel.setFallbackExecutionMode();
       }
       recreateRange(_settings);
-      return executeInternal(_settings);
+      return executeInternalInner(_settings);
    }
 
    private void recreateRange(ExecutionSettings _settings) {
@@ -1075,33 +1107,23 @@ public class KernelRunner extends KernelRunnerJNI{
       }
 
       recreateRange(_settings);
-      return executeInternal(_settings);
-   }
-
-   private String describeDevice() {
-      Device device = KernelManager.instance().getPreferences(kernel).getPreferredDevice(kernel);
-      return (device == null) ? "<default fallback>" : device.getShortDescription();
-   }
-
-   @Override
-   public String toString() {
-      return "KernelRunner{" + kernel + "}";
+      return executeInternalInner(_settings);
    }
 
    @SuppressWarnings("deprecation")
    public synchronized Kernel execute(String _entrypoint, final Range _range, final int _passes) {
       executing = true;
-      clearCancelMultiPass();
-      KernelProfile profile = KernelManager.instance().getProfile(kernel.getClass());
-      KernelPreferences preferences = KernelManager.instance().getPreferences(kernel);
-      boolean legacyExecutionMode = kernel.getExecutionMode() != Kernel.EXECUTION_MODE.AUTO;
-
-      ExecutionSettings settings = new ExecutionSettings(preferences, profile, _entrypoint, _range, _passes, legacyExecutionMode);
       try {
+         clearCancelMultiPass();
+         KernelProfile profile = KernelManager.instance().getProfile(kernel.getClass());
+         KernelPreferences preferences = KernelManager.instance().getPreferences(kernel);
+         boolean legacyExecutionMode = kernel.getExecutionMode() != Kernel.EXECUTION_MODE.AUTO;
+
+         ExecutionSettings settings = new ExecutionSettings(preferences, profile, _entrypoint, _range, _passes, legacyExecutionMode);
          // Two Kernels of the same class share the same KernelPreferences object, and since failure (fallback) generally mutates
          // the preferences object, we must lock it. Note this prevents two Kernels of the same class executing simultaneously.
          synchronized (preferences) {
-            return executeInternal(settings);
+            return executeInternalOuter(settings);
          }
       } finally {
          executing = false;
@@ -1109,8 +1131,18 @@ public class KernelRunner extends KernelRunnerJNI{
       }
    }
 
+   private synchronized Kernel executeInternalOuter(ExecutionSettings _settings) {
+      try {
+         return executeInternalInner(_settings);
+      } finally {
+         if (kernel.isAutoCleanUpArrays() &&_settings.range.getGlobalSize_0() != 0) {
+            cleanUpArrays();
+         }
+      }
+   }
+
    @SuppressWarnings("deprecation")
-   private synchronized Kernel executeInternal(ExecutionSettings _settings) {
+   private synchronized Kernel executeInternalInner(ExecutionSettings _settings) {
 
       if (_settings.range == null) {
          throw new IllegalStateException("range can't be null");
@@ -1119,7 +1151,7 @@ public class KernelRunner extends KernelRunnerJNI{
       EXECUTION_MODE requestedExecutionMode = kernel.getExecutionMode();
 
       if (requestedExecutionMode.isOpenCL() && _settings.range.getDevice() != null && !(_settings.range.getDevice() instanceof OpenCLDevice)) {
-         fallBackToNextDevice(_settings, "OpenCL was requested but Device supplied was not an OpenCLDevice");
+         fallBackToNextDevice(_settings, "OpenCL EXECUTION_MODE was requested but Device supplied was not an OpenCLDevice");
       }
 
       Device device = _settings.range.getDevice();
@@ -1151,9 +1183,6 @@ public class KernelRunner extends KernelRunnerJNI{
          OpenCLDevice openCLDevice = device instanceof OpenCLDevice ? (OpenCLDevice) device : null;
 
          int jniFlags = 0;
-         if (_settings.legacyExecutionMode && device != null && !(device instanceof OpenCLDevice)) {
-            hashCode();
-         }
          // for legacy reasons use old logic where Kernel.EXECUTION_MODE is not AUTO
          if (_settings.legacyExecutionMode && !userSpecifiedDevice && requestedExecutionMode.isOpenCL()) {
             if (requestedExecutionMode.equals(EXECUTION_MODE.GPU)) {
@@ -1214,9 +1243,8 @@ public class KernelRunner extends KernelRunnerJNI{
                      // jniFlags |= (kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.GPU) ? JNI_FLAG_USE_GPU : 0);
                      // Init the device to check capabilities before emitting the
                      // code that requires the capabilities.
-
-                     // synchronized(Kernel.class){
                      jniContextHandle = initJNI(kernel, openCLDevice, jniFlags); // openCLDevice will not be null here
+                     _settings.profile.onEvent(ProfilingEvent.INIT_JNI);
                   } // end of synchronized! issue 68
 
                   if (jniContextHandle == 0) {
@@ -1282,8 +1310,26 @@ public class KernelRunner extends KernelRunnerJNI{
                      }
                   }
 
-                  // Send the string to OpenCL to compile it
-                  long handle = buildProgramJNI(jniContextHandle, openCL);
+                  // Send the string to OpenCL to compile it, or if the compiled binary is already cached on JNI side just empty string to use cached binary
+                  long handle;
+                  if (BINARY_CACHING_DISABLED) {
+                     handle = buildProgramJNI(jniContextHandle, openCL, "");
+                  } else {
+                     synchronized (seenBinaryKeys) {
+                        String binaryKey = kernel.getClass().getName() + ":" + device.getDeviceId();
+                        if (seenBinaryKeys.contains(binaryKey)) {
+                           // use cached binary
+                           logger.log(Level.INFO, "reusing cached binary for " + binaryKey);
+                           handle = buildProgramJNI(jniContextHandle, "", binaryKey);
+                        }
+                        else {
+                           // create and cache binary
+                           logger.log(Level.INFO, "compiling new binary for " + binaryKey);
+                           handle = buildProgramJNI(jniContextHandle, openCL, binaryKey);
+                           seenBinaryKeys.add(binaryKey);
+                        }
+                     }
+                  }
                   _settings.profile.onEvent(ProfilingEvent.OPENCL_COMPILED);
                   if (handle == 0) {
                      return fallBackToNextDevice(_settings, "OpenCL compile failed");
@@ -1446,6 +1492,26 @@ public class KernelRunner extends KernelRunnerJNI{
       }
       finally {
          _settings.profile.onEvent(ProfilingEvent.EXECUTED);
+         maybeReportProfile(_settings);
+      }
+   }
+
+   @Override
+   public String toString() {
+      return "KernelRunner{" + kernel + "}";
+   }
+
+   private String describeDevice() {
+      Device device = KernelManager.instance().getPreferences(kernel).getPreferredDevice(kernel);
+      return (device == null) ? "<default fallback>" : device.getShortDescription();
+   }
+
+   private void maybeReportProfile(ExecutionSettings _settings) {
+      if (Config.dumpProfileOnExecution) {
+         StringBuilder report = new StringBuilder();
+         report.append(KernelDeviceProfile.getTableHeader()).append('\n');
+         report.append(_settings.profile.getLastDeviceProfile().getLastAsTableRow());
+         System.out.println(report);
       }
    }
 
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java
index fcb06bfd38a478b80d899c52566a7d5660c105a0..4e1d01d0a524f3a7b2075891b6bbd877ad6cf3b1 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java
@@ -4,5 +4,5 @@ package com.amd.aparapi.internal.kernel;
  * Created by Barney on 02/09/2015.
  */
 public enum ProfilingEvent {
-   START, CLASS_MODEL_BUILT, OPENCL_GENERATED, OPENCL_COMPILED, PREPARE_EXECUTE, EXECUTED
+   START, CLASS_MODEL_BUILT, INIT_JNI, OPENCL_GENERATED, OPENCL_COMPILED, PREPARE_EXECUTE, EXECUTED
 }
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java
index 132f4f21ae49d9371e19914bfc03805f5aceb880..d3db6a62c37b620384f16609afbeed6b2692ce7f 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java
@@ -65,9 +65,9 @@ import java.util.logging.*;
  * @author gfrost
  *
  */
-public class ClassModel{
+public class ClassModel {
 
-   public interface LocalVariableInfo{
+   public interface LocalVariableInfo {
 
       int getStart();
 
@@ -142,6 +142,7 @@ public class ClassModel{
          });
 
    //   private ValueCache<String, Integer, ClassParseException> privateMemorySizes = ValueCache.on(this::computePrivateMemorySize);
+
    private ValueCache<String, Integer, ClassParseException> privateMemorySizes = ValueCache
          .on(new ThrowingValueComputer<String, Integer, ClassParseException>(){
             @Override
@@ -635,19 +636,25 @@ public class ClassModel{
       return (methodDescription);
    }
 
-   //   private static final ValueCache<Class<?>, ClassModel, ClassParseException> classModelCache = ValueCache.onIdentity(ClassModel::new);
    private static final ValueCache<Class<?>, ClassModel, ClassParseException> classModelCache = ValueCache
          .on(new ThrowingValueComputer<Class<?>, ClassModel, ClassParseException>(){
             @Override
             public ClassModel compute(Class<?> key) throws ClassParseException {
-               return new ClassModel(key);
+               return createClassModelInternal(key);
             }
          });
 
+   private static ClassModel createClassModelInternal(Class<?> key) throws ClassParseException {
+      ClassModel classModel = new ClassModel(key);
+      return classModel;
+   }
+
    public static ClassModel createClassModel(Class<?> _class) throws ClassParseException {
-      if (CacheEnabler.areCachesEnabled())
+      if (CacheEnabler.areCachesEnabled()) {
          return classModelCache.computeIfAbsent(_class);
-      return new ClassModel(_class);
+      }
+
+      return createClassModelInternal(_class);
    }
 
    private int magic;
@@ -746,7 +753,7 @@ public class ClassModel{
 
       private final List<Entry> entries = new ArrayList<Entry>();
 
-      public abstract class Entry{
+      public abstract class Entry {
          private final ConstantPoolType constantPoolType;
 
          private final int slot;
@@ -1559,7 +1566,7 @@ public class ClassModel{
       }
    }
 
-   public class AttributePool{
+   public class AttributePool {
       private final List<AttributePoolEntry> attributePoolEntries = new ArrayList<AttributePoolEntry>();
 
       public class CodeEntry extends AttributePoolEntry{
@@ -1672,7 +1679,7 @@ public class ClassModel{
          }
       }
 
-      public abstract class AttributePoolEntry{
+      public abstract class AttributePoolEntry {
          protected int length;
 
          protected int nameIndex;
@@ -1727,7 +1734,7 @@ public class ClassModel{
       }
 
       public class InnerClassesEntry extends PoolEntry<InnerClassesEntry.InnerClassInfo>{
-         public class InnerClassInfo{
+         public class InnerClassInfo {
             private final int innerAccess;
 
             private final int innerIndex;
@@ -1771,7 +1778,7 @@ public class ClassModel{
 
       public class LineNumberTableEntry extends PoolEntry<LineNumberTableEntry.StartLineNumberPair>{
 
-         public class StartLineNumberPair{
+         public class StartLineNumberPair {
             private final int lineNumber;
 
             private final int start;
@@ -2090,13 +2097,13 @@ public class ClassModel{
 
       public class RuntimeAnnotationsEntry extends PoolEntry<RuntimeAnnotationsEntry.AnnotationInfo>{
 
-         public class AnnotationInfo{
+         public class AnnotationInfo {
             private final int typeIndex;
 
             private final int elementValuePairCount;
 
             public class ElementValuePair{
-               class Value{
+               class Value {
                   Value(int _tag) {
                      tag = _tag;
                   }
@@ -2383,7 +2390,7 @@ public class ClassModel{
 
    private static ClassLoader classModelLoader = ClassModel.class.getClassLoader();
 
-   public class ClassModelField{
+   public class ClassModelField {
       private final int fieldAccessFlags;
 
       AttributePool fieldAttributePool;
@@ -2450,7 +2457,7 @@ public class ClassModel{
       }
    }
 
-   public class ClassModelMethod{
+   public class ClassModelMethod {
 
       private final int methodAccessFlags;
 
@@ -2554,7 +2561,7 @@ public class ClassModel{
       }
    }
 
-   public class ClassModelInterface{
+   public class ClassModelInterface {
       private final int interfaceIndex;
 
       ClassModelInterface(ByteReader _byteReader) {
@@ -2805,7 +2812,10 @@ public class ClassModel{
    Entrypoint getEntrypoint(String _entrypointName, String _descriptor, Object _k) throws AparapiException {
       if (CacheEnabler.areCachesEnabled()) {
          EntrypointKey key = EntrypointKey.of(_entrypointName, _descriptor);
+         long s = System.nanoTime();
          Entrypoint entrypointWithoutKernel = entrypointCache.computeIfAbsent(key);
+         long e = System.nanoTime() - s;
+         System.out.println("newMethodModel: " + e / 1000000f);
          return entrypointWithoutKernel.cloneForKernel(_k);
       } else {
          final MethodModel method = getMethodModel(_entrypointName, _descriptor);
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Memoizer.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Memoizer.java
index ece7e391574fb962f7f28d06e876e97693b2d970..7eec09b7e7a08a606d44712a714c86b6ab064fe8 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Memoizer.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Memoizer.java
@@ -1,7 +1,7 @@
 package com.amd.aparapi.internal.model;
 
-import java.util.NoSuchElementException;
-import java.util.concurrent.atomic.AtomicReference;
+import java.util.*;
+import java.util.concurrent.atomic.*;
 
 interface Optional<E> {
    final class Some<E> implements Optional<E>{
@@ -49,7 +49,7 @@ interface Optional<E> {
    boolean isPresent();
 }
 
-public interface Memoizer<T> extends Supplier<T>{
+public interface Memoizer<T> extends Supplier<T> {
    public final class Impl<T> implements Memoizer<T>{
       private final Supplier<T> supplier;
 
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ValueCache.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ValueCache.java
index ef66a53fdeca66f8da816f12f1d88e360d749303..63906ed0465b9d95150dc3923f05552d9aacaa90 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ValueCache.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ValueCache.java
@@ -1,9 +1,7 @@
 package com.amd.aparapi.internal.model;
 
-import java.lang.ref.Reference;
-import java.lang.ref.SoftReference;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ConcurrentMap;
+import java.lang.ref.*;
+import java.util.concurrent.*;
 
 //import java.util.function.Supplier;
 
@@ -14,7 +12,7 @@ public final class ValueCache<K, V, T extends Throwable> {
    }
 
    //    @FunctionalInterface
-   public interface ValueComputer<K, V> extends ThrowingValueComputer<K, V, RuntimeException>{
+   public interface ValueComputer<K, V> extends ThrowingValueComputer<K, V, RuntimeException> {
       // Marker interface
    }
 
diff --git a/samples/blackscholes/src/com/amd/aparapi/sample/blackscholes/Main.java b/samples/blackscholes/src/com/amd/aparapi/sample/blackscholes/Main.java
index bdc1486254513bbc07b61b1cb84e3313debca650..074ed2b013182be33f73aa4325d463c256aecbd0 100644
--- a/samples/blackscholes/src/com/amd/aparapi/sample/blackscholes/Main.java
+++ b/samples/blackscholes/src/com/amd/aparapi/sample/blackscholes/Main.java
@@ -76,7 +76,6 @@ public class Main{
       /**
       * @brief   Abromowitz Stegun approxmimation for PHI (Cumulative Normal Distribution Function)
       * @param   X input value
-      * @param   phi pointer to store calculated CND of X
       */
       float phi(float X) {
          final float c1 = 0.319381530f;
@@ -183,18 +182,15 @@ public class Main{
 
       int size = Integer.getInteger("size", 512);
       Range range = Range.create(size);
-      int iterations = Integer.getInteger("iterations", 5);
+      int iterations = Integer.getInteger("iterations", 20);
       System.out.println("size =" + size);
       System.out.println("iterations =" + iterations);
       BlackScholesKernel kernel = new BlackScholesKernel(size);
 
-      long totalExecTime = 0;
-      long iterExecTime = 0;
-      /*
       for (int i = 0; i < iterations; i++) {
-         iterExecTime = kernel.execute(size).getExecutionTime();
-         totalExecTime += iterExecTime;
-      }*/
+         kernel.execute(size).getExecutionTime();
+      }
+
       kernel.execute(range, iterations);
       System.out.println("Average execution time " + kernel.getAccumulatedExecutionTime() / iterations);
       kernel.showResults(10);
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/AutoCleanUpArraysDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/AutoCleanUpArraysDemo.java
new file mode 100644
index 0000000000000000000000000000000000000000..c09d0ab218bec2c0a303a77517890397ede4b2d5
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/AutoCleanUpArraysDemo.java
@@ -0,0 +1,20 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.sample.mandel.*;
+
+public class AutoCleanUpArraysDemo {
+   public static void main(String[] ignored) {
+
+      System.setProperty("com.amd.aparapi.dumpProfileOnExecution", "true");
+
+      int size = 1024;
+      int[] rgbs = new int[size * size];
+      Main.MandelKernel kernel = new Main.MandelKernel(size, size, rgbs);
+      kernel.setAutoCleanUpArrays(true);
+      kernel.execute(size * size);
+      System.out.println("length = " + kernel.getRgbs().length);
+      kernel.resetImage(size, size, rgbs);
+      kernel.execute(size * size);
+      System.out.println("length = " + kernel.getRgbs().length);
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/CleanUpArraysDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/CleanUpArraysDemo.java
new file mode 100644
index 0000000000000000000000000000000000000000..26d832f4b2d1db2be339c6933fff405d642b6a7c
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/CleanUpArraysDemo.java
@@ -0,0 +1,25 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.sample.mandel.*;
+
+public class CleanUpArraysDemo {
+   public static void main(String[] ignored) {
+
+      System.setProperty("com.amd.aparapi.enableVerboseJNI", "true");
+      System.setProperty("com.amd.aparapi.enableVerboseJNIOpenCLResourceTracking", "true");
+      System.setProperty("com.amd.aparapi.enableExecutionModeReporting", "true");
+      System.setProperty("com.amd.aparapi.dumpProfileOnExecution", "true");
+
+      int size = 1024;
+      int[] rgbs = new int[size * size];
+      Main.MandelKernel kernel = new Main.MandelKernel(size, size, rgbs);
+      kernel.execute(size * size);
+      System.out.println("length = " + kernel.getRgbs().length);
+      System.out.println("Cleaning up arrays");
+      kernel.cleanUpArrays();
+      System.out.println("length = " + kernel.getRgbs().length);
+      kernel.resetImage(size, size, rgbs);
+      kernel.execute(size * size);
+      System.out.println("length = " + kernel.getRgbs().length);
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java
index bdfb3cf21d21ad26b50772f56c91d231cc7e352e..67d7cc0296b1432303e80ab8fd39fec477f64891 100644
--- a/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java
@@ -10,16 +10,14 @@ import java.util.*;
  */
 public class ConfigurationDemo {
    public static void main(String[] ignored) {
-      System.setProperty("com.amd.aparapi.dumpProfilesOnExit", "true");
-
       StringBuilder report;
 
       List<Integer> tests = Arrays.asList(0, 1, 2, 3);
-      int reps = 300;
+      int reps = 1;
       for (int rep = 0; rep < reps; ++rep) {
          runTests(rep == 0, tests);
 
-         if (rep % 100 == 99 || rep == 0) {
+         if (rep % 100 == 99 || rep == 0 || rep == reps - 1) {
             report = new StringBuilder("rep = " + rep + "\n");
             KernelManager.instance().reportDeviceUsage(report, true);
             System.out.println(report);
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemo.java
new file mode 100644
index 0000000000000000000000000000000000000000..aeea4ea5888c4bcf13b0dddf5fcad7cb05038edc
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemo.java
@@ -0,0 +1,83 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.*;
+import com.amd.aparapi.internal.kernel.*;
+import com.amd.aparapi.sample.blackscholes.Main.*;
+import com.amd.aparapi.sample.mandel.*;
+
+/**
+ * Demonstrate new enhanced profiling capability, profiling the kernel from the blackscholes sample.
+ */
+public class ProfilingDemo {
+
+   private static BlackScholesKernel kernel;
+
+   public static void main(String[] ignored) {
+
+      final int size = 1024;
+      newBlackScholesKernel(size);
+
+      // first execute an arbitrary Kernel (not the one we are profiling!) a few times to ensure class loading and initial JIT optimisations have
+      // been performed before we start the profiling
+      int warmups = 5;
+      for (int i = 0; i < warmups; ++i) {
+         runWarmup();
+      }
+
+      String tableHeader = KernelDeviceProfile.getTableHeader();
+
+      boolean newKernel = false;
+
+      runOnce(size, newKernel);
+      System.out.println("First run:");
+      printLastProfile(tableHeader);
+
+
+      int reps = 20;
+
+      System.out.println("\nSubsequent runs using same kernel:");
+      for (int rep = 0; rep < reps; ++rep) {
+         runOnce(size, newKernel);
+         printLastProfile(tableHeader);
+      }
+
+      newKernel = true;
+      System.out.println("\nSubsequent runs using new kernels:");
+      for (int rep = 0; rep < reps; ++rep) {
+         runOnce(size, newKernel);
+         printLastProfile(tableHeader);
+      }
+
+      // Note. You will see from the output that there is a substantial cost to Kernel creation (vs Kernel reuse), almost entirely due to KernelRunner#initJNI
+
+   }
+
+   private static void printLastProfile(String tableHeader) {
+      KernelProfile profile = KernelManager.instance().getProfile(BlackScholesKernel.class);
+      KernelDeviceProfile deviceProfile = profile.getLastDeviceProfile();
+      String row = deviceProfile.getLastAsTableRow();
+      System.out.println(tableHeader);
+      System.out.println(row);
+   }
+
+   private static void runOnce(int size, boolean newKernel) {
+      if (newKernel) {
+         newBlackScholesKernel(size);
+      }
+      kernel.execute(size);
+   }
+
+   private static void runWarmup() {
+      int[] rgb = new int[512 * 512];
+      Kernel warmupKernel = new Main.MandelKernel(512, 512, rgb);
+      warmupKernel.execute(512 * 512);
+   }
+
+   private static void newBlackScholesKernel(int size) {
+      if (kernel != null) {
+         kernel.dispose();
+      }
+      System.gc();
+      kernel = new BlackScholesKernel(size);
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemoNoBinaryCaching.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemoNoBinaryCaching.java
new file mode 100644
index 0000000000000000000000000000000000000000..2f3252c68cf63a67e21b2c2603ea1b5d333a25de
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemoNoBinaryCaching.java
@@ -0,0 +1,14 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.internal.kernel.*;
+
+/**
+ * Created by Barney on 13/09/2015.
+ */
+public class ProfilingDemoNoBinaryCaching {
+
+   public static void main(String[] ignored) {
+      KernelRunner.BINARY_CACHING_DISABLED = true;
+      ProfilingDemo.main(null);
+   }
+}
diff --git a/samples/convolution/src/com/amd/aparapi/sample/convolution/Convolution.java b/samples/convolution/src/com/amd/aparapi/sample/convolution/Convolution.java
index fc70267efa87412914e40580fce6ffb3f9fb66c9..597317a6af365eefe16ab223f5e2b4d0c1164261 100644
--- a/samples/convolution/src/com/amd/aparapi/sample/convolution/Convolution.java
+++ b/samples/convolution/src/com/amd/aparapi/sample/convolution/Convolution.java
@@ -38,15 +38,15 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 
 package com.amd.aparapi.sample.convolution;
 
-import java.io.File;
+import com.amd.aparapi.*;
 
-import com.amd.aparapi.Kernel;
+import java.io.*;
 
 public class Convolution {
 
-    public static void main(final String[] _args) {
+    public static void main(final String[] _args) throws IOException {
 
-        final File file = new File(_args.length == 1 ? _args[0] : "testcard.jpg");
+        final File file = new File(_args.length == 1 ? _args[0] : "./samples/convolution/testcard.jpg").getCanonicalFile();
 
         final ImageConvolution convolution = new ImageConvolution();
 
diff --git a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java
index 13de958505466f8a17ce3af2cbe84f3481d130f8..d527917a74d531e9ff11423126c708fb317dc956 100644
--- a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java
+++ b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java
@@ -60,6 +60,16 @@ import java.util.List;
 
 public class Main{
 
+   static {
+      System.setProperty("com.amd.aparapi.dumpProfilesOnExit", "true");
+//      KernelManager.setKernelManager(new KernelManager() {
+//         @Override
+//         protected List<Device.TYPE> getPreferredDeviceTypes() {
+//            return Collections.singletonList(Device.TYPE.CPU);
+//         }
+//      });
+   }
+
    /**
     * An Aparapi Kernel implementation for creating a scaled view of the mandelbrot set.
     *  
@@ -70,13 +80,13 @@ public class Main{
    public static class MandelKernel extends Kernel{
 
       /** RGB buffer used to store the Mandelbrot image. This buffer holds (width * height) RGB values. */
-      final private int rgb[];
+      private int[] rgb;
 
       /** Mandelbrot image width. */
-      final private int width;
+      private int width;
 
       /** Mandelbrot image height. */
-      final private int height;
+      private int height;
 
       /** Maximum iterations for Mandelbrot. */
       final private int maxIterations = 64;
@@ -112,6 +122,12 @@ public class Main{
 
       }
 
+      public void resetImage(int _width, int _height, int[] _rgb) {
+         width = _width;
+         height = _height;
+         rgb = _rgb;
+      }
+
       public int getCount(float x, float y) {
          int count = 0;
 
@@ -152,6 +168,9 @@ public class Main{
          scale = _scale;
       }
 
+      public int[] getRgbs() {
+         return rgb;
+      }
    }
 
    /** User selected zoom-in point on the Mandelbrot view. */
diff --git a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java
index 8a1b7faa68eceb14aeae40c133bf2d6f57303bd0..5bdd9805077801c6620d6c7719a9ef8a7957da50 100644
--- a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java
+++ b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java
@@ -143,6 +143,7 @@ public class Main2D{
 
    @SuppressWarnings("serial") public static void main(String[] _args) {
 
+
       final JFrame frame = new JFrame("MandelBrot");
 
       /** Mandelbrot image height. */
diff --git a/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java b/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java
index f4e3e28c5d7e748613d067ba6e76dbe018429b6e..b94c359d1f03037bb6828bcb7d10751fcb489b4e 100644
--- a/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java
+++ b/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java
@@ -1,6 +1,5 @@
 package com.amd.aparapi.sample.median;
 
-import com.amd.aparapi.device.*;
 import com.amd.aparapi.internal.kernel.*;
 
 import javax.imageio.*;
@@ -8,7 +7,6 @@ import javax.swing.*;
 import java.awt.*;
 import java.awt.image.*;
 import java.io.*;
-import java.util.*;
 
 /**
  * Demonstrate use of __private namespaces and @NoCL annotations.
@@ -27,12 +25,10 @@ public class MedianDemo {
       }
    }
 
-   private static final boolean TEST_JTP = true;
-
    public static void main(String[] ignored) {
       final int size = 5;
-      System.setProperty("com.amd.aparapi.enableShowGeneratedOpenCL", "true");
-      boolean verbose = true;
+      System.setProperty("com.amd.aparapi.dumpProfilesOnExit", "true");
+      boolean verbose = false;
       if (verbose)
       {
           System.setProperty("com.amd.aparapi.enableVerboseJNI", "true");
@@ -42,18 +38,22 @@ public class MedianDemo {
           System.setProperty("com.amd.aparapi.enableExecutionModeReporting", "true");
       }
 
-      if (TEST_JTP) {
-         LinkedHashSet<Device> devices = new LinkedHashSet<>(Collections.singleton(JavaDevice.THREAD_POOL));
-         KernelManager.instance().setDefaultPreferredDevices(devices);
-      }
+//      KernelManager.setKernelManager(new KernelManager(){
+//         @Override
+//         protected Comparator<OpenCLDevice> getDefaultGPUComparator() {
+//            return new Comparator<OpenCLDevice>() {
+//               @Override
+//               public int compare(OpenCLDevice o1, OpenCLDevice o2) {
+//                  return o2.getMaxComputeUnits() - o1.getMaxComputeUnits();
+//               }
+//            };
+//         }
+//      });
+
+      System.out.println(KernelManager.instance().bestDevice());
 
       int[] argbs = testImage.getRGB(0, 0, testImage.getWidth(), testImage.getHeight(), null, 0, testImage.getWidth());
-      MedianKernel7x7 kernel = new MedianKernel7x7();
-      kernel._imageTypeOrdinal = MedianKernel7x7.RGB;
-      kernel._sourceWidth = testImage.getWidth();
-      kernel._sourceHeight = testImage.getHeight();
-      kernel._sourcePixels = argbs;
-      kernel._destPixels = new int[argbs.length];
+      MedianKernel7x7 kernel = createMedianKernel(argbs);
 
       kernel.processImages(new MedianSettings(size));
       BufferedImage out = new BufferedImage(testImage.getWidth(), testImage.getHeight(), BufferedImage.TYPE_INT_RGB);
@@ -71,12 +71,35 @@ public class MedianDemo {
       frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
       frame.setVisible(true);
 
-      int reps = 20;
+      StringBuilder builder = new StringBuilder();
+      KernelManager.instance().reportDeviceUsage(builder, true);
+      System.out.println(builder);
+
+      int reps = 50;
+      final boolean newKernel = false;
       for (int rep = 0; rep < reps; ++rep) {
+         if (newKernel) {
+            kernel.dispose();
+            kernel = createMedianKernel(argbs);
+         }
          long start = System.nanoTime();
          kernel.processImages(new MedianSettings(size));
          long elapsed = System.nanoTime() - start;
          System.out.println("elapsed = " + elapsed / 1000000f + "ms");
       }
+
+      builder = new StringBuilder();
+      KernelManager.instance().reportDeviceUsage(builder, true);
+      System.out.println(builder);
+   }
+
+   private static MedianKernel7x7 createMedianKernel(int[] argbs) {
+      MedianKernel7x7 kernel = new MedianKernel7x7();
+      kernel._imageTypeOrdinal = MedianKernel7x7.RGB;
+      kernel._sourceWidth = testImage.getWidth();
+      kernel._sourceHeight = testImage.getHeight();
+      kernel._sourcePixels = argbs;
+      kernel._destPixels = new int[argbs.length];
+      return kernel;
    }
 }