diff --git a/.gitignore b/.gitignore
index 202ac33242205c08f7d20fafac35d38ee6024f83..8a8717d5df41d15bf703b40693a5711633bad079 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@ hs_err_pid*
 **/classes/
 **/dist/
 **/include/
+**/nbproject/
diff --git a/README.md b/README.md
index f8a09db984eb52b698cef0753ac8738f253f8f25..f28ea63fba21a14b814954ad9ab769525c0aa51a 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@ aparapi
 
 This is the new home of Aparapi.
 
-Until all code, issue tickets, wiki pages, etc. are moved please refer to [Aparapi Google Code](https://code.google.com/p/aparapi/) for documentation.
+Please refer to the [current documentation](doc/README.md) or the older docs on google code [Aparapi Google Code](https://code.google.com/p/aparapi/) for documentation.
 
 We plan to implement Binary downloads at [GitHub Releases](https://help.github.com/articles/about-releases).
 
diff --git a/build.xml b/build.xml
index abfbc1a3ff75539c4506945857fc47276c600664..443d381e2eba07a6cc474ca9ac3d2dbe6d98507f 100644
--- a/build.xml
+++ b/build.xml
@@ -5,8 +5,9 @@
 
    <condition property="x86_or_x86_64" value="x86" else="x86_64"> <or><os arch="x86" /><os arch="i386"/></or> </condition>
    <condition property="dist" value="dist_windows_${x86_or_x86_64}"><os family="windows" /></condition>
-   <condition property="dist" value="dist_linux_${x86_or_x86_64}"><and><not><os family="mac"/></not><os family="unix" /></and></condition>	
+   <condition property="dist" value="dist_linux_${x86_or_x86_64}"><and><not><os family="mac"/></not><not><os name="FreeBSD"/></not><os family="unix" /></and></condition>
    <condition property="dist" value="dist_mac_${x86_or_x86_64}"><os family="mac" /></condition>
+   <condition property="dist" value="dist_freebsd_${x86_or_x86_64}"><os name="FreeBSD" /></condition>
 		
    <target name="help">
       <echo message="Available targets are:-"/> 
diff --git a/com.amd.aparapi.jni/build.xml b/com.amd.aparapi.jni/build.xml
index 035bc928bbeb0a01880f7613fa7cc47b23c6534c..9dc834cf71fc9d1504b079f700167129bd4d567b 100644
--- a/com.amd.aparapi.jni/build.xml
+++ b/com.amd.aparapi.jni/build.xml
@@ -91,6 +91,28 @@ First consider editing the properties in build.properties
       </condition>
 
       <echo message=" intel.app.sdk.dir ${intel.app.sdk.dir}"/>
+      
+      <available property="freebsd.opencl.exists" file="/usr/local/lib/libOpenCL.so" type="file"/>
+      <condition property="freebsd.app.sdk.dir" value="/usr/local">
+         <and>
+            <os name="FreeBSD" />
+            <isset property="freebsd.opencl.exists" />
+            <not>
+               <isset property="win32.amd.app.sdk.exists" />
+            </not>
+            <not>
+               <isset property="win64.amd.app.sdk.exists" />
+            </not>
+            <not>
+               <isset property="linux.amd.app.sdk.exists" />
+            </not>
+	    <not>
+	      <isset property="linux.intel.app.sdk.exists" />
+	    </not>
+         </and>
+      </condition>
+      
+      <echo message=" freebsd.app.sdk.dir ${freebsd.app.sdk.dir}"/>
 
       <condition property="vendor.name" value="amd">
          <isset property="amd.app.sdk.dir" /> 
@@ -105,6 +127,18 @@ First consider editing the properties in build.properties
          </and>
       </condition>
 
+      <condition property="vendor.name" value="freebsd">
+         <and>
+            <isset property="freebsd.app.sdk.dir" /> 
+            <not>
+                <isset property="amd.app.sdk.dir" /> 
+            </not>
+	    <not>
+                <isset property="intel.app.sdk.dir" /> 
+            </not>
+         </and>
+      </condition>
+      
       <echo message=" vendor.name ${vendor.name}"/>
   
       <condition property="app.sdk.dir" value="${amd.app.sdk.dir}">
@@ -119,6 +153,15 @@ First consider editing the properties in build.properties
             </not>
          </and>
       </condition>
+      
+      <condition property="freebsd.sdk.dir" value="${freebsd.app.sdk.dir}">
+         <and>
+            <isset property="freebsd.app.sdk.dir" /> 
+            <not>
+                <isset property="app.sdk.dir" /> 
+            </not>
+         </and>
+      </condition>
 
       <echo message="app.sdk.dir ${app.sdk.dir}"/>
 
@@ -348,6 +391,9 @@ First consider editing the properties in build.properties
             <os family="unix" />
             <not>
                <os family="mac" />
+            </not>	 
+	    <not>
+               <os name="FreeBSD" />
             </not>
          </and>
       </condition>
@@ -355,6 +401,10 @@ First consider editing the properties in build.properties
       <condition property="use.gcc_mac">
          <os family="mac" />
       </condition>
+      
+      <condition property="use.clang_freebsd">
+         <os name="FreeBSD" />
+      </condition>
 
       <condition property="x86_or_x86_64" value="x86" else="x86_64">
          <or>
@@ -406,7 +456,10 @@ First consider editing the properties in build.properties
                <not>
                   <os family="mac" />
                </not>
-               <not>
+	       <not>
+                  <os name="FreeBSD" />
+               </not>
+	       <not>
                   <isset property="app.sdk.dir" />
                </not>
             </and>
@@ -426,6 +479,9 @@ First consider editing the properties in build.properties
                <not>
                   <os family="mac" />
                </not>
+	       <not>
+                  <os name="FreeBSD" />
+               </not>
                <not>
                   <isset property="app.sdk.dir.exists" />
                </not>
@@ -582,6 +638,45 @@ First consider editing the properties in build.properties
          <arg value="-Wno-write-strings" />
       </exec>
    </target>
+   
+   <target name="clang_freebsd" if="use.clang_freebsd">
+      <mkdir dir="${basedir}/dist"/>
+      <echo message="freebsdcc ${os.arch}" />
+      <exec executable="clang++" failonerror="true">
+         <arg value="-m${gcc.m.value}" />
+         <arg value="-O3" />
+         <arg value="-g" />
+         <arg value="-fPIC" />
+         <arg value="-DCL_USE_DEPRECATED_OPENCL_1_1_APIS"/>
+         <arg value="-I${java.home}/../include" />
+         <arg value="-I${java.home}/../include/freebsd" />
+         <arg value="-Iinclude" />
+         <arg value="-I/usr/local/include" />
+         <arg value="-Isrc/cpp" />
+         <arg value="-Isrc/cpp/runKernel" />
+         <arg value="-Isrc/cpp/invoke" />
+         <arg value="-shared" />
+         <arg value="-o" />
+         <arg value="${basedir}/dist/libaparapi_${x86_or_x86_64}.so" />
+	 <arg value="src/cpp/runKernel/Aparapi.cpp" />
+         <arg value="src/cpp/runKernel/ArrayBuffer.cpp" />
+         <arg value="src/cpp/runKernel/AparapiBuffer.cpp" />
+         <arg value="src/cpp/runKernel/Config.cpp" />
+         <arg value="src/cpp/runKernel/JNIContext.cpp" />
+         <arg value="src/cpp/runKernel/KernelArg.cpp" />
+         <arg value="src/cpp/runKernel/ProfileInfo.cpp" />
+         <arg value="src/cpp/runKernel/Range.cpp" />
+         <arg value="src/cpp/invoke/OpenCLJNI.cpp" />
+         <arg value="src/cpp/invoke/OpenCLArgDescriptor.cpp" />
+         <arg value="src/cpp/invoke/OpenCLMem.cpp" />
+         <arg value="src/cpp/CLHelper.cpp" />
+         <arg value="src/cpp/classtools.cpp" />
+         <arg value="src/cpp/JNIHelper.cpp" />
+         <arg value="src/cpp/agent.cpp" />
+         <arg value="-L/usr/local/lib" />
+         <arg value="-lOpenCL" />
+      </exec>
+   </target>
 
    <target name="msvc" if="use.msvc">
       <mkdir dir="${basedir}\dist"/>
@@ -632,7 +727,7 @@ First consider editing the properties in build.properties
       </exec>
    </target>
 
-   <target name="build" depends="clean, javah, msvc, gcc, gcc_mac" />
+   <target name="build" depends="clean, javah, msvc, gcc, gcc_mac, clang_freebsd" />
 
    <target name="msvc_cltest" if="use.msvc">
       <mkdir dir="${basedir}\dist"/>
@@ -684,6 +779,24 @@ First consider editing the properties in build.properties
          <arg value="OpenCL" />
       </exec>
    </target>
+   
+   <target name="freebsd_cltest" if="use.clang_freebsd">
+      <mkdir dir="${basedir}/dist"/>
+      <echo message="clang cltest ${os.arch}" />
+      <exec executable="clang++" failonerror="true">
+         <arg value="-O3" />
+         <arg value="-g" />
+         <arg value="-fPIC" />
+         <arg value="-DCL_USE_DEPRECATED_OPENCL_1_1_APIS"/>
+         <arg value="-I${java.home}/../include" />
+         <arg value="-I${java.home}/../include/freebsd" />
+         <arg value="-I/usr/local/include" />
+         <arg value="src/cpp/cltest.cpp" />
+         <arg value="-L/usr/local/lib -lOpenCL" />
+         <arg value="-o" />
+         <arg value="${basedir}/dist/cltest" />
+      </exec>
+   </target>
 
    <target name="gcc_cltest" if="use.gcc">
       <mkdir dir="${basedir}/dist"/>
@@ -733,6 +846,6 @@ First consider editing the properties in build.properties
       </exec>
    </target>
 
-   <target name="cltest" depends="check,msvc_cltest,mac_cltest,gcc_cltest" />
-   <target name="clt" depends="check,gcc_clt,mac_clt" />
+   <target name="cltest" depends="check,msvc_cltest,mac_cltest,freebsd_cltest,gcc_cltest" />
+   <target name="clt" depends="check,gcc_clt,mac_clt,freebsd_cltest" />
 </project>
diff --git a/com.amd.aparapi.jni/src/cpp/CLHelper.cpp b/com.amd.aparapi.jni/src/cpp/CLHelper.cpp
index 1d0752e7d23bef40876cdc5dd393884b3737566b..d7afc8feb64d2cde3c8a4ae6471beb785103667d 100644
--- a/com.amd.aparapi.jni/src/cpp/CLHelper.cpp
+++ b/com.amd.aparapi.jni/src/cpp/CLHelper.cpp
@@ -40,6 +40,9 @@
 #include "CLHelper.h"
 #include "List.h"
 #include <map>
+#include <vector>
+#include <stdio.h>
+#include <string>
 
 void setMap(std::map<cl_int, const char*>& errorMap) {
    errorMap[CL_SUCCESS]                         = "success";
@@ -129,14 +132,62 @@ void CLHelper::getBuildErr(JNIEnv *jenv, cl_device_id deviceId,  cl_program prog
    delete []buildLog;
 }
 
-cl_program CLHelper::compile(JNIEnv *jenv, cl_context context, size_t deviceCount, cl_device_id* deviceIds, jstring source, jstring* log, cl_int* status){
-   const char *sourceChars = jenv->GetStringUTFChars(source, NULL);
-   size_t sourceSize[] = { strlen(sourceChars) };
-   cl_program program = clCreateProgramWithSource(context, 1, &sourceChars, sourceSize, status); 
-   jenv->ReleaseStringUTFChars(source, sourceChars);
-   *status = clBuildProgram(program, deviceCount, deviceIds, NULL, NULL, NULL);
+cl_program CLHelper::compile(JNIEnv *jenv, cl_context context, cl_device_id* deviceId, jstring* source, jstring* binaryKey, jstring* log, cl_int* status){
+   using std::map;
+   using std::vector;
+   using std::string;
+
+   static map<string, vector<unsigned char *> > src2bin;
+   static map<string, vector<size_t> > src2len;
+
+   const char* sourceChars = jenv->GetStringUTFChars(*source, NULL);
+   const char* keyChars = jenv->GetStringUTFChars(*binaryKey, NULL);
+   string sourceStr(sourceChars);
+   string keyStr(keyChars);
+
+   size_t sourceLength[] = {sourceStr.length()};
+
+   bool cacheDisabled = jenv->GetStringLength(*binaryKey) == 0;
+
+   cl_program program;
+   bool is_built_from_source = false;
+   bool keyNotFound = src2bin.find(keyStr) == src2bin.end();
+
+   if (cacheDisabled || keyNotFound) {
+      is_built_from_source = true;
+      program = clCreateProgramWithSource(context, 1, &sourceChars, sourceLength, status);
+   }
+   else{
+      cl_int *binary_status = new cl_int[1];
+      program = clCreateProgramWithBinary(context, 1, deviceId, &src2len[keyStr][0], (const unsigned char**)&src2bin[keyStr][0], binary_status, NULL);
+      cl_int theStatus = binary_status[0];
+      if (theStatus != CL_SUCCESS) {
+         getBuildErr(jenv, *deviceId, program, log);
+      }
+      delete[] binary_status;
+   }
+
+   jenv->ReleaseStringUTFChars(*source, sourceChars);
+   jenv->ReleaseStringUTFChars(*binaryKey, keyChars);
+
+   *status = clBuildProgram(program, 1, deviceId, NULL, NULL, NULL);
    if(*status == CL_BUILD_PROGRAM_FAILURE) {
-      getBuildErr(jenv, *deviceIds, program, log);
+      getBuildErr(jenv, *deviceId, program, log);
+   }
+
+   if(is_built_from_source && !cacheDisabled) {
+      vector<unsigned char *> &bins = src2bin[keyStr];
+      vector<size_t> &lens = src2len[keyStr];
+
+      bins.resize(1);
+      lens.resize(1);
+
+      clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &lens[0], NULL);
+      for(size_t i = 0; i < 1; ++i){
+         bins[i] = new unsigned char[lens[i]];
+      }
+
+      clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &bins[0], NULL);
    }
    return(program);
 }
diff --git a/com.amd.aparapi.jni/src/cpp/CLHelper.h b/com.amd.aparapi.jni/src/cpp/CLHelper.h
index d1581e9e297a586e3a2f6ad390c10eeb1040c08f..71f761efe26dfc38163c71c1e07ef69afb923680 100644
--- a/com.amd.aparapi.jni/src/cpp/CLHelper.h
+++ b/com.amd.aparapi.jni/src/cpp/CLHelper.h
@@ -45,7 +45,7 @@ class CLHelper{
    public:
    static const char *errString(cl_int status);
    static void getBuildErr(JNIEnv *jenv, cl_device_id deviceId, cl_program program, jstring *log);
-   static cl_program compile(JNIEnv *jenv, cl_context context, size_t deviceCount, cl_device_id* deviceId, jstring source, jstring* log, cl_int *status);
+   static cl_program compile(JNIEnv *jenv, cl_context context, cl_device_id* deviceId, jstring* source, jstring* binaryKey, jstring* log, cl_int *status);
    static jstring getExtensions(JNIEnv *jenv, cl_device_id deviceId, cl_int *status);
 };
 
diff --git a/com.amd.aparapi.jni/src/cpp/Common.h b/com.amd.aparapi.jni/src/cpp/Common.h
index 1abcb7e907b5c5128614b3067c2c806dd8d6d316..7e34ed21a3f43cafb27ffc418c7836e3b11cc969 100644
--- a/com.amd.aparapi.jni/src/cpp/Common.h
+++ b/com.amd.aparapi.jni/src/cpp/Common.h
@@ -44,7 +44,7 @@
 #include <string.h>
 #include <time.h>
 
-#ifndef __APPLE__
+#if not defined __APPLE__ && not defined  __FreeBSD__
 #include <malloc.h>
 #endif
 
diff --git a/com.amd.aparapi.jni/src/cpp/invoke/OpenCLJNI.cpp b/com.amd.aparapi.jni/src/cpp/invoke/OpenCLJNI.cpp
index ccfa62bfbae9254f8821dac1fa436380efeb6695..b637a390736f83d7d5f5e5d3f0ab8eaf32c51079 100644
--- a/com.amd.aparapi.jni/src/cpp/invoke/OpenCLJNI.cpp
+++ b/com.amd.aparapi.jni/src/cpp/invoke/OpenCLJNI.cpp
@@ -88,7 +88,7 @@ void OpenCLRange::fill(JNIEnv *jenv, jobject rangeInstance, jint dims, size_t* o
 }
 
 JNI_JAVA(jobject, OpenCLJNI, createProgram)
-   (JNIEnv *jenv, jobject jobj, jobject deviceInstance, jstring source) {
+   (JNIEnv *jenv, jobject jobj, jobject deviceInstance, jstring source, jstring binaryKey) {
 
       jobject platformInstance = OpenCLDevice::getPlatformInstance(jenv, deviceInstance);
       cl_platform_id platformId = OpenCLPlatform::getPlatformId(jenv, platformInstance);
@@ -105,7 +105,7 @@ JNI_JAVA(jobject, OpenCLJNI, createProgram)
 
 
       jstring log=NULL;
-      cl_program program = CLHelper::compile(jenv, context, 1, &deviceId, source, &log, &status);
+      cl_program program = CLHelper::compile(jenv, context, &deviceId, &source, &binaryKey, &log, &status);
       cl_command_queue queue = NULL;
       if(status == CL_SUCCESS) {
          cl_command_queue_properties queue_props = CL_QUEUE_PROFILING_ENABLE;
diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp
index cbad7e81539f48989f847e32edc5ebf643f2b413..eb404c523e909a00a609b35af1d7edf0d1de23a4 100644
--- a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp
+++ b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp
@@ -52,7 +52,7 @@
 
 static const int PASS_ID_PREPARING_EXECUTION = -2;
 static const int PASS_ID_COMPLETED_EXECUTION = -1;
-static const int CANCEL_STATUS_FALSE = 0;
+static const int CANCEL_STATUS_FALSE = 0;
 static const int CANCEL_STATUS_TRUE = 1;
 
 //compiler dependant code
@@ -1198,7 +1198,7 @@ void writeProfile(JNIEnv* jenv, JNIContext* jniContext) {
 }
 
 JNI_JAVA(jlong, KernelRunnerJNI, buildProgramJNI)
-   (JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jstring source) {
+   (JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jstring source, jstring binaryKey) {
       JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
       if (jniContext == NULL){
          return 0;
@@ -1207,7 +1207,7 @@ JNI_JAVA(jlong, KernelRunnerJNI, buildProgramJNI)
       try {
          cl_int status = CL_SUCCESS;
 
-         jniContext->program = CLHelper::compile(jenv, jniContext->context,  1, &jniContext->deviceId, source, NULL, &status);
+         jniContext->program = CLHelper::compile(jenv, jniContext->context, &jniContext->deviceId, &source, &binaryKey, NULL, &status);
 
          if(status == CL_BUILD_PROGRAM_FAILURE) throw CLException(status, "");
 
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Config.java b/com.amd.aparapi/src/java/com/amd/aparapi/Config.java
index 339ee89e9e482130aa26c93cd1904f72d3026460..6f08fbd80f47b505e4f1f480fbb61f9ddb2b5400 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/Config.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/Config.java
@@ -37,13 +37,11 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
  */
 package com.amd.aparapi;
 
-import java.util.logging.Handler;
-import java.util.logging.Level;
-import java.util.logging.Logger;
+import com.amd.aparapi.internal.instruction.*;
+import com.amd.aparapi.internal.jni.*;
+import com.amd.aparapi.internal.tool.*;
 
-import com.amd.aparapi.internal.instruction.Instruction;
-import com.amd.aparapi.internal.jni.ConfigJNI;
-import com.amd.aparapi.internal.tool.InstructionViewer;
+import java.util.logging.*;
 
 /**
  * A central location for holding all runtime configurable properties as well as logging configuration.
@@ -99,6 +97,22 @@ public class Config extends ConfigJNI{
     *  
     */
    public static final boolean enableShowGeneratedOpenCL = Boolean.getBoolean(propPkgName + ".enableShowGeneratedOpenCL");
+   
+   /**
+    * Upon exiting the JVM, dumps kernel profiling info to standard out.
+    *
+    *  Usage -Dcom.amd.aparapi.dumpProfilesOnExit={true|false}
+    *  
+    */
+   public static final boolean dumpProfilesOnExit = Boolean.getBoolean(propPkgName + ".dumpProfilesOnExit");
+
+   /**
+    * Dumps profiling info (for a single execution) after every Kernel execution.
+    *
+    *  Usage -Dcom.amd.aparapi.dumpProfileOnExecution={true|false}
+    *
+    */
+   public static final boolean dumpProfileOnExecution = Boolean.getBoolean(propPkgName + ".dumpProfileOnExecution");
 
    // Pragma/OpenCL codegen related flags
    public static final boolean enableAtomic32 = Boolean.getBoolean(propPkgName + ".enableAtomic32");
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java b/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java
index 708005ccde41337bd9e23bf26fb84043a632e2db..5e11cab9a1b3d9ab9c1e339170adbc99c1ab1374 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java
@@ -38,8 +38,9 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 package com.amd.aparapi;
 
 import com.amd.aparapi.annotation.Experimental;
+import com.amd.aparapi.device.*;
 import com.amd.aparapi.exception.DeprecatedException;
-import com.amd.aparapi.internal.kernel.KernelRunner;
+import com.amd.aparapi.internal.kernel.*;
 import com.amd.aparapi.internal.model.CacheEnabler;
 import com.amd.aparapi.internal.model.ClassModel.ConstantPool.MethodReferenceEntry;
 import com.amd.aparapi.internal.model.ClassModel.ConstantPool.NameAndTypeEntry;
@@ -47,7 +48,7 @@ import com.amd.aparapi.internal.model.ValueCache;
 import com.amd.aparapi.internal.model.ValueCache.ThrowingValueComputer;
 import com.amd.aparapi.internal.model.ValueCache.ValueComputer;
 import com.amd.aparapi.internal.opencl.OpenCLLoader;
-import com.amd.aparapi.internal.util.UnsafeWrapper;
+import com.amd.aparapi.internal.util.*;
 
 import java.lang.annotation.Annotation;
 import java.lang.annotation.ElementType;
@@ -55,14 +56,7 @@ import java.lang.annotation.Retention;
 import java.lang.annotation.RetentionPolicy;
 import java.lang.annotation.Target;
 import java.lang.reflect.Method;
-import java.util.ArrayDeque;
-import java.util.Arrays;
-import java.util.Deque;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.LinkedHashSet;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 import java.util.concurrent.BrokenBarrierException;
 import java.util.concurrent.CyclicBarrier;
 import java.util.logging.Logger;
@@ -314,7 +308,13 @@ public abstract class Kernel implements Cloneable {
    }
 
    /**
-    * The <i>execution mode</i> ENUM enumerates the possible modes of executing a kernel. 
+    * @deprecated It is no longer recommended that {@code EXECUTION_MODE}s are used, as a more sophisticated {@link com.amd.aparapi.device.Device}
+    * preference mechanism is in place, see {@link com.amd.aparapi.internal.kernel.KernelManager}. Though {@link #setExecutionMode(EXECUTION_MODE)}
+    * is still honored, the default EXECUTION_MODE is now {@link EXECUTION_MODE#AUTO}, which indicates that the KernelManager
+    * will determine execution behaviours.
+    *
+    * <p>
+    * The <i>execution mode</i> ENUM enumerates the possible modes of executing a kernel.
     * One can request a mode of execution using the values below, and query a kernel after it first executes to 
     * determine how it executed.  
     *    
@@ -354,8 +354,12 @@ public abstract class Kernel implements Cloneable {
     * @author  gfrost AMD Javalabs
     * @version Alpha, 21/09/2010
     */
-
+   @Deprecated
    public static enum EXECUTION_MODE {
+      /**
+       *
+       */
+      AUTO,
       /**
        * A dummy value to indicate an unknown state.
        */
@@ -389,27 +393,9 @@ public abstract class Kernel implements Cloneable {
        */
       ACC;
 
-      static EXECUTION_MODE getDefaultExecutionMode() {
-         EXECUTION_MODE defaultExecutionMode = OpenCLLoader.isOpenCLAvailable() ? GPU : JTP;
-         final String executionMode = Config.executionMode;
-         if (executionMode != null) {
-            try {
-               EXECUTION_MODE requestedExecutionMode;
-               requestedExecutionMode = getExecutionModeFromString(executionMode).iterator().next();
-               logger.fine("requested execution mode =");
-               if ((OpenCLLoader.isOpenCLAvailable() && requestedExecutionMode.isOpenCL()) || !requestedExecutionMode.isOpenCL()) {
-                  defaultExecutionMode = requestedExecutionMode;
-               }
-            } catch (final Throwable t) {
-               // we will take the default
-            }
-         }
-
-         logger.fine("default execution modes = " + defaultExecutionMode);
-
-         return (defaultExecutionMode);
-      }
-
+      /**
+       * @deprecated See {@link EXECUTION_MODE}.
+       */
       static LinkedHashSet<EXECUTION_MODE> getDefaultExecutionModes() {
          LinkedHashSet<EXECUTION_MODE> defaultExecutionModes = new LinkedHashSet<EXECUTION_MODE>();
 
@@ -478,6 +464,8 @@ public abstract class Kernel implements Cloneable {
 
    private KernelRunner kernelRunner = null;
 
+   private boolean autoCleanUpArrays = false;
+
    private KernelState kernelState = new KernelState();
 
    /**
@@ -956,6 +944,26 @@ public abstract class Kernel implements Cloneable {
     */
    public abstract void run();
 
+   /** False by default. In the event that all preferred devices fail to execute a kernel, it is possible to supply an alternate (possibly non-parallel)
+    * execution algorithm by overriding this method to return true, and overriding {@link #executeFallbackAlgorithm(Range, int)} with the alternate
+    * algorithm.
+    */
+   public boolean hasFallbackAlgorithm() {
+      return false;
+   }
+
+   /** If {@link #hasFallbackAlgorithm()} has been overriden to return true, this method should be overriden so as to
+    * apply a single pass of the kernel's logic to the entire _range.
+    *
+    * <p>
+    * This is not normally required, as fallback to {@link JavaDevice#THREAD_POOL} will implement the algorithm in parallel. However
+    * in the event that thread pool execution may be prohibitively slow, this method might implement a "quick and dirty" approximation
+    * to the desired result (for example, a simple box-blur as opposed to a gaussian blur in an image processing application).
+    */
+   public void executeFallbackAlgorithm(Range _range, int _passId) {
+      // nothing
+   }
+
    /**
     * Invoking this method flags that once the current pass is complete execution should be abandoned. Due to the complexity of intercommunication
     * between java (or C) and executing OpenCL, this is the best we can do for general cancellation of execution at present. OpenCL 2.0 should introduce
@@ -1930,26 +1938,29 @@ public abstract class Kernel implements Cloneable {
       return kernelState;
    }
 
+   private KernelRunner prepareKernelRunner() {
+      if (kernelRunner == null) {
+         kernelRunner = new KernelRunner(this);
+      }
+      return kernelRunner;
+   }
+
    /**
     * Determine the execution time of the previous Kernel.execute(range) call.
-    * 
-    * Note that for the first call this will include the conversion time. 
-    * 
-    * @return The time spent executing the kernel (ms) 
-    * 
+    *
+    * Note that for the first call this will include the conversion time.
+    *
+    * @return The time spent executing the kernel (ms)
+    *
     * @see #getConversionTime();
     * @see #getAccumulatedExecutionTime();
-    * 
+    *
     */
-   public synchronized long getExecutionTime() {
-      return prepareKernelRunner().getExecutionTime();
-   }
-
-   private KernelRunner prepareKernelRunner() {
-      if (kernelRunner == null) {
-         kernelRunner = new KernelRunner(this);
+   public double getExecutionTime() {
+      KernelProfile profile = KernelManager.instance().getProfile(getClass());
+      synchronized (profile) {
+         return profile.getLastExecutionTime();
       }
-      return kernelRunner;
    }
 
    /**
@@ -1963,8 +1974,11 @@ public abstract class Kernel implements Cloneable {
     * @see #getConversionTime();
     * 
     */
-   public synchronized long getAccumulatedExecutionTime() {
-      return prepareKernelRunner().getAccumulatedExecutionTime();
+   public double getAccumulatedExecutionTime() {
+      KernelProfile profile = KernelManager.instance().getProfile(getClass());
+      synchronized (profile) {
+         return profile.getAccumulatedTotalTime();
+      }
    }
 
    /**
@@ -1974,8 +1988,11 @@ public abstract class Kernel implements Cloneable {
     * @see #getExecutionTime();
     * @see #getAccumulatedExecutionTime();
     */
-   public synchronized long getConversionTime() {
-      return prepareKernelRunner().getConversionTime();
+   public double getConversionTime() {
+      KernelProfile profile = KernelManager.instance().getProfile(getClass());
+      synchronized (profile) {
+         return profile.getLastConversionTime();
+      }
    }
 
    /**
@@ -1992,10 +2009,30 @@ public abstract class Kernel implements Cloneable {
       return (execute(_range, 1));
    }
 
+   @Override
+   @SuppressWarnings("deprecation")
+   public String toString() {
+      if (executionMode == EXECUTION_MODE.AUTO) {
+         List<Device> preferredDevices = KernelManager.instance().getPreferences(this).getPreferredDevices(this);
+         StringBuilder preferredDevicesSummary = new StringBuilder("{");
+         for (int i = 0; i < preferredDevices.size(); ++i) {
+            Device device = preferredDevices.get(i);
+            preferredDevicesSummary.append(device.getShortDescription());
+            if (i < preferredDevices.size() - 1) {
+               preferredDevicesSummary.append("|");
+            }
+         }
+         preferredDevicesSummary.append("}");
+         return Reflection.getSimpleName(getClass()) + ", devices=" + preferredDevicesSummary.toString();
+      } else {
+         return Reflection.getSimpleName(getClass()) + ", modes=" + executionModes + ", current = " + executionMode;
+      }
+   }
+
    /**
     * Start execution of <code>_range</code> kernels.
     * <p>
-    * When <code>kernel.execute(_range)</code> is invoked, Aparapi will schedule the execution of <code>_range</code> kernels. If the execution mode is GPU then 
+    * When <code>kernel.execute(_range)</code> is 1invoked, Aparapi will schedule the execution of <code>_range</code> kernels. If the execution mode is GPU then
     * the kernels will execute as OpenCL code on the GPU device. Otherwise, if the mode is JTP, the kernels will execute as a pool of Java threads on the CPU. 
     * <p>
     * Since adding the new <code>Range class</code> this method offers backward compatibility and merely defers to <code> return (execute(Range.create(_range), 1));</code>.
@@ -2004,7 +2041,18 @@ public abstract class Kernel implements Cloneable {
     * 
     */
    public synchronized Kernel execute(int _range) {
-      return (execute(Range.create(_range), 1));
+      return (execute(createRange(_range), 1));
+   }
+
+   @SuppressWarnings("deprecation")
+   protected Range createRange(int _range) {
+      if (executionMode.equals(EXECUTION_MODE.AUTO)) {
+         Device device = getTargetDevice();
+         Range range = Range.create(device, _range);
+         return range;
+      } else {
+         return Range.create(null, _range);
+      }
    }
 
    /**
@@ -2033,21 +2081,7 @@ public abstract class Kernel implements Cloneable {
     * 
     */
    public synchronized Kernel execute(int _range, int _passes) {
-      return (execute(Range.create(_range), _passes));
-   }
-
-   /**
-    * Start execution of <code>globalSize</code> kernels for the given entrypoint.
-    * <p>
-    * When <code>kernel.execute("entrypoint", globalSize)</code> is invoked, Aparapi will schedule the execution of <code>globalSize</code> kernels. If the execution mode is GPU then 
-    * the kernels will execute as OpenCL code on the GPU device. Otherwise, if the mode is JTP, the kernels will execute as a pool of Java threads on the CPU. 
-    * <p>
-    * @param _entry is the name of the method we wish to use as the entrypoint to the kernel
-    * @return The Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr)
-    * 
-    */
-   public synchronized Kernel execute(Entry _entry, Range _range) {
-      return prepareKernelRunner().execute(_entry, _range, 1);
+      return (execute(createRange(_range), _passes));
    }
 
    /**
@@ -2078,6 +2112,33 @@ public abstract class Kernel implements Cloneable {
       return prepareKernelRunner().execute(_entrypoint, _range, _passes);
    }
 
+   public boolean isAutoCleanUpArrays() {
+      return autoCleanUpArrays;
+   }
+
+   /**
+    * Property which if true enables automatic calling of {@link #cleanUpArrays()} following each execution.
+    */
+   public void setAutoCleanUpArrays(boolean autoCleanUpArrays) {
+      this.autoCleanUpArrays = autoCleanUpArrays;
+   }
+
+   /**
+    * Frees the bulk of the resources used by this kernel, by setting array sizes in non-primitive {@link KernelArg}s to 1 (0 size is prohibited) and invoking kernel
+    * execution on a zero size range. Unlike {@link #dispose()}, this does not prohibit further invocations of this kernel, as sundry resources such as OpenCL queues are
+    * <b>not</b> freed by this method.
+    *
+    * <p>This allows a "dormant" Kernel to remain in existence without undue strain on GPU resources, which may be strongly preferable to disposing a Kernel and
+    * recreating another one later, as creation/use of a new Kernel (specifically creation of its associated OpenCL context) is expensive.</p>
+    *
+    * <p>Note that where the underlying array field is declared final, for obvious reasons it is not resized to zero.</p>
+    */
+   public synchronized void cleanUpArrays() {
+      if (kernelRunner != null) {
+         kernelRunner.cleanUpArrays();
+      }
+   }
+
    /**
     * Release any resources associated with this Kernel.
     * <p>
@@ -2093,7 +2154,22 @@ public abstract class Kernel implements Cloneable {
       }
    }
 
+   public boolean isRunningCL() {
+      return getTargetDevice() instanceof OpenCLDevice;
+   }
+
+   public final Device getTargetDevice() {
+      return KernelManager.instance().getPreferences(this).getPreferredDevice(this);
+   }
+
+   /** @return true by default, may be overriden to allow vetoing of a device or devices by a given Kernel instance. */
+   public boolean isAllowDevice(Device _device) {
+      return true;
+   }
+
    /**
+    * @deprecated See {@link EXECUTION_MODE}
+    * <p>
     * Return the current execution mode.  
     * 
     * Before a Kernel executes, this return value will be the execution mode as determined by the setting of 
@@ -2108,11 +2184,14 @@ public abstract class Kernel implements Cloneable {
     * 
     * @see #setExecutionMode(EXECUTION_MODE)
     */
+   @Deprecated
    public EXECUTION_MODE getExecutionMode() {
       return (executionMode);
    }
 
    /**
+    * @deprecated See {@link EXECUTION_MODE}
+    * <p>
     * Set the execution mode. 
     * <p>
     * This should be regarded as a request. The real mode will be determined at runtime based on the availability of OpenCL and the characteristics of the workload.
@@ -2121,10 +2200,15 @@ public abstract class Kernel implements Cloneable {
     * 
     * @see #getExecutionMode()
     */
+   @Deprecated
    public void setExecutionMode(EXECUTION_MODE _executionMode) {
       executionMode = _executionMode;
    }
 
+   /**
+    * @deprecated See {@link EXECUTION_MODE}
+    */
+   @Deprecated
    public void setFallbackExecutionMode() {
       executionMode = EXECUTION_MODE.getFallbackExecutionMode();
    }
@@ -2718,13 +2802,24 @@ public abstract class Kernel implements Cloneable {
       return prepareKernelRunner().getProfileInfo();
    }
 
-   private final LinkedHashSet<EXECUTION_MODE> executionModes = EXECUTION_MODE.getDefaultExecutionModes();
+   /**
+    * @deprecated See {@link EXECUTION_MODE}.
+    */
+   private final LinkedHashSet<EXECUTION_MODE> executionModes = (Config.executionMode != null) ? EXECUTION_MODE.getDefaultExecutionModes() :  new LinkedHashSet<>(Collections.singleton(EXECUTION_MODE.AUTO));
 
+   /**
+    * @deprecated See {@link EXECUTION_MODE}.
+    */
    private Iterator<EXECUTION_MODE> currentMode = executionModes.iterator();
 
+   /**
+    * @deprecated See {@link EXECUTION_MODE}.
+    */
    private EXECUTION_MODE executionMode = currentMode.next();
 
    /**
+    * @deprecated See {@link EXECUTION_MODE}.
+    * <p>
     * set possible fallback path for execution modes.
     * for example setExecutionFallbackPath(GPU,CPU,JTP) will try to use the GPU
     * if it fails it will fall back to OpenCL CPU and finally it will try JTP.
@@ -2736,6 +2831,7 @@ public abstract class Kernel implements Cloneable {
    }
 
    /**
+    * @deprecated See {@link EXECUTION_MODE}.
     * @return is there another execution path we can try
     */
    public boolean hasNextExecutionMode() {
@@ -2743,6 +2839,7 @@ public abstract class Kernel implements Cloneable {
    }
 
    /**
+    * @deprecated See {@link EXECUTION_MODE}.
     * try the next execution path in the list if there aren't any more than give up
     */
    public void tryNextExecutionMode() {
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Range.java b/com.amd.aparapi/src/java/com/amd/aparapi/Range.java
index 5fb435a46b0535215833f1dea888fa4934db17bd..64f060bd028195ab2c70521cbeb4b0e82e484168 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/Range.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/Range.java
@@ -1,9 +1,9 @@
 package com.amd.aparapi;
 
-import java.util.Arrays;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.jni.*;
 
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.internal.jni.RangeJNI;
+import java.util.*;
 
 /**
  * 
@@ -56,7 +56,7 @@ public class Range extends RangeJNI{
    public static final int MAX_GROUP_SIZE = Math.max(Runtime.getRuntime().availableProcessors() * THREADS_PER_CORE,
          MAX_OPENCL_GROUP_SIZE);
 
-   private Device device = null;
+   private OpenCLDevice device = null;
 
    private int maxWorkGroupSize;
 
@@ -73,7 +73,7 @@ public class Range extends RangeJNI{
     * @param _dims
     */
    public Range(Device _device, int _dims) {
-      device = _device;
+      device = !(_device instanceof OpenCLDevice) ? null : (OpenCLDevice) _device;
       dims = _dims;
 
       if (device != null) {
@@ -140,6 +140,20 @@ public class Range extends RangeJNI{
    public static Range create(Device _device, int _globalWidth) {
       final Range withoutLocal = create(_device, _globalWidth, 1);
 
+      if (_device == JavaDevice.THREAD_POOL) {
+         withoutLocal.setLocalSize_0(Runtime.getRuntime().availableProcessors());
+         withoutLocal.setLocalIsDerived(true);
+         return withoutLocal;
+      } else if (_device instanceof JavaDevice) {
+         withoutLocal.setLocalIsDerived(true);
+         return withoutLocal;
+      }
+
+      if (_globalWidth == 0) {
+         withoutLocal.setLocalIsDerived(true);
+         return withoutLocal;
+      }
+
       if (withoutLocal.isValid()) {
          withoutLocal.setLocalIsDerived(true);
          final int[] factors = getFactors(withoutLocal.getGlobalSize_0(), withoutLocal.getMaxWorkItemSize()[0]);
@@ -317,7 +331,7 @@ public class Range extends RangeJNI{
     * For example for <code>MAX_GROUP_SIZE</code> of 64 we favor 4x4x4 over 1x16x16.
     * 
     * @param _globalWidth the width of the 3D grid we wish to process
-    * @param _globalHieght the height of the 3D grid we wish to process
+    * @param _globalHeight the height of the 3D grid we wish to process
     * @param _globalDepth the depth of the 3D grid we wish to process
     * @return
     */
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/device/Device.java b/com.amd.aparapi/src/java/com/amd/aparapi/device/Device.java
index a4bfcdeb9d6411ce52e9593e41d2fd9f3294a9eb..c3790880b8278ac5689b02e0da67fcb1b934e1e1 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/device/Device.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/device/Device.java
@@ -1,79 +1,76 @@
 package com.amd.aparapi.device;
 
-import com.amd.aparapi.Range;
-import com.amd.aparapi.device.OpenCLDevice.DeviceComparitor;
-import com.amd.aparapi.device.OpenCLDevice.DeviceSelector;
+import com.amd.aparapi.*;
+import com.amd.aparapi.internal.kernel.*;
 
 public abstract class Device{
 
    public static enum TYPE {
-      UNKNOWN,
-      GPU,
-      CPU,
-      JTP,
-      SEQ,
-      ACC
+      UNKNOWN(Integer.MAX_VALUE),
+      GPU(2),
+      CPU(3),
+      JTP(5),
+      SEQ(6),
+      ACC(1),
+      ALT(4);
+
+      /** Heuristic ranking of device types, lower is better. */
+      public final int rank;
+
+      TYPE(int rank) {
+         this.rank = rank;
+      }
    };
 
-   /**
-    * @return Now return the device of any types having the maximum compute units
+   /** @deprecated  use {@link KernelManager#bestDevice()}
+    *  @see com.amd.aparapi.device
     */
+   @Deprecated
    public static Device best() {
-      return (OpenCLDevice.select(new DeviceComparitor(){
-         @Override public OpenCLDevice select(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs) {
-            if (_deviceLhs.getMaxComputeUnits() > _deviceRhs.getMaxComputeUnits()) {
-               return (_deviceLhs);
-            } else {
-               return (_deviceRhs);
-            }
-         }
-      }));
+      return KernelManager.instance().bestDevice();
    }
 
+   /**
+    *  @see com.amd.aparapi.device
+    */
+   @SuppressWarnings("deprecation")
+   @Deprecated
    public static Device bestGPU() {
-      return (OpenCLDevice.select(new DeviceComparitor(){
-         @Override public OpenCLDevice select(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs) {
-            if (_deviceLhs.getMaxComputeUnits() > _deviceRhs.getMaxComputeUnits()) {
-               return (_deviceLhs);
-            } else {
-               return (_deviceRhs);
-            }
-         }
-      }, Device.TYPE.GPU));
-   }
-
-   public static Device bestACC() {
-      return (OpenCLDevice.select(new DeviceComparitor(){
-         @Override public OpenCLDevice select(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs) {
-            if (_deviceLhs.getMaxComputeUnits() > _deviceRhs.getMaxComputeUnits()) {
-               return (_deviceLhs);
-            } else {
-               return (_deviceRhs);
-            }
-         }
-      }, Device.TYPE.ACC));
+      return firstGPU();
    }
 
+   /**
+    *  @see com.amd.aparapi.device
+    */
+   @Deprecated
    public static Device first(final Device.TYPE _type) {
-      return (OpenCLDevice.select(new DeviceSelector(){
-         @Override public OpenCLDevice select(OpenCLDevice _device) {
-            return (_device.getType() == _type ? _device : null);
-         }
-      }));
+      return KernelManager.DeprecatedMethods.firstDevice(_type);
    }
 
+   /**
+    *  @see com.amd.aparapi.device
+    */
+   @SuppressWarnings("deprecation")
+   @Deprecated
    public static Device firstGPU() {
-      return (first(Device.TYPE.GPU));
+      return KernelManager.DeprecatedMethods.firstDevice(TYPE.GPU);
    }
 
+   /**
+    *  @see com.amd.aparapi.device
+    */
+   @SuppressWarnings("deprecation")
+   @Deprecated
    public static Device firstCPU() {
-      return (first(Device.TYPE.CPU));
-
+      return KernelManager.DeprecatedMethods.firstDevice(TYPE.CPU);
    }
 
-   public static Device firstACC() {
-      return (first(Device.TYPE.ACC));
-
+   /**
+    *  @see com.amd.aparapi.device
+    */
+   @Deprecated
+   public static Device bestACC() {
+      throw new UnsupportedOperationException();
    }
 
    protected TYPE type = TYPE.UNKNOWN;
@@ -88,6 +85,8 @@ public abstract class Device{
          0
    };
 
+   public abstract String getShortDescription();
+
    public TYPE getType() {
       return type;
    }
@@ -144,4 +143,25 @@ public abstract class Device{
          int _localDepth) {
       return (Range.create3D(this, _globalWidth, _globalHeight, _globalDepth, _localWidth, _localHeight, _localDepth));
    }
+
+   public abstract long getDeviceId();
+
+   @Override
+   public boolean equals(Object o) {
+      if (this == o) {
+         return true;
+      }
+      if (!(o instanceof Device)) {
+         return false;
+      }
+
+      Device device = (Device) o;
+
+      return getDeviceId() == device.getDeviceId();
+   }
+
+   @Override
+   public int hashCode() {
+      return Long.hashCode(getDeviceId());
+   }
 }
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/device/JavaDevice.java b/com.amd.aparapi/src/java/com/amd/aparapi/device/JavaDevice.java
index 78082e77d74d0512dd91df791b130b7beec75bf8..33f5cd4d22e02c6b7f31dc731995e2f906c5fda6 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/device/JavaDevice.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/device/JavaDevice.java
@@ -1,5 +1,32 @@
 package com.amd.aparapi.device;
 
-public class JavaDevice extends Device{
+public class JavaDevice extends Device {
 
+   public static final JavaDevice THREAD_POOL = new JavaDevice(TYPE.JTP, "Java Thread Pool", -3);
+   public static final JavaDevice ALTERNATIVE_ALGORITHM = new JavaDevice(TYPE.ALT, "Java Alternative Algorithm", -2);
+   public static final JavaDevice SEQUENTIAL = new JavaDevice(TYPE.SEQ, "Java Sequential", -1);
+
+   private final String name;
+   private final long deviceId;
+
+   private JavaDevice(TYPE _type, String _name, long deviceId) {
+      this.deviceId = deviceId;
+      this.type = _type;
+      this.name = _name;
+   }
+
+   @Override
+   public String getShortDescription() {
+      return name;
+   }
+
+   @Override
+   public long getDeviceId() {
+      return deviceId;
+   }
+
+   @Override
+   public String toString() {
+      return getShortDescription();
+   }
 }
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/device/OpenCLDevice.java b/com.amd.aparapi/src/java/com/amd/aparapi/device/OpenCLDevice.java
index 61bfe548a2b292191f91de30bc77f74a70a3b615..ce196121488778aaee505202071d933a181d46c8 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/device/OpenCLDevice.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/device/OpenCLDevice.java
@@ -1,34 +1,15 @@
 package com.amd.aparapi.device;
 
-import com.amd.aparapi.ProfileInfo;
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.lang.annotation.Annotation;
-import java.lang.reflect.InvocationHandler;
-import java.lang.reflect.Method;
-import java.lang.reflect.Proxy;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import com.amd.aparapi.Range;
-import com.amd.aparapi.internal.opencl.OpenCLArgDescriptor;
-import com.amd.aparapi.internal.opencl.OpenCLKernel;
-import com.amd.aparapi.internal.opencl.OpenCLPlatform;
-import com.amd.aparapi.internal.opencl.OpenCLProgram;
-import com.amd.aparapi.opencl.OpenCL;
-import com.amd.aparapi.opencl.OpenCL.Arg;
-import com.amd.aparapi.opencl.OpenCL.Constant;
-import com.amd.aparapi.opencl.OpenCL.GlobalReadOnly;
-import com.amd.aparapi.opencl.OpenCL.GlobalReadWrite;
-import com.amd.aparapi.opencl.OpenCL.GlobalWriteOnly;
+import com.amd.aparapi.*;
+import com.amd.aparapi.internal.opencl.*;
+import com.amd.aparapi.opencl.*;
+import com.amd.aparapi.opencl.OpenCL.*;
 import com.amd.aparapi.opencl.OpenCL.Kernel;
-import com.amd.aparapi.opencl.OpenCL.Local;
-import com.amd.aparapi.opencl.OpenCL.Resource;
-import com.amd.aparapi.opencl.OpenCL.Source;
+
+import java.io.*;
+import java.lang.annotation.*;
+import java.lang.reflect.*;
+import java.util.*;
 
 public class OpenCLDevice extends Device{
 
@@ -44,6 +25,8 @@ public class OpenCLDevice extends Device{
 
    private long maxMemAllocSize;
 
+   private String shortDescription = null;
+
    /**
     * Minimal constructor
     * 
@@ -101,6 +84,18 @@ public class OpenCLDevice extends Device{
       return (deviceId);
    }
 
+   @Override
+   public String getShortDescription() {
+      if (shortDescription == null) {
+         String vendor = platform.getName();
+         // Hopefully(!) this equates to the recognisable name of the vendor, e.g. "Intel", "NVIDIA", "AMD"
+         // Note, it is not necessarily the hardware vendor, e.g. if the AMD CPU driver (i.e. platform) is used for an Intel CPU, this will be "AMD"
+         String[] split = vendor.split("[\\s\\(\\)]"); // split on whitespace or on '(' or ')' since Intel use "Intel(R)" here
+         shortDescription = split[0] + "<" + getType() + ">";
+      }
+      return shortDescription;
+   }
+
    public static class OpenCLInvocationHandler<T extends OpenCL<T>> implements InvocationHandler{
       private final Map<String, OpenCLKernel> map;
 
@@ -380,8 +375,6 @@ public class OpenCLDevice extends Device{
          }
       }
 
-      // System.out.println("opencl{\n" + _source + "\n}opencl");
-
       final OpenCLProgram program = new OpenCLProgram(this, _source).createProgram(this);
 
       final Map<String, OpenCLKernel> map = new HashMap<String, OpenCLKernel>();
@@ -412,6 +405,22 @@ public class OpenCLDevice extends Device{
       OpenCLDevice select(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs);
    }
 
+   /** List OpenCLDevices of a given TYPE, or all OpenCLDevices if type == null. */
+   public static List<OpenCLDevice> listDevices(TYPE type) {
+      final OpenCLPlatform platform = new OpenCLPlatform(0, null, null, null);
+      final ArrayList<OpenCLDevice> results = new ArrayList<>();
+
+      for (final OpenCLPlatform p : platform.getOpenCLPlatforms()) {
+         for (final OpenCLDevice device : p.getOpenCLDevices()) {
+            if (type == null || device.getType() == type) {
+               results.add(device);
+            }
+         }
+      }
+
+      return results;
+   }
+
    public static OpenCLDevice select(DeviceSelector _deviceSelector) {
       OpenCLDevice device = null;
       final OpenCLPlatform platform = new OpenCLPlatform(0, null, null, null);
@@ -435,8 +444,10 @@ public class OpenCLDevice extends Device{
       OpenCLDevice device = null;
       final OpenCLPlatform platform = new OpenCLPlatform(0, null, null, null);
 
-      for (final OpenCLPlatform p : platform.getOpenCLPlatforms()) {
-         for (final OpenCLDevice d : p.getOpenCLDevices()) {
+      List<OpenCLPlatform> openCLPlatforms = platform.getOpenCLPlatforms();
+      for (final OpenCLPlatform p : openCLPlatforms) {
+         List<OpenCLDevice> openCLDevices = p.getOpenCLDevices();
+         for (final OpenCLDevice d : openCLDevices) {
             if (device == null) {
                device = d;
             } else {
@@ -466,7 +477,6 @@ public class OpenCLDevice extends Device{
       return (device);
    }
 
-
    @Override public String toString() {
       final StringBuilder s = new StringBuilder("{");
       boolean first = true;
@@ -482,7 +492,8 @@ public class OpenCLDevice extends Device{
 
       s.append("}");
 
-      return ("Device " + deviceId + "\n  type:" + type + "\n  maxComputeUnits=" + maxComputeUnits + "\n  maxWorkItemDimensions="
+      return ("Device " + deviceId + "\n  vendor = " + getOpenCLPlatform().getVendor()
+            + "\n  type:" + type + "\n  maxComputeUnits=" + maxComputeUnits + "\n  maxWorkItemDimensions="
             + maxWorkItemDimensions + "\n  maxWorkItemSizes=" + s + "\n  maxWorkWorkGroupSize=" + maxWorkGroupSize
             + "\n  globalMemSize=" + globalMemSize + "\n  localMemSize=" + localMemSize);
    }
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/device/package-info.java b/com.amd.aparapi/src/java/com/amd/aparapi/device/package-info.java
index babe06a6e60af0c368db8ffce5c5bfb85a16fd6b..039f1883909ce1a6b8934baa634a17763d44869f 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/device/package-info.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/device/package-info.java
@@ -1,4 +1,19 @@
 /**
+ * Contains classes representing OpenCL-capable devices, and "virtual" (java) devices which execute kernels using java.
+ *
+ * <p>Various methods of {@link com.amd.aparapi.device.Device} which selected devices of a particular type have been deprecated,
+ * as now the preferred mechanism for device selection is to rely on the {@link com.amd.aparapi.internal.kernel.KernelManager} to
+ * select an appropriate device. Where a particular device is required to be used for a certain kernel, for such purposes as
+ * debugging or unit testing, this can be achieved by using
+ * {@link com.amd.aparapi.internal.kernel.KernelManager#setKernelManager(com.amd.aparapi.internal.kernel.KernelManager)} prior to
+ * invoking any Kernel executions, by overriding {@link com.amd.aparapi.Kernel#isAllowDevice(com.amd.aparapi.device.Device)}
+  * to veto/approve devices from the available devices for a given Kernel class, or (not recommended) by using
+ * {@link com.amd.aparapi.internal.kernel.KernelManager#setPreferredDevices(com.amd.aparapi.Kernel, java.util.LinkedHashSet)} to specify
+ * a particular device list for a given Kernel class.
+ *
+ * <p>In order to determine the Device which will be used to execute a particular Kernel, use {@link com.amd.aparapi.Kernel#getTargetDevice()}.
+ * This can also be used immediately after execution to see on which device the kernel actually got executed (in case the execution failed
+ * and fell back to another device).
  *
  */
 package com.amd.aparapi.device;
\ No newline at end of file
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelArgJNI.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelArgJNI.java
index 270321ff7c58508e4016eb82dc3afadbabeb1b81..895d1ff9e5f17cdc59deaabb2bf27fb180090cdf 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelArgJNI.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelArgJNI.java
@@ -1,8 +1,8 @@
 package com.amd.aparapi.internal.jni;
 
-import java.lang.reflect.Field;
+import com.amd.aparapi.internal.annotation.*;
 
-import com.amd.aparapi.internal.annotation.UsedByJNICode;
+import java.lang.reflect.*;
 
 /**
  * This class is intended to be used as a 'proxy' or 'facade' object for Java code to interact with JNI
@@ -12,28 +12,25 @@ public abstract class KernelArgJNI{
    /**
     * The type of this KernelArg. Created by or-ing appropriate flags
     * 
-    * @see ARG_BOOLEAN
-    * @see ARG_BYTE
-    * @see ARG_CHAR
-    * @see ARG_FLOAT
-    * @see ARG_INT
-    * @see ARG_DOUBLE
-    * @see ARG_LONG
-    * @see ARG_SHORT
-    * @see ARG_ARRAY
-    * @see ARG_PRIMITIVE
-    * @see ARG_READ
-    * @see ARG_WRITE
-    * @see ARG_LOCAL
-    * @see ARG_GLOBAL
-    * @see ARG_CONSTANT
-    * @see ARG_ARRAYLENGTH
-    * @see ARG_APARAPI_BUF
-    * @see ARG_EXPLICIT
-    * @see ARG_EXPLICIT_WRITE
-    * @see ARG_OBJ_ARRAY_STRUCT
-    * @see ARG_APARAPI_BUF_HAS_ARRAY
-    * @see ARG_APARAPI_BUF_IS_DIRECT
+    * @see KernelRunnerJNI#ARG_BOOLEAN
+    * @see KernelRunnerJNI#ARG_BYTE
+    * @see KernelRunnerJNI#ARG_CHAR
+    * @see KernelRunnerJNI#ARG_FLOAT
+    * @see KernelRunnerJNI#ARG_INT
+    * @see KernelRunnerJNI#ARG_DOUBLE
+    * @see KernelRunnerJNI#ARG_LONG
+    * @see KernelRunnerJNI#ARG_SHORT
+    * @see KernelRunnerJNI#ARG_ARRAY
+    * @see KernelRunnerJNI#ARG_PRIMITIVE
+    * @see KernelRunnerJNI#ARG_READ
+    * @see KernelRunnerJNI#ARG_WRITE
+    * @see KernelRunnerJNI#ARG_LOCAL
+    * @see KernelRunnerJNI#ARG_GLOBAL
+    * @see KernelRunnerJNI#ARG_CONSTANT
+    * @see KernelRunnerJNI#ARG_ARRAYLENGTH
+    * @see KernelRunnerJNI#ARG_EXPLICIT
+    * @see KernelRunnerJNI#ARG_EXPLICIT_WRITE
+    * @see KernelRunnerJNI#ARG_OBJ_ARRAY_STRUCT
     */
    @UsedByJNICode protected int type;
 
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelRunnerJNI.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelRunnerJNI.java
index 923875ee51bdf4bab77d550bf36f40b06b7883bd..7b83bb9b4ff345fd9caf40452ce706a3ee9ef34e 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelRunnerJNI.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelRunnerJNI.java
@@ -307,7 +307,15 @@ public abstract class KernelRunnerJNI{
 
    protected native int getJNI(long _jniContextHandle, Object _array);
 
-   protected native long buildProgramJNI(long _jniContextHandle, String _source);
+   /**
+    * @param _source The OpenCL source code to compile, which may be sent empty if the binary for that source code is known to be cached on the JNI side
+    *                under the key {@code _binaryKey}.
+    * @param _binaryKey A key which embodies a Kernel class and a Device, under which the JNI side will cache the compiled binary corresponding to that Kernel/Device
+    *                   pair. Once a certain _binaryKey has been passed to this method once, further calls to this method with that key will ignore the _source (which
+    *                   can be passed empty) andused the cached binary.
+    *                   <p>By passing an empty String as the _binaryKey, the entire JNI-side binary caching apparatus can be disabled.
+    */
+   protected native long buildProgramJNI(long _jniContextHandle, String _source, String _binaryKey);
 
    protected native int setArgsJNI(long _jniContextHandle, KernelArgJNI[] _args, int argc);
 
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelArg.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelArg.java
index 9599bf0469be570c30199d78fc409e3cd76ad823..ce34d6d062e3cad65231c82e84cfc6491efa0ef2 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelArg.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelArg.java
@@ -1,11 +1,11 @@
 package com.amd.aparapi.internal.kernel;
 
-import java.lang.reflect.Field;
-import java.nio.ByteBuffer;
+import com.amd.aparapi.internal.jni.*;
+import com.amd.aparapi.internal.model.*;
+import com.amd.aparapi.internal.util.*;
 
-import com.amd.aparapi.Kernel;
-import com.amd.aparapi.internal.jni.KernelArgJNI;
-import com.amd.aparapi.internal.model.ClassModel;
+import java.lang.reflect.*;
+import java.nio.*;
 
 /**
  * Each field (or captured field in the case of an anonymous inner class) referenced by any bytecode reachable from the users Kernel.run(), will
@@ -48,7 +48,7 @@ public class KernelArg extends KernelArgJNI{
     * Default constructor
     */
    protected KernelArg() {
-
+      // empty
    }
 
    /**
@@ -260,4 +260,9 @@ public class KernelArg extends KernelArgJNI{
    protected void setDims(int[] dims) {
       this.dims = dims;
    }
+
+   @Override
+   public String toString() {
+      return Reflection.getSimpleName(field.getType()) + " " + field.getName();
+   }
 }
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java
new file mode 100644
index 0000000000000000000000000000000000000000..55c4ee5043e9028b9147d2a496a803a79b4af1b4
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelDeviceProfile.java
@@ -0,0 +1,197 @@
+package com.amd.aparapi.internal.kernel;
+
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+
+import java.text.*;
+import java.util.*;
+import java.util.logging.*;
+
+/**
+ * Created by Barney on 02/09/2015.
+ */
+public class KernelDeviceProfile {
+
+   private static Logger logger = Logger.getLogger(Config.getLoggerName());
+   private static final double MILLION = 1000 * 1000;
+   private static final int TABLE_COLUMN_HEADER_WIDTH = 21;
+   private static final int TABLE_COLUMN_COUNT_WIDTH = 8;
+   private static final int TABLE_COLUMN_WIDTH;
+   private static String tableHeader = null;
+   private final Class<? extends Kernel> kernel;
+   private final Device device;
+   private long[] currentTimes = new long[ProfilingEvent.values().length];
+   private long[] accumulatedTimes = new long[ProfilingEvent.values().length];
+   private ProfilingEvent lastEvent = null;
+   private final DecimalFormat format;
+   private long invocationCount = 0;
+
+   static {
+      assert ProfilingEvent.START.ordinal() == 0 : "ProfilingEvent.START.ordinal() != 0";
+      int max = 0;
+      for (ProfilingEvent event : ProfilingEvent.values()) {
+         max = Math.max(max, event.name().length());
+      }
+      TABLE_COLUMN_WIDTH = max + 1;
+   }
+
+   public KernelDeviceProfile(Class<? extends Kernel> kernel, Device device) {
+      this.kernel = kernel;
+      this.device = device;
+      this.format = (DecimalFormat) DecimalFormat.getNumberInstance();
+      format.setMinimumFractionDigits(3);
+      format.setMaximumFractionDigits(3);
+   }
+
+   public void onEvent(ProfilingEvent event) {
+      if (event == ProfilingEvent.START) {
+         if (lastEvent != null) {
+            logger.log(Level.SEVERE, "ProfilingEvent.START encountered without ProfilingEvent.EXECUTED");
+         } else if (lastEvent == ProfilingEvent.START) {
+            logger.log(Level.SEVERE, "Duplicate event ProfilingEvent.START");
+         }
+         Arrays.fill(currentTimes, 0L);
+         ++invocationCount;
+      } else {
+         if (lastEvent == null) {
+            if (event != ProfilingEvent.EXECUTED) {
+               logger.log(Level.SEVERE, "ProfilingEvent.START was not invoked prior to ProfilingEvent." + event);
+            }
+         } else {
+            for (int i = lastEvent.ordinal() + 1; i < event.ordinal(); ++i) {
+               currentTimes[i] = currentTimes[i - 1];
+            }
+         }
+      }
+      currentTimes[event.ordinal()] = System.nanoTime();
+      if (event == ProfilingEvent.EXECUTED) {
+         for (int i = 1; i < currentTimes.length; ++i) {
+            long elapsed = currentTimes[i] - currentTimes[i - 1];
+            if (elapsed < 0) {
+               logger.log(Level.SEVERE, "negative elapsed time for event " + event);
+               break;
+            }
+            accumulatedTimes[i] += elapsed;
+         }
+      }
+      lastEvent = event;
+      if (event == ProfilingEvent.EXECUTED) {
+         lastEvent = null;
+      }
+   }
+
+   /** Elapsed time for a single event only, i.e. since the previous stage rather than from the start. */
+   public double getLastElapsedTime(ProfilingEvent stage) {
+      if (stage == ProfilingEvent.START) {
+         return 0;
+      }
+      return (currentTimes[stage.ordinal()] - currentTimes[stage.ordinal() - 1]) / MILLION;
+   }
+
+   /** Elapsed time for all events {@code from} through {@code to}.*/
+   public double getLastElapsedTime(ProfilingEvent from, ProfilingEvent to) {
+      return (currentTimes[to.ordinal()] - currentTimes[from.ordinal()]) / MILLION;
+   }
+
+   /** Elapsed time for a single event only, i.e. since the previous stage rather than from the start, summed over all executions. */
+   public double getCumulativeElapsedTime(ProfilingEvent stage) {
+      return (accumulatedTimes[stage.ordinal()]) / MILLION;
+   }
+
+   /** Elapsed time of entire execution, summed over all executions. */
+   public double getCumulativeElapsedTimeAll() {
+      double sum = 0;
+      for (int i = 1; i <= ProfilingEvent.EXECUTED.ordinal(); ++i) {
+         sum += accumulatedTimes[i];
+      }
+      return sum;
+   }
+
+   public static synchronized String getTableHeader() {
+      if (tableHeader == null) {
+         int length = ProfilingEvent.values().length;
+         StringBuilder builder = new StringBuilder(150);
+         appendRowHeaders(builder, "Device", "Count");
+         for (int i = 1; i < length; ++i) {
+            ProfilingEvent stage = ProfilingEvent.values()[i];
+            String heading = stage.name();
+            appendCell(builder, heading);
+         }
+         builder.append("  ").append("Total");
+         tableHeader = builder.toString();
+      }
+      return tableHeader;
+   }
+
+   public String getLastAsTableRow() {
+      double total = 0;
+      StringBuilder builder = new StringBuilder(150);
+      appendRowHeaders(builder, device.getShortDescription(), String.valueOf(invocationCount));
+      for (int i = 1; i < currentTimes.length; ++i) {
+         ProfilingEvent stage = ProfilingEvent.values()[i];
+         double time = getLastElapsedTime(stage);
+         total += time;
+         String formatted = format.format(time);
+         appendCell(builder, formatted);
+      }
+      builder.append("  ").append(format.format(total));
+      return builder.toString();
+   }
+
+   public String getCumulativeAsTableRow() {
+      return internalCumulativeAsTableRow(false);
+   }
+
+   public String getAverageAsTableRow() {
+      return internalCumulativeAsTableRow(true);
+   }
+
+   private String internalCumulativeAsTableRow(boolean mean) {
+      double total = 0;
+      double count = mean ? invocationCount : 1;
+      StringBuilder builder = new StringBuilder(150);
+      appendRowHeaders(builder, device.getShortDescription(), String.valueOf(invocationCount));
+      for (int i = 1; i < currentTimes.length; ++i) {
+         ProfilingEvent stage = ProfilingEvent.values()[i];
+         double time = getCumulativeElapsedTime(stage);
+         if (mean) {
+            time /= count;
+         }
+         total += time;
+         String formatted = format.format(time);
+         appendCell(builder, formatted);
+      }
+      builder.append("  ").append(format.format(total));
+      return builder.toString();
+   }
+
+   private static void appendRowHeaders(StringBuilder builder, String device, String count) {
+      if (device.length() > TABLE_COLUMN_HEADER_WIDTH - 1) {
+         device = device.substring(0, TABLE_COLUMN_HEADER_WIDTH - 1);
+      }
+      builder.append(device);
+      int padding = TABLE_COLUMN_HEADER_WIDTH - device.length();
+      for (int i = 0; i < padding; ++i) {
+         builder.append(' ');
+      }
+
+      builder.append(count);
+      padding = TABLE_COLUMN_COUNT_WIDTH - count.length();
+      for (int i = 0; i < padding; ++i) {
+         builder.append(' ');
+      }
+   }
+
+   private static void appendCell(StringBuilder builder, String cell) {
+      int padding = TABLE_COLUMN_WIDTH - cell.length();
+      for (int paddingIndex = 0; paddingIndex < padding; ++paddingIndex) {
+         builder.append(' ');
+      }
+      builder.append(cell);
+   }
+
+   @Override
+   public String toString() {
+      return "KernelDeviceProfile{" + kernel.toString() + ", " + device.getShortDescription() + "}";
+   }
+}
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java
new file mode 100644
index 0000000000000000000000000000000000000000..2bfaaa8b97dabbd176ae1c8cdf2d698b0e490f39
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManager.java
@@ -0,0 +1,305 @@
+package com.amd.aparapi.internal.kernel;
+
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.util.*;
+
+import java.lang.reflect.*;
+import java.util.*;
+
+/**
+ * Created by Barney on 24/08/2015.
+ */
+public class KernelManager {
+
+   private static KernelManager INSTANCE = new KernelManager();
+   private LinkedHashMap<Class<? extends Kernel>, KernelPreferences> preferences = new LinkedHashMap<>();
+   private LinkedHashMap<Class<? extends Kernel>, KernelProfile> profiles = new LinkedHashMap<>();
+   private LinkedHashMap<Class<? extends Kernel>, Kernel> sharedInstances = new LinkedHashMap<>();
+
+   private KernelPreferences defaultPreferences;
+
+   protected KernelManager() {
+      defaultPreferences = createDefaultPreferences();
+   }
+
+   public static KernelManager instance() {
+      return INSTANCE;
+   }
+
+   public static void setKernelManager(KernelManager manager) {
+      INSTANCE = manager;
+   }
+
+   static {
+      if (Config.dumpProfilesOnExit) {
+         Runtime.getRuntime().addShutdownHook(new Thread() {
+            @Override
+            public void run() {
+               StringBuilder builder = new StringBuilder(2048);
+               instance().reportProfilingSummary(builder);
+               System.out.println(builder);
+            }
+         });
+      }
+   }
+
+   /** This method returns a shared instance of a given Kernel subclass. The kernelClass needs a no-args constructor, which
+    *  need not be public.
+    *
+    *  <p>Each new Kernel instance requires a new JNIContext, the creation of which is expensive. There is apparently no simple solution by which a cached JNIContext can be reused
+    *  for all instances of a given Kernel class, since it is intimately connected with resource aquisition and release. In the absence of a context caching solution, it is often
+    *  highly desirable to only ever create one instance of any given Kernel subclass, which this method facilitates.</p>
+    *
+    *  <p>In order to maintain thread saftey when using a shared instance, it is necessary to synchronize on the returned kernel for the duration of the process of setting up,
+    *  executing and extracting the results from that kernel.</p>
+    *
+    *  <p>This method instantiates a Kernel (per Kernel class) via Reflection, and thus can only be used where the Kernel class has a no-args constructor, which need not be public.
+    *  In fact, if a Kernel subclass is designed to be used in conjunction with this method, it is recommended that its <b>only</b> constructor is a <b>private</b> no-args constructor.
+    *  </p>
+    *
+    *  @throws RuntimeException if the class cannot be instantiated
+    */
+   public static <T extends Kernel> T sharedKernelInstance(Class<T> kernelClass) {
+       return instance().getSharedKernelInstance(kernelClass);
+   }
+
+   /** Append a report to {@code builder} which contains information, per Kernel subclass, on which device is currently being used for the
+    * kernel class, and which (if any) devices failed to execute a given Kernel.
+    */
+   public void reportDeviceUsage(StringBuilder builder, boolean withProfilingInfo) {
+      builder.append("Device Usage by Kernel Subclass");
+      if (withProfilingInfo) {
+         builder.append(" (showing mean elapsed times in milliseconds)");
+      }
+      builder.append("\n\n");
+      for (Class<? extends Kernel> klass : preferences.keySet()) {
+         KernelPreferences preferences = this.preferences.get(klass);
+         KernelProfile profile = withProfilingInfo ? profiles.get(klass) : null;
+         builder.append(klass.getName()).append(":\n\tusing ").append(preferences.getPreferredDevice(null).getShortDescription());
+         List<Device> failedDevices = preferences.getFailedDevices();
+         if (failedDevices.size() > 0) {
+            builder.append(", failed devices = ");
+            for (int i = 0; i < failedDevices.size(); ++i) {
+               builder.append(failedDevices.get(i).getShortDescription());
+               if (i < failedDevices.size() - 1) {
+                  builder.append(" | ");
+               }
+            }
+         }
+         if (profile != null) {
+            builder.append("\n");
+            int row = 0;
+            for (KernelDeviceProfile deviceProfile : profile.getDeviceProfiles()) {
+               if (row == 0) {
+                  builder.append(deviceProfile.getTableHeader()).append("\n");
+               }
+               builder.append(deviceProfile.getAverageAsTableRow()).append("\n");
+               ++row;
+            }
+         }
+         builder.append("\n");
+      }
+   }
+
+   public void reportProfilingSummary(StringBuilder builder) {
+      builder.append("\nProfiles by Kernel Subclass (mean elapsed times in milliseconds)\n\n");
+      builder.append(KernelDeviceProfile.getTableHeader()).append("\n");
+      for (Class<? extends Kernel> kernelClass : profiles.keySet()) {
+         String simpleName = Reflection.getSimpleName(kernelClass);
+         String kernelName = "----------------- [[ " + simpleName + " ]] ";
+         builder.append(kernelName);
+         int dashes = 132 - kernelName.length();
+         for (int i = 0; i < dashes; ++i) {
+            builder.append('-');
+         }
+         builder.append("\n");
+         KernelProfile kernelProfile = profiles.get(kernelClass);
+         for (KernelDeviceProfile deviceProfile : kernelProfile.getDeviceProfiles()) {
+            builder.append(deviceProfile.getAverageAsTableRow()).append("\n");
+         }
+      }
+   }
+
+
+   public KernelPreferences getPreferences(Kernel kernel) {
+      synchronized (preferences) {
+         KernelPreferences kernelPreferences = preferences.get(kernel.getClass());
+         if (kernelPreferences == null) {
+            kernelPreferences = new KernelPreferences(this, kernel.getClass());
+            preferences.put(kernel.getClass(), kernelPreferences);
+         }
+         return kernelPreferences;
+      }
+   }
+
+   public void setPreferredDevices(Kernel _kernel, LinkedHashSet<Device> _devices) {
+      KernelPreferences kernelPreferences = getPreferences(_kernel);
+      kernelPreferences.setPreferredDevices(_devices);
+   }
+
+   public KernelPreferences getDefaultPreferences() {
+      return defaultPreferences;
+   }
+
+   public void setDefaultPreferredDevices(LinkedHashSet<Device> _devices) {
+      defaultPreferences.setPreferredDevices(_devices);
+   }
+
+   protected KernelPreferences createDefaultPreferences() {
+      KernelPreferences preferences = new KernelPreferences(this, null);
+      preferences.setPreferredDevices(createDefaultPreferredDevices());
+      return preferences;
+   }
+
+   private <T extends Kernel> T getSharedKernelInstance(Class<T> kernelClass) {
+      synchronized (sharedInstances) {
+         T shared = (T) sharedInstances.get(kernelClass);
+         if (shared == null) {
+            try {
+               Constructor<T> constructor = kernelClass.getConstructor();
+               constructor.setAccessible(true);
+               shared = constructor.newInstance();
+               sharedInstances.put(kernelClass, shared);
+            }
+            catch (Exception e) {
+               throw new RuntimeException(e);
+            }
+         }
+         return shared;
+      }
+   }
+
+   protected LinkedHashSet<Device> createDefaultPreferredDevices() {
+      LinkedHashSet<Device> devices = new LinkedHashSet<>();
+
+      List<OpenCLDevice> accelerators = OpenCLDevice.listDevices(Device.TYPE.ACC);
+      List<OpenCLDevice> gpus = OpenCLDevice.listDevices(Device.TYPE.GPU);
+      List<OpenCLDevice> cpus = OpenCLDevice.listDevices(Device.TYPE.CPU);
+
+      Collections.sort(accelerators, getDefaultAcceleratorComparator());
+      Collections.sort(gpus, getDefaultGPUComparator());
+
+      List<Device.TYPE> preferredDeviceTypes = getPreferredDeviceTypes();
+
+      for (Device.TYPE type : preferredDeviceTypes) {
+         switch (type) {
+            case UNKNOWN:
+               throw new AssertionError("UNKNOWN device type not supported");
+            case GPU:
+               devices.addAll(gpus);
+               break;
+            case CPU:
+               devices.addAll(cpus);
+               break;
+            case JTP:
+               devices.add(JavaDevice.THREAD_POOL);
+               break;
+            case SEQ:
+               devices.add(JavaDevice.SEQUENTIAL);
+               break;
+            case ACC:
+               devices.addAll(accelerators);
+               break;
+            case ALT:
+               devices.add(JavaDevice.ALTERNATIVE_ALGORITHM);
+               break;
+         }
+      }
+
+      return devices;
+   }
+
+   protected List<Device.TYPE> getPreferredDeviceTypes() {
+      return Arrays.asList(Device.TYPE.ACC, Device.TYPE.GPU, Device.TYPE.CPU, Device.TYPE.ALT, Device.TYPE.JTP);
+   }
+
+   /** NB, returns -ve for the better device. */
+   protected Comparator<OpenCLDevice> getDefaultAcceleratorComparator() {
+      return new Comparator<OpenCLDevice>() {
+         @Override
+         public int compare(OpenCLDevice left, OpenCLDevice right) {
+            return (right.getMaxComputeUnits() - left.getMaxComputeUnits());
+         }
+      };
+   }
+
+   /** NB, returns -ve for the better device. */
+   protected Comparator<OpenCLDevice> getDefaultGPUComparator() {
+      return new Comparator<OpenCLDevice>() {
+         @Override
+         public int compare(OpenCLDevice left, OpenCLDevice right) {
+            return selectLhs(left, right) ? -1 : 1;
+         }
+      };
+   }
+
+   public Device bestDevice() {
+      return getDefaultPreferences().getPreferredDevice(null);
+   }
+
+    protected static boolean selectLhs(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs) {
+       boolean nvidiaLhs = _deviceLhs.getOpenCLPlatform().getVendor().toLowerCase().contains("nvidia");
+       boolean nvidiaRhs = _deviceRhs.getOpenCLPlatform().getVendor().toLowerCase().contains("nvidia");
+       if (nvidiaLhs || nvidiaRhs) {
+          return selectLhsIfCUDA(_deviceLhs, _deviceRhs);
+       }
+       return _deviceLhs.getMaxComputeUnits() > _deviceRhs.getMaxComputeUnits();
+    }
+
+    /** NVidia/CUDA architecture reports maxComputeUnits in a completely different context, i.e. maxComputeUnits is not same as
+     * (is much less than) the number of OpenCL cores available.
+     *
+     * <p>Therefore when comparing an NVidia device we use different criteria.</p>
+     */
+    protected static boolean selectLhsIfCUDA(OpenCLDevice _deviceLhs, OpenCLDevice _deviceRhs) {
+       if (_deviceLhs.getType() != _deviceRhs.getType()) {
+          return selectLhsByType(_deviceLhs.getType(), _deviceRhs.getType());
+       }
+       return _deviceLhs.getMaxWorkGroupSize() == _deviceRhs.getMaxWorkGroupSize()
+               ? _deviceLhs.getGlobalMemSize() > _deviceRhs.getGlobalMemSize()
+               : _deviceLhs.getMaxWorkGroupSize() > _deviceRhs.getMaxWorkGroupSize();
+    }
+
+   private static boolean selectLhsByType(Device.TYPE lhs, Device.TYPE rhs) {
+      return lhs.rank < rhs.rank;
+   }
+
+   public KernelProfile getProfile(Class<? extends Kernel> kernelClass) {
+      synchronized (profiles) {
+         KernelProfile profile = profiles.get(kernelClass);
+         if (profile == null) {
+            profile = new KernelProfile(kernelClass);
+            profiles.put(kernelClass, profile);
+         }
+         return profile;
+      }
+   }
+
+   /** New home for deprecated methods of {@link Device}. */
+   public static class DeprecatedMethods {
+
+      @Deprecated
+      public static Device firstDevice(Device.TYPE _type) {
+         List<Device> devices = instance().getDefaultPreferences().getPreferredDevices(null);
+         for (Device device : devices) {
+            if(device.getType() == _type) {
+               return device;
+            }
+         }
+         return null;
+      }
+
+      @SuppressWarnings("deprecation")
+      @Deprecated
+      public static Device bestGPU() {
+         return firstDevice(Device.TYPE.GPU);
+      }
+
+      @SuppressWarnings("deprecation")
+      @Deprecated
+      public static Device bestACC() {
+         return firstDevice(Device.TYPE.ACC);
+      }
+   }
+}
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManagers.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManagers.java
new file mode 100644
index 0000000000000000000000000000000000000000..11b28b7085cc4798c4e25a9fd94bbf699b0408f3
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelManagers.java
@@ -0,0 +1,31 @@
+package com.amd.aparapi.internal.kernel;
+
+import com.amd.aparapi.device.*;
+
+import java.util.*;
+
+/**
+ * KernelManager instances useful for debugging.
+ */
+public class KernelManagers {
+
+   public static final KernelManager JTP_ONLY = new KernelManager() {
+
+      private List<Device.TYPE> types = Collections.singletonList(Device.TYPE.JTP);
+
+      @Override
+      protected List<Device.TYPE> getPreferredDeviceTypes() {
+         return types;
+      }
+   };
+
+   public static final KernelManager SEQUENTIAL_ONLY = new KernelManager() {
+
+      private final List<Device.TYPE> types = Collections.singletonList(Device.TYPE.SEQ);
+
+      @Override
+      protected List<Device.TYPE> getPreferredDeviceTypes() {
+         return types;
+      }
+   };
+}
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelPreferences.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelPreferences.java
new file mode 100644
index 0000000000000000000000000000000000000000..fd238a6d9e44959f96415e897317508d2446114a
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelPreferences.java
@@ -0,0 +1,78 @@
+package com.amd.aparapi.internal.kernel;
+
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+
+import java.util.*;
+
+public class KernelPreferences {
+   private final Class<? extends Kernel> kernelClass;
+   private final KernelManager manager;
+   private volatile LinkedList<Device> preferredDevices = null;
+   private final LinkedHashSet<Device> failedDevices = new LinkedHashSet<>();
+
+   public KernelPreferences(KernelManager manager, Class<? extends Kernel> kernelClass) {
+      this.kernelClass = kernelClass;
+      this.manager = manager;
+   }
+
+   /** What Kernel subclass is this the preferences for? */
+   public Class<? extends Kernel> getKernelClass() {
+      return kernelClass;
+   }
+
+   public List<Device> getPreferredDevices(Kernel kernel) {
+      maybeSetUpDefaultPreferredDevices();
+
+      if (kernel == null) {
+         return Collections.unmodifiableList(preferredDevices);
+      }
+      List<Device> localPreferredDevices = new ArrayList<>();
+      ArrayList<Device> copy;
+      synchronized (preferredDevices) {
+         copy = new ArrayList(preferredDevices);
+      }
+      for (Device device : copy) {
+         if (kernel.isAllowDevice(device)) {
+            localPreferredDevices.add(device);
+         }
+      }
+      return Collections.unmodifiableList(localPreferredDevices);
+   }
+
+   synchronized void setPreferredDevices(LinkedHashSet<Device> _preferredDevices) {
+      if (preferredDevices != null) {
+         preferredDevices.clear();
+         preferredDevices.addAll(_preferredDevices);
+      }
+      else {
+         preferredDevices = new LinkedList<>(_preferredDevices);
+      }
+      failedDevices.clear();
+   }
+
+   public Device getPreferredDevice(Kernel kernel) {
+      List<Device> localPreferredDevices = getPreferredDevices(kernel);
+      return localPreferredDevices.isEmpty() ? null : localPreferredDevices.get(0);
+   }
+
+   synchronized void markPreferredDeviceFailed() {
+      if (preferredDevices.size() > 0) {
+         failedDevices.add(preferredDevices.remove(0));
+      }
+   }
+
+   private void maybeSetUpDefaultPreferredDevices() {
+      if (preferredDevices == null) {
+         synchronized (this) {
+            if (preferredDevices == null) {
+               preferredDevices = new LinkedList<>(manager.getDefaultPreferences().getPreferredDevices(null));
+            }
+         }
+      }
+   }
+
+   public List<Device> getFailedDevices() {
+      return new ArrayList<>(failedDevices);
+   }
+}
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java
new file mode 100644
index 0000000000000000000000000000000000000000..3d1caaa11906ae2fcccacbce59050b0d4b8c86c7
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelProfile.java
@@ -0,0 +1,106 @@
+package com.amd.aparapi.internal.kernel;
+
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+
+import java.util.*;
+import java.util.logging.*;
+
+/**
+ * Collects profiling information per kernel class per device. Not thread safe, it is necessary for client code to correctly synchronize on
+ * objects of this class.
+ */
+public class KernelProfile {
+
+   private static final double MILLION = 1000000d;
+   private static Logger logger = Logger.getLogger(Config.getLoggerName());
+   private final Class<? extends Kernel> kernelClass;
+   private LinkedHashMap<Device, KernelDeviceProfile> deviceProfiles = new LinkedHashMap<>();
+   private Device currentDevice;
+   private Device lastDevice;
+   private KernelDeviceProfile currentDeviceProfile;
+
+   public KernelProfile(Class<? extends Kernel> _kernelClass) {
+      kernelClass = _kernelClass;
+   }
+
+   public double getLastExecutionTime() {
+      KernelDeviceProfile lastDeviceProfile = getLastDeviceProfile();
+      return lastDeviceProfile == null ? Double.NaN : lastDeviceProfile.getLastElapsedTime(ProfilingEvent.START, ProfilingEvent.EXECUTED) / MILLION;
+   }
+
+   public double getLastConversionTime() {
+      KernelDeviceProfile lastDeviceProfile = getLastDeviceProfile();
+      return lastDeviceProfile == null ? Double.NaN : lastDeviceProfile.getLastElapsedTime(ProfilingEvent.START, ProfilingEvent.EXECUTED) / MILLION;
+   }
+
+   public double getAccumulatedTotalTime() {
+      KernelDeviceProfile lastDeviceProfile = getLastDeviceProfile();
+      if (lastDeviceProfile == null) {
+         return Double.NaN;
+      }
+      else {
+         return lastDeviceProfile.getCumulativeElapsedTimeAll() / MILLION;
+      }
+   }
+
+   public KernelDeviceProfile getLastDeviceProfile() {
+      return deviceProfiles.get(currentDevice);
+   }
+
+   void onStart(Device device) {
+      currentDevice = device;
+      synchronized (deviceProfiles) {
+         currentDeviceProfile = deviceProfiles.get(device);
+         if (currentDeviceProfile == null) {
+            currentDeviceProfile = new KernelDeviceProfile(kernelClass, device);
+            deviceProfiles.put(device, currentDeviceProfile);
+         }
+      }
+      currentDeviceProfile.onEvent(ProfilingEvent.START);
+   }
+
+   void onEvent(ProfilingEvent event) {
+      switch (event) {
+         case CLASS_MODEL_BUILT: // fallthrough
+         case OPENCL_GENERATED:  // fallthrough
+         case INIT_JNI:          // fallthrough
+         case OPENCL_COMPILED:   // fallthrough
+         case PREPARE_EXECUTE:   // fallthrough
+         case EXECUTED:          // fallthrough
+         {
+            if (currentDeviceProfile == null) {
+               logger.log(Level.SEVERE, "Error in KernelProfile, no currentDevice (synchronization error?");
+            }
+            currentDeviceProfile.onEvent(event);
+            break;
+         }
+         case START:
+            throw new IllegalArgumentException("must use onStart(Device) to start profiling");
+         default:
+            throw new IllegalArgumentException("Unhandled event " + event);
+      }
+   }
+
+   void onFinishedExecution() {
+      reset();
+   }
+
+   private void reset() {
+      lastDevice = currentDevice;
+      currentDevice = null;
+      currentDeviceProfile = null;
+   }
+
+   public Collection<Device> getDevices() {
+      return deviceProfiles.keySet();
+   }
+
+   public Collection<KernelDeviceProfile> getDeviceProfiles() {
+      return deviceProfiles.values();
+   }
+
+   public KernelDeviceProfile getDeviceProfile(Device device) {
+      return deviceProfiles.get(device);
+   }
+}
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java
index c2b69e44f1cb3564fa00d82c632d861c5ae93986..f8af5bc2f85a596bae8693e30b6590e6c79651b0 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java
@@ -37,45 +37,25 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 */
 package com.amd.aparapi.internal.kernel;
 
-import com.amd.aparapi.Config;
-import com.amd.aparapi.Kernel;
+import com.amd.aparapi.*;
 import com.amd.aparapi.Kernel.Constant;
-import com.amd.aparapi.Kernel.EXECUTION_MODE;
-import com.amd.aparapi.Kernel.KernelState;
-import com.amd.aparapi.Kernel.Local;
-import com.amd.aparapi.ProfileInfo;
-import com.amd.aparapi.Range;
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.device.OpenCLDevice;
-import com.amd.aparapi.internal.annotation.UsedByJNICode;
-import com.amd.aparapi.internal.exception.AparapiException;
-import com.amd.aparapi.internal.exception.CodeGenException;
-import com.amd.aparapi.internal.instruction.InstructionSet.TypeSpec;
-import com.amd.aparapi.internal.jni.KernelRunnerJNI;
-import com.amd.aparapi.internal.model.ClassModel;
-import com.amd.aparapi.internal.model.Entrypoint;
-import com.amd.aparapi.internal.util.UnsafeWrapper;
-import com.amd.aparapi.internal.writer.KernelWriter;
-import com.amd.aparapi.opencl.OpenCL;
-
-import java.lang.reflect.Array;
-import java.lang.reflect.Field;
-import java.lang.reflect.Modifier;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.nio.IntBuffer;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import java.util.StringTokenizer;
-import java.util.concurrent.BrokenBarrierException;
-import java.util.concurrent.CyclicBarrier;
-import java.util.concurrent.ForkJoinPool;
-import java.util.concurrent.ForkJoinPool.ForkJoinWorkerThreadFactory;
-import java.util.concurrent.ForkJoinPool.ManagedBlocker;
-import java.util.concurrent.ForkJoinWorkerThread;
-import java.util.logging.Level;
-import java.util.logging.Logger;
+import com.amd.aparapi.Kernel.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.annotation.*;
+import com.amd.aparapi.internal.exception.*;
+import com.amd.aparapi.internal.instruction.InstructionSet.*;
+import com.amd.aparapi.internal.jni.*;
+import com.amd.aparapi.internal.model.*;
+import com.amd.aparapi.internal.util.*;
+import com.amd.aparapi.internal.writer.*;
+import com.amd.aparapi.opencl.*;
+
+import java.lang.reflect.*;
+import java.nio.*;
+import java.util.*;
+import java.util.concurrent.*;
+import java.util.concurrent.ForkJoinPool.*;
+import java.util.logging.*;
 
 /**
  * The class is responsible for executing <code>Kernel</code> implementations. <br/>
@@ -95,12 +75,17 @@ import java.util.logging.Logger;
  */
 public class KernelRunner extends KernelRunnerJNI{
 
+   public static boolean BINARY_CACHING_DISABLED = false;
+
+   private static final int MINIMUM_ARRAY_SIZE = 1;
+
    /** @see #getCurrentPass() */
    @UsedByJNICode public static final int PASS_ID_PREPARING_EXECUTION = -2;
    /** @see #getCurrentPass() */
    @UsedByJNICode public static final int PASS_ID_COMPLETED_EXECUTION = -1;
    @UsedByJNICode public static final int CANCEL_STATUS_FALSE = 0;
    @UsedByJNICode public static final int CANCEL_STATUS_TRUE = 1;
+   private static final String CODE_GEN_ERROR_MARKER = CodeGenException.class.getName();
 
    private static Logger logger = Logger.getLogger(Config.getLoggerName());
 
@@ -147,6 +132,8 @@ public class KernelRunner extends KernelRunnerJNI{
 
    private static final ForkJoinPool threadPool = new ForkJoinPool(Runtime.getRuntime().availableProcessors(),
          lowPriorityThreadFactory, null, false);
+   private static HashMap<Class<? extends Kernel>, String> openCLCache = new HashMap<>();
+   private static LinkedHashSet<String> seenBinaryKeys = new LinkedHashSet<>();
 
    /**
     * Create a KernelRunner for a specific Kernel instance.
@@ -164,6 +151,35 @@ public class KernelRunner extends KernelRunnerJNI{
 
       inBufferRemoteInt = inBufferRemote.asIntBuffer();
       outBufferRemoteInt = outBufferRemote.asIntBuffer();
+
+      KernelManager.instance(); // ensures static initialization of KernelManager
+   }
+
+   /**
+    * @see Kernel#cleanUpArrays().
+    */
+   public void cleanUpArrays() {
+      if (args != null && kernel.isRunningCL()) {
+         for (KernelArg arg : args) {
+            if ((arg.getType() & KernelRunnerJNI.ARG_ARRAY) != 0) {
+               Field field = arg.getField();
+               if (field != null && field.getType().isArray() && !Modifier.isFinal(field.getModifiers())) {
+                  field.setAccessible(true);
+                  Class<?> componentType = field.getType().getComponentType();
+                  Object newValue = Array.newInstance(componentType, MINIMUM_ARRAY_SIZE);
+                  try {
+                     field.set(kernel, newValue);
+                  }
+                  catch (IllegalAccessException e) {
+                     throw new RuntimeException(e);
+                  }
+               }
+            }
+         }
+         kernel.execute(0);
+      } else if (kernel.isRunningCL()) {
+         logger.log(Level.SEVERE, "KernelRunner#cleanUpArrays() could not execute as no args available (Kernel has not been executed?)");
+      }
    }
 
    /**
@@ -171,8 +187,8 @@ public class KernelRunner extends KernelRunnerJNI{
     * 
     * @see KernelRunnerJNI#disposeJNI(long)
     */
-   public void dispose() {
-      if (kernel.getExecutionMode().isOpenCL()) {
+   public synchronized void dispose() {
+      if (kernel.isRunningCL()) {
          disposeJNI(jniContextHandle);
       }
       // We are using a shared pool, so there's no need no shutdown it when kernel is disposed
@@ -181,12 +197,6 @@ public class KernelRunner extends KernelRunnerJNI{
 
    private Set<String> capabilitiesSet;
 
-   private long accumulatedExecutionTime = 0;
-
-   private long conversionTime = 0;
-
-   private long executionTime = 0;
-
    boolean hasFP64Support() {
       if (capabilitiesSet == null) {
          throw new IllegalStateException("Capabilities queried before they were initialized");
@@ -316,312 +326,334 @@ public class KernelRunner extends KernelRunnerJNI{
    }
 
    /**
-    * Execute using a Java thread pool. Either because we were explicitly asked to do so, or because we 'fall back' after discovering an OpenCL issue.
-    * 
-    * @param _range
-    *          The globalSize requested by the user (via <code>Kernel.execute(globalSize)</code>)
-    * @param _passes
-    *          The # of passes requested by the user (via <code>Kernel.execute(globalSize, passes)</code>). Note this is usually defaulted to 1 via <code>Kernel.execute(globalSize)</code>.
-    * @return
+    * Execute using a Java thread pool, or sequentially, or using an alternative algorithm, usually as a result of failing to compile or execute OpenCL
     */
-   protected long executeJava(final Range _range, final int _passes) {
+   @SuppressWarnings("deprecation")
+   protected void executeJava(ExecutionSettings _settings, Device device) {
       if (logger.isLoggable(Level.FINE)) {
-         logger.fine("executeJava: range = " + _range);
+         logger.fine("executeJava: range = " + _settings.range + ", device = " + device);
       }
+      boolean legacySequentialMode = kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.SEQ);
 
       passId = PASS_ID_PREPARING_EXECUTION;
+      _settings.profile.onEvent(ProfilingEvent.PREPARE_EXECUTE);
+
       try {
-         final int localSize0 = _range.getLocalSize(0);
-         final int localSize1 = _range.getLocalSize(1);
-         final int localSize2 = _range.getLocalSize(2);
-         final int globalSize1 = _range.getGlobalSize(1);
-         if (kernel.getExecutionMode().equals(EXECUTION_MODE.SEQ)) {
-            /**
-             * SEQ mode is useful for testing trivial logic, but kernels which use SEQ mode cannot be used if the
-             * product of localSize(0..3) is >1.  So we can use multi-dim ranges but only if the local size is 1 in all dimensions.
-             *
-             * As a result of this barrier is only ever 1 work item wide and probably should be turned into a no-op.
-             *
-             * So we need to check if the range is valid here. If not we have no choice but to punt.
-             */
-            if ((localSize0 * localSize1 * localSize2) > 1) {
-               throw new IllegalStateException("Can't run range with group size >1 sequentially. Barriers would deadlock!");
+         if (device == JavaDevice.ALTERNATIVE_ALGORITHM) {
+            if (kernel.hasFallbackAlgorithm()) {
+               for (passId = 0; passId < _settings.passes; ++passId) {
+                  kernel.executeFallbackAlgorithm(_settings.range, passId);
+               }
+            } else {
+               boolean silently = true; // not having an alternative algorithm is the normal state, and does not need reporting
+               fallBackToNextDevice(_settings, (Exception) null, silently);
             }
-
-            final Kernel kernelClone = kernel.clone();
-            final KernelState kernelState = kernelClone.getKernelState();
-
-            kernelState.setRange(_range);
-            kernelState.setGroupId(0, 0);
-            kernelState.setGroupId(1, 0);
-            kernelState.setGroupId(2, 0);
-            kernelState.setLocalId(0, 0);
-            kernelState.setLocalId(1, 0);
-            kernelState.setLocalId(2, 0);
-            kernelState.setLocalBarrier(new FJSafeCyclicBarrier(1));
-
-            for (passId = 0; passId < _passes; passId++) {
-               if (getCancelState() == CANCEL_STATUS_TRUE) {
-                  break;
+         } else {
+            final int localSize0 = _settings.range.getLocalSize(0);
+            final int localSize1 = _settings.range.getLocalSize(1);
+            final int localSize2 = _settings.range.getLocalSize(2);
+            final int globalSize1 = _settings.range.getGlobalSize(1);
+            if (legacySequentialMode || device == JavaDevice.SEQUENTIAL) {
+               /**
+                * SEQ mode is useful for testing trivial logic, but kernels which use SEQ mode cannot be used if the
+                * product of localSize(0..3) is >1.  So we can use multi-dim ranges but only if the local size is 1 in all dimensions.
+                *
+                * As a result of this barrier is only ever 1 work item wide and probably should be turned into a no-op.
+                *
+                * So we need to check if the range is valid here. If not we have no choice but to punt.
+                */
+               if ((localSize0 * localSize1 * localSize2) > 1) {
+                  throw new IllegalStateException("Can't run range with group size >1 sequentially. Barriers would deadlock!");
                }
-               kernelState.setPassId(passId);
 
-               if (_range.getDims() == 1) {
-                  for (int id = 0; id < _range.getGlobalSize(0); id++) {
-                     kernelState.setGlobalId(0, id);
-                     kernelClone.run();
+               final Kernel kernelClone = kernel.clone();
+               final KernelState kernelState = kernelClone.getKernelState();
+
+               kernelState.setRange(_settings.range);
+               kernelState.setGroupId(0, 0);
+               kernelState.setGroupId(1, 0);
+               kernelState.setGroupId(2, 0);
+               kernelState.setLocalId(0, 0);
+               kernelState.setLocalId(1, 0);
+               kernelState.setLocalId(2, 0);
+               kernelState.setLocalBarrier(new FJSafeCyclicBarrier(1));
+
+               for (passId = 0; passId < _settings.passes; passId++) {
+                  if (getCancelState() == CANCEL_STATUS_TRUE) {
+                     break;
                   }
-               } else if (_range.getDims() == 2) {
-                  for (int x = 0; x < _range.getGlobalSize(0); x++) {
-                     kernelState.setGlobalId(0, x);
+                  kernelState.setPassId(passId);
 
-                     for (int y = 0; y < globalSize1; y++) {
-                        kernelState.setGlobalId(1, y);
+                  if (_settings.range.getDims() == 1) {
+                     for (int id = 0; id < _settings.range.getGlobalSize(0); id++) {
+                        kernelState.setGlobalId(0, id);
                         kernelClone.run();
                      }
                   }
-               } else if (_range.getDims() == 3) {
-                  for (int x = 0; x < _range.getGlobalSize(0); x++) {
-                     kernelState.setGlobalId(0, x);
+                  else if (_settings.range.getDims() == 2) {
+                     for (int x = 0; x < _settings.range.getGlobalSize(0); x++) {
+                        kernelState.setGlobalId(0, x);
 
-                     for (int y = 0; y < globalSize1; y++) {
-                        kernelState.setGlobalId(1, y);
-
-                        for (int z = 0; z < _range.getGlobalSize(2); z++) {
-                           kernelState.setGlobalId(2, z);
+                        for (int y = 0; y < globalSize1; y++) {
+                           kernelState.setGlobalId(1, y);
                            kernelClone.run();
                         }
+                     }
+                  }
+                  else if (_settings.range.getDims() == 3) {
+                     for (int x = 0; x < _settings.range.getGlobalSize(0); x++) {
+                        kernelState.setGlobalId(0, x);
 
-                        kernelClone.run();
+                        for (int y = 0; y < globalSize1; y++) {
+                           kernelState.setGlobalId(1, y);
+
+                           for (int z = 0; z < _settings.range.getGlobalSize(2); z++) {
+                              kernelState.setGlobalId(2, z);
+                              kernelClone.run();
+                           }
+
+                           kernelClone.run();
+                        }
                      }
                   }
                }
+               passId = PASS_ID_COMPLETED_EXECUTION;
             }
-            passId = PASS_ID_COMPLETED_EXECUTION;
-         } else {
-            final int threads = localSize0 * localSize1 * localSize2;
-            final int numGroups0 = _range.getNumGroups(0);
-            final int numGroups1 = _range.getNumGroups(1);
-            final int globalGroups = numGroups0 * numGroups1 * _range.getNumGroups(2);
-            /**
-             * This joinBarrier is the barrier that we provide for the kernel threads to rendezvous with the current dispatch thread.
-             * So this barrier is threadCount+1 wide (the +1 is for the dispatch thread)
-             */
-            final CyclicBarrier joinBarrier = new FJSafeCyclicBarrier(threads + 1);
-
-            /**
-             * This localBarrier is only ever used by the kernels.  If the kernel does not use the barrier the threads
-             * can get out of sync, we promised nothing in JTP mode.
-             *
-             * As with OpenCL all threads within a group must wait at the barrier or none.  It is a user error (possible deadlock!)
-             * if the barrier is in a conditional that is only executed by some of the threads within a group.
-             *
-             * Kernel developer must understand this.
-             *
-             * This barrier is threadCount wide.  We never hit the barrier from the dispatch thread.
-             */
-            final CyclicBarrier localBarrier = new FJSafeCyclicBarrier(threads);
-
-            final ThreadIdSetter threadIdSetter;
-
-            if (_range.getDims() == 1) {
-               threadIdSetter = new ThreadIdSetter(){
-                  @Override public void set(KernelState kernelState, int globalGroupId, int threadId) {
-                     //                   (kernelState, globalGroupId, threadId) ->{
-                     kernelState.setLocalId(0, (threadId % localSize0));
-                     kernelState.setGlobalId(0, (threadId + (globalGroupId * threads)));
-                     kernelState.setGroupId(0, globalGroupId);
-                  }
-               };
-            } else if (_range.getDims() == 2) {
+            else {
+               if (device != JavaDevice.THREAD_POOL && kernel.getExecutionMode() != Kernel.EXECUTION_MODE.JTP) {
+                  throw new AssertionError("unexpected JavaDevice or EXECUTION_MODE");
+               }
+               final int threads = localSize0 * localSize1 * localSize2;
+               final int numGroups0 = _settings.range.getNumGroups(0);
+               final int numGroups1 = _settings.range.getNumGroups(1);
+               final int globalGroups = numGroups0 * numGroups1 * _settings.range.getNumGroups(2);
+               /**
+                * This joinBarrier is the barrier that we provide for the kernel threads to rendezvous with the current dispatch thread.
+                * So this barrier is threadCount+1 wide (the +1 is for the dispatch thread)
+                */
+               final CyclicBarrier joinBarrier = new FJSafeCyclicBarrier(threads + 1);
 
                /**
-                * Consider a 12x4 grid of 4*2 local groups
-                * <pre>
-                *                                             threads = 4*2 = 8
-                *                                             localWidth=4
-                *                                             localHeight=2
-                *                                             globalWidth=12
-                *                                             globalHeight=4
-                *
-                *    00 01 02 03 | 04 05 06 07 | 08 09 10 11
-                *    12 13 14 15 | 16 17 18 19 | 20 21 22 23
-                *    ------------+-------------+------------
-                *    24 25 26 27 | 28 29 30 31 | 32 33 34 35
-                *    36 37 38 39 | 40 41 42 43 | 44 45 46 47
-                *
-                *    00 01 02 03 | 00 01 02 03 | 00 01 02 03  threadIds : [0..7]*6
-                *    04 05 06 07 | 04 05 06 07 | 04 05 06 07
-                *    ------------+-------------+------------
-                *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
-                *    04 05 06 07 | 04 05 06 07 | 04 05 06 07
-                *
-                *    00 00 00 00 | 01 01 01 01 | 02 02 02 02  groupId[0] : 0..6
-                *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
-                *    ------------+-------------+------------
-                *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
-                *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
-                *
-                *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  groupId[1] : 0..6
-                *    00 00 00 00 | 00 00 00 00 | 00 00 00 00
-                *    ------------+-------------+------------
-                *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
-                *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
-                *
-                *    00 01 02 03 | 08 09 10 11 | 16 17 18 19  globalThreadIds == threadId + groupId * threads;
-                *    04 05 06 07 | 12 13 14 15 | 20 21 22 23
-                *    ------------+-------------+------------
-                *    24 25 26 27 | 32[33]34 35 | 40 41 42 43
-                *    28 29 30 31 | 36 37 38 39 | 44 45 46 47
-                *
-                *    00 01 02 03 | 00 01 02 03 | 00 01 02 03  localX = threadId % localWidth; (for globalThreadId 33 = threadId = 01 : 01%4 =1)
-                *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
-                *    ------------+-------------+------------
-                *    00 01 02 03 | 00[01]02 03 | 00 01 02 03
-                *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
-                *
-                *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  localY = threadId /localWidth  (for globalThreadId 33 = threadId = 01 : 01/4 =0)
-                *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
-                *    ------------+-------------+------------
-                *    00 00 00 00 | 00[00]00 00 | 00 00 00 00
-                *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
-                *
-                *    00 01 02 03 | 04 05 06 07 | 08 09 10 11  globalX=
-                *    00 01 02 03 | 04 05 06 07 | 08 09 10 11     groupsPerLineWidth=globalWidth/localWidth (=12/4 =3)
-                *    ------------+-------------+------------     groupInset =groupId%groupsPerLineWidth (=4%3 = 1)
-                *    00 01 02 03 | 04[05]06 07 | 08 09 10 11
-                *    00 01 02 03 | 04 05 06 07 | 08 09 10 11     globalX = groupInset*localWidth+localX (= 1*4+1 = 5)
+                * This localBarrier is only ever used by the kernels.  If the kernel does not use the barrier the threads
+                * can get out of sync, we promised nothing in JTP mode.
                 *
-                *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  globalY
-                *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
-                *    ------------+-------------+------------
-                *    02 02 02 02 | 02[02]02 02 | 02 02 02 02
-                *    03 03 03 03 | 03 03 03 03 | 03 03 03 03
+                * As with OpenCL all threads within a group must wait at the barrier or none.  It is a user error (possible deadlock!)
+                * if the barrier is in a conditional that is only executed by some of the threads within a group.
                 *
-                * </pre>
-                * Assume we are trying to locate the id's for #33
+                * Kernel developer must understand this.
                 *
+                * This barrier is threadCount wide.  We never hit the barrier from the dispatch thread.
                 */
-               threadIdSetter = new ThreadIdSetter(){
-                  @Override public void set(KernelState kernelState, int globalGroupId, int threadId) {
-                     //                   (kernelState, globalGroupId, threadId) ->{
-                     kernelState.setLocalId(0, (threadId % localSize0)); // threadId % localWidth =  (for 33 = 1 % 4 = 1)
-                     kernelState.setLocalId(1, (threadId / localSize0)); // threadId / localWidth = (for 33 = 1 / 4 == 0)
-
-                     final int groupInset = globalGroupId % numGroups0; // 4%3 = 1
-                     kernelState.setGlobalId(0, ((groupInset * localSize0) + kernelState.getLocalIds()[0])); // 1*4+1=5
-
-                     final int completeLines = (globalGroupId / numGroups0) * localSize1;// (4/3) * 2
-                     kernelState.setGlobalId(1, (completeLines + kernelState.getLocalIds()[1])); // 2+0 = 2
-                     kernelState.setGroupId(0, (globalGroupId % numGroups0));
-                     kernelState.setGroupId(1, (globalGroupId / numGroups0));
-                  }
-               };
-            } else if (_range.getDims() == 3) {
-               //Same as 2D actually turns out that localId[0] is identical for all three dims so could be hoisted out of conditional code
-               threadIdSetter = new ThreadIdSetter(){
-                  @Override public void set(KernelState kernelState, int globalGroupId, int threadId) {
-                     //                   (kernelState, globalGroupId, threadId) ->{
-                     kernelState.setLocalId(0, (threadId % localSize0));
+               final CyclicBarrier localBarrier = new FJSafeCyclicBarrier(threads);
+
+               final ThreadIdSetter threadIdSetter;
+
+               if (_settings.range.getDims() == 1) {
+                  threadIdSetter = new ThreadIdSetter() {
+                     @Override
+                     public void set(KernelState kernelState, int globalGroupId, int threadId) {
+                        //                   (kernelState, globalGroupId, threadId) ->{
+                        kernelState.setLocalId(0, (threadId % localSize0));
+                        kernelState.setGlobalId(0, (threadId + (globalGroupId * threads)));
+                        kernelState.setGroupId(0, globalGroupId);
+                     }
+                  };
+               }
+               else if (_settings.range.getDims() == 2) {
 
-                     kernelState.setLocalId(1, ((threadId / localSize0) % localSize1));
+                  /**
+                   * Consider a 12x4 grid of 4*2 local groups
+                   * <pre>
+                   *                                             threads = 4*2 = 8
+                   *                                             localWidth=4
+                   *                                             localHeight=2
+                   *                                             globalWidth=12
+                   *                                             globalHeight=4
+                   *
+                   *    00 01 02 03 | 04 05 06 07 | 08 09 10 11
+                   *    12 13 14 15 | 16 17 18 19 | 20 21 22 23
+                   *    ------------+-------------+------------
+                   *    24 25 26 27 | 28 29 30 31 | 32 33 34 35
+                   *    36 37 38 39 | 40 41 42 43 | 44 45 46 47
+                   *
+                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03  threadIds : [0..7]*6
+                   *    04 05 06 07 | 04 05 06 07 | 04 05 06 07
+                   *    ------------+-------------+------------
+                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
+                   *    04 05 06 07 | 04 05 06 07 | 04 05 06 07
+                   *
+                   *    00 00 00 00 | 01 01 01 01 | 02 02 02 02  groupId[0] : 0..6
+                   *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
+                   *    ------------+-------------+------------
+                   *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
+                   *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
+                   *
+                   *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  groupId[1] : 0..6
+                   *    00 00 00 00 | 00 00 00 00 | 00 00 00 00
+                   *    ------------+-------------+------------
+                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                   *
+                   *    00 01 02 03 | 08 09 10 11 | 16 17 18 19  globalThreadIds == threadId + groupId * threads;
+                   *    04 05 06 07 | 12 13 14 15 | 20 21 22 23
+                   *    ------------+-------------+------------
+                   *    24 25 26 27 | 32[33]34 35 | 40 41 42 43
+                   *    28 29 30 31 | 36 37 38 39 | 44 45 46 47
+                   *
+                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03  localX = threadId % localWidth; (for globalThreadId 33 = threadId = 01 : 01%4 =1)
+                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
+                   *    ------------+-------------+------------
+                   *    00 01 02 03 | 00[01]02 03 | 00 01 02 03
+                   *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
+                   *
+                   *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  localY = threadId /localWidth  (for globalThreadId 33 = threadId = 01 : 01/4 =0)
+                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                   *    ------------+-------------+------------
+                   *    00 00 00 00 | 00[00]00 00 | 00 00 00 00
+                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                   *
+                   *    00 01 02 03 | 04 05 06 07 | 08 09 10 11  globalX=
+                   *    00 01 02 03 | 04 05 06 07 | 08 09 10 11     groupsPerLineWidth=globalWidth/localWidth (=12/4 =3)
+                   *    ------------+-------------+------------     groupInset =groupId%groupsPerLineWidth (=4%3 = 1)
+                   *    00 01 02 03 | 04[05]06 07 | 08 09 10 11
+                   *    00 01 02 03 | 04 05 06 07 | 08 09 10 11     globalX = groupInset*localWidth+localX (= 1*4+1 = 5)
+                   *
+                   *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  globalY
+                   *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                   *    ------------+-------------+------------
+                   *    02 02 02 02 | 02[02]02 02 | 02 02 02 02
+                   *    03 03 03 03 | 03 03 03 03 | 03 03 03 03
+                   *
+                   * </pre>
+                   * Assume we are trying to locate the id's for #33
+                   *
+                   */
+                  threadIdSetter = new ThreadIdSetter() {
+                     @Override
+                     public void set(KernelState kernelState, int globalGroupId, int threadId) {
+                        //                   (kernelState, globalGroupId, threadId) ->{
+                        kernelState.setLocalId(0, (threadId % localSize0)); // threadId % localWidth =  (for 33 = 1 % 4 = 1)
+                        kernelState.setLocalId(1, (threadId / localSize0)); // threadId / localWidth = (for 33 = 1 / 4 == 0)
+
+                        final int groupInset = globalGroupId % numGroups0; // 4%3 = 1
+                        kernelState.setGlobalId(0, ((groupInset * localSize0) + kernelState.getLocalIds()[0])); // 1*4+1=5
+
+                        final int completeLines = (globalGroupId / numGroups0) * localSize1;// (4/3) * 2
+                        kernelState.setGlobalId(1, (completeLines + kernelState.getLocalIds()[1])); // 2+0 = 2
+                        kernelState.setGroupId(0, (globalGroupId % numGroups0));
+                        kernelState.setGroupId(1, (globalGroupId / numGroups0));
+                     }
+                  };
+               }
+               else if (_settings.range.getDims() == 3) {
+                  //Same as 2D actually turns out that localId[0] is identical for all three dims so could be hoisted out of conditional code
+                  threadIdSetter = new ThreadIdSetter() {
+                     @Override
+                     public void set(KernelState kernelState, int globalGroupId, int threadId) {
+                        //                   (kernelState, globalGroupId, threadId) ->{
+                        kernelState.setLocalId(0, (threadId % localSize0));
 
-                     // the thread id's span WxHxD so threadId/(WxH) should yield the local depth
-                     kernelState.setLocalId(2, (threadId / (localSize0 * localSize1)));
+                        kernelState.setLocalId(1, ((threadId / localSize0) % localSize1));
 
-                     kernelState.setGlobalId(0, (((globalGroupId % numGroups0) * localSize0) + kernelState.getLocalIds()[0]));
+                        // the thread id's span WxHxD so threadId/(WxH) should yield the local depth
+                        kernelState.setLocalId(2, (threadId / (localSize0 * localSize1)));
 
-                     kernelState.setGlobalId(1,
-                           ((((globalGroupId / numGroups0) * localSize1) % globalSize1) + kernelState.getLocalIds()[1]));
+                        kernelState.setGlobalId(0, (((globalGroupId % numGroups0) * localSize0) + kernelState.getLocalIds()[0]));
 
-                     kernelState.setGlobalId(2,
-                           (((globalGroupId / (numGroups0 * numGroups1)) * localSize2) + kernelState.getLocalIds()[2]));
+                        kernelState.setGlobalId(1,
+                        ((((globalGroupId / numGroups0) * localSize1) % globalSize1) + kernelState.getLocalIds()[1]));
 
-                     kernelState.setGroupId(0, (globalGroupId % numGroups0));
-                     kernelState.setGroupId(1, ((globalGroupId / numGroups0) % numGroups1));
-                     kernelState.setGroupId(2, (globalGroupId / (numGroups0 * numGroups1)));
-                  }
-               };
-            } else
-               throw new IllegalArgumentException("Expected 1,2 or 3 dimensions, found " + _range.getDims());
-            for (passId = 0; passId < _passes; passId++) {
-               if (getCancelState() == CANCEL_STATUS_TRUE) {
-                  break;
-               }
-               /**
-                 * Note that we emulate OpenCL by creating one thread per localId (across the group).
-                 *
-                 * So threadCount == range.getLocalSize(0)*range.getLocalSize(1)*range.getLocalSize(2);
-                 *
-                 * For a 1D range of 12 groups of 4 we create 4 threads. One per localId(0).
-                 *
-                 * We also clone the kernel 4 times. One per thread.
-                 *
-                 * We create local barrier which has a width of 4
-                 *
-                 *    Thread-0 handles localId(0) (global 0,4,8)
-                 *    Thread-1 handles localId(1) (global 1,5,7)
-                 *    Thread-2 handles localId(2) (global 2,6,10)
-                 *    Thread-3 handles localId(3) (global 3,7,11)
-                 *
-                 * This allows all threads to synchronize using the local barrier.
-                 *
-                 * Initially the use of local buffers seems broken as the buffers appears to be per Kernel.
-                 * Thankfully Kernel.clone() performs a shallow clone of all buffers (local and global)
-                 * So each of the cloned kernels actually still reference the same underlying local/global buffers.
-                 *
-                 * If the kernel uses local buffers but does not use barriers then it is possible for different groups
-                 * to see mutations from each other (unlike OpenCL), however if the kernel does not us barriers then it
-                 * cannot assume any coherence in OpenCL mode either (the failure mode will be different but still wrong)
-                 *
-                 * So even JTP mode use of local buffers will need to use barriers. Not for the same reason as OpenCL but to keep groups in lockstep.
-                 *
-                 **/
-               for (int id = 0; id < threads; id++) {
-                  final int threadId = id;
+                        kernelState.setGlobalId(2,
+                        (((globalGroupId / (numGroups0 * numGroups1)) * localSize2) + kernelState.getLocalIds()[2]));
 
+                        kernelState.setGroupId(0, (globalGroupId % numGroups0));
+                        kernelState.setGroupId(1, ((globalGroupId / numGroups0) % numGroups1));
+                        kernelState.setGroupId(2, (globalGroupId / (numGroups0 * numGroups1)));
+                     }
+                  };
+               }
+               else
+                  throw new IllegalArgumentException("Expected 1,2 or 3 dimensions, found " + _settings.range.getDims());
+               for (passId = 0; passId < _settings.passes; passId++) {
+                  if (getCancelState() == CANCEL_STATUS_TRUE) {
+                     break;
+                  }
                   /**
-                   *  We clone one kernel for each thread.
+                   * Note that we emulate OpenCL by creating one thread per localId (across the group).
                    *
-                   *  They will all share references to the same range, localBarrier and global/local buffers because the clone is shallow.
-                   *  We need clones so that each thread can assign 'state' (localId/globalId/groupId) without worrying
-                   *  about other threads.
-                   */
-                  final Kernel kernelClone = kernel.clone();
-                  final KernelState kernelState = kernelClone.getKernelState();
-                  kernelState.setRange(_range);
-                  kernelState.setPassId(passId);
-
-                  if (threads == 1) {
-                     kernelState.disableLocalBarrier();
-                  } else {
-                     kernelState.setLocalBarrier(localBarrier);
-                  }
+                   * So threadCount == range.getLocalSize(0)*range.getLocalSize(1)*range.getLocalSize(2);
+                   *
+                   * For a 1D range of 12 groups of 4 we create 4 threads. One per localId(0).
+                   *
+                   * We also clone the kernel 4 times. One per thread.
+                   *
+                   * We create local barrier which has a width of 4
+                   *
+                   *    Thread-0 handles localId(0) (global 0,4,8)
+                   *    Thread-1 handles localId(1) (global 1,5,7)
+                   *    Thread-2 handles localId(2) (global 2,6,10)
+                   *    Thread-3 handles localId(3) (global 3,7,11)
+                   *
+                   * This allows all threads to synchronize using the local barrier.
+                   *
+                   * Initially the use of local buffers seems broken as the buffers appears to be per Kernel.
+                   * Thankfully Kernel.clone() performs a shallow clone of all buffers (local and global)
+                   * So each of the cloned kernels actually still reference the same underlying local/global buffers.
+                   *
+                   * If the kernel uses local buffers but does not use barriers then it is possible for different groups
+                   * to see mutations from each other (unlike OpenCL), however if the kernel does not us barriers then it
+                   * cannot assume any coherence in OpenCL mode either (the failure mode will be different but still wrong)
+                   *
+                   * So even JTP mode use of local buffers will need to use barriers. Not for the same reason as OpenCL but to keep groups in lockstep.
+                   *
+                   **/
+                  for (int id = 0; id < threads; id++) {
+                     final int threadId = id;
+
+                     /**
+                      *  We clone one kernel for each thread.
+                      *
+                      *  They will all share references to the same range, localBarrier and global/local buffers because the clone is shallow.
+                      *  We need clones so that each thread can assign 'state' (localId/globalId/groupId) without worrying
+                      *  about other threads.
+                      */
+                     final Kernel kernelClone = kernel.clone();
+                     final KernelState kernelState = kernelClone.getKernelState();
+                     kernelState.setRange(_settings.range);
+                     kernelState.setPassId(passId);
+
+                     if (threads == 1) {
+                        kernelState.disableLocalBarrier();
+                     }
+                     else {
+                        kernelState.setLocalBarrier(localBarrier);
+                     }
 
-                  threadPool.submit(
-                  //                     () -> {
-                        new Runnable(){
-                           public void run() {
-                              try {
-                                 for (int globalGroupId = 0; globalGroupId < globalGroups; globalGroupId++) {
-                                    threadIdSetter.set(kernelState, globalGroupId, threadId);
-                                    kernelClone.run();
-                                 }
-                              } catch (RuntimeException | Error e) {
-                                 logger.log(Level.SEVERE, "Execution failed", e);
-                              } finally {
-                                 await(joinBarrier); // This thread will rendezvous with dispatch thread here. This is effectively a join.
+                     threadPool.submit(
+                     //                     () -> {
+                     new Runnable() {
+                        public void run() {
+                           try {
+                              for (int globalGroupId = 0; globalGroupId < globalGroups; globalGroupId++) {
+                                 threadIdSetter.set(kernelState, globalGroupId, threadId);
+                                 kernelClone.run();
                               }
                            }
-                        });
-               }
-
-               await(joinBarrier); // This dispatch thread waits for all worker threads here.
-            }
-            passId = PASS_ID_COMPLETED_EXECUTION;
-         } // execution mode == JTP
+                           catch (RuntimeException | Error e) {
+                              logger.log(Level.SEVERE, "Execution failed", e);
+                           }
+                           finally {
+                              await(joinBarrier); // This thread will rendezvous with dispatch thread here. This is effectively a join.
+                           }
+                        }
+                     });
+                  }
 
-         return 0;
+                  await(joinBarrier); // This dispatch thread waits for all worker threads here.
+               }
+               passId = PASS_ID_COMPLETED_EXECUTION;
+            } // execution mode == JTP
+         }
       } finally {
          passId = PASS_ID_COMPLETED_EXECUTION;
       }
@@ -964,63 +996,22 @@ public class KernelRunner extends KernelRunnerJNI{
       return needsSync;
    }
 
-   // private int numAvailableProcessors = Runtime.getRuntime().availableProcessors();
+   @SuppressWarnings("deprecation")
+   private Kernel executeOpenCL(ExecutionSettings _settings) throws AparapiException {
 
-   private Kernel executeOpenCL(final String _entrypointName, final Range _range, final int _passes) throws AparapiException {
-      /*
-      if (_range.getDims() > getMaxWorkItemDimensionsJNI(jniContextHandle)) {
-         throw new RangeException("Range dim size " + _range.getDims() + " > device "
-               + getMaxWorkItemDimensionsJNI(jniContextHandle));
-      }
-      if (_range.getWorkGroupSize() > getMaxWorkGroupSizeJNI(jniContextHandle)) {
-         throw new RangeException("Range workgroup size " + _range.getWorkGroupSize() + " > device "
-               + getMaxWorkGroupSizeJNI(jniContextHandle));
-      }
-      
-            if (_range.getGlobalSize(0) > getMaxWorkItemSizeJNI(jniContextHandle, 0)) {
-               throw new RangeException("Range globalsize 0 " + _range.getGlobalSize(0) + " > device "
-                     + getMaxWorkItemSizeJNI(jniContextHandle, 0));
-            }
-            if (_range.getDims() > 1) {
-               if (_range.getGlobalSize(1) > getMaxWorkItemSizeJNI(jniContextHandle, 1)) {
-                  throw new RangeException("Range globalsize 1 " + _range.getGlobalSize(1) + " > device "
-                        + getMaxWorkItemSizeJNI(jniContextHandle, 1));
-               }
-               if (_range.getDims() > 2) {
-                  if (_range.getGlobalSize(2) > getMaxWorkItemSizeJNI(jniContextHandle, 2)) {
-                     throw new RangeException("Range globalsize 2 " + _range.getGlobalSize(2) + " > device "
-                           + getMaxWorkItemSizeJNI(jniContextHandle, 2));
-                  }
-               }
-            }
-      
-
-      if (logger.isLoggable(Level.FINE)) {
-         logger.fine("maxComputeUnits=" + this.getMaxComputeUnitsJNI(jniContextHandle));
-         logger.fine("maxWorkGroupSize=" + this.getMaxWorkGroupSizeJNI(jniContextHandle));
-         logger.fine("maxWorkItemDimensions=" + this.getMaxWorkItemDimensionsJNI(jniContextHandle));
-         logger.fine("maxWorkItemSize(0)=" + getMaxWorkItemSizeJNI(jniContextHandle, 0));
-         if (_range.getDims() > 1) {
-            logger.fine("maxWorkItemSize(1)=" + getMaxWorkItemSizeJNI(jniContextHandle, 1));
-            if (_range.getDims() > 2) {
-               logger.fine("maxWorkItemSize(2)=" + getMaxWorkItemSizeJNI(jniContextHandle, 2));
-            }
-         }
-      }
-      */
       // Read the array refs after kernel may have changed them
       // We need to do this as input to computing the localSize
       assert args != null : "args should not be null";
       final boolean needSync = updateKernelArrayRefs();
       if (needSync && logger.isLoggable(Level.FINE)) {
-         logger.fine("Need to resync arrays on " + describeKernelClass());
+         logger.fine("Need to resync arrays on " + kernel);
       }
 
       // native side will reallocate array buffers if necessary
-      if (runKernelJNI(jniContextHandle, _range, needSync, _passes, inBufferRemote, outBufferRemote) != 0) {
-         logger.warning("### " + describeKernelClass() + " - CL exec seems to have failed. Trying to revert to Java ###");
-         kernel.setFallbackExecutionMode();
-         return execute(_entrypointName, _range, _passes);
+      int returnValue = runKernelJNI(jniContextHandle, _settings.range, needSync, _settings.passes, inBufferRemote, outBufferRemote);
+      if (returnValue != 0) {
+         String reason = "OpenCL execution seems to have failed (runKernelJNI returned " + returnValue + ")";
+         return fallBackToNextDevice(_settings, new AparapiException(reason));
       }
 
       if (usesOopConversion == true) {
@@ -1028,343 +1019,523 @@ public class KernelRunner extends KernelRunnerJNI{
       }
 
       if (logger.isLoggable(Level.FINE)) {
-         logger.fine("executeOpenCL completed. " + _range);
+         logger.fine("executeOpenCL completed. " + _settings.range);
       }
 
       return kernel;
    }
 
-   public synchronized Kernel execute(Kernel.Entry entry, final Range _range, final int _passes) {
-      System.out.println("execute(Kernel.Entry, size) not implemented");
-      return (kernel);
-   }
-
-   synchronized private Kernel fallBackAndExecute(String _entrypointName, final Range _range, final int _passes) {
+   @SuppressWarnings("deprecation")
+   synchronized private Kernel fallBackByExecutionMode(ExecutionSettings _settings) {
       isFallBack = true;
       if (kernel.hasNextExecutionMode()) {
          kernel.tryNextExecutionMode();
+         if (logger.isLoggable(Level.WARNING)) {
+            logger.warning("Trying next execution mode " + kernel.getExecutionMode());
+         }
       } else {
          kernel.setFallbackExecutionMode();
       }
-
-      return execute(_entrypointName, _range, _passes);
+      recreateRange(_settings);
+      return executeInternalInner(_settings);
    }
 
-   synchronized private Kernel warnFallBackAndExecute(String _entrypointName, final Range _range, final int _passes,
-         Exception _exception) {
-      if (logger.isLoggable(Level.WARNING)) {
-         logger.warning("Reverting to the next execution mode for " + describeKernelClass() + ": " + _exception.getMessage());
-         _exception.printStackTrace();
+   private void recreateRange(ExecutionSettings _settings) {
+      if (_settings.range.isLocalIsDerived() && !_settings.legacyExecutionMode) {
+         Device device = kernel.getTargetDevice();
+         Range result;
+         switch (_settings.range.getDims()) {
+            case 1: {
+               result = Range.create(device, _settings.range.getGlobalSize_0());
+               break;
+            }
+            case 2: {
+               result = Range.create2D(device, _settings.range.getGlobalSize_0(), _settings.range.getGlobalSize_1());
+               break;
+            }
+            case 3: {
+               result = Range.create3D(device, _settings.range.getGlobalSize_0(), _settings.range.getGlobalSize_1(), _settings.range.getGlobalSize_2());
+               break;
+            }
+            default: {
+               throw new AssertionError("Range.getDims() = " + _settings.range.getDims());
+            }
+         }
+         _settings.range = result;
       }
-      return fallBackAndExecute(_entrypointName, _range, _passes);
    }
 
-   private String describeKernelClass() {
-      return kernel.getClass().getName();
+   private Kernel fallBackToNextDevice(ExecutionSettings _settings, String _reason) {
+      return fallBackToNextDevice(_settings, new AparapiException(_reason));
    }
 
-   synchronized private Kernel warnFallBackAndExecute(String _entrypointName, final Range _range, final int _passes, String _excuse) {
-      logger.warning("Reverting to the next execution mode for " + describeKernelClass() + ": " + _excuse);
-      return fallBackAndExecute(_entrypointName, _range, _passes);
+   @SuppressWarnings("deprecation")
+   synchronized private Kernel fallBackToNextDevice(ExecutionSettings _settings, Exception _exception) {
+      return fallBackToNextDevice(_settings, _exception, false);
    }
 
-   public synchronized Kernel execute(String _entrypointName, final Range _range, final int _passes) {
-      clearCancelMultiPass();
+   @SuppressWarnings("deprecation")
+   synchronized private Kernel fallBackToNextDevice(ExecutionSettings _settings, Exception _exception, boolean _silently) {
+      isFallBack = true;
+      _settings.profile.onEvent(ProfilingEvent.EXECUTED);
+      if (_settings.legacyExecutionMode) {
+         if (!_silently && logger.isLoggable(Level.WARNING)) {
+            logger.warning("Execution mode " + kernel.getExecutionMode() + " failed for " + kernel + ": " + _exception.getMessage());
+             _exception.printStackTrace();
+          }
+          return fallBackByExecutionMode(_settings);
+      } else {
+         KernelPreferences preferences = KernelManager.instance().getPreferences(kernel);
+         if (!_silently && logger.isLoggable(Level.WARNING)) {
+            logger.warning("Device failed for " + kernel + ": " + _exception.getMessage());
+         }
+
+         preferences.markPreferredDeviceFailed();
+
+//         Device nextDevice = preferences.getPreferredDevice(kernel);
+//
+//         if (nextDevice == null) {
+//            if (!_silently && logger.isLoggable(Level.SEVERE)) {
+//               logger.severe("No Devices left to try, giving up");
+//            }
+//            throw new RuntimeException(_exception);
+//         }
+         if (!_silently && logger.isLoggable(Level.WARNING)) {
+            _exception.printStackTrace();
+            logger.warning("Trying next device: " + describeDevice());
+         }
+      }
+
+      recreateRange(_settings);
+      return executeInternalInner(_settings);
+   }
+
+   @SuppressWarnings("deprecation")
+   public synchronized Kernel execute(String _entrypoint, final Range _range, final int _passes) {
       executing = true;
       try {
-         long executeStartTime = System.currentTimeMillis();
-
-         if (_range == null) {
-            throw new IllegalStateException("range can't be null");
+         clearCancelMultiPass();
+         KernelProfile profile = KernelManager.instance().getProfile(kernel.getClass());
+         KernelPreferences preferences = KernelManager.instance().getPreferences(kernel);
+         boolean legacyExecutionMode = kernel.getExecutionMode() != Kernel.EXECUTION_MODE.AUTO;
+
+         ExecutionSettings settings = new ExecutionSettings(preferences, profile, _entrypoint, _range, _passes, legacyExecutionMode);
+         // Two Kernels of the same class share the same KernelPreferences object, and since failure (fallback) generally mutates
+         // the preferences object, we must lock it. Note this prevents two Kernels of the same class executing simultaneously.
+         synchronized (preferences) {
+            return executeInternalOuter(settings);
          }
+      } finally {
+         executing = false;
+         clearCancelMultiPass();
+      }
+   }
 
-         /* for backward compatibility reasons we still honor execution mode */
-         if (kernel.getExecutionMode().isOpenCL()) {
-            // System.out.println("OpenCL");
+   private synchronized Kernel executeInternalOuter(ExecutionSettings _settings) {
+      try {
+         return executeInternalInner(_settings);
+      } finally {
+         if (kernel.isAutoCleanUpArrays() &&_settings.range.getGlobalSize_0() != 0) {
+            cleanUpArrays();
+         }
+      }
+   }
 
-            // See if user supplied a Device
-            Device device = _range.getDevice();
+   @SuppressWarnings("deprecation")
+   private synchronized Kernel executeInternalInner(ExecutionSettings _settings) {
 
-            if ((device == null) || (device instanceof OpenCLDevice)) {
-               if ((entryPoint == null) || (isFallBack)) {
-                  if (entryPoint == null) {
-                     try {
-                        final ClassModel classModel = ClassModel.createClassModel(kernel.getClass());
-                        entryPoint = classModel.getEntrypoint(_entrypointName, kernel);
-                     } catch (final Exception exception) {
-                        return warnFallBackAndExecute(_entrypointName, _range, _passes, exception);
-                     }
-                  }
+      if (_settings.range == null) {
+         throw new IllegalStateException("range can't be null");
+      }
 
-                  if ((entryPoint != null) && !entryPoint.shouldFallback()) {
-                     synchronized (Kernel.class) { // This seems to be needed because of a race condition uncovered with issue #68 http://code.google.com/p/aparapi/issues/detail?id=68
-                        if (device != null && !(device instanceof OpenCLDevice)) {
-                           throw new IllegalStateException("range's device is not suitable for OpenCL ");
-                        }
+      EXECUTION_MODE requestedExecutionMode = kernel.getExecutionMode();
 
-                        OpenCLDevice openCLDevice = (OpenCLDevice) device; // still might be null!
+      if (requestedExecutionMode.isOpenCL() && _settings.range.getDevice() != null && !(_settings.range.getDevice() instanceof OpenCLDevice)) {
+         fallBackToNextDevice(_settings, "OpenCL EXECUTION_MODE was requested but Device supplied was not an OpenCLDevice");
+      }
 
-                        int jniFlags = 0;
-                        if (openCLDevice == null) {
-                           if (kernel.getExecutionMode().equals(EXECUTION_MODE.GPU)) {
-                              // Get the best GPU
-                              openCLDevice = (OpenCLDevice) OpenCLDevice.bestGPU();
-                              jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now.
-                              if (openCLDevice == null) {
-                                 return warnFallBackAndExecute(_entrypointName, _range, _passes, "GPU request can't be honored");
-                              }
-                           } else if (kernel.getExecutionMode().equals(EXECUTION_MODE.ACC)) {
-                              // Get the best ACC
-                              openCLDevice = (OpenCLDevice) OpenCLDevice.bestACC();
-                              jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now.
-                              if (openCLDevice == null) {
-                                 return warnFallBackAndExecute(_entrypointName, _range, _passes, "ACC request can't be honored");
-                              }
-                           } else {
-                              // We fetch the first CPU device
-                              openCLDevice = (OpenCLDevice) OpenCLDevice.firstCPU();
-                              if (openCLDevice == null) {
-                                 return warnFallBackAndExecute(_entrypointName, _range, _passes,
-                                       "CPU request can't be honored not CPU device");
-                              }
-                           }
-                        } else { // openCLDevice == null
-                           if (openCLDevice.getType() == Device.TYPE.GPU) {
-                              jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now.
-                           } else if (openCLDevice.getType() == Device.TYPE.ACC) {
-                              jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now.
-                           }
-                        }
+      Device device = _settings.range.getDevice();
+      boolean userSpecifiedDevice = true;
+      if (device == null) {
+         userSpecifiedDevice = false;
+         if (!_settings.legacyExecutionMode) {
+            device = _settings.preferences.getPreferredDevice(kernel);
+            if (device == null) {
+               // the default fallback when KernelPreferences has run out of options is JTP
+               device = JavaDevice.THREAD_POOL;
+            }
+         } else {
+            if (requestedExecutionMode == EXECUTION_MODE.JTP) {
+               device = JavaDevice.THREAD_POOL;
+            } else if (requestedExecutionMode == EXECUTION_MODE.SEQ) {
+               device = JavaDevice.SEQUENTIAL;
+            }
+         }
+      } else {
+         boolean compatible = isDeviceCompatible(device);
+         if (!compatible) {
+            throw new AssertionError("user supplied Device incompatible with current EXECUTION_MODE or getTargetDevice(); device = "
+                    + device.getShortDescription() + "; kernel = " + kernel);
+         }
+      }
 
-                        //  jniFlags |= (Config.enableProfiling ? JNI_FLAG_ENABLE_PROFILING : 0);
-                        //  jniFlags |= (Config.enableProfilingCSV ? JNI_FLAG_ENABLE_PROFILING_CSV | JNI_FLAG_ENABLE_PROFILING : 0);
-                        //  jniFlags |= (Config.enableVerboseJNI ? JNI_FLAG_ENABLE_VERBOSE_JNI : 0);
-                        // jniFlags |= (Config.enableVerboseJNIOpenCLResourceTracking ? JNI_FLAG_ENABLE_VERBOSE_JNI_OPENCL_RESOURCE_TRACKING :0);
-                        // jniFlags |= (kernel.getExecutionMode().equals(EXECUTION_MODE.GPU) ? JNI_FLAG_USE_GPU : 0);
-                        // Init the device to check capabilities before emitting the
-                        // code that requires the capabilities.
+      try {
+         OpenCLDevice openCLDevice = device instanceof OpenCLDevice ? (OpenCLDevice) device : null;
+
+         int jniFlags = 0;
+         // for legacy reasons use old logic where Kernel.EXECUTION_MODE is not AUTO
+         if (_settings.legacyExecutionMode && !userSpecifiedDevice && requestedExecutionMode.isOpenCL()) {
+            if (requestedExecutionMode.equals(EXECUTION_MODE.GPU)) {
+               // Get the best GPU
+               openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.bestGPU();
+               jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now.
+               if (openCLDevice == null) {
+                  return fallBackToNextDevice(_settings, "GPU request can't be honored, no GPU device");
+               }
+            } else if (requestedExecutionMode.equals(EXECUTION_MODE.ACC)) {
+               // Get the best ACC
+               openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.bestACC();
+               jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now.
+               if (openCLDevice == null) {
+                  return fallBackToNextDevice(_settings, "ACC request can't be honored, no ACC device");
+               }
+            } else {
+               // We fetch the first CPU device
+               openCLDevice = (OpenCLDevice) KernelManager.DeprecatedMethods.firstDevice(Device.TYPE.CPU);
+               if (openCLDevice == null) {
+                  return fallBackToNextDevice(_settings, "CPU request can't be honored, no CPU device");
+               }
+            }
+         } else {
+            if (device.getType() == Device.TYPE.GPU) {
+               jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now.
+            } else if (device.getType() == Device.TYPE.ACC) {
+               jniFlags |= JNI_FLAG_USE_ACC; // this flag might be redundant now.
+            }
+         }
+         if (device == null && openCLDevice != null) {
+            device = openCLDevice;
+         }
+         assert device != null : "No device available";
+         _settings.profile.onStart(device);
+         /* for backward compatibility reasons we still honor execution mode */
+         boolean isOpenCl = requestedExecutionMode.isOpenCL() || device instanceof OpenCLDevice;
+         if (isOpenCl) {
+            if ((entryPoint == null) || (isFallBack)) {
+               if (entryPoint == null) {
+                  try {
+                     final ClassModel classModel = ClassModel.createClassModel(kernel.getClass());
+                     entryPoint = classModel.getEntrypoint(_settings.entrypoint, kernel);
+                     _settings.profile.onEvent(ProfilingEvent.CLASS_MODEL_BUILT);
+                  } catch (final Exception exception) {
+                     _settings.profile.onEvent(ProfilingEvent.CLASS_MODEL_BUILT);
+                     return fallBackToNextDevice(_settings, exception);
+                  }
+               }
 
-                        // synchronized(Kernel.class){
-                        jniContextHandle = initJNI(kernel, openCLDevice, jniFlags); // openCLDevice will not be null here
-                     } // end of synchronized! issue 68
+               if ((entryPoint != null)) {
+                  synchronized (Kernel.class) { // This seems to be needed because of a race condition uncovered with issue #68 http://code.google.com/p/aparapi/issues/detail?id=68
+
+                     //  jniFlags |= (Config.enableProfiling ? JNI_FLAG_ENABLE_PROFILING : 0);
+                     //  jniFlags |= (Config.enableProfilingCSV ? JNI_FLAG_ENABLE_PROFILING_CSV | JNI_FLAG_ENABLE_PROFILING : 0);
+                     //  jniFlags |= (Config.enableVerboseJNI ? JNI_FLAG_ENABLE_VERBOSE_JNI : 0);
+                     // jniFlags |= (Config.enableVerboseJNIOpenCLResourceTracking ? JNI_FLAG_ENABLE_VERBOSE_JNI_OPENCL_RESOURCE_TRACKING :0);
+                     // jniFlags |= (kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.GPU) ? JNI_FLAG_USE_GPU : 0);
+                     // Init the device to check capabilities before emitting the
+                     // code that requires the capabilities.
+                     jniContextHandle = initJNI(kernel, openCLDevice, jniFlags); // openCLDevice will not be null here
+                     _settings.profile.onEvent(ProfilingEvent.INIT_JNI);
+                  } // end of synchronized! issue 68
+
+                  if (jniContextHandle == 0) {
+                     return fallBackToNextDevice(_settings, "initJNI failed to return a valid handle");
+                  }
 
-                     if (jniContextHandle == 0) {
-                        return warnFallBackAndExecute(_entrypointName, _range, _passes, "initJNI failed to return a valid handle");
-                     }
+                  final String extensions = getExtensionsJNI(jniContextHandle);
+                  capabilitiesSet = new HashSet<String>();
 
-                     final String extensions = getExtensionsJNI(jniContextHandle);
-                     capabilitiesSet = new HashSet<String>();
+                  final StringTokenizer strTok = new StringTokenizer(extensions);
+                  while (strTok.hasMoreTokens()) {
+                     capabilitiesSet.add(strTok.nextToken());
+                  }
 
-                     final StringTokenizer strTok = new StringTokenizer(extensions);
-                     while (strTok.hasMoreTokens()) {
-                        capabilitiesSet.add(strTok.nextToken());
-                     }
+                  if (logger.isLoggable(Level.FINE)) {
+                     logger.fine("Capabilities initialized to :" + capabilitiesSet.toString());
+                  }
 
-                     if (logger.isLoggable(Level.FINE)) {
-                        logger.fine("Capabilities initialized to :" + capabilitiesSet.toString());
-                     }
+                  if (entryPoint.requiresDoublePragma() && !hasFP64Support()) {
+                     return fallBackToNextDevice(_settings, "FP64 required but not supported");
+                  }
 
-                     if (entryPoint.requiresDoublePragma() && !hasFP64Support()) {
-                        return warnFallBackAndExecute(_entrypointName, _range, _passes, "FP64 required but not supported");
-                     }
+                  if (entryPoint.requiresByteAddressableStorePragma() && !hasByteAddressableStoreSupport()) {
+                     return fallBackToNextDevice(_settings, "Byte addressable stores required but not supported");
+                  }
 
-                     if (entryPoint.requiresByteAddressableStorePragma() && !hasByteAddressableStoreSupport()) {
-                        return warnFallBackAndExecute(_entrypointName, _range, _passes,
-                              "Byte addressable stores required but not supported");
-                     }
+                  final boolean all32AtomicsAvailable = hasGlobalInt32BaseAtomicsSupport()
+                        && hasGlobalInt32ExtendedAtomicsSupport() && hasLocalInt32BaseAtomicsSupport()
+                        && hasLocalInt32ExtendedAtomicsSupport();
 
-                     final boolean all32AtomicsAvailable = hasGlobalInt32BaseAtomicsSupport()
-                           && hasGlobalInt32ExtendedAtomicsSupport() && hasLocalInt32BaseAtomicsSupport()
-                           && hasLocalInt32ExtendedAtomicsSupport();
+                  if (entryPoint.requiresAtomic32Pragma() && !all32AtomicsAvailable) {
 
-                     if (entryPoint.requiresAtomic32Pragma() && !all32AtomicsAvailable) {
+                     return fallBackToNextDevice(_settings, "32 bit Atomics required but not supported");
+                  }
 
-                        return warnFallBackAndExecute(_entrypointName, _range, _passes, "32 bit Atomics required but not supported");
+                  String openCL;
+                  synchronized (openCLCache) {
+                     openCL = openCLCache.get(kernel.getClass());
+                     if (openCL == null) {
+                        try {
+                           openCL = KernelWriter.writeToString(entryPoint);
+                           if (logger.isLoggable(Level.INFO)) {
+                              logger.info(openCL);
+                           }
+                           else if (Config.enableShowGeneratedOpenCL) {
+                              System.out.println(openCL);
+                           }
+                           _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED);
+                           openCLCache.put(kernel.getClass(), openCL);
+                        }
+                        catch (final CodeGenException codeGenException) {
+                           openCLCache.put(kernel.getClass(), CODE_GEN_ERROR_MARKER);
+                           _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED);
+                           return fallBackToNextDevice(_settings, codeGenException);
+                        }
                      }
-
-                     String openCL = null;
-                     try {
-                        openCL = KernelWriter.writeToString(entryPoint);
-                     } catch (final CodeGenException codeGenException) {
-                        return warnFallBackAndExecute(_entrypointName, _range, _passes, codeGenException);
+                     else {
+                        if (openCL.equals(CODE_GEN_ERROR_MARKER)) {
+                           _settings.profile.onEvent(ProfilingEvent.OPENCL_GENERATED);
+                           boolean silently = true; // since we must have already reported the CodeGenException
+                           return fallBackToNextDevice(_settings, null, silently);
+                        }
                      }
+                  }
 
-                     if (Config.enableShowGeneratedOpenCL) {
-                        System.out.println(openCL);
+                  // Send the string to OpenCL to compile it, or if the compiled binary is already cached on JNI side just empty string to use cached binary
+                  long handle;
+                  if (BINARY_CACHING_DISABLED) {
+                     handle = buildProgramJNI(jniContextHandle, openCL, "");
+                  } else {
+                     synchronized (seenBinaryKeys) {
+                        String binaryKey = kernel.getClass().getName() + ":" + device.getDeviceId();
+                        if (seenBinaryKeys.contains(binaryKey)) {
+                           // use cached binary
+                           logger.log(Level.INFO, "reusing cached binary for " + binaryKey);
+                           handle = buildProgramJNI(jniContextHandle, "", binaryKey);
+                        }
+                        else {
+                           // create and cache binary
+                           logger.log(Level.INFO, "compiling new binary for " + binaryKey);
+                           handle = buildProgramJNI(jniContextHandle, openCL, binaryKey);
+                           seenBinaryKeys.add(binaryKey);
+                        }
                      }
+                  }
+                  _settings.profile.onEvent(ProfilingEvent.OPENCL_COMPILED);
+                  if (handle == 0) {
+                     return fallBackToNextDevice(_settings, "OpenCL compile failed");
+                  }
 
-                     if (logger.isLoggable(Level.INFO)) {
-                        logger.info(openCL);
-                     }
+                  args = new KernelArg[entryPoint.getReferencedFields().size()];
+                  int i = 0;
 
-                     // Send the string to OpenCL to compile it
-                     if (buildProgramJNI(jniContextHandle, openCL) == 0) {
-                        return warnFallBackAndExecute(_entrypointName, _range, _passes, "OpenCL compile failed");
-                     }
+                  for (final Field field : entryPoint.getReferencedFields()) {
+                     try {
+                        field.setAccessible(true);
+                        args[i] = new KernelArg();
+                        args[i].setName(field.getName());
+                        args[i].setField(field);
+                        if ((field.getModifiers() & Modifier.STATIC) == Modifier.STATIC) {
+                           args[i].setType(args[i].getType() | ARG_STATIC);
+                        }
 
-                     args = new KernelArg[entryPoint.getReferencedFields().size()];
-                     int i = 0;
+                        final Class<?> type = field.getType();
+                        if (type.isArray()) {
 
-                     for (final Field field : entryPoint.getReferencedFields()) {
-                        try {
-                           field.setAccessible(true);
-                           args[i] = new KernelArg();
-                           args[i].setName(field.getName());
-                           args[i].setField(field);
-                           if ((field.getModifiers() & Modifier.STATIC) == Modifier.STATIC) {
-                              args[i].setType(args[i].getType() | ARG_STATIC);
+                           if (field.getAnnotation(Local.class) != null || args[i].getName().endsWith(Kernel.LOCAL_SUFFIX)) {
+                              args[i].setType(args[i].getType() | ARG_LOCAL);
+                           } else if ((field.getAnnotation(Constant.class) != null)
+                                 || args[i].getName().endsWith(Kernel.CONSTANT_SUFFIX)) {
+                              args[i].setType(args[i].getType() | ARG_CONSTANT);
+                           } else {
+                              args[i].setType(args[i].getType() | ARG_GLOBAL);
+                           }
+                           if (isExplicit()) {
+                              args[i].setType(args[i].getType() | ARG_EXPLICIT);
                            }
+                           // for now, treat all write arrays as read-write, see bugzilla issue 4859
+                           // we might come up with a better solution later
+                           args[i].setType(args[i].getType()
+                                 | (entryPoint.getArrayFieldAssignments().contains(field.getName()) ? (ARG_WRITE | ARG_READ) : 0));
+                           args[i].setType(args[i].getType()
+                                 | (entryPoint.getArrayFieldAccesses().contains(field.getName()) ? ARG_READ : 0));
+                           // args[i].type |= ARG_GLOBAL;
+
+                           if (type.getName().startsWith("[L")) {
+                              args[i].setArray(null); // will get updated in updateKernelArrayRefs
+                              args[i].setType(args[i].getType()
+                                    | (ARG_ARRAY | ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ));
 
-                           final Class<?> type = field.getType();
-                           if (type.isArray()) {
+                              if (logger.isLoggable(Level.FINE)) {
+                                 logger.fine("tagging " + args[i].getName() + " as (ARG_ARRAY | ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)");
+                              }
+                           } else if (type.getName().startsWith("[[")) {
 
-                              if (field.getAnnotation(Local.class) != null || args[i].getName().endsWith(Kernel.LOCAL_SUFFIX)) {
-                                 args[i].setType(args[i].getType() | ARG_LOCAL);
-                              } else if ((field.getAnnotation(Constant.class) != null)
-                                    || args[i].getName().endsWith(Kernel.CONSTANT_SUFFIX)) {
-                                 args[i].setType(args[i].getType() | ARG_CONSTANT);
-                              } else {
-                                 args[i].setType(args[i].getType() | ARG_GLOBAL);
+                              try {
+                                 setMultiArrayType(args[i], type);
+                              } catch (AparapiException e) {
+                                 return fallBackToNextDevice(_settings, "failed to set kernel arguement "
+                                       + args[i].getName() + ".  Aparapi only supports 2D and 3D arrays.");
                               }
-                              if (isExplicit()) {
-                                 args[i].setType(args[i].getType() | ARG_EXPLICIT);
+                           } else {
+
+                              args[i].setArray(null); // will get updated in updateKernelArrayRefs
+                              args[i].setType(args[i].getType() | ARG_ARRAY);
+
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(float[].class) ? ARG_FLOAT : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(int[].class) ? ARG_INT : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(boolean[].class) ? ARG_BOOLEAN : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(byte[].class) ? ARG_BYTE : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(char[].class) ? ARG_CHAR : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(double[].class) ? ARG_DOUBLE : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(long[].class) ? ARG_LONG : 0));
+                              args[i].setType(args[i].getType() | (type.isAssignableFrom(short[].class) ? ARG_SHORT : 0));
+
+                              // arrays whose length is used will have an int arg holding
+                              // the length as a kernel param
+                              if (entryPoint.getArrayFieldArrayLengthUsed().contains(args[i].getName())) {
+                                 args[i].setType(args[i].getType() | ARG_ARRAYLENGTH);
                               }
-                              // for now, treat all write arrays as read-write, see bugzilla issue 4859
-                              // we might come up with a better solution later
-                              args[i].setType(args[i].getType()
-                                    | (entryPoint.getArrayFieldAssignments().contains(field.getName()) ? (ARG_WRITE | ARG_READ) : 0));
-                              args[i].setType(args[i].getType()
-                                    | (entryPoint.getArrayFieldAccesses().contains(field.getName()) ? ARG_READ : 0));
-                              // args[i].type |= ARG_GLOBAL;
 
                               if (type.getName().startsWith("[L")) {
-                                 args[i].setType(args[i].getType()
-                                       | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ | ARG_APARAPI_BUFFER));
-
+                                 args[i].setType(args[i].getType() | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ));
                                  if (logger.isLoggable(Level.FINE)) {
-                                    logger.fine("tagging " + args[i].getName() + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)");
-                                 }
-                              } else if (type.getName().startsWith("[[")) {
-
-                                 try {
-                                    setMultiArrayType(args[i], type);
-                                 } catch (AparapiException e) {
-                                    return warnFallBackAndExecute(_entrypointName, _range, _passes, "failed to set kernel arguement "
-                                          + args[i].getName() + ".  Aparapi only supports 2D and 3D arrays.");
-                                 }
-                              } else {
-
-                                 args[i].setArray(null); // will get updated in updateKernelArrayRefs
-                                 args[i].setType(args[i].getType() | ARG_ARRAY);
-
-                                 args[i].setType(args[i].getType() | (type.isAssignableFrom(float[].class) ? ARG_FLOAT : 0));
-                                 args[i].setType(args[i].getType() | (type.isAssignableFrom(int[].class) ? ARG_INT : 0));
-                                 args[i].setType(args[i].getType() | (type.isAssignableFrom(boolean[].class) ? ARG_BOOLEAN : 0));
-                                 args[i].setType(args[i].getType() | (type.isAssignableFrom(byte[].class) ? ARG_BYTE : 0));
-                                 args[i].setType(args[i].getType() | (type.isAssignableFrom(char[].class) ? ARG_CHAR : 0));
-                                 args[i].setType(args[i].getType() | (type.isAssignableFrom(double[].class) ? ARG_DOUBLE : 0));
-                                 args[i].setType(args[i].getType() | (type.isAssignableFrom(long[].class) ? ARG_LONG : 0));
-                                 args[i].setType(args[i].getType() | (type.isAssignableFrom(short[].class) ? ARG_SHORT : 0));
-
-                                 // arrays whose length is used will have an int arg holding
-                                 // the length as a kernel param
-                                 if (entryPoint.getArrayFieldArrayLengthUsed().contains(args[i].getName())) {
-                                    args[i].setType(args[i].getType() | ARG_ARRAYLENGTH);
-                                 }
-
-                                 if (type.getName().startsWith("[L")) {
-                                    args[i].setType(args[i].getType() | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ));
-                                    if (logger.isLoggable(Level.FINE)) {
-                                       logger.fine("tagging " + args[i].getName()
-                                             + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)");
-                                    }
+                                    logger.fine("tagging " + args[i].getName()
+                                          + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)");
                                  }
                               }
-                           } else if (type.isAssignableFrom(float.class)) {
-                              args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                              args[i].setType(args[i].getType() | ARG_FLOAT);
-                           } else if (type.isAssignableFrom(int.class)) {
-                              args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                              args[i].setType(args[i].getType() | ARG_INT);
-                           } else if (type.isAssignableFrom(double.class)) {
-                              args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                              args[i].setType(args[i].getType() | ARG_DOUBLE);
-                           } else if (type.isAssignableFrom(long.class)) {
-                              args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                              args[i].setType(args[i].getType() | ARG_LONG);
-                           } else if (type.isAssignableFrom(boolean.class)) {
-                              args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                              args[i].setType(args[i].getType() | ARG_BOOLEAN);
-                           } else if (type.isAssignableFrom(byte.class)) {
-                              args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                              args[i].setType(args[i].getType() | ARG_BYTE);
-                           } else if (type.isAssignableFrom(char.class)) {
-                              args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                              args[i].setType(args[i].getType() | ARG_CHAR);
-                           } else if (type.isAssignableFrom(short.class)) {
-                              args[i].setType(args[i].getType() | ARG_PRIMITIVE);
-                              args[i].setType(args[i].getType() | ARG_SHORT);
                            }
-                           // System.out.printf("in execute, arg %d %s %08x\n", i,args[i].name,args[i].type );
-                        } catch (final IllegalArgumentException e) {
-                           e.printStackTrace();
+                        } else if (type.isAssignableFrom(float.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_FLOAT);
+                        } else if (type.isAssignableFrom(int.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_INT);
+                        } else if (type.isAssignableFrom(double.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_DOUBLE);
+                        } else if (type.isAssignableFrom(long.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_LONG);
+                        } else if (type.isAssignableFrom(boolean.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_BOOLEAN);
+                        } else if (type.isAssignableFrom(byte.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_BYTE);
+                        } else if (type.isAssignableFrom(char.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_CHAR);
+                        } else if (type.isAssignableFrom(short.class)) {
+                           args[i].setType(args[i].getType() | ARG_PRIMITIVE);
+                           args[i].setType(args[i].getType() | ARG_SHORT);
                         }
+                        // System.out.printf("in execute, arg %d %s %08x\n", i,args[i].name,args[i].type );
+                     } catch (final IllegalArgumentException e) {
+                        e.printStackTrace();
+                     }
 
-                        args[i].setPrimitiveSize(getPrimitiveSize(args[i].getType()));
-
-                        if (logger.isLoggable(Level.FINE)) {
-                           logger.fine("arg " + i + ", " + args[i].getName() + ", type=" + Integer.toHexString(args[i].getType())
-                                 + ", primitiveSize=" + args[i].getPrimitiveSize());
-                        }
+                     args[i].setPrimitiveSize(getPrimitiveSize(args[i].getType()));
 
-                        i++;
+                     if (logger.isLoggable(Level.FINE)) {
+                        logger.fine("arg " + i + ", " + args[i].getName() + ", type=" + Integer.toHexString(args[i].getType())
+                              + ", primitiveSize=" + args[i].getPrimitiveSize());
                      }
 
-                     // at this point, i = the actual used number of arguments
-                     // (private buffers do not get treated as arguments)
-
-                     argc = i;
+                     i++;
+                  }
 
-                     setArgsJNI(jniContextHandle, args, argc);
+                  // at this point, i = the actual used number of arguments
+                  // (private buffers do not get treated as arguments)
 
-                     conversionTime = System.currentTimeMillis() - executeStartTime;
+                  argc = i;
 
-                     try {
-                        executeOpenCL(_entrypointName, _range, _passes);
-                        isFallBack = false;
-                     } catch (final AparapiException e) {
-                        warnFallBackAndExecute(_entrypointName, _range, _passes, e);
-                     }
-                  } else { // (entryPoint != null) && !entryPoint.shouldFallback()
-                     warnFallBackAndExecute(_entrypointName, _range, _passes, "failed to locate entrypoint");
-                  }
-               } else { // (entryPoint == null) || (isFallBack)
+                  setArgsJNI(jniContextHandle, args, argc);
+                  _settings.profile.onEvent(ProfilingEvent.PREPARE_EXECUTE);
                   try {
-                     executeOpenCL(_entrypointName, _range, _passes);
+                     executeOpenCL(_settings);
                      isFallBack = false;
                   } catch (final AparapiException e) {
-                     warnFallBackAndExecute(_entrypointName, _range, _passes, e);
+                     fallBackToNextDevice(_settings, e);
                   }
+               } else { // (entryPoint != null) && !entryPoint.shouldFallback()
+                  fallBackToNextDevice(_settings, "failed to locate entrypoint");
+               }
+            } else { // (entryPoint == null) || (isFallBack)
+               try {
+                  executeOpenCL(_settings);
+                  isFallBack = false;
+               } catch (final AparapiException e) {
+                  fallBackToNextDevice(_settings, e);
                }
-            } else { // (device == null) || (device instanceof OpenCLDevice)
-               warnFallBackAndExecute(_entrypointName, _range, _passes,
-                     "OpenCL was requested but Device supplied was not an OpenCLDevice");
             }
-         } else { // kernel.getExecutionMode().isOpenCL()
-            executeJava(_range, _passes);
+         } else { // isOpenCL
+            if (!(device instanceof JavaDevice)) {
+               fallBackToNextDevice(_settings, "Non-OpenCL Kernel.EXECUTION_MODE requested but device is not a JavaDevice ");
+            }
+            executeJava(_settings, (JavaDevice) device);
          }
 
          if (Config.enableExecutionModeReporting) {
-            System.out.println(describeKernelClass() + ":" + kernel.getExecutionMode());
+            System.out.println("execution complete: " + kernel);
          }
 
-         executionTime = System.currentTimeMillis() - executeStartTime;
-         accumulatedExecutionTime += executionTime;
-
          return kernel;
-      } finally {
-         executing = false;
-         clearCancelMultiPass();
+      }
+      finally {
+         _settings.profile.onEvent(ProfilingEvent.EXECUTED);
+         maybeReportProfile(_settings);
+      }
+   }
+
+   @Override
+   public String toString() {
+      return "KernelRunner{" + kernel + "}";
+   }
+
+   private String describeDevice() {
+      Device device = KernelManager.instance().getPreferences(kernel).getPreferredDevice(kernel);
+      return (device == null) ? "<default fallback>" : device.getShortDescription();
+   }
+
+   private void maybeReportProfile(ExecutionSettings _settings) {
+      if (Config.dumpProfileOnExecution) {
+         StringBuilder report = new StringBuilder();
+         report.append(KernelDeviceProfile.getTableHeader()).append('\n');
+         report.append(_settings.profile.getLastDeviceProfile().getLastAsTableRow());
+         System.out.println(report);
+      }
+   }
+
+   @SuppressWarnings("deprecation")
+   private boolean isDeviceCompatible(Device device) {
+      Kernel.EXECUTION_MODE mode = kernel.getExecutionMode();
+      if (mode != Kernel.EXECUTION_MODE.AUTO) {
+         switch (device.getType()) {
+            case GPU:
+               return mode == Kernel.EXECUTION_MODE.GPU;
+            case CPU:
+               return mode == Kernel.EXECUTION_MODE.CPU;
+            case JTP:
+               return mode == Kernel.EXECUTION_MODE.JTP;
+            case SEQ:
+               return mode == Kernel.EXECUTION_MODE.SEQ;
+            case ACC:
+               return mode == Kernel.EXECUTION_MODE.ACC;
+            default:
+               return false;
+         }
+      } else {
+         return (device == kernel.getTargetDevice());
       }
    }
 
@@ -1394,14 +1565,11 @@ public class KernelRunner extends KernelRunnerJNI{
       if (!executing) {
          return PASS_ID_COMPLETED_EXECUTION;
       }
-      switch (kernel.getExecutionMode()) {
-         case NONE:
-            return PASS_ID_COMPLETED_EXECUTION;
-         case JTP: // fallthrough
-         case SEQ:
-            return getCurrentPassLocal();
-         default:
-            return getCurrentPassRemote();
+
+      if (kernel.isRunningCL()) {
+         return getCurrentPassRemote();
+      } else {
+         return getCurrentPassLocal();
       }
    }
 
@@ -1520,17 +1688,14 @@ public class KernelRunner extends KernelRunnerJNI{
     * @see Kernel#get(boolean[] arr)
     */
    public void get(Object array) {
-      if (explicit
-            && ((kernel.getExecutionMode() == Kernel.EXECUTION_MODE.GPU)
-                  || (kernel.getExecutionMode() == Kernel.EXECUTION_MODE.ACC) || (kernel.getExecutionMode() == Kernel.EXECUTION_MODE.CPU))) {
-         // Only makes sense when we are using OpenCL
+      if (explicit && (kernel.isRunningCL())) {
+        // Only makes sense when we are using OpenCL
          getJNI(jniContextHandle, array);
       }
    }
 
    public List<ProfileInfo> getProfileInfo() {
-      if (((kernel.getExecutionMode() == Kernel.EXECUTION_MODE.GPU) || (kernel.getExecutionMode() == Kernel.EXECUTION_MODE.ACC) || (kernel
-            .getExecutionMode() == Kernel.EXECUTION_MODE.CPU))) {
+      if (explicit && (kernel.isRunningCL())) {
          // Only makes sense when we are using OpenCL
          return (getProfileInfoJNI(jniContextHandle));
       } else {
@@ -1554,9 +1719,7 @@ public class KernelRunner extends KernelRunnerJNI{
     */
 
    public void put(Object array) {
-      if (explicit
-            && ((kernel.getExecutionMode() == Kernel.EXECUTION_MODE.GPU)
-                  || (kernel.getExecutionMode() == Kernel.EXECUTION_MODE.ACC) || (kernel.getExecutionMode() == Kernel.EXECUTION_MODE.CPU))) {
+      if (explicit && (kernel.isRunningCL())) {
          // Only makes sense when we are using OpenCL
          puts.add(array);
       }
@@ -1572,33 +1735,33 @@ public class KernelRunner extends KernelRunnerJNI{
       return (explicit);
    }
 
-   /**
-    * Determine the time taken to convert bytecode to OpenCL for first Kernel.execute(range) call.
-    * 
-    * @return The time spent preparing the kernel for execution using GPU
-    * 
-    */
-   public long getConversionTime() {
-      return conversionTime;
-   }
-
-   /**
-    * Determine the execution time of the previous Kernel.execute(range) call.
-    * 
-    * @return The time spent executing the kernel (ms)
-    * 
-    */
-   public long getExecutionTime() {
-      return executionTime;
-   }
+   private static class ExecutionSettings {
+      final KernelPreferences preferences;
+      final KernelProfile profile;
+      final String entrypoint;
+      Range range;
+      final int passes;
+      final boolean legacyExecutionMode;
+
+      private ExecutionSettings(KernelPreferences preferences, KernelProfile profile, String entrypoint, Range range, int passes, boolean legacyExecutionMode) {
+         this.preferences = preferences;
+         this.profile = profile;
+         this.entrypoint = entrypoint;
+         this.range = range;
+         this.passes = passes;
+         this.legacyExecutionMode = legacyExecutionMode;
+      }
 
-   /**
-    * Determine the accumulated execution time of all previous Kernel.execute(range) calls.
-    * 
-    * @return The accumulated time spent executing this kernel (ms)
-    * 
-    */
-   public long getAccumulatedExecutionTime() {
-      return accumulatedExecutionTime;
+      @Override
+      public String toString() {
+         return "ExecutionSettings{" +
+                 "preferences=" + preferences +
+                 ", profile=" + profile +
+                 ", entrypoint='" + entrypoint + '\'' +
+                 ", range=" + range +
+                 ", passes=" + passes +
+                 ", legacyExecutionMode=" + legacyExecutionMode +
+                 '}';
+      }
    }
 }
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java
new file mode 100644
index 0000000000000000000000000000000000000000..4e1d01d0a524f3a7b2075891b6bbd877ad6cf3b1
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/ProfilingEvent.java
@@ -0,0 +1,8 @@
+package com.amd.aparapi.internal.kernel;
+
+/**
+ * Created by Barney on 02/09/2015.
+ */
+public enum ProfilingEvent {
+   START, CLASS_MODEL_BUILT, INIT_JNI, OPENCL_GENERATED, OPENCL_COMPILED, PREPARE_EXECUTE, EXECUTED
+}
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java
index e4728c5e892f769305fcec0f8b29878a1aecbd41..5b3823e8f70e55eed88371817a966733bfb7f7ad 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ClassModel.java
@@ -45,6 +45,7 @@ import com.amd.aparapi.internal.model.ValueCache.ThrowingValueComputer;
 import com.amd.aparapi.internal.model.ClassModel.AttributePool.*;
 import com.amd.aparapi.internal.model.ClassModel.ConstantPool.*;
 import com.amd.aparapi.internal.reader.*;
+import com.amd.aparapi.internal.util.*;
 
 import java.io.*;
 import java.lang.reflect.*;
@@ -64,9 +65,9 @@ import java.util.logging.*;
  * @author gfrost
  *
  */
-public class ClassModel{
+public class ClassModel {
 
-   public interface LocalVariableInfo{
+   public interface LocalVariableInfo {
 
       int getStart();
 
@@ -141,6 +142,7 @@ public class ClassModel{
          });
 
    //   private ValueCache<String, Integer, ClassParseException> privateMemorySizes = ValueCache.on(this::computePrivateMemorySize);
+
    private ValueCache<String, Integer, ClassParseException> privateMemorySizes = ValueCache
          .on(new ThrowingValueComputer<String, Integer, ClassParseException>(){
             @Override
@@ -634,19 +636,25 @@ public class ClassModel{
       return (methodDescription);
    }
 
-   //   private static final ValueCache<Class<?>, ClassModel, ClassParseException> classModelCache = ValueCache.onIdentity(ClassModel::new);
    private static final ValueCache<Class<?>, ClassModel, ClassParseException> classModelCache = ValueCache
          .on(new ThrowingValueComputer<Class<?>, ClassModel, ClassParseException>(){
             @Override
             public ClassModel compute(Class<?> key) throws ClassParseException {
-               return new ClassModel(key);
+               return createClassModelInternal(key);
             }
          });
 
+   private static ClassModel createClassModelInternal(Class<?> key) throws ClassParseException {
+      ClassModel classModel = new ClassModel(key);
+      return classModel;
+   }
+
    public static ClassModel createClassModel(Class<?> _class) throws ClassParseException {
-      if (CacheEnabler.areCachesEnabled())
+      if (CacheEnabler.areCachesEnabled()) {
          return classModelCache.computeIfAbsent(_class);
-      return new ClassModel(_class);
+      }
+
+      return createClassModelInternal(_class);
    }
 
    private int magic;
@@ -745,7 +753,7 @@ public class ClassModel{
 
       private final List<Entry> entries = new ArrayList<Entry>();
 
-      public abstract class Entry{
+      public abstract class Entry {
          private final ConstantPoolType constantPoolType;
 
          private final int slot;
@@ -1558,7 +1566,7 @@ public class ClassModel{
       }
    }
 
-   public class AttributePool{
+   public class AttributePool {
       private final List<AttributePoolEntry> attributePoolEntries = new ArrayList<AttributePoolEntry>();
 
       public class CodeEntry extends AttributePoolEntry{
@@ -1671,7 +1679,7 @@ public class ClassModel{
          }
       }
 
-      public abstract class AttributePoolEntry{
+      public abstract class AttributePoolEntry {
          protected int length;
 
          protected int nameIndex;
@@ -1726,7 +1734,7 @@ public class ClassModel{
       }
 
       public class InnerClassesEntry extends PoolEntry<InnerClassesEntry.InnerClassInfo>{
-         public class InnerClassInfo{
+         public class InnerClassInfo {
             private final int innerAccess;
 
             private final int innerIndex;
@@ -1770,7 +1778,7 @@ public class ClassModel{
 
       public class LineNumberTableEntry extends PoolEntry<LineNumberTableEntry.StartLineNumberPair>{
 
-         public class StartLineNumberPair{
+         public class StartLineNumberPair {
             private final int lineNumber;
 
             private final int start;
@@ -2089,13 +2097,13 @@ public class ClassModel{
 
       public class RuntimeAnnotationsEntry extends PoolEntry<RuntimeAnnotationsEntry.AnnotationInfo>{
 
-         public class AnnotationInfo{
+         public class AnnotationInfo {
             private final int typeIndex;
 
             private final int elementValuePairCount;
 
             public class ElementValuePair{
-               class Value{
+               class Value {
                   Value(int _tag) {
                      tag = _tag;
                   }
@@ -2382,7 +2390,7 @@ public class ClassModel{
 
    private static ClassLoader classModelLoader = ClassModel.class.getClassLoader();
 
-   public class ClassModelField{
+   public class ClassModelField {
       private final int fieldAccessFlags;
 
       AttributePool fieldAttributePool;
@@ -2449,7 +2457,7 @@ public class ClassModel{
       }
    }
 
-   public class ClassModelMethod{
+   public class ClassModelMethod {
 
       private final int methodAccessFlags;
 
@@ -2553,7 +2561,7 @@ public class ClassModel{
       }
    }
 
-   public class ClassModelInterface{
+   public class ClassModelInterface {
       private final int interfaceIndex;
 
       ClassModelInterface(ByteReader _byteReader) {
@@ -2629,7 +2637,7 @@ public class ClassModel{
          methods.add(method);
       }
 
-      attributePool = new AttributePool(byteReader, getClassWeAreModelling().getSimpleName());
+      attributePool = new AttributePool(byteReader, Reflection.getSimpleName(getClassWeAreModelling()));
    }
 
    public int getMagic() {
@@ -2804,7 +2812,9 @@ public class ClassModel{
    Entrypoint getEntrypoint(String _entrypointName, String _descriptor, Object _k) throws AparapiException {
       if (CacheEnabler.areCachesEnabled()) {
          EntrypointKey key = EntrypointKey.of(_entrypointName, _descriptor);
+         long s = System.nanoTime();
          Entrypoint entrypointWithoutKernel = entrypointCache.computeIfAbsent(key);
+         long e = System.nanoTime() - s;
          return entrypointWithoutKernel.cloneForKernel(_k);
       } else {
          final MethodModel method = getMethodModel(_entrypointName, _descriptor);
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Entrypoint.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Entrypoint.java
index 7ae155efa905a1dca6cd88f39931977a6ea9317a..974dac64adfec1c2ba8ca681c3576e6ccad28fda 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Entrypoint.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Entrypoint.java
@@ -62,8 +62,6 @@ public class Entrypoint implements Cloneable {
 
    private Object kernelInstance = null;
 
-   private final boolean fallback = false;
-
    private final Set<String> referencedFieldNames = new LinkedHashSet<String>();
 
    private final Set<String> arrayFieldAssignments = new LinkedHashSet<String>();
@@ -474,7 +472,7 @@ public class Entrypoint implements Cloneable {
 
       // methodMap now contains a list of method called by run itself().
       // Walk the whole graph of called methods and add them to the methodMap
-      while (!fallback && discovered) {
+      while (discovered) {
          discovered = false;
          for (final MethodModel mm : new ArrayList<MethodModel>(methodMap.values())) {
             for (final MethodCall methodCall : mm.getMethodCalls()) {
@@ -506,295 +504,288 @@ public class Entrypoint implements Cloneable {
 
       methodModel.checkForRecursion(new HashSet<MethodModel>());
 
-      if (logger.isLoggable(Level.FINE)) {
-         logger.fine("fallback=" + fallback);
-      }
-
-      if (!fallback) {
-         calledMethods.addAll(methodMap.values());
-         Collections.reverse(calledMethods);
-         final List<MethodModel> methods = new ArrayList<MethodModel>(calledMethods);
+      calledMethods.addAll(methodMap.values());
+      Collections.reverse(calledMethods);
+      final List<MethodModel> methods = new ArrayList<MethodModel>(calledMethods);
 
-         // add method to the calledMethods so we can include in this list
-         methods.add(methodModel);
-         final Set<String> fieldAssignments = new HashSet<String>();
+      // add method to the calledMethods so we can include in this list
+      methods.add(methodModel);
+      final Set<String> fieldAssignments = new HashSet<String>();
 
-         final Set<String> fieldAccesses = new HashSet<String>();
+      final Set<String> fieldAccesses = new HashSet<String>();
 
-         for (final MethodModel methodModel : methods) {
-
-            // Record which pragmas we need to enable
-            if (methodModel.requiresDoublePragma()) {
-               usesDoubles = true;
-               if (logger.isLoggable(Level.FINE)) {
-                  logger.fine("Enabling doubles on " + methodModel.getName());
-               }
+      for (final MethodModel methodModel : methods) {
 
+         // Record which pragmas we need to enable
+         if (methodModel.requiresDoublePragma()) {
+            usesDoubles = true;
+            if (logger.isLoggable(Level.FINE)) {
+               logger.fine("Enabling doubles on " + methodModel.getName());
             }
-            if (methodModel.requiresByteAddressableStorePragma()) {
-               usesByteWrites = true;
-               if (logger.isLoggable(Level.FINE)) {
-                  logger.fine("Enabling byte addressable on " + methodModel.getName());
-               }
+
+         }
+         if (methodModel.requiresByteAddressableStorePragma()) {
+            usesByteWrites = true;
+            if (logger.isLoggable(Level.FINE)) {
+               logger.fine("Enabling byte addressable on " + methodModel.getName());
             }
+         }
 
-            for (Instruction instruction = methodModel.getPCHead(); instruction != null; instruction = instruction.getNextPC()) {
+         for (Instruction instruction = methodModel.getPCHead(); instruction != null; instruction = instruction.getNextPC()) {
 
-               if (instruction instanceof AssignToArrayElement) {
-                  final AssignToArrayElement assignment = (AssignToArrayElement) instruction;
+            if (instruction instanceof AssignToArrayElement) {
+               final AssignToArrayElement assignment = (AssignToArrayElement) instruction;
 
-                  final Instruction arrayRef = assignment.getArrayRef();
-                  // AccessField here allows instance and static array refs
-                  if (arrayRef instanceof I_GETFIELD) {
-                     final I_GETFIELD getField = (I_GETFIELD) arrayRef;
-                     final FieldEntry field = getField.getConstantPoolFieldEntry();
-                     final String assignedArrayFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
-                     arrayFieldAssignments.add(assignedArrayFieldName);
-                     referencedFieldNames.add(assignedArrayFieldName);
+               final Instruction arrayRef = assignment.getArrayRef();
+               // AccessField here allows instance and static array refs
+               if (arrayRef instanceof I_GETFIELD) {
+                  final I_GETFIELD getField = (I_GETFIELD) arrayRef;
+                  final FieldEntry field = getField.getConstantPoolFieldEntry();
+                  final String assignedArrayFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
+                  arrayFieldAssignments.add(assignedArrayFieldName);
+                  referencedFieldNames.add(assignedArrayFieldName);
 
-                  }
-               } else if (instruction instanceof AccessArrayElement) {
-                  final AccessArrayElement access = (AccessArrayElement) instruction;
-
-                  final Instruction arrayRef = access.getArrayRef();
-                  // AccessField here allows instance and static array refs
-                  if (arrayRef instanceof I_GETFIELD) {
-                     final I_GETFIELD getField = (I_GETFIELD) arrayRef;
-                     final FieldEntry field = getField.getConstantPoolFieldEntry();
-                     final String accessedArrayFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
-                     arrayFieldAccesses.add(accessedArrayFieldName);
-                     referencedFieldNames.add(accessedArrayFieldName);
+               }
+            } else if (instruction instanceof AccessArrayElement) {
+               final AccessArrayElement access = (AccessArrayElement) instruction;
+
+               final Instruction arrayRef = access.getArrayRef();
+               // AccessField here allows instance and static array refs
+               if (arrayRef instanceof I_GETFIELD) {
+                  final I_GETFIELD getField = (I_GETFIELD) arrayRef;
+                  final FieldEntry field = getField.getConstantPoolFieldEntry();
+                  final String accessedArrayFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
+                  arrayFieldAccesses.add(accessedArrayFieldName);
+                  referencedFieldNames.add(accessedArrayFieldName);
 
-                  }
-               } else if (instruction instanceof I_ARRAYLENGTH) {
-                  Instruction child = instruction.getFirstChild();
-                  while(child instanceof I_AALOAD) {
-                     child = child.getFirstChild();
-                  }
-                  if (!(child instanceof AccessField)) {
-                     throw new ClassParseException(ClassParseException.TYPE.LOCALARRAYLENGTHACCESS);
-                  }
-                  final AccessField childField = (AccessField) child;
-                  final String arrayName = childField.getConstantPoolFieldEntry().getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
-                  arrayFieldArrayLengthUsed.add(arrayName);
-                  if (logger.isLoggable(Level.FINE)) {
-                     logger.fine("Noted arraylength in " + methodModel.getName() + " on " + arrayName);
-                  }
-               } else if (instruction instanceof AccessField) {
-                  final AccessField access = (AccessField) instruction;
-                  final FieldEntry field = access.getConstantPoolFieldEntry();
-                  final String accessedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
-                  fieldAccesses.add(accessedFieldName);
-                  referencedFieldNames.add(accessedFieldName);
-
-                  final String signature = field.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8();
-                  if (logger.isLoggable(Level.FINE)) {
-                     logger.fine("AccessField field type= " + signature + " in " + methodModel.getName());
-                  }
+               }
+            } else if (instruction instanceof I_ARRAYLENGTH) {
+               Instruction child = instruction.getFirstChild();
+               while(child instanceof I_AALOAD) {
+                  child = child.getFirstChild();
+               }
+               if (!(child instanceof AccessField)) {
+                  throw new ClassParseException(ClassParseException.TYPE.LOCALARRAYLENGTHACCESS);
+               }
+               final AccessField childField = (AccessField) child;
+               final String arrayName = childField.getConstantPoolFieldEntry().getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
+               arrayFieldArrayLengthUsed.add(arrayName);
+               if (logger.isLoggable(Level.FINE)) {
+                  logger.fine("Noted arraylength in " + methodModel.getName() + " on " + arrayName);
+               }
+            } else if (instruction instanceof AccessField) {
+               final AccessField access = (AccessField) instruction;
+               final FieldEntry field = access.getConstantPoolFieldEntry();
+               final String accessedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
+               fieldAccesses.add(accessedFieldName);
+               referencedFieldNames.add(accessedFieldName);
+
+               final String signature = field.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8();
+               if (logger.isLoggable(Level.FINE)) {
+                  logger.fine("AccessField field type= " + signature + " in " + methodModel.getName());
+               }
 
-                  // Add the class model for the referenced obj array
-                  if (signature.startsWith("[L")) {
-                     // Turn [Lcom/amd/javalabs/opencl/demo/DummyOOA; into com.amd.javalabs.opencl.demo.DummyOOA for example
-                     final String className = (signature.substring(2, signature.length() - 1)).replace('/', '.');
-                     final ClassModel arrayFieldModel = getOrUpdateAllClassAccesses(className);
-                     if (arrayFieldModel != null) {
-                        final Class<?> memberClass = arrayFieldModel.getClassWeAreModelling();
-                        final int modifiers = memberClass.getModifiers();
-                        if (!Modifier.isFinal(modifiers)) {
-                           throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTNONFINAL);
-                        }
+               // Add the class model for the referenced obj array
+               if (signature.startsWith("[L")) {
+                  // Turn [Lcom/amd/javalabs/opencl/demo/DummyOOA; into com.amd.javalabs.opencl.demo.DummyOOA for example
+                  final String className = (signature.substring(2, signature.length() - 1)).replace('/', '.');
+                  final ClassModel arrayFieldModel = getOrUpdateAllClassAccesses(className);
+                  if (arrayFieldModel != null) {
+                     final Class<?> memberClass = arrayFieldModel.getClassWeAreModelling();
+                     final int modifiers = memberClass.getModifiers();
+                     if (!Modifier.isFinal(modifiers)) {
+                        throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTNONFINAL);
+                     }
 
-                        final ClassModel refModel = objectArrayFieldsClasses.get(className);
-                        if (refModel == null) {
-
-                           // Verify no other member with common parent
-                           for (final ClassModel memberObjClass : objectArrayFieldsClasses.values()) {
-                              ClassModel superModel = memberObjClass;
-                              while (superModel != null) {
-                                 if (superModel.isSuperClass(memberClass)) {
-                                    throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTFIELDNAMECONFLICT);
-                                 }
-                                 superModel = superModel.getSuperClazz();
+                     final ClassModel refModel = objectArrayFieldsClasses.get(className);
+                     if (refModel == null) {
+
+                        // Verify no other member with common parent
+                        for (final ClassModel memberObjClass : objectArrayFieldsClasses.values()) {
+                           ClassModel superModel = memberObjClass;
+                           while (superModel != null) {
+                              if (superModel.isSuperClass(memberClass)) {
+                                 throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTFIELDNAMECONFLICT);
                               }
+                              superModel = superModel.getSuperClazz();
                            }
+                        }
 
-                           objectArrayFieldsClasses.put(className, arrayFieldModel);
-                           if (logger.isLoggable(Level.FINE)) {
-                              logger.fine("adding class to objectArrayFields: " + className);
-                           }
+                        objectArrayFieldsClasses.put(className, arrayFieldModel);
+                        if (logger.isLoggable(Level.FINE)) {
+                           logger.fine("adding class to objectArrayFields: " + className);
                         }
                      }
-                  } else {
-                     final String className = (field.getClassEntry().getNameUTF8Entry().getUTF8()).replace('/', '.');
-                     // Look for object data member access
-                     if (!className.equals(getClassModel().getClassWeAreModelling().getName())
-                           && (getFieldFromClassHierarchy(getClassModel().getClassWeAreModelling(), accessedFieldName) == null)) {
-                        updateObjectMemberFieldAccesses(className, field);
-                     }
                   }
-
-               } else if (instruction instanceof AssignToField) {
-                  final AssignToField assignment = (AssignToField) instruction;
-                  final FieldEntry field = assignment.getConstantPoolFieldEntry();
-                  final String assignedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
-                  fieldAssignments.add(assignedFieldName);
-                  referencedFieldNames.add(assignedFieldName);
-
+               } else {
                   final String className = (field.getClassEntry().getNameUTF8Entry().getUTF8()).replace('/', '.');
                   // Look for object data member access
                   if (!className.equals(getClassModel().getClassWeAreModelling().getName())
-                        && (getFieldFromClassHierarchy(getClassModel().getClassWeAreModelling(), assignedFieldName) == null)) {
+                        && (getFieldFromClassHierarchy(getClassModel().getClassWeAreModelling(), accessedFieldName) == null)) {
                      updateObjectMemberFieldAccesses(className, field);
-                  } else {
+                  }
+               }
 
-                     if ((!Config.enablePUTFIELD) && methodModel.methodUsesPutfield() && !methodModel.isSetter()) {
-                        throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTONLYSUPPORTSSIMPLEPUTFIELD);
-                     }
+            } else if (instruction instanceof AssignToField) {
+               final AssignToField assignment = (AssignToField) instruction;
+               final FieldEntry field = assignment.getConstantPoolFieldEntry();
+               final String assignedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
+               fieldAssignments.add(assignedFieldName);
+               referencedFieldNames.add(assignedFieldName);
+
+               final String className = (field.getClassEntry().getNameUTF8Entry().getUTF8()).replace('/', '.');
+               // Look for object data member access
+               if (!className.equals(getClassModel().getClassWeAreModelling().getName())
+                     && (getFieldFromClassHierarchy(getClassModel().getClassWeAreModelling(), assignedFieldName) == null)) {
+                  updateObjectMemberFieldAccesses(className, field);
+               } else {
 
+                  if ((!Config.enablePUTFIELD) && methodModel.methodUsesPutfield() && !methodModel.isSetter()) {
+                     throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTONLYSUPPORTSSIMPLEPUTFIELD);
                   }
 
                }
-               else if (instruction instanceof I_INVOKEVIRTUAL) {
-                  final I_INVOKEVIRTUAL invokeInstruction = (I_INVOKEVIRTUAL) instruction;
-                  MethodModel invokedMethod = invokeInstruction.getMethod();
-                  FieldEntry getterField = getSimpleGetterField(invokedMethod);
-                  if (getterField != null) {
-                     referencedFieldNames.add(getterField.getNameAndTypeEntry().getNameUTF8Entry().getUTF8());
-                  }
-                  else {
-                     final MethodEntry methodEntry = invokeInstruction.getConstantPoolMethodEntry();
-                     if (Kernel.isMappedMethod(methodEntry)) { //only do this for intrinsics
 
-                        if (Kernel.usesAtomic32(methodEntry)) {
-                           setRequiresAtomics32Pragma(true);
-                        }
+            }
+            else if (instruction instanceof I_INVOKEVIRTUAL) {
+               final I_INVOKEVIRTUAL invokeInstruction = (I_INVOKEVIRTUAL) instruction;
+               MethodModel invokedMethod = invokeInstruction.getMethod();
+               FieldEntry getterField = getSimpleGetterField(invokedMethod);
+               if (getterField != null) {
+                  referencedFieldNames.add(getterField.getNameAndTypeEntry().getNameUTF8Entry().getUTF8());
+               }
+               else {
+                  final MethodEntry methodEntry = invokeInstruction.getConstantPoolMethodEntry();
+                  if (Kernel.isMappedMethod(methodEntry)) { //only do this for intrinsics
 
-                        final Arg methodArgs[] = methodEntry.getArgs();
-                        if ((methodArgs.length > 0) && methodArgs[0].isArray()) { //currently array arg can only take slot 0
-                           final Instruction arrInstruction = invokeInstruction.getArg(0);
-                           if (arrInstruction instanceof AccessField) {
-                              final AccessField access = (AccessField) arrInstruction;
-                              final FieldEntry field = access.getConstantPoolFieldEntry();
-                              final String accessedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
-                              arrayFieldAssignments.add(accessedFieldName);
-                              referencedFieldNames.add(accessedFieldName);
-                           }
-                           else {
-                              throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTSETTERARRAY);
-                           }
-                        }
+                     if (Kernel.usesAtomic32(methodEntry)) {
+                        setRequiresAtomics32Pragma(true);
                      }
 
+                     final Arg methodArgs[] = methodEntry.getArgs();
+                     if ((methodArgs.length > 0) && methodArgs[0].isArray()) { //currently array arg can only take slot 0
+                        final Instruction arrInstruction = invokeInstruction.getArg(0);
+                        if (arrInstruction instanceof AccessField) {
+                           final AccessField access = (AccessField) arrInstruction;
+                           final FieldEntry field = access.getConstantPoolFieldEntry();
+                           final String accessedFieldName = field.getNameAndTypeEntry().getNameUTF8Entry().getUTF8();
+                           arrayFieldAssignments.add(accessedFieldName);
+                           referencedFieldNames.add(accessedFieldName);
+                        }
+                        else {
+                           throw new ClassParseException(ClassParseException.TYPE.ACCESSEDOBJECTSETTERARRAY);
+                        }
+                     }
                   }
+
                }
             }
          }
+      }
 
-         for (final String referencedFieldName : referencedFieldNames) {
+      for (final String referencedFieldName : referencedFieldNames) {
 
-            try {
-               final Class<?> clazz = classModel.getClassWeAreModelling();
-               final Field field = getFieldFromClassHierarchy(clazz, referencedFieldName);
-               if (field != null) {
-                  referencedFields.add(field);
-                  final ClassModelField ff = classModel.getField(referencedFieldName);
-                  assert ff != null : "ff should not be null for " + clazz.getName() + "." + referencedFieldName;
-                  referencedClassModelFields.add(ff);
-               }
-            } catch (final SecurityException e) {
-               e.printStackTrace();
+         try {
+            final Class<?> clazz = classModel.getClassWeAreModelling();
+            final Field field = getFieldFromClassHierarchy(clazz, referencedFieldName);
+            if (field != null) {
+               referencedFields.add(field);
+               final ClassModelField ff = classModel.getField(referencedFieldName);
+               assert ff != null : "ff should not be null for " + clazz.getName() + "." + referencedFieldName;
+               referencedClassModelFields.add(ff);
             }
+         } catch (final SecurityException e) {
+            e.printStackTrace();
          }
+      }
 
-         // Build data needed for oop form transforms if necessary
-         if (!objectArrayFieldsClasses.keySet().isEmpty()) {
+      // Build data needed for oop form transforms if necessary
+      if (!objectArrayFieldsClasses.keySet().isEmpty()) {
 
-            for (final ClassModel memberObjClass : objectArrayFieldsClasses.values()) {
+         for (final ClassModel memberObjClass : objectArrayFieldsClasses.values()) {
 
-               // At this point we have already done the field override safety check, so 
-               // add all the superclass fields into the kernel member class to be
-               // sorted by size and emitted into the struct
-               ClassModel superModel = memberObjClass.getSuperClazz();
-               while (superModel != null) {
-                  if (logger.isLoggable(Level.FINEST)) {
-                     logger.finest("adding = " + superModel.getClassWeAreModelling().getName() + " fields into "
-                           + memberObjClass.getClassWeAreModelling().getName());
-                  }
-                  memberObjClass.getStructMembers().addAll(superModel.getStructMembers());
-                  superModel = superModel.getSuperClazz();
+            // At this point we have already done the field override safety check, so
+            // add all the superclass fields into the kernel member class to be
+            // sorted by size and emitted into the struct
+            ClassModel superModel = memberObjClass.getSuperClazz();
+            while (superModel != null) {
+               if (logger.isLoggable(Level.FINEST)) {
+                  logger.finest("adding = " + superModel.getClassWeAreModelling().getName() + " fields into "
+                        + memberObjClass.getClassWeAreModelling().getName());
                }
+               memberObjClass.getStructMembers().addAll(superModel.getStructMembers());
+               superModel = superModel.getSuperClazz();
             }
+         }
 
-            // Sort fields of each class biggest->smallest
-            final Comparator<FieldEntry> fieldSizeComparator = new Comparator<FieldEntry>(){
-               @Override public int compare(FieldEntry aa, FieldEntry bb) {
-                  final String aType = aa.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8();
-                  final String bType = bb.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8();
-
-                  // Booleans get converted down to bytes
-                  final int aSize = InstructionSet.TypeSpec.valueOf(aType.equals("Z") ? "B" : aType).getSize();
-                  final int bSize = InstructionSet.TypeSpec.valueOf(bType.equals("Z") ? "B" : bType).getSize();
+         // Sort fields of each class biggest->smallest
+         final Comparator<FieldEntry> fieldSizeComparator = new Comparator<FieldEntry>(){
+            @Override public int compare(FieldEntry aa, FieldEntry bb) {
+               final String aType = aa.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8();
+               final String bType = bb.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8();
 
-                  if (logger.isLoggable(Level.FINEST)) {
-                     logger.finest("aType= " + aType + " aSize= " + aSize + " . . bType= " + bType + " bSize= " + bSize);
-                  }
+               // Booleans get converted down to bytes
+               final int aSize = InstructionSet.TypeSpec.valueOf(aType.equals("Z") ? "B" : aType).getSize();
+               final int bSize = InstructionSet.TypeSpec.valueOf(bType.equals("Z") ? "B" : bType).getSize();
 
-                  // Note this is sorting in reverse order so the biggest is first
-                  if (aSize > bSize) {
-                     return -1;
-                  } else if (aSize == bSize) {
-                     return 0;
-                  } else {
-                     return 1;
-                  }
+               if (logger.isLoggable(Level.FINEST)) {
+                  logger.finest("aType= " + aType + " aSize= " + aSize + " . . bType= " + bType + " bSize= " + bSize);
                }
-            };
-
-            for (final ClassModel c : objectArrayFieldsClasses.values()) {
-               final ArrayList<FieldEntry> fields = c.getStructMembers();
-               if (fields.size() > 0) {
-                  Collections.sort(fields, fieldSizeComparator);
-
-                  // Now compute the total size for the struct
-                  int totalSize = 0;
-                  int alignTo = 0;
-
-                  for (final FieldEntry f : fields) {
-                     // Record field offset for use while copying
-                     // Get field we will copy out of the kernel member object
-                     final Field rfield = getFieldFromClassHierarchy(c.getClassWeAreModelling(), f.getNameAndTypeEntry()
-                           .getNameUTF8Entry().getUTF8());
-
-                     c.getStructMemberOffsets().add(UnsafeWrapper.objectFieldOffset(rfield));
-
-                     final String fType = f.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8();
-                     //c.getStructMemberTypes().add(TypeSpec.valueOf(fType.equals("Z") ? "B" : fType));
-                     c.getStructMemberTypes().add(TypeSpec.valueOf(fType));
-                     final int fSize = TypeSpec.valueOf(fType.equals("Z") ? "B" : fType).getSize();
-                     if (fSize > alignTo) {
-                        alignTo = fSize;
-                     }
 
-                     totalSize += fSize;
-                     if (logger.isLoggable(Level.FINEST)) {
-                        logger.finest("Field = " + f.getNameAndTypeEntry().getNameUTF8Entry().getUTF8() + " size=" + fSize
-                              + " totalSize=" + totalSize);
-                     }
+               // Note this is sorting in reverse order so the biggest is first
+               if (aSize > bSize) {
+                  return -1;
+               } else if (aSize == bSize) {
+                  return 0;
+               } else {
+                  return 1;
+               }
+            }
+         };
+
+         for (final ClassModel c : objectArrayFieldsClasses.values()) {
+            final ArrayList<FieldEntry> fields = c.getStructMembers();
+            if (fields.size() > 0) {
+               Collections.sort(fields, fieldSizeComparator);
+
+               // Now compute the total size for the struct
+               int totalSize = 0;
+               int alignTo = 0;
+
+               for (final FieldEntry f : fields) {
+                  // Record field offset for use while copying
+                  // Get field we will copy out of the kernel member object
+                  final Field rfield = getFieldFromClassHierarchy(c.getClassWeAreModelling(), f.getNameAndTypeEntry()
+                        .getNameUTF8Entry().getUTF8());
+
+                  c.getStructMemberOffsets().add(UnsafeWrapper.objectFieldOffset(rfield));
+
+                  final String fType = f.getNameAndTypeEntry().getDescriptorUTF8Entry().getUTF8();
+                  //c.getStructMemberTypes().add(TypeSpec.valueOf(fType.equals("Z") ? "B" : fType));
+                  c.getStructMemberTypes().add(TypeSpec.valueOf(fType));
+                  final int fSize = TypeSpec.valueOf(fType.equals("Z") ? "B" : fType).getSize();
+                  if (fSize > alignTo) {
+                     alignTo = fSize;
                   }
 
-                  // compute total size for OpenCL buffer
-                  int totalStructSize = 0;
-                  if ((totalSize % alignTo) == 0) {
-                     totalStructSize = totalSize;
-                  } else {
-                     // Pad up if necessary
-                     totalStructSize = ((totalSize / alignTo) + 1) * alignTo;
+                  totalSize += fSize;
+                  if (logger.isLoggable(Level.FINEST)) {
+                     logger.finest("Field = " + f.getNameAndTypeEntry().getNameUTF8Entry().getUTF8() + " size=" + fSize
+                           + " totalSize=" + totalSize);
                   }
-                  c.setTotalStructSize(totalStructSize);
                }
+
+               // compute total size for OpenCL buffer
+               int totalStructSize = 0;
+               if ((totalSize % alignTo) == 0) {
+                  totalStructSize = totalSize;
+               } else {
+                  // Pad up if necessary
+                  totalStructSize = ((totalSize / alignTo) + 1) * alignTo;
+               }
+               c.setTotalStructSize(totalStructSize);
             }
          }
-
       }
    }
 
@@ -807,10 +798,6 @@ public class Entrypoint implements Cloneable {
       return method.getAccessorVariableFieldEntry();
    }
 
-   public boolean shouldFallback() {
-      return (fallback);
-   }
-
    public List<ClassModel.ClassModelField> getReferencedClassModelFields() {
       return (referencedClassModelFields);
    }
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Memoizer.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Memoizer.java
index ece7e391574fb962f7f28d06e876e97693b2d970..7eec09b7e7a08a606d44712a714c86b6ab064fe8 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Memoizer.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Memoizer.java
@@ -1,7 +1,7 @@
 package com.amd.aparapi.internal.model;
 
-import java.util.NoSuchElementException;
-import java.util.concurrent.atomic.AtomicReference;
+import java.util.*;
+import java.util.concurrent.atomic.*;
 
 interface Optional<E> {
    final class Some<E> implements Optional<E>{
@@ -49,7 +49,7 @@ interface Optional<E> {
    boolean isPresent();
 }
 
-public interface Memoizer<T> extends Supplier<T>{
+public interface Memoizer<T> extends Supplier<T> {
    public final class Impl<T> implements Memoizer<T>{
       private final Supplier<T> supplier;
 
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ValueCache.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ValueCache.java
index ef66a53fdeca66f8da816f12f1d88e360d749303..63906ed0465b9d95150dc3923f05552d9aacaa90 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ValueCache.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/ValueCache.java
@@ -1,9 +1,7 @@
 package com.amd.aparapi.internal.model;
 
-import java.lang.ref.Reference;
-import java.lang.ref.SoftReference;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ConcurrentMap;
+import java.lang.ref.*;
+import java.util.concurrent.*;
 
 //import java.util.function.Supplier;
 
@@ -14,7 +12,7 @@ public final class ValueCache<K, V, T extends Throwable> {
    }
 
    //    @FunctionalInterface
-   public interface ValueComputer<K, V> extends ThrowingValueComputer<K, V, RuntimeException>{
+   public interface ValueComputer<K, V> extends ThrowingValueComputer<K, V, RuntimeException> {
       // Marker interface
    }
 
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/opencl/OpenCLPlatform.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/opencl/OpenCLPlatform.java
index 12b52360ef683e9bf74d2bf9a5f2a2b73d2092c0..1f8321336f6999aec5fc7540f65d32ab07cef2bd 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/opencl/OpenCLPlatform.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/opencl/OpenCLPlatform.java
@@ -1,10 +1,9 @@
 package com.amd.aparapi.internal.opencl;
 
-import java.util.ArrayList;
-import java.util.List;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.jni.*;
 
-import com.amd.aparapi.device.OpenCLDevice;
-import com.amd.aparapi.internal.jni.OpenCLJNI;
+import java.util.*;
 
 public class OpenCLPlatform extends OpenCLJNI{
 
@@ -18,6 +17,8 @@ public class OpenCLPlatform extends OpenCLJNI{
 
    private final List<OpenCLDevice> devices = new ArrayList<OpenCLDevice>();
 
+   private static List<OpenCLPlatform> platforms;
+
    /**
     * Default constructor
     */
@@ -51,11 +52,14 @@ public class OpenCLPlatform extends OpenCLJNI{
    }
 
    public List<OpenCLPlatform> getOpenCLPlatforms() {
-      if (OpenCLLoader.isOpenCLAvailable()) {
-         return (getPlatforms());
-      } else {
-         return (new ArrayList<OpenCLPlatform>());
+      if (platforms == null) {
+         if (OpenCLLoader.isOpenCLAvailable()) {
+            platforms = getPlatforms();
+         } else {
+            return (Collections.EMPTY_LIST);
+         }
       }
+      return platforms;
    }
 
    public String getName() {
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/util/Reflection.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/util/Reflection.java
new file mode 100644
index 0000000000000000000000000000000000000000..ba7a553a0b45d1828d25815cb823c8c34a378604
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/util/Reflection.java
@@ -0,0 +1,18 @@
+package com.amd.aparapi.internal.util;
+
+/**
+ * Created by Barney on 03/09/2015.
+ */
+public class Reflection {
+
+   /** Avoids getting dumb empty names for anonymous inners. */
+   public static String getSimpleName(Class<?> klass) {
+      String simpleName = klass.getSimpleName();
+      if (simpleName.isEmpty()) {
+         String fullName = klass.getName();
+         int index = fullName.lastIndexOf('.');
+         simpleName = (index < 0) ? fullName : fullName.substring(index + 1);
+      }
+      return simpleName;
+   }
+}
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/writer/BlockWriter.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/writer/BlockWriter.java
index 090c14542a9faad28a257043c40edee46834ef37..613e9401ba96f9e5ee2977bb2a971e4551099c8e 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/writer/BlockWriter.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/writer/BlockWriter.java
@@ -1,803 +1,816 @@
-/*
-Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
-following conditions are met:
-
-Redistributions of source code must retain the above copyright notice, this list of conditions and the following
-disclaimer. 
-
-Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
-disclaimer in the documentation and/or other materials provided with the distribution. 
-
-Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission. 
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
-INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
-laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
-774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
-you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
-Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
-Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
-E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
-D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
-to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
-of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
-under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 
-
-*/
-package com.amd.aparapi.internal.writer;
-
-import com.amd.aparapi.*;
-import com.amd.aparapi.internal.exception.*;
-import com.amd.aparapi.internal.instruction.*;
-import com.amd.aparapi.internal.instruction.BranchSet.LogicalExpressionNode;
-import com.amd.aparapi.internal.instruction.InstructionSet.AccessInstanceField;
-import com.amd.aparapi.internal.instruction.BranchSet.*;
-import com.amd.aparapi.internal.instruction.InstructionSet.*;
-import com.amd.aparapi.internal.model.ClassModel.ConstantPool.*;
-import com.amd.aparapi.internal.model.ClassModel.*;
-import com.amd.aparapi.internal.model.*;
-import com.amd.aparapi.internal.model.ClassModel.ConstantPool.NameAndTypeEntry;
-
-import java.util.*;
-
-/**
- * Base abstract class for converting <code>Aparapi</code> IR to text.<br/>
- * 
- *   
- * @author gfrost
- *
- */
-
-public abstract class BlockWriter{
-
-   public final static String arrayLengthMangleSuffix = "__javaArrayLength";
-
-   public final static String arrayDimMangleSuffix = "__javaArrayDimension";
-
-   public abstract void write(String _string);
-
-   public void writeln(String _string) {
-      write(_string);
-      newLine();
-   }
-
-   public int indent = 0;
-
-   public void in() {
-      indent++;
-   }
-
-   public void out() {
-      indent--;
-   }
-
-   public void newLine() {
-      write("\n");
-      for (int i = 0; i < indent; i++) {
-         write("   ");
-      }
-   }
-
-   public void writeConditionalBranch16(ConditionalBranch16 _branch16, boolean _invert) throws CodeGenException {
-
-      if (_branch16 instanceof If) {
-         final If iff = (If) _branch16;
-
-         writeInstruction(iff.getLhs());
-         write(_branch16.getOperator().getText(_invert));
-         writeInstruction(iff.getRhs());
-      } else if (_branch16 instanceof I_IFNULL) {
-         final I_IFNULL iff = (I_IFNULL) _branch16;
-         writeInstruction(iff.getFirstChild());
-
-         if (_invert) {
-            write(" != NULL");
-         } else {
-            write(" == NULL");
-         }
-
-      } else if (_branch16 instanceof I_IFNONNULL) {
-         final I_IFNONNULL iff = (I_IFNONNULL) _branch16;
-         writeInstruction(iff.getFirstChild());
-
-         if (_invert) {
-            write(" == NULL");
-         } else {
-            write(" != NULL");
-         }
-      } else if (_branch16 instanceof IfUnary) {
-         final IfUnary branch16 = (IfUnary) _branch16;
-         final Instruction comparison = branch16.getUnary();
-         final ByteCode comparisonByteCode = comparison.getByteCode();
-         final String comparisonOperator = _branch16.getOperator().getText(_invert);
-
-         switch (comparisonByteCode) {
-            case FCMPG:
-            case DCMPG:
-            case FCMPL:
-            case DCMPL:
-               if (Config.verboseComparitor) {
-                  write("/* bytecode=" + comparisonByteCode.getName() + " invert=" + _invert + "*/");
-               }
-               writeInstruction(comparison.getFirstChild());
-               write(comparisonOperator);
-               writeInstruction(comparison.getLastChild());
-               break;
-            default:
-               if (Config.verboseComparitor) {
-                  write("/* default bytecode=" + comparisonByteCode.getName() + " invert=" + _invert + "*/");
-               }
-               writeInstruction(comparison);
-               write(comparisonOperator);
-               write("0");
-         }
-      }
-   }
-
-   public void writeComposite(CompositeInstruction instruction) throws CodeGenException {
-      if (instruction instanceof CompositeArbitraryScopeInstruction) {
-         newLine();
-
-         writeBlock(instruction.getFirstChild(), null);
-      } else if (instruction instanceof CompositeIfInstruction) {
-         newLine();
-         write("if (");
-         final Instruction blockStart = writeConditional(instruction.getBranchSet());
-
-         write(")");
-         writeBlock(blockStart, null);
-      } else if (instruction instanceof CompositeIfElseInstruction) {
-         newLine();
-         write("if (");
-         final Instruction blockStart = writeConditional(instruction.getBranchSet());
-         write(")");
-         Instruction elseGoto = blockStart;
-         while (!(elseGoto.isBranch() && elseGoto.asBranch().isUnconditional())) {
-            elseGoto = elseGoto.getNextExpr();
-         }
-         writeBlock(blockStart, elseGoto);
-         write(" else ");
-         writeBlock(elseGoto.getNextExpr(), null);
-      } else if (instruction instanceof CompositeForSunInstruction) {
-         newLine();
-         write("for (");
-         Instruction topBranch = instruction.getFirstChild();
-         if (topBranch instanceof AssignToLocalVariable) {
-            writeInstruction(topBranch);
-            topBranch = topBranch.getNextExpr();
-         }
-         write("; ");
-         final BranchSet branchSet = instruction.getBranchSet();
-         final Instruction blockStart = writeConditional(branchSet);
-
-         final Instruction lastGoto = instruction.getLastChild();
-
-         if (branchSet.getFallThrough() == lastGoto) {
-            // empty body no delta!
-            write(";){}");
-         } else {
-            final Instruction delta = lastGoto.getPrevExpr();
-            write("; ");
-            if (!(delta instanceof CompositeInstruction)) {
-               writeInstruction(delta);
-               write(")");
-               writeBlock(blockStart, delta);
-            } else {
-               write("){");
-               in();
-               writeSequence(blockStart, delta);
-
-               newLine();
-               writeSequence(delta, delta.getNextExpr());
-               out();
-               newLine();
-               write("}");
-
-            }
-         }
-
-      } else if (instruction instanceof CompositeWhileInstruction) {
-         newLine();
-         write("while (");
-         final BranchSet branchSet = instruction.getBranchSet();
-         final Instruction blockStart = writeConditional(branchSet);
-         write(")");
-         final Instruction lastGoto = instruction.getLastChild();
-         writeBlock(blockStart, lastGoto);
-
-      } else if (instruction instanceof CompositeEmptyLoopInstruction) {
-         newLine();
-         write("for (");
-         Instruction topBranch = instruction.getFirstChild();
-         if (topBranch instanceof AssignToLocalVariable) {
-            writeInstruction(topBranch);
-            topBranch = topBranch.getNextExpr();
-         }
-         write("; ");
-         writeConditional(instruction.getBranchSet());
-         write(";){}");
-
-      } else if (instruction instanceof CompositeForEclipseInstruction) {
-         newLine();
-         write("for (");
-         Instruction topGoto = instruction.getFirstChild();
-         if (topGoto instanceof AssignToLocalVariable) {
-            writeInstruction(topGoto);
-            topGoto = topGoto.getNextExpr();
-         }
-         write("; ");
-         Instruction last = instruction.getLastChild();
-         while (last.getPrevExpr().isBranch()) {
-            last = last.getPrevExpr();
-         }
-         writeConditional(instruction.getBranchSet(), true);
-         write("; ");
-         final Instruction delta = last.getPrevExpr();
-         if (!(delta instanceof CompositeInstruction)) {
-            writeInstruction(delta);
-            write(")");
-            writeBlock(topGoto.getNextExpr(), delta);
-         } else {
-            write("){");
-            in();
-            writeSequence(topGoto.getNextExpr(), delta);
-
-            newLine();
-            writeSequence(delta, delta.getNextExpr());
-            out();
-            newLine();
-            write("}");
-
-         }
-
-      } else if (instruction instanceof CompositeDoWhileInstruction) {
-         newLine();
-         write("do");
-         Instruction blockStart = instruction.getFirstChild();
-         Instruction blockEnd = instruction.getLastChild();
-         writeBlock(blockStart, blockEnd);
-         write("while(");
-         writeConditional(((CompositeInstruction) instruction).getBranchSet(), true);
-         write(");");
-         newLine();
-      }
-   }
-
-   public void writeSequence(Instruction _first, Instruction _last) throws CodeGenException {
-
-      for (Instruction instruction = _first; instruction != _last; instruction = instruction.getNextExpr()) {
-         if (instruction instanceof CompositeInstruction) {
-            writeComposite((CompositeInstruction) instruction);
-         } else if (!instruction.getByteCode().equals(ByteCode.NONE)) {
-            newLine();
-            writeInstruction(instruction);
-            write(";");
-
-         }
-      }
-
-   }
-
-   protected void writeGetterBlock(FieldEntry accessorVariableFieldEntry) {
-      write("{");
-      in();
-      newLine();
-      write("return this->");
-      write(accessorVariableFieldEntry.getNameAndTypeEntry().getNameUTF8Entry().getUTF8());
-      write(";");
-      out();
-      newLine();
-
-      write("}");
-   }
-
-   public void writeBlock(Instruction _first, Instruction _last) throws CodeGenException {
-      write("{");
-      in();
-      writeSequence(_first, _last);
-      out();
-      newLine();
-
-      write("}");
-   }
-
-   public Instruction writeConditional(BranchSet _branchSet) throws CodeGenException {
-      return (writeConditional(_branchSet, false));
-   }
-
-   public Instruction writeConditional(BranchSet _branchSet, boolean _invert) throws CodeGenException {
-
-      final LogicalExpressionNode logicalExpression = _branchSet.getLogicalExpression();
-      write(_invert ? logicalExpression : logicalExpression.cloneInverted());
-      return (_branchSet.getLast().getNextExpr());
-   }
-
-   public void write(LogicalExpressionNode _node) throws CodeGenException {
-      if (_node instanceof SimpleLogicalExpressionNode) {
-         final SimpleLogicalExpressionNode sn = (SimpleLogicalExpressionNode) _node;
-
-         writeConditionalBranch16((ConditionalBranch16) sn.getBranch(), sn.isInvert());
-      } else {
-         final CompoundLogicalExpressionNode ln = (CompoundLogicalExpressionNode) _node;
-         boolean needParenthesis = false;
-         final CompoundLogicalExpressionNode parent = (CompoundLogicalExpressionNode) ln.getParent();
-         if (parent != null) {
-            if (!ln.isAnd() && parent.isAnd()) {
-               needParenthesis = true;
-            }
-         }
-         if (needParenthesis) {
-
-            write("(");
-         }
-         write(ln.getLhs());
-         write(ln.isAnd() ? " && " : " || ");
-         write(ln.getRhs());
-         if (needParenthesis) {
-
-            write(")");
-         }
-      }
-   }
-
-   public String convertType(String _typeDesc, boolean useClassModel) {
-      return (_typeDesc);
-   }
-
-   public String convertCast(String _cast) {
-      // Strip parens off cast
-      //System.out.println("cast = " + _cast);
-      final String raw = convertType(_cast.substring(1, _cast.length() - 1), false);
-      return ("(" + raw + ")");
-   }
-
-   public void writeInstruction(Instruction _instruction) throws CodeGenException {
-      if (_instruction instanceof CompositeIfElseInstruction) {
-         write("(");
-         final Instruction lhs = writeConditional(((CompositeInstruction) _instruction).getBranchSet());
-         write(")?");
-         writeInstruction(lhs);
-         write(":");
-         writeInstruction(lhs.getNextExpr().getNextExpr());
-      } else if (_instruction instanceof CompositeInstruction) {
-         writeComposite((CompositeInstruction) _instruction);
-
-      } else if (_instruction instanceof AssignToLocalVariable) {
-         final AssignToLocalVariable assignToLocalVariable = (AssignToLocalVariable) _instruction;
-
-         final LocalVariableInfo localVariableInfo = assignToLocalVariable.getLocalVariableInfo();
-         if (assignToLocalVariable.isDeclaration()) {
-            final String descriptor = localVariableInfo.getVariableDescriptor();
-            // Arrays always map to __global arrays
-            if (descriptor.startsWith("[")) {
-               write(" __global ");
-            }
-            write(convertType(descriptor, true));
-         }
-         if (localVariableInfo == null) {
-            throw new CodeGenException("outOfScope" + _instruction.getThisPC() + " = ");
-         } else {
-            write(localVariableInfo.getVariableName() + " = ");
-         }
-
-         for (Instruction operand = _instruction.getFirstChild(); operand != null; operand = operand.getNextExpr()) {
-            writeInstruction(operand);
-         }
-
-      } else if (_instruction instanceof AssignToArrayElement) {
-         final AssignToArrayElement arrayAssignmentInstruction = (AssignToArrayElement) _instruction;
-         writeInstruction(arrayAssignmentInstruction.getArrayRef());
-         write("[");
-         writeInstruction(arrayAssignmentInstruction.getArrayIndex());
-         write("]");
-         write(" ");
-         write(" = ");
-         writeInstruction(arrayAssignmentInstruction.getValue());
-      } else if (_instruction instanceof AccessArrayElement) {
-
-         //we're getting an element from an array
-         //if the array is a primitive then we just return the value
-         //so the generated code looks like
-         //arrayName[arrayIndex];
-         //but if the array is an object, or multidimensional array, then we want to return
-         //a pointer to our index our position in the array.  The code will look like
-         //&(arrayName[arrayIndex * this->arrayNameLen_dimension]
-         //
-         final AccessArrayElement arrayLoadInstruction = (AccessArrayElement) _instruction;
-
-         //object array, get address
-         boolean isMultiDimensional = arrayLoadInstruction instanceof I_AALOAD && isMultiDimensionalArray(arrayLoadInstruction);
-         if (isMultiDimensional) {
-            write("(&");
-         }
-         writeInstruction(arrayLoadInstruction.getArrayRef());
-         write("[");
-         writeInstruction(arrayLoadInstruction.getArrayIndex());
-
-         //object array, find the size of each object in the array
-         //for 2D arrays, this size is the size of a row.
-         if (isMultiDimensional) {
-            int dim = 0;
-            Instruction load = arrayLoadInstruction.getArrayRef();
-            while (load instanceof I_AALOAD) {
-               load = load.getFirstChild();
-               dim++;
-            }
-
-            NameAndTypeEntry nameAndTypeEntry = ((AccessInstanceField) load).getConstantPoolFieldEntry().getNameAndTypeEntry();
-            if (isMultiDimensionalArray(nameAndTypeEntry)) {
-               String arrayName = nameAndTypeEntry.getNameUTF8Entry().getUTF8();
-               write(" * this->" + arrayName + arrayDimMangleSuffix + dim);
-            }
-         }
-
-         write("]");
-
-         //object array, close parentheses
-         if (isMultiDimensional) {
-            write(")");
-         }
-      } else if (_instruction instanceof AccessField) {
-         final AccessField accessField = (AccessField) _instruction;
-         if (accessField instanceof AccessInstanceField) {
-            Instruction accessInstanceField = ((AccessInstanceField) accessField).getInstance();
-            if (accessInstanceField instanceof CloneInstruction) {
-               accessInstanceField = ((CloneInstruction) accessInstanceField).getReal();
-            }
-            if (!(accessInstanceField instanceof I_ALOAD_0)) {
-               writeInstruction(accessInstanceField);
-               write(".");
-            } else {
-               writeThisRef();
-            }
-         }
-         write(accessField.getConstantPoolFieldEntry().getNameAndTypeEntry().getNameUTF8Entry().getUTF8());
-
-      } else if (_instruction instanceof I_ARRAYLENGTH) {
-
-         //getting the length of an array.
-         //if this is a primitive array, then this is trivial
-         //if we're getting an object array, then we need to find what dimension
-         //we're looking at
-         int dim = 0;
-         Instruction load = _instruction.getFirstChild();
-         while (load instanceof I_AALOAD) {
-            load = load.getFirstChild();
-            dim++;
-         }
-         NameAndTypeEntry nameAndTypeEntry = ((AccessInstanceField) load).getConstantPoolFieldEntry().getNameAndTypeEntry();
-         final String arrayName = nameAndTypeEntry.getNameUTF8Entry().getUTF8();
-         String dimSuffix = isMultiDimensionalArray(nameAndTypeEntry) ? Integer.toString(dim) : "";
-         write("this->" + arrayName + arrayLengthMangleSuffix + dimSuffix);
-      } else if (_instruction instanceof AssignToField) {
-         final AssignToField assignedField = (AssignToField) _instruction;
-
-         if (assignedField instanceof AssignToInstanceField) {
-            final Instruction accessInstanceField = ((AssignToInstanceField) assignedField).getInstance().getReal();
-
-            if (!(accessInstanceField instanceof I_ALOAD_0)) {
-               writeInstruction(accessInstanceField);
-               write(".");
-            } else {
-               writeThisRef();
-            }
-         }
-         write(assignedField.getConstantPoolFieldEntry().getNameAndTypeEntry().getNameUTF8Entry().getUTF8());
-         write("=");
-         writeInstruction(assignedField.getValueToAssign());
-      } else if (_instruction instanceof Constant<?>) {
-         final Constant<?> constantInstruction = (Constant<?>) _instruction;
-         final Object value = constantInstruction.getValue();
-
-         if (value instanceof Float) {
-
-            final Float f = (Float) value;
-            if (f.isNaN()) {
-               write("NAN");
-            } else if (f.isInfinite()) {
-               if (f < 0) {
-                  write("-");
-               }
-               write("INFINITY");
-            } else {
-               write(value.toString());
-               write("f");
-            }
-         } else if (value instanceof Double) {
-
-            final Double d = (Double) value;
-            if (d.isNaN()) {
-               write("NAN");
-            } else if (d.isInfinite()) {
-               if (d < 0) {
-                  write("-");
-               }
-               write("INFINITY");
-            } else {
-               write(value.toString());
-            }
-         } else {
-            write(value.toString());
-            if (value instanceof Long) {
-               write("L");
-            }
-         }
-
-      } else if (_instruction instanceof AccessLocalVariable) {
-         final AccessLocalVariable localVariableLoadInstruction = (AccessLocalVariable) _instruction;
-         final LocalVariableInfo localVariable = localVariableLoadInstruction.getLocalVariableInfo();
-         write(localVariable.getVariableName());
-      } else if (_instruction instanceof I_IINC) {
-         final I_IINC location = (I_IINC) _instruction;
-         final LocalVariableInfo localVariable = location.getLocalVariableInfo();
-         final int adjust = location.getAdjust();
-
-         write(localVariable.getVariableName());
-         if (adjust == 1) {
-            write("++");
-         } else if (adjust == -1) {
-            write("--");
-         } else if (adjust > 1) {
-            write("+=" + adjust);
-         } else if (adjust < -1) {
-            write("-=" + (-adjust));
-         }
-      } else if (_instruction instanceof BinaryOperator) {
-         final BinaryOperator binaryInstruction = (BinaryOperator) _instruction;
-         final Instruction parent = binaryInstruction.getParentExpr();
-         boolean needsParenthesis = true;
-
-         if (parent instanceof AssignToLocalVariable) {
-            needsParenthesis = false;
-         } else if (parent instanceof AssignToField) {
-            needsParenthesis = false;
-         } else if (parent instanceof AssignToArrayElement) {
-            needsParenthesis = false;
-         } else {
-            /**
-                        if (parent instanceof BinaryOperator) {
-                           BinaryOperator parentBinaryOperator = (BinaryOperator) parent;
-                           if (parentBinaryOperator.getOperator().ordinal() > binaryInstruction.getOperator().ordinal()) {
-                              needsParenthesis = false;
-                           }
-                        }
-            **/
-         }
-
-         if (needsParenthesis) {
-            write("(");
-         }
-
-         writeInstruction(binaryInstruction.getLhs());
-
-         write(" " + binaryInstruction.getOperator().getText() + " ");
-         writeInstruction(binaryInstruction.getRhs());
-
-         if (needsParenthesis) {
-            write(")");
-         }
-
-      } else if (_instruction instanceof CastOperator) {
-         final CastOperator castInstruction = (CastOperator) _instruction;
-         //  write("(");
-         write(convertCast(castInstruction.getOperator().getText()));
-
-         writeInstruction(castInstruction.getUnary());
-         //    write(")");
-      } else if (_instruction instanceof UnaryOperator) {
-         final UnaryOperator unaryInstruction = (UnaryOperator) _instruction;
-         //   write("(");
-         write(unaryInstruction.getOperator().getText());
-
-         writeInstruction(unaryInstruction.getUnary());
-         //   write(")");
-      } else if (_instruction instanceof Return) {
-
-         final Return ret = (Return) _instruction;
-         write("return");
-         if (ret.getStackConsumeCount() > 0) {
-            write("(");
-            writeInstruction(ret.getFirstChild());
-            write(")");
-         }
-
-      } else if (_instruction instanceof MethodCall) {
-         final MethodCall methodCall = (MethodCall) _instruction;
-
-         final MethodEntry methodEntry = methodCall.getConstantPoolMethodEntry();
-
-         writeMethod(methodCall, methodEntry);
-      } else if (_instruction.getByteCode().equals(ByteCode.CLONE)) {
-         final CloneInstruction cloneInstruction = (CloneInstruction) _instruction;
-         writeInstruction(cloneInstruction.getReal());
-      } else if (_instruction.getByteCode().equals(ByteCode.INCREMENT)) {
-         final IncrementInstruction incrementInstruction = (IncrementInstruction) _instruction;
-
-         if (incrementInstruction.isPre()) {
-            if (incrementInstruction.isInc()) {
-               write("++");
-            } else {
-               write("--");
-            }
-         }
-
-         writeInstruction(incrementInstruction.getFieldOrVariableReference());
-         if (!incrementInstruction.isPre()) {
-            if (incrementInstruction.isInc()) {
-               write("++");
-            } else {
-               write("--");
-            }
-         }
-      } else if (_instruction.getByteCode().equals(ByteCode.MULTI_ASSIGN)) {
-         final MultiAssignInstruction multiAssignInstruction = (MultiAssignInstruction) _instruction;
-         AssignToLocalVariable from = (AssignToLocalVariable) multiAssignInstruction.getFrom();
-         final AssignToLocalVariable last = (AssignToLocalVariable) multiAssignInstruction.getTo();
-         final Instruction common = multiAssignInstruction.getCommon();
-         final Stack<AssignToLocalVariable> stack = new Stack<AssignToLocalVariable>();
-
-         while (from != last) {
-            stack.push(from);
-            from = (AssignToLocalVariable) ((Instruction) from).getNextExpr();
-         }
-
-         for (AssignToLocalVariable alv = stack.pop(); alv != null; alv = stack.size() > 0 ? stack.pop() : null) {
-
-            final LocalVariableInfo localVariableInfo = alv.getLocalVariableInfo();
-            if (alv.isDeclaration()) {
-               write(convertType(localVariableInfo.getVariableDescriptor(), true));
-            }
-            if (localVariableInfo == null) {
-               throw new CodeGenException("outOfScope" + _instruction.getThisPC() + " = ");
-            } else {
-               write(localVariableInfo.getVariableName() + " = ");
-            }
-
-         }
-         writeInstruction(common);
-      } else if (_instruction.getByteCode().equals(ByteCode.INLINE_ASSIGN)) {
-         final InlineAssignInstruction inlineAssignInstruction = (InlineAssignInstruction) _instruction;
-         final AssignToLocalVariable assignToLocalVariable = inlineAssignInstruction.getAssignToLocalVariable();
-
-         final LocalVariableInfo localVariableInfo = assignToLocalVariable.getLocalVariableInfo();
-         if (assignToLocalVariable.isDeclaration()) {
-            // this is bad! we need a general way to hoist up a required declaration
-            throw new CodeGenException("/* we can't declare this " + convertType(localVariableInfo.getVariableDescriptor(), true)
-                  + " here */");
-         }
-         write(localVariableInfo.getVariableName());
-         write("=");
-         writeInstruction(inlineAssignInstruction.getRhs());
-      } else if (_instruction.getByteCode().equals(ByteCode.FIELD_ARRAY_ELEMENT_ASSIGN)) {
-         final FieldArrayElementAssign inlineAssignInstruction = (FieldArrayElementAssign) _instruction;
-         final AssignToArrayElement arrayAssignmentInstruction = inlineAssignInstruction.getAssignToArrayElement();
-
-         writeInstruction(arrayAssignmentInstruction.getArrayRef());
-         write("[");
-         writeInstruction(arrayAssignmentInstruction.getArrayIndex());
-         write("]");
-         write(" ");
-         write(" = ");
-
-         writeInstruction(inlineAssignInstruction.getRhs());
-      } else if (_instruction.getByteCode().equals(ByteCode.FIELD_ARRAY_ELEMENT_INCREMENT)) {
-
-         final FieldArrayElementIncrement fieldArrayElementIncrement = (FieldArrayElementIncrement) _instruction;
-         final AssignToArrayElement arrayAssignmentInstruction = fieldArrayElementIncrement.getAssignToArrayElement();
-         if (fieldArrayElementIncrement.isPre()) {
-            if (fieldArrayElementIncrement.isInc()) {
-               write("++");
-            } else {
-               write("--");
-            }
-         }
-         writeInstruction(arrayAssignmentInstruction.getArrayRef());
-
-         write("[");
-         writeInstruction(arrayAssignmentInstruction.getArrayIndex());
-         write("]");
-         if (!fieldArrayElementIncrement.isPre()) {
-            if (fieldArrayElementIncrement.isInc()) {
-               write("++");
-            } else {
-               write("--");
-            }
-         }
-
-      } else if (_instruction.getByteCode().equals(ByteCode.NONE)) {
-         // we are done
-      } else if (_instruction instanceof Branch) {
-         throw new CodeGenException(String.format("%s -> %04d", _instruction.getByteCode().toString().toLowerCase(),
-               ((Branch) _instruction).getTarget().getThisPC()));
-      } else if (_instruction instanceof I_POP) {
-         //POP discarded void call return?
-         writeInstruction(_instruction.getFirstChild());
-      } else {
-         throw new CodeGenException(String.format("%s", _instruction.getByteCode().toString().toLowerCase()));
-      }
-
-   }
-
-   private boolean isMultiDimensionalArray(NameAndTypeEntry nameAndTypeEntry) {
-      return nameAndTypeEntry.getDescriptorUTF8Entry().getUTF8().startsWith("[[");
-   }
-
-   private boolean isObjectArray(NameAndTypeEntry nameAndTypeEntry) {
-      return nameAndTypeEntry.getDescriptorUTF8Entry().getUTF8().startsWith("[L");
-   }
-
-   private boolean isMultiDimensionalArray(final AccessArrayElement arrayLoadInstruction) {
-      AccessInstanceField accessInstanceField = getUltimateInstanceFieldAccess(arrayLoadInstruction);
-      return isMultiDimensionalArray(accessInstanceField.getConstantPoolFieldEntry().getNameAndTypeEntry());
-   }
-
-   private boolean isObjectArray(final AccessArrayElement arrayLoadInstruction) {
-      AccessInstanceField accessInstanceField = getUltimateInstanceFieldAccess(arrayLoadInstruction);
-      return isObjectArray(accessInstanceField.getConstantPoolFieldEntry().getNameAndTypeEntry());
-   }
-
-   private AccessInstanceField getUltimateInstanceFieldAccess(final AccessArrayElement arrayLoadInstruction) {
-      Instruction load = arrayLoadInstruction.getArrayRef();
-      while (load instanceof I_AALOAD) {
-         load = load.getFirstChild();
-      }
-
-      return (AccessInstanceField) load;
-   }
-
-   public void writeMethod(MethodCall _methodCall, MethodEntry _methodEntry) throws CodeGenException {
-      boolean noCL = _methodEntry.getOwnerClassModel().getNoCLMethods()
-            .contains(_methodEntry.getNameAndTypeEntry().getNameUTF8Entry().getUTF8());
-      if (noCL) {
-         return;
-      }
-
-      if (_methodCall instanceof VirtualMethodCall) {
-         final Instruction instanceInstruction = ((VirtualMethodCall) _methodCall).getInstanceReference();
-         if (!(instanceInstruction instanceof I_ALOAD_0)) {
-            writeInstruction(instanceInstruction);
-            write(".");
-         } else {
-            writeThisRef();
-         }
-      }
-      final int argc = _methodEntry.getStackConsumeCount();
-      write(_methodEntry.getNameAndTypeEntry().getNameUTF8Entry().getUTF8());
-      write("(");
-
-      for (int arg = 0; arg < argc; arg++) {
-         if (arg != 0) {
-            write(", ");
-         }
-         writeInstruction(_methodCall.getArg(arg));
-      }
-      write(")");
-
-   }
-
-   public void writeThisRef() {
-      write("this.");
-   }
-
-   public void writeMethodBody(MethodModel _methodModel) throws CodeGenException {
-      if (_methodModel.isGetter() && !_methodModel.isNoCL()) {
-         FieldEntry accessorVariableFieldEntry = _methodModel.getAccessorVariableFieldEntry();
-         writeGetterBlock(accessorVariableFieldEntry);
-      } else {
-         writeBlock(_methodModel.getExprHead(), null);
-      }
-   }
-
-   public abstract void write(Entrypoint entryPoint) throws CodeGenException;
-}
+/*
+Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following
+disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials provided with the distribution.
+
+Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
+laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
+774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
+you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of
+Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration
+Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
+E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
+D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
+to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
+of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
+under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/.
+
+*/
+package com.amd.aparapi.internal.writer;
+
+import com.amd.aparapi.*;
+import com.amd.aparapi.internal.exception.*;
+import com.amd.aparapi.internal.instruction.*;
+import com.amd.aparapi.internal.instruction.BranchSet.LogicalExpressionNode;
+import com.amd.aparapi.internal.instruction.InstructionSet.AccessInstanceField;
+import com.amd.aparapi.internal.instruction.BranchSet.*;
+import com.amd.aparapi.internal.instruction.InstructionSet.*;
+import com.amd.aparapi.internal.model.ClassModel.ConstantPool.*;
+import com.amd.aparapi.internal.model.ClassModel.*;
+import com.amd.aparapi.internal.model.*;
+import com.amd.aparapi.internal.model.ClassModel.ConstantPool.NameAndTypeEntry;
+
+import java.util.*;
+
+/**
+ * Base abstract class for converting <code>Aparapi</code> IR to text.<br/>
+ *
+ *
+ * @author gfrost
+ *
+ */
+
+public abstract class BlockWriter{
+
+   public final static String arrayLengthMangleSuffix = "__javaArrayLength";
+
+   public final static String arrayDimMangleSuffix = "__javaArrayDimension";
+
+   public abstract void write(String _string);
+
+   public void writeln(String _string) {
+      write(_string);
+      newLine();
+   }
+
+   public int indent = 0;
+
+   public void in() {
+      indent++;
+   }
+
+   public void out() {
+      indent--;
+   }
+
+   public void newLine() {
+      write("\n");
+      for (int i = 0; i < indent; i++) {
+         write("   ");
+      }
+   }
+
+   public void writeConditionalBranch16(ConditionalBranch16 _branch16, boolean _invert) throws CodeGenException {
+
+      if (_branch16 instanceof If) {
+         final If iff = (If) _branch16;
+
+         writeInstruction(iff.getLhs());
+         write(_branch16.getOperator().getText(_invert));
+         writeInstruction(iff.getRhs());
+      } else if (_branch16 instanceof I_IFNULL) {
+         final I_IFNULL iff = (I_IFNULL) _branch16;
+         writeInstruction(iff.getFirstChild());
+
+         if (_invert) {
+            write(" != NULL");
+         } else {
+            write(" == NULL");
+         }
+
+      } else if (_branch16 instanceof I_IFNONNULL) {
+         final I_IFNONNULL iff = (I_IFNONNULL) _branch16;
+         writeInstruction(iff.getFirstChild());
+
+         if (_invert) {
+            write(" == NULL");
+         } else {
+            write(" != NULL");
+         }
+      } else if (_branch16 instanceof IfUnary) {
+         final IfUnary branch16 = (IfUnary) _branch16;
+         final Instruction comparison = branch16.getUnary();
+         final ByteCode comparisonByteCode = comparison.getByteCode();
+         final String comparisonOperator = _branch16.getOperator().getText(_invert);
+
+         switch (comparisonByteCode) {
+            case FCMPG:
+            case DCMPG:
+            case FCMPL:
+            case DCMPL:
+               if (Config.verboseComparitor) {
+                  write("/* bytecode=" + comparisonByteCode.getName() + " invert=" + _invert + "*/");
+               }
+               writeInstruction(comparison.getFirstChild());
+               write(comparisonOperator);
+               writeInstruction(comparison.getLastChild());
+               break;
+            default:
+               if (Config.verboseComparitor) {
+                  write("/* default bytecode=" + comparisonByteCode.getName() + " invert=" + _invert + "*/");
+               }
+               writeInstruction(comparison);
+               write(comparisonOperator);
+               write("0");
+         }
+      }
+   }
+
+   public void writeComposite(CompositeInstruction instruction) throws CodeGenException {
+      if (instruction instanceof CompositeArbitraryScopeInstruction) {
+         newLine();
+
+         writeBlock(instruction.getFirstChild(), null);
+      } else if (instruction instanceof CompositeIfInstruction) {
+         newLine();
+         write("if (");
+         final Instruction blockStart = writeConditional(instruction.getBranchSet());
+
+         write(")");
+         writeBlock(blockStart, null);
+      } else if (instruction instanceof CompositeIfElseInstruction) {
+         newLine();
+         write("if (");
+         final Instruction blockStart = writeConditional(instruction.getBranchSet());
+         write(")");
+         Instruction elseGoto = blockStart;
+         while (!(elseGoto.isBranch() && elseGoto.asBranch().isUnconditional())) {
+            elseGoto = elseGoto.getNextExpr();
+         }
+         writeBlock(blockStart, elseGoto);
+         write(" else ");
+         writeBlock(elseGoto.getNextExpr(), null);
+      } else if (instruction instanceof CompositeForSunInstruction) {
+         newLine();
+         write("for (");
+         Instruction topBranch = instruction.getFirstChild();
+         if (topBranch instanceof AssignToLocalVariable) {
+            writeInstruction(topBranch);
+            topBranch = topBranch.getNextExpr();
+         }
+         write("; ");
+         final BranchSet branchSet = instruction.getBranchSet();
+         final Instruction blockStart = writeConditional(branchSet);
+
+         final Instruction lastGoto = instruction.getLastChild();
+
+         if (branchSet.getFallThrough() == lastGoto) {
+            // empty body no delta!
+            write(";){}");
+         } else {
+            final Instruction delta = lastGoto.getPrevExpr();
+            write("; ");
+            if (!(delta instanceof CompositeInstruction)) {
+               writeInstruction(delta);
+               write(")");
+               writeBlock(blockStart, delta);
+            } else {
+               write("){");
+               in();
+               writeSequence(blockStart, delta);
+
+               newLine();
+               writeSequence(delta, delta.getNextExpr());
+               out();
+               newLine();
+               write("}");
+
+            }
+         }
+
+      } else if (instruction instanceof CompositeWhileInstruction) {
+         newLine();
+         write("while (");
+         final BranchSet branchSet = instruction.getBranchSet();
+         final Instruction blockStart = writeConditional(branchSet);
+         write(")");
+         final Instruction lastGoto = instruction.getLastChild();
+         writeBlock(blockStart, lastGoto);
+
+      } else if (instruction instanceof CompositeEmptyLoopInstruction) {
+         newLine();
+         write("for (");
+         Instruction topBranch = instruction.getFirstChild();
+         if (topBranch instanceof AssignToLocalVariable) {
+            writeInstruction(topBranch);
+            topBranch = topBranch.getNextExpr();
+         }
+         write("; ");
+         writeConditional(instruction.getBranchSet());
+         write(";){}");
+
+      } else if (instruction instanceof CompositeForEclipseInstruction) {
+         newLine();
+         write("for (");
+         Instruction topGoto = instruction.getFirstChild();
+         if (topGoto instanceof AssignToLocalVariable) {
+            writeInstruction(topGoto);
+            topGoto = topGoto.getNextExpr();
+         }
+         write("; ");
+         Instruction last = instruction.getLastChild();
+         while (last.getPrevExpr().isBranch()) {
+            last = last.getPrevExpr();
+         }
+         writeConditional(instruction.getBranchSet(), true);
+         write("; ");
+         final Instruction delta = last.getPrevExpr();
+         if (!(delta instanceof CompositeInstruction)) {
+            writeInstruction(delta);
+            write(")");
+            writeBlock(topGoto.getNextExpr(), delta);
+         } else {
+            write("){");
+            in();
+            writeSequence(topGoto.getNextExpr(), delta);
+
+            newLine();
+            writeSequence(delta, delta.getNextExpr());
+            out();
+            newLine();
+            write("}");
+
+         }
+
+      } else if (instruction instanceof CompositeDoWhileInstruction) {
+         newLine();
+         write("do");
+         Instruction blockStart = instruction.getFirstChild();
+         Instruction blockEnd = instruction.getLastChild();
+         writeBlock(blockStart, blockEnd);
+         write("while(");
+         writeConditional(((CompositeInstruction) instruction).getBranchSet(), true);
+         write(");");
+         newLine();
+      }
+   }
+
+   public void writeSequence(Instruction _first, Instruction _last) throws CodeGenException {
+
+      for (Instruction instruction = _first; instruction != _last; instruction = instruction.getNextExpr()) {
+         if (instruction instanceof CompositeInstruction) {
+            writeComposite((CompositeInstruction) instruction);
+         } else if (!instruction.getByteCode().equals(ByteCode.NONE)) {
+            newLine();
+            writeInstruction(instruction);
+            write(";");
+
+         }
+      }
+
+   }
+
+   protected void writeGetterBlock(FieldEntry accessorVariableFieldEntry) {
+      write("{");
+      in();
+      newLine();
+      write("return this->");
+      write(accessorVariableFieldEntry.getNameAndTypeEntry().getNameUTF8Entry().getUTF8());
+      write(";");
+      out();
+      newLine();
+
+      write("}");
+   }
+
+   public void writeBlock(Instruction _first, Instruction _last) throws CodeGenException {
+      write("{");
+      in();
+      writeSequence(_first, _last);
+      out();
+      newLine();
+
+      write("}");
+   }
+
+   public Instruction writeConditional(BranchSet _branchSet) throws CodeGenException {
+      return (writeConditional(_branchSet, false));
+   }
+
+   public Instruction writeConditional(BranchSet _branchSet, boolean _invert) throws CodeGenException {
+
+      final LogicalExpressionNode logicalExpression = _branchSet.getLogicalExpression();
+      write(_invert ? logicalExpression : logicalExpression.cloneInverted());
+      return (_branchSet.getLast().getNextExpr());
+   }
+
+   public void write(LogicalExpressionNode _node) throws CodeGenException {
+      if (_node instanceof SimpleLogicalExpressionNode) {
+         final SimpleLogicalExpressionNode sn = (SimpleLogicalExpressionNode) _node;
+
+         writeConditionalBranch16((ConditionalBranch16) sn.getBranch(), sn.isInvert());
+      } else {
+         final CompoundLogicalExpressionNode ln = (CompoundLogicalExpressionNode) _node;
+         boolean needParenthesis = false;
+         final CompoundLogicalExpressionNode parent = (CompoundLogicalExpressionNode) ln.getParent();
+         if (parent != null) {
+            if (!ln.isAnd() && parent.isAnd()) {
+               needParenthesis = true;
+            }
+         }
+         if (needParenthesis) {
+
+            write("(");
+         }
+         write(ln.getLhs());
+         write(ln.isAnd() ? " && " : " || ");
+         write(ln.getRhs());
+         if (needParenthesis) {
+
+            write(")");
+         }
+      }
+   }
+
+   public String convertType(String _typeDesc, boolean useClassModel) {
+      return (_typeDesc);
+   }
+
+   public String convertCast(String _cast) {
+      // Strip parens off cast
+      //System.out.println("cast = " + _cast);
+      final String raw = convertType(_cast.substring(1, _cast.length() - 1), false);
+      return ("(" + raw + ")");
+   }
+
+   public void writeInstruction(Instruction _instruction) throws CodeGenException {
+      if (_instruction instanceof CompositeIfElseInstruction) {
+         boolean needParenthesis = isNeedParenthesis(_instruction);
+         if(needParenthesis){
+             write("(");
+         }
+         write("(");
+         final Instruction lhs = writeConditional(((CompositeInstruction) _instruction).getBranchSet());
+         write(")?");
+         writeInstruction(lhs);
+         write(":");
+         writeInstruction(lhs.getNextExpr().getNextExpr());
+         if(needParenthesis){
+             write(")");
+         }
+      } else if (_instruction instanceof CompositeInstruction) {
+         writeComposite((CompositeInstruction) _instruction);
+
+      } else if (_instruction instanceof AssignToLocalVariable) {
+         final AssignToLocalVariable assignToLocalVariable = (AssignToLocalVariable) _instruction;
+
+         final LocalVariableInfo localVariableInfo = assignToLocalVariable.getLocalVariableInfo();
+         if (assignToLocalVariable.isDeclaration()) {
+            final String descriptor = localVariableInfo.getVariableDescriptor();
+            // Arrays always map to __global arrays
+            if (descriptor.startsWith("[")) {
+               write(" __global ");
+            }
+            write(convertType(descriptor, true));
+         }
+         if (localVariableInfo == null) {
+            throw new CodeGenException("outOfScope" + _instruction.getThisPC() + " = ");
+         } else {
+            write(localVariableInfo.getVariableName() + " = ");
+         }
+
+         for (Instruction operand = _instruction.getFirstChild(); operand != null; operand = operand.getNextExpr()) {
+            writeInstruction(operand);
+         }
+
+      } else if (_instruction instanceof AssignToArrayElement) {
+         final AssignToArrayElement arrayAssignmentInstruction = (AssignToArrayElement) _instruction;
+         writeInstruction(arrayAssignmentInstruction.getArrayRef());
+         write("[");
+         writeInstruction(arrayAssignmentInstruction.getArrayIndex());
+         write("]");
+         write(" ");
+         write(" = ");
+         writeInstruction(arrayAssignmentInstruction.getValue());
+      } else if (_instruction instanceof AccessArrayElement) {
+
+         //we're getting an element from an array
+         //if the array is a primitive then we just return the value
+         //so the generated code looks like
+         //arrayName[arrayIndex];
+         //but if the array is an object, or multidimensional array, then we want to return
+         //a pointer to our index our position in the array.  The code will look like
+         //&(arrayName[arrayIndex * this->arrayNameLen_dimension]
+         //
+         final AccessArrayElement arrayLoadInstruction = (AccessArrayElement) _instruction;
+
+         //object array, get address
+         boolean isMultiDimensional = arrayLoadInstruction instanceof I_AALOAD && isMultiDimensionalArray(arrayLoadInstruction);
+         if (isMultiDimensional) {
+            write("(&");
+         }
+         writeInstruction(arrayLoadInstruction.getArrayRef());
+         write("[");
+         writeInstruction(arrayLoadInstruction.getArrayIndex());
+
+         //object array, find the size of each object in the array
+         //for 2D arrays, this size is the size of a row.
+         if (isMultiDimensional) {
+            int dim = 0;
+            Instruction load = arrayLoadInstruction.getArrayRef();
+            while (load instanceof I_AALOAD) {
+               load = load.getFirstChild();
+               dim++;
+            }
+
+            NameAndTypeEntry nameAndTypeEntry = ((AccessInstanceField) load).getConstantPoolFieldEntry().getNameAndTypeEntry();
+            if (isMultiDimensionalArray(nameAndTypeEntry)) {
+               String arrayName = nameAndTypeEntry.getNameUTF8Entry().getUTF8();
+               write(" * this->" + arrayName + arrayDimMangleSuffix + dim);
+            }
+         }
+
+         write("]");
+
+         //object array, close parentheses
+         if (isMultiDimensional) {
+            write(")");
+         }
+      } else if (_instruction instanceof AccessField) {
+         final AccessField accessField = (AccessField) _instruction;
+         if (accessField instanceof AccessInstanceField) {
+            Instruction accessInstanceField = ((AccessInstanceField) accessField).getInstance();
+            if (accessInstanceField instanceof CloneInstruction) {
+               accessInstanceField = ((CloneInstruction) accessInstanceField).getReal();
+            }
+            if (!(accessInstanceField instanceof I_ALOAD_0)) {
+               writeInstruction(accessInstanceField);
+               write(".");
+            } else {
+               writeThisRef();
+            }
+         }
+         write(accessField.getConstantPoolFieldEntry().getNameAndTypeEntry().getNameUTF8Entry().getUTF8());
+
+      } else if (_instruction instanceof I_ARRAYLENGTH) {
+
+         //getting the length of an array.
+         //if this is a primitive array, then this is trivial
+         //if we're getting an object array, then we need to find what dimension
+         //we're looking at
+         int dim = 0;
+         Instruction load = _instruction.getFirstChild();
+         while (load instanceof I_AALOAD) {
+            load = load.getFirstChild();
+            dim++;
+         }
+         NameAndTypeEntry nameAndTypeEntry = ((AccessInstanceField) load).getConstantPoolFieldEntry().getNameAndTypeEntry();
+         final String arrayName = nameAndTypeEntry.getNameUTF8Entry().getUTF8();
+         String dimSuffix = isMultiDimensionalArray(nameAndTypeEntry) ? Integer.toString(dim) : "";
+         write("this->" + arrayName + arrayLengthMangleSuffix + dimSuffix);
+      } else if (_instruction instanceof AssignToField) {
+         final AssignToField assignedField = (AssignToField) _instruction;
+
+         if (assignedField instanceof AssignToInstanceField) {
+            final Instruction accessInstanceField = ((AssignToInstanceField) assignedField).getInstance().getReal();
+
+            if (!(accessInstanceField instanceof I_ALOAD_0)) {
+               writeInstruction(accessInstanceField);
+               write(".");
+            } else {
+               writeThisRef();
+            }
+         }
+         write(assignedField.getConstantPoolFieldEntry().getNameAndTypeEntry().getNameUTF8Entry().getUTF8());
+         write("=");
+         writeInstruction(assignedField.getValueToAssign());
+      } else if (_instruction instanceof Constant<?>) {
+         final Constant<?> constantInstruction = (Constant<?>) _instruction;
+         final Object value = constantInstruction.getValue();
+
+         if (value instanceof Float) {
+
+            final Float f = (Float) value;
+            if (f.isNaN()) {
+               write("NAN");
+            } else if (f.isInfinite()) {
+               if (f < 0) {
+                  write("-");
+               }
+               write("INFINITY");
+            } else {
+               write(value.toString());
+               write("f");
+            }
+         } else if (value instanceof Double) {
+
+            final Double d = (Double) value;
+            if (d.isNaN()) {
+               write("NAN");
+            } else if (d.isInfinite()) {
+               if (d < 0) {
+                  write("-");
+               }
+               write("INFINITY");
+            } else {
+               write(value.toString());
+            }
+         } else {
+            write(value.toString());
+            if (value instanceof Long) {
+               write("L");
+            }
+         }
+
+      } else if (_instruction instanceof AccessLocalVariable) {
+         final AccessLocalVariable localVariableLoadInstruction = (AccessLocalVariable) _instruction;
+         final LocalVariableInfo localVariable = localVariableLoadInstruction.getLocalVariableInfo();
+         write(localVariable.getVariableName());
+      } else if (_instruction instanceof I_IINC) {
+         final I_IINC location = (I_IINC) _instruction;
+         final LocalVariableInfo localVariable = location.getLocalVariableInfo();
+         final int adjust = location.getAdjust();
+
+         write(localVariable.getVariableName());
+         if (adjust == 1) {
+            write("++");
+         } else if (adjust == -1) {
+            write("--");
+         } else if (adjust > 1) {
+            write("+=" + adjust);
+         } else if (adjust < -1) {
+            write("-=" + (-adjust));
+         }
+      } else if (_instruction instanceof BinaryOperator) {
+         final BinaryOperator binaryInstruction = (BinaryOperator) _instruction;
+         final Instruction parent = binaryInstruction.getParentExpr();
+         boolean needsParenthesis = isNeedParenthesis(binaryInstruction);
+
+         if (needsParenthesis) {
+            write("(");
+         }
+
+         writeInstruction(binaryInstruction.getLhs());
+
+         write(" " + binaryInstruction.getOperator().getText() + " ");
+         writeInstruction(binaryInstruction.getRhs());
+
+         if (needsParenthesis) {
+            write(")");
+         }
+
+      } else if (_instruction instanceof CastOperator) {
+         final CastOperator castInstruction = (CastOperator) _instruction;
+         //  write("(");
+         write(convertCast(castInstruction.getOperator().getText()));
+
+         writeInstruction(castInstruction.getUnary());
+         //    write(")");
+      } else if (_instruction instanceof UnaryOperator) {
+         final UnaryOperator unaryInstruction = (UnaryOperator) _instruction;
+         //   write("(");
+         write(unaryInstruction.getOperator().getText());
+
+         writeInstruction(unaryInstruction.getUnary());
+         //   write(")");
+      } else if (_instruction instanceof Return) {
+
+         final Return ret = (Return) _instruction;
+         write("return");
+         if (ret.getStackConsumeCount() > 0) {
+            write("(");
+            writeInstruction(ret.getFirstChild());
+            write(")");
+         }
+
+      } else if (_instruction instanceof MethodCall) {
+         final MethodCall methodCall = (MethodCall) _instruction;
+
+         final MethodEntry methodEntry = methodCall.getConstantPoolMethodEntry();
+
+         writeMethod(methodCall, methodEntry);
+      } else if (_instruction.getByteCode().equals(ByteCode.CLONE)) {
+         final CloneInstruction cloneInstruction = (CloneInstruction) _instruction;
+         writeInstruction(cloneInstruction.getReal());
+      } else if (_instruction.getByteCode().equals(ByteCode.INCREMENT)) {
+         final IncrementInstruction incrementInstruction = (IncrementInstruction) _instruction;
+
+         if (incrementInstruction.isPre()) {
+            if (incrementInstruction.isInc()) {
+               write("++");
+            } else {
+               write("--");
+            }
+         }
+
+         writeInstruction(incrementInstruction.getFieldOrVariableReference());
+         if (!incrementInstruction.isPre()) {
+            if (incrementInstruction.isInc()) {
+               write("++");
+            } else {
+               write("--");
+            }
+         }
+      } else if (_instruction.getByteCode().equals(ByteCode.MULTI_ASSIGN)) {
+         final MultiAssignInstruction multiAssignInstruction = (MultiAssignInstruction) _instruction;
+         AssignToLocalVariable from = (AssignToLocalVariable) multiAssignInstruction.getFrom();
+         final AssignToLocalVariable last = (AssignToLocalVariable) multiAssignInstruction.getTo();
+         final Instruction common = multiAssignInstruction.getCommon();
+         final Stack<AssignToLocalVariable> stack = new Stack<AssignToLocalVariable>();
+
+         while (from != last) {
+            stack.push(from);
+            from = (AssignToLocalVariable) ((Instruction) from).getNextExpr();
+         }
+
+         for (AssignToLocalVariable alv = stack.pop(); alv != null; alv = stack.size() > 0 ? stack.pop() : null) {
+
+            final LocalVariableInfo localVariableInfo = alv.getLocalVariableInfo();
+            if (alv.isDeclaration()) {
+               write(convertType(localVariableInfo.getVariableDescriptor(), true));
+            }
+            if (localVariableInfo == null) {
+               throw new CodeGenException("outOfScope" + _instruction.getThisPC() + " = ");
+            } else {
+               write(localVariableInfo.getVariableName() + " = ");
+            }
+
+         }
+         writeInstruction(common);
+      } else if (_instruction.getByteCode().equals(ByteCode.INLINE_ASSIGN)) {
+         final InlineAssignInstruction inlineAssignInstruction = (InlineAssignInstruction) _instruction;
+         final AssignToLocalVariable assignToLocalVariable = inlineAssignInstruction.getAssignToLocalVariable();
+
+         final LocalVariableInfo localVariableInfo = assignToLocalVariable.getLocalVariableInfo();
+         if (assignToLocalVariable.isDeclaration()) {
+            // this is bad! we need a general way to hoist up a required declaration
+            throw new CodeGenException("/* we can't declare this " + convertType(localVariableInfo.getVariableDescriptor(), true)
+                  + " here */");
+         }
+         write(localVariableInfo.getVariableName());
+         write("=");
+         writeInstruction(inlineAssignInstruction.getRhs());
+      } else if (_instruction.getByteCode().equals(ByteCode.FIELD_ARRAY_ELEMENT_ASSIGN)) {
+         final FieldArrayElementAssign inlineAssignInstruction = (FieldArrayElementAssign) _instruction;
+         final AssignToArrayElement arrayAssignmentInstruction = inlineAssignInstruction.getAssignToArrayElement();
+
+         writeInstruction(arrayAssignmentInstruction.getArrayRef());
+         write("[");
+         writeInstruction(arrayAssignmentInstruction.getArrayIndex());
+         write("]");
+         write(" ");
+         write(" = ");
+
+         writeInstruction(inlineAssignInstruction.getRhs());
+      } else if (_instruction.getByteCode().equals(ByteCode.FIELD_ARRAY_ELEMENT_INCREMENT)) {
+
+         final FieldArrayElementIncrement fieldArrayElementIncrement = (FieldArrayElementIncrement) _instruction;
+         final AssignToArrayElement arrayAssignmentInstruction = fieldArrayElementIncrement.getAssignToArrayElement();
+         if (fieldArrayElementIncrement.isPre()) {
+            if (fieldArrayElementIncrement.isInc()) {
+               write("++");
+            } else {
+               write("--");
+            }
+         }
+         writeInstruction(arrayAssignmentInstruction.getArrayRef());
+
+         write("[");
+         writeInstruction(arrayAssignmentInstruction.getArrayIndex());
+         write("]");
+         if (!fieldArrayElementIncrement.isPre()) {
+            if (fieldArrayElementIncrement.isInc()) {
+               write("++");
+            } else {
+               write("--");
+            }
+         }
+
+      } else if (_instruction.getByteCode().equals(ByteCode.NONE)) {
+         // we are done
+      } else if (_instruction instanceof Branch) {
+         throw new CodeGenException(String.format("%s -> %04d", _instruction.getByteCode().toString().toLowerCase(),
+               ((Branch) _instruction).getTarget().getThisPC()));
+      } else if (_instruction instanceof I_POP) {
+         //POP discarded void call return?
+         writeInstruction(_instruction.getFirstChild());
+      } else {
+         throw new CodeGenException(String.format("%s", _instruction.getByteCode().toString().toLowerCase()));
+      }
+
+   }
+
+   private boolean isNeedParenthesis(Instruction instruction){
+        final Instruction parent = instruction.getParentExpr();
+        boolean needsParenthesis = true;
+
+        if (parent instanceof AssignToLocalVariable) {
+           needsParenthesis = false;
+        } else if (parent instanceof AssignToField) {
+           needsParenthesis = false;
+        } else if (parent instanceof AssignToArrayElement) {
+           needsParenthesis = false;
+        } else {
+           /**
+                       if (parent instanceof BinaryOperator) {
+                          BinaryOperator parentBinaryOperator = (BinaryOperator) parent;
+                          if (parentBinaryOperator.getOperator().ordinal() > binaryInstruction.getOperator().ordinal()) {
+                             needsParenthesis = false;
+                          }
+                       }
+           **/
+        }
+        return needsParenthesis;
+   }
+
+   private boolean isMultiDimensionalArray(NameAndTypeEntry nameAndTypeEntry) {
+      return nameAndTypeEntry.getDescriptorUTF8Entry().getUTF8().startsWith("[[");
+   }
+
+   private boolean isObjectArray(NameAndTypeEntry nameAndTypeEntry) {
+      return nameAndTypeEntry.getDescriptorUTF8Entry().getUTF8().startsWith("[L");
+   }
+
+   private boolean isMultiDimensionalArray(final AccessArrayElement arrayLoadInstruction) {
+      AccessInstanceField accessInstanceField = getUltimateInstanceFieldAccess(arrayLoadInstruction);
+      return isMultiDimensionalArray(accessInstanceField.getConstantPoolFieldEntry().getNameAndTypeEntry());
+   }
+
+   private boolean isObjectArray(final AccessArrayElement arrayLoadInstruction) {
+      AccessInstanceField accessInstanceField = getUltimateInstanceFieldAccess(arrayLoadInstruction);
+      return isObjectArray(accessInstanceField.getConstantPoolFieldEntry().getNameAndTypeEntry());
+   }
+
+   private AccessInstanceField getUltimateInstanceFieldAccess(final AccessArrayElement arrayLoadInstruction) {
+      Instruction load = arrayLoadInstruction.getArrayRef();
+      while (load instanceof I_AALOAD) {
+         load = load.getFirstChild();
+      }
+
+      return (AccessInstanceField) load;
+   }
+
+   public void writeMethod(MethodCall _methodCall, MethodEntry _methodEntry) throws CodeGenException {
+      boolean noCL = _methodEntry.getOwnerClassModel().getNoCLMethods()
+            .contains(_methodEntry.getNameAndTypeEntry().getNameUTF8Entry().getUTF8());
+      if (noCL) {
+         return;
+      }
+
+      if (_methodCall instanceof VirtualMethodCall) {
+         final Instruction instanceInstruction = ((VirtualMethodCall) _methodCall).getInstanceReference();
+         if (!(instanceInstruction instanceof I_ALOAD_0)) {
+            writeInstruction(instanceInstruction);
+            write(".");
+         } else {
+            writeThisRef();
+         }
+      }
+      final int argc = _methodEntry.getStackConsumeCount();
+      write(_methodEntry.getNameAndTypeEntry().getNameUTF8Entry().getUTF8());
+      write("(");
+
+      for (int arg = 0; arg < argc; arg++) {
+         if (arg != 0) {
+            write(", ");
+         }
+         writeInstruction(_methodCall.getArg(arg));
+      }
+      write(")");
+
+   }
+
+   public void writeThisRef() {
+      write("this.");
+   }
+
+   public void writeMethodBody(MethodModel _methodModel) throws CodeGenException {
+      if (_methodModel.isGetter() && !_methodModel.isNoCL()) {
+         FieldEntry accessorVariableFieldEntry = _methodModel.getAccessorVariableFieldEntry();
+         writeGetterBlock(accessorVariableFieldEntry);
+      } else {
+         writeBlock(_methodModel.getExprHead(), null);
+      }
+   }
+
+   public abstract void write(Entrypoint entryPoint) throws CodeGenException;
+}
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/writer/KernelWriter.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/writer/KernelWriter.java
index 92ce2fe930cda4a348b1c8b26b42eb0fb5cb19a7..16763ccbe5d31fcce2d2cf3a6aa215c79428eda4 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/writer/KernelWriter.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/writer/KernelWriter.java
@@ -73,6 +73,11 @@ public abstract class KernelWriter extends BlockWriter{
 
    private final String cvtShortArrayToShortStar = "short* ";
 
+   /** When declaring a __private struct pointer field, we always omit the "__private" qualifier. This is because the NVidia OpenCL compiler, at time of writing
+    * erroneously complains about explicitly qualifying pointers with __private ("error: field may not be qualified with an address space").
+    */
+   private static final boolean IMPLICIT_PRIVATE_FIELDS = true;
+
    // private static Logger logger = Logger.getLogger(Config.getLoggerName());
 
    private Entrypoint entryPoint = null;
@@ -333,7 +338,9 @@ public abstract class KernelWriter extends BlockWriter{
          while (signature.startsWith("[")) {
             if (isPointer == false) {
                argLine.append(argType + " ");
-               thisStructLine.append(type + " ");
+               if (!(type.equals(__private) && IMPLICIT_PRIVATE_FIELDS)) {
+                  thisStructLine.append(type + " ");
+               }
             }
             isPointer = true;
             numDimensions++;
diff --git a/doc/AccessingMultiDimNDRangeProposal.md b/doc/AccessingMultiDimNDRangeProposal.md
new file mode 100644
index 0000000000000000000000000000000000000000..188cf9bab385a6267eb0723857ff5807c45609c4
--- /dev/null
+++ b/doc/AccessingMultiDimNDRangeProposal.md
@@ -0,0 +1,197 @@
+#AccessingMultiDimNDRangeProposal
+*A proposal for accessing multi-dim ND range execution Updated Dec 14, 2011 by frost.g...@gmail.com*
+
+We can discuss this proposal either here (in comments) or via the discussion list here.
+
+Note this is nothing to do with accessing Java 2D arrays in Aparapi. This discussion is focused on the ability to expose the execution of kernels over 1, 2 or 3 dimensions. The memory in each case is a single contiguous region (like a single dimension primitive array).
+
+At present an Aparapi kernel can only be executed using a single dimension. If we wish to represent execution over WIDTH x HEIGHT element grid we would execute over the range (WIDTH*HEIGHT) and manually divide/mod getGlobalID() by WIDTH to determine the x and y for each.
+
+Similarly we would multiply y by WIDTH and add x (y*WIDTH+x) to convert an X,Y location to a linear global id
+
+    final static int WIDTH=128;
+    final static int HEIGHT=64;
+    final int in[] = new int[WIDTH*HEIGHT];
+    final int out[] = new int[WIDTH*HEIGHT];
+    Kernel kernel = new Kernel(){
+       public void run(){
+          int x = getGlobaId()%WIDTH;
+          int y = getGlobalID()/WIDTH;
+          if (!(x==1 || x==(WIDTH-1) || y==1 || y==(HEIGHT-1)){
+             int sum = 0;
+             for (int dx =-1; dx<2; dx++){
+               for (int dy =-1; dy<2; dy++){
+                 sum+=in[(y+dy)*WIDTH+(x+dx)];
+               }
+             }
+             out[y*WIDTH+x] = sum/9;
+             // or out[getGlobalID()] = sum/9;
+          }
+       }
+
+    };
+    kernel.execute(WIDTH*HEIGHT);
+
+OpenCL natively allows the user to execute over 1, 2 or 3 dimension grids via the clEnqueueNDRangeKernel() method.
+
+We chose not to expose this in Aparapi but there have been requests for us to allow it.
+
+There are a number of things to consider here:
+
+1. Extending the syntax of kernel.execute() to allow multi dimensional grids.
+1. Mapping Kernel methods to OpenCL's get_local_id(int dim), get_local_size(int dim), get_group_id(int_dim), etc. At present we map kernel.getGlobalId() to get_local_id(0).
+1. Handling all of these when an application drops back to JTP mode.
+
+##Extending Kernel.execute(int range)
+Sadly we can't overload Kernel.execute(int range), Kernel.execute(int xrange, int yrange) and Kernel.execute(int xrange, int yrange, int zrange) because we already have kernel.execute(int, int) mapped for executing mutiple passes over the linear range.
+
+Remember
+
+    for (int pass=0; pass<20; pass++){
+       kernel(1024);
+    }
+Is equivalent to
+
+    kernel(1024, 20);
+I think I would prefer
+
+    Kernel.execute(int range)
+    Kernel.execute(int range, int passes)
+    Kernel.executeXY(int xrange, int yrange)
+    Kernel.executeXY(int xrange, int yrange, int passes)
+    Kernel.executeXYZ(int xrange, int yrange, int zrange)
+    Kernel.executeXYZ(int xrange, int yrange, int zrange, int passes)
+    Obviously in the above calls we are only supplying the global bounds for the grid. We could also provide mappings allowing local ranges. I think I would prefer
+
+    Kernel.executeLocal(int range, int local)
+    Kernel.executeLocal(int range, int local, int passes)
+    Kernel.executeXYLocal(int xrange, int yrange, int xlocalrange, int ylocalrange)
+    Kernel.executeXYLocal(int xrange, int yrange, int xlocalrange, int ylocalrange, int passes)
+    Kernel.executeXYZLocal(int xrange, int yrange, int zrange, int xlocalrange, int ylocalrange, int zlocalrange)
+    Kernel.executeXYZLocal(int xrange, int yrange, int zrange, int xlocalrange, int ylocalrange, int zlocalrange, int passes)
+Another alternative may be to create Range classes
+
+    class Range{
+      int passes;
+      int width;
+      static Range create(int width);
+      static Range create(int width, int passes);
+    }
+
+    class Range2D extends Range{
+       int height;
+       static Range create(int width, int height);
+       static Range create(int width, int height, int passes);
+
+    }
+
+    class Range3D extends Range2D{
+       int depth;
+       static Range create(int width, int height);
+       static Range create(int width, int height, int passes);
+    }
+With appropriate constructors (or factory methods) to allow
+
+    Kernel.execute(Range range)
+
+Then execute would be simply.
+
+    Kernel.execute(Range.create(1,1))
+
+We can also arrange for the group size to be placed in the base Range class.
+
+    class Range{
+      int groupSize;
+      int passes;
+      int width;
+      static Range create(int width);
+      static Range create(int width, int passes);
+    }
+
+##Mapping to OpenCL multi dim methods. i.e get_global_id(1), get_local_size(2) etc
+We could just add getGlobalId(int dim), getLocalSize(int dim) etc to replicate OpenCL methods.
+
+I would prefer to offer the following global mappings
+
+|Kernel	| OpenCL|
+|-----|------|
+|getGlobalId()|	get_global_id(0)|
+|getGlobalX()|	get_global_id(0)|
+|getGlobalY()|	get_global_id(1)|
+|getGlobalZ()|	get_global_id(2)|
+|getGlobalSize()|	get_global_size(0)|
+|getGlobalWidth()|	get_global_size(0)|
+|getGlobalHeight()|	get_global_size(1)|
+|getGlobalDepth()|	get_global_size(2)|
+
+And the following local mappings
+
+|Kernel|	OpenCL|
+|-----|-------|
+|getLocalId()|	get_local_id(0)|
+|getLocalX()|	get_local_id(0)|
+|getLocalY()|	get_local_id(1)|
+|getLocalZ()|	get_local_id(2)|
+|getLocalSize()|	get_local_size(0)|
+|getLocalWidth()|	get_local_size(0)|
+|getLocalHeight()|	get_local_size(1)|
+|getLocalDepth()|	get_local_size(2)|
+
+##An example
+
+    final static int WIDTH=128;
+    final static int HEIGHT=64;
+    final int in[] = new int[WIDTH*HEIGHT];
+    final int out[] = new int[WIDTH*HEIGHT];
+    Kernel kernel = new Kernel(){
+       public void run(){
+          int x = getGlobalX();
+          int y = getGlobalY();
+          if (!(x==1 || x==(getGlobalWidth()-1) || y==1 || y==(getGlobalHeight()-1)){
+             int sum = 0;
+             for (int dx =-1; dx<2; dx++){
+               for (int dy =-1; dy<2; dy++){
+                 sum+=in[(y+dy)*getGlobalWidth()+(x+dx)];
+               }
+             }
+             out[y*getGlobalWidth()+x] = sum/9;
+             // or out[getGlobalID()] = sum/9;
+          }
+       }
+
+    };
+    kernel.executeXY(WIDTH, HEIGHT);
+
+Or if we choose the Range class approach.
+
+    final static int WIDTH=128;
+    final static int HEIGHT=64;
+    final int in[] = new int[WIDTH*HEIGHT];
+    final int out[] = new int[WIDTH*HEIGHT];
+    Kernel kernel = new Kernel(){
+       public void run(){
+          int x = getGlobalX();
+          int y = getGlobalY();
+          if (!(x==1 || x==(getGlobalWidth()-1) || y==1 || y==(getGlobalHeight()-1)){
+             int sum = 0;
+             for (int dx =-1; dx<2; dx++){
+               for (int dy =-1; dy<2; dy++){
+                 sum+=in[(y+dy)*getGlobalWidth()+(x+dx)];
+               }
+             }
+             out[y*getGlobalWidth()+x] = sum/9;
+             // or out[getGlobalID()] = sum/9;
+          }
+       }
+
+    };
+    kernel.execute(Range2D.create(WIDTH, HEIGHT));
+
+##Handling this from JTP mode
+Mapping to OpenCL for this is all fairly straightforward.
+
+In Java JTP mode we will have to emulate this. For get_global_id(0..3) (getGlobalX(), getGlobalY() and getGlobalZ() using our proposed Aparapi Java mappings) we can of course easily offer reasonable implementations, this just requires the Java code to essentially nest 3 loops (or emulate) and set globalX, globalY, globalZ inside each nesting.
+
+For get_local_size(0..3) (getLocalWidth(), getLocalHeight() and getLocalDepth() using our proposed Aparapi Java mappings) we will need to break the globalWidth/globalHeight and globalDepth into some arbitrary equal 'chunks' (note I am avoiding using the word groups here to avoid confusion with get_group_size(0..3)!
+
+At present we always create a synthetic group in JTP mode which is the the # or cores. This will need to be changed. If the user requests a grid (64,64,8,8) (global width 64, global height 64, local width 8, local height 8) then we will have to create a JTP group of 64 (8x8) and just in case the kernel code contains a barrier, we will need to ensure we launch 64 threads for this group. From our experience it is best to launch one thread per core, so we may lose some JTP performance executing in this mode.
\ No newline at end of file
diff --git a/doc/AddingLambdasToAparapi.md b/doc/AddingLambdasToAparapi.md
new file mode 100644
index 0000000000000000000000000000000000000000..07e9ab9fed7d3bc3de78308393a13c2d8fd6c55e
--- /dev/null
+++ b/doc/AddingLambdasToAparapi.md
@@ -0,0 +1,106 @@
+#AddingLambdasToAparapi
+*Adding Java 8 Lambda Support to Aparapi Updated Jun 24, 2013 by frost.g...@gmail.com*
+
+In the recently added ''lambda'' branch we have been experimenting with adding lambda support to Aparapi. We believe that this upcomming Java 8 feature will be a natural way to express parallel algorithms which can be executed on the GPU.
+
+A link to the branch can be found here preview.
+
+You will need to get the latest binary build of ''Project Lambda'' to experiment with these new features. The 'Project Lambda' preview can be found here.
+
+Once you have a Lambda enabled Java 8 JDK Java set JAVA_HOME to your Java8 Lambda enabled compiler and build Aparapi.
+
+So from the root of SumatraExperiments just use
+
+    $ ant
+We are slowly walking through some of the Aparapi demos and converting them. At present NBody and Mandel have been converted.
+
+With Lambda enabled Aparapi we remove the need to derive from a base Kernel class, we will allow the user to express their code as a lambda using the following basic pattern
+
+    Device.bestGPU().forEach(int range, IntConsumer lambda);
+The Java 8 stream API defines a type called java.util.function.IntConsumer. This is essentially an interface with a Single Abstract Method (these types are referred to as SAM types in the stream API code).
+
+IntConsumer looks something like....
+
+    interface IntConsumer{
+       public void accept(int Id);
+    }
+So you can run the familiar 'squares' kernel using
+
+    int in[] = ..//
+    int out[] = .../
+    Device.bestGPU().forEach(in.length, (i)->{
+       out[i] = in[i] * in[i];
+     });
+
+Instead of
+
+    int in[] = ..//
+    int out[] = .../
+    Device.bestGPU().forEach(in.length, new IntConsumer(){
+       public void accept(int i){
+           out[i] = in[i] * in[i];
+       }
+     });
+
+To accomodate lambda's we created Device.forEach(int range, IntConsumer ic) which converts the bytecode of the ic parameter to OpenCL at runtime. The captured args (in, out and i - in this case) are passed to the GPU and the kernel executed.
+
+During our early experiments we encountered an interesting issue. The new 'lambdafied' javac uses Java 7 method handles and invoke dynamic instructions to dispatch the lambda code. It does this by injecting a call to a MethodHandle factory into the call site. At runtime, this factory creates a synthetic class (to capture call-site args) and passes this to our Device.forEach().
+
+We needed to analyse this synthetically generated class in order to work out which args need to be sent to the GPU. Of course we have a bunch of tools already in Aparapi for analyzing bytecode, but this code expects to find bytecode in class files (either in a Jar or on the disk), we had to find a way to access these classfile bytes to Aparapi.
+
+We have a couple of proposed solutions for solving this. The most promising is to turn the aparapi.dll/aparapi.so native library (used by Aparapi at runtime) into a JVMTI agent (like hprof). JVMTI agents are native libraries which have access to some aspects of a running JVM (via the JVM Tool Interface). We havea prototype JVMTI agent which 'listens' for classfiles which represent these 'synthetic lambda helpers' and allows us to get hold of the bytecode for these classes.
+
+This will mean that in future we will change how Aparapi is launched.
+
+Instead of
+
+    $ java -Djava.library.path=path/to/aparapi -classpath path/to/aparapi/aparapi.jar:your.jar YourClass
+    
+We will use
+
+    $ java -agentlib=path/to/aparapi/aparapi.dll -classpath path/to/aparapi/aparapi.jar:your.jar YourClass
+We are also looking into the possibility of having this agent provide the bytecode for all Aparapi classes. We believe that this will enable us to ultimately remove MethodModel/ClassModel and even the InstructionSet classes and handling all of this in JNI.
+
+We would welcome comments on these proposals. Either here, or in the discussion list. Let us know what you think.
+
+##Consequences of lambdification of Aparapi.
+
+* No support for local memory, group size or barriers in Lambda form
+* Calls to Kernel base class methods (such as getGlobalId()) will not be allowed. The 'global id' will be passed as an arg to the lambda.
+* We will need to add support for calling static methods (of course the bytecode for the called methods cannot violate Aparapi restrictions).
+* We might need to drop support for multi dimension dispatch. This is more a convergence story with Sumatra (which is unlikely to support this)
+* Unlikely that explicit buffer management will be simple.
+* We can use lambda's for control as well as the kernel itself. See examples below.
+
+##Alternate forms for kernel dispatch
+
+This version would allow us to carry over Aparapi's device selection
+
+    Device.bestGPU().forEach(1024, i->{lambda});
+This version would allow us to carry over Aparapi's Range selection
+
+    Device.bestGPU().range2D(width, height).forEach(1024, rid->{lambda});
+This version would allow us to mimic Kernel.execute(1024, 5)
+
+    Device.bestGPU().forEach(1024, 5, (id, passid)->{lambda});
+We could even have the range iterated over until some other lambda determines we are done
+
+    Device.bestGPU().forEachUntil(1024, id->{lambda}, ->{predicate lambda});
+Explicit buffer handling could be removed in many cases by allowing the bytecode of the 'until' predicate to be snooped for buffer references.
+
+    int lotsOfData[] = ...;
+    boolean found[false] = new boolean[1];
+    Device.bestGPU().forEachUntil(1024, 5,
+       (id, passid)->{ /* mutate lotsOfData, found[0]=true when done */ }
+       ->{found[0]]});
+In the above cases Aparapi can determine that between each pass it needs to ''ONLY'' copy found[] back from the device.
+
+There is no reason that the range itself needs to be constant, we can use a collection/iterable. This helps with some reductions.
+
+    int range[] = new int[]{1024,512,128,64,32,16,8,4,2,1,0};
+    Device.bestGPU().forEach(range,{lambda});
+or the range can be a lambda itself, here we specify a start and end value for the range itself, and a lambda to provide each step.
+
+    Device.bestGPU().forEach(1024, 1, r->{return(r/2);},(pass, r, id)->{lambda});
+    // or
+    Device.bestGPU().forEach(1, 1024, r->{return(r*2);},(pass, r, id)->{lambda});
diff --git a/doc/AddressSpacesUsingBuffers.md b/doc/AddressSpacesUsingBuffers.md
new file mode 100644
index 0000000000000000000000000000000000000000..a311db2f4e98dce9b71fec915c0b19c5859358ae
--- /dev/null
+++ b/doc/AddressSpacesUsingBuffers.md
@@ -0,0 +1,44 @@
+#AddressSpacesUsingBuffers
+*Proposal For OpenCL address space support using java Buffers instead of arrays. Updated Dec 8, 2011 by frost.g...@gmail.com*
+The general idea is to have a AS_PRIMTYPE_Buffer for each AS=address space and PRIM=primitive type. Here is an example for LocalFloatBuffer which would be a buffer for floats that got mapped to OpenCL local address space.
+
+As with normal FloatBuffers, the float elements are accessed using get and put methods
+
+Although a LocalFloatBuffer conceptually exists only for the lifetime of a workgroup, it is still constructed in the enclosing Kernel, not in the Kernel.Entry.run method. (Aparapi does not support constructing new objects inside the Kernel.Entry.run method).
+
+A typical declaration would be:
+
+    LocalFloatBuffer locbuf = new LocalFloatBuffer{12);
+The argument 12 here means that 12 floats would be used by each workitem in the workgroup. So the total buffer would be LocalSize*12 floats. Aparapi would at runtime allocate a total local OpenCL buffer to be this size. Note how this removes the need for the programmer to specify localSize anywhere.
+
+Note: For each Kernel.Entry.execute(globalSize) call, the runtime will determine an appropriate workgroup size, also called localSize, depending on the capabilities of the device, and on the globalSize. The localSize will always evenly divide the globalSize, in other words all workgroups for an execute context will be the same size. A workitem can determine localSize by calling getLocalSize().
+
+Because workitems operate simultaneously and in an undetermined order, workitems will generally only use put on its own portion of the LocalFloatBuffer between the LocalBarriers, and will generally only use get outside the LocalBarriers.
+
+Some example code (from NBody) follows. Here each workitem copies a "BODY" consisting of 4 floats. The global array contains 4*globalSize floats, and we want to iterate thru this global array, copying it into local memory and operating on it there. This will take globalSize/localSize "tiles". For each tile, each workitem fills in one "BODY"'s worth or 4 elements
+
+      // outside run method...
+      final int BODYSIZE = 4;
+      LocalFloatBuffer pos_xyzm_local = new LocalFloatBuffer(BODYSIZE);
+      //
+      // inside run method...
+      int numTiles = globalSize / localSize;
+      for (int i = 0; i < numTiles; ++i) {
+         // load one tile into local memory
+         int idx = i * localSize + localId;  // index into a global memory array
+         localBarrier();
+         pos_xyzm_local.put(localId * BODYSIZE + 0, pos_xyzm[idx * BODYSIZE + 0]);
+         pos_xyzm_local.put(localId * BODYSIZE + 1, pos_xyzm[idx * BODYSIZE + 1]);
+         pos_xyzm_local.put(localId * BODYSIZE + 2, pos_xyzm[idx * BODYSIZE + 2]);
+         pos_xyzm_local.put(localId * BODYSIZE + 3, pos_xyzm[idx * BODYSIZE + 3]);
+         // Synchronize to make sure data is available for processing
+         localBarrier();
+
+         // now the entire LocalFloatBuffer has been filled.
+         // each workitem might use the entire Buffer
+         // which consists of localSize BODYs
+         for (int j = 0; j < localSize; ++j) {
+            float r_x = pos_xyzm_local.get(j * BODYSIZE + 0) - myPos_x;
+            float r_y = pos_xyzm_local.get(j * BODYSIZE + 1) - myPos_y;
+            float r_z = pos_xyzm_local.get(j * BODYSIZE + 2) - myPos_z;
+            // ...etc
\ No newline at end of file
diff --git a/doc/AparapiExtensionProposal.md b/doc/AparapiExtensionProposal.md
new file mode 100644
index 0000000000000000000000000000000000000000..087695cb29f920e57d084439b1a4cd616b8251a8
--- /dev/null
+++ b/doc/AparapiExtensionProposal.md
@@ -0,0 +1,258 @@
+#AparapiExtensionProposal
+*A proposed aparapi extension mechanism. Updated Feb 29, 2012 by frost.g...@gmail.com*
+
+##Here is a proposed Aparapi extension mechanism
+This would allow a developer to create a library that could be used by Aparapi Kernel code. The library would include OpenCL and Java implementations.
+
+We will treat this as a live document. Please join the discussions at http://groups.google.com/group/aparapi-discuss/browse_thread/thread/7ec81ecb2169aa4 and I will update this page to reflect what I think the latest decisions are:-
+
+Currently Aparapi allows Java bytecode to be converted to OpenCL at runtime. Only the OpenCL generated by this conversion process is made available. Sometimes for performance reasons we might want to allow hand coded OpenCL to be called from Aparapi kernel code.
+
+Here we will present a strawman API which would allow extension points to be added by an end user or by a library provider.
+
+We will use an FFT usecase to walk through the steps.
+
+The FFT (Fast Fourier Transform) algorithm can be coded in Aparapi, but for performance reasons handcrafted OpenCL is likely to be more performant. The goal is to allow Aparapi to do what it does best, i.e. manage the host buffer allocations and provide a mechanism for binding arbitrary opencl code at runtime.
+
+So lets assume we wanted an Aparapi Kernel to be able to call an Aparapi extension for computing FFT (forward and reverse). The Kernel implementation might look like this.
+
+    public static class BandStopFilter extends Kernel{
+       FFT fft = new FFT(); // Create an instance of the Extension point.
+       float[] real;
+       float[] imaginary;
+
+      BandStopFilter (float[] _real){
+         real = _real;
+         imaginary = new float[_real.length];
+
+      }
+
+      @Override public void run() {
+         fft.forward(real, imaginary);
+      }
+    }
+
+The main method then would just execute the Kernel using the familiar kernel.execute() method :-
+
+    public static void main(String[] args) {
+       float[] data = new float[1024];
+       BandStopFilter  kernel = new BandStopFilter (data);
+       kernel.execute(data.length);
+    }
+
+Essentially we want the FFT.forward(float[] _real, float[] _imaginary) and FFT.reverse(float[] _real, float[] _imaginary) methods to be callable from Aparapi Kernel code. We want Aparapi to handle the call-forwarding and the argument/buffer mapping transfers. We want Aparapi to call the Java methods normally if OpenCL is not available but would like Aparapi to use the implementor provided OpenCL if it is. So the implementor will be required to provide both a Java and an OpenCL version of the callable methods because Aparapi will decide which version needs to be called ant runtime.
+
+Any extension point is required to implement the AparapiExtensionPoint interface.
+
+public class AparapiExtensionPoint
+   public String getOpenCL();
+}
+Here is a possible (although incomplete) FFT implementation.
+
+    public class FFT implements AparapiExtensionPoint{
+        @AparapiCallable public void forward(
+            @Global @ReadWrite float[] _data,
+            @Global @ReadWrite float[] _imaginary) {
+              // java implementation
+           }
+
+        @AparapiCallable public void reverse(
+            @Global @ReadWrite float[] _data,
+            @Global @ReadWrite float[] _imaginary) {
+              // java implementation
+            }
+
+        @Override public String getOpenCL() {
+              return ""
+              +"void my_package_FFT_forward("
+              +"   __global float* _real,"
+              +"   __global float* _imaginary )"
+              +"   {"
+              +"       // OpenCL implemention"
+              +"   }"
+              +"void my_package_FFT_reverse("
+              +"   __global float* _real,"
+              +"   __global float* _imaginary )"
+              +"   {"
+              +"       // OpenCL implemention"
+              +"   }";
+           }
+    }
+
+The implementerâ€™s class will be required to define the callable aparapi methods as well as implement the `getOpenCL()` method so that the OpenCL implementation of those methods can be extracted at run-time.
+
+Aparapi will provide annotations to decorate the methods and args/parameters of the exposed callable methods . These annotations provide information so that Aparapi locate the callable methods as well as parameter hints to help coordinate buffer types (global, local, constant) and transfer directions (read,write, readWrite) when executing the methods from a Kernel. This information is consulted during the normal bytecode analysis that Aparapi provides when Aparapi hits the call site.
+
+Note that the Java code inside the `@AparapiCallable` functions (or code executed from it) is not constrained to the normal Aparapi subset. It can be any legitimate Java code, but should be thread safe (because it will be called from JTP mode!).
+
+Note also that the OpenCL code yielded from the `getOpenCL()` method is assumed to be complete, Aparapi does not attempt to parse this code. If the code fails to compile Aparapi will fallback and execute the whole Kernel in JTP mode.
+
+BTW we show getOpenCL() returning a String literal. This is most likely to be how code is returned. However, it could be extracted from a File? a resource in the Jar file? or dynamically generated based on some state. For example an FFT implementation might choose to use different code for radix2 or radix4 implementations (based on a paramater passed to `FFT()` constructor - say `FFT(FFT.RADIX2))` in which case the getOpenCL() method might yield different code.
+
+The above proposal covers the case where a third party might want to provide an Aparapi extension point as a library.
+
+We might also consider allowing single methods within the Kernel to be optimized, where the OpenCL is made available via the AparapiCallable annotation. The method would still use the same Annotations for the args (to allow buffer txfers to be optimized).
+
+    Kernel k = new Kernel(){
+          @AparapiCallable(â€ /* opencl code for sum() goes here */â€)
+           int sum(@Global @ReadWrite int[] data, int length){
+                 int  sum = 0;
+                 for (int v:data){
+                        sum+=v;
+                 }
+          }
+         @Override public void run(){
+                sum(data);
+         }
+    }
+
+Here are the proposed new interfaces/annotations
+
+    public interface AparapiExtensionPoint {
+       public String getOpenCL();
+    }
+    @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.METHOD)
+    public @interface AparapiCallable {
+         String value default NULL;
+    }
+
+    @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER)
+    public @interface Global {}
+
+    @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER)
+    public @interface Local {}
+
+    @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER)
+    public @interface Constant {}
+
+    @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER)
+    public @interface ReadWrite {}
+
+    @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER)
+    public @interface ReadOnly {}
+
+    @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.PARAMETER)
+    public @interface WriteOnly {}
+
+And here is the example code in one chunk
+
+    public class FFT implements AparapiExtensionPoint{
+        @AparapiCallable public void forward(
+            @Global @ReadWrite float[] _data,
+            @Global @ReadWrite float[] _imaginary) {
+              // java implementation
+           }
+
+      @AparapiCallable public void reverse(
+          @Global @ReadWrite float[] _data,
+          @Global @ReadWrite float[] _imaginary) {
+            // java implementation
+          }
+
+      @Override public String getOpenCL() {
+            return ""
+            +"void my_package_FFT_forward("
+            +"   __global float* _real,"
+            +"   __global float* _imaginary )"
+            +"   {"
+            +"       // OpenCL implemention"
+            +"   }"
+            +"void my_package_FFT_reverse("
+            +"   __global float* _real,"
+            +"   __global float* _imaginary )"
+            +"   {"
+            +"       // OpenCL implemention"
+            +"   }";
+         }
+    }
+
+    public class BandStopFilter extends Kernel{
+       FFT fft = new FFT();
+       float[] real;
+       float[] imaginary;
+
+       BandStopFilter (float[] _real){
+          real = _real;
+          imaginary = new float[_real.length];
+
+       }
+
+       @Override public void run() {
+          fft.forward(real, imaginary);
+       }
+    }
+
+    public static void main(String[] args) {
+       float[] data = new float[1024];
+       BandStopFilter  kernel = new BandStopFilter (data);
+       kernel.execute(data.length);
+    }
+
+After discussion I think we are converging on a less complex solution. This is based on Witold's feedback suggestion (see below) where we use OpenCL annotations rather than forcing the implementation of the interface and the `getOpenCL()` method as originally suggested.
+
+So we will create an `@OpenCL` annotation for classes/methods.
+
+The `@OpenCL` annotation on the methods will contain the OpenCL source replacement for a specific method. The arg list will be created by Aparapi.
+
+The @OpenCL annotation on a class allows us to optionally introduce common code (helper methods, #pragmas, constants) which will precede the method declarations in the OpenCL code.
+
+So an FFT example whereby forward() and reverse() methods both called a common foo() method might look like this.
+
+    @OpenCL(common="/* common void foo(){} + maybe #pragmas + accessable
+    global fields declared here */")
+    public class FFT extends AparapiExtensionPoint {
+          @OpenCL(signature="//function signature - OPTIONAL", body="{ /* uses foo(); */ }")
+          public void forward(
+              @Global @ReadWrite float[] _data,
+              @Global @ReadWrite float[] _imaginary) {
+                // java implementation
+             }
+          @OpenCL(function="{  /*uses foo(); */) }")
+          public void reverse(
+              @Global @ReadWrite float[] _data,
+              @Global @ReadWrite float[] _imaginary) {
+                // java implementation
+              }
+       }
+    }
+
+To invoke from an Aparapi kernel. We should be able to do something like
+
+    public class BandStopFilter extends Kernel{
+         FFT fft = new FFT();
+         float[] real;
+         float[] imaginary;
+
+         BandStopFilter (float[] _real){
+            real = _real;
+            imaginary = new float[_real.length];
+
+         }
+
+         @Override public void run() {
+            fft.forward(this, real, imaginary);
+         }
+      }
+
+      public static void main(String[] args) {
+         float[] data = new float[1024];
+         BandStopFilter  kernel = new BandStopFilter (data);
+         kernel.execute(data.length);
+      }
+
+Ideally we would also like to invoke FFT directly (instead of via a Kernel). This is tricky because the forward()} and reverse() methods will need to be invoked across a range and of course the dispatch across the range needs to be initiated from Aparapi.
+
+The only way I can see how to do this is to force the creation of an interface so we can use Java's existing Proxy mechanism to create a wrapper.
+
+    @OpenCL(wraps=FFT.class);
+    interface FFTInterface{
+     public void forward(  Range _range, float[] _data,  float[] _imaginary);
+         public void reverse( Range _range, float[] _data, float[] _imaginary);
+    }
+    Then provide a mechanism for extracting a proxy and invoking it.
+
+    float[] real = //??
+    float[] imag = //??
+    Aparapi.wrap<FFT>(FFTInterface.class).forward(range, real, imag);
+
+I can't see a cleaner solution.
diff --git a/doc/AparapiPatterns.md b/doc/AparapiPatterns.md
new file mode 100644
index 0000000000000000000000000000000000000000..7baf1cbb8589c533aae1724508974ad1c8ec08fe
--- /dev/null
+++ b/doc/AparapiPatterns.md
@@ -0,0 +1,129 @@
+#AparapiPatterns
+*Examples and code fragments to demonstrate Aparapi fetaures. Updated Jul 24, 2012 by frost.g...@gmail.com*
+
+##Aparapi Patterns
+
+The following suggestions help solve some common problems found in using Aparapi.
+
+Additional suggestions and solutions to extend this list would be welcome.
+
+##How do I return data from a kernel if I canâ€™t write to kernel fields?
+
+Use a small array buffer (possibly containing a single element) and assign it from the kernel.
+
+For example, the following kernel code detects whether the buffer[] contains the value 1234. The flag (true or false) is returned in found[0].
+
+    final int buffer[] = new int[HUGE];
+    final boolean found[] = new boolean[]{false};
+    // fill buffer somehow
+     kernel kernel = new kernel(){
+        @Override public void run(){
+              if (buffer[getGlobald()]==1234){
+                    found[0]=true;
+              }
+        }
+    };
+    kernel.execute(buffer.length);
+
+This code does include a race condition, whereby more than one value of `Kernel.getGlobalId()` might contain 1234 and try to set `found[0]`. This is not a problem here, because we don't care if multiple kernel executions match, provided one flips the value of `found[0]`.
+
+##How can I use Aparapi and still maintain an object-oriented view of my data?
+
+See the NewFeatures page. Aparapi can now handle simple arrays of objects, which minimizes the amount of refactoring required to experiment with Aparapi. However, performance is still likely to be better if your algorithm operates on data held in parallel primitive arrays. To get higher performance from Aparapi with minimal exposure to data in this parallel primitive array form, we can (with a little work) allow both forms of data to co-exist. Letâ€™s reconsider the NBody problem (http://en.wikipedia.org/wiki/N-body_problem) .
+
+A Java developer writing an NBody solution would most likely create a Body class:
+
+    class Body{
+      float x,y,z;
+      float getX(){return x;}
+      void setX(float _x){ x = _x;}
+      float getY(){return y;}
+      void setY(float _y){ y = _y;}
+      float getZ(){return z;}
+      void setZ(float _z){ z = _z;}
+
+
+      // other data related to Body unused by positioning calculations
+    }
+
+The developer would also likely create a container class (such as NBodyUniverse), that manages the positions of multiple Body instances.
+
+    class NBodyUniverse{
+         final Body[] bodies = null;
+         NBodyUniverse(final Bodies _bodies[]){
+            bodies = _bodies;
+            for (int i=0; i<bodies.length; i++){
+               bodies[i].setX(Math.random()*100);
+               bodies[i].setY(Math.random()*100);
+               bodies[i].setZ(Math.random()*100);
+            }
+         }
+         void adjustPositions(){
+           // can use new array of object Aparapi features, but is not performant
+         }
+    }
+    Body bodies = new Body[BODIES];
+    for (int i=0; i<bodies; i++){
+        bodies[i] = new Body();
+    }
+    NBodyUniverse universe = new NBodyUniverse(bodies);
+    while (true){
+       universe.adjustPositions();
+       // display NBodyUniverse
+    }
+
+The NBodyUniverse.adjustPostions() method contains the nested loops (adjusting each body position based on forces impinging on it from all of the other bodies), making it an ideal Aparapi candidate.
+
+Even though this code can now be written by accessing the x, y and z ordinates of Body[] via getters/setters, the most performant Aparapi implementation is the one that operates on parallel arrays of floats containing x, y and z ordinates, with Body[10]â€™s state conceptually stored across x[10], y[10] and z[10].
+
+So for performance reasons, you can do something like this:
+
+    class Body{
+        int idx;
+        NBodyUniverse universe;
+        void setUniverseAndIndex(NBodyUniverse _universe, int _idx){
+            universe = _universe;
+            idx = _idx;
+        }
+
+        // other fields not used by layout
+
+        void setX(float _x){ layout.x[idx]=_x;}
+        void setY(float _y){ layout.y[idx]=_y;}
+        void setZ(float _z){ layout.z[idx]=_z;}
+        float getX(){ return layout.x[idx];}
+        float getY(){ return layout.y[idx];}
+        float getZ(){ return layout.z[idx];}
+    }
+    class NBodyUniverse {
+         final Body[] bodies;
+         final int[] x, y, z;
+         NBodyUniverse(Body[] _bodies){
+            bodies = _bodies;
+            for (int i=0; i<bodies.length; i++){
+               bodies[i].setUniverseAndIndex(this, i);
+               bodies[i].setX(Math.random()*100);
+               bodies[i].setY(Math.random()*100);
+               bodies[i].setZ(Math.random()*100);
+            }
+         }
+         void adjustPositions(){
+             // can now more efficiently use Aparapi
+         }
+    }
+
+
+
+    Body bodies = new Body[BODIES];
+    for (int i=0; i<bodies; i++){
+        bodies[i] = new Body();
+    }
+    NBodyUniverse universe = new NBodyUniverse(bodies);
+    while (true){
+       universe.adjustPositions();
+       // display NBodyUniverse
+    }
+
+This example allows Javaâ„¢ code to treat each Body in a traditional object-oriented fashion and also allows Aparapi kernels to act on the parallel primitive array form, in order to access/mutate the position of the bodies.
+
+[Attribution](Attribution.md)
\ No newline at end of file
diff --git a/doc/Attribution.md b/doc/Attribution.md
new file mode 100644
index 0000000000000000000000000000000000000000..52ab3813e7eec37fd5751196eae30b8a18c0ae59
--- /dev/null
+++ b/doc/Attribution.md
@@ -0,0 +1,26 @@
+#Attribution
+*Attribution Updated Sep 13, 2011 by frost.g...@gmail.com*
+
+##Attribution
+
+AMD, AMD Radeon, the AMD arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc.
+
+OpenCL is a trademark of Apple Inc used under license to the Khronos Group, Inc.
+
+NVIDIA, the NVIDIA logo, and CUDA are trademarks or registered trademarks of NVIDIA Corporation.
+
+Java , JVM, JDK and â€œWrite Once, Run Anywhere" are trademarks of Oracle and/or its affiliates.
+
+Eclipse and the related logos are a trademark of The Eclipse Foundation in the United States, other countries, or both.
+
+Microsoft, Windows, Visual Studio, Visual Studio Express Edition are trademarks of Microsoft Corporation in the United States, other countries, or both.
+
+Linux is a registered trademark of Linus Torvalds
+
+Ubuntu is a trademark of Canonical Ltd
+
+Red Hat is a registered trademark of Red Hat, Inc. in the United States and other countries.
+
+OpenGLÂ® and the oval logo are trademarks or registered trademarks of Silicon Graphics, Inc. in the United States and/or other countries worldwide.
+
+All other names used in this documentation are for identification purposes only and may be trademarks of their respective owners.
diff --git a/doc/BuildingNBody.md b/doc/BuildingNBody.md
new file mode 100644
index 0000000000000000000000000000000000000000..092306ac1c0f1975d86b9c2beb6ddbcee596c02a
--- /dev/null
+++ b/doc/BuildingNBody.md
@@ -0,0 +1,40 @@
+#BuildingNBody
+*How to build the NBody example. Updated Nov 11, 2011 by frost.g...@gmail.com*
+##Building NBody
+The NBody example is located in the/ examples subdirectory under the Aparapi trunk:
+
+    trunk/
+       ...
+       examples/
+         ...
+         nbody/
+            src/java/com.amd.aparapi.nbody/
+            build.xml
+            nbody.sh
+            nbody.bat
+The NBody example requires a little more work to build because it depends on a third party project named â€˜JOGLâ€™.
+
+JOGL is a set of OpenGLâ„¢ bindings for JavaÂ® and the NBody example uses this library to render the particles/bodies (potentially many thousands of them) at runtime. More information about JOGL can be found here http://jogamp.org/jogl/www.
+
+The build.xml file build target will download the jars required to build and run the nbody example if the files do not exist.
+
+To build nbody, perform the following commands.
+
+    C:> ant clean build
+The NBody build.xml file includes a â€˜runâ€™ target so you can launch the application using.
+
+    C:> ant run
+Or if you prefer to launch from either the nbody.sh or nbody.bat script.
+
+For LinuxÂ® we also need to chmod nbody.sh in order to execute it.
+
+    chmod +x nbody.sh
+The nbody scripts take the execution mode as the first argument, the number of bodies as the second argument, and then the height and width (in pixels).
+
+Windows example:
+
+    C:> nbody  GPU 32768 800 800
+Linux example:
+
+    $ ./nbody.sh  GPU 32768 800 800
+Attribution
\ No newline at end of file
diff --git a/doc/ByteCode2OpenCL.pdf b/doc/ByteCode2OpenCL.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..6d51d746f087b8e4fa74a42a1580e5fcb00f5336
Binary files /dev/null and b/doc/ByteCode2OpenCL.pdf differ
diff --git a/doc/ChoosingSpecificDevicesForExecution.md b/doc/ChoosingSpecificDevicesForExecution.md
new file mode 100644
index 0000000000000000000000000000000000000000..62b4ce089e982347ece86397f538793849c5918e
--- /dev/null
+++ b/doc/ChoosingSpecificDevicesForExecution.md
@@ -0,0 +1,58 @@
+#ChoosingSpecificDevicesForExecution
+*Using the new Device API's to choose Kernel execution on a specific device. Updated Sep 18, 2012 by frost.g...@gmail.com*
+
+Previously Aparapi chose the first GPU device when Kernel.execute() was called. This make it easy to execute simple Kernels, but was problematic when users wished finer control over which device should be chosen. Especially when the first device may be unsuitable. We recently added new classes and API's to allow the developer to specify exactly which device we intend to target.
+
+A new Device class has been added. This allows the user to select a specific device; either by calling a helper method Device.firstGPU() or Device.best(). Or by allowing the user to iterate through all devices and choose one based on some other criteria (capabilities? vendor name?).
+
+So selecting the 'best' (most performant) device could be achieved using.
+
+    Device device = Device.best();
+
+Alternatively if I wanted the first AMD GPU device I might use:-
+
+    Device chosen=null;
+    for (Device device: devices.getAll()){
+       if (device.getVendor().contains("AMD") && device.isGPU()){
+          chosen = device;
+          break;
+       }
+    }
+
+A Device can be queried `(isGPU(), isOpenCL(), isGroup(), isJava(), getOpenCLPlatform(), getMaxMemory(), getLocalSizes())` to yield it's characteristics.
+
+To execute on a specific device we must use the device to create our range.
+
+    Range range = device.createRange2D(width, height);
+
+This allows the Range to be created with knowledge of the underlying device. So for example device.createRange3D(1024, 1024, 1024, 16, 16, 16) will fail if the device does not allow a local size of (16x16x16).
+
+A range created using a device method captures the device which created it. The range instance has a device field which is set by the device which creates it.
+
+It's as if we had this code
+
+    Range range = Range.create(width, height);
+    range.setDevice(device);
+
+So the Range locks the device that it can be used with.
+
+Now when we have a Kernel.
+
+    Kernel kernel = new Kernel(){
+        @Override public void run(){
+          ...
+        }
+    }
+
+And we then use a device created range.
+
+    Device device = Device.firstGPU();
+    Kernel kernel = new Kernel(){
+        @Override public void run(){
+          // uses input[];
+        }
+    };
+    range = device.createRange2D(1024, 1024);
+    kernel.execute(range);
+
+We have forced execution on the first GPU.
diff --git a/doc/ContributionGuide.md b/doc/ContributionGuide.md
new file mode 100644
index 0000000000000000000000000000000000000000..c3f8d219c8b20f53f627d53bebf05b00ffa63757
--- /dev/null
+++ b/doc/ContributionGuide.md
@@ -0,0 +1,48 @@
+#ContributionGuide
+*How to contribute (bug fix or features). Updated Sep 13, 2011 by frost.g...@gmail.com*
+##Contribution Guide
+We welcome all contributions to add new features to Aparapi and make Aparapi more useful and high performing. These guidelines are intended to describe and streamline the contribution process.
+
+A patch can be a bug fix, a new feature, a new JUnit test case or a documentation change.
+
+Only members of the commit team are able to commit changes to the SVN repository.
+
+Only patches submitted through the process described below will be committed to SVN.
+
+The commit team will only applying patches which are submitted via the Aparapi projectâ€™s issue list.
+
+http://code.google.com/p/aparapi/issues/list
+The current commit team members are:
+* Eric Caspole (AMD)
+* Tom Deneau (AMD)
+* Gary Frost (AMD)
+
+If you would like to be considered for inclusion in the commit team, please send an email to anyone on the team and let them know.
+
+##Submitting a patch
+If the bug or enhancement does not yet appear in the issues list, please take the time add a new issue.
+
+Be sure to include sufficient detail to explain and recreate the bug or to justify the proposed enhancement.
+
+Ensure that your patch/fix does not regress any of existing JUnit tests. The UnitTestGuide wiki page describes how to run the various Aparapi unit tests.
+
+Ensure that your patch does not break any sample or example. Create a patch file (using SVNâ€™s diff command) against a recently updated trunk, do not submit patches against branches. Name your patch file using the following filename convention
+
+     aparapi-<issue id>-<trunk revision id>.patch
+The following shows the sequence for creating a patch for issue number 1234.
+
+    $ cd aparapi-trunk
+    $ svn update
+    At revision 10339
+    $ svn -diff > aparapi-1234-10339.patch
+
+Attach your patch file to the issue via Issue List.
+
+## Attribution of contributions
+We want to correctly attribute all contributions and will maintain a CREDITS.txt file at the head of the trunk. We discourage including attribution as comments in the code, instead we intend to let the history feature of SVN be the primary method for tracking attributions. When patch is committed the commit team member will update the CREDITS.txt file and apply your patch, then will include your name (and email if you desire) as part of the SVN commit history.
+
+## Contributions made under a different license than the existing BSD derived license
+We cannot accept contributions or patches which are subject to other licenses.
+
+Attribution
+
diff --git a/doc/ConvertingBytecodeToOpenCL.md b/doc/ConvertingBytecodeToOpenCL.md
new file mode 100644
index 0000000000000000000000000000000000000000..74210bb10bebce69f644f896fb8bdd80ea39a355
--- /dev/null
+++ b/doc/ConvertingBytecodeToOpenCL.md
@@ -0,0 +1,282 @@
+#ConvertingBytecodeToOpenCL
+
+*How Aparapi converts bytecode to OpenCL Updated Aug 23, 2012 by frost.g...@gmail.com*
+
+##Introduction
+
+[try this](ByteCode2OpenCL.pdf)
+
+One of the unique Aparapi features is it's ability to convert Java bytecode to OpenCL automatically.
+
+In this page we will try to describe the process used to perform this conversion. If you are unfamiliar with bytecode consider visiting this page WhatIsBytecode.
+
+The command
+
+    javac Source.java
+
+Will compile the java source file Source.java to Source.class
+
+The classfile format is well documented here and we will not go into too much detail here, however it should be known that Aparapi must parse the classfile of each Kernel to extract the bytecode for the Kernel.run() and any method reachable from Kernel.run().
+
+Lets start with a simple Kernel.
+
+    import com.amd.aparapi.Kernel;
+
+    public class Squarer extends Kernel{
+       int[] in;
+       int[] out;
+       @Override public void run(){
+          int gid = getGlobalId(0);
+          out[gid] = in[gid] * in[gid];
+       }
+    }
+
+We will compile this
+
+    javac -g -cp path/to/aparapi/aparapi.jar Squarer.java
+
+and then we can look at the bytecode using javap
+
+    javap -c -classpath path/to/aparapi/aparapi.jar;. Squarer
+
+Compiled from "Squarer.java"
+
+    public class Squarer extends com.amd.aparapi.Kernel
+      SourceFile: "Squarer.java"
+      minor version: 0
+      major version: 50
+      Constant pool:
+    const #1 = Method       #6.#17; //  com/amd/aparapi/Kernel."<init>":()V
+    const #2 = Method       #5.#18; //  Squarer.getGlobalId:(I)I
+    const #3 = Field        #5.#19; //  Squarer.out:[I
+    const #4 = Field        #5.#20; //  Squarer.in:[I
+    const #5 = class        #21;    //  Squarer
+    const #6 = class        #22;    //  com/amd/aparapi/Kernel
+    const #7 = Asciz        in;
+    const #8 = Asciz        [I;
+    const #9 = Asciz        out;
+    const #10 = Asciz       <init>;
+    const #11 = Asciz       ()V;
+    const #12 = Asciz       Code;
+    const #13 = Asciz       LineNumberTable;
+    const #14 = Asciz       run;
+    const #15 = Asciz       SourceFile;
+    const #16 = Asciz       Squarer.java;
+    const #17 = NameAndType #10:#11;//  "<init>":()V
+    const #18 = NameAndType #23:#24;//  getGlobalId:(I)I
+    const #19 = NameAndType #9:#8;//  out:[I
+    const #20 = NameAndType #7:#8;//  in:[I
+    const #21 = Asciz       Squarer;
+    const #22 = Asciz       com/amd/aparapi/Kernel;
+    const #23 = Asciz       getGlobalId;
+    const #24 = Asciz       (I)I;
+
+    {
+    int[] in;
+
+    int[] out;
+
+    public Squarer();
+      Code:
+       Stack=1, Locals=1, Args_size=1
+       0:   aload_0
+       1:   invokespecial   #1; //Method com/amd/aparapi/Kernel."<init>":()V
+       4:   return
+
+
+    public void run();
+      Code:
+       Stack=5, Locals=2, Args_size=1
+       0:   aload_0
+       1:   iconst_0
+       2:   invokevirtual   #2; //Method getGlobalId:(I)I
+       5:   istore_1
+       6:   aload_0
+       7:   getfield        #3; //Field out:[I
+       10:  iload_1
+       11:  aload_0
+       12:  getfield        #4; //Field in:[I
+       15:  iload_1
+       16:  iaload
+       17:  aload_0
+       18:  getfield        #4; //Field in:[I
+       21:  iload_1
+       22:  iaload
+       23:  imul
+       24:  iastore
+       25:  return
+    }
+
+Here we see constant pool of the class and the disassembled bytecode of the default constructor Squarer() and the Squarer.run() method.
+
+The constant pool is a table of constant values that can be accessed from the bytecode of any methods from within this class. Some of the constants are String literals defined within the source (or literals used to name classes, fields, methods, variables or signatures), other slots represent Classes, Methods, Fields or Type signatures. These later constant pool entries cross-reference other constant pool entries to describe higher level artifact.
+
+For example constant pool entry #1 is
+
+    const #1 = Method       #6.#17; //  com/amd/aparapi/Kernel."<init>":()V
+
+So entry #1 defines a method. The class containing the method is defined in constant pool entry #6. So lets look at constant pool entry #6.
+
+    const #1 = Method       #6.#17; //  com/amd/aparapi/Kernel."<init>":()V
+
+    const #6 = class        #22;    //  com/amd/aparapi/Kernel
+
+At constant pool entry #6 we find a class definition which refers to entry #22
+
+    const #1 = Method       #6.#17; //  com/amd/aparapi/Kernel."<init>":()V
+
+    const #6 = class        #22;    //  com/amd/aparapi/Kernel
+
+    const #22 = Asciz       com/amd/aparapi/Kernel;
+
+Which just contains the String (Ascii) name of the class.
+
+Looking back at entry #1 again, we note that the Method also references entry #17 which contains a NameAndType entry for determining the method name and the signature.
+
+    const #1 = Method       #6.#17; //  com/amd/aparapi/Kernel."<init>":()V
+
+    const #6 = class        #22;    //  com/amd/aparapi/Kernel
+
+
+    const #17 = NameAndType #10:#11;//  "<init>":()V
+
+    const #22 = Asciz       com/amd/aparapi/Kernel;
+
+Entry #17's "NameAndType" references #10 for the method name.
+
+    const #1 = Method       #6.#17; //  com/amd/aparapi/Kernel."<init>":()V
+
+    const #6 = class        #22;    //  com/amd/aparapi/Kernel
+
+    const #10 = Asciz       <init>;
+
+    const #17 = NameAndType #10:#11;//  "<init>":()V
+
+    const #22 = Asciz       com/amd/aparapi/Kernel;
+
+And then references #11 to get the signature.
+
+    const #1 = Method       #6.#17; //  com/amd/aparapi/Kernel."<init>":()V
+
+    const #6 = class        #22;    //  com/amd/aparapi/Kernel
+
+    const #10 = Asciz       <init>;
+
+    const #11 = Asciz       ()V;
+
+    const #17 = NameAndType #10:#11;//  "<init>":()V
+
+    const #22 = Asciz       com/amd/aparapi/Kernel;
+
+So from constant pool #1 we ended up using slots 1,6,10,11,17 and 22 to fully resolve the method.
+
+This looks like a lot of work, however by breaking method and field references up like this, allows the various slots to be reused by other field/method descriptions.
+
+So when we see disassembled bytecode which references a constantpool slot the actual slot # (2 in the example below) will appear after the bytecode for invokevirtual.
+
+    2:   invokevirtual   #2; Method getGlobalId:(I)I
+
+Bytecode is basically able to access three things
+
+1. Constant pool entries
+2. Variable slots
+3. Stack operands
+
+Instructions are able to pop operands from the stack, push operands to the stack, load values from variable slots (to the stack), store values (from the stack) to variable slots, store values from accessed fields (to the stack) and call methods (popping args from the stack).
+
+Some instructions can only handle specific types (int, float, double, and object instances - arrays are special forms of objects) and usually the first character of the instruction helps determine which type the instruction acts upon. So imul would be a multiply instruction that operates on integers, fmul would multiply two floats, dmul for doubles. Instructions that begin with 'a' operate on object instances.
+
+So lets look at the first instruction.
+
+    0:   aload_0
+
+This instruction loads an object (a is the first character) from variable slot 0 (we'll come back to the variable slots in a moment) and pushes it on the stack.
+
+Variables are held in 'slots' that are reserved at compiled time.
+
+Consider this static method.
+
+    static int squareMe(int value){
+      value += value;
+      return(value);
+    }
+
+This method requires one variable slot. At any one time there is only one variable that is live, it just happens to be an argument to the method.
+
+The following method also contains one slot.
+
+    static int squareMe(){
+      int value=4;
+      value += value;
+      return(value);
+    }
+
+Here we need two slots
+
+    static int squareMe(int arg){
+      int value=arg*arg;
+      return(value);
+    }
+
+Suprisingly the following also only requires two slots.
+
+    static int squareMe(int arg){
+      {
+        int temp = arg*arg;
+      }
+      int value=arg*arg;
+      return(value);
+    }
+
+Note that in the above example the temp variable loses scope before the local variable value is used. So only two slots are required. Both temp and value can share a slot.
+
+If we have an instance method we always require one extra slot (always slot 0) for the this reference.
+
+So
+
+    int squareMe(int arg){
+      int value=arg*arg;
+      return(value);
+    }
+
+Requires three slots.
+
+Anyway back to our bytecode
+
+    0:   aload_0
+
+This loads the object instance in slot 0 (this) and pushes it on the stack.
+
+Next we have
+
+    1:   iconst_0
+
+Which pushes the int constant 0 on the stack. So the stack contains {this,0}
+
+Next we have
+
+    2:   invokevirtual   #2; //Method getGlobalId:(I)I
+
+This is the bytecode for calling a method. Basically the instruction itself references the constant pool (we'll come back to this ;) ) and pulls the method description in `constantPool2` which happens to be the description for a method called `getGlobalId()` which takes an integer and returns an `int`.
+
+So the VM will pop the top value `(int - const 0)` as the method arg, and then will pop an object reference (this!) and will call the method `this.getGlobalId(0)` and will push the result (an int) back on the stack.
+
+So our stack which contains `{this,0}` now contains the result of this.getGlobalId(0), lets assume it is {0}. We describe this invoke instruction as consuming two operands from the stack and producing one.
+
+Before we start executing our stack is empty {}, the slots are initialized with 'this' (if an instance method) and any arguments passed to the method.
+
+                                                                0   1
+                                                       slots=[this, ?  ]    stack={}
+
+                                                                0   1
+    0:   aload_0                                        slots=[this, ?  ]    stack={this}
+                                                                0   1
+    1:   iconst_0                                       slots=[this, ?  ]    stack={this, 0}
+                                                                0   1
+    2:   invokevirtual   #2; Method getGlobalId:(I)I    slots=[this, ?  ]  stack={result of this.getGlobalId(0) lets say 0}
+
+    5:   istore_1                                       slots=[this, 0  ]    stack={}
+
+    6:   aload_0                                        slots=[this, 0  ]    stack={this}
+
+    7:   getfield        #3; //Field out:[I
diff --git a/doc/DevelopersGuide.md b/doc/DevelopersGuide.md
new file mode 100644
index 0000000000000000000000000000000000000000..350226f8b262c3aacb631debc0b2042769d92d5c
--- /dev/null
+++ b/doc/DevelopersGuide.md
@@ -0,0 +1,29 @@
+#DevelopersGuide
+*Aparapi developers guide. Updated Sep 13, 2011 by frost.g...@gmail.com*
+##Developer Guide
+Although the vast majority of the Aparapi code is JavaÂ® we do include some to C++ code (accessed from Javaâ„¢ via JNI) to interface with existing OpenCLâ„¢ C/C++ headers and libraries. Therefore to build Aparapi for a given platform (MicrosoftÂ® WindowsÂ® 32- or 64- bit and or LinuxÂ® 32- or 64- bit) we do require developers to setup a build environment containing both JavaÂ® and C++ development tools. In this documentation we will describe the tools required to build Aparapi for the various supported platforms.
+
+##Supported Platforms
+In general Aparapi can be used on any platform currently supported by AMD APP SDK v2.5 or later. Please check the AMD APP SDK site for details on supported platforms and installation help.
+
+[http://developer.amd.com/sdks/amdappsdk/downloads/pages/default.aspx](http://developer.amd.com/sdks/amdappsdk/downloads/pages/default.aspx)
+
+[http://developer.amd.com/sdks/AMDAPPSDK/assets/AMD_APP_SDK_Installation_Notes.pdf](http://developer.amd.com/sdks/amdappsdk/downloads/pages/default.aspx)
+
+* 32-bit MicrosoftÂ® WindowsÂ® 7
+* 32-bit MicrosoftÂ® Windows VistaÂ®
+* 64-bit MicrosoftÂ® WindowsÂ® 7
+* 64-bit MicrosoftÂ® Windows VistaÂ®
+* 32-bit LinuxÂ®
+* 64-bit LinuxÂ®
+
+Clearly we will also depend on platform specific OracleÂ® JavaÂ® JDK 6 components and C++ compilers along with some platform neutral tools (such as SVN, ant and Junit) .
+
+## Platform Specific Developer Guides
+We have broken the Developer Guide into two separate docs. One for LinuxÂ® (32- and 64- bit) and another for MicrosoftÂ® WindowsÂ® (32- and 64- bit). Please follow the appropriate link below.
+
+[DevelopersGuideLinux](DevelopersGuideLinux.md)
+
+[DevelopersGuideWindows](DevelopersGuideWindows.md)
+
+Attribution
\ No newline at end of file
diff --git a/doc/DevelopersGuideLinux.md b/doc/DevelopersGuideLinux.md
new file mode 100644
index 0000000000000000000000000000000000000000..c14da3a23fa09e6a72c55196b9937da13321e274
--- /dev/null
+++ b/doc/DevelopersGuideLinux.md
@@ -0,0 +1,181 @@
+#DevelopersGuideLinux
+
+*Developer guide for Linux. Updated Aug 23, 2012 by frost.g...@gmail.com*
+
+#Aparapi Developer Guide: LinuxÂ® 32- and 64-bit platforms
+
+##SVN Client
+
+To contribute to Aparapi you will need an SVN client to access the latest source code. This page lists a number of SVN client providers [http://subversion.apache.org/packages.html](http://subversion.apache.org/packages.html) Also you might want to consider one of the SVN-based plugins for EclipseÂ®. http://wiki.eclipse.org/SVN_Howto
+OpenJDK or OracleÂ® Java JDK install (JDK1.6 or later)
+
+http://OpenJDK.java.net http://www.oracle.com/technetwork/java/javase/downloads/index.html
+
+Many LinuxÂ® distributions come with Java JDK pre-installed or available as an optional install component. Sometimes the version that comes pre-installed is GCJ (http://gcc.gnu.org/java/). For Aparapi you will need to ensure that you have a copy of the JDK from either the OpenJDK project or from OracleÂ®.
+
+The OracleÂ® J2SE JDK site contains downloads and documentation showing how to install for various Linux distributions.
+
+http://www.oracle.com/technetwork/java/javase/index-137561.html
+
+Here is an example for my Ubuntu system:
+
+    $ sudo apt-get install sun-java6-jdk sun-java6-jre
+
+When the installation is complete, ensure that your JAVA_HOME environment variable is pointing to the install location (such as /usr/lib/jvm/java-6-sun-1.6.0.26).
+
+    $ export JAVA_HOME=/usr/lib/jvm/java-6-sun-1.6.0.26
+
+You should also add ${JAVA_HOME}/bin to your path.
+
+    $ export PATH=$PATH}:${JAVA_HOME}/bin
+
+Double-check your path and ensure that there is not another JDK/JRE in your path.
+
+    $ java -version
+    java version "1.6.0_26"
+    Java(TM) SE Runtime Environment (build 1.6.0_26-b03)
+    Java HotSpot(TM) Client VM (build 20.1-b02, mixed mode, sharing)
+
+##Apache Ant
+
+Apache AntÂ® can be downloaded from the apache project page http://ant.apache.org
+
+Aparapi has been tested using 1.7.1 version of Ant. It may work with earlier versions, but if you encounter issues we recommend updating to at least 1.7.1 before reporting issues. Here is an example for installing Ant on Ubuntu :
+
+    $ apt-get install ant
+
+Ensure that ANT_HOME is set to the install dir.
+
+    $ export ANT_HOME=/usr/local/ant
+
+Add `${ANT_HOME}/bin` to your path.
+
+    $ export PATH=$PATH}:${ANT_HOME}/bin
+
+Double-check the installation and environment vars.
+
+    ant -version
+    Apache Ant version 1.7.1 compiled ...
+
+##AMD APP SDK
+
+To compile Aparapi JNI code you need access to OpenCL headers and libraries. The instructions below assume that there is an available AMD APP SDK v2.5Â® (or later) installed and that your platform supports the required device drivers for your GPU card. Install the Catalyst driver first, and then install AMD APP SDK v2.5Â® or later.
+
+See http://developer.amd.com/sdks/AMDAPPSDK/pages/DriverCompatibility.aspx for help locating the appropriate driver for your AMD card. Make sure you install the catalyst driver that includes the OpenCLâ„¢ runtime components.
+
+    The OpenCLâ„¢ runtime is required for executing Aparapi or OpenCLâ„¢ on your GPU or GPU, but it is not necessary for building/compiling Aparapi.
+    The AMD APP SDK v2.5 is necessary for compiling the Aparapi JNI code against OpenCLâ„¢ APIs.
+
+Once you have a suitable driver, download a copy of AMD APP SDK v2.5 or later from http://developer.amd.com/sdks/AMDAPPSDK/downloads/Pages/default.aspx.
+
+Download the installation guide for MicrosoftÂ® WindowsÂ® (and LinuxÂ®) from http://developer.amd.com/sdks/AMDAPPSDK/assets/AMD_APP_SDK_Installation_Notes.pdf. Note that if you updating from a previous version of AMD APP SDK (or its predecessor ATI STREAM SDK), first uninstall the previous version.
+
+Download the release notes from: http://developer.amd.com/sdks/AMDAPPSDK/assets/AMD_APP_SDK_Release_Notes_Developer.pdf
+GCC compiler (G++) for your Linux 32-bit or 64-bit platform
+
+Aparapi has been tested with 32-bit and 64-bit Linux 4.1.2 or later GCC compilers.
+
+Ensure you have the g++ toolchain installed:
+
+    $ g++
+    no input files
+
+##JUnit
+
+The initial Open Source drop includes a suite of JUnit tests for validating bytecode to OpenCLâ„¢ code generation. These tests require JUnit 4.
+
+Download JUnit from http://www.junit.org/ and note the location of your JUnit installation; the location is needed to configure the test\codegen\build.xml file. Please see the UnitTestGuide page.
+
+##Eclipse
+
+Eclipse is not required to build Aparapi; however, the developers of Aparapi do use Eclipse and have made the Eclipse artifacts (.classpath and .project files) available so that projects can be imported into Eclipse. The com.amd.aparapi.jni subproject (containing C++ JNI source) should be imported as a resource project. We do not recommend importing com.amd.aparapi.jni as a CDT project, and we do not recommend trying to configure a CDT build, the existing build.xml files has been customized for multiplatform C++ compilations.
+
+##Building
+
+Check out the Aparapi SVN trunk:
+
+    $ svn checkout http://aparapi.googlecode.com/svn/trunk aparapi
+
+Checkout provides the following:
+
+    aparapi/
+       com.amd.aparapi/
+          src/java/com.amd.aparapi/*.java
+          build.xml
+       com.amd.aparapi.jni/
+          src/cpp/*.cpp
+          src/cpp/*.h
+          build.xml
+       test/
+          codegen/
+             src/java/
+                com.amd.aparapi/
+                com.amd.aparapi.test/
+             build.xml
+          runtime/
+             src/java/
+                com.amd.aparapi/
+                com.amd.aparapi.test/
+             build.xml
+       samples/
+          mandel
+             src/java/com.amd.aparapi.samples.mandel/*.java
+             build.xml
+             mandel.sh
+             mandel.bat
+          squares/
+             src/java/com.amd.aparapi.samples.squares/*.java
+             build.xml
+             squares.sh
+             squares.bat
+          convolution/
+             src/java/com.amd.aparapi.samples.convolution/*.java
+             build.xml
+             conv.sh
+             conv.bat
+       examples/
+          nbody/
+             src/java/com.amd.aparapi.nbody/
+             build.xml
+             nbody.sh
+             nbody.bat
+       build.xml
+       README.txt
+       LICENSE.txt
+       CREDITS.txt
+
+##Sub Directories
+
+The com.amd.aparapi and com.amd.aparapi.jni subdirectories contain the source for building and using Aparapi.
+
+The ant build.xml file, in each folder accept common 'clean' and 'build' targets. You can use the build.xml file at the root of the tree for two purposes:
+
+    To initiate a build com.amd.aparapi of com.amd.aparapi.jni.
+    To create a binary â€˜distributionâ€™ directory and zip file. This zip file is same as those available from the download section of the code.google.com/p/aparapi site.
+
+##Preparing for your first build
+
+Edit com.amd.aparapi.jni\build.properties and ensure that the properties are valid for your platform.
+
+View the comments in the properties file for assistance. The build.xml ant file contains some simple checks to help diagnose simple configuration errors in case something gets messed up.
+
+For Linux you should not need to edit build.xml unless your APP SDK install location differs from the default. The default for LinuxÂ® is /opt/AMDAPP
+
+    amd.app.sdk.dir=/opt/AMDAPP
+
+Perform a build from the root directory using the following command:
+
+    $ ant clean build dist
+
+Once your build has completed you should see an additional subdirectory named dist_linux_x86 or dist_linux_x86_64 (depending on the bitness of your platform).
+
+The distribution directory contains:
+
+    aparapi.jar containing Aparapi classes for all platforms.
+    the shared library for your platform (aparapi_x86.so or aparapi_x86_64.so).
+    an /api subdirectory containing the 'public' javadoc for Aparapi.
+    a samples directory containing the source and binaries for the mandel and squares sample projects.
+
+The root directory also contains either dist_linux_x86_64.zip or dist_linux_x86.zip containing a compressed archive of the distribution tree.
+
+[Attribution](Attribution.md)
diff --git a/doc/DevelopersGuideWindows.md b/doc/DevelopersGuideWindows.md
new file mode 100644
index 0000000000000000000000000000000000000000..dd0c386f7329e7c4255ca028c89fea23aca145b9
--- /dev/null
+++ b/doc/DevelopersGuideWindows.md
@@ -0,0 +1,187 @@
+#DevelopersGuideWindows
+*Developers guide for Windows. Updated Aug 23, 2012 by frost.g...@gmail.com*
+
+##Aparapi Developer Guide: MicrosoftÂ® WindowsÂ® 32- and 64-bit platforms
+
+##SVN Client
+
+To contribute to Aparapi you will need an SVN client to access the latest source code.
+
+This page lists a number of SVN client providers http://subversion.apache.org/packages.html
+
+For Microsoft WindowsÂ® users TortoiseSVN incorporates SVN functionality directly into Windows Explorer view and is often preferred http://tortoisesvn.tigris.org/
+
+Also you might want to consider one of the SVN-based plugins for Eclipse. http://wiki.eclipse.org/SVN_Howto
+OracleÂ® Java JDK install (JDK1.6 or later)
+
+http://www.oracle.com/technetwork/java/javase/downloads/index.html
+
+The OracleÂ® J2SE JDK site contains downloads and documentation showing how to install for various platforms. http://www.oracle.com/technetwork/java/javase/index-137561.html
+
+When the installation is complete, ensure that your JAVA_HOME environment variable is pointing to the install location (such as c:\progra~1\java\jdk1.6.0_26)and that %JAVA_HOME%\bin is in your path.
+
+    C:> set JAVA_HOME=c:\progra~1\java\jdk1.6.0_26
+    C:> set PATH=%PATH%;%JAVA_HOME%\bin
+
+Note that we tend to use the 8.3 form of MicrosoftÂ® WindowsÂ® path variables this avoids us having to quote paths in scripts.
+
+Double check your path and ensure that there is not another JDK/JRE in your path.
+
+    C:> java -version
+    java version "1.6.0_26"
+    Java(TM) SE Runtime Environment (build 1.6.0_26-b03)
+    Java HotSpot(TM) Client VM (build 20.1-b02, mixed mode, sharing)
+
+##Apache Ant
+
+Apache Antâ„¢ can be downloaded from the apache project page http://ant.apache.org
+
+Aparapi has been tested using 1.7.1 version of Ant, it may well work with earlier versions, but if you encounter issues we recommend updating to at least 1.7.1 before reporting issues. Installation is straightforward, just unzip the ant.zip file and ensure that your ANT_HOME}} environment variable is pointing to your ANT installation and that `{{{%ANT_HOME%\bin` is in your path.
+
+    C:> set ANT_HOME=C:\progra~1\apache\apache-ant-1.8.1
+    C:> set PATH=%PATH%;%ANT_HOME%\bin
+
+Double check the installation and environment vars.
+
+    ant -version
+    Apache Ant version 1.7.1 compiled ..
+
+##AMD APP SDK
+
+To compile Aparapi JNI code you need access to OpenCL headers and libraries. The instructions below assume that there is an available AMD APP SDK v2.5 (or later) installed and that your platform supports the required device drivers for your GPU card. Install the Catalyst driver first, and then install AMD APP SDK v2.5.
+
+See http://developer.amd.com/sdks/AMDAPPSDK/pages/DriverCompatibility.aspx for help locating the appropriate driver for your AMD card. Be sure you obtain the catalyst driver that includes the OpenCLâ„¢ runtime components.
+
+    The OpenCLâ„¢ runtime is required for executing Aparapi or OpenCLâ„¢ on your CPU or GPU, but it is not necessary for building/compiling Aparapi.
+    The AMD APP SDK v2.5 is necessary for compiling the Aparapi JNI code against OpenCLâ„¢ APIs.
+
+Once you have a suitable driver, download a copy of AMD APP SDK v2.5 from http://developer.amd.com/sdks/AMDAPPSDK/downloads/Pages/default.aspx.
+
+Download the installation guide for MicrosoftÂ® WindowsÂ® (and LinuxÂ®) from http://developer.amd.com/sdks/AMDAPPSDK/assets/AMD_APP_SDK_Installation_Notes.pdf. Note that if you updating from a previous version of AMD APP SDK (or its predecessor ATI STREAM SDK), first uninstall the previous version. The release notes are available here http://developer.amd.com/sdks/AMDAPPSDK/assets/AMD_APP_SDK_Release_Notes_Developer.pdf
+##A C++ compiler
+
+For MicrosoftÂ® WindowsÂ® platforms the JNI build can support either MicrosoftÂ® Visual StudioÂ® 2008, 2009 or 2010 compiler or MinGW (Minimal GNU for Windows) from GNU. Now that Visual Studio express is available for free, we would recommend using Visual studio. If you wish to use another compiler then you will have to tweak the com.amd.aparapi.jni/build.xml file to get your compiler to work.
+MicrosoftÂ® Visual StudioÂ® 2008/2010 for 32-bit or 64-bit platforms
+
+Aparapi has been tested with various versions of MicrosoftÂ® Visual StudioÂ® 2008, 2009 and 2010 including Enterprise, Professional and Express editions, if you encounter any version specific issues please let us know so we can address it and/or update this documentation.
+
+If you already have MicrosoftÂ® Visual StudioÂ® installed you will need to know the location of the compiler and the SDK. These can vary depending upon the platform and version you are using. Typically an install results in a Visual Studio install, such as. c:\Program Files\Microsoft Visual Studio 9.0
+
+And an SDK, such as. c:\Program Files\Microsoft SDKs\Windows\v6.0A
+
+Note the location of both of these as this information will be needed to configure the com.amd.aparapi.jni\build.property file (later).
+For Visual Studio Express 64 bit users
+
+Visual studio express does not include the 64 bit compiler or libraries. You will need to also install the SDK from Microsoft. this link should help
+##MinGW â€“ (MINimum Gnu for Windows)
+
+As an alternative to installing MicrosoftÂ® Visual StudioÂ® we have included support for the MinGW tool chain and Aparapi has been (minimally) tested with this compiler.
+
+MingGW can be downloaded from http://www.mingw.org/ by following the instructions on their Getting Started page. We recommend installing the mingw-get-inst msi installer and just taking the defaults.
+
+Note the install location as this information will be needed to edit build.xml file and uncomment the line referencing the mingw instal dir. Typically the install location is
+
+    C:\MinGW
+
+After a successful build, you will need to ensure that the bin sub directory is in your path before you attempting to run an Aparapi enabled application built using MinGW. MinGW apps require access to MingGW/GNU C++/C runtime at execution time.
+
+    set PATH=%PATH%;C:\MinGW\bin
+
+This is one reason the binary distribution is ''not'' built using mingw.
+##JUnit
+
+The initial Open Source drop includes a suite of JUnit tests for validating bytecode to OpenCL code generation. These tests require JUnit 4.
+
+Download JUnit from http://www.junit.org/
+
+Note the location of your JUnit installation; the location is needed to configure the test\codegen\build.xml file. See the UnitTestGuide page for howto configure the JUnit build.
+##Eclipse
+
+Eclipse is not required to build Aparapi, however the developers of Aparapi do use Eclipse and have made the Eclipse artifacts (.classpath and .project files) available so that projects can be imported into Eclipse.
+
+The com.amd.aparapi.jni subproject (containing C++ JNI source) should be imported as a resource project, we do not recommend importing com.amd.aparapi.jni as a CDT project, and we do not recommend trying to configure a CDT build, the existing build.xml files has been customized for multiplatform C++ compilations.
+##Building
+
+Check out the Aparapi SVN trunk:
+
+svn checkout http://aparapi.googlecode.com/svn/trunk
+
+You will end up with the following files/directories
+
+    aparapi/
+       com.amd.aparapi/
+          src/java/com.amd.aparapi/*.java
+          build.xml
+       com.amd.aparapi.jni/
+          src/cpp/*.cpp
+          src/cpp/*.h
+          build.xml
+       test/
+          codegen/
+             src/java/
+                com.amd.aparapi/
+                com.amd.aparapi.test/
+             build.xml
+          runtime/
+             src/java/
+                com.amd.aparapi/
+                com.amd.aparapi.test/
+             build.xml
+       samples/
+          mandel
+             src/java/com.amd.aparapi.samples.mandel/*.java
+             build.xml
+             mandel.sh
+             mandel.bat
+          squares/
+             src/java/com.amd.aparapi.samples.squares/*.java
+             build.xml
+             squares.sh
+             squares.bat
+          convolution/
+             src/java/com.amd.aparapi.samples.convolution/*.java
+             build.xml
+             conv.sh
+             conv.bat
+       examples/
+          nbody/
+             src/java/com.amd.aparapi.nbody/
+             build.xml
+             nbody.sh
+             nbody.bat
+       build.xml
+       README.txt
+       LICENSE.txt
+       CREDITS.txt
+
+##Sub Directories
+
+The com.amd.aparapi and com.amd.aparapi.jni subdirectories contain the source for building and using Aparapi.
+
+The ant build.xml file, in each folder accept 'clean' and 'build' targets.
+
+Use the build.xml file at the root of the tree for two purposes:
+
+    To initiate a build of com.amd.aparapi and com.amd.aparapi.jni.
+    To create a binary distribution directory and zip file. This zip file is same as those available from the download section of the code.google.com/p/aparapi site.
+
+##Preparing for your first build
+
+You should only need to edit com.amd.aparapi.jni\build.xml file if you wish to use mingw or if you Visual Studio or gcc compiler is in an unusual place.
+
+Perform a build from the root directory using the following command:
+
+    $ ant clean dist
+
+The jni build will perform some simple tests to check the configuration properties and hopefully also guide you to a possible solution.
+
+Once your build has completed you should see an additional subdirectory named dist_windows_x86 or dist_windows_x86_64 (depending upon your platform type).
+
+    aparapi.jar containing Aparapi classes for all platforms.
+    the shared library for your platform (aparapi_x86.dll or aparapi_x86_64.dll).
+    an /api subdirectory containing the 'public' javadoc for Aparapi.
+    a samples directory containing the source and binaries for the mandel and squares sample projects.
+
+The root directory also contains either dist_windows_x86_64.zip or dist_windows_x86.zip containing a compressed archive of the distribution tree.
+
+[Attribution](Attribution.md)
diff --git a/doc/DeviceProposal.md b/doc/DeviceProposal.md
new file mode 100644
index 0000000000000000000000000000000000000000..cb91759b5e4a08f8e380941dcba5c0c1829b4b76
--- /dev/null
+++ b/doc/DeviceProposal.md
@@ -0,0 +1,65 @@
+#DeviceProposal
+*How we might use the extension mechanism devices for general Kernel execution. Updated May 9, 2012 by frost.g...@gmail.com*
+
+At present the first GPU or CPU device (depending on Kernel.ExecutionMode value) is chosen at execution time. This make it easy to execute simple Kernels, but is problematic when using some advanced feature (barriers, local memory) or for sizing buffers appropriate for the target device. I propose that we add API's to allow the developer to specify exactly which device we intend to target.
+
+In the extension proposal branch we needed to expose a Device class for binding arbitrary OpenCL to a Java interface. I suggest we also be use this to query device information useful for allocating suitable size global buffers/local buffers, and for dispatching Kernel's to specific devices.
+
+The general pattern would be that we ask Aparapi to give us a Device, probably via a Device factory method.
+
+Something like:-
+
+    Device device = Device.best();
+We would also offer other useful factory methods `getBestGPU(), getFirstCPU() getJavaMultiThread(), getJavaSequential()` as well as a method to get all device so that the developer can filter themselves.
+
+Note that as well as real OpenCL devices we also expose 'pseudo' devices such as JavaMultiThread and Sequential. We might also allow pseudo devices to group multiple devices. So getAllGPUDevices() might return a pseudo device for executing across devices.
+
+    Device chosen=null;
+    for (Device device: devices.getAll()){
+       if (device.getVendor().contains("AMD") && device.isGPU()){
+          chosen = device;
+          break;
+       }
+    }
+
+A Device can be queried `(isGPU(), isOpenCL(), isGroup(), isJava(), getOpenCLPlatform(), getMaxMemory(), getLocalSizes())` and may need to be cast to specific types.
+
+This would allow us to configure buffers.
+
+    Device device = Device.best();
+    if (device instanceof OpenCLDevice){
+       OpenCLDevice openCLDevice  = (OpenCLDevice)device;
+       char input[] = new char[openCLDevice.getMaxMemory()/4);
+    }
+We can also use the Device as a factory for creating Ranges.
+
+    Range range = device.createRange2D(width, height);
+This allows the Range to be created with knowledge of the underlying device. So for example `device.createRange3D(1024, 1024, 1024, 16, 16, 16)` will fail if the device does not allow a local size of (16x16x16).
+
+A range created using `device.createRangeXX()` would also capture the device that created it. As if we had
+
+    Range range = device.createRange2D(width, height);
+    // implied range.setDevice(device);
+    This basically means that the Range locks the device that it can be used with.
+
+    So when we have a Kernel.
+
+    Kernel kernel = new Kernel(){
+        @Override public void run(){
+          ...
+        }
+    }
+And we then use
+
+    Device device = Device.firstGPU();
+    final char input[] = new char[((OpenCLDevice)device).getMaxMemory()/4);
+    Kernel kernel = new Kernel(){
+        @Override public void run(){
+          // uses input[];
+        }
+    };
+    range = device.createRange2D(1024, 1024);
+    kernel.execute(range);
+We have forced execution on the first GPU. Java fallback would still be possible (should we forbid this?).
+
+    kernel.execute( Device.firstGPU().getRange2D(width, height));
diff --git a/doc/EmulatingMultipleEntrypointsUsingCurrentAPI.md b/doc/EmulatingMultipleEntrypointsUsingCurrentAPI.md
new file mode 100644
index 0000000000000000000000000000000000000000..b34051f5aadc8ba235098cbc088ce97eaa266d5d
--- /dev/null
+++ b/doc/EmulatingMultipleEntrypointsUsingCurrentAPI.md
@@ -0,0 +1,226 @@
+#EmulatingMultipleEntrypointsUsingCurrentAPI
+*How to emulate multiple entrypoints using existing Aparapi APIs Updated Jul 30, 2012 by frost.g...@gmail.com*
+
+##Emulating Multiple Entrypoints Using Existing Aparapi APIs
+
+Until we have support for multiple entrypoints in Aparapi, there are some tricks for emulating this feature.
+
+Follow the proposal for adding multiple entrypoints on this page [MultipleEntryPointSupportProposal](MultipleEntryPointSupportProposal.md).
+
+Suppose we wanted to create a general VectorMath kernel which might expose unary square, squareroot methods and binary addition and subtraction functionality. With our current API limitations we can't easily do this, we can approximate having separate methods by passing a separate arg to dictate the 'function' that we wish to perform.
+
+    class VectorKernel extends Kernel{
+        float[] lhsOperand;
+        float[] rhsOperand;
+        float[] unaryOperand;
+        float[] result;
+        final static int FUNC_ADD =0;
+        final static int FUNC_SUB =1;
+        final static int FUNC_SQR =2;
+        final static int FUNC_SQRT =3;
+        // other functions
+        int function;
+        @Override public void run(){
+            int gid = getGlobalId(0){
+            if (function==FUNC_ADD){
+               result[gid]=lhsOperand[gid]+rhsOperand[gid];
+            }else if (function==FUNC_SUB){
+               result[gid]=lhsOperand[gid]-rhsOperand[gid];
+            }else if (function==FUNC_SQR){
+               result[gid]=unaryOperand[gid]*unaryOperand[gid];
+            }else if (function==FUNC_ADD){
+               result[gid]=sqrt(unaryOperand[gid]);
+            }else if ....
+        }
+    }
+
+To use this for adding two vectors and then take the sqrt of the result we would use something like....
+
+    int SIZE=1024;
+    Range range = Range.create(SIZE);
+    VectorKernel vk = new VectorKernel();
+    vk.lhsOperand = new float[SIZE];
+    vk.rhsOperand = new float[SIZE];
+    vk.unaryOperand = new float[SIZE];
+    vk.result = new float[SIZE];
+
+    // fill lhsOperand ommitted
+    // fill rhsOperand ommitted
+    vk.function = VectorKernel.FUNC_ADD;
+    vk.execute(range);
+    System.arrayCopy(vk.result, 0, vk.unaryOperand, 0, SIZE);
+    vk.function = VectorKernel.FUNC_SQRT;
+    vk.execute(range);
+
+This approach is fairly common and I have used it successfully to perform various pipeline stages for calculating FFT's for example. Whilst this is functional it is not a great solution. First the API is clumsy. We have to mutate the state of the kernel instance and then re-arrange the arrays manually to chain math operations. We could of course hide all of this behind helper methods. One could imagine for example an implementation which exposes helper add(lhs, rhs)}}, or {{{sqrt() which hid all the nasty stuff.
+
+    class VectorKernel extends Kernel{
+        float[] lhsOperand;
+        float[] rhsOperand;
+        float[] unaryOperand;
+        float[] result;
+        final static int FUNC_ADD =0;
+        final static int FUNC_SUB =1;
+        final static int FUNC_SQR =2;
+        final static int FUNC_SQRT =3;
+        // other functions
+        int function;
+        @Override public void run(){
+            int gid = getGlobalId(0){
+            if (function==FUNC_ADD){
+               result[gid]=lhsOperand[gid]+rhsOperand[gid];
+            }else if (function==FUNC_SUB){
+               result[gid]=lhsOperand[gid]-rhsOperand[gid];
+            }else if (function==FUNC_SQR){
+               result[gid]=unaryOperand[gid]*unaryOperand[gid];
+            }else if (function==FUNC_ADD){
+               result[gid]=sqrt(unaryOperand[gid]);
+            }else if ....
+        }
+        private void binary(int operator, float[] lhs, float[] rhs){
+           lhsOperand = lhs;
+           rhsOperand = rhs;
+           function=operator;
+           execute(lhs.length());
+        }
+        public void add(float[] lhs, float[] rhs){
+           binary(FUNC_ADD, lhs, rhs);
+        }
+
+        public void sub(float[] lhs, float[] rhs){
+           binary(FUNC_SUB, lhs, rhs);
+        }
+
+        private void binary(int operator, float[] rhs){
+           System.arrayCopy(result, 0, lhsOperand, result.length);
+           rhsOperand = rhs;
+           function=operator;
+           execute(lhsOperand.legth());
+        }
+
+        public void add(float[] rhs){
+           binary(FUNC_ADD,  rhs);
+        }
+
+        public void sub( float[] rhs){
+           binary(FUNC_SUB,  rhs);
+        }
+
+        private void unary(int operator, float[] unary){
+           unaryOperand = unary;
+           function=operator;
+           execute(unaryOperand.length());
+        }
+
+        public void sqrt(float[] unary){
+           unary(FUNC_SQRT, unary);
+        }
+
+        private void unary(int operator){
+           System.array.copy(result, 0, unaryOperand, 0, result.length);
+           function=operator;
+           execute(unaryOperand.length());
+        }
+
+        public void sqrt(){
+           unary(FUNC_SQRT);
+        }
+
+    }
+
+    VectorKernel vk = new VectorKernel(SIZE);
+    vk.add(copyLhs, copyRhs);  // copies args to lhs and rhs operands
+                               // sets function type
+                               // and executes kernel
+    vk.sqrt();                 // because we have no arg
+                               // copies result to unary operand
+                               // sets function type
+                               // execute kernel
+
+However there is one more objection to this approach, namely that it by default will force unnecessarily buffer copies.
+
+When the bytecode for the above Kernel.run() method is analyzed Aparapi finds bytecode reading from lhsOperand, rhsOperand and unaryOperand arrays/buffers. Obviously at this bytecode analysis stage we can't predict which 'function type' will be used, so on every executions (Kernel.run()) Aparapi must copy all three buffers to the GPU. For binary operations this is one buffer copy wasted (the unaryOperand), for the unary operations we copy two buffers unnecessarily (lhsOperand and rhsOperand). We can of course use explicit buffer management to help us reduce these costs. Ideally we add this to our helper methods.
+
+    class VectorKernel extends Kernel{
+        float[] lhsOperand;
+        float[] rhsOperand;
+        float[] unaryOperand;
+        float[] result;
+        final static int FUNC_ADD =0;
+        final static int FUNC_SUB =1;
+        final static int FUNC_SQR =2;
+        final static int FUNC_SQRT =3;
+        // other functions
+        int function;
+        @Override public void run(){
+            int gid = getGlobalId(0){
+            if (function==FUNC_ADD){
+               result[gid]=lhsOperand[gid]+rhsOperand[gid];
+            }else if (function==FUNC_SUB){
+               result[gid]=lhsOperand[gid]-rhsOperand[gid];
+            }else if (function==FUNC_SQR){
+               result[gid]=unaryOperand[gid]*unaryOperand[gid];
+            }else if (function==FUNC_ADD){
+               result[gid]=sqrt(unaryOperand[gid]);
+            }else if ....
+        }
+        private void binary(int operator, float[] lhs, float[] rhs){
+           lhsOperand = lhs;
+           rhsOperand = rhs;
+           function=operator;
+           put(lhsOperand).put(rhsOperand);
+           execute(lhs.length());
+           get(result);
+        }
+        public void add(float[] lhs, float[] rhs){
+           binary(FUNC_ADD, lhs, rhs);
+        }
+
+        public void sub(float[] lhs, float[] rhs){
+           binary(FUNC_SUB, lhs, rhs);
+        }
+
+        private void binary(int operator, float[] rhs){
+           System.arrayCopy(result, 0, lhsOperand, result.length);
+           rhsOperand = rhs;
+           function=operator;
+           put(lhsOperand).put(rhsOperand);
+           execute(lhsOperand.legth());
+           get(result);
+        }
+
+        public void add(float[] rhs){
+           binary(FUNC_ADD,  rhs);
+        }
+
+        public void sub( float[] rhs){
+           binary(FUNC_SUB,  rhs);
+        }
+
+        private void unary(int operator, float[] unary){
+           unaryOperand = unary;
+           function=operator;
+           put(unaryOperand);
+           execute(unaryOperand.length());
+           get(result);
+        }
+
+        public void sqrt(float[] unary){
+           unary(FUNC_SQRT, unary);
+        }
+
+        private void unary(int operator){
+           System.array.copy(result, 0, unaryOperand, 0, result.length);
+           function=operator;
+           put(unaryOperand);
+           execute(unaryOperand.length());
+           get(result);
+
+        }
+
+        public void sqrt(){
+           unary(FUNC_SQRT);
+        }
+
+    }
+
diff --git a/doc/ExplicitBufferHandling.md b/doc/ExplicitBufferHandling.md
new file mode 100644
index 0000000000000000000000000000000000000000..5f0e70112dfda3c99d87b0e9103adcb20fec8045
--- /dev/null
+++ b/doc/ExplicitBufferHandling.md
@@ -0,0 +1,220 @@
+#ExplicitBufferHandling
+*How to minimize buffer transfers Updated Jul 24, 2012 by frost.g...@gmail.com*
+Aparapi is designed to shield the Java developer from dealing with the underlying movement of data between the OpenCL host and device. Aparapi can analyze a kernel's `run()` method and run-reachable methods to determine which primitive arrays to transfer to the GPU prior to execution, and which arrays to transfer back when the GPU execution is complete.
+
+Generally this strategy is both clean and performant. Aparapi will attempt to just do the right thing.
+
+However, occasionally the following code pattern is seen.
+
+    final int[] hugeArray = new int[HUGE];
+    Kernel kernel= new Kernel(){
+        ... // reads/writes hugeArray
+    };
+    for (int loop=0; loop <MAXLOOP; loop++){
+        kernel.execute(HUGE);
+    }
+
+This is a common pattern which unfortunately exposes an issue with Aparapi's normal buffer handling.
+
+Although Aparapi does analyze the byte code of the `Kernel.run()` method (and any method reachable from `Kernel.run()`) Aparapi has no visibility to the call site. In the above code there is no way for Aparapi to detect that that hugeArray is not modified within the for loop body. Unfortunately, Aparapi must default to being 'safe' and copy the contents of hugeArray backwards and forwards to the GPU device.
+
+Here we add comments to indicate where the unnecessary buffer transfers take place.
+
+    final int[] hugeArray = new int[HUGE];
+    Kernel kernel= new Kernel(){
+       ... // reads/writes hugeArray
+    };
+    for (int loop=0; loop <MAXLOOP; loop++){
+       // copy hugeArray to GPU
+       kernel.execute(HUGE);
+       // copy hugeArray back from the GPU
+    }
+
+In reality hugeArray only needs to be copied to the GPU once (prior to the loop) and then once again when the loop has terminated.
+
+Here we use comments to indicated the 'optimal' transfers.
+
+    final int[] hugeArray = new int[HUGE];
+    Kernel kernel= new Kernel(){
+       ... // reads/writes hugeArray
+    };
+    // Ideally transfer hugeArray to GPU here
+    for (int loop=0; loop <MAXLOOP; loop++){
+       kernel.execute(HUGE);
+    }
+    // Ideally transfer hugeArray back from GPU here
+
+Consider another common pattern
+
+    final int[] hugeArray = new int[HUGE];
+    final int[] done = new int[]{0};
+    Kernel kernel= new Kernel(){
+       ... // reads/writes hugeArray and writes to done[0] when complete
+    };
+    done[0]=0;
+    while (done[0] ==0)){
+       kernel.execute(HUGE);
+    }
+
+This is a common pattern in reduce stages of map-reduce type problems. Essentially the developer wants to keep executing a kernel until some condition is met. For example, this may be seen in bitonic sort implementations and various financial applications.
+
+From the code it can be seen that the kernel reads and writes `hugeArray[]` array and uses the single item `done[]` array to indicate some form of convergence or completion.
+
+As we demonstrated above, by default Aparapi will transfer `done[]` and `hugeArray[]` to and from the GPU device each time `Kernel.execute(HUGE)` is executed.
+
+To demonstrate which buffers are being transfered, these copies are shown as comments in the following version of the code.
+
+    final int[] hugeArray = new int[HUGE];
+    final int[] done = new int[]{0};
+    Kernel kernel= new Kernel(){
+       ... // reads/writes hugeArray and writes to done[0] when complete
+    };
+    done[0]=0;
+    while (done[0] ==0)){
+       // Send done[] to GPU
+       // Send hugeArray[] to GPU
+       kernel.execute(HUGE);
+       // Fetch done[] from GPU
+       // Fetch hugeArray[] from GPU
+    }
+
+Further analysis of the code reveals that `hugeArray[]` is not accessed by the loop containing the kernel execution, so Aparapi is performing 999 unnecessary transfers to the device and 999 unnecessary transfers back. Only two transfers of `hugeArray[]` are needed; one to move the initial data to the GPU and one to move it back after the loop terminates.
+
+The `done[]` array is accessed during each iteration (although never written to within the loop), so it does need to be transferred back for each return from Kernel.execute(), however, it only needs to be sent once.
+
+Clearly it is better to avoid unnecessary transfers, especially of large buffers like `hugeArray[]`.
+
+Aparapi exposes a feature which allows the developer to control these situations and explicitly manage transfers.
+
+To use this feature first the developer needs to 'turn on' explicit mode, using the `kernel.setExplicit(true)` method. Then the developer can request buffer/array transfers using either `kernel.put()` or `kernel.get()`. `Kernel.put()` forces a transfer to the GPU device and Kernel.get() transfers data back.
+
+The following code illustrates the use of these new explicit buffer management APIs.
+
+    final int[] hugeArray = new int[HUGE];
+    final int[] done = new int[]{0};
+    Kernel kernel= new Kernel(){
+       ... // reads/writes hugeArray and writes to done[0] when complete
+    };
+    kernel.setExplicit(true);
+    done[0]=0;
+    kernel.put(done);
+    kernel.put(hugeArray);
+    while (done[0] ==0)){
+       kernel.execute(HUGE);
+       kernel.get(done);
+    }
+    kernel.get(hugeArray);
+
+Note that marking a kernel as explicit and failing to request the appropriate transfer is a programmer error.
+
+We deliberately made `Kernel.put(...)`, `Kernel.get(...)` and `Kernel.execute(range)` return an instance of the executing kernel to allow these calls be chained. Some may find this fluent style API more expressive.
+
+    final int[] hugeArray = new int[HUGE];
+    final int[] done = new int[]{0};
+    Kernel kernel= new Kernel(){
+       ... // reads/writes hugeArray and writes to done[0] when complete
+    };
+    kernel.setExplicit(true);
+    done[0]=0;
+    kernel.put(done).put(hugeArray);    // chained puts
+    while (done[0] ==0)){
+       kernel.execute(HUGE).get(done);  // chained execute and put
+    }
+    kernel.get(hugeArray);
+
+An alternate approach for loops containing a single `kernel.execute(range)` call.
+One variant of code which would normally suggest the use of Explicit Buffer Management can be handled differently. For cases where `Kernel.execute(range)` is the sole statement inside a loop and where the iteration count is known prior to the first iteration we offer an alternate (hopefully more elegant) way of minimizing buffer transfers.
+
+So for cases like:-
+
+    final int[] hugeArray = new int[HUGE];
+    Kernel kernel= new Kernel(){
+        ... // reads/writes hugeArray
+    };
+
+    for (int pass=0; pass<1000; pass++){
+       kernel.execute(HUGE);
+    }
+
+The developer can request that Aparapi perform the outer loop rather than coding the loop. This is achieved explicitly by passing the iteration count as the second argument to `Kernel.execute(range, iterations)`.
+
+Now any form of code that looks like :-
+
+    int range = 1024;
+    int loopCount = 64;
+    for (int passId = 0; passId < loopCount; passId++){
+       kernel.execute(range);
+    }
+
+Can be replaced with
+
+    int range = 1024;
+    int loopCount = 64;
+
+    kernel.execute(range, loopCount);
+
+Not only does this make the code more compact and avoids the use of explicit buffer management APIs, it allows Aparapi visibility to the complete loop so that Aparapi can minimize the number of transfers. Aparapi will only transfer buffers to the GPU once and transfer them back once, resulting in improved performance.
+
+Sometimes kernel code using this loop-pattern needs to track the current iteration number as the code passed through the outer loop. Previously we would be forced to use explicit buffer management to allow the kernel to do this.
+
+The code for this would have looked something like
+
+    int range = 1024;
+    int loopCount = 64;
+    final int[] hugeArray = new int[HUGE];
+    final int[] passId = new int[0];
+    Kernel kernel = new Kernel(){
+       @Override public void run(){
+          int id=getGlobalId();
+          if (passId[0] == 0){
+              // perform some initialization!
+          }
+          ... // reads/writes hugeArray
+       }
+    };
+    Kernel.setExplicit(true);
+    kernel.put(hugeArray);
+    for (passId[0]=0; passId[0]<loopCount; passId[0]++){
+
+       kernel.put(passId).execute(range);
+    }
+In the current version of Aparapi we added `Kernel.getPassId()` to allow a Kernel to determine the current â€˜passâ€™ through the outer loop without having to use explicit buffer management.
+
+So the previous code can now be written without any explicit buffer management APIs:-
+
+    final int[] hugeArray = new int[HUGE];
+    final int[] pass[] = new int[]{0};
+    Kernel kernel = new Kernel(){
+       @Override public void run(){
+          int id = getGlobalId();
+          int pass = getPassId();
+          if (pass == 0){
+              // perform some initialization!
+          }
+          ... // reads/writes both hugeArray
+       }
+    };
+
+    kernel.execute(HUGE, 1000);
+One common use for Kernel.getPassId() is to avoid flipping buffers in the outer loop.
+
+It is common for kernels to process data from one buffer to another, and in the next invocation process the data back the other way. Now these kernels can use the passId (odd or even) to determine the direction of data transfer.
+
+    final int[] arr1 = new int[HUGE];
+    final int[] arr2 = new int[HUGE];
+    Kernel kernel = new Kernel(){
+       int f(int v){ â€¦ }
+
+       @Override public void run(){
+          int id = getGlobalId();
+          int pass = getPassId();
+          if (pass % 2 == 0){
+              arr1[id] = f(arr2[id]);
+          }else{
+              arr2[id] = f(arr1[id]);
+
+          }
+       }
+    };
+
+    kernel.execute(HUGE, 1000);
\ No newline at end of file
diff --git a/doc/FrequentlyAskedQuestions.md b/doc/FrequentlyAskedQuestions.md
new file mode 100644
index 0000000000000000000000000000000000000000..4092dff8f4fc173fdf75bd7fd08c8ab9708ae904
--- /dev/null
+++ b/doc/FrequentlyAskedQuestions.md
@@ -0,0 +1,134 @@
+#FrequentlyAskedQuestions
+*Frequently Asked Questions Updated Oct 17, 2012 by frost.g...@gmail.com*
+
+##Frequently Asked Questions
+
+##Why is this project called Aparapi and how is it pronounced?
+
+Aparapi is just a contraction of A PAR{allel} API and is pronounced (ap-per-rap-ee).
+
+##Does Aparapi only work with AMD graphics cards?
+
+No. Aparapi has been tested with AMD's OpenCL enabled drivers and devices as well as a limited set of NVidia devices and drivers on Windows, Linux and Mac OSX platforms. The minimal requirement at runtime is OpenCL 1.1. If you have a compatible OpenCL 1.1 runtime and supported devices Aparapi should work.
+
+Although the build is currently configured for AMD APP SDK, OpenCL is an open standard and we look forward to contributions which will allow Aparapi to be built against other OpenCL SDK's.
+
+Note that dll's built using AMD APP SDK will work on other platforms at runtime. So the binary builds are expected to work on all OpenCL 1.1 platforms.
+
+Witold Bolt has kindly supplied the patches to allow Mac OS support. The Mac OS build will run against OpenCL 1.1 and 1.0 runtimes, but we won't fix any issues reported against the OpenCL 1.0, your code may run, or may not.
+
+Aparapi may be used in JTP (Java Thread Pool) mode on any platform supported by OracleÂ®â€™s JDK.
+
+## Does Aparapi only support AMD CPUs?
+
+No, there is nothing restricting Aparapi to AMD CPUs. The JNI code that we use may run on any x86/x64 machine provided there is a compatible Java Virtual MachineÂ® JVM implementation for your platform.
+
+##Will there be an Aparapi-like translator for .NET?
+
+This is still an early technology and Aparapi is currently focused on JavaÂ® enablement. There are similar projects targeting .NET (See www.tidepowerd.com)
+
+##How can I profile the OpenCL kernels that Aparapi generates? Can I get details on the latency of my kernel request?How do I optimize my kernel?
+
+AMD offers the â€˜AMD APP Profilerâ€™ which can be used to profile the kernels. With Aparapi, we recommend using the command line mode of the profiler, which is described in the release notes. Using the â€˜AMD APP Profilerâ€™ you can see how much time is taken by each kernel execution and buffer transfer. Also, in each kernel, you can get more detailed information on things like memory reads and writes, and other useful data.
+
+##Can I have multiple threads all using the GPU compute capabilities?
+
+Yes. There might be a performance impact if the device becomes a bottleneck. However, OpenCL and your GPU driver are designed to coordinate the various threads of execution.
+
+##Can I make method calls from the run method?
+
+You can generally only make calls to other methods declared in the same class as the initial run() method. Aparapi will follow this call chain to try to determine whether it can create OpenCL. If, for example, Aparapi encounters System.out.println("Hello World") ( call to a method not in the users Kernel class) it will detect this and refuse to consider the call chain as an OpenCL candidate.
+
+One exception to this rule allows a kernel to access or mutate the state of objects held in simple arrays via their setters/getters. For example a kernel can include :-
+
+    out[i].setValue(in[i].getValue()*5);
+
+##Does Aparapi support vectorized types?
+
+Due to Java's lack of vector types (float4 for example) Aparapi can't directly use them. Also, due to Java's lack of operator overloading, simulating these with Java abstracts could lead to very complex and unwieldy code.
+
+##Is there a way I can see the generated OpenCL?
+
+Yes, by using adding -Dcom.amd.aparapi.enableShowGeneratedOpenCL=true to your command line when you start your JVM.
+
+##Does Aparapi support sharing buffers with JOGL? Can I exploit the features of JOGAMP/glugen?
+
+Rather than only supporting display-oriented compute, we are pursuing general data parallel compute. Therefore, we have chosen not to bind Aparapi too closely with JOGL.
+
+##What is the performance delta from handcrafted OpenCL?
+
+This depends heavily on the application. Although we can currently show 20x performance improvement on some compute intensive Java applications compared with the same algorithm using a Java Thread Pool a developer who is prepared to handcraft and hand-tune OpenCL and write custom host code in C/C++ is likely to see better performance than Aparapi may achieve.
+
+We understand that some user may use Aparapi as a gateway technology to test their Java code before porting to hand-crafted/tuned OpenCL.
+
+##Are you working with Project Lambda for offloading/parallelizing suitable work?
+
+We are following the progress of Project Lambda (currently scheduled for inclusion in Java 8) and would like to be able to leverage Lambda expression format in Aparapi, but none exists now.
+
+##Can I select a specific GPU if I have more than one card?
+
+Under review. At present, Aparapi just looks for the first AMD GPU (or APU) device. If the community has feedback on its preference, let us know.
+
+##Can I get the demos/samples presented at JavaOne or ADFS?
+
+The Squares and Mandlebrot sample code is included in the binary download of Aparapi. The NBody source is not included in the binary (because of a dependency on JOGL). We have, however, included the NBody code as an example project in the Open Source tree (code.google.com/p/aparapi) and provide details and we provide details on how to install the appropriate JOGL components.
+
+##Can Mersenne twister be ported as a random number function inside the kernel class?
+
+You can elect to implement your own Mersenne twister and use it in our own derived Kernel.
+
+##Does Aparapi use JNI?
+
+Yes, we do ship a small JNI shim to handle the host OpenCL calls.
+
+##How can I confirm that my code is actually executing on the GPU?
+
+From within the Java code itself you can query the execution mode after Kernel.execute(n) has returned.
+
+    Kernel kernel = new Kernel(){
+       @Override public void run(){
+       }
+    } ;
+    kernel.execute(1024);
+    System.out.priintln(â€œExecution mode = â€œ+kernel.getExecutionMode());
+
+The above code fragment will print either â€˜GPUâ€™ if the kernel executed on the GPU or JTP if Aparapi executed the Kernel in a Java Thread Pool.
+
+Alternatively, setting the property â€“Dcom.amd.aparapi.enableShowExecutionModes=true when you start your JVM will cause Aparapi to automatically report the execution mode of all kernels to stdout.
+
+##Why does Aparapi need me to compile my code with -g?
+
+Aparapi extracts most of the information required to create OpenCL from the bytecode of your Kernel.run() (and run-reachable) methods. We use the debug information to re-create the original variable name and to determine the local variable scope information.
+
+Of course only the derived Kernel class (or accessed Objects using new Arrays of Objects feature) need to be compiled using -g.
+
+##Why does the Aparapi documentation suggest I use Oracle's JDK/JRE? Why can't I use any JVM/JDK?
+
+The documentation suggests using Oracle's JDK/JRE for coverage reasons and not as a requirement. AMD focused its testing on Oracle's JVM/JDK.
+
+There are two parts to this.
+
+1. Our bytecode to OpenCL engine is somewhat tuned to the bytecode structures created by javac supplied by OracleÂ®. Specifically, there are some optimizations that other javac implementation might perform that Aparapi won't recognize. Eclipse (for example) does not presently use Oracle's javac, and so we do have some experience handling Eclipse specific bytecode patterns.
+2. At runtime, we piggyback on the (aptly named) sun.misc.Unsafe class, which is included in rt.jar from OracleÂ®. This class is useful because it helps us avoid some JNI calls by providing low level routines for accessing object field addresses (in real memory) and useful routines for Atomic operations. All accesses to 'sun.misc.Unsafe' are handled by an Aparapi class called UnsafeWrapper with the intent that this could be refactored to avoid this dependency.
+
+##I am using a dynamic language (Clojure, Scala, Groovy, Beanshell, etc) will I be able to use Aparapi?
+
+No.
+
+To access the bytecode for a method Aparapi needs to parse the original class file. For Java code, Aparapi can use something like `YourClass.getClassLoader().loadAsResource(YourClass.getName()+".class"))` to reload the class file bytes and parse the constant pool, attributes, fields, methods and method bytecode.
+
+It is unlikely that this process would work with a dynamically created class based on the presumption that dynamic languages employ some form of custom classloader to make dynamically generated bytecode available to the JVM. Therefore, it is unlikely that these classloaders would yield the classfile bytes. However, we encourage contributors to investigate opportunities here. Even if the class bytes were loadable, Aparapi would also expect debug information to be available (see previous FAQ entry). Again, this is not impossible for a dynamic language to do, indeed it would probably even be desirable as it would allow the code to be debugged using JDB compatible debugger.
+
+Finally, Aparapi recognizes bytecode patterns created by the javac supplied by OracleÂ® and it is possible that the code generated by a particular dynamic language may not be compatible with Aparapi current code analyzer.
+
+Therefore, at present this is unlikely to work. However, these would be excellent contributions to Aparapi. It would be great to see Aparapi being adopted by other JVM based dynamic language.
+
+##Why does Aparapi seems to be copying data unnecessarily back and forth between host and GPU. Can I stop Aparapi from doing this?
+
+Aparapi ensures that required data is moved to the GPU prior to kernel execution and returned to the appropriate array before Java execution resumes. Generally, this is what the Java user will expect. However, for some code patterns where multiple Kernel.execute() calls are made in succession (or more likely in a tight loop) Aparapi's approach may not be optimal.
+
+In the NewFeatures page we discuss a couple of Aparapi enhancements which will developers to elect intervene to reduce unnecessary copies.
+
+##Do I have to refactor my code to use arrays of primitives? Why canâ€™t Aparapi just work with Java Objects?
+
+Aparapi creates OpenCL from the bytecode. Generally, OpenCL constrains us to using parallel primitive arrays (OpenCL does indeed allow structs, but Java and OpenCL do not have comparable memory layouts for these structures). Therefore, you will probably need to refactor your code to use primitive arrays. In this initial contribution, we have included some limited support for Arrays of simple Objects and hope contributors extend them. Check the NewFeatures page which shows how you can use this feature.
\ No newline at end of file
diff --git a/doc/HSAEnablementOfLambdaBranch.md b/doc/HSAEnablementOfLambdaBranch.md
new file mode 100644
index 0000000000000000000000000000000000000000..15e7fe9c1b71cee17b796a38ab8a95688cafa2c0
--- /dev/null
+++ b/doc/HSAEnablementOfLambdaBranch.md
@@ -0,0 +1,32 @@
+#HSAEnablementOfLambdaBranch
+*Adding HSA Support to Aparapi lambda branch Updated Feb 28, 2014 by frost.g...@gmail.com*
+
+* [How to setup a HSA enabled Linux Platform](SettingUpLinuxHSAMachineForAparapi.md)
+* [How to setup a HSA simulator on a Linux Platform](UsingAparapiLambdaBranchWithHSASimulator.md)
+
+Recently the HSA Foundation released their â€˜Programmers Reference Manualâ€™. This manual is for developers wishing to write code for upcoming HSA compatible devices, it describes the HSA Intermediate Language (HSAIL) along with its binary form (BRIG) and describes how code is expected to execute on a HSA enabled devices.
+
+In many ways we can think of HSAIL as we do Java bytecode. It is a common intermediate form that can be optimized at runtime to execute across a variety of future heterogeneous platforms. HSAIL will greatly simplify the development of software taking advantage of both sequential and parallel compute solutions.
+
+Now that the spec is out, we have started adding HSA support to the Aparapi lambda branch. We believe that HSA combined with the upcoming Java 8 feature lambda will be a natural way to express parallel algorithms which can be executed on the GPU via HSA.
+
+A HSA+Lambda enabled Aparapi will remove many of Aparapi's constraints. HSA allows all of the CPU's memory to be accessed directly from code running on the GPU. This means
+
+* We no longer need to move data from the host CPU to the GPU.
+* We are no longer limited to the memory addressable from the GPU
+* We can access multi-dim arrays efficiently
+* We can access Java objects directly from the GPU.
+These are all substantial benefits.
+
+In the existing code (early prototype) we provide access to HSA as a specific device type.
+
+So our ubiquitous 'squares' example will initially be written as:
+
+    int in[] = ..//
+    int out[] = .../
+    Device.hsa().forEach(in.length, (i)->{
+       out[i] = in[i]*in[i];
+     });
+You will obviously need a Java 8 compatible JDK ([https://jdk8.java.net/download.html](https://jdk8.java.net/download.html)) in your path.
+
+We also recommend using IntelliJ which has preliminary support for Java 8 lambda features. You can download the community edition of IntelliJ from [http://www.jetbrains.com/idea/](http://www.jetbrains.com/idea/)
\ No newline at end of file
diff --git a/doc/HSAEnablementOfLambdaBranchSidebar.md b/doc/HSAEnablementOfLambdaBranchSidebar.md
new file mode 100644
index 0000000000000000000000000000000000000000..3275452280bd0f065af3f2f1e9e0291927ba490f
--- /dev/null
+++ b/doc/HSAEnablementOfLambdaBranchSidebar.md
@@ -0,0 +1,6 @@
+#HSAEnablementOfLambdaBranchSidebar
+*Sidebar for HSAEnablementOfLambdaBranchAparapi*
+
+[How to setup a HSA enabled Linux Platform](SettingUpLinuxHSAMachineForAparapi.md)
+
+[How to setup a HSA simulator on a Linux Platform](UsingAparapiLambdaBranchWithHSASimulator.md)
diff --git a/doc/HowToAddUML.md b/doc/HowToAddUML.md
new file mode 100644
index 0000000000000000000000000000000000000000..8c1c7f4997689f6f40a664e7bfd67b22e2726c8b
--- /dev/null
+++ b/doc/HowToAddUML.md
@@ -0,0 +1,39 @@
+#HowToAddUML
+*How to add plantuml docs to wiki pages Updated Apr 20, 2013 by frost.g...@gmail.com*
+
+Go to http://www.plantuml.com/plantuml and type in the text for you diagram.
+
+Hit submit and check out the diagram.
+
+Once you are happy, so with something like
+
+    start
+    :kernel.execute(range);
+    if (?) then (first call for this instance)
+        : Convert Kernel.run() to OpenCL;
+        note
+           We also convert all
+           methods reachable from
+           kernel.run()
+        end note
+        if (?) then (Conversion was successful)
+           : Compile OpenCL;
+           : Map compiled OpenCL to this Kernel;
+        else (Conversion unsuccessful)
+        endif
+    else (not first call)
+    endif
+    if (?) then (OpenCL mapped for this instance)
+       : Bind args (send to GPU);
+       : Execute kernel;
+    else (false)
+       : Execute using a Java Thread Pool;
+    endif
+    stop
+Paste the resulting URL into the wiki page but append %20as.png at the end of the URL
+
+http://www.plantuml.com:80/plantuml/img/BLAHBLAH%20as.png
+
+To get this!
+
+![Image of UML](uml.png)
\ No newline at end of file
diff --git a/doc/JavaKernelGuidelines.md b/doc/JavaKernelGuidelines.md
new file mode 100644
index 0000000000000000000000000000000000000000..89ab38dc789f31b1acf1f474c87b152a4ffbe4a6
--- /dev/null
+++ b/doc/JavaKernelGuidelines.md
@@ -0,0 +1,72 @@
+#JavaKernelGuidelines
+*What code can and can't be converted to OpenCL by Aparapi. Updated Sep 13, 2011 by frost.g...@gmail.com*
+##Aparapi Java Kernel Guidelines
+Certain practices can improve the chances of your Java kernel being converted to OpenCL and executing on a GPU.
+
+The following guidelines/restrictions only apply to the Kernel.run() method and any method reachable from run() (calledâ€ run-reachable methodsâ€ in this documentation), clearly any methods executed via a normal Java execution path will not be subject to these restrictions.
+
+Some restrictions/guidelines may be removed or augmented in a future Aparapi releases.
+
+##Data Types
+* Only the Java primitive data types boolean, byte, short, int, long, and float and one-dimensional arrays of these primitive data types are supported by Aparapi.
+* Aparapi support for the primitive data type double will depend on your graphics card, driver, and OpenCL version. Aparapi will query the device/platform to determine if double is supported (at runtime). If your platform does not support double, Aparapi will drop back to (Java Thread Pool) (JTP) mode.
+* The primitive data type char is not supported.
+
+##Fields
+* Elements of primitive array fields can be read from kernel code.
+* Elements of primitive array fields can be written to by kernel code.
+* Note that Java creates 'hidden' fields for captured final primitive arrays (from anonymous inner classes) and they can be accessed as if they were fields of the kernel.
+* Primitive scalar fields can only be read by the kernel code. Because kernel run-reachable methods execute in parallel in an indeterminate order, any reliance on the result of modifications to primitive scalar fields is discouraged even when executing in Java Thread Pool mode.
+* Static final fields can be read from kernel code.
+* Static non-final fields are not supported for either read or write. Try to make them final.
+
+##Arrays
+* Only one-dimensional arrays are supported.
+* Arrays cannot be aliased either by direct local assignment or by passed arguments to other methods.
+* Java 5â€™s extended 'for' syntax for (int i: arrayOfInt){} is not supported, because it causes a shallow copy of the original array under the covers.
+
+##Methods
+* References to or through a Java Object other than your kernel instance will cause Aparapi to abandon attempting to create OpenCL (note the following exceptions).
+* There are a few very specific exceptions to the above rule to allow accesses through getters/setters of objects held in arrays of objects referenced from the kernel code.
+* Static methods are not supported by Aparapi.
+* Recursion is not supported, whether direct or indirect. Aparapi tries to detect this recursion statically, but the developer should not rely on Aparapi to do so.
+* Methods with varargs argument lists are not supported by Aparapi.
+* Overloaded methods (i.e. methods with the same name but different signatures) are not supported by Aparapi. OpenCL is C99 based so we are constrained by OpenCL's lack of support for overloading.
+* The kernel base class contains wrappers around most of the functions offered by java.lang.Math.  When run in a thread pool these wrappers delegate back to java.lang.Math when executing in OpenCL they translate to OpenCL equivalents.
+
+##Other Restrictions
+
+* Exceptions are not supported (no throw, catch. or finally).
+* New is not supported either for arrays or objects
+* Synchronized blocks and synchronized methods are not supported.
+* Only simple loops and conditionals are supported; switch, break, and continue are not supported.
+* A variable cannot have its first assignment be the side effect of an expression evaluation or a method call.  For example, the following will not be translated to run on the GPU.
+
+
+        int foo(int a) {
+           // . . .
+        }
+        public void run() {
+          int z;
+          foo(z = 3);
+        }
+
+
+* This should be regarded as an error which needs to be addressed, as a workaround, explicitly initialize variables (even to 0) when declared.
+
+## Beware Of Side Effects
+OpenCL is C99-based and as such the result of expressions depending on side effects of other expressions can differ from what one might expect from Java, please avoid using code that assumes Java's tighter rules.  Generally code should be as simple as possible.
+For example, although Java explicitly defines
+
+    arra[i++] = arrb[i++];
+  to be equivalent to
+
+    arra[i] = arrb[i+1];
+    i += 2;
+
+The C99/OpenCL standard does not define this and so the result would be undefined.
+
+##Runtime Exceptions
+* When run on the GPU, array accesses will not generate an ArrayIndexOutOfBoundsException.  Instead the behavior will be unspecified.
+* When run on the GPU, ArithmeticExceptions will not be generated, for example with integer division by zero. Instead the behavior will be unspecified.
+Attribution
diff --git a/doc/LIbraryAgentDuality.md b/doc/LIbraryAgentDuality.md
new file mode 100644
index 0000000000000000000000000000000000000000..88e164e679d635bdd00aebc83d57ff88cb82637c
--- /dev/null
+++ b/doc/LIbraryAgentDuality.md
@@ -0,0 +1,28 @@
+#LIbraryAgentDuality
+*Aparapi libraries can now be loaded as JVMTI agents. Updated Jan 15, 2013 by frost.g...@gmail.com*
+
+##What are all these check-ins referring to JVMTI agents?
+
+If you have been tracking Aparapi SVN checkins you will have noticed a bunch of changes to JNI code. I just finished arranging for aparapi libraries (.dll or .so) to be able to be loaded as JVMTI agent. Now (assuming library is in ${APARAPI_DIR}) we can either launch using the traditional...
+
+    java â€“Djava.library.path=${APARAPI_DIR} â€“classpath ${APARAPI_DIR}/aparapi.jar;my.jar mypackage.MyClass
+
+or ...
+
+    java â€“agentpath=${APARAPI_DIR}/aparapi_x86_64.dll â€“classpath ${APARAPI_DIR}/aparapi.jar;my.jar mypackage.MyClass
+
+So the dll/so is now both â€˜just a libraryâ€™ and a JVMTI agent.
+
+##When would I need an agent?
+
+Prevously Aparapi loaded classes that it needed to convert to OpenCL using java.lang.Class.getResourceAsStream(). This only works if we have a jar, or if the classes are on the filesystem somewhere. This approach will not work for 'synthetically generated classes'.
+
+There are applications/frameworks which create synthetic classes (at runtime) which would not normally be useable by Aparapi.
+
+Specifically (and significantly) Java 8 uses synthetic classes to capture args (closure captures) so they can be passed to the final lambda implementation. We needed a way to allow Aparapi to access bytecode of any class, not just those in jars or on the disk.
+
+A JVMTI agent can register an interest in loaded classes (loaded by the classloader)do this. So when we use the aparapi library in 'agent mode' it caches all bytes for all loaded classes (yes we could filter by name) and puts this information in a common data structure (should be a map but is a linked list at present).
+
+By adding a new OpenCLJNI.getBytes(String) JNI method, Aparapi can now retrieve the bytes for any loaded classes, out of this cache.
+
+So this combined with our ability to parse classes which donâ€™t have line number information should really enable Aparapi to be used with Scala/JRuby/Groovy or other dynamic scripting languages which create classes on the fly.
diff --git a/doc/MultipleEntryPointSupportProposal.md b/doc/MultipleEntryPointSupportProposal.md
new file mode 100644
index 0000000000000000000000000000000000000000..bf2d70563fcc52aea2db5e6d8008db376168e22e
--- /dev/null
+++ b/doc/MultipleEntryPointSupportProposal.md
@@ -0,0 +1,377 @@
+#MultipleEntryPointSupportProposal
+*How to extend Aparapi to allow multiple entrypoints for kernels Updated Jul 30, 2012 by frost.g...@gmail.com*
+
+##The Current Single Entrypoint World
+
+At present Aparapi allows us to dispatch execution to a single 'single entry point' in a Kernel. Essentially for each Kernel only the overridden Kernel.run() method can be used to initiate execution on the GPU.
+
+Our canonical example is the 'Squarer' Kernel which allows us to create squares for each element in an input array in an output array.
+
+    Kernel squarer = new Kernel(){
+       @Overide public void run(){
+          int id = getGlobalId(0);
+          out[id] = in[id] * in[id];
+       }
+    };
+
+If we wanted a vector addition Kernel we would have to create a whole new Kernel.
+
+    Kernel adder = new Kernel(){
+       @Overide public void run(){
+          int id = getGlobalId(0);
+          out[id] = in[id] * in[id];
+       }
+    };
+
+For us to square and then add a constant we would have to invoke two kernels. Or of course create single SquarerAdder kernel.
+
+See this page EmulatingMultipleEntrypointsUsingCurrentAPI for ideas on how to emulate having multiple methods, by passing data to a single run() method.
+
+##Why can't Aparapi just allow 'arbitary' methods
+
+Ideally we would just expose a more natural API, one which allows us to provide specific methods for each arithmetic operation.
+
+Essentially
+
+    class VectorKernel extends Kernel{
+       public void add();
+       public void sub();
+       public void sqr();
+       public void sqrt();
+    }
+
+Unfortunately this is hard to implement using Aparapi. There are two distinct problems, both at runtime.
+
+    How will Aparapi know which of the available methods we want to execute when we call Kernel.execute(range)?
+    On first execution how does Aparapi determine which methods might be entrypoints and are therefore need to be converted to OpenCL?
+
+The first problem can be solved by extending Kernel.execute() to accept a method name
+
+    kernel.execute(SIZE, "add");
+
+This is the obvious solution, but really causes maintenence issues int that it trades compile time reporting for a runtime errors. If a developer mistypes the name of the method, :-
+
+    kernel.execute(SIZE, "sadd"); // there is no such method
+
+The code will compile perfectly, only at runtime will we detect that there is no such method.
+##An aside
+
+Maybe the new Java 8 method reference feature method might help here. In the paper below Brian Goetz talks about a double-colon syntax (Class::Method) for directly referencing a method which is presumably checked at compile time.
+
+So presumably
+
+    kernel.execute(SIZE, VectorKernel::add);
+
+Would compile just fine, whereby
+
+    kernel.execute(SIZE, VectorKernel::sadd);
+
+Would yield a compile time error.
+
+See Brian Goetz's excellent Lambda documentation
+##back from Aside
+
+The second problem (knowing which methods need to be converted to OpenCL) can probably be solved using an Annotation.
+
+    class VectorKernel extends Kernel{
+       @EntryPoint public void add();
+       @EntryPoint public void sub();
+       @EntryPoint public void sqr();
+       @EntryPoint public void sqrt();
+       public void nonOpenCLMethod();
+    }
+
+Here the @EntryPoint annotation allows the Aparapi runtime to determine which methods need to be exposed.
+#My Extension Proposal
+
+Here is my proposal. Not only does it allow us to reference multiple entryoints, but I think it actually improves the single entrypoint API, albeit at the cost of being more verbose.
+##The developer must provide an API interface
+
+First I propose that we should ask the developer to provide an interface for all methods that we wish to execute on the GPU (or convert to OpenCL).
+
+    interface VectorAPI extends AparapiAPI {
+       public void add(Range range);
+       public void sub(Range range);
+       public void sqrt(Range range);
+       public void sqr(Range range);
+    }
+
+Note that each API takes a Range, this will make more sense in a moment.
+##The developer provides a bound implementation
+
+Aparapi should provide a mechanism for mapping the proposed implementation of the API to it's implementation.
+
+Note the weasel words here, this is not a conventional implementation of an interface. We will use an annotation (@Implements(Class class)) to provide the binding.
+
+    @Implements(VectorAPI.class) class Vector extends Kernel {
+       public void add(RangeId rangeId){/*implementation here */}
+       public void sub(RangeId rangeId){/*implementation here */}
+       public void sqrt(RangeId rangeId){/*implementation here */}
+       public void sqr(RangeId rangeId){/*implementation here */}
+       public void  public void nonOpenCLMethod();
+    }
+
+##Why we can't the implementation just implement the interface?
+
+This would be ideal. Sadly we need to intercept a call to say VectorAPI.add(Range) and dispatch to the resulting Vector.add(RangeId) instances. If you look at the signatures, the interface accepts a Range as it's arg (the range over which we intend to execute) whereas the implementation (either called by JTP threads or GPU OpenCL dispatch) receives a RangeId (containing the unique globalId, localId, etc fields). At the very end of this page I show a strawman implementation of a sequential loop implementation.
+##So how do we get an implementation of VectorAPI
+
+We instantiate our Kernel by creating an instance using new. We then ask this instance to create an API instance. Some presumably java.util.Proxy trickery will create an implementation of the actual instance, backed by the Java implementation.
+
+So execution would look something like.
+
+    Vector kernel = new Vector();
+    VectorAPI kernelApi = kernel.api();
+    Range range = Range.create(SIZE);
+    kernalApi.add(range);
+
+So the Vector instance is a pure Java implementation. The extracted API is the bridge to the GPU.
+
+Of course then we can also execute using an inline call through api()
+
+    Vector kernel = new Vector();
+    Range range = Range.create(SIZE);
+    kernel.api().add(range);
+    kernel.api().sqrt(range);
+
+or even expose api as public final fields
+
+    Vector kernel = new Vector();
+    Range range = Range.create(SIZE);
+    kernel.api.add(range);
+    kernel.api.sqrt(range);
+
+##How would our canonical Squarer example look
+
+    interface SquarerAPI extends AparapiAPI{
+       square(Range range);
+    }
+
+    @Implement(SquarerAPI) class Squarer extends Kernel{
+       int in[];
+       int square[];
+       public void square(RangeId rangeId){
+          square[rangeId.gid] = in[rangeId.gid]*in[rangeId.gid];
+       }
+    }
+
+Then we execute using
+
+    Squarer squarer = new Squarer();
+    // fill squarer.in[SIZE]
+    // create squarer.values[SIZE];
+
+squarer.api().square(Range.create(SIZE));
+
+#Extending this proposal to allow argument passing
+
+Note that we have effectively replaced the use of the 'abstract' squarer.execute(range) with the more concrete squarer.api().add(range).
+
+Now I would like to propose that we take one more step by allowing us to pass arguments to our methods.
+
+Normally Aparapi captures buffer and field accesses to create the args that it passes to the generated OpenCL code. In our cannonical squarer example the in[] and square[] buffers are captured from the bytecode and passed (behind the scenes) to the OpenCL.
+
+* **TODO: Add generated OpenCl code to show what this looks like.** *
+
+However, by exposing the actual method we want to execute, we could also allow the API to accept parameters.
+
+So our squarer example would go from
+
+    interface SquarerAPI extends AparapiAPI{
+       square(Range range);
+    }
+
+    @Implement(SquarerAPI) class Squarer extends Kernel{
+       int in[];
+       int square[];
+       public void square(RangeId rangeId){
+          square[rangeId.gid] = in[rangeId.gid]*in[rangeId.gid];
+       }
+    }
+
+
+    Squarer squarer = new Squarer();
+    // fill squarer.in[SIZE]
+    // create squarer.values[SIZE];
+
+    squarer.api().square(Range.create(SIZE));
+
+to
+
+    interface SquarerAPI extends AparapiAPI{
+       square(Range range, int[] in, int[] square);
+    }
+
+    @Implement(SquarerAPI) class Squarer extends Kernel{
+       public void square(RangeId rangeId, int[] in, int[] square){
+          square[rangeId.gid] = in[rangeId.gid]*in[rangeId.gid];
+       }
+    }
+
+
+    Squarer squarer = new Squarer();
+    int[] in = // create and fill squarer.in[SIZE]
+    int[] square = // create squarer.values[SIZE];
+
+    squarer.api().square(Range.create(SIZE), in, result);
+
+I think that this makes Aparapi look more conventional. It also allows us to allow overloading for the first time.
+
+    interface SquarerAPI extends AparapiAPI{
+       square(Range range, int[] in, int[] square);
+       square(Range range, float[] in, float[] square);
+    }
+
+    @Implement(SquarerAPI) class Squarer extends Kernel{
+       public void square(RangeId rangeId, int[] in, int[] square){
+          square[rangeId.gid] = in[rangeId.gid]*in[rangeId.gid];
+       }
+       public void square(RangeId rangeId, float[] in, float[] square){
+          square[rangeId.gid] = in[rangeId.gid]*in[rangeId.gid];
+       }
+    }
+
+
+    Squarer squarer = new Squarer();
+    int[] in = // create and fill squarer.in[SIZE]
+    int[] square = // create squarer.values[SIZE];
+
+    squarer.api().square(Range.create(SIZE), in, result);
+    float[] inf = // create and fill squarer.in[SIZE]
+    float[] squaref = // create squarer.values[SIZE];
+
+    squarer.api().square(Range.create(SIZE), inf, resultf);
+
+---
+
+test harness
+
+    import java.lang.reflect.InvocationHandler;
+    import java.lang.reflect.Method;
+    import java.lang.reflect.Proxy;
+
+
+    public class Ideal{
+
+       public static class OpenCLInvocationHandler<T> implements InvocationHandler {
+           Object instance;
+           OpenCLInvocationHandler(Object _instance){
+              instance = _instance;
+           }
+          @Override public Object invoke(Object interfaceThis, Method interfaceMethod, Object[] interfaceArgs) throws Throwable {
+             Class clazz = instance.getClass();
+
+             Class[] argTypes =  interfaceMethod.getParameterTypes();
+             argTypes[0]=RangeId.class;
+             Method method = clazz.getDeclaredMethod(interfaceMethod.getName(), argTypes);
+
+
+             if (method == null){
+                System.out.println("can't find method");
+             }else{
+                RangeId rangeId = new RangeId((Range)interfaceArgs[0]);
+                interfaceArgs[0]=rangeId;
+                for (rangeId.wgid = 0; rangeId.wgid <rangeId.r.width; rangeId.wgid++){
+                    method.invoke(instance, interfaceArgs);
+                }
+             }
+
+             return null;
+          }
+       }
+
+       static class Range{
+          int width;
+          Range(int _width) {
+             width = _width;
+          }
+       }
+
+       static class Range2D extends Range{
+          int height;
+
+          Range2D(int _width, int _height) {
+             super(_width);
+             height = _height;
+          }
+       }
+
+       static class Range1DId<T extends Range>{
+          Range1DId(T _r){
+             r = _r;
+          }
+          T r;
+
+          int wgid, wlid, wgsize, wlsize, wgroup;
+       }
+
+       static class RangeId  extends Range1DId<Range>{
+          RangeId(Range r){
+             super(r);
+          }
+       }
+
+       static class Range2DId extends Range1DId<Range2D>{
+          Range2DId(Range2D r){
+             super(r);
+          }
+
+          int hgid, hlid, hgsize, hlsize, hgroup;
+       }
+
+
+
+
+
+       static <T> T create(Object _instance, Class<T> _interface) {
+          OpenCLInvocationHandler<T> invocationHandler = new OpenCLInvocationHandler<T>(_instance);
+          T instance = (T) Proxy.newProxyInstance(Ideal.class.getClassLoader(), new Class[] {
+                _interface,
+
+          }, invocationHandler);
+          return (instance);
+
+       }
+
+
+
+       public static class Squarer{
+          interface API {
+             public API foo(Range range, int[] in, int[] out);
+             public Squarer dispatch();
+
+          }
+
+          public API foo(RangeId rangeId, int[] in, int[] out) {
+             out[rangeId.wgid] = in[rangeId.wgid]*in[rangeId.wgid];
+             return(null);
+          }
+       }
+
+       /**
+        * @param args
+        */
+       public static void main(String[] args) {
+
+          Squarer.API squarer = create(new Squarer(), Squarer.API.class);
+          int[] in = new int[] {
+                1,
+                2,
+                3,
+                4,
+                5,
+                6
+          };
+          int[] out = new int[in.length];
+          Range range = new Range(in.length);
+
+          squarer.foo(range, in, out);
+
+          for (int s:out){
+             System.out.println(s);
+          }
+
+       }
+
+    }
+
diff --git a/doc/NewFeatures.md b/doc/NewFeatures.md
new file mode 100644
index 0000000000000000000000000000000000000000..4bcb8f5983bc4063edee4e84160f39c9f8b6be75
--- /dev/null
+++ b/doc/NewFeatures.md
@@ -0,0 +1,227 @@
+#NewFeatures
+*New Features added to this open source release of Aparapi. Updated Sep 14, 2011 by frost.g...@gmail.com*
+##New Features
+Aparapi has two new, especially useful features:
+
+Explicit Buffer Management for minimizing buffer transfers
+Kernel access to objects held in arrays
+###Minimizing Buffer Transfers
+####Explicit Buffer Management
+Aparapi is designed to shield the Java developer from dealing with the underlying movement of data between the OpenCL host and device. Aparapi can analyze a kernel's run() method and run-reachable methods to determine which primitive arrays to transfer to the GPU prior to execution, and which arrays to transfer back when the GPU execution is complete.
+
+Generally this strategy is both clean and performant. Aparapi will attempt to just do the right thing.
+
+However, occasionally the following code pattern is seen.
+
+    final int[] hugeArray = new int[HUGE];
+    final int[] done = new int[]{0};
+    Kernel kernel= new Kernel(){
+       ... // reads/writes hugeArray and writes to done[0] when complete
+    };
+    done[0]=0;
+    while (done[0] ==0)){
+       kernel.execute(HUGE);
+    }
+This is a common pattern in reduce stages of map-reduce type problems. Essentially the developer wants to keep executing a kernel until some condition is met. For example, this may be seen in bitonic sort implementations and various financial applications.
+
+From the code it can be seen that the kernel reads and writes hugeArray[] array and uses the single item done[] array to indicate some form of convergence or completion.
+
+Unfortunately, by default Aparapi will transfer done[] and hugeArray[] to and from the GPU device each time Kernel.execute(HUGE) is executed.
+
+To demonstrate which buffers are being transfered, these copies are shown as comments in the following version of the code.
+
+    final int[] hugeArray = new int[HUGE];
+    final int[] done = new int[]{0};
+    Kernel kernel= new Kernel(){
+       ... // reads/writes hugeArray and writes to done[0] when complete
+    };
+    done[0]=0;
+    while (done[0] ==0)){
+       // Send done[] to GPU
+       // Send hugeArray[] to GPU
+       kernel.execute(HUGE);
+       // Fetch done[] from GPU
+       // Fetch hugeArray[] from GPU
+    }
+Further analysis of the code reveals that hugeArray[] is not accessed by the loop containing the kernel execution, so Aparapi is performing 999 unnecessary transfers to the device and 999 unnecessary transfers back. Only two transfers of hugeArray[] are needed; one to move the initial data to the GPU and one to move it back after the loop terminates.
+
+The done[] array is accessed during each iteration (although never written to within the loop), so it does needs to be transferred back for each return from Kernel.execute(), however, it only needs to be sent once.
+
+Clearly it is better to avoid unnecessary transfers, especially of large buffers like hugeArray[].
+
+A new Aparapi feature allows the developer to control these situations and explicitly manage transfers.
+
+To use this feature first set the mode to explicit, using the kernel.setExplicit(true) method, and then requests transfers using either kernel.put() or kernel.get(). Kernel.put() forces a transfer to the GPU device and Kernel.get() transfers data back.
+
+The following code illustrates the use of these new explicit buffer management APIs.
+
+    final int[] hugeArray = new int[HUGE];
+    final int[] done = new int[]{0};
+    Kernel kernel= new Kernel(){
+       ... // reads/writes hugeArray and writes to done[0] when complete
+    };
+    kernel.setExplicit(true);
+    done[0]=0;
+    kernel.put(done);
+    kernel.put(hugeArray);
+    while (done[0] ==0)){
+       kernel.execute(HUGE);
+       kernel.get(done);
+    }
+    kernel.get(hugeArray);
+Note that marking a kernel as explicit and failing to request the appropriate transfer is a programmer error.
+
+We deliberately made Kernel.put(â€¦), Kernel.get(â€¦) and Kernel.execute(range) return an instance of the executing kernel to allow these calls be chained. Some may find this fluent style API more expressive.
+
+    final int[] hugeArray = new int[HUGE];
+    final int[] done = new int[]{0};
+    Kernel kernel= new Kernel(){
+       ... // reads/writes hugeArray and writes to done[0] when complete
+    };
+    kernel.setExplicit(true);
+    done[0]=0;
+    kernel.put(done).put(hugeArray);    // chained puts
+    while (done[0] ==0)){
+       kernel.execute(HUGE).get(done);  // chained execute and put
+    }
+    kernel.get(hugeArray);
+####An alternate approach for loops containing a single kernel.execute(range) call.
+One variant of code which would normally suggest the use of Explicit Buffer Management can be handled differently. For cases where Kernel.execute(range) is the sole statement inside a loop and where the iteration count is known prior to the first iteration we offer an alternate (hopefully more elegant) way of minimizing buffer transfers.
+
+So for cases like:-
+
+    final int[] hugeArray = new int[HUGE];
+    Kernel kernel= new Kernel(){
+       ... // reads/writes hugeArray
+    };
+
+    for (int pass=0; pass<1000; pass++){
+       kernel.execute(HUGE);
+    }
+The developer can request that Aparapi perform the outer loop rather than coding the loop. This is achieved explicitly by passing the iteration count as the second argument to Kernel.execute(range, iterations).
+
+Now any form of code that looks like :-
+
+    int range=1024;
+    int loopCount=64;
+    for (int passId=0; passId<loopCount; passId++){
+       kernel.execute(range);
+    }
+Can be replaced with
+
+    int range=1024;
+    int loopCount=64;
+
+    kernel.execute(range, loopCount);
+Not only does this make the code more compact and avoids the use of explicit buffer management APIs, it allows Aparapi visibility to the complete loop so that Aparapi can minimize the number of transfers. Aparapi will only transfer buffers to the GPU once and transfer them back once, resulting in improved performance.
+
+Sometimes kernel code using this loop-pattern needs to track the current iteration number as the code passed through the outer loop. Previously we would be forced to use explicit buffer management to allow the kernel to do this.
+
+The code for this would have looked something like
+
+    int range=1024;
+    int loopCount=64;
+    final int[] hugeArray = new int[HUGE];
+    final int[] passId = new int[0];
+    Kernel kernel= new Kernel(){
+       @Override public void run(){
+          int id=getGlobalId();
+          if (passId[0] == 0){
+              // perform some initialization!
+          }
+          ... // reads/writes hugeArray
+       }
+    };
+    Kernel.setExplicit(true);
+    kernel.put(hugeArray);
+    for (passId[0]=0; passId[0]<loopCount; passId[0]++){
+
+       kernel.put(passId).execute(range);
+    }
+In the current version of Aparapi we added Kernel.getPassId() to allow a Kernel to determine the current â€˜passâ€™ through the outer loop without having to use explicit buffer management.
+
+So the previous code can now be written without any explicit buffer management APIs:-
+
+    final int[] hugeArray = new int[HUGE];
+    final int[] pass[] = new int[]{0};
+    Kernel kernel= new Kernel(){
+       @Override public void run(){
+          int id=getGlobalId();
+          int pass = getPassId();
+          if (pass == 0){
+              // perform some initialization!
+          }
+          ... // reads/writes both hugeArray
+       }
+    };
+
+    kernel.execute(HUGE, 1000);
+One common use for Kernel.getPassId() is to avoid flipping buffers in the outer loop.
+
+It is common for kernels to process data from one buffer to another, and in the next invocation process the data back the other way. Now these kernels can use the passId (odd or even) to determine the direction of data transfer.
+
+    final int[] arr1 = new int[HUGE];
+    final int[] arr2 = new int[HUGE];
+    Kernel kernel= new Kernel(){
+       int f(int v){ â€¦ }
+
+       @Override public void run(){
+          int id=getGlobalId();
+          int pass = getPassId();
+          if (pass%2==0){
+              arr1[id] = f(arr2[id]);
+          }else{
+              arr2[id] = f(arr1[id]);
+
+          }
+       }
+    };
+
+    kernel.execute(HUGE, 1000);
+
+####Allow kernels to access simple arrays of objects
+Aparapi needs to create OpenCL from the bytecode that it sees. Generally OpenCL constrains us to using parallel primitive arrays (OpenCL allows structs, but Java and OpenCL do not have comparable memory layouts for these structures). Therefore, you will generally need to refactor your code from a classic object-oriented form to use primitive arrays.
+
+This incompatibility between data-parallel and object-oriented code patterns might discourage use of Aparapi, so Aparapi includes limited support for arrays of simple Objects. Future versions may well extend this functionality and address performance loss.
+
+Consider the NBody example.
+
+Typically, a Java developer writing NBody would probably not separate the x,y and z ordinates into parallel arrays of floats as was required in the previous (alpha) version of Aparapi. Instead, a Java developer would probably create a Body class to hold the state of each body and possibly a Universe class (container of Body instances) with the responsible for positioning and possibly displaying the bodies.
+
+    class Body{
+      float x,y,z;
+      float getX(){return x;}
+      void setX(float _x){ x = _x;}
+      float getY(){return y;}
+      void setY(float _y){ y = _y;}
+      float getZ(){return z;}
+      void setZ(float _z){ z = _z;}
+
+
+      // other data related to Body unused by positioning calculations
+    }
+
+    class Universe{
+         final Body[] bodies;
+         public Universe(final Body[] _bodies){
+            bodies = _bodies;
+         }
+         void adjustPositions(){
+             for (Body outer:bodies){
+                for (Body inner:bodies}{
+                   // adjust outer position to reflect the effect of inner
+                   // using inner and outer getters and setters for x, y and z
+                }
+             }
+         }
+         void display(){
+            for (Body body:bodies){
+               // draw body based on x, y and z using Body getters
+            }
+         }
+    }
+From the above code we see that the Universe.adjustPositions() method is compute intensive and an ideal candidate for refactoring to use Aparapi. The current version of Aparapi is able to deal with simple arrays of objects like this.
+
+Now when Aparapi encounters an array of objects and the accesses to these objects are constrained to simple getters and setters, Aparapi will automatically extract the values of the accessed fields into a data parallel form, execute the kernel and then replace the results back in the original objects in the array. This happens on each call to Kernel.execute() and is fairly costly (from a performance point of view), however, for embarrassingly parallel code (such as NBody), we can still show considerable performance gains over standard Java Thread Pool
+
+Attribution
\ No newline at end of file
diff --git a/doc/NewOpenCLBinding.md b/doc/NewOpenCLBinding.md
new file mode 100644
index 0000000000000000000000000000000000000000..32e5f4347b94d3e6b300543873850158350bdfde
--- /dev/null
+++ b/doc/NewOpenCLBinding.md
@@ -0,0 +1,51 @@
+#NewOpenCLBinding  
+*How to use new OpenCL binding mechanism. Updated Mar 6, 2012 by frost.g...@gmail.com*
+As a step towards the extension mechanism I needed a way to easily bind OpenCL to an interface.
+
+Here is what I have come up with. We will use the 'Square' example.
+
+You first define an interface with OpenCL annotations..
+
+  interface Squarer extends OpenCL<Squarer>{
+  @Kernel("{\n"//
+         + "  const size_t id = get_global_id(0);\n"//
+         + "  out[id] = in[id]*in[id];\n"//
+         + "}\n")//
+   public Squarer square(//
+         Range _range,//
+         @GlobalReadOnly("in") float[] in,//
+         @GlobalWriteOnly("out") float[] out);
+  }
+
+This describes the API we wish to bind to a set of kernel entrypoints (here we only have one, but we could have many). Then you 'realize' the interface by asking a device to create an implementation of the interface. Device is a new Aparapi class which represents a GPU or CPU OpenCL device. So here we are asking for the first (default) GPU device to realize the interface.
+
+  Squarer squarer = Device.firstGPU(Squarer.class);
+Now you can call the implementation directly with a Range.
+
+ squarer.square(Range.create(in.length), in, out);
+I think that we will have the easiest OpenCL binding out there...
+
+Following some conversations/suggestions online http://a-hackers-craic.blogspot.com/2012/03/aparapi.html we could also offer the ability to provide the OpenCL source from a file/url course using interface level Annotations.
+
+So we could allow.
+
+  @OpenCL.Resource("squarer.cl");
+  interface Squarer extends OpenCL<Squarer>{
+         public Squarer square(//
+           Range _range,//
+           @GlobalReadOnly("in") float[] in,//
+           @GlobalWriteOnly("out") float[] out);
+  }
+Or if the text is on-hand at compile time in a single constant string
+
+  @OpenCL.Source("... opencl text here");
+  interface Squarer extends OpenCL<Squarer>{
+         public Squarer square(//
+           Range _range,//
+           @GlobalReadOnly("in") float[] in,//
+           @GlobalWriteOnly("out") float[] out);
+  }
+Finally to allow for creation of dynamicl OpenCL (good for FFT's of various Radii).
+
+ String openclSource = ...;
+ Squarer squarer = Device.firstGPU(Squarer.class, openclSource);
diff --git a/doc/PossibleAparapiLambdaSyntaxOptions.md b/doc/PossibleAparapiLambdaSyntaxOptions.md
new file mode 100644
index 0000000000000000000000000000000000000000..8bfcf5f9cba7849ec5f4946bfc2a634c6e5089b5
--- /dev/null
+++ b/doc/PossibleAparapiLambdaSyntaxOptions.md
@@ -0,0 +1,96 @@
+#PossibleAparapiLambdaSyntaxOptions
+*syntax suggestions for HSA enabled Aparapi*
+
+#Introduction
+Now that Java 8 is nearly upon us and HSA enabled Aparapi 'lambda' branch is usable (though in no way complete) I figured we could use this page to discuss the 'programming model' we might prefer for Aparapi, and contrast with the API's for the new Java 8 lambda based stream APIs.
+
+##Converting between Aparapi HSA + Java 8 enabled Aparapi
+Our **hello world** app has always been the ''vector add''. In classic Aparapi we could transform
+
+    final float inA[] = .... // get a float array from somewhere
+    final float inB[] = .... // get a float from somewhere
+                         // assume (inA.length==inB.length)
+    final float result = new float[inA.length];
+
+    for (int i=0; i<array.length; i++){
+        result[i]=intA[i]+inB[i];
+    }
+to
+
+    Kernel kernel = new Kernel(){
+       @Override public void run(){
+          int i= getGlobalId();
+          result[i]=intA[i]+inB[i];
+       }
+    };
+    Range range = Range.create(result.length);
+    kernel.execute(range);
+For the lambda aparapi branch we can currently use
+
+    Device.hsa().forEach(result.length, i-> result[i]=intA[i]+inB[i]);
+Note that the closest Java 8 construct is
+
+    IntStream.range(0, result.length).parallel().forEach(i-> result[i]=intA[i]+inB[i]);
+Aparapi and Java 8 stream API's both use IntConsumer as the lambda type. So you can reuse the lambda.
+
+    IntConsumer lambda = i-> result[i]=intA[i]+inB[i];
+
+    IntStream.range(0, result.length).parallel().forEach(lambda);
+    Device.hsa().forEach(result.length, lambda);
+Exposing the Deviceness of this was a conscious effort. We may also hide it completely.
+
+    IntConsumer lambda = i-> result[i]=intA[i]+inB[i];
+
+    IntStream.range(0, result.length).parallel().forEach(lambda);
+    Aparapi.forEach(result.length, lambda);
+I am toying with providing an API which maps more closely to the Stream API from Java 8.
+
+Maybe
+
+    IntStream.range(0, result.length).parallel().forEach(lambda);
+    Aparapi.range(0, result.length).parallel().forEach(lambda);
+This way users can more readily swap between the two.
+
+For collections/arrays in Aparapi we can also offer
+
+    T[] arr = // get an array of T from somewhere
+    ArrayList<T> list = // get an array backed list of T from somewhere
+
+    Aparapi.range(arr).forEach(t -> /* do something with each T */);
+We can create special cases. Say for mutating images
+
+    BufferedImage in, out;
+    Aparapi.forEachPixel(in, out, rgb[] -> rgb[0] = 0 );
+We may also need select operations for associative operations
+
+    class Person{
+        int age;
+        String first;
+        String last;
+    };
+
+    Aparapi.selectOne(Person[] people, (p1,p2)-> p1.age>p2.age?p1:p2 );
+##A case for map reduce
+A mapper maps from one type to another. Possibly by extracting state. Here is a mapper which maps each String in an array of Strings to its length.
+
+As if the mapper was
+
+    interface mapToInt<T>{ int map(T v); }
+Here it is in action.
+
+    Aparapi.range(strings).map(s->string.length())...
+Now the result is a stream of int's which can be 'reduced' by a reduction lambda.
+
+In this case the reduction reduces two int's to one, by choosing the max of k and v. All reductions must be commutative style operations (max, min, add) where the order of execution is not important.
+
+    int lengthOfLongestString = Aparapi.range(strings).map(s->string.length()).reduce((k,v)-> k>v?k:v);
+Here we had a sum reduction.
+
+    int sumOfLengths = Aparapi.range(strings).map(s ->string.length()).reduce((k,v)-> k+v);
+Some of these may be common enough that we offer direct functionality.
+
+    int sumOfLengths = Aparapi.range(strings).map(s ->string.length()).sum();
+    int maxOfLengths = Aparapi.range(strings).map(s ->string.length()).max();
+    int minOfLengths = Aparapi.range(strings).map(s ->string.length()).min();
+    String string = Aparapi.range(strings).map(s->string.length()).select((k,v)-> k>v);
+This last one needs some explaining. We map String to int then select the String whose length is the greatest.
\ No newline at end of file
diff --git a/doc/PrivateMemorySpace.md b/doc/PrivateMemorySpace.md
new file mode 100644
index 0000000000000000000000000000000000000000..51fee39e43f50ec6c2c93cbde20315c3345a041a
--- /dev/null
+++ b/doc/PrivateMemorySpace.md
@@ -0,0 +1,34 @@
+PrivateMemorySpace
+==================
+
+*Using `__private` memory space in Aparapi kernels. Phase-Implementation Updated Sep 14, 2014 by barneydp...@gmail.com*
+
+## Introduction
+The private memory space identifier (just "private" is also recognised) can be applied to struct fields in order to indicate that the data is not shared with/accessible to other kernel instances. Whilst this is the default for non-array data, it must be explicitly applied to array fields in order to make them private. Aparapi now supports arrays in the private memory space.
+
+The private memory space is generally only suitable for smallish arrays, but is required for certain algorithms, e.g. for those which must mutate (for example, sort or partially sort) an exclusive copy of an array/subarray.
+
+##Details
+In Aparapi there are two mechanisms available to mark a Kernel class member as belonging to the private memory space when mapped to OpenCL code (matching the equivalent functionality for marking items as belonging to the local memory space). Either the field can be named with a suffix plus buffer size, for example
+
+    protected short[] myBuffer_$private$32 = new short[32];
+or using the Annotation Kernel.PrivateMemorySpace, for example
+
+    protected @PrivateMemorySpace(32) short[] myBuffer = new short[32];
+The latter should be used in preference to the former.
+
+Note that OpenCL requires that the size of a private array be fixed at compile time for any kernel. Thus it is not possible for a single Kernel subclass to support private buffers of varying size. Unfortunately this may entail creating multiple subclasses with varying buffer sizes in order to most efficiently support varying private buffer sizes.
+
+Of course, a single Kernel class can be created which has a private buffer large enough for all use cases, though this may be suboptimal if only a small fraction of the maximum buffer size is commonly required.
+
+Because private buffers are unshared, they require much more of a GPU's memory than a local or global buffer of the same size, and should therefore be used sparingly and kept as small as possible, as overuse of large private arrays might cause GPU execution to fail on lower-end graphics cards.
+
+However, private memory space is the fastest of all OpenCls memory spaces, so may in some limited cases might be used to increase execution speed even when the kernel does not need to modify the array and a shared (local or global) array would suffice - for example to provide a smallish lookup-table to replace an expensive function call.
+
+Without modification, an Aparapi kernel which uses private buffers may fail to work when invoked in Java Threadpool (JTP) mode, because the buffer will be shared across multiple threads. However a simple mechanism exists which allows such buffers to be used safely in JTP execution mode.
+
+The Kernel.NoCL annotation exists to allow specialised code to be executed when running in Java (or JTP) which is not invoked when running on the GPU. A NoCL method can be inserted at the begining of a Kernel's run() method which sets the private array to a value obtained from a static ThreadLocal<foo[]> where foo is the primitive type of the array in question. This will have no effect upon OpenCL execution, but will allow threadsafe execution when running in java.
+
+In the project samples, there is a package com.amd.aparapi.sample.median which gives an example of a median image filter which uses a private array of pixel data to apply a distructive median algorithm to a "window" of local pixels. This sample also demonstrates how to use the ThreadLocal trick to allow correct behaviour when running in JTP execution mode.
+
+[http://code.google.com/p/aparapi/source/browse/trunk/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java](http://code.google.com/p/aparapi/source/browse/trunk/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java)
\ No newline at end of file
diff --git a/doc/ProfilingKernelExecution.md b/doc/ProfilingKernelExecution.md
new file mode 100644
index 0000000000000000000000000000000000000000..58cc58844f5d51f2cbf4ce2e38a78d16c05b543d
--- /dev/null
+++ b/doc/ProfilingKernelExecution.md
@@ -0,0 +1,53 @@
+#ProfilingKernelExecution
+*Using Aparapi's built in profiling APIs Updated May 7, 2013 by frost.g...@gmail.com*
+
+If you want to extract OpenCL performance info from a kernel at runtime you need to set the property :-
+
+    -Dcom.amd.aparapi.enableProfiling=true
+
+Your application can then call kernel.getProfileInfo() after a successful call to kernel.execute(range) to extract a List List<ProfileInfo>.
+
+Each ProfileInfo holds timing information for buffer writes, executs and buffer reads.
+
+The following code will print a simple table of profile information
+
+    List<ProfileInfo> profileInfo = k.getProfileInfo();
+    for (final ProfileInfo p : profileInfo) {
+       System.out.print(" " + p.getType() + " " + p.getLabel() + " " + (p.getStart() / 1000) + " .. "
+           + (p.getEnd() / 1000) + " " + ((p.getEnd() - p.getStart()) / 1000) + "us");
+       System.out.println();
+    }
+
+Here is an example implementation
+
+            final float result[] = new float[2048*2048];
+            Kernel k = new Kernel(){
+               public void run(){
+                  final int gid=getGlobalId();
+                  result[gid] =0f;
+               }
+            };
+            k.execute(result.length);
+            List<ProfileInfo> profileInfo = k.getProfileInfo();
+
+            for (final ProfileInfo p : profileInfo) {
+               System.out.print(" " + p.getType() + " " + p.getLabel() + " " + (p.getStart() / 1000) + " .. "
+                  + (p.getEnd() / 1000) + " " + ((p.getEnd() - p.getStart()) / 1000) + "us");
+               System.out.println();
+            }
+            k.dispose();
+        }
+    }
+And here is the tabular output from
+
+        java
+           -Djava.library.path=${APARAPI_HOME}
+           -Dcom.amd.aparapi.enableProfiling=true
+           -cp ${APARAPI_HOME}:.
+           MyClass
+
+      W val$result 69500 .. 72694 3194us
+      X exec()     72694 .. 72835  141us
+      R val$result 75327 .. 78225 2898us
+
+The table shows that the transfer of the 'result' buffer to the device ('W') took 3194 us (micro seconds), the execute ('X') of the kernel 141 us and the read ('R') of resulting buffer 2898 us.
\ No newline at end of file
diff --git a/doc/ProfilingKernelsFromEclipse.md b/doc/ProfilingKernelsFromEclipse.md
new file mode 100644
index 0000000000000000000000000000000000000000..c1edfc9ebf2cd2a85f43b088dc237bc71ce1dba7
--- /dev/null
+++ b/doc/ProfilingKernelsFromEclipse.md
@@ -0,0 +1,97 @@
+#ProfilingKernelsFromEclipse
+*Profiling Kernels with AMD profiler in Eclipse (Indigo) Updated May 14, 2012 by frost.g...@gmail.com*
+
+##Profiling Kernels with AMD profiler in Eclipse (Indigo)
+
+Wayne Johnson
+
+12 May 2012
+Disclaimer: This has been tested with Eclipse (Indigo SR1) only on W7SR1.
+
+Assume your Eclipse project follows a typical Maven layout:
+
+    Project
+       src/main/java/...
+         AlgorithmImplementation.java
+       src/test/java/...
+         BenchmarkRunner.java
+         BenchmarkTest.java
+       lib/aparapi-2012-02-15/
+         aparapi jar file
+         native libraries for W7, Linux, and OSX
+         â€¦
+       profiles/
+         [this is where the profiles and logs will be generated]
+
+1. Download and install the current AMD APP SDK
+2. Download and install Aparapi (see Wiki), making sure that the native libraries are on your build path.
+3. Create your algorithm implementation(s).
+
+        example: AlgorithmImplementations.java
+
+4. Create your performance benchmark test as a JUnit test case to exercise your implementations.
+
+        example: BenchmarkTest.java
+
+5. Test your JUnit test case inside Eclipse using BenchmarkRunner to make sure it works. The runner will be the main application for the runnable jar file you create in the next step.
+
+        This step will also automatically create the launch configuration that the export command will ask you for. Select BenchmarkRunner.java
+
+        Right-click > Run as > Java application
+
+6. Export your project as a runnable jar file.
+
+    Right-click > Export...
+      [wizard] Java > Runnable Jar File. Next.
+        Launch configuration: BenchmarkRunner [1] - Project
+        Export destination: Project\runner.jar
+        Library handling: [use default]    Finish.
+      Ok on â€œ...repacks referenced librariesâ€
+      Yes on â€œConfirm replaceâ€ [You wonâ€™t see this dialog on the first export but will on subsequent exports]
+      Ok [ignore warning dialog]
+
+    After refreshing Project, you should see a runner.jar file at the top level.
+
+7. Create an external tool configuration to generate the performance counter profile
+
+    Run > External Tools > External Tool Configurations...
+      Name: AMD counters - Project
+      Location: C:\Program Files (x86)\AMD APP\tools\AMD APP Profiler 2.4\x64\sprofile.exe
+      Arguments:
+       -o "${project_loc}\profiles\counters.csv"
+       -w "${project_loc}"
+       "C:\Program Files\Java\jdk1.6.0_30\bin\java.exe"
+       -Djava.library.path="lib\aparapi-2012-02-15"
+       -jar "${project_loc}\runner.jar"
+
+
+    Note: The ''java.library.path'' indicates the relative location of the folder containing the native libraries used by Aparapi. If this is not set correctly, steps 9 and 10 below will run in JTP execution mode and the only error message you will see on the Eclipse console is that the profile was not generated. This is because nothing executed on the GPU.
+
+8. Create an external tool configuration to generate the cltrace and summary profiles.
+
+    1. Run > External Tools > External Tool Configurations...
+    2. Name: AMD cltrace - Project
+    3. Location: C:\Program Files (x86)\AMD APP\tools\AMD APP Profiler 2.4\x64\sprofile.exe
+    4. Arguments:
+
+        `-o "${project_loc}\profiles\cltrace.txt" -k all -r -O -t -T`
+
+        `-w "${project_loc}"`
+
+        `"C:\Program Files\Java\jdk1.6.0_30\bin\java.exe"`
+
+        `-Djava.library.path="lib\aparapi-2012-02-15"`
+
+        `-jar "${project_loc}\runner.jar"`
+
+
+9. Run the AMD profiler counter configuration to generate the counter profile.
+
+    Run > External Tools > AMD counters - Project
+
+
+10. Run the AMD profiler cltrace configuration to generate the cltrace and summary profiles.
+
+    Run > External Tools > AMD cltrace - Project
+    A project file for testing the above instructions can be found http://code.google.com/p/aparapi/source/browse/trunk/wiki-collateral/ProfilingKernelsFormEclipseProject.zip
+
diff --git a/doc/README.md b/doc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5bbba270ad043b42e9b3f6f532e5be7b200d0e0d
--- /dev/null
+++ b/doc/README.md
@@ -0,0 +1,46 @@
+APARAPI Documentation
+======================
+
+| | |
+|----------------|------|
+| [PrivateMemorySpace](PrivateMemorySpace.md)| Using `__private` memory space in Aparapi kernels. |
+| [SettingUpLinuxHSAMachineForAparapi](SettingUpLinuxHSAMachineForAparapi.md) | How to setup a Linux HSA machine for testing HSA enabled Aparapi |
+| [PossibleAparapiLambdaSyntaxOptions](PossibleAparapiLambdaSyntaxOptions.md) | Syntax suggestions for HSA enabled Aparapi |
+| [HSAEnablementOfLambdaBranchSidebar](HSAEnablementOfLambdaBranchSidebar.md)| Sidebar for HSAEnablementOfLambdaBranchAparapi|
+| [HSAEnablementOfLambdaBranch](HSAEnablementOfLambdaBranch.md)	| Adding HSA Support to Aparapi lambda branch	|
+| [UsingAparapiLambdaBranchWithHSASimulator](UsingAparapiLambdaBranchWithHSASimulator.md) | One-sentence summary of this page. |
+| [SettingUpLinuxHSAMachineForAparapiSidebar](SettingUpLinuxHSAMachineForAparapiSidebar.md) | Sidebar for SettingUpLinuxHSAMachineForAparapi |
+| HSASidebar | |
+| [AddingLambdasToAparapi](AddingLambdasToAparapi.md) | Adding Java 8 Lambda Support to Aparapi |
+| [ProfilingKernelExecution](ProfilingKernelExecution.md) | Using Aparapi's built in profiling APIs |
+| [HowToAddUML](HowToAddUML.md) | How to add plantuml docs to wiki pages |
+| [LIbraryAgentDuality](LIbraryAgentDuality.md) | Aparapi libraries can now be loaded as JVMTI agents. |
+| [FrequentlyAskedQuestions](FrequentlyAskedQuestions.md) | Frequently Asked Questions|
+| HomePageSuggestions ||
+| [ChoosingSpecificDevicesForExecution](ChoosingSpecificDevicesForExecution.md) | Using the new Device API's to choose Kernel execution on a specific device.	|
+| Gadgets | Gadgetorium|
+| [ConvertingBytecodeToOpenCL](ConvertingBytecodeToOpenCL.md) | How Aparapi converts bytecode to OpenCL |
+| [DevelopersGuideLinux](DevelopersGuideLinux.md) | Developer guide for Linux. |
+| [DevelopersGuideWindows](DevelopersGuideWindows.md) | Developers guide for Windows. |
+| [EmulatingMultipleEntrypointsUsingCurrentAPI](EmulatingMultipleEntrypointsUsingCurrentAPI.md)	| How to emulate multiple entrypoints using existing Aparapi APIs	|
+| [MultipleEntryPointSupportProposal](MultipleEntryPointSupportProposal.md) | How to extend Aparapi to allow multiple entrypoints for kernels	|
+| [ExplicitBufferHandling](ExplicitBufferHandling.md) | How to minimize buffer transfers |
+| [AparapiPatterns](AparapiPatterns.md) | Examples and code fragments to demonstrate Aparapi fetaures. |
+| [ProfilingKernelsFromEclipse](ProfilingKernelsFromEclipse.md) | Profiling Kernels with AMD profiler in Eclipse (Indigo) |
+| [DeviceProposal](DeviceProposal.md) | How we might use the extension mechanism devices for general Kernel execution.|
+| [NewOpenCLBinding](NewOpenCLBinding.md) | How to use new OpenCL binding mechanism. |
+| [AparapiExtensionProposal](AparapiExtensionProposal.md) | A proposed aparapi extension mechanism. |
+| [UsingConstantMemory](UsingConstantMemory.md) | How to make use of constant memory in a Kernel |
+| [UsingLocalMemory](UsingLocalMemory.md) | How to make use of local memory in a Kernel |
+| [UsingMultiDimExecutionRanges](UsingMultiDimExecutionRanges.md) | How to use the new Range class (for multi-dim range access) |
+| [AccessingMultiDimNDRangeProposal](AccessingMultiDimNDRangeProposal.md) | A proposal for accessing multi-dim ND range execution |
+| LocalMemoryAndBarrierProposal | A proposal for handling local memory and barriers |
+| [AddressSpacesUsingBuffers](AddressSpacesUsingBuffers.md) | Proposal For OpenCL address space support using java Buffers instead of arrays.	|
+| [BuildingNBody](BuildingNBody.md) | How to build the NBody example.|
+| [UnitTestGuide](UnitTestGuide.md) | Unit test Guide Find out how to run Junit tests and how to add new tests. |
+| [NewFeatures](NewFeatures.md) | New Features added to this open source release of Aparapi. |
+| [UsersGuide](UsersGuide.md) | Aparapi User's Guide. |
+| [DevelopersGuide](DevelopersGuide.md) | Aparapi developers guide. |
+| [ContributionGuide](ContributionGuide.md) | How to contribute (bug fix or features). |
+| [JavaKernelGuidelines](JavaKernelGuidelines.md) | What code can and can't be converted to OpenCL by Aparapi. |
+| [Attribution](Attribution.md) | Attribution |
diff --git a/doc/SettingUpLinuxHSAMachineForAparapi.md b/doc/SettingUpLinuxHSAMachineForAparapi.md
new file mode 100644
index 0000000000000000000000000000000000000000..edf564be4e2766edb8db14ff4a4c36538af987b1
--- /dev/null
+++ b/doc/SettingUpLinuxHSAMachineForAparapi.md
@@ -0,0 +1,209 @@
+#SettingUpLinuxHSAMachineForAparapi
+*How to setup a Linux HSA machine for testing HSA enabled Aparapi Updated May 22, 2014 by frost.g...@gmail.com*
+
+* HSA Videos
+    * [http://www.youtube.com/watch?v=5ntILiXTuhE](http://www.youtube.com/watch?v=5ntILiXTuhE)
+    * [http://www.youtube.com/watch?v=caEPq4KvTTA](http://www.youtube.com/watch?v=caEPq4KvTTA)
+* HSA Articles
+    * [http://developer.amd.com/resources/heterogeneous-computing/what-is-heterogeneous-computing/](http://developer.amd.com/resources/heterogeneous-computing/what-is-heterogeneous-computing/)
+* HSA Foundation
+    * [https://github.com/HSAFoundation](https://github.com/HSAFoundation)
+
+##Introduction
+Now that HSA hardware is generally available I figured it was time to describe how to setup a HSA enabled Linux platform so that it can run Aparapi.
+
+Here is a nice introduction to HSA [http://developer.amd.com/resources/heterogeneous-computing/what-is-heterogeneous-system-architecture-hsa/](http://developer.amd.com/resources/heterogeneous-computing/what-is-heterogeneous-system-architecture-hsa/)
+
+But for Aparapi users the main advantage is that we are no longer limited to the GPU memory for running GPU tasks. Also because the CPU and the GPU can both see the same memory (the Java heap) Aparapi code can now access Java objects directly. This removes a number of Aparapi constraints. So more of your code can now run on the GPU.
+
+##Hardware Required
+These instructions were based on my experience setting up a platform using the following hardware.
+
+|Component	|       Suggested           |
+|---------------|---------------------------|
+|APU            | AMD A10-7850K APU [http://www.amd.com/us/products/desktop/processors/a-series/Pages/a-series-apu.aspx](http://www.amd.com/us/products/desktop/processors/a-series/Pages/a-series-apu.aspx) |
+|Motherboard	| ASUS A88X-PRO or A88XM-A [http://www.asus.com/Motherboards/A88XPRO](http://www.asus.com/Motherboards/A88XPRO) [http://www.asus.com/Motherboards/A88XMA](http://www.asus.com/Motherboards/A88XMA)|
+| Memory        | G.SKILL Ripjaws X Series 16GB (2 x 8GB) 240-Pin DDR3 SDRAM DDR3 2133|
+
+##Software Required
+We also have some software dependencies.
+
+|Component	| Suggested |
+|---------------|-----------|
+| Java 8 JDK	| [http://www.oracle.com/technetwork/java/javase/downloads/ea-jsp-142245.html](http://www.oracle.com/technetwork/java/javase/downloads/ea-jsp-142245.html) |
+| Ubuntu 13.10 64-bit edition | [http://www.ubuntu.com/download](http://www.ubuntu.com/download) |
+| Ubuntu 13.10 64-bit edition HSA enabled kernel image	| [https://github.com/HSAFoundation/Linux-HSA-Drivers-And-Images-AMD](https://github.com/HSAFoundation/Linux-HSA-Drivers-And-Images-AMD) |
+| OKRA HSA enabled runtime | [https://github.com/HSAFoundation/Okra-Interface-to-HSA-Device](https://github.com/HSAFoundation/Okra-Interface-to-HSA-Device) |
+
+The hope is that the list of HW/SW support widens, but for early adopters this is the set of HW/SW we have been testing with.
+
+#Setting up your System
+##Configure your BIOS to support IOMMU
+Once you have built your AMD A10-7850K APU based system you should make sure that your system is configured to use IOMMU.
+
+Remember HSA allows the GPU and CPU cores to share the same memory. IOMMU needs to be enabled for this.
+
+##For the A88X-PRO board
+For the recommended ASUS board above you will need to make sure that your BIOS is updated to version 0802. Here is a direct link to the 0802 version of the BIOS from ASUS's site as of 2/28/2014.
+
+[http://dlcdnet.asus.com/pub/ASUS/mb/SocketFM2/A88X-PRO/A88X-PRO-ASUS-0802.zip](http://dlcdnet.asus.com/pub/ASUS/mb/SocketFM2/A88X-PRO/A88X-PRO-ASUS-0802.zip)
+
+Once you have the latest BIOS you will need to enable IOMMU in the system BIOS. This is done using the "CPU Configuration" screen under "Advanced Mode" and then enabling IOMMU.
+
+##For the A88XM-A
+You will need the 1102 (or later) version of the BIOS
+
+[http://dlcdnet.asus.com/pub/ASUS/mb/SocketFM2/A88XM-A/A88XM-A-ASUS-1102.zip](http://dlcdnet.asus.com/pub/ASUS/mb/SocketFM2/A88XM-A/A88XM-A-ASUS-1102.zip)
+
+Once you have the latest BIOS you will need to enable IOMMU in the system BIOS. This is done using the "CPU Configuration" screen under "Advanced Mode" and then enabling IOMMU.
+
+##Installing Ubuntu 13.10
+Once you have your BIOS setup you need to install Ubuntu [http://www.ubuntu.com/download](http://www.ubuntu.com/download)
+
+Installing HSA enabled kernel + driver
+Until all of the HSA drivers and features are available in stock linux and have been pulled down into Ubuntu distro we will need a special HSA enabled kernel image.
+
+##A Ubuntu compatible kernel can be pulled from github
+
+    $ cd ~ # I put all of this in my home dir
+    $ sudo apt-get install git
+    $ git clone https://github.com/HSAFoundation/Linux-HSA-Drivers-And-Images-AMD.git
+Or you can pull the zip and unzip using curl if you don't have git
+
+    $ cd ~ # I put all of this in my home dir
+    $ curl -L https://github.com/HSAFoundation/Linux-HSA-Drivers-And-Images-AMD/archive/master.zip > drivers.zip
+    $ unzip drivers.zip
+This will create the following subdir on your machine
+
+    Linux-HSA-Drivers-And-Images-AMD/
+       LICENSE
+       README.md
+       ubuntu12.10-based-alpha1/
+           xorg.conf
+           linux-image-3.13.0-kfd+_3.13.0-kfd+-2_amd64.deb
+
+
+From here we can install our new image and setup the HSA KFD (the driver for HSA)and reboot to the new kernel.
+
+    $ cd ~/Linux-HSA-Drivers-And-Images-AMD
+    $ echo  "KERNEL==\"kfd\", MODE=\"0666\"" | sudo tee /etc/udev/rules.d/kfd.rules
+    $ sudo dpkg -i ubuntu13.10-based-alpha1/linux-image-3.13.0-kfd+_3.13.0-kfd+-2_amd64.deb
+    $ sudo cp ~/Linux-HSA-Drivers-And-Images-AMD/ubuntu13.10-based-alpha1/xorg.conf /etc/X11
+    $ sudo reboot
+##Installing OKRA RT
+Now we need a runtime for executing HSAIL code. We share common infrastructure used by our sister OpenJDK project called Sumatra. Both Aparapi and Sumatra use OKRA to execute HSAIL code on a HSA enabled platform.
+
+We can get the latest version using of OKRA (Offloadable Kernel Runtime API) from another HSA foundation repository.
+
+    $ cd ~ # I put all of this in my home dir
+    $ git clone https://github.com/HSAFoundation/Okra-Interface-to-HSA-Device.git
+or if you prefer curl/unzip
+
+    $ cd ~ # I put all of this in my home dir
+    $ curl -L https://github.com/HSAFoundation/Okra-Interface-to-HSA-Device/archive/master.zip > okra.zip
+    $ unzip okra.zip
+This will create the following dir structure.
+
+    Okra-Interface-to-HSA-Device/
+       README.md
+       okra/
+          README
+          dist/
+             okra.jar
+             bin/
+                libamdhsacl64.so
+                libnewhsacore64.so
+                libokra_x86_64.so
+             include/
+                common.h
+                okraContext.h
+
+          samples/
+             dist/
+               Squares
+               Squares.hsail
+             runSquares.sh
+
+OKRA offers a C API (for those that are so inclined ;) ) as well as a java jar file which contains JNI wrappers.
+
+##Sanity check your HSA and OKRA install
+So to sanity check your install you can run a small sample app (binary)
+
+    $ cd ~/Okra-Interface-to-HSA-Device/okra/samples/
+    $ sh runSquares.sh
+If everything is OK this should run the C Squares test app.
+
+Congratulations, you have executed your first HSA enabled app.
+
+Getting OpenCL headers and libraries
+We need OpenCL headers and libraries to build Aparapi (remember we still support OpenCL).
+
+My recommendation is to download AMD-APP-SDK-v2.9-lnx64.tgz from [http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/downloads](http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/downloads) and extract the libraries and headers.
+
+Note that we have nested zipped jars in this archive.
+
+    $ cd ~
+    $ gunzip ~/Downloads/AMD-APP-SDK-v2.9-lnx64.tgz
+    $ tar xvf ~/Downloads/AMD-APP-SDK-v2.9-lnx64.tar
+    $ rm ~/default-install_lnx_64.pl ~/icd-registration.tgz ~/Install-AMD-APP.sh ~/ReadMe.txt
+    $ gunzip ~/AMD-APP-SDK-v2.9-RC-lnx64.tgz
+    $ tar xvf ~/AMD-APP-SDK-v2.9-RC-lnx64.tar
+    $ rm ~/AMD-APP-SDK-v2.9-RC-lnx64.tar
+    $ rm -rf AMD-APP-SDK-v2.9-RC-lnx64/samples
+Note where AMD-APP-SDK-v2.9-RC-lnx64 is located, you need this in the following step.
+
+##You will need Java 8
+Download Java 8 JDK from [https://jdk8.java.net/download.html](https://jdk8.java.net/download.html) I chose to download the zipped tar and not install with RPM so I can control the location of the install.
+
+    $ cd ~
+    $ gunzip /home/gfrost/Downloads/jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar.gz
+    $ tar xvf ~/Downloads/jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar
+I now have ~/jdk1.8.0 as my java 8 install dir.
+
+Alternatively the following will pull from Oracles site using curl
+
+    $ cd ~
+    $ curl http://download.java.net/jdk8/archive/b132/binaries/jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar.gz?q=download/jdk8/archive/b132/binaries/jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar.gz > jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar.gz
+    $ gunzip jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar.gz
+    $ tar xvf jdk-8-fcs-bin-b132-linux-x64-04_mar_2014.tar
+I now have ~/jdk1.8.0 as my java 8 install dir.
+
+##You will need ant
+    $ sudo apt-get install ant
+This takes a long time because in also installs a java7 jdk.
+
+##You will need g++
+We use g++ to build the JNI side of Aparapi
+
+    $ sudo apt-get install g++
+##Pulling the HSA enabled Aparapi branch and building
+Now we can pull the Aparapi lambda/HSA branch from SVN
+
+    $ sudo apt-get install subversion
+    $ svn checkout https://aparapi.googlecode.com/svn/branches/lambda aparapi-lambda
+If you are familiar with Aparapi structure then this tree should not be that much of a surprise but there are a few subtle changes.
+
+Specifically the build system has been changed to support OKRA, Aparapi JNI code is provided as a Java agent and the execution scripts all refer to ${APARAPI_HOME}/env.sh to setup a reasonable execution environment.
+
+You will need to edit env.sh and make sure that APARAPI_HOME, OKRA_HOME, OCL_HOME and JAVA_HOME correctly.
+
+Here are how I set my vars.
+
+|environment variable	|value|
+|-----------------------|-----|
+|JAVA_HOME	|/home/${LOGNAME}/jdk1.8.0|
+|OCL_HOME	|/home/${LOGNAME}/AMD-APP-SDK-v2.9-RC-lnx64|
+|APARAPI_HOME	|/home/${LOGNAME}/aparapi-lambda|
+|OKRA_HOME	|/home/${LOGNAME}/Okra-Interface-to-HSA-Device/okra/|
+
+It is recommended (thanks notzed ;) ) that you test your env.sh using sh env.sh until it stops reporting errors. Once you have finished I recommend sourcing it into your current shell before building with ant.
+
+    $ cd ~aparapi-lambda
+    $ . env.sh
+    $ ant
+If you get any problems check the env.sh vars first.
+
+If all is well you should be able to run some samples.
+
+    $ cd ~/aparapi-lambda/samples/mandel
+    $ sh hsailmandel.sh
\ No newline at end of file
diff --git a/doc/SettingUpLinuxHSAMachineForAparapiSidebar.md b/doc/SettingUpLinuxHSAMachineForAparapiSidebar.md
new file mode 100644
index 0000000000000000000000000000000000000000..9d165df7e1fa5bd66761b5b95482d60225adb6ec
--- /dev/null
+++ b/doc/SettingUpLinuxHSAMachineForAparapiSidebar.md
@@ -0,0 +1,10 @@
+#SettingUpLinuxHSAMachineForAparapiSidebar
+*Sidebar for SettingUpLinuxHSAMachineForAparapi*
+
+* HSA Videos
+    * [http://www.youtube.com/watch?v=5ntILiXTuhE](http://www.youtube.com/watch?v=5ntILiXTuhE)
+    * [http://www.youtube.com/watch?v=caEPq4KvTTA](http://www.youtube.com/watch?v=caEPq4KvTTA)
+* HSA Articles
+    * [http://developer.amd.com/resources/heterogeneous-computing/what-is-heterogeneous-computing/](http://developer.amd.com/resources/heterogeneous-computing/what-is-heterogeneous-computing/)
+* HSA Foundation
+    * [https://github.com/HSAFoundation](https://github.com/HSAFoundation)
\ No newline at end of file
diff --git a/doc/UnitTestGuide.md b/doc/UnitTestGuide.md
new file mode 100644
index 0000000000000000000000000000000000000000..d812e14e11ffc7b2edfc08d83f831a33ff14091f
--- /dev/null
+++ b/doc/UnitTestGuide.md
@@ -0,0 +1,174 @@
+#UnitTestGuide
+*Unit test Guide Find out how to run Junit tests and how to add new tests. Updated Sep 14, 2011 by frost.g...@gmail.com*
+
+#Unit Test Guide
+
+The Unit Test Guide explains the test infrastructure associated with Aparapi, including instructions for executing existing tests adding new test cases.
+OpenCLâ„¢ code generation tests
+
+The initial open source tree includes the codegen subdirectory (test/codegen), which used to validate the Aparapi bytecode to OpenCLâ„¢ conversion.
+
+    aparapi/trunk/
+       com.amd.aparapi/
+          src/java/com.amd.aparapi/
+          build.xml
+       test/
+          codegen/
+             src/java/
+                com.amd.aparapi/
+                com.amd.aparapi.test/
+             build.xml
+       build.xml
+
+The code generation tests to not require OpenCLâ„¢ , AMD APP SDK or a GPU devices to be configured; these tests only validate the creation of valid OpenCLâ„¢ code by comparing against predefined expected output.
+
+##Running the OpenCLâ„¢ code generation JUnit tests
+
+Before executing the code generation tests, build the com.amd.aparapi sub-project and ensure that you have JUnit 4 installed.
+
+Edit the junit.jar property in test/codegen/build.xml to point to your install directory.
+
+    <property name="junit.jar" value="C:\JUnit4.9\junit-4.9.jar"/>
+
+Initiate the code generation tests using ant.
+
+    C:\> cd tests/codegen
+    C:\> ant
+    <failures will be reported here>
+    C:>
+
+View the HTML version of the JUnit report at junit/html/index.html. On Microsoft Windows(r) platforms use
+
+    C:\> start junit\html\index.html
+
+On Linux(r) platforms just invoke your browser (Firefox in this case).
+
+    C:\> firefox junit\html\index.html
+
+##Adding a new OpenCLâ„¢ code generation test
+
+The test cases for OpenCLâ„¢ code generation are not strictly JUnit tests. Instead the codegen Java tree contains a tool (CreateJUnitTests) to create JUnit test cases from specially formatted test source files.
+
+The package `com.amd.aparapi.test (codegen/src/java/com/amd/aparapi/test)` contains all of the existing code generation tests.
+
+Here is an example that tests the code generation resulting from a call to Kernel.getPassId(), this is taken from com.amd.aparapi.test.CallGetPassId
+
+    package com.amd.aparapi.test;
+
+    import com.amd.aparapi.Kernel;
+
+    public class CallGetPassId extends Kernel{
+       public void run() {
+          int thePassId = getPassId();
+       }
+
+    }
+    /**{OpenCL{
+
+    typedef struct This_s{
+       int passid;
+    }This;
+    int get_pass_id(This *this){
+       return this->passid;
+    }
+    __kernel void run(
+       int passid
+    ){
+       This thisStruct;
+       This* this=&thisStruct;
+       this->passid = passid;
+       {
+          int thePassId = get_pass_id(this);
+          return;
+       }
+    }
+
+    }OpenCL}**/
+
+The test source takes the form of a simple class that extends the kernel and a block of OpenCL code between the /**{OpenCL{ and }OpenCL}**/ markers. The code between these markers is the OpenCL code that we expect Aparapi to produce as a result of converting the run() method to OpenCL.
+
+The code-generating ant build.xml file performs the following steps to generate its report:
+
+* compiles the src/java tree. This compiles all the test cases as well as a few â€˜utilityâ€™ classes.
+* executes the com.amd.aparapi.test.CreateJUnitTests program. This iterates through all of the test source files and converts them to JUnit form. The generated source is written to the src/genjava tree.
+* compiles the src/genjava tree to create the required JUnit classes
+* initiates the JUnit test phase (result data in junit/data)
+* creates the JUnit report (in junit/html/junit from junit/data)
+
+To create a new test case, just add your test case to the `codegen/src/java/com/amd/aparapi/test` package (including the expected OpenCL).
+
+Sometimes different javac implementations (such as Oracle and Eclipse) will generate different bytecode for the same source. When Aparapi converts this bytecode it may yield different (but equally acceptable) OpenCL forms. One example of this is the BooleanToggle test:
+
+    public class BooleanToggle{
+       public void run() {
+          boolean pass = false;
+
+          pass = !pass;
+
+       }
+    }
+
+The BooleanToggle test code creates two (slightly different) versions of OpenCLâ„¢ (sadly one line different) depending on the javac compiler.
+
+This example shows the â€˜toggleâ€™ OpenCLâ„¢ created from the bytecode generated by Oracle.
+
+    pass = pass==1?0:1;
+
+This example shows the bytecode from Eclipse javac:
+
+    pass = pass==0?1:0;
+
+Logically either of the above are correct. However, to accommodate the alternate acceptable forms we need to add two complete `/**{OpenCL{ and }OpenCL}**/` sections to the file. If either matches, the test will pass.
+
+Here is the complete BooleanToggle code.
+
+    package com.amd.aparapi.test;
+
+    public class BooleanToggle{
+       public void run() {
+          boolean pass = false;
+
+          pass = !pass;
+
+       }
+    }
+    /**{OpenCL{
+    typedef struct This_s{
+       int passid;
+    }This;
+    int get_pass_id(This *this){
+       return this->passid;
+    }
+    __kernel void run(
+       int passid
+    ){
+       This thisStruct;
+       This* this=&thisStruct;
+       this->passid = passid;
+       {
+          char pass = 0;
+          pass = (pass==0)?1:0;
+          return;
+       }
+    }
+    }OpenCL}**/
+    /**{OpenCL{
+    typedef struct This_s{
+       int passid;
+    }This;
+    int get_pass_id(This *this){
+       return this->passid;
+    }
+    __kernel void run(
+       int passid
+    ){
+       This thisStruct;
+       This* this=&thisStruct;
+       this->passid = passid;
+       {
+          char pass = 0;
+          pass = (pass!=0)?0:1;
+          return;
+       }
+    }
+    }OpenCL}**/
\ No newline at end of file
diff --git a/doc/UsersGuide.md b/doc/UsersGuide.md
new file mode 100644
index 0000000000000000000000000000000000000000..9a2ae7966805ef20cd2c1a415255b62c9a0cbed1
--- /dev/null
+++ b/doc/UsersGuide.md
@@ -0,0 +1,126 @@
+#UsersGuide
+*Aparapi User's Guide. Updated Sep 14, 2011 by frost.g...@gmail.com*
+##Userâ€™s Guide
+Aparapi is: An API used to express data parallel workloads in Java and a runtime system capable of running compatible workloads on a compatible GPU.
+
+Where your workload runs depends on
+
+Whether you have a compatible GPU and OpenCL capable device driver
+Whether your Java parallel code can be converted to OpenCL by Aparapi
+For information about restrictions on the code that Aparapi can convert to OpenCL, see JavaKernelGuidelines.
+Aparapi depends on AMDâ€™s OpenCLâ„¢ driver to execute on the GPU and therefore shares the same device, driver, and platform compatibility requirements as AMD APP SDK V2.5Â®.
+
+* 32-bit MicrosoftÂ® WindowsÂ® 7
+* 32-bit MicrosoftÂ® Windows VistaÂ® SP2
+* 64-bit MicrosoftÂ® WindowsÂ® 7
+* 64-bit MicrosoftÂ® Windows VistaÂ® SP2
+* 32-bit LinuxÂ® OpenSUSEâ„¢ 11.2,   UbuntuÂ® 10.04/9.10, or Red HatÂ® Enterprise LinuxÂ® 5.5/5.4
+* 64-bit LinuxÂ® OpenSUSEâ„¢ 11.2,   UbuntuÂ® 10.04/9.10, or Red HatÂ® Enterprise LinuxÂ® 5.5/5.4
+* An OpenCL GPU and suitable OpenCL enabled device driver
+* An installed AMD APP SDK v2.5 or later
+
+If you prefer to test Aparapi in JTP mode (Java Thread Pool) then you will only need Aparapi.jar and Oracle Java 6 or later JRE or JDK.
+The following fragment of Java code takes an input float array and populates an output array with the square of each element.
+
+    final float in[8192]; // initialization of in[0..8191] omitted
+    final float out[in.length];
+
+    for(int i=0; i<in.length; i++){
+       out[i]=in[i]*in[i];
+    }
+This code segment illustrates an ideal data parallel candidate, each pass through the loop is independent of the others. Traversing the loop in any order should provide the same result.
+
+To convert the above code to Aparapi we use an anonymous inner-class (a common Java idiom) to express the data parallel nature of the above sequential loop.
+
+    Kernel kernel = new Kernel(){
+       @Override public void run(){
+          int i = getGlobalId();
+          out[i]=in[i]*in[i];
+       }
+    };
+    kernel.execute(in.length);
+Java developers should recognize the general pattern as similar to that used to launch a new Thread.
+
+    Thread thread = new Thread(new Runnable(){
+       @Override public void run(){
+           System.out.println(â€œIn another thread!â€);
+       }
+    });
+    thread.start();
+    thread.join();
+The Aparapi developer extends the com.amd.aparapi.Kernel and overrides the public void Kernel.run() method. It is this Kernel.run() method that is executed in parallel.
+
+The base class also exposes the Kernel.execute(range) method which is used to initiate the execution of Kernel.run() over the range 0...n.
+
+Kernel.execute(range) will block until execution has completed. Any code within the overridden â€˜void run()â€™ method of Kernel (and indeed any method or methods reachable from that method) is assumed to be data-parallel and it is the developerâ€™s responsibility to ensure that it is. Aparapi can neither detect nor enforce this.
+
+Within the executing kernel (on the GPU device or from the thread pool) the Kernel.getGlobalId() method is used to identify which (of the range 0..n) a particular execution represents.
+
+## Compiling an Aparapi application
+Aparapi has only two compilation requirements:
+
+Aparapi.jar must be in the class path at compile time.
+The generated class files must contain debug information (javac â€“g)
+A typical compilation might be:
+    $ javac â€“g â€“cp ${APARAPI_DIR}/aparapi.jar Squares.java
+Aparapi requires this classfile debug information so that can extract the name and scope of local variables for the generated OpenCL.
+
+## Running an Aparapi application
+At runtime an Aparapi-enabled application requires aparapi.jar to be in the class path to be able to execute in a Java Thread Pool (no GPU offload).
+
+    $ javaâ€“cp ${APARAPI_DIR}/aparapi.jar;. Squares
+To take advantage of the GPU, the directory containing the platform-dependent Aparapi shared library is passed via the java.library.path property.
+
+    $ java â€“Djava.library.path=${APARAPI_DIR} â€“cp ${APARAPI_DIR}/aparapi.jar;. Squares
+
+Aparapi detects whether the JNI shared library is available. If the library cannot be located your code will be executed using a Java Thread Pool.
+
+An application can detect whether a kernel was executed on the GPU or by a Java Thread Pool (JTP) by querying the execution mode â€˜afterâ€™ Kernel.execute(range) has returned. This is achieved using the Kernel.getExecutionMode() method.
+
+    Kernel kernel = new Kernel(){
+       @Override public void run(){
+          int i = getGlobalId();
+          out[i]=in[i]*in[i];
+       }
+    };
+    kernel.execute(in.length);
+    if (!kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.GPU)){
+       System.out.println(â€œKernel nid not execute on the GPU!â€);
+    }
+
+To obtain a runtime report of the execution mode of all kernel executions, set the com.amd.aparapi.enableExecutionModeReporting property to true when the JVM is launched.
+
+    $ java â€“Djava.library.path=${APARAPI_DIR} â€“Dcom.amd.aparapi.enableExecutionModeReporting=true â€“cp ${APARAPI_DIR}/aparapi.jar;. Squares
+
+##Running the sample applications
+Aparapi includes two sample applications in the /samples subdirectory of the binary distribution zip file.
+
+samples/squares	simple example that computes an array of squares of integers
+samples/mandel	computes and displays the Mandelbrot set
+The jar file for each sample is included (so you can run a sample without having to build it) as well as both LinuxÂ® and Microsoft WindowsÂ® script files for launching the samples.
+
+You will need an appropriate GPU card, OpenCLÂ® enabled CatalystÂ® driver and a compatible Oracle Java 6 JRE for your platform. To execute a sample:
+
+Set the environment variable JAVA_HOME to point to the root of your JRE or JDK.
+Change to the appropriate samples directory (samples/squares or samples/mandel)
+Run either the .bat or .sh script. On LinuxÂ® , you might have to initially chmod +x script.sh to add execute permissions.
+The sample scripts pass the first arg (%1 or $1) to -Dcom.amd.aparapi.executionMode when the JVM is launched. This allows the sample to be tested in either GPU or JTP execution modes by passing the requested mode.
+
+    $ cd samples/mandel
+    $ bash ./mandel.sh GPU
+    <executes in GPU mode here>
+    $ bash ./mandel.sh JTP
+    <executes in JTP mode here>
+
+## Building the sample applications
+To build a sample, install OracleÂ® JDK 6 and Apache Ant (at least 1.7.1).
+
+Set the environment variable ANT_HOME to point to the root of your ant install.
+Ensure that the %ANT_HOME%/bin or ${ANT_HOME}/bin is in your path.
+Set the environment variable JAVA_HOME to point to the root of your JDK.
+Change to the appropriate samples directory (sample/squares or sample/mandel).
+Initiate a build using ant.
+    $ cd samples/mandel
+    $ ant
+    $ bash ./mandel.sh GPU
+Attribution
\ No newline at end of file
diff --git a/doc/UsingAparapiLambdaBranchWithHSASimulator.md b/doc/UsingAparapiLambdaBranchWithHSASimulator.md
new file mode 100644
index 0000000000000000000000000000000000000000..4e35c0ebce318734ccd4247d81bce557398b6ab0
--- /dev/null
+++ b/doc/UsingAparapiLambdaBranchWithHSASimulator.md
@@ -0,0 +1,46 @@
+#UsingAparapiLambdaBranchWithHSASimulator
+*One-sentence summary of this page. Updated Feb 28, 2014 by frost.g...@gmail.com*
+
+##Introduction
+Although HSA compatible devices are available, we understand that Aparapi developers may not have access to these devices.
+
+The HSA foundation has open sourced an LLVM based HSAIL emulator which we can use to test HSAIL generated code.
+
+The project is based here ([https://github.com/HSAFoundation/Okra-Interface-to-HSAIL-Simulator](https://github.com/HSAFoundation/Okra-Interface-to-HSAIL-Simulator)) but we have extracted detailed download and build instructions for Ubuntu below.
+
+Aparapi users/developers can use this simulator to test correctness.
+
+##Building the HSA Simulator on Ubuntu
+We assume you have ant, svn and g++ available because you can build other aparapi artifacts.
+
+You will also need git, libelf-dev, libdwarf-dev, flex and cmake
+
+    $ sudo apt-get install git libelf-dev libdwarf-dev flex cmake
+
+login...
+
+    $ git clone https://github.com/HSAFoundation/Okra-Interface-to-HSAIL-Simulator.git okra
+    $ cd okra
+    $ ant -f build-okra-sim.xml
+
+##The build should take approximately 15 mins.
+
+How to setup and test an initial lambda/HSA enabled Aparapi build
+Assuming you have built okra in /home/gfrost/okra
+
+Assuming your Java8 JDK is in /home/gfrost/jdk1.8.0
+
+Assuming your aparapi svn trunk is /home/gfrost/aparapi
+
+    $ export JAVA_HOME=/home/gfrost/jdk1.8.0
+    $ export OKRA=/home/gfrost/okra
+    $ export PATH=${PATH}:${JAVA_HOME}/bin:${OKRA}/dist/bin
+    $ java -version
+    java version "1.8.0-ea"
+    Java(TM) SE Runtime Environment (build 1.8.0-ea-b94)
+    Java HotSpot(TM) 64-Bit Server VM (build 25.0-b36, mixed mode)
+    $ cd /home/gfrost/aparapi/branches/lambda
+    $ ant
+    $ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${OKRA}/dist/bin
+    $ java -agentpath:com.amd.aparapi.jni/dist/libaparapi_x86_64.so -cp com.amd.aparapi/dist/aparapi.jar:${OKRA}/dist/okra.jar hsailtest.Squares
+    $
\ No newline at end of file
diff --git a/doc/UsingConstantMemory.md b/doc/UsingConstantMemory.md
new file mode 100644
index 0000000000000000000000000000000000000000..9bd53e6ac6602de0ae34f91eb5fd8f9e70c547a6
--- /dev/null
+++ b/doc/UsingConstantMemory.md
@@ -0,0 +1,50 @@
+#UsingConstantMemory
+*How to make use of constant memory in a Kernel Updated Feb 28, 2012 by frost.g...@gmail.com*
+##How to make use of new constant memory feature
+By default all primitive arrays accessed by an Aparapi Kernel is considered global. If we look at the generated code using `-Dcom.amd.aparapi.enableShowGeneratedOpenCL=true` we will see that primitive arrays (such as `int buf[]`) are mapped to `__global` pointers (such as `__global int *buf`) in OpenCL.
+
+Although this makes Aparapi easy to use (especially to Java developers who are unfamiliar to tiered memory hierarchies), it does limit the ability of the 'power developer' wanting to extract more performance from Aparapi on the GPU.
+
+This [page](http://www.amd.com/us/products/technologies/stream-technology/opencl/pages/opencl-intro.aspx?cmpid=cp_article_2_2010) from AMD's website shows the different types of memory that OpenCL programmers can exploit.
+
+Global memory buffers in Aparapi (primitive Java arrays) are stored in host memory and are copied to Global memory (the RAM of the GPU card).
+
+Local memory is 'closer' to the compute devices and not copied from the host memory, it is just allocated for use on the device. The use of local memory on OpenCL can lead to much more performant code as the cost of fetching from local memory is much lower.
+
+Local memory is shared by all work item's (kernel instances) executing in the same group. This is why the use of local memory was deferred until we had a satisfactory mechanism for specifying a required group size.
+
+We recently also added support for constant memory for data that needs to be written once to the GPU but will not change.
+
+Aparapi only supports constant arrays, not scalers.
+
+##How to define a primitive array as "constant"
+We have two ways define a constant buffer. Either we can decorate the variable name with a _$constant$ suffix (yes it is a valid identifier n Java).
+
+    final int[] buffer = new int[1024]; // this is global accessable to all work items.
+    final int[] buffer_$constant$ = new int[]{1,2,3,4,5,6,7,8,9} // this is a constant buffer
+
+    Kernel k = new Kernel(){
+        public void run(){
+             // access buffer
+             // access buffer_$constant$
+             // ....
+        }
+    }
+
+Alternatively (if defining inside the derived Kernel class - cannot be used via anonymous inner class pattern above!) we can can use the @Constant annotation.
+
+    final int[] buffer = new int[1024]; // this is global accessable to all work items.
+
+    Kernel k = new Kernel(){
+        @Constant int[] constantBuffer = new int[]{1,2,3,4,5,6,7,8,9} // this is a constant buffer
+        public void run(){
+             // access buffer
+             // access constantBuffers
+             // ....
+        }
+    }
+
+##Can I see some code?
+I updated the Mandelbrot example so that the pallete of RGB values is represented using constant memory, the source can be found here. Look at line #95. BTW for me this resulted in a 5-7 % performance improvement.
+
+[http://code.google.com/p/aparapi/source/browse/trunk/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java](tp://code.google.com/p/aparapi/source/browse/trunk/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java)
\ No newline at end of file
diff --git a/doc/UsingLocalMemory.md b/doc/UsingLocalMemory.md
new file mode 100644
index 0000000000000000000000000000000000000000..e74376b343649d71f80eadb1ed4527f9b0bc2e03
--- /dev/null
+++ b/doc/UsingLocalMemory.md
@@ -0,0 +1,180 @@
+#UsingLocalMemory
+*How to make use of local memory in a Kernel Updated Feb 28, 2012 by frost.g...@gmail.com*
+##How to make use of new local memory feature
+By default all primitive arrays accessed by an Aparapi Kernel is considered global. If we look at the generated code using -Dcom.amd.aparapi.enableShowGeneratedOpenCL=true we will see that primitive arrays (such as int buf[]) are mapped to __global pointers (such as __global int *buf) in OpenCL.
+
+Although this makes Aparapi easy to use (especially to Java developers who are unfamiliar to tiered memory hierarchies), it does limit the ability of the 'power developer' wanting to extract more performance from Aparapi on the GPU.
+
+This [page](http://www.amd.com/us/products/technologies/stream-technology/opencl/pages/opencl-intro.aspx?cmpid=cp_article_2_2010) from AMD's website shows the different types of memory that OpenCL programmers can exploit.
+
+Global memory buffers in Aparapi (primitive Java arrays) are stored in host memory and are copied to Global memory (the RAM of the GPU card).
+
+Local memory is 'closer' to the compute devices and not copied from the host memory, it is just allocated for use on the device. The use of local memory on OpenCL can lead to much more performant code as the cost of fetching from local memory is much lower.
+
+Local memory is shared by all work item's (kernel instances) executing in the same group. This is why the use of local memory was deferred until we had a satisfactory mechanism for specifying a required group size.
+
+Aparapi only supports local arrays, not scalers.
+
+##How to define a primitive array as "local"
+We have two ways define a local buffer. Either we can decorate the variable name with a _$local$ suffix (yes it is a valid identifier n Java).
+
+    final int[] buffer = new int[1024]; // this is global accessable to all work items.
+    final int[] buffer_$local$ = new int[1024]; // this is a local buffer 1024 int's shared across all work item's in a group
+
+    Kernel k = new Kernel(){
+        public void run(){
+             // access buffer
+             // access buffer_$local$
+             localBarrier(); // allows all writes to buffer_$local$ to be synchronized across all work items in this group
+             // ....
+        }
+    }
+Alternatively (if defining inside the derived Kernel class - cannot be used via anonymous inner class pattern above!) we can can use the @Local annotation.
+
+    final int[] buffer = new int[1024]; // this is global accessable to all work items.
+
+    Kernel k = new Kernel(){
+        @Local int[] localBuffer = new int[1024]; // this is a local buffer 1024 int's shared across all work item's in a group
+        public void run(){
+             // access buffer
+             // access localBuffer
+             localBarrier(); // allows all writes to localBuffer to be synchronized across all work items in this group
+             // ....
+        }
+    }
+##How do I know how big to make my local buffer?
+This is where the new Range class helps.
+
+If we create a Range using:
+
+    Range rangeWithUndefinedGroupSize = Range.create(1024);
+The Aparapi will pick a suitable group size. Generally this will be the highest factor of global size <= 256. So for a global size which is a power of two (and greater or equal to256 ;) ) the group size will be 256.
+
+Normally the size a local buffer will be some ratio of the group size.
+
+So if we needed 4 ints per group we might use a sequence such as.
+
+    final int[] buffer = new int[8192]; // this is global accessable to all work items.
+    final Range range = Range.create(buffer.length); // let the runtime pick the group size
+
+    Kernel k = new Kernel(){
+        @Local int[] localBuffer = new int[range.getLocalSize(0)*4]; // this is a local buffer containing 4 ints per work item in the group
+        public void run(){
+             // access buffer
+             // access localBuffer
+             localBarrier(); // allows all writes to localBuffer to be synchronized across all work items in this group
+             // ....
+        }
+    }
+Alternatively you can of course specify your own group size when you create the Range.
+
+    final int[] buffer = new int[8192]; // this is global accessable to all work items.
+    final Range range = Range.create(buffer.length,16); // we requested a group size of 16
+
+    Kernel k = new Kernel(){
+        @Local int[] localBuffer = new int[range.getLocalSize(0)*4]; // this is a local buffer containing 4 ints per work item in the group = 64 ints
+        public void run(){
+             // access buffer
+             // access localBuffer
+             localBarrier(); // allows all writes to localBuffer to be synchronized across all work items in this group
+             // ....
+        }
+    }
+##Using barriers
+As we mentioned above local memory buffers are shared by all work items/kernels executing in the same group. However, to read a value written by another workitem we need to insert a local barrier.
+
+A common pattern involves having each work item copying a value from global memory in local memory.
+
+    Kernel k = new Kernel(){
+        @Local int[] localBuffer = new int[range.getLocalSize(0)];
+        public void run(){
+
+             localBuffer[getLocalId(0)] = globalBuffer[getGlobalId(0)];
+             localBarrier(); // after this all kernels can see the data copied by other workitems in this group
+             // use localBuffer[0..getLocalSize(0)]
+        }
+    }
+Without the barrier above, there is no guarantee that other work items will see mutations to localBuffer from other work items.
+
+Caution regarding barriers
+Barriers can be dangerous. It is up to the developer to ensure that all kernels execute the same # of calls to localBarrier(). Be very careful with conditional code (or code containing loops!), to ensure that each kernel executes the same number of calls to localBarrier().
+
+The following kernel will deadlock!
+
+    Kernel kernel = new Kernel(){
+        public void run(){
+             if (getGlobalId(0)>10){
+                // ...
+                localBarrier();
+                // ...
+             }
+        }
+    }
+We need to make sure that all kernel's in a group execute the localBarrier(). So the following will work.
+
+    Kernel kernel = new Kernel(){
+        public void run(){
+             if (getGlobalId(0)>10){
+                // ...
+                localBarrier();
+                // ...
+             }else{
+                localBarrier();
+             }
+
+        }
+    }
+Of course if we have multiple calls to localBarrier() in the 'if' side of the if..then then we must match in the 'else'.
+
+    Kernel kernel = new Kernel(){
+        public void run(){
+             if (getGlobalId(0)>10){
+                // ...
+                localBarrier();
+                // ...
+                localBarrier();
+                // ...
+             }else{
+                localBarrier();
+                localBarrier();
+             }
+
+        }
+    }
+With loops we must make sure that each kernel processes any loop the sam e # of times.
+
+So the following is fine.
+
+    Kernel kernel = new Kernel(){
+        public void run(){
+             for (int i=0; i< 10; i++){
+                // ...
+                localBarrier();
+                // ...
+             }
+        }
+    }
+However the following will deadlock
+
+    Kernel kernel = new Kernel(){
+        public void run(){
+             for (int i=0; i< getLocalId(0); i++){
+                // ...
+                localBarrier();
+                // ...
+             }
+        }
+    }
+As a testament to how well we emulate OpenCL in JTP mode, this will also deadlock your kernel in JTP mode ;) so be careful.
+
+Performance impact in JTP mode
+Of course Java itself does not support local memory in any form. So any time code using local memory falls back to JTP mode we must expect a considerable performance degradation (try the NBody local example in JTP mode).
+
+We do honor localBarrier() using Java's barrier from the new concurrency utils. However, Java's memory model does not require the use of a barrier to observe array changes across threads. So these barriers are basically just an expense.
+
+I would recommend using local memory and barriers only if I am 90% sure the code will run on the GPU.
+
+##Can I see some code?
+I added a version of NBody example which uses local memory, the source can be found here.
+
+[http://code.google.com/p/aparapi/source/browse/trunk/examples/nbody/src/com/amd/aparapi/examples/nbody/Local.java](http://code.google.com/p/aparapi/source/browse/trunk/examples/nbody/src/com/amd/aparapi/examples/nbody/Local.java)
\ No newline at end of file
diff --git a/doc/UsingMultiDimExecutionRanges.md b/doc/UsingMultiDimExecutionRanges.md
new file mode 100644
index 0000000000000000000000000000000000000000..adaf8190f42f3f54fcf93796c34b2749a7ebf616
--- /dev/null
+++ b/doc/UsingMultiDimExecutionRanges.md
@@ -0,0 +1,60 @@
+#UsingMultiDimExecutionRanges
+*How to use the new Range class (for multi-dim range access) Updated Feb 13, 2012 by frost.g...@gmail.com*
+
+Aparapi now allows developers to execute over one, two or three dimensional ranges. OpenCL natively allows the user to execute over 1, 2 or 3 dimension grids via the clEnqueueNDRangeKernel() method.
+
+Initially we chose not to expose 2D or 3D ranges (Aparapi's Kernel.execute(range) allowed only !d ranges, but following a specific request we added the notion of a Range via the new com.amd.aparapi.Range class.
+
+A range is created using various static factory methods. For example to create a simple range {0..1024} we would use.
+
+Range range = Range.create(1024);
+In this case the range will span 1..1024 and a 'default' group size will be decided behind the scenes (256 probably in this case).
+
+If the user wishes to select a specific group size (say 32) for a one dimensional Range (0..1024) then they can use.
+
+Range range = Range.create(1024, 32);
+The group size must always be a 'factor' of the global range. So globalRange % groupSize == 0
+
+For a 2D range we use the Range.create2D(...) factory methods.
+
+Range range = Range.create2D(32, 32);
+The above represents a 2D grid of execution 32 rows by 32 columns. In this case a default group size will be determined by the runtime.
+
+If we wish to specify the groupsize (say 4x4) then we can use.
+
+    Range range = Range.create2D(32, 32, 4, 4);
+    This example uses a 2D range to apply a blurring convolution effect to a pixel buffer.
+
+    final static int WIDTH=128;
+    final static int HEIGHT=64;
+    final int in[] = new int[WIDTH*HEIGHT];
+    final int out[] = new int[WIDTH*HEIGHT];
+    Kernel kernel = new Kernel(){
+       public void run(){
+          int x = getGlobalId(0);
+          int y = getGlobalId(1);
+          if (x>0 && x<(getGlobalSize(0)-1) && y>0 && y<(getGlobalSize(0)-1)){
+             int sum = 0;
+             for (int dx =-1; dx<2; dx++){
+               for (int dy =-1; dy<2; dy++){
+                 sum+=in[(y+dy)*getGlobalSize(0)+(x+dx)];
+               }
+             }
+             out[y*getGlobalSize(0)+x] = sum/9;
+          }
+       }
+
+    };
+    Range range = Range.create2D(WIDTH, HEIGHT);
+    kernel.execute(range);
+
+##Handling this from JTP mode
+Mapping to OpenCL for this is all fairly straightforward.
+
+In Java JTP mode we have to emulate the execution over the 1D, 2D and 3D ranges using threads. Note that the number of threads we launch is essentially the size of the group. So be careful creating large groups.
+
+If we ask for a 3D range using :-
+
+    Range range = Range.create3D(1024, 1024, 1024, 8, 8, 8);
+
+We are asking for a group size of 8x8x8 == 512. So we are asking for 512 threads!
\ No newline at end of file
diff --git a/doc/uml.png b/doc/uml.png
new file mode 100644
index 0000000000000000000000000000000000000000..5dfc8690ae8500506d8337472ce5efae832de47c
Binary files /dev/null and b/doc/uml.png differ
diff --git a/samples/blackscholes/src/com/amd/aparapi/sample/blackscholes/Main.java b/samples/blackscholes/src/com/amd/aparapi/sample/blackscholes/Main.java
index bdc1486254513bbc07b61b1cb84e3313debca650..074ed2b013182be33f73aa4325d463c256aecbd0 100644
--- a/samples/blackscholes/src/com/amd/aparapi/sample/blackscholes/Main.java
+++ b/samples/blackscholes/src/com/amd/aparapi/sample/blackscholes/Main.java
@@ -76,7 +76,6 @@ public class Main{
       /**
       * @brief   Abromowitz Stegun approxmimation for PHI (Cumulative Normal Distribution Function)
       * @param   X input value
-      * @param   phi pointer to store calculated CND of X
       */
       float phi(float X) {
          final float c1 = 0.319381530f;
@@ -183,18 +182,15 @@ public class Main{
 
       int size = Integer.getInteger("size", 512);
       Range range = Range.create(size);
-      int iterations = Integer.getInteger("iterations", 5);
+      int iterations = Integer.getInteger("iterations", 20);
       System.out.println("size =" + size);
       System.out.println("iterations =" + iterations);
       BlackScholesKernel kernel = new BlackScholesKernel(size);
 
-      long totalExecTime = 0;
-      long iterExecTime = 0;
-      /*
       for (int i = 0; i < iterations; i++) {
-         iterExecTime = kernel.execute(size).getExecutionTime();
-         totalExecTime += iterExecTime;
-      }*/
+         kernel.execute(size).getExecutionTime();
+      }
+
       kernel.execute(range, iterations);
       System.out.println("Average execution time " + kernel.getAccumulatedExecutionTime() / iterations);
       kernel.showResults(10);
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/AutoCleanUpArraysDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/AutoCleanUpArraysDemo.java
new file mode 100644
index 0000000000000000000000000000000000000000..c09d0ab218bec2c0a303a77517890397ede4b2d5
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/AutoCleanUpArraysDemo.java
@@ -0,0 +1,20 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.sample.mandel.*;
+
+public class AutoCleanUpArraysDemo {
+   public static void main(String[] ignored) {
+
+      System.setProperty("com.amd.aparapi.dumpProfileOnExecution", "true");
+
+      int size = 1024;
+      int[] rgbs = new int[size * size];
+      Main.MandelKernel kernel = new Main.MandelKernel(size, size, rgbs);
+      kernel.setAutoCleanUpArrays(true);
+      kernel.execute(size * size);
+      System.out.println("length = " + kernel.getRgbs().length);
+      kernel.resetImage(size, size, rgbs);
+      kernel.execute(size * size);
+      System.out.println("length = " + kernel.getRgbs().length);
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/CleanUpArraysDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/CleanUpArraysDemo.java
new file mode 100644
index 0000000000000000000000000000000000000000..26d832f4b2d1db2be339c6933fff405d642b6a7c
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/CleanUpArraysDemo.java
@@ -0,0 +1,25 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.sample.mandel.*;
+
+public class CleanUpArraysDemo {
+   public static void main(String[] ignored) {
+
+      System.setProperty("com.amd.aparapi.enableVerboseJNI", "true");
+      System.setProperty("com.amd.aparapi.enableVerboseJNIOpenCLResourceTracking", "true");
+      System.setProperty("com.amd.aparapi.enableExecutionModeReporting", "true");
+      System.setProperty("com.amd.aparapi.dumpProfileOnExecution", "true");
+
+      int size = 1024;
+      int[] rgbs = new int[size * size];
+      Main.MandelKernel kernel = new Main.MandelKernel(size, size, rgbs);
+      kernel.execute(size * size);
+      System.out.println("length = " + kernel.getRgbs().length);
+      System.out.println("Cleaning up arrays");
+      kernel.cleanUpArrays();
+      System.out.println("length = " + kernel.getRgbs().length);
+      kernel.resetImage(size, size, rgbs);
+      kernel.execute(size * size);
+      System.out.println("length = " + kernel.getRgbs().length);
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java
new file mode 100644
index 0000000000000000000000000000000000000000..67d7cc0296b1432303e80ab8fd39fec477f64891
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/ConfigurationDemo.java
@@ -0,0 +1,80 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.*;
+import com.amd.aparapi.internal.kernel.*;
+
+import java.util.*;
+
+/**
+ * Tests device selection via {@link com.amd.aparapi.internal.kernel.KernelManager}.
+ */
+public class ConfigurationDemo {
+   public static void main(String[] ignored) {
+      StringBuilder report;
+
+      List<Integer> tests = Arrays.asList(0, 1, 2, 3);
+      int reps = 1;
+      for (int rep = 0; rep < reps; ++rep) {
+         runTests(rep == 0, tests);
+
+         if (rep % 100 == 99 || rep == 0 || rep == reps - 1) {
+            report = new StringBuilder("rep = " + rep + "\n");
+            KernelManager.instance().reportDeviceUsage(report, true);
+            System.out.println(report);
+         }
+      }
+   }
+
+   private static void runTests(boolean verbose, List<Integer> testIndicesToRun) {
+      final int globalSize = 1;
+      Kernel kernel;
+      if (testIndicesToRun.contains(0)) {
+         if (verbose) {
+            System.out.println();
+            System.out.println("Testing default KernelPreferences with kernel which cannot be run in OpenCL, with fallback algorithm");
+            System.out.println();
+         }
+         kernel = new KernelWithAlternateFallbackAlgorithm();
+         kernel.execute(globalSize);
+         kernel.dispose();
+      }
+
+      if (testIndicesToRun.contains(1)) {
+         if (verbose) {
+            System.out.println();
+            System.out.println("Testing default KernelPreferences with kernel which cannot be run in OpenCL, without fallback algorithm");
+            System.out.println();
+         }
+         kernel = new KernelWithoutAlternateFallbackAlgorithm();
+         kernel.execute(globalSize);
+         kernel.dispose();
+      }
+
+      if (testIndicesToRun.contains(2)) {
+         if (verbose) {
+            System.out.println();
+            System.out.println("Retesting previous case, should jump straight to regular java implementation without warnings");
+            System.out.println();
+         }
+         kernel = new KernelWithoutAlternateFallbackAlgorithm();
+         kernel.execute(globalSize);
+         kernel.dispose();
+      }
+
+      if (testIndicesToRun.contains(3)) {
+         if (verbose) {
+            System.out.println();
+            System.out.println("Testing default KernelPreferences with kernel which should be run in OpenCL");
+            System.out.println();
+         }
+         KernelOkayInOpenCL clKernel = new KernelOkayInOpenCL();
+         kernel = clKernel;
+         kernel.execute(clKernel.inChars.length);
+         String result = new String(clKernel.outChars);
+         if (verbose) {
+            System.out.println("kernel output: " + result);
+         }
+         kernel.dispose();
+      }
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/CustomConfigurationDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/CustomConfigurationDemo.java
new file mode 100644
index 0000000000000000000000000000000000000000..5ee4b8eb53155ecbe3d69737f46b347ecf45f693
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/CustomConfigurationDemo.java
@@ -0,0 +1,42 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
+
+import java.util.*;
+
+/**
+ * Created by Barney on 31/08/2015.
+ */
+public class CustomConfigurationDemo {
+
+   public static void main(String[] ignored) {
+      System.setProperty("com.amd.aparapi.dumpProfilesOnExit", "true");
+      KernelManager manager = new KernelManager() {
+         @Override
+         protected List<Device.TYPE> getPreferredDeviceTypes() {
+            return Arrays.asList(Device.TYPE.CPU, Device.TYPE.ALT, Device.TYPE.JTP);
+         }
+      };
+      KernelManager.setKernelManager(manager);
+
+      System.out.println("\nTesting custom KernelPreferences with kernel, preferences choose CPU");
+      KernelOkayInOpenCL kernel = new KernelOkayInOpenCL();
+      kernel.execute(kernel.inChars.length);
+      System.out.println(kernel.outChars);
+
+      System.out.println("\nTesting custom KernelPreferences with kernel, preferences specify CPU but kernel vetos CPU");
+      kernel = new KernelOkayInOpenCL() {
+         @Override
+         public boolean isAllowDevice(Device _device) {
+            return _device.getType() != Device.TYPE.CPU;
+         }
+      };
+      kernel.execute(kernel.inChars.length);
+      System.out.println(kernel.outChars);
+
+      StringBuilder report = new StringBuilder("\n");
+      KernelManager.instance().reportDeviceUsage(report, true);
+      System.out.println(report);
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelOkayInOpenCL.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelOkayInOpenCL.java
new file mode 100644
index 0000000000000000000000000000000000000000..9423d7159affedccce83b34a68409d0bc307940a
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelOkayInOpenCL.java
@@ -0,0 +1,21 @@
+package com.amd.aparapi.sample.configuration;
+
+/**
+ * Created by Barney on 24/08/2015.
+ */
+public class KernelOkayInOpenCL extends com.amd.aparapi.Kernel {
+   char[] inChars = "KernelOkayInOpenCL".toCharArray();
+   char[] outChars = new char[inChars.length];
+
+   @Override
+   public void run() {
+      int index = getGlobalId();
+      oops();
+      outChars[index] = inChars[index];
+   }
+
+   @NoCL
+   private void oops() {
+      System.out.println("Oops, running in kernel in Java");
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithAlternateFallbackAlgorithm.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithAlternateFallbackAlgorithm.java
new file mode 100644
index 0000000000000000000000000000000000000000..6aee117e412dc48041f6b1c234f09494360ea369
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithAlternateFallbackAlgorithm.java
@@ -0,0 +1,24 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.*;
+
+/**
+ * Kernel which will always fail to run on an OpenCLDevice but has an alternative fallback algorithm.
+ */
+public class KernelWithAlternateFallbackAlgorithm extends Kernel {
+   @Override
+   public void run() {
+      // deliberately, will fail to generate OpenCL as println is unsupported
+      System.out.println("Running in Java (regular algorithm)");
+   }
+
+   @Override
+   public boolean hasFallbackAlgorithm() {
+      return true;
+   }
+
+   @Override
+   public void executeFallbackAlgorithm(Range _range, int _passes) {
+      System.out.println("Running in Java (alternate non-parallel algorithm)");
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithoutAlternateFallbackAlgorithm.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithoutAlternateFallbackAlgorithm.java
new file mode 100644
index 0000000000000000000000000000000000000000..bdc1a12099458950c5a3b860dbd50aab0442d96e
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/KernelWithoutAlternateFallbackAlgorithm.java
@@ -0,0 +1,14 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.*;
+
+/**
+ * Kernel which will always fail to run on an OpenCLDevice but has an alternative fallback algorithm.
+ */
+public class KernelWithoutAlternateFallbackAlgorithm extends Kernel {
+   @Override
+   public void run() {
+      // deliberately, will fail to generate OpenCL as println is unsupported
+      System.out.println("Running in Java (regular algorithm)");
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/LegacyConfigurationDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/LegacyConfigurationDemo.java
new file mode 100644
index 0000000000000000000000000000000000000000..73ea9d70a79aa0ebf11e63cb11188c805303b0e8
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/LegacyConfigurationDemo.java
@@ -0,0 +1,26 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.*;
+import com.amd.aparapi.internal.kernel.*;
+
+/**
+ * Tests device selection when circumventing the {@link com.amd.aparapi.internal.kernel.KernelManager} by using the legacy mechanism
+ * (setExecutionMode, etc.).
+ */
+public class LegacyConfigurationDemo {
+
+   @SuppressWarnings("deprecation")
+   public static void main(String[] ignored) {
+      System.setProperty("com.amd.aparapi.executionMode", "GPU,CPU,SEQ");
+      System.setProperty("com.amd.aparapi.dumpProfilesOnExit", "true");
+
+      KernelWithAlternateFallbackAlgorithm kernel = new KernelWithAlternateFallbackAlgorithm();
+      kernel.setExecutionMode(Kernel.EXECUTION_MODE.GPU);
+      int globalRange = 1;
+      kernel.execute(globalRange);
+
+      StringBuilder report = new StringBuilder("\n");
+      KernelManager.instance().reportDeviceUsage(report, true);
+      System.out.println(report);
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemo.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemo.java
new file mode 100644
index 0000000000000000000000000000000000000000..aeea4ea5888c4bcf13b0dddf5fcad7cb05038edc
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemo.java
@@ -0,0 +1,83 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.*;
+import com.amd.aparapi.internal.kernel.*;
+import com.amd.aparapi.sample.blackscholes.Main.*;
+import com.amd.aparapi.sample.mandel.*;
+
+/**
+ * Demonstrate new enhanced profiling capability, profiling the kernel from the blackscholes sample.
+ */
+public class ProfilingDemo {
+
+   private static BlackScholesKernel kernel;
+
+   public static void main(String[] ignored) {
+
+      final int size = 1024;
+      newBlackScholesKernel(size);
+
+      // first execute an arbitrary Kernel (not the one we are profiling!) a few times to ensure class loading and initial JIT optimisations have
+      // been performed before we start the profiling
+      int warmups = 5;
+      for (int i = 0; i < warmups; ++i) {
+         runWarmup();
+      }
+
+      String tableHeader = KernelDeviceProfile.getTableHeader();
+
+      boolean newKernel = false;
+
+      runOnce(size, newKernel);
+      System.out.println("First run:");
+      printLastProfile(tableHeader);
+
+
+      int reps = 20;
+
+      System.out.println("\nSubsequent runs using same kernel:");
+      for (int rep = 0; rep < reps; ++rep) {
+         runOnce(size, newKernel);
+         printLastProfile(tableHeader);
+      }
+
+      newKernel = true;
+      System.out.println("\nSubsequent runs using new kernels:");
+      for (int rep = 0; rep < reps; ++rep) {
+         runOnce(size, newKernel);
+         printLastProfile(tableHeader);
+      }
+
+      // Note. You will see from the output that there is a substantial cost to Kernel creation (vs Kernel reuse), almost entirely due to KernelRunner#initJNI
+
+   }
+
+   private static void printLastProfile(String tableHeader) {
+      KernelProfile profile = KernelManager.instance().getProfile(BlackScholesKernel.class);
+      KernelDeviceProfile deviceProfile = profile.getLastDeviceProfile();
+      String row = deviceProfile.getLastAsTableRow();
+      System.out.println(tableHeader);
+      System.out.println(row);
+   }
+
+   private static void runOnce(int size, boolean newKernel) {
+      if (newKernel) {
+         newBlackScholesKernel(size);
+      }
+      kernel.execute(size);
+   }
+
+   private static void runWarmup() {
+      int[] rgb = new int[512 * 512];
+      Kernel warmupKernel = new Main.MandelKernel(512, 512, rgb);
+      warmupKernel.execute(512 * 512);
+   }
+
+   private static void newBlackScholesKernel(int size) {
+      if (kernel != null) {
+         kernel.dispose();
+      }
+      System.gc();
+      kernel = new BlackScholesKernel(size);
+   }
+}
diff --git a/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemoNoBinaryCaching.java b/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemoNoBinaryCaching.java
new file mode 100644
index 0000000000000000000000000000000000000000..2f3252c68cf63a67e21b2c2603ea1b5d333a25de
--- /dev/null
+++ b/samples/configuration/src/com/amd/aparapi/sample/configuration/ProfilingDemoNoBinaryCaching.java
@@ -0,0 +1,14 @@
+package com.amd.aparapi.sample.configuration;
+
+import com.amd.aparapi.internal.kernel.*;
+
+/**
+ * Created by Barney on 13/09/2015.
+ */
+public class ProfilingDemoNoBinaryCaching {
+
+   public static void main(String[] ignored) {
+      KernelRunner.BINARY_CACHING_DISABLED = true;
+      ProfilingDemo.main(null);
+   }
+}
diff --git a/samples/convolution/src/com/amd/aparapi/sample/convolution/Convolution.java b/samples/convolution/src/com/amd/aparapi/sample/convolution/Convolution.java
index fc70267efa87412914e40580fce6ffb3f9fb66c9..597317a6af365eefe16ab223f5e2b4d0c1164261 100644
--- a/samples/convolution/src/com/amd/aparapi/sample/convolution/Convolution.java
+++ b/samples/convolution/src/com/amd/aparapi/sample/convolution/Convolution.java
@@ -38,15 +38,15 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 
 package com.amd.aparapi.sample.convolution;
 
-import java.io.File;
+import com.amd.aparapi.*;
 
-import com.amd.aparapi.Kernel;
+import java.io.*;
 
 public class Convolution {
 
-    public static void main(final String[] _args) {
+    public static void main(final String[] _args) throws IOException {
 
-        final File file = new File(_args.length == 1 ? _args[0] : "testcard.jpg");
+        final File file = new File(_args.length == 1 ? _args[0] : "./samples/convolution/testcard.jpg").getCanonicalFile();
 
         final ImageConvolution convolution = new ImageConvolution();
 
diff --git a/samples/convolution/src/com/amd/aparapi/sample/convolution/ConvolutionOpenCL.java b/samples/convolution/src/com/amd/aparapi/sample/convolution/ConvolutionOpenCL.java
index 57f96b060a8c0993309d714e7e804fffa704bcc8..4b916b252e1ba399bea1c57f5860c2f4d6d9ea68 100644
--- a/samples/convolution/src/com/amd/aparapi/sample/convolution/ConvolutionOpenCL.java
+++ b/samples/convolution/src/com/amd/aparapi/sample/convolution/ConvolutionOpenCL.java
@@ -38,13 +38,13 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 
 package com.amd.aparapi.sample.convolution;
 
-import java.io.File;
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
+import com.amd.aparapi.opencl.*;
+import com.amd.aparapi.opencl.OpenCL.*;
 
-import com.amd.aparapi.Range;
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.device.OpenCLDevice;
-import com.amd.aparapi.opencl.OpenCL;
-import com.amd.aparapi.opencl.OpenCL.Resource;
+import java.io.*;
 
 public class ConvolutionOpenCL{
 
@@ -61,7 +61,7 @@ public class ConvolutionOpenCL{
    public static void main(final String[] _args) {
       final File file = new File(_args.length == 1 ? _args[0] : "testcard.jpg");
 
-      final OpenCLDevice openclDevice = (OpenCLDevice) Device.best();
+      final OpenCLDevice openclDevice = (OpenCLDevice) KernelManager.instance().bestDevice();
 
       final Convolution convolution = openclDevice.bind(Convolution.class);
       final float convMatrix3x3[] = new float[] {
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/FFTExample.java b/samples/extension/src/com/amd/aparapi/sample/extension/FFTExample.java
index 9284fa503455f429b2f10d9cfdaa519e7f183650..7c575c7a2c8200b95e8755e49fc8d15992ac1ea4 100644
--- a/samples/extension/src/com/amd/aparapi/sample/extension/FFTExample.java
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/FFTExample.java
@@ -1,12 +1,12 @@
 package com.amd.aparapi.sample.extension;
 
-import java.util.Arrays;
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
+import com.amd.aparapi.opencl.*;
+import com.amd.aparapi.opencl.OpenCL.*;
 
-import com.amd.aparapi.Range;
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.device.OpenCLDevice;
-import com.amd.aparapi.opencl.OpenCL;
-import com.amd.aparapi.opencl.OpenCL.Resource;
+import java.util.*;
 
 public class FFTExample{
 
@@ -98,7 +98,7 @@ public class FFTExample{
       final float imaginary[] = new float[LEN];
       final float referenceReal[] = Arrays.copyOf(real, real.length);
       final float referenceImaginary[] = Arrays.copyOf(imaginary, imaginary.length);
-      final OpenCLDevice device = (OpenCLDevice) Device.best();
+      final OpenCLDevice device = (OpenCLDevice) KernelManager.instance().getDefaultPreferences().getPreferredDevice(null);
       final FFT fft = device.bind(FFT.class);
       for (int i = 0; i < LEN; i++) {
          initial[i] = real[i] = referenceReal[i] = (float) (Math.random() * 256);
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/Histogram.java b/samples/extension/src/com/amd/aparapi/sample/extension/Histogram.java
index 54b06c3b0a98eae4b8685b3762959f44d9c9e232..e260d5e825f5a287f987bcc5ac063ed68a8a0041 100644
--- a/samples/extension/src/com/amd/aparapi/sample/extension/Histogram.java
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/Histogram.java
@@ -41,7 +41,6 @@ public class Histogram{
       System.out.println("binResult size=" + binResult.length);
       final int[] histo = new int[BIN_SIZE];
       final int[] refHisto = new int[BIN_SIZE];
-      final Device device = Device.firstGPU();
       final Kernel k = new Kernel(){
 
          @Override public void run() {
@@ -52,6 +51,7 @@ public class Histogram{
          }
 
       };
+      final Device device = k.getTargetDevice();
       final Range range2 = device.createRange(BIN_SIZE);
       k.execute(range2);
 
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/HistogramIdeal.java b/samples/extension/src/com/amd/aparapi/sample/extension/HistogramIdeal.java
index 1ff76e9f5593423d2ab04c4dc73617937efc47f5..a0f74813706604358021cdc53d02663332d63a67 100644
--- a/samples/extension/src/com/amd/aparapi/sample/extension/HistogramIdeal.java
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/HistogramIdeal.java
@@ -3,6 +3,7 @@ package com.amd.aparapi.sample.extension;
 import com.amd.aparapi.Range;
 import com.amd.aparapi.device.Device;
 import com.amd.aparapi.device.OpenCLDevice;
+import com.amd.aparapi.internal.kernel.*;
 import com.amd.aparapi.opencl.OpenCL;
 
 public class HistogramIdeal{
@@ -40,7 +41,7 @@ public class HistogramIdeal{
       System.out.println("binResult size=" + binResult.length);
       final int[] histo = new int[BIN_SIZE];
       final int[] refHisto = new int[BIN_SIZE];
-      final Device device = Device.best();
+      final Device device = KernelManager.instance().bestDevice();
 
       if (device != null) {
          System.out.println(((OpenCLDevice) device).getOpenCLPlatform().getName());
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/MandelExample.java b/samples/extension/src/com/amd/aparapi/sample/extension/MandelExample.java
index ba2d20ac2d6b496bb7b766ac0edecd7a1d781c3d..85ac9cda4614810b3936c568ec47d39213e06ba6 100644
--- a/samples/extension/src/com/amd/aparapi/sample/extension/MandelExample.java
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/MandelExample.java
@@ -38,37 +38,17 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 
 package com.amd.aparapi.sample.extension;
 
-import java.awt.BorderLayout;
-import java.awt.Dimension;
-import java.awt.FlowLayout;
-import java.awt.Graphics;
-import java.awt.Point;
-import java.awt.event.ItemEvent;
-import java.awt.event.ItemListener;
-import java.awt.event.MouseAdapter;
-import java.awt.event.MouseEvent;
-import java.awt.event.WindowAdapter;
-import java.awt.event.WindowEvent;
-import java.awt.image.BufferedImage;
-import java.awt.image.DataBufferInt;
-import java.util.concurrent.BrokenBarrierException;
-import java.util.concurrent.CyclicBarrier;
-
-import javax.swing.JComboBox;
-import javax.swing.JComponent;
-import javax.swing.JFrame;
-import javax.swing.JLabel;
-import javax.swing.JPanel;
-import javax.swing.JTextField;
-
-import com.amd.aparapi.Range;
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.device.OpenCLDevice;
-import com.amd.aparapi.internal.opencl.OpenCLPlatform;
-import com.amd.aparapi.internal.util.OpenCLUtil;
-import com.amd.aparapi.opencl.OpenCL;
-import com.amd.aparapi.opencl.OpenCL.Resource;
-import com.amd.aparapi.opencl.OpenCLAdapter;
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
+import com.amd.aparapi.opencl.*;
+import com.amd.aparapi.opencl.OpenCL.*;
+
+import javax.swing.*;
+import java.awt.*;
+import java.awt.event.*;
+import java.awt.image.*;
+import java.util.concurrent.*;
 
 /**
  * An example Aparapi application which displays a view of the Mandelbrot set and lets the user zoom in to a particular point. 
@@ -418,7 +398,7 @@ public class MandelExample{
       float offsetx = .0f;
 
       float offsety = .0f;
-      Device device = Device.best();
+      Device device = KernelManager.instance().bestDevice();
       if (device instanceof OpenCLDevice) {
          final OpenCLDevice openclDevice = (OpenCLDevice) device;
 
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/MandelSimple.java b/samples/extension/src/com/amd/aparapi/sample/extension/MandelSimple.java
index 1b9c993b7ed53ef6ad1db3f224aa5132b2540902..89faa7f2f6ef027a2d5163f9e7f139cb0080a43e 100644
--- a/samples/extension/src/com/amd/aparapi/sample/extension/MandelSimple.java
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/MandelSimple.java
@@ -38,29 +38,16 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 
 package com.amd.aparapi.sample.extension;
 
-import java.awt.BorderLayout;
-import java.awt.Dimension;
-import java.awt.FlowLayout;
-import java.awt.Graphics;
-import java.awt.Point;
-import java.awt.event.MouseAdapter;
-import java.awt.event.MouseEvent;
-import java.awt.event.WindowAdapter;
-import java.awt.event.WindowEvent;
-import java.awt.image.BufferedImage;
-import java.awt.image.DataBufferInt;
-
-import javax.swing.JComponent;
-import javax.swing.JFrame;
-import javax.swing.JLabel;
-import javax.swing.JPanel;
-import javax.swing.JTextField;
-
-import com.amd.aparapi.Range;
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.device.OpenCLDevice;
-import com.amd.aparapi.opencl.OpenCL;
-import com.amd.aparapi.opencl.OpenCL.Resource;
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
+import com.amd.aparapi.opencl.*;
+import com.amd.aparapi.opencl.OpenCL.*;
+
+import javax.swing.*;
+import java.awt.*;
+import java.awt.event.*;
+import java.awt.image.*;
 
 /**
  * An example Aparapi application which displays a view of the Mandelbrot set and lets the user zoom in to a particular point. 
@@ -155,7 +142,7 @@ public class MandelSimple{
       float offsetx = .0f;
 
       float offsety = .0f;
-      final Device device = Device.best();
+      final Device device = KernelManager.instance().bestDevice();
       if (device instanceof OpenCLDevice) {
          final OpenCLDevice openclDevice = (OpenCLDevice) device;
 
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/Pow4Example.java b/samples/extension/src/com/amd/aparapi/sample/extension/Pow4Example.java
index 7bc046767b738a2fcbd496cc8155454863e31355..0ea3043e19eaf2fc0203beaffdb6709e7e1a2230 100644
--- a/samples/extension/src/com/amd/aparapi/sample/extension/Pow4Example.java
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/Pow4Example.java
@@ -3,6 +3,7 @@ package com.amd.aparapi.sample.extension;
 import com.amd.aparapi.Range;
 import com.amd.aparapi.device.Device;
 import com.amd.aparapi.device.OpenCLDevice;
+import com.amd.aparapi.internal.kernel.*;
 import com.amd.aparapi.opencl.OpenCL;
 import com.amd.aparapi.opencl.OpenCL.Resource;
 
@@ -26,7 +27,7 @@ public class Pow4Example{
       final float[] squares = new float[size];
       final Range range = Range.create(size);
 
-      final Device device = Device.best();
+      final Device device = KernelManager.instance().bestDevice();
 
       if (device instanceof OpenCLDevice) {
          final OpenCLDevice openclDevice = (OpenCLDevice) device;
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/SquareExample.java b/samples/extension/src/com/amd/aparapi/sample/extension/SquareExample.java
index e2b2aa4814ec8baf558f6a24fa92e8c54cc3c9cf..58f01c0b8789a51ae886b73e608dc2f3bb98b25d 100644
--- a/samples/extension/src/com/amd/aparapi/sample/extension/SquareExample.java
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/SquareExample.java
@@ -4,6 +4,7 @@ import com.amd.aparapi.ProfileInfo;
 import com.amd.aparapi.Range;
 import com.amd.aparapi.device.Device;
 import com.amd.aparapi.device.OpenCLDevice;
+import com.amd.aparapi.internal.kernel.*;
 import com.amd.aparapi.opencl.OpenCL;
 import com.amd.aparapi.opencl.OpenCL.Resource;
 import com.amd.aparapi.opencl.OpenCL.Source;
@@ -54,7 +55,7 @@ public class SquareExample{
       final float[] quads = new float[size];
       final Range range = Range.create(size);
 
-      final Device device = Device.best();
+      final Device device = KernelManager.instance().bestDevice();
 
       if (device instanceof OpenCLDevice) {
          final OpenCLDevice openclDevice = (OpenCLDevice) device;
diff --git a/samples/extension/src/com/amd/aparapi/sample/extension/SwapExample.java b/samples/extension/src/com/amd/aparapi/sample/extension/SwapExample.java
index 381b52faa4b694b5ae86a1618b8b2382c3a602cd..d5fe0bf9244580e431ae6d72ea6ae372a0998beb 100644
--- a/samples/extension/src/com/amd/aparapi/sample/extension/SwapExample.java
+++ b/samples/extension/src/com/amd/aparapi/sample/extension/SwapExample.java
@@ -3,6 +3,7 @@ package com.amd.aparapi.sample.extension;
 import com.amd.aparapi.Range;
 import com.amd.aparapi.device.Device;
 import com.amd.aparapi.device.OpenCLDevice;
+import com.amd.aparapi.internal.kernel.*;
 import com.amd.aparapi.opencl.OpenCL;
 
 public class SwapExample{
@@ -29,7 +30,7 @@ public class SwapExample{
       final float[] rhs = new float[size];
       final Range range = Range.create(size);
 
-      final Device device = Device.best();
+      final Device device = KernelManager.instance().bestDevice();
 
       if (device instanceof OpenCLDevice) {
          final OpenCLDevice openclDevice = (OpenCLDevice) device;
diff --git a/samples/info/src/com/amd/aparapi/sample/info/Main.java b/samples/info/src/com/amd/aparapi/sample/info/Main.java
index fcff248937d1be7a55fed94e9bf5a047ca6ece9e..8397715d404927671ebb496cbaeb8cd925ab6022 100644
--- a/samples/info/src/com/amd/aparapi/sample/info/Main.java
+++ b/samples/info/src/com/amd/aparapi/sample/info/Main.java
@@ -38,11 +38,11 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 
 package com.amd.aparapi.sample.info;
 
-import java.util.List;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
+import com.amd.aparapi.internal.opencl.*;
 
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.device.OpenCLDevice;
-import com.amd.aparapi.internal.opencl.OpenCLPlatform;
+import java.util.*;
 
 public class Main{
    public static void main(String[] _args) {
@@ -73,90 +73,13 @@ public class Main{
          platformc++;
       }
 
-      Device bestDevice = OpenCLDevice.best();
-      if (bestDevice == null) {
-         System.out.println("OpenCLDevice.best() returned null!");
-      } else {
-         System.out.println("OpenCLDevice.best() returned { ");
-         System.out.println("   Type                  : " + bestDevice.getType());
-         System.out.println("   GlobalMemSize         : " + ((OpenCLDevice) bestDevice).getGlobalMemSize());
-         System.out.println("   LocalMemSize          : " + ((OpenCLDevice) bestDevice).getLocalMemSize());
-         System.out.println("   MaxComputeUnits       : " + ((OpenCLDevice) bestDevice).getMaxComputeUnits());
-         System.out.println("   MaxWorkGroupSizes     : " + ((OpenCLDevice) bestDevice).getMaxWorkGroupSize());
-         System.out.println("   MaxWorkItemDimensions : " + ((OpenCLDevice) bestDevice).getMaxWorkItemDimensions());
-         System.out.println("}");
-      }
-
-      Device firstCPU = OpenCLDevice.firstCPU();
-      if (firstCPU == null) {
-         System.out.println("OpenCLDevice.firstCPU() returned null!");
-      } else {
-         System.out.println("OpenCLDevice.firstCPU() returned { ");
-         System.out.println("   Type                  : " + firstCPU.getType());
-         System.out.println("   GlobalMemSize         : " + ((OpenCLDevice) firstCPU).getGlobalMemSize());
-         System.out.println("   LocalMemSize          : " + ((OpenCLDevice) firstCPU).getLocalMemSize());
-         System.out.println("   MaxComputeUnits       : " + ((OpenCLDevice) firstCPU).getMaxComputeUnits());
-         System.out.println("   MaxWorkGroupSizes     : " + ((OpenCLDevice) firstCPU).getMaxWorkGroupSize());
-         System.out.println("   MaxWorkItemDimensions : " + ((OpenCLDevice) firstCPU).getMaxWorkItemDimensions());
-         System.out.println("}");
-      }
-
-      Device firstGPU = OpenCLDevice.firstGPU();
-      if (firstGPU == null) {
-         System.out.println("OpenCLDevice.firstGPU() returned null!");
-      } else {
-         System.out.println("OpenCLDevice.firstGPU() returned { ");
-         System.out.println("   Type                  : " + firstGPU.getType());
-         System.out.println("   GlobalMemSize         : " + ((OpenCLDevice) firstGPU).getGlobalMemSize());
-         System.out.println("   LocalMemSize          : " + ((OpenCLDevice) firstGPU).getLocalMemSize());
-         System.out.println("   MaxComputeUnits       : " + ((OpenCLDevice) firstGPU).getMaxComputeUnits());
-         System.out.println("   MaxWorkGroupSizes     : " + ((OpenCLDevice) firstGPU).getMaxWorkGroupSize());
-         System.out.println("   MaxWorkItemDimensions : " + ((OpenCLDevice) firstGPU).getMaxWorkItemDimensions());
-         System.out.println("}");
-      }
+      KernelPreferences preferences = KernelManager.instance().getDefaultPreferences();
+      System.out.println("\nDevices in preferred order:\n");
 
-      Device bestGPU = OpenCLDevice.bestGPU();
-      if (bestGPU == null) {
-         System.out.println("OpenCLDevice.bestGPU() returned null!");
-      } else {
-         System.out.println("OpenCLDevice.bestGPU() returned { ");
-         System.out.println("   Type                  : " + bestGPU.getType());
-         System.out.println("   GlobalMemSize         : " + ((OpenCLDevice) bestGPU).getGlobalMemSize());
-         System.out.println("   LocalMemSize          : " + ((OpenCLDevice) bestGPU).getLocalMemSize());
-         System.out.println("   MaxComputeUnits       : " + ((OpenCLDevice) bestGPU).getMaxComputeUnits());
-         System.out.println("   MaxWorkGroupSizes     : " + ((OpenCLDevice) bestGPU).getMaxWorkGroupSize());
-         System.out.println("   MaxWorkItemDimensions : " + ((OpenCLDevice) bestGPU).getMaxWorkItemDimensions());
-         System.out.println("}");
-      }
-
-      Device firstACC = OpenCLDevice.firstACC();
-      if (firstACC == null) {
-         System.out.println("OpenCLDevice.firstACC() returned null!");
-      } else {
-         System.out.println("OpenCLDevice.firstACC() returned { ");
-         System.out.println("   Type                  : " + firstACC.getType());
-         System.out.println("   GlobalMemSize         : " + ((OpenCLDevice) firstACC).getGlobalMemSize());
-         System.out.println("   LocalMemSize          : " + ((OpenCLDevice) firstACC).getLocalMemSize());
-         System.out.println("   MaxComputeUnits       : " + ((OpenCLDevice) firstACC).getMaxComputeUnits());
-         System.out.println("   MaxWorkGroupSizes     : " + ((OpenCLDevice) firstACC).getMaxWorkGroupSize());
-         System.out.println("   MaxWorkItemDimensions : " + ((OpenCLDevice) firstACC).getMaxWorkItemDimensions());
-         System.out.println("}");
+      for (Device device : preferences.getPreferredDevices(null)) {
+         System.out.println(device);
+         System.out.println();
       }
-
-      Device bestACC = OpenCLDevice.bestACC();
-      if (bestACC == null) {
-         System.out.println("OpenCLDevice.bestACC() returned null!");
-      } else {
-         System.out.println("OpenCLDevice.bestACC() returned { ");
-         System.out.println("   Type                  : " + bestACC.getType());
-         System.out.println("   GlobalMemSize         : " + ((OpenCLDevice) bestACC).getGlobalMemSize());
-         System.out.println("   LocalMemSize          : " + ((OpenCLDevice) bestACC).getLocalMemSize());
-         System.out.println("   MaxComputeUnits       : " + ((OpenCLDevice) bestACC).getMaxComputeUnits());
-         System.out.println("   MaxWorkGroupSizes     : " + ((OpenCLDevice) bestACC).getMaxWorkGroupSize());
-         System.out.println("   MaxWorkItemDimensions : " + ((OpenCLDevice) bestACC).getMaxWorkItemDimensions());
-         System.out.println("}");
-      }
-
    }
 
 }
diff --git a/samples/life/src/com/amd/aparapi/sample/life/Main.java b/samples/life/src/com/amd/aparapi/sample/life/Main.java
index 963cceb9ed0750585f0891c483d1bead7c3b4dd8..e51ca5fafa4431d417c07206fff95782c4d168c0 100644
--- a/samples/life/src/com/amd/aparapi/sample/life/Main.java
+++ b/samples/life/src/com/amd/aparapi/sample/life/Main.java
@@ -38,26 +38,14 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 
 package com.amd.aparapi.sample.life;
 
-import java.awt.BorderLayout;
-import java.awt.Dimension;
-import java.awt.FlowLayout;
-import java.awt.Graphics;
-import java.awt.event.ActionEvent;
-import java.awt.event.ActionListener;
-import java.awt.image.BufferedImage;
-import java.awt.image.DataBufferInt;
-import java.util.List;
-
-import javax.swing.JButton;
-import javax.swing.JComponent;
-import javax.swing.JFrame;
-import javax.swing.JLabel;
-import javax.swing.JPanel;
-import javax.swing.WindowConstants;
-
 import com.amd.aparapi.Kernel;
-import com.amd.aparapi.ProfileInfo;
-import com.amd.aparapi.Range;
+import com.amd.aparapi.*;
+
+import javax.swing.*;
+import java.awt.*;
+import java.awt.event.*;
+import java.awt.image.*;
+import java.util.List;
 
 /**
  * An example Aparapi application which demonstrates Conways 'Game Of Life'.
@@ -239,7 +227,7 @@ public class Main{
          }
       });
       controlPanel.add(startButton);
-      controlPanel.add(new JLabel(lifeKernel.getExecutionMode().toString()));
+      controlPanel.add(new JLabel(lifeKernel.getTargetDevice().getShortDescription()));
 
       controlPanel.add(new JLabel("  Generations/Second="));
       final JLabel generationsPerSecond = new JLabel("0.00");
diff --git a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java
index 8ad4a7a766537877946a11e6ce71e2038431c2be..d527917a74d531e9ff11423126c708fb317dc956 100644
--- a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java
+++ b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java
@@ -38,24 +38,14 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 
 package com.amd.aparapi.sample.mandel;
 
-import java.awt.Color;
-import java.awt.Dimension;
-import java.awt.Graphics;
-import java.awt.Point;
-import java.awt.event.MouseAdapter;
-import java.awt.event.MouseEvent;
-import java.awt.event.WindowAdapter;
-import java.awt.event.WindowEvent;
-import java.awt.image.BufferedImage;
-import java.awt.image.DataBufferInt;
-import java.util.List;
-
-import javax.swing.JComponent;
-import javax.swing.JFrame;
-
 import com.amd.aparapi.Kernel;
-import com.amd.aparapi.ProfileInfo;
-import com.amd.aparapi.Range;
+import com.amd.aparapi.*;
+
+import javax.swing.*;
+import java.awt.*;
+import java.awt.event.*;
+import java.awt.image.*;
+import java.util.List;
 
 /**
  * An example Aparapi application which displays a view of the Mandelbrot set and lets the user zoom in to a particular point. 
@@ -70,6 +60,16 @@ import com.amd.aparapi.Range;
 
 public class Main{
 
+   static {
+      System.setProperty("com.amd.aparapi.dumpProfilesOnExit", "true");
+//      KernelManager.setKernelManager(new KernelManager() {
+//         @Override
+//         protected List<Device.TYPE> getPreferredDeviceTypes() {
+//            return Collections.singletonList(Device.TYPE.CPU);
+//         }
+//      });
+   }
+
    /**
     * An Aparapi Kernel implementation for creating a scaled view of the mandelbrot set.
     *  
@@ -80,13 +80,13 @@ public class Main{
    public static class MandelKernel extends Kernel{
 
       /** RGB buffer used to store the Mandelbrot image. This buffer holds (width * height) RGB values. */
-      final private int rgb[];
+      private int[] rgb;
 
       /** Mandelbrot image width. */
-      final private int width;
+      private int width;
 
       /** Mandelbrot image height. */
-      final private int height;
+      private int height;
 
       /** Maximum iterations for Mandelbrot. */
       final private int maxIterations = 64;
@@ -107,7 +107,6 @@ public class Main{
        * @param _width Mandelbrot image width
        * @param _height Mandelbrot image height
        * @param _rgb Mandelbrot image RGB buffer
-       * @param _pallette Mandelbrot image palette
        */
       public MandelKernel(int _width, int _height, int[] _rgb) {
          //Initialize palette values
@@ -123,6 +122,12 @@ public class Main{
 
       }
 
+      public void resetImage(int _width, int _height, int[] _rgb) {
+         width = _width;
+         height = _height;
+         rgb = _rgb;
+      }
+
       public int getCount(float x, float y) {
          int count = 0;
 
@@ -163,6 +168,9 @@ public class Main{
          scale = _scale;
       }
 
+      public int[] getRgbs() {
+         return rgb;
+      }
    }
 
    /** User selected zoom-in point on the Mandelbrot view. */
@@ -229,8 +237,7 @@ public class Main{
       System.arraycopy(rgb, 0, imageRgb, 0, rgb.length);
       viewer.repaint();
 
-      // Report target execution mode: GPU or JTP (Java Thread Pool).
-      System.out.println("Execution mode=" + kernel.getExecutionMode());
+      System.out.println("device=" + kernel.getTargetDevice());
 
       // Window listener to dispose Kernel resources on user exit.
       frame.addWindowListener(new WindowAdapter(){
diff --git a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java
index 65c62320965962f53fcb5aa98bae0254a6aca5ca..5bdd9805077801c6620d6c7719a9ef8a7957da50 100644
--- a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java
+++ b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java
@@ -38,24 +38,14 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 
 package com.amd.aparapi.sample.mandel;
 
-import java.awt.Color;
-import java.awt.Dimension;
-import java.awt.Graphics;
-import java.awt.Point;
-import java.awt.event.MouseAdapter;
-import java.awt.event.MouseEvent;
-import java.awt.event.WindowAdapter;
-import java.awt.event.WindowEvent;
-import java.awt.image.BufferedImage;
-import java.awt.image.DataBufferInt;
-import java.util.List;
-
-import javax.swing.JComponent;
-import javax.swing.JFrame;
-
 import com.amd.aparapi.Kernel;
-import com.amd.aparapi.ProfileInfo;
-import com.amd.aparapi.Range;
+import com.amd.aparapi.*;
+
+import javax.swing.*;
+import java.awt.*;
+import java.awt.event.*;
+import java.awt.image.*;
+import java.util.List;
 
 /**
  * An example Aparapi application which displays a view of the Mandelbrot set and lets the user zoom in to a particular point. 
@@ -97,11 +87,8 @@ public class Main2D{
 
       /**
        * Initialize the Kernel.
-       *  
-       * @param _width Mandelbrot image width
-       * @param _height Mandelbrot image height
+       *
        * @param _rgb Mandelbrot image RGB buffer
-       * @param _pallette Mandelbrot image palette
        */
       public MandelKernel(int[] _rgb) {
          rgb = _rgb;
@@ -156,6 +143,7 @@ public class Main2D{
 
    @SuppressWarnings("serial") public static void main(String[] _args) {
 
+
       final JFrame frame = new JFrame("MandelBrot");
 
       /** Mandelbrot image height. */
@@ -209,8 +197,7 @@ public class Main2D{
       System.arraycopy(rgb, 0, imageRgb, 0, rgb.length);
       viewer.repaint();
 
-      // Report target execution mode: GPU or JTP (Java Thread Pool).
-      System.out.println("Execution mode=" + kernel.getExecutionMode());
+      System.out.println("device=" + kernel.getTargetDevice());
 
       // Window listener to dispose Kernel resources on user exit.
       frame.addWindowListener(new WindowAdapter(){
diff --git a/samples/mdarray/build.xml b/samples/mdarray/build.xml
index 7c5bf8ec6b3363b236090b2fd542ad50cd23766f..787fd0950e69d37599e2ba0b855c16714fa19710 100644
--- a/samples/mdarray/build.xml
+++ b/samples/mdarray/build.xml
@@ -19,7 +19,7 @@
 
 	<target name="build" depends="clean">
 		<mkdir dir="classes" />
-		<javac srcdir="src" destdir="classes" debug="on" includeantruntime="false" fork="true" memorymaximumsize="3G">
+		<javac srcdir="src" destdir="classes" debug="on" includeantruntime="false" fork="true" memorymaximumsize="1024m">
 			<classpath>
 				<pathelement path="../../com.amd.aparapi/dist/aparapi.jar" />
 			</classpath>
diff --git a/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java b/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java
index 99fa6259ff541663b292bc0e0c29aaf6709d61c3..b94c359d1f03037bb6828bcb7d10751fcb489b4e 100644
--- a/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java
+++ b/samples/median/src/com/amd/aparapi/sample/median/MedianDemo.java
@@ -1,6 +1,6 @@
 package com.amd.aparapi.sample.median;
 
-import com.amd.aparapi.Kernel;
+import com.amd.aparapi.internal.kernel.*;
 
 import javax.imageio.*;
 import javax.swing.*;
@@ -12,32 +12,49 @@ import java.io.*;
  * Demonstrate use of __private namespaces and @NoCL annotations.
  */
 public class MedianDemo {
-   public final static BufferedImage testImage;
+   public static BufferedImage testImage;
 
    static {
       try {
-         File imageFile = new File("./../../../samples/convolution/testcard.jpg").getCanonicalFile();
-         testImage = ImageIO.read(imageFile);
+         File imageFile = new File("./samples/convolution/testcard.jpg").getCanonicalFile();
+         if (imageFile.exists()) {
+            testImage = ImageIO.read(imageFile);
+         }
       } catch (IOException e) {
          throw new RuntimeException(e);
       }
    }
 
-   private static final boolean TEST_JTP = false;
-
    public static void main(String[] ignored) {
       final int size = 5;
-      System.setProperty("com.amd.aparapi.enableShowGeneratedOpenCL", "true");
-      int[] argbs = testImage.getRGB(0, 0, testImage.getWidth(), testImage.getHeight(), null, 0, testImage.getWidth());
-      MedianKernel7x7 kernel = new MedianKernel7x7();
-      kernel._imageTypeOrdinal = MedianKernel7x7.RGB;
-      kernel._sourceWidth = testImage.getWidth();
-      kernel._sourceHeight = testImage.getHeight();
-      kernel._sourcePixels = argbs;
-      kernel._destPixels = new int[argbs.length];
-      if (TEST_JTP) {
-         kernel.setExecutionMode(Kernel.EXECUTION_MODE.JTP);
+      System.setProperty("com.amd.aparapi.dumpProfilesOnExit", "true");
+      boolean verbose = false;
+      if (verbose)
+      {
+          System.setProperty("com.amd.aparapi.enableVerboseJNI", "true");
+          System.setProperty("com.amd.aparapi.dumpFlags", "true");
+          System.setProperty("com.amd.aparapi.enableShowGeneratedOpenCL", "true");
+          System.setProperty("com.amd.aparapi.enableVerboseJNIOpenCLResourceTracking", "true");
+          System.setProperty("com.amd.aparapi.enableExecutionModeReporting", "true");
       }
+
+//      KernelManager.setKernelManager(new KernelManager(){
+//         @Override
+//         protected Comparator<OpenCLDevice> getDefaultGPUComparator() {
+//            return new Comparator<OpenCLDevice>() {
+//               @Override
+//               public int compare(OpenCLDevice o1, OpenCLDevice o2) {
+//                  return o2.getMaxComputeUnits() - o1.getMaxComputeUnits();
+//               }
+//            };
+//         }
+//      });
+
+      System.out.println(KernelManager.instance().bestDevice());
+
+      int[] argbs = testImage.getRGB(0, 0, testImage.getWidth(), testImage.getHeight(), null, 0, testImage.getWidth());
+      MedianKernel7x7 kernel = createMedianKernel(argbs);
+
       kernel.processImages(new MedianSettings(size));
       BufferedImage out = new BufferedImage(testImage.getWidth(), testImage.getHeight(), BufferedImage.TYPE_INT_RGB);
       out.setRGB(0, 0, testImage.getWidth(), testImage.getHeight(), kernel._destPixels, 0, testImage.getWidth());
@@ -54,12 +71,35 @@ public class MedianDemo {
       frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
       frame.setVisible(true);
 
-      int reps = 20;
+      StringBuilder builder = new StringBuilder();
+      KernelManager.instance().reportDeviceUsage(builder, true);
+      System.out.println(builder);
+
+      int reps = 50;
+      final boolean newKernel = false;
       for (int rep = 0; rep < reps; ++rep) {
+         if (newKernel) {
+            kernel.dispose();
+            kernel = createMedianKernel(argbs);
+         }
          long start = System.nanoTime();
          kernel.processImages(new MedianSettings(size));
          long elapsed = System.nanoTime() - start;
          System.out.println("elapsed = " + elapsed / 1000000f + "ms");
       }
+
+      builder = new StringBuilder();
+      KernelManager.instance().reportDeviceUsage(builder, true);
+      System.out.println(builder);
+   }
+
+   private static MedianKernel7x7 createMedianKernel(int[] argbs) {
+      MedianKernel7x7 kernel = new MedianKernel7x7();
+      kernel._imageTypeOrdinal = MedianKernel7x7.RGB;
+      kernel._sourceWidth = testImage.getWidth();
+      kernel._sourceHeight = testImage.getHeight();
+      kernel._sourcePixels = argbs;
+      kernel._destPixels = new int[argbs.length];
+      return kernel;
    }
 }
diff --git a/samples/median/src/com/amd/aparapi/sample/median/MedianKernel7x7.java b/samples/median/src/com/amd/aparapi/sample/median/MedianKernel7x7.java
index 6cbece4157e3518a897c4e1bfead8fdf2ba7dbbd..c393720be7b4b200645d039ec0b28425f8d86e5b 100644
--- a/samples/median/src/com/amd/aparapi/sample/median/MedianKernel7x7.java
+++ b/samples/median/src/com/amd/aparapi/sample/median/MedianKernel7x7.java
@@ -28,7 +28,8 @@ public class MedianKernel7x7 extends Kernel {
    protected int[] _destPixels;
 
    // NB could also use suffix naming instead of annotation ... field would be named _window_$private$49
-   @PrivateMemorySpace(MAX_WINDOW_SIZE) private short[] _window = new short[MAX_WINDOW_SIZE];
+   @PrivateMemorySpace(MAX_WINDOW_SIZE)
+   private short[] _window = new short[MAX_WINDOW_SIZE];
    @NoCL private static ThreadLocal<short[]> _threadLocalWindow = new ThreadLocal<short[]>() {
       @Override
       protected short[] initialValue() {
diff --git a/samples/progress/src/com/amd/aparapi/sample/progress/MultiPassKernelSwingWorkerDemo.java b/samples/progress/src/com/amd/aparapi/sample/progress/MultiPassKernelSwingWorkerDemo.java
index 7cc2584b1cb10d054f16632dd12ff27f2102c53b..7bfc91e4eea39ce1148611cfa428cdc6879a90bf 100644
--- a/samples/progress/src/com/amd/aparapi/sample/progress/MultiPassKernelSwingWorkerDemo.java
+++ b/samples/progress/src/com/amd/aparapi/sample/progress/MultiPassKernelSwingWorkerDemo.java
@@ -1,7 +1,7 @@
 package com.amd.aparapi.sample.progress;
 
-import com.amd.aparapi.Kernel;
-import com.amd.aparapi.internal.kernel.KernelRunner;
+import com.amd.aparapi.*;
+import com.amd.aparapi.internal.kernel.*;
 import com.amd.aparapi.util.swing.MultiPassKernelSwingWorker;
 
 import javax.swing.*;
@@ -23,13 +23,13 @@ public class MultiPassKernelSwingWorkerDemo {
    private static LongRunningKernel kernel;
    private static MultiPassKernelSwingWorker worker;
 
-   private static final boolean TEST_JTP = true;
+   private static final boolean TEST_JTP = false;
 
    public static void main(String[] ignored) throws Exception {
-      kernel = new LongRunningKernel();
       if (TEST_JTP) {
-         kernel.setExecutionMode(Kernel.EXECUTION_MODE.JTP);
+         KernelManager.setKernelManager(KernelManagers.JTP_ONLY);
       }
+      kernel = new LongRunningKernel();
 
       UIManager.setLookAndFeel(NimbusLookAndFeel.class.getName());
       JPanel rootPanel = new JPanel();
diff --git a/samples/progress/src/com/amd/aparapi/sample/progress/ProgressAndCancelDemo.java b/samples/progress/src/com/amd/aparapi/sample/progress/ProgressAndCancelDemo.java
index b114dcac4f19b5d93e6ec82b1d84da19193fa719..721f2c611ee06bf1fd3a144aedc16262785d84b5 100644
--- a/samples/progress/src/com/amd/aparapi/sample/progress/ProgressAndCancelDemo.java
+++ b/samples/progress/src/com/amd/aparapi/sample/progress/ProgressAndCancelDemo.java
@@ -1,13 +1,11 @@
 package com.amd.aparapi.sample.progress;
 
-import com.amd.aparapi.Kernel;
-import com.amd.aparapi.internal.kernel.KernelRunner;
+import com.amd.aparapi.internal.kernel.*;
 
 import javax.swing.*;
-import javax.swing.plaf.nimbus.NimbusLookAndFeel;
+import javax.swing.plaf.nimbus.*;
 import java.awt.*;
-import java.awt.event.ActionEvent;
-import java.awt.event.ActionListener;
+import java.awt.event.*;
 
 /**
  * Demonstrates progress tracking and cancellation for multi-pass kernels.
@@ -36,7 +34,7 @@ public class ProgressAndCancelDemo {
 
       kernel = new LongRunningKernel();
       if (TEST_JTP) {
-         kernel.setExecutionMode(Kernel.EXECUTION_MODE.JTP);
+         KernelManager.setKernelManager(KernelManagers.JTP_ONLY);
       }
       Thread asynchReader = new Thread() {
          @Override
diff --git a/samples/squares/src/com/amd/aparapi/sample/squares/Main.java b/samples/squares/src/com/amd/aparapi/sample/squares/Main.java
index 32a1b70b8bfd16cd76eff8d5666442738a18dc72..247cda6f8ea46601b339efe22b8144dc94b88b8b 100644
--- a/samples/squares/src/com/amd/aparapi/sample/squares/Main.java
+++ b/samples/squares/src/com/amd/aparapi/sample/squares/Main.java
@@ -82,7 +82,7 @@ public class Main{
       kernel.execute(Range.create(512));
 
       // Report target execution mode: GPU or JTP (Java Thread Pool).
-      System.out.println("Execution mode=" + kernel.getExecutionMode());
+      System.out.println("Device = " + kernel.getTargetDevice().getShortDescription());
 
       // Display computed square values.
       for (int i = 0; i < size; i++) {
diff --git a/test/codegen/src/java/com/amd/aparapi/Source.java b/test/codegen/src/java/com/amd/aparapi/Source.java
index a08c2872186874a4d7aef3c387116130718b3770..d9774096ed5499de4435e7da3e095b0c00d53bde 100644
--- a/test/codegen/src/java/com/amd/aparapi/Source.java
+++ b/test/codegen/src/java/com/amd/aparapi/Source.java
@@ -84,7 +84,7 @@ public class Source{
 
    public Source(Class<?> _clazz, File _rootDir) {
       clazz = _clazz;
-      String srcName = clazz.getPackage().getName().replace(".", "/") + "/" + clazz.getSimpleName() + ".java";
+      String srcName = clazz.getPackage().getName().replace(".", "/") + "/" + clazz + ".java";
       file = new File(_rootDir, srcName);
       try {
          BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/BufferTransfer.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/BufferTransfer.java
index 847a69575859dce0ab6d894f34cd9812270a7f36..1f9a36fa893ca1d383606c82cf52538f11eec61c 100644
--- a/test/runtime/src/java/com/amd/aparapi/test/runtime/BufferTransfer.java
+++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/BufferTransfer.java
@@ -1,17 +1,13 @@
 package com.amd.aparapi.test.runtime;
 
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
+import org.junit.*;
 
-import java.util.Arrays;
+import java.util.*;
 
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import com.amd.aparapi.Kernel;
-import com.amd.aparapi.Range;
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.device.OpenCLDevice;
+import static org.junit.Assert.*;
 
 public class BufferTransfer{
 
@@ -19,7 +15,7 @@ public class BufferTransfer{
 
    @BeforeClass public static void setUpBeforeClass() throws Exception {
 
-      Device device = Device.best();
+      Device device = KernelManager.instance().bestDevice();
       if (device == null || !(device instanceof OpenCLDevice)) {
          fail("no opencl device!");
       }
@@ -209,7 +205,7 @@ public class BufferTransfer{
             for (int n = 0; n < neuronOutputs.length; n++)
                log[n][simStep[0]] = neuronOutputs[n];
          }
-         System.out.println(getExecutionMode() + (isExplicit() ? ", explicit" : ", auto"));
+         System.out.println(getTargetDevice().getShortDescription() + (isExplicit() ? ", explicit" : ", auto"));
 
          for (int n = 0; n < neuronOutputs.length; n++)
             System.out.println(Arrays.toString(log[n]));
diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/CallStaticFromAnonymousKernel.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/CallStaticFromAnonymousKernel.java
index ca70f1a2ec6c39b6da1114a1cd1f39262c5af4f0..8cfb0d251027af33dff7c4b884055a94c7a03adb 100644
--- a/test/runtime/src/java/com/amd/aparapi/test/runtime/CallStaticFromAnonymousKernel.java
+++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/CallStaticFromAnonymousKernel.java
@@ -1,8 +1,10 @@
 package com.amd.aparapi.test.runtime;
 
-import static org.junit.Assert.assertTrue;
-import org.junit.Test;
-import com.amd.aparapi.Kernel;
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import org.junit.*;
+
+import static org.junit.Assert.*;
 
 class AnotherClass{
    static public int foo(int i) {
@@ -42,7 +44,7 @@ public class CallStaticFromAnonymousKernel{
          }
       };
       kernel.execute(size);
-      assertTrue("ran on GPU", kernel.getExecutionMode() == Kernel.EXECUTION_MODE.GPU);
+      assertTrue("ran on GPU", kernel.getTargetDevice().getType() == Device.TYPE.GPU);
 
       for (int i = 0; i < size; i++) {
          assertTrue("results == fooBar", results[i] == (fooBar(values[i]) + AnotherClass.foo(i)));
diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/ExplicitBoolean.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/ExplicitBoolean.java
index c80b587ba5fff03255756764f1256c7eaab0a44a..c59efbd9f90b1fce79b1de38202f4084b8a0ed5f 100644
--- a/test/runtime/src/java/com/amd/aparapi/test/runtime/ExplicitBoolean.java
+++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/ExplicitBoolean.java
@@ -1,10 +1,9 @@
 package com.amd.aparapi.test.runtime;
 
-import static org.junit.Assert.assertTrue;
+import com.amd.aparapi.*;
+import org.junit.*;
 
-import org.junit.Test;
-
-import com.amd.aparapi.Kernel;
+import static org.junit.Assert.*;
 
 public class ExplicitBoolean{
 
@@ -61,7 +60,7 @@ public class ExplicitBoolean{
          printArray(k2.output);
 
       assertTrue("k1.input == k2.input", Util.same(k1.output, k1.output));
-      System.out.println(k1.getExecutionMode());
+      System.out.println(k1.getTargetDevice().getShortDescription());
    }
 
    private static void printArray(boolean[] a) {
diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/LoadCL.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/LoadCL.java
index 28b2a73b50b76eba3ecee1751eaecdefaaaacf22..99d1764c9857952a461135f27ed89ab46cd12ba2 100644
--- a/test/runtime/src/java/com/amd/aparapi/test/runtime/LoadCL.java
+++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/LoadCL.java
@@ -1,14 +1,13 @@
 package com.amd.aparapi.test.runtime;
 
-import com.amd.aparapi.Range;
-import com.amd.aparapi.device.Device;
-import com.amd.aparapi.device.OpenCLDevice;
-import com.amd.aparapi.opencl.OpenCL;
-import com.amd.aparapi.opencl.OpenCL.Resource;
-import com.amd.aparapi.opencl.OpenCL.Source;
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import com.amd.aparapi.internal.kernel.*;
+import com.amd.aparapi.opencl.*;
+import com.amd.aparapi.opencl.OpenCL.*;
+import org.junit.*;
 
-import static org.junit.Assert.assertTrue;
-import org.junit.Test;
+import static org.junit.Assert.*;
 
 public class LoadCL{
 
@@ -31,7 +30,7 @@ public class LoadCL{
       final float[] quads = new float[size];
       final Range range = Range.create(size);
 
-      final Device device = Device.best();
+      final Device device = KernelManager.instance().bestDevice();
 
       if (device instanceof OpenCLDevice) {
          final OpenCLDevice openclDevice = (OpenCLDevice) device;
diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/Test12x4_4x2.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/Test12x4_4x2.java
index 51cee4366bdc15dc648cfb647976f9eb7cf423b0..b415b7764a36e6cbb210a3df724b8752802998a2 100644
--- a/test/runtime/src/java/com/amd/aparapi/test/runtime/Test12x4_4x2.java
+++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/Test12x4_4x2.java
@@ -1,12 +1,14 @@
 package com.amd.aparapi.test.runtime;
 
+import com.amd.aparapi.device.*;
 import org.junit.Test;
 
 import com.amd.aparapi.Kernel;
 import com.amd.aparapi.Range;
 
 public class Test12x4_4x2{
-   @Test public void test() {
+      @SuppressWarnings("deprecation")
+      @Test public void test() {
       // globalThreadId, threadId, globalX, globalY, localX, localY
       final int[][] test = new int[][] {
             {
@@ -446,7 +448,12 @@ public class Test12x4_4x2{
       };
       Kernel kernel = new Kernel(){
 
-         @Override public void run() {
+            @Override
+            public boolean isAllowDevice(Device _device) {
+                  return _device.getType() == Device.TYPE.JTP;
+            }
+
+            @Override public void run() {
             int x = getGlobalId(0);
             int y = getGlobalId(1);
             int lx = getLocalId(0);
@@ -492,7 +499,6 @@ public class Test12x4_4x2{
          }
 
       };
-      kernel.setExecutionMode(Kernel.EXECUTION_MODE.JTP);
       kernel.execute(Range.create2D(12, 4, 4, 2));
 
    }
diff --git a/test/runtime/src/java/com/amd/aparapi/test/runtime/UseStaticArray.java b/test/runtime/src/java/com/amd/aparapi/test/runtime/UseStaticArray.java
index 41f4b0d21e02207f9ad02621c06f2776a67bf5fd..5ce32645e4c77beb6776101754dc2918f5c0d743 100644
--- a/test/runtime/src/java/com/amd/aparapi/test/runtime/UseStaticArray.java
+++ b/test/runtime/src/java/com/amd/aparapi/test/runtime/UseStaticArray.java
@@ -1,8 +1,10 @@
 package com.amd.aparapi.test.runtime;
 
+import com.amd.aparapi.*;
+import com.amd.aparapi.device.*;
+import org.junit.*;
+
 import static org.junit.Assert.*;
-import org.junit.Test;
-import com.amd.aparapi.Kernel;
 
 public class UseStaticArray extends Kernel{
 
@@ -26,7 +28,7 @@ public class UseStaticArray extends Kernel{
 
       execute(size);
 
-      assertTrue("ran on GPU", getExecutionMode() == Kernel.EXECUTION_MODE.GPU);
+      assertTrue("ran on GPU", getTargetDevice().getType() == Device.TYPE.GPU);
 
       assertArrayEquals("results == fooBar", results, values);
 //      for (int i = 0; i < size; i++) {