diff --git a/build.xml b/build.xml
index ee646ad16617dfbc5c803d5828a09fc3aaca0193..3b19d8139945510362fdfcc7df3dd4e2cc8f5abb 100644
--- a/build.xml
+++ b/build.xml
@@ -35,6 +35,7 @@
          <fileset dir="test" includes="*/build.xml"/>
       </subant>
       <delete dir="examples\nbody\jogamp"/> <!-- we handle the jogamp delete here, save downloading each build -->
+      <delete file="test\codegen\junit-4.10.jar"/> <!-- we handle the junit delete here, save downloading each build -->
       <ant dir="com.amd.aparapi.jni" target="clean"/> 
       <ant dir="com.amd.aparapi" target="clean"/> 
    </target>
diff --git a/com.amd.aparapi.jni/src/cpp/aparapi.cpp b/com.amd.aparapi.jni/src/cpp/aparapi.cpp
index 29fa4c27e127e339e6f470cbeda81d866417a140..4e9c4b31188224e74b28f1e3cf68f0abe85aaf93 100644
--- a/com.amd.aparapi.jni/src/cpp/aparapi.cpp
+++ b/com.amd.aparapi.jni/src/cpp/aparapi.cpp
@@ -1,40 +1,40 @@
 /*
-Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
-following conditions are met:
-
-Redistributions of source code must retain the above copyright notice, this list of conditions and the following
-disclaimer. 
-
-Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
-disclaimer in the documentation and/or other materials provided with the distribution. 
-
-Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission. 
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
-INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
-laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
-774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
-you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
-Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
-Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
-E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
-D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
-to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
-of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
-under those regulations, please refer to the U.S. Bureau of Industry and Security’s website at http://www.bis.doc.gov/. 
-
-*/
+   Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+   following conditions are met:
+
+   Redistributions of source code must retain the above copyright notice, this list of conditions and the following
+   disclaimer. 
+
+   Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
+   disclaimer in the documentation and/or other materials provided with the distribution. 
+
+   Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
+   derived from this software without specific prior written permission. 
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+   INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
+   laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 
+   through 774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of
+   the EAR, you hereby certify that, except pursuant to a license granted by the United States Department of Commerce
+   Bureau of Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export 
+   Administration Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in 
+   Country Groups D:1, E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) 
+   export to Country Groups D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced
+   direct product is subject to national security controls as identified on the Commerce Control List (currently 
+   found in Supplement 1 to Part 774 of EAR).  For the most current Country Group listings, or for additional 
+   information about the EAR or your obligations under those regulations, please refer to the U.S. Bureau of Industry
+   and Security’s website at http://www.bis.doc.gov/. 
+   */
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -110,6 +110,10 @@ MicrosecondTimer timer;
 
 #include "com_amd_aparapi_KernelRunner.h"
 
+#define CHECK_NO_RETURN(condition, msg) if(condition){\
+   fprintf(stderr, "!!!!!!! %s failed !!!!!!!\n", msg);\
+}
+
 #define CHECK(condition, msg) if(condition){\
    fprintf(stderr, "!!!!!!! %s failed !!!!!!!\n", msg);\
    return 0;\
@@ -127,26 +131,9 @@ MicrosecondTimer timer;
 
 #define PRINT_CL_ERR(status, msg) fprintf(stderr, "!!!!!!! %s failed %s\n", msg, CLErrString(status));
 
-#define ASSERT_FIELD(id) CHECK(id##FieldID == 0, "No such field as " #id)
-
-#define GET_DEV_INFO(deviceId, param, val, format){\
-   status = clGetDeviceInfo(deviceId, param, sizeof(val), &(val), NULL);\
-   ASSERT_CL_NO_RETURN( "clGetDeviceInfo().");\
-   /*fprintf(stderr, #param " " format " \n", val);*/ \
-}
-
+#define ASSERT_FIELD(id) CHECK_NO_RETURN(id##FieldID == 0, "No such field as " #id)
 
-jfieldID typeFieldID;
-jfieldID isStaticFieldID;
-jfieldID nameFieldID;
-jfieldID javaArrayFieldID;
-jfieldID bytesPerLocalSizeFieldID;
-jfieldID sizeInBytesFieldID;
-jfieldID numElementsFieldID;
 
-// we rely on these being 0 initially to detect whether we have cached the above fieldId's 
-jclass clazz = (jclass)0;
-jclass argClazz = (jclass)0;
 
 static const char *CLErrString(cl_int status) {
    static struct { cl_int code; const char *msg; } error_table[] = {
@@ -214,7 +201,90 @@ static const char *CLErrString(cl_int status) {
 #endif
    return unknown;
 }
+class Range{
+   public:
+      static jclass rangeClazz;
+      static jfieldID globalSize_0_FieldID;
+      static jfieldID globalSize_1_FieldID;
+      static jfieldID globalSize_2_FieldID;
+      static jfieldID localSize_0_FieldID;
+      static jfieldID localSize_1_FieldID;
+      static jfieldID localSize_2_FieldID;
+      static jfieldID dimsFieldID;
+      static jfieldID localIsDerivedFieldID; 
+      jobject range;
+      cl_int dims;
+      size_t *offsets;
+      size_t *globalDims;
+      size_t *localDims;
+      jboolean localIsDerived;
+      Range(JNIEnv *jenv, jobject range):
+         range(range),
+         dims(0),
+         offsets(NULL),
+         globalDims(NULL),
+         localDims(NULL){
+            if (rangeClazz ==NULL){
+               jclass rangeClazz = jenv->GetObjectClass(range); 
+               globalSize_0_FieldID = jenv->GetFieldID(rangeClazz, "globalSize_0", "I"); ASSERT_FIELD(globalSize_0_);
+               globalSize_1_FieldID = jenv->GetFieldID(rangeClazz, "globalSize_1", "I"); ASSERT_FIELD(globalSize_1_);
+               globalSize_2_FieldID = jenv->GetFieldID(rangeClazz, "globalSize_2", "I"); ASSERT_FIELD(globalSize_2_);
+               localSize_0_FieldID = jenv->GetFieldID(rangeClazz, "localSize_0", "I"); ASSERT_FIELD(localSize_0_);
+               localSize_1_FieldID = jenv->GetFieldID(rangeClazz, "localSize_1", "I"); ASSERT_FIELD(localSize_1_);
+               localSize_2_FieldID = jenv->GetFieldID(rangeClazz, "localSize_2", "I"); ASSERT_FIELD(localSize_2_);
+               dimsFieldID = jenv->GetFieldID(rangeClazz, "dims", "I"); ASSERT_FIELD(dims);
+               localIsDerivedFieldID = jenv->GetFieldID(rangeClazz, "localIsDerived", "Z"); ASSERT_FIELD(localIsDerived);
+            }
+            dims = jenv->GetIntField(range, dimsFieldID);
+            localIsDerived = jenv->GetBooleanField(range, localIsDerivedFieldID);
+            if (dims >0){
+               //fprintf(stderr, "native range dims == %d\n", dims);
+               offsets = new size_t[dims];
+               globalDims = new size_t[dims];
+               localDims = new size_t[dims];
+               offsets[0]= 0;
+               localDims[0]= jenv->GetIntField(range, localSize_0_FieldID);
+               //fprintf(stderr, "native range localSize_0 == %d\n", localDims[0]);
+               globalDims[0]= jenv->GetIntField(range, globalSize_0_FieldID);
+               //fprintf(stderr, "native range globalSize_0 == %d\n", globalDims[0]);
+               if (dims >1){
+                  offsets[1]= 0;
+                  localDims[1]= jenv->GetIntField(range, localSize_1_FieldID);
+                  //fprintf(stderr, "native range localSize_1 == %d\n", localDims[1]);
+                  globalDims[1]= jenv->GetIntField(range, globalSize_1_FieldID);
+                  //fprintf(stderr, "native range globalSize_1 == %d\n", globalDims[1]);
+                  if (dims >2){
+                     offsets[2]= 0;
+                     localDims[2]= jenv->GetIntField(range, localSize_2_FieldID);
+                     //fprintf(stderr, "native range localSize_2 == %d\n", localDims[2]);
+                     globalDims[2]= jenv->GetIntField(range, globalSize_2_FieldID);
+                     //fprintf(stderr, "native range globalSize_2 == %d\n", globalDims[2]);
+                  }
+               }
 
+            }
+         }
+      ~Range(){
+         if (offsets!= NULL){
+            delete offsets;
+         }
+         if (globalDims!= NULL){
+            delete globalDims;
+         }
+         if (localDims!= NULL){
+            delete localDims;
+         }
+      }
+};
+jclass Range::rangeClazz = (jclass)0;
+jfieldID  Range::globalSize_0_FieldID=0;
+jfieldID  Range::globalSize_1_FieldID=0;
+jfieldID  Range::globalSize_2_FieldID=0;
+jfieldID  Range::localSize_0_FieldID=0;
+jfieldID  Range::localSize_1_FieldID=0;
+jfieldID  Range::localSize_2_FieldID=0;
+jfieldID  Range::dimsFieldID=0;
+jfieldID  Range::localIsDerivedFieldID=0; 
 
 class ProfileInfo{
    public:
@@ -243,13 +313,22 @@ class KernelArgRef{
 class JNIContext ; // forward reference
 
 class KernelArg{
+   private:
+      static jclass argClazz;
+      static jfieldID nameFieldID;
+      static jfieldID typeFieldID; 
+      static jfieldID isStaticFieldID; 
+      static jfieldID sizeInBytesFieldID;
+      static jfieldID numElementsFieldID; 
    public:
+      static jfieldID javaArrayFieldID; 
+      jobject argObj;
       char *name;        // used for debugging printfs
       jfieldID fieldID;  // The field that this arg represents in the kernel (java), used only for primitive updates
       jint type;         // a bit mask determining the type of this arg
       jboolean isStatic; // A flag indicating if the value is static
       jint sizeInBytes;  // bytes in the array or directBuf
-      jobject javaArg;   // global reference to the corresponding java KernelArg 
+      jobject javaArg;   // global reference to the corresponding java KernelArg object
       union{
          cl_char c;
          cl_double d;
@@ -259,6 +338,29 @@ class KernelArg{
          KernelArgRef ref;
       } value;
 
+      KernelArg(JNIEnv *jenv, jobject argObj):
+         argObj(argObj){
+            javaArg = jenv->NewGlobalRef(argObj);   // save a global ref to the java Arg Object
+            if (argClazz == 0){
+               jclass c = jenv->GetObjectClass(argObj); 
+               nameFieldID = jenv->GetFieldID(c, "name", "Ljava/lang/String;"); ASSERT_FIELD(name);
+               typeFieldID = jenv->GetFieldID(c, "type", "I"); ASSERT_FIELD(type);
+               isStaticFieldID = jenv->GetFieldID(c, "isStatic", "Z"); ASSERT_FIELD(isStatic);
+               javaArrayFieldID = jenv->GetFieldID(c, "javaArray", "Ljava/lang/Object;"); ASSERT_FIELD(javaArray);
+               sizeInBytesFieldID = jenv->GetFieldID(c, "sizeInBytes", "I"); ASSERT_FIELD(sizeInBytes);
+               numElementsFieldID = jenv->GetFieldID(c, "numElements", "I"); ASSERT_FIELD(numElements);
+            }
+            type = jenv->GetIntField(argObj, typeFieldID);
+            isStatic = jenv->GetBooleanField(argObj, isStaticFieldID);
+            jstring nameString  = (jstring)jenv->GetObjectField(argObj, nameFieldID);
+            const char *nameChars = jenv->GetStringUTFChars(nameString, NULL);
+            name=strdup(nameChars);
+            jenv->ReleaseStringUTFChars(nameString, nameChars);
+         }
+
+      ~KernelArg(){
+      }
+
       void unpinAbort(JNIEnv *jenv){
          jenv->ReleasePrimitiveArrayCritical((jarray)value.ref.javaArray, value.ref.addr,JNI_ABORT);
       }
@@ -342,9 +444,6 @@ class KernelArg{
       int isAparapiBufHasArray(){
          return (type&com_amd_aparapi_KernelRunner_ARG_APARAPI_BUF_HAS_ARRAY);
       }
-      int isAparapiBufIsDirect(){
-         return (type&com_amd_aparapi_KernelRunner_ARG_APARAPI_BUF_IS_DIRECT);
-      }
       int isBackedByArray(){
          return ( (isArray() && isGlobal()) || ((isGlobal() || isConstant()) && isAparapiBufHasArray()));
       }
@@ -354,8 +453,27 @@ class KernelArg{
       int mustWriteBuffer(){
          return ((isImplicit()&&isRead()&&!isConstant())||(isExplicit()&&isExplicitWrite()));
       }
-
+      void syncType(JNIEnv* jenv){
+         type = jenv->GetIntField(javaArg, typeFieldID);
+      }
+      void syncSizeInBytes(JNIEnv* jenv){
+         sizeInBytes = jenv->GetIntField(javaArg, sizeInBytesFieldID);
+      }
+      void syncJavaArrayLength(JNIEnv* jenv){
+         value.ref.javaArrayLength = jenv->GetIntField(javaArg, numElementsFieldID);
+      }
+      void clearExplicitBufferBit(JNIEnv* jenv){
+         type &= ~com_amd_aparapi_KernelRunner_ARG_EXPLICIT_WRITE;
+         jenv->SetIntField(javaArg, typeFieldID,type );
+      }
 };
+jclass KernelArg::argClazz=(jclass)0;
+jfieldID KernelArg::nameFieldID=0;
+jfieldID KernelArg::typeFieldID=0; 
+jfieldID KernelArg::isStaticFieldID=0; 
+jfieldID KernelArg::javaArrayFieldID=0; 
+jfieldID KernelArg::sizeInBytesFieldID=0;
+jfieldID KernelArg::numElementsFieldID=0; 
 
 class JNIContext{
    private: 
@@ -365,10 +483,7 @@ class JNIContext{
       cl_platform_id* platforms;
       cl_uint platformc;
    public:
-      JNIEnv *jenv;
       jobject kernelObject;
-      jint numProcessors;
-      jint maxJTPLocalSize;
       jclass kernelClass;
       cl_uint deviceIdc;
       cl_device_id* deviceIds;
@@ -391,6 +506,7 @@ class JNIContext{
       // these map to camelCase form of CL_DEVICE_XXX_XXX  For example CL_DEVICE_MAX_COMPUTE_UNITS == maxComputeUnits
       cl_uint maxComputeUnits;
       cl_uint maxWorkItemDimensions;
+      size_t *maxWorkItemSizes;
       size_t maxWorkGroupSize;
       cl_ulong globalMemSize;
       cl_ulong localMemSize;
@@ -399,13 +515,10 @@ class JNIContext{
          return((JNIContext*)jniContextHandle);
       }
 
-      JNIContext(JNIEnv *_jenv, jobject _kernelObject, jint _flags, jint _numProcessors, jint _maxJTPLocalSize): 
-         jenv(_jenv),
+      JNIContext(JNIEnv *jenv, jobject _kernelObject, jint _flags): 
          kernelObject(jenv->NewGlobalRef(_kernelObject)),
          kernelClass((jclass)jenv->NewGlobalRef(jenv->GetObjectClass(_kernelObject))), 
          flags(_flags),
-         numProcessors(_numProcessors),
-         maxJTPLocalSize(_maxJTPLocalSize),
          platform(NULL),
          profileBaseTime(0),
          deviceType(((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_USE_GPU)==com_amd_aparapi_KernelRunner_JNI_FLAG_USE_GPU)?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU),
@@ -438,1179 +551,1209 @@ class JNIContext{
                      // platformVersionName = "OpenCL 1.1 AMD-APP-SDK-v2.5 (684.213)"|"OpenCL 1.1 CUDA 4.0.1"
 #ifndef __APPLE__
                      // Here we check if the platformVersionName starts with "OpenCL 1.1" (10 chars!) 
-                     if (!strncmp(platformVersionName, "OpenCL 1.1", 10)) {
+                     if (!strncmp(platformVersionName, "OpenCL 1.1", 10)) { //}
 #else 
-                        // Here we check if the platformVersionName starts with "OpenCL 1.1" or "OpenCL 1.0" (10 chars!) 
-                        if (!strncmp(platformVersionName, "OpenCL 1.1", 10) || !strncmp(platformVersionName, "OpenCL 1.0", 10)) {
+                     // Here we check if the platformVersionName starts with "OpenCL 1.1" or "OpenCL 1.0" (10 chars!) 
+                     if (!strncmp(platformVersionName, "OpenCL 1.1", 10) || !strncmp(platformVersionName, "OpenCL 1.0", 10)) { // }
 #endif
-                           // Get the # of devices
-                           status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &deviceIdc);
-                           // now check if this platform supports the requested device type (GPU or CPU)
-                           if (status == CL_SUCCESS && deviceIdc >0 ){
-                              if (deviceIdc >1){
-                                 if (isVerbose()){
-                                    fprintf(stderr, "Warning attempt to use %d devices\n", deviceIdc);
-                                 }
-                                 deviceIdc = 1; // Hack to work around issue #18 (multiple device error)
-                                 if (isVerbose()){
-                                    fprintf(stderr, "Locking deviceIdc to %d to work around issue #18\n", deviceIdc);
-                                 }
-                              }
-                              platform = platforms[i];
-                              if (isVerbose()){
-                                 fprintf(stderr, "platform %s supports requested device type\n", platformVendorName);
-                              }
-
-                              deviceIds = new cl_device_id[deviceIdc];
-                              status = clGetDeviceIDs(platform, deviceType, deviceIdc, deviceIds, NULL);
-                              if (status == CL_SUCCESS){
-                                 ASSERT_CL_NO_RETURN("clGetDeviceIDs()"); 
-
-                                 GET_DEV_INFO(deviceIds[0], CL_DEVICE_MAX_COMPUTE_UNITS, maxComputeUnits, "%d");
-                                 GET_DEV_INFO(deviceIds[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, maxWorkItemDimensions, "%d");
-                                 GET_DEV_INFO(deviceIds[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, maxWorkGroupSize, "%d");
-                                 GET_DEV_INFO(deviceIds[0], CL_DEVICE_GLOBAL_MEM_SIZE, globalMemSize, "%d");
-                                 GET_DEV_INFO(deviceIds[0], CL_DEVICE_LOCAL_MEM_SIZE, localMemSize, "%d");
-                                 if (isVerbose()){
-
-                                    fprintf(stderr, "device[%p]: Type: ", deviceIds[0]);
-                                    if (deviceType & CL_DEVICE_TYPE_DEFAULT) {
-                                       //  deviceType &= ~CL_DEVICE_TYPE_DEFAULT;
-                                       fprintf(stderr, "Default ");
-                                    }else if (deviceType & CL_DEVICE_TYPE_CPU) {
-                                       // deviceType &= ~CL_DEVICE_TYPE_CPU;
-                                       fprintf(stderr, "CPU ");
-                                    }else if (deviceType & CL_DEVICE_TYPE_GPU) {
-                                       // deviceType &= ~CL_DEVICE_TYPE_GPU;
-                                       fprintf(stderr, "GPU ");
-                                    }else if (deviceType & CL_DEVICE_TYPE_ACCELERATOR) {
-                                       // deviceType &= ~CL_DEVICE_TYPE_ACCELERATOR;
-                                       fprintf(stderr, "Accelerator ");
-                                    }else{
-                                       fprintf(stderr, "Unknown (0x%llx) ", deviceType);
-                                    }
-                                    fprintf(stderr, "\n");
-                                 }
-                                 cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
-                                 cl_context_properties* cprops = (NULL == platform) ? NULL : cps;
-                                 context = clCreateContextFromType( cprops, deviceType, NULL, NULL, &status);
-                                 ASSERT_CL_NO_RETURN("clCreateContextFromType()");
-                                 if (status == CL_SUCCESS){
-
-                                    valid = JNI_TRUE;
-                                 }
-                              }
-                           }else{
-                              if (isVerbose()){
-                                 fprintf(stderr, "platform %s does not support requested device type skipping!\n", platformVendorName);
-                              }
+                     // Get the # of devices
+                     status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &deviceIdc);
+                     // now check if this platform supports the requested device type (GPU or CPU)
+                     if (status == CL_SUCCESS && deviceIdc >0 ){
+                        if (deviceIdc >1){
+                           if (isVerbose()){
+                              fprintf(stderr, "Warning attempt to use %d devices\n", deviceIdc);
                            }
-
-                        }else{
+                           deviceIdc = 1; // Hack to work around issue #18 (multiple device error)
                            if (isVerbose()){
-#ifndef __APPLE__
-                              fprintf(stderr, "platform %s version %s is not OpenCL 1.1 skipping!\n", platformVendorName, platformVersionName);
-#else
-                              fprintf(stderr, "platform %s version %s is neither OpenCL 1.1 or OpenCL 1.0 skipping!\n", platformVendorName, platformVersionName);
-#endif
-
+                              fprintf(stderr, "Locking deviceIdc to %d to work around issue #18\n", deviceIdc);
                            }
                         }
-                     }
+                        platform = platforms[i];
+                        if (isVerbose()){
+                           fprintf(stderr, "platform %s supports requested device type\n", platformVendorName);
+                        }
 
-                  } 
-               }else{
-                  if (isVerbose()){
-                     fprintf(stderr, "no opencl platforms available!\n");
-                  }
-               }
+                        deviceIds = new cl_device_id[deviceIdc];
+                        status = clGetDeviceIDs(platform, deviceType, deviceIdc, deviceIds, NULL);
+                        ASSERT_CL_NO_RETURN("clGetDeviceIDs()"); 
+                        if (status == CL_SUCCESS){
 
-            }
+                           status = clGetDeviceInfo(deviceIds[0], CL_DEVICE_MAX_COMPUTE_UNITS,  sizeof(maxComputeUnits), &maxComputeUnits, NULL);
+                           ASSERT_CL_NO_RETURN( "clGetDeviceInfo(CL_DEVICE_MAX_COMPUTE_UNITS).");
 
-            jboolean isValid(){
-               return(valid);
-            }
-            jboolean isProfilingEnabled(){
-               return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_PROFILING)==com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_PROFILING?JNI_TRUE:JNI_FALSE);
-            }
-            jboolean isUsingGPU(){
-               return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_USE_GPU)==com_amd_aparapi_KernelRunner_JNI_FLAG_USE_GPU?JNI_TRUE:JNI_FALSE);
-            }
-            jboolean isVerbose(){
-               return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_VERBOSE_JNI)==com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_VERBOSE_JNI?JNI_TRUE:JNI_FALSE);
-            }
+                           status = clGetDeviceInfo(deviceIds[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,  sizeof(maxWorkItemDimensions), &maxWorkItemDimensions, NULL);
+                           ASSERT_CL_NO_RETURN( "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS).");
 
-            ~JNIContext(){
-               cl_int status = CL_SUCCESS;
-               jenv->DeleteGlobalRef(kernelObject);
-               jenv->DeleteGlobalRef(kernelClass);
-               if (context != 0){
-                  status = clReleaseContext(context);
-                  ASSERT_CL_NO_RETURN("clReleaseContext()");
-                  context = (cl_context)0;
-               }
-               if (commandQueues){
-                  for (int dev=0; dev<deviceIdc; dev++){
-                     status = clReleaseCommandQueue((cl_command_queue)commandQueues[dev]);
-                     ASSERT_CL_NO_RETURN("clReleaseCommandQueue()");
-                     commandQueues[dev] = (cl_command_queue)0;
-                  }
-                  delete[] commandQueues; commandQueues = NULL;
-               }
-               if (program != 0){
-                  status = clReleaseProgram((cl_program)program);
-                  ASSERT_CL_NO_RETURN("clReleaseProgram()");
-                  program = (cl_program)0;
-               }
-               if (kernel != 0){
-                  status = clReleaseKernel((cl_kernel)kernel);
-                  ASSERT_CL_NO_RETURN("clReleaseKernel()");
-                  kernel = (cl_kernel)0;
-               }
-               if (platforms){
-                  delete []platforms; platforms=NULL;
-               }
-               if (deviceIds){
-                  delete [] deviceIds; deviceIds=NULL;
-               }
-               if (argc> 0){
-                  for (int i=0; i< argc; i++){
-                     KernelArg *arg = args[i];
-                     if (!arg->isPrimitive()){
-                        if (arg->value.ref.mem != 0){
-                           status = clReleaseMemObject((cl_mem)arg->value.ref.mem);
-                           ASSERT_CL_NO_RETURN("clReleaseMemObject()");
-                           arg->value.ref.mem = (cl_mem)0;
+                           maxWorkItemSizes = (size_t *)malloc(sizeof(size_t)*maxWorkItemDimensions);
+                           status = clGetDeviceInfo(deviceIds[0], CL_DEVICE_MAX_WORK_ITEM_SIZES,  sizeof(size_t)*maxWorkItemDimensions, maxWorkItemSizes, NULL);
+
+                           ASSERT_CL_NO_RETURN( "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES).");
+
+                           status = clGetDeviceInfo(deviceIds[0], CL_DEVICE_MAX_WORK_GROUP_SIZE,  sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL);
+                           ASSERT_CL_NO_RETURN( "clGetDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE).");
+
+                           status = clGetDeviceInfo(deviceIds[0], CL_DEVICE_GLOBAL_MEM_SIZE,  sizeof(globalMemSize), &globalMemSize, NULL);
+                           ASSERT_CL_NO_RETURN( "clGetDeviceInfo(CL_DEVICE_GLOBAL_MEM_SIZE).");
+
+                           status = clGetDeviceInfo(deviceIds[0], CL_DEVICE_LOCAL_MEM_SIZE,  sizeof(localMemSize), &localMemSize, NULL);
+
+                           ASSERT_CL_NO_RETURN( "clGetDeviceInfo(CL_DEVICE_LOCAL_MEM_SIZE).");
+
+
+                           if (isVerbose()){
+                              fprintf(stderr, "device[%p]: Type: ", deviceIds[0]);
+                              if (deviceType & CL_DEVICE_TYPE_DEFAULT) {
+                                 //  deviceType &= ~CL_DEVICE_TYPE_DEFAULT;
+                                 fprintf(stderr, "Default ");
+                              }else if (deviceType & CL_DEVICE_TYPE_CPU) {
+                                 // deviceType &= ~CL_DEVICE_TYPE_CPU;
+                                 fprintf(stderr, "CPU ");
+                              }else if (deviceType & CL_DEVICE_TYPE_GPU) {
+                                 // deviceType &= ~CL_DEVICE_TYPE_GPU;
+                                 fprintf(stderr, "GPU ");
+                              }else if (deviceType & CL_DEVICE_TYPE_ACCELERATOR) {
+                                 // deviceType &= ~CL_DEVICE_TYPE_ACCELERATOR;
+                                 fprintf(stderr, "Accelerator ");
+                              }else{
+                                 fprintf(stderr, "Unknown (0x%llx) ", deviceType);
+                              }
+                              fprintf(stderr, "\n");
+                           }
+                           cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
+                           cl_context_properties* cprops = (NULL == platform) ? NULL : cps;
+                           context = clCreateContextFromType( cprops, deviceType, NULL, NULL, &status);
+                           ASSERT_CL_NO_RETURN("clCreateContextFromType()");
+                           if (status == CL_SUCCESS){
+
+                              valid = JNI_TRUE;
+                           }
                         }
-                        if (arg->value.ref.javaArray != NULL)  {
-                           jenv->DeleteWeakGlobalRef((jweak) arg->value.ref.javaArray);
+                     }else{
+                        if (isVerbose()){
+                           fprintf(stderr, "platform %s does not support requested device type skipping!\n", platformVendorName);
                         }
                      }
-                     if (arg->name != NULL){
-                        free(arg->name); arg->name = NULL;
-                     }
-                     if (arg->javaArg != NULL ) {
-                        jenv->DeleteGlobalRef((jobject) arg->javaArg);
-                     }
-                     delete arg; arg=args[i]=NULL;
-                  }
-                  delete[] args; args=NULL;
 
-                  delete []readEvents; readEvents =NULL;
-                  delete []writeEvents; writeEvents = NULL;
-                  delete []executeEvents; executeEvents = NULL;
+                  }else{
+                     if (isVerbose()){
+#ifndef __APPLE__
+                        fprintf(stderr, "platform %s version %s is not OpenCL 1.1 skipping!\n", platformVendorName, platformVersionName);
+#else
+                        fprintf(stderr, "platform %s version %s is neither OpenCL 1.1 or OpenCL 1.0 skipping!\n", platformVendorName, platformVersionName);
+#endif
 
-                  if (isProfilingEnabled()) {
-                     if (profileFile != NULL && profileFile != stderr) {
-                        fclose(profileFile);
                      }
-                     delete[] readEventArgs; readEventArgs=0;
-                     delete[] writeEventArgs; writeEventArgs=0;
-                  } 
-               }
-            }
-
-            /*
-               Release JNI critical pinned arrays before returning to java code
-               */
-            void unpinAll() {
-               for (int i=0; i< argc; i++){
-                  KernelArg *arg = args[i];
-                  if (arg->isBackedByArray()) {
-                     arg->unpin(jenv);
                   }
                }
+
+            } 
+         }else{
+            if (isVerbose()){
+               fprintf(stderr, "no opencl platforms available!\n");
             }
+         }
 
+}
 
-         };
+jboolean isValid(){
+   return(valid);
+}
+jboolean isProfilingEnabled(){
+   return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_PROFILING)==com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_PROFILING?JNI_TRUE:JNI_FALSE);
+}
+jboolean isUsingGPU(){
+   return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_USE_GPU)==com_amd_aparapi_KernelRunner_JNI_FLAG_USE_GPU?JNI_TRUE:JNI_FALSE);
+}
+jboolean isVerbose(){
+   return((flags&com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_VERBOSE_JNI)==com_amd_aparapi_KernelRunner_JNI_FLAG_ENABLE_VERBOSE_JNI?JNI_TRUE:JNI_FALSE);
+}
 
-      jclass cacheKernelArgFields(JNIEnv *jenv, jobject jobj){
-         jclass c = jenv->GetObjectClass(jobj); 
-         nameFieldID = jenv->GetFieldID(c, "name", "Ljava/lang/String;"); ASSERT_FIELD(name);
-         typeFieldID = jenv->GetFieldID(c, "type", "I"); ASSERT_FIELD(type);
-         isStaticFieldID = jenv->GetFieldID(c, "isStatic", "Z"); ASSERT_FIELD(isStatic);
-         javaArrayFieldID = jenv->GetFieldID(c, "javaArray", "Ljava/lang/Object;"); ASSERT_FIELD(javaArray);
-         bytesPerLocalSizeFieldID = jenv->GetFieldID(c, "bytesPerLocalSize", "I"); ASSERT_FIELD(bytesPerLocalSize);
-         sizeInBytesFieldID = jenv->GetFieldID(c, "sizeInBytes", "I"); ASSERT_FIELD(sizeInBytes);
-         numElementsFieldID = jenv->GetFieldID(c, "numElements", "I"); ASSERT_FIELD(numElements);
-         return(c);
-      }
+~JNIContext(){
+}
 
-      JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_disposeJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) {
-         cl_int status = CL_SUCCESS;
-         JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
-         if (jniContext != NULL){
-            delete jniContext;//free(jniContext);
-            jniContext = NULL;
-         }
-         return(status);
+void dispose(JNIEnv *jenv){
+   cl_int status = CL_SUCCESS;
+   jenv->DeleteGlobalRef(kernelObject);
+   jenv->DeleteGlobalRef(kernelClass);
+   if (context != 0){
+      status = clReleaseContext(context);
+      ASSERT_CL_NO_RETURN("clReleaseContext()");
+      context = (cl_context)0;
+   }
+   if (commandQueues){
+      for (int dev=0; dev<deviceIdc; dev++){
+         status = clReleaseCommandQueue((cl_command_queue)commandQueues[dev]);
+         ASSERT_CL_NO_RETURN("clReleaseCommandQueue()");
+         commandQueues[dev] = (cl_command_queue)0;
       }
-
-      void idump(char *str, void *ptr, int size){
-         int * iptr = (int *)ptr;
-         for (int i=0; i<size/sizeof(int); i++){
-            fprintf(stderr, "%s%4d %d\n", str, i, iptr[i]);
+      delete[] commandQueues; commandQueues = NULL;
+   }
+   if (program != 0){
+      status = clReleaseProgram((cl_program)program);
+      ASSERT_CL_NO_RETURN("clReleaseProgram()");
+      program = (cl_program)0;
+   }
+   if (kernel != 0){
+      status = clReleaseKernel((cl_kernel)kernel);
+      ASSERT_CL_NO_RETURN("clReleaseKernel()");
+      kernel = (cl_kernel)0;
+   }
+   if (platforms){
+      delete []platforms; platforms=NULL;
+   }
+   if (deviceIds){
+      delete [] deviceIds; deviceIds=NULL;
+   }
+   if (argc> 0){
+      for (int i=0; i< argc; i++){
+         KernelArg *arg = args[i];
+         if (!arg->isPrimitive()){
+            if (arg->value.ref.mem != 0){
+               status = clReleaseMemObject((cl_mem)arg->value.ref.mem);
+               ASSERT_CL_NO_RETURN("clReleaseMemObject()");
+               arg->value.ref.mem = (cl_mem)0;
+            }
+            if (arg->value.ref.javaArray != NULL)  {
+               jenv->DeleteWeakGlobalRef((jweak) arg->value.ref.javaArray);
+            }
+         }
+         if (arg->name != NULL){
+            free(arg->name); arg->name = NULL;
          }
+         if (arg->javaArg != NULL ) {
+            jenv->DeleteGlobalRef((jobject) arg->javaArg);
+         }
+         delete arg; arg=args[i]=NULL;
       }
+      delete[] args; args=NULL;
+
+      delete []readEvents; readEvents =NULL;
+      delete []writeEvents; writeEvents = NULL;
+      delete []executeEvents; executeEvents = NULL;
 
-      void fdump(char *str, void *ptr, int size){
-         float * fptr = (float *)ptr;
-         for (int i=0; i<size/sizeof(float); i++){
-            fprintf(stderr, "%s%4d %6.2f\n", str, i, fptr[i]);
+      if (isProfilingEnabled()) {
+         if (profileFile != NULL && profileFile != stderr) {
+            fclose(profileFile);
          }
+         delete[] readEventArgs; readEventArgs=0;
+         delete[] writeEventArgs; writeEventArgs=0;
+      } 
+   }
+}
+
+/*
+   Release JNI critical pinned arrays before returning to java code
+   */
+void unpinAll(JNIEnv* jenv) {
+   for (int i=0; i< argc; i++){
+      KernelArg *arg = args[i];
+      if (arg->isBackedByArray()) {
+         arg->unpin(jenv);
       }
+   }
+}
 
 
-      jint writeProfileInfo(JNIContext* jniContext){
-         cl_ulong currSampleBaseTime = -1;
-         int pos = 1;
+};
 
-         if (jniContext->firstRun) {
-            fprintf(jniContext->profileFile, "# PROFILE Name, queued, submit, start, end (microseconds)\n");
-         }       
 
-         // A read by a user kernel means the OpenCL layer wrote to the kernel and vice versa
-         for (int i=0; i< jniContext->argc; i++){
-            KernelArg *arg=jniContext->args[i];
-            if (arg->isBackedByArray() && arg->isRead()){
 
-               // Initialize the base time for this sample
-               if (currSampleBaseTime == -1) {
-                  currSampleBaseTime = arg->value.ref.write.queued;
-               } 
 
-               if (jniContext->profileBaseTime == 0){
-                  jniContext->profileBaseTime = arg->value.ref.write.queued;
+JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_disposeJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) {
+   cl_int status = CL_SUCCESS;
+   JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
+   if (jniContext != NULL){
+      jniContext->dispose(jenv);
+      delete jniContext;
+      jniContext = NULL;
+   }
+   return(status);
+}
 
-                  // Write the base time as the first item in the csv
-                  //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime);
-               }
+void idump(char *str, void *ptr, int size){
+   int * iptr = (int *)ptr;
+   for (int i=0; i<size/sizeof(int); i++){
+      fprintf(stderr, "%s%4d %d\n", str, i, iptr[i]);
+   }
+}
 
-               fprintf(jniContext->profileFile, "%d write %s,", pos++, arg->name);
+void fdump(char *str, void *ptr, int size){
+   float * fptr = (float *)ptr;
+   for (int i=0; i<size/sizeof(float); i++){
+      fprintf(stderr, "%s%4d %6.2f\n", str, i, fptr[i]);
+   }
+}
 
-               fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,",  
-                     (arg->value.ref.write.queued - currSampleBaseTime)/1000, 
-                     (arg->value.ref.write.submit - currSampleBaseTime)/1000, 
-                     (arg->value.ref.write.start - currSampleBaseTime)/1000, 
-                     (arg->value.ref.write.end - currSampleBaseTime)/1000);
-            }
-         }
 
-         if (jniContext->profileBaseTime == 0){
-            jniContext->profileBaseTime = jniContext->exec.queued;
+jint writeProfileInfo(JNIContext* jniContext){
+   cl_ulong currSampleBaseTime = -1;
+   int pos = 1;
 
-            // Write the base time as the first item in the csv
-            //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime);
-         }
+   if (jniContext->firstRun) {
+      fprintf(jniContext->profileFile, "# PROFILE Name, queued, submit, start, end (microseconds)\n");
+   }       
 
-         // Initialize the base time for this sample if necessary
+   // A read by a user kernel means the OpenCL layer wrote to the kernel and vice versa
+   for (int i=0; i< jniContext->argc; i++){
+      KernelArg *arg=jniContext->args[i];
+      if (arg->isBackedByArray() && arg->isRead()){
+
+         // Initialize the base time for this sample
          if (currSampleBaseTime == -1) {
-            currSampleBaseTime = jniContext->exec.queued;
+            currSampleBaseTime = arg->value.ref.write.queued;
          } 
 
-         // exec 
-         fprintf(jniContext->profileFile, "%d exec,", pos++);
-
-         fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,",  
-               (jniContext->exec.queued - currSampleBaseTime)/1000, 
-               (jniContext->exec.submit - currSampleBaseTime)/1000, 
-               (jniContext->exec.start - currSampleBaseTime)/1000, 
-               (jniContext->exec.end - currSampleBaseTime)/1000);
-
-         // 
-         if ( jniContext->argc == 0 ) {
-            fprintf(jniContext->profileFile, "\n");
-         } else { 
-            for (int i=0; i< jniContext->argc; i++){
-               KernelArg *arg=jniContext->args[i];
-               if (arg->isBackedByArray() && arg->isWrite()){
-                  if (jniContext->profileBaseTime == 0){
-                     jniContext->profileBaseTime = arg->value.ref.read.queued;
-
-                     // Write the base time as the first item in the csv
-                     //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime);               
-                  }
-
-                  // Initialize the base time for this sample
-                  if (currSampleBaseTime == -1) {
-                     currSampleBaseTime = arg->value.ref.read.queued;
-                  }
-
-                  fprintf(jniContext->profileFile, "%d read %s,", pos++, arg->name);
+         if (jniContext->profileBaseTime == 0){
+            jniContext->profileBaseTime = arg->value.ref.write.queued;
 
-                  fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,",  
-                        (arg->value.ref.read.queued - currSampleBaseTime)/1000, 
-                        (arg->value.ref.read.submit - currSampleBaseTime)/1000, 
-                        (arg->value.ref.read.start - currSampleBaseTime)/1000, 
-                        (arg->value.ref.read.end - currSampleBaseTime)/1000);
-               }
-            }
+            // Write the base time as the first item in the csv
+            //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime);
          }
-         fprintf(jniContext->profileFile, "\n");
-         return(0);
-      }
-
-      // Should failed profiling abort the run and return early?
-      cl_int profile(ProfileInfo *profileInfo, cl_event *event){
-         cl_int status = CL_SUCCESS;
-         status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_QUEUED, sizeof(profileInfo->queued), &(profileInfo->queued), NULL);
-         ASSERT_CL( "clGetEventProfiliningInfo() QUEUED");
-         status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_SUBMIT, sizeof(profileInfo->submit), &(profileInfo->submit), NULL);
-         ASSERT_CL( "clGetEventProfiliningInfo() SUBMIT");
-         status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_START, sizeof(profileInfo->start), &(profileInfo->start), NULL);
-         ASSERT_CL( "clGetEventProfiliningInfo() START");
-         status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_END, sizeof(profileInfo->end), &(profileInfo->end), NULL);
-         ASSERT_CL( "clGetEventProfiliningInfo() END");
-         return status;
-      }
-
 
+         fprintf(jniContext->profileFile, "%d write %s,", pos++, arg->name);
 
+         fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,",  
+               (arg->value.ref.write.queued - currSampleBaseTime)/1000, 
+               (arg->value.ref.write.submit - currSampleBaseTime)/1000, 
+               (arg->value.ref.write.start - currSampleBaseTime)/1000, 
+               (arg->value.ref.write.end - currSampleBaseTime)/1000);
+      }
+   }
 
-      jint updateKernel(JNIEnv *jenv, jobject jobj, JNIContext* jniContext) {
-         cl_int status = CL_SUCCESS;
-         if (jniContext != NULL){
-            // we need to step through the array of KernelArg's to create the info required to create the cl_mem buffers.
-            for (jint i=0; i<jniContext->argc; i++){ 
-               KernelArg *arg=jniContext->args[i];
-
-               arg->type = jenv->GetIntField(arg->javaArg, typeFieldID);
-               if (jniContext->isVerbose()){
-                  fprintf(stderr, "got type for %s: %08x\n", arg->name, arg->type);
-               }
-               if (!arg->isPrimitive()) {
-                  // Following used for all primitive arrays, object arrays  and nio Buffers
-                  jarray newRef = (jarray)jenv->GetObjectField(arg->javaArg, javaArrayFieldID);
-                  if (jniContext->isVerbose()){
-
-                     fprintf(stderr, "testing for Resync javaArray %s: old=%p, new=%p\n", arg->name, arg->value.ref.javaArray, newRef);         
-                  }
-
-                  jboolean isSame = jenv->IsSameObject( newRef, arg->value.ref.javaArray);
-                  if (isSame == JNI_FALSE) {
-                     if (jniContext->isVerbose()){
-                        fprintf(stderr, "Resync javaArray for %s: %p  %p\n", arg->name, newRef, arg->value.ref.javaArray);         
-                     }
-                     // Free previous ref if any
-                     if (arg->value.ref.javaArray != NULL) {
-                        jenv->DeleteWeakGlobalRef((jweak) arg->value.ref.javaArray);
-                        if (jniContext->isVerbose()){
-                           fprintf(stderr, "DeleteWeakGlobalRef for %s: %p\n", arg->name, arg->value.ref.javaArray);         
-                        }
-                     }
-
-                     // need to free opencl buffers, run will reallocate later
-                     if (arg->value.ref.mem != 0) {
-                        //fprintf(stderr, "-->releaseMemObject[%d]\n", i);
-                        status = clReleaseMemObject((cl_mem)arg->value.ref.mem);
-                        //fprintf(stderr, "<--releaseMemObject[%d]\n", i);
-                        ASSERT_CL("clReleaseMemObject()");
-                        arg->value.ref.mem = (cl_mem)0;
-                     }
-
-                     arg->value.ref.mem = (cl_mem) 0;
-                     arg->value.ref.addr = NULL;
+   if (jniContext->profileBaseTime == 0){
+      jniContext->profileBaseTime = jniContext->exec.queued;
 
-                     // Capture new array ref from the kernel arg object
+      // Write the base time as the first item in the csv
+      //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime);
+   }
 
-                     if (newRef != NULL) {
-                        arg->value.ref.javaArray = (jarray)jenv->NewWeakGlobalRef((jarray)newRef);
-                        if (jniContext->isVerbose()){
-                           fprintf(stderr, "NewWeakGlobalRef for %s, set to %p\n", arg->name,
-                                 arg->value.ref.javaArray);         
-                        }
-                     } else {
-                        arg->value.ref.javaArray = NULL;
-                     }
-                     arg->value.ref.isArray = !arg->isAparapiBufIsDirect();
+   // Initialize the base time for this sample if necessary
+   if (currSampleBaseTime == -1) {
+      currSampleBaseTime = jniContext->exec.queued;
+   } 
+
+   // exec 
+   fprintf(jniContext->profileFile, "%d exec,", pos++);
+
+   fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,",  
+         (jniContext->exec.queued - currSampleBaseTime)/1000, 
+         (jniContext->exec.submit - currSampleBaseTime)/1000, 
+         (jniContext->exec.start - currSampleBaseTime)/1000, 
+         (jniContext->exec.end - currSampleBaseTime)/1000);
+
+   // 
+   if ( jniContext->argc == 0 ) {
+      fprintf(jniContext->profileFile, "\n");
+   } else { 
+      for (int i=0; i< jniContext->argc; i++){
+         KernelArg *arg=jniContext->args[i];
+         if (arg->isBackedByArray() && arg->isWrite()){
+            if (jniContext->profileBaseTime == 0){
+               jniContext->profileBaseTime = arg->value.ref.read.queued;
+
+               // Write the base time as the first item in the csv
+               //fprintf(jniContext->profileFile, "%llu,", jniContext->profileBaseTime);               
+            }
 
-                     // Save the sizeInBytes which was set on the java side
-                     arg->sizeInBytes = jenv->GetIntField(arg->javaArg, sizeInBytesFieldID);
+            // Initialize the base time for this sample
+            if (currSampleBaseTime == -1) {
+               currSampleBaseTime = arg->value.ref.read.queued;
+            }
 
-                     if (jniContext->isVerbose()){
-                        fprintf(stderr, "updateKernel, args[%d].sizeInBytes=%d\n", i, arg->sizeInBytes);
-                     }
-                  } // !is_same
-               }
-            } // for each arg
-         } // if jniContext != NULL
+            fprintf(jniContext->profileFile, "%d read %s,", pos++, arg->name);
 
-         return(status);
+            fprintf(jniContext->profileFile, "%lu,%lu,%lu,%lu,",  
+                  (arg->value.ref.read.queued - currSampleBaseTime)/1000, 
+                  (arg->value.ref.read.submit - currSampleBaseTime)/1000, 
+                  (arg->value.ref.read.start - currSampleBaseTime)/1000, 
+                  (arg->value.ref.read.end - currSampleBaseTime)/1000);
+         }
       }
+   }
+   fprintf(jniContext->profileFile, "\n");
+   return(0);
+}
 
+// Should failed profiling abort the run and return early?
+cl_int profile(ProfileInfo *profileInfo, cl_event *event){
+   cl_int status = CL_SUCCESS;
+   status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_QUEUED, sizeof(profileInfo->queued), &(profileInfo->queued), NULL);
+   ASSERT_CL( "clGetEventProfiliningInfo() QUEUED");
+   status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_SUBMIT, sizeof(profileInfo->submit), &(profileInfo->submit), NULL);
+   ASSERT_CL( "clGetEventProfiliningInfo() SUBMIT");
+   status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_START, sizeof(profileInfo->start), &(profileInfo->start), NULL);
+   ASSERT_CL( "clGetEventProfiliningInfo() START");
+   status = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_END, sizeof(profileInfo->end), &(profileInfo->end), NULL);
+   ASSERT_CL( "clGetEventProfiliningInfo() END");
+   return status;
+}
 
 
-      JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_runKernelJNI(JNIEnv *jenv,
-            jobject jobj, jlong jniContextHandle, jint globalSize, jint localSize, jboolean needSync,
-            jboolean useNullForLocalSize, jint passes) {
+//Step through all non-primitive (array of primitive or array object references) and determine if the field has changed
+//The field may have been re-assigned by the Java code to NULL or another instance. 
+//If we detect a change then we discard the previous cl_mem buffer, the caller will detect that the buffers are null and will create new cl_mem buffers. 
+jint updateNonPrimitiveReferences(JNIEnv *jenv, jobject jobj, JNIContext* jniContext) {
+   cl_int status = CL_SUCCESS;
+   if (jniContext != NULL){
+      for (jint i=0; i<jniContext->argc; i++){ 
+         KernelArg *arg=jniContext->args[i];
+         arg->syncType(jenv); // make sure that the JNI arg reflects the latest type info from the instance 
 
-         cl_int status = CL_SUCCESS;
-         JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
          if (jniContext->isVerbose()){
-            timer.start();
+            fprintf(stderr, "got type for %s: %08x\n", arg->name, arg->type);
          }
-
-         // Need to capture array refs
-         if (jniContext->firstRun || needSync) {
-            updateKernel(jenv, jobj, jniContext );
+         if (!arg->isPrimitive()) {
+            // Following used for all primitive arrays, object arrays and nio Buffers
+            jarray newRef = (jarray)jenv->GetObjectField(arg->javaArg, KernelArg::javaArrayFieldID);
             if (jniContext->isVerbose()){
-               fprintf(stderr, "back from updateKernel\n");
+               fprintf(stderr, "testing for Resync javaArray %s: old=%p, new=%p\n", arg->name, arg->value.ref.javaArray, newRef);         
             }
-         }
-
-         int writeEventCount = 0;
-
-         // kernelArgPos is used to keep track of the kernel arg position, it can 
-         // differ from "i" due to insertion of javaArrayLength args which are not
-         // fields read from the kernel object.
-         int kernelArgPos = 0;
-
-         for (int i=0; i< jniContext->argc; i++){
-            KernelArg *arg = jniContext->args[i];
-            // TODO: see if we can get rid of this read
-            arg->type = jenv->GetIntField(arg->javaArg, typeFieldID);
-            if (jniContext->isVerbose()){
-               fprintf(stderr, "got type for arg %d, %s, type=%08x\n", i, arg->name, arg->type);
-            }
-            if (!arg->isPrimitive() && !arg->isLocal()) {
-               // pin the arrays so that GC does not move them during the call
-
-               // get the C memory address for the region being transferred
-               // this uses different JNI calls for arrays vs. directBufs
-               void * prevAddr =  arg->value.ref.addr;
-               if (arg->value.ref.isArray) {
-                  arg->pin(jenv);
-               } else if (arg->isAparapiBufIsDirect()) {
-                  // different call used for directbuffers
-                  arg->value.ref.addr = jenv->GetDirectBufferAddress(arg->value.ref.javaArray);
-               }
 
+            if (!jenv->IsSameObject(newRef, arg->value.ref.javaArray)) {
                if (jniContext->isVerbose()){
-                  fprintf(stderr, "runKernel: arrayOrBuf ref %p, oldAddr=%p, newAddr=%p, ref.mem=%p, isArray=%d\n",
-                        arg->value.ref.javaArray, 
-                        prevAddr,
-                        arg->value.ref.addr,
-                        arg->value.ref.mem,
-                        arg->value.ref.isArray );
-                  fprintf(stderr, "at memory addr %p, contents: ", arg->value.ref.addr);
-                  unsigned char *pb = (unsigned char *) arg->value.ref.addr;
-                  for (int k=0; k<8; k++) {
-                     fprintf(stderr, "%02x ", pb[k]);
-                  }
-                  fprintf(stderr, "\n" );
+                  fprintf(stderr, "Resync javaArray for %s: %p  %p\n", arg->name, newRef, arg->value.ref.javaArray);         
                }
-               // record whether object moved 
-               // if we see that isCopy was returned by getPrimitiveArrayCritical, treat that as a move
-               bool objectMoved = (arg->value.ref.addr != prevAddr) || arg->value.ref.isCopy;
-
-#ifdef VERBOSE_EXPLICIT
-               if (arg->isExplicit() && arg->isExplicitWrite()){
-                  fprintf(stderr, "explicit write of %s\n",  arg->name);
-               }
-#endif
-
-               if (jniContext->firstRun || (arg->value.ref.mem == 0) || objectMoved ){
-                  // if either this is the first run or user changed input array
-                  // or gc moved something, then we create buffers/args
-                  cl_uint mask = CL_MEM_USE_HOST_PTR;
-                  if (arg->isRead() && arg->isWrite()) mask |= CL_MEM_READ_WRITE;
-                  else if (arg->isRead() && !arg->isWrite()) mask |= CL_MEM_READ_ONLY;
-                  else if (arg->isWrite()) mask |= CL_MEM_WRITE_ONLY;
-                  arg->value.ref.memMask = mask;
+               // Free previous ref if any
+               if (arg->value.ref.javaArray != NULL) {
+                  jenv->DeleteWeakGlobalRef((jweak) arg->value.ref.javaArray);
                   if (jniContext->isVerbose()){
-                     strcpy(arg->value.ref.memSpec,"CL_MEM_USE_HOST_PTR");
-                     if (mask & CL_MEM_READ_WRITE) strcat(arg->value.ref.memSpec,"|CL_MEM_READ_WRITE");
-                     if (mask & CL_MEM_READ_ONLY) strcat(arg->value.ref.memSpec,"|CL_MEM_READ_ONLY");
-                     if (mask & CL_MEM_WRITE_ONLY) strcat(arg->value.ref.memSpec,"|CL_MEM_WRITE_ONLY");
-
-                     fprintf(stderr, "%s %d clCreateBuffer(context, %s, size=%08x bytes, address=%08x, &status)\n", arg->name, 
-                           i, arg->value.ref.memSpec, arg->sizeInBytes, arg->value.ref.addr);
-                  }
-                  arg->value.ref.mem = clCreateBuffer(jniContext->context, arg->value.ref.memMask, 
-                        arg->sizeInBytes, arg->value.ref.addr, &status);
-
-                  if (status != CL_SUCCESS) {
-                     PRINT_CL_ERR(status, "clCreateBuffer");
-                     jniContext->unpinAll();
-                     return status;
-                  }
-
-                  status = clSetKernelArg(jniContext->kernel, kernelArgPos++, sizeof(cl_mem), (void *)&(arg->value.ref.mem));                  
-                  if (status != CL_SUCCESS) {
-                     PRINT_CL_ERR(status, "clSetKernelArg (array)");
-                     jniContext->unpinAll();
-                     return status;
-                  }
-
-                  // Add the array length if needed
-                  if (arg->usesArrayLength()){
-                     arg->value.ref.javaArrayLength = jenv->GetIntField(arg->javaArg, numElementsFieldID);
-                     status = clSetKernelArg(jniContext->kernel, kernelArgPos++, sizeof(jint), &(arg->value.ref.javaArrayLength));
-
-                     if (jniContext->isVerbose()){
-                        fprintf(stderr, "runKernel arg %d %s, javaArrayLength = %d\n", i, arg->name, arg->value.ref.javaArrayLength);
-                     }
-                     if (status != CL_SUCCESS) {
-                        PRINT_CL_ERR(status, "clSetKernelArg (array length)");
-                        jniContext->unpinAll();
-                        return status;
-                     }
-                  }
-               } else {
-                  // Keep the arg position in sync if no updates were required
-                  kernelArgPos++;
-                  if (arg->usesArrayLength()){
-                     kernelArgPos++;
+                     fprintf(stderr, "DeleteWeakGlobalRef for %s: %p\n", arg->name, arg->value.ref.javaArray);         
                   }
                }
 
-               // we only enqueue a write if we know the kernel actually reads the buffer or if there is an explicit write pending
-               // the default behavior for Constant buffers is also that there is no write enqueued unless explicit
+               // need to free opencl buffers, run will reallocate later
+               if (arg->value.ref.mem != 0) {
+                  //fprintf(stderr, "-->releaseMemObject[%d]\n", i);
+                  status = clReleaseMemObject((cl_mem)arg->value.ref.mem);
+                  //fprintf(stderr, "<--releaseMemObject[%d]\n", i);
+                  ASSERT_CL("clReleaseMemObject()");
+                  arg->value.ref.mem = (cl_mem)0;
+               }
 
-               if (arg->mustWriteBuffer()){
-#ifdef VERBOSE_EXPLICIT
-                  if (arg->isExplicit() && arg->isExplicitWrite()){
-                     fprintf(stderr, "writing explicit buffer %d %s\n", i, arg->name);
-                  }
-#endif
-                  if (jniContext->isVerbose()){
-                     fprintf(stderr, "%s writing buffer %d %s\n",  (arg->isExplicit() ? "explicitly" : ""), 
-                           i, arg->name);
-                  }
-                  if (jniContext->isProfilingEnabled()) {
-                     jniContext->writeEventArgs[writeEventCount]=i;
-                  }
+               arg->value.ref.mem = (cl_mem) 0;
+               arg->value.ref.addr = NULL;
 
-                  status = clEnqueueWriteBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, 
-                        arg->sizeInBytes, arg->value.ref.addr, 0, NULL, &(jniContext->writeEvents[writeEventCount++]));
-                  if (status != CL_SUCCESS) {
-                     PRINT_CL_ERR(status, "clEnqueueWriteBuffer");
-                     jniContext->unpinAll();
-                     return status;
-                  }
-                  if (arg->isExplicit() && arg->isExplicitWrite()){
-                     arg->type &= ~com_amd_aparapi_KernelRunner_ARG_EXPLICIT_WRITE;
-#ifdef VERBOSE_EXPLICIT
-                     fprintf(stderr, "clearing explicit buffer bit %d %s\n", i, arg->name);
-#endif
-                     jenv->SetIntField(arg->javaArg, typeFieldID,arg->type );
-                  }
-               }
-            } else if (arg->isLocal()){
-               if (jniContext->firstRun){
-                  // must multiply perlocalByteSize by localSize to get real opencl buffer size
-                  int bytesPerLocalSize = jenv->GetIntField(arg->javaArg, bytesPerLocalSizeFieldID);
-                  int adjustedLocalBufSize = bytesPerLocalSize * localSize;
+               // Capture new array ref from the kernel arg object
 
+               if (newRef != NULL) {
+                  arg->value.ref.javaArray = (jarray)jenv->NewWeakGlobalRef((jarray)newRef);
                   if (jniContext->isVerbose()){
-                     fprintf(stderr, "ISLOCAL, clSetKernelArg(jniContext->kernel, %d, %d, NULL);\n", i, adjustedLocalBufSize);
-                  }
-                  status = clSetKernelArg(jniContext->kernel, kernelArgPos++, adjustedLocalBufSize, NULL);
-                  if (status != CL_SUCCESS) {
-                     PRINT_CL_ERR(status, "clSetKernelArg() (local)");
-                     jniContext->unpinAll();
-                     return status;
+                     fprintf(stderr, "NewWeakGlobalRef for %s, set to %p\n", arg->name,
+                           arg->value.ref.javaArray);         
                   }
                } else {
-                  // Keep the arg position in sync if no updates were required
-                  kernelArgPos++;
-                  if (arg->usesArrayLength()){
-                     kernelArgPos++;
-                  }
+                  arg->value.ref.javaArray = NULL;
                }
-            }else{  // primitive arguments
+               arg->value.ref.isArray = true;
 
-               // we need to reflectively sync the value out of the kernel object
-               if (arg->isFloat()){
-                  if (arg->isStatic){
-                     arg->value.f = jenv->GetStaticFloatField(jniContext->kernelClass, arg->fieldID);
-                  }else{
-                     arg->value.f = jenv->GetFloatField(jniContext->kernelObject, arg->fieldID);
-                  }
-                  //fprintf(stderr, "float arg %d\n", arg->value.f); 
-               }else if (arg->isInt()){
-                  if (arg->isStatic){
-                     arg->value.i = jenv->GetStaticIntField(jniContext->kernelClass, arg->fieldID);
-                  }else{
-                     arg->value.i = jenv->GetIntField(jniContext->kernelObject, arg->fieldID);
-                  }
-                  //fprintf(stderr, "int arg %d\n", arg->value.i); 
-               }else if (arg->isBoolean()){
-                  if (arg->isStatic){
-                     arg->value.c = jenv->GetStaticBooleanField(jniContext->kernelClass, arg->fieldID);
-                  }else{
-                     arg->value.c = jenv->GetBooleanField(jniContext->kernelObject, arg->fieldID);
-                  }
-                  //fprintf(stderr, "boolean arg %d\n", arg->value.c); 
-               }else if (arg->isByte()){
-                  if (arg->isStatic){
-                     arg->value.c = jenv->GetStaticByteField(jniContext->kernelClass, arg->fieldID);
-                  }else{
-                     arg->value.c = jenv->GetByteField(jniContext->kernelObject, arg->fieldID);
-                  }
-                  //fprintf(stderr, "byte arg %d\n", arg->value.c); 
-               }else if (arg->isLong()){
-                  if (arg->isStatic){
-                     arg->value.j = jenv->GetStaticLongField(jniContext->kernelClass, arg->fieldID);
-                  }else{
-                     arg->value.j = jenv->GetLongField(jniContext->kernelObject, arg->fieldID);
-                  }
-                  //fprintf(stderr, "long arg %d\n", arg->value.c); 
-               }else if (arg->isDouble()){
-                  if (arg->isStatic){
-                     arg->value.d = jenv->GetStaticDoubleField(jniContext->kernelClass, arg->fieldID);
-                  }else{
-                     arg->value.d = jenv->GetDoubleField(jniContext->kernelObject, arg->fieldID);
-                  }
-                  //fprintf(stderr, "double arg %d\n", arg->value.c); 
-               }
+               // Save the sizeInBytes which was set on the java side
+               arg->syncSizeInBytes(jenv);
 
                if (jniContext->isVerbose()){
-                  fprintf(stderr, "clSetKernelArg %s: %d %d %d 0x%08x\n", arg->name, i, kernelArgPos, 
-                        arg->sizeInBytes, arg->value);         
-               }
-               status = clSetKernelArg(jniContext->kernel, kernelArgPos++, arg->sizeInBytes, &(arg->value));
-               if (status != CL_SUCCESS) {
-                  PRINT_CL_ERR(status, "clSetKernelArg() (value)");
-                  jniContext->unpinAll();
-                  return status;
+                  fprintf(stderr, "updateNonPrimitiveReferences, args[%d].sizeInBytes=%d\n", i, arg->sizeInBytes);
                }
-            }
-         }  // for each arg
+            } // object has changed
+         }
+      } // for each arg
+   } // if jniContext != NULL
+   return(status);
+}
 
-         size_t globalSizeAsSizeT = (globalSize /jniContext->deviceIdc);
-         size_t localSizeAsSizeT = localSize;
 
-         // To support multiple passes we add a 'secret' final arg called 'passid' and just schedule multiple enqueuendrange kernels.  Each of which having a separate value of passid
 
-         for (int passid=0; passid<passes; passid++){
-            for (int dev =0; dev < jniContext->deviceIdc; dev++){
-               size_t offset = (size_t)((globalSize/jniContext->deviceIdc)*dev);
-               status = clSetKernelArg(jniContext->kernel, kernelArgPos, sizeof(passid), &(passid));
-               if (status != CL_SUCCESS) {
-                  PRINT_CL_ERR(status, "clSetKernelArg() (passid)");
-                  jniContext->unpinAll();
-                  return status;
-               }
+JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_runKernelJNI(JNIEnv *jenv,
+      jobject jobj, jlong jniContextHandle, jobject _range, jboolean needSync, jint passes) {
 
-               // four options here due to passid
-               if (passid == 0 && passes==1){
-                  //fprintf(stderr, "setting passid to %d of %d first and last\n", passid, passes);
-                  // there is one pass and this is it
-                  // enqueue depends on write enqueues 
-                  // we don't block but and we populate the executeEvents
-                  status = clEnqueueNDRangeKernel(jniContext->commandQueues[dev], jniContext->kernel, 1, &offset, &globalSizeAsSizeT,
-                        useNullForLocalSize ? NULL : &localSizeAsSizeT,
-                        writeEventCount, writeEventCount?jniContext->writeEvents:NULL, &jniContext->executeEvents[dev]);
-               }else if (passid == 0){
-                  //fprintf(stderr, "setting passid to %d of %d first not last\n", passid, passes);
-                  // this is the first of multiple passes
-                  // enqueue depends on write enqueues 
-                  // we block but do not populate executeEvents (only the last pass does this)
-                  status = clEnqueueNDRangeKernel(jniContext->commandQueues[dev], jniContext->kernel, 1, &offset, &globalSizeAsSizeT,
-                        useNullForLocalSize ? NULL : &localSizeAsSizeT,
-                        writeEventCount, writeEventCount?jniContext->writeEvents:NULL, &jniContext->executeEvents[dev]);
-
-               }else if (passid < passes-1){
-                  // we are in some middle pass (neither first or last) 
-                  // we don't depend on write enqueues
-                  // we block and do not supply executeEvents (only the last pass does this)
-                  //fprintf(stderr, "setting passid to %d of %d not first not last\n", passid, passes);
-                  status = clEnqueueNDRangeKernel(jniContext->commandQueues[dev], jniContext->kernel, 1, &offset, &globalSizeAsSizeT,
-                        useNullForLocalSize ? NULL : &localSizeAsSizeT, 0, NULL, &jniContext->executeEvents[dev]);
-               }else{
-                  // we are the last pass of >1
-                  // we don't depend on write enqueues
-                  // we block and supply executeEvents
-                  //fprintf(stderr, "setting passid to %d of %d  last\n", passid, passes);
-                  status = clEnqueueNDRangeKernel(jniContext->commandQueues[dev], jniContext->kernel, 1, &offset, &globalSizeAsSizeT,
-                        useNullForLocalSize ? NULL : &localSizeAsSizeT, 0, NULL, &jniContext->executeEvents[dev]);
-               }
+   Range range(jenv, _range);
 
+   cl_int status = CL_SUCCESS;
+   JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
+   if (jniContext->isVerbose()){
+      timer.start();
+   }
 
-               if (status != CL_SUCCESS) {
-                  PRINT_CL_ERR(status, "clEnqueueNDRangeKernel()");
-                  fprintf(stderr, "after clEnqueueNDRangeKernel, globalSize=%d localSize=%d usingNull=%d\n", (int)globalSizeAsSizeT, (int)localSizeAsSizeT, useNullForLocalSize);
-                  jniContext->unpinAll();
-                  return status;
-               }
-            }
-            if (passid < passes-1){
-               // we need to wait for the executions to complete...
-               status = clWaitForEvents(jniContext->deviceIdc,  jniContext->executeEvents);
-               if (status != CL_SUCCESS) {
-                  PRINT_CL_ERR(status, "clWaitForEvents() execute events mid pass");
-                  jniContext->unpinAll();
-                  return status;
-               }
+   // Need to capture array refs
+   if (jniContext->firstRun || needSync) {
+      updateNonPrimitiveReferences(jenv, jobj, jniContext );
+      if (jniContext->isVerbose()){
+         fprintf(stderr, "back from updateNonPrimitiveReferences\n");
+      }
+   }
 
-               for (int dev = 0; dev < jniContext->deviceIdc; dev++){
-                  status = clReleaseEvent(jniContext->executeEvents[dev]);
-                  if (status != CL_SUCCESS) {
-                     PRINT_CL_ERR(status, "clReleaseEvent() read event");
-                     jniContext->unpinAll();
-                     return status;
-                  }
-               }
-            }
+   int writeEventCount = 0;
+
+   // kernelArgPos is used to keep track of the kernel arg position, it can 
+   // differ from "i" due to insertion of javaArrayLength args which are not
+   // fields read from the kernel object.
+   int kernelArgPos = 0;
+
+   for (int i=0; i< jniContext->argc; i++){
+      KernelArg *arg = jniContext->args[i];
+      // TODO: see if we can get rid of this read 
+      arg->syncType(jenv);
+      if (jniContext->isVerbose()){
+         fprintf(stderr, "got type for arg %d, %s, type=%08x\n", i, arg->name, arg->type);
+      }
+      if (!arg->isPrimitive() && !arg->isLocal()) {
+         // pin the arrays so that GC does not move them during the call
+
+         // get the C memory address for the region being transferred
+         // this uses different JNI calls for arrays vs. directBufs
+         void * prevAddr =  arg->value.ref.addr;
+         if (arg->value.ref.isArray) {
+            arg->pin(jenv);
          }
 
-         int readEventCount = 0;
+         if (jniContext->isVerbose()){
+            fprintf(stderr, "runKernel: arrayOrBuf ref %p, oldAddr=%p, newAddr=%p, ref.mem=%p, isArray=%d\n",
+                  arg->value.ref.javaArray, 
+                  prevAddr,
+                  arg->value.ref.addr,
+                  arg->value.ref.mem,
+                  arg->value.ref.isArray );
+            fprintf(stderr, "at memory addr %p, contents: ", arg->value.ref.addr);
+            unsigned char *pb = (unsigned char *) arg->value.ref.addr;
+            for (int k=0; k<8; k++) {
+               fprintf(stderr, "%02x ", pb[k]);
+            }
+            fprintf(stderr, "\n" );
+         }
+         // record whether object moved 
+         // if we see that isCopy was returned by getPrimitiveArrayCritical, treat that as a move
+         bool objectMoved = (arg->value.ref.addr != prevAddr) || arg->value.ref.isCopy;
 
-         for (int i=0; i< jniContext->argc; i++){
-            KernelArg *arg = jniContext->args[i];
+#ifdef VERBOSE_EXPLICIT
+         if (arg->isExplicit() && arg->isExplicitWrite()){
+            fprintf(stderr, "explicit write of %s\n",  arg->name);
+         }
+#endif
 
-            if (arg->mustReadBuffer()){
-               if (jniContext->isProfilingEnabled()) {
-                  jniContext->readEventArgs[readEventCount]=i;
-               }
-               if (jniContext->isVerbose()){
-                  fprintf(stderr, "reading buffer %d %s\n", i, arg->name);
-               }
+         if (jniContext->firstRun || (arg->value.ref.mem == 0) || objectMoved ){
+            // if either this is the first run or user changed input array
+            // or gc moved something, then we create buffers/args
+            cl_uint mask = CL_MEM_USE_HOST_PTR;
+            if (arg->isRead() && arg->isWrite()) mask |= CL_MEM_READ_WRITE;
+            else if (arg->isRead() && !arg->isWrite()) mask |= CL_MEM_READ_ONLY;
+            else if (arg->isWrite()) mask |= CL_MEM_WRITE_ONLY;
+            arg->value.ref.memMask = mask;
+            if (jniContext->isVerbose()){
+               strcpy(arg->value.ref.memSpec,"CL_MEM_USE_HOST_PTR");
+               if (mask & CL_MEM_READ_WRITE) strcat(arg->value.ref.memSpec,"|CL_MEM_READ_WRITE");
+               if (mask & CL_MEM_READ_ONLY) strcat(arg->value.ref.memSpec,"|CL_MEM_READ_ONLY");
+               if (mask & CL_MEM_WRITE_ONLY) strcat(arg->value.ref.memSpec,"|CL_MEM_WRITE_ONLY");
 
-               status = clEnqueueReadBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, 
-                     arg->sizeInBytes,arg->value.ref.addr , jniContext->deviceIdc, jniContext->executeEvents, &(jniContext->readEvents[readEventCount++]));
-               if (status != CL_SUCCESS) {
-                  PRINT_CL_ERR(status, "clEnqueueReadBuffer()");
-                  jniContext->unpinAll();
-                  return status;
-               }
+               fprintf(stderr, "%s %d clCreateBuffer(context, %s, size=%08x bytes, address=%08x, &status)\n", arg->name, 
+                     i, arg->value.ref.memSpec, arg->sizeInBytes, arg->value.ref.addr);
             }
-         }
+            arg->value.ref.mem = clCreateBuffer(jniContext->context, arg->value.ref.memMask, 
+                  arg->sizeInBytes, arg->value.ref.addr, &status);
 
-         // don't change the order here
-         // We wait for the reads which each depend on the execution, which depends on the writes ;)
-         // So after the reads have completed, we can release the execute and writes.
+            if (status != CL_SUCCESS) {
+               PRINT_CL_ERR(status, "clCreateBuffer");
+               jniContext->unpinAll(jenv);
+               return status;
+            }
 
-         if (readEventCount >0){
-            status = clWaitForEvents(readEventCount, jniContext->readEvents);
+            status = clSetKernelArg(jniContext->kernel, kernelArgPos++, sizeof(cl_mem), (void *)&(arg->value.ref.mem));                  
             if (status != CL_SUCCESS) {
-               PRINT_CL_ERR(status, "clWaitForEvents() read events");
-               jniContext->unpinAll();
+               PRINT_CL_ERR(status, "clSetKernelArg (array)");
+               jniContext->unpinAll(jenv);
                return status;
             }
 
-            for (int i=0; i< readEventCount; i++){
-               if (jniContext->isProfilingEnabled()) {
-                  status = profile(&jniContext->args[jniContext->readEventArgs[i]]->value.ref.read, &jniContext->readEvents[i]);
-                  if (status != CL_SUCCESS) {
-                     jniContext->unpinAll();
-                     return status;
-                  }
+            // Add the array length if needed
+            if (arg->usesArrayLength()){
+               arg->syncJavaArrayLength(jenv);
+
+               status = clSetKernelArg(jniContext->kernel, kernelArgPos++, sizeof(jint), &(arg->value.ref.javaArrayLength));
+
+               if (jniContext->isVerbose()){
+                  fprintf(stderr, "runKernel arg %d %s, javaArrayLength = %d\n", i, arg->name, arg->value.ref.javaArrayLength);
                }
-               status = clReleaseEvent(jniContext->readEvents[i]);
                if (status != CL_SUCCESS) {
-                  PRINT_CL_ERR(status, "clReleaseEvent() read event");
-                  jniContext->unpinAll();
+                  PRINT_CL_ERR(status, "clSetKernelArg (array length)");
+                  jniContext->unpinAll(jenv);
                   return status;
                }
             }
          } else {
-            // if readEventCount == 0 then we don't need any reads so we just wait for the executions to complete
-            status = clWaitForEvents(jniContext->deviceIdc, jniContext->executeEvents);
+            // Keep the arg position in sync if no updates were required
+            kernelArgPos++;
+            if (arg->usesArrayLength()){
+               kernelArgPos++;
+            }
+         }
+
+         // we only enqueue a write if we know the kernel actually reads the buffer or if there is an explicit write pending
+         // the default behavior for Constant buffers is also that there is no write enqueued unless explicit
+
+         if (arg->mustWriteBuffer()){
+#ifdef VERBOSE_EXPLICIT
+            if (arg->isExplicit() && arg->isExplicitWrite()){
+               fprintf(stderr, "writing explicit buffer %d %s\n", i, arg->name);
+            }
+#endif
+            if (jniContext->isVerbose()){
+               fprintf(stderr, "%s writing buffer %d %s\n",  (arg->isExplicit() ? "explicitly" : ""), 
+                     i, arg->name);
+            }
+            if (jniContext->isProfilingEnabled()) {
+               jniContext->writeEventArgs[writeEventCount]=i;
+            }
+
+            status = clEnqueueWriteBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, 
+                  arg->sizeInBytes, arg->value.ref.addr, 0, NULL, &(jniContext->writeEvents[writeEventCount++]));
             if (status != CL_SUCCESS) {
-               PRINT_CL_ERR(status, "clWaitForEvents() execute event");
-               jniContext->unpinAll();
+               PRINT_CL_ERR(status, "clEnqueueWriteBuffer");
+               jniContext->unpinAll(jenv);
                return status;
             }
+            if (arg->isExplicit() && arg->isExplicitWrite()){
+#ifdef VERBOSE_EXPLICIT
+               fprintf(stderr, "clearing explicit buffer bit %d %s\n", i, arg->name);
+#endif
+               arg->clearExplicitBufferBit(jenv);
+            }
          }
+      } else if (arg->isLocal()){
+         if (jniContext->firstRun){
+            int bytes = arg->sizeInBytes;
 
-         if (jniContext->isProfilingEnabled()) {
-            status = profile(&jniContext->exec, &jniContext->executeEvents[0]);
+            if (jniContext->isVerbose()){
+               fprintf(stderr, "ISLOCAL, clSetKernelArg(jniContext->kernel, %d, %d, NULL);\n", i, bytes);
+            }
+            status = clSetKernelArg(jniContext->kernel, kernelArgPos++, bytes, NULL);
             if (status != CL_SUCCESS) {
-               jniContext->unpinAll();
+               PRINT_CL_ERR(status, "clSetKernelArg() (local)");
+               jniContext->unpinAll(jenv);
                return status;
             }
+         } else {
+            // Keep the arg position in sync if no updates were required
+            kernelArgPos++;
+            if (arg->usesArrayLength()){
+               kernelArgPos++;
+            }
+         }
+      }else{  // primitive arguments
+
+         // we need to reflectively sync the value out of the kernel object
+         if (arg->isFloat()){
+            if (arg->isStatic){
+               arg->value.f = jenv->GetStaticFloatField(jniContext->kernelClass, arg->fieldID);
+            }else{
+               arg->value.f = jenv->GetFloatField(jniContext->kernelObject, arg->fieldID);
+            }
+            //fprintf(stderr, "float arg %d\n", arg->value.f); 
+         }else if (arg->isInt()){
+            if (arg->isStatic){
+               arg->value.i = jenv->GetStaticIntField(jniContext->kernelClass, arg->fieldID);
+            }else{
+               arg->value.i = jenv->GetIntField(jniContext->kernelObject, arg->fieldID);
+            }
+            //fprintf(stderr, "int arg %d\n", arg->value.i); 
+         }else if (arg->isBoolean()){
+            if (arg->isStatic){
+               arg->value.c = jenv->GetStaticBooleanField(jniContext->kernelClass, arg->fieldID);
+            }else{
+               arg->value.c = jenv->GetBooleanField(jniContext->kernelObject, arg->fieldID);
+            }
+            //fprintf(stderr, "boolean arg %d\n", arg->value.c); 
+         }else if (arg->isByte()){
+            if (arg->isStatic){
+               arg->value.c = jenv->GetStaticByteField(jniContext->kernelClass, arg->fieldID);
+            }else{
+               arg->value.c = jenv->GetByteField(jniContext->kernelObject, arg->fieldID);
+            }
+            //fprintf(stderr, "byte arg %d\n", arg->value.c); 
+         }else if (arg->isLong()){
+            if (arg->isStatic){
+               arg->value.j = jenv->GetStaticLongField(jniContext->kernelClass, arg->fieldID);
+            }else{
+               arg->value.j = jenv->GetLongField(jniContext->kernelObject, arg->fieldID);
+            }
+            //fprintf(stderr, "long arg %d\n", arg->value.c); 
+         }else if (arg->isDouble()){
+            if (arg->isStatic){
+               arg->value.d = jenv->GetStaticDoubleField(jniContext->kernelClass, arg->fieldID);
+            }else{
+               arg->value.d = jenv->GetDoubleField(jniContext->kernelObject, arg->fieldID);
+            }
+            //fprintf(stderr, "double arg %d\n", arg->value.c); 
+         }
+
+         if (jniContext->isVerbose()){
+            fprintf(stderr, "clSetKernelArg %s: %d %d %d 0x%08x\n", arg->name, i, kernelArgPos, 
+                  arg->sizeInBytes, arg->value);         
          }
+         status = clSetKernelArg(jniContext->kernel, kernelArgPos++, arg->sizeInBytes, &(arg->value));
+         if (status != CL_SUCCESS) {
+            PRINT_CL_ERR(status, "clSetKernelArg() (value)");
+            jniContext->unpinAll(jenv);
+            return status;
+         }
+      }
+   }  // for each arg
 
-         // extract the execution status from the executeEvent
-         cl_int executeStatus;
-         status = clGetEventInfo(jniContext->executeEvents[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &executeStatus, NULL);
+   // We will need to revisit the execution of multiple devices.  
+   // POssibly cloning the range per device and mutating each to handle a unique subrange (of global) and
+   // maybe even pushing the offset into the range class.
+
+   //   size_t globalSize_0AsSizeT = (range.globalDims[0] /jniContext->deviceIdc);
+   //   size_t localSize_0AsSizeT = range.localDims[0];
+
+   // To support multiple passes we add a 'secret' final arg called 'passid' and just schedule multiple enqueuendrange kernels.  Each of which having a separate value of passid
+   //
+
+   for (int passid=0; passid<passes; passid++){
+      for (int dev =0; dev < jniContext->deviceIdc; dev++){ // this will always be 1 until we reserect multi-dim support
+         //size_t offset = 1; // (size_t)((range.globalDims[0]/jniContext->deviceIdc)*dev);
+         status = clSetKernelArg(jniContext->kernel, kernelArgPos, sizeof(passid), &(passid));
          if (status != CL_SUCCESS) {
-            PRINT_CL_ERR(status, "clGetEventInfo() execute event");
-            jniContext->unpinAll();
+            PRINT_CL_ERR(status, "clSetKernelArg() (passid)");
+            jniContext->unpinAll(jenv);
             return status;
          }
-         if (executeStatus != CL_SUCCESS) {
-            // it should definitely not be negative, but since we did a wait above, it had better be CL_COMPLETE==CL_SUCCESS
-            PRINT_CL_ERR(executeStatus, "Execution status of execute event");
-            jniContext->unpinAll();
-            return executeStatus;
+
+         // four options here due to passid
+         if (passid == 0 && passes==1){
+            //fprintf(stderr, "setting passid to %d of %d first and last\n", passid, passes);
+            // there is one pass and this is it
+            // enqueue depends on write enqueues 
+            // we don't block but and we populate the executeEvents
+            status = clEnqueueNDRangeKernel(
+                  jniContext->commandQueues[dev],
+                  jniContext->kernel,
+                  range.dims,
+                  range.offsets, range.globalDims,
+                  range.localDims,
+                  writeEventCount,
+                  writeEventCount?jniContext->writeEvents:NULL,
+                  &jniContext->executeEvents[dev]);
+         }else if (passid == 0){
+            //fprintf(stderr, "setting passid to %d of %d first not last\n", passid, passes);
+            // this is the first of multiple passes
+            // enqueue depends on write enqueues 
+            // we block but do not populate executeEvents (only the last pass does this)
+            status = clEnqueueNDRangeKernel(
+                  jniContext->commandQueues[dev],
+                  jniContext->kernel,
+                  range.dims,
+                  range.offsets,
+                  range.globalDims,
+                  range.localDims,
+                  writeEventCount,
+                  writeEventCount?jniContext->writeEvents:NULL,
+                  &jniContext->executeEvents[dev]);
+
+         }else if (passid < passes-1){
+            // we are in some middle pass (neither first or last) 
+            // we don't depend on write enqueues
+            // we block and do not supply executeEvents (only the last pass does this)
+            //fprintf(stderr, "setting passid to %d of %d not first not last\n", passid, passes);
+            status = clEnqueueNDRangeKernel(
+                  jniContext->commandQueues[dev], 
+                  jniContext->kernel,
+                  range.dims,
+                  range.offsets,
+                  range.globalDims,
+                  range.localDims,
+                  0,    // wait for this event count
+                  NULL, // list of events to wait for
+                  &jniContext->executeEvents[dev]);
+         }else{
+            // we are the last pass of >1
+            // we don't depend on write enqueues
+            // we block and supply executeEvents
+            //fprintf(stderr, "setting passid to %d of %d  last\n", passid, passes);
+            status = clEnqueueNDRangeKernel(
+                  jniContext->commandQueues[dev], 
+                  jniContext->kernel,
+                  range.dims,
+                  range.offsets, 
+                  range.globalDims,
+                  range.localDims,
+                  0,    // wait for this event count
+                  NULL, // list of events to wait for
+                  &jniContext->executeEvents[dev]);
          }
 
-         for (int dev=0; dev<jniContext->deviceIdc; dev++){
 
-            status = clReleaseEvent(jniContext->executeEvents[dev]);
-            if (status != CL_SUCCESS) {
-               PRINT_CL_ERR(status, "clReleaseEvent() execute event");
-               jniContext->unpinAll();
-               return status;
-            }
+         if (status != CL_SUCCESS) {
+            PRINT_CL_ERR(status, "clEnqueueNDRangeKernel()");
+            fprintf(stderr, "after clEnqueueNDRangeKernel, globalSize_0=%d localSize_0=%d\n", (int)range.globalDims[0], range.localDims[0] );
+            jniContext->unpinAll(jenv);
+            return status;
+         }
+      }
+      if (passid < passes-1){
+         // we need to wait for the executions to complete...
+         status = clWaitForEvents(jniContext->deviceIdc,  jniContext->executeEvents);
+         if (status != CL_SUCCESS) {
+            PRINT_CL_ERR(status, "clWaitForEvents() execute events mid pass");
+            jniContext->unpinAll(jenv);
+            return status;
          }
 
-         for (int i=0; i< writeEventCount; i++){
-            if (jniContext->isProfilingEnabled()) {
-               profile(&jniContext->args[jniContext->writeEventArgs[i]]->value.ref.write, &jniContext->writeEvents[i]);
-            }
-            status = clReleaseEvent(jniContext->writeEvents[i]);
+         for (int dev = 0; dev < jniContext->deviceIdc; dev++){
+            status = clReleaseEvent(jniContext->executeEvents[dev]);
             if (status != CL_SUCCESS) {
-               PRINT_CL_ERR(status, "clReleaseEvent() write event");
-               jniContext->unpinAll();
+               PRINT_CL_ERR(status, "clReleaseEvent() read event");
+               jniContext->unpinAll(jenv);
                return status;
             }
          }
+      }
+   }
+
+   int readEventCount = 0;
 
-         jniContext->unpinAll();
+   for (int i=0; i< jniContext->argc; i++){
+      KernelArg *arg = jniContext->args[i];
 
+      if (arg->mustReadBuffer()){
          if (jniContext->isProfilingEnabled()) {
-            writeProfileInfo(jniContext);
+            jniContext->readEventArgs[readEventCount]=i;
          }
-
-         jniContext->firstRun = false;
          if (jniContext->isVerbose()){
-            timer.end("elapsed");
+            fprintf(stderr, "reading buffer %d %s\n", i, arg->name);
          }
 
-         //fprintf(stderr, "About to return %d from exec\n", status);
-         return(status);
+         status = clEnqueueReadBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, 
+               arg->sizeInBytes,arg->value.ref.addr , jniContext->deviceIdc, jniContext->executeEvents, &(jniContext->readEvents[readEventCount++]));
+         if (status != CL_SUCCESS) {
+            PRINT_CL_ERR(status, "clEnqueueReadBuffer()");
+            jniContext->unpinAll(jenv);
+            return status;
+         }
       }
+   }
 
+   // don't change the order here
+   // We wait for the reads which each depend on the execution, which depends on the writes ;)
+   // So after the reads have completed, we can release the execute and writes.
 
-      // we return the JNIContext from here 
-      JNIEXPORT jlong JNICALL Java_com_amd_aparapi_KernelRunner_initJNI(JNIEnv *jenv, jclass clazz, jobject kernelObject, 
-            jint flags, jint numProcessors,
-            jint maxJTPLocalSize) {
-         cl_int status = CL_SUCCESS;
-         JNIContext* jniContext = new JNIContext(jenv, kernelObject, flags, numProcessors, maxJTPLocalSize);
-         if (jniContext->isValid()){
-            return((jlong)jniContext);
-         }else{
-            return(0L);
+   if (readEventCount >0){
+      status = clWaitForEvents(readEventCount, jniContext->readEvents);
+      if (status != CL_SUCCESS) {
+         PRINT_CL_ERR(status, "clWaitForEvents() read events");
+         jniContext->unpinAll(jenv);
+         return status;
+      }
+
+      for (int i=0; i< readEventCount; i++){
+         if (jniContext->isProfilingEnabled()) {
+            status = profile(&jniContext->args[jniContext->readEventArgs[i]]->value.ref.read, &jniContext->readEvents[i]);
+            if (status != CL_SUCCESS) {
+               jniContext->unpinAll(jenv);
+               return status;
+            }
+         }
+         status = clReleaseEvent(jniContext->readEvents[i]);
+         if (status != CL_SUCCESS) {
+            PRINT_CL_ERR(status, "clReleaseEvent() read event");
+            jniContext->unpinAll(jenv);
+            return status;
          }
       }
+   } else {
+      // if readEventCount == 0 then we don't need any reads so we just wait for the executions to complete
+      status = clWaitForEvents(jniContext->deviceIdc, jniContext->executeEvents);
+      if (status != CL_SUCCESS) {
+         PRINT_CL_ERR(status, "clWaitForEvents() execute event");
+         jniContext->unpinAll(jenv);
+         return status;
+      }
+   }
 
+   if (jniContext->isProfilingEnabled()) {
+      status = profile(&jniContext->exec, &jniContext->executeEvents[0]);
+      if (status != CL_SUCCESS) {
+         jniContext->unpinAll(jenv);
+         return status;
+      }
+   }
 
-      JNIEXPORT jlong JNICALL Java_com_amd_aparapi_KernelRunner_buildProgramJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jstring source) {
-         JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
-         if (jniContext == NULL){
-            return 0;
-         }
+   // extract the execution status from the executeEvent
+   cl_int executeStatus;
+   status = clGetEventInfo(jniContext->executeEvents[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &executeStatus, NULL);
+   if (status != CL_SUCCESS) {
+      PRINT_CL_ERR(status, "clGetEventInfo() execute event");
+      jniContext->unpinAll(jenv);
+      return status;
+   }
+   if (executeStatus != CL_SUCCESS) {
+      // it should definitely not be negative, but since we did a wait above, it had better be CL_COMPLETE==CL_SUCCESS
+      PRINT_CL_ERR(executeStatus, "Execution status of execute event");
+      jniContext->unpinAll(jenv);
+      return executeStatus;
+   }
 
-         cl_int status = CL_SUCCESS;
-         const char *sourceChars = jenv->GetStringUTFChars(source, NULL);
-         CHECK(sourceChars == NULL, "jenv->GetStringUTFChars() returned null" );
-
-         size_t sourceSize[] = { strlen(sourceChars) };
-         jniContext->program = clCreateProgramWithSource( jniContext->context, 1, &sourceChars, sourceSize, &status); 
-         jenv->ReleaseStringUTFChars(source, sourceChars);
-         ASSERT_CL("clCreateProgramWithSource()");
-
-         status = clBuildProgram(jniContext->program, jniContext->deviceIdc, jniContext->deviceIds, NULL, NULL, NULL);
-
-         if(status == CL_BUILD_PROGRAM_FAILURE) {
-            cl_int logStatus;
-            size_t buildLogSize = 0;
-            status = clGetProgramBuildInfo(jniContext->program, jniContext->deviceIds[0], 
-                  CL_PROGRAM_BUILD_LOG, buildLogSize, NULL, &buildLogSize);
-            ASSERT_CL("clGetProgramBuildInfo()");
-            char * buildLog = new char[buildLogSize];
-            CHECK(buildLog == NULL, "Failed to allocate host memory. (buildLog)");
-            memset(buildLog, 0, buildLogSize);
-            status = clGetProgramBuildInfo (jniContext->program, jniContext->deviceIds[0], 
-                  CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL);
-            ASSERT_CL("clGetProgramBuildInfo()");
-
-            fprintf(stderr, "clBuildProgram failed");
-            fprintf(stderr, "\n************************************************\n");
-            fprintf(stderr, "%s", buildLog);
-            fprintf(stderr, "\n************************************************\n\n\n");
-            delete []buildLog;
-            return(0);
-         }
+   for (int dev=0; dev<jniContext->deviceIdc; dev++){
 
-         jniContext->kernel = clCreateKernel(jniContext->program, "run", &status);
-         ASSERT_CL("clCreateKernel()");
+      status = clReleaseEvent(jniContext->executeEvents[dev]);
+      if (status != CL_SUCCESS) {
+         PRINT_CL_ERR(status, "clReleaseEvent() execute event");
+         jniContext->unpinAll(jenv);
+         return status;
+      }
+   }
 
-         cl_command_queue_properties queue_props = 0;
-         if (jniContext->isProfilingEnabled()) {
-            queue_props |= CL_QUEUE_PROFILING_ENABLE;
-         }
+   for (int i=0; i< writeEventCount; i++){
+      if (jniContext->isProfilingEnabled()) {
+         profile(&jniContext->args[jniContext->writeEventArgs[i]]->value.ref.write, &jniContext->writeEvents[i]);
+      }
+      status = clReleaseEvent(jniContext->writeEvents[i]);
+      if (status != CL_SUCCESS) {
+         PRINT_CL_ERR(status, "clReleaseEvent() write event");
+         jniContext->unpinAll(jenv);
+         return status;
+      }
+   }
 
-         jniContext->commandQueues= new cl_command_queue[jniContext->deviceIdc];
-         for (int dev=0; dev < jniContext->deviceIdc; dev++){
-            jniContext->commandQueues[dev]=clCreateCommandQueue(jniContext->context, (cl_device_id)jniContext->deviceIds[dev],
-                  queue_props,
-                  &status);
-            ASSERT_CL("clCreateCommandQueue()");
-         }
+   jniContext->unpinAll(jenv);
 
-         if (jniContext->isProfilingEnabled()) {
-            // compute profile filename
+   if (jniContext->isProfilingEnabled()) {
+      writeProfileInfo(jniContext);
+   }
+
+   jniContext->firstRun = false;
+   if (jniContext->isVerbose()){
+      timer.end("elapsed");
+   }
+
+   //fprintf(stderr, "About to return %d from exec\n", status);
+   return(status);
+}
+
+
+// we return the JNIContext from here 
+JNIEXPORT jlong JNICALL Java_com_amd_aparapi_KernelRunner_initJNI(JNIEnv *jenv, jclass clazz, jobject kernelObject, 
+      jint flags) {
+   cl_int status = CL_SUCCESS;
+   JNIContext* jniContext = new JNIContext(jenv, kernelObject, flags);
+   if (jniContext->isValid()){
+      return((jlong)jniContext);
+   }else{
+      return(0L);
+   }
+}
+
+
+JNIEXPORT jlong JNICALL Java_com_amd_aparapi_KernelRunner_buildProgramJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jstring source) {
+   JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
+   if (jniContext == NULL){
+      return 0;
+   }
+
+   cl_int status = CL_SUCCESS;
+   const char *sourceChars = jenv->GetStringUTFChars(source, NULL);
+   CHECK(sourceChars == NULL, "jenv->GetStringUTFChars() returned null" );
+
+   size_t sourceSize[] = { strlen(sourceChars) };
+   jniContext->program = clCreateProgramWithSource( jniContext->context, 1, &sourceChars, sourceSize, &status); 
+   jenv->ReleaseStringUTFChars(source, sourceChars);
+   ASSERT_CL("clCreateProgramWithSource()");
+
+   status = clBuildProgram(jniContext->program, jniContext->deviceIdc, jniContext->deviceIds, NULL, NULL, NULL);
+
+   if(status == CL_BUILD_PROGRAM_FAILURE) {
+      cl_int logStatus;
+      size_t buildLogSize = 0;
+      status = clGetProgramBuildInfo(jniContext->program, jniContext->deviceIds[0], 
+            CL_PROGRAM_BUILD_LOG, buildLogSize, NULL, &buildLogSize);
+      ASSERT_CL("clGetProgramBuildInfo()");
+      char * buildLog = new char[buildLogSize];
+      CHECK(buildLog == NULL, "Failed to allocate host memory. (buildLog)");
+      memset(buildLog, 0, buildLogSize);
+      status = clGetProgramBuildInfo (jniContext->program, jniContext->deviceIds[0], 
+            CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL);
+      ASSERT_CL("clGetProgramBuildInfo()");
+
+      fprintf(stderr, "clBuildProgram failed");
+      fprintf(stderr, "\n************************************************\n");
+      fprintf(stderr, "%s", buildLog);
+      fprintf(stderr, "\n************************************************\n\n\n");
+      delete []buildLog;
+      return(0);
+   }
+
+   jniContext->kernel = clCreateKernel(jniContext->program, "run", &status);
+   ASSERT_CL("clCreateKernel()");
+
+   cl_command_queue_properties queue_props = 0;
+   if (jniContext->isProfilingEnabled()) {
+      queue_props |= CL_QUEUE_PROFILING_ENABLE;
+   }
+
+   jniContext->commandQueues= new cl_command_queue[jniContext->deviceIdc];
+   for (int dev=0; dev < jniContext->deviceIdc; dev++){
+      jniContext->commandQueues[dev]=clCreateCommandQueue(jniContext->context, (cl_device_id)jniContext->deviceIds[dev],
+            queue_props,
+            &status);
+      ASSERT_CL("clCreateCommandQueue()");
+   }
+
+   if (jniContext->isProfilingEnabled()) {
+      // compute profile filename
 #if defined (_WIN32)
-            jint pid = GetCurrentProcessId();
+      jint pid = GetCurrentProcessId();
 #else
-            pid_t pid = getpid();
+      pid_t pid = getpid();
 #endif
-            // indicate cpu or gpu
-            // timestamp
-            // kernel name
+      // indicate cpu or gpu
+      // timestamp
+      // kernel name
 
-            jclass classMethodAccess = jenv->FindClass("java/lang/Class"); 
-            jmethodID getNameID=jenv->GetMethodID(classMethodAccess,"getName","()Ljava/lang/String;");
-            jstring className = (jstring)jenv->CallObjectMethod(jniContext->kernelClass, getNameID);
-            const char *classNameChars = jenv->GetStringUTFChars(className, NULL);
+      jclass classMethodAccess = jenv->FindClass("java/lang/Class"); 
+      jmethodID getNameID=jenv->GetMethodID(classMethodAccess,"getName","()Ljava/lang/String;");
+      jstring className = (jstring)jenv->CallObjectMethod(jniContext->kernelClass, getNameID);
+      const char *classNameChars = jenv->GetStringUTFChars(className, NULL);
 
 #define TIME_STR_LEN 200
 
-            char timeStr[TIME_STR_LEN];
-            struct tm *tmp;
-            time_t t = time(NULL);
-            tmp = localtime(&t);
-            if (tmp == NULL) {
-               perror("localtime");
-            }
-            //strftime(timeStr, TIME_STR_LEN, "%F.%H%M%S", tmp);  %F seemed to cause a core dump
-            strftime(timeStr, TIME_STR_LEN, "%H%M%S", tmp);
-
-            char* fnameStr = new char[strlen(classNameChars) + strlen(timeStr) + 128];
+      char timeStr[TIME_STR_LEN];
+      struct tm *tmp;
+      time_t t = time(NULL);
+      tmp = localtime(&t);
+      if (tmp == NULL) {
+         perror("localtime");
+      }
+      //strftime(timeStr, TIME_STR_LEN, "%F.%H%M%S", tmp);  %F seemed to cause a core dump
+      strftime(timeStr, TIME_STR_LEN, "%H%M%S", tmp);
 
-            //sprintf(fnameStr, "%s.%s.%d.%llx\n", classNameChars, timeStr, pid, jniContext);
-            sprintf(fnameStr, "aparapiprof.%s.%d.%016lx", timeStr, pid, (unsigned long)jniContext);
-            jenv->ReleaseStringUTFChars(className, classNameChars);
+      char* fnameStr = new char[strlen(classNameChars) + strlen(timeStr) + 128];
 
-            FILE* profileFile = fopen(fnameStr, "w");
-            if (profileFile != NULL) {
-               jniContext->profileFile = profileFile;
-            } else {
-               jniContext->profileFile = stderr;
-               fprintf(stderr, "Could not open profile data file %s, reverting to stderr\n", fnameStr);
-            }
-            delete []fnameStr;
-         }
+      //sprintf(fnameStr, "%s.%s.%d.%llx\n", classNameChars, timeStr, pid, jniContext);
+      sprintf(fnameStr, "aparapiprof.%s.%d.%016lx", timeStr, pid, (unsigned long)jniContext);
+      jenv->ReleaseStringUTFChars(className, classNameChars);
 
-         return((jlong)jniContext);
+      FILE* profileFile = fopen(fnameStr, "w");
+      if (profileFile != NULL) {
+         jniContext->profileFile = profileFile;
+      } else {
+         jniContext->profileFile = stderr;
+         fprintf(stderr, "Could not open profile data file %s, reverting to stderr\n", fnameStr);
       }
+      delete []fnameStr;
+   }
 
+   return((jlong)jniContext);
+}
 
-      // this is called once when the arg list is first determined for this kernel
-      JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_setArgsJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jobjectArray argArray, jint argc) {
-         JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
-         cl_int status = CL_SUCCESS;
-         if (jniContext != NULL){      
-            jniContext->argc = argc;
-            jniContext->args = new KernelArg*[jniContext->argc];
-            jniContext->firstRun = true;
 
-            // Step through the array of KernelArg's to capture the type data for the Kernel's data members.
-            for (jint i=0; i<jniContext->argc; i++){ 
-               KernelArg* arg = jniContext->args[i] = new KernelArg;
+// this is called once when the arg list is first determined for this kernel
+JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_setArgsJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jobjectArray argArray, jint argc) {
+   JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
+   cl_int status = CL_SUCCESS;
+   if (jniContext != NULL){      
+      jniContext->argc = argc;
+      jniContext->args = new KernelArg*[jniContext->argc];
+      jniContext->firstRun = true;
 
+      // Step through the array of KernelArg's to capture the type data for the Kernel's data members.
+      for (jint i=0; i<jniContext->argc; i++){ 
+
+
+         jobject argObj = jenv->GetObjectArrayElement(argArray, i);
+         KernelArg* arg = jniContext->args[i] = new KernelArg(jenv, argObj);
 
-               jobject argObj = jenv->GetObjectArrayElement(argArray, i);
-               if (argClazz == 0){
-                  argClazz = cacheKernelArgFields(jenv, argObj);
-               }
-               arg->javaArg = jenv->NewGlobalRef(argObj);   // save a global ref to the java Arg Object
-
-               arg->type = jenv->GetIntField(argObj, typeFieldID);
-               arg->isStatic = jenv->GetBooleanField(argObj, isStaticFieldID);
-               jstring name  = (jstring)jenv->GetObjectField(argObj, nameFieldID);
-               const char *nameChars = jenv->GetStringUTFChars(name, NULL);
-               arg->name=strdup(nameChars);
-               jenv->ReleaseStringUTFChars(name, nameChars);
 #ifdef VERBOSE_EXPLICIT
-               if (arg->isExplicit()){
-                  fprintf(stderr, "%s is explicit!\n", arg->name);
-               }
+         if (arg->isExplicit()){
+            fprintf(stderr, "%s is explicit!\n", arg->name);
+         }
 #endif
 
-               if (arg->isPrimitive()) {
-                  // for primitives, we cache the fieldID for that field in the kernel's arg object
-                  if (arg->isFloat()){
-                     if (arg->isStatic){
-                        arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "F");
-                     }else{
-                        arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "F");
-                     }
-                     arg->sizeInBytes = sizeof(jfloat);
-                  }else if (arg->isInt()){
-                     if (arg->isStatic){
-                        arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "I");
-                     }else{
-                        arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "I");
-                     }
-                     arg->sizeInBytes = sizeof(jint);
-                  }else if (arg->isByte()){
-                     if (arg->isStatic){
-                        arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "B");
-                     }else{
-                        arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "B");
-                     }
-                     arg->sizeInBytes = sizeof(jbyte);
-                  }else if (arg->isBoolean()){
-                     if (arg->isStatic){
-                        arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "Z");
-                     }else{
-                        arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "Z");
-                     }
-                     arg->sizeInBytes = sizeof(jboolean);
-                  }else if (arg->isLong()){
-                     if (arg->isStatic){
-                        arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "J");
-                     }else{
-                        arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "J");
-                     }
-                     arg->sizeInBytes = sizeof(jlong);
-                  }else if (arg->isDouble()){
-                     if (arg->isStatic){
-                        arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "D");
-                     }else{
-                        arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "D");
-                     }
-                     arg->sizeInBytes = sizeof(jdouble);
-                  }
-               }else{ // we will use an array
-                  arg->value.ref.mem = (cl_mem) 0;
-                  arg->value.ref.javaArray = 0;
-                  arg->sizeInBytes = 0;
+         if (arg->isPrimitive()) {
+            // for primitives, we cache the fieldID for that field in the kernel's arg object
+            if (arg->isFloat()){
+               if (arg->isStatic){
+                  arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "F");
+               }else{
+                  arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "F");
                }
-               if (jniContext->isVerbose()){
-                  fprintf(stderr, "in setArgs arg %d %s type %08x\n", i, arg->name, arg->type);
+               arg->sizeInBytes = sizeof(jfloat);
+            }else if (arg->isInt()){
+               if (arg->isStatic){
+                  arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "I");
+               }else{
+                  arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "I");
                }
-
-               //If an error occurred, return early so we report the first problem, not the last
-               if (jniContext->jenv->ExceptionCheck() == JNI_TRUE) {
-                  jniContext->argc = -1;
-                  delete[] jniContext->args;
-                  jniContext->args = NULL;
-                  jniContext->firstRun = true;
-                  return (status);
+               arg->sizeInBytes = sizeof(jint);
+            }else if (arg->isByte()){
+               if (arg->isStatic){
+                  arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "B");
+               }else{
+                  arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "B");
                }
-
-            }
-            // we will need an executeEvent buffer for all devices
-            jniContext->executeEvents = new cl_event[jniContext->deviceIdc];
-
-            // We will need *at most* jniContext->argc read/write events
-            jniContext->readEvents = new cl_event[jniContext->argc];
-            if (jniContext->isProfilingEnabled()) {
-               jniContext->readEventArgs = new jint[jniContext->argc];
-            }
-            jniContext->writeEvents = new cl_event[jniContext->argc];
-            if (jniContext->isProfilingEnabled()) {
-               jniContext->writeEventArgs = new jint[jniContext->argc];
+               arg->sizeInBytes = sizeof(jbyte);
+            }else if (arg->isBoolean()){
+               if (arg->isStatic){
+                  arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "Z");
+               }else{
+                  arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "Z");
+               }
+               arg->sizeInBytes = sizeof(jboolean);
+            }else if (arg->isLong()){
+               if (arg->isStatic){
+                  arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "J");
+               }else{
+                  arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "J");
+               }
+               arg->sizeInBytes = sizeof(jlong);
+            }else if (arg->isDouble()){
+               if (arg->isStatic){
+                  arg->fieldID = jenv->GetStaticFieldID(jniContext->kernelClass, arg->name, "D");
+               }else{
+                  arg->fieldID = jenv->GetFieldID(jniContext->kernelClass, arg->name, "D");
+               }
+               arg->sizeInBytes = sizeof(jdouble);
             }
+         }else{ // we will use an array
+            arg->value.ref.mem = (cl_mem) 0;
+            arg->value.ref.javaArray = 0;
+            arg->sizeInBytes = 0;
          }
-         return(status);
-      }
-
-      JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getLocalSizeJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jint globalSize, jint localBytesPerLocalId) {
-         size_t kernelMaxWorkGroupSize = 0;
-         size_t kernelWorkGroupSizeMultiple = 0;
-         cl_int status = CL_SUCCESS;
-         JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
-         if (jniContext != NULL){
-            clGetKernelWorkGroupInfo(jniContext->kernel, jniContext->deviceIds[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(kernelMaxWorkGroupSize), &kernelMaxWorkGroupSize, NULL);
-            ASSERT_CL("clGetKernelWorkGroupInfo()");
-            // starting value depends on device type
-            // not sure why the CPU has a different starting size, but it does
-            int startLocalSize = (jniContext->deviceType == CL_DEVICE_TYPE_GPU ? kernelMaxWorkGroupSize : globalSize/(jniContext->numProcessors*4));
-
-            if (startLocalSize == 0) startLocalSize = 1;
-            if (startLocalSize > kernelMaxWorkGroupSize) startLocalSize = kernelMaxWorkGroupSize;
-            if (startLocalSize > globalSize) startLocalSize = globalSize;
-            // if the kernel uses any local memory, determine our max local memory size so we can possibly limit localsize
-            cl_ulong devLocalMemSize;
-            if (localBytesPerLocalId > 0) {
-               status = clGetDeviceInfo(jniContext->deviceIds[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &devLocalMemSize, NULL);
-               cl_uint localSizeLimitFromLocalMem = devLocalMemSize/localBytesPerLocalId;
-               if (startLocalSize > localSizeLimitFromLocalMem) startLocalSize = localSizeLimitFromLocalMem;
-               if (jniContext->isVerbose()){
-                  fprintf(stderr, "localBytesPerLocalId=%d, device localMemMax=%d, localSizeLimitFromLocalMem=%d\n",
-                        localBytesPerLocalId, (cl_uint) devLocalMemSize, localSizeLimitFromLocalMem);
-               }
-
+         if (jniContext->isVerbose()){
+            fprintf(stderr, "in setArgs arg %d %s type %08x\n", i, arg->name, arg->type);
+            if (arg->isLocal()){
+                 fprintf(stderr, "in setArgs arg %d %s is local\n", i, arg->name);
+            }else{
+                 fprintf(stderr, "in setArgs arg %d %s is *not* local\n", i, arg->name);
             }
+         }
 
-            // then iterate down until we find a localSize that divides globalSize equally
-            for (int localSize = startLocalSize; localSize>0; localSize--) {
-               if (globalSize % localSize == 0) {
-                  if (jniContext->isVerbose()){
-                     fprintf(stderr, "for globalSize=%d, stepping localSize from %d, returning localSize=%d\n", globalSize, startLocalSize, localSize);
-                  }
-                  return localSize;
-               }
-            }
+         //If an error occurred, return early so we report the first problem, not the last
+         if (jenv->ExceptionCheck() == JNI_TRUE) {
+            jniContext->argc = -1;
+            delete[] jniContext->args;
+            jniContext->args = NULL;
+            jniContext->firstRun = true;
+            return (status);
          }
-         // should never get this far
-         return 0;
+
       }
+      // we will need an executeEvent buffer for all devices
+      jniContext->executeEvents = new cl_event[jniContext->deviceIdc];
+
+      // We will need *at most* jniContext->argc read/write events
+      jniContext->readEvents = new cl_event[jniContext->argc];
+      if (jniContext->isProfilingEnabled()) {
+         jniContext->readEventArgs = new jint[jniContext->argc];
+      }
+      jniContext->writeEvents = new cl_event[jniContext->argc];
+      if (jniContext->isProfilingEnabled()) {
+         jniContext->writeEventArgs = new jint[jniContext->argc];
+      }
+   }
+   return(status);
+}
 
-      JNIEXPORT jstring JNICALL Java_com_amd_aparapi_KernelRunner_getExtensions(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) {
-         jstring jextensions = NULL;
-         JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
-         if (jniContext != NULL){
-            size_t retvalsize = 0;
-            cl_int status = CL_SUCCESS;
-            status = clGetDeviceInfo(jniContext->deviceIds[0], CL_DEVICE_EXTENSIONS, 0, NULL, &retvalsize);
-            ASSERT_CL("clGetDeviceInfo()");
-            char* extensions = new char[retvalsize];
-            clGetDeviceInfo(jniContext->deviceIds[0], CL_DEVICE_EXTENSIONS, retvalsize, extensions, NULL);
-            jextensions = jenv->NewStringUTF(extensions);
-            delete [] extensions;
-         }
-         return jextensions;
-      }
-
-      // Called as a result of Kernel.get(someArray)
-      JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jobject buffer) {
-         cl_int status = CL_SUCCESS;
-         JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
-         if (jniContext != NULL){
-            jboolean foundArg = false;
-            for (jint i=0; i<jniContext->argc; i++){ 
-               KernelArg *arg= jniContext->args[i];
-               if (arg->isArray()){
-                  jboolean isSame = jenv->IsSameObject(buffer, arg->value.ref.javaArray);
-                  // only do this if the array that we are passed is indeed an arg we are tracking
-                  if (isSame){
-                     foundArg = true;
-                     //fprintf(stderr, "get of %s\n", arg->name);
+
+
+JNIEXPORT jstring JNICALL Java_com_amd_aparapi_KernelRunner_getExtensionsJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) {
+   jstring jextensions = NULL;
+   JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
+   if (jniContext != NULL){
+      size_t retvalsize = 0;
+      cl_int status = CL_SUCCESS;
+      status = clGetDeviceInfo(jniContext->deviceIds[0], CL_DEVICE_EXTENSIONS, 0, NULL, &retvalsize);
+      ASSERT_CL("clGetDeviceInfo()");
+      char* extensions = new char[retvalsize];
+      clGetDeviceInfo(jniContext->deviceIds[0], CL_DEVICE_EXTENSIONS, retvalsize, extensions, NULL);
+      jextensions = jenv->NewStringUTF(extensions);
+      delete [] extensions;
+   }
+   return jextensions;
+}
+
+// Called as a result of Kernel.get(someArray)
+JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jobject buffer) {
+   cl_int status = CL_SUCCESS;
+   JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
+   if (jniContext != NULL){
+      jboolean foundArg = false;
+      for (jint i=0; i<jniContext->argc; i++){ 
+         KernelArg *arg= jniContext->args[i];
+         if (arg->isArray()){
+            jboolean isSame = jenv->IsSameObject(buffer, arg->value.ref.javaArray);
+            // only do this if the array that we are passed is indeed an arg we are tracking
+            if (isSame){
+               foundArg = true;
+               //fprintf(stderr, "get of %s\n", arg->name);
 
 #ifdef VERBOSE_EXPLICIT
-                     fprintf(stderr, "explicitly reading buffer %d %s\n", i, arg->name);
+               fprintf(stderr, "explicitly reading buffer %d %s\n", i, arg->name);
 #endif
-                     arg->pin(jenv);
+               arg->pin(jenv);
 
-                     status = clEnqueueReadBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, 
-                           arg->sizeInBytes,arg->value.ref.addr , 0, NULL, &jniContext->readEvents[0]);
-                     if (status != CL_SUCCESS) {
-                        PRINT_CL_ERR(status, "clEnqueueReadBuffer()");
-                        return status;
-                     }
-                     status = clWaitForEvents(1, jniContext->readEvents);
-                     if (status != CL_SUCCESS) {
-                        PRINT_CL_ERR(status, "clWaitForEvents");
-                        return status;
-                     }
-                     clReleaseEvent(jniContext->readEvents[0]);
-                     if (status != CL_SUCCESS) {
-                        PRINT_CL_ERR(status, "clReleaseEvent() read event");
-                        return status;
-                     }
-                     // since this is an explicit buffer get, we expect the buffer to have changed so we commit
-                     arg->unpinCommit(jenv);
-                  }
+               status = clEnqueueReadBuffer(jniContext->commandQueues[0], arg->value.ref.mem, CL_FALSE, 0, 
+                     arg->sizeInBytes,arg->value.ref.addr , 0, NULL, &jniContext->readEvents[0]);
+               if (status != CL_SUCCESS) {
+                  PRINT_CL_ERR(status, "clEnqueueReadBuffer()");
+                  return status;
                }
-            }
-            if (!foundArg){
-               if (jniContext->isVerbose()){
-                  fprintf(stderr, "attempt to request to get a buffer that does not appear to be referenced from kernel\n");
+               status = clWaitForEvents(1, jniContext->readEvents);
+               if (status != CL_SUCCESS) {
+                  PRINT_CL_ERR(status, "clWaitForEvents");
+                  return status;
                }
+               clReleaseEvent(jniContext->readEvents[0]);
+               if (status != CL_SUCCESS) {
+                  PRINT_CL_ERR(status, "clReleaseEvent() read event");
+                  return status;
+               }
+               // since this is an explicit buffer get, we expect the buffer to have changed so we commit
+               arg->unpinCommit(jenv);
             }
          }
-         return 0;
       }
+      if (!foundArg){
+         if (jniContext->isVerbose()){
+            fprintf(stderr, "attempt to request to get a buffer that does not appear to be referenced from kernel\n");
+         }
+      }
+   }
+   return 0;
+}
+
+JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getMaxComputeUnitsJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) {
+   cl_int status = CL_SUCCESS;
+   JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
+   if (jniContext != NULL){
+      return(jniContext->maxComputeUnits);
+   }else{
+      return(0);
+   }
+}
 
+JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getMaxWorkItemDimensionsJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) {
+   cl_int status = CL_SUCCESS;
+   JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
+   if (jniContext != NULL){
+      return(jniContext->maxWorkItemDimensions);
+   }else{
+      return(0);
+   }
+}
 
+JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getMaxWorkGroupSizeJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle) {
+   cl_int status = CL_SUCCESS;
+   JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
+   if (jniContext != NULL){
+      return(jniContext->maxWorkGroupSize);
+   }else{
+      return(0);
+   }
+}
+
+JNIEXPORT jint JNICALL Java_com_amd_aparapi_KernelRunner_getMaxWorkItemSizeJNI(JNIEnv *jenv, jobject jobj, jlong jniContextHandle, jint _index) {
+   cl_int status = CL_SUCCESS;
+   JNIContext* jniContext = JNIContext::getJNIContext(jniContextHandle);
+   if (jniContext != NULL && _index >=0 && _index <= jniContext->maxWorkItemDimensions){
+      return(jniContext->maxWorkItemSizes[_index]);
+   }else{
+      return(0);
+   }
+}
diff --git a/com.amd.aparapi/build.xml b/com.amd.aparapi/build.xml
index c57f0cb31150f3b4d473d1497816519ccea49f44..099eee309ad340b72aebf7b865da2c3e6f398051 100644
--- a/com.amd.aparapi/build.xml
+++ b/com.amd.aparapi/build.xml
@@ -8,7 +8,7 @@
       <delete file="aparapi.jar"/>
    </target>
 
-   <target name="build">
+   <target name="build" depends="clean">
       <mkdir dir="classes"/>
       <javac destdir="classes" debug="on" includeAntRuntime="false" >
          <src path="src/java"/>
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/ClassModel.java b/com.amd.aparapi/src/java/com/amd/aparapi/ClassModel.java
index 57f259350c762d55c085b4bcf416c3f566e2d6a3..8d6b45b43174b897ec2b5029b1a93a68bd4c4ddf 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/ClassModel.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/ClassModel.java
@@ -2090,9 +2090,9 @@ class ClassModel{
       }
 
       public boolean isStatic() {
-    	  return (Access.STATIC.bitIsSet(methodAccessFlags));
+         return (Access.STATIC.bitIsSet(methodAccessFlags));
       }
-      
+
       AttributePool getAttributePool() {
          return (methodAttributePool);
       }
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Config.java b/com.amd.aparapi/src/java/com/amd/aparapi/Config.java
index 46d2266a191eadbf983dd9540ba7c711dede66de..192edeac461ea8cf50e3dbc17d7e4399db8a222b 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/Config.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/Config.java
@@ -175,6 +175,7 @@ class Config{
    static String instructionListenerClassName = System.getProperty(propPkgName + ".instructionListenerClass");
 
    static public InstructionListener instructionListener = null;
+
    {
 
       if (instructionListenerClassName != null && !instructionListenerClassName.equals("")) {
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/DeprecatedException.java b/com.amd.aparapi/src/java/com/amd/aparapi/DeprecatedException.java
new file mode 100644
index 0000000000000000000000000000000000000000..70ca856e9cd07cde34185f34c976c813419177cd
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/DeprecatedException.java
@@ -0,0 +1,46 @@
+/*
+Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following
+disclaimer. 
+
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials provided with the distribution. 
+
+Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
+laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
+774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
+you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
+Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
+Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
+E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
+D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
+to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
+of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
+under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 
+
+*/
+package com.amd.aparapi;
+
+@SuppressWarnings("serial") class DeprecatedException extends AparapiException{
+
+   DeprecatedException(String msg) {
+      super(msg);
+   }
+
+}
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java b/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java
index c6dd14a1332086db13894c25c700e11500dba77d..abef24b2bf16ab2d57ecb2341f5b283cf51eda0e 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/Kernel.java
@@ -89,16 +89,17 @@ import com.amd.aparapi.ClassModel.ConstantPool.MethodReferenceEntry;
  *     }
  * </pre></blockquote>
  * <p>
- * To execute this kernel, first create a new instance of it and then call <code>execute(int globalSize)</code>. 
+ * To execute this kernel, first create a new instance of it and then call <code>execute(Range _range)</code>. 
  * <p>
  * <blockquote><pre>
  *     int[] values = new int[1024];
  *     // fill values array
+ *     Range range = Range.create(values.length); // create a range 0..1024
  *     SquareKernel kernel = new SquareKernel(values);
- *     kernel.execute(values.length);
+ *     kernel.execute(range);
  * </pre></blockquote>
  * <p>
- * When <code>execute()</code> returns, all the executions of Kernel.run() have completed and the results are available in the <code>squares</code> array.
+ * When <code>execute(Range)</code> returns, all the executions of <code>Kernel.run()</code> have completed and the results are available in the <code>squares</code> array.
  * <p>
  * <blockquote><pre>
  *     int[] squares = kernel.getSquares();
@@ -110,16 +111,19 @@ import com.amd.aparapi.ClassModel.ConstantPool.MethodReferenceEntry;
  * A different approach to creating kernels that avoids extending Kernel is to write an anonymous inner class:
  * <p>
  * <blockquote><pre>
+ *   
  *     final int[] values = new int[1024];
- *     // fill values array
+ *     // fill the values array 
  *     final int[] squares = new int[values.length];
+ *     final Range range = Range.create(values.length);
+ *   
  *     Kernel kernel = new Kernel(){
  *         public void run() {
  *             int gid = getGlobalID();
  *             squares[gid] = values[gid]*values[gid];
  *         }
  *     };
- *     kernel.execute(values.length);
+ *     kernel.execute(range);
  *     for (int i=0; i< values.length; i++){
  *        System.out.printf("%4d %4d %8d\n", i, values[i], squares[i]);
  *     }
@@ -141,15 +145,52 @@ public abstract class Kernel implements Cloneable{
    }
 
    @Retention(RetentionPolicy.RUNTIME) @interface OpenCLDelegate {
+
    }
 
+   /**
+    *  We can use this Annotation to 'tag' intended local buffers. 
+    *  
+    *  So we can either annotate the buffer
+    *  <pre><code>
+    *  &#64Local int[] buffer = new int[1024];
+    *  </code></pre>
+    *   Or use a special suffix 
+    *  <pre><code>
+    *  int[] buffer_$local$ = new int[1024];
+    *  </code></pre>
+    *  
+    *  @see LOCAL_SUFFIX
+    * 
+    * 
+    */
+   public @Retention(RetentionPolicy.RUNTIME) @interface Local {
+
+   }
+
+   /**
+    *  We can use this suffix to 'tag' intended local buffers. 
+    *  
+    *  
+    *  So either name the buffer 
+    *  <pre><code>
+    *  int[] buffer_$local$ = new int[1024];
+    *  </code></pre>
+    *  Or use the Annotation form 
+    *  <pre><code>
+    *  &#64Local int[] buffer = new int[1024];
+    *  </code></pre>
+    */
+
+   final static String LOCAL_SUFFIX = "_$local$";
+
    private static Logger logger = Logger.getLogger(Config.getLoggerName());
 
    public abstract class Entry{
       public abstract void run();
 
-      public Kernel execute(int _globalSize) {
-         return (Kernel.this.execute("foo", _globalSize, 1));
+      public Kernel execute(Range _range) {
+         return (Kernel.this.execute("foo", _range, 1));
       }
    }
 
@@ -292,26 +333,30 @@ public abstract class Kernel implements Cloneable{
 
    private EXECUTION_MODE executionMode = EXECUTION_MODE.getDefaultExecutionMode();
 
-   private int globalId;
-
-   private int localId;
-
-   private int localSize;
+   int[] globalId = new int[] {
+         0,
+         0,
+         0
+   };
 
-   private int globalSize;
+   int[] localId = new int[] {
+         0,
+         0,
+         0
+   };
 
-   private int groupId;
+   int[] groupId = new int[] {
+         0,
+         0,
+         0
+   };
 
-   private int passId;
+   Range range;
 
-   private int numGroups;
+   int passId;
 
    volatile CyclicBarrier localBarrier;
 
-   void setGlobalId(int _globalId) {
-      globalId = _globalId;
-   }
-
    /**
     * Determine the globalId of an executing kernel.
     * <p>
@@ -349,19 +394,26 @@ public abstract class Kernel implements Cloneable{
     */
 
    @OpenCLDelegate protected final int getGlobalId() {
-      return (globalId);
+      return (getGlobalId(0));
    }
 
-   void setGroupId(int _groupId) {
-      groupId = _groupId;
-
+   @OpenCLDelegate protected final int getGlobalId(int _dim) {
+      return (globalId[_dim]);
    }
 
-   void setPassId(int _passId) {
-      passId = _passId;
+   /*
+      @OpenCLDelegate protected final int getGlobalX() {
+         return (getGlobalId(0));
+      }
 
-   }
+      @OpenCLDelegate protected final int getGlobalY() {
+         return (getGlobalId(1));
+      }
 
+      @OpenCLDelegate protected final int getGlobalZ() {
+         return (getGlobalId(2));
+      }
+   */
    /**
     * Determine the groupId of an executing kernel.
     * <p>
@@ -394,9 +446,26 @@ public abstract class Kernel implements Cloneable{
     * @return The groupId for this Kernel being executed
     */
    @OpenCLDelegate protected final int getGroupId() {
-      return (groupId);
+      return (getGroupId(0));
    }
 
+   @OpenCLDelegate protected final int getGroupId(int _dim) {
+      return (groupId[_dim]);
+   }
+
+   /*
+      @OpenCLDelegate protected final int getGroupX() {
+         return (getGroupId(0));
+      }
+
+      @OpenCLDelegate protected final int getGroupY() {
+         return (getGroupId(1));
+      }
+
+      @OpenCLDelegate protected final int getGroupZ() {
+         return (getGroupId(2));
+      }
+   */
    /**
     * Determine the passId of an executing kernel.
     * <p>
@@ -416,10 +485,6 @@ public abstract class Kernel implements Cloneable{
       return (passId);
    }
 
-   void setLocalId(int _localId) {
-      localId = _localId;
-   }
-
    /**
     * Determine the local id of an executing kernel.
     * <p>
@@ -451,9 +516,26 @@ public abstract class Kernel implements Cloneable{
     * @return The local id for this Kernel being executed
     */
    @OpenCLDelegate protected final int getLocalId() {
-      return (localId);
+      return (getLocalId(0));
    }
 
+   @OpenCLDelegate protected final int getLocalId(int _dim) {
+      return (localId[_dim]);
+   }
+
+   /*
+      @OpenCLDelegate protected final int getLocalX() {
+         return (getLocalId(0));
+      }
+
+      @OpenCLDelegate protected final int getLocalY() {
+         return (getLocalId(1));
+      }
+
+      @OpenCLDelegate protected final int getLocalZ() {
+         return (getLocalId(2));
+      }
+   */
    /**
     * Determine the size of the group that an executing kernel is a member of.
     * <p>
@@ -472,9 +554,26 @@ public abstract class Kernel implements Cloneable{
     * @return The size of the currently executing group.
     */
    @OpenCLDelegate protected final int getLocalSize() {
-      return (localSize);
+      return (range.getLocalSize(0));
+   }
+
+   @OpenCLDelegate protected final int getLocalSize(int _dim) {
+      return (range.getLocalSize(_dim));
    }
 
+   /*
+      @OpenCLDelegate protected final int getLocalWidth() {
+         return (range.getLocalSize(0));
+      }
+
+      @OpenCLDelegate protected final int getLocalHeight() {
+         return (range.getLocalSize(1));
+      }
+
+      @OpenCLDelegate protected final int getLocalDepth() {
+         return (range.getLocalSize(2));
+      }
+   */
    /**
     * Determine the value that was passed to <code>Kernel.execute(int globalSize)</code> method.
     * 
@@ -486,14 +585,26 @@ public abstract class Kernel implements Cloneable{
     * @return The value passed to <code>Kernel.execute(int globalSize)</code> causing the current execution.
     */
    @OpenCLDelegate protected final int getGlobalSize() {
-      return (globalSize);
+      return (range.getGlobalSize(0));
    }
 
-   void setNumGroups(int _numGroups) {
-      numGroups = _numGroups;
-
+   @OpenCLDelegate protected final int getGlobalSize(int _dim) {
+      return (range.getGlobalSize(_dim));
    }
 
+   /*
+      @OpenCLDelegate protected final int getGlobalWidth() {
+         return (range.getGlobalSize(0));
+      }
+
+      @OpenCLDelegate protected final int getGlobalHeight() {
+         return (range.getGlobalSize(1));
+      }
+
+      @OpenCLDelegate protected final int getGlobalDepth() {
+         return (range.getGlobalSize(2));
+      }
+   */
    /**
     * Determine the number of groups that will be used to execute a kernel
     * <p>
@@ -509,9 +620,26 @@ public abstract class Kernel implements Cloneable{
     * @return The number of groups that kernels will be dispatched into.
     */
    @OpenCLDelegate protected final int getNumGroups() {
-      return (numGroups);
+      return (range.getNumGroups(0));
+   }
+
+   @OpenCLDelegate protected final int getNumGroups(int _dim) {
+      return (range.getNumGroups(_dim));
    }
 
+   /*
+      @OpenCLDelegate protected final int getNumGroupsWidth() {
+         return (range.getGroups(0));
+      }
+
+      @OpenCLDelegate protected final int getNumGroupsHeight() {
+         return (range.getGroups(1));
+      }
+
+      @OpenCLDelegate protected final int getNumGroupsDepth() {
+         return (range.getGroups(2));
+      }
+   */
    /**
     * The entry point of a kernel. 
     *  
@@ -529,9 +657,21 @@ public abstract class Kernel implements Cloneable{
    @Override protected Object clone() {
       try {
          Kernel worker = (Kernel) super.clone();
-         // if there are any private buffers, go thru the fields here
-         // and allocate a new instance for each clone
-
+         worker.groupId = new int[] {
+               0,
+               0,
+               0
+         };
+         worker.localId = new int[] {
+               0,
+               0,
+               0
+         };
+         worker.globalId = new int[] {
+               0,
+               0,
+               0
+         };
          return worker;
       } catch (CloneNotSupportedException e) {
          // TODO Auto-generated catch block
@@ -1373,23 +1513,12 @@ public abstract class Kernel implements Cloneable{
     * Java version is identical to localBarrier()
     * 
     * @annotion Experimental
+    * @deprecated
     */
 
-   @OpenCLDelegate @Annotations.Experimental protected final void globalBarrier() {
-      try {
-         localBarrier.await();
-      } catch (InterruptedException e) {
-         // TODO Auto-generated catch block
-         e.printStackTrace();
-      } catch (BrokenBarrierException e) {
-         // TODO Auto-generated catch block
-         e.printStackTrace();
-      }
-   }
-
-   final void setSizes(int _globalSize, int _localSize) {
-      localSize = _localSize;
-      globalSize = _globalSize;
+   @OpenCLDelegate @Annotations.Experimental @Deprecated() protected final void globalBarrier() throws DeprecatedException {
+      throw new DeprecatedException(
+            "Kernel.globalBarrier() has been deprecated. It was based an incorrect understanding of OpenCL functionality.");
 
    }
 
@@ -1441,23 +1570,38 @@ public abstract class Kernel implements Cloneable{
    }
 
    /**
-    * Start execution of <code>globalSize</code> kernels.
+    * Start execution of <code>_range</code> kernels.
     * <p>
     * When <code>kernel.execute(globalSize)</code> is invoked, Aparapi will schedule the execution of <code>globalSize</code> kernels. If the execution mode is GPU then 
     * the kernels will execute as OpenCL code on the GPU device. Otherwise, if the mode is JTP, the kernels will execute as a pool of Java threads on the CPU. 
     * <p>
-    * @param _globalSize The number of Kernels that we would like to initiate.
+    * @param range The number of Kernels that we would like to initiate.
+    * @returnThe Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr)
+    * 
+    */
+   public synchronized Kernel execute(Range _range) {
+      return (execute(_range, 1));
+   }
+
+   /**
+    * Start execution of <code>_range</code> kernels.
+    * <p>
+    * When <code>kernel.execute(_range)</code> is invoked, Aparapi will schedule the execution of <code>_range</code> kernels. If the execution mode is GPU then 
+    * the kernels will execute as OpenCL code on the GPU device. Otherwise, if the mode is JTP, the kernels will execute as a pool of Java threads on the CPU. 
+    * <p>
+    * Since adding the new <code>Range class</code> this method offers backward compatibility and merely defers to <code> return (execute(Range.create(_range), 1));</code>.
+    * @param _range The number of Kernels that we would like to initiate.
     * @returnThe Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr)
     * 
     */
-   public synchronized Kernel execute(int _globalSize) {
-      return (execute(_globalSize, 1));
+   public synchronized Kernel execute(int _range) {
+      return (execute(Range.create(_range), 1));
    }
 
    /**
-    * Start execution of <code>_passes</code> iterations of <code>globalSize</code> kernels.
+    * Start execution of <code>_passes</code> iterations of <code>_range</code> kernels.
     * <p>
-    * When <code>kernel.execute(globalSize, passes)</code> is invoked, Aparapi will schedule the execution of <code>globalSize</code> kernels. If the execution mode is GPU then 
+    * When <code>kernel.execute(_range, _passes)</code> is invoked, Aparapi will schedule the execution of <code>_reange</code> kernels. If the execution mode is GPU then 
     * the kernels will execute as OpenCL code on the GPU device. Otherwise, if the mode is JTP, the kernels will execute as a pool of Java threads on the CPU. 
     * <p>
     * @param _globalSize The number of Kernels that we would like to initiate.
@@ -1465,8 +1609,23 @@ public abstract class Kernel implements Cloneable{
     * @return The Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr)
     * 
     */
-   public synchronized Kernel execute(int _globalSize, int _passes) {
-      return (execute("run", _globalSize, _passes));
+   public synchronized Kernel execute(Range _range, int _passes) {
+      return (execute("run", _range, _passes));
+   }
+
+   /**
+    * Start execution of <code>_passes</code> iterations over the <code>_range</code> of kernels.
+    * <p>
+    * When <code>kernel.execute(_range)</code> is invoked, Aparapi will schedule the execution of <code>_range</code> kernels. If the execution mode is GPU then 
+    * the kernels will execute as OpenCL code on the GPU device. Otherwise, if the mode is JTP, the kernels will execute as a pool of Java threads on the CPU. 
+    * <p>
+    * Since adding the new <code>Range class</code> this method offers backward compatibility and merely defers to <code> return (execute(Range.create(_range), 1));</code>.
+    * @param _range The number of Kernels that we would like to initiate.
+    * @returnThe Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr)
+    * 
+    */
+   public synchronized Kernel execute(int _range, int _passes) {
+      return (execute(Range.create(_range), _passes));
    }
 
    /**
@@ -1480,12 +1639,12 @@ public abstract class Kernel implements Cloneable{
     * @return The Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr)
     * 
     */
-   public synchronized Kernel execute(Entry _entry, int _globalSize) {
+   public synchronized Kernel execute(Entry _entry, Range _range) {
       if (kernelRunner == null) {
          kernelRunner = new KernelRunner(this);
 
       }
-      return (kernelRunner.execute(_entry, _globalSize, 1));
+      return (kernelRunner.execute(_entry, _range, 1));
    }
 
    /**
@@ -1499,8 +1658,8 @@ public abstract class Kernel implements Cloneable{
     * @return The Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr)
     * 
     */
-   public synchronized Kernel execute(String _entrypoint, int _globalSize) {
-      return (execute(_entrypoint, _globalSize, 1));
+   public synchronized Kernel execute(String _entrypoint, Range _range) {
+      return (execute(_entrypoint, _range, 1));
 
    }
 
@@ -1515,12 +1674,12 @@ public abstract class Kernel implements Cloneable{
     * @return The Kernel instance (this) so we can chain calls to put(arr).execute(range).get(arr)
     * 
     */
-   public synchronized Kernel execute(String _entrypoint, int _globalSize, int _passes) {
+   public synchronized Kernel execute(String _entrypoint, Range _range, int _passes) {
       if (kernelRunner == null) {
          kernelRunner = new KernelRunner(this);
 
       }
-      return (kernelRunner.execute(_entrypoint, _globalSize, _passes));
+      return (kernelRunner.execute(_entrypoint, _range, _passes));
 
    }
 
@@ -1693,11 +1852,6 @@ public abstract class Kernel implements Cloneable{
       return (false);
    }
 
-   void setLocalSize(int _localSize) {
-      localSize = _localSize;
-
-   }
-
    // the flag useNullForLocalSize is useful for testing that what we compute for localSize is what OpenCL
    // would also compute if we passed in null.  In non-testing mode, we just call execute with the
    // same localSize that we computed in getLocalSizeJNI.  We don't want do publicize these of course.
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/KernelRunner.java b/com.amd.aparapi/src/java/com/amd/aparapi/KernelRunner.java
index 87e7de463bddcbc308df4189a204dca722f27a52..ab2f07f56a1a3cd01d1afc9ac4a5e5e5f9a20721 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/KernelRunner.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/KernelRunner.java
@@ -70,6 +70,9 @@ import com.amd.aparapi.Kernel.EXECUTION_MODE;
  *
  */
 class KernelRunner{
+   /**
+    * Be careful changing the name/type of this field as it is referenced from JNI code.
+    */
    public @interface UsedByJNICode {
 
    }
@@ -305,7 +308,7 @@ class KernelRunner{
     * 
     * @author gfrost
     */
-   @UsedByJNICode public static final int ARG_APARAPI_BUF_IS_DIRECT = 1 << 20;
+   // @UsedByJNICode public static final int ARG_APARAPI_BUF_IS_DIRECT = 1 << 20;
 
    /**
     * This 'bit' indicates that a particular <code>KernelArg</code> represents a <code>char</code> type (array or primitive).
@@ -447,7 +450,7 @@ class KernelRunner{
        * 
        * At present only set for AparapiLocalBuffer objs, JNI multiplies this by localSize
        */
-      @Annotations.Unused @UsedByJNICode public int bytesPerLocalSize;
+      //  @Annotations.Unused @UsedByJNICode public int bytesPerLocalWidth;
 
       /**
        * Only set for array objs, not used on JNI
@@ -527,21 +530,25 @@ class KernelRunner{
     * @param maxJTPLocalSize
     * @return
     */
-   @Annotations.DocMe private native static synchronized long initJNI(Kernel _kernel, int _flags, int numProcessors,
-         int maxJTPLocalSize);
+   @Annotations.DocMe private native static synchronized long initJNI(Kernel _kernel, int _flags);
 
    private native long buildProgramJNI(long _jniContextHandle, String _source);
 
    private native int setArgsJNI(long _jniContextHandle, KernelArg[] _args, int argc);
 
-   private native int runKernelJNI(long _jniContextHandle, int _globalSize, int _localSize, boolean _needSync,
-         boolean useNullForLocalSize, int _passes);
+   private native int runKernelJNI(long _jniContextHandle, Range _range, boolean _needSync, int _passes);
 
    private native int disposeJNI(long _jniContextHandle);
 
-   private native int getLocalSizeJNI(long _jniContextHandle, int _globalSize, int localBytesPerLocalId);
+   private native String getExtensionsJNI(long _jniContextHandle);
+
+   private native int getMaxWorkGroupSizeJNI(long _jniContextHandle);
+
+   private native int getMaxWorkItemSizeJNI(long _jniContextHandle, int _index);
+
+   private native int getMaxComputeUnitsJNI(long _jniContextHandle);
 
-   private native String getExtensions(long _jniContextHandle);
+   private native int getMaxWorkItemDimensionsJNI(long _jniContextHandle);
 
    private Set<String> capabilitiesSet;
 
@@ -635,34 +642,6 @@ class KernelRunner{
       return capabilitiesSet.contains(CL_KHR_GL_SHARING);
    }
 
-   private static int numCores = Runtime.getRuntime().availableProcessors();
-
-   private static int maxJTPLocalSize = Config.JTPLocalSizeMultiplier * numCores;
-
-   static int getMaxJTPLocalSize() {
-      return maxJTPLocalSize;
-   }
-
-   /**
-    * We need to match OpenCL's algorithm for localsize.
-    * 
-    * @param _globalSize
-    *          The globalsize requested by the user (via <code>Kernel.execute(globalSize)</code>)
-    * @return The value we use for JTP execution for localSize
-    */
-   private static int getJTPLocalSizeForGlobalSize(int _globalSize) {
-      // iterate down until we find a localSize that divides _globalSize equally
-      for (int localSize = getMaxJTPLocalSize(); localSize > 0; localSize--) {
-         if (_globalSize % localSize == 0) {
-            if (logger.isLoggable(Level.FINE)) {
-               logger.fine("executeJava: picking localSize=" + localSize + " for globalSize=" + _globalSize);
-            }
-            return localSize;
-         }
-      }
-      return 0;
-   }
-
    /**
     * Execute using a Java thread pool. Either because we were explicitly asked to do so, or because we 'fall back' after discovering an OpenCL issue.
     * 
@@ -672,108 +651,272 @@ class KernelRunner{
     *          The # of passes requested by the user (via <code>Kernel.execute(globalSize, passes)</code>). Note this is usually defaulted to 1 via <code>Kernel.execute(globalSize)</code>.
     * @return
     */
-   private long executeJava(final int _globalSize, final int _passes) {
+   private long executeJava(final Range _range, final int _passes) {
       if (logger.isLoggable(Level.FINE)) {
-         logger.fine("executeJava: _globalSize=" + _globalSize);
+         logger.fine("executeJava: range = " + _range);
       }
 
       if (kernel.getExecutionMode().equals(EXECUTION_MODE.SEQ)) {
 
-         kernel.localBarrier = new CyclicBarrier(1);
-
-         kernel.setSizes(_globalSize, 1);
+         /**
+          * SEQ mode is useful for testing trivial logic, but kernels which use SEQ mode cannot be used if the
+          * product of localSize(0..3) is >1.  So we can use multi-dim ranges but only if the local size is 1 in all dimensions. 
+          * 
+          * As a result of this barrier is only ever 1 work item wide and probably should be turned into a no-op. 
+          * 
+          * So we need to check if the range is valid here. If not we have no choice but to punt.
+          */
+         if (_range.getLocalSize(0) * _range.getLocalSize(1) * _range.getLocalSize(2) > 1) {
+            throw new IllegalStateException("Can't run range with group size >1 sequentially. Barriers would deadlock!");
+         }
 
-         kernel.setNumGroups(_globalSize);
-         Kernel worker = (Kernel) kernel.clone();
-         for (int passid = 0; passid < _passes; passid++) {
-            worker.setPassId(passid);
-            for (int id = 0; id < _globalSize; id++) {
-               worker.setGroupId(id);
-               worker.setGlobalId(id);
-               worker.setLocalId(0);
-               worker.run();
+         Kernel kernelClone = (Kernel) kernel.clone();
+         kernelClone.range = _range;
+         kernelClone.groupId[0] = 0;
+         kernelClone.groupId[1] = 0;
+         kernelClone.groupId[2] = 0;
+         kernelClone.localId[0] = 0;
+         kernelClone.localId[1] = 0;
+         kernelClone.localId[2] = 0;
+         kernelClone.localBarrier = new CyclicBarrier(1);
+         for (kernelClone.passId = 0; kernelClone.passId < _passes; kernelClone.passId++) {
+
+            if (_range.getDims() == 1) {
+               for (int id = 0; id < _range.getGlobalSize(0); id++) {
+                  kernelClone.globalId[0] = id;
+                  kernelClone.run();
+               }
+            } else if (_range.getDims() == 2) {
+               for (int x = 0; x < _range.getGlobalSize(0); x++) {
+                  kernelClone.globalId[0] = x;
+                  for (int y = 0; y < _range.getGlobalSize(1); y++) {
+                     kernelClone.globalId[1] = y;
+                     kernelClone.run();
+                  }
+               }
+            } else if (_range.getDims() == 3) {
+               for (int x = 0; x < _range.getGlobalSize(0); x++) {
+                  kernelClone.globalId[0] = x;
+                  for (int y = 0; y < _range.getGlobalSize(1); y++) {
+                     kernelClone.globalId[1] = y;
+                     for (int z = 0; z < _range.getGlobalSize(2); z++) {
+                        kernelClone.globalId[2] = z;
+                        kernelClone.run();
+                     }
+                     kernelClone.run();
+                  }
+               }
             }
          }
+
       } else {
-         // note uses of final so we can use in anonymous inner class
-         final int localSize = getJTPLocalSizeForGlobalSize(_globalSize);
-         // if (localSize == 0) return 0; // should never happen
-         final int numGroups = _globalSize / localSize;
-
-         // compute numThreadSets by multiplying localSize until bigger than numCores
-         final int numThreadSets = localSize >= numCores ? 1 : (numCores + (localSize - 1)) / localSize;
-         final int numThreads = numThreadSets * localSize;
-         // when dividing to get groupsPerThreadSet, round up
-         final int groupsPerThreadSet = (numGroups + (numThreadSets - 1)) / numThreadSets;
-         if (logger.isLoggable(Level.FINE)) {
-            logger.fine("executeJava: localSize=" + localSize + ", numThreads=" + numThreads + ", numThreadSets=" + numThreadSets
-                  + ", numGroups=" + numGroups);
-         }
 
-         Thread[] threads = new Thread[numThreads];
-         // joinBarrier that says all threads are finished
-         final CyclicBarrier joinBarrier = new CyclicBarrier(numThreads + 1);
-
-         // each threadSet shares a CyclicBarrier of size localSize
-         final CyclicBarrier localBarriers[] = new CyclicBarrier[numThreadSets];
-         kernel.setSizes(_globalSize, localSize);
-         kernel.setNumGroups(numGroups);
-         for (int passid = 0; passid < _passes; passid++) {
-            kernel.setPassId(passid);
-            for (int thrSetId = 0; thrSetId < numThreadSets; thrSetId++) {
-               final int startGroupId = thrSetId * groupsPerThreadSet;
-               final int endGroupId = Math.min((thrSetId + 1) * groupsPerThreadSet, numGroups);
-               // System.out.println("thrSetId=" + thrSetId + " running groups from " + startGroupId + " thru " + (endGroupId-1));
-               localBarriers[thrSetId] = new CyclicBarrier(localSize);
-
-               // each threadSet has localSize threads
-               for (int lid = 0; lid < localSize; lid++) { // for each thread in threadSet
-                  final int localId = lid;
-                  final int threadId = thrSetId * localSize + localId;
-                  final Kernel worker = (Kernel) kernel.clone();
-                  worker.setLocalId(localId);
-                  worker.localBarrier = localBarriers[thrSetId]; // barrier that the kernel has access to
-
-                  threads[threadId] = new Thread(new Runnable(){
-                     @Override public void run() {
-                        for (int groupId = startGroupId; groupId < endGroupId; groupId++) {
-                           int globalId = (groupId * localSize) + localId;
-                           worker.setGroupId(groupId);
-                           worker.setGlobalId(globalId);
-                           // System.out.println("running worker with gid=" + globalId + ", lid=" + localId
-                           // + ", groupId=" + groupId + ", threadId=" + threadId);
-                           worker.run();
-                        }
-                        try {
-                           joinBarrier.await();
-                        } catch (InterruptedException e) {
-                           // TODO Auto-generated catch block
-                           e.printStackTrace();
-                        } catch (BrokenBarrierException e) {
-                           // TODO Auto-generated catch block
-                           e.printStackTrace();
-                        }
+         final int threads = _range.getLocalSize(0) * _range.getLocalSize(1) * _range.getLocalSize(2);
+         final int globalGroups = _range.getNumGroups(0) * _range.getNumGroups(1) * _range.getNumGroups(2);
+         final Thread threadArray[] = new Thread[threads];
+         /**
+          * This joinBarrier is the barrier that we provide for the kernel threads to rendezvous with the current dispatch thread.
+          * So this barrier is threadCount+1 wide (the +1 is for the dispatch thread)
+          */
+         final CyclicBarrier joinBarrier = new CyclicBarrier(threads + 1);
+
+         /**
+          * This localBarrier is only ever used by the kernels.  If the kernel does not use the barrier the threads 
+          * can get out of sync, we promised nothing in JTP mode.
+          *
+          * As with OpenCL all threads within a group must wait at the barrier or none.  It is a user error (possible deadlock!)
+          * if the barrier is in a conditional that is only executed by some of the threads within a group.
+          * 
+          * Kernel developer must understand this.
+          * 
+          * This barrier is threadCount wide.  We never hit the barrier from the dispatch thread.
+          */
+         final CyclicBarrier localBarrier = new CyclicBarrier(threads);
+
+         /**
+           * Note that we emulate OpenCL by creating one thread per localId (across the group).
+           * 
+           * So threadCount == range.getLocalSize(0)*range.getLocalSize(1)*range.getLocalSize(2);
+           * 
+           * For a 1D range of 12 groups of 4 we create 4 threads. One per localId(0).
+           * 
+           * We also clone the kernel 4 times. One per thread.
+           * 
+           * We create local barrier which has a width of 4
+           *         
+           *    Thread-0 handles localId(0) (global 0,4,8)
+           *    Thread-1 handles localId(1) (global 1,5,7)
+           *    Thread-2 handles localId(2) (global 2,6,10)
+           *    Thread-3 handles localId(3) (global 3,7,11)
+           *    
+           * This allows all threads to synchronize using the local barrier.
+           * 
+           * Initially the use of local buffers seems broken as the buffers appears to be per Kernel.
+           * Thankfully Kernel.clone() performs a shallow clone of all buffers (local and global)
+           * So each of the cloned kernels actually still reference the same underlying local/global buffers. 
+           * 
+           * If the kernel uses local buffers but does not use barriers then it is possible for different groups
+           * to see mutations from each other (unlike OpenCL), however if the kernel does not us barriers then it 
+           * cannot assume any coherence in OpenCL mode either (the failure mode will be different but still wrong) 
+           * 
+           * So even JTP mode use of local buffers will need to use barriers. Not for the same reason as OpenCL but to keep groups in lockstep.
+           * 
+           **/
+
+         for (int id = 0; id < threads; id++) {
+            final int threadId = id;
+
+            /**
+             *  We clone one kernel for each thread.
+             *  
+             *  They will all share references to the same range, localBarrier and global/local buffers because the clone is shallow.
+             *  We need clones so that each thread can assign 'state' (localId/globalId/groupId) without worrying 
+             *  about other threads.   
+             */
+            final Kernel kernelClone = (Kernel) kernel.clone();
+            kernelClone.range = _range;
+            kernelClone.localBarrier = localBarrier;
+
+            threadArray[threadId] = new Thread(new Runnable(){
+               @Override public void run() {
+                  for (int globalGroupId = 0; globalGroupId < globalGroups; globalGroupId++) {
+
+                     if (_range.getDims() == 1) {
+                        kernelClone.localId[0] = threadId % _range.getLocalSize(0);
+                        kernelClone.globalId[0] = threadId + globalGroupId * threads;
+                        kernelClone.groupId[0] = globalGroupId;
+                     } else if (_range.getDims() == 2) {
+
+                        /**
+                         * Consider a 12x4 grid of 4*2 local groups
+                         * <pre>
+                         *                                             threads = 4*2 = 8
+                         *                                             localWidth=4
+                         *                                             localHeight=2
+                         *                                             globalWidth=12
+                         *                                             globalHeight=4
+                         * 
+                         *    00 01 02 03 | 04 05 06 07 | 08 09 10 11  
+                         *    12 13 14 15 | 16 17 18 19 | 20 21 22 23
+                         *    ------------+-------------+------------
+                         *    24 25 26 27 | 28 29 30 31 | 32 33 34 35
+                         *    36 37 38 39 | 40 41 42 43 | 44 45 46 47  
+                         *    
+                         *    00 01 02 03 | 00 01 02 03 | 00 01 02 03  threadIds : [0..7]*6
+                         *    04 05 06 07 | 04 05 06 07 | 04 05 06 07
+                         *    ------------+-------------+------------
+                         *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
+                         *    04 05 06 07 | 04 05 06 07 | 04 05 06 07  
+                         *    
+                         *    00 00 00 00 | 01 01 01 01 | 02 02 02 02  groupId[0] : 0..6 
+                         *    00 00 00 00 | 01 01 01 01 | 02 02 02 02   
+                         *    ------------+-------------+------------
+                         *    00 00 00 00 | 01 01 01 01 | 02 02 02 02  
+                         *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
+                         *    
+                         *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  groupId[1] : 0..6 
+                         *    00 00 00 00 | 00 00 00 00 | 00 00 00 00   
+                         *    ------------+-------------+------------
+                         *    01 01 01 01 | 01 01 01 01 | 01 01 01 01 
+                         *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                         *         
+                         *    00 01 02 03 | 08 09 10 11 | 16 17 18 19  globalThreadIds == threadId + groupId * threads;
+                         *    04 05 06 07 | 12 13 14 15 | 20 21 22 23
+                         *    ------------+-------------+------------
+                         *    24 25 26 27 | 32[33]34 35 | 40 41 42 43
+                         *    28 29 30 31 | 36 37 38 39 | 44 45 46 47   
+                         *          
+                         *    00 01 02 03 | 00 01 02 03 | 00 01 02 03  localX = threadId % localWidth; (for globalThreadId 33 = threadId = 01 : 01%4 =1)
+                         *    00 01 02 03 | 00 01 02 03 | 00 01 02 03   
+                         *    ------------+-------------+------------
+                         *    00 01 02 03 | 00[01]02 03 | 00 01 02 03 
+                         *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
+                         *     
+                         *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  localY = threadId /localWidth  (for globalThreadId 33 = threadId = 01 : 01/4 =0)
+                         *    01 01 01 01 | 01 01 01 01 | 01 01 01 01   
+                         *    ------------+-------------+------------
+                         *    00 00 00 00 | 00[00]00 00 | 00 00 00 00 
+                         *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
+                         *     
+                         *    00 01 02 03 | 04 05 06 07 | 08 09 10 11  globalX=
+                         *    00 01 02 03 | 04 05 06 07 | 08 09 10 11     groupsPerLineWidth=globalWidth/localWidth (=12/4 =3)
+                         *    ------------+-------------+------------     groupInset =groupId%groupsPerLineWidth (=4%3 = 1)
+                         *    00 01 02 03 | 04[05]06 07 | 08 09 10 11 
+                         *    00 01 02 03 | 04 05 06 07 | 08 09 10 11     globalX = groupInset*localWidth+localX (= 1*4+1 = 5)
+                         *     
+                         *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  globalY
+                         *    01 01 01 01 | 01 01 01 01 | 01 01 01 01      
+                         *    ------------+-------------+------------
+                         *    02 02 02 02 | 02[02]02 02 | 02 02 02 02 
+                         *    03 03 03 03 | 03 03 03 03 | 03 03 03 03
+                         *    
+                         * </pre>
+                         * Assume we are trying to locate the id's for #33 
+                         *
+                         */
+
+                        kernelClone.localId[0] = threadId % _range.getLocalSize(0); // threadId % localWidth =  (for 33 = 1 % 4 = 1)
+                        kernelClone.localId[1] = threadId / _range.getLocalSize(0); // threadId / localWidth = (for 33 = 1 / 4 == 0)
+
+                        int groupInset = globalGroupId % _range.getNumGroups(0); // 4%3 = 1
+                        kernelClone.globalId[0] = groupInset * _range.getLocalSize(0) + kernelClone.localId[0]; // 1*4+1=5
+
+                        int completeLines = (globalGroupId / _range.getNumGroups(0)) * _range.getLocalSize(1);// (4/3) * 2
+                        kernelClone.globalId[1] = completeLines + kernelClone.localId[1]; // 2+0 = 2
+                        kernelClone.groupId[0] = globalGroupId % _range.getNumGroups(0);
+                        kernelClone.groupId[1] = globalGroupId / _range.getNumGroups(0);
+                     } else if (_range.getDims() == 3) {
+
+                        //Same as 2D actually turns out that localId[0] is identical for all three dims so could be hoisted out of conditional code
+
+                        kernelClone.localId[0] = threadId % _range.getLocalSize(0);
+
+                        kernelClone.localId[1] = (threadId / _range.getLocalSize(0)) % _range.getLocalSize(1);
+
+                        // the thread id's span WxHxD so threadId/(WxH) should yield the local depth  
+                        kernelClone.localId[2] = threadId / (_range.getLocalSize(0) * _range.getLocalSize(1));
+
+                        kernelClone.globalId[0] = (globalGroupId % _range.getNumGroups(0)) * _range.getLocalSize(0)
+                              + kernelClone.localId[0];
+
+                        kernelClone.globalId[1] = ((globalGroupId / _range.getNumGroups(0)) * _range.getLocalSize(1))
+                              % _range.getGlobalSize(1) + kernelClone.localId[1];
+
+                        kernelClone.globalId[2] = (globalGroupId / (_range.getNumGroups(0) * _range.getNumGroups(1)))
+                              * _range.getLocalSize(2) + kernelClone.localId[2];
+
+                        kernelClone.groupId[0] = globalGroupId % _range.getNumGroups(0);
+                        kernelClone.groupId[1] = (globalGroupId / _range.getNumGroups(0)) % _range.getNumGroups(1);
+                        kernelClone.groupId[2] = globalGroupId / (_range.getNumGroups(0) * _range.getNumGroups(1));
                      }
-                  });
-                  threads[threadId].start();
-               }
+                     kernelClone.run();
 
-               // this is where the main thread waits on the join barrier
-               try {
-                  joinBarrier.await();
-               } catch (InterruptedException e) {
-                  // TODO Auto-generated catch block
-                  e.printStackTrace();
-               } catch (BrokenBarrierException e) {
-                  // TODO Auto-generated catch block
-                  e.printStackTrace();
+                  }
+                  await(joinBarrier); // This thread will rendezvous with dispatch thread here. This is effectively a join.                  
                }
-            }
+            });
+            threadArray[threadId].setName("aparapi-" + threadId + "/" + threads);
+            threadArray[threadId].start();
+
          }
+         await(joinBarrier); // This dispatch thread waits for all worker threads here. 
+
       } // execution mode == JTP
       return 0;
    }
 
+   private static void await(CyclicBarrier _barrier) {
+      try {
+         _barrier.await();
+      } catch (InterruptedException e) {
+         // TODO Auto-generated catch block
+         e.printStackTrace();
+      } catch (BrokenBarrierException e) {
+         // TODO Auto-generated catch block
+         e.printStackTrace();
+      }
+   }
+
    private KernelArg[] args = null;
 
    private boolean usesOopConversion = false;
@@ -1036,12 +1179,8 @@ class KernelRunner{
       }
    }
 
-   // this routine now also finds out how many perLocalItem bytes are specified for this kernel
-   private int localBytesPerLocalId = 0;
-
    private boolean updateKernelArrayRefs() throws AparapiException {
       boolean needsSync = false;
-      localBytesPerLocalId = 0;
 
       for (int i = 0; i < argc; i++) {
          KernelArg arg = args[i];
@@ -1089,37 +1228,59 @@ class KernelRunner{
 
    // private int numAvailableProcessors = Runtime.getRuntime().availableProcessors();
 
-   private Kernel executeOpenCL(final String _entrypointName, final int _globalSize, final int _passes) throws AparapiException {
+   private Kernel executeOpenCL(final String _entrypointName, final Range _range, final int _passes) throws AparapiException {
+      if (_range.getDims() > getMaxWorkItemDimensionsJNI(jniContextHandle)) {
+         throw new RangeException("Range dim size " + _range.getDims() + " > device "
+               + getMaxWorkItemDimensionsJNI(jniContextHandle));
+      }
+      if (_range.getWorkGroupSize() > getMaxWorkGroupSizeJNI(jniContextHandle)) {
+         throw new RangeException("Range workgroup size " + _range.getWorkGroupSize() + " > device "
+               + getMaxWorkGroupSizeJNI(jniContextHandle));
+      }
+      /*
+            if (_range.getGlobalSize(0) > getMaxWorkItemSizeJNI(jniContextHandle, 0)) {
+               throw new RangeException("Range globalsize 0 " + _range.getGlobalSize(0) + " > device "
+                     + getMaxWorkItemSizeJNI(jniContextHandle, 0));
+            }
+            if (_range.getDims() > 1) {
+               if (_range.getGlobalSize(1) > getMaxWorkItemSizeJNI(jniContextHandle, 1)) {
+                  throw new RangeException("Range globalsize 1 " + _range.getGlobalSize(1) + " > device "
+                        + getMaxWorkItemSizeJNI(jniContextHandle, 1));
+               }
+               if (_range.getDims() > 2) {
+                  if (_range.getGlobalSize(2) > getMaxWorkItemSizeJNI(jniContextHandle, 2)) {
+                     throw new RangeException("Range globalsize 2 " + _range.getGlobalSize(2) + " > device "
+                           + getMaxWorkItemSizeJNI(jniContextHandle, 2));
+                  }
+               }
+            }
+      */
+
+      if (logger.isLoggable(Level.FINE)) {
+         logger.fine("maxComputeUnits=" + this.getMaxComputeUnitsJNI(jniContextHandle));
+         logger.fine("maxWorkGroupSize=" + this.getMaxWorkGroupSizeJNI(jniContextHandle));
+         logger.fine("maxWorkItemDimensions=" + this.getMaxWorkItemDimensionsJNI(jniContextHandle));
+         logger.fine("maxWorkItemSize(0)=" + getMaxWorkItemSizeJNI(jniContextHandle, 0));
+         if (_range.getDims() > 1) {
+            logger.fine("maxWorkItemSize(1)=" + getMaxWorkItemSizeJNI(jniContextHandle, 1));
+            if (_range.getDims() > 2) {
+               logger.fine("maxWorkItemSize(2)=" + getMaxWorkItemSizeJNI(jniContextHandle, 2));
+            }
+         }
+      }
 
       // Read the array refs after kernel may have changed them
       // We need to do this as input to computing the localSize
       assert args != null : "args should not be null";
       boolean needSync = updateKernelArrayRefs();
-
-      // note: the above will also recompute the value localBytesPerLocalId
-
-      int localSize = getLocalSizeJNI(jniContextHandle, _globalSize, localBytesPerLocalId);
-      if (localSize == 0) {
-         // should fall back to java?
-         logger.warning("getLocalSizeJNI failed, reverting java");
-         kernel.setFallbackExecutionMode();
-         return execute(_entrypointName, _globalSize, _passes);
-      }
-      assert localSize <= _globalSize : "localSize = " + localSize;
-
-      // Call back to kernel for last minute changes
-      kernel.setSizes(_globalSize, localSize);
-
       if (needSync && logger.isLoggable(Level.FINE)) {
          logger.fine("Need to resync arrays on " + kernel.getClass().getName());
       }
-
       // native side will reallocate array buffers if necessary
-      if (runKernelJNI(jniContextHandle, _globalSize, localSize, needSync, kernel.useNullForLocalSize, _passes) != 0) {
-         //System.out.println("CL exec seems to have failed");
+      if (runKernelJNI(jniContextHandle, _range, needSync, _passes) != 0) {
          logger.warning("### CL exec seems to have failed. Trying to revert to Java ###");
          kernel.setFallbackExecutionMode();
-         return execute(_entrypointName, _globalSize, _passes);
+         return execute(_entrypointName, _range, _passes);
       }
 
       if (usesOopConversion == true) {
@@ -1127,42 +1288,41 @@ class KernelRunner{
       }
 
       if (logger.isLoggable(Level.FINE)) {
-         logger.fine("executeOpenCL completed. _globalSize=" + _globalSize);
+         logger.fine("executeOpenCL completed. " + _range);
       }
       return kernel;
    }
 
-   synchronized Kernel execute(Kernel.Entry entry, final int _globalSize, final int _passes) {
+   synchronized Kernel execute(Kernel.Entry entry, final Range _range, final int _passes) {
       System.out.println("execute(Kernel.Entry, size) not implemented");
       return (kernel);
    }
 
-   synchronized private Kernel fallBackAndExecute(String _entrypointName, final int _globalSize, final int _passes) {
+   synchronized private Kernel fallBackAndExecute(String _entrypointName, final Range _range, final int _passes) {
 
       kernel.setFallbackExecutionMode();
-      return execute(_entrypointName, _globalSize, _passes);
+      return execute(_entrypointName, _range, _passes);
    }
 
-   synchronized private Kernel warnFallBackAndExecute(String _entrypointName, final int _globalSize, final int _passes,
+   synchronized private Kernel warnFallBackAndExecute(String _entrypointName, final Range _range, final int _passes,
          Exception _exception) {
       if (logger.isLoggable(Level.WARNING)) {
          logger.warning("Reverting to Java Thread Pool (JTP) for " + kernel.getClass() + ": " + _exception.getMessage());
          _exception.printStackTrace();
       }
-      return fallBackAndExecute(_entrypointName, _globalSize, _passes);
+      return fallBackAndExecute(_entrypointName, _range, _passes);
    }
 
-   synchronized private Kernel warnFallBackAndExecute(String _entrypointName, final int _globalSize, final int _passes,
-         String _excuse) {
+   synchronized private Kernel warnFallBackAndExecute(String _entrypointName, final Range _range, final int _passes, String _excuse) {
       logger.warning("Reverting to Java Thread Pool (JTP) for " + kernel.getClass() + ": " + _excuse);
-      return fallBackAndExecute(_entrypointName, _globalSize, _passes);
+      return fallBackAndExecute(_entrypointName, _range, _passes);
    }
 
-   synchronized Kernel execute(String _entrypointName, final int _globalSize, final int _passes) {
+   synchronized Kernel execute(String _entrypointName, final Range _range, final int _passes) {
 
       long executeStartTime = System.currentTimeMillis();
-      if (_globalSize == 0) {
-         throw new IllegalStateException("global size can't be 0");
+      if (_range == null) {
+         throw new IllegalStateException("range can't be null");
       }
 
       if (kernel.getExecutionMode().isOpenCL()) {
@@ -1173,7 +1333,7 @@ class KernelRunner{
                entryPoint = classModel.getEntrypoint(_entrypointName, kernel);
             } catch (Exception exception) {
 
-               return warnFallBackAndExecute(_entrypointName, _globalSize, _passes, exception);
+               return warnFallBackAndExecute(_entrypointName, _range, _passes, exception);
             }
             if ((entryPoint != null) && !entryPoint.shouldFallback()) {
 
@@ -1184,12 +1344,12 @@ class KernelRunner{
                jniFlags |= (kernel.getExecutionMode().equals(EXECUTION_MODE.GPU) ? JNI_FLAG_USE_GPU : 0);
                // Init the device to check capabilities before emitting the
                // code that requires the capabilities.
-               jniContextHandle = initJNI(kernel, jniFlags, Runtime.getRuntime().availableProcessors(), getMaxJTPLocalSize());
+               jniContextHandle = initJNI(kernel, jniFlags);
                if (jniContextHandle == 0) {
-                  return warnFallBackAndExecute(_entrypointName, _globalSize, _passes, "initJNI failed to return a valid handle");
+                  return warnFallBackAndExecute(_entrypointName, _range, _passes, "initJNI failed to return a valid handle");
                }
 
-               String extensions = getExtensions(jniContextHandle);
+               String extensions = getExtensionsJNI(jniContextHandle);
                capabilitiesSet = new HashSet<String>();
                StringTokenizer strTok = new StringTokenizer(extensions);
                while (strTok.hasMoreTokens()) {
@@ -1201,12 +1361,12 @@ class KernelRunner{
 
                if (entryPoint.requiresDoublePragma() && !hasFP64Support()) {
 
-                  return warnFallBackAndExecute(_entrypointName, _globalSize, _passes, "FP64 required but not supported");
+                  return warnFallBackAndExecute(_entrypointName, _range, _passes, "FP64 required but not supported");
                }
 
                if (entryPoint.requiresByteAddressableStorePragma() && !hasByteAddressableStoreSupport()) {
 
-                  return warnFallBackAndExecute(_entrypointName, _globalSize, _passes,
+                  return warnFallBackAndExecute(_entrypointName, _range, _passes,
                         "Byte addressable stores required but not supported");
                }
 
@@ -1215,24 +1375,16 @@ class KernelRunner{
 
                if (entryPoint.requiresAtomic32Pragma() && !all32AtomicsAvailable) {
 
-                  return warnFallBackAndExecute(_entrypointName, _globalSize, _passes, "32 bit Atomics required but not supported");
+                  return warnFallBackAndExecute(_entrypointName, _range, _passes, "32 bit Atomics required but not supported");
                }
 
-               final StringBuilder openCLStringBuilder = new StringBuilder();
-               KernelWriter openCLWriter = new KernelWriter(){
-                  @Override public void write(String _string) {
-                     openCLStringBuilder.append(_string);
-                  }
-               };
-
-               // Emit the OpenCL source into a string
+               String openCL = null;
                try {
-                  openCLWriter.write(entryPoint);
-
+                  openCL = KernelWriter.writeToString(entryPoint);
                } catch (CodeGenException codeGenException) {
-                  return warnFallBackAndExecute(_entrypointName, _globalSize, _passes, codeGenException);
+                  return warnFallBackAndExecute(_entrypointName, _range, _passes, codeGenException);
                }
-               String openCL = openCLStringBuilder.toString();
+
                if (Config.enableShowGeneratedOpenCL) {
                   System.out.println(openCL);
                }
@@ -1241,9 +1393,8 @@ class KernelRunner{
                }
 
                // Send the string to OpenCL to compile it
-               if (buildProgramJNI(jniContextHandle, openCLStringBuilder.toString()) == 0) {
-
-                  return warnFallBackAndExecute(_entrypointName, _globalSize, _passes, "OpenCL compile failed");
+               if (buildProgramJNI(jniContextHandle, openCL) == 0) {
+                  return warnFallBackAndExecute(_entrypointName, _range, _passes, "OpenCL compile failed");
                }
 
                args = new KernelArg[entryPoint.getReferencedFields().size()];
@@ -1256,9 +1407,15 @@ class KernelRunner{
                      args[i].name = field.getName();
                      args[i].field = field;
                      args[i].isStatic = (field.getModifiers() & Modifier.STATIC) == Modifier.STATIC;
-
                      Class<?> type = field.getType();
                      if (type.isArray()) {
+
+                        if (field.getAnnotation(com.amd.aparapi.Kernel.Local.class) != null
+                              || args[i].name.endsWith(Kernel.LOCAL_SUFFIX)) {
+                           args[i].type |= ARG_LOCAL;
+                        } else {
+                           args[i].type |= ARG_GLOBAL;
+                        }
                         args[i].array = null; // will get updated in updateKernelArrayRefs
                         args[i].type |= ARG_ARRAY;
                         if (isExplicit()) {
@@ -1269,7 +1426,7 @@ class KernelRunner{
                         args[i].type |= entryPoint.getArrayFieldAssignments().contains(field.getName()) ? (ARG_WRITE | ARG_READ)
                               : 0;
                         args[i].type |= entryPoint.getArrayFieldAccesses().contains(field.getName()) ? ARG_READ : 0;
-                        args[i].type |= ARG_GLOBAL;
+                        // args[i].type |= ARG_GLOBAL;
                         args[i].type |= type.isAssignableFrom(float[].class) ? ARG_FLOAT : 0;
 
                         args[i].type |= type.isAssignableFrom(int[].class) ? ARG_INT : 0;
@@ -1351,26 +1508,26 @@ class KernelRunner{
                conversionTime = System.currentTimeMillis() - executeStartTime;
 
                try {
-                  executeOpenCL(_entrypointName, _globalSize, _passes);
+                  executeOpenCL(_entrypointName, _range, _passes);
                } catch (AparapiException e) {
-                  warnFallBackAndExecute(_entrypointName, _globalSize, _passes, e);
+                  warnFallBackAndExecute(_entrypointName, _range, _passes, e);
                }
             } else {
-               warnFallBackAndExecute(_entrypointName, _globalSize, _passes, "failed to locate entrypoint");
+               warnFallBackAndExecute(_entrypointName, _range, _passes, "failed to locate entrypoint");
             }
 
          } else {
 
             try {
-               executeOpenCL(_entrypointName, _globalSize, _passes);
+               executeOpenCL(_entrypointName, _range, _passes);
             } catch (AparapiException e) {
 
-               warnFallBackAndExecute(_entrypointName, _globalSize, _passes, e);
+               warnFallBackAndExecute(_entrypointName, _range, _passes, e);
             }
          }
 
       } else {
-         executeJava(_globalSize, _passes);
+         executeJava(_range, _passes);
       }
       if (Config.enableExecutionModeReporting) {
          System.out.println(kernel.getClass().getCanonicalName() + ":" + kernel.getExecutionMode());
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/KernelWriter.java b/com.amd.aparapi/src/java/com/amd/aparapi/KernelWriter.java
index 7a1886bbce1e22624b2d2d77e1630bd5c1a6b718..a9eb0f28d1a34f31c1b8470b9d13b9e10ce5e977 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/KernelWriter.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/KernelWriter.java
@@ -43,10 +43,11 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 
+import com.amd.aparapi.ClassModel.ClassModelField;
 import com.amd.aparapi.ClassModel.AttributePool.LocalVariableTableEntry;
-import com.amd.aparapi.ClassModel.AttributePool.LocalVariableTableEntry.LocalVariableInfo;
 import com.amd.aparapi.ClassModel.AttributePool.RuntimeAnnotationsEntry;
-import com.amd.aparapi.ClassModel.ClassModelField;
+import com.amd.aparapi.ClassModel.AttributePool.LocalVariableTableEntry.LocalVariableInfo;
+import com.amd.aparapi.ClassModel.AttributePool.RuntimeAnnotationsEntry.AnnotationInfo;
 import com.amd.aparapi.ClassModel.ConstantPool.FieldEntry;
 import com.amd.aparapi.ClassModel.ConstantPool.MethodEntry;
 import com.amd.aparapi.InstructionSet.AccessArrayElement;
@@ -92,14 +93,47 @@ abstract class KernelWriter extends BlockWriter{
 
    final static Map<String, String> javaToCLIdentifierMap = new HashMap<String, String>();
    {
+
       javaToCLIdentifierMap.put("getGlobalId()I", "get_global_id(0)");
+      javaToCLIdentifierMap.put("getGlobalId(I)I", "get_global_id"); // no parenthesis if we are conveying args
+      javaToCLIdentifierMap.put("getGlobalX()I", "get_global_id(0)");
+      javaToCLIdentifierMap.put("getGlobalY()I", "get_global_id(1)");
+      javaToCLIdentifierMap.put("getGlobalZ()I", "get_global_id(2)");
+
       javaToCLIdentifierMap.put("getGlobalSize()I", "get_global_size(0)");
+      javaToCLIdentifierMap.put("getGlobalSize(I)I", "get_global_size"); // no parenthesis if we are conveying args
+      javaToCLIdentifierMap.put("getGlobalWidth()I", "get_global_size(0)");
+      javaToCLIdentifierMap.put("getGlobalHeight()I", "get_global_size(1)");
+      javaToCLIdentifierMap.put("getGlobalDepth()I", "get_global_size(2)");
+
       javaToCLIdentifierMap.put("getLocalId()I", "get_local_id(0)");
+      javaToCLIdentifierMap.put("getLocalId(I)I", "get_local_id"); // no parenthesis if we are conveying args
+      javaToCLIdentifierMap.put("getLocalX()I", "get_local_id(0)");
+      javaToCLIdentifierMap.put("getLocalY()I", "get_local_id(1)");
+      javaToCLIdentifierMap.put("getLocalZ()I", "get_local_id(2)");
+
       javaToCLIdentifierMap.put("getLocalSize()I", "get_local_size(0)");
+      javaToCLIdentifierMap.put("getLocalSize(I)I", "get_local_size"); // no parenthesis if we are conveying args
+      javaToCLIdentifierMap.put("getLocalWidth()I", "get_local_size(0)");
+      javaToCLIdentifierMap.put("getLocalHeight()I", "get_local_size(1)");
+      javaToCLIdentifierMap.put("getLocalDepth()I", "get_local_size(2)");
+
       javaToCLIdentifierMap.put("getNumGroups()I", "get_num_groups(0)");
+      javaToCLIdentifierMap.put("getNumGroups(I)I", "get_num_groups"); // no parenthesis if we are conveying args
+      javaToCLIdentifierMap.put("getNumGroupsX()I", "get_num_groups(0)");
+      javaToCLIdentifierMap.put("getNumGroupsY()I", "get_num_groups(1)");
+      javaToCLIdentifierMap.put("getNumGroupsZ()I", "get_num_groups(2)");
+
       javaToCLIdentifierMap.put("getGroupId()I", "get_group_id(0)");
+      javaToCLIdentifierMap.put("getGroupId(I)I", "get_group_id"); // no parenthesis if we are conveying args
+      javaToCLIdentifierMap.put("getGroupX()I", "get_group_id(0)");
+      javaToCLIdentifierMap.put("getGroupY()I", "get_group_id(1)");
+      javaToCLIdentifierMap.put("getGroupZ()I", "get_group_id(2)");
+
       javaToCLIdentifierMap.put("getPassId()I", "get_pass_id(this)");
+
       javaToCLIdentifierMap.put("localBarrier()V", "barrier(CLK_LOCAL_MEM_FENCE)");
+
       javaToCLIdentifierMap.put("globalBarrier()V", "barrier(CLK_GLOBAL_MEM_FENCE)");
 
    }
@@ -160,7 +194,19 @@ abstract class KernelWriter extends BlockWriter{
       if (barrierAndGetterMappings != null) {
          // this is one of the OpenCL barrier or size getter methods
          // write the mapping and exit
-         write(barrierAndGetterMappings);
+         if (argc > 0) {
+            write(barrierAndGetterMappings);
+            write("(");
+            for (int arg = 0; arg < argc; arg++) {
+               if ((arg != 0)) {
+                  write(", ");
+               }
+               writeInstruction(_methodCall.getArg(arg));
+            }
+            write(")");
+         } else {
+            write(barrierAndGetterMappings);
+         }
       } else {
 
          String intrinsicMapping = Kernel.getMappedMethodName(_methodEntry);
@@ -220,14 +266,17 @@ abstract class KernelWriter extends BlockWriter{
       newLine();
    }
 
+   public final static String __local = "__local";
+
+   public final static String __global = "__global";
+
+   public final static String LOCAL_ANNOTATION_NAME = "L" + Kernel.Local.class.getName().replace(".", "/") + ";";
+
    @Override void write(Entrypoint _entryPoint) throws CodeGenException {
       List<String> thisStruct = new ArrayList<String>();
       List<String> argLines = new ArrayList<String>();
       List<String> assigns = new ArrayList<String>();
 
-      // hack
-      // for (java.lang.reflect.Field f:_entryPoint.getTheClass().getDeclaredFields()){
-
       entryPoint = _entryPoint;
 
       for (ClassModelField field : _entryPoint.getReferencedClassModelFields()) {
@@ -239,13 +288,18 @@ abstract class KernelWriter extends BlockWriter{
          String signature = field.getDescriptor();
 
          boolean isPointer = false;
+
+         // check the suffix 
+         String type = field.getName().endsWith(Kernel.LOCAL_SUFFIX) ? __local : __global;
          RuntimeAnnotationsEntry visibleAnnotations = field.fieldAttributePool.getRuntimeVisibleAnnotationsEntry();
 
-         String type = "__global";
          if (visibleAnnotations != null) {
-            // for (AnnotationInfo ai : visibleAnnotations) {
-            // String typeDescriptor = ai.getTypeDescriptor();
-            // }
+            for (AnnotationInfo ai : visibleAnnotations) {
+               String typeDescriptor = ai.getTypeDescriptor();
+               if (typeDescriptor.equals(LOCAL_ANNOTATION_NAME)) {
+                  type = __local;
+               }
+            }
          }
 
          if (signature.startsWith("[")) {
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/MethodModel.java b/com.amd.aparapi/src/java/com/amd/aparapi/MethodModel.java
index 4d5900370e383665af9c244b789921e4f9f565de..96106eea8247bef026ce48cfbdb4bc9a8c863c46 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/MethodModel.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/MethodModel.java
@@ -117,8 +117,6 @@ class MethodModel{
    private boolean methodIsGetter;
 
    private boolean methodIsSetter;
-   
-   
 
    // Only setters can use putfield
    private boolean usesPutfield;
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/Range.java b/com.amd.aparapi/src/java/com/amd/aparapi/Range.java
new file mode 100644
index 0000000000000000000000000000000000000000..d0b59cadada5ebf414dcfccbf33528817932ac7e
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/Range.java
@@ -0,0 +1,392 @@
+package com.amd.aparapi;
+
+import java.util.Arrays;
+
+/**
+ * 
+ * A representation of 1, 2 or 3 dimensional range of execution. 
+ * 
+ * This class uses factory methods to allow one, two or three dimensional ranges to be created. 
+ * <br/>
+ * For a Kernel operating over the linear range 0..1024 without a specified groups size we would create a one dimensional <code>Range</code> using 
+ * <blockquote><pre>Range.create(1024);</pre></blockquote>
+ * To request the same linear range but with a groupSize of 64 (range must be a multiple of group size!) we would use
+ * <blockquote><pre>Range.create(1024,64);</pre></blockquote>
+ * To request a two dimensional range over a grid (0..width)x(0..height) where width==512 and height=256 we would use
+ * <blockquote><pre>
+ * int width=512;
+ * int height=256;
+ * Range.create2D(width,height)
+ * </pre></blockquote>
+ * Again the above does not specify the group size.  One will be chosen for you. If you want to specify the groupSize (say 16x8; 16 wide by 8 high) use
+ * <blockquote><pre>
+ * int width=512;
+ * int height=256;
+ * int groupWidth=16;
+ * int groupHeight=8;
+ * Range.create2D(width, height, groupWidth, groupHeight);
+ * </pre></blockquote>
+ * Finally we can request a three dimensional range using 
+ * <blockquote><pre>
+ * int width=512;
+ * int height=256;
+ * int depth=8;
+ * Range.create3D(width, height, depth);
+ * </pre></blockquote>
+ * And can specify a group size using 
+ * <blockquote><pre>
+ *  int width=512;
+ *  int height=256;
+ *  int depth=8;
+ *  int groupWidth=8;
+ *  int groupHeight=4;
+ *  int groupDepth=2
+ *  Range.create3D(width, height, depth, groupWidth, groupHeight, groupDepth);
+ * </pre></blockquote>
+ */
+public class Range{
+   @KernelRunner.UsedByJNICode private int globalSize_0 = 1;
+
+   @KernelRunner.UsedByJNICode private int localSize_0 = 1;
+
+   @KernelRunner.UsedByJNICode private int globalSize_1 = 1;
+
+   @KernelRunner.UsedByJNICode private int localSize_1 = 1;
+
+   @KernelRunner.UsedByJNICode private int globalSize_2 = 1;
+
+   @KernelRunner.UsedByJNICode private int localSize_2 = 1;
+
+   @KernelRunner.UsedByJNICode private int dims;
+
+   @KernelRunner.UsedByJNICode private boolean valid;
+
+   @KernelRunner.UsedByJNICode private boolean localIsDerived = false;
+
+   /**
+    * Get the localSize (of the group) given the requested dimension
+    * 
+    * @param _dim 0=width, 1=height, 2=depth
+    * @return The size of the group give the requested dimension
+    */
+   public int getLocalSize(int _dim) {
+      return (_dim == 0 ? localSize_0 : (_dim == 1 ? localSize_1 : localSize_2));
+   }
+
+   /**
+    * Get the globalSize (of the range) given the requested dimension
+    * 
+    * @param _dim 0=width, 1=height, 2=depth
+    * @return The size of the group give the requested dimension
+    */
+   public int getGlobalSize(int _dim) {
+      return (_dim == 0 ? globalSize_0 : (_dim == 1 ? globalSize_1 : globalSize_2));
+   }
+
+   private static final int THREADS_PER_CORE = 16;
+
+   private static final int MAX_OPENCL_GROUP_SIZE = 256;
+
+   private static final int MAX_GROUP_SIZE = Math.max(Runtime.getRuntime().availableProcessors() * THREADS_PER_CORE,
+         MAX_OPENCL_GROUP_SIZE);
+
+   /** 
+    * Create a one dimensional range <code>0.._globalWidth</code> which is processed in groups of size _localWidth.
+    * <br/>
+    * Note that for this range to be valid : </br> <strong><code>_globalWidth > 0 && _localWidth > 0 && _localWidth < MAX_GROUP_SIZE && _globalWidth % _localWidth==0</code></strong>
+    * 
+    * @param _globalWidth the overall range we wish to process
+    * @param _localWidth the size of the group we wish to process.
+    * @return A new Range with the requested dimensions
+    */
+   public static Range create(int _globalWidth, int _localWidth) {
+      Range range = new Range();
+      range.dims = 1;
+      range.globalSize_0 = _globalWidth;
+      range.localSize_0 = _localWidth;
+      range.valid = range.localSize_0 > 0 && range.localSize_0 < MAX_GROUP_SIZE && range.globalSize_0 % range.localSize_0 == 0;
+      return (range);
+   }
+
+   /**
+    * Determine the set of factors for a given value.
+    * @param _value The value we wish to factorize. 
+    * @return and array of factors of _value
+    */
+
+   private static int[] getFactors(int _value) {
+      int factors[] = new int[MAX_GROUP_SIZE];
+      int factorIdx = 0;
+      for (int possibleFactor = 1; possibleFactor <= MAX_GROUP_SIZE; possibleFactor++) {
+         if (_value % possibleFactor == 0) {
+            factors[factorIdx++] = possibleFactor;
+         }
+      }
+      return (Arrays.copyOf(factors, factorIdx));
+   }
+
+   /** 
+    * Create a one dimensional range <code>0.._globalWidth</code> with an undefined group size.
+    * <br/>
+    * Note that for this range to be valid :- </br> <strong><code>_globalWidth > 0 </code></strong>
+    * <br/>
+    * The groupsize will be chosen such that _localWidth > 0 && _localWidth < MAX_GROUP_SIZE && _globalWidth % _localWidth==0 is true
+    * 
+    * We extract the factors of _globalWidth and choose the highest value.
+    * 
+    * @param _globalWidth the overall range we wish to process
+    * @return A new Range with the requested dimensions
+    */
+   public static Range create(int _globalWidth) {
+      Range withoutLocal = create(_globalWidth, 1);
+      withoutLocal.localIsDerived = true;
+      int[] factors = getFactors(withoutLocal.globalSize_0);
+
+      withoutLocal.localSize_0 = factors[factors.length - 1];
+
+      withoutLocal.valid = withoutLocal.localSize_0 > 0 && withoutLocal.localSize_0 < MAX_GROUP_SIZE
+            && withoutLocal.globalSize_0 % withoutLocal.localSize_0 == 0;
+      return (withoutLocal);
+   }
+
+   /** 
+    * Create a two dimensional range 0.._globalWidth x 0.._globalHeight using a group which is _localWidth x _localHeight in size.
+    * <br/>
+    * Note that for this range to be valid  _globalWidth > 0 &&  _globalHeight >0 && _localWidth>0 && _localHeight>0 && _localWidth*_localHeight < MAX_GROUP_SIZE && _globalWidth%_localWidth==0 && _globalHeight%_localHeight==0.
+    * 
+    *  @param _globalWidth the overall range we wish to process
+    * @return
+    */
+   public static Range create2D(int _globalWidth, int _globalHeight, int _localWidth, int _localHeight) {
+      Range range = new Range();
+      range.dims = 2;
+      range.globalSize_0 = _globalWidth;
+      range.localSize_0 = _localWidth;
+      range.globalSize_1 = _globalHeight;
+      range.localSize_1 = _localHeight;
+      range.valid = range.localSize_0 > 0 && range.localSize_1 > 0 && range.localSize_0 * range.localSize_1 < MAX_GROUP_SIZE
+            && range.globalSize_0 % range.localSize_0 == 0 && range.globalSize_1 % range.localSize_1 == 0;
+
+      return (range);
+   }
+
+   /** 
+    * Create a two dimensional range <code>0.._globalWidth * 0.._globalHeight</code> choosing suitable values for <code>localWidth</code> and <code>localHeight</code>.
+    * <p>
+    * Note that for this range to be valid  <code>_globalWidth > 0 &&  _globalHeight >0 && _localWidth>0 && _localHeight>0 && _localWidth*_localHeight < MAX_GROUP_SIZE && _globalWidth%_localWidth==0 && _globalHeight%_localHeight==0</code>.
+    * 
+    * <p>
+    * To determine suitable values for <code>_localWidth</code> and <code>_localHeight</code> we extract the factors for <code>_globalWidth</code> and <code>_globalHeight</code> and then 
+    * find the largest product ( <code><= MAX_GROUP_SIZE</code>) with the lowest perimeter.
+    * 
+    * <p>
+    * For example for <code>MAX_GROUP_SIZE</code> of 16 we favor 4x4 over 1x16.
+    * 
+    * @param _globalWidth the overall range we wish to process
+    * @return
+    */
+   public static Range create2D(int _globalWidth, int _globalHeight) {
+      Range withoutLocal = create2D(_globalWidth, _globalHeight, 1, 1);
+      withoutLocal.localIsDerived = true;
+      int[] widthFactors = getFactors(_globalWidth);
+      int[] heightFactors = getFactors(_globalHeight);
+
+      withoutLocal.localSize_0 = 1;
+      withoutLocal.localSize_1 = 1;
+      int max = 1;
+      int perimeter = 0;
+      for (int w : widthFactors) {
+         for (int h : heightFactors) {
+            int size = w * h;
+            if (size > MAX_GROUP_SIZE) {
+               break;
+            }
+
+            if (size > max) {
+               max = size;
+               perimeter = w + h;
+               withoutLocal.localSize_0 = w;
+               withoutLocal.localSize_1 = h;
+            } else if (size == max) {
+               int localPerimeter = w + h;
+               if (localPerimeter < perimeter) {// is this the shortest perimeter so far
+                  perimeter = localPerimeter;
+                  withoutLocal.localSize_0 = w;
+                  withoutLocal.localSize_1 = h;
+               }
+            }
+         }
+      }
+
+      withoutLocal.valid = withoutLocal.localSize_0 > 0 && withoutLocal.localSize_1 > 0
+            && withoutLocal.localSize_0 * withoutLocal.localSize_1 < MAX_GROUP_SIZE
+            && withoutLocal.globalSize_0 % withoutLocal.localSize_0 == 0
+            && withoutLocal.globalSize_1 % withoutLocal.localSize_1 == 0;
+
+      return (withoutLocal);
+   }
+
+   /** 
+    * Create a two dimensional range <code>0.._globalWidth * 0.._globalHeight *0../_globalDepth</code> 
+    * in groups defined by  <code>localWidth</code> * <code>localHeight</code> * <code>localDepth</code>.
+    * <p>
+    * Note that for this range to be valid  <code>_globalWidth > 0 &&  _globalHeight >0 _globalDepth >0 && _localWidth>0 && _localHeight>0 && _localDepth>0 && _localWidth*_localHeight*_localDepth < MAX_GROUP_SIZE && _globalWidth%_localWidth==0 && _globalHeight%_localHeight==0 && _globalDepth%_localDepth==0</code>.
+    * 
+    * @param _globalWidth the width of the 3D grid we wish to process
+    * @param _globalHieght the height of the 3D grid we wish to process
+    * @param _globalDepth the depth of the 3D grid we wish to process
+    * @param _localWidth the width of the 3D group we wish to process
+    * @param _localHieght the height of the 3D group we wish to process
+    * @param _localDepth the depth of the 3D group we wish to process
+    * @return
+    */
+   public static Range create3D(int _globalWidth, int _globalHeight, int _globalDepth, int _localWidth, int _localHeight,
+         int _localDepth) {
+      Range range = new Range();
+      range.dims = 3;
+      range.globalSize_0 = _globalWidth;
+      range.localSize_0 = _localWidth;
+      range.globalSize_1 = _globalHeight;
+      range.localSize_1 = _localHeight;
+      range.globalSize_2 = _globalDepth;
+      range.localSize_2 = _localDepth;
+      range.valid = range.localSize_0 > 0 && range.localSize_1 > 0 && range.localSize_2 > 0
+            && range.localSize_0 * range.localSize_1 * range.localSize_2 < MAX_GROUP_SIZE
+            && range.globalSize_0 % range.localSize_0 == 0 && range.globalSize_1 % range.localSize_1 == 0
+            && range.globalSize_2 % range.localSize_2 == 0;
+
+      return (range);
+   }
+
+   /** 
+    * Create a two dimensional range <code>0.._globalWidth * 0.._globalHeight *0../_globalDepth</code> 
+    * choosing suitable values for <code>localWidth</code>, <code>localHeight</code> and <code>localDepth</code>.
+    * <p>
+     * Note that for this range to be valid  <code>_globalWidth > 0 &&  _globalHeight >0 _globalDepth >0 && _localWidth>0 && _localHeight>0 && _localDepth>0 && _localWidth*_localHeight*_localDepth < MAX_GROUP_SIZE && _globalWidth%_localWidth==0 && _globalHeight%_localHeight==0 && _globalDepth%_localDepth==0</code>.
+    * 
+    * <p>
+    * To determine suitable values for <code>_localWidth</code>,<code>_localHeight</code> and <code>_lodalDepth</code> we extract the factors for <code>_globalWidth</code>,<code>_globalHeight</code> and <code>_globalDepth</code> and then 
+    * find the largest product ( <code><= MAX_GROUP_SIZE</code>) with the lowest perimeter.
+    * 
+    * <p>
+    * For example for <code>MAX_GROUP_SIZE</code> of 64 we favor 4x4x4 over 1x16x16.
+    * 
+    * @param _globalWidth the width of the 3D grid we wish to process
+    * @param _globalHieght the height of the 3D grid we wish to process
+    * @param _globalDepth the depth of the 3D grid we wish to process
+    * @return
+    */
+   public static Range create3D(int _globalWidth, int _globalHeight, int _globalDepth) {
+      Range withoutLocal = create3D(_globalWidth, _globalHeight, _globalDepth, 1, 1, 1);
+      withoutLocal.localIsDerived = true;
+      int[] widthFactors = getFactors(_globalWidth);
+      int[] heightFactors = getFactors(_globalHeight);
+      int[] depthFactors = getFactors(_globalDepth);
+
+      withoutLocal.localSize_0 = 1;
+      withoutLocal.localSize_1 = 1;
+      withoutLocal.localSize_2 = 1;
+      int max = 1;
+      int perimeter = 0;
+      for (int w : widthFactors) {
+         for (int h : heightFactors) {
+            for (int d : depthFactors) {
+               int size = w * h * d;
+               if (size > MAX_GROUP_SIZE) {
+                  break;
+               }
+               if (size > max) {
+                  max = size;
+                  perimeter = w + h + d;
+                  withoutLocal.localSize_0 = w;
+                  withoutLocal.localSize_1 = h;
+                  withoutLocal.localSize_2 = d;
+               } else if (size == max) {
+                  int localPerimeter = w + h + d;
+                  if (localPerimeter < perimeter) { // is this the shortest perimeter so far
+                     perimeter = localPerimeter;
+                     withoutLocal.localSize_0 = w;
+                     withoutLocal.localSize_1 = h;
+                     withoutLocal.localSize_2 = d;
+                  }
+               }
+            }
+         }
+      }
+
+      withoutLocal.valid = withoutLocal.localSize_0 > 0 && withoutLocal.localSize_1 > 0 && withoutLocal.localSize_2 > 0
+            && withoutLocal.localSize_0 * withoutLocal.localSize_1 * withoutLocal.localSize_2 < MAX_GROUP_SIZE
+            && withoutLocal.globalSize_0 % withoutLocal.localSize_0 == 0
+            && withoutLocal.globalSize_1 % withoutLocal.localSize_1 == 0
+            && withoutLocal.globalSize_2 % withoutLocal.localSize_2 == 0;
+
+      return (withoutLocal);
+
+   }
+
+   /**
+    * Get the number of dims for this Range.  
+    * 
+    * @return 0, 1 or 2 for one dimensional, two dimensional and three dimensional range respectively.
+    */
+   public int getDims() {
+      return (dims);
+   }
+
+   /**
+    * Override {@link #toString()}
+    */
+   public String toString() {
+      StringBuilder sb = new StringBuilder();
+
+      switch (dims) {
+         case 1:
+
+            sb.append("global:" + globalSize_0 + " local:" + (localIsDerived ? "(derived)" : "") + localSize_0);
+
+            break;
+         case 2:
+            sb.append("2D(global:" + globalSize_0 + "x" + globalSize_1 + " local:" + (localIsDerived ? "(derived)" : "")
+                  + localSize_0 + "x" + localSize_1 + ")");
+            break;
+         case 3:
+            sb.append("3D(global:" + globalSize_0 + "x" + globalSize_1 + "x" + globalSize_2 + " local:"
+                  + (localIsDerived ? "(derived)" : "") + localSize_0 + "x" + localSize_1 + "x" + localSize_0 + ")");
+            break;
+
+      }
+      return (sb.toString());
+   }
+
+   /**
+    * Get the number of groups for the given dimension. 
+    * 
+    * <p>
+    * This will essentially return globalXXXX/localXXXX for the given dimension (width, height, depth)
+    * @param _dim The dim we are interested in 0, 1 or 2
+    * @return the number of groups for the given dimension. 
+    */
+
+   public int getNumGroups(int _dim) {
+      return (_dim == 0 ? (globalSize_0 / localSize_0) : (_dim == 1 ? (globalSize_1 / localSize_1) : (globalSize_2 / localSize_2)));
+   }
+
+   /**
+    * 
+    * @return The product of all valid localSize dimensions
+    */
+   public int getWorkGroupSize() {
+      return localSize_0 * localSize_1 * localSize_2;
+   }
+
+   /**
+    * Determine whether this Range is usable. 
+    * 
+    * @return true if this Range is usable/valid. 
+    */
+
+   public boolean isValid() {
+      return (valid);
+   }
+
+}
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/RangeException.java b/com.amd.aparapi/src/java/com/amd/aparapi/RangeException.java
new file mode 100644
index 0000000000000000000000000000000000000000..b3998f84d7bb621cb710a6f6b23159103520f747
--- /dev/null
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/RangeException.java
@@ -0,0 +1,46 @@
+/*
+Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following
+disclaimer. 
+
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials provided with the distribution. 
+
+Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
+laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
+774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
+you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
+Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
+Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
+E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
+D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
+to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
+of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
+under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 
+
+*/
+package com.amd.aparapi;
+
+@SuppressWarnings("serial") class RangeException extends AparapiException{
+
+   RangeException(String msg) {
+      super(msg);
+   }
+
+}
diff --git a/examples/effects/src/com/amd/aparapi/examples/effects/Main.java b/examples/effects/src/com/amd/aparapi/examples/effects/Main.java
index dab3078f5f7a66f7aa6dc258bddfc3f111a46db9..5543d94c4373bce826f0c731d6e1faec5bb6752e 100644
--- a/examples/effects/src/com/amd/aparapi/examples/effects/Main.java
+++ b/examples/effects/src/com/amd/aparapi/examples/effects/Main.java
@@ -53,6 +53,7 @@ import javax.swing.JComponent;
 import javax.swing.JFrame;
 
 import com.amd.aparapi.Kernel;
+import com.amd.aparapi.Range;
 
 /**
  * An example Aparapi application which tracks the mouse and updates the color pallete of the window based on the distance from the mouse pointer. 
@@ -161,6 +162,8 @@ public class Main{
       /** Height of Mandelbrot view. */
       final int height = 1024;
 
+      final Range range = Range.create2D(width, height);
+
       /** The size of the pallette of pixel colors we choose from. */
       final int palletteSize = 128;
 
@@ -234,7 +237,7 @@ public class Main{
 
       int trailLastUpdatedPosition = 0;
 
-      kernel.execute(width * height);
+      kernel.execute(range);
       System.arraycopy(rgb, 0, imageRgb, 0, rgb.length);
       viewer.repaint();
 
@@ -268,7 +271,7 @@ public class Main{
          trailLastUpdatedPosition++;
 
          /** execute the kernel which calculates new pixel values **/
-         kernel.execute(width * height);
+         kernel.execute(range);
 
          /** copy the rgb values to the imageRgb buffer **/
          System.arraycopy(rgb, 0, imageRgb, 0, rgb.length);
diff --git a/examples/nbody/local.bat b/examples/nbody/local.bat
new file mode 100644
index 0000000000000000000000000000000000000000..9fe4eb04eb0d4e890106ca8ad0a8da5bfc3d2473
--- /dev/null
+++ b/examples/nbody/local.bat
@@ -0,0 +1,14 @@
+@echo off
+
+java ^
+  -Djava.library.path=..\..\com.amd.aparapi.jni;jogamp ^
+  -Dcom.amd.aparapi.executionMode=%1 ^
+  -Dcom.amd.aparapi.enableShowGeneratedOpenCL=true ^
+  -Dcom.amd.aparapi.enableVerboseJNI=false ^
+  -Dbodies=%2 ^
+  -Dheight=600 ^
+  -Dwidth=600 ^
+  -classpath jogamp\gluegen-rt.jar;jogamp\jogl.all.jar;..\..\com.amd.aparapi\aparapi.jar;nbody.jar ^
+  com.amd.aparapi.examples.nbody.Local
+
+
diff --git a/examples/nbody/src/com/amd/aparapi/examples/nbody/Local.java b/examples/nbody/src/com/amd/aparapi/examples/nbody/Local.java
new file mode 100644
index 0000000000000000000000000000000000000000..7fd272e29efe6a7387c19952f3753c24b83061ca
--- /dev/null
+++ b/examples/nbody/src/com/amd/aparapi/examples/nbody/Local.java
@@ -0,0 +1,356 @@
+/*
+Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following
+disclaimer. 
+
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials provided with the distribution. 
+
+Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
+laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
+774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
+you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
+Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
+Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
+E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
+D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
+to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
+of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
+under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 
+
+*/
+package com.amd.aparapi.examples.nbody;
+
+import java.awt.BorderLayout;
+import java.awt.Dimension;
+import java.awt.FlowLayout;
+import java.awt.event.ActionEvent;
+import java.awt.event.ActionListener;
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.media.opengl.GL;
+import javax.media.opengl.GL2;
+import javax.media.opengl.GLAutoDrawable;
+import javax.media.opengl.GLCapabilities;
+import javax.media.opengl.GLEventListener;
+import javax.media.opengl.GLException;
+import javax.media.opengl.awt.GLCanvas;
+import javax.media.opengl.fixedfunc.GLLightingFunc;
+import javax.media.opengl.glu.GLU;
+import javax.swing.JButton;
+import javax.swing.JFrame;
+import javax.swing.JLabel;
+import javax.swing.JPanel;
+import javax.swing.JTextField;
+import javax.swing.WindowConstants;
+
+import com.amd.aparapi.Kernel;
+import com.amd.aparapi.Range;
+import com.jogamp.opengl.util.FPSAnimator;
+import com.jogamp.opengl.util.texture.Texture;
+import com.jogamp.opengl.util.texture.TextureIO;
+
+/**
+ * An NBody clone which uses local memory to cache NBody positions for execution.
+ * 
+ * http://www.browndeertechnology.com/docs/BDT_OpenCL_Tutorial_NBody-rev3.html
+ * 
+ * @see com.amd.aparapi.examples.nbody.Main
+ * 
+ * @author gfrost
+ *
+ */
+public class Local{
+
+   public static class NBodyKernel extends Kernel{
+      protected final float delT = .005f;
+
+      protected final float espSqr = 1.0f;
+
+      protected final float mass = 5f;
+
+      private final Range range;
+
+      private final float[] xyz; // positions xy and z of bodies
+
+      private final float[] vxyz; // velocity component of x,y and z of bodies 
+
+      @Local private final float[] localStuff; // local memory
+
+      /**
+       * Constructor initializes xyz and vxyz arrays.
+       * @param _bodies
+       */
+      public NBodyKernel(Range _range) {
+         range = _range;
+         localStuff = new float[range.getLocalSize(0) * 3];
+
+         xyz = new float[range.getGlobalSize(0) * 3];
+         vxyz = new float[range.getGlobalSize(0) * 3];
+         float maxDist = 20f;
+         for (int body = 0; body < range.getGlobalSize(0) * 3; body += 3) {
+            float theta = (float) (Math.random() * Math.PI * 2);
+            float phi = (float) (Math.random() * Math.PI * 2);
+            float radius = (float) (Math.random() * maxDist);
+
+            // get the 3D dimensional coordinates
+            xyz[body + 0] = (float) (radius * Math.cos(theta) * Math.sin(phi));
+            xyz[body + 1] = (float) (radius * Math.sin(theta) * Math.sin(phi));
+            xyz[body + 2] = (float) (radius * Math.cos(phi));
+
+            // divide into two 'spheres of bodies' by adjusting x 
+            if (body % 2 == 0) {
+               xyz[body + 0] += maxDist * 1.5;
+            } else {
+               xyz[body + 0] -= maxDist * 1.5;
+            }
+         }
+         setExplicit(true);
+      }
+
+      /** 
+       * Here is the kernel entrypoint. Here is where we calculate the position of each body
+       */
+      @Override public void run() {
+
+         int globalId = getGlobalId(0) * 3;
+
+         float accx = 0.f;
+         float accy = 0.f;
+         float accz = 0.f;
+         float myPosx = xyz[globalId + 0];
+         float myPosy = xyz[globalId + 1];
+         float myPosz = xyz[globalId + 2];
+
+         for (int tile = 0; tile < getGlobalSize(0) / getLocalSize(0); tile++) {
+            // load one tile into local memory
+            int gidx = (tile * getLocalSize(0) + getLocalId()) * 3;
+            int lidx = getLocalId(0) * 3;
+            localStuff[lidx + 0] = xyz[gidx + 0];
+            localStuff[lidx + 1] = xyz[gidx + 1];
+            localStuff[lidx + 2] = xyz[gidx + 2];
+            // Synchronize to make sure data is available for processing
+            localBarrier();
+
+            for (int i = 0; i < getLocalSize() * 3; i += 3) {
+               float dx = localStuff[i + 0] - myPosx;
+               float dy = localStuff[i + 1] - myPosy;
+               float dz = localStuff[i + 2] - myPosz;
+               float invDist = rsqrt((dx * dx) + (dy * dy) + (dz * dz) + espSqr);
+               float s = mass * invDist * invDist * invDist;
+               accx = accx + s * dx;
+               accy = accy + s * dy;
+               accz = accz + s * dz;
+            }
+            localBarrier();
+         }
+         accx = accx * delT;
+         accy = accy * delT;
+         accz = accz * delT;
+         xyz[globalId + 0] = myPosx + vxyz[globalId + 0] * delT + accx * .5f * delT;
+         xyz[globalId + 1] = myPosy + vxyz[globalId + 1] * delT + accy * .5f * delT;
+         xyz[globalId + 2] = myPosz + vxyz[globalId + 2] * delT + accz * .5f * delT;
+
+         vxyz[globalId + 0] = vxyz[globalId + 0] + accx;
+         vxyz[globalId + 1] = vxyz[globalId + 1] + accy;
+         vxyz[globalId + 2] = vxyz[globalId + 2] + accz;
+      }
+
+      /**
+       * Render all particles to the OpenGL context
+       * @param gl
+       */
+
+      protected void render(GL2 gl) {
+         gl.glBegin(GL2.GL_QUADS);
+
+         for (int i = 0; i < range.getGlobalSize(0) * 3; i += 3) {
+            gl.glTexCoord2f(0, 1);
+            gl.glVertex3f(xyz[i + 0], xyz[i + 1] + 1, xyz[i + 2]);
+            gl.glTexCoord2f(0, 0);
+            gl.glVertex3f(xyz[i + 0], xyz[i + 1], xyz[i + 2]);
+            gl.glTexCoord2f(1, 0);
+            gl.glVertex3f(xyz[i + 0] + 1, xyz[i + 1], xyz[i + 2]);
+            gl.glTexCoord2f(1, 1);
+            gl.glVertex3f(xyz[i + 0] + 1, xyz[i + 1] + 1, xyz[i + 2]);
+         }
+         gl.glEnd();
+      }
+
+   }
+
+   public static int width;
+
+   public static int height;
+
+   public static boolean running;
+
+   public static void main(String _args[]) {
+
+      final NBodyKernel kernel = new NBodyKernel(Range.create(Integer.getInteger("bodies", 8192), 256));
+
+      JFrame frame = new JFrame("NBody");
+
+      JPanel panel = new JPanel(new BorderLayout());
+      JPanel controlPanel = new JPanel(new FlowLayout());
+      panel.add(controlPanel, BorderLayout.SOUTH);
+
+      final JButton startButton = new JButton("Start");
+
+      startButton.addActionListener(new ActionListener(){
+         @Override public void actionPerformed(ActionEvent e) {
+            running = true;
+            startButton.setEnabled(false);
+         }
+      });
+      controlPanel.add(startButton);
+      controlPanel.add(new JLabel(kernel.getExecutionMode().toString()));
+
+      controlPanel.add(new JLabel("   Particles"));
+      controlPanel.add(new JTextField("" + kernel.range.getGlobalSize(0), 5));
+
+      controlPanel.add(new JLabel("FPS"));
+      final JTextField framesPerSecondTextField = new JTextField("0", 5);
+
+      controlPanel.add(framesPerSecondTextField);
+      controlPanel.add(new JLabel("Score("));
+      JLabel miniLabel = new JLabel("<html><small>calcs</small><hr/><small>&micro;sec</small></html>");
+
+      controlPanel.add(miniLabel);
+      controlPanel.add(new JLabel(")"));
+
+      final JTextField positionUpdatesPerMicroSecondTextField = new JTextField("0", 5);
+
+      controlPanel.add(positionUpdatesPerMicroSecondTextField);
+      GLCapabilities caps = new GLCapabilities(null);
+      caps.setDoubleBuffered(true);
+      caps.setHardwareAccelerated(true);
+      final GLCanvas canvas = new GLCanvas(caps);
+      Dimension dimension = new Dimension(Integer.getInteger("width", 742), Integer.getInteger("height", 742));
+      canvas.setPreferredSize(dimension);
+
+      canvas.addGLEventListener(new GLEventListener(){
+         private double ratio;
+
+         private final float xeye = 0f;
+
+         private final float yeye = 0f;
+
+         private final float zeye = 100f;
+
+         private final float xat = 0f;
+
+         private final float yat = 0f;
+
+         private final float zat = 0f;
+
+         public final float zoomFactor = 1.0f;
+
+         private int frames;
+
+         private long last = System.currentTimeMillis();
+
+         @Override public void dispose(GLAutoDrawable drawable) {
+
+         }
+
+         @Override public void display(GLAutoDrawable drawable) {
+
+            GL2 gl = drawable.getGL().getGL2();
+
+            gl.glLoadIdentity();
+            gl.glClear(GL.GL_COLOR_BUFFER_BIT | GL.GL_DEPTH_BUFFER_BIT);
+            gl.glColor3f(1f, 1f, 1f);
+
+            GLU glu = new GLU();
+            glu.gluPerspective(45f, ratio, 0f, 1000f);
+
+            glu.gluLookAt(xeye, yeye, zeye * zoomFactor, xat, yat, zat, 0f, 1f, 0f);
+            if (running) {
+               kernel.execute(kernel.range);
+               if (kernel.isExplicit()) {
+                  kernel.get(kernel.xyz);
+               }
+            }
+            kernel.render(gl);
+
+            long now = System.currentTimeMillis();
+            long time = now - last;
+            frames++;
+
+            if (time > 1000) { // We update the frames/sec every second
+               if (running) {
+                  float framesPerSecond = (frames * 1000.0f) / time;
+                  int updatesPerMicroSecond = (int) ((framesPerSecond * kernel.range.getGlobalSize(0) * kernel.range
+                        .getGlobalSize(0)) / 1000000);
+                  framesPerSecondTextField.setText(String.format("%5.2f", framesPerSecond));
+                  positionUpdatesPerMicroSecondTextField.setText(String.format("%4d", updatesPerMicroSecond));
+               }
+               frames = 0;
+               last = now;
+            }
+            gl.glFlush();
+
+         }
+
+         @Override public void init(GLAutoDrawable drawable) {
+            final GL2 gl = drawable.getGL().getGL2();
+
+            gl.glShadeModel(GLLightingFunc.GL_SMOOTH);
+            gl.glEnable(GL.GL_BLEND);
+            gl.glBlendFunc(GL.GL_SRC_ALPHA, GL.GL_ONE);
+            try {
+               InputStream textureStream = Local.class.getResourceAsStream("particle.jpg");
+               Texture texture = TextureIO.newTexture(textureStream, false, null);
+               texture.enable(gl);
+            } catch (IOException e) {
+               e.printStackTrace();
+            } catch (GLException e) {
+               e.printStackTrace();
+            }
+
+         }
+
+         @Override public void reshape(GLAutoDrawable drawable, int x, int y, int _width, int _height) {
+            width = _width;
+            height = _height;
+
+            GL2 gl = drawable.getGL().getGL2();
+            gl.glViewport(0, 0, width, height);
+
+            ratio = (double) width / (double) height;
+
+         }
+
+      });
+
+      panel.add(canvas, BorderLayout.CENTER);
+      frame.getContentPane().add(panel, BorderLayout.CENTER);
+
+      frame.setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE);
+      frame.pack();
+      frame.setVisible(true);
+
+      FPSAnimator animator = new FPSAnimator(canvas, 100);
+      animator.start();
+
+   }
+
+}
diff --git a/examples/nbody/src/com/amd/aparapi/examples/nbody/Main.java b/examples/nbody/src/com/amd/aparapi/examples/nbody/Main.java
index ae92a90a4c9dba7cfc7a8e12a639698c5a587e7c..016a70849824c3a4a351f8e42f0488a6b86ad785 100644
--- a/examples/nbody/src/com/amd/aparapi/examples/nbody/Main.java
+++ b/examples/nbody/src/com/amd/aparapi/examples/nbody/Main.java
@@ -62,10 +62,23 @@ import javax.swing.JTextField;
 import javax.swing.WindowConstants;
 
 import com.amd.aparapi.Kernel;
+import com.amd.aparapi.Range;
 import com.jogamp.opengl.util.FPSAnimator;
 import com.jogamp.opengl.util.texture.Texture;
 import com.jogamp.opengl.util.texture.TextureIO;
 
+/**
+ * NBody implementing demonstrating Aparapi kernels. 
+ * 
+ * For a description of the NBody problem. 
+ * @see http://en.wikipedia.org/wiki/N-body_problem
+ * 
+ * We use JOGL to render the bodies. 
+ * @see http://jogamp.org/jogl/www/
+ * 
+ * @author gfrost
+ *
+ */
 public class Main{
 
    public static class NBodyKernel extends Kernel{
@@ -75,7 +88,7 @@ public class Main{
 
       protected final float mass = 5f;
 
-      private final int bodies;
+      private final Range range;
 
       private final float[] xyz; // positions xy and z of bodies
 
@@ -85,27 +98,32 @@ public class Main{
        * Constructor initializes xyz and vxyz arrays.
        * @param _bodies
        */
-      public NBodyKernel(int _bodies) {
-         bodies = _bodies;
-         xyz = new float[bodies * 3];
-         vxyz = new float[bodies * 3];
+      public NBodyKernel(Range _range) {
+         range = _range;
+         // range = Range.create(bodies);
+         xyz = new float[range.getGlobalSize(0) * 3];
+         vxyz = new float[range.getGlobalSize(0) * 3];
          float maxDist = 20f;
-         for (int body = 0; body < bodies * 3; body += 3) {
-            // If I could remmember some basic algebra I guess I could avoid this loop ;) 
-            // just ensures that the x,y,z is within maxdist radius of origin
-            do {
-               xyz[body + 0] = (float) (Math.random() * 2 * maxDist) - maxDist; //x
-               xyz[body + 1] = (float) (Math.random() * 2 * maxDist) - maxDist; //y
-               xyz[body + 2] = (float) (Math.random() * 2 * maxDist) - maxDist; //z
-            } while (xyz[body + 0] * xyz[body + 0] + xyz[body + 1] * xyz[body + 1] + xyz[body + 2] * xyz[body + 2] > maxDist
-                  * maxDist);
-            // divide into two 'sphere of bodies' by adjusting x 
+         for (int body = 0; body < range.getGlobalSize(0) * 3; body += 3) {
+
+            float theta = (float) (Math.random() * Math.PI * 2);
+            float phi = (float) (Math.random() * Math.PI * 2);
+            float radius = (float) (Math.random() * maxDist);
+
+            // get the 3D dimensional coordinates
+            xyz[body + 0] = (float) (radius * Math.cos(theta) * Math.sin(phi));
+            xyz[body + 1] = (float) (radius * Math.sin(theta) * Math.sin(phi));
+            xyz[body + 2] = (float) (radius * Math.cos(phi));
+
+            // divide into two 'spheres of bodies' by adjusting x 
+
             if (body % 2 == 0) {
                xyz[body + 0] += maxDist * 1.5;
             } else {
                xyz[body + 0] -= maxDist * 1.5;
             }
          }
+         setExplicit(true);
       }
 
       /** 
@@ -113,7 +131,7 @@ public class Main{
        */
       @Override public void run() {
          int body = getGlobalId();
-         int count = bodies * 3;
+         int count = getGlobalSize(0) * 3;
          int globalId = body * 3;
 
          float accx = 0.f;
@@ -153,7 +171,7 @@ public class Main{
       protected void render(GL2 gl) {
          gl.glBegin(GL2.GL_QUADS);
 
-         for (int i = 0; i < bodies * 3; i += 3) {
+         for (int i = 0; i < range.getGlobalSize(0) * 3; i += 3) {
             gl.glTexCoord2f(0, 1);
             gl.glVertex3f(xyz[i + 0], xyz[i + 1] + 1, xyz[i + 2]);
             gl.glTexCoord2f(0, 0);
@@ -176,7 +194,7 @@ public class Main{
 
    public static void main(String _args[]) {
 
-      final NBodyKernel kernel = new NBodyKernel(Integer.getInteger("bodies", 8192));
+      final NBodyKernel kernel = new NBodyKernel(Range.create(Integer.getInteger("bodies", 8192)));
 
       JFrame frame = new JFrame("NBody");
 
@@ -196,7 +214,7 @@ public class Main{
       controlPanel.add(new JLabel(kernel.getExecutionMode().toString()));
 
       controlPanel.add(new JLabel("   Particles"));
-      controlPanel.add(new JTextField("" + kernel.bodies, 5));
+      controlPanel.add(new JTextField("" + kernel.range.getGlobalSize(0), 5));
 
       controlPanel.add(new JLabel("FPS"));
       final JTextField framesPerSecondTextField = new JTextField("0", 5);
@@ -256,7 +274,10 @@ public class Main{
 
             glu.gluLookAt(xeye, yeye, zeye * zoomFactor, xat, yat, zat, 0f, 1f, 0f);
             if (running) {
-               kernel.execute(kernel.bodies);
+               kernel.execute(kernel.range);
+               if (kernel.isExplicit()) {
+                  kernel.get(kernel.xyz);
+               }
             }
             kernel.render(gl);
 
@@ -267,7 +288,8 @@ public class Main{
             if (time > 1000) { // We update the frames/sec every second
                if (running) {
                   float framesPerSecond = (frames * 1000.0f) / time;
-                  int updatesPerMicroSecond = (int) ((framesPerSecond * kernel.bodies * kernel.bodies) / 1000000);
+                  int updatesPerMicroSecond = (int) ((framesPerSecond * kernel.range.getGlobalSize(0) * kernel.range
+                        .getGlobalSize(0)) / 1000000);
                   framesPerSecondTextField.setText(String.format("%5.2f", framesPerSecond));
                   positionUpdatesPerMicroSecondTextField.setText(String.format("%4d", updatesPerMicroSecond));
                }
@@ -316,7 +338,7 @@ public class Main{
       frame.pack();
       frame.setVisible(true);
 
-      FPSAnimator animator = new FPSAnimator(canvas, 200);
+      FPSAnimator animator = new FPSAnimator(canvas, 100);
       animator.start();
 
    }
diff --git a/samples/blackscholes/.classpath b/samples/blackscholes/.classpath
new file mode 100644
index 0000000000000000000000000000000000000000..2b3d42947b0a9f028e1acf3ee7d6963e71b1003c
--- /dev/null
+++ b/samples/blackscholes/.classpath
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
+	<classpathentry combineaccessrules="false" kind="src" path="/com.amd.aparapi"/>
+	<classpathentry kind="output" path="classes"/>
+</classpath>
diff --git a/samples/blackscholes/.project b/samples/blackscholes/.project
new file mode 100644
index 0000000000000000000000000000000000000000..eb5be55b443c4d9d23671cae1c7825d4a8d5a954
--- /dev/null
+++ b/samples/blackscholes/.project
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>blackscholes</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>
diff --git a/samples/blackscholes/src/com/amd/aparapi/samples/blackscholes/Main.java b/samples/blackscholes/src/com/amd/aparapi/samples/blackscholes/Main.java
index a80ddd65e4d32183fbb2b218510221d2869df405..60c311427db70f5b00f929684b98ef0266ca62ca 100644
--- a/samples/blackscholes/src/com/amd/aparapi/samples/blackscholes/Main.java
+++ b/samples/blackscholes/src/com/amd/aparapi/samples/blackscholes/Main.java
@@ -38,6 +38,7 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 package com.amd.aparapi.samples.blackscholes;
 
 import com.amd.aparapi.Kernel;
+import com.amd.aparapi.Range;
 
 public class Main{
 
@@ -181,9 +182,10 @@ public class Main{
    public static void main(String[] _args) throws ClassNotFoundException, InstantiationException, IllegalAccessException {
 
       int size = Integer.getInteger("size", 512);
-      int iterations = Integer.getInteger("iterations", 5); 
-      System.out.println("size ="+size);
-      System.out.println("iterations ="+iterations);
+      Range range = Range.create(size);
+      int iterations = Integer.getInteger("iterations", 5);
+      System.out.println("size =" + size);
+      System.out.println("iterations =" + iterations);
       BlackScholesKernel kernel = new BlackScholesKernel(size);
 
       long totalExecTime = 0;
@@ -193,7 +195,7 @@ public class Main{
          iterExecTime = kernel.execute(size).getExecutionTime();
          totalExecTime += iterExecTime;
       }*/
-      kernel.execute(size, iterations);
+      kernel.execute(range, iterations);
       System.out.println("Average execution time " + kernel.getAccumulatedExecutionTime() / iterations);
       kernel.showResults(10);
 
diff --git a/samples/convolution/.classpath b/samples/convolution/.classpath
new file mode 100644
index 0000000000000000000000000000000000000000..2b3d42947b0a9f028e1acf3ee7d6963e71b1003c
--- /dev/null
+++ b/samples/convolution/.classpath
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
+	<classpathentry combineaccessrules="false" kind="src" path="/com.amd.aparapi"/>
+	<classpathentry kind="output" path="classes"/>
+</classpath>
diff --git a/samples/convolution/.project b/samples/convolution/.project
new file mode 100644
index 0000000000000000000000000000000000000000..a304e12fe2c740acb75674e6ccb285155ddf2c2a
--- /dev/null
+++ b/samples/convolution/.project
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>convolution</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>
diff --git a/samples/convolution/build.xml b/samples/convolution/build.xml
new file mode 100644
index 0000000000000000000000000000000000000000..90979e334ff82caa213a1027e54266a24bb65aca
--- /dev/null
+++ b/samples/convolution/build.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+
+<project name="convolution" default="build" basedir=".">
+   <target name="build" depends="clean">
+      <mkdir dir="classes"/>
+      <javac srcdir="src" destdir="classes" debug="on" includeantruntime="false" >
+         <classpath>
+            <pathelement path="../../com.amd.aparapi/aparapi.jar"/>
+         </classpath>
+      </javac>
+      <jar jarfile="${ant.project.name}.jar" basedir="classes"/>
+   </target>
+
+   <target name="clean">
+      <delete dir="classes"/>
+      <delete file="${ant.project.name}.jar"/>
+   </target>
+
+
+</project>
diff --git a/samples/convolution/conv.bat b/samples/convolution/conv.bat
new file mode 100644
index 0000000000000000000000000000000000000000..ac0c4aab2b405198e1f7516ee3ed5d9adb71e131
--- /dev/null
+++ b/samples/convolution/conv.bat
@@ -0,0 +1,6 @@
+java ^
+ -Djava.library.path=../../com.amd.aparapi.jni ^
+ -Dcom.amd.aparapi.executionMode=%1 ^
+ -classpath ../../com.amd.aparapi/aparapi.jar;convolution.jar ^
+ com.amd.aparapi.sample.convolution.Main
+
diff --git a/samples/convolution/src/com/amd/aparapi/sample/convolution/Main.java b/samples/convolution/src/com/amd/aparapi/sample/convolution/Main.java
new file mode 100644
index 0000000000000000000000000000000000000000..0196418136f905cc258dbf5f596eec8b3fda8b69
--- /dev/null
+++ b/samples/convolution/src/com/amd/aparapi/sample/convolution/Main.java
@@ -0,0 +1,234 @@
+/*
+Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following
+disclaimer. 
+
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials provided with the distribution. 
+
+Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
+laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
+774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
+you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
+Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
+Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
+E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
+D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
+to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
+of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
+under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 
+
+*/
+
+package com.amd.aparapi.sample.convolution;
+
+import java.awt.Dimension;
+import java.awt.Graphics;
+import java.awt.image.BufferedImage;
+import java.awt.image.DataBufferInt;
+import java.io.File;
+import java.io.IOException;
+
+import javax.imageio.ImageIO;
+import javax.swing.JComponent;
+import javax.swing.JFrame;
+import javax.swing.WindowConstants;
+
+import com.amd.aparapi.Kernel;
+import com.amd.aparapi.Range;
+
+/**
+ * An example Aparapi application which demonstrates image manipulation via convolution filter
+ * 
+ * Converted to use int buffer and some performance tweaks by Gary Frost
+ * http://processing.org/learning/pixels/
+ * 
+ * @author Gary Frost
+ */
+public class Main{
+   // http://docs.gimp.org/en/plug-in-convmatrix.html
+
+   final static class ConvolutionFilter{
+      private float[] weights;
+
+      private int offset;
+
+      ConvolutionFilter(float _nw, float _n, float ne, float _w, float _o, float _e, float _sw, float _s, float _se, int _offset) {
+         weights = new float[] {
+               _nw,
+               _w,
+               ne,
+               _w,
+               _o,
+               _e,
+               _sw,
+               _s,
+               _se
+         };
+         offset = _offset;
+      }
+
+   }
+
+   private static final ConvolutionFilter NONE = new ConvolutionFilter(0f, 0f, 0f, 0f, 1f, 0f, 0f, 0f, 0f, 0);
+
+   private static final ConvolutionFilter BLUR = new ConvolutionFilter(1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 1f, 0);
+
+   private static final ConvolutionFilter EMBOSS = new ConvolutionFilter(-2f, -1f, 0f, -1f, 1f, 1f, 0f, 1f, 2f, 0);
+
+   public static class ConvolutionKernel extends Kernel{
+
+      private final float[] filter = new float[9];
+
+      private final int[] inputData;
+
+      private final int[] outputData;
+
+      private final int width;
+
+      private final int height;
+
+      private int offset;
+
+      public ConvolutionKernel(int _width, int _height, BufferedImage _inputImage, BufferedImage _outputImage) {
+         inputData = ((DataBufferInt) _inputImage.getRaster().getDataBuffer()).getData();
+         outputData = ((DataBufferInt) _outputImage.getRaster().getDataBuffer()).getData();
+         width = _width;
+         height = _height;
+
+         // setExplicit(true); // This gives us a performance boost
+         //  put(inputData); // Because we are using explicit buffer management we must put the imageData array
+      }
+
+      public void run() {
+
+         int x = getGlobalId(0);
+         int y = getGlobalId(1);
+         int lx = getLocalId(0);
+         int ly = getLocalId(1);
+         int w = getGlobalSize(0);
+         int h = getGlobalSize(1);
+         // System.out.println(x+","+y+" "+lx+","+ly+" "+w+","+h);
+         if (x > 1 && x < (w - 1) && y > 1 && y < (h - 1)) {
+
+            int result = 0;
+            // We handle each color separately using rgbshift as an 8 bit mask for red, green, blue
+            for (int rgbShift = 0; rgbShift < 24; rgbShift += 8) { // 0,8,16
+               int channelAccum = 0;
+               float accum = 0;
+
+               for (int count = 0; count < 9; count++) {
+                  int dx = (count % 3) - 1; // 0,1,2 -> -1,0,1
+                  int dy = (count / 3) - 1; // 0,1,2 -> -1,0,1
+
+                  int rgb = (inputData[((y + dy) * w) + (x + dx)]);
+                  int channelValue = ((rgb >> rgbShift) & 0xff);
+                  accum += filter[count];
+                  channelAccum += channelValue * filter[count++];
+
+               }
+               channelAccum /= accum;
+               channelAccum += offset;
+               channelAccum = max(0, min(channelAccum, 0xff));
+               result |= (channelAccum << rgbShift);
+            }
+            outputData[y * w + x] = result;
+         }
+      }
+
+      public void convolve(ConvolutionFilter _filter) {
+         System.arraycopy(_filter.weights, 0, filter, 0, _filter.weights.length);
+         offset = _filter.offset;
+         put(filter);
+         execute(Range.create2D(width, height, 8, 8));
+         get(outputData);
+      }
+   }
+
+   public static final int PAD = 1024;
+
+   public static int padValue(int value) {
+      return (PAD - (value % PAD));
+   }
+
+   public static int padTo(int value) {
+      return (value + padValue(value));
+   }
+
+   public static void main(String[] _args) throws IOException, InterruptedException {
+
+      JFrame frame = new JFrame("Convolution");
+
+      BufferedImage testCard = ImageIO.read(new File("testcard.jpg"));
+
+      int imageHeight = testCard.getHeight();
+
+      int imageWidth = testCard.getWidth();
+
+      final int width = padTo(imageWidth);// now multiple of 64
+
+      final int height = padTo(imageHeight); // now multiple of 64
+
+      System.out.println("image width,height=" + width + "," + height);
+
+      final BufferedImage inputImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
+
+      inputImage.getGraphics().drawImage(testCard, padValue(imageWidth) / 2, padValue(imageHeight) / 2, null);
+      final BufferedImage outputImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
+      outputImage.getGraphics().drawImage(testCard, padValue(imageWidth) / 2, padValue(imageHeight) / 2, null);
+      final ConvolutionKernel lifeKernel = new ConvolutionKernel(width, height, inputImage, outputImage);
+
+      // Create a component for viewing the offsecreen image
+      @SuppressWarnings("serial") JComponent viewer = new JComponent(){
+         @Override public void paintComponent(Graphics g) {
+            //  if (lifeKernel.isExplicit()) {
+            //    lifeKernel.get(lifeKernel.inputData); // We only pull the imageData when we intend to use it.
+            //  }
+            g.drawImage(outputImage, 0, 0, width, height, 0, 0, width, height, this);
+         }
+      };
+
+      // Set the default size and add to the frames content pane
+      viewer.setPreferredSize(new Dimension(width, height));
+      frame.getContentPane().add(viewer);
+
+      // Swing housekeeping
+      frame.pack();
+      frame.setVisible(true);
+      frame.setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE);
+
+      ConvolutionFilter filters[] = new ConvolutionFilter[] {
+            NONE,
+            BLUR,
+            EMBOSS,
+      };
+      long start = System.nanoTime();
+      for (int i = 0; i < 100; i++) {
+         for (ConvolutionFilter filter : filters) {
+
+            lifeKernel.convolve(filter); // Work is performed here
+
+            viewer.repaint(); // Request a repaint of the viewer (causes paintComponent(Graphics) to be called later not synchronous
+            //Thread.sleep(1000);
+         }
+      }
+      System.out.println((System.nanoTime() - start) / 1000000);
+
+   }
+}
diff --git a/samples/convolution/src/com/amd/aparapi/sample/convolution/Test12x4_4x2.java b/samples/convolution/src/com/amd/aparapi/sample/convolution/Test12x4_4x2.java
new file mode 100644
index 0000000000000000000000000000000000000000..e68ca82cfd56c878bfe48ec9e78779cde98f71cf
--- /dev/null
+++ b/samples/convolution/src/com/amd/aparapi/sample/convolution/Test12x4_4x2.java
@@ -0,0 +1,496 @@
+package com.amd.aparapi.sample.convolution;
+
+import com.amd.aparapi.Kernel;
+import com.amd.aparapi.Range;
+
+public class Test12x4_4x2{
+   public static void main(String[] _args) {
+      // globalThreadId, threadId, globalX, globalY, localX, localY
+      final int[][] test = new int[][] {
+            {
+                  0, //globalThreadId
+                  0,//threadId
+                  0,//globalX
+                  0,//globalY
+                  0,//localX
+                  0
+            //localY
+            },
+            {
+                  1,//globalThreadId
+                  1,//threadId
+                  1,//globalX
+                  0,//globalY
+                  1,//localX
+                  0
+            //localY
+            },
+            {
+                  2,//globalThreadId
+                  2,//threadId
+                  2,//globalX
+                  0,//globalY
+                  2,//localX
+                  0
+            //localY
+            },
+            {
+                  3,//globalThreadId
+                  3,//threadId
+                  3,//globalX
+                  0,//globalY
+                  3,//localX
+                  0
+            //localY
+            },
+            {
+                  4,//globalThreadId
+                  4,//threadId
+                  0,//globalX
+                  1,//globalY
+                  0,//localX
+                  1
+            //localY
+            },
+            {
+                  5,//globalThreadId
+                  5,//threadId
+                  1,//globalX
+                  1,//globalY
+                  1,//localX
+                  1
+            //localY
+            },
+            {
+                  6,//globalThreadId
+                  6,//threadId
+                  2,//globalX
+                  1,//globalY
+                  2,//localX
+                  1
+            //localY
+            },
+            {
+                  7,//globalThreadId
+                  7,//threadId
+                  3,//globalX
+                  1,//globalY
+                  3,//localX
+                  1
+            //localY
+            },
+            {
+                  8,//globalThreadId
+                  0,//threadId
+                  4,//globalX
+                  0,//globalY
+                  0,//localX
+                  0
+            //localY
+            },
+            {
+                  9,//globalThreadId
+                  1,//threadId
+                  5,//globalX
+                  0,//globalY
+                  1,//localX
+                  0
+            //localY
+            },
+            {
+                  10,//globalThreadId
+                  2,//threadId
+                  6,//globalX
+                  0,//globalY
+                  2,//localX
+                  0
+            //localY
+            },
+            {
+                  11,//globalThreadId
+                  3,//threadId
+                  7,//globalX
+                  0,//globalY
+                  3,//localX
+                  0
+            //localY
+            },
+            {
+                  12,//globalThreadId
+                  4,//threadId
+                  4,//globalX
+                  1,//globalY
+                  0,//localX
+                  1
+            //localY
+            },
+            {
+                  13,//globalThreadId
+                  5,//threadId
+                  5,//globalX
+                  1,//globalY
+                  1,//localX
+                  1
+            //localY
+            },
+            {
+                  14,//globalThreadId
+                  6,//threadId
+                  6,//globalX
+                  1,//globalY
+                  2,//localX
+                  1
+            //localY
+            },
+            {
+                  15,//globalThreadId
+                  7,//threadId
+                  7,//globalX
+                  1,//globalY
+                  3,//localX
+                  1
+            //localY
+            },
+            {
+                  16,//globalThreadId
+                  0,//threadId
+                  8,//globalX
+                  0,//globalY
+                  0,//localX
+                  0
+            //localY
+            },
+            {
+                  17,//globalThreadId
+                  1,//threadId
+                  9,//globalX
+                  0,//globalY
+                  1,//localX
+                  0
+            //localY
+            },
+            {
+                  18,//globalThreadId
+                  2,//threadId
+                  10,//globalX
+                  0,//globalY
+                  2,//localX
+                  0
+            //localY
+            },
+            {
+                  19,//globalThreadId
+                  3,//threadId
+                  11,//globalX
+                  0,//globalY
+                  3,//localX
+                  0
+            //localY
+            },
+
+            {
+                  20,//globalThreadId
+                  4,//threadId
+                  8,//globalX
+                  1,//globalY
+                  0,//localX
+                  1
+            //localY
+            },
+            {
+                  21,//globalThreadId
+                  5,//threadId
+                  9,//globalX
+                  1,//globalY
+                  1,//localX
+                  1
+            //localY
+            },
+            {
+                  22,//globalThreadId
+                  6,//threadId
+                  10,//globalX
+                  1,
+                  2,//localX
+                  1
+            //localY
+            },
+            {
+                  23,//globalThreadId
+                  7,//threadId
+                  11,//globalX
+                  1,//globalY
+                  3,//localX
+                  1
+            //localY
+            },
+            {
+                  24,//globalThreadId
+                  0,//threadId
+                  0,//globalX
+                  2,//globalY
+                  0,//localX
+                  0
+            //localY
+            },
+            {
+                  25,//globalThreadId
+                  1,//threadId
+                  1,//globalX
+                  2,//globalY
+                  1,//localX
+                  0
+            //localY
+            },
+            {
+                  26,//globalThreadId
+                  2,//threadId
+                  2,//globalX
+                  2,//globalY
+                  2,//localX
+                  0
+            //localY
+            },
+            {
+                  27,//globalThreadId
+                  3,//threadId
+                  3,//globalX
+                  2,//globalY
+                  3,//localX
+                  0
+            //localY
+            },
+            {
+                  28,//globalThreadId
+                  4,//threadId
+                  0,//globalX
+                  3,//globalY
+                  0,//localX
+                  1
+            //localY
+            },
+            {
+                  29,//globalThreadId
+                  5,//threadId
+                  1,//globalX
+                  3,//globalY
+                  1,//localX
+                  1
+            //localY
+            },
+            {
+                  30,//globalThreadId
+                  6,//threadId
+                  2,//globalX
+                  3,//globalY
+                  2,//localX
+                  1
+            //localY
+            },
+            {
+                  31,//globalThreadId
+                  7,//threadId
+                  3,//globalX
+                  3,//globalY
+                  3,//localX
+                  1
+            //localY
+            },
+            {
+                  32,//globalThreadId
+                  0,//threadId
+                  4,//globalX
+                  2,//globalY
+                  0,//localX
+                  0
+            //localY
+            },
+            {
+                  33,//globalThreadId
+                  1,//threadId
+                  5,//globalX
+                  2,//globalY
+                  1,//localX
+                  0
+            //localY
+            },
+            {
+                  34,//globalThreadId
+                  2,//threadId
+                  6,//globalX
+                  2,//globalY
+                  2,//localX
+                  0
+            //localY
+            },
+            {
+                  35,//globalThreadId
+                  3,//threadId
+                  7,//globalX
+                  2,//globalY
+                  3,//localX
+                  0
+            //localY
+            },
+            {
+                  36,//globalThreadId
+                  4,//threadId
+                  4,//globalX
+                  3,//globalY
+                  0,//localX
+                  1
+            //localY
+            },
+            {
+                  37,//globalThreadId
+                  5,//threadId
+                  5,//globalX
+                  3,//globalY
+                  1,//localX
+                  1
+            //localY
+            },
+            {
+                  38,//globalThreadId
+                  6,//threadId
+                  6,//globalX
+                  3,//globalY
+                  2,//localX
+                  1
+            //localY
+            },
+            {
+                  39,//globalThreadId
+                  7,//threadId
+                  7,//globalX
+                  3,//globalY
+                  3,//localX
+                  1
+            //localY
+            },
+            {
+                  40,//globalThreadId
+                  0,//threadId
+                  8,//globalX
+                  2,//globalY
+                  0,//localX
+                  0
+            //localY
+            },
+            {
+                  41,//globalThreadId
+                  1,//threadId
+                  9,//globalX
+                  2,//globalY
+                  1,//localX
+                  0
+            //localY
+            },
+            {
+                  42,//globalThreadId
+                  2,//threadId
+                  10,//globalX
+                  2,//globalY
+                  2,//localX
+                  0
+            //localY
+            },
+            {
+                  43,//globalThreadId
+                  3,//threadId
+                  11,//globalX
+                  2,//globalY
+                  3,//localX
+                  0
+            //localY
+            },
+
+            {
+                  44,//globalThreadId
+                  4,//threadId
+                  8,//globalX
+                  3,//globalY
+                  0,//localX
+                  1
+            //localY
+            },
+            {
+                  45,//globalThreadId
+                  5,//threadId
+                  9,//globalX
+                  3,//globalY
+                  1,//localX
+                  1
+            //localY
+            },
+            {
+                  46,//globalThreadId
+                  6,//threadId
+                  10,//globalX
+                  3,//globalY
+                  2,//localX
+                  1
+            //localY
+            },
+            {
+                  47,//globalThreadId
+                  7,//threadId
+                  11,//globalX
+                  3,//globalY
+                  3,//localX
+                  1
+            //localY
+            },
+      };
+      Kernel kernel = new Kernel(){
+
+         @Override public void run() {
+            int x = getGlobalId(0);
+            int y = getGlobalId(1);
+            int lx = getLocalId(0);
+            int ly = getLocalId(1);
+            int w = getGlobalSize(0);
+            int h = getGlobalSize(1);
+            int globalThreadId = getGlobalId(1) * getGlobalSize(0) + getGlobalId(0);
+            int threadId = getLocalId(1) * getLocalSize(0) + getLocalId(0);
+            synchronized (test) {
+               boolean show = false;
+               if (globalThreadId != test[globalThreadId][0]) {
+                  System.out.println("bad globalThreadId");
+                  show = true;
+               }
+               if (threadId != test[globalThreadId][1]) {
+                  System.out.println("bad threadId");
+                  show = true;
+               }
+               if (x != test[globalThreadId][2]) {
+                  System.out.println("bad globalx");
+                  show = true;
+               }
+               if (y != test[globalThreadId][3]) {
+                  System.out.println("bad globaly");
+                  show = true;
+               }
+               if (lx != test[globalThreadId][4]) {
+                  System.out.println("bad localx");
+                  show = true;
+               }
+               if (ly != test[globalThreadId][5]) {
+                  System.out.println("bad localy");
+                  show = true;
+               }
+               if (show) {
+                  System.out.println("derived =>" + globalThreadId + " " + threadId + " " + x + "," + y + " " + lx + "," + ly + " "
+                        + w + "," + h);
+                  System.out.println("data    =>" + test[globalThreadId][0] + " " + test[globalThreadId][1] + " "
+                        + test[globalThreadId][2] + "," + test[globalThreadId][3] + " " + test[globalThreadId][4] + ","
+                        + test[globalThreadId][5] + " " + w + "," + h);
+               }
+            }
+         }
+
+      };
+      kernel.execute(Range.create2D(12, 4, 4, 2));
+
+   }
+}
diff --git a/samples/convolution/testcard.jpg b/samples/convolution/testcard.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..16b1a709365da51b2247255c903c7c6bb173207f
Binary files /dev/null and b/samples/convolution/testcard.jpg differ
diff --git a/samples/life/src/com/amd/aparapi/sample/life/Main.java b/samples/life/src/com/amd/aparapi/sample/life/Main.java
index 4ede78b12789d46f31b9d68f91cdd0e557b1b5b0..73a0a867251f8ea2c0e6b0629568a279761b1f7c 100644
--- a/samples/life/src/com/amd/aparapi/sample/life/Main.java
+++ b/samples/life/src/com/amd/aparapi/sample/life/Main.java
@@ -55,6 +55,7 @@ import javax.swing.JPanel;
 import javax.swing.WindowConstants;
 
 import com.amd.aparapi.Kernel;
+import com.amd.aparapi.Range;
 
 /**
  * An example Aparapi application which demonstrates Conways 'Game Of Life'.
@@ -101,6 +102,8 @@ public class Main{
 
       private final int height;
 
+      private final Range range;
+
       private int fromBase;
 
       private int toBase;
@@ -109,6 +112,8 @@ public class Main{
          imageData = ((DataBufferInt) _image.getRaster().getDataBuffer()).getData();
          width = _width;
          height = _height;
+         range = Range.create(width * height, 256);
+         System.out.println("range = " + range);
          fromBase = height * width;
          toBase = 0;
          setExplicit(true); // This gives us a performance boost
@@ -160,7 +165,7 @@ public class Main{
          fromBase = toBase;
          toBase = swap;
 
-         execute(width * height);
+         execute(range);
       }
 
    }
diff --git a/samples/mandel/mandel2D.bat b/samples/mandel/mandel2D.bat
new file mode 100644
index 0000000000000000000000000000000000000000..93e67e077cccab1762bd22405ad426df138b58b1
--- /dev/null
+++ b/samples/mandel/mandel2D.bat
@@ -0,0 +1,7 @@
+java ^
+ -Djava.library.path=../../com.amd.aparapi.jni ^
+ -Dcom.amd.aparapi.executionMode=%1 ^
+ -classpath ../../com.amd.aparapi/aparapi.jar;mandel.jar ^
+ com.amd.aparapi.sample.mandel.Main2D
+
+
diff --git a/samples/mandel/mandel2D.sh b/samples/mandel/mandel2D.sh
new file mode 100644
index 0000000000000000000000000000000000000000..99a1c45c882d6c93f4fbf23ecde2c5698d1cadac
--- /dev/null
+++ b/samples/mandel/mandel2D.sh
@@ -0,0 +1,5 @@
+java\
+ -Djava.library.path=../../com.amd.aparapi.jni\
+ -Dcom.amd.aparapi.executionMode=$1\
+ -classpath ../../com.amd.aparapi/aparapi.jar:mandel.jar\
+ com.amd.aparapi.sample.mandel.Main2D
diff --git a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java
index 5879f7852652ea8c73f6e15c4a9a0aee6fd88c35..99d3373c080bd6cff626d5c9aadc0a94e5aca926 100644
--- a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java
+++ b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main.java
@@ -53,6 +53,7 @@ import javax.swing.JComponent;
 import javax.swing.JFrame;
 
 import com.amd.aparapi.Kernel;
+import com.amd.aparapi.Range;
 
 /**
  * An example Aparapi application which displays a view of the Mandelbrot set and lets the user zoom in to a particular point. 
@@ -164,6 +165,9 @@ public class Main{
       /** Height of Mandelbrot view. */
       final int height = 768;
 
+      /** Mandelbrot image height. */
+      final Range range = Range.create(width * height);
+
       /** Maximum iterations for Mandelbrot. */
       final int maxIterations = 256;
 
@@ -220,7 +224,7 @@ public class Main{
 
       // Set the default scale and offset, execute the kernel and force a repaint of the viewer.
       kernel.setScaleAndOffset(defaultScale, -1f, 0f);
-      kernel.execute(width * height);
+      kernel.execute(range);
       System.arraycopy(rgb, 0, imageRgb, 0, rgb.length);
       viewer.repaint();
 
@@ -266,7 +270,7 @@ public class Main{
 
                // Set the scale and offset, execute the kernel and force a repaint of the viewer.
                kernel.setScaleAndOffset(scale, x, y);
-               kernel.execute(width * height);
+               kernel.execute(range);
                System.arraycopy(rgb, 0, imageRgb, 0, rgb.length);
                viewer.repaint();
             }
diff --git a/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java
new file mode 100644
index 0000000000000000000000000000000000000000..5e6b85a3c3da3096f91c3dce28a96db95607808c
--- /dev/null
+++ b/samples/mandel/src/com/amd/aparapi/sample/mandel/Main2D.java
@@ -0,0 +1,277 @@
+/*
+Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following
+disclaimer. 
+
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials provided with the distribution. 
+
+Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
+laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
+774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
+you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
+Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
+Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
+E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
+D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
+to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
+of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
+under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 
+
+*/
+
+package com.amd.aparapi.sample.mandel;
+
+import java.awt.Color;
+import java.awt.Dimension;
+import java.awt.Graphics;
+import java.awt.Point;
+import java.awt.event.MouseAdapter;
+import java.awt.event.MouseEvent;
+import java.awt.event.WindowAdapter;
+import java.awt.event.WindowEvent;
+import java.awt.image.BufferedImage;
+import java.awt.image.DataBufferInt;
+
+import javax.swing.JComponent;
+import javax.swing.JFrame;
+
+import com.amd.aparapi.Kernel;
+import com.amd.aparapi.Range;
+
+/**
+ * An example Aparapi application which displays a view of the Mandelbrot set and lets the user zoom in to a particular point. 
+ * 
+ * When the user clicks on the view, this example application will zoom in to the clicked point and zoom out there after.
+ * On GPU, additional computing units will offer a better viewing experience. On the other hand on CPU, this example 
+ * application might suffer with sub-optimal frame refresh rate as compared to GPU. 
+ *  
+ * @author gfrost
+ *
+ */
+
+public class Main2D{
+
+   /**
+    * An Aparapi Kernel implementation for creating a scaled view of the mandelbrot set.
+    *  
+    * @author gfrost
+    *
+    */
+
+   public static class MandelKernel extends Kernel{
+
+      /** RGB buffer used to store the Mandelbrot image. This buffer holds (width * height) RGB values. */
+      final private int rgb[];
+
+      /** Palette used for each iteration value 0..maxIterations. */
+      final private int pallette[];
+
+      /** Maximum iterations we will check for. */
+      final private int maxIterations;
+
+      /** Mutable values of scale, offsetx and offsety so that we can modify the zoom level and position of a view. */
+      private float scale = .0f;
+
+      private float offsetx = .0f;
+
+      private float offsety = .0f;
+
+      /**
+       * Initialize the Kernel.
+       *  
+       * @param _width Mandelbrot image width
+       * @param _height Mandelbrot image height
+       * @param _rgb Mandelbrot image RGB buffer
+       * @param _pallette Mandelbrot image palette
+       */
+      public MandelKernel(int[] _rgb, int[] _pallette) {
+
+         rgb = _rgb;
+         pallette = _pallette;
+         maxIterations = pallette.length - 1;
+
+      }
+
+      @Override public void run() {
+
+         /** Determine which RGB value we are going to process (0..RGB.length). */
+         int gid = getGlobalId(1) * getGlobalSize(0) + getGlobalId(0);
+
+         /** Translate the gid into an x an y value. */
+         float x = (((getGlobalId(0) * scale) - ((scale / 2) * getGlobalSize(0))) / getGlobalSize(0)) + offsetx;
+
+         float y = (((getGlobalId(1) * scale) - ((scale / 2) * getGlobalSize(1))) / getGlobalSize(1)) + offsety;
+
+         int count = 0;
+
+         float zx = x;
+         float zy = y;
+         float new_zx = 0f;
+
+         // Iterate until the algorithm converges or until maxIterations are reached.
+         while (count < maxIterations && zx * zx + zy * zy < 8) {
+            new_zx = zx * zx - zy * zy + x;
+            zy = 2 * zx * zy + y;
+            zx = new_zx;
+            count++;
+         }
+
+         // Pull the value out of the palette for this iteration count.
+         rgb[gid] = pallette[count];
+      }
+
+      public void setScaleAndOffset(float _scale, float _offsetx, float _offsety) {
+         offsetx = _offsetx;
+         offsety = _offsety;
+         scale = _scale;
+      }
+
+   }
+
+   /** User selected zoom-in point on the Mandelbrot view. */
+   public static volatile Point to = null;
+
+   @SuppressWarnings("serial") public static void main(String[] _args) {
+
+      JFrame frame = new JFrame("MandelBrot");
+
+      /** Mandelbrot image height. */
+      final Range range = Range.create2D(768, 768);
+      System.out.println("range= " + range);
+
+      /** Maximum iterations for Mandelbrot. */
+      final int maxIterations = 256;
+
+      /** Palette which maps iteration values to RGB values. */
+      final int pallette[] = new int[maxIterations + 1];
+
+      //Initialize palette values
+      for (int i = 0; i < maxIterations; i++) {
+         float h = i / (float) maxIterations;
+         float b = 1.0f - h * h;
+         pallette[i] = Color.HSBtoRGB(h, 1f, b);
+      }
+
+      /** Image for Mandelbrot view. */
+      final BufferedImage image = new BufferedImage(range.getGlobalSize(0), range.getGlobalSize(1), BufferedImage.TYPE_INT_RGB);
+      final BufferedImage offscreen = new BufferedImage(range.getGlobalSize(0), range.getGlobalSize(1), BufferedImage.TYPE_INT_RGB);
+      // Draw Mandelbrot image
+      JComponent viewer = new JComponent(){
+         @Override public void paintComponent(Graphics g) {
+
+            g.drawImage(image, 0, 0, range.getGlobalSize(0), range.getGlobalSize(1), this);
+         }
+      };
+
+      // Set the size of JComponent which displays Mandelbrot image
+      viewer.setPreferredSize(new Dimension(range.getGlobalSize(0), range.getGlobalSize(1)));
+
+      final Object doorBell = new Object();
+
+      // Mouse listener which reads the user clicked zoom-in point on the Mandelbrot view 
+      viewer.addMouseListener(new MouseAdapter(){
+         @Override public void mouseClicked(MouseEvent e) {
+            to = e.getPoint();
+            synchronized (doorBell) {
+               doorBell.notify();
+            }
+         }
+      });
+
+      // Swing housework to create the frame
+      frame.getContentPane().add(viewer);
+      frame.pack();
+      frame.setLocationRelativeTo(null);
+      frame.setVisible(true);
+
+      // Extract the underlying RGB buffer from the image.
+      // Pass this to the kernel so it operates directly on the RGB buffer of the image
+      final int[] rgb = ((DataBufferInt) offscreen.getRaster().getDataBuffer()).getData();
+      final int[] imageRgb = ((DataBufferInt) image.getRaster().getDataBuffer()).getData();
+      // Create a Kernel passing the size, RGB buffer and the palette.
+      final MandelKernel kernel = new MandelKernel(rgb, pallette);
+
+      float defaultScale = 3f;
+
+      // Set the default scale and offset, execute the kernel and force a repaint of the viewer.
+      kernel.setScaleAndOffset(defaultScale, -1f, 0f);
+      kernel.execute(range);
+      System.arraycopy(rgb, 0, imageRgb, 0, rgb.length);
+      viewer.repaint();
+
+      // Report target execution mode: GPU or JTP (Java Thread Pool).
+      System.out.println("Execution mode=" + kernel.getExecutionMode());
+
+      // Window listener to dispose Kernel resources on user exit.
+      frame.addWindowListener(new WindowAdapter(){
+         public void windowClosing(WindowEvent _windowEvent) {
+            kernel.dispose();
+            System.exit(0);
+         }
+      });
+
+      // Wait until the user selects a zoom-in point on the Mandelbrot view.
+      while (true) {
+
+         // Wait for the user to click somewhere
+         while (to == null) {
+            synchronized (doorBell) {
+               try {
+                  doorBell.wait();
+               } catch (InterruptedException ie) {
+                  ie.getStackTrace();
+               }
+            }
+         }
+
+         float x = -1f;
+         float y = 0f;
+         float scale = defaultScale;
+         float tox = (float) (to.x - range.getGlobalSize(0) / 2) / range.getGlobalSize(0) * scale;
+         float toy = (float) (to.y - range.getGlobalSize(1) / 2) / range.getGlobalSize(1) * scale;
+
+         // This is how many frames we will display as we zoom in and out.
+         int frames = 128;
+         long startMillis = System.currentTimeMillis();
+         for (int sign = -1; sign < 2; sign += 2) {
+            for (int i = 0; i < frames - 4; i++) {
+               scale = scale + sign * defaultScale / frames;
+               x = x - sign * (tox / frames);
+               y = y - sign * (toy / frames);
+
+               // Set the scale and offset, execute the kernel and force a repaint of the viewer.
+               kernel.setScaleAndOffset(scale, x, y);
+               kernel.execute(range);
+               System.arraycopy(rgb, 0, imageRgb, 0, rgb.length);
+               viewer.repaint();
+            }
+         }
+
+         long elapsedMillis = System.currentTimeMillis() - startMillis;
+         System.out.println("FPS = " + frames * 1000 / elapsedMillis);
+
+         // Reset zoom-in point.
+         to = null;
+
+      }
+
+   }
+
+}
diff --git a/samples/squares/src/com/amd/aparapi/sample/squares/Main.java b/samples/squares/src/com/amd/aparapi/sample/squares/Main.java
index 929d02b397dc8ed60bdc10922e208ab3d1a05ec7..32a1b70b8bfd16cd76eff8d5666442738a18dc72 100644
--- a/samples/squares/src/com/amd/aparapi/sample/squares/Main.java
+++ b/samples/squares/src/com/amd/aparapi/sample/squares/Main.java
@@ -39,6 +39,7 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 package com.amd.aparapi.sample.squares;
 
 import com.amd.aparapi.Kernel;
+import com.amd.aparapi.Range;
 
 /**
  * An example Aparapi application which computes and displays squares of a set of 512 input values.
@@ -77,7 +78,8 @@ public class Main{
       };
 
       // Execute Kernel.
-      kernel.execute(512);
+
+      kernel.execute(Range.create(512));
 
       // Report target execution mode: GPU or JTP (Java Thread Pool).
       System.out.println("Execution mode=" + kernel.getExecutionMode());
diff --git a/test/codegen/build.xml b/test/codegen/build.xml
index 1118e3dafc678ec6e93d57d00fa67d0836faf54d..5af3d17a981ba103c708b4345d9dc7aa1f842735 100644
--- a/test/codegen/build.xml
+++ b/test/codegen/build.xml
@@ -2,8 +2,7 @@
 
 <project name="codegen" default="junit" basedir=".">
 
-   <!--<property name="junit.jar" value="/home/gfrost/aparapi/trunk/tools/junit/junit.jar"/>-->
-   <property name="junit.jar" value="C:\Users\gfrost\javalabs\projects\aparapi\trunk\tools\junit\junit.jar"/>
+   <property name="junit.jar" value="./junit-4.10.jar"/>
 
    <path id="classpath">
       <pathelement path="..\..\com.amd.aparapi\aparapi.jar"/>
@@ -11,28 +10,22 @@
       <pathelement path="classes"/>
    </path>
 
-   <target name="check">
-      <fail message="Error:">
-         <condition>
-            <not><isset property="junit.jar"/></not>
-         </condition>
-         <![CDATA[
-         You will need to edit test/codegen/build.xml
-
-         At present junit.dir is not set.  It needs to point to the junit jar file in your junit installation.
+   <target name="check-junit">
+      <condition property="need.to.upload.junit">
+         <not><available file="${junit.jar}"/> </not>
+      </condition>
+   </target>
 
-         You can install/download junit from www.junit.org.
-         ]]>
-      </fail>
-      <available file="${junit.jar}" type="file" property="junit.jar.exists"/>
+   <target name="check" if="need.to.upload.junit">
+      <get dest=".">
+         <url url="http://repo1.maven.org/maven2/junit/junit/4.10/junit-4.10.jar"/>
+      </get>
       <fail message="Error:">
          <condition>
-            <not><isset property="junit.jar.exists"/></not>
+            <not><available file="${junit.jar}"/> </not>
          </condition>
          <![CDATA[
-         You will need to edit test/codegen/build.xml
-
-         At present junit.jar is set to ${junit.jar} but that file does not exist
+         Failed to upload junit from maven repository. 
          ]]>
       </fail>
    </target>
@@ -43,7 +36,7 @@
       <delete dir="src/genjava/com"/>
    </target>
 
-   <target name="junit" depends="clean, check">
+   <target name="junit" depends="clean, check-junit, check">
       <mkdir dir="classes"/>
       <javac srcdir="src/java" destdir="classes" debug="on"  includeAntRuntime="false" classpathref="classpath" />