diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp
index eb404c523e909a00a609b35af1d7edf0d1de23a4..956a67ba442d4984a66d2638d34127eb41ffa511 100644
--- a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp
+++ b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp
@@ -422,10 +422,10 @@ void updateBuffer(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& arg
             fprintf(stderr, "runKernel arg %d %s, length = %d\n", argIdx, arg->name, buffer->lens[i]);
          }
          argPos++;
-         status = clSetKernelArg(jniContext->kernel, argPos, sizeof(cl_uint), &(buffer->dims[i]));
-         if(status != CL_SUCCESS) throw CLException(status,"clSetKernelArg (buffer dimension)");
+         status = clSetKernelArg(jniContext->kernel, argPos, sizeof(cl_uint), &(buffer->offsets[i]));
+         if(status != CL_SUCCESS) throw CLException(status,"clSetKernelArg (buffer offset)");
          if (config->isVerbose()){
-            fprintf(stderr, "runKernel arg %d %s, dim = %d\n", argIdx, arg->name, buffer->dims[i]);
+            fprintf(stderr, "runKernel arg %d %s, offsets = %d\n", argIdx, arg->name, buffer->offsets[i]);
          }
       }
    }
@@ -469,7 +469,7 @@ void processArray(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& arg
    arg->pin(jenv);
 
    if (config->isVerbose()) {
-      fprintf(stderr, "runKernel: arrayOrBuf ref %p, oldAddr=%p, newAddr=%p, ref.mem=%p isCopy=%s\n",
+      fprintf(stderr, "runKernel: array ref %p, oldAddr=%p, newAddr=%p, ref.mem=%p isCopy=%s\n",
             arg->arrayBuffer->javaArray, 
             prevAddr,
             arg->arrayBuffer->addr,
@@ -528,8 +528,11 @@ void processBuffer(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& ar
       arg->aparapiBuffer->write.valid = false;
    }
 
+   // TODO: check if the object was moved and required re-flatten
+   arg->aparapiBuffer->flatten(jenv,arg);
+
    if (config->isVerbose()) {
-      fprintf(stderr, "runKernel: arrayOrBuf addr=%p, ref.mem=%p\n",
+      fprintf(stderr, "runKernel: Buf addr=%p, ref.mem=%p\n",
             arg->aparapiBuffer->data,
             arg->aparapiBuffer->mem);
       fprintf(stderr, "at memory addr %p, contents: ", arg->aparapiBuffer->data);
diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.cpp b/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.cpp
index f5fcb19c542f35fcf6c02434bb9eeb901d8d50b0..a6c29c784b601742c03f7ac2c81fd6c9adcd41d4 100644
--- a/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.cpp
+++ b/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.cpp
@@ -42,7 +42,7 @@
 AparapiBuffer::AparapiBuffer():
    javaObject((jobject) 0),
    numDims(0),
-   dims(NULL),
+   offsets(NULL),
    lengthInBytes(0),
    mem((cl_mem) 0),
    data(NULL),
@@ -58,11 +58,11 @@ AparapiBuffer::AparapiBuffer(void* _data, cl_uint* _lens, cl_uint _numDims, long
    mem((cl_mem) 0),
    memMask((cl_uint)0)
 {
-   dims = new cl_uint[_numDims];
+   offsets = new cl_uint[_numDims];
    for(int i = 0; i < _numDims; i++) {
-      dims[i] = 1;
+      offsets[i] = 1;
       for(int j = i+1; j < _numDims; j++) {
-         dims[i] *= lens[j];
+         offsets[i] *= lens[j];
       }
    }
 }
@@ -72,44 +72,61 @@ jobject AparapiBuffer::getJavaObject(JNIEnv* env, KernelArg* arg) {
 }
 
 
-AparapiBuffer* AparapiBuffer::flatten(JNIEnv* env, jobject arg, int type) {
-   int numDims = JNIHelper::getInstanceField<jint>(env, arg, "numDims", IntArg);
-   if(numDims == 2 && isBoolean(type)) {
-      return AparapiBuffer::flattenBoolean2D(env,arg);
-   } else if(numDims == 2 && isByte(type)) {
-      return AparapiBuffer::flattenByte2D(env,arg);
-   } else if(numDims == 2 && isShort(type)) {
-      return AparapiBuffer::flattenShort2D(env,arg);
-   } else if(numDims == 2 && isInt(type)) {
-      return AparapiBuffer::flattenInt2D(env,arg);
-   } else if(numDims == 2 && isLong(type)) {
-      return AparapiBuffer::flattenLong2D(env,arg);
-   } else if(numDims == 2 && isFloat(type)) {
-      return AparapiBuffer::flattenFloat2D(env,arg);
-   } else if(numDims == 2 && isDouble(type)) {
-      return AparapiBuffer::flattenDouble2D(env,arg);
-   } else if(numDims == 3 && isBoolean(type)) {
-      return AparapiBuffer::flattenBoolean3D(env,arg);
-   } else if(numDims == 3 && isByte(type)) {
-      return AparapiBuffer::flattenByte3D(env,arg);
-   } else if(numDims == 3 && isShort(type)) {
-      return AparapiBuffer::flattenShort3D(env,arg);
-   } else if(numDims == 3 && isInt(type)) {
-      return AparapiBuffer::flattenInt3D(env,arg);
-   } else if(numDims == 3 && isLong(type)) {
-      return AparapiBuffer::flattenLong3D(env,arg);
-   } else if(numDims == 3 && isFloat(type)) {
-      return AparapiBuffer::flattenFloat3D(env,arg);
-   } else if(numDims == 3 && isDouble(type)) {
-      return AparapiBuffer::flattenDouble3D(env,arg);
-   }
-   return new AparapiBuffer();
+void AparapiBuffer::flatten(JNIEnv* env, KernelArg* arg) {
+   int numDims = JNIHelper::getInstanceField<jint>(env, arg->javaArg, "numDims", IntArg);
+   if(numDims == 2 && arg->isBoolean()) {
+      flattenBoolean2D(env,arg);
+   } else if(numDims == 2 && arg->isByte()) {
+      flattenByte2D(env,arg);
+   } else if(numDims == 2 && arg->isShort()) {
+      flattenShort2D(env,arg);
+   } else if(numDims == 2 && arg->isInt()) {
+      flattenInt2D(env,arg);
+   } else if(numDims == 2 && arg->isLong()) {
+      flattenLong2D(env,arg);
+   } else if(numDims == 2 && arg->isFloat()) {
+      flattenFloat2D(env,arg);
+   } else if(numDims == 2 && arg->isDouble()) {
+      flattenDouble2D(env,arg);
+   } else if(numDims == 3 && arg->isBoolean()) {
+      flattenBoolean3D(env,arg);
+   } else if(numDims == 3 && arg->isByte()) {
+      flattenByte3D(env,arg);
+   } else if(numDims == 3 && arg->isShort()) {
+      flattenShort3D(env,arg);
+   } else if(numDims == 3 && arg->isInt()) {
+      flattenInt3D(env,arg);
+   } else if(numDims == 3 && arg->isLong()) {
+      flattenLong3D(env,arg);
+   } else if(numDims == 3 && arg->isFloat()) {
+      flattenFloat3D(env,arg);
+   } else if(numDims == 3 && arg->isDouble()) {
+      flattenDouble3D(env,arg);
+   } else {
+      fprintf(stderr,"flatten() not understand argument type\n");
+   }
+
+}
+
+void AparapiBuffer::buildBuffer(void* _data, cl_uint* _dims, cl_uint _numDims, long _lengthInBytes, jobject _javaObject) {
+   data = _data;
+   lens = _dims;
+   numDims = _numDims;
+   lengthInBytes = _lengthInBytes;
+   javaObject = _javaObject;
+   offsets = new cl_uint[_numDims];
+   for(int i = 0; i < _numDims; i++) {
+      offsets[i] = 1;
+      for(int j = i+1; j < _numDims; j++) {
+         offsets[i] *= lens[j];
+      }
+   }
 }
 
 
-AparapiBuffer* AparapiBuffer::flattenBoolean2D(JNIEnv* env, jobject arg) {
+void AparapiBuffer::flattenBoolean2D(JNIEnv* env, KernelArg* arg) {
 
-   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg->javaArg, "javaBuffer", ObjectClassArg);
    cl_uint* dims = new cl_uint[2];
    dims[0] = env->GetArrayLength((jobjectArray)javaBuffer);
    dims[1] = env->GetArrayLength((jbooleanArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0));
@@ -148,14 +165,16 @@ AparapiBuffer* AparapiBuffer::flattenBoolean2D(JNIEnv* env, jobject arg) {
          array[i*dims[1] + j] = elems[j];
       }
       env->ReleaseBooleanArrayElements(jArray, elems, 0);
+      // Does DeleteLocalRef required?
+      // env->DeleteLocalRef(jArray);
    }
   
-   return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
+   buildBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
 }
 
-AparapiBuffer* AparapiBuffer::flattenByte2D(JNIEnv* env, jobject arg) {
+void AparapiBuffer::flattenByte2D(JNIEnv* env, KernelArg* arg) {
 
-   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg->javaArg, "javaBuffer", ObjectClassArg);
    cl_uint* dims = new cl_uint[2];
    dims[0] = env->GetArrayLength((jobjectArray)javaBuffer);
    dims[1] = env->GetArrayLength((jbyteArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0));
@@ -196,12 +215,12 @@ AparapiBuffer* AparapiBuffer::flattenByte2D(JNIEnv* env, jobject arg) {
       env->ReleaseByteArrayElements(jArray, elems, 0);
    }
   
-   return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
+   buildBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
 }
 
-AparapiBuffer* AparapiBuffer::flattenShort2D(JNIEnv* env, jobject arg) {
+void AparapiBuffer::flattenShort2D(JNIEnv* env, KernelArg* arg) {
 
-   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg->javaArg, "javaBuffer", ObjectClassArg);
    cl_uint* dims = new cl_uint[2];
    dims[0] = env->GetArrayLength((jobjectArray)javaBuffer);
    dims[1] = env->GetArrayLength((jshortArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0));
@@ -242,12 +261,12 @@ AparapiBuffer* AparapiBuffer::flattenShort2D(JNIEnv* env, jobject arg) {
       env->ReleaseShortArrayElements(jArray, elems, 0);
    }
   
-   return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
+   buildBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
 }
 
-AparapiBuffer* AparapiBuffer::flattenInt2D(JNIEnv* env, jobject arg) {
+void AparapiBuffer::flattenInt2D(JNIEnv* env, KernelArg* arg) {
 
-   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg->javaArg, "javaBuffer", ObjectClassArg);
    cl_uint* dims = new cl_uint[2];
    dims[0] = env->GetArrayLength((jobjectArray)javaBuffer);
    dims[1] = env->GetArrayLength((jintArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0));
@@ -288,12 +307,12 @@ AparapiBuffer* AparapiBuffer::flattenInt2D(JNIEnv* env, jobject arg) {
       env->ReleaseIntArrayElements(jArray, elems, 0);
    }
   
-   return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
+   buildBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
 }
 
-AparapiBuffer* AparapiBuffer::flattenLong2D(JNIEnv* env, jobject arg) {
+void AparapiBuffer::flattenLong2D(JNIEnv* env, KernelArg* arg) {
 
-   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg->javaArg, "javaBuffer", ObjectClassArg);
    cl_uint* dims = new cl_uint[2];
    dims[0] = env->GetArrayLength((jobjectArray)javaBuffer);
    dims[1] = env->GetArrayLength((jlongArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0));
@@ -334,12 +353,12 @@ AparapiBuffer* AparapiBuffer::flattenLong2D(JNIEnv* env, jobject arg) {
       env->ReleaseLongArrayElements(jArray, elems, 0);
    }
   
-   return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
+   buildBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
 }
 
-AparapiBuffer* AparapiBuffer::flattenFloat2D(JNIEnv* env, jobject arg) {
+void AparapiBuffer::flattenFloat2D(JNIEnv* env, KernelArg* arg) {
 
-   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg->javaArg, "javaBuffer", ObjectClassArg);
    cl_uint* dims = new cl_uint[2];
    dims[0] = env->GetArrayLength((jobjectArray)javaBuffer);
    dims[1] = env->GetArrayLength((jfloatArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0));
@@ -380,12 +399,12 @@ AparapiBuffer* AparapiBuffer::flattenFloat2D(JNIEnv* env, jobject arg) {
       env->ReleaseFloatArrayElements(jArray, elems, 0);
    }
   
-   return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
+   buildBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
 }
 
-AparapiBuffer* AparapiBuffer::flattenDouble2D(JNIEnv* env, jobject arg) {
+void AparapiBuffer::flattenDouble2D(JNIEnv* env, KernelArg* arg) {
 
-   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg->javaArg, "javaBuffer", ObjectClassArg);
    cl_uint* dims = new cl_uint[2];
    dims[0] = env->GetArrayLength((jobjectArray)javaBuffer);
    dims[1] = env->GetArrayLength((jdoubleArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0));
@@ -426,13 +445,13 @@ AparapiBuffer* AparapiBuffer::flattenDouble2D(JNIEnv* env, jobject arg) {
       env->ReleaseDoubleArrayElements(jArray, elems, 0);
    }
   
-   return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
+   buildBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
 }
 
 
-AparapiBuffer* AparapiBuffer::flattenBoolean3D(JNIEnv* env, jobject arg) {
+void AparapiBuffer::flattenBoolean3D(JNIEnv* env, KernelArg* arg) {
 
-   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg->javaArg, "javaBuffer", ObjectClassArg);
    cl_uint* dims = new cl_uint[3];
    jobjectArray j0 = (jobjectArray)javaBuffer;
    jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0);
@@ -491,12 +510,12 @@ AparapiBuffer* AparapiBuffer::flattenBoolean3D(JNIEnv* env, jobject arg) {
       }
    }
   
-   return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
+   buildBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
 }
 
-AparapiBuffer* AparapiBuffer::flattenByte3D(JNIEnv* env, jobject arg) {
+void AparapiBuffer::flattenByte3D(JNIEnv* env, KernelArg* arg) {
 
-   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg->javaArg, "javaBuffer", ObjectClassArg);
    cl_uint* dims = new cl_uint[3];
    jobjectArray j0 = (jobjectArray)javaBuffer;
    jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0);
@@ -555,12 +574,12 @@ AparapiBuffer* AparapiBuffer::flattenByte3D(JNIEnv* env, jobject arg) {
       }
    }
   
-   return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
+   buildBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
 }
 
-AparapiBuffer* AparapiBuffer::flattenShort3D(JNIEnv* env, jobject arg) {
+void AparapiBuffer::flattenShort3D(JNIEnv* env, KernelArg* arg) {
 
-   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg->javaArg, "javaBuffer", ObjectClassArg);
    cl_uint* dims = new cl_uint[3];
    jobjectArray j0 = (jobjectArray)javaBuffer;
    jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0);
@@ -619,12 +638,12 @@ AparapiBuffer* AparapiBuffer::flattenShort3D(JNIEnv* env, jobject arg) {
       }
    }
   
-   return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
+   buildBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
 }
 
-AparapiBuffer* AparapiBuffer::flattenInt3D(JNIEnv* env, jobject arg) {
+void AparapiBuffer::flattenInt3D(JNIEnv* env, KernelArg* arg) {
 
-   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg->javaArg, "javaBuffer", ObjectClassArg);
    cl_uint* dims = new cl_uint[3];
    jobjectArray j0 = (jobjectArray)javaBuffer;
    jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0);
@@ -683,12 +702,12 @@ AparapiBuffer* AparapiBuffer::flattenInt3D(JNIEnv* env, jobject arg) {
       }
    }
   
-   return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
+   buildBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
 }
 
-AparapiBuffer* AparapiBuffer::flattenLong3D(JNIEnv* env, jobject arg) {
+void AparapiBuffer::flattenLong3D(JNIEnv* env, KernelArg* arg) {
 
-   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg->javaArg, "javaBuffer", ObjectClassArg);
    cl_uint* dims = new cl_uint[3];
    jobjectArray j0 = (jobjectArray)javaBuffer;
    jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0);
@@ -747,12 +766,12 @@ AparapiBuffer* AparapiBuffer::flattenLong3D(JNIEnv* env, jobject arg) {
       }
    }
   
-   return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
+   buildBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
 }
 
-AparapiBuffer* AparapiBuffer::flattenFloat3D(JNIEnv* env, jobject arg) {
+void AparapiBuffer::flattenFloat3D(JNIEnv* env, KernelArg* arg) {
 
-   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg->javaArg, "javaBuffer", ObjectClassArg);
    cl_uint* dims = new cl_uint[3];
    jobjectArray j0 = (jobjectArray)javaBuffer;
    jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0);
@@ -811,12 +830,12 @@ AparapiBuffer* AparapiBuffer::flattenFloat3D(JNIEnv* env, jobject arg) {
       }
    }
   
-   return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
+   buildBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
 }
 
-AparapiBuffer* AparapiBuffer::flattenDouble3D(JNIEnv* env, jobject arg) {
+void AparapiBuffer::flattenDouble3D(JNIEnv* env, KernelArg* arg) {
 
-   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg->javaArg, "javaBuffer", ObjectClassArg);
    cl_uint* dims = new cl_uint[3];
    jobjectArray j0 = (jobjectArray)javaBuffer;
    jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0);
@@ -875,7 +894,7 @@ AparapiBuffer* AparapiBuffer::flattenDouble3D(JNIEnv* env, jobject arg) {
       }
    }
   
-   return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
+   buildBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
 }
 
 
@@ -934,7 +953,7 @@ void AparapiBuffer::inflateBoolean2D(JNIEnv *env, KernelArg* arg) {
    #pragma omp parallel for
    for(int i = 0; i < lens[0]; i++) {
       for(int j = 0; j < lens[1]; j++) {
-         body[i][j] = array[i*dims[0] + j];
+         body[i][j] = array[i*offsets[0] + j];
       }
    }
 
@@ -949,7 +968,7 @@ void AparapiBuffer::inflateBoolean2D(JNIEnv *env, KernelArg* arg) {
       jbooleanArray jArray = (jbooleanArray)env->GetObjectArrayElement(buffer, i);
       jboolean* body = env->GetBooleanArrayElements(jArray,0);
       for(int j = 0; j < lens[1]; j++) {
-         body[j] = array[i*dims[0] + j];
+         body[j] = array[i*offsets[0] + j];
       }
       env->ReleaseBooleanArrayElements(jArray, body, 0);
    }
@@ -971,7 +990,7 @@ void AparapiBuffer::inflateByte2D(JNIEnv *env, KernelArg* arg) {
    #pragma omp parallel for
    for(int i = 0; i < lens[0]; i++) {
       for(int j = 0; j < lens[1]; j++) {
-         body[i][j] = array[i*dims[0] + j];
+         body[i][j] = array[i*offsets[0] + j];
       }
    }
 
@@ -986,7 +1005,7 @@ void AparapiBuffer::inflateByte2D(JNIEnv *env, KernelArg* arg) {
       jbyteArray jArray = (jbyteArray)env->GetObjectArrayElement(buffer, i);
       jbyte* body = env->GetByteArrayElements(jArray,0);
       for(int j = 0; j < lens[1]; j++) {
-         body[j] = array[i*dims[0] + j];
+         body[j] = array[i*offsets[0] + j];
       }
       env->ReleaseByteArrayElements(jArray, body, 0);
    }
@@ -1008,7 +1027,7 @@ void AparapiBuffer::inflateShort2D(JNIEnv *env, KernelArg* arg) {
    #pragma omp parallel for
    for(int i = 0; i < lens[0]; i++) {
       for(int j = 0; j < lens[1]; j++) {
-         body[i][j] = array[i*dims[0] + j];
+         body[i][j] = array[i*offsets[0] + j];
       }
    }
 
@@ -1023,7 +1042,7 @@ void AparapiBuffer::inflateShort2D(JNIEnv *env, KernelArg* arg) {
       jshortArray jArray = (jshortArray)env->GetObjectArrayElement(buffer, i);
       jshort* body = env->GetShortArrayElements(jArray,0);
       for(int j = 0; j < lens[1]; j++) {
-         body[j] = array[i*dims[0] + j];
+         body[j] = array[i*offsets[0] + j];
       }
       env->ReleaseShortArrayElements(jArray, body, 0);
    }
@@ -1045,7 +1064,7 @@ void AparapiBuffer::inflateInt2D(JNIEnv *env, KernelArg* arg) {
    #pragma omp parallel for
    for(int i = 0; i < lens[0]; i++) {
       for(int j = 0; j < lens[1]; j++) {
-         body[i][j] = array[i*dims[0] + j];
+         body[i][j] = array[i*offsets[0] + j];
       }
    }
 
@@ -1060,7 +1079,7 @@ void AparapiBuffer::inflateInt2D(JNIEnv *env, KernelArg* arg) {
       jintArray jArray = (jintArray)env->GetObjectArrayElement(buffer, i);
       jint* body = env->GetIntArrayElements(jArray,0);
       for(int j = 0; j < lens[1]; j++) {
-         body[j] = array[i*dims[0] + j];
+         body[j] = array[i*offsets[0] + j];
       }
       env->ReleaseIntArrayElements(jArray, body, 0);
    }
@@ -1082,7 +1101,7 @@ void AparapiBuffer::inflateLong2D(JNIEnv *env, KernelArg* arg) {
    #pragma omp parallel for
    for(int i = 0; i < lens[0]; i++) {
       for(int j = 0; j < lens[1]; j++) {
-         body[i][j] = array[i*dims[0] + j];
+         body[i][j] = array[i*offsets[0] + j];
       }
    }
 
@@ -1097,7 +1116,7 @@ void AparapiBuffer::inflateLong2D(JNIEnv *env, KernelArg* arg) {
       jlongArray jArray = (jlongArray)env->GetObjectArrayElement(buffer, i);
       jlong* body = env->GetLongArrayElements(jArray,0);
       for(int j = 0; j < lens[1]; j++) {
-         body[j] = array[i*dims[0] + j];
+         body[j] = array[i*offsets[0] + j];
       }
       env->ReleaseLongArrayElements(jArray, body, 0);
    }
@@ -1119,7 +1138,7 @@ void AparapiBuffer::inflateFloat2D(JNIEnv *env, KernelArg* arg) {
    #pragma omp parallel for
    for(int i = 0; i < lens[0]; i++) {
       for(int j = 0; j < lens[1]; j++) {
-         body[i][j] = array[i*dims[0] + j];
+         body[i][j] = array[i*offsets[0] + j];
       }
    }
 
@@ -1134,7 +1153,7 @@ void AparapiBuffer::inflateFloat2D(JNIEnv *env, KernelArg* arg) {
       jfloatArray jArray = (jfloatArray)env->GetObjectArrayElement(buffer, i);
       jfloat* body = env->GetFloatArrayElements(jArray,0);
       for(int j = 0; j < lens[1]; j++) {
-         body[j] = array[i*dims[0] + j];
+         body[j] = array[i*offsets[0] + j];
       }
       env->ReleaseFloatArrayElements(jArray, body, 0);
    }
@@ -1156,7 +1175,7 @@ void AparapiBuffer::inflateDouble2D(JNIEnv *env, KernelArg* arg) {
    #pragma omp parallel for
    for(int i = 0; i < lens[0]; i++) {
       for(int j = 0; j < lens[1]; j++) {
-         body[i][j] = array[i*dims[0] + j];
+         body[i][j] = array[i*offsets[0] + j];
       }
    }
 
@@ -1171,7 +1190,7 @@ void AparapiBuffer::inflateDouble2D(JNIEnv *env, KernelArg* arg) {
       jdoubleArray jArray = (jdoubleArray)env->GetObjectArrayElement(buffer, i);
       jdouble* body = env->GetDoubleArrayElements(jArray,0);
       for(int j = 0; j < lens[1]; j++) {
-         body[j] = array[i*dims[0] + j];
+         body[j] = array[i*offsets[0] + j];
       }
       env->ReleaseDoubleArrayElements(jArray, body, 0);
    }
@@ -1200,7 +1219,7 @@ void AparapiBuffer::inflateBoolean3D(JNIEnv *env, KernelArg* arg) {
    for(int i = 0; i < lens[0]; i++) {
       for(int j = 0; j < lens[1]; j++) {
          for(int k = 0; k < lens[2]; k++) {
-            body[i][j][k] = array[i*dims[0] + j*dims[1] + k];
+            body[i][j][k] = array[i*offsets[0] + j*offsets[1] + k];
          }
       }
    }
@@ -1222,7 +1241,7 @@ void AparapiBuffer::inflateBoolean3D(JNIEnv *env, KernelArg* arg) {
          jbooleanArray jArray = (jbooleanArray)env->GetObjectArrayElement(jrow, j);
          jboolean* body = env->GetBooleanArrayElements(jArray,0);
          for(int k = 0; k < lens[2]; k++) {
-            body[k] = array[i*dims[0] + j*dims[1] + k];
+            body[k] = array[i*offsets[0] + j*offsets[1] + k];
          }
          env->ReleaseBooleanArrayElements(jArray, body, 0);
       }
@@ -1251,7 +1270,7 @@ void AparapiBuffer::inflateByte3D(JNIEnv *env, KernelArg* arg) {
    for(int i = 0; i < lens[0]; i++) {
       for(int j = 0; j < lens[1]; j++) {
          for(int k = 0; k < lens[2]; k++) {
-            body[i][j][k] = array[i*dims[0] + j*dims[1] + k];
+            body[i][j][k] = array[i*offsets[0] + j*offsets[1] + k];
          }
       }
    }
@@ -1273,7 +1292,7 @@ void AparapiBuffer::inflateByte3D(JNIEnv *env, KernelArg* arg) {
          jbyteArray jArray = (jbyteArray)env->GetObjectArrayElement(jrow, j);
          jbyte* body = env->GetByteArrayElements(jArray,0);
          for(int k = 0; k < lens[2]; k++) {
-            body[k] = array[i*dims[0] + j*dims[1] + k];
+            body[k] = array[i*offsets[0] + j*offsets[1] + k];
          }
          env->ReleaseByteArrayElements(jArray, body, 0);
       }
@@ -1302,7 +1321,7 @@ void AparapiBuffer::inflateShort3D(JNIEnv *env, KernelArg* arg) {
    for(int i = 0; i < lens[0]; i++) {
       for(int j = 0; j < lens[1]; j++) {
          for(int k = 0; k < lens[2]; k++) {
-            body[i][j][k] = array[i*dims[0] + j*dims[1] + k];
+            body[i][j][k] = array[i*offsets[0] + j*offsets[1] + k];
          }
       }
    }
@@ -1324,7 +1343,7 @@ void AparapiBuffer::inflateShort3D(JNIEnv *env, KernelArg* arg) {
          jshortArray jArray = (jshortArray)env->GetObjectArrayElement(jrow, j);
          jshort* body = env->GetShortArrayElements(jArray,0);
          for(int k = 0; k < lens[2]; k++) {
-            body[k] = array[i*dims[0] + j*dims[1] + k];
+            body[k] = array[i*offsets[0] + j*offsets[1] + k];
          }
          env->ReleaseShortArrayElements(jArray, body, 0);
       }
@@ -1353,7 +1372,7 @@ void AparapiBuffer::inflateInt3D(JNIEnv *env, KernelArg* arg) {
    for(int i = 0; i < lens[0]; i++) {
       for(int j = 0; j < lens[1]; j++) {
          for(int k = 0; k < lens[2]; k++) {
-            body[i][j][k] = array[i*dims[0] + j*dims[1] + k];
+            body[i][j][k] = array[i*offsets[0] + j*offsets[1] + k];
          }
       }
    }
@@ -1375,7 +1394,7 @@ void AparapiBuffer::inflateInt3D(JNIEnv *env, KernelArg* arg) {
          jintArray jArray = (jintArray)env->GetObjectArrayElement(jrow, j);
          jint* body = env->GetIntArrayElements(jArray,0);
          for(int k = 0; k < lens[2]; k++) {
-            body[k] = array[i*dims[0] + j*dims[1] + k];
+            body[k] = array[i*offsets[0] + j*offsets[1] + k];
          }
          env->ReleaseIntArrayElements(jArray, body, 0);
       }
@@ -1404,7 +1423,7 @@ void AparapiBuffer::inflateLong3D(JNIEnv *env, KernelArg* arg) {
    for(int i = 0; i < lens[0]; i++) {
       for(int j = 0; j < lens[1]; j++) {
          for(int k = 0; k < lens[2]; k++) {
-            body[i][j][k] = array[i*dims[0] + j*dims[1] + k];
+            body[i][j][k] = array[i*offsets[0] + j*offsets[1] + k];
          }
       }
    }
@@ -1426,7 +1445,7 @@ void AparapiBuffer::inflateLong3D(JNIEnv *env, KernelArg* arg) {
          jlongArray jArray = (jlongArray)env->GetObjectArrayElement(jrow, j);
          jlong* body = env->GetLongArrayElements(jArray,0);
          for(int k = 0; k < lens[2]; k++) {
-            body[k] = array[i*dims[0] + j*dims[1] + k];
+            body[k] = array[i*offsets[0] + j*offsets[1] + k];
          }
          env->ReleaseLongArrayElements(jArray, body, 0);
       }
@@ -1455,7 +1474,7 @@ void AparapiBuffer::inflateFloat3D(JNIEnv *env, KernelArg* arg) {
    for(int i = 0; i < lens[0]; i++) {
       for(int j = 0; j < lens[1]; j++) {
          for(int k = 0; k < lens[2]; k++) {
-            body[i][j][k] = array[i*dims[0] + j*dims[1] + k];
+            body[i][j][k] = array[i*offsets[0] + j*offsets[1] + k];
          }
       }
    }
@@ -1477,7 +1496,7 @@ void AparapiBuffer::inflateFloat3D(JNIEnv *env, KernelArg* arg) {
          jfloatArray jArray = (jfloatArray)env->GetObjectArrayElement(jrow, j);
          jfloat* body = env->GetFloatArrayElements(jArray,0);
          for(int k = 0; k < lens[2]; k++) {
-            body[k] = array[i*dims[0] + j*dims[1] + k];
+            body[k] = array[i*offsets[0] + j*offsets[1] + k];
          }
          env->ReleaseFloatArrayElements(jArray, body, 0);
       }
@@ -1506,7 +1525,7 @@ void AparapiBuffer::inflateDouble3D(JNIEnv *env, KernelArg* arg) {
    for(int i = 0; i < lens[0]; i++) {
       for(int j = 0; j < lens[1]; j++) {
          for(int k = 0; k < lens[2]; k++) {
-            body[i][j][k] = array[i*dims[0] + j*dims[1] + k];
+            body[i][j][k] = array[i*offsets[0] + j*offsets[1] + k];
          }
       }
    }
@@ -1528,7 +1547,7 @@ void AparapiBuffer::inflateDouble3D(JNIEnv *env, KernelArg* arg) {
          jdoubleArray jArray = (jdoubleArray)env->GetObjectArrayElement(jrow, j);
          jdouble* body = env->GetDoubleArrayElements(jArray,0);
          for(int k = 0; k < lens[2]; k++) {
-            body[k] = array[i*dims[0] + j*dims[1] + k];
+            body[k] = array[i*offsets[0] + j*offsets[1] + k];
          }
          env->ReleaseDoubleArrayElements(jArray, body, 0);
       }
@@ -1537,7 +1556,7 @@ void AparapiBuffer::inflateDouble3D(JNIEnv *env, KernelArg* arg) {
 
 void AparapiBuffer::deleteBuffer(KernelArg* arg)
 {
-      delete[] dims;
+      delete[] offsets;
       delete[] lens;
    if(arg->isBoolean()) {
       delete[] (jboolean*)data;
diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.h b/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.h
index b5a05d82b6d45e68034c8abeaa1b890d1cfb1437..e008909f2cb8e6425e421ae85eec9d3d30bbf562 100644
--- a/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.h
+++ b/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.h
@@ -70,10 +70,12 @@ private:
       return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_SHORT);
    }
 
+   void buildBuffer(void* _data, cl_uint* _dims, cl_uint _numDims, long _lengthInBytes, jobject _javaObject);
+
 public:
       jobject javaObject;       // The java array that this arg is mapped to 
-      cl_uint numDims;          // sizes of dimensions of the object (array lengths for ND arrays)
-      cl_uint* dims;            // sizes of offsets of the object (first element offset in ND arrays)
+      cl_uint numDims;          // number of dimensions of the object (array lengths for ND arrays)
+      cl_uint* offsets;         // offsets of the next element in ND arrays)
       cl_uint* lens;            // sizes of dimensions of the object (array lengths for ND arrays)
       jint lengthInBytes;       // bytes in the array or directBuf
       cl_mem mem;               // the opencl buffer 
@@ -87,25 +89,25 @@ public:
 
       void deleteBuffer(KernelArg* arg);
 
-      static AparapiBuffer* flatten(JNIEnv *env, jobject arg, int type);
-
-      static AparapiBuffer* flattenBoolean2D(JNIEnv *env, jobject arg);
-      static AparapiBuffer* flattenChar2D(JNIEnv *env, jobject arg);
-      static AparapiBuffer* flattenByte2D(JNIEnv *env, jobject arg);
-      static AparapiBuffer* flattenShort2D(JNIEnv *env, jobject arg);
-      static AparapiBuffer* flattenInt2D(JNIEnv *env, jobject arg);
-      static AparapiBuffer* flattenLong2D(JNIEnv *env, jobject arg);
-      static AparapiBuffer* flattenFloat2D(JNIEnv *env, jobject arg);
-      static AparapiBuffer* flattenDouble2D(JNIEnv *env, jobject arg);
-
-      static AparapiBuffer* flattenBoolean3D(JNIEnv *env, jobject arg);
-      static AparapiBuffer* flattenChar3D(JNIEnv *env, jobject arg);
-      static AparapiBuffer* flattenByte3D(JNIEnv *env, jobject arg);
-      static AparapiBuffer* flattenShort3D(JNIEnv *env, jobject arg);
-      static AparapiBuffer* flattenInt3D(JNIEnv *env, jobject arg);
-      static AparapiBuffer* flattenLong3D(JNIEnv *env, jobject arg);
-      static AparapiBuffer* flattenFloat3D(JNIEnv *env, jobject arg);
-      static AparapiBuffer* flattenDouble3D(JNIEnv *env, jobject arg);
+      void flatten(JNIEnv *env, KernelArg* arg);
+
+      void flattenBoolean2D(JNIEnv *env, KernelArg* arg);
+      void flattenChar2D(JNIEnv *env, KernelArg* arg);
+      void flattenByte2D(JNIEnv *env, KernelArg* arg);
+      void flattenShort2D(JNIEnv *env, KernelArg* arg);
+      void flattenInt2D(JNIEnv *env, KernelArg* arg);
+      void flattenLong2D(JNIEnv *env, KernelArg* arg);
+      void flattenFloat2D(JNIEnv *env, KernelArg* arg);
+      void flattenDouble2D(JNIEnv *env, KernelArg* arg);
+
+      void flattenBoolean3D(JNIEnv *env, KernelArg* arg);
+      void flattenChar3D(JNIEnv *env, KernelArg* arg);
+      void flattenByte3D(JNIEnv *env, KernelArg* arg);
+      void flattenShort3D(JNIEnv *env, KernelArg* arg);
+      void flattenInt3D(JNIEnv *env, KernelArg* arg);
+      void flattenLong3D(JNIEnv *env, KernelArg* arg);
+      void flattenFloat3D(JNIEnv *env, KernelArg* arg);
+      void flattenDouble3D(JNIEnv *env, KernelArg* arg);
 
       void inflate(JNIEnv *env, KernelArg* arg);
 
diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/JNIContext.cpp b/com.amd.aparapi.jni/src/cpp/runKernel/JNIContext.cpp
index 1b13151963aa66872aaefb2e630b808a43837129..8a9d1f229074500246d62611068bf1841d0808c1 100644
--- a/com.amd.aparapi.jni/src/cpp/runKernel/JNIContext.cpp
+++ b/com.amd.aparapi.jni/src/cpp/runKernel/JNIContext.cpp
@@ -21,8 +21,7 @@ JNIContext::JNIContext(JNIEnv *jenv, jobject _kernelObject, jobject _openCLDevic
    deviceId = OpenCLDevice::getDeviceId(jenv, openCLDeviceObject);
    cl_device_type returnedDeviceType;
    clGetDeviceInfo(deviceId, CL_DEVICE_TYPE,  sizeof(returnedDeviceType), &returnedDeviceType, NULL);
-   //fprintf(stderr, "device[%d] CL_DEVICE_TYPE = %x\n", deviceId, returnedDeviceType);
-
+   //fprintf(stderr, "device[%p] CL_DEVICE_TYPE = %x\n", deviceId, returnedDeviceType);
 
    cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformId, 0 };
    cl_context_properties* cprops = (NULL == platformId) ? NULL : cps;
@@ -34,7 +33,7 @@ JNIContext::JNIContext(JNIEnv *jenv, jobject _kernelObject, jobject _openCLDevic
 }
 
 void JNIContext::dispose(JNIEnv *jenv, Config* config) {
-   //fprintf(stdout, "dispose()\n");
+   //fprintf(stderr, "dispose()\n");
    cl_int status = CL_SUCCESS;
    jenv->DeleteGlobalRef(kernelObject);
    jenv->DeleteGlobalRef(kernelClass);
@@ -69,21 +68,41 @@ void JNIContext::dispose(JNIEnv *jenv, Config* config) {
       for (int i=0; i< argc; i++){
          KernelArg *arg = args[i];
          if (!arg->isPrimitive()){
-            if (arg->arrayBuffer != NULL){
-               if (arg->arrayBuffer->mem != 0){
-                  if (config->isTrackingOpenCLResources()){
-                     memList.remove((cl_mem)arg->arrayBuffer->mem, __LINE__, __FILE__);
+            if (arg->isArray()) {
+               if (arg->arrayBuffer != NULL){
+                  if (arg->arrayBuffer->mem != 0){
+                     if (config->isTrackingOpenCLResources()){
+                        memList.remove((cl_mem)arg->arrayBuffer->mem, __LINE__, __FILE__);
+                     }
+                     status = clReleaseMemObject((cl_mem)arg->arrayBuffer->mem);
+                     //fprintf(stdout, "dispose arg %d %0lx\n", i, arg->arrayBuffer->mem);
+                     CLException::checkCLError(status, "clReleaseMemObject()");
+                     arg->arrayBuffer->mem = (cl_mem)0;
+                  }
+                  if (arg->arrayBuffer->javaArray != NULL)  {
+                     jenv->DeleteWeakGlobalRef((jweak) arg->arrayBuffer->javaArray);
                   }
-                  status = clReleaseMemObject((cl_mem)arg->arrayBuffer->mem);
-                  //fprintf(stdout, "dispose arg %d %0lx\n", i, arg->arrayBuffer->mem);
-                  CLException::checkCLError(status, "clReleaseMemObject()");
-                  arg->arrayBuffer->mem = (cl_mem)0;
+                  delete arg->arrayBuffer;
+                  arg->arrayBuffer = NULL;
                }
-               if (arg->arrayBuffer->javaArray != NULL)  {
-                  jenv->DeleteWeakGlobalRef((jweak) arg->arrayBuffer->javaArray);
+            } else if (arg->isAparapiBuffer()) {
+               if (arg->aparapiBuffer != NULL){
+                  if (arg->aparapiBuffer->mem != 0){
+                     if (config->isTrackingOpenCLResources()){
+                        memList.remove((cl_mem)arg->aparapiBuffer->mem, __LINE__, __FILE__);
+                     }
+                     status = clReleaseMemObject((cl_mem)arg->aparapiBuffer->mem);
+                     //fprintf(stdout, "dispose arg %d %0lx\n", i, arg->aparapiBuffer->mem);
+                     CLException::checkCLError(status, "clReleaseMemObject()");
+                     arg->aparapiBuffer->mem = (cl_mem)0;
+                  }
+                  if (arg->aparapiBuffer->javaObject != NULL)  {
+                     jenv->DeleteWeakGlobalRef((jweak) arg->aparapiBuffer->javaObject); 
+                  }
+                  delete arg->aparapiBuffer;
+                  arg->aparapiBuffer = NULL;
                }
-               delete arg->arrayBuffer;
-               arg->arrayBuffer = NULL;
+
             }
          }
          if (arg->name != NULL){
@@ -93,7 +112,7 @@ void JNIContext::dispose(JNIEnv *jenv, Config* config) {
             jenv->DeleteGlobalRef((jobject) arg->javaArg);
          }
          delete arg; arg=args[i]=NULL;
-      }
+      } // for
       delete[] args; args=NULL;
 
       // do we need to call clReleaseEvent on any of these that are still retained....
diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.cpp b/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.cpp
index 0f401ceae60180d26b444f5bbf6abdf39006c0a8..da77e9e151cd306fb5060e2fb6eefedb8b9e6b13 100644
--- a/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.cpp
+++ b/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.cpp
@@ -35,7 +35,7 @@ KernelArg::KernelArg(JNIEnv *jenv, JNIContext *jniContext, jobject argObj):
       if (isArray()){
          arrayBuffer = new ArrayBuffer();
       } else if(isAparapiBuffer()) {
-         aparapiBuffer = AparapiBuffer::flatten(jenv, argObj, type);
+         aparapiBuffer = new AparapiBuffer();
       }
    }
 
diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.h b/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.h
index a54e411e90dc04c4f0ea50940ea4a12eef94aa7e..c61cc6fa8d75d4adea41ca000ab45aba8d6d7e6e 100644
--- a/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.h
+++ b/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.h
@@ -64,7 +64,7 @@ class KernelArg{
       static jfieldID javaArrayFieldID; 
    public:
       JNIContext *jniContext;  
-      jobject argObj;    // the Java KernelRunner.KernelArg object that we are mirroring.
+      jobject argObj;    // the Java KernelRunner.KernelArg object that we are mirroring. Do not use it outside constructor due to GC. Use javaArg instead.
       jobject javaArg;   // global reference to the corresponding java KernelArg object we grabbed our own global reference so that the object won't be collected until we dispose!
       char *name;        // used for debugging printfs
       jint type;         // a bit mask determining the type of this arg
diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java
index f8af5bc2f85a596bae8693e30b6590e6c79651b0..9bb09c29f9703dc22cda1de09f451496d7546cb6 100644
--- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java
+++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java
@@ -986,6 +986,34 @@ public class KernelRunner extends KernelRunnerJNI{
 
                arg.setArray(newArrayRef);
                assert arg.getArray() != null : "null array ref";
+            } else if ((arg.getType() & ARG_APARAPI_BUFFER) != 0) {
+               // TODO: check if the 2D/3D array is changed. 
+               //   can Arrays.equals help?
+               needsSync = true; // Always need syn
+               Object buffer = new Object();
+               try {
+                  buffer = arg.getField().get(kernel);
+               } catch (IllegalAccessException e) {
+                  e.printStackTrace();
+               }
+               int numDims = arg.getNumDims();
+               Object subBuffer = buffer;
+               int[] dims = new int[numDims];
+               for (int d = 0; d < numDims - 1; d++) {
+                  dims[d] = Array.getLength(subBuffer);
+                  subBuffer = Array.get(subBuffer, 0);
+               }
+               dims[numDims - 1] = Array.getLength(subBuffer);
+               arg.setDims(dims);
+
+               int primitiveSize = getPrimitiveSize(arg.getType());
+               int totalElements = 1;
+               for (int d = 0; d < numDims; d++) {
+                  totalElements *= dims[d];
+               }
+               arg.setJavaBuffer(buffer);
+               arg.setSizeInBytes(totalElements * primitiveSize);
+               arg.setArray(buffer);
             }
          } catch (final IllegalArgumentException e) {
             e.printStackTrace();
@@ -1619,53 +1647,32 @@ public class KernelRunner extends KernelRunnerJNI{
       while (type.getName().charAt(numDims) == '[') {
          numDims++;
       }
-      Object buffer = new Object();
-      try {
-         buffer = arg.getField().get(kernel);
-      } catch (IllegalAccessException e) {
-         e.printStackTrace();
-      }
-      arg.setJavaBuffer(buffer);
       arg.setNumDims(numDims);
-      Object subBuffer = buffer;
-      int[] dims = new int[numDims];
-      for (int i = 0; i < numDims - 1; i++) {
-         dims[i] = Array.getLength(subBuffer);
-         subBuffer = Array.get(subBuffer, 0);
+      arg.setJavaBuffer(null); // will get updated in updateKernelArrayRefs
+      arg.setArray(null); // will get updated in updateKernelArrayRefs
+
+      Class<?> elementType = arg.getField().getType();
+      while (elementType.isArray()) {
+         elementType = elementType.getComponentType();
       }
-      dims[numDims - 1] = Array.getLength(subBuffer);
-      arg.setDims(dims);
 
-      if (subBuffer.getClass().isAssignableFrom(float[].class)) {
+      if (elementType.isAssignableFrom(float.class)) {
          arg.setType(arg.getType() | ARG_FLOAT);
-      }
-      if (subBuffer.getClass().isAssignableFrom(int[].class)) {
+      } else if (elementType.isAssignableFrom(int.class)) {
          arg.setType(arg.getType() | ARG_INT);
-      }
-      if (subBuffer.getClass().isAssignableFrom(boolean[].class)) {
+      } else if (elementType.isAssignableFrom(boolean.class)) {
          arg.setType(arg.getType() | ARG_BOOLEAN);
-      }
-      if (subBuffer.getClass().isAssignableFrom(byte[].class)) {
+      } else if (elementType.isAssignableFrom(byte.class)) {
          arg.setType(arg.getType() | ARG_BYTE);
-      }
-      if (subBuffer.getClass().isAssignableFrom(char[].class)) {
+      } else if (elementType.isAssignableFrom(char.class)) {
          arg.setType(arg.getType() | ARG_CHAR);
-      }
-      if (subBuffer.getClass().isAssignableFrom(double[].class)) {
+      } else if (elementType.isAssignableFrom(double.class)) {
          arg.setType(arg.getType() | ARG_DOUBLE);
-      }
-      if (subBuffer.getClass().isAssignableFrom(long[].class)) {
+      } else if (elementType.isAssignableFrom(long.class)) {
          arg.setType(arg.getType() | ARG_LONG);
-      }
-      if (subBuffer.getClass().isAssignableFrom(short[].class)) {
+      } else if (elementType.isAssignableFrom(short.class)) {
          arg.setType(arg.getType() | ARG_SHORT);
       }
-      int primitiveSize = getPrimitiveSize(arg.getType());
-      int totalElements = 1;
-      for (int i = 0; i < numDims; i++) {
-         totalElements *= dims[i];
-      }
-      arg.setSizeInBytes(totalElements * primitiveSize);
    }
 
    private final Set<Object> puts = new HashSet<Object>();