diff --git a/com.amd.aparapi.jni/.cproject b/com.amd.aparapi.jni/.cproject.example
similarity index 100%
rename from com.amd.aparapi.jni/.cproject
rename to com.amd.aparapi.jni/.cproject.example
diff --git a/com.amd.aparapi.jni/build.xml b/com.amd.aparapi.jni/build.xml
index 1e4912b85195f993784a3ba96e6b5ff18c0a5e96..44516cc957ae0ca7e14b0c1b359f92cd273c7c74 100644
--- a/com.amd.aparapi.jni/build.xml
+++ b/com.amd.aparapi.jni/build.xml
@@ -479,6 +479,8 @@ First consider editing the properties in build.properties
       <delete file="ProfileInfo.o" />
       <delete file="ArrayBuffer.obj" />
       <delete file="ArrayBuffer.o" />
+      <delete file="AparapiBuffer.obj" />
+      <delete file="AparapiBuffer.o" />
       <delete file="JNIHelper.obj" />
       <delete file="JNIHelper.o" />
       <delete file="CLHelper.obj" />
@@ -540,6 +542,7 @@ First consider editing the properties in build.properties
          <arg value="${basedir}/dist/libaparapi_${x86_or_x86_64}.so" />
          <arg value="src/cpp/runKernel/Aparapi.cpp" />
          <arg value="src/cpp/runKernel/ArrayBuffer.cpp" />
+         <arg value="src/cpp/runKernel/AparapiBuffer.cpp" />
          <arg value="src/cpp/runKernel/Config.cpp" />
          <arg value="src/cpp/runKernel/JNIContext.cpp" />
          <arg value="src/cpp/runKernel/KernelArg.cpp" />
@@ -564,6 +567,7 @@ First consider editing the properties in build.properties
          <arg value="-O3" />
          <arg value="-g" />
          <arg value="-fPIC" />
+         <arg value="-fopenmp"/>
          <arg value="-DCL_USE_DEPRECATED_OPENCL_1_1_APIS"/>
          <arg value="-I/System/Library/Frameworks/JavaVM.framework/Headers" />
          <arg value="-Iinclude" />
@@ -575,6 +579,7 @@ First consider editing the properties in build.properties
          <arg value="${basedir}/dist/libaparapi_${x86_or_x86_64}.dylib" />
          <arg value="src/cpp/runKernel/Aparapi.cpp" />
          <arg value="src/cpp/runKernel/ArrayBuffer.cpp" />
+         <arg value="src/cpp/runKernel/AparapiBuffer.cpp" />
          <arg value="src/cpp/runKernel/Config.cpp" />
          <arg value="src/cpp/runKernel/JNIContext.cpp" />
          <arg value="src/cpp/runKernel/KernelArg.cpp" />
@@ -601,6 +606,7 @@ First consider editing the properties in build.properties
          <arg value="/nologo" />
          <arg value="/TP" />
          <arg value="/Ox" />
+         <arg value="/openmp" />
          <arg value="-DCL_USE_DEPRECATED_OPENCL_1_1_APIS"/>
          <arg value="/EHsc" />
          <arg value="/I${msvc.dir}\vc\include" />
@@ -614,6 +620,7 @@ First consider editing the properties in build.properties
          <arg value="/Isrc/cpp/invoke" />
          <arg value="src/cpp/runKernel/Aparapi.cpp" />
          <arg value="src/cpp/runKernel/ArrayBuffer.cpp" />
+         <arg value="src/cpp/runKernel/AparapiBuffer.cpp" />
          <arg value="src/cpp/runKernel/Config.cpp" />
          <arg value="src/cpp/runKernel/JNIContext.cpp" />
          <arg value="src/cpp/runKernel/KernelArg.cpp" />
@@ -657,6 +664,7 @@ First consider editing the properties in build.properties
          <arg value="${basedir}\dist\aparapi_${x86_or_x86_64}.dll" />
          <arg value="src/cpp/runKernel/Aparapi.cpp" />
          <arg value="src/cpp/runKernel/ArrayBuffer.cpp" />
+         <arg value="src/cpp/runKernel/AparapiBuffer.cpp" />
          <arg value="src/cpp/runKernel/Config.cpp" />
          <arg value="src/cpp/runKernel/JNIContext.cpp" />
          <arg value="src/cpp/runKernel/KernelArg.cpp" />
diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp
index 142fb19c8d2a690517dbb79bbb4da5af07fb3d48..277cbe94b4a738a72233e86391bfc19e28e11d2c 100644
--- a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp
+++ b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp
@@ -45,6 +45,7 @@
 #include "Config.h"
 #include "ProfileInfo.h"
 #include "ArrayBuffer.h"
+#include "AparapiBuffer.h"
 #include "CLHelper.h"
 #include "List.h"
 #include <algorithm>
@@ -52,7 +53,8 @@
 
 //compiler dependant code
 /**
- * calls either clEnqueueMarker or clEnqueueMarkerWithWaitList depending on the version of OpenCL installed.
+ * calls either clEnqueueMarker or clEnqueueMarkerWithWaitList 
+ * depending on the version of OpenCL installed.
  * conveiniece function so we don't have to have #ifdefs all over the code
  */
 int enqueueMarker(cl_command_queue commandQueue, cl_event* firstEvent) {
@@ -240,7 +242,10 @@ jint updateNonPrimitiveReferences(JNIEnv *jenv, jobject jobj, JNIContext* jniCon
          if (config->isVerbose()){
             fprintf(stderr, "got type for %s: %08x\n", arg->name, arg->type);
          }
-         if (!arg->isPrimitive()) {
+
+         //this won't be a problem with the aparapi buffers because
+         //we need to copy them every time anyway
+         if (!arg->isPrimitive() && !arg->isAparapiBuffer()) {
             // Following used for all primitive arrays, object arrays and nio Buffers
             jarray newRef = (jarray)jenv->GetObjectField(arg->javaArg, KernelArg::javaArrayFieldID);
             if (config->isVerbose()){
@@ -326,19 +331,8 @@ void profileFirstRun(JNIContext* jniContext) {
    }
 }
 
-/**
- * create a new variable in OpenCL for the object, and sets the kernel arguement.
- * currently the only objects supported are arrays.
- *
- * @param jenv the java environment
- * @param jniContext the context we got from java
- * @param arg the argument we're passing to opencl
- * @param argPos out: the position of arg in the opencl argument list
- * @param argIdx the position of arg in the argument array
- *
- * @throws CLException
- */
-void updateObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) {
+
+void updateArray(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) {
 
    cl_int status = CL_SUCCESS;
    // if either this is the first run or user changed input array
@@ -373,6 +367,7 @@ void updateObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& arg
 
    // Add the array length if needed
    if (arg->usesArrayLength()) {
+      argPos++;
       arg->syncJavaArrayLength(jenv);
 
       status = clSetKernelArg(jniContext->kernel, argPos, sizeof(jint), &(arg->arrayBuffer->length));
@@ -381,10 +376,52 @@ void updateObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& arg
       if (config->isVerbose()){
          fprintf(stderr, "runKernel arg %d %s, length = %d\n", argIdx, arg->name, arg->arrayBuffer->length);
       }
-      argPos++;
    }
 }
 
+void updateBuffer(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) {
+
+   AparapiBuffer* buffer = arg->aparapiBuffer;
+   cl_int status = CL_SUCCESS;
+   cl_uint mask = CL_MEM_USE_HOST_PTR;
+   if (arg->isReadByKernel() && arg->isMutableByKernel()) mask |= CL_MEM_READ_WRITE;
+   else if (arg->isReadByKernel() && !arg->isMutableByKernel()) mask |= CL_MEM_READ_ONLY;
+   else if (arg->isMutableByKernel()) mask |= CL_MEM_WRITE_ONLY;
+   buffer->memMask = mask;
+
+   buffer->mem = clCreateBuffer(jniContext->context, buffer->memMask, 
+         buffer->lengthInBytes, buffer->data, &status);
+
+   if(status != CL_SUCCESS) throw CLException(status,"clCreateBuffer");
+
+   if (config->isTrackingOpenCLResources()){
+      memList.add(buffer->mem, __LINE__, __FILE__);
+   }
+
+   status = clSetKernelArg(jniContext->kernel, argPos, sizeof(cl_mem), (void *)&(buffer->mem));
+   if(status != CL_SUCCESS) throw CLException(status,"clSetKernelArg (buffer)");
+
+   // Add the array length if needed
+   if (arg->usesArrayLength()) {
+
+      for(int i = 0; i < buffer->numDims; i++) {
+         argPos++;
+         status = clSetKernelArg(jniContext->kernel, argPos, sizeof(cl_uint), &(buffer->lens[i]));
+         if(status != CL_SUCCESS) throw CLException(status,"clSetKernelArg (buffer length)");
+         if (config->isVerbose()){
+            fprintf(stderr, "runKernel arg %d %s, length = %d\n", argIdx, arg->name, buffer->lens[i]);
+         }
+         argPos++;
+         status = clSetKernelArg(jniContext->kernel, argPos, sizeof(cl_uint), &(buffer->dims[i]));
+         if(status != CL_SUCCESS) throw CLException(status,"clSetKernelArg (buffer dimension)");
+         if (config->isVerbose()){
+            fprintf(stderr, "runKernel arg %d %s, dim = %d\n", argIdx, arg->name, buffer->dims[i]);
+         }
+      }
+   }
+}
+
+
 /**
  * manages the memory of KernelArgs that are object.  i.e. handels pinning, and moved objects.
  * currently the only objects supported are arrays.
@@ -398,6 +435,14 @@ void updateObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& arg
  * @throws CLException
  */
 void processObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) {
+    if(arg->isArray()) {
+       processArray(jenv, jniContext, arg, argPos, argIdx);
+    } else if(arg->isAparapiBuffer()) {
+       processBuffer(jenv, jniContext, arg, argPos, argIdx);
+    }
+}
+
+void processArray(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) {
 
    cl_int status = CL_SUCCESS;
 
@@ -452,7 +497,7 @@ void processObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& ar
          arg->arrayBuffer->mem = (cl_mem)0;
       }
 
-      updateObject(jenv, jniContext, arg, argPos, argIdx);
+      updateArray(jenv, jniContext, arg, argPos, argIdx);
 
    } else {
       // Keep the arg position in sync if no updates were required
@@ -463,6 +508,50 @@ void processObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& ar
 
 }
 
+void processBuffer(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) {
+
+   cl_int status = CL_SUCCESS;
+
+   if (config->isProfilingEnabled()){
+      arg->aparapiBuffer->read.valid = false;
+      arg->aparapiBuffer->write.valid = false;
+   }
+
+   if (config->isVerbose()) {
+      fprintf(stderr, "runKernel: arrayOrBuf addr=%p, ref.mem=%p\n",
+            arg->aparapiBuffer->data,
+            arg->aparapiBuffer->mem);
+      fprintf(stderr, "at memory addr %p, contents: ", arg->aparapiBuffer->data);
+      unsigned char *pb = (unsigned char *) arg->aparapiBuffer->data;
+      for (int k=0; k<8; k++) {
+         fprintf(stderr, "%02x ", pb[k]);
+      }
+      fprintf(stderr, "\n" );
+   }
+
+   if (config->isVerbose()){
+      if (arg->isExplicit() && arg->isExplicitWrite()){
+         fprintf(stderr, "explicit write of %s\n",  arg->name);
+      }
+   }
+
+   if (arg->aparapiBuffer->mem != 0) {
+      if (config->isTrackingOpenCLResources()) {
+         memList.remove((cl_mem)arg->aparapiBuffer->mem, __LINE__, __FILE__);
+      }
+      status = clReleaseMemObject((cl_mem)arg->aparapiBuffer->mem);
+      //fprintf(stdout, "dispose arg %d %0lx\n", i, arg->aparapiBuffer->mem);
+
+      //this needs to be reported, but we can still keep going
+      CLException::checkCLError(status, "clReleaseMemObject()");
+
+      arg->aparapiBuffer->mem = (cl_mem)0;
+   }
+
+   updateBuffer(jenv, jniContext, arg, argPos, argIdx);
+
+}
+
 
 /**
  * keeps track of write events for KernelArgs.
@@ -487,8 +576,13 @@ void updateWriteEvents(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int
       jniContext->writeEventArgs[writeEventCount] = argIdx;
    }
 
-   status = clEnqueueWriteBuffer(jniContext->commandQueue, arg->arrayBuffer->mem, CL_FALSE, 0, 
+   if(arg->isArray()) {
+      status = clEnqueueWriteBuffer(jniContext->commandQueue, arg->arrayBuffer->mem, CL_FALSE, 0, 
          arg->arrayBuffer->lengthInBytes, arg->arrayBuffer->addr, 0, NULL, &(jniContext->writeEvents[writeEventCount]));
+   } else if(arg->isAparapiBuffer()) {
+      status = clEnqueueWriteBuffer(jniContext->commandQueue, arg->aparapiBuffer->mem, CL_FALSE, 0, 
+         arg->aparapiBuffer->lengthInBytes, arg->aparapiBuffer->data, 0, NULL, &(jniContext->writeEvents[writeEventCount]));
+   }
    if(status != CL_SUCCESS) throw CLException(status,"clEnqueueWriteBuffer");
 
    if (config->isTrackingOpenCLResources()){
@@ -503,6 +597,7 @@ void updateWriteEvents(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int
    }
 }
 
+
 /**
  * sets the opencl kernel arguement for local args.
  *
@@ -514,7 +609,7 @@ void updateWriteEvents(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int
  *
  * @throws CLException
  */
-void processLocal(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) {
+void processLocalArray(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) {
 
    cl_int status = CL_SUCCESS;
    // what if local buffer size has changed?  We need a check for resize here.
@@ -534,7 +629,6 @@ void processLocal(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& arg
 
          if(status != CL_SUCCESS) throw CLException(status,"clSetKernelArg (array length)");
 
-         argPos++;
       }
    } else {
       // Keep the arg position in sync if no updates were required
@@ -544,6 +638,53 @@ void processLocal(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& arg
    }
 }
 
+/**
+ * sets the opencl kernel arguement for local args.
+ *
+ * @param jenv the java envrionment
+ * @param jniContext the context we got from java
+ * @param arg the KernelArg to create a write event for
+ * @param argPos out: the position of arg in the opencl argument list
+ * @param argIdx the position of arg in the argument array
+ *
+ * @throws CLException
+ */
+void processLocalBuffer(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) {
+
+   cl_int status = CL_SUCCESS;
+   // what if local buffer size has changed?  We need a check for resize here.
+   if (jniContext->firstRun) {
+      status = arg->setLocalBufferArg(jenv, argIdx, argPos, config->isVerbose());
+      if(status != CL_SUCCESS) throw CLException(status,"clSetKernelArg() (local)");
+
+      // Add the array length if needed
+      if (arg->usesArrayLength()) {
+         arg->syncJavaArrayLength(jenv);
+
+         for(int i = 0; i < arg->aparapiBuffer->numDims; i++)
+         {
+             int length = arg->aparapiBuffer->lens[i];
+             status = clSetKernelArg(jniContext->kernel, argPos, sizeof(jint), &length);
+             if (config->isVerbose()){
+                fprintf(stderr, "runKernel arg %d %s, javaArrayLength = %d\n", argIdx, arg->name, length);
+             }
+             if(status != CL_SUCCESS) throw CLException(status,"clSetKernelArg (array length)");
+         }
+      }
+   } else {
+      // Keep the arg position in sync if no updates were required
+      if (arg->usesArrayLength()) {
+         argPos += arg->aparapiBuffer->numDims;
+      }
+   }
+}
+
+void processLocal(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) {
+   if(arg->isArray()) processLocalArray(jenv,jniContext,arg,argPos,argIdx);
+   if(arg->isAparapiBuffer()) processLocalBuffer(jenv,jniContext,arg,argPos,argIdx);
+}
+
+
 /**
  * processes all of the arguments for the OpenCL Kernel that we got from the JNIContext
  *
@@ -755,7 +896,7 @@ void enqueueKernel(JNIContext* jniContext, Range& range, int passes, int argPos,
  *
  * @throws CLException
  */
-int getReadEvents(JNIContext* jniContext) {
+int getReadEvents(JNIEnv* jenv, JNIContext* jniContext) {
 
    int readEventCount = 0; 
 
@@ -774,8 +915,17 @@ int getReadEvents(JNIContext* jniContext) {
             fprintf(stderr, "reading buffer %d %s\n", i, arg->name);
          }
 
-         status = clEnqueueReadBuffer(jniContext->commandQueue, arg->arrayBuffer->mem, CL_FALSE, 0, 
-               arg->arrayBuffer->lengthInBytes,arg->arrayBuffer->addr , 1, jniContext->executeEvents, &(jniContext->readEvents[readEventCount]));
+         if(arg->isArray()) {
+            status = clEnqueueReadBuffer(jniContext->commandQueue, arg->arrayBuffer->mem, 
+                CL_FALSE, 0, arg->arrayBuffer->lengthInBytes, arg->arrayBuffer->addr, 1, 
+                jniContext->executeEvents, &(jniContext->readEvents[readEventCount]));
+         } else if(arg->isAparapiBuffer()) {
+            status = clEnqueueReadBuffer(jniContext->commandQueue, arg->aparapiBuffer->mem, 
+                CL_TRUE, 0, arg->aparapiBuffer->lengthInBytes, arg->aparapiBuffer->data, 1, 
+                jniContext->executeEvents, &(jniContext->readEvents[readEventCount]));
+            arg->aparapiBuffer->inflate(jenv, arg);
+         }
+
          if (status != CL_SUCCESS) throw CLException(status, "clEnqueueReadBuffer()");
 
          if (config->isTrackingOpenCLResources()){
@@ -932,7 +1082,7 @@ JNI_JAVA(jint, KernelRunnerJNI, runKernelJNI)
          int writeEventCount = 0;
          processArgs(jenv, jniContext, argPos, writeEventCount);
          enqueueKernel(jniContext, range, passes, argPos, writeEventCount);
-         int readEventCount = getReadEvents(jniContext);
+         int readEventCount = getReadEvents(jenv, jniContext);
          waitForReadEvents(jniContext, readEventCount, passes);
          checkEvents(jenv, jniContext, writeEventCount);
       }
@@ -1144,7 +1294,7 @@ KernelArg* getArgForBuffer(JNIEnv* jenv, JNIContext* jniContext, jobject buffer)
    if (jniContext != NULL){
       for (jint i = 0; returnArg == NULL && i < jniContext->argc; i++){ 
          KernelArg *arg = jniContext->args[i];
-         if (arg->isArray()){
+         if (arg->isArray()) {
             jboolean isSame = jenv->IsSameObject(buffer, arg->arrayBuffer->javaArray);
             if (isSame){
                if (config->isVerbose()){
@@ -1156,6 +1306,18 @@ KernelArg* getArgForBuffer(JNIEnv* jenv, JNIContext* jniContext, jobject buffer)
                   fprintf(stderr, "unmatched arg '%s'\n", arg->name);
                }
             }
+         } else if(arg->isAparapiBuffer()) {
+            jboolean isSame = jenv->IsSameObject(buffer, arg->aparapiBuffer->getJavaObject(jenv,arg));
+            if (isSame) {
+               if (config->isVerbose()) {
+                  fprintf(stderr, "matched arg '%s'\n", arg->name);
+               }
+               returnArg = arg;
+            } else {
+               if (config->isVerbose()) {
+                  fprintf(stderr, "unmatched arg '%s'\n", arg->name);
+               }
+            }
          }
       }
       if (returnArg == NULL){
@@ -1181,44 +1343,79 @@ JNI_JAVA(jint, KernelRunnerJNI, getJNI)
             if (config->isVerbose()){
                fprintf(stderr, "explicitly reading buffer %s\n", arg->name);
             }
-            arg->pin(jenv);
-
-            try {
-               status = clEnqueueReadBuffer(jniContext->commandQueue, arg->arrayBuffer->mem, 
-                                            CL_FALSE, 0, 
-                                            arg->arrayBuffer->lengthInBytes,
-                                            arg->arrayBuffer->addr , 0, NULL, 
-                                            &jniContext->readEvents[0]);
-               if (config->isVerbose()){
-                  fprintf(stderr, "explicitly read %s ptr=%lx len=%d\n", 
-                          arg->name, (unsigned long)arg->arrayBuffer->addr, 
-                          arg->arrayBuffer->lengthInBytes );
-               }
-               if (status != CL_SUCCESS) throw CLException(status, "clEnqueueReadBuffer()");
+            if(arg->isArray()) {
+               arg->pin(jenv);
+
+               try {
+                  status = clEnqueueReadBuffer(jniContext->commandQueue, arg->arrayBuffer->mem, 
+                                               CL_FALSE, 0, 
+                                               arg->arrayBuffer->lengthInBytes,
+                                               arg->arrayBuffer->addr , 0, NULL, 
+                                               &jniContext->readEvents[0]);
+                  if (config->isVerbose()){
+                     fprintf(stderr, "explicitly read %s ptr=%lx len=%d\n", 
+                             arg->name, (unsigned long)arg->arrayBuffer->addr, 
+                             arg->arrayBuffer->lengthInBytes );
+                  }
+                  if (status != CL_SUCCESS) throw CLException(status, "clEnqueueReadBuffer()");
+
+                  status = clWaitForEvents(1, jniContext->readEvents);
+                  if (status != CL_SUCCESS) throw CLException(status, "clWaitForEvents");
+
+                  if (config->isProfilingEnabled()) {
+                     status = profile(&arg->arrayBuffer->read, &jniContext->readEvents[0], 0,
+                                      arg->name, jniContext->profileBaseTime);
+                     if (status != CL_SUCCESS) throw CLException(status, "profile ");
+                  }
 
-               status = clWaitForEvents(1, jniContext->readEvents);
-               if (status != CL_SUCCESS) throw CLException(status, "clWaitForEvents");
+                  status = clReleaseEvent(jniContext->readEvents[0]);
+                  if (status != CL_SUCCESS) throw CLException(status, "clReleaseEvent() read event");
 
-               if (config->isProfilingEnabled()) {
-                  status = profile(&arg->arrayBuffer->read, &jniContext->readEvents[0], 0,
-                                   arg->name, jniContext->profileBaseTime);
-                  if (status != CL_SUCCESS) throw CLException(status, "profile "); 
+                  // since this is an explicit buffer get, 
+                  // we expect the buffer to have changed so we commit
+                  arg->unpin(jenv); // was unpinCommit
+
+               //something went wrong print the error and exit
+               } catch(CLException& cle) {
+                  cle.printError();
+                  return status;
                }
+            } else if(arg->isAparapiBuffer()) {
+
+               try {
+                  status = clEnqueueReadBuffer(jniContext->commandQueue, arg->aparapiBuffer->mem, 
+                                               CL_FALSE, 0, 
+                                               arg->aparapiBuffer->lengthInBytes,
+                                               arg->aparapiBuffer->data, 0, NULL, 
+                                               &jniContext->readEvents[0]);
+                  if (config->isVerbose()){
+                     fprintf(stderr, "explicitly read %s ptr=%lx len=%d\n", 
+                             arg->name, (unsigned long)arg->aparapiBuffer->data, 
+                             arg->aparapiBuffer->lengthInBytes );
+                  }
+                  if (status != CL_SUCCESS) throw CLException(status, "clEnqueueReadBuffer()");
+
+                  status = clWaitForEvents(1, jniContext->readEvents);
+                  if (status != CL_SUCCESS) throw CLException(status, "clWaitForEvents");
 
-               status = clReleaseEvent(jniContext->readEvents[0]);
-               if (status != CL_SUCCESS) throw CLException(status, "clReleaseEvent() read event");
+                  if (config->isProfilingEnabled()) {
+                     status = profile(&arg->aparapiBuffer->read, &jniContext->readEvents[0], 0,
+                                      arg->name, jniContext->profileBaseTime);
+                     if (status != CL_SUCCESS) throw CLException(status, "profile "); 
+                  }
 
-               // since this is an explicit buffer get, 
-               // we expect the buffer to have changed so we commit
-               arg->unpin(jenv); // was unpinCommit
+                  status = clReleaseEvent(jniContext->readEvents[0]);
+                  if (status != CL_SUCCESS) throw CLException(status, "clReleaseEvent() read event");
 
-            //something went wrong print the error and exit
-            } catch(CLException& cle) {
-               cle.printError();
-               return status;
-            }
+                  arg->aparapiBuffer->inflate(jenv,arg);
 
-         }else{
+               //something went wrong print the error and exit
+               } catch(CLException& cle) {
+                  cle.printError();
+                  return status;
+               }
+            }
+         } else {
             if (config->isVerbose()){
                fprintf(stderr, "attempt to request to get a buffer that does not appear to be referenced from kernel\n");
             }
diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.h b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.h
index 3b4d40e09c69a83a52656d7b022c107fd1967707..b2b7258ce46e226b03667c5e91fa887f7d334b49 100644
--- a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.h
+++ b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.h
@@ -33,7 +33,7 @@
    direct product is subject to national security controls as identified on the Commerce Control List (currently 
    found in Supplement 1 to Part 774 of EAR).  For the most current Country Group listings, or for additional 
    information about the EAR or your obligations under those regulations, please refer to the U.S. Bureau of Industry
-   and Security’s website at http://www.bis.doc.gov/. 
+   and Security�s website at http://www.bis.doc.gov/. 
    */
 
 #ifndef APARAPI_H
@@ -65,13 +65,18 @@ jint updateNonPrimitiveReferences(JNIEnv *jenv, jobject jobj, JNIContext* jniCon
 
 void profileFirstRun(JNIContext* jniContext);
 
-void updateObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx);
+void updateArray(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx);
+void updateBuffer(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx);
 
 void processObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx);
+void processArray(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx);
+void processBuffer(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx);
 
 void updateWriteEvents(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int argIdx, int& writeEventCount);
 
 void processLocal(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx);
+void processLocalArray(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx);
+void processLocalBuffer(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx);
 
 int processArgs(JNIEnv* jenv, JNIContext* jniContext, int& argPos, int& writeEventCount);
 
diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.cpp b/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f5fcb19c542f35fcf6c02434bb9eeb901d8d50b0
--- /dev/null
+++ b/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.cpp
@@ -0,0 +1,1557 @@
+/*
+   Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+   following conditions are met:
+
+   Redistributions of source code must retain the above copyright notice, this list of conditions and the following
+   disclaimer. 
+
+   Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
+   disclaimer in the documentation and/or other materials provided with the distribution. 
+
+   Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
+   derived from this software without specific prior written permission. 
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+   INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
+   laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 
+   through 774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of
+   the EAR, you hereby certify that, except pursuant to a license granted by the United States Department of Commerce
+   Bureau of Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export 
+   Administration Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in 
+   Country Groups D:1, E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) 
+   export to Country Groups D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced
+   direct product is subject to national security controls as identified on the Commerce Control List (currently 
+   found in Supplement 1 to Part 774 of EAR).  For the most current Country Group listings, or for additional 
+   information about the EAR or your obligations under those regulations, please refer to the U.S. Bureau of Industry
+   and Security?s website at http://www.bis.doc.gov/. 
+   */
+#define APARAPIBUFFER_SOURCE
+#include "AparapiBuffer.h"
+#include "KernelArg.h"
+
+AparapiBuffer::AparapiBuffer():
+   javaObject((jobject) 0),
+   numDims(0),
+   dims(NULL),
+   lengthInBytes(0),
+   mem((cl_mem) 0),
+   data(NULL),
+   memMask((cl_uint)0) {
+   }
+
+AparapiBuffer::AparapiBuffer(void* _data, cl_uint* _lens, cl_uint _numDims, long _lengthInBytes, jobject _javaObject) :
+   data(_data),
+   lens(_lens),
+   numDims(_numDims),
+   lengthInBytes(_lengthInBytes),
+   javaObject(_javaObject),
+   mem((cl_mem) 0),
+   memMask((cl_uint)0)
+{
+   dims = new cl_uint[_numDims];
+   for(int i = 0; i < _numDims; i++) {
+      dims[i] = 1;
+      for(int j = i+1; j < _numDims; j++) {
+         dims[i] *= lens[j];
+      }
+   }
+}
+
+jobject AparapiBuffer::getJavaObject(JNIEnv* env, KernelArg* arg) {
+   return JNIHelper::getInstanceField<jobject>(env, arg->javaArg, "javaBuffer", ObjectClassArg);
+}
+
+
+AparapiBuffer* AparapiBuffer::flatten(JNIEnv* env, jobject arg, int type) {
+   int numDims = JNIHelper::getInstanceField<jint>(env, arg, "numDims", IntArg);
+   if(numDims == 2 && isBoolean(type)) {
+      return AparapiBuffer::flattenBoolean2D(env,arg);
+   } else if(numDims == 2 && isByte(type)) {
+      return AparapiBuffer::flattenByte2D(env,arg);
+   } else if(numDims == 2 && isShort(type)) {
+      return AparapiBuffer::flattenShort2D(env,arg);
+   } else if(numDims == 2 && isInt(type)) {
+      return AparapiBuffer::flattenInt2D(env,arg);
+   } else if(numDims == 2 && isLong(type)) {
+      return AparapiBuffer::flattenLong2D(env,arg);
+   } else if(numDims == 2 && isFloat(type)) {
+      return AparapiBuffer::flattenFloat2D(env,arg);
+   } else if(numDims == 2 && isDouble(type)) {
+      return AparapiBuffer::flattenDouble2D(env,arg);
+   } else if(numDims == 3 && isBoolean(type)) {
+      return AparapiBuffer::flattenBoolean3D(env,arg);
+   } else if(numDims == 3 && isByte(type)) {
+      return AparapiBuffer::flattenByte3D(env,arg);
+   } else if(numDims == 3 && isShort(type)) {
+      return AparapiBuffer::flattenShort3D(env,arg);
+   } else if(numDims == 3 && isInt(type)) {
+      return AparapiBuffer::flattenInt3D(env,arg);
+   } else if(numDims == 3 && isLong(type)) {
+      return AparapiBuffer::flattenLong3D(env,arg);
+   } else if(numDims == 3 && isFloat(type)) {
+      return AparapiBuffer::flattenFloat3D(env,arg);
+   } else if(numDims == 3 && isDouble(type)) {
+      return AparapiBuffer::flattenDouble3D(env,arg);
+   }
+   return new AparapiBuffer();
+}
+
+
+AparapiBuffer* AparapiBuffer::flattenBoolean2D(JNIEnv* env, jobject arg) {
+
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   cl_uint* dims = new cl_uint[2];
+   dims[0] = env->GetArrayLength((jobjectArray)javaBuffer);
+   dims[1] = env->GetArrayLength((jbooleanArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0));
+   int totalSize = dims[0] * dims[1];
+   long bitSize = totalSize * sizeof(jboolean);
+
+   jboolean* array = new jboolean[totalSize];
+   /*
+   jbooleanArray* jArray = new jbooleanArray[dims[0]];
+   jboolean** elems = new jboolean*[dims[0]];
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jArray[i] = (jbooleanArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      elems[i] = env->GetBooleanArrayElements(jArray[i],0);
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         array[i*dims[1] + j] = elems[i][j];
+      }
+   }
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      env->ReleaseBooleanArrayElements(jArray[i], elems[i], 0);
+   }
+   delete[] jArray;
+   delete[] elems;
+   */
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jbooleanArray jArray = (jbooleanArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      jboolean* elems = env->GetBooleanArrayElements(jArray,0);
+
+      for(int j = 0; j < (int)dims[1]; j++) {
+         array[i*dims[1] + j] = elems[j];
+      }
+      env->ReleaseBooleanArrayElements(jArray, elems, 0);
+   }
+  
+   return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
+}
+
+AparapiBuffer* AparapiBuffer::flattenByte2D(JNIEnv* env, jobject arg) {
+
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   cl_uint* dims = new cl_uint[2];
+   dims[0] = env->GetArrayLength((jobjectArray)javaBuffer);
+   dims[1] = env->GetArrayLength((jbyteArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0));
+   int totalSize = dims[0] * dims[1];
+   long bitSize = totalSize * sizeof(jbyte);
+
+   jbyte* array = new jbyte[totalSize];
+   /*
+   jbyteArray* jArray = new jbyteArray[dims[0]];
+   jbyte** elems = new jbyte*[dims[0]];
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jArray[i] = (jbyteArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      elems[i] = env->GetByteArrayElements(jArray[i],0);
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         array[i*dims[1] + j] = elems[i][j];
+      }
+   }
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      env->ReleaseByteArrayElements(jArray[i], elems[i], 0);
+   }
+   delete[] jArray;
+   delete[] elems;
+   */
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jbyteArray jArray = (jbyteArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      jbyte* elems = env->GetByteArrayElements(jArray,0);
+
+      for(int j = 0; j < (int)dims[1]; j++) {
+         array[i*dims[1] + j] = elems[j];
+      }
+      env->ReleaseByteArrayElements(jArray, elems, 0);
+   }
+  
+   return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
+}
+
+AparapiBuffer* AparapiBuffer::flattenShort2D(JNIEnv* env, jobject arg) {
+
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   cl_uint* dims = new cl_uint[2];
+   dims[0] = env->GetArrayLength((jobjectArray)javaBuffer);
+   dims[1] = env->GetArrayLength((jshortArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0));
+   int totalSize = dims[0] * dims[1];
+   long bitSize = totalSize * sizeof(jshort);
+
+   jshort* array = new jshort[totalSize];
+   /*
+   jshortArray* jArray = new jshortArray[dims[0]];
+   jshort** elems = new jshort*[dims[0]];
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jArray[i] = (jshortArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      elems[i] = env->GetShortArrayElements(jArray[i],0);
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         array[i*dims[1] + j] = elems[i][j];
+      }
+   }
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      env->ReleaseShortArrayElements(jArray[i], elems[i], 0);
+   }
+   delete[] jArray;
+   delete[] elems;
+   */
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jshortArray jArray = (jshortArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      jshort* elems = env->GetShortArrayElements(jArray,0);
+
+      for(int j = 0; j < (int)dims[1]; j++) {
+         array[i*dims[1] + j] = elems[j];
+      }
+      env->ReleaseShortArrayElements(jArray, elems, 0);
+   }
+  
+   return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
+}
+
+AparapiBuffer* AparapiBuffer::flattenInt2D(JNIEnv* env, jobject arg) {
+
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   cl_uint* dims = new cl_uint[2];
+   dims[0] = env->GetArrayLength((jobjectArray)javaBuffer);
+   dims[1] = env->GetArrayLength((jintArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0));
+   int totalSize = dims[0] * dims[1];
+   long bitSize = totalSize * sizeof(jint);
+
+   jint* array = new jint[totalSize];
+   /*
+   jintArray* jArray = new jintArray[dims[0]];
+   jint** elems = new jint*[dims[0]];
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jArray[i] = (jintArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      elems[i] = env->GetIntArrayElements(jArray[i],0);
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         array[i*dims[1] + j] = elems[i][j];
+      }
+   }
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      env->ReleaseIntArrayElements(jArray[i], elems[i], 0);
+   }
+   delete[] jArray;
+   delete[] elems;
+   */
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jintArray jArray = (jintArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      jint* elems = env->GetIntArrayElements(jArray,0);
+
+      for(int j = 0; j < (int)dims[1]; j++) {
+         array[i*dims[1] + j] = elems[j];
+      }
+      env->ReleaseIntArrayElements(jArray, elems, 0);
+   }
+  
+   return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
+}
+
+AparapiBuffer* AparapiBuffer::flattenLong2D(JNIEnv* env, jobject arg) {
+
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   cl_uint* dims = new cl_uint[2];
+   dims[0] = env->GetArrayLength((jobjectArray)javaBuffer);
+   dims[1] = env->GetArrayLength((jlongArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0));
+   int totalSize = dims[0] * dims[1];
+   long bitSize = totalSize * sizeof(jlong);
+
+   jlong* array = new jlong[totalSize];
+   /*
+   jlongArray* jArray = new jlongArray[dims[0]];
+   jlong** elems = new jlong*[dims[0]];
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jArray[i] = (jlongArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      elems[i] = env->GetLongArrayElements(jArray[i],0);
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         array[i*dims[1] + j] = elems[i][j];
+      }
+   }
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      env->ReleaseLongArrayElements(jArray[i], elems[i], 0);
+   }
+   delete[] jArray;
+   delete[] elems;
+   */
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jlongArray jArray = (jlongArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      jlong* elems = env->GetLongArrayElements(jArray,0);
+
+      for(int j = 0; j < (int)dims[1]; j++) {
+         array[i*dims[1] + j] = elems[j];
+      }
+      env->ReleaseLongArrayElements(jArray, elems, 0);
+   }
+  
+   return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
+}
+
+AparapiBuffer* AparapiBuffer::flattenFloat2D(JNIEnv* env, jobject arg) {
+
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   cl_uint* dims = new cl_uint[2];
+   dims[0] = env->GetArrayLength((jobjectArray)javaBuffer);
+   dims[1] = env->GetArrayLength((jfloatArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0));
+   int totalSize = dims[0] * dims[1];
+   long bitSize = totalSize * sizeof(jfloat);
+
+   jfloat* array = new jfloat[totalSize];
+   /*
+   jfloatArray* jArray = new jfloatArray[dims[0]];
+   jfloat** elems = new jfloat*[dims[0]];
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jArray[i] = (jfloatArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      elems[i] = env->GetFloatArrayElements(jArray[i],0);
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         array[i*dims[1] + j] = elems[i][j];
+      }
+   }
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      env->ReleaseFloatArrayElements(jArray[i], elems[i], 0);
+   }
+   delete[] jArray;
+   delete[] elems;
+   */
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jfloatArray jArray = (jfloatArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      jfloat* elems = env->GetFloatArrayElements(jArray,0);
+
+      for(int j = 0; j < (int)dims[1]; j++) {
+         array[i*dims[1] + j] = elems[j];
+      }
+      env->ReleaseFloatArrayElements(jArray, elems, 0);
+   }
+  
+   return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
+}
+
+AparapiBuffer* AparapiBuffer::flattenDouble2D(JNIEnv* env, jobject arg) {
+
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   cl_uint* dims = new cl_uint[2];
+   dims[0] = env->GetArrayLength((jobjectArray)javaBuffer);
+   dims[1] = env->GetArrayLength((jdoubleArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0));
+   int totalSize = dims[0] * dims[1];
+   long bitSize = totalSize * sizeof(jdouble);
+
+   jdouble* array = new jdouble[totalSize];
+   /*
+   jdoubleArray* jArray = new jdoubleArray[dims[0]];
+   jdouble** elems = new jdouble*[dims[0]];
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jArray[i] = (jdoubleArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      elems[i] = env->GetDoubleArrayElements(jArray[i],0);
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         array[i*dims[1] + j] = elems[i][j];
+      }
+   }
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      env->ReleaseDoubleArrayElements(jArray[i], elems[i], 0);
+   }
+   delete[] jArray;
+   delete[] elems;
+   */
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jdoubleArray jArray = (jdoubleArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      jdouble* elems = env->GetDoubleArrayElements(jArray,0);
+
+      for(int j = 0; j < (int)dims[1]; j++) {
+         array[i*dims[1] + j] = elems[j];
+      }
+      env->ReleaseDoubleArrayElements(jArray, elems, 0);
+   }
+  
+   return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer);
+}
+
+
+AparapiBuffer* AparapiBuffer::flattenBoolean3D(JNIEnv* env, jobject arg) {
+
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   cl_uint* dims = new cl_uint[3];
+   jobjectArray j0 = (jobjectArray)javaBuffer;
+   jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0);
+   jbooleanArray j2 = (jbooleanArray)env->GetObjectArrayElement(j1, 0);
+   dims[0] = env->GetArrayLength(j0);
+   dims[1] = env->GetArrayLength(j1);
+   dims[2] = env->GetArrayLength(j2);
+
+   int totalSize = dims[0] * dims[1] * dims[2];
+   long bitSize = totalSize * sizeof(jboolean);
+
+   jboolean* array = new jboolean[totalSize];
+   /*
+   jbooleanArray** jArray = new jbooleanArray*[dims[0]];
+   jboolean*** elems = new jboolean**[dims[0]];
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jArray[i] = new jbooleanArray[dims[1]];
+      elems[i] = new jboolean*[dims[1]];
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      for(int j = 0; j < (int)dims[1]; j++) {
+         jArray[i][j] = (jbooleanArray)env->GetObjectArrayElement(jrow, j);
+         elems[i][j] = env->GetBooleanArrayElements(jArray[i][j],0);
+      }
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         for(int k = 0; k < (int)dims[2]; k++) {
+            array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[i][j][k];
+         }
+      }
+   }
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         env->ReleaseBooleanArrayElements(jArray[i][j], elems[i][j], 0);
+      }
+      delete[] jArray[i];
+      delete[] elems[i];
+   }
+   delete[] jArray;
+   delete[] elems;
+   */
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      for(int j = 0; j < (int)dims[1]; j++) {
+         jbooleanArray jArray = (jbooleanArray)env->GetObjectArrayElement(jrow, j);
+         jboolean* elems = env->GetBooleanArrayElements(jArray,0);
+         for(int k = 0; k < (int)dims[2]; k++) {
+            array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[k];
+         }
+         env->ReleaseBooleanArrayElements(jArray, elems, 0);
+      }
+   }
+  
+   return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
+}
+
+AparapiBuffer* AparapiBuffer::flattenByte3D(JNIEnv* env, jobject arg) {
+
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   cl_uint* dims = new cl_uint[3];
+   jobjectArray j0 = (jobjectArray)javaBuffer;
+   jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0);
+   jbyteArray j2 = (jbyteArray)env->GetObjectArrayElement(j1, 0);
+   dims[0] = env->GetArrayLength(j0);
+   dims[1] = env->GetArrayLength(j1);
+   dims[2] = env->GetArrayLength(j2);
+
+   int totalSize = dims[0] * dims[1] * dims[2];
+   long bitSize = totalSize * sizeof(jbyte);
+
+   jbyte* array = new jbyte[totalSize];
+   /*
+   jbyteArray** jArray = new jbyteArray*[dims[0]];
+   jbyte*** elems = new jbyte**[dims[0]];
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jArray[i] = new jbyteArray[dims[1]];
+      elems[i] = new jbyte*[dims[1]];
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      for(int j = 0; j < (int)dims[1]; j++) {
+         jArray[i][j] = (jbyteArray)env->GetObjectArrayElement(jrow, j);
+         elems[i][j] = env->GetByteArrayElements(jArray[i][j],0);
+      }
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         for(int k = 0; k < (int)dims[2]; k++) {
+            array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[i][j][k];
+         }
+      }
+   }
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         env->ReleaseByteArrayElements(jArray[i][j], elems[i][j], 0);
+      }
+      delete[] jArray[i];
+      delete[] elems[i];
+   }
+   delete[] jArray;
+   delete[] elems;
+   */
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      for(int j = 0; j < (int)dims[1]; j++) {
+         jbyteArray jArray = (jbyteArray)env->GetObjectArrayElement(jrow, j);
+         jbyte* elems = env->GetByteArrayElements(jArray,0);
+         for(int k = 0; k < (int)dims[2]; k++) {
+            array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[k];
+         }
+         env->ReleaseByteArrayElements(jArray, elems, 0);
+      }
+   }
+  
+   return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
+}
+
+AparapiBuffer* AparapiBuffer::flattenShort3D(JNIEnv* env, jobject arg) {
+
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   cl_uint* dims = new cl_uint[3];
+   jobjectArray j0 = (jobjectArray)javaBuffer;
+   jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0);
+   jshortArray j2 = (jshortArray)env->GetObjectArrayElement(j1, 0);
+   dims[0] = env->GetArrayLength(j0);
+   dims[1] = env->GetArrayLength(j1);
+   dims[2] = env->GetArrayLength(j2);
+
+   int totalSize = dims[0] * dims[1] * dims[2];
+   long bitSize = totalSize * sizeof(jshort);
+
+   jshort* array = new jshort[totalSize];
+   /*
+   jshortArray** jArray = new jshortArray*[dims[0]];
+   jshort*** elems = new jshort**[dims[0]];
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jArray[i] = new jshortArray[dims[1]];
+      elems[i] = new jshort*[dims[1]];
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      for(int j = 0; j < (int)dims[1]; j++) {
+         jArray[i][j] = (jshortArray)env->GetObjectArrayElement(jrow, j);
+         elems[i][j] = env->GetShortArrayElements(jArray[i][j],0);
+      }
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         for(int k = 0; k < (int)dims[2]; k++) {
+            array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[i][j][k];
+         }
+      }
+   }
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         env->ReleaseShortArrayElements(jArray[i][j], elems[i][j], 0);
+      }
+      delete[] jArray[i];
+      delete[] elems[i];
+   }
+   delete[] jArray;
+   delete[] elems;
+   */
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      for(int j = 0; j < (int)dims[1]; j++) {
+         jshortArray jArray = (jshortArray)env->GetObjectArrayElement(jrow, j);
+         jshort* elems = env->GetShortArrayElements(jArray,0);
+         for(int k = 0; k < (int)dims[2]; k++) {
+            array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[k];
+         }
+         env->ReleaseShortArrayElements(jArray, elems, 0);
+      }
+   }
+  
+   return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
+}
+
+AparapiBuffer* AparapiBuffer::flattenInt3D(JNIEnv* env, jobject arg) {
+
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   cl_uint* dims = new cl_uint[3];
+   jobjectArray j0 = (jobjectArray)javaBuffer;
+   jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0);
+   jintArray j2 = (jintArray)env->GetObjectArrayElement(j1, 0);
+   dims[0] = env->GetArrayLength(j0);
+   dims[1] = env->GetArrayLength(j1);
+   dims[2] = env->GetArrayLength(j2);
+
+   int totalSize = dims[0] * dims[1] * dims[2];
+   long bitSize = totalSize * sizeof(jint);
+
+   jint* array = new jint[totalSize];
+   /*
+   jintArray** jArray = new jintArray*[dims[0]];
+   jint*** elems = new jint**[dims[0]];
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jArray[i] = new jintArray[dims[1]];
+      elems[i] = new jint*[dims[1]];
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      for(int j = 0; j < (int)dims[1]; j++) {
+         jArray[i][j] = (jintArray)env->GetObjectArrayElement(jrow, j);
+         elems[i][j] = env->GetIntArrayElements(jArray[i][j],0);
+      }
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         for(int k = 0; k < (int)dims[2]; k++) {
+            array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[i][j][k];
+         }
+      }
+   }
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         env->ReleaseIntArrayElements(jArray[i][j], elems[i][j], 0);
+      }
+      delete[] jArray[i];
+      delete[] elems[i];
+   }
+   delete[] jArray;
+   delete[] elems;
+   */
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      for(int j = 0; j < (int)dims[1]; j++) {
+         jintArray jArray = (jintArray)env->GetObjectArrayElement(jrow, j);
+         jint* elems = env->GetIntArrayElements(jArray,0);
+         for(int k = 0; k < (int)dims[2]; k++) {
+            array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[k];
+         }
+         env->ReleaseIntArrayElements(jArray, elems, 0);
+      }
+   }
+  
+   return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
+}
+
+AparapiBuffer* AparapiBuffer::flattenLong3D(JNIEnv* env, jobject arg) {
+
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   cl_uint* dims = new cl_uint[3];
+   jobjectArray j0 = (jobjectArray)javaBuffer;
+   jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0);
+   jlongArray j2 = (jlongArray)env->GetObjectArrayElement(j1, 0);
+   dims[0] = env->GetArrayLength(j0);
+   dims[1] = env->GetArrayLength(j1);
+   dims[2] = env->GetArrayLength(j2);
+
+   int totalSize = dims[0] * dims[1] * dims[2];
+   jlong bitSize = totalSize * sizeof(jlong);
+
+   jlong* array = new jlong[totalSize];
+   /*
+   jlongArray** jArray = new jlongArray*[dims[0]];
+   jlong*** elems = new jlong**[dims[0]];
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jArray[i] = new jlongArray[dims[1]];
+      elems[i] = new jlong*[dims[1]];
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      for(int j = 0; j < (int)dims[1]; j++) {
+         jArray[i][j] = (jlongArray)env->GetObjectArrayElement(jrow, j);
+         elems[i][j] = env->GetLongArrayElements(jArray[i][j],0);
+      }
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         for(int k = 0; k < (int)dims[2]; k++) {
+            array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[i][j][k];
+         }
+      }
+   }
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         env->ReleaseLongArrayElements(jArray[i][j], elems[i][j], 0);
+      }
+      delete[] jArray[i];
+      delete[] elems[i];
+   }
+   delete[] jArray;
+   delete[] elems;
+   */
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      for(int j = 0; j < (int)dims[1]; j++) {
+         jlongArray jArray = (jlongArray)env->GetObjectArrayElement(jrow, j);
+         jlong* elems = env->GetLongArrayElements(jArray,0);
+         for(int k = 0; k < (int)dims[2]; k++) {
+            array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[k];
+         }
+         env->ReleaseLongArrayElements(jArray, elems, 0);
+      }
+   }
+  
+   return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
+}
+
+AparapiBuffer* AparapiBuffer::flattenFloat3D(JNIEnv* env, jobject arg) {
+
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   cl_uint* dims = new cl_uint[3];
+   jobjectArray j0 = (jobjectArray)javaBuffer;
+   jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0);
+   jfloatArray j2 = (jfloatArray)env->GetObjectArrayElement(j1, 0);
+   dims[0] = env->GetArrayLength(j0);
+   dims[1] = env->GetArrayLength(j1);
+   dims[2] = env->GetArrayLength(j2);
+
+   int totalSize = dims[0] * dims[1] * dims[2];
+   long bitSize = totalSize * sizeof(jfloat);
+
+   jfloat* array = new jfloat[totalSize];
+   /*
+   jfloatArray** jArray = new jfloatArray*[dims[0]];
+   jfloat*** elems = new jfloat**[dims[0]];
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jArray[i] = new jfloatArray[dims[1]];
+      elems[i] = new jfloat*[dims[1]];
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      for(int j = 0; j < (int)dims[1]; j++) {
+         jArray[i][j] = (jfloatArray)env->GetObjectArrayElement(jrow, j);
+         elems[i][j] = env->GetFloatArrayElements(jArray[i][j],0);
+      }
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         for(int k = 0; k < (int)dims[2]; k++) {
+            array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[i][j][k];
+         }
+      }
+   }
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         env->ReleaseFloatArrayElements(jArray[i][j], elems[i][j], 0);
+      }
+      delete[] jArray[i];
+      delete[] elems[i];
+   }
+   delete[] jArray;
+   delete[] elems;
+   */
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      for(int j = 0; j < (int)dims[1]; j++) {
+         jfloatArray jArray = (jfloatArray)env->GetObjectArrayElement(jrow, j);
+         jfloat* elems = env->GetFloatArrayElements(jArray,0);
+         for(int k = 0; k < (int)dims[2]; k++) {
+            array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[k];
+         }
+         env->ReleaseFloatArrayElements(jArray, elems, 0);
+      }
+   }
+  
+   return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
+}
+
+AparapiBuffer* AparapiBuffer::flattenDouble3D(JNIEnv* env, jobject arg) {
+
+   jobject javaBuffer = JNIHelper::getInstanceField<jobject>(env, arg, "javaBuffer", ObjectClassArg);
+   cl_uint* dims = new cl_uint[3];
+   jobjectArray j0 = (jobjectArray)javaBuffer;
+   jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0);
+   jdoubleArray j2 = (jdoubleArray)env->GetObjectArrayElement(j1, 0);
+   dims[0] = env->GetArrayLength(j0);
+   dims[1] = env->GetArrayLength(j1);
+   dims[2] = env->GetArrayLength(j2);
+
+   int totalSize = dims[0] * dims[1] * dims[2];
+   long bitSize = totalSize * sizeof(jdouble);
+
+   jdouble* array = new jdouble[totalSize];
+   /*
+   jdoubleArray** jArray = new jdoubleArray*[dims[0]];
+   jdouble*** elems = new jdouble**[dims[0]];
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jArray[i] = new jdoubleArray[dims[1]];
+      elems[i] = new jdouble*[dims[1]];
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      for(int j = 0; j < (int)dims[1]; j++) {
+         jArray[i][j] = (jdoubleArray)env->GetObjectArrayElement(jrow, j);
+         elems[i][j] = env->GetDoubleArrayElements(jArray[i][j],0);
+      }
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         for(int k = 0; k < (int)dims[2]; k++) {
+            array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[i][j][k];
+         }
+      }
+   }
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      for(int j = 0; j < (int)dims[1]; j++) {
+         env->ReleaseDoubleArrayElements(jArray[i][j], elems[i][j], 0);
+      }
+      delete[] jArray[i];
+      delete[] elems[i];
+   }
+   delete[] jArray;
+   delete[] elems;
+   */
+
+   for(int i = 0; i < (int)dims[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i);
+      for(int j = 0; j < (int)dims[1]; j++) {
+         jdoubleArray jArray = (jdoubleArray)env->GetObjectArrayElement(jrow, j);
+         jdouble* elems = env->GetDoubleArrayElements(jArray,0);
+         for(int k = 0; k < (int)dims[2]; k++) {
+            array[i*dims[1]*dims[2] + j*dims[1] + k] = elems[k];
+         }
+         env->ReleaseDoubleArrayElements(jArray, elems, 0);
+      }
+   }
+  
+   return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer);
+}
+
+
+
+void AparapiBuffer::inflate(JNIEnv* env, KernelArg* arg) {
+   javaObject = JNIHelper::getInstanceField<jobject>(env, arg->javaArg, "javaBuffer", ObjectClassArg);
+   if(numDims == 2 && arg->isBoolean()) {
+      AparapiBuffer::inflateBoolean2D(env, arg);
+   } else if(numDims == 2 && arg->isByte()) {
+      AparapiBuffer::inflateByte2D(env, arg);
+   } else if(numDims == 2 && arg->isShort()) {
+      AparapiBuffer::inflateShort2D(env, arg);
+   } else if(numDims == 2 && arg->isInt()) {
+      AparapiBuffer::inflateInt2D(env, arg);
+   } else if(numDims == 2 && arg->isLong()) {
+      AparapiBuffer::inflateLong2D(env, arg);
+   } else if(numDims == 2 && arg->isFloat()) {
+      AparapiBuffer::inflateFloat2D(env, arg);
+   } else if(numDims == 2 && arg->isDouble()) {
+      AparapiBuffer::inflateDouble2D(env, arg);
+   } else if(numDims == 3 && arg->isBoolean()) {
+      AparapiBuffer::inflateBoolean3D(env, arg);
+   } else if(numDims == 3 && arg->isByte()) {
+      AparapiBuffer::inflateByte3D(env, arg);
+   } else if(numDims == 3 && arg->isShort()) {
+      AparapiBuffer::inflateShort3D(env, arg);
+   } else if(numDims == 3 && arg->isInt()) {
+      AparapiBuffer::inflateInt3D(env, arg);
+   } else if(numDims == 3 && arg->isLong()) {
+      AparapiBuffer::inflateLong3D(env, arg);
+   } else if(numDims == 3 && arg->isFloat()) {
+      AparapiBuffer::inflateFloat3D(env, arg);
+   } else if(numDims == 3 && arg->isDouble()) {
+      AparapiBuffer::inflateDouble3D(env, arg);
+   } else {
+       return;
+   }
+
+   deleteBuffer(arg);
+}
+
+
+void AparapiBuffer::inflateBoolean2D(JNIEnv *env, KernelArg* arg) {
+   
+   jobjectArray buffer = (jobjectArray)javaObject;
+   jboolean* array = (jboolean*)data;
+   /*
+   jbooleanArray* jArray = new jbooleanArray[lens[0]];
+   jboolean** body =  new jboolean*[lens[0]];
+
+   for(int i = 0; i < lens[0]; i++) {
+      jArray[i] = (jbooleanArray)env->GetObjectArrayElement(buffer, i);
+      body[i] = env->GetBooleanArrayElements(jArray[i],0);
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         body[i][j] = array[i*dims[0] + j];
+      }
+   }
+
+   for(int i = 0; i < lens[0]; i++) {
+      env->ReleaseBooleanArrayElements(jArray[i], body[i], 0);
+   }
+   delete[] jArray;
+   delete[] body;
+   */
+
+   for(int i = 0; i < lens[0]; i++) {
+      jbooleanArray jArray = (jbooleanArray)env->GetObjectArrayElement(buffer, i);
+      jboolean* body = env->GetBooleanArrayElements(jArray,0);
+      for(int j = 0; j < lens[1]; j++) {
+         body[j] = array[i*dims[0] + j];
+      }
+      env->ReleaseBooleanArrayElements(jArray, body, 0);
+   }
+}
+
+void AparapiBuffer::inflateByte2D(JNIEnv *env, KernelArg* arg) {
+   
+   jobjectArray buffer = (jobjectArray)javaObject;
+   jbyte* array = (jbyte*)data;
+   /*
+   jbyteArray* jArray = new jbyteArray[lens[0]];
+   jbyte** body =  new jbyte*[lens[0]];
+
+   for(int i = 0; i < lens[0]; i++) {
+      jArray[i] = (jbyteArray)env->GetObjectArrayElement(buffer, i);
+      body[i] = env->GetByteArrayElements(jArray[i],0);
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         body[i][j] = array[i*dims[0] + j];
+      }
+   }
+
+   for(int i = 0; i < lens[0]; i++) {
+      env->ReleaseByteArrayElements(jArray[i], body[i], 0);
+   }
+   delete[] jArray;
+   delete[] body;
+   */
+
+   for(int i = 0; i < lens[0]; i++) {
+      jbyteArray jArray = (jbyteArray)env->GetObjectArrayElement(buffer, i);
+      jbyte* body = env->GetByteArrayElements(jArray,0);
+      for(int j = 0; j < lens[1]; j++) {
+         body[j] = array[i*dims[0] + j];
+      }
+      env->ReleaseByteArrayElements(jArray, body, 0);
+   }
+}
+
+void AparapiBuffer::inflateShort2D(JNIEnv *env, KernelArg* arg) {
+   
+   jobjectArray buffer = (jobjectArray)javaObject;
+   jshort* array = (jshort*)data;
+   /*
+   jshortArray* jArray = new jshortArray[lens[0]];
+   jshort** body =  new jshort*[lens[0]];
+
+   for(int i = 0; i < lens[0]; i++) {
+      jArray[i] = (jshortArray)env->GetObjectArrayElement(buffer, i);
+      body[i] = env->GetShortArrayElements(jArray[i],0);
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         body[i][j] = array[i*dims[0] + j];
+      }
+   }
+
+   for(int i = 0; i < lens[0]; i++) {
+      env->ReleaseShortArrayElements(jArray[i], body[i], 0);
+   }
+   delete[] jArray;
+   delete[] body;
+   */
+
+   for(int i = 0; i < lens[0]; i++) {
+      jshortArray jArray = (jshortArray)env->GetObjectArrayElement(buffer, i);
+      jshort* body = env->GetShortArrayElements(jArray,0);
+      for(int j = 0; j < lens[1]; j++) {
+         body[j] = array[i*dims[0] + j];
+      }
+      env->ReleaseShortArrayElements(jArray, body, 0);
+   }
+}
+
+void AparapiBuffer::inflateInt2D(JNIEnv *env, KernelArg* arg) {
+   
+   jobjectArray buffer = (jobjectArray)javaObject;
+   jint* array = (jint*)data;
+   /*
+   jintArray* jArray = new jintArray[lens[0]];
+   jint** body =  new jint*[lens[0]];
+
+   for(int i = 0; i < lens[0]; i++) {
+      jArray[i] = (jintArray)env->GetObjectArrayElement(buffer, i);
+      body[i] = env->GetIntArrayElements(jArray[i],0);
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         body[i][j] = array[i*dims[0] + j];
+      }
+   }
+
+   for(int i = 0; i < lens[0]; i++) {
+      env->ReleaseIntArrayElements(jArray[i], body[i], 0);
+   }
+   delete[] jArray;
+   delete[] body;
+   */
+
+   for(int i = 0; i < lens[0]; i++) {
+      jintArray jArray = (jintArray)env->GetObjectArrayElement(buffer, i);
+      jint* body = env->GetIntArrayElements(jArray,0);
+      for(int j = 0; j < lens[1]; j++) {
+         body[j] = array[i*dims[0] + j];
+      }
+      env->ReleaseIntArrayElements(jArray, body, 0);
+   }
+}
+
+void AparapiBuffer::inflateLong2D(JNIEnv *env, KernelArg* arg) {
+   
+   jobjectArray buffer = (jobjectArray)javaObject;
+   jlong* array = (jlong*)data;
+   /*
+   jlongArray* jArray = new jlongArray[lens[0]];
+   jlong** body =  new jlong*[lens[0]];
+
+   for(int i = 0; i < lens[0]; i++) {
+      jArray[i] = (jlongArray)env->GetObjectArrayElement(buffer, i);
+      body[i] = env->GetLongArrayElements(jArray[i],0);
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         body[i][j] = array[i*dims[0] + j];
+      }
+   }
+
+   for(int i = 0; i < lens[0]; i++) {
+      env->ReleaseLongArrayElements(jArray[i], body[i], 0);
+   }
+   delete[] jArray;
+   delete[] body;
+   */
+
+   for(int i = 0; i < lens[0]; i++) {
+      jlongArray jArray = (jlongArray)env->GetObjectArrayElement(buffer, i);
+      jlong* body = env->GetLongArrayElements(jArray,0);
+      for(int j = 0; j < lens[1]; j++) {
+         body[j] = array[i*dims[0] + j];
+      }
+      env->ReleaseLongArrayElements(jArray, body, 0);
+   }
+}
+
+void AparapiBuffer::inflateFloat2D(JNIEnv *env, KernelArg* arg) {
+   
+   jobjectArray buffer = (jobjectArray)javaObject;
+   jfloat* array = (jfloat*)data;
+   /*
+   jfloatArray* jArray = new jfloatArray[lens[0]];
+   jfloat** body =  new jfloat*[lens[0]];
+
+   for(int i = 0; i < lens[0]; i++) {
+      jArray[i] = (jfloatArray)env->GetObjectArrayElement(buffer, i);
+      body[i] = env->GetFloatArrayElements(jArray[i],0);
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         body[i][j] = array[i*dims[0] + j];
+      }
+   }
+
+   for(int i = 0; i < lens[0]; i++) {
+      env->ReleaseFloatArrayElements(jArray[i], body[i], 0);
+   }
+   delete[] jArray;
+   delete[] body;
+   */
+
+   for(int i = 0; i < lens[0]; i++) {
+      jfloatArray jArray = (jfloatArray)env->GetObjectArrayElement(buffer, i);
+      jfloat* body = env->GetFloatArrayElements(jArray,0);
+      for(int j = 0; j < lens[1]; j++) {
+         body[j] = array[i*dims[0] + j];
+      }
+      env->ReleaseFloatArrayElements(jArray, body, 0);
+   }
+}
+
+void AparapiBuffer::inflateDouble2D(JNIEnv *env, KernelArg* arg) {
+   
+   jobjectArray buffer = (jobjectArray)javaObject;
+   jdouble* array = (jdouble*)data;
+   /*
+   jdoubleArray* jArray = new jdoubleArray[lens[0]];
+   jdouble** body =  new jdouble*[lens[0]];
+
+   for(int i = 0; i < lens[0]; i++) {
+      jArray[i] = (jdoubleArray)env->GetObjectArrayElement(buffer, i);
+      body[i] = env->GetDoubleArrayElements(jArray[i],0);
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         body[i][j] = array[i*dims[0] + j];
+      }
+   }
+
+   for(int i = 0; i < lens[0]; i++) {
+      env->ReleaseDoubleArrayElements(jArray[i], body[i], 0);
+   }
+   delete[] jArray;
+   delete[] body;
+   */
+
+   for(int i = 0; i < lens[0]; i++) {
+      jdoubleArray jArray = (jdoubleArray)env->GetObjectArrayElement(buffer, i);
+      jdouble* body = env->GetDoubleArrayElements(jArray,0);
+      for(int j = 0; j < lens[1]; j++) {
+         body[j] = array[i*dims[0] + j];
+      }
+      env->ReleaseDoubleArrayElements(jArray, body, 0);
+   }
+}
+
+
+void AparapiBuffer::inflateBoolean3D(JNIEnv *env, KernelArg* arg) {
+   
+   jobjectArray buffer = (jobjectArray)javaObject;
+   jboolean* array = (jboolean*)data;
+   /*
+   jbooleanArray** jArray = new jbooleanArray*[lens[0]];
+   jboolean*** body =  new jboolean**[lens[0]];
+
+   for(int i = 0; i < lens[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i);
+      jArray[i] = new jbooleanArray[lens[0]];
+      body[i] =  new jboolean*[lens[0]];
+      for(int j = 0; j < lens[1]; j++) {
+         jArray[i][j] = (jbooleanArray)env->GetObjectArrayElement(jrow, j);
+         body[i][j] = env->GetBooleanArrayElements(jArray[i][j],0);
+      }
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         for(int k = 0; k < lens[2]; k++) {
+            body[i][j][k] = array[i*dims[0] + j*dims[1] + k];
+         }
+      }
+   }
+
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         env->ReleaseBooleanArrayElements(jArray[i][j], body[i][j], 0);
+      }
+      delete[] jArray[i];
+      delete[] body[i];
+   }
+   delete[] jArray;
+   delete[] body;
+   */
+
+   for(int i = 0; i < lens[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i);
+      for(int j = 0; j < lens[1]; j++) {
+         jbooleanArray jArray = (jbooleanArray)env->GetObjectArrayElement(jrow, j);
+         jboolean* body = env->GetBooleanArrayElements(jArray,0);
+         for(int k = 0; k < lens[2]; k++) {
+            body[k] = array[i*dims[0] + j*dims[1] + k];
+         }
+         env->ReleaseBooleanArrayElements(jArray, body, 0);
+      }
+   }
+}
+
+void AparapiBuffer::inflateByte3D(JNIEnv *env, KernelArg* arg) {
+   
+   jobjectArray buffer = (jobjectArray)javaObject;
+   jbyte* array = (jbyte*)data;
+   /*
+   jbyteArray** jArray = new jbyteArray*[lens[0]];
+   jbyte*** body =  new jbyte**[lens[0]];
+
+   for(int i = 0; i < lens[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i);
+      jArray[i] = new jbyteArray[lens[0]];
+      body[i] =  new jbyte*[lens[0]];
+      for(int j = 0; j < lens[1]; j++) {
+         jArray[i][j] = (jbyteArray)env->GetObjectArrayElement(jrow, j);
+         body[i][j] = env->GetByteArrayElements(jArray[i][j],0);
+      }
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         for(int k = 0; k < lens[2]; k++) {
+            body[i][j][k] = array[i*dims[0] + j*dims[1] + k];
+         }
+      }
+   }
+
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         env->ReleaseByteArrayElements(jArray[i][j], body[i][j], 0);
+      }
+      delete[] jArray[i];
+      delete[] body[i];
+   }
+   delete[] jArray;
+   delete[] body;
+   */
+
+   for(int i = 0; i < lens[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i);
+      for(int j = 0; j < lens[1]; j++) {
+         jbyteArray jArray = (jbyteArray)env->GetObjectArrayElement(jrow, j);
+         jbyte* body = env->GetByteArrayElements(jArray,0);
+         for(int k = 0; k < lens[2]; k++) {
+            body[k] = array[i*dims[0] + j*dims[1] + k];
+         }
+         env->ReleaseByteArrayElements(jArray, body, 0);
+      }
+   }
+}
+
+void AparapiBuffer::inflateShort3D(JNIEnv *env, KernelArg* arg) {
+   
+   jobjectArray buffer = (jobjectArray)javaObject;
+   jshort* array = (jshort*)data;
+   /*
+   jshortArray** jArray = new jshortArray*[lens[0]];
+   jshort*** body =  new jshort**[lens[0]];
+
+   for(int i = 0; i < lens[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i);
+      jArray[i] = new jshortArray[lens[0]];
+      body[i] =  new jshort*[lens[0]];
+      for(int j = 0; j < lens[1]; j++) {
+         jArray[i][j] = (jshortArray)env->GetObjectArrayElement(jrow, j);
+         body[i][j] = env->GetShortArrayElements(jArray[i][j],0);
+      }
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         for(int k = 0; k < lens[2]; k++) {
+            body[i][j][k] = array[i*dims[0] + j*dims[1] + k];
+         }
+      }
+   }
+
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         env->ReleaseShortArrayElements(jArray[i][j], body[i][j], 0);
+      }
+      delete[] jArray[i];
+      delete[] body[i];
+   }
+   delete[] jArray;
+   delete[] body;
+   */
+
+   for(int i = 0; i < lens[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i);
+      for(int j = 0; j < lens[1]; j++) {
+         jshortArray jArray = (jshortArray)env->GetObjectArrayElement(jrow, j);
+         jshort* body = env->GetShortArrayElements(jArray,0);
+         for(int k = 0; k < lens[2]; k++) {
+            body[k] = array[i*dims[0] + j*dims[1] + k];
+         }
+         env->ReleaseShortArrayElements(jArray, body, 0);
+      }
+   }
+}
+
+void AparapiBuffer::inflateInt3D(JNIEnv *env, KernelArg* arg) {
+   
+   jobjectArray buffer = (jobjectArray)javaObject;
+   jint* array = (jint*)data;
+   /*
+   jintArray** jArray = new jintArray*[lens[0]];
+   jint*** body =  new jint**[lens[0]];
+
+   for(int i = 0; i < lens[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i);
+      jArray[i] = new jintArray[lens[0]];
+      body[i] =  new jint*[lens[0]];
+      for(int j = 0; j < lens[1]; j++) {
+         jArray[i][j] = (jintArray)env->GetObjectArrayElement(jrow, j);
+         body[i][j] = env->GetIntArrayElements(jArray[i][j],0);
+      }
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         for(int k = 0; k < lens[2]; k++) {
+            body[i][j][k] = array[i*dims[0] + j*dims[1] + k];
+         }
+      }
+   }
+
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         env->ReleaseIntArrayElements(jArray[i][j], body[i][j], 0);
+      }
+      delete[] jArray[i];
+      delete[] body[i];
+   }
+   delete[] jArray;
+   delete[] body;
+   */
+
+   for(int i = 0; i < lens[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i);
+      for(int j = 0; j < lens[1]; j++) {
+         jintArray jArray = (jintArray)env->GetObjectArrayElement(jrow, j);
+         jint* body = env->GetIntArrayElements(jArray,0);
+         for(int k = 0; k < lens[2]; k++) {
+            body[k] = array[i*dims[0] + j*dims[1] + k];
+         }
+         env->ReleaseIntArrayElements(jArray, body, 0);
+      }
+   }
+}
+
+void AparapiBuffer::inflateLong3D(JNIEnv *env, KernelArg* arg) {
+   
+   jobjectArray buffer = (jobjectArray)javaObject;
+   jlong* array = (jlong*)data;
+   /*
+   jlongArray** jArray = new jlongArray*[lens[0]];
+   jlong*** body =  new jlong**[lens[0]];
+
+   for(int i = 0; i < lens[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i);
+      jArray[i] = new jlongArray[lens[0]];
+      body[i] =  new jlong*[lens[0]];
+      for(int j = 0; j < lens[1]; j++) {
+         jArray[i][j] = (jlongArray)env->GetObjectArrayElement(jrow, j);
+         body[i][j] = env->GetLongArrayElements(jArray[i][j],0);
+      }
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         for(int k = 0; k < lens[2]; k++) {
+            body[i][j][k] = array[i*dims[0] + j*dims[1] + k];
+         }
+      }
+   }
+
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         env->ReleaseLongArrayElements(jArray[i][j], body[i][j], 0);
+      }
+      delete[] jArray[i];
+      delete[] body[i];
+   }
+   delete[] jArray;
+   delete[] body;
+   */
+
+   for(int i = 0; i < lens[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i);
+      for(int j = 0; j < lens[1]; j++) {
+         jlongArray jArray = (jlongArray)env->GetObjectArrayElement(jrow, j);
+         jlong* body = env->GetLongArrayElements(jArray,0);
+         for(int k = 0; k < lens[2]; k++) {
+            body[k] = array[i*dims[0] + j*dims[1] + k];
+         }
+         env->ReleaseLongArrayElements(jArray, body, 0);
+      }
+   }
+}
+
+void AparapiBuffer::inflateFloat3D(JNIEnv *env, KernelArg* arg) {
+   
+   jobjectArray buffer = (jobjectArray)javaObject;
+   jfloat* array = (jfloat*)data;
+   /*
+   jfloatArray** jArray = new jfloatArray*[lens[0]];
+   jfloat*** body =  new jfloat**[lens[0]];
+
+   for(int i = 0; i < lens[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i);
+      jArray[i] = new jfloatArray[lens[0]];
+      body[i] =  new jfloat*[lens[0]];
+      for(int j = 0; j < lens[1]; j++) {
+         jArray[i][j] = (jfloatArray)env->GetObjectArrayElement(jrow, j);
+         body[i][j] = env->GetFloatArrayElements(jArray[i][j],0);
+      }
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         for(int k = 0; k < lens[2]; k++) {
+            body[i][j][k] = array[i*dims[0] + j*dims[1] + k];
+         }
+      }
+   }
+
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         env->ReleaseFloatArrayElements(jArray[i][j], body[i][j], 0);
+      }
+      delete[] jArray[i];
+      delete[] body[i];
+   }
+   delete[] jArray;
+   delete[] body;
+   */
+
+   for(int i = 0; i < lens[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i);
+      for(int j = 0; j < lens[1]; j++) {
+         jfloatArray jArray = (jfloatArray)env->GetObjectArrayElement(jrow, j);
+         jfloat* body = env->GetFloatArrayElements(jArray,0);
+         for(int k = 0; k < lens[2]; k++) {
+            body[k] = array[i*dims[0] + j*dims[1] + k];
+         }
+         env->ReleaseFloatArrayElements(jArray, body, 0);
+      }
+   }
+}
+
+void AparapiBuffer::inflateDouble3D(JNIEnv *env, KernelArg* arg) {
+   
+   jobjectArray buffer = (jobjectArray)javaObject;
+   jdouble* array = (jdouble*)data;
+   /*
+   jdoubleArray** jArray = new jdoubleArray*[lens[0]];
+   jdouble*** body =  new jdouble**[lens[0]];
+
+   for(int i = 0; i < lens[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i);
+      jArray[i] = new jdoubleArray[lens[0]];
+      body[i] =  new jdouble*[lens[0]];
+      for(int j = 0; j < lens[1]; j++) {
+         jArray[i][j] = (jdoubleArray)env->GetObjectArrayElement(jrow, j);
+         body[i][j] = env->GetDoubleArrayElements(jArray[i][j],0);
+      }
+   }
+
+   #pragma omp parallel for
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         for(int k = 0; k < lens[2]; k++) {
+            body[i][j][k] = array[i*dims[0] + j*dims[1] + k];
+         }
+      }
+   }
+
+   for(int i = 0; i < lens[0]; i++) {
+      for(int j = 0; j < lens[1]; j++) {
+         env->ReleaseDoubleArrayElements(jArray[i][j], body[i][j], 0);
+      }
+      delete[] jArray[i];
+      delete[] body[i];
+   }
+   delete[] jArray;
+   delete[] body;
+   */
+
+   for(int i = 0; i < lens[0]; i++) {
+      jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i);
+      for(int j = 0; j < lens[1]; j++) {
+         jdoubleArray jArray = (jdoubleArray)env->GetObjectArrayElement(jrow, j);
+         jdouble* body = env->GetDoubleArrayElements(jArray,0);
+         for(int k = 0; k < lens[2]; k++) {
+            body[k] = array[i*dims[0] + j*dims[1] + k];
+         }
+         env->ReleaseDoubleArrayElements(jArray, body, 0);
+      }
+   }
+}
+
+void AparapiBuffer::deleteBuffer(KernelArg* arg)
+{
+      delete[] dims;
+      delete[] lens;
+   if(arg->isBoolean()) {
+      delete[] (jboolean*)data;
+   } else if(arg->isByte()) {
+      delete[] (jbyte*)data;
+   } else if(arg->isShort()) {
+      delete[] (jshort*)data;
+   } else if(arg->isInt()) {
+      delete[] (jint*)data;
+   } else if(arg->isLong()) {
+      delete[] (jlong*)data;
+   } else if(arg->isFloat()) {
+      delete[] (jfloat*)data;
+   } else if(arg->isDouble()) {
+      delete[] (jdouble*)data;
+   }
+}
diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.h b/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5a05d82b6d45e68034c8abeaa1b890d1cfb1437
--- /dev/null
+++ b/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.h
@@ -0,0 +1,133 @@
+/*
+   Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+   following conditions are met:
+
+   Redistributions of source code must retain the above copyright notice, this list of conditions and the following
+   disclaimer. 
+
+   Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
+   disclaimer in the documentation and/or other materials provided with the distribution. 
+
+   Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
+   derived from this software without specific prior written permission. 
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+   INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
+   laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 
+   through 774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of
+   the EAR, you hereby certify that, except pursuant to a license granted by the United States Department of Commerce
+   Bureau of Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export 
+   Administration Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in 
+   Country Groups D:1, E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) 
+   export to Country Groups D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced
+   direct product is subject to national security controls as identified on the Commerce Control List (currently 
+   found in Supplement 1 to Part 774 of EAR).  For the most current Country Group listings, or for additional 
+   information about the EAR or your obligations under those regulations, please refer to the U.S. Bureau of Industry
+   and Security?s website at http://www.bis.doc.gov/. 
+   */
+
+#ifndef APARAPIBUFFER_H
+#define APARAPIBUFFER_H
+#include "Common.h"
+#include "ProfileInfo.h"
+#include "com_amd_aparapi_internal_jni_KernelRunnerJNI.h"
+
+class KernelArg;
+
+class AparapiBuffer{
+
+private:
+
+   static int isFloat(int type){
+      return(type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_FLOAT);
+   }
+   static int isLong(int type){
+      return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_LONG);
+   }
+   static int isInt(int type){
+      return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_INT);
+   }
+   static int isDouble(int type){
+      return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_DOUBLE);
+   }
+   static int isBoolean(int type){
+      return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_BOOLEAN);
+   }
+   static int isByte(int type){
+      return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_BYTE);
+   }
+   static int isShort(int type){
+      return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_SHORT);
+   }
+
+public:
+      jobject javaObject;       // The java array that this arg is mapped to 
+      cl_uint numDims;          // sizes of dimensions of the object (array lengths for ND arrays)
+      cl_uint* dims;            // sizes of offsets of the object (first element offset in ND arrays)
+      cl_uint* lens;            // sizes of dimensions of the object (array lengths for ND arrays)
+      jint lengthInBytes;       // bytes in the array or directBuf
+      cl_mem mem;               // the opencl buffer 
+      void *data;               // a copy of the object itself (this is what we pass to OpenCL)
+      cl_uint memMask;          // the mask used for createBuffer
+      ProfileInfo read;
+      ProfileInfo write;
+
+      AparapiBuffer();
+      AparapiBuffer(void* _data, cl_uint* _dims, cl_uint _numDims, long _lengthInBytes, jobject _javaObject);
+
+      void deleteBuffer(KernelArg* arg);
+
+      static AparapiBuffer* flatten(JNIEnv *env, jobject arg, int type);
+
+      static AparapiBuffer* flattenBoolean2D(JNIEnv *env, jobject arg);
+      static AparapiBuffer* flattenChar2D(JNIEnv *env, jobject arg);
+      static AparapiBuffer* flattenByte2D(JNIEnv *env, jobject arg);
+      static AparapiBuffer* flattenShort2D(JNIEnv *env, jobject arg);
+      static AparapiBuffer* flattenInt2D(JNIEnv *env, jobject arg);
+      static AparapiBuffer* flattenLong2D(JNIEnv *env, jobject arg);
+      static AparapiBuffer* flattenFloat2D(JNIEnv *env, jobject arg);
+      static AparapiBuffer* flattenDouble2D(JNIEnv *env, jobject arg);
+
+      static AparapiBuffer* flattenBoolean3D(JNIEnv *env, jobject arg);
+      static AparapiBuffer* flattenChar3D(JNIEnv *env, jobject arg);
+      static AparapiBuffer* flattenByte3D(JNIEnv *env, jobject arg);
+      static AparapiBuffer* flattenShort3D(JNIEnv *env, jobject arg);
+      static AparapiBuffer* flattenInt3D(JNIEnv *env, jobject arg);
+      static AparapiBuffer* flattenLong3D(JNIEnv *env, jobject arg);
+      static AparapiBuffer* flattenFloat3D(JNIEnv *env, jobject arg);
+      static AparapiBuffer* flattenDouble3D(JNIEnv *env, jobject arg);
+
+      void inflate(JNIEnv *env, KernelArg* arg);
+
+      void inflateBoolean2D(JNIEnv *env, KernelArg* arg);
+      void inflateChar2D(JNIEnv *env, KernelArg* arg);
+      void inflateByte2D(JNIEnv *env, KernelArg* arg);
+      void inflateShort2D(JNIEnv *env, KernelArg* arg);
+      void inflateInt2D(JNIEnv *env, KernelArg* arg);
+      void inflateLong2D(JNIEnv *env, KernelArg* arg);
+      void inflateFloat2D(JNIEnv *env, KernelArg* arg);
+      void inflateDouble2D(JNIEnv *env, KernelArg* arg);
+
+      void inflateBoolean3D(JNIEnv *env, KernelArg* arg);
+      void inflateChar3D(JNIEnv *env, KernelArg* arg);
+      void inflateByte3D(JNIEnv *env, KernelArg* arg);
+      void inflateShort3D(JNIEnv *env, KernelArg* arg);
+      void inflateInt3D(JNIEnv *env, KernelArg* arg);
+      void inflateLong3D(JNIEnv *env, KernelArg* arg);
+      void inflateFloat3D(JNIEnv *env, KernelArg* arg);
+      void inflateDouble3D(JNIEnv *env, KernelArg* arg);
+
+      jobject getJavaObject(JNIEnv* env, KernelArg* arg);
+};
+
+#endif // ARRAYBUFFER_H
diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.cpp b/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.cpp
index c7e918b91114a6e3a87a93f91f02edfd2c8397e0..0f401ceae60180d26b444f5bbf6abdf39006c0a8 100644
--- a/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.cpp
+++ b/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.cpp
@@ -34,6 +34,8 @@ KernelArg::KernelArg(JNIEnv *jenv, JNIContext *jniContext, jobject argObj):
       jenv->ReleaseStringUTFChars(nameString, nameChars);
       if (isArray()){
          arrayBuffer = new ArrayBuffer();
+      } else if(isAparapiBuffer()) {
+         aparapiBuffer = AparapiBuffer::flatten(jenv, argObj, type);
       }
    }
 
@@ -44,6 +46,13 @@ cl_int KernelArg::setLocalBufferArg(JNIEnv *jenv, int argIdx, int argPos, bool v
    return(clSetKernelArg(jniContext->kernel, argPos, (int)arrayBuffer->lengthInBytes, NULL));
 }
 
+cl_int KernelArg::setLocalAparapiBufferArg(JNIEnv *jenv, int argIdx, int argPos, bool verbose) {
+   if (verbose){
+       fprintf(stderr, "ISLOCAL, clSetKernelArg(jniContext->kernel, %d, %d, NULL);\n", argIdx, (int) aparapiBuffer->lengthInBytes);
+   }
+   return(clSetKernelArg(jniContext->kernel, argPos, (int)aparapiBuffer->lengthInBytes, NULL));
+}
+
 const char* KernelArg::getTypeName() {
    string s = "";
    if(isStatic()) {
diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.h b/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.h
index 75c7a5ca6dbcc948bc0e36e343edaac7e7054cd8..a54e411e90dc04c4f0ea50940ea4a12eef94aa7e 100644
--- a/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.h
+++ b/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.h
@@ -5,6 +5,7 @@
 #include "Common.h"
 #include "JNIHelper.h"
 #include "ArrayBuffer.h"
+#include "AparapiBuffer.h"
 #include "com_amd_aparapi_internal_jni_KernelRunnerJNI.h"
 #include "Config.h"
 #include <iostream>
@@ -69,6 +70,7 @@ class KernelArg{
       jint type;         // a bit mask determining the type of this arg
 
       ArrayBuffer *arrayBuffer;
+      AparapiBuffer *aparapiBuffer;
 
       // Uses JNIContext so cant inline here see below
       KernelArg(JNIEnv *jenv, JNIContext *jniContext, jobject argObj);
@@ -157,14 +159,14 @@ class KernelArg{
       int isConstant(){
          return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_CONSTANT);
       }
-      int isAparapiBuf(){
-         return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_APARAPI_BUF);
+      int isAparapiBuffer(){
+         return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_APARAPI_BUFFER);
       }
       int isBackedByArray(){
          return ( (isArray() && (isGlobal() || isConstant())));
       }
       int needToEnqueueRead(){
-         return(((isArray() && isGlobal()) || ((isAparapiBuf()&&isGlobal()))) && (isImplicit()&&isMutableByKernel()));
+         return(((isArray() && isGlobal()) || ((isAparapiBuffer()&&isGlobal()))) && (isImplicit()&&isMutableByKernel()));
       }
       int needToEnqueueWrite(){
          return ((isImplicit()&&isReadByKernel())||(isExplicit()&&isExplicitWrite()));
@@ -188,6 +190,7 @@ class KernelArg{
 
       // Uses JNIContext so can't inline here we below.  
       cl_int setLocalBufferArg(JNIEnv *jenv, int argIdx, int argPos, bool verbose);
+      cl_int setLocalAparapiBufferArg(JNIEnv *jenv, int argIdx, int argPos, bool verbose);
       // Uses JNIContext so can't inline here we below.  
       cl_int setPrimitiveArg(JNIEnv *jenv, int argIdx, int argPos, bool verbose);
 };