diff --git a/samples/mdarray/.classpath b/samples/mdarray/.classpath
new file mode 100644
index 0000000000000000000000000000000000000000..43bd144cf860aedddebc6b0492a623845407f94f
--- /dev/null
+++ b/samples/mdarray/.classpath
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
+	<classpathentry kind="lib" path="/com.amd.aparapi/dist/aparapi.jar" sourcepath="/com.amd.aparapi">
+		<attributes>
+			<attribute name="org.eclipse.jdt.launching.CLASSPATH_ATTR_LIBRARY_PATH_ENTRY" value="com.amd.aparapi.jni/dist"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="output" path="classes"/>
+</classpath>
diff --git a/samples/mdarray/.project b/samples/mdarray/.project
new file mode 100644
index 0000000000000000000000000000000000000000..2273fb14a347b04d3172b14c20b8f16215e5f601
--- /dev/null
+++ b/samples/mdarray/.project
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>mdarray</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>
diff --git a/samples/mdarray/build.xml b/samples/mdarray/build.xml
new file mode 100644
index 0000000000000000000000000000000000000000..fca33590aaf7dc8369bbb4fd72bc0017bfdc6f7d
--- /dev/null
+++ b/samples/mdarray/build.xml
@@ -0,0 +1,118 @@
+<?xml version="1.0"?>
+
+<project name="mdarray" default="build" basedir=".">
+
+   <path id="compiler.class.path">
+       <pathelement path="../../com.amd.aparapi/dist/aparapi.jar"/>
+   </path>
+
+   <path id="runtime.class.path" cache="true">
+      <path refid="compiler.class.path"/>
+      <pathelement path="${ant.project.name}.jar"/>
+   </path>
+
+   <target name="build" depends="clean">
+      <mkdir dir="classes"/>
+      <javac srcdir="src" destdir="classes" debug="on" includeantruntime="false" fork="true"
+          memorymaximumsize="3G" >
+         <classpath>
+            <pathelement path="../../com.amd.aparapi/dist/aparapi.jar"/>
+         </classpath>
+      </javac>
+      <jar jarfile="${ant.project.name}.jar" basedir="classes"/>
+   </target>
+
+   <target name="clean">
+      <delete dir="classes"/>
+      <delete file="${ant.project.name}.jar"/>
+   </target>
+
+   <target name="run-jtp">
+      <java classname="gov.pnnl.aparapi.sample.mdarray.MDArray" fork="true">
+         <classpath refid="runtime.class.path"/>
+         <sysproperty key="java.library.path" path="../../com.amd.aparapi.jni/dist"/>
+         <sysproperty key="com.amd.aparapi.executionMode" value="JTP"/>
+         <sysproperty key="com.amd.aparapi.logLevel" value="WARNING"/>
+         <sysproperty key="com.amd.aparapi.enableVerboseJNI" value="false"/>
+         <sysproperty key="com.amd.aparapi.enableExecutionModeReporting" value="false"/>
+         <sysproperty key="com.amd.aparapi.enableShowGeneratedOpenCL" value="false"/>
+      </java>
+   </target>
+
+   <target name="run-gpu">
+      <java classname="gov.pnnl.aparapi.sample.mdarray.MDArray" fork="true">
+         <classpath refid="runtime.class.path"/>
+         <sysproperty key="java.library.path" path="../../com.amd.aparapi.jni/dist"/>
+         <jvmarg value="-Xmx5G"/>
+         <jvmarg value="-Xms5G"/>
+         <sysproperty key="com.amd.aparapi.executionMode" value="GPU"/>
+         <sysproperty key="com.amd.aparapi.logLevel" value="WARNING"/>
+         <sysproperty key="com.amd.aparapi.enableVerboseJNI" value="false"/>
+         <sysproperty key="com.amd.aparapi.enableExecutionModeReporting" value="false"/>
+         <sysproperty key="com.amd.aparapi.enableShowGeneratedOpenCL" value="false"/>
+      </java>
+   </target>
+
+
+   <target name="run-gpuv">
+      <java classname="gov.pnnl.aparapi.sample.mdarray.MDArray" fork="true">
+         <classpath refid="runtime.class.path"/>
+         <sysproperty key="java.library.path" path="../../com.amd.aparapi.jni/dist"/>
+         <sysproperty key="com.amd.aparapi.executionMode" value="GPU"/>
+         <sysproperty key="com.amd.aparapi.logLevel" value="WARNING"/>
+         <sysproperty key="com.amd.aparapi.enableVerboseJNI" value="true"/>
+         <sysproperty key="com.amd.aparapi.enableExecutionModeReporting" value="true"/>
+         <sysproperty key="com.amd.aparapi.enableShowGeneratedOpenCL" value="true"/>
+      </java>
+   </target>
+
+   <target name="run-cpu">
+      <java classname="gov.pnnl.aparapi.sample.mdarray.MDArray" fork="true">
+         <classpath refid="runtime.class.path"/>
+         <sysproperty key="java.library.path" path="../../com.amd.aparapi.jni/dist"/>
+         <sysproperty key="com.amd.aparapi.executionMode" value="CPU"/>
+         <sysproperty key="com.amd.aparapi.logLevel" value="WARNING"/>
+         <sysproperty key="com.amd.aparapi.enableVerboseJNI" value="false"/>
+         <sysproperty key="com.amd.aparapi.enableExecutionModeReporting" value="false"/>
+         <sysproperty key="com.amd.aparapi.enableShowGeneratedOpenCL" value="false"/>
+      </java>
+   </target>
+
+   <target name="run-multi">
+      <java classname="gov.pnnl.aparapi.sample.mdarray.MDArray" fork="true">
+         <classpath refid="runtime.class.path"/>
+         <sysproperty key="java.library.path" path="../../com.amd.aparapi.jni/dist"/>
+         <sysproperty key="com.amd.aparapi.executionMode" value="CPU,JTP"/>
+         <sysproperty key="com.amd.aparapi.logLevel" value="WARNING"/>
+         <sysproperty key="com.amd.aparapi.enableVerboseJNI" value="false"/>
+         <sysproperty key="com.amd.aparapi.enableExecutionModeReporting" value="false"/>
+         <sysproperty key="com.amd.aparapi.enableShowGeneratedOpenCL" value="false"/>
+      </java>
+   </target>
+
+
+   <target name="run">
+      <java classname="gov.pnnl.aparapi.sample.mdarray.MDArray" fork="true">
+         <classpath refid="runtime.class.path"/>
+         <sysproperty key="java.library.path" path="../../com.amd.aparapi.jni/dist"/>
+      </java>
+   </target>
+
+   <target name="show">
+      <java classname="gov.pnnl.aparapi.sample.mdarray.MDArray" fork="true">
+         <classpath refid="runtime.class.path"/>
+         <sysproperty key="java.library.path" path="../../com.amd.aparapi.jni/dist"/>
+         <sysproperty key="com.amd.aparapi.enableShowGeneratedOpenCL" value="true"/>
+         <sysproperty key="com.amd.aparapi.enableInstructionDecodeViewer" value="true"/>
+      </java>
+  </target>
+
+   <target name="print">
+      <java classname="gov.pnnl.aparapi.sample.mdarray.MDArray" fork="true">
+         <classpath refid="runtime.class.path"/>
+         <sysproperty key="java.library.path" path="../../com.amd.aparapi.jni/dist"/>
+         <sysproperty key="com.amd.aparapi.enableShowGeneratedOpenCL" value="true"/>
+      </java>
+   </target>
+
+</project>
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul1D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul1D.java
new file mode 100644
index 0000000000000000000000000000000000000000..4ffe62409460624c4829b107c577b8cc0b5ad382
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul1D.java
@@ -0,0 +1,28 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class BMatMul1D extends Kernel{
+   byte[] A;
+
+   byte[] B;
+
+   byte[] C;
+
+   int N;
+
+   public BMatMul1D(byte[] A, byte[] B, byte[] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / N;
+      int j = id % N;
+      for (int k = 0; k < N; k++) {
+         C[i * N + j] += (byte) (A[i * N + k] * B[k * N + j]);
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul2D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul2D.java
new file mode 100644
index 0000000000000000000000000000000000000000..1839d82b4f1f262a8dcba9f222812271e43e55cc
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul2D.java
@@ -0,0 +1,28 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class BMatMul2D extends Kernel{
+   byte[][] A;
+
+   byte[][] B;
+
+   byte[][] C;
+
+   int N;
+
+   public BMatMul2D(byte[][] A, byte[][] B, byte[][] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / N;
+      int j = id % N;
+      for (int k = 0; k < N; k++) {
+         C[i][j] += (byte) (A[i][k] * B[k][j]);
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul3D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul3D.java
new file mode 100644
index 0000000000000000000000000000000000000000..eb27bf7630cbf85a313c7a210ed899b5312dbcfc
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul3D.java
@@ -0,0 +1,32 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class BMatMul3D extends Kernel{
+   byte[][][] A;
+
+   byte[][][] B;
+
+   byte[][][] C;
+
+   int N;
+
+   public BMatMul3D(byte[][][] A, byte[][][] B, byte[][][] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / (N * N);
+      int j = (id / N) % N;
+      int k = id % N;
+      int a0 = A.length;
+      int a1 = A[0].length;
+      int a2 = A[0][0].length;
+      for (int l = 0; l < N; l++) {
+         C[i][j][k] += (byte) (A[i][j][l] * B[l][j][k]);
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul1D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul1D.java
new file mode 100644
index 0000000000000000000000000000000000000000..f852a900431f26a5662344adbb974f1fa9d339f7
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul1D.java
@@ -0,0 +1,28 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class DMatMul1D extends Kernel{
+   double[] A;
+
+   double[] B;
+
+   double[] C;
+
+   int N;
+
+   public DMatMul1D(double[] A, double[] B, double[] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / N;
+      int j = id % N;
+      for (int k = 0; k < N; k++) {
+         C[i * N + j] += A[i * N + k] * B[k * N + j];
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul2D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul2D.java
new file mode 100644
index 0000000000000000000000000000000000000000..e46e8f6e68ca6e872eb55d0713098b252a90ec6e
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul2D.java
@@ -0,0 +1,28 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class DMatMul2D extends Kernel{
+   double[][] A;
+
+   double[][] B;
+
+   double[][] C;
+
+   int N;
+
+   public DMatMul2D(double[][] A, double[][] B, double[][] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / N;
+      int j = id % N;
+      for (int k = 0; k < N; k++) {
+         C[i][j] += A[i][k] * B[k][j];
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul3D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul3D.java
new file mode 100644
index 0000000000000000000000000000000000000000..8641f2ff60fc538bed160029cc8a085e715892cf
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul3D.java
@@ -0,0 +1,29 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class DMatMul3D extends Kernel{
+   double[][][] A;
+
+   double[][][] B;
+
+   double[][][] C;
+
+   int N;
+
+   public DMatMul3D(double[][][] A, double[][][] B, double[][][] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / (N * N);
+      int j = (id / N) % N;
+      int k = id % N;
+      for (int l = 0; l < N; l++) {
+         C[i][j][k] += A[i][j][l] * B[l][j][k];
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul1D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul1D.java
new file mode 100644
index 0000000000000000000000000000000000000000..15a2fa21c62f98dd600e5e244f25ea64529d9146
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul1D.java
@@ -0,0 +1,28 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class FMatMul1D extends Kernel{
+   float[] A;
+
+   float[] B;
+
+   float[] C;
+
+   int N;
+
+   public FMatMul1D(float[] A, float[] B, float[] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / N;
+      int j = id % N;
+      for (int k = 0; k < N; k++) {
+         C[i * N + j] += A[i * N + k] * B[k * N + j];
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul2D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul2D.java
new file mode 100644
index 0000000000000000000000000000000000000000..21e66103d0992e336cf8cf68196dbb78b6fde875
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul2D.java
@@ -0,0 +1,28 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class FMatMul2D extends Kernel{
+   float[][] A;
+
+   float[][] B;
+
+   float[][] C;
+
+   int N;
+
+   public FMatMul2D(float[][] A, float[][] B, float[][] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / N;
+      int j = id % N;
+      for (int k = 0; k < N; k++) {
+         C[i][j] += A[i][k] * B[k][j];
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul3D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul3D.java
new file mode 100644
index 0000000000000000000000000000000000000000..b5af1848f54fd885ce5f7338938b58bb0f591f79
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul3D.java
@@ -0,0 +1,29 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class FMatMul3D extends Kernel{
+   float[][][] A;
+
+   float[][][] B;
+
+   float[][][] C;
+
+   int N;
+
+   public FMatMul3D(float[][][] A, float[][][] B, float[][][] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / (N * N);
+      int j = (id / N) % N;
+      int k = id % N;
+      for (int l = 0; l < N; l++) {
+         C[i][j][k] += A[i][j][l] * B[l][j][k];
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul1D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul1D.java
new file mode 100644
index 0000000000000000000000000000000000000000..4760c0867afc5d9a23cf504393370e4a53b937ff
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul1D.java
@@ -0,0 +1,28 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class IMatMul1D extends Kernel{
+   int[] A;
+
+   int[] B;
+
+   int[] C;
+
+   int N;
+
+   public IMatMul1D(int[] A, int[] B, int[] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / N;
+      int j = id % N;
+      for (int k = 0; k < N; k++) {
+         C[i * N + j] += A[i * N + k] * B[k * N + j];
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul2D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul2D.java
new file mode 100644
index 0000000000000000000000000000000000000000..4493633b2a21b67b9967efd9da96789ae35b1104
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul2D.java
@@ -0,0 +1,28 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class IMatMul2D extends Kernel{
+   int[][] A;
+
+   int[][] B;
+
+   int[][] C;
+
+   int N;
+
+   public IMatMul2D(int[][] A, int[][] B, int[][] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / N;
+      int j = id % N;
+      for (int k = 0; k < N; k++) {
+         C[i][j] += A[i][k] * B[k][j];
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul3D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul3D.java
new file mode 100644
index 0000000000000000000000000000000000000000..a4658b6593897f88519608378d8353435e1990cc
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul3D.java
@@ -0,0 +1,29 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class IMatMul3D extends Kernel{
+   int[][][] A;
+
+   int[][][] B;
+
+   int[][][] C;
+
+   int N;
+
+   public IMatMul3D(int[][][] A, int[][][] B, int[][][] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / (N * N);
+      int j = (id / N) % N;
+      int k = id % N;
+      for (int l = 0; l < N; l++) {
+         C[i][j][k] += A[i][j][l] * B[l][j][k];
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul1D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul1D.java
new file mode 100644
index 0000000000000000000000000000000000000000..9d81d1e8629f824443b905a589783efc967420e9
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul1D.java
@@ -0,0 +1,28 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class LMatMul1D extends Kernel{
+   long[] A;
+
+   long[] B;
+
+   long[] C;
+
+   int N;
+
+   public LMatMul1D(long[] A, long[] B, long[] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / N;
+      int j = id % N;
+      for (int k = 0; k < N; k++) {
+         C[i * N + j] += A[i * N + k] * B[k * N + j];
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul2D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul2D.java
new file mode 100644
index 0000000000000000000000000000000000000000..d8f8b8e57347a0226aae083411095dffcb65983b
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul2D.java
@@ -0,0 +1,28 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class LMatMul2D extends Kernel{
+   long[][] A;
+
+   long[][] B;
+
+   long[][] C;
+
+   int N;
+
+   public LMatMul2D(long[][] A, long[][] B, long[][] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / N;
+      int j = id % N;
+      for (int k = 0; k < N; k++) {
+         C[i][j] += A[i][k] * B[k][j];
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul3D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul3D.java
new file mode 100644
index 0000000000000000000000000000000000000000..46ec3c8fbb245cdb6b16c4e5c12b15264a9ebe3c
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul3D.java
@@ -0,0 +1,29 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class LMatMul3D extends Kernel{
+   long[][][] A;
+
+   long[][][] B;
+
+   long[][][] C;
+
+   int N;
+
+   public LMatMul3D(long[][][] A, long[][][] B, long[][][] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / (N * N);
+      int j = (id / N) % N;
+      int k = id % N;
+      for (int l = 0; l < N; l++) {
+         C[i][j][k] += A[i][j][l] * B[l][j][k];
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/MDArray.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/MDArray.java
new file mode 100644
index 0000000000000000000000000000000000000000..8e80c59ee5eabcd667d4c8cd863d3a161b55e3c4
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/MDArray.java
@@ -0,0 +1,1281 @@
+package gov.pnnl.aparapi.sample.mdarray;
+
+import com.amd.aparapi.Kernel;
+
+class MDArray {
+
+   static int N = 1 << 10;
+
+   static int M = 1 << 5;
+
+   public static void main(String[] args) {
+      System.out.println("boolean 1D");
+      Zrun1D();
+      System.out.println("byte 1D");
+      Brun1D();
+      System.out.println("short 1D");
+      Srun1D();
+      System.out.println("int 1D");
+      Irun1D();
+      System.out.println("long 1D");
+      Lrun1D();
+      System.out.println("float 1D");
+      Frun1D();
+      System.out.println("double 1D");
+      Drun1D();
+      System.out.println("boolean 2D");
+      Zrun2D();
+      System.out.println("byte 2D");
+      Brun2D();
+      System.out.println("short 2D");
+      Srun2D();
+      System.out.println("int 2D");
+      Irun2D();
+      System.out.println("long 2D");
+      Lrun2D();
+      System.out.println("float 2D");
+      Frun2D();
+      System.out.println("double 2D");
+      Drun2D();
+      System.out.println("boolean 3D");
+      Zrun3D();
+      System.out.println("byte 3D");
+      Brun3D();
+      System.out.println("short 3D");
+      Srun3D();
+      System.out.println("int 3D");
+      Irun3D();
+      System.out.println("long 3D");
+      Lrun3D();
+      System.out.println("float 3D");
+      Frun3D();
+      System.out.println("double 3D");
+      Drun3D();
+   }
+
+   private static boolean[] matMull(boolean[] A, boolean[] B, int N) {
+      final boolean[] C = new boolean[N * N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               C[(i * N) + j] ^= A[(i * N) + k] & B[(k * N) + j];
+            }
+         }
+      }
+      return C;
+   }
+
+   private static byte[] matMull(byte[] A, byte[] B, int N) {
+      final byte[] C = new byte[N * N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               C[(i * N) + j] += (byte) (A[(i * N) + k] * B[(k * N) + j]);
+            }
+         }
+      }
+      return C;
+   }
+
+   private static short[] matMull(short[] A, short[] B, int N) {
+      final short[] C = new short[N * N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               C[(i * N) + j] += (short) (A[(i * N) + k] * B[(k * N) + j]);
+            }
+         }
+      }
+      return C;
+   }
+
+   private static int[] matMull(int[] A, int[] B, int N) {
+      final int[] C = new int[N * N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
+            }
+         }
+      }
+      return C;
+   }
+
+   private static long[] matMull(long[] A, long[] B, int N) {
+      final long[] C = new long[N * N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
+            }
+         }
+      }
+      return C;
+   }
+
+   private static float[] matMull(float[] A, float[] B, int N) {
+      final float[] C = new float[N * N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
+            }
+         }
+      }
+      return C;
+   }
+
+   private static double[] matMull(double[] A, double[] B, int N) {
+      final double[] C = new double[N * N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
+            }
+         }
+      }
+      return C;
+   }
+
+   private static boolean[][] matMull(boolean[][] A, boolean[][] B, int N) {
+      final boolean[][] C = new boolean[N][N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               C[i][j] ^= A[i][k] & B[k][j];
+            }
+         }
+      }
+      return C;
+   }
+
+   private static byte[][] matMull(byte[][] A, byte[][] B, int N) {
+      final byte[][] C = new byte[N][N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               C[i][j] += (byte) (A[i][k] * B[k][j]);
+            }
+         }
+      }
+      return C;
+   }
+
+   private static short[][] matMull(short[][] A, short[][] B, int N) {
+      final short[][] C = new short[N][N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               C[i][j] += (short) (A[i][k] * B[k][j]);
+            }
+         }
+      }
+      return C;
+   }
+
+   private static int[][] matMull(int[][] A, int[][] B, int N) {
+      final int[][] C = new int[N][N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               C[i][j] += A[i][k] * B[k][j];
+            }
+         }
+      }
+      return C;
+   }
+
+   private static long[][] matMull(long[][] A, long[][] B, int N) {
+      final long[][] C = new long[N][N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               C[i][j] += A[i][k] * B[k][j];
+            }
+         }
+      }
+      return C;
+   }
+
+   private static float[][] matMull(float[][] A, float[][] B, int N) {
+      final float[][] C = new float[N][N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               C[i][j] += A[i][k] * B[k][j];
+            }
+         }
+      }
+      return C;
+   }
+
+   private static double[][] matMull(double[][] A, double[][] B, int N) {
+      final double[][] C = new double[N][N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               C[i][j] += A[i][k] * B[k][j];
+            }
+         }
+      }
+      return C;
+   }
+
+   private static boolean[][][] matMull(boolean[][][] A, boolean[][][] B, int N) {
+      final boolean[][][] C = new boolean[N][N][N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               for (int l = 0; l < N; l++) {
+                  C[i][j][k] ^= A[i][j][l] & B[l][j][k];
+               }
+            }
+         }
+      }
+      return C;
+   }
+
+   private static byte[][][] matMull(byte[][][] A, byte[][][] B, int N) {
+      final byte[][][] C = new byte[N][N][N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               for (int l = 0; l < N; l++) {
+                  C[i][j][k] += (byte) (A[i][j][l] * B[l][j][k]);
+               }
+            }
+         }
+      }
+      return C;
+   }
+
+   private static short[][][] matMull(short[][][] A, short[][][] B, int N) {
+      final short[][][] C = new short[N][N][N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               for (int l = 0; l < N; l++) {
+                  C[i][j][k] += (short) (A[i][j][l] * B[l][j][k]);
+               }
+            }
+         }
+      }
+      return C;
+   }
+
+   private static int[][][] matMull(int[][][] A, int[][][] B, int N) {
+      final int[][][] C = new int[N][N][N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               for (int l = 0; l < N; l++) {
+                  C[i][j][k] += A[i][j][l] * B[l][j][k];
+               }
+            }
+         }
+      }
+      return C;
+   }
+
+   private static long[][][] matMull(long[][][] A, long[][][] B, int N) {
+      final long[][][] C = new long[N][N][N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               for (int l = 0; l < N; l++) {
+                  C[i][j][k] += A[i][j][l] * B[l][j][k];
+               }
+            }
+         }
+      }
+      return C;
+   }
+
+   private static float[][][] matMull(float[][][] A, float[][][] B, int N) {
+      final float[][][] C = new float[N][N][N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               for (int l = 0; l < N; l++) {
+                  C[i][j][k] += A[i][j][l] * B[l][j][k];
+               }
+            }
+         }
+      }
+      return C;
+   }
+
+   private static double[][][] matMull(double[][][] A, double[][][] B, int N) {
+      final double[][][] C = new double[N][N][N];
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            for (int k = 0; k < N; k++) {
+               for (int l = 0; l < N; l++) {
+                  C[i][j][k] += A[i][j][l] * B[l][j][k];
+               }
+            }
+         }
+      }
+      return C;
+   }
+
+   private static boolean checkResults(boolean[] cpu, boolean[] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         if (cpu[i] != gpu[i]) {
+            return false;
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(byte[] cpu, byte[] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         if (cpu[i] != gpu[i]) {
+            return false;
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(short[] cpu, short[] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         if (cpu[i] != gpu[i]) {
+            return false;
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(int[] cpu, int[] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         if (cpu[i] != gpu[i]) {
+            return false;
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(long[] cpu, long[] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         if (cpu[i] != gpu[i]) {
+            return false;
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(float[] cpu, float[] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         if (cpu[i] != gpu[i]) {
+            return false;
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(double[] cpu, double[] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         if (cpu[i] != gpu[i]) {
+            return false;
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(boolean[][] cpu, boolean[][] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         for (int j = 0; j < cpu[i].length; j++) {
+            if (cpu[i][j] != gpu[i][j]) {
+               return false;
+            }
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(byte[][] cpu, byte[][] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         for (int j = 0; j < cpu[i].length; j++) {
+            if (cpu[i][j] != gpu[i][j]) {
+               return false;
+            }
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(short[][] cpu, short[][] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         for (int j = 0; j < cpu[i].length; j++) {
+            if (cpu[i][j] != gpu[i][j]) {
+               return false;
+            }
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(int[][] cpu, int[][] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         for (int j = 0; j < cpu[i].length; j++) {
+            if (cpu[i][j] != gpu[i][j]) {
+               return false;
+            }
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(long[][] cpu, long[][] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         for (int j = 0; j < cpu[i].length; j++) {
+            if (cpu[i][j] != gpu[i][j]) {
+               return false;
+            }
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(float[][] cpu, float[][] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         for (int j = 0; j < cpu[i].length; j++) {
+            if (cpu[i][j] != gpu[i][j]) {
+               return false;
+            }
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(double[][] cpu, double[][] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         for (int j = 0; j < cpu[i].length; j++) {
+            if (cpu[i][j] != gpu[i][j]) {
+               return false;
+            }
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(boolean[][][] cpu, boolean[][][] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         for (int j = 0; j < cpu[i].length; j++) {
+            for (int k = 0; k < cpu[i][j].length; k++) {
+               if (cpu[i][j][k] != gpu[i][j][k]) {
+                  return false;
+               }
+            }
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(byte[][][] cpu, byte[][][] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         for (int j = 0; j < cpu[i].length; j++) {
+            for (int k = 0; k < cpu[i][j].length; k++) {
+               if (cpu[i][j][k] != gpu[i][j][k]) {
+                  return false;
+               }
+            }
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(short[][][] cpu, short[][][] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         for (int j = 0; j < cpu[i].length; j++) {
+            for (int k = 0; k < cpu[i][j].length; k++) {
+               if (cpu[i][j][k] != gpu[i][j][k]) {
+                  return false;
+               }
+            }
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(int[][][] cpu, int[][][] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         for (int j = 0; j < cpu[i].length; j++) {
+            for (int k = 0; k < cpu[i][j].length; k++) {
+               if (cpu[i][j][k] != gpu[i][j][k]) {
+                  return false;
+               }
+            }
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(long[][][] cpu, long[][][] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         for (int j = 0; j < cpu[i].length; j++) {
+            for (int k = 0; k < cpu[i][j].length; k++) {
+               if (cpu[i][j][k] != gpu[i][j][k]) {
+                  return false;
+               }
+            }
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(float[][][] cpu, float[][][] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         for (int j = 0; j < cpu[i].length; j++) {
+            for (int k = 0; k < cpu[i][j].length; k++) {
+               if (cpu[i][j][k] != gpu[i][j][k]) {
+                  return false;
+               }
+            }
+         }
+      }
+      return true;
+   }
+
+   private static boolean checkResults(double[][][] cpu, double[][][] gpu) {
+      for (int i = 0; i < cpu.length; i++) {
+         for (int j = 0; j < cpu[i].length; j++) {
+            for (int k = 0; k < cpu[i][j].length; k++) {
+               if (cpu[i][j][k] != gpu[i][j][k]) {
+                  return false;
+               }
+            }
+         }
+      }
+      return true;
+   }
+
+   public static void Zrun1D() {
+      final boolean[] A = new boolean[N * N];
+      final boolean[] B = new boolean[N * N];
+      final boolean[] gpu = new boolean[N * N];
+      boolean[] cpu = new boolean[N * N];
+
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            A[(i * N) + j] = ((i % 2) == 0) ^ ((j % 2) == 0);
+            B[(i * N) + j] = ((i % 2) == 0) & ((j % 2) == 0);
+            cpu[(i * N) + j] = false;
+            gpu[(i * N) + j] = false;
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new ZMatMul1D(A, B, gpu, N);
+      kernel.execute(N * N);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, N);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Brun1D() {
+      final byte[] A = new byte[N * N];
+      final byte[] B = new byte[N * N];
+      final byte[] gpu = new byte[N * N];
+      byte[] cpu = new byte[N * N];
+
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            A[(i * N) + j] = (byte) (i + j);
+            B[(i * N) + j] = (byte) (i - j);
+            cpu[(i * N) + j] = (byte) 0;
+            gpu[(i * N) + j] = (byte) 0;
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new BMatMul1D(A, B, gpu, N);
+      kernel.execute(N * N);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, N);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Srun1D() {
+      final short[] A = new short[N * N];
+      final short[] B = new short[N * N];
+      final short[] gpu = new short[N * N];
+      short[] cpu = new short[N * N];
+
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            A[(i * N) + j] = (short) (i + j);
+            B[(i * N) + j] = (short) (i - j);
+            cpu[(i * N) + j] = (short) 0;
+            gpu[(i * N) + j] = (short) 0;
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new SMatMul1D(A, B, gpu, N);
+      kernel.execute(N * N);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, N);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Irun1D() {
+      final int[] A = new int[N * N];
+      final int[] B = new int[N * N];
+      final int[] gpu = new int[N * N];
+      int[] cpu = new int[N * N];
+
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            A[(i * N) + j] = i + j;
+            B[(i * N) + j] = i - j;
+            cpu[(i * N) + j] = 0;
+            gpu[(i * N) + j] = 0;
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new IMatMul1D(A, B, gpu, N);
+      kernel.execute(N * N);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, N);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Lrun1D() {
+      final long[] A = new long[N * N];
+      final long[] B = new long[N * N];
+      final long[] gpu = new long[N * N];
+      long[] cpu = new long[N * N];
+
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            A[(i * N) + j] = i + j;
+            B[(i * N) + j] = i - j;
+            cpu[(i * N) + j] = 0l;
+            gpu[(i * N) + j] = 0l;
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new LMatMul1D(A, B, gpu, N);
+      kernel.execute(N * N);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, N);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Frun1D() {
+      final float[] A = new float[N * N];
+      final float[] B = new float[N * N];
+      final float[] gpu = new float[N * N];
+      float[] cpu = new float[N * N];
+
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            A[(i * N) + j] = i + j;
+            B[(i * N) + j] = i - j;
+            cpu[(i * N) + j] = 0.0f;
+            gpu[(i * N) + j] = 0.0f;
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new FMatMul1D(A, B, gpu, N);
+      kernel.execute(N * N);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, N);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Drun1D() {
+      final double[] A = new double[N * N];
+      final double[] B = new double[N * N];
+      final double[] gpu = new double[N * N];
+      double[] cpu = new double[N * N];
+
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            A[(i * N) + j] = i + j;
+            B[(i * N) + j] = i - j;
+            cpu[(i * N) + j] = 0.0;
+            gpu[(i * N) + j] = 0.0;
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new DMatMul1D(A, B, gpu, N);
+      kernel.execute(N * N);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, N);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Zrun2D() {
+      final boolean[][] A = new boolean[N][N];
+      final boolean[][] B = new boolean[N][N];
+      final boolean[][] gpu = new boolean[N][N];
+      boolean[][] cpu = new boolean[N][N];
+
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            A[i][j] = ((i % 2) == 0) ^ ((j % 2) == 0);
+            B[i][j] = ((i % 2) == 0) & ((j % 2) == 0);
+            cpu[i][j] = false;
+            gpu[i][j] = false;
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new ZMatMul2D(A, B, gpu, N);
+      kernel.execute(N * N);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, N);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Brun2D() {
+      final byte[][] A = new byte[N][N];
+      final byte[][] B = new byte[N][N];
+      final byte[][] gpu = new byte[N][N];
+      byte[][] cpu = new byte[N][N];
+
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            A[i][j] = (byte) (i + j);
+            B[i][j] = (byte) (i - j);
+            cpu[i][j] = (byte) 0;
+            gpu[i][j] = (byte) 0;
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new BMatMul2D(A, B, gpu, N);
+      kernel.execute(N * N);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, N);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Srun2D() {
+      final short[][] A = new short[N][N];
+      final short[][] B = new short[N][N];
+      final short[][] gpu = new short[N][N];
+      short[][] cpu = new short[N][N];
+
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            A[i][j] = (short) (i + j);
+            B[i][j] = (short) (i - j);
+            cpu[i][j] = (short) 0;
+            gpu[i][j] = (short) 0;
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new SMatMul2D(A, B, gpu, N);
+      kernel.execute(N * N);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, N);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Irun2D() {
+      final int[][] A = new int[N][N];
+      final int[][] B = new int[N][N];
+      final int[][] gpu = new int[N][N];
+      int[][] cpu = new int[N][N];
+
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            A[i][j] = i + j;
+            B[i][j] = i - j;
+            cpu[i][j] = 0;
+            gpu[i][j] = 0;
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new IMatMul2D(A, B, gpu, N);
+      kernel.execute(N * N);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, N);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Lrun2D() {
+      final long[][] A = new long[N][N];
+      final long[][] B = new long[N][N];
+      final long[][] gpu = new long[N][N];
+      long[][] cpu = new long[N][N];
+
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            A[i][j] = i + j;
+            B[i][j] = i - j;
+            cpu[i][j] = 0l;
+            gpu[i][j] = 0l;
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new LMatMul2D(A, B, gpu, N);
+      kernel.execute(N * N);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, N);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Frun2D() {
+      final float[][] A = new float[N][N];
+      final float[][] B = new float[N][N];
+      final float[][] gpu = new float[N][N];
+      float[][] cpu = new float[N][N];
+
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            A[i][j] = i + j;
+            B[i][j] = i - j;
+            cpu[i][j] = 0.0f;
+            gpu[i][j] = 0.0f;
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new FMatMul2D(A, B, gpu, N);
+      kernel.execute(N * N);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, N);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Drun2D() {
+      final double[][] A = new double[N][N];
+      final double[][] B = new double[N][N];
+      final double[][] gpu = new double[N][N];
+      double[][] cpu = new double[N][N];
+
+      for (int i = 0; i < N; i++) {
+         for (int j = 0; j < N; j++) {
+            A[i][j] = i + j;
+            B[i][j] = i - j;
+            cpu[i][j] = 0.0;
+            gpu[i][j] = 0.0;
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new DMatMul2D(A, B, gpu, N);
+      kernel.execute(N * N);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, N);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Zrun3D() {
+      final boolean[][][] A = new boolean[M][M][M];
+      final boolean[][][] B = new boolean[M][M][M];
+      final boolean[][][] gpu = new boolean[M][M][M];
+      boolean[][][] cpu = new boolean[M][M][M];
+
+      for (int i = 0; i < M; i++) {
+         for (int j = 0; j < M; j++) {
+            for (int k = 0; k < M; k++) {
+               A[i][j][k] = ((i % 2) == 0) ^ (((j % 2) == 0) & ((k % 2) == 0));
+               B[i][j][k] = (((i % 2) == 0) & ((j % 2) == 0)) ^ ((k % 2) == 0);
+               ;
+               cpu[i][j][k] = false;
+               gpu[i][j][k] = false;
+            }
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new ZMatMul3D(A, B, gpu, M);
+      kernel.execute(M * M * M);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, M);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Brun3D() {
+      final byte[][][] A = new byte[M][M][M];
+      final byte[][][] B = new byte[M][M][M];
+      final byte[][][] gpu = new byte[M][M][M];
+      byte[][][] cpu = new byte[M][M][M];
+
+      for (int i = 0; i < M; i++) {
+         for (int j = 0; j < M; j++) {
+            for (int k = 0; k < M; k++) {
+               A[i][j][k] = (byte) (i + j + k);
+               B[i][j][k] = (byte) ((i - j) + k);
+               cpu[i][j][k] = (byte) 0;
+               gpu[i][j][k] = (byte) 0;
+            }
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new BMatMul3D(A, B, gpu, M);
+      kernel.execute(M * M * M);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, M);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Srun3D() {
+      final short[][][] A = new short[M][M][M];
+      final short[][][] B = new short[M][M][M];
+      final short[][][] gpu = new short[M][M][M];
+      short[][][] cpu = new short[M][M][M];
+
+      for (int i = 0; i < M; i++) {
+         for (int j = 0; j < M; j++) {
+            for (int k = 0; k < M; k++) {
+               A[i][j][k] = (short) (i + j + k);
+               B[i][j][k] = (short) ((i - j) + k);
+               cpu[i][j][k] = (short) 0;
+               gpu[i][j][k] = (short) 0;
+            }
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new SMatMul3D(A, B, gpu, M);
+      kernel.execute(M * M * M);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, M);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Irun3D() {
+      final int[][][] A = new int[M][M][M];
+      final int[][][] B = new int[M][M][M];
+      final int[][][] gpu = new int[M][M][M];
+      int[][][] cpu = new int[M][M][M];
+
+      for (int i = 0; i < M; i++) {
+         for (int j = 0; j < M; j++) {
+            for (int k = 0; k < M; k++) {
+               A[i][j][k] = i + j + k;
+               B[i][j][k] = (i - j) + k;
+               cpu[i][j][k] = 0;
+               gpu[i][j][k] = 0;
+            }
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new IMatMul3D(A, B, gpu, M);
+      kernel.execute(M * M * M);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, M);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Lrun3D() {
+      final long[][][] A = new long[M][M][M];
+      final long[][][] B = new long[M][M][M];
+      final long[][][] gpu = new long[M][M][M];
+      long[][][] cpu = new long[M][M][M];
+
+      for (int i = 0; i < M; i++) {
+         for (int j = 0; j < M; j++) {
+            for (int k = 0; k < M; k++) {
+               A[i][j][k] = i + j + k;
+               B[i][j][k] = (i - j) + k;
+               cpu[i][j][k] = 0l;
+               gpu[i][j][k] = 0l;
+            }
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new LMatMul3D(A, B, gpu, M);
+      kernel.execute(M * M * M);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, M);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Frun3D() {
+      final float[][][] A = new float[M][M][M];
+      final float[][][] B = new float[M][M][M];
+      final float[][][] gpu = new float[M][M][M];
+      float[][][] cpu = new float[M][M][M];
+
+      for (int i = 0; i < M; i++) {
+         for (int j = 0; j < M; j++) {
+            for (int k = 0; k < M; k++) {
+               A[i][j][k] = i + j + k;
+               B[i][j][k] = (i - j) + k;
+               cpu[i][j][k] = 0.0f;
+               gpu[i][j][k] = 0.0f;
+            }
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new FMatMul3D(A, B, gpu, M);
+      kernel.execute(M * M * M);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, M);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+
+   public static void Drun3D() {
+      final double[][][] A = new double[M][M][M];
+      final double[][][] B = new double[M][M][M];
+      final double[][][] gpu = new double[M][M][M];
+      double[][][] cpu = new double[M][M][M];
+
+      for (int i = 0; i < M; i++) {
+         for (int j = 0; j < M; j++) {
+            for (int k = 0; k < M; k++) {
+               A[i][j][k] = i + j + k;
+               B[i][j][k] = (i - j) + k;
+               cpu[i][j][k] = 0.0;
+               gpu[i][j][k] = 0.0;
+            }
+         }
+      }
+
+      long gs = System.currentTimeMillis();
+      final Kernel kernel = new DMatMul3D(A, B, gpu, M);
+      kernel.execute(M * M * M);
+      gs = System.currentTimeMillis() - gs;
+
+      long cs = System.currentTimeMillis();
+      cpu = matMull(A, B, M);
+      cs = System.currentTimeMillis() - cs;
+
+      System.out.println("gpu time: " + gs + "\ncpu time: " + cs);
+      System.out.print("valid? ");
+
+      if (checkResults(cpu, gpu)) {
+         System.out.println("yes");
+      } else {
+         System.out.println("no");
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul1D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul1D.java
new file mode 100644
index 0000000000000000000000000000000000000000..858be09e8be693420c75857261f93aeafd32cc08
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul1D.java
@@ -0,0 +1,28 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class SMatMul1D extends Kernel{
+   short[] A;
+
+   short[] B;
+
+   short[] C;
+
+   int N;
+
+   public SMatMul1D(short[] A, short[] B, short[] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / N;
+      int j = id % N;
+      for (int k = 0; k < N; k++) {
+         C[i * N + j] += (short) (A[i * N + k] * B[k * N + j]);
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul2D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul2D.java
new file mode 100644
index 0000000000000000000000000000000000000000..2ed216e523459052bd785b4e3e56f1a326e1afc8
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul2D.java
@@ -0,0 +1,28 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class SMatMul2D extends Kernel{
+   short[][] A;
+
+   short[][] B;
+
+   short[][] C;
+
+   int N;
+
+   public SMatMul2D(short[][] A, short[][] B, short[][] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / N;
+      int j = id % N;
+      for (int k = 0; k < N; k++) {
+         C[i][j] += (short) (A[i][k] * B[k][j]);
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul3D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul3D.java
new file mode 100644
index 0000000000000000000000000000000000000000..95de3a37d33f7db8a6c0dfd307da6907c0f9309d
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul3D.java
@@ -0,0 +1,29 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class SMatMul3D extends Kernel{
+   short[][][] A;
+
+   short[][][] B;
+
+   short[][][] C;
+
+   int N;
+
+   public SMatMul3D(short[][][] A, short[][][] B, short[][][] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / (N * N);
+      int j = (id / N) % N;
+      int k = id % N;
+      for (int l = 0; l < N; l++) {
+         C[i][j][k] += (short) (A[i][j][l] * B[l][j][k]);
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul1D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul1D.java
new file mode 100644
index 0000000000000000000000000000000000000000..f28db2f6b2389e83d6cf1cbf4bfe0b20ce4790ea
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul1D.java
@@ -0,0 +1,28 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class ZMatMul1D extends Kernel{
+   boolean[] A;
+
+   boolean[] B;
+
+   boolean[] C;
+
+   int N;
+
+   public ZMatMul1D(boolean[] A, boolean[] B, boolean[] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / N;
+      int j = id % N;
+      for (int k = 0; k < N; k++) {
+         C[i * N + j] ^= A[i * N + k] & B[k * N + j];
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul2D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul2D.java
new file mode 100644
index 0000000000000000000000000000000000000000..75304d54cbc24cf0e090abfc03916e76e7eecb23
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul2D.java
@@ -0,0 +1,28 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class ZMatMul2D extends Kernel{
+   boolean[][] A;
+
+   boolean[][] B;
+
+   boolean[][] C;
+
+   int N;
+
+   public ZMatMul2D(boolean[][] A, boolean[][] B, boolean[][] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / N;
+      int j = id % N;
+      for (int k = 0; k < N; k++) {
+         C[i][j] ^= A[i][k] & B[k][j];
+      }
+   }
+}
diff --git a/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul3D.java b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul3D.java
new file mode 100644
index 0000000000000000000000000000000000000000..6d63dfb6ffb1dbeabc7fb63513ecdbd8479cd8d8
--- /dev/null
+++ b/samples/mdarray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul3D.java
@@ -0,0 +1,29 @@
+package gov.pnnl.aparapi.sample.mdarray;
+import com.amd.aparapi.Kernel;
+
+class ZMatMul3D extends Kernel{
+   boolean[][][] A;
+
+   boolean[][][] B;
+
+   boolean[][][] C;
+
+   int N;
+
+   public ZMatMul3D(boolean[][][] A, boolean[][][] B, boolean[][][] C, int N) {
+      this.A = A;
+      this.B = B;
+      this.C = C;
+      this.N = N;
+   }
+
+   @Override public void run() {
+      int id = getGlobalId();
+      int i = id / (N * N);
+      int j = (id / N) % N;
+      int k = id % N;
+      for (int l = 0; l < N; l++) {
+         C[i][j][k] ^= A[i][j][l] & B[l][j][k];
+      }
+   }
+}