Correlation/co-occurrence matrix computation code -- as presented at the 2012...

Correlation/co-occurrence matrix computation code -- as presented at the 2012 AMD Fusion Developer Summit -- modified to meet the standards of the official Aparapi code base. Please see included CC-4257_FINAL_060612.pdf for in-depth information explaining how this code is intended to function.

Correlation/co-occurrence matrix computation code -- as presented at the 2012...
Correlation/co-occurrence matrix computation code -- as presented at the 2012 AMD Fusion Developer Summit -- modified to meet the standards of the official Aparapi code base. Please see included CC-4257_FINAL_060612.pdf for in-depth information explaining how this code is intended to function.
4fa3ab7b · Ryan LaMothe · 8b721b27 · 4fa3ab7b · 4fa3ab7b · 4fa3ab7b
Commit 4fa3ab7b authored 11 years ago by Ryan LaMothe
--- a/examples/correlation-matrix/.classpath
+++ b/examples/correlation-matrix/.classpath
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src/java"/>
+	<classpathentry kind="src" path="src/test"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+	<classpathentry kind="lib" path="/com.amd.aparapi/dist/aparapi.jar" sourcepath="/com.amd.aparapi">
+		<attributes>
+			<attribute name="org.eclipse.jdt.launching.CLASSPATH_ATTR_LIBRARY_PATH_ENTRY" value="com.amd.aparapi.jni/dist"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="lib" path="/third-party/apache/commons/commons-lang3-3.1.jar"/>
+	<classpathentry kind="lib" path="/third-party/apache/logging/log4j-1.2.16.jar"/>
+	<classpathentry kind="lib" path="/third-party/apache/lucene/lucene-core-3.5.0.jar"/>
+	<classpathentry kind="lib" path="/third-party/junit/junit-4.10.jar"/>
+	<classpathentry kind="output" path="classes"/>
+</classpath>
--- a/examples/correlation-matrix/.project
+++ b/examples/correlation-matrix/.project
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>correlation-matrix</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>
--- a/examples/correlation-matrix/CC-4257_FINAL_060612.pdf
+++ b/examples/correlation-matrix/CC-4257_FINAL_060612.pdf
--- a/examples/correlation-matrix/build.xml
+++ b/examples/correlation-matrix/build.xml
+<?xml version="1.0"?>
+
+<project name="correlation-matrix" default="junit" basedir=".">
+   
+   <!-- 
+         DO NOT EDIT BELOW THIS LINE 
+   -->
+   <echo>OS Name:    	${os.name}</echo>
+   <echo>OS Version: 	${os.version}</echo>
+   <echo>OS Arch:    	${os.arch}</echo>
+   <echo>Java Version:   	${java.version}</echo>
+
+   <target name="clean">
+      <delete dir="classes"/>
+      <delete dir="junit"/>
+      <!-- Legacy cleanup -->
+      <delete file="junit*.jar"/>
+   </target>
+
+   <path id="classpath">
+      <pathelement path="${basedir}/../../com.amd.aparapi/dist/aparapi.jar"/>
+      <pathelement path="${basedir}/../third-party/apache/commons/commons-lang3-3.1.jar"/>
+      <pathelement path="${basedir}/../third-party/apache/logging/log4j-1.2.16.jar"/>
+      <pathelement path="${basedir}/../third-party/apache/lucene/lucene-core-3.5.0.jar"/>
+	  <pathelement path="${basedir}/../third-party/junit/junit-4.10.jar"/>
+      <pathelement path="${junit.home}/${junit.jar.name}"/>
+      <pathelement path="classes"/>
+   </path>
+
+   <target name="junit" depends="clean">
+      <mkdir dir="classes"/>
+      <mkdir dir="junit/data"/>
+      
+      <!-- Runtime Code -->
+      <javac debug="true"
+         debuglevel="lines,vars,source"
+         srcdir="src/java" 
+         destdir="classes" 
+         includeAntRuntime="false"
+         classpathref="classpath">
+         <compilerarg value="-Xlint"/>
+         <compilerarg value="-Xlint:-path"/>
+      </javac>
+      
+      <!-- JUnit Tests -->
+      <javac debug="true"
+         debuglevel="lines,vars,source"
+         srcdir="src/test" 
+         destdir="classes" 
+         includeAntRuntime="false"
+         classpathref="classpath">
+         <compilerarg value="-Xlint"/>
+         <compilerarg value="-Xlint:-path"/>
+      </javac>
+      
+      <copy todir="classes" file="src/java/log4j.xml"/>
+
+      <!-- even though fork is slower we need to set the library path and this requires fork -->
+      <junit printsummary="false" fork="true" haltonfailure="false" failureproperty="tests.failed" showoutput="true">
+         <sysproperty key="java.library.path" value="${basedir}/../../com.amd.aparapi.jni/dist"/>
+         
+         <!-- USER DEFINED PROPERTIES -->
+         <sysproperty key="numRows" value="1024"/>
+         <sysproperty key="numColumns" value="16384"/>
+         <sysproperty key="useGPU" value="true"/>
+         
+         <formatter type="xml" />
+         <classpath refid="classpath"/>
+         <batchtest todir="junit/data">
+            <fileset dir="src/test"/>
+         </batchtest>
+      </junit>
+
+      <junitreport todir="junit/data">
+         <fileset dir="junit/data"/>
+      </junitreport>
+   </target>
+
+</project>
\ No newline at end of file
--- a/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixHost.java
+++ b/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixHost.java
+/**
+ * This material was prepared as an account of work sponsored by an agency of the United States Government.  
+ * Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of 
+ * their employees, nor any jurisdiction or organization that has cooperated in the development of these materials, 
+ * makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy, 
+ * completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents
+ * that its use would not infringe privately owned rights.
+ */
+package gov.pnnl.aparapi.matrix;
+
+import org.apache.log4j.Logger;
+
+import com.amd.aparapi.Kernel;
+import com.amd.aparapi.Kernel.EXECUTION_MODE;
+import com.amd.aparapi.Range;
+import com.amd.aparapi.device.Device;
+import com.amd.aparapi.device.OpenCLDevice;
+
+/**
+ * GPU calculations using OpenBitSet Intersection for OpenBitSets
+ * 
+ * Based on code from: <br/>
+ * {@link http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java}
+ * 
+ * @author ryan.lamothe at gmail.com
+ * @author sedillard at gmail.com
+ */
+public class CorrMatrixHost {
+
+   private static final Logger LOG = Logger.getLogger(CorrMatrixHost.class);
+
+   /**
+    * Perform matrix intersection for two lists of Lucene OpenBitSet-based packed longs
+    * 
+    * @param matrixA
+    *    The first term-document matrix
+    * @param matrixB
+    *    The second term-document matrix
+    * @param Aparapi EXECUTION_MODE
+    * @return result Matrix
+    * @throws Exception
+    */
+   public static int[][] intersectionMatrix(final long[][] matrixA, final long[][] matrixB, final EXECUTION_MODE executionMode) throws Exception {
+
+      // Basic validation
+      if (matrixA == null) {
+         throw new NullPointerException("MatrixA cannot be NULL");
+      }
+
+      if (matrixB == null) {
+         throw new NullPointerException("MatrixB cannot be NULL");
+      }
+
+      // Size of an array is 8 bytes for the object + 4 bytes for the header and length information
+      final int arrayMemOverhead = 12;
+
+      // numDocs/64 since they are packed into longs
+      // We need to make our matrix sizes multiples of BLOCK_SIZE
+      final int matrixA_numTerms = matrixA.length;
+      final int matrixA_numLongs = matrixA[0].length;
+
+      if (LOG.isDebugEnabled()) {
+         LOG.debug("----------");
+         LOG.debug("MatrixA NumTerms (Rows): " + matrixA_numTerms);
+         LOG.debug("MatrixA NumLongs (Columns): " + matrixA_numLongs);
+         LOG.debug("MatrixA NumDocs: " + (matrixA_numLongs * 64L));
+      }
+
+      final long matrixA_BytesPerRow = matrixA_numLongs * 8L;
+      final long matrixA_TotalBytes = (matrixA_numTerms * matrixA_BytesPerRow) + arrayMemOverhead;
+
+      if (LOG.isDebugEnabled()) {
+         LOG.debug("MatrixA Total Memory Size: " + humanReadableByteCount(matrixA_TotalBytes, true));
+      }
+
+      final int matrixB_numTerms = matrixB.length;
+      final int matrixB_numLongs = matrixB[0].length;
+
+      if (LOG.isDebugEnabled()) {
+         LOG.debug("----------");
+         LOG.debug("MatrixB NumTerms (Rows): " + matrixB_numTerms);
+         LOG.debug("MatrixB NumLongs (Columns): " + matrixB_numLongs);
+         LOG.debug("MatrixB NumDocs: " + (matrixB_numLongs * 64L));
+      }
+
+      final long matrixB_BytesPerRow = matrixB_numLongs * 8L;
+      final long matrixB_TotalBytes = (matrixB_numTerms * matrixB_BytesPerRow) + arrayMemOverhead;
+
+      if (LOG.isDebugEnabled()) {
+         LOG.debug("MatrixB Total Memory Size: " + humanReadableByteCount(matrixB_TotalBytes, true));
+         LOG.debug("----------");
+      }
+
+      final int[][] resultMatrix = new int[matrixA_numTerms][matrixB_numTerms];
+
+      if (LOG.isDebugEnabled()) {
+         final long resultMatrix_TotalBytes = (matrixA_numTerms * matrixB_numTerms * 4L) + arrayMemOverhead;
+         LOG.debug("ResultMatrix Memory Size: " + humanReadableByteCount(resultMatrix_TotalBytes, true));
+         LOG.debug("Total Requested Memory Size: " + humanReadableByteCount(matrixA_TotalBytes + matrixB_TotalBytes + resultMatrix_TotalBytes, true));
+         LOG.debug("----------");
+      }
+
+      int NUM_SUB_ROWS = matrixA_numTerms; // Default number of sub-rows
+
+      OpenCLDevice device = null;
+
+      // We do not test for EXECUTION_MODE.JTP because JTP is non-OpenCL
+      if (executionMode.equals(EXECUTION_MODE.CPU)) {
+         device = (OpenCLDevice) Device.firstCPU();
+
+         if (device == null) {
+            LOG.warn("OpenCLDevice.CPU is NULL...OpenCL is unavailable. Setting to JTP mode.");
+            LOG.debug("----------");
+         }
+      } else if (executionMode.equals(EXECUTION_MODE.GPU)) {
+         device = (OpenCLDevice) Device.best();
+
+         if (device == null) {
+            LOG.warn("OpenCLDevice.GPU is NULL...OpenCL is unavailable. Setting to JTP mode.");
+            LOG.debug("----------");
+         }
+      }
+
+      // This is to create stripes of rows that will fit into OpenCL's available memory
+      // Calculate the number of sub-rows by calling OpenCL to find out available memory
+      // Length of row * 8 (size of long in bytes) * number of rows to available memory
+      final int maxNumTerms = Math.max(matrixA_numTerms, matrixB_numTerms);
+
+      if (device != null) {
+         final long globalMemSize = device.getGlobalMemSize();
+         // final long maxMemAllocSize = Math.max((globalMemSize/4), 128*1024*1024);
+         final long maxMemAllocSize = device.getMaxMemAllocSize();
+
+         // 1048576 bytes in a megabyte (1024*1024)
+         // Java long is 8 bytes
+         // 131072 longs in 1 megabyte
+         // SAFE OpenCL spec allocation is max(1/4 GlobalMemSize)
+         // ***During our testing this appears to be incorrectly/inconsistently reported depending on os/drivers/hardware***
+         if (LOG.isDebugEnabled()) {
+            LOG.debug("Available OpenCL globalMemSize: " + humanReadableByteCount(globalMemSize, true));
+            LOG.debug("Available OpenCL maxMemAllocSize: " + humanReadableByteCount(maxMemAllocSize, true));
+         }
+
+         // Maybe there is a more clever way to do this :)
+         // The idea here is to decide how many sub-rows of the matrix we can fit on a single card
+         // The long-term goal to divide up the work for both small RAM GPUs and multiple GPUs
+         int subRowsCounterA = 0;
+         int subRowsCounterB = 0;
+         long subRowsMemSizeA = 0L;
+         long subRowsMemSizeB = 0L;
+         long subResultMatrixMemSize = 0L;
+         long subTotalMemSize = 0L;
+
+         do {
+            if (subRowsCounterA < matrixA_numTerms) {
+               subRowsMemSizeA = subRowsCounterA != 0 ? (subRowsCounterA * matrixA_numLongs * 8L) + arrayMemOverhead : 0;
+               subRowsCounterA += 1;
+            } else if (subRowsCounterA == matrixA_numTerms) {
+               subRowsMemSizeA = subRowsCounterA != 0 ? (subRowsCounterA * matrixA_numLongs * 8L) + arrayMemOverhead : 0;
+            }
+
+            if (subRowsCounterB < matrixB_numTerms) {
+               subRowsMemSizeB = subRowsCounterB != 0 ? (subRowsCounterB * matrixB_numLongs * 8L) + arrayMemOverhead : 0;
+               subRowsCounterB += 1;
+            } else if (subRowsCounterB == matrixB_numTerms) {
+               subRowsMemSizeB = subRowsCounterB != 0 ? (subRowsCounterB * matrixB_numLongs * 8L) + arrayMemOverhead : 0;
+            }
+
+            // This is 4 bytes since the sub-result matrix is an int array
+            subResultMatrixMemSize = ((subRowsCounterA * subRowsCounterB) * 4L) + arrayMemOverhead;
+
+            subTotalMemSize = subRowsMemSizeA + subRowsMemSizeB + subResultMatrixMemSize;
+         } while ((Math.max(subRowsCounterA, subRowsCounterB) < maxNumTerms) && (subTotalMemSize <= maxMemAllocSize));
+
+         // If using OpenCL override the default number of subrows
+         NUM_SUB_ROWS = Math.max(subRowsCounterA, subRowsCounterB);
+
+         if (NUM_SUB_ROWS < maxNumTerms) {
+            final long subMatrixA_memSize = (NUM_SUB_ROWS * matrixA_numLongs * 8L) + arrayMemOverhead;
+            final long subMatrixB_memSize = (NUM_SUB_ROWS * matrixB_numLongs * 8L) + arrayMemOverhead;
+            final long subResultMatrix_memSize = (NUM_SUB_ROWS * NUM_SUB_ROWS * 4L) + arrayMemOverhead;
+
+            LOG.warn("****************************************************************");
+            LOG.warn("Requested matrix computation is larger than available OpenCL memory");
+            LOG.warn("Matrix striping is occurring to fit all data into OpenCL memory...");
+            LOG.warn("");
+            LOG.warn("Number rows requested: " + maxNumTerms);
+            LOG.warn("Number rows that fit: " + NUM_SUB_ROWS);
+            LOG.warn("");
+            LOG.warn("SubMatrixA Memory Size: " + humanReadableByteCount(subMatrixA_memSize, true));
+            LOG.warn("SubMatrixB Memory Size: " + humanReadableByteCount(subMatrixB_memSize, true));
+            LOG.warn("SubResultMatrix Memory Size: " + humanReadableByteCount(subResultMatrix_memSize, true));
+            LOG.warn("SubMatrix Total Memory Size: " + humanReadableByteCount(subMatrixA_memSize + subMatrixB_memSize + subResultMatrix_memSize, true));
+            LOG.warn("****************************************************************");
+         }
+      }
+
+      final int numSubBlocksA = ((matrixA_numTerms + NUM_SUB_ROWS) - 1) / NUM_SUB_ROWS;
+      final int numSubBlocksB = ((matrixB_numTerms + NUM_SUB_ROWS) - 1) / NUM_SUB_ROWS;
+
+      final long[] subMatrixA = new long[NUM_SUB_ROWS * matrixA_numLongs];
+      final long[] subMatrixB = new long[NUM_SUB_ROWS * matrixB_numLongs];
+      final int[] subResultMatrix = new int[NUM_SUB_ROWS * NUM_SUB_ROWS];
+
+      final CorrMatrixKernel kernel = new CorrMatrixKernel(subMatrixA, NUM_SUB_ROWS, subMatrixB, NUM_SUB_ROWS, matrixA_numLongs, subResultMatrix);
+      kernel.setExplicit(true);
+
+      // Here we define a fall-back strategy, since the user may have wanted to execute only a single execution mode
+      if (executionMode.equals(EXECUTION_MODE.GPU) && (device != null)) {
+         kernel.addExecutionModes(EXECUTION_MODE.GPU, EXECUTION_MODE.CPU, EXECUTION_MODE.JTP);
+         LOG.debug("Execution Fallback Strategy: GPU --> CPU --> JTP");
+      } else if (executionMode.equals(EXECUTION_MODE.CPU) && (device != null)) {
+         kernel.addExecutionModes(EXECUTION_MODE.CPU, EXECUTION_MODE.JTP);
+         LOG.debug("Execution Fallback Strategy: CPU --> JTP");
+      } else {
+         kernel.addExecutionModes(EXECUTION_MODE.JTP);
+         LOG.debug("Execution Strategy: JTP");
+      }
+
+      try {
+         for (int a = 0; a < numSubBlocksA; a++) {
+            for (int b = 0; b < numSubBlocksB; b++) {
+               final int aSubRowStart = a * NUM_SUB_ROWS;
+               final int aSubRowEnd = Math.min(matrixA_numTerms, aSubRowStart + NUM_SUB_ROWS);
+
+               for (int i = aSubRowStart; i < aSubRowEnd; i++) {
+                  if (matrixA_numLongs != matrixA[i].length) {
+                     throw new Exception("All rows in the matrix need be the same length");
+                  }
+
+                  System.arraycopy(matrixA[i], 0, subMatrixA, (i - aSubRowStart) * matrixA_numLongs, matrixA_numLongs);
+               }
+
+               final int bSubRowStart = b * NUM_SUB_ROWS;
+               final int bSubRowEnd = Math.min(matrixB_numTerms, bSubRowStart + NUM_SUB_ROWS);
+
+               for (int i = bSubRowStart; i < bSubRowEnd; i++) {
+                  if (matrixA_numLongs != matrixB[i].length) {
+                     throw new Exception("All rows in the matrix need be the same length");
+                  }
+
+                  System.arraycopy(matrixB[i], 0, subMatrixB, (i - bSubRowStart) * matrixB_numLongs, matrixB_numLongs);
+               }
+
+               // Since matrixA_NumLongs == matrixB_NumLongs we're only going to pass matrixA_NumLongs
+               executeKernel(device, subMatrixA, aSubRowEnd - aSubRowStart, subMatrixB, bSubRowEnd - bSubRowStart, matrixA_numLongs, subResultMatrix, kernel);
+
+               // Convert one dimensional array to two dimensional array in the expected output ordering
+               for (int i = 0; i < NUM_SUB_ROWS; i++) {
+                  if ((i + aSubRowStart) < aSubRowEnd) {
+                     System.arraycopy(subResultMatrix, i * NUM_SUB_ROWS, resultMatrix[i + aSubRowStart], bSubRowStart, bSubRowEnd - bSubRowStart);
+                  }
+               }
+            }
+         }
+      } finally {
+         if (LOG.isDebugEnabled()) {
+            LOG.debug("----------");
+            LOG.debug("Aparapi Gross Execution Time: " + kernel.getAccumulatedExecutionTime() + " ms <------ Aparapi");
+            LOG.debug("OpenCL Generation Time: " + kernel.getConversionTime() + " ms");
+            LOG.debug("Kernel Net Execution Time: " + (kernel.getAccumulatedExecutionTime() - kernel.getConversionTime()) + " ms");
+            LOG.debug("----------");
+         }
+
+         try {
+            kernel.dispose();
+         } catch (final UnsatisfiedLinkError e) {
+            LOG.error("Aparapi failed to dispose of the kernel", e);
+         }
+      }
+
+      return resultMatrix;
+   }
+
+   /**
+    * Execute the GPU kernel
+    * 
+    * @param subMatrixA
+    * @param matrixA_NumTerms
+    * @param subMatrixB
+    * @param matrixB_NumTerms
+    * @param numLongs
+    * @param subResultMatrix
+    * @param kernel
+    * 
+    * @return resultMatrix
+    */
+   private static void executeKernel(final Device device, final long[] subMatrixA, final int matrixA_NumTerms, final long[] subMatrixB, final int matrixB_NumTerms, final int numLongs, final int[] subResultMatrix, final Kernel kernel) {
+
+      // Power of Two for best performance
+      int matrixA_NumTermsRnd = matrixA_NumTerms;
+      while (!isPowerOfTwo(matrixA_NumTermsRnd)) {
+         matrixA_NumTermsRnd += 1;
+      }
+
+      int matrixB_NumTermsRnd = matrixB_NumTerms;
+      while (!isPowerOfTwo(matrixB_NumTermsRnd)) {
+         matrixB_NumTermsRnd += 1;
+      }
+
+      final Range range;
+      if (device != null) {
+         range = Range.create2D(device, matrixA_NumTermsRnd, matrixB_NumTermsRnd);
+      } else {
+         range = Range.create2D(matrixA_NumTermsRnd, matrixB_NumTermsRnd);
+      }
+
+      if (LOG.isDebugEnabled()) {
+         LOG.debug("Range: " + range);
+      }
+
+      kernel.put(subMatrixA);
+      kernel.put(subMatrixB);
+      kernel.put(subResultMatrix);
+
+      kernel.execute(range);
+
+      kernel.get(subResultMatrix);
+   }
+
+   /**
+    * Highly efficient means to compute whether a number is a power of 2<br>
+    * Based on code from http://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2
+    * <p>
+    * Another very cool way to do this is ((x&(-x))==x)
+    * 
+    * @param n
+    * @return boolean
+    */
+   private static boolean isPowerOfTwo(int n) {
+      return (n > 0) && ((n & (n - 1)) == 0);
+   }
+
+   /**
+    * Rounds a number to the multiple indicated
+    * 
+    * @param num
+    * @param multiple
+    * @return
+    */
+   private static int roundToMultiple(double num, int multiple) {
+      return (int) (Math.ceil(num / multiple) * multiple);
+   }
+
+   /**
+    * Very nice means to convert byte sizes into human readable format<br>
+    * Based on code from http://stackoverflow.com/questions/3758606/how-to-convert-byte-size-into-human-readable-format-in-java
+    * <p>
+    * 
+    * @param bytes
+    * @param si
+    * @return humanReadableByteCount
+    */
+   private static String humanReadableByteCount(long bytes, boolean si) {
+      final int unit = si ? 1000 : 1024;
+      if (bytes < unit) {
+         return bytes + " B";
+      }
+      final int exp = (int) (Math.log(bytes) / Math.log(unit));
+      final String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp - 1) + (si ? "" : "i");
+
+      return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre);
+   }
+}
--- a/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixKernel.java
+++ b/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixKernel.java
+/**
+ * This material was prepared as an account of work sponsored by an agency of the United States Government.  
+ * Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of 
+ * their employees, nor any jurisdiction or organization that has cooperated in the development of these materials, 
+ * makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy, 
+ * completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents
+ * that its use would not infringe privately owned rights.
+ */
+package gov.pnnl.aparapi.matrix;
+
+import com.amd.aparapi.Kernel;
+
+/**
+ * This kernel attempts to re-implement the Lucene OpenBitSet functionality on a GPU
+ * 
+ * Based on code from: <br/>
+ * {@link http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java}
+ * 
+ * @author ryan.lamothe at gmail.com
+ * @author sedillard at gmail.com
+ */
+public class CorrMatrixKernel extends Kernel {
+
+   final long[] matrixA;
+
+   final int matrixA_NumTerms;
+
+   final long[] matrixB;
+
+   final int matrixB_NumTerms;
+
+   int numLongs;
+
+   int[] resultMatrix;
+
+   /**
+    * Default constructor
+    */
+   public CorrMatrixKernel(final long[] matrixA, final int matrixA_NumTerms, final long[] matrixB, final int matrixB_NumTerms,
+         final int numLongs, final int[] resultMatrix) {
+      this.matrixA = matrixA;
+      this.matrixA_NumTerms = matrixA_NumTerms;
+      this.matrixB = matrixB;
+      this.matrixB_NumTerms = matrixB_NumTerms;
+      this.numLongs = numLongs;
+      this.resultMatrix = resultMatrix;
+   }
+
+   @Override
+   public void run() {
+      final int i = this.getGlobalId(0);
+
+      if (i < matrixA_NumTerms) {
+         final int j = this.getGlobalId(1);
+
+         if (j < matrixB_NumTerms) {
+            // For testing purposes, you can use the naive implementation to compare performance
+            resultMatrix[(i * matrixB_NumTerms) + j] = pop_intersect(matrixA, i * numLongs, matrixB, j * numLongs, numLongs);
+            // this.resultMatrix[i * matrixB_NumTerms + j] = this.naive_pop_intersect(matrixA, i * numLongs, matrixB, j * numLongs, numLongs);
+         }
+      }
+   }
+
+   /**
+    * A naive implementation of the pop_array code below
+    */
+   private int naive_pop_intersect(final long matrixA[], final int aStart, final long matrixB[], final int bStart, final int numWords) {
+      int sum = 0;
+
+      for (int i = 0; i < numWords; i++) {
+         sum += pop(matrixA[aStart + i] & matrixB[bStart + i]);
+      }
+
+      return sum;
+   }
+
+   /**
+    * Returns the popcount or cardinality of the two sets after an intersection.
+    * Neither array is modified.
+    * 
+    * Modified for the purposes of this kernel from its original version
+    */
+   private int pop_intersect(final long matrixA[], final int aStart, final long matrixB[], final int bStart, final int numWords) {
+
+      /*
+       * http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java
+       */
+
+      // generated from pop_array via sed 's/A\[\([^]]*\)\]/\(A[\1] \& B[\1]\)/g'
+      final int n = numWords;
+      int tot = 0, tot8 = 0;
+      long ones = 0, twos = 0, fours = 0;
+
+      int i;
+      for (i = 0; i <= (n - 8); i += 8) {
+         long twosA = 0;
+         long twosB = 0;
+         long foursA = 0;
+         long foursB = 0;
+         long eights = 0;
+
+         final int ai = aStart + i;
+         final int bi = bStart + i;
+
+         // CSA(twosA, ones, ones, (A[i] & B[i]), (A[i+1] & B[i+1]))
+         {
+            final long b = matrixA[ai] & matrixB[bi], c = matrixA[ai + 1] & matrixB[bi + 1];
+            final long u = ones ^ b;
+            twosA = (ones & b) | (u & c);
+            ones = u ^ c;
+         }
+
+         // CSA(twosB, ones, ones, (A[i+2] & B[i+2]), (A[i+3] & B[i+3]))
+         {
+            final long b = matrixA[ai + 2] & matrixB[bi + 2], c = matrixA[ai + 3] & matrixB[bi + 3];
+            final long u = ones ^ b;
+            twosB = (ones & b) | (u & c);
+            ones = u ^ c;
+         }
+
+         // CSA(foursA, twos, twos, twosA, twosB)
+         {
+            final long u = twos ^ twosA;
+            foursA = (twos & twosA) | (u & twosB);
+            twos = u ^ twosB;
+         }
+
+         // CSA(twosA, ones, ones, (A[i+4] & B[i+4]), (A[i+5] & B[i+5]))
+         {
+            final long b = matrixA[ai + 4] & matrixB[bi + 4], c = matrixA[ai + 5] & matrixB[bi + 5];
+            final long u = ones ^ b;
+            twosA = (ones & b) | (u & c);
+            ones = u ^ c;
+         }
+
+         // CSA(twosB, ones, ones, (A[i+6] & B[i+6]), (A[i+7] & B[i+7]))
+         {
+            final long b = matrixA[ai + 6] & matrixB[bi + 6], c = matrixA[ai + 7] & matrixB[bi + 7];
+            final long u = ones ^ b;
+            twosB = (ones & b) | (u & c);
+            ones = u ^ c;
+         }
+
+         // CSA(foursB, twos, twos, twosA, twosB)
+         {
+            final long u = twos ^ twosA;
+            foursB = (twos & twosA) | (u & twosB);
+            twos = u ^ twosB;
+         }
+
+         // CSA(eights, fours, fours, foursA, foursB)
+         {
+            final long u = fours ^ foursA;
+            eights = (fours & foursA) | (u & foursB);
+            fours = u ^ foursB;
+         }
+
+         tot8 += pop(eights);
+      }
+
+      if (i <= (n - 4)) {
+         final int ai = aStart + i;
+         final int bi = bStart + i;
+
+         long twosA = 0;
+         long twosB = 0;
+         long foursA = 0;
+         long eights = 0;
+
+         {
+            final long b = matrixA[ai] & matrixB[bi], c = matrixA[ai + 1] & matrixB[bi + 1];
+            final long u = ones ^ b;
+            twosA = (ones & b) | (u & c);
+            ones = u ^ c;
+         }
+
+         {
+            final long b = matrixA[ai + 2] & matrixB[bi + 2], c = matrixA[ai + 3] & matrixB[bi + 3];
+            final long u = ones ^ b;
+            twosB = (ones & b) | (u & c);
+            ones = u ^ c;
+         }
+
+         {
+            final long u = twos ^ twosA;
+            foursA = (twos & twosA) | (u & twosB);
+            twos = u ^ twosB;
+         }
+
+         eights = fours & foursA;
+         fours = fours ^ foursA;
+
+         tot8 += pop(eights);
+         i += 4;
+      }
+
+      if (i <= (n - 2)) {
+         final int ai = aStart + i;
+         final int bi = bStart + i;
+
+         final long b = matrixA[ai] & matrixB[bi], c = matrixA[ai + 1] & matrixB[bi + 1];
+         final long u = ones ^ b;
+         final long twosA = (ones & b) | (u & c);
+         ones = u ^ c;
+
+         final long foursA = twos & twosA;
+         twos = twos ^ twosA;
+
+         final long eights = fours & foursA;
+         fours = fours ^ foursA;
+
+         tot8 += pop(eights);
+         i += 2;
+      }
+
+      if (i < n) {
+         final int ai = aStart + i;
+         final int bi = bStart + i;
+
+         tot += pop(matrixA[ai] & matrixB[bi]);
+      }
+
+      tot += (pop(fours) << 2) + (pop(twos) << 1) + pop(ones) + (tot8 << 3);
+
+      return tot;
+   }
+
+   /**
+    * Returns the number of bits set in the long
+    */
+   private int pop(long x) {
+
+      /*
+       * http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java
+       */
+
+      /*
+       * Hacker's Delight 32 bit pop function:
+       * http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt
+       * 
+       * int pop(unsigned x) {
+       * x = x - ((x >> 1) & 0x55555555);
+       * x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+       * x = (x + (x >> 4)) & 0x0F0F0F0F;
+       * x = x + (x >> 8);
+       * x = x + (x >> 16);
+       * return x & 0x0000003F;
+       * }
+       * *
+       */
+
+      // 64 bit java version of the C function from above
+      x = x - ((x >>> 1) & 0x5555555555555555L);
+      x = (x & 0x3333333333333333L) + ((x >>> 2) & 0x3333333333333333L);
+      x = (x + (x >>> 4)) & 0x0F0F0F0F0F0F0F0FL;
+      x = x + (x >>> 8);
+      x = x + (x >>> 16);
+      x = x + (x >>> 32);
+      return (int) x & 0x7F;
+   }
+}
--- a/examples/correlation-matrix/src/java/log4j.xml
+++ b/examples/correlation-matrix/src/java/log4j.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
+
+<!--
+   | For more configuration information and examples see the Jakarta Log4j
+   | website: http://jakarta.apache.org/log4j
+ -->
+
+<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/" debug="false">
+
+   <!-- ============================== -->
+   <!-- Append messages to the console -->
+   <!-- ============================== -->
+   
+   <appender name="CONSOLE" class="org.apache.log4j.ConsoleAppender">
+		<param name="Threshold" value="DEBUG"/>
+		<param name="Target" value="System.out"/>
+		<param name="Encoding" value="UTF-8"/>
+		
+		<layout class="org.apache.log4j.PatternLayout">
+		 	<!-- The default pattern: Date Priority [Category] (Thread) Message\n -->
+        	<param name="ConversionPattern" value="%d %-5p [%c{1}] %m%n"/>
+
+        	<!-- The full pattern: Date MS Priority [Category] (Thread:NDC) Message\n
+        	<param name="ConversionPattern" value="%d %-5r %-5p [%c] (%t:%x) %m%n"/>
+         	-->
+		</layout>
+	</appender>
+	
+	<appender name="FILE" class="org.apache.log4j.FileAppender">
+      	<param name="File" value="log/corrmatrix.log"/>
+      	<param name="Append" value="true"/>
+        <param name="Encoding" value="UTF-8"/>
+
+      	<layout class="org.apache.log4j.PatternLayout">
+        	<!-- The default pattern: Date Priority [Category] (Thread) Message\n -->
+        	<param name="ConversionPattern" value="%d %-5p [%c] %m%n"/>
+
+        	<!-- The full pattern: Date MS Priority [Category] (Thread:NDC) Message\n
+        	<param name="ConversionPattern" value="%d %-5r %-5p [%c] (%t:%x) %m%n"/>
+         	-->
+      	</layout>
+    </appender>
+
+	<!-- Limit categories -->	
+	<logger name="gov.pnnl">
+		<level value="DEBUG"/>
+	</logger>
+
+   <!-- ======================= -->
+   <!-- Setup the Root category -->
+   <!-- ======================= -->
+
+   <root>
+      <appender-ref ref="CONSOLE"/>
+      <!-- <appender-ref ref="FILE"/> -->
+   </root>
+
+</log4j:configuration>
\ No newline at end of file
--- a/examples/correlation-matrix/src/test/gov/pnnl/aparapi/test/CorrMatrixTest.java
+++ b/examples/correlation-matrix/src/test/gov/pnnl/aparapi/test/CorrMatrixTest.java
+/**
+ * This material was prepared as an account of work sponsored by an agency of the United States Government.  
+ * Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of 
+ * their employees, nor any jurisdiction or organization that has cooperated in the development of these materials, 
+ * makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy, 
+ * completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents
+ * that its use would not infringe privately owned rights.
+ */
+package gov.pnnl.aparapi.test;
+
+import gov.pnnl.aparapi.matrix.CorrMatrixHost;
+
+import java.io.File;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.log4j.Logger;
+import org.apache.lucene.util.OpenBitSet;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import com.amd.aparapi.Kernel.EXECUTION_MODE;
+
+/**
+ * This test class performs the following functions:
+ * 
+ * 1) Create a randomly populated set of matrices for correlation/co-occurrence computation
+ * 2) Execute the CPU-based computation using Lucene OpenBitSets
+ * 3) Execute the GPU-based computation using Aparapi CorrMatrix host and kernel
+ * 4) Verify the results of OpenBitSet and CorrMatrix by comparing matrices to each other
+ *  
+ * @author ryan.lamothe at gmail.com
+ *
+ */
+public class CorrMatrixTest {
+
+   private static final Logger LOG = Logger.getLogger(CorrMatrixTest.class);
+
+   private final List<Pair<OpenBitSet, OpenBitSet>> obsPairs = new ArrayList<Pair<OpenBitSet, OpenBitSet>>();;
+
+   private final Random rand = new Random();
+
+   private int[][] obsResultMatrix;
+
+   /**
+    * NumTerms and NumLongs (documents) need to be adjusted manually right now to force 'striping' to occur (see Host code for details)
+    */
+   @Before
+   public void setup() throws Exception {
+      /*
+       * Populate test data
+       */
+      LOG.debug("----------");
+      LOG.debug("Populating test matrix data using settings from build.xml...");
+      LOG.debug("----------");
+
+      final int numTerms = Integer.getInteger("numRows", 300); // # Rows
+      // numLongs*64 for number of actual documents since these are 'packed' longs
+      final int numLongs = Integer.getInteger("numColumns", 10000); // # Columns
+
+      for (int i = 0; i < numTerms; ++i) {
+         final long[] bits = new long[numLongs];
+         for (int j = 0; j < numLongs; ++j) {
+            bits[j] = rand.nextLong();
+         }
+
+         obsPairs.add(i, new ImmutablePair<OpenBitSet, OpenBitSet>(new OpenBitSet(bits, numLongs), new OpenBitSet(bits, numLongs)));
+      }
+
+      /*
+       * OpenBitSet calculations
+       */
+      LOG.debug("Executing OpenBitSet intersectionCount");
+
+      final long startTime = System.currentTimeMillis();
+
+      obsResultMatrix = new int[obsPairs.size()][obsPairs.size()];
+
+      // This is an N^2 comparison loop
+      // FIXME This entire loop needs to be parallelized to show an apples-to-apples comparison to Aparapi
+      for (int i = 0; i < obsPairs.size(); i++) {
+         final Pair<OpenBitSet, OpenBitSet> docFreqVector1 = obsPairs.get(i);
+
+         for (int j = 0; j < obsPairs.size(); j++) {
+            final Pair<OpenBitSet, OpenBitSet> docFreqVector2 = obsPairs.get(j);
+
+            // # of matches in both sets of documents
+            final int result = (int) OpenBitSet.intersectionCount(docFreqVector1.getLeft(), docFreqVector2.getRight());
+            obsResultMatrix[i][j] = result;
+         }
+      }
+
+      final long endTime = System.currentTimeMillis() - startTime;
+
+      LOG.debug("OpenBitSet Gross Execution Time: " + endTime + " ms <------OpenBitSet");
+      LOG.debug("----------");
+   }
+
+   @Test
+   public void testCorrelationMatrix() throws Exception {
+      /*
+       * GPU calculations
+       */
+      LOG.debug("Executing Aparapi intersectionCount");
+
+      final long[][] matrixA = new long[obsPairs.size()][];
+      final long[][] matrixB = new long[obsPairs.size()][];
+
+      // Convert OpenBitSet pairs to long primitive arrays for use with Aparapi
+      // TODO It would be nice if we could find a way to put the obsPairs onto the GPU directly :)
+      for (int i = 0; i < obsPairs.size(); i++) {
+         final OpenBitSet obsA = obsPairs.get(i).getLeft();
+         final OpenBitSet obsB = obsPairs.get(i).getRight();
+
+         matrixA[i] = obsA.getBits();
+         matrixB[i] = obsB.getBits();
+      }
+
+      // The reason for setting this property is because the CorrMatrix host/kernel code
+      // came from a GUI where a user could select "Use Hardware Acceleration" instead
+      // of the application forcing the setting globally on the command-line
+      final int[][] gpuResultMatrix;
+      if (Boolean.getBoolean("useGPU")) {
+         gpuResultMatrix = CorrMatrixHost.intersectionMatrix(matrixA, matrixB, EXECUTION_MODE.GPU);
+      } else {
+         gpuResultMatrix = CorrMatrixHost.intersectionMatrix(matrixA, matrixB, EXECUTION_MODE.CPU);
+      }
+
+      // Compare the two result arrays to make sure we are generating the same output
+      for (int i = 0; i < obsResultMatrix.length; i++) {
+         Assert.assertTrue("Arrays are not equal", Arrays.equals(obsResultMatrix[i], gpuResultMatrix[i]));
+      }
+
+      // Visually compare/third-party tool compare if desired
+      if (LOG.isTraceEnabled()) {
+         // We're not using "try with resources" because Aparapi currently targets JDK 6
+         final PrintWriter cpuOut = new PrintWriter(new File(System.getProperty("user.dir"), "trace/cpuOut.txt"));
+         final PrintWriter gpuOut = new PrintWriter(new File(System.getProperty("user.dir"), "trace/gpuOut.txt"));
+
+         try {
+            for (int i = 0; i < obsResultMatrix.length; i++) {
+               if (LOG.isTraceEnabled()) {
+                  LOG.trace("obsResultMatrix length: " + obsResultMatrix.length);
+                  LOG.trace("gpuResultMatrix length: " + gpuResultMatrix.length);
+
+                  cpuOut.println(Arrays.toString(obsResultMatrix[i]));
+                  gpuOut.println(Arrays.toString(gpuResultMatrix[i]));
+               }
+            }
+         } finally {
+            if (cpuOut != null) {
+               cpuOut.flush();
+               cpuOut.close();
+            }
+
+            if (gpuOut != null) {
+               gpuOut.flush();
+               gpuOut.close();
+            }
+         }
+      }
+   }
+}