diff --git a/examples/correlation-matrix/.classpath b/examples/correlation-matrix/.classpath new file mode 100644 index 0000000000000000000000000000000000000000..25515efc14d10796c4ba331b3b4fcf2ae79e4aea --- /dev/null +++ b/examples/correlation-matrix/.classpath @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<classpath> + <classpathentry kind="src" path="src/java"/> + <classpathentry kind="src" path="src/test"/> + <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> + <classpathentry kind="lib" path="/com.amd.aparapi/dist/aparapi.jar" sourcepath="/com.amd.aparapi"> + <attributes> + <attribute name="org.eclipse.jdt.launching.CLASSPATH_ATTR_LIBRARY_PATH_ENTRY" value="com.amd.aparapi.jni/dist"/> + </attributes> + </classpathentry> + <classpathentry kind="lib" path="/third-party/apache/commons/commons-lang3-3.1.jar"/> + <classpathentry kind="lib" path="/third-party/apache/logging/log4j-1.2.16.jar"/> + <classpathentry kind="lib" path="/third-party/apache/lucene/lucene-core-3.5.0.jar"/> + <classpathentry kind="lib" path="/third-party/junit/junit-4.10.jar"/> + <classpathentry kind="output" path="classes"/> +</classpath> diff --git a/examples/correlation-matrix/.project b/examples/correlation-matrix/.project new file mode 100644 index 0000000000000000000000000000000000000000..89d60ffcfa30f47bda1bf65bbd588e765ffe5786 --- /dev/null +++ b/examples/correlation-matrix/.project @@ -0,0 +1,17 @@ +<?xml version="1.0" encoding="UTF-8"?> +<projectDescription> + <name>correlation-matrix</name> + <comment></comment> + <projects> + </projects> + <buildSpec> + <buildCommand> + <name>org.eclipse.jdt.core.javabuilder</name> + <arguments> + </arguments> + </buildCommand> + </buildSpec> + <natures> + <nature>org.eclipse.jdt.core.javanature</nature> + </natures> +</projectDescription> diff --git a/examples/correlation-matrix/CC-4257_FINAL_060612.pdf b/examples/correlation-matrix/CC-4257_FINAL_060612.pdf new file mode 100644 index 0000000000000000000000000000000000000000..4a3aa75c7c35a321e1334dda56d5db498105bd40 Binary files /dev/null and b/examples/correlation-matrix/CC-4257_FINAL_060612.pdf differ diff --git a/examples/correlation-matrix/build.xml b/examples/correlation-matrix/build.xml new file mode 100644 index 0000000000000000000000000000000000000000..2fd6770348138c9d408e4acd8ebbae19c5ce5723 --- /dev/null +++ b/examples/correlation-matrix/build.xml @@ -0,0 +1,79 @@ +<?xml version="1.0"?> + +<project name="correlation-matrix" default="junit" basedir="."> + + <!-- + DO NOT EDIT BELOW THIS LINE + --> + <echo>OS Name: ${os.name}</echo> + <echo>OS Version: ${os.version}</echo> + <echo>OS Arch: ${os.arch}</echo> + <echo>Java Version: ${java.version}</echo> + + <target name="clean"> + <delete dir="classes"/> + <delete dir="junit"/> + <!-- Legacy cleanup --> + <delete file="junit*.jar"/> + </target> + + <path id="classpath"> + <pathelement path="${basedir}/../../com.amd.aparapi/dist/aparapi.jar"/> + <pathelement path="${basedir}/../third-party/apache/commons/commons-lang3-3.1.jar"/> + <pathelement path="${basedir}/../third-party/apache/logging/log4j-1.2.16.jar"/> + <pathelement path="${basedir}/../third-party/apache/lucene/lucene-core-3.5.0.jar"/> + <pathelement path="${basedir}/../third-party/junit/junit-4.10.jar"/> + <pathelement path="${junit.home}/${junit.jar.name}"/> + <pathelement path="classes"/> + </path> + + <target name="junit" depends="clean"> + <mkdir dir="classes"/> + <mkdir dir="junit/data"/> + + <!-- Runtime Code --> + <javac debug="true" + debuglevel="lines,vars,source" + srcdir="src/java" + destdir="classes" + includeAntRuntime="false" + classpathref="classpath"> + <compilerarg value="-Xlint"/> + <compilerarg value="-Xlint:-path"/> + </javac> + + <!-- JUnit Tests --> + <javac debug="true" + debuglevel="lines,vars,source" + srcdir="src/test" + destdir="classes" + includeAntRuntime="false" + classpathref="classpath"> + <compilerarg value="-Xlint"/> + <compilerarg value="-Xlint:-path"/> + </javac> + + <copy todir="classes" file="src/java/log4j.xml"/> + + <!-- even though fork is slower we need to set the library path and this requires fork --> + <junit printsummary="false" fork="true" haltonfailure="false" failureproperty="tests.failed" showoutput="true"> + <sysproperty key="java.library.path" value="${basedir}/../../com.amd.aparapi.jni/dist"/> + + <!-- USER DEFINED PROPERTIES --> + <sysproperty key="numRows" value="1024"/> + <sysproperty key="numColumns" value="16384"/> + <sysproperty key="useGPU" value="true"/> + + <formatter type="xml" /> + <classpath refid="classpath"/> + <batchtest todir="junit/data"> + <fileset dir="src/test"/> + </batchtest> + </junit> + + <junitreport todir="junit/data"> + <fileset dir="junit/data"/> + </junitreport> + </target> + +</project> \ No newline at end of file diff --git a/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixHost.java b/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixHost.java new file mode 100644 index 0000000000000000000000000000000000000000..89e3cc301c533293cdbb1554975c603a2721a322 --- /dev/null +++ b/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixHost.java @@ -0,0 +1,364 @@ +/** + * This material was prepared as an account of work sponsored by an agency of the United States Government. + * Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of + * their employees, nor any jurisdiction or organization that has cooperated in the development of these materials, + * makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy, + * completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents + * that its use would not infringe privately owned rights. + */ +package gov.pnnl.aparapi.matrix; + +import org.apache.log4j.Logger; + +import com.amd.aparapi.Kernel; +import com.amd.aparapi.Kernel.EXECUTION_MODE; +import com.amd.aparapi.Range; +import com.amd.aparapi.device.Device; +import com.amd.aparapi.device.OpenCLDevice; + +/** + * GPU calculations using OpenBitSet Intersection for OpenBitSets + * + * Based on code from: <br/> + * {@link http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java} + * + * @author ryan.lamothe at gmail.com + * @author sedillard at gmail.com + */ +public class CorrMatrixHost { + + private static final Logger LOG = Logger.getLogger(CorrMatrixHost.class); + + /** + * Perform matrix intersection for two lists of Lucene OpenBitSet-based packed longs + * + * @param matrixA + * The first term-document matrix + * @param matrixB + * The second term-document matrix + * @param Aparapi EXECUTION_MODE + * @return result Matrix + * @throws Exception + */ + public static int[][] intersectionMatrix(final long[][] matrixA, final long[][] matrixB, final EXECUTION_MODE executionMode) throws Exception { + + // Basic validation + if (matrixA == null) { + throw new NullPointerException("MatrixA cannot be NULL"); + } + + if (matrixB == null) { + throw new NullPointerException("MatrixB cannot be NULL"); + } + + // Size of an array is 8 bytes for the object + 4 bytes for the header and length information + final int arrayMemOverhead = 12; + + // numDocs/64 since they are packed into longs + // We need to make our matrix sizes multiples of BLOCK_SIZE + final int matrixA_numTerms = matrixA.length; + final int matrixA_numLongs = matrixA[0].length; + + if (LOG.isDebugEnabled()) { + LOG.debug("----------"); + LOG.debug("MatrixA NumTerms (Rows): " + matrixA_numTerms); + LOG.debug("MatrixA NumLongs (Columns): " + matrixA_numLongs); + LOG.debug("MatrixA NumDocs: " + (matrixA_numLongs * 64L)); + } + + final long matrixA_BytesPerRow = matrixA_numLongs * 8L; + final long matrixA_TotalBytes = (matrixA_numTerms * matrixA_BytesPerRow) + arrayMemOverhead; + + if (LOG.isDebugEnabled()) { + LOG.debug("MatrixA Total Memory Size: " + humanReadableByteCount(matrixA_TotalBytes, true)); + } + + final int matrixB_numTerms = matrixB.length; + final int matrixB_numLongs = matrixB[0].length; + + if (LOG.isDebugEnabled()) { + LOG.debug("----------"); + LOG.debug("MatrixB NumTerms (Rows): " + matrixB_numTerms); + LOG.debug("MatrixB NumLongs (Columns): " + matrixB_numLongs); + LOG.debug("MatrixB NumDocs: " + (matrixB_numLongs * 64L)); + } + + final long matrixB_BytesPerRow = matrixB_numLongs * 8L; + final long matrixB_TotalBytes = (matrixB_numTerms * matrixB_BytesPerRow) + arrayMemOverhead; + + if (LOG.isDebugEnabled()) { + LOG.debug("MatrixB Total Memory Size: " + humanReadableByteCount(matrixB_TotalBytes, true)); + LOG.debug("----------"); + } + + final int[][] resultMatrix = new int[matrixA_numTerms][matrixB_numTerms]; + + if (LOG.isDebugEnabled()) { + final long resultMatrix_TotalBytes = (matrixA_numTerms * matrixB_numTerms * 4L) + arrayMemOverhead; + LOG.debug("ResultMatrix Memory Size: " + humanReadableByteCount(resultMatrix_TotalBytes, true)); + LOG.debug("Total Requested Memory Size: " + humanReadableByteCount(matrixA_TotalBytes + matrixB_TotalBytes + resultMatrix_TotalBytes, true)); + LOG.debug("----------"); + } + + int NUM_SUB_ROWS = matrixA_numTerms; // Default number of sub-rows + + OpenCLDevice device = null; + + // We do not test for EXECUTION_MODE.JTP because JTP is non-OpenCL + if (executionMode.equals(EXECUTION_MODE.CPU)) { + device = (OpenCLDevice) Device.firstCPU(); + + if (device == null) { + LOG.warn("OpenCLDevice.CPU is NULL...OpenCL is unavailable. Setting to JTP mode."); + LOG.debug("----------"); + } + } else if (executionMode.equals(EXECUTION_MODE.GPU)) { + device = (OpenCLDevice) Device.best(); + + if (device == null) { + LOG.warn("OpenCLDevice.GPU is NULL...OpenCL is unavailable. Setting to JTP mode."); + LOG.debug("----------"); + } + } + + // This is to create stripes of rows that will fit into OpenCL's available memory + // Calculate the number of sub-rows by calling OpenCL to find out available memory + // Length of row * 8 (size of long in bytes) * number of rows to available memory + final int maxNumTerms = Math.max(matrixA_numTerms, matrixB_numTerms); + + if (device != null) { + final long globalMemSize = device.getGlobalMemSize(); + // final long maxMemAllocSize = Math.max((globalMemSize/4), 128*1024*1024); + final long maxMemAllocSize = device.getMaxMemAllocSize(); + + // 1048576 bytes in a megabyte (1024*1024) + // Java long is 8 bytes + // 131072 longs in 1 megabyte + // SAFE OpenCL spec allocation is max(1/4 GlobalMemSize) + // ***During our testing this appears to be incorrectly/inconsistently reported depending on os/drivers/hardware*** + if (LOG.isDebugEnabled()) { + LOG.debug("Available OpenCL globalMemSize: " + humanReadableByteCount(globalMemSize, true)); + LOG.debug("Available OpenCL maxMemAllocSize: " + humanReadableByteCount(maxMemAllocSize, true)); + } + + // Maybe there is a more clever way to do this :) + // The idea here is to decide how many sub-rows of the matrix we can fit on a single card + // The long-term goal to divide up the work for both small RAM GPUs and multiple GPUs + int subRowsCounterA = 0; + int subRowsCounterB = 0; + long subRowsMemSizeA = 0L; + long subRowsMemSizeB = 0L; + long subResultMatrixMemSize = 0L; + long subTotalMemSize = 0L; + + do { + if (subRowsCounterA < matrixA_numTerms) { + subRowsMemSizeA = subRowsCounterA != 0 ? (subRowsCounterA * matrixA_numLongs * 8L) + arrayMemOverhead : 0; + subRowsCounterA += 1; + } else if (subRowsCounterA == matrixA_numTerms) { + subRowsMemSizeA = subRowsCounterA != 0 ? (subRowsCounterA * matrixA_numLongs * 8L) + arrayMemOverhead : 0; + } + + if (subRowsCounterB < matrixB_numTerms) { + subRowsMemSizeB = subRowsCounterB != 0 ? (subRowsCounterB * matrixB_numLongs * 8L) + arrayMemOverhead : 0; + subRowsCounterB += 1; + } else if (subRowsCounterB == matrixB_numTerms) { + subRowsMemSizeB = subRowsCounterB != 0 ? (subRowsCounterB * matrixB_numLongs * 8L) + arrayMemOverhead : 0; + } + + // This is 4 bytes since the sub-result matrix is an int array + subResultMatrixMemSize = ((subRowsCounterA * subRowsCounterB) * 4L) + arrayMemOverhead; + + subTotalMemSize = subRowsMemSizeA + subRowsMemSizeB + subResultMatrixMemSize; + } while ((Math.max(subRowsCounterA, subRowsCounterB) < maxNumTerms) && (subTotalMemSize <= maxMemAllocSize)); + + // If using OpenCL override the default number of subrows + NUM_SUB_ROWS = Math.max(subRowsCounterA, subRowsCounterB); + + if (NUM_SUB_ROWS < maxNumTerms) { + final long subMatrixA_memSize = (NUM_SUB_ROWS * matrixA_numLongs * 8L) + arrayMemOverhead; + final long subMatrixB_memSize = (NUM_SUB_ROWS * matrixB_numLongs * 8L) + arrayMemOverhead; + final long subResultMatrix_memSize = (NUM_SUB_ROWS * NUM_SUB_ROWS * 4L) + arrayMemOverhead; + + LOG.warn("****************************************************************"); + LOG.warn("Requested matrix computation is larger than available OpenCL memory"); + LOG.warn("Matrix striping is occurring to fit all data into OpenCL memory..."); + LOG.warn(""); + LOG.warn("Number rows requested: " + maxNumTerms); + LOG.warn("Number rows that fit: " + NUM_SUB_ROWS); + LOG.warn(""); + LOG.warn("SubMatrixA Memory Size: " + humanReadableByteCount(subMatrixA_memSize, true)); + LOG.warn("SubMatrixB Memory Size: " + humanReadableByteCount(subMatrixB_memSize, true)); + LOG.warn("SubResultMatrix Memory Size: " + humanReadableByteCount(subResultMatrix_memSize, true)); + LOG.warn("SubMatrix Total Memory Size: " + humanReadableByteCount(subMatrixA_memSize + subMatrixB_memSize + subResultMatrix_memSize, true)); + LOG.warn("****************************************************************"); + } + } + + final int numSubBlocksA = ((matrixA_numTerms + NUM_SUB_ROWS) - 1) / NUM_SUB_ROWS; + final int numSubBlocksB = ((matrixB_numTerms + NUM_SUB_ROWS) - 1) / NUM_SUB_ROWS; + + final long[] subMatrixA = new long[NUM_SUB_ROWS * matrixA_numLongs]; + final long[] subMatrixB = new long[NUM_SUB_ROWS * matrixB_numLongs]; + final int[] subResultMatrix = new int[NUM_SUB_ROWS * NUM_SUB_ROWS]; + + final CorrMatrixKernel kernel = new CorrMatrixKernel(subMatrixA, NUM_SUB_ROWS, subMatrixB, NUM_SUB_ROWS, matrixA_numLongs, subResultMatrix); + kernel.setExplicit(true); + + // Here we define a fall-back strategy, since the user may have wanted to execute only a single execution mode + if (executionMode.equals(EXECUTION_MODE.GPU) && (device != null)) { + kernel.addExecutionModes(EXECUTION_MODE.GPU, EXECUTION_MODE.CPU, EXECUTION_MODE.JTP); + LOG.debug("Execution Fallback Strategy: GPU --> CPU --> JTP"); + } else if (executionMode.equals(EXECUTION_MODE.CPU) && (device != null)) { + kernel.addExecutionModes(EXECUTION_MODE.CPU, EXECUTION_MODE.JTP); + LOG.debug("Execution Fallback Strategy: CPU --> JTP"); + } else { + kernel.addExecutionModes(EXECUTION_MODE.JTP); + LOG.debug("Execution Strategy: JTP"); + } + + try { + for (int a = 0; a < numSubBlocksA; a++) { + for (int b = 0; b < numSubBlocksB; b++) { + final int aSubRowStart = a * NUM_SUB_ROWS; + final int aSubRowEnd = Math.min(matrixA_numTerms, aSubRowStart + NUM_SUB_ROWS); + + for (int i = aSubRowStart; i < aSubRowEnd; i++) { + if (matrixA_numLongs != matrixA[i].length) { + throw new Exception("All rows in the matrix need be the same length"); + } + + System.arraycopy(matrixA[i], 0, subMatrixA, (i - aSubRowStart) * matrixA_numLongs, matrixA_numLongs); + } + + final int bSubRowStart = b * NUM_SUB_ROWS; + final int bSubRowEnd = Math.min(matrixB_numTerms, bSubRowStart + NUM_SUB_ROWS); + + for (int i = bSubRowStart; i < bSubRowEnd; i++) { + if (matrixA_numLongs != matrixB[i].length) { + throw new Exception("All rows in the matrix need be the same length"); + } + + System.arraycopy(matrixB[i], 0, subMatrixB, (i - bSubRowStart) * matrixB_numLongs, matrixB_numLongs); + } + + // Since matrixA_NumLongs == matrixB_NumLongs we're only going to pass matrixA_NumLongs + executeKernel(device, subMatrixA, aSubRowEnd - aSubRowStart, subMatrixB, bSubRowEnd - bSubRowStart, matrixA_numLongs, subResultMatrix, kernel); + + // Convert one dimensional array to two dimensional array in the expected output ordering + for (int i = 0; i < NUM_SUB_ROWS; i++) { + if ((i + aSubRowStart) < aSubRowEnd) { + System.arraycopy(subResultMatrix, i * NUM_SUB_ROWS, resultMatrix[i + aSubRowStart], bSubRowStart, bSubRowEnd - bSubRowStart); + } + } + } + } + } finally { + if (LOG.isDebugEnabled()) { + LOG.debug("----------"); + LOG.debug("Aparapi Gross Execution Time: " + kernel.getAccumulatedExecutionTime() + " ms <------ Aparapi"); + LOG.debug("OpenCL Generation Time: " + kernel.getConversionTime() + " ms"); + LOG.debug("Kernel Net Execution Time: " + (kernel.getAccumulatedExecutionTime() - kernel.getConversionTime()) + " ms"); + LOG.debug("----------"); + } + + try { + kernel.dispose(); + } catch (final UnsatisfiedLinkError e) { + LOG.error("Aparapi failed to dispose of the kernel", e); + } + } + + return resultMatrix; + } + + /** + * Execute the GPU kernel + * + * @param subMatrixA + * @param matrixA_NumTerms + * @param subMatrixB + * @param matrixB_NumTerms + * @param numLongs + * @param subResultMatrix + * @param kernel + * + * @return resultMatrix + */ + private static void executeKernel(final Device device, final long[] subMatrixA, final int matrixA_NumTerms, final long[] subMatrixB, final int matrixB_NumTerms, final int numLongs, final int[] subResultMatrix, final Kernel kernel) { + + // Power of Two for best performance + int matrixA_NumTermsRnd = matrixA_NumTerms; + while (!isPowerOfTwo(matrixA_NumTermsRnd)) { + matrixA_NumTermsRnd += 1; + } + + int matrixB_NumTermsRnd = matrixB_NumTerms; + while (!isPowerOfTwo(matrixB_NumTermsRnd)) { + matrixB_NumTermsRnd += 1; + } + + final Range range; + if (device != null) { + range = Range.create2D(device, matrixA_NumTermsRnd, matrixB_NumTermsRnd); + } else { + range = Range.create2D(matrixA_NumTermsRnd, matrixB_NumTermsRnd); + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Range: " + range); + } + + kernel.put(subMatrixA); + kernel.put(subMatrixB); + kernel.put(subResultMatrix); + + kernel.execute(range); + + kernel.get(subResultMatrix); + } + + /** + * Highly efficient means to compute whether a number is a power of 2<br> + * Based on code from http://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 + * <p> + * Another very cool way to do this is ((x&(-x))==x) + * + * @param n + * @return boolean + */ + private static boolean isPowerOfTwo(int n) { + return (n > 0) && ((n & (n - 1)) == 0); + } + + /** + * Rounds a number to the multiple indicated + * + * @param num + * @param multiple + * @return + */ + private static int roundToMultiple(double num, int multiple) { + return (int) (Math.ceil(num / multiple) * multiple); + } + + /** + * Very nice means to convert byte sizes into human readable format<br> + * Based on code from http://stackoverflow.com/questions/3758606/how-to-convert-byte-size-into-human-readable-format-in-java + * <p> + * + * @param bytes + * @param si + * @return humanReadableByteCount + */ + private static String humanReadableByteCount(long bytes, boolean si) { + final int unit = si ? 1000 : 1024; + if (bytes < unit) { + return bytes + " B"; + } + final int exp = (int) (Math.log(bytes) / Math.log(unit)); + final String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp - 1) + (si ? "" : "i"); + + return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre); + } +} diff --git a/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixKernel.java b/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixKernel.java new file mode 100644 index 0000000000000000000000000000000000000000..2b95ed01fbf386c1bbdf915e76b3fff9dbb38416 --- /dev/null +++ b/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixKernel.java @@ -0,0 +1,261 @@ +/** + * This material was prepared as an account of work sponsored by an agency of the United States Government. + * Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of + * their employees, nor any jurisdiction or organization that has cooperated in the development of these materials, + * makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy, + * completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents + * that its use would not infringe privately owned rights. + */ +package gov.pnnl.aparapi.matrix; + +import com.amd.aparapi.Kernel; + +/** + * This kernel attempts to re-implement the Lucene OpenBitSet functionality on a GPU + * + * Based on code from: <br/> + * {@link http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java} + * + * @author ryan.lamothe at gmail.com + * @author sedillard at gmail.com + */ +public class CorrMatrixKernel extends Kernel { + + final long[] matrixA; + + final int matrixA_NumTerms; + + final long[] matrixB; + + final int matrixB_NumTerms; + + int numLongs; + + int[] resultMatrix; + + /** + * Default constructor + */ + public CorrMatrixKernel(final long[] matrixA, final int matrixA_NumTerms, final long[] matrixB, final int matrixB_NumTerms, + final int numLongs, final int[] resultMatrix) { + this.matrixA = matrixA; + this.matrixA_NumTerms = matrixA_NumTerms; + this.matrixB = matrixB; + this.matrixB_NumTerms = matrixB_NumTerms; + this.numLongs = numLongs; + this.resultMatrix = resultMatrix; + } + + @Override + public void run() { + final int i = this.getGlobalId(0); + + if (i < matrixA_NumTerms) { + final int j = this.getGlobalId(1); + + if (j < matrixB_NumTerms) { + // For testing purposes, you can use the naive implementation to compare performance + resultMatrix[(i * matrixB_NumTerms) + j] = pop_intersect(matrixA, i * numLongs, matrixB, j * numLongs, numLongs); + // this.resultMatrix[i * matrixB_NumTerms + j] = this.naive_pop_intersect(matrixA, i * numLongs, matrixB, j * numLongs, numLongs); + } + } + } + + /** + * A naive implementation of the pop_array code below + */ + private int naive_pop_intersect(final long matrixA[], final int aStart, final long matrixB[], final int bStart, final int numWords) { + int sum = 0; + + for (int i = 0; i < numWords; i++) { + sum += pop(matrixA[aStart + i] & matrixB[bStart + i]); + } + + return sum; + } + + /** + * Returns the popcount or cardinality of the two sets after an intersection. + * Neither array is modified. + * + * Modified for the purposes of this kernel from its original version + */ + private int pop_intersect(final long matrixA[], final int aStart, final long matrixB[], final int bStart, final int numWords) { + + /* + * http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java + */ + + // generated from pop_array via sed 's/A\[\([^]]*\)\]/\(A[\1] \& B[\1]\)/g' + final int n = numWords; + int tot = 0, tot8 = 0; + long ones = 0, twos = 0, fours = 0; + + int i; + for (i = 0; i <= (n - 8); i += 8) { + long twosA = 0; + long twosB = 0; + long foursA = 0; + long foursB = 0; + long eights = 0; + + final int ai = aStart + i; + final int bi = bStart + i; + + // CSA(twosA, ones, ones, (A[i] & B[i]), (A[i+1] & B[i+1])) + { + final long b = matrixA[ai] & matrixB[bi], c = matrixA[ai + 1] & matrixB[bi + 1]; + final long u = ones ^ b; + twosA = (ones & b) | (u & c); + ones = u ^ c; + } + + // CSA(twosB, ones, ones, (A[i+2] & B[i+2]), (A[i+3] & B[i+3])) + { + final long b = matrixA[ai + 2] & matrixB[bi + 2], c = matrixA[ai + 3] & matrixB[bi + 3]; + final long u = ones ^ b; + twosB = (ones & b) | (u & c); + ones = u ^ c; + } + + // CSA(foursA, twos, twos, twosA, twosB) + { + final long u = twos ^ twosA; + foursA = (twos & twosA) | (u & twosB); + twos = u ^ twosB; + } + + // CSA(twosA, ones, ones, (A[i+4] & B[i+4]), (A[i+5] & B[i+5])) + { + final long b = matrixA[ai + 4] & matrixB[bi + 4], c = matrixA[ai + 5] & matrixB[bi + 5]; + final long u = ones ^ b; + twosA = (ones & b) | (u & c); + ones = u ^ c; + } + + // CSA(twosB, ones, ones, (A[i+6] & B[i+6]), (A[i+7] & B[i+7])) + { + final long b = matrixA[ai + 6] & matrixB[bi + 6], c = matrixA[ai + 7] & matrixB[bi + 7]; + final long u = ones ^ b; + twosB = (ones & b) | (u & c); + ones = u ^ c; + } + + // CSA(foursB, twos, twos, twosA, twosB) + { + final long u = twos ^ twosA; + foursB = (twos & twosA) | (u & twosB); + twos = u ^ twosB; + } + + // CSA(eights, fours, fours, foursA, foursB) + { + final long u = fours ^ foursA; + eights = (fours & foursA) | (u & foursB); + fours = u ^ foursB; + } + + tot8 += pop(eights); + } + + if (i <= (n - 4)) { + final int ai = aStart + i; + final int bi = bStart + i; + + long twosA = 0; + long twosB = 0; + long foursA = 0; + long eights = 0; + + { + final long b = matrixA[ai] & matrixB[bi], c = matrixA[ai + 1] & matrixB[bi + 1]; + final long u = ones ^ b; + twosA = (ones & b) | (u & c); + ones = u ^ c; + } + + { + final long b = matrixA[ai + 2] & matrixB[bi + 2], c = matrixA[ai + 3] & matrixB[bi + 3]; + final long u = ones ^ b; + twosB = (ones & b) | (u & c); + ones = u ^ c; + } + + { + final long u = twos ^ twosA; + foursA = (twos & twosA) | (u & twosB); + twos = u ^ twosB; + } + + eights = fours & foursA; + fours = fours ^ foursA; + + tot8 += pop(eights); + i += 4; + } + + if (i <= (n - 2)) { + final int ai = aStart + i; + final int bi = bStart + i; + + final long b = matrixA[ai] & matrixB[bi], c = matrixA[ai + 1] & matrixB[bi + 1]; + final long u = ones ^ b; + final long twosA = (ones & b) | (u & c); + ones = u ^ c; + + final long foursA = twos & twosA; + twos = twos ^ twosA; + + final long eights = fours & foursA; + fours = fours ^ foursA; + + tot8 += pop(eights); + i += 2; + } + + if (i < n) { + final int ai = aStart + i; + final int bi = bStart + i; + + tot += pop(matrixA[ai] & matrixB[bi]); + } + + tot += (pop(fours) << 2) + (pop(twos) << 1) + pop(ones) + (tot8 << 3); + + return tot; + } + + /** + * Returns the number of bits set in the long + */ + private int pop(long x) { + + /* + * http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java + */ + + /* + * Hacker's Delight 32 bit pop function: + * http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt + * + * int pop(unsigned x) { + * x = x - ((x >> 1) & 0x55555555); + * x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + * x = (x + (x >> 4)) & 0x0F0F0F0F; + * x = x + (x >> 8); + * x = x + (x >> 16); + * return x & 0x0000003F; + * } + * * + */ + + // 64 bit java version of the C function from above + x = x - ((x >>> 1) & 0x5555555555555555L); + x = (x & 0x3333333333333333L) + ((x >>> 2) & 0x3333333333333333L); + x = (x + (x >>> 4)) & 0x0F0F0F0F0F0F0F0FL; + x = x + (x >>> 8); + x = x + (x >>> 16); + x = x + (x >>> 32); + return (int) x & 0x7F; + } +} diff --git a/examples/correlation-matrix/src/java/log4j.xml b/examples/correlation-matrix/src/java/log4j.xml new file mode 100644 index 0000000000000000000000000000000000000000..03aa30df4cf8828ee3fd07f56e5b8e1db577228c --- /dev/null +++ b/examples/correlation-matrix/src/java/log4j.xml @@ -0,0 +1,59 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd"> + +<!-- + | For more configuration information and examples see the Jakarta Log4j + | website: http://jakarta.apache.org/log4j + --> + +<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/" debug="false"> + + <!-- ============================== --> + <!-- Append messages to the console --> + <!-- ============================== --> + + <appender name="CONSOLE" class="org.apache.log4j.ConsoleAppender"> + <param name="Threshold" value="DEBUG"/> + <param name="Target" value="System.out"/> + <param name="Encoding" value="UTF-8"/> + + <layout class="org.apache.log4j.PatternLayout"> + <!-- The default pattern: Date Priority [Category] (Thread) Message\n --> + <param name="ConversionPattern" value="%d %-5p [%c{1}] %m%n"/> + + <!-- The full pattern: Date MS Priority [Category] (Thread:NDC) Message\n + <param name="ConversionPattern" value="%d %-5r %-5p [%c] (%t:%x) %m%n"/> + --> + </layout> + </appender> + + <appender name="FILE" class="org.apache.log4j.FileAppender"> + <param name="File" value="log/corrmatrix.log"/> + <param name="Append" value="true"/> + <param name="Encoding" value="UTF-8"/> + + <layout class="org.apache.log4j.PatternLayout"> + <!-- The default pattern: Date Priority [Category] (Thread) Message\n --> + <param name="ConversionPattern" value="%d %-5p [%c] %m%n"/> + + <!-- The full pattern: Date MS Priority [Category] (Thread:NDC) Message\n + <param name="ConversionPattern" value="%d %-5r %-5p [%c] (%t:%x) %m%n"/> + --> + </layout> + </appender> + + <!-- Limit categories --> + <logger name="gov.pnnl"> + <level value="DEBUG"/> + </logger> + + <!-- ======================= --> + <!-- Setup the Root category --> + <!-- ======================= --> + + <root> + <appender-ref ref="CONSOLE"/> + <!-- <appender-ref ref="FILE"/> --> + </root> + +</log4j:configuration> \ No newline at end of file diff --git a/examples/correlation-matrix/src/test/gov/pnnl/aparapi/test/CorrMatrixTest.java b/examples/correlation-matrix/src/test/gov/pnnl/aparapi/test/CorrMatrixTest.java new file mode 100644 index 0000000000000000000000000000000000000000..dfde8a6636feccd58af9bb4584dd8b69f4a21042 --- /dev/null +++ b/examples/correlation-matrix/src/test/gov/pnnl/aparapi/test/CorrMatrixTest.java @@ -0,0 +1,169 @@ +/** + * This material was prepared as an account of work sponsored by an agency of the United States Government. + * Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of + * their employees, nor any jurisdiction or organization that has cooperated in the development of these materials, + * makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy, + * completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents + * that its use would not infringe privately owned rights. + */ +package gov.pnnl.aparapi.test; + +import gov.pnnl.aparapi.matrix.CorrMatrixHost; + +import java.io.File; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; + +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.log4j.Logger; +import org.apache.lucene.util.OpenBitSet; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import com.amd.aparapi.Kernel.EXECUTION_MODE; + +/** + * This test class performs the following functions: + * + * 1) Create a randomly populated set of matrices for correlation/co-occurrence computation + * 2) Execute the CPU-based computation using Lucene OpenBitSets + * 3) Execute the GPU-based computation using Aparapi CorrMatrix host and kernel + * 4) Verify the results of OpenBitSet and CorrMatrix by comparing matrices to each other + * + * @author ryan.lamothe at gmail.com + * + */ +public class CorrMatrixTest { + + private static final Logger LOG = Logger.getLogger(CorrMatrixTest.class); + + private final List<Pair<OpenBitSet, OpenBitSet>> obsPairs = new ArrayList<Pair<OpenBitSet, OpenBitSet>>();; + + private final Random rand = new Random(); + + private int[][] obsResultMatrix; + + /** + * NumTerms and NumLongs (documents) need to be adjusted manually right now to force 'striping' to occur (see Host code for details) + */ + @Before + public void setup() throws Exception { + /* + * Populate test data + */ + LOG.debug("----------"); + LOG.debug("Populating test matrix data using settings from build.xml..."); + LOG.debug("----------"); + + final int numTerms = Integer.getInteger("numRows", 300); // # Rows + // numLongs*64 for number of actual documents since these are 'packed' longs + final int numLongs = Integer.getInteger("numColumns", 10000); // # Columns + + for (int i = 0; i < numTerms; ++i) { + final long[] bits = new long[numLongs]; + for (int j = 0; j < numLongs; ++j) { + bits[j] = rand.nextLong(); + } + + obsPairs.add(i, new ImmutablePair<OpenBitSet, OpenBitSet>(new OpenBitSet(bits, numLongs), new OpenBitSet(bits, numLongs))); + } + + /* + * OpenBitSet calculations + */ + LOG.debug("Executing OpenBitSet intersectionCount"); + + final long startTime = System.currentTimeMillis(); + + obsResultMatrix = new int[obsPairs.size()][obsPairs.size()]; + + // This is an N^2 comparison loop + // FIXME This entire loop needs to be parallelized to show an apples-to-apples comparison to Aparapi + for (int i = 0; i < obsPairs.size(); i++) { + final Pair<OpenBitSet, OpenBitSet> docFreqVector1 = obsPairs.get(i); + + for (int j = 0; j < obsPairs.size(); j++) { + final Pair<OpenBitSet, OpenBitSet> docFreqVector2 = obsPairs.get(j); + + // # of matches in both sets of documents + final int result = (int) OpenBitSet.intersectionCount(docFreqVector1.getLeft(), docFreqVector2.getRight()); + obsResultMatrix[i][j] = result; + } + } + + final long endTime = System.currentTimeMillis() - startTime; + + LOG.debug("OpenBitSet Gross Execution Time: " + endTime + " ms <------OpenBitSet"); + LOG.debug("----------"); + } + + @Test + public void testCorrelationMatrix() throws Exception { + /* + * GPU calculations + */ + LOG.debug("Executing Aparapi intersectionCount"); + + final long[][] matrixA = new long[obsPairs.size()][]; + final long[][] matrixB = new long[obsPairs.size()][]; + + // Convert OpenBitSet pairs to long primitive arrays for use with Aparapi + // TODO It would be nice if we could find a way to put the obsPairs onto the GPU directly :) + for (int i = 0; i < obsPairs.size(); i++) { + final OpenBitSet obsA = obsPairs.get(i).getLeft(); + final OpenBitSet obsB = obsPairs.get(i).getRight(); + + matrixA[i] = obsA.getBits(); + matrixB[i] = obsB.getBits(); + } + + // The reason for setting this property is because the CorrMatrix host/kernel code + // came from a GUI where a user could select "Use Hardware Acceleration" instead + // of the application forcing the setting globally on the command-line + final int[][] gpuResultMatrix; + if (Boolean.getBoolean("useGPU")) { + gpuResultMatrix = CorrMatrixHost.intersectionMatrix(matrixA, matrixB, EXECUTION_MODE.GPU); + } else { + gpuResultMatrix = CorrMatrixHost.intersectionMatrix(matrixA, matrixB, EXECUTION_MODE.CPU); + } + + // Compare the two result arrays to make sure we are generating the same output + for (int i = 0; i < obsResultMatrix.length; i++) { + Assert.assertTrue("Arrays are not equal", Arrays.equals(obsResultMatrix[i], gpuResultMatrix[i])); + } + + // Visually compare/third-party tool compare if desired + if (LOG.isTraceEnabled()) { + // We're not using "try with resources" because Aparapi currently targets JDK 6 + final PrintWriter cpuOut = new PrintWriter(new File(System.getProperty("user.dir"), "trace/cpuOut.txt")); + final PrintWriter gpuOut = new PrintWriter(new File(System.getProperty("user.dir"), "trace/gpuOut.txt")); + + try { + for (int i = 0; i < obsResultMatrix.length; i++) { + if (LOG.isTraceEnabled()) { + LOG.trace("obsResultMatrix length: " + obsResultMatrix.length); + LOG.trace("gpuResultMatrix length: " + gpuResultMatrix.length); + + cpuOut.println(Arrays.toString(obsResultMatrix[i])); + gpuOut.println(Arrays.toString(gpuResultMatrix[i])); + } + } + } finally { + if (cpuOut != null) { + cpuOut.flush(); + cpuOut.close(); + } + + if (gpuOut != null) { + gpuOut.flush(); + gpuOut.close(); + } + } + } + } +}