Skip to content
Snippets Groups Projects
Commit 4fa3ab7b authored by Ryan LaMothe's avatar Ryan LaMothe
Browse files

Correlation/co-occurrence matrix computation code -- as presented at the 2012...

Correlation/co-occurrence matrix computation code -- as presented at the 2012 AMD Fusion Developer Summit -- modified to meet the standards of the official Aparapi code base. Please see included CC-4257_FINAL_060612.pdf for in-depth information explaining how this code is intended to function.
parent 8b721b27
No related branches found
No related tags found
No related merge requests found
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path="src/java"/>
<classpathentry kind="src" path="src/test"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="/com.amd.aparapi/dist/aparapi.jar" sourcepath="/com.amd.aparapi">
<attributes>
<attribute name="org.eclipse.jdt.launching.CLASSPATH_ATTR_LIBRARY_PATH_ENTRY" value="com.amd.aparapi.jni/dist"/>
</attributes>
</classpathentry>
<classpathentry kind="lib" path="/third-party/apache/commons/commons-lang3-3.1.jar"/>
<classpathentry kind="lib" path="/third-party/apache/logging/log4j-1.2.16.jar"/>
<classpathentry kind="lib" path="/third-party/apache/lucene/lucene-core-3.5.0.jar"/>
<classpathentry kind="lib" path="/third-party/junit/junit-4.10.jar"/>
<classpathentry kind="output" path="classes"/>
</classpath>
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>correlation-matrix</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
<?xml version="1.0"?>
<project name="correlation-matrix" default="junit" basedir=".">
<!--
DO NOT EDIT BELOW THIS LINE
-->
<echo>OS Name: ${os.name}</echo>
<echo>OS Version: ${os.version}</echo>
<echo>OS Arch: ${os.arch}</echo>
<echo>Java Version: ${java.version}</echo>
<target name="clean">
<delete dir="classes"/>
<delete dir="junit"/>
<!-- Legacy cleanup -->
<delete file="junit*.jar"/>
</target>
<path id="classpath">
<pathelement path="${basedir}/../../com.amd.aparapi/dist/aparapi.jar"/>
<pathelement path="${basedir}/../third-party/apache/commons/commons-lang3-3.1.jar"/>
<pathelement path="${basedir}/../third-party/apache/logging/log4j-1.2.16.jar"/>
<pathelement path="${basedir}/../third-party/apache/lucene/lucene-core-3.5.0.jar"/>
<pathelement path="${basedir}/../third-party/junit/junit-4.10.jar"/>
<pathelement path="${junit.home}/${junit.jar.name}"/>
<pathelement path="classes"/>
</path>
<target name="junit" depends="clean">
<mkdir dir="classes"/>
<mkdir dir="junit/data"/>
<!-- Runtime Code -->
<javac debug="true"
debuglevel="lines,vars,source"
srcdir="src/java"
destdir="classes"
includeAntRuntime="false"
classpathref="classpath">
<compilerarg value="-Xlint"/>
<compilerarg value="-Xlint:-path"/>
</javac>
<!-- JUnit Tests -->
<javac debug="true"
debuglevel="lines,vars,source"
srcdir="src/test"
destdir="classes"
includeAntRuntime="false"
classpathref="classpath">
<compilerarg value="-Xlint"/>
<compilerarg value="-Xlint:-path"/>
</javac>
<copy todir="classes" file="src/java/log4j.xml"/>
<!-- even though fork is slower we need to set the library path and this requires fork -->
<junit printsummary="false" fork="true" haltonfailure="false" failureproperty="tests.failed" showoutput="true">
<sysproperty key="java.library.path" value="${basedir}/../../com.amd.aparapi.jni/dist"/>
<!-- USER DEFINED PROPERTIES -->
<sysproperty key="numRows" value="1024"/>
<sysproperty key="numColumns" value="16384"/>
<sysproperty key="useGPU" value="true"/>
<formatter type="xml" />
<classpath refid="classpath"/>
<batchtest todir="junit/data">
<fileset dir="src/test"/>
</batchtest>
</junit>
<junitreport todir="junit/data">
<fileset dir="junit/data"/>
</junitreport>
</target>
</project>
\ No newline at end of file
/**
* This material was prepared as an account of work sponsored by an agency of the United States Government.
* Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of
* their employees, nor any jurisdiction or organization that has cooperated in the development of these materials,
* makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy,
* completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents
* that its use would not infringe privately owned rights.
*/
package gov.pnnl.aparapi.matrix;
import org.apache.log4j.Logger;
import com.amd.aparapi.Kernel;
import com.amd.aparapi.Kernel.EXECUTION_MODE;
import com.amd.aparapi.Range;
import com.amd.aparapi.device.Device;
import com.amd.aparapi.device.OpenCLDevice;
/**
* GPU calculations using OpenBitSet Intersection for OpenBitSets
*
* Based on code from: <br/>
* {@link http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java}
*
* @author ryan.lamothe at gmail.com
* @author sedillard at gmail.com
*/
public class CorrMatrixHost {
private static final Logger LOG = Logger.getLogger(CorrMatrixHost.class);
/**
* Perform matrix intersection for two lists of Lucene OpenBitSet-based packed longs
*
* @param matrixA
* The first term-document matrix
* @param matrixB
* The second term-document matrix
* @param Aparapi EXECUTION_MODE
* @return result Matrix
* @throws Exception
*/
public static int[][] intersectionMatrix(final long[][] matrixA, final long[][] matrixB, final EXECUTION_MODE executionMode) throws Exception {
// Basic validation
if (matrixA == null) {
throw new NullPointerException("MatrixA cannot be NULL");
}
if (matrixB == null) {
throw new NullPointerException("MatrixB cannot be NULL");
}
// Size of an array is 8 bytes for the object + 4 bytes for the header and length information
final int arrayMemOverhead = 12;
// numDocs/64 since they are packed into longs
// We need to make our matrix sizes multiples of BLOCK_SIZE
final int matrixA_numTerms = matrixA.length;
final int matrixA_numLongs = matrixA[0].length;
if (LOG.isDebugEnabled()) {
LOG.debug("----------");
LOG.debug("MatrixA NumTerms (Rows): " + matrixA_numTerms);
LOG.debug("MatrixA NumLongs (Columns): " + matrixA_numLongs);
LOG.debug("MatrixA NumDocs: " + (matrixA_numLongs * 64L));
}
final long matrixA_BytesPerRow = matrixA_numLongs * 8L;
final long matrixA_TotalBytes = (matrixA_numTerms * matrixA_BytesPerRow) + arrayMemOverhead;
if (LOG.isDebugEnabled()) {
LOG.debug("MatrixA Total Memory Size: " + humanReadableByteCount(matrixA_TotalBytes, true));
}
final int matrixB_numTerms = matrixB.length;
final int matrixB_numLongs = matrixB[0].length;
if (LOG.isDebugEnabled()) {
LOG.debug("----------");
LOG.debug("MatrixB NumTerms (Rows): " + matrixB_numTerms);
LOG.debug("MatrixB NumLongs (Columns): " + matrixB_numLongs);
LOG.debug("MatrixB NumDocs: " + (matrixB_numLongs * 64L));
}
final long matrixB_BytesPerRow = matrixB_numLongs * 8L;
final long matrixB_TotalBytes = (matrixB_numTerms * matrixB_BytesPerRow) + arrayMemOverhead;
if (LOG.isDebugEnabled()) {
LOG.debug("MatrixB Total Memory Size: " + humanReadableByteCount(matrixB_TotalBytes, true));
LOG.debug("----------");
}
final int[][] resultMatrix = new int[matrixA_numTerms][matrixB_numTerms];
if (LOG.isDebugEnabled()) {
final long resultMatrix_TotalBytes = (matrixA_numTerms * matrixB_numTerms * 4L) + arrayMemOverhead;
LOG.debug("ResultMatrix Memory Size: " + humanReadableByteCount(resultMatrix_TotalBytes, true));
LOG.debug("Total Requested Memory Size: " + humanReadableByteCount(matrixA_TotalBytes + matrixB_TotalBytes + resultMatrix_TotalBytes, true));
LOG.debug("----------");
}
int NUM_SUB_ROWS = matrixA_numTerms; // Default number of sub-rows
OpenCLDevice device = null;
// We do not test for EXECUTION_MODE.JTP because JTP is non-OpenCL
if (executionMode.equals(EXECUTION_MODE.CPU)) {
device = (OpenCLDevice) Device.firstCPU();
if (device == null) {
LOG.warn("OpenCLDevice.CPU is NULL...OpenCL is unavailable. Setting to JTP mode.");
LOG.debug("----------");
}
} else if (executionMode.equals(EXECUTION_MODE.GPU)) {
device = (OpenCLDevice) Device.best();
if (device == null) {
LOG.warn("OpenCLDevice.GPU is NULL...OpenCL is unavailable. Setting to JTP mode.");
LOG.debug("----------");
}
}
// This is to create stripes of rows that will fit into OpenCL's available memory
// Calculate the number of sub-rows by calling OpenCL to find out available memory
// Length of row * 8 (size of long in bytes) * number of rows to available memory
final int maxNumTerms = Math.max(matrixA_numTerms, matrixB_numTerms);
if (device != null) {
final long globalMemSize = device.getGlobalMemSize();
// final long maxMemAllocSize = Math.max((globalMemSize/4), 128*1024*1024);
final long maxMemAllocSize = device.getMaxMemAllocSize();
// 1048576 bytes in a megabyte (1024*1024)
// Java long is 8 bytes
// 131072 longs in 1 megabyte
// SAFE OpenCL spec allocation is max(1/4 GlobalMemSize)
// ***During our testing this appears to be incorrectly/inconsistently reported depending on os/drivers/hardware***
if (LOG.isDebugEnabled()) {
LOG.debug("Available OpenCL globalMemSize: " + humanReadableByteCount(globalMemSize, true));
LOG.debug("Available OpenCL maxMemAllocSize: " + humanReadableByteCount(maxMemAllocSize, true));
}
// Maybe there is a more clever way to do this :)
// The idea here is to decide how many sub-rows of the matrix we can fit on a single card
// The long-term goal to divide up the work for both small RAM GPUs and multiple GPUs
int subRowsCounterA = 0;
int subRowsCounterB = 0;
long subRowsMemSizeA = 0L;
long subRowsMemSizeB = 0L;
long subResultMatrixMemSize = 0L;
long subTotalMemSize = 0L;
do {
if (subRowsCounterA < matrixA_numTerms) {
subRowsMemSizeA = subRowsCounterA != 0 ? (subRowsCounterA * matrixA_numLongs * 8L) + arrayMemOverhead : 0;
subRowsCounterA += 1;
} else if (subRowsCounterA == matrixA_numTerms) {
subRowsMemSizeA = subRowsCounterA != 0 ? (subRowsCounterA * matrixA_numLongs * 8L) + arrayMemOverhead : 0;
}
if (subRowsCounterB < matrixB_numTerms) {
subRowsMemSizeB = subRowsCounterB != 0 ? (subRowsCounterB * matrixB_numLongs * 8L) + arrayMemOverhead : 0;
subRowsCounterB += 1;
} else if (subRowsCounterB == matrixB_numTerms) {
subRowsMemSizeB = subRowsCounterB != 0 ? (subRowsCounterB * matrixB_numLongs * 8L) + arrayMemOverhead : 0;
}
// This is 4 bytes since the sub-result matrix is an int array
subResultMatrixMemSize = ((subRowsCounterA * subRowsCounterB) * 4L) + arrayMemOverhead;
subTotalMemSize = subRowsMemSizeA + subRowsMemSizeB + subResultMatrixMemSize;
} while ((Math.max(subRowsCounterA, subRowsCounterB) < maxNumTerms) && (subTotalMemSize <= maxMemAllocSize));
// If using OpenCL override the default number of subrows
NUM_SUB_ROWS = Math.max(subRowsCounterA, subRowsCounterB);
if (NUM_SUB_ROWS < maxNumTerms) {
final long subMatrixA_memSize = (NUM_SUB_ROWS * matrixA_numLongs * 8L) + arrayMemOverhead;
final long subMatrixB_memSize = (NUM_SUB_ROWS * matrixB_numLongs * 8L) + arrayMemOverhead;
final long subResultMatrix_memSize = (NUM_SUB_ROWS * NUM_SUB_ROWS * 4L) + arrayMemOverhead;
LOG.warn("****************************************************************");
LOG.warn("Requested matrix computation is larger than available OpenCL memory");
LOG.warn("Matrix striping is occurring to fit all data into OpenCL memory...");
LOG.warn("");
LOG.warn("Number rows requested: " + maxNumTerms);
LOG.warn("Number rows that fit: " + NUM_SUB_ROWS);
LOG.warn("");
LOG.warn("SubMatrixA Memory Size: " + humanReadableByteCount(subMatrixA_memSize, true));
LOG.warn("SubMatrixB Memory Size: " + humanReadableByteCount(subMatrixB_memSize, true));
LOG.warn("SubResultMatrix Memory Size: " + humanReadableByteCount(subResultMatrix_memSize, true));
LOG.warn("SubMatrix Total Memory Size: " + humanReadableByteCount(subMatrixA_memSize + subMatrixB_memSize + subResultMatrix_memSize, true));
LOG.warn("****************************************************************");
}
}
final int numSubBlocksA = ((matrixA_numTerms + NUM_SUB_ROWS) - 1) / NUM_SUB_ROWS;
final int numSubBlocksB = ((matrixB_numTerms + NUM_SUB_ROWS) - 1) / NUM_SUB_ROWS;
final long[] subMatrixA = new long[NUM_SUB_ROWS * matrixA_numLongs];
final long[] subMatrixB = new long[NUM_SUB_ROWS * matrixB_numLongs];
final int[] subResultMatrix = new int[NUM_SUB_ROWS * NUM_SUB_ROWS];
final CorrMatrixKernel kernel = new CorrMatrixKernel(subMatrixA, NUM_SUB_ROWS, subMatrixB, NUM_SUB_ROWS, matrixA_numLongs, subResultMatrix);
kernel.setExplicit(true);
// Here we define a fall-back strategy, since the user may have wanted to execute only a single execution mode
if (executionMode.equals(EXECUTION_MODE.GPU) && (device != null)) {
kernel.addExecutionModes(EXECUTION_MODE.GPU, EXECUTION_MODE.CPU, EXECUTION_MODE.JTP);
LOG.debug("Execution Fallback Strategy: GPU --> CPU --> JTP");
} else if (executionMode.equals(EXECUTION_MODE.CPU) && (device != null)) {
kernel.addExecutionModes(EXECUTION_MODE.CPU, EXECUTION_MODE.JTP);
LOG.debug("Execution Fallback Strategy: CPU --> JTP");
} else {
kernel.addExecutionModes(EXECUTION_MODE.JTP);
LOG.debug("Execution Strategy: JTP");
}
try {
for (int a = 0; a < numSubBlocksA; a++) {
for (int b = 0; b < numSubBlocksB; b++) {
final int aSubRowStart = a * NUM_SUB_ROWS;
final int aSubRowEnd = Math.min(matrixA_numTerms, aSubRowStart + NUM_SUB_ROWS);
for (int i = aSubRowStart; i < aSubRowEnd; i++) {
if (matrixA_numLongs != matrixA[i].length) {
throw new Exception("All rows in the matrix need be the same length");
}
System.arraycopy(matrixA[i], 0, subMatrixA, (i - aSubRowStart) * matrixA_numLongs, matrixA_numLongs);
}
final int bSubRowStart = b * NUM_SUB_ROWS;
final int bSubRowEnd = Math.min(matrixB_numTerms, bSubRowStart + NUM_SUB_ROWS);
for (int i = bSubRowStart; i < bSubRowEnd; i++) {
if (matrixA_numLongs != matrixB[i].length) {
throw new Exception("All rows in the matrix need be the same length");
}
System.arraycopy(matrixB[i], 0, subMatrixB, (i - bSubRowStart) * matrixB_numLongs, matrixB_numLongs);
}
// Since matrixA_NumLongs == matrixB_NumLongs we're only going to pass matrixA_NumLongs
executeKernel(device, subMatrixA, aSubRowEnd - aSubRowStart, subMatrixB, bSubRowEnd - bSubRowStart, matrixA_numLongs, subResultMatrix, kernel);
// Convert one dimensional array to two dimensional array in the expected output ordering
for (int i = 0; i < NUM_SUB_ROWS; i++) {
if ((i + aSubRowStart) < aSubRowEnd) {
System.arraycopy(subResultMatrix, i * NUM_SUB_ROWS, resultMatrix[i + aSubRowStart], bSubRowStart, bSubRowEnd - bSubRowStart);
}
}
}
}
} finally {
if (LOG.isDebugEnabled()) {
LOG.debug("----------");
LOG.debug("Aparapi Gross Execution Time: " + kernel.getAccumulatedExecutionTime() + " ms <------ Aparapi");
LOG.debug("OpenCL Generation Time: " + kernel.getConversionTime() + " ms");
LOG.debug("Kernel Net Execution Time: " + (kernel.getAccumulatedExecutionTime() - kernel.getConversionTime()) + " ms");
LOG.debug("----------");
}
try {
kernel.dispose();
} catch (final UnsatisfiedLinkError e) {
LOG.error("Aparapi failed to dispose of the kernel", e);
}
}
return resultMatrix;
}
/**
* Execute the GPU kernel
*
* @param subMatrixA
* @param matrixA_NumTerms
* @param subMatrixB
* @param matrixB_NumTerms
* @param numLongs
* @param subResultMatrix
* @param kernel
*
* @return resultMatrix
*/
private static void executeKernel(final Device device, final long[] subMatrixA, final int matrixA_NumTerms, final long[] subMatrixB, final int matrixB_NumTerms, final int numLongs, final int[] subResultMatrix, final Kernel kernel) {
// Power of Two for best performance
int matrixA_NumTermsRnd = matrixA_NumTerms;
while (!isPowerOfTwo(matrixA_NumTermsRnd)) {
matrixA_NumTermsRnd += 1;
}
int matrixB_NumTermsRnd = matrixB_NumTerms;
while (!isPowerOfTwo(matrixB_NumTermsRnd)) {
matrixB_NumTermsRnd += 1;
}
final Range range;
if (device != null) {
range = Range.create2D(device, matrixA_NumTermsRnd, matrixB_NumTermsRnd);
} else {
range = Range.create2D(matrixA_NumTermsRnd, matrixB_NumTermsRnd);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Range: " + range);
}
kernel.put(subMatrixA);
kernel.put(subMatrixB);
kernel.put(subResultMatrix);
kernel.execute(range);
kernel.get(subResultMatrix);
}
/**
* Highly efficient means to compute whether a number is a power of 2<br>
* Based on code from http://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2
* <p>
* Another very cool way to do this is ((x&(-x))==x)
*
* @param n
* @return boolean
*/
private static boolean isPowerOfTwo(int n) {
return (n > 0) && ((n & (n - 1)) == 0);
}
/**
* Rounds a number to the multiple indicated
*
* @param num
* @param multiple
* @return
*/
private static int roundToMultiple(double num, int multiple) {
return (int) (Math.ceil(num / multiple) * multiple);
}
/**
* Very nice means to convert byte sizes into human readable format<br>
* Based on code from http://stackoverflow.com/questions/3758606/how-to-convert-byte-size-into-human-readable-format-in-java
* <p>
*
* @param bytes
* @param si
* @return humanReadableByteCount
*/
private static String humanReadableByteCount(long bytes, boolean si) {
final int unit = si ? 1000 : 1024;
if (bytes < unit) {
return bytes + " B";
}
final int exp = (int) (Math.log(bytes) / Math.log(unit));
final String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp - 1) + (si ? "" : "i");
return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre);
}
}
/**
* This material was prepared as an account of work sponsored by an agency of the United States Government.
* Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of
* their employees, nor any jurisdiction or organization that has cooperated in the development of these materials,
* makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy,
* completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents
* that its use would not infringe privately owned rights.
*/
package gov.pnnl.aparapi.matrix;
import com.amd.aparapi.Kernel;
/**
* This kernel attempts to re-implement the Lucene OpenBitSet functionality on a GPU
*
* Based on code from: <br/>
* {@link http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java}
*
* @author ryan.lamothe at gmail.com
* @author sedillard at gmail.com
*/
public class CorrMatrixKernel extends Kernel {
final long[] matrixA;
final int matrixA_NumTerms;
final long[] matrixB;
final int matrixB_NumTerms;
int numLongs;
int[] resultMatrix;
/**
* Default constructor
*/
public CorrMatrixKernel(final long[] matrixA, final int matrixA_NumTerms, final long[] matrixB, final int matrixB_NumTerms,
final int numLongs, final int[] resultMatrix) {
this.matrixA = matrixA;
this.matrixA_NumTerms = matrixA_NumTerms;
this.matrixB = matrixB;
this.matrixB_NumTerms = matrixB_NumTerms;
this.numLongs = numLongs;
this.resultMatrix = resultMatrix;
}
@Override
public void run() {
final int i = this.getGlobalId(0);
if (i < matrixA_NumTerms) {
final int j = this.getGlobalId(1);
if (j < matrixB_NumTerms) {
// For testing purposes, you can use the naive implementation to compare performance
resultMatrix[(i * matrixB_NumTerms) + j] = pop_intersect(matrixA, i * numLongs, matrixB, j * numLongs, numLongs);
// this.resultMatrix[i * matrixB_NumTerms + j] = this.naive_pop_intersect(matrixA, i * numLongs, matrixB, j * numLongs, numLongs);
}
}
}
/**
* A naive implementation of the pop_array code below
*/
private int naive_pop_intersect(final long matrixA[], final int aStart, final long matrixB[], final int bStart, final int numWords) {
int sum = 0;
for (int i = 0; i < numWords; i++) {
sum += pop(matrixA[aStart + i] & matrixB[bStart + i]);
}
return sum;
}
/**
* Returns the popcount or cardinality of the two sets after an intersection.
* Neither array is modified.
*
* Modified for the purposes of this kernel from its original version
*/
private int pop_intersect(final long matrixA[], final int aStart, final long matrixB[], final int bStart, final int numWords) {
/*
* http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java
*/
// generated from pop_array via sed 's/A\[\([^]]*\)\]/\(A[\1] \& B[\1]\)/g'
final int n = numWords;
int tot = 0, tot8 = 0;
long ones = 0, twos = 0, fours = 0;
int i;
for (i = 0; i <= (n - 8); i += 8) {
long twosA = 0;
long twosB = 0;
long foursA = 0;
long foursB = 0;
long eights = 0;
final int ai = aStart + i;
final int bi = bStart + i;
// CSA(twosA, ones, ones, (A[i] & B[i]), (A[i+1] & B[i+1]))
{
final long b = matrixA[ai] & matrixB[bi], c = matrixA[ai + 1] & matrixB[bi + 1];
final long u = ones ^ b;
twosA = (ones & b) | (u & c);
ones = u ^ c;
}
// CSA(twosB, ones, ones, (A[i+2] & B[i+2]), (A[i+3] & B[i+3]))
{
final long b = matrixA[ai + 2] & matrixB[bi + 2], c = matrixA[ai + 3] & matrixB[bi + 3];
final long u = ones ^ b;
twosB = (ones & b) | (u & c);
ones = u ^ c;
}
// CSA(foursA, twos, twos, twosA, twosB)
{
final long u = twos ^ twosA;
foursA = (twos & twosA) | (u & twosB);
twos = u ^ twosB;
}
// CSA(twosA, ones, ones, (A[i+4] & B[i+4]), (A[i+5] & B[i+5]))
{
final long b = matrixA[ai + 4] & matrixB[bi + 4], c = matrixA[ai + 5] & matrixB[bi + 5];
final long u = ones ^ b;
twosA = (ones & b) | (u & c);
ones = u ^ c;
}
// CSA(twosB, ones, ones, (A[i+6] & B[i+6]), (A[i+7] & B[i+7]))
{
final long b = matrixA[ai + 6] & matrixB[bi + 6], c = matrixA[ai + 7] & matrixB[bi + 7];
final long u = ones ^ b;
twosB = (ones & b) | (u & c);
ones = u ^ c;
}
// CSA(foursB, twos, twos, twosA, twosB)
{
final long u = twos ^ twosA;
foursB = (twos & twosA) | (u & twosB);
twos = u ^ twosB;
}
// CSA(eights, fours, fours, foursA, foursB)
{
final long u = fours ^ foursA;
eights = (fours & foursA) | (u & foursB);
fours = u ^ foursB;
}
tot8 += pop(eights);
}
if (i <= (n - 4)) {
final int ai = aStart + i;
final int bi = bStart + i;
long twosA = 0;
long twosB = 0;
long foursA = 0;
long eights = 0;
{
final long b = matrixA[ai] & matrixB[bi], c = matrixA[ai + 1] & matrixB[bi + 1];
final long u = ones ^ b;
twosA = (ones & b) | (u & c);
ones = u ^ c;
}
{
final long b = matrixA[ai + 2] & matrixB[bi + 2], c = matrixA[ai + 3] & matrixB[bi + 3];
final long u = ones ^ b;
twosB = (ones & b) | (u & c);
ones = u ^ c;
}
{
final long u = twos ^ twosA;
foursA = (twos & twosA) | (u & twosB);
twos = u ^ twosB;
}
eights = fours & foursA;
fours = fours ^ foursA;
tot8 += pop(eights);
i += 4;
}
if (i <= (n - 2)) {
final int ai = aStart + i;
final int bi = bStart + i;
final long b = matrixA[ai] & matrixB[bi], c = matrixA[ai + 1] & matrixB[bi + 1];
final long u = ones ^ b;
final long twosA = (ones & b) | (u & c);
ones = u ^ c;
final long foursA = twos & twosA;
twos = twos ^ twosA;
final long eights = fours & foursA;
fours = fours ^ foursA;
tot8 += pop(eights);
i += 2;
}
if (i < n) {
final int ai = aStart + i;
final int bi = bStart + i;
tot += pop(matrixA[ai] & matrixB[bi]);
}
tot += (pop(fours) << 2) + (pop(twos) << 1) + pop(ones) + (tot8 << 3);
return tot;
}
/**
* Returns the number of bits set in the long
*/
private int pop(long x) {
/*
* http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java
*/
/*
* Hacker's Delight 32 bit pop function:
* http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.c.txt
*
* int pop(unsigned x) {
* x = x - ((x >> 1) & 0x55555555);
* x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
* x = (x + (x >> 4)) & 0x0F0F0F0F;
* x = x + (x >> 8);
* x = x + (x >> 16);
* return x & 0x0000003F;
* }
* *
*/
// 64 bit java version of the C function from above
x = x - ((x >>> 1) & 0x5555555555555555L);
x = (x & 0x3333333333333333L) + ((x >>> 2) & 0x3333333333333333L);
x = (x + (x >>> 4)) & 0x0F0F0F0F0F0F0F0FL;
x = x + (x >>> 8);
x = x + (x >>> 16);
x = x + (x >>> 32);
return (int) x & 0x7F;
}
}
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<!--
| For more configuration information and examples see the Jakarta Log4j
| website: http://jakarta.apache.org/log4j
-->
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/" debug="false">
<!-- ============================== -->
<!-- Append messages to the console -->
<!-- ============================== -->
<appender name="CONSOLE" class="org.apache.log4j.ConsoleAppender">
<param name="Threshold" value="DEBUG"/>
<param name="Target" value="System.out"/>
<param name="Encoding" value="UTF-8"/>
<layout class="org.apache.log4j.PatternLayout">
<!-- The default pattern: Date Priority [Category] (Thread) Message\n -->
<param name="ConversionPattern" value="%d %-5p [%c{1}] %m%n"/>
<!-- The full pattern: Date MS Priority [Category] (Thread:NDC) Message\n
<param name="ConversionPattern" value="%d %-5r %-5p [%c] (%t:%x) %m%n"/>
-->
</layout>
</appender>
<appender name="FILE" class="org.apache.log4j.FileAppender">
<param name="File" value="log/corrmatrix.log"/>
<param name="Append" value="true"/>
<param name="Encoding" value="UTF-8"/>
<layout class="org.apache.log4j.PatternLayout">
<!-- The default pattern: Date Priority [Category] (Thread) Message\n -->
<param name="ConversionPattern" value="%d %-5p [%c] %m%n"/>
<!-- The full pattern: Date MS Priority [Category] (Thread:NDC) Message\n
<param name="ConversionPattern" value="%d %-5r %-5p [%c] (%t:%x) %m%n"/>
-->
</layout>
</appender>
<!-- Limit categories -->
<logger name="gov.pnnl">
<level value="DEBUG"/>
</logger>
<!-- ======================= -->
<!-- Setup the Root category -->
<!-- ======================= -->
<root>
<appender-ref ref="CONSOLE"/>
<!-- <appender-ref ref="FILE"/> -->
</root>
</log4j:configuration>
\ No newline at end of file
/**
* This material was prepared as an account of work sponsored by an agency of the United States Government.
* Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of
* their employees, nor any jurisdiction or organization that has cooperated in the development of these materials,
* makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy,
* completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents
* that its use would not infringe privately owned rights.
*/
package gov.pnnl.aparapi.test;
import gov.pnnl.aparapi.matrix.CorrMatrixHost;
import java.io.File;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.log4j.Logger;
import org.apache.lucene.util.OpenBitSet;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import com.amd.aparapi.Kernel.EXECUTION_MODE;
/**
* This test class performs the following functions:
*
* 1) Create a randomly populated set of matrices for correlation/co-occurrence computation
* 2) Execute the CPU-based computation using Lucene OpenBitSets
* 3) Execute the GPU-based computation using Aparapi CorrMatrix host and kernel
* 4) Verify the results of OpenBitSet and CorrMatrix by comparing matrices to each other
*
* @author ryan.lamothe at gmail.com
*
*/
public class CorrMatrixTest {
private static final Logger LOG = Logger.getLogger(CorrMatrixTest.class);
private final List<Pair<OpenBitSet, OpenBitSet>> obsPairs = new ArrayList<Pair<OpenBitSet, OpenBitSet>>();;
private final Random rand = new Random();
private int[][] obsResultMatrix;
/**
* NumTerms and NumLongs (documents) need to be adjusted manually right now to force 'striping' to occur (see Host code for details)
*/
@Before
public void setup() throws Exception {
/*
* Populate test data
*/
LOG.debug("----------");
LOG.debug("Populating test matrix data using settings from build.xml...");
LOG.debug("----------");
final int numTerms = Integer.getInteger("numRows", 300); // # Rows
// numLongs*64 for number of actual documents since these are 'packed' longs
final int numLongs = Integer.getInteger("numColumns", 10000); // # Columns
for (int i = 0; i < numTerms; ++i) {
final long[] bits = new long[numLongs];
for (int j = 0; j < numLongs; ++j) {
bits[j] = rand.nextLong();
}
obsPairs.add(i, new ImmutablePair<OpenBitSet, OpenBitSet>(new OpenBitSet(bits, numLongs), new OpenBitSet(bits, numLongs)));
}
/*
* OpenBitSet calculations
*/
LOG.debug("Executing OpenBitSet intersectionCount");
final long startTime = System.currentTimeMillis();
obsResultMatrix = new int[obsPairs.size()][obsPairs.size()];
// This is an N^2 comparison loop
// FIXME This entire loop needs to be parallelized to show an apples-to-apples comparison to Aparapi
for (int i = 0; i < obsPairs.size(); i++) {
final Pair<OpenBitSet, OpenBitSet> docFreqVector1 = obsPairs.get(i);
for (int j = 0; j < obsPairs.size(); j++) {
final Pair<OpenBitSet, OpenBitSet> docFreqVector2 = obsPairs.get(j);
// # of matches in both sets of documents
final int result = (int) OpenBitSet.intersectionCount(docFreqVector1.getLeft(), docFreqVector2.getRight());
obsResultMatrix[i][j] = result;
}
}
final long endTime = System.currentTimeMillis() - startTime;
LOG.debug("OpenBitSet Gross Execution Time: " + endTime + " ms <------OpenBitSet");
LOG.debug("----------");
}
@Test
public void testCorrelationMatrix() throws Exception {
/*
* GPU calculations
*/
LOG.debug("Executing Aparapi intersectionCount");
final long[][] matrixA = new long[obsPairs.size()][];
final long[][] matrixB = new long[obsPairs.size()][];
// Convert OpenBitSet pairs to long primitive arrays for use with Aparapi
// TODO It would be nice if we could find a way to put the obsPairs onto the GPU directly :)
for (int i = 0; i < obsPairs.size(); i++) {
final OpenBitSet obsA = obsPairs.get(i).getLeft();
final OpenBitSet obsB = obsPairs.get(i).getRight();
matrixA[i] = obsA.getBits();
matrixB[i] = obsB.getBits();
}
// The reason for setting this property is because the CorrMatrix host/kernel code
// came from a GUI where a user could select "Use Hardware Acceleration" instead
// of the application forcing the setting globally on the command-line
final int[][] gpuResultMatrix;
if (Boolean.getBoolean("useGPU")) {
gpuResultMatrix = CorrMatrixHost.intersectionMatrix(matrixA, matrixB, EXECUTION_MODE.GPU);
} else {
gpuResultMatrix = CorrMatrixHost.intersectionMatrix(matrixA, matrixB, EXECUTION_MODE.CPU);
}
// Compare the two result arrays to make sure we are generating the same output
for (int i = 0; i < obsResultMatrix.length; i++) {
Assert.assertTrue("Arrays are not equal", Arrays.equals(obsResultMatrix[i], gpuResultMatrix[i]));
}
// Visually compare/third-party tool compare if desired
if (LOG.isTraceEnabled()) {
// We're not using "try with resources" because Aparapi currently targets JDK 6
final PrintWriter cpuOut = new PrintWriter(new File(System.getProperty("user.dir"), "trace/cpuOut.txt"));
final PrintWriter gpuOut = new PrintWriter(new File(System.getProperty("user.dir"), "trace/gpuOut.txt"));
try {
for (int i = 0; i < obsResultMatrix.length; i++) {
if (LOG.isTraceEnabled()) {
LOG.trace("obsResultMatrix length: " + obsResultMatrix.length);
LOG.trace("gpuResultMatrix length: " + gpuResultMatrix.length);
cpuOut.println(Arrays.toString(obsResultMatrix[i]));
gpuOut.println(Arrays.toString(gpuResultMatrix[i]));
}
}
} finally {
if (cpuOut != null) {
cpuOut.flush();
cpuOut.close();
}
if (gpuOut != null) {
gpuOut.flush();
gpuOut.close();
}
}
}
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment