From 1df93467f45d7334a33c08edf0b9146673efc79a Mon Sep 17 00:00:00 2001 From: Jeffrey Phillips Freeman <jeffrey.freeman@syncleus.com> Date: Tue, 17 Oct 2017 22:00:59 -0400 Subject: [PATCH] fix: no longer requires manual installation of gluegen. --- .gitattributes | 3 + Notes.txt | 6 +- .../pnnl/aparapi/matrix/CorrMatrixHost.java | 728 ++++---- .../gov/pnnl/aparapi/test/CorrMatrixTest.java | 338 ++-- examples/movie/.project | 17 - examples/movie/build.xml | 254 +-- examples/movie/movie.bat | 30 +- .../examples/movie/AparapiSolution.java | 260 +-- .../examples/movie/ConvMatrix3x3Editor.java | 302 ++-- .../aparapi/examples/movie/JJMPEGPlayer.java | 294 ++-- .../examples/movie/PureJavaSolution.java | 70 +- .../examples/movie/ReferenceSolution.java | 272 +-- pom.xml | 7 +- .../examples/convolution/Convolution.java | 9 +- .../convolution/ConvolutionOpenCL.java | 8 +- .../examples/convolution/PureJava.java | 8 +- .../examples/convolution/convolution.cl | 56 +- .../examples/extension/HistogramKernel.cl | 332 ++-- .../com/aparapi/examples/extension/fft.cl | 1474 ++++++++--------- .../com/aparapi/examples/extension/mandel.cl | 182 +- .../com/aparapi/examples/extension/mandel2.cl | 190 +-- .../com/aparapi/examples/extension/squarer.cl | 10 +- .../aparapi/examples/javaonedemo/NBody.java | 24 +- .../aparapi/examples/median/MedianDemo.java | 19 +- .../com/aparapi/examples/nbody/Local.java | 8 +- .../java/com/aparapi/examples/nbody/Main.java | 4 +- .../java/com/aparapi/examples/nbody/Seq.java | 4 +- .../com/aparapi/examples/oopnbody/Main.java | 4 +- 28 files changed, 2447 insertions(+), 2466 deletions(-) delete mode 100644 examples/movie/.project diff --git a/.gitattributes b/.gitattributes index 456e49e9..867f823d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,7 @@ * eol=lf *.bat eol=crlf *.sh eol=lf +*.jpg binary +*.png binary +*.pdf binary .git* export-ignore diff --git a/Notes.txt b/Notes.txt index fd9aca63..8ad6ae40 100644 --- a/Notes.txt +++ b/Notes.txt @@ -1,3 +1,3 @@ -Maybe we can use Java 7's method handles for this? -http://java.sun.com/developer/technicalArticles/DynTypeLang/ - +Maybe we can use Java 7's method handles for this? +http://java.sun.com/developer/technicalArticles/DynTypeLang/ + diff --git a/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixHost.java b/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixHost.java index e2a5a2ce..f6d3cb42 100644 --- a/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixHost.java +++ b/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixHost.java @@ -1,364 +1,364 @@ -/** - * This material was prepared as an account of work sponsored by an agency of the United States Government. - * Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of - * their employees, nor any jurisdiction or organization that has cooperated in the development of these materials, - * makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy, - * completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents - * that its use would not infringe privately owned rights. - */ -package gov.pnnl.aparapi.matrix; - -import org.apache.log4j.Logger; - -import com.aparapi.Kernel; -import com.aparapi.Kernel.EXECUTION_MODE; -import com.aparapi.Range; -import com.aparapi.device.Device; -import com.aparapi.device.OpenCLDevice; - -/** - * GPU calculations using OpenBitSet Intersection for OpenBitSets - * - * Based on code from: <br/> - * {@link http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java} - * - * @author ryan.lamothe at gmail.com - * @author sedillard at gmail.com - */ -public class CorrMatrixHost { - - private static final Logger LOG = Logger.getLogger(CorrMatrixHost.class); - - /** - * Perform matrix intersection for two lists of Lucene OpenBitSet-based packed longs - * - * @param matrixA - * The first term-document matrix - * @param matrixB - * The second term-document matrix - * @param Aparapi EXECUTION_MODE - * @return result Matrix - * @throws Exception - */ - public static int[][] intersectionMatrix(final long[][] matrixA, final long[][] matrixB, final EXECUTION_MODE executionMode) throws Exception { - - // Basic validation - if (matrixA == null) { - throw new NullPointerException("MatrixA cannot be NULL"); - } - - if (matrixB == null) { - throw new NullPointerException("MatrixB cannot be NULL"); - } - - // Size of an array is 8 bytes for the object + 4 bytes for the header and length information - final int arrayMemOverhead = 12; - - // numDocs/64 since they are packed into longs - // We need to make our matrix sizes multiples of BLOCK_SIZE - final int matrixA_numTerms = matrixA.length; - final int matrixA_numLongs = matrixA[0].length; - - if (LOG.isDebugEnabled()) { - LOG.debug("----------"); - LOG.debug("MatrixA NumTerms (Rows): " + matrixA_numTerms); - LOG.debug("MatrixA NumLongs (Columns): " + matrixA_numLongs); - LOG.debug("MatrixA NumDocs: " + (matrixA_numLongs * 64L)); - } - - final long matrixA_BytesPerRow = matrixA_numLongs * 8L; - final long matrixA_TotalBytes = (matrixA_numTerms * matrixA_BytesPerRow) + arrayMemOverhead; - - if (LOG.isDebugEnabled()) { - LOG.debug("MatrixA Total Memory Size: " + humanReadableByteCount(matrixA_TotalBytes, true)); - } - - final int matrixB_numTerms = matrixB.length; - final int matrixB_numLongs = matrixB[0].length; - - if (LOG.isDebugEnabled()) { - LOG.debug("----------"); - LOG.debug("MatrixB NumTerms (Rows): " + matrixB_numTerms); - LOG.debug("MatrixB NumLongs (Columns): " + matrixB_numLongs); - LOG.debug("MatrixB NumDocs: " + (matrixB_numLongs * 64L)); - } - - final long matrixB_BytesPerRow = matrixB_numLongs * 8L; - final long matrixB_TotalBytes = (matrixB_numTerms * matrixB_BytesPerRow) + arrayMemOverhead; - - if (LOG.isDebugEnabled()) { - LOG.debug("MatrixB Total Memory Size: " + humanReadableByteCount(matrixB_TotalBytes, true)); - LOG.debug("----------"); - } - - final int[][] resultMatrix = new int[matrixA_numTerms][matrixB_numTerms]; - - if (LOG.isDebugEnabled()) { - final long resultMatrix_TotalBytes = (matrixA_numTerms * matrixB_numTerms * 4L) + arrayMemOverhead; - LOG.debug("ResultMatrix Memory Size: " + humanReadableByteCount(resultMatrix_TotalBytes, true)); - LOG.debug("Total Requested Memory Size: " + humanReadableByteCount(matrixA_TotalBytes + matrixB_TotalBytes + resultMatrix_TotalBytes, true)); - LOG.debug("----------"); - } - - int NUM_SUB_ROWS = matrixA_numTerms; // Default number of sub-rows - - OpenCLDevice device = null; - - // We do not test for EXECUTION_MODE.JTP because JTP is non-OpenCL - if (executionMode.equals(EXECUTION_MODE.CPU)) { - device = (OpenCLDevice) Device.firstCPU(); - - if (device == null) { - LOG.warn("OpenCLDevice.CPU is NULL...OpenCL is unavailable. Setting to JTP mode."); - LOG.debug("----------"); - } - } else if (executionMode.equals(EXECUTION_MODE.GPU)) { - device = (OpenCLDevice) Device.best(); - - if (device == null) { - LOG.warn("OpenCLDevice.GPU is NULL...OpenCL is unavailable. Setting to JTP mode."); - LOG.debug("----------"); - } - } - - // This is to create stripes of rows that will fit into OpenCL's available memory - // Calculate the number of sub-rows by calling OpenCL to find out available memory - // Length of row * 8 (size of long in bytes) * number of rows to available memory - final int maxNumTerms = Math.max(matrixA_numTerms, matrixB_numTerms); - - if (device != null) { - final long globalMemSize = device.getGlobalMemSize(); - // final long maxMemAllocSize = Math.max((globalMemSize/4), 128*1024*1024); - final long maxMemAllocSize = device.getMaxMemAllocSize(); - - // 1048576 bytes in a megabyte (1024*1024) - // Java long is 8 bytes - // 131072 longs in 1 megabyte - // SAFE OpenCL spec allocation is max(1/4 GlobalMemSize) - // ***During our testing this appears to be incorrectly/inconsistently reported depending on os/drivers/hardware*** - if (LOG.isDebugEnabled()) { - LOG.debug("Available OpenCL globalMemSize: " + humanReadableByteCount(globalMemSize, true)); - LOG.debug("Available OpenCL maxMemAllocSize: " + humanReadableByteCount(maxMemAllocSize, true)); - } - - // Maybe there is a more clever way to do this :) - // The idea here is to decide how many sub-rows of the matrix we can fit on a single card - // The long-term goal to divide up the work for both small RAM GPUs and multiple GPUs - int subRowsCounterA = 0; - int subRowsCounterB = 0; - long subRowsMemSizeA = 0L; - long subRowsMemSizeB = 0L; - long subResultMatrixMemSize = 0L; - long subTotalMemSize = 0L; - - do { - if (subRowsCounterA < matrixA_numTerms) { - subRowsMemSizeA = subRowsCounterA != 0 ? (subRowsCounterA * matrixA_numLongs * 8L) + arrayMemOverhead : 0; - subRowsCounterA += 1; - } else if (subRowsCounterA == matrixA_numTerms) { - subRowsMemSizeA = subRowsCounterA != 0 ? (subRowsCounterA * matrixA_numLongs * 8L) + arrayMemOverhead : 0; - } - - if (subRowsCounterB < matrixB_numTerms) { - subRowsMemSizeB = subRowsCounterB != 0 ? (subRowsCounterB * matrixB_numLongs * 8L) + arrayMemOverhead : 0; - subRowsCounterB += 1; - } else if (subRowsCounterB == matrixB_numTerms) { - subRowsMemSizeB = subRowsCounterB != 0 ? (subRowsCounterB * matrixB_numLongs * 8L) + arrayMemOverhead : 0; - } - - // This is 4 bytes since the sub-result matrix is an int array - subResultMatrixMemSize = ((subRowsCounterA * subRowsCounterB) * 4L) + arrayMemOverhead; - - subTotalMemSize = subRowsMemSizeA + subRowsMemSizeB + subResultMatrixMemSize; - } while ((Math.max(subRowsCounterA, subRowsCounterB) < maxNumTerms) && (subTotalMemSize <= maxMemAllocSize)); - - // If using OpenCL override the default number of subrows - NUM_SUB_ROWS = Math.max(subRowsCounterA, subRowsCounterB); - - if (NUM_SUB_ROWS < maxNumTerms) { - final long subMatrixA_memSize = (NUM_SUB_ROWS * matrixA_numLongs * 8L) + arrayMemOverhead; - final long subMatrixB_memSize = (NUM_SUB_ROWS * matrixB_numLongs * 8L) + arrayMemOverhead; - final long subResultMatrix_memSize = (NUM_SUB_ROWS * NUM_SUB_ROWS * 4L) + arrayMemOverhead; - - LOG.warn("****************************************************************"); - LOG.warn("Requested matrix computation is larger than available OpenCL memory"); - LOG.warn("Matrix striping is occurring to fit all data into OpenCL memory..."); - LOG.warn(""); - LOG.warn("Number rows requested: " + maxNumTerms); - LOG.warn("Number rows that fit: " + NUM_SUB_ROWS); - LOG.warn(""); - LOG.warn("SubMatrixA Memory Size: " + humanReadableByteCount(subMatrixA_memSize, true)); - LOG.warn("SubMatrixB Memory Size: " + humanReadableByteCount(subMatrixB_memSize, true)); - LOG.warn("SubResultMatrix Memory Size: " + humanReadableByteCount(subResultMatrix_memSize, true)); - LOG.warn("SubMatrix Total Memory Size: " + humanReadableByteCount(subMatrixA_memSize + subMatrixB_memSize + subResultMatrix_memSize, true)); - LOG.warn("****************************************************************"); - } - } - - final int numSubBlocksA = ((matrixA_numTerms + NUM_SUB_ROWS) - 1) / NUM_SUB_ROWS; - final int numSubBlocksB = ((matrixB_numTerms + NUM_SUB_ROWS) - 1) / NUM_SUB_ROWS; - - final long[] subMatrixA = new long[NUM_SUB_ROWS * matrixA_numLongs]; - final long[] subMatrixB = new long[NUM_SUB_ROWS * matrixB_numLongs]; - final int[] subResultMatrix = new int[NUM_SUB_ROWS * NUM_SUB_ROWS]; - - final CorrMatrixKernel kernel = new CorrMatrixKernel(subMatrixA, NUM_SUB_ROWS, subMatrixB, NUM_SUB_ROWS, matrixA_numLongs, subResultMatrix); - kernel.setExplicit(true); - - // Here we define a fall-back strategy, since the user may have wanted to execute only a single execution mode - if (executionMode.equals(EXECUTION_MODE.GPU) && (device != null)) { - kernel.addExecutionModes(EXECUTION_MODE.GPU, EXECUTION_MODE.CPU, EXECUTION_MODE.JTP); - LOG.debug("Execution Fallback Strategy: GPU --> CPU --> JTP"); - } else if (executionMode.equals(EXECUTION_MODE.CPU) && (device != null)) { - kernel.addExecutionModes(EXECUTION_MODE.CPU, EXECUTION_MODE.JTP); - LOG.debug("Execution Fallback Strategy: CPU --> JTP"); - } else { - kernel.addExecutionModes(EXECUTION_MODE.JTP); - LOG.debug("Execution Strategy: JTP"); - } - - try { - for (int a = 0; a < numSubBlocksA; a++) { - for (int b = 0; b < numSubBlocksB; b++) { - final int aSubRowStart = a * NUM_SUB_ROWS; - final int aSubRowEnd = Math.min(matrixA_numTerms, aSubRowStart + NUM_SUB_ROWS); - - for (int i = aSubRowStart; i < aSubRowEnd; i++) { - if (matrixA_numLongs != matrixA[i].length) { - throw new Exception("All rows in the matrix need be the same length"); - } - - System.arraycopy(matrixA[i], 0, subMatrixA, (i - aSubRowStart) * matrixA_numLongs, matrixA_numLongs); - } - - final int bSubRowStart = b * NUM_SUB_ROWS; - final int bSubRowEnd = Math.min(matrixB_numTerms, bSubRowStart + NUM_SUB_ROWS); - - for (int i = bSubRowStart; i < bSubRowEnd; i++) { - if (matrixA_numLongs != matrixB[i].length) { - throw new Exception("All rows in the matrix need be the same length"); - } - - System.arraycopy(matrixB[i], 0, subMatrixB, (i - bSubRowStart) * matrixB_numLongs, matrixB_numLongs); - } - - // Since matrixA_NumLongs == matrixB_NumLongs we're only going to pass matrixA_NumLongs - executeKernel(device, subMatrixA, aSubRowEnd - aSubRowStart, subMatrixB, bSubRowEnd - bSubRowStart, matrixA_numLongs, subResultMatrix, kernel); - - // Convert one dimensional array to two dimensional array in the expected output ordering - for (int i = 0; i < NUM_SUB_ROWS; i++) { - if ((i + aSubRowStart) < aSubRowEnd) { - System.arraycopy(subResultMatrix, i * NUM_SUB_ROWS, resultMatrix[i + aSubRowStart], bSubRowStart, bSubRowEnd - bSubRowStart); - } - } - } - } - } finally { - if (LOG.isDebugEnabled()) { - LOG.debug("----------"); - LOG.debug("Aparapi Gross Execution Time: " + kernel.getAccumulatedExecutionTime() + " ms <------ Aparapi"); - LOG.debug("OpenCL Generation Time: " + kernel.getConversionTime() + " ms"); - LOG.debug("Kernel Net Execution Time: " + (kernel.getAccumulatedExecutionTime() - kernel.getConversionTime()) + " ms"); - LOG.debug("----------"); - } - - try { - kernel.dispose(); - } catch (final UnsatisfiedLinkError e) { - LOG.error("Aparapi failed to dispose of the kernel", e); - } - } - - return resultMatrix; - } - - /** - * Execute the GPU kernel - * - * @param subMatrixA - * @param matrixA_NumTerms - * @param subMatrixB - * @param matrixB_NumTerms - * @param numLongs - * @param subResultMatrix - * @param kernel - * - * @return resultMatrix - */ - private static void executeKernel(final Device device, final long[] subMatrixA, final int matrixA_NumTerms, final long[] subMatrixB, final int matrixB_NumTerms, final int numLongs, final int[] subResultMatrix, final Kernel kernel) { - - // Power of Two for best performance - int matrixA_NumTermsRnd = matrixA_NumTerms; - while (!isPowerOfTwo(matrixA_NumTermsRnd)) { - matrixA_NumTermsRnd += 1; - } - - int matrixB_NumTermsRnd = matrixB_NumTerms; - while (!isPowerOfTwo(matrixB_NumTermsRnd)) { - matrixB_NumTermsRnd += 1; - } - - final Range range; - if (device != null) { - range = Range.create2D(device, matrixA_NumTermsRnd, matrixB_NumTermsRnd); - } else { - range = Range.create2D(matrixA_NumTermsRnd, matrixB_NumTermsRnd); - } - - if (LOG.isDebugEnabled()) { - LOG.debug("Range: " + range); - } - - kernel.put(subMatrixA); - kernel.put(subMatrixB); - kernel.put(subResultMatrix); - - kernel.execute(range); - - kernel.get(subResultMatrix); - } - - /** - * Highly efficient means to compute whether a number is a power of 2<br> - * Based on code from http://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 - * <p> - * Another very cool way to do this is ((x&(-x))==x) - * - * @param n - * @return boolean - */ - private static boolean isPowerOfTwo(int n) { - return (n > 0) && ((n & (n - 1)) == 0); - } - - /** - * Rounds a number to the multiple indicated - * - * @param num - * @param multiple - * @return - */ - private static int roundToMultiple(double num, int multiple) { - return (int) (Math.ceil(num / multiple) * multiple); - } - - /** - * Very nice means to convert byte sizes into human readable format<br> - * Based on code from http://stackoverflow.com/questions/3758606/how-to-convert-byte-size-into-human-readable-format-in-java - * <p> - * - * @param bytes - * @param si - * @return humanReadableByteCount - */ - private static String humanReadableByteCount(long bytes, boolean si) { - final int unit = si ? 1000 : 1024; - if (bytes < unit) { - return bytes + " B"; - } - final int exp = (int) (Math.log(bytes) / Math.log(unit)); - final String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp - 1) + (si ? "" : "i"); - - return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre); - } -} +/** + * This material was prepared as an account of work sponsored by an agency of the United States Government. + * Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of + * their employees, nor any jurisdiction or organization that has cooperated in the development of these materials, + * makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy, + * completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents + * that its use would not infringe privately owned rights. + */ +package gov.pnnl.aparapi.matrix; + +import org.apache.log4j.Logger; + +import com.aparapi.Kernel; +import com.aparapi.Kernel.EXECUTION_MODE; +import com.aparapi.Range; +import com.aparapi.device.Device; +import com.aparapi.device.OpenCLDevice; + +/** + * GPU calculations using OpenBitSet Intersection for OpenBitSets + * + * Based on code from: <br/> + * {@link http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java} + * + * @author ryan.lamothe at gmail.com + * @author sedillard at gmail.com + */ +public class CorrMatrixHost { + + private static final Logger LOG = Logger.getLogger(CorrMatrixHost.class); + + /** + * Perform matrix intersection for two lists of Lucene OpenBitSet-based packed longs + * + * @param matrixA + * The first term-document matrix + * @param matrixB + * The second term-document matrix + * @param Aparapi EXECUTION_MODE + * @return result Matrix + * @throws Exception + */ + public static int[][] intersectionMatrix(final long[][] matrixA, final long[][] matrixB, final EXECUTION_MODE executionMode) throws Exception { + + // Basic validation + if (matrixA == null) { + throw new NullPointerException("MatrixA cannot be NULL"); + } + + if (matrixB == null) { + throw new NullPointerException("MatrixB cannot be NULL"); + } + + // Size of an array is 8 bytes for the object + 4 bytes for the header and length information + final int arrayMemOverhead = 12; + + // numDocs/64 since they are packed into longs + // We need to make our matrix sizes multiples of BLOCK_SIZE + final int matrixA_numTerms = matrixA.length; + final int matrixA_numLongs = matrixA[0].length; + + if (LOG.isDebugEnabled()) { + LOG.debug("----------"); + LOG.debug("MatrixA NumTerms (Rows): " + matrixA_numTerms); + LOG.debug("MatrixA NumLongs (Columns): " + matrixA_numLongs); + LOG.debug("MatrixA NumDocs: " + (matrixA_numLongs * 64L)); + } + + final long matrixA_BytesPerRow = matrixA_numLongs * 8L; + final long matrixA_TotalBytes = (matrixA_numTerms * matrixA_BytesPerRow) + arrayMemOverhead; + + if (LOG.isDebugEnabled()) { + LOG.debug("MatrixA Total Memory Size: " + humanReadableByteCount(matrixA_TotalBytes, true)); + } + + final int matrixB_numTerms = matrixB.length; + final int matrixB_numLongs = matrixB[0].length; + + if (LOG.isDebugEnabled()) { + LOG.debug("----------"); + LOG.debug("MatrixB NumTerms (Rows): " + matrixB_numTerms); + LOG.debug("MatrixB NumLongs (Columns): " + matrixB_numLongs); + LOG.debug("MatrixB NumDocs: " + (matrixB_numLongs * 64L)); + } + + final long matrixB_BytesPerRow = matrixB_numLongs * 8L; + final long matrixB_TotalBytes = (matrixB_numTerms * matrixB_BytesPerRow) + arrayMemOverhead; + + if (LOG.isDebugEnabled()) { + LOG.debug("MatrixB Total Memory Size: " + humanReadableByteCount(matrixB_TotalBytes, true)); + LOG.debug("----------"); + } + + final int[][] resultMatrix = new int[matrixA_numTerms][matrixB_numTerms]; + + if (LOG.isDebugEnabled()) { + final long resultMatrix_TotalBytes = (matrixA_numTerms * matrixB_numTerms * 4L) + arrayMemOverhead; + LOG.debug("ResultMatrix Memory Size: " + humanReadableByteCount(resultMatrix_TotalBytes, true)); + LOG.debug("Total Requested Memory Size: " + humanReadableByteCount(matrixA_TotalBytes + matrixB_TotalBytes + resultMatrix_TotalBytes, true)); + LOG.debug("----------"); + } + + int NUM_SUB_ROWS = matrixA_numTerms; // Default number of sub-rows + + OpenCLDevice device = null; + + // We do not test for EXECUTION_MODE.JTP because JTP is non-OpenCL + if (executionMode.equals(EXECUTION_MODE.CPU)) { + device = (OpenCLDevice) Device.firstCPU(); + + if (device == null) { + LOG.warn("OpenCLDevice.CPU is NULL...OpenCL is unavailable. Setting to JTP mode."); + LOG.debug("----------"); + } + } else if (executionMode.equals(EXECUTION_MODE.GPU)) { + device = (OpenCLDevice) Device.best(); + + if (device == null) { + LOG.warn("OpenCLDevice.GPU is NULL...OpenCL is unavailable. Setting to JTP mode."); + LOG.debug("----------"); + } + } + + // This is to create stripes of rows that will fit into OpenCL's available memory + // Calculate the number of sub-rows by calling OpenCL to find out available memory + // Length of row * 8 (size of long in bytes) * number of rows to available memory + final int maxNumTerms = Math.max(matrixA_numTerms, matrixB_numTerms); + + if (device != null) { + final long globalMemSize = device.getGlobalMemSize(); + // final long maxMemAllocSize = Math.max((globalMemSize/4), 128*1024*1024); + final long maxMemAllocSize = device.getMaxMemAllocSize(); + + // 1048576 bytes in a megabyte (1024*1024) + // Java long is 8 bytes + // 131072 longs in 1 megabyte + // SAFE OpenCL spec allocation is max(1/4 GlobalMemSize) + // ***During our testing this appears to be incorrectly/inconsistently reported depending on os/drivers/hardware*** + if (LOG.isDebugEnabled()) { + LOG.debug("Available OpenCL globalMemSize: " + humanReadableByteCount(globalMemSize, true)); + LOG.debug("Available OpenCL maxMemAllocSize: " + humanReadableByteCount(maxMemAllocSize, true)); + } + + // Maybe there is a more clever way to do this :) + // The idea here is to decide how many sub-rows of the matrix we can fit on a single card + // The long-term goal to divide up the work for both small RAM GPUs and multiple GPUs + int subRowsCounterA = 0; + int subRowsCounterB = 0; + long subRowsMemSizeA = 0L; + long subRowsMemSizeB = 0L; + long subResultMatrixMemSize = 0L; + long subTotalMemSize = 0L; + + do { + if (subRowsCounterA < matrixA_numTerms) { + subRowsMemSizeA = subRowsCounterA != 0 ? (subRowsCounterA * matrixA_numLongs * 8L) + arrayMemOverhead : 0; + subRowsCounterA += 1; + } else if (subRowsCounterA == matrixA_numTerms) { + subRowsMemSizeA = subRowsCounterA != 0 ? (subRowsCounterA * matrixA_numLongs * 8L) + arrayMemOverhead : 0; + } + + if (subRowsCounterB < matrixB_numTerms) { + subRowsMemSizeB = subRowsCounterB != 0 ? (subRowsCounterB * matrixB_numLongs * 8L) + arrayMemOverhead : 0; + subRowsCounterB += 1; + } else if (subRowsCounterB == matrixB_numTerms) { + subRowsMemSizeB = subRowsCounterB != 0 ? (subRowsCounterB * matrixB_numLongs * 8L) + arrayMemOverhead : 0; + } + + // This is 4 bytes since the sub-result matrix is an int array + subResultMatrixMemSize = ((subRowsCounterA * subRowsCounterB) * 4L) + arrayMemOverhead; + + subTotalMemSize = subRowsMemSizeA + subRowsMemSizeB + subResultMatrixMemSize; + } while ((Math.max(subRowsCounterA, subRowsCounterB) < maxNumTerms) && (subTotalMemSize <= maxMemAllocSize)); + + // If using OpenCL override the default number of subrows + NUM_SUB_ROWS = Math.max(subRowsCounterA, subRowsCounterB); + + if (NUM_SUB_ROWS < maxNumTerms) { + final long subMatrixA_memSize = (NUM_SUB_ROWS * matrixA_numLongs * 8L) + arrayMemOverhead; + final long subMatrixB_memSize = (NUM_SUB_ROWS * matrixB_numLongs * 8L) + arrayMemOverhead; + final long subResultMatrix_memSize = (NUM_SUB_ROWS * NUM_SUB_ROWS * 4L) + arrayMemOverhead; + + LOG.warn("****************************************************************"); + LOG.warn("Requested matrix computation is larger than available OpenCL memory"); + LOG.warn("Matrix striping is occurring to fit all data into OpenCL memory..."); + LOG.warn(""); + LOG.warn("Number rows requested: " + maxNumTerms); + LOG.warn("Number rows that fit: " + NUM_SUB_ROWS); + LOG.warn(""); + LOG.warn("SubMatrixA Memory Size: " + humanReadableByteCount(subMatrixA_memSize, true)); + LOG.warn("SubMatrixB Memory Size: " + humanReadableByteCount(subMatrixB_memSize, true)); + LOG.warn("SubResultMatrix Memory Size: " + humanReadableByteCount(subResultMatrix_memSize, true)); + LOG.warn("SubMatrix Total Memory Size: " + humanReadableByteCount(subMatrixA_memSize + subMatrixB_memSize + subResultMatrix_memSize, true)); + LOG.warn("****************************************************************"); + } + } + + final int numSubBlocksA = ((matrixA_numTerms + NUM_SUB_ROWS) - 1) / NUM_SUB_ROWS; + final int numSubBlocksB = ((matrixB_numTerms + NUM_SUB_ROWS) - 1) / NUM_SUB_ROWS; + + final long[] subMatrixA = new long[NUM_SUB_ROWS * matrixA_numLongs]; + final long[] subMatrixB = new long[NUM_SUB_ROWS * matrixB_numLongs]; + final int[] subResultMatrix = new int[NUM_SUB_ROWS * NUM_SUB_ROWS]; + + final CorrMatrixKernel kernel = new CorrMatrixKernel(subMatrixA, NUM_SUB_ROWS, subMatrixB, NUM_SUB_ROWS, matrixA_numLongs, subResultMatrix); + kernel.setExplicit(true); + + // Here we define a fall-back strategy, since the user may have wanted to execute only a single execution mode + if (executionMode.equals(EXECUTION_MODE.GPU) && (device != null)) { + kernel.addExecutionModes(EXECUTION_MODE.GPU, EXECUTION_MODE.CPU, EXECUTION_MODE.JTP); + LOG.debug("Execution Fallback Strategy: GPU --> CPU --> JTP"); + } else if (executionMode.equals(EXECUTION_MODE.CPU) && (device != null)) { + kernel.addExecutionModes(EXECUTION_MODE.CPU, EXECUTION_MODE.JTP); + LOG.debug("Execution Fallback Strategy: CPU --> JTP"); + } else { + kernel.addExecutionModes(EXECUTION_MODE.JTP); + LOG.debug("Execution Strategy: JTP"); + } + + try { + for (int a = 0; a < numSubBlocksA; a++) { + for (int b = 0; b < numSubBlocksB; b++) { + final int aSubRowStart = a * NUM_SUB_ROWS; + final int aSubRowEnd = Math.min(matrixA_numTerms, aSubRowStart + NUM_SUB_ROWS); + + for (int i = aSubRowStart; i < aSubRowEnd; i++) { + if (matrixA_numLongs != matrixA[i].length) { + throw new Exception("All rows in the matrix need be the same length"); + } + + System.arraycopy(matrixA[i], 0, subMatrixA, (i - aSubRowStart) * matrixA_numLongs, matrixA_numLongs); + } + + final int bSubRowStart = b * NUM_SUB_ROWS; + final int bSubRowEnd = Math.min(matrixB_numTerms, bSubRowStart + NUM_SUB_ROWS); + + for (int i = bSubRowStart; i < bSubRowEnd; i++) { + if (matrixA_numLongs != matrixB[i].length) { + throw new Exception("All rows in the matrix need be the same length"); + } + + System.arraycopy(matrixB[i], 0, subMatrixB, (i - bSubRowStart) * matrixB_numLongs, matrixB_numLongs); + } + + // Since matrixA_NumLongs == matrixB_NumLongs we're only going to pass matrixA_NumLongs + executeKernel(device, subMatrixA, aSubRowEnd - aSubRowStart, subMatrixB, bSubRowEnd - bSubRowStart, matrixA_numLongs, subResultMatrix, kernel); + + // Convert one dimensional array to two dimensional array in the expected output ordering + for (int i = 0; i < NUM_SUB_ROWS; i++) { + if ((i + aSubRowStart) < aSubRowEnd) { + System.arraycopy(subResultMatrix, i * NUM_SUB_ROWS, resultMatrix[i + aSubRowStart], bSubRowStart, bSubRowEnd - bSubRowStart); + } + } + } + } + } finally { + if (LOG.isDebugEnabled()) { + LOG.debug("----------"); + LOG.debug("Aparapi Gross Execution Time: " + kernel.getAccumulatedExecutionTime() + " ms <------ Aparapi"); + LOG.debug("OpenCL Generation Time: " + kernel.getConversionTime() + " ms"); + LOG.debug("Kernel Net Execution Time: " + (kernel.getAccumulatedExecutionTime() - kernel.getConversionTime()) + " ms"); + LOG.debug("----------"); + } + + try { + kernel.dispose(); + } catch (final UnsatisfiedLinkError e) { + LOG.error("Aparapi failed to dispose of the kernel", e); + } + } + + return resultMatrix; + } + + /** + * Execute the GPU kernel + * + * @param subMatrixA + * @param matrixA_NumTerms + * @param subMatrixB + * @param matrixB_NumTerms + * @param numLongs + * @param subResultMatrix + * @param kernel + * + * @return resultMatrix + */ + private static void executeKernel(final Device device, final long[] subMatrixA, final int matrixA_NumTerms, final long[] subMatrixB, final int matrixB_NumTerms, final int numLongs, final int[] subResultMatrix, final Kernel kernel) { + + // Power of Two for best performance + int matrixA_NumTermsRnd = matrixA_NumTerms; + while (!isPowerOfTwo(matrixA_NumTermsRnd)) { + matrixA_NumTermsRnd += 1; + } + + int matrixB_NumTermsRnd = matrixB_NumTerms; + while (!isPowerOfTwo(matrixB_NumTermsRnd)) { + matrixB_NumTermsRnd += 1; + } + + final Range range; + if (device != null) { + range = Range.create2D(device, matrixA_NumTermsRnd, matrixB_NumTermsRnd); + } else { + range = Range.create2D(matrixA_NumTermsRnd, matrixB_NumTermsRnd); + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Range: " + range); + } + + kernel.put(subMatrixA); + kernel.put(subMatrixB); + kernel.put(subResultMatrix); + + kernel.execute(range); + + kernel.get(subResultMatrix); + } + + /** + * Highly efficient means to compute whether a number is a power of 2<br> + * Based on code from http://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 + * <p> + * Another very cool way to do this is ((x&(-x))==x) + * + * @param n + * @return boolean + */ + private static boolean isPowerOfTwo(int n) { + return (n > 0) && ((n & (n - 1)) == 0); + } + + /** + * Rounds a number to the multiple indicated + * + * @param num + * @param multiple + * @return + */ + private static int roundToMultiple(double num, int multiple) { + return (int) (Math.ceil(num / multiple) * multiple); + } + + /** + * Very nice means to convert byte sizes into human readable format<br> + * Based on code from http://stackoverflow.com/questions/3758606/how-to-convert-byte-size-into-human-readable-format-in-java + * <p> + * + * @param bytes + * @param si + * @return humanReadableByteCount + */ + private static String humanReadableByteCount(long bytes, boolean si) { + final int unit = si ? 1000 : 1024; + if (bytes < unit) { + return bytes + " B"; + } + final int exp = (int) (Math.log(bytes) / Math.log(unit)); + final String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp - 1) + (si ? "" : "i"); + + return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre); + } +} diff --git a/examples/correlation-matrix/src/test/gov/pnnl/aparapi/test/CorrMatrixTest.java b/examples/correlation-matrix/src/test/gov/pnnl/aparapi/test/CorrMatrixTest.java index df57dce1..d919641d 100644 --- a/examples/correlation-matrix/src/test/gov/pnnl/aparapi/test/CorrMatrixTest.java +++ b/examples/correlation-matrix/src/test/gov/pnnl/aparapi/test/CorrMatrixTest.java @@ -1,169 +1,169 @@ -/** - * This material was prepared as an account of work sponsored by an agency of the United States Government. - * Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of - * their employees, nor any jurisdiction or organization that has cooperated in the development of these materials, - * makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy, - * completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents - * that its use would not infringe privately owned rights. - */ -package gov.pnnl.aparapi.test; - -import gov.pnnl.aparapi.matrix.CorrMatrixHost; - -import java.io.File; -import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Random; - -import org.apache.commons.lang3.tuple.ImmutablePair; -import org.apache.commons.lang3.tuple.Pair; -import org.apache.log4j.Logger; -import org.apache.lucene.util.OpenBitSet; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import com.aparapi.Kernel.EXECUTION_MODE; - -/** - * This test class performs the following functions: - * - * 1) Create a randomly populated set of matrices for correlation/co-occurrence computation - * 2) Execute the CPU-based computation using Lucene OpenBitSets - * 3) Execute the GPU-based computation using Aparapi CorrMatrix host and kernel - * 4) Verify the results of OpenBitSet and CorrMatrix by comparing matrices to each other - * - * @author ryan.lamothe at gmail.com - * - */ -public class CorrMatrixTest { - - private static final Logger LOG = Logger.getLogger(CorrMatrixTest.class); - - private final List<Pair<OpenBitSet, OpenBitSet>> obsPairs = new ArrayList<Pair<OpenBitSet, OpenBitSet>>();; - - private final Random rand = new Random(); - - private int[][] obsResultMatrix; - - /** - * NumTerms and NumLongs (documents) need to be adjusted manually right now to force 'striping' to occur (see Host code for details) - */ - @Before - public void setup() throws Exception { - /* - * Populate test data - */ - LOG.debug("----------"); - LOG.debug("Populating test matrix data using settings from build.xml..."); - LOG.debug("----------"); - - final int numTerms = Integer.getInteger("numRows", 300); // # Rows - // numLongs*64 for number of actual documents since these are 'packed' longs - final int numLongs = Integer.getInteger("numColumns", 10000); // # Columns - - for (int i = 0; i < numTerms; ++i) { - final long[] bits = new long[numLongs]; - for (int j = 0; j < numLongs; ++j) { - bits[j] = rand.nextLong(); - } - - obsPairs.add(i, new ImmutablePair<OpenBitSet, OpenBitSet>(new OpenBitSet(bits, numLongs), new OpenBitSet(bits, numLongs))); - } - - /* - * OpenBitSet calculations - */ - LOG.debug("Executing OpenBitSet intersectionCount"); - - final long startTime = System.currentTimeMillis(); - - obsResultMatrix = new int[obsPairs.size()][obsPairs.size()]; - - // This is an N^2 comparison loop - // FIXME This entire loop needs to be parallelized to show an apples-to-apples comparison to Aparapi - for (int i = 0; i < obsPairs.size(); i++) { - final Pair<OpenBitSet, OpenBitSet> docFreqVector1 = obsPairs.get(i); - - for (int j = 0; j < obsPairs.size(); j++) { - final Pair<OpenBitSet, OpenBitSet> docFreqVector2 = obsPairs.get(j); - - // # of matches in both sets of documents - final int result = (int) OpenBitSet.intersectionCount(docFreqVector1.getLeft(), docFreqVector2.getRight()); - obsResultMatrix[i][j] = result; - } - } - - final long endTime = System.currentTimeMillis() - startTime; - - LOG.debug("OpenBitSet Gross Execution Time: " + endTime + " ms <------OpenBitSet"); - LOG.debug("----------"); - } - - @Test - public void testCorrelationMatrix() throws Exception { - /* - * GPU calculations - */ - LOG.debug("Executing Aparapi intersectionCount"); - - final long[][] matrixA = new long[obsPairs.size()][]; - final long[][] matrixB = new long[obsPairs.size()][]; - - // Convert OpenBitSet pairs to long primitive arrays for use with Aparapi - // TODO It would be nice if we could find a way to put the obsPairs onto the GPU directly :) - for (int i = 0; i < obsPairs.size(); i++) { - final OpenBitSet obsA = obsPairs.get(i).getLeft(); - final OpenBitSet obsB = obsPairs.get(i).getRight(); - - matrixA[i] = obsA.getBits(); - matrixB[i] = obsB.getBits(); - } - - // The reason for setting this property is because the CorrMatrix host/kernel code - // came from a GUI where a user could select "Use Hardware Acceleration" instead - // of the application forcing the setting globally on the command-line - final int[][] gpuResultMatrix; - if (Boolean.getBoolean("useGPU")) { - gpuResultMatrix = CorrMatrixHost.intersectionMatrix(matrixA, matrixB, EXECUTION_MODE.GPU); - } else { - gpuResultMatrix = CorrMatrixHost.intersectionMatrix(matrixA, matrixB, EXECUTION_MODE.CPU); - } - - // Compare the two result arrays to make sure we are generating the same output - for (int i = 0; i < obsResultMatrix.length; i++) { - Assert.assertTrue("Arrays are not equal", Arrays.equals(obsResultMatrix[i], gpuResultMatrix[i])); - } - - // Visually compare/third-party tool compare if desired - if (LOG.isTraceEnabled()) { - // We're not using "try with resources" because Aparapi currently targets JDK 6 - final PrintWriter cpuOut = new PrintWriter(new File(System.getProperty("user.dir"), "trace/cpuOut.txt")); - final PrintWriter gpuOut = new PrintWriter(new File(System.getProperty("user.dir"), "trace/gpuOut.txt")); - - try { - for (int i = 0; i < obsResultMatrix.length; i++) { - if (LOG.isTraceEnabled()) { - LOG.trace("obsResultMatrix length: " + obsResultMatrix.length); - LOG.trace("gpuResultMatrix length: " + gpuResultMatrix.length); - - cpuOut.println(Arrays.toString(obsResultMatrix[i])); - gpuOut.println(Arrays.toString(gpuResultMatrix[i])); - } - } - } finally { - if (cpuOut != null) { - cpuOut.flush(); - cpuOut.close(); - } - - if (gpuOut != null) { - gpuOut.flush(); - gpuOut.close(); - } - } - } - } -} +/** + * This material was prepared as an account of work sponsored by an agency of the United States Government. + * Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of + * their employees, nor any jurisdiction or organization that has cooperated in the development of these materials, + * makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy, + * completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents + * that its use would not infringe privately owned rights. + */ +package gov.pnnl.aparapi.test; + +import gov.pnnl.aparapi.matrix.CorrMatrixHost; + +import java.io.File; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; + +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.log4j.Logger; +import org.apache.lucene.util.OpenBitSet; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import com.aparapi.Kernel.EXECUTION_MODE; + +/** + * This test class performs the following functions: + * + * 1) Create a randomly populated set of matrices for correlation/co-occurrence computation + * 2) Execute the CPU-based computation using Lucene OpenBitSets + * 3) Execute the GPU-based computation using Aparapi CorrMatrix host and kernel + * 4) Verify the results of OpenBitSet and CorrMatrix by comparing matrices to each other + * + * @author ryan.lamothe at gmail.com + * + */ +public class CorrMatrixTest { + + private static final Logger LOG = Logger.getLogger(CorrMatrixTest.class); + + private final List<Pair<OpenBitSet, OpenBitSet>> obsPairs = new ArrayList<Pair<OpenBitSet, OpenBitSet>>();; + + private final Random rand = new Random(); + + private int[][] obsResultMatrix; + + /** + * NumTerms and NumLongs (documents) need to be adjusted manually right now to force 'striping' to occur (see Host code for details) + */ + @Before + public void setup() throws Exception { + /* + * Populate test data + */ + LOG.debug("----------"); + LOG.debug("Populating test matrix data using settings from build.xml..."); + LOG.debug("----------"); + + final int numTerms = Integer.getInteger("numRows", 300); // # Rows + // numLongs*64 for number of actual documents since these are 'packed' longs + final int numLongs = Integer.getInteger("numColumns", 10000); // # Columns + + for (int i = 0; i < numTerms; ++i) { + final long[] bits = new long[numLongs]; + for (int j = 0; j < numLongs; ++j) { + bits[j] = rand.nextLong(); + } + + obsPairs.add(i, new ImmutablePair<OpenBitSet, OpenBitSet>(new OpenBitSet(bits, numLongs), new OpenBitSet(bits, numLongs))); + } + + /* + * OpenBitSet calculations + */ + LOG.debug("Executing OpenBitSet intersectionCount"); + + final long startTime = System.currentTimeMillis(); + + obsResultMatrix = new int[obsPairs.size()][obsPairs.size()]; + + // This is an N^2 comparison loop + // FIXME This entire loop needs to be parallelized to show an apples-to-apples comparison to Aparapi + for (int i = 0; i < obsPairs.size(); i++) { + final Pair<OpenBitSet, OpenBitSet> docFreqVector1 = obsPairs.get(i); + + for (int j = 0; j < obsPairs.size(); j++) { + final Pair<OpenBitSet, OpenBitSet> docFreqVector2 = obsPairs.get(j); + + // # of matches in both sets of documents + final int result = (int) OpenBitSet.intersectionCount(docFreqVector1.getLeft(), docFreqVector2.getRight()); + obsResultMatrix[i][j] = result; + } + } + + final long endTime = System.currentTimeMillis() - startTime; + + LOG.debug("OpenBitSet Gross Execution Time: " + endTime + " ms <------OpenBitSet"); + LOG.debug("----------"); + } + + @Test + public void testCorrelationMatrix() throws Exception { + /* + * GPU calculations + */ + LOG.debug("Executing Aparapi intersectionCount"); + + final long[][] matrixA = new long[obsPairs.size()][]; + final long[][] matrixB = new long[obsPairs.size()][]; + + // Convert OpenBitSet pairs to long primitive arrays for use with Aparapi + // TODO It would be nice if we could find a way to put the obsPairs onto the GPU directly :) + for (int i = 0; i < obsPairs.size(); i++) { + final OpenBitSet obsA = obsPairs.get(i).getLeft(); + final OpenBitSet obsB = obsPairs.get(i).getRight(); + + matrixA[i] = obsA.getBits(); + matrixB[i] = obsB.getBits(); + } + + // The reason for setting this property is because the CorrMatrix host/kernel code + // came from a GUI where a user could select "Use Hardware Acceleration" instead + // of the application forcing the setting globally on the command-line + final int[][] gpuResultMatrix; + if (Boolean.getBoolean("useGPU")) { + gpuResultMatrix = CorrMatrixHost.intersectionMatrix(matrixA, matrixB, EXECUTION_MODE.GPU); + } else { + gpuResultMatrix = CorrMatrixHost.intersectionMatrix(matrixA, matrixB, EXECUTION_MODE.CPU); + } + + // Compare the two result arrays to make sure we are generating the same output + for (int i = 0; i < obsResultMatrix.length; i++) { + Assert.assertTrue("Arrays are not equal", Arrays.equals(obsResultMatrix[i], gpuResultMatrix[i])); + } + + // Visually compare/third-party tool compare if desired + if (LOG.isTraceEnabled()) { + // We're not using "try with resources" because Aparapi currently targets JDK 6 + final PrintWriter cpuOut = new PrintWriter(new File(System.getProperty("user.dir"), "trace/cpuOut.txt")); + final PrintWriter gpuOut = new PrintWriter(new File(System.getProperty("user.dir"), "trace/gpuOut.txt")); + + try { + for (int i = 0; i < obsResultMatrix.length; i++) { + if (LOG.isTraceEnabled()) { + LOG.trace("obsResultMatrix length: " + obsResultMatrix.length); + LOG.trace("gpuResultMatrix length: " + gpuResultMatrix.length); + + cpuOut.println(Arrays.toString(obsResultMatrix[i])); + gpuOut.println(Arrays.toString(gpuResultMatrix[i])); + } + } + } finally { + if (cpuOut != null) { + cpuOut.flush(); + cpuOut.close(); + } + + if (gpuOut != null) { + gpuOut.flush(); + gpuOut.close(); + } + } + } + } +} diff --git a/examples/movie/.project b/examples/movie/.project deleted file mode 100644 index 6be55d8c..00000000 --- a/examples/movie/.project +++ /dev/null @@ -1,17 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<projectDescription> - <name>movie</name> - <comment></comment> - <projects> - </projects> - <buildSpec> - <buildCommand> - <name>org.eclipse.jdt.core.javabuilder</name> - <arguments> - </arguments> - </buildCommand> - </buildSpec> - <natures> - <nature>org.eclipse.jdt.core.javanature</nature> - </natures> -</projectDescription> diff --git a/examples/movie/build.xml b/examples/movie/build.xml index bbfd7ae5..adea1f80 100644 --- a/examples/movie/build.xml +++ b/examples/movie/build.xml @@ -1,127 +1,127 @@ -<?xml version="1.0"?> - -<project name="movie" default="build" basedir="."> - - <target name="getjjmpeg-windows-i586" if="use.win32.jjmpeg"> - <delete dir="jjmpeg"/> - <mkdir dir="jjmpeg"/> - <get dest="jjmpeg"> - <url url="http://jjmpeg.googlecode.com/files/jjmpeg-0.0-bin.tar.bz2"/> - </get> - <untar src="jjmpeg/jjmpeg-0.0-bin.tar.bz2" compression="bzip2" dest="jjmpeg"/> - </target> - - <target name="getffmpeg-windows-i586" if="use.win32.ffmpeg"> - <delete dir="ffmpeg"/> - <mkdir dir="ffmpeg"/> - <get dest="ffmpeg"> - <url url="http://ffmpeg.zeranoe.com/builds/win32/shared/ffmpeg-git-9c2651a-win32-shared.7z"/> - </get> - <exec dir="ffmpeg" executable="C:\Program Files\7-Zip\7z.exe"> - <arg value="x"/> - <arg value="ffmpeg-git-9c2651a-win32-shared.7z"/> - </exec> - </target> - <target name="getjjmpeg-windows-amd64" if="use.win64.jjmpeg"> - <delete dir="jjmpeg"/> - <mkdir dir="jjmpeg"/> - <get dest="jjmpeg"> - <url url="http://jjmpeg.googlecode.com/files/jjmpeg-0.0-bin.tar.bz2"/> - </get> - <untar src="jjmpeg/jjmpeg-0.0-bin.tar.bz2" compression="bzip2" dest="jjmpeg"/> - </target> - <target name="getffmpeg-windows-amd64" if="use.win64.ffmpeg"> - <delete dir="ffmpeg"/> - <mkdir dir="ffmpeg"/> - <get dest="ffmpeg"> - <url url="http://ffmpeg.zeranoe.com/builds/win64/shared/ffmpeg-git-9c2651a-win64-shared.7z"/> - </get> - <exec dir="ffmpeg" executable="C:\Program Files\7-Zip\7z.exe"> - <arg value="x"/> - <arg value="ffmpeg-git-9c2651a-win64-shared.7z"/> - </exec> - </target> - - <target name="checkos"> - <condition property="use.win32.jjmpeg"> - <and> - <os family="windows" /> - <or> - <os arch="x86" /> - <os arch="i386" /> - </or> - <not><available file="jjmpeg/jjmpeg-0.0/native/mswin-i386/jjmpeg.dll"/> </not> - <not><available file="jjmpeg/jjmpeg-0.0/dist/jjmpeg.jar"/> </not> - </and> - </condition> - <condition property="use.win32.ffmpeg"> - <and> - <os family="windows" /> - <or> - <os arch="x86" /> - <os arch="i386" /> - </or> - <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avcodec-53.dll"/> </not> - <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avdevice-53.dll"/> </not> - <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avfilter-53.dll"/> </not> - <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avformat-53.dll"/> </not> - <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avutil-51.dll"/> </not> - <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\postproc-51.dll"/> </not> - <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\swscale-2.dll"/> </not> - </and> - </condition> - <condition property="use.win64.jjmpeg"> - <and> - <os family="windows" /> - <not> - <or> - <os arch="x86" /> - <os arch="i386" /> - </or> - </not> - <not><available file="jjmpeg/jjmpeg-0.0/native/mswin-amd64/jjmpeg.dll"/> </not> - <not><available file="jjmpeg/jjmpeg-0.0/dist/jjmpeg.jar"/> </not> - </and> - </condition> - - <condition property="use.win64.ffmpeg"> - <and> - <os family="windows" /> - <not> - <or> - <os arch="x86" /> - <os arch="i386" /> - </or> - </not> - <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avcodec-53.dll"/> </not> - <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avdevice-53.dll"/> </not> - <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avfilter-53.dll"/> </not> - <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avformat-53.dll"/> </not> - <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avutil-51.dll"/> </not> - <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\postproc-51.dll"/> </not> - <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\swscale-2.dll"/> </not> - </and> - </condition> - </target> - - <target name="getstuff" depends="checkos, getjjmpeg-windows-i586, getffmpeg-windows-i586, getjjmpeg-windows-amd64, getffmpeg-windows-amd64"/> - - <target name="build" depends="getstuff,clean"> - <mkdir dir="classes"/> - <javac srcdir="src" destdir="classes" debug="on" includeantruntime="false" > - <classpath> - <pathelement path="..\..\com.aparapi\dist\aparapi.jar"/> - <pathelement path="jjmpeg\jjmpeg-0.0\dist\jjmpeg.jar/"/> - <pathelement path="..\jviolajones\jviolajones.jar/"/> - </classpath> - </javac> - <jar jarfile="${ant.project.name}.jar" basedir="classes"/> - </target> - - <target name="clean"> - <delete dir="classes"/> - <delete file="${ant.project.name}.jar"/> - </target> - - -</project> +<?xml version="1.0"?> + +<project name="movie" default="build" basedir="."> + + <target name="getjjmpeg-windows-i586" if="use.win32.jjmpeg"> + <delete dir="jjmpeg"/> + <mkdir dir="jjmpeg"/> + <get dest="jjmpeg"> + <url url="http://jjmpeg.googlecode.com/files/jjmpeg-0.0-bin.tar.bz2"/> + </get> + <untar src="jjmpeg/jjmpeg-0.0-bin.tar.bz2" compression="bzip2" dest="jjmpeg"/> + </target> + + <target name="getffmpeg-windows-i586" if="use.win32.ffmpeg"> + <delete dir="ffmpeg"/> + <mkdir dir="ffmpeg"/> + <get dest="ffmpeg"> + <url url="http://ffmpeg.zeranoe.com/builds/win32/shared/ffmpeg-git-9c2651a-win32-shared.7z"/> + </get> + <exec dir="ffmpeg" executable="C:\Program Files\7-Zip\7z.exe"> + <arg value="x"/> + <arg value="ffmpeg-git-9c2651a-win32-shared.7z"/> + </exec> + </target> + <target name="getjjmpeg-windows-amd64" if="use.win64.jjmpeg"> + <delete dir="jjmpeg"/> + <mkdir dir="jjmpeg"/> + <get dest="jjmpeg"> + <url url="http://jjmpeg.googlecode.com/files/jjmpeg-0.0-bin.tar.bz2"/> + </get> + <untar src="jjmpeg/jjmpeg-0.0-bin.tar.bz2" compression="bzip2" dest="jjmpeg"/> + </target> + <target name="getffmpeg-windows-amd64" if="use.win64.ffmpeg"> + <delete dir="ffmpeg"/> + <mkdir dir="ffmpeg"/> + <get dest="ffmpeg"> + <url url="http://ffmpeg.zeranoe.com/builds/win64/shared/ffmpeg-git-9c2651a-win64-shared.7z"/> + </get> + <exec dir="ffmpeg" executable="C:\Program Files\7-Zip\7z.exe"> + <arg value="x"/> + <arg value="ffmpeg-git-9c2651a-win64-shared.7z"/> + </exec> + </target> + + <target name="checkos"> + <condition property="use.win32.jjmpeg"> + <and> + <os family="windows" /> + <or> + <os arch="x86" /> + <os arch="i386" /> + </or> + <not><available file="jjmpeg/jjmpeg-0.0/native/mswin-i386/jjmpeg.dll"/> </not> + <not><available file="jjmpeg/jjmpeg-0.0/dist/jjmpeg.jar"/> </not> + </and> + </condition> + <condition property="use.win32.ffmpeg"> + <and> + <os family="windows" /> + <or> + <os arch="x86" /> + <os arch="i386" /> + </or> + <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avcodec-53.dll"/> </not> + <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avdevice-53.dll"/> </not> + <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avfilter-53.dll"/> </not> + <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avformat-53.dll"/> </not> + <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avutil-51.dll"/> </not> + <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\postproc-51.dll"/> </not> + <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\swscale-2.dll"/> </not> + </and> + </condition> + <condition property="use.win64.jjmpeg"> + <and> + <os family="windows" /> + <not> + <or> + <os arch="x86" /> + <os arch="i386" /> + </or> + </not> + <not><available file="jjmpeg/jjmpeg-0.0/native/mswin-amd64/jjmpeg.dll"/> </not> + <not><available file="jjmpeg/jjmpeg-0.0/dist/jjmpeg.jar"/> </not> + </and> + </condition> + + <condition property="use.win64.ffmpeg"> + <and> + <os family="windows" /> + <not> + <or> + <os arch="x86" /> + <os arch="i386" /> + </or> + </not> + <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avcodec-53.dll"/> </not> + <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avdevice-53.dll"/> </not> + <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avfilter-53.dll"/> </not> + <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avformat-53.dll"/> </not> + <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avutil-51.dll"/> </not> + <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\postproc-51.dll"/> </not> + <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\swscale-2.dll"/> </not> + </and> + </condition> + </target> + + <target name="getstuff" depends="checkos, getjjmpeg-windows-i586, getffmpeg-windows-i586, getjjmpeg-windows-amd64, getffmpeg-windows-amd64"/> + + <target name="build" depends="getstuff,clean"> + <mkdir dir="classes"/> + <javac srcdir="src" destdir="classes" debug="on" includeantruntime="false" > + <classpath> + <pathelement path="..\..\com.aparapi\dist\aparapi.jar"/> + <pathelement path="jjmpeg\jjmpeg-0.0\dist\jjmpeg.jar/"/> + <pathelement path="..\jviolajones\jviolajones.jar/"/> + </classpath> + </javac> + <jar jarfile="${ant.project.name}.jar" basedir="classes"/> + </target> + + <target name="clean"> + <delete dir="classes"/> + <delete file="${ant.project.name}.jar"/> + </target> + + +</project> diff --git a/examples/movie/movie.bat b/examples/movie/movie.bat index 9273c3c2..5f07c629 100644 --- a/examples/movie/movie.bat +++ b/examples/movie/movie.bat @@ -1,15 +1,15 @@ -SETLOCAL -if /I %PROCESSOR_ARCHITECTURE%==x86 goto win32 -echo "win64!" -set PATH=%PATH%;ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin -set PATH=%PATH%;jjmpeg\jjmpeg-0.0\native\mswin-amd64 -goto win64 -:win32 -echo "win32!" -set PATH=%PATH%;ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin -set PATH=%PATH%;jjmpeg\jjmpeg-0.0\native\mswin-i386 -:win64 -set PATH=%PATH%;..\..\com.aparapi.jni\dist -java -classpath jjmpeg\jjmpeg-0.0\dist\jjmpeg.jar;..\..\com.aparapi\dist\aparapi.jar;movie.jar; com.aparapi.examples.movie.%1 %2 - -ENDLOCAL +SETLOCAL +if /I %PROCESSOR_ARCHITECTURE%==x86 goto win32 +echo "win64!" +set PATH=%PATH%;ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin +set PATH=%PATH%;jjmpeg\jjmpeg-0.0\native\mswin-amd64 +goto win64 +:win32 +echo "win32!" +set PATH=%PATH%;ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin +set PATH=%PATH%;jjmpeg\jjmpeg-0.0\native\mswin-i386 +:win64 +set PATH=%PATH%;..\..\com.aparapi.jni\dist +java -classpath jjmpeg\jjmpeg-0.0\dist\jjmpeg.jar;..\..\com.aparapi\dist\aparapi.jar;movie.jar; com.aparapi.examples.movie.%1 %2 + +ENDLOCAL diff --git a/examples/movie/src/com/amd/aparapi/examples/movie/AparapiSolution.java b/examples/movie/src/com/amd/aparapi/examples/movie/AparapiSolution.java index 29fdc946..aed1d6f4 100644 --- a/examples/movie/src/com/amd/aparapi/examples/movie/AparapiSolution.java +++ b/examples/movie/src/com/amd/aparapi/examples/movie/AparapiSolution.java @@ -1,130 +1,130 @@ -/* -Copyright (c) 2010-2011, Advanced Micro Devices, Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the -following conditions are met: - -Redistributions of source code must retain the above copyright notice, this list of conditions and the following -disclaimer. - -Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following -disclaimer in the documentation and/or other materials provided with the distribution. - -Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, -INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export -laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through -774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, -you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of -Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration -Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1, -E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups -D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject -to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774 -of EAR). For the most current Country Group listings, or for additional information about the EAR or your obligations -under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. - -*/ -package com.aparapi.examples.movie; - -import java.awt.Graphics2D; -import java.awt.image.BufferedImage; -import java.awt.image.DataBufferByte; - -import com.aparapi.Kernel; -import com.aparapi.Range; - -public class AparapiSolution{ - - public static class AparapiConvolution extends Kernel{ - - private byte[] inputData; - - private byte[] outputData; - - private int width; - - private int height; - - private Range range; - - float[] convMatrix3x3; - - public AparapiConvolution(BufferedImage _imageIn, BufferedImage _imageOut) { - inputData = ((DataBufferByte) _imageIn.getRaster().getDataBuffer()).getData(); - outputData = ((DataBufferByte) _imageOut.getRaster().getDataBuffer()).getData(); - width = _imageIn.getWidth(); - height = _imageIn.getHeight(); - range = Range.create2D(width * 3, height); - setExplicit(true); - - } - - public void processPixel(int x, int y, int w, int h) { - float accum = 0; - int count = 0; - for (int dx = -3; dx < 6; dx += 3) { - for (int dy = -1; dy < 2; dy += 1) { - int rgb = 0xff & inputData[((y + dy) * w) + (x + dx)]; - accum += rgb * convMatrix3x3[count++]; - } - } - outputData[y * w + x] = (byte) Math.max(0, Math.min((int) accum, 255)); - } - - public void run() { - int x = getGlobalId(0); - int y = getGlobalId(1); - int w = getGlobalSize(0); - int h = getGlobalSize(1); - if (x > 3 && x < (w - 3) && y > 1 && y < (h - 1)) { - processPixel(x, y, w, h); - } else { - outputData[y * w + x] = inputData[(y * w) + x]; - } - } - - public void apply(float[] _convMatrix3x3) { - convMatrix3x3 = _convMatrix3x3; - put(convMatrix3x3).put(inputData).execute(range).get(outputData); - } - - } - - public static void main(final String[] _args) { - String fileName = _args.length == 1 ? _args[0] : "Leo720p.wmv"; - - float[] convMatrix3x3 = new float[] { - 0f, - -10f, - 0f, - -10f, - 41f, - -10f, - 0f, - -10f, - 0f - }; - new JJMPEGPlayer("Aparapi - Solution", fileName, convMatrix3x3){ - AparapiConvolution kernel = null; - - @Override protected void processFrame(Graphics2D gc, float[] _convMatrix3x3, BufferedImage in, BufferedImage out) { - if (kernel == null) { - kernel = new AparapiConvolution(in, out); - } - kernel.apply(_convMatrix3x3); - } - }; - - } -} +/* +Copyright (c) 2010-2011, Advanced Micro Devices, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the +following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and the following +disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials provided with the distribution. + +Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export +laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through +774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, +you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of +Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration +Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1, +E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups +D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject +to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774 +of EAR). For the most current Country Group listings, or for additional information about the EAR or your obligations +under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. + +*/ +package com.aparapi.examples.movie; + +import java.awt.Graphics2D; +import java.awt.image.BufferedImage; +import java.awt.image.DataBufferByte; + +import com.aparapi.Kernel; +import com.aparapi.Range; + +public class AparapiSolution{ + + public static class AparapiConvolution extends Kernel{ + + private byte[] inputData; + + private byte[] outputData; + + private int width; + + private int height; + + private Range range; + + float[] convMatrix3x3; + + public AparapiConvolution(BufferedImage _imageIn, BufferedImage _imageOut) { + inputData = ((DataBufferByte) _imageIn.getRaster().getDataBuffer()).getData(); + outputData = ((DataBufferByte) _imageOut.getRaster().getDataBuffer()).getData(); + width = _imageIn.getWidth(); + height = _imageIn.getHeight(); + range = Range.create2D(width * 3, height); + setExplicit(true); + + } + + public void processPixel(int x, int y, int w, int h) { + float accum = 0; + int count = 0; + for (int dx = -3; dx < 6; dx += 3) { + for (int dy = -1; dy < 2; dy += 1) { + int rgb = 0xff & inputData[((y + dy) * w) + (x + dx)]; + accum += rgb * convMatrix3x3[count++]; + } + } + outputData[y * w + x] = (byte) Math.max(0, Math.min((int) accum, 255)); + } + + public void run() { + int x = getGlobalId(0); + int y = getGlobalId(1); + int w = getGlobalSize(0); + int h = getGlobalSize(1); + if (x > 3 && x < (w - 3) && y > 1 && y < (h - 1)) { + processPixel(x, y, w, h); + } else { + outputData[y * w + x] = inputData[(y * w) + x]; + } + } + + public void apply(float[] _convMatrix3x3) { + convMatrix3x3 = _convMatrix3x3; + put(convMatrix3x3).put(inputData).execute(range).get(outputData); + } + + } + + public static void main(final String[] _args) { + String fileName = _args.length == 1 ? _args[0] : "Leo720p.wmv"; + + float[] convMatrix3x3 = new float[] { + 0f, + -10f, + 0f, + -10f, + 41f, + -10f, + 0f, + -10f, + 0f + }; + new JJMPEGPlayer("Aparapi - Solution", fileName, convMatrix3x3){ + AparapiConvolution kernel = null; + + @Override protected void processFrame(Graphics2D gc, float[] _convMatrix3x3, BufferedImage in, BufferedImage out) { + if (kernel == null) { + kernel = new AparapiConvolution(in, out); + } + kernel.apply(_convMatrix3x3); + } + }; + + } +} diff --git a/examples/movie/src/com/amd/aparapi/examples/movie/ConvMatrix3x3Editor.java b/examples/movie/src/com/amd/aparapi/examples/movie/ConvMatrix3x3Editor.java index d3140806..2d3c2c28 100644 --- a/examples/movie/src/com/amd/aparapi/examples/movie/ConvMatrix3x3Editor.java +++ b/examples/movie/src/com/amd/aparapi/examples/movie/ConvMatrix3x3Editor.java @@ -1,151 +1,151 @@ -/* -Copyright (c) 2010-2011, Advanced Micro Devices, Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the -following conditions are met: - -Redistributions of source code must retain the above copyright notice, this list of conditions and the following -disclaimer. - -Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following -disclaimer in the documentation and/or other materials provided with the distribution. - -Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, -INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export -laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through -774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, -you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of -Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration -Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1, -E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups -D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject -to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774 -of EAR). For the most current Country Group listings, or for additional information about the EAR or your obligations -under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. - -*/ - -package com.aparapi.examples.movie; - -import java.awt.BorderLayout; -import java.awt.Component; -import java.awt.GridLayout; -import java.awt.event.ActionEvent; -import java.awt.event.ActionListener; -import java.util.Arrays; - -import javax.swing.BoxLayout; -import javax.swing.JComboBox; -import javax.swing.JPanel; -import javax.swing.JSpinner; -import javax.swing.SpinnerModel; -import javax.swing.SpinnerNumberModel; -import javax.swing.event.ChangeEvent; -import javax.swing.event.ChangeListener; - -public class ConvMatrix3x3Editor{ - Component component; - - float[] default3x3; - - float[] none3x3 = new float[] { - 0, - 0, - 0, - 0, - 1, - 0, - 0, - 0, - 0 - }; - - float[] blur3x3 = new float[] { - .1f, - .1f, - .1f, - .1f, - .1f, - .1f, - .1f, - .1f, - .1f - }; - - JSpinner[] spinners = new JSpinner[9]; - - protected void updated(float[] _convMatrix3x3) { - - }; - - void set(float[] _to, float[] _from) { - for (int i = 0; i < 9; i++) { - _to[i] = _from[i]; - spinners[i].setValue((Double) (double) _to[i]); - - } - updated(_to); - } - - ConvMatrix3x3Editor(final float[] _convMatrix3x3) { - default3x3 = Arrays.copyOf(_convMatrix3x3, _convMatrix3x3.length); - JPanel leftPanel = new JPanel(); - JPanel controlPanel = new JPanel(); - BoxLayout layout = new BoxLayout(controlPanel, BoxLayout.Y_AXIS); - controlPanel.setLayout(layout); - component = leftPanel; - JPanel grid3x3Panel = new JPanel(); - controlPanel.add(grid3x3Panel); - grid3x3Panel.setLayout(new GridLayout(3, 3)); - for (int i = 0; i < 9; i++) { - final int index = i; - SpinnerModel model = new SpinnerNumberModel(_convMatrix3x3[index], -50f, 50f, 1f); - JSpinner spinner = new JSpinner(model); - spinners[i] = spinner; - spinner.addChangeListener(new ChangeListener(){ - public void stateChanged(ChangeEvent ce) { - JSpinner source = (JSpinner) ce.getSource(); - double value = ((Double) source.getValue()); - _convMatrix3x3[index] = (float) value; - updated(_convMatrix3x3); - } - }); - grid3x3Panel.add(spinner); - } - String[] options = new String[] { - "DEFAULT", - "NONE", - "BLUR" - }; - JComboBox combo = new JComboBox(options); - combo.addActionListener(new ActionListener(){ - - @Override public void actionPerformed(ActionEvent e) { - JComboBox cb = (JComboBox) e.getSource(); - String value = (String) cb.getSelectedItem(); - if (value.equals("DEFAULT")) { - set(_convMatrix3x3, default3x3); - } else if (value.equals("NONE")) { - set(_convMatrix3x3, none3x3); - } else if (value.equals("BLUR")) { - set(_convMatrix3x3, blur3x3); - } - } - - }); - controlPanel.add(combo); - - leftPanel.add(controlPanel, BorderLayout.NORTH); - } -} +/* +Copyright (c) 2010-2011, Advanced Micro Devices, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the +following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and the following +disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials provided with the distribution. + +Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export +laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through +774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, +you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of +Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration +Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1, +E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups +D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject +to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774 +of EAR). For the most current Country Group listings, or for additional information about the EAR or your obligations +under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. + +*/ + +package com.aparapi.examples.movie; + +import java.awt.BorderLayout; +import java.awt.Component; +import java.awt.GridLayout; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; +import java.util.Arrays; + +import javax.swing.BoxLayout; +import javax.swing.JComboBox; +import javax.swing.JPanel; +import javax.swing.JSpinner; +import javax.swing.SpinnerModel; +import javax.swing.SpinnerNumberModel; +import javax.swing.event.ChangeEvent; +import javax.swing.event.ChangeListener; + +public class ConvMatrix3x3Editor{ + Component component; + + float[] default3x3; + + float[] none3x3 = new float[] { + 0, + 0, + 0, + 0, + 1, + 0, + 0, + 0, + 0 + }; + + float[] blur3x3 = new float[] { + .1f, + .1f, + .1f, + .1f, + .1f, + .1f, + .1f, + .1f, + .1f + }; + + JSpinner[] spinners = new JSpinner[9]; + + protected void updated(float[] _convMatrix3x3) { + + }; + + void set(float[] _to, float[] _from) { + for (int i = 0; i < 9; i++) { + _to[i] = _from[i]; + spinners[i].setValue((Double) (double) _to[i]); + + } + updated(_to); + } + + ConvMatrix3x3Editor(final float[] _convMatrix3x3) { + default3x3 = Arrays.copyOf(_convMatrix3x3, _convMatrix3x3.length); + JPanel leftPanel = new JPanel(); + JPanel controlPanel = new JPanel(); + BoxLayout layout = new BoxLayout(controlPanel, BoxLayout.Y_AXIS); + controlPanel.setLayout(layout); + component = leftPanel; + JPanel grid3x3Panel = new JPanel(); + controlPanel.add(grid3x3Panel); + grid3x3Panel.setLayout(new GridLayout(3, 3)); + for (int i = 0; i < 9; i++) { + final int index = i; + SpinnerModel model = new SpinnerNumberModel(_convMatrix3x3[index], -50f, 50f, 1f); + JSpinner spinner = new JSpinner(model); + spinners[i] = spinner; + spinner.addChangeListener(new ChangeListener(){ + public void stateChanged(ChangeEvent ce) { + JSpinner source = (JSpinner) ce.getSource(); + double value = ((Double) source.getValue()); + _convMatrix3x3[index] = (float) value; + updated(_convMatrix3x3); + } + }); + grid3x3Panel.add(spinner); + } + String[] options = new String[] { + "DEFAULT", + "NONE", + "BLUR" + }; + JComboBox combo = new JComboBox(options); + combo.addActionListener(new ActionListener(){ + + @Override public void actionPerformed(ActionEvent e) { + JComboBox cb = (JComboBox) e.getSource(); + String value = (String) cb.getSelectedItem(); + if (value.equals("DEFAULT")) { + set(_convMatrix3x3, default3x3); + } else if (value.equals("NONE")) { + set(_convMatrix3x3, none3x3); + } else if (value.equals("BLUR")) { + set(_convMatrix3x3, blur3x3); + } + } + + }); + controlPanel.add(combo); + + leftPanel.add(controlPanel, BorderLayout.NORTH); + } +} diff --git a/examples/movie/src/com/amd/aparapi/examples/movie/JJMPEGPlayer.java b/examples/movie/src/com/amd/aparapi/examples/movie/JJMPEGPlayer.java index 79f546b6..7404255d 100644 --- a/examples/movie/src/com/amd/aparapi/examples/movie/JJMPEGPlayer.java +++ b/examples/movie/src/com/amd/aparapi/examples/movie/JJMPEGPlayer.java @@ -1,147 +1,147 @@ -/* -Copyright (c) 2010-2011, Advanced Micro Devices, Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the -following conditions are met: - -Redistributions of source code must retain the above copyright notice, this list of conditions and the following -disclaimer. - -Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following -disclaimer in the documentation and/or other materials provided with the distribution. - -Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, -INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export -laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through -774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, -you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of -Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration -Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1, -E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups -D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject -to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774 -of EAR). For the most current Country Group listings, or for additional information about the EAR or your obligations -under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. - -*/ -package com.aparapi.examples.movie; - -import java.awt.BorderLayout; -import java.awt.Graphics; -import java.awt.Graphics2D; -import java.awt.image.BufferedImage; -import java.util.logging.Level; -import java.util.logging.Logger; - -import javax.swing.ImageIcon; -import javax.swing.JFrame; -import javax.swing.JLabel; -import javax.swing.SwingUtilities; - -import au.notzed.jjmpeg.io.JJMediaReader; -import au.notzed.jjmpeg.io.JJMediaReader.JJReaderVideo; - -/** - * Code based on Demo of JJVideoScanner class from jjmpeg - * - * See http://code.google.com/p/jjmpeg/ - * - * @author notzed - * @author gfrost - */ -public abstract class JJMPEGPlayer{ - - public JJMPEGPlayer(final String _title, final String _fileName, final float[] _convMatrix3x3) { - SwingUtilities.invokeLater(new Runnable(){ - final Object doorBell = new Object(); - - public void run() { - - JFrame frame = new JFrame(_title); - frame.getContentPane().setLayout(new BorderLayout()); - final JLabel label = new JLabel(){ - @Override public void paint(Graphics GC) { - super.paint(GC); - synchronized (doorBell) { - doorBell.notify(); - } - } - }; - frame.getContentPane().add(label, BorderLayout.CENTER); - - ConvMatrix3x3Editor editor = new ConvMatrix3x3Editor(_convMatrix3x3){ - @Override protected void updated(float[] _convMatrix3x3) { - - } - }; - frame.getContentPane().add(editor.component, BorderLayout.WEST); - - try { - final JJMediaReader reader = new JJMediaReader(_fileName); - final JJReaderVideo vs = reader.openFirstVideoStream(); - final BufferedImage in = vs.createImage(); - final BufferedImage out = vs.createImage(); - - label.setIcon(new ImageIcon(out)); - - new Thread(new Runnable(){ - public void run() { - int frames = 0; - long start = System.currentTimeMillis() - 1; - try { - while (true) { - JJMediaReader.JJReaderStream rs = reader.readFrame(); - if (rs != null) { - vs.getOutputFrame(in); - Graphics2D gc = in.createGraphics(); - frames++; - long fps = (frames * 1000) / (System.currentTimeMillis() - start); - gc.drawString("" + fps, 20, 20); - - processFrame(gc, _convMatrix3x3, in, out); - - label.repaint(); - synchronized (doorBell) { - try { - doorBell.wait(); - } catch (InterruptedException ie) { - ie.getStackTrace(); - } - } - } else { - reader.dispose(); - System.exit(1); - } - Thread.sleep(1); - } - } catch (Exception ex) { - ex.printStackTrace(); - Logger.getLogger(JJMPEGPlayer.class.getName()).log(Level.SEVERE, null, ex); - } - } - }).start(); - frame.pack(); - frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); - frame.setVisible(true); - } catch (Exception ex) { - Logger.getLogger(JJMPEGPlayer.class.getName()).log(Level.SEVERE, null, ex); - } - - } - }); - } - - protected abstract void processFrame(Graphics2D gc, float[] _convMatrix, BufferedImage in, BufferedImage _out); - -} +/* +Copyright (c) 2010-2011, Advanced Micro Devices, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the +following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and the following +disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials provided with the distribution. + +Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export +laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through +774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, +you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of +Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration +Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1, +E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups +D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject +to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774 +of EAR). For the most current Country Group listings, or for additional information about the EAR or your obligations +under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. + +*/ +package com.aparapi.examples.movie; + +import java.awt.BorderLayout; +import java.awt.Graphics; +import java.awt.Graphics2D; +import java.awt.image.BufferedImage; +import java.util.logging.Level; +import java.util.logging.Logger; + +import javax.swing.ImageIcon; +import javax.swing.JFrame; +import javax.swing.JLabel; +import javax.swing.SwingUtilities; + +import au.notzed.jjmpeg.io.JJMediaReader; +import au.notzed.jjmpeg.io.JJMediaReader.JJReaderVideo; + +/** + * Code based on Demo of JJVideoScanner class from jjmpeg + * + * See http://code.google.com/p/jjmpeg/ + * + * @author notzed + * @author gfrost + */ +public abstract class JJMPEGPlayer{ + + public JJMPEGPlayer(final String _title, final String _fileName, final float[] _convMatrix3x3) { + SwingUtilities.invokeLater(new Runnable(){ + final Object doorBell = new Object(); + + public void run() { + + JFrame frame = new JFrame(_title); + frame.getContentPane().setLayout(new BorderLayout()); + final JLabel label = new JLabel(){ + @Override public void paint(Graphics GC) { + super.paint(GC); + synchronized (doorBell) { + doorBell.notify(); + } + } + }; + frame.getContentPane().add(label, BorderLayout.CENTER); + + ConvMatrix3x3Editor editor = new ConvMatrix3x3Editor(_convMatrix3x3){ + @Override protected void updated(float[] _convMatrix3x3) { + + } + }; + frame.getContentPane().add(editor.component, BorderLayout.WEST); + + try { + final JJMediaReader reader = new JJMediaReader(_fileName); + final JJReaderVideo vs = reader.openFirstVideoStream(); + final BufferedImage in = vs.createImage(); + final BufferedImage out = vs.createImage(); + + label.setIcon(new ImageIcon(out)); + + new Thread(new Runnable(){ + public void run() { + int frames = 0; + long start = System.currentTimeMillis() - 1; + try { + while (true) { + JJMediaReader.JJReaderStream rs = reader.readFrame(); + if (rs != null) { + vs.getOutputFrame(in); + Graphics2D gc = in.createGraphics(); + frames++; + long fps = (frames * 1000) / (System.currentTimeMillis() - start); + gc.drawString("" + fps, 20, 20); + + processFrame(gc, _convMatrix3x3, in, out); + + label.repaint(); + synchronized (doorBell) { + try { + doorBell.wait(); + } catch (InterruptedException ie) { + ie.getStackTrace(); + } + } + } else { + reader.dispose(); + System.exit(1); + } + Thread.sleep(1); + } + } catch (Exception ex) { + ex.printStackTrace(); + Logger.getLogger(JJMPEGPlayer.class.getName()).log(Level.SEVERE, null, ex); + } + } + }).start(); + frame.pack(); + frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + frame.setVisible(true); + } catch (Exception ex) { + Logger.getLogger(JJMPEGPlayer.class.getName()).log(Level.SEVERE, null, ex); + } + + } + }); + } + + protected abstract void processFrame(Graphics2D gc, float[] _convMatrix, BufferedImage in, BufferedImage _out); + +} diff --git a/examples/movie/src/com/amd/aparapi/examples/movie/PureJavaSolution.java b/examples/movie/src/com/amd/aparapi/examples/movie/PureJavaSolution.java index a7b32024..db6fb6f2 100644 --- a/examples/movie/src/com/amd/aparapi/examples/movie/PureJavaSolution.java +++ b/examples/movie/src/com/amd/aparapi/examples/movie/PureJavaSolution.java @@ -1,35 +1,35 @@ -package com.aparapi.examples.movie; - -import java.awt.Graphics2D; -import java.awt.image.ConvolveOp; - -import java.awt.image.BufferedImage; - -public class PureJavaSolution{ - - public static void main(final String[] _args) { - String fileName = _args.length == 1 ? _args[0] : "Leo720p.wmv"; - - float[] convMatrix3x3 = new float[] { - 0f, - -10f, - 0f, - -10f, - 41f, - -10f, - 0f, - -10f, - 0f - }; - - new JJMPEGPlayer("lab_6.alternate", fileName, convMatrix3x3){ - - @Override protected void processFrame(Graphics2D _gc, float[] _convMatrix3x3, BufferedImage _in, BufferedImage _out) { - java.awt.image.Kernel conv = new java.awt.image.Kernel(3, 3, _convMatrix3x3); - ConvolveOp convOp = new ConvolveOp(conv, ConvolveOp.EDGE_NO_OP, null); - convOp.filter(_in, _out); - } - }; - - } -} +package com.aparapi.examples.movie; + +import java.awt.Graphics2D; +import java.awt.image.ConvolveOp; + +import java.awt.image.BufferedImage; + +public class PureJavaSolution{ + + public static void main(final String[] _args) { + String fileName = _args.length == 1 ? _args[0] : "Leo720p.wmv"; + + float[] convMatrix3x3 = new float[] { + 0f, + -10f, + 0f, + -10f, + 41f, + -10f, + 0f, + -10f, + 0f + }; + + new JJMPEGPlayer("lab_6.alternate", fileName, convMatrix3x3){ + + @Override protected void processFrame(Graphics2D _gc, float[] _convMatrix3x3, BufferedImage _in, BufferedImage _out) { + java.awt.image.Kernel conv = new java.awt.image.Kernel(3, 3, _convMatrix3x3); + ConvolveOp convOp = new ConvolveOp(conv, ConvolveOp.EDGE_NO_OP, null); + convOp.filter(_in, _out); + } + }; + + } +} diff --git a/examples/movie/src/com/amd/aparapi/examples/movie/ReferenceSolution.java b/examples/movie/src/com/amd/aparapi/examples/movie/ReferenceSolution.java index 1c39d1d0..805ff4da 100644 --- a/examples/movie/src/com/amd/aparapi/examples/movie/ReferenceSolution.java +++ b/examples/movie/src/com/amd/aparapi/examples/movie/ReferenceSolution.java @@ -1,136 +1,136 @@ -/* -Copyright (c) 2010-2011, Advanced Micro Devices, Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the -following conditions are met: - -Redistributions of source code must retain the above copyright notice, this list of conditions and the following -disclaimer. - -Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following -disclaimer in the documentation and/or other materials provided with the distribution. - -Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, -INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export -laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through -774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, -you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of -Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration -Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1, -E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups -D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject -to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774 -of EAR). For the most current Country Group listings, or for additional information about the EAR or your obligations -under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. - -*/ -package com.aparapi.examples.movie; - -import java.awt.Graphics2D; -import java.awt.image.BufferedImage; -import java.awt.image.DataBufferByte; - -import com.aparapi.Kernel; -import com.aparapi.Range; - -public class ReferenceSolution{ - - public static class Convolution extends Kernel{ - - private byte[] inputData; - - private byte[] outputData; - - private int width; - - private int height; - - private Range range; - - float[] convMatrix3x3; - - public Convolution(BufferedImage _imageIn, BufferedImage _imageOut) { - inputData = ((DataBufferByte) _imageIn.getRaster().getDataBuffer()).getData(); - outputData = ((DataBufferByte) _imageOut.getRaster().getDataBuffer()).getData(); - width = _imageIn.getWidth(); - height = _imageIn.getHeight(); - range = Range.create2D(width * 3, height); - setExplicit(true); - - } - - public void processPixel(int x, int y, int w, int h) { - float accum = 0; - int count = 0; - for (int dx = -3; dx < 6; dx += 3) { - for (int dy = -1; dy < 2; dy += 1) { - int rgb = 0xff & inputData[((y + dy) * w) + (x + dx)]; - accum += rgb * convMatrix3x3[count++]; - } - } - outputData[y * w + x] = (byte) Math.max(0, Math.min((int) accum, 255)); - } - - public void run() { - int x = getGlobalId(0); - int y = getGlobalId(1); - int w = getGlobalSize(0); - int h = getGlobalSize(1); - if (x > 3 && x < (w - 3) && y > 1 && y < (h - 1)) { - processPixel(x, y, w, h); - } else { - outputData[y * w + x] = inputData[(y * w) + x]; - } - } - - public void apply(float[] _convMatrix3x3) { - convMatrix3x3 = _convMatrix3x3; - for (int x = 0; x < width * 3; x++) { - for (int y = 0; y < height; y++) { - if (x > 3 && x < (width * 3 - 3) && y > 1 && y < (height - 1)) { - processPixel(x, y, width * 3, height); - } - } - } - } - - } - - public static void main(final String[] _args) { - String fileName = _args.length == 1 ? _args[0] : "Leo720p.wmv"; - - float[] convMatrix3x3 = new float[] { - 0f, - -10f, - 0f, - -10f, - 41f, - -10f, - 0f, - -10f, - 0f - }; - new JJMPEGPlayer("Aparapi - Solution", fileName, convMatrix3x3){ - Convolution kernel = null; - - @Override protected void processFrame(Graphics2D gc, float[] _convMatrix3x3, BufferedImage in, BufferedImage out) { - if (kernel == null) { - kernel = new Convolution(in, out); - } - kernel.apply(_convMatrix3x3); - } - }; - - } -} +/* +Copyright (c) 2010-2011, Advanced Micro Devices, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the +following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and the following +disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials provided with the distribution. + +Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export +laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through +774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, +you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of +Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration +Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1, +E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups +D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject +to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774 +of EAR). For the most current Country Group listings, or for additional information about the EAR or your obligations +under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. + +*/ +package com.aparapi.examples.movie; + +import java.awt.Graphics2D; +import java.awt.image.BufferedImage; +import java.awt.image.DataBufferByte; + +import com.aparapi.Kernel; +import com.aparapi.Range; + +public class ReferenceSolution{ + + public static class Convolution extends Kernel{ + + private byte[] inputData; + + private byte[] outputData; + + private int width; + + private int height; + + private Range range; + + float[] convMatrix3x3; + + public Convolution(BufferedImage _imageIn, BufferedImage _imageOut) { + inputData = ((DataBufferByte) _imageIn.getRaster().getDataBuffer()).getData(); + outputData = ((DataBufferByte) _imageOut.getRaster().getDataBuffer()).getData(); + width = _imageIn.getWidth(); + height = _imageIn.getHeight(); + range = Range.create2D(width * 3, height); + setExplicit(true); + + } + + public void processPixel(int x, int y, int w, int h) { + float accum = 0; + int count = 0; + for (int dx = -3; dx < 6; dx += 3) { + for (int dy = -1; dy < 2; dy += 1) { + int rgb = 0xff & inputData[((y + dy) * w) + (x + dx)]; + accum += rgb * convMatrix3x3[count++]; + } + } + outputData[y * w + x] = (byte) Math.max(0, Math.min((int) accum, 255)); + } + + public void run() { + int x = getGlobalId(0); + int y = getGlobalId(1); + int w = getGlobalSize(0); + int h = getGlobalSize(1); + if (x > 3 && x < (w - 3) && y > 1 && y < (h - 1)) { + processPixel(x, y, w, h); + } else { + outputData[y * w + x] = inputData[(y * w) + x]; + } + } + + public void apply(float[] _convMatrix3x3) { + convMatrix3x3 = _convMatrix3x3; + for (int x = 0; x < width * 3; x++) { + for (int y = 0; y < height; y++) { + if (x > 3 && x < (width * 3 - 3) && y > 1 && y < (height - 1)) { + processPixel(x, y, width * 3, height); + } + } + } + } + + } + + public static void main(final String[] _args) { + String fileName = _args.length == 1 ? _args[0] : "Leo720p.wmv"; + + float[] convMatrix3x3 = new float[] { + 0f, + -10f, + 0f, + -10f, + 41f, + -10f, + 0f, + -10f, + 0f + }; + new JJMPEGPlayer("Aparapi - Solution", fileName, convMatrix3x3){ + Convolution kernel = null; + + @Override protected void processFrame(Graphics2D gc, float[] _convMatrix3x3, BufferedImage in, BufferedImage out) { + if (kernel == null) { + kernel = new Convolution(in, out); + } + kernel.apply(_convMatrix3x3); + } + }; + + } +} diff --git a/pom.xml b/pom.xml index aa62853f..c9a2738d 100644 --- a/pom.xml +++ b/pom.xml @@ -83,9 +83,14 @@ <artifactId>aparapi</artifactId> <version>1.4.1-SNAPSHOT</version> </dependency> + <dependency> + <groupId>org.jogamp.gluegen</groupId> + <artifactId>gluegen-rt-main</artifactId> + <version>2.3.2</version> + </dependency> <dependency> <groupId>org.jogamp.jogl</groupId> - <artifactId>jogl-all</artifactId> + <artifactId>jogl-all-main</artifactId> <version>2.3.2</version> </dependency> <dependency> diff --git a/src/main/java/com/aparapi/examples/convolution/Convolution.java b/src/main/java/com/aparapi/examples/convolution/Convolution.java index 29ffe62f..243ec0a5 100644 --- a/src/main/java/com/aparapi/examples/convolution/Convolution.java +++ b/src/main/java/com/aparapi/examples/convolution/Convolution.java @@ -66,12 +66,17 @@ package com.aparapi.examples.convolution; import com.aparapi.*; import java.io.*; +import java.net.URISyntaxException; public class Convolution { public static void main(final String[] _args) throws IOException { - - final File file = new File(_args.length == 1 ? _args[0] : "./src/main/resources/testcard.jpg").getCanonicalFile(); + final File file; + try{ + file = new File(Convolution.class.getResource("/testcard.jpg").toURI()); + } catch (URISyntaxException e) { + throw new IllegalStateException("could not get testcard", e); + } final ImageConvolution convolution = new ImageConvolution(); diff --git a/src/main/java/com/aparapi/examples/convolution/ConvolutionOpenCL.java b/src/main/java/com/aparapi/examples/convolution/ConvolutionOpenCL.java index 676ed4f2..95d511b7 100644 --- a/src/main/java/com/aparapi/examples/convolution/ConvolutionOpenCL.java +++ b/src/main/java/com/aparapi/examples/convolution/ConvolutionOpenCL.java @@ -70,6 +70,7 @@ import com.aparapi.opencl.*; import com.aparapi.opencl.OpenCL.*; import java.io.*; +import java.net.URISyntaxException; public class ConvolutionOpenCL{ @@ -84,7 +85,12 @@ public class ConvolutionOpenCL{ } public static void main(final String[] _args) { - final File file = new File(_args.length == 1 ? _args[0] : "./src/main/resources/testcard.jpg"); + final File file; + try{ + file = (_args.length >= 1 ? new File(_args[0]) : new File(ConvolutionOpenCL.class.getResource("/testcard.jpg").toURI())); + } catch (URISyntaxException e) { + throw new IllegalStateException("could not get testcard", e); + } final OpenCLDevice openclDevice = (OpenCLDevice) KernelManager.instance().bestDevice(); diff --git a/src/main/java/com/aparapi/examples/convolution/PureJava.java b/src/main/java/com/aparapi/examples/convolution/PureJava.java index ab37dc03..a4170ca8 100644 --- a/src/main/java/com/aparapi/examples/convolution/PureJava.java +++ b/src/main/java/com/aparapi/examples/convolution/PureJava.java @@ -64,6 +64,7 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit package com.aparapi.examples.convolution; import java.io.File; +import java.net.URISyntaxException; import com.aparapi.Kernel; @@ -114,7 +115,12 @@ public class PureJava{ } public static void main(final String[] _args) { - File file = new File(_args.length == 1 ? _args[0] : "./src/main/resources/testcard.jpg"); + final File file; + try{ + file = new File(PureJava.class.getResource("/testcard.jpg").toURI()); + } catch (URISyntaxException e) { + throw new IllegalStateException("could not get testcard", e); + } final ImageConvolution convolution = new ImageConvolution(); diff --git a/src/main/java/com/aparapi/examples/convolution/convolution.cl b/src/main/java/com/aparapi/examples/convolution/convolution.cl index e1acfa8f..107b4c7f 100644 --- a/src/main/java/com/aparapi/examples/convolution/convolution.cl +++ b/src/main/java/com/aparapi/examples/convolution/convolution.cl @@ -1,28 +1,28 @@ -void processPixel(__global float* _convMatrix3x3, __global char* _imageIn, __global char* _imageOut, int _width, int _height, int _x, int _y){ - float accum = 0.0f; - int count = 0; - for (int dx = -3; dx<6; dx+=3){ - for (int dy = -1; dy<2; dy++){ - int rgb = 0xff & _imageIn[(((_y + dy) * _width) + (_x + dx))]; - accum = accum + ((float)rgb * _convMatrix3x3[count++]); - } - } - char value = (char )max(0, min((int)accum, 255)); - _imageOut[(_y * _width) + _x] = value; - return; -} - -__kernel void applyConvolution( - __global float *_convMatrix3x3, // only read from kernel - __global char *_imageIn, // only read from kernel - __global char *_imageOut, // only written to (never read) from kernel - int _width, - int _height -){ - int x = get_global_id(0) % (_width * 3); - int y = get_global_id(0) / (_width * 3); - if (x>3 && x<((_width * 3) - 3) && y>1 && y<(_height - 1)){ - processPixel(_convMatrix3x3, _imageIn, _imageOut, _width*3, _height, x, y); - } -} - +void processPixel(__global float* _convMatrix3x3, __global char* _imageIn, __global char* _imageOut, int _width, int _height, int _x, int _y){ + float accum = 0.0f; + int count = 0; + for (int dx = -3; dx<6; dx+=3){ + for (int dy = -1; dy<2; dy++){ + int rgb = 0xff & _imageIn[(((_y + dy) * _width) + (_x + dx))]; + accum = accum + ((float)rgb * _convMatrix3x3[count++]); + } + } + char value = (char )max(0, min((int)accum, 255)); + _imageOut[(_y * _width) + _x] = value; + return; +} + +__kernel void applyConvolution( + __global float *_convMatrix3x3, // only read from kernel + __global char *_imageIn, // only read from kernel + __global char *_imageOut, // only written to (never read) from kernel + int _width, + int _height +){ + int x = get_global_id(0) % (_width * 3); + int y = get_global_id(0) / (_width * 3); + if (x>3 && x<((_width * 3) - 3) && y>1 && y<(_height - 1)){ + processPixel(_convMatrix3x3, _imageIn, _imageOut, _width*3, _height, x, y); + } +} + diff --git a/src/main/java/com/aparapi/examples/extension/HistogramKernel.cl b/src/main/java/com/aparapi/examples/extension/HistogramKernel.cl index aaa5c154..11b156cc 100644 --- a/src/main/java/com/aparapi/examples/extension/HistogramKernel.cl +++ b/src/main/java/com/aparapi/examples/extension/HistogramKernel.cl @@ -1,166 +1,166 @@ -/* ============================================================ - -Copyright (c) 2009-2010 Advanced Micro Devices, Inc. All rights reserved. - -Redistribution and use of this material is permitted under the following -conditions: - -Redistributions must retain the above copyright notice and all terms of this -license. - -In no event shall anyone redistributing or accessing or using this material -commence or participate in any arbitration or legal action relating to this -material against Advanced Micro Devices, Inc. or any copyright holders or -contributors. The foregoing shall survive any expiration or termination of -this license or any agreement or access or use related to this material. - -ANY BREACH OF ANY TERM OF THIS LICENSE SHALL RESULT IN THE IMMEDIATE REVOCATION -OF ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE THIS MATERIAL. - -THIS MATERIAL IS PROVIDED BY ADVANCED MICRO DEVICES, INC. AND ANY COPYRIGHT -HOLDERS AND CONTRIBUTORS "AS IS" IN ITS CURRENT CONDITION AND WITHOUT ANY -REPRESENTATIONS, GUARANTEE, OR WARRANTY OF ANY KIND OR IN ANY WAY RELATED TO -SUPPORT, INDEMNITY, ERROR FREE OR UNINTERRUPTED OPERA TION, OR THAT IT IS FREE -FROM DEFECTS OR VIRUSES. ALL OBLIGATIONS ARE HEREBY DISCLAIMED - WHETHER -EXPRESS, IMPLIED, OR STATUTORY - INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED -WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, -ACCURACY, COMPLETENESS, OPERABILITY, QUALITY OF SERVICE, OR NON-INFRINGEMENT. -IN NO EVENT SHALL ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, PUNITIVE, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT -OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, REVENUE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED OR BASED ON ANY THEORY OF LIABILITY -ARISING IN ANY WAY RELATED TO THIS MATERIAL, EVEN IF ADVISED OF THE POSSIBILITY -OF SUCH DAMAGE. THE ENTIRE AND AGGREGATE LIABILITY OF ADVANCED MICRO DEVICES, -INC. AND ANY COPYRIGHT HOLDERS AND CONTRIBUTORS SHALL NOT EXCEED TEN DOLLARS -(US $10.00). ANYONE REDISTRIBUTING OR ACCESSING OR USING THIS MATERIAL ACCEPTS -THIS ALLOCATION OF RISK AND AGREES TO RELEASE ADVANCED MICRO DEVICES, INC. AND -ANY COPYRIGHT HOLDERS AND CONTRIBUTORS FROM ANY AND ALL LIABILITIES, -OBLIGATIONS, CLAIMS, OR DEMANDS IN EXCESS OF TEN DOLLARS (US $10.00). THE -FOREGOING ARE ESSENTIAL TERMS OF THIS LICENSE AND, IF ANY OF THESE TERMS ARE -CONSTRUED AS UNENFORCEABLE, FAIL IN ESSENTIAL PURPOSE, OR BECOME VOID OR -DETRIMENTAL TO ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR -CONTRIBUTORS FOR ANY REASON, THEN ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE -THIS MATERIAL SHALL TERMINATE IMMEDIATELY. MOREOVER, THE FOREGOING SHALL -SURVIVE ANY EXPIRATION OR TERMINATION OF THIS LICENSE OR ANY AGREEMENT OR -ACCESS OR USE RELATED TO THIS MATERIAL. - -NOTICE IS HEREBY PROVIDED, AND BY REDISTRIBUTING OR ACCESSING OR USING THIS -MATERIAL SUCH NOTICE IS ACKNOWLEDGED, THAT THIS MATERIAL MAY BE SUBJECT TO -RESTRICTIONS UNDER THE LAWS AND REGULATIONS OF THE UNITED STATES OR OTHER -COUNTRIES, WHICH INCLUDE BUT ARE NOT LIMITED TO, U.S. EXPORT CONTROL LAWS SUCH -AS THE EXPORT ADMINISTRATION REGULATIONS AND NATIONAL SECURITY CONTROLS AS -DEFINED THEREUNDER, AS WELL AS STATE DEPARTMENT CONTROLS UNDER THE U.S. -MUNITIONS LIST. THIS MATERIAL MAY NOT BE USED, RELEASED, TRANSFERRED, IMPORTED, -EXPORTED AND/OR RE-EXPORTED IN ANY MANNER PROHIBITED UNDER ANY APPLICABLE LAWS, -INCLUDING U.S. EXPORT CONTROL LAWS REGARDING SPECIFICALLY DESIGNATED PERSONS, -COUNTRIES AND NATIONALS OF COUNTRIES SUBJECT TO NATIONAL SECURITY CONTROLS. -MOREOVER, THE FOREGOING SHALL SURVIVE ANY EXPIRATION OR TERMINATION OF ANY -LICENSE OR AGREEMENT OR ACCESS OR USE RELATED TO THIS MATERIAL. - -NOTICE REGARDING THE U.S. GOVERNMENT AND DOD AGENCIES: This material is -provided with "RESTRICTED RIGHTS" and/or "LIMITED RIGHTS" as applicable to -computer software and technical data, respectively. Use, duplication, -distribution or disclosure by the U.S. Government and/or DOD agencies is -subject to the full extent of restrictions in all applicable regulations, -including those found at FAR52.227 and DFARS252.227 et seq. and any successor -regulations thereof. Use of this material by the U.S. Government and/or DOD -agencies is acknowledgment of the proprietary rights of any copyright holders -and contributors, including those of Advanced Micro Devices, Inc., as well as -the provisions of FAR52.227-14 through 23 regarding privately developed and/or -commercial computer software. - -This license forms the entire agreement regarding the subject matter hereof and -supersedes all proposals and prior discussions and writings between the parties -with respect thereto. This license does not affect any ownership, rights, title, -or interest in, or relating to, this material. No terms of this license can be -modified or waived, and no breach of this license can be excused, unless done -so in a writing signed by all affected parties. Each term of this license is -separately enforceable. If any term of this license is determined to be or -becomes unenforceable or illegal, such term shall be reformed to the minimum -extent necessary in order for this license to remain in effect in accordance -with its terms as modified by such reformation. This license shall be governed -by and construed in accordance with the laws of the State of Texas without -regard to rules on conflicts of law of any state or jurisdiction or the United -Nations Convention on the International Sale of Goods. All disputes arising out -of this license shall be subject to the jurisdiction of the federal and state -courts in Austin, Texas, and all defenses are hereby waived concerning personal -jurisdiction and venue of these courts. - -============================================================ */ - -/* - * For a description of the algorithm and the terms used, please see the - * documentation for this sample. - * - * On invocation of kernel blackScholes, each work thread calculates - * thread-histogram bin and finally all thread-histograms merged into - * block-histogram bin. Outside the kernel, All block-histograms merged - * into final histogram - */ - -#define LINEAR_MEM_ACCESS -#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable - - - -/** - * @brief Calculates block-histogram bin whose bin size is 256 - * @param data input data pointer - * @param sharedArray shared array for thread-histogram bins - * @param binResult block-histogram array - */ -__kernel -void histogram256(__global const uchar* data, - __local uchar* sharedArray, - __global uint* binResult, - uint binSize) -{ - size_t localId = get_local_id(0); - size_t globalId = get_global_id(0); - size_t groupId = get_group_id(0); - size_t groupSize = get_local_size(0); - - /* initialize shared array to zero */ - for(int i = 0; i < binSize; ++i) - sharedArray[localId * binSize + i] = 0; - - barrier(CLK_LOCAL_MEM_FENCE); - - /* calculate thread-histograms */ - for(int i = 0; i < binSize; ++i) - { -#ifdef LINEAR_MEM_ACCESS - uchar value = data[groupId * groupSize * binSize + i * groupSize + localId]; -#else - uchar value = data[globalId * binSize + i]; -#endif // LINEAR_MEM_ACCESS - sharedArray[localId * binSize + value]++; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - /* merge all thread-histograms into block-histogram */ - for(int i = 0; i < binSize / groupSize; ++i) - { - uint binCount = 0; - for(int j = 0; j < groupSize; ++j) - binCount += sharedArray[j * binSize + i * groupSize + localId]; - - binResult[groupId * binSize + i * groupSize + localId] = binCount; - } -} - -__kernel -void bin256(__global uint* histo, - __global const uint* binResult, - uint subHistogramSize ) -{ - size_t j = get_local_id(0); - size_t binSize=get_global_size(0); - uint histValue=0; - for(int i = 0; i < subHistogramSize; ++i){ - histValue += binResult[i * binSize + j]; - } - histo[j]=histValue; -} +/* ============================================================ + +Copyright (c) 2009-2010 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use of this material is permitted under the following +conditions: + +Redistributions must retain the above copyright notice and all terms of this +license. + +In no event shall anyone redistributing or accessing or using this material +commence or participate in any arbitration or legal action relating to this +material against Advanced Micro Devices, Inc. or any copyright holders or +contributors. The foregoing shall survive any expiration or termination of +this license or any agreement or access or use related to this material. + +ANY BREACH OF ANY TERM OF THIS LICENSE SHALL RESULT IN THE IMMEDIATE REVOCATION +OF ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE THIS MATERIAL. + +THIS MATERIAL IS PROVIDED BY ADVANCED MICRO DEVICES, INC. AND ANY COPYRIGHT +HOLDERS AND CONTRIBUTORS "AS IS" IN ITS CURRENT CONDITION AND WITHOUT ANY +REPRESENTATIONS, GUARANTEE, OR WARRANTY OF ANY KIND OR IN ANY WAY RELATED TO +SUPPORT, INDEMNITY, ERROR FREE OR UNINTERRUPTED OPERA TION, OR THAT IT IS FREE +FROM DEFECTS OR VIRUSES. ALL OBLIGATIONS ARE HEREBY DISCLAIMED - WHETHER +EXPRESS, IMPLIED, OR STATUTORY - INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED +WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, +ACCURACY, COMPLETENESS, OPERABILITY, QUALITY OF SERVICE, OR NON-INFRINGEMENT. +IN NO EVENT SHALL ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, PUNITIVE, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, REVENUE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED OR BASED ON ANY THEORY OF LIABILITY +ARISING IN ANY WAY RELATED TO THIS MATERIAL, EVEN IF ADVISED OF THE POSSIBILITY +OF SUCH DAMAGE. THE ENTIRE AND AGGREGATE LIABILITY OF ADVANCED MICRO DEVICES, +INC. AND ANY COPYRIGHT HOLDERS AND CONTRIBUTORS SHALL NOT EXCEED TEN DOLLARS +(US $10.00). ANYONE REDISTRIBUTING OR ACCESSING OR USING THIS MATERIAL ACCEPTS +THIS ALLOCATION OF RISK AND AGREES TO RELEASE ADVANCED MICRO DEVICES, INC. AND +ANY COPYRIGHT HOLDERS AND CONTRIBUTORS FROM ANY AND ALL LIABILITIES, +OBLIGATIONS, CLAIMS, OR DEMANDS IN EXCESS OF TEN DOLLARS (US $10.00). THE +FOREGOING ARE ESSENTIAL TERMS OF THIS LICENSE AND, IF ANY OF THESE TERMS ARE +CONSTRUED AS UNENFORCEABLE, FAIL IN ESSENTIAL PURPOSE, OR BECOME VOID OR +DETRIMENTAL TO ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR +CONTRIBUTORS FOR ANY REASON, THEN ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE +THIS MATERIAL SHALL TERMINATE IMMEDIATELY. MOREOVER, THE FOREGOING SHALL +SURVIVE ANY EXPIRATION OR TERMINATION OF THIS LICENSE OR ANY AGREEMENT OR +ACCESS OR USE RELATED TO THIS MATERIAL. + +NOTICE IS HEREBY PROVIDED, AND BY REDISTRIBUTING OR ACCESSING OR USING THIS +MATERIAL SUCH NOTICE IS ACKNOWLEDGED, THAT THIS MATERIAL MAY BE SUBJECT TO +RESTRICTIONS UNDER THE LAWS AND REGULATIONS OF THE UNITED STATES OR OTHER +COUNTRIES, WHICH INCLUDE BUT ARE NOT LIMITED TO, U.S. EXPORT CONTROL LAWS SUCH +AS THE EXPORT ADMINISTRATION REGULATIONS AND NATIONAL SECURITY CONTROLS AS +DEFINED THEREUNDER, AS WELL AS STATE DEPARTMENT CONTROLS UNDER THE U.S. +MUNITIONS LIST. THIS MATERIAL MAY NOT BE USED, RELEASED, TRANSFERRED, IMPORTED, +EXPORTED AND/OR RE-EXPORTED IN ANY MANNER PROHIBITED UNDER ANY APPLICABLE LAWS, +INCLUDING U.S. EXPORT CONTROL LAWS REGARDING SPECIFICALLY DESIGNATED PERSONS, +COUNTRIES AND NATIONALS OF COUNTRIES SUBJECT TO NATIONAL SECURITY CONTROLS. +MOREOVER, THE FOREGOING SHALL SURVIVE ANY EXPIRATION OR TERMINATION OF ANY +LICENSE OR AGREEMENT OR ACCESS OR USE RELATED TO THIS MATERIAL. + +NOTICE REGARDING THE U.S. GOVERNMENT AND DOD AGENCIES: This material is +provided with "RESTRICTED RIGHTS" and/or "LIMITED RIGHTS" as applicable to +computer software and technical data, respectively. Use, duplication, +distribution or disclosure by the U.S. Government and/or DOD agencies is +subject to the full extent of restrictions in all applicable regulations, +including those found at FAR52.227 and DFARS252.227 et seq. and any successor +regulations thereof. Use of this material by the U.S. Government and/or DOD +agencies is acknowledgment of the proprietary rights of any copyright holders +and contributors, including those of Advanced Micro Devices, Inc., as well as +the provisions of FAR52.227-14 through 23 regarding privately developed and/or +commercial computer software. + +This license forms the entire agreement regarding the subject matter hereof and +supersedes all proposals and prior discussions and writings between the parties +with respect thereto. This license does not affect any ownership, rights, title, +or interest in, or relating to, this material. No terms of this license can be +modified or waived, and no breach of this license can be excused, unless done +so in a writing signed by all affected parties. Each term of this license is +separately enforceable. If any term of this license is determined to be or +becomes unenforceable or illegal, such term shall be reformed to the minimum +extent necessary in order for this license to remain in effect in accordance +with its terms as modified by such reformation. This license shall be governed +by and construed in accordance with the laws of the State of Texas without +regard to rules on conflicts of law of any state or jurisdiction or the United +Nations Convention on the International Sale of Goods. All disputes arising out +of this license shall be subject to the jurisdiction of the federal and state +courts in Austin, Texas, and all defenses are hereby waived concerning personal +jurisdiction and venue of these courts. + +============================================================ */ + +/* + * For a description of the algorithm and the terms used, please see the + * documentation for this sample. + * + * On invocation of kernel blackScholes, each work thread calculates + * thread-histogram bin and finally all thread-histograms merged into + * block-histogram bin. Outside the kernel, All block-histograms merged + * into final histogram + */ + +#define LINEAR_MEM_ACCESS +#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable + + + +/** + * @brief Calculates block-histogram bin whose bin size is 256 + * @param data input data pointer + * @param sharedArray shared array for thread-histogram bins + * @param binResult block-histogram array + */ +__kernel +void histogram256(__global const uchar* data, + __local uchar* sharedArray, + __global uint* binResult, + uint binSize) +{ + size_t localId = get_local_id(0); + size_t globalId = get_global_id(0); + size_t groupId = get_group_id(0); + size_t groupSize = get_local_size(0); + + /* initialize shared array to zero */ + for(int i = 0; i < binSize; ++i) + sharedArray[localId * binSize + i] = 0; + + barrier(CLK_LOCAL_MEM_FENCE); + + /* calculate thread-histograms */ + for(int i = 0; i < binSize; ++i) + { +#ifdef LINEAR_MEM_ACCESS + uchar value = data[groupId * groupSize * binSize + i * groupSize + localId]; +#else + uchar value = data[globalId * binSize + i]; +#endif // LINEAR_MEM_ACCESS + sharedArray[localId * binSize + value]++; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + /* merge all thread-histograms into block-histogram */ + for(int i = 0; i < binSize / groupSize; ++i) + { + uint binCount = 0; + for(int j = 0; j < groupSize; ++j) + binCount += sharedArray[j * binSize + i * groupSize + localId]; + + binResult[groupId * binSize + i * groupSize + localId] = binCount; + } +} + +__kernel +void bin256(__global uint* histo, + __global const uint* binResult, + uint subHistogramSize ) +{ + size_t j = get_local_id(0); + size_t binSize=get_global_size(0); + uint histValue=0; + for(int i = 0; i < subHistogramSize; ++i){ + histValue += binResult[i * binSize + j]; + } + histo[j]=histValue; +} diff --git a/src/main/java/com/aparapi/examples/extension/fft.cl b/src/main/java/com/aparapi/examples/extension/fft.cl index f4085983..ae8e31b7 100644 --- a/src/main/java/com/aparapi/examples/extension/fft.cl +++ b/src/main/java/com/aparapi/examples/extension/fft.cl @@ -1,737 +1,737 @@ -/* ============================================================ - -Copyright (c) 2009-2010 Advanced Micro Devices, Inc. All rights reserved. - -Redistribution and use of this material is permitted under the following -conditions: - -Redistributions must retain the above copyright notice and all terms of this -license. - -In no event shall anyone redistributing or accessing or using this material -commence or participate in any arbitration or legal action relating to this -material against Advanced Micro Devices, Inc. or any copyright holders or -contributors. The foregoing shall survive any expiration or termination of -this license or any agreement or access or use related to this material. - -ANY BREACH OF ANY TERM OF THIS LICENSE SHALL RESULT IN THE IMMEDIATE REVOCATION -OF ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE THIS MATERIAL. - -THIS MATERIAL IS PROVIDED BY ADVANCED MICRO DEVICES, INC. AND ANY COPYRIGHT -HOLDERS AND CONTRIBUTORS "AS IS" IN ITS CURRENT CONDITION AND WITHOUT ANY -REPRESENTATIONS, GUARANTEE, OR WARRANTY OF ANY KIND OR IN ANY WAY RELATED TO -SUPPORT, INDEMNITY, ERROR FREE OR UNINTERRUPTED OPERA TION, OR THAT IT IS FREE -FROM DEFECTS OR VIRUSES. ALL OBLIGATIONS ARE HEREBY DISCLAIMED - WHETHER -EXPRESS, IMPLIED, OR STATUTORY - INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED -WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, -ACCURACY, COMPLETENESS, OPERABILITY, QUALITY OF SERVICE, OR NON-INFRINGEMENT. -IN NO EVENT SHALL ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, PUNITIVE, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT -OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, REVENUE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED OR BASED ON ANY THEORY OF LIABILITY -ARISING IN ANY WAY RELATED TO THIS MATERIAL, EVEN IF ADVISED OF THE POSSIBILITY -OF SUCH DAMAGE. THE ENTIRE AND AGGREGATE LIABILITY OF ADVANCED MICRO DEVICES, -INC. AND ANY COPYRIGHT HOLDERS AND CONTRIBUTORS SHALL NOT EXCEED TEN DOLLARS -(US $10.00). ANYONE REDISTRIBUTING OR ACCESSING OR USING THIS MATERIAL ACCEPTS -THIS ALLOCATION OF RISK AND AGREES TO RELEASE ADVANCED MICRO DEVICES, INC. AND -ANY COPYRIGHT HOLDERS AND CONTRIBUTORS FROM ANY AND ALL LIABILITIES, -OBLIGATIONS, CLAIMS, OR DEMANDS IN EXCESS OF TEN DOLLARS (US $10.00). THE -FOREGOING ARE ESSENTIAL TERMS OF THIS LICENSE AND, IF ANY OF THESE TERMS ARE -CONSTRUED AS UNENFORCEABLE, FAIL IN ESSENTIAL PURPOSE, OR BECOME VOID OR -DETRIMENTAL TO ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR -CONTRIBUTORS FOR ANY REASON, THEN ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE -THIS MATERIAL SHALL TERMINATE IMMEDIATELY. MOREOVER, THE FOREGOING SHALL -SURVIVE ANY EXPIRATION OR TERMINATION OF THIS LICENSE OR ANY AGREEMENT OR -ACCESS OR USE RELATED TO THIS MATERIAL. - -NOTICE IS HEREBY PROVIDED, AND BY REDISTRIBUTING OR ACCESSING OR USING THIS -MATERIAL SUCH NOTICE IS ACKNOWLEDGED, THAT THIS MATERIAL MAY BE SUBJECT TO -RESTRICTIONS UNDER THE LAWS AND REGULATIONS OF THE UNITED STATES OR OTHER -COUNTRIES, WHICH INCLUDE BUT ARE NOT LIMITED TO, U.S. EXPORT CONTROL LAWS SUCH -AS THE EXPORT ADMINISTRATION REGULATIONS AND NATIONAL SECURITY CONTROLS AS -DEFINED THEREUNDER, AS WELL AS STATE DEPARTMENT CONTROLS UNDER THE U.S. -MUNITIONS LIST. THIS MATERIAL MAY NOT BE USED, RELEASED, TRANSFERRED, IMPORTED, -EXPORTED AND/OR RE-EXPORTED IN ANY MANNER PROHIBITED UNDER ANY APPLICABLE LAWS, -INCLUDING U.S. EXPORT CONTROL LAWS REGARDING SPECIFICALLY DESIGNATED PERSONS, -COUNTRIES AND NATIONALS OF COUNTRIES SUBJECT TO NATIONAL SECURITY CONTROLS. -MOREOVER, THE FOREGOING SHALL SURVIVE ANY EXPIRATION OR TERMINATION OF ANY -LICENSE OR AGREEMENT OR ACCESS OR USE RELATED TO THIS MATERIAL. - -NOTICE REGARDING THE U.S. GOVERNMENT AND DOD AGENCIES: This material is -provided with "RESTRICTED RIGHTS" and/or "LIMITED RIGHTS" as applicable to -computer software and technical data, respectively. Use, duplication, -distribution or disclosure by the U.S. Government and/or DOD agencies is -subject to the full extent of restrictions in all applicable regulations, -including those found at FAR52.227 and DFARS252.227 et seq. and any successor -regulations thereof. Use of this material by the U.S. Government and/or DOD -agencies is acknowledgment of the proprietary rights of any copyright holders -and contributors, including those of Advanced Micro Devices, Inc., as well as -the provisions of FAR52.227-14 through 23 regarding privately developed and/or -commercial computer software. - -This license forms the entire agreement regarding the subject matter hereof and -supersedes all proposals and prior discussions and writings between the parties -with respect thereto. This license does not affect any ownership, rights, title, -or interest in, or relating to, this material. No terms of this license can be -modified or waived, and no breach of this license can be excused, unless done -so in a writing signed by all affected parties. Each term of this license is -separately enforceable. If any term of this license is determined to be or -becomes unenforceable or illegal, such term shall be reformed to the minimum -extent necessary in order for this license to remain in effect in accordance -with its terms as modified by such reformation. This license shall be governed -by and construed in accordance with the laws of the State of Texas without -regard to rules on conflicts of law of any state or jurisdiction or the United -Nations Convention on the International Sale of Goods. All disputes arising out -of this license shall be subject to the jurisdiction of the federal and state -courts in Austin, Texas, and all defenses are hereby waived concerning personal -jurisdiction and venue of these courts. - -============================================================ */ - - -// This is 2 PI / 1024 -#define ANGLE 0x1.921fb6p-8F - -// Return sin and cos of -2*pi*i/1024 -__attribute__((always_inline)) float -k_sincos(int i, float *cretp) -{ - if (i > 512) - i -= 1024; - - float x = i * -ANGLE; - *cretp = native_cos(x); - return native_sin(x); -} - -__attribute__((always_inline)) float4 -k_sincos4(int4 i, float4 *cretp) -{ - i -= (i > 512) & 1024; - float4 x = convert_float4(i) * -ANGLE; - *cretp = native_cos(x); - return native_sin(x); -} - -// Twiddle factor stuff -#define TWGEN(I,C,S) \ - float C; \ - float S = k_sincos(tbase * I, &C) - -#define TW4GEN(I,C,S) \ - float4 C; \ - float4 S = k_sincos4(tbase * I, &C) - -#define TWAPPLY(ZR, ZI, C, S) \ - do { \ - float4 __r = C * ZR - S * ZI; \ - ZI = C * ZI + S * ZR; \ - ZR = __r; \ - } while (0) - -# define TW4IDDLE4() \ - do { \ - TW4GEN(1, c1, s1); \ - TWAPPLY(zr1, zi1, c1, s1); \ - TW4GEN(2, c2, s2); \ - TWAPPLY(zr2, zi2, c2, s2); \ - TW4GEN(3, c3, s3); \ - TWAPPLY(zr3, zi3, c3, s3); \ - } while (0) - -# define TWIDDLE4() \ - do { \ - TWGEN(1, c1, s1); \ - TWAPPLY(zr1, zi1, c1, s1); \ - TWGEN(2, c2, s2); \ - TWAPPLY(zr2, zi2, c2, s2); \ - TWGEN(3, c3, s3); \ - TWAPPLY(zr3, zi3, c3, s3); \ - } while (0) - -// 4 point FFT -#define FFT4() \ - do { \ - float4 ar0 = zr0 + zr2; \ - float4 ar2 = zr1 + zr3; \ - float4 br0 = ar0 + ar2; \ - float4 br1 = zr0 - zr2; \ - float4 br2 = ar0 - ar2; \ - float4 br3 = zr1 - zr3; \ - float4 ai0 = zi0 + zi2; \ - float4 ai2 = zi1 + zi3; \ - float4 bi0 = ai0 + ai2; \ - float4 bi1 = zi0 - zi2; \ - float4 bi2 = ai0 - ai2; \ - float4 bi3 = zi1 - zi3; \ - zr0 = br0; \ - zi0 = bi0; \ - zr1 = br1 + bi3; \ - zi1 = bi1 - br3; \ - zr3 = br1 - bi3; \ - zi3 = br3 + bi1; \ - zr2 = br2; \ - zi2 = bi2; \ - } while (0) - -// First pass of 1K FFT -__attribute__((always_inline)) void -kfft_pass1(uint me, - const __global float *gr, const __global float *gi, - __local float *lds) -{ - const __global float4 *gp; - __local float *lp; - - // Pull in transform data - gp = (const __global float4 *)(gr + (me << 2)); - float4 zr0 = gp[0*64]; - float4 zr1 = gp[1*64]; - float4 zr2 = gp[2*64]; - float4 zr3 = gp[3*64]; - - gp = (const __global float4 *)(gi + (me << 2)); - float4 zi0 = gp[0*64]; - float4 zi1 = gp[1*64]; - float4 zi2 = gp[2*64]; - float4 zi3 = gp[3*64]; - - FFT4(); - - int4 tbase = (int)(me << 2) + (int4)(0, 1, 2, 3); - TW4IDDLE4(); - - // Save registers - // Note that this pointer is not aligned enough to be cast to a float4* - lp = lds + ((me << 2) + (me >> 3)); - - lp[0] = zr0.x; - lp[1] = zr0.y; - lp[2] = zr0.z; - lp[3] = zr0.w; - lp += 66*4; - - lp[0] = zr1.x; - lp[1] = zr1.y; - lp[2] = zr1.z; - lp[3] = zr1.w; - lp += 66*4; - - lp[0] = zr2.x; - lp[1] = zr2.y; - lp[2] = zr2.z; - lp[3] = zr2.w; - lp += 66*4; - - lp[0] = zr3.x; - lp[1] = zr3.y; - lp[2] = zr3.z; - lp[3] = zr3.w; - lp += 66*4; - - // Imaginary part - lp[0] = zi0.x; - lp[1] = zi0.y; - lp[2] = zi0.z; - lp[3] = zi0.w; - lp += 66*4; - - lp[0] = zi1.x; - lp[1] = zi1.y; - lp[2] = zi1.z; - lp[3] = zi1.w; - lp += 66*4; - - lp[0] = zi2.x; - lp[1] = zi2.y; - lp[2] = zi2.z; - lp[3] = zi2.w; - lp += 66*4; - - lp[0] = zi3.x; - lp[1] = zi3.y; - lp[2] = zi3.z; - lp[3] = zi3.w; - - barrier(CLK_LOCAL_MEM_FENCE); -} - -// Second pass of 1K FFT -__attribute__((always_inline)) void -kfft_pass2(uint me, __local float *lds) -{ - __local float *lp; - - // Load registers - lp = lds + (me + (me >> 5)); - - float4 zr0, zr1, zr2, zr3; - - zr0.x = lp[0*66]; - zr1.x = lp[1*66]; - zr2.x = lp[2*66]; - zr3.x = lp[3*66]; - lp += 66*4; - - zr0.y = lp[0*66]; - zr1.y = lp[1*66]; - zr2.y = lp[2*66]; - zr3.y = lp[3*66]; - lp += 66*4; - - zr0.z = lp[0*66]; - zr1.z = lp[1*66]; - zr2.z = lp[2*66]; - zr3.z = lp[3*66]; - lp += 66*4; - - zr0.w = lp[0*66]; - zr1.w = lp[1*66]; - zr2.w = lp[2*66]; - zr3.w = lp[3*66]; - lp += 66*4; - - float4 zi0, zi1, zi2, zi3; - - zi0.x = lp[0*66]; - zi1.x = lp[1*66]; - zi2.x = lp[2*66]; - zi3.x = lp[3*66]; - lp += 66*4; - - zi0.y = lp[0*66]; - zi1.y = lp[1*66]; - zi2.y = lp[2*66]; - zi3.y = lp[3*66]; - lp += 66*4; - - zi0.z = lp[0*66]; - zi1.z = lp[1*66]; - zi2.z = lp[2*66]; - zi3.z = lp[3*66]; - lp += 66*4; - - zi0.w = lp[0*66]; - zi1.w = lp[1*66]; - zi2.w = lp[2*66]; - zi3.w = lp[3*66]; - - // Transform and twiddle - FFT4(); - - int tbase = (int)(me << 2); - TWIDDLE4(); - - barrier(CLK_LOCAL_MEM_FENCE); - - // Store registers - lp = lds + ((me << 2) + (me >> 3)); - - lp[0] = zr0.x; - lp[1] = zr1.x; - lp[2] = zr2.x; - lp[3] = zr3.x; - lp += 66*4; - - lp[0] = zr0.y; - lp[1] = zr1.y; - lp[2] = zr2.y; - lp[3] = zr3.y; - lp += 66*4; - - lp[0] = zr0.z; - lp[1] = zr1.z; - lp[2] = zr2.z; - lp[3] = zr3.z; - lp += 66*4; - - lp[0] = zr0.w; - lp[1] = zr1.w; - lp[2] = zr2.w; - lp[3] = zr3.w; - lp += 66*4; - - // Imaginary part - lp[0] = zi0.x; - lp[1] = zi1.x; - lp[2] = zi2.x; - lp[3] = zi3.x; - lp += 66*4; - - lp[0] = zi0.y; - lp[1] = zi1.y; - lp[2] = zi2.y; - lp[3] = zi3.y; - lp += 66*4; - - lp[0] = zi0.z; - lp[1] = zi1.z; - lp[2] = zi2.z; - lp[3] = zi3.z; - lp += 66*4; - - lp[0] = zi0.w; - lp[1] = zi1.w; - lp[2] = zi2.w; - lp[3] = zi3.w; - - barrier(CLK_LOCAL_MEM_FENCE); -} - -// Third pass of 1K FFT -__attribute__((always_inline)) void -kfft_pass3(uint me, __local float *lds) -{ - __local float *lp; - - // Load registers - lp = lds + (me + (me >> 5)); - - float4 zr0, zr1, zr2, zr3; - - zr0.x = lp[0*66]; - zr1.x = lp[1*66]; - zr2.x = lp[2*66]; - zr3.x = lp[3*66]; - lp += 66*4; - - zr0.y = lp[0*66]; - zr1.y = lp[1*66]; - zr2.y = lp[2*66]; - zr3.y = lp[3*66]; - lp += 66*4; - - zr0.z = lp[0*66]; - zr1.z = lp[1*66]; - zr2.z = lp[2*66]; - zr3.z = lp[3*66]; - lp += 66*4; - - zr0.w = lp[0*66]; - zr1.w = lp[1*66]; - zr2.w = lp[2*66]; - zr3.w = lp[3*66]; - lp += 66*4; - - float4 zi0, zi1, zi2, zi3; - - zi0.x = lp[0*66]; - zi1.x = lp[1*66]; - zi2.x = lp[2*66]; - zi3.x = lp[3*66]; - lp += 66*4; - - zi0.y = lp[0*66]; - zi1.y = lp[1*66]; - zi2.y = lp[2*66]; - zi3.y = lp[3*66]; - lp += 66*4; - - zi0.z = lp[0*66]; - zi1.z = lp[1*66]; - zi2.z = lp[2*66]; - zi3.z = lp[3*66]; - lp += 66*4; - - zi0.w = lp[0*66]; - zi1.w = lp[1*66]; - zi2.w = lp[2*66]; - zi3.w = lp[3*66]; - - // Transform and twiddle - FFT4(); - - int tbase = (int)((me >> 2) << 4); - TWIDDLE4(); - - barrier(CLK_LOCAL_MEM_FENCE); - - // Save registers - lp = lds + me; - - lp[0*66] = zr0.x; - lp[1*66] = zr0.y; - lp[2*66] = zr0.z; - lp[3*66] = zr0.w; - lp += 66*4; - - lp[0*66] = zr1.x; - lp[1*66] = zr1.y; - lp[2*66] = zr1.z; - lp[3*66] = zr1.w; - lp += 66*4; - - lp[0*66] = zr2.x; - lp[1*66] = zr2.y; - lp[2*66] = zr2.z; - lp[3*66] = zr2.w; - lp += 66*4; - - lp[0*66] = zr3.x; - lp[1*66] = zr3.y; - lp[2*66] = zr3.z; - lp[3*66] = zr3.w; - lp += 66*4; - - // Imaginary part - lp[0*66] = zi0.x; - lp[1*66] = zi0.y; - lp[2*66] = zi0.z; - lp[3*66] = zi0.w; - lp += 66*4; - - lp[0*66] = zi1.x; - lp[1*66] = zi1.y; - lp[2*66] = zi1.z; - lp[3*66] = zi1.w; - lp += 66*4; - - lp[0*66] = zi2.x; - lp[1*66] = zi2.y; - lp[2*66] = zi2.z; - lp[3*66] = zi2.w; - lp += 66*4; - - lp[0*66] = zi3.x; - lp[1*66] = zi3.y; - lp[2*66] = zi3.z; - lp[3*66] = zi3.w; - - barrier(CLK_LOCAL_MEM_FENCE); -} - -// Fourth pass of 1K FFT -__attribute__((always_inline)) void -kfft_pass4(uint me, __local float *lds) -{ - __local float *lp; - - // Load registers - lp = lds + ((me & 0x3) + ((me >> 2) & 0x3)*(66*4) + ((me >> 4) << 2)); - - float4 zr0, zr1, zr2, zr3; - - zr0.x = lp[0*66]; - zr0.y = lp[1*66]; - zr0.z = lp[2*66]; - zr0.w = lp[3*66]; - lp += 16; - - zr1.x = lp[0*66]; - zr1.y = lp[1*66]; - zr1.z = lp[2*66]; - zr1.w = lp[3*66]; - lp += 16; - - zr2.x = lp[0*66]; - zr2.y = lp[1*66]; - zr2.z = lp[2*66]; - zr2.w = lp[3*66]; - lp += 16; - - zr3.x = lp[0*66]; - zr3.y = lp[1*66]; - zr3.z = lp[2*66]; - zr3.w = lp[3*66]; - lp += 66*4*4 - 3*16; - - float4 zi0, zi1, zi2, zi3; - - zi0.x = lp[0*66]; - zi0.y = lp[1*66]; - zi0.z = lp[2*66]; - zi0.w = lp[3*66]; - lp += 16; - - zi1.x = lp[0*66]; - zi1.y = lp[1*66]; - zi1.z = lp[2*66]; - zi1.w = lp[3*66]; - lp += 16; - - zi2.x = lp[0*66]; - zi2.y = lp[1*66]; - zi2.z = lp[2*66]; - zi2.w = lp[3*66]; - lp += 16; - - zi3.x = lp[0*66]; - zi3.y = lp[1*66]; - zi3.z = lp[2*66]; - zi3.w = lp[3*66]; - - // Transform and twiddle - FFT4(); - - int tbase = (int)((me >> 4) << 6); - TWIDDLE4(); - - barrier(CLK_LOCAL_MEM_FENCE); - - // Save registers in conflict free manner - lp = lds + me; - - lp[0*68] = zr0.x; - lp[1*68] = zr0.y; - lp[2*68] = zr0.z; - lp[3*68] = zr0.w; - lp += 68*4; - - lp[0*68] = zr1.x; - lp[1*68] = zr1.y; - lp[2*68] = zr1.z; - lp[3*68] = zr1.w; - lp += 68*4; - - lp[0*68] = zr2.x; - lp[1*68] = zr2.y; - lp[2*68] = zr2.z; - lp[3*68] = zr2.w; - lp += 68*4; - - lp[0*68] = zr3.x; - lp[1*68] = zr3.y; - lp[2*68] = zr3.z; - lp[3*68] = zr3.w; - lp += 68*4; - - // Imaginary part - lp[0*68] = zi0.x; - lp[1*68] = zi0.y; - lp[2*68] = zi0.z; - lp[3*68] = zi0.w; - lp += 68*4; - - lp[0*68] = zi1.x; - lp[1*68] = zi1.y; - lp[2*68] = zi1.z; - lp[3*68] = zi1.w; - lp += 68*4; - - lp[0*68] = zi2.x; - lp[1*68] = zi2.y; - lp[2*68] = zi2.z; - lp[3*68] = zi2.w; - lp += 68*4; - - lp[0*68] = zi3.x; - lp[1*68] = zi3.y; - lp[2*68] = zi3.z; - lp[3*68] = zi3.w; - - barrier(CLK_LOCAL_MEM_FENCE); -} - -// Fifth and last pass of 1K FFT -__attribute__((always_inline)) void -kfft_pass5(uint me, - const __local float *lds, - __global float *gr, __global float *gi) -{ - const __local float *lp; - - // Load registers - lp = lds + ((me & 0xf) + (me >> 4)*(68*4)); - - float4 zr0, zr1, zr2, zr3; - - zr0.x = lp[0*68]; - zr0.y = lp[1*68]; - zr0.z = lp[2*68]; - zr0.w = lp[3*68]; - lp += 16; - - zr1.x = lp[0*68]; - zr1.y = lp[1*68]; - zr1.z = lp[2*68]; - zr1.w = lp[3*68]; - lp += 16; - - zr2.x = lp[0*68]; - zr2.y = lp[1*68]; - zr2.z = lp[2*68]; - zr2.w = lp[3*68]; - lp += 16; - - zr3.x = lp[0*68]; - zr3.y = lp[1*68]; - zr3.z = lp[2*68]; - zr3.w = lp[3*68]; - - lp += 68*4*4 - 3*16; - - float4 zi0, zi1, zi2, zi3; - - zi0.x = lp[0*68]; - zi0.y = lp[1*68]; - zi0.z = lp[2*68]; - zi0.w = lp[3*68]; - lp += 16; - - zi1.x = lp[0*68]; - zi1.y = lp[1*68]; - zi1.z = lp[2*68]; - zi1.w = lp[3*68]; - lp += 16; - - zi2.x = lp[0*68]; - zi2.y = lp[1*68]; - zi2.z = lp[2*68]; - zi2.w = lp[3*68]; - lp += 16; - - zi3.x = lp[0*68]; - zi3.y = lp[1*68]; - zi3.z = lp[2*68]; - zi3.w = lp[3*68]; - - // Transform - FFT4(); - - // Save result - __global float4 *gp = (__global float4 *)(gr + (me << 2)); - gp[0*64] = zr0; - gp[1*64] = zr1; - gp[2*64] = zr2; - gp[3*64] = zr3; - - gp = (__global float4 *)(gi + (me << 2)); - gp[0*64] = zi0; - gp[1*64] = zi1; - gp[2*64] = zi2; - gp[3*64] = zi3; -} - -// Distance between first real element of successive 1K vectors -// It must be >= 1024, and a multiple of 4 -#define VSTRIDE (1024+0) - -// Performs a 1K complex FFT with every 64 global ids. -// Each vector is a multiple of VSTRIDE from the first -// Number of global ids must be a multiple of 64, e.g. 1024*64 -// -// greal - pointer to input and output real part of data -// gimag - pointer to input and output imaginary part of data -__kernel void -forward(__global float *greal, __global float *gimag) -{ - // This is 8704 bytes - __local float lds[68*4*4*2]; - - __global float *gr; - __global float *gi; - uint gid = get_global_id(0); - uint me = gid & 0x3fU; - uint dg = (gid >> 6) * VSTRIDE; - - gr = greal + dg; - gi = gimag + dg; - - kfft_pass1(me, gr, gi, lds); - kfft_pass2(me, lds); - kfft_pass3(me, lds); - kfft_pass4(me, lds); - kfft_pass5(me, lds, gr, gi); -} - +/* ============================================================ + +Copyright (c) 2009-2010 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use of this material is permitted under the following +conditions: + +Redistributions must retain the above copyright notice and all terms of this +license. + +In no event shall anyone redistributing or accessing or using this material +commence or participate in any arbitration or legal action relating to this +material against Advanced Micro Devices, Inc. or any copyright holders or +contributors. The foregoing shall survive any expiration or termination of +this license or any agreement or access or use related to this material. + +ANY BREACH OF ANY TERM OF THIS LICENSE SHALL RESULT IN THE IMMEDIATE REVOCATION +OF ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE THIS MATERIAL. + +THIS MATERIAL IS PROVIDED BY ADVANCED MICRO DEVICES, INC. AND ANY COPYRIGHT +HOLDERS AND CONTRIBUTORS "AS IS" IN ITS CURRENT CONDITION AND WITHOUT ANY +REPRESENTATIONS, GUARANTEE, OR WARRANTY OF ANY KIND OR IN ANY WAY RELATED TO +SUPPORT, INDEMNITY, ERROR FREE OR UNINTERRUPTED OPERA TION, OR THAT IT IS FREE +FROM DEFECTS OR VIRUSES. ALL OBLIGATIONS ARE HEREBY DISCLAIMED - WHETHER +EXPRESS, IMPLIED, OR STATUTORY - INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED +WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, +ACCURACY, COMPLETENESS, OPERABILITY, QUALITY OF SERVICE, OR NON-INFRINGEMENT. +IN NO EVENT SHALL ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, PUNITIVE, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, REVENUE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED OR BASED ON ANY THEORY OF LIABILITY +ARISING IN ANY WAY RELATED TO THIS MATERIAL, EVEN IF ADVISED OF THE POSSIBILITY +OF SUCH DAMAGE. THE ENTIRE AND AGGREGATE LIABILITY OF ADVANCED MICRO DEVICES, +INC. AND ANY COPYRIGHT HOLDERS AND CONTRIBUTORS SHALL NOT EXCEED TEN DOLLARS +(US $10.00). ANYONE REDISTRIBUTING OR ACCESSING OR USING THIS MATERIAL ACCEPTS +THIS ALLOCATION OF RISK AND AGREES TO RELEASE ADVANCED MICRO DEVICES, INC. AND +ANY COPYRIGHT HOLDERS AND CONTRIBUTORS FROM ANY AND ALL LIABILITIES, +OBLIGATIONS, CLAIMS, OR DEMANDS IN EXCESS OF TEN DOLLARS (US $10.00). THE +FOREGOING ARE ESSENTIAL TERMS OF THIS LICENSE AND, IF ANY OF THESE TERMS ARE +CONSTRUED AS UNENFORCEABLE, FAIL IN ESSENTIAL PURPOSE, OR BECOME VOID OR +DETRIMENTAL TO ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR +CONTRIBUTORS FOR ANY REASON, THEN ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE +THIS MATERIAL SHALL TERMINATE IMMEDIATELY. MOREOVER, THE FOREGOING SHALL +SURVIVE ANY EXPIRATION OR TERMINATION OF THIS LICENSE OR ANY AGREEMENT OR +ACCESS OR USE RELATED TO THIS MATERIAL. + +NOTICE IS HEREBY PROVIDED, AND BY REDISTRIBUTING OR ACCESSING OR USING THIS +MATERIAL SUCH NOTICE IS ACKNOWLEDGED, THAT THIS MATERIAL MAY BE SUBJECT TO +RESTRICTIONS UNDER THE LAWS AND REGULATIONS OF THE UNITED STATES OR OTHER +COUNTRIES, WHICH INCLUDE BUT ARE NOT LIMITED TO, U.S. EXPORT CONTROL LAWS SUCH +AS THE EXPORT ADMINISTRATION REGULATIONS AND NATIONAL SECURITY CONTROLS AS +DEFINED THEREUNDER, AS WELL AS STATE DEPARTMENT CONTROLS UNDER THE U.S. +MUNITIONS LIST. THIS MATERIAL MAY NOT BE USED, RELEASED, TRANSFERRED, IMPORTED, +EXPORTED AND/OR RE-EXPORTED IN ANY MANNER PROHIBITED UNDER ANY APPLICABLE LAWS, +INCLUDING U.S. EXPORT CONTROL LAWS REGARDING SPECIFICALLY DESIGNATED PERSONS, +COUNTRIES AND NATIONALS OF COUNTRIES SUBJECT TO NATIONAL SECURITY CONTROLS. +MOREOVER, THE FOREGOING SHALL SURVIVE ANY EXPIRATION OR TERMINATION OF ANY +LICENSE OR AGREEMENT OR ACCESS OR USE RELATED TO THIS MATERIAL. + +NOTICE REGARDING THE U.S. GOVERNMENT AND DOD AGENCIES: This material is +provided with "RESTRICTED RIGHTS" and/or "LIMITED RIGHTS" as applicable to +computer software and technical data, respectively. Use, duplication, +distribution or disclosure by the U.S. Government and/or DOD agencies is +subject to the full extent of restrictions in all applicable regulations, +including those found at FAR52.227 and DFARS252.227 et seq. and any successor +regulations thereof. Use of this material by the U.S. Government and/or DOD +agencies is acknowledgment of the proprietary rights of any copyright holders +and contributors, including those of Advanced Micro Devices, Inc., as well as +the provisions of FAR52.227-14 through 23 regarding privately developed and/or +commercial computer software. + +This license forms the entire agreement regarding the subject matter hereof and +supersedes all proposals and prior discussions and writings between the parties +with respect thereto. This license does not affect any ownership, rights, title, +or interest in, or relating to, this material. No terms of this license can be +modified or waived, and no breach of this license can be excused, unless done +so in a writing signed by all affected parties. Each term of this license is +separately enforceable. If any term of this license is determined to be or +becomes unenforceable or illegal, such term shall be reformed to the minimum +extent necessary in order for this license to remain in effect in accordance +with its terms as modified by such reformation. This license shall be governed +by and construed in accordance with the laws of the State of Texas without +regard to rules on conflicts of law of any state or jurisdiction or the United +Nations Convention on the International Sale of Goods. All disputes arising out +of this license shall be subject to the jurisdiction of the federal and state +courts in Austin, Texas, and all defenses are hereby waived concerning personal +jurisdiction and venue of these courts. + +============================================================ */ + + +// This is 2 PI / 1024 +#define ANGLE 0x1.921fb6p-8F + +// Return sin and cos of -2*pi*i/1024 +__attribute__((always_inline)) float +k_sincos(int i, float *cretp) +{ + if (i > 512) + i -= 1024; + + float x = i * -ANGLE; + *cretp = native_cos(x); + return native_sin(x); +} + +__attribute__((always_inline)) float4 +k_sincos4(int4 i, float4 *cretp) +{ + i -= (i > 512) & 1024; + float4 x = convert_float4(i) * -ANGLE; + *cretp = native_cos(x); + return native_sin(x); +} + +// Twiddle factor stuff +#define TWGEN(I,C,S) \ + float C; \ + float S = k_sincos(tbase * I, &C) + +#define TW4GEN(I,C,S) \ + float4 C; \ + float4 S = k_sincos4(tbase * I, &C) + +#define TWAPPLY(ZR, ZI, C, S) \ + do { \ + float4 __r = C * ZR - S * ZI; \ + ZI = C * ZI + S * ZR; \ + ZR = __r; \ + } while (0) + +# define TW4IDDLE4() \ + do { \ + TW4GEN(1, c1, s1); \ + TWAPPLY(zr1, zi1, c1, s1); \ + TW4GEN(2, c2, s2); \ + TWAPPLY(zr2, zi2, c2, s2); \ + TW4GEN(3, c3, s3); \ + TWAPPLY(zr3, zi3, c3, s3); \ + } while (0) + +# define TWIDDLE4() \ + do { \ + TWGEN(1, c1, s1); \ + TWAPPLY(zr1, zi1, c1, s1); \ + TWGEN(2, c2, s2); \ + TWAPPLY(zr2, zi2, c2, s2); \ + TWGEN(3, c3, s3); \ + TWAPPLY(zr3, zi3, c3, s3); \ + } while (0) + +// 4 point FFT +#define FFT4() \ + do { \ + float4 ar0 = zr0 + zr2; \ + float4 ar2 = zr1 + zr3; \ + float4 br0 = ar0 + ar2; \ + float4 br1 = zr0 - zr2; \ + float4 br2 = ar0 - ar2; \ + float4 br3 = zr1 - zr3; \ + float4 ai0 = zi0 + zi2; \ + float4 ai2 = zi1 + zi3; \ + float4 bi0 = ai0 + ai2; \ + float4 bi1 = zi0 - zi2; \ + float4 bi2 = ai0 - ai2; \ + float4 bi3 = zi1 - zi3; \ + zr0 = br0; \ + zi0 = bi0; \ + zr1 = br1 + bi3; \ + zi1 = bi1 - br3; \ + zr3 = br1 - bi3; \ + zi3 = br3 + bi1; \ + zr2 = br2; \ + zi2 = bi2; \ + } while (0) + +// First pass of 1K FFT +__attribute__((always_inline)) void +kfft_pass1(uint me, + const __global float *gr, const __global float *gi, + __local float *lds) +{ + const __global float4 *gp; + __local float *lp; + + // Pull in transform data + gp = (const __global float4 *)(gr + (me << 2)); + float4 zr0 = gp[0*64]; + float4 zr1 = gp[1*64]; + float4 zr2 = gp[2*64]; + float4 zr3 = gp[3*64]; + + gp = (const __global float4 *)(gi + (me << 2)); + float4 zi0 = gp[0*64]; + float4 zi1 = gp[1*64]; + float4 zi2 = gp[2*64]; + float4 zi3 = gp[3*64]; + + FFT4(); + + int4 tbase = (int)(me << 2) + (int4)(0, 1, 2, 3); + TW4IDDLE4(); + + // Save registers + // Note that this pointer is not aligned enough to be cast to a float4* + lp = lds + ((me << 2) + (me >> 3)); + + lp[0] = zr0.x; + lp[1] = zr0.y; + lp[2] = zr0.z; + lp[3] = zr0.w; + lp += 66*4; + + lp[0] = zr1.x; + lp[1] = zr1.y; + lp[2] = zr1.z; + lp[3] = zr1.w; + lp += 66*4; + + lp[0] = zr2.x; + lp[1] = zr2.y; + lp[2] = zr2.z; + lp[3] = zr2.w; + lp += 66*4; + + lp[0] = zr3.x; + lp[1] = zr3.y; + lp[2] = zr3.z; + lp[3] = zr3.w; + lp += 66*4; + + // Imaginary part + lp[0] = zi0.x; + lp[1] = zi0.y; + lp[2] = zi0.z; + lp[3] = zi0.w; + lp += 66*4; + + lp[0] = zi1.x; + lp[1] = zi1.y; + lp[2] = zi1.z; + lp[3] = zi1.w; + lp += 66*4; + + lp[0] = zi2.x; + lp[1] = zi2.y; + lp[2] = zi2.z; + lp[3] = zi2.w; + lp += 66*4; + + lp[0] = zi3.x; + lp[1] = zi3.y; + lp[2] = zi3.z; + lp[3] = zi3.w; + + barrier(CLK_LOCAL_MEM_FENCE); +} + +// Second pass of 1K FFT +__attribute__((always_inline)) void +kfft_pass2(uint me, __local float *lds) +{ + __local float *lp; + + // Load registers + lp = lds + (me + (me >> 5)); + + float4 zr0, zr1, zr2, zr3; + + zr0.x = lp[0*66]; + zr1.x = lp[1*66]; + zr2.x = lp[2*66]; + zr3.x = lp[3*66]; + lp += 66*4; + + zr0.y = lp[0*66]; + zr1.y = lp[1*66]; + zr2.y = lp[2*66]; + zr3.y = lp[3*66]; + lp += 66*4; + + zr0.z = lp[0*66]; + zr1.z = lp[1*66]; + zr2.z = lp[2*66]; + zr3.z = lp[3*66]; + lp += 66*4; + + zr0.w = lp[0*66]; + zr1.w = lp[1*66]; + zr2.w = lp[2*66]; + zr3.w = lp[3*66]; + lp += 66*4; + + float4 zi0, zi1, zi2, zi3; + + zi0.x = lp[0*66]; + zi1.x = lp[1*66]; + zi2.x = lp[2*66]; + zi3.x = lp[3*66]; + lp += 66*4; + + zi0.y = lp[0*66]; + zi1.y = lp[1*66]; + zi2.y = lp[2*66]; + zi3.y = lp[3*66]; + lp += 66*4; + + zi0.z = lp[0*66]; + zi1.z = lp[1*66]; + zi2.z = lp[2*66]; + zi3.z = lp[3*66]; + lp += 66*4; + + zi0.w = lp[0*66]; + zi1.w = lp[1*66]; + zi2.w = lp[2*66]; + zi3.w = lp[3*66]; + + // Transform and twiddle + FFT4(); + + int tbase = (int)(me << 2); + TWIDDLE4(); + + barrier(CLK_LOCAL_MEM_FENCE); + + // Store registers + lp = lds + ((me << 2) + (me >> 3)); + + lp[0] = zr0.x; + lp[1] = zr1.x; + lp[2] = zr2.x; + lp[3] = zr3.x; + lp += 66*4; + + lp[0] = zr0.y; + lp[1] = zr1.y; + lp[2] = zr2.y; + lp[3] = zr3.y; + lp += 66*4; + + lp[0] = zr0.z; + lp[1] = zr1.z; + lp[2] = zr2.z; + lp[3] = zr3.z; + lp += 66*4; + + lp[0] = zr0.w; + lp[1] = zr1.w; + lp[2] = zr2.w; + lp[3] = zr3.w; + lp += 66*4; + + // Imaginary part + lp[0] = zi0.x; + lp[1] = zi1.x; + lp[2] = zi2.x; + lp[3] = zi3.x; + lp += 66*4; + + lp[0] = zi0.y; + lp[1] = zi1.y; + lp[2] = zi2.y; + lp[3] = zi3.y; + lp += 66*4; + + lp[0] = zi0.z; + lp[1] = zi1.z; + lp[2] = zi2.z; + lp[3] = zi3.z; + lp += 66*4; + + lp[0] = zi0.w; + lp[1] = zi1.w; + lp[2] = zi2.w; + lp[3] = zi3.w; + + barrier(CLK_LOCAL_MEM_FENCE); +} + +// Third pass of 1K FFT +__attribute__((always_inline)) void +kfft_pass3(uint me, __local float *lds) +{ + __local float *lp; + + // Load registers + lp = lds + (me + (me >> 5)); + + float4 zr0, zr1, zr2, zr3; + + zr0.x = lp[0*66]; + zr1.x = lp[1*66]; + zr2.x = lp[2*66]; + zr3.x = lp[3*66]; + lp += 66*4; + + zr0.y = lp[0*66]; + zr1.y = lp[1*66]; + zr2.y = lp[2*66]; + zr3.y = lp[3*66]; + lp += 66*4; + + zr0.z = lp[0*66]; + zr1.z = lp[1*66]; + zr2.z = lp[2*66]; + zr3.z = lp[3*66]; + lp += 66*4; + + zr0.w = lp[0*66]; + zr1.w = lp[1*66]; + zr2.w = lp[2*66]; + zr3.w = lp[3*66]; + lp += 66*4; + + float4 zi0, zi1, zi2, zi3; + + zi0.x = lp[0*66]; + zi1.x = lp[1*66]; + zi2.x = lp[2*66]; + zi3.x = lp[3*66]; + lp += 66*4; + + zi0.y = lp[0*66]; + zi1.y = lp[1*66]; + zi2.y = lp[2*66]; + zi3.y = lp[3*66]; + lp += 66*4; + + zi0.z = lp[0*66]; + zi1.z = lp[1*66]; + zi2.z = lp[2*66]; + zi3.z = lp[3*66]; + lp += 66*4; + + zi0.w = lp[0*66]; + zi1.w = lp[1*66]; + zi2.w = lp[2*66]; + zi3.w = lp[3*66]; + + // Transform and twiddle + FFT4(); + + int tbase = (int)((me >> 2) << 4); + TWIDDLE4(); + + barrier(CLK_LOCAL_MEM_FENCE); + + // Save registers + lp = lds + me; + + lp[0*66] = zr0.x; + lp[1*66] = zr0.y; + lp[2*66] = zr0.z; + lp[3*66] = zr0.w; + lp += 66*4; + + lp[0*66] = zr1.x; + lp[1*66] = zr1.y; + lp[2*66] = zr1.z; + lp[3*66] = zr1.w; + lp += 66*4; + + lp[0*66] = zr2.x; + lp[1*66] = zr2.y; + lp[2*66] = zr2.z; + lp[3*66] = zr2.w; + lp += 66*4; + + lp[0*66] = zr3.x; + lp[1*66] = zr3.y; + lp[2*66] = zr3.z; + lp[3*66] = zr3.w; + lp += 66*4; + + // Imaginary part + lp[0*66] = zi0.x; + lp[1*66] = zi0.y; + lp[2*66] = zi0.z; + lp[3*66] = zi0.w; + lp += 66*4; + + lp[0*66] = zi1.x; + lp[1*66] = zi1.y; + lp[2*66] = zi1.z; + lp[3*66] = zi1.w; + lp += 66*4; + + lp[0*66] = zi2.x; + lp[1*66] = zi2.y; + lp[2*66] = zi2.z; + lp[3*66] = zi2.w; + lp += 66*4; + + lp[0*66] = zi3.x; + lp[1*66] = zi3.y; + lp[2*66] = zi3.z; + lp[3*66] = zi3.w; + + barrier(CLK_LOCAL_MEM_FENCE); +} + +// Fourth pass of 1K FFT +__attribute__((always_inline)) void +kfft_pass4(uint me, __local float *lds) +{ + __local float *lp; + + // Load registers + lp = lds + ((me & 0x3) + ((me >> 2) & 0x3)*(66*4) + ((me >> 4) << 2)); + + float4 zr0, zr1, zr2, zr3; + + zr0.x = lp[0*66]; + zr0.y = lp[1*66]; + zr0.z = lp[2*66]; + zr0.w = lp[3*66]; + lp += 16; + + zr1.x = lp[0*66]; + zr1.y = lp[1*66]; + zr1.z = lp[2*66]; + zr1.w = lp[3*66]; + lp += 16; + + zr2.x = lp[0*66]; + zr2.y = lp[1*66]; + zr2.z = lp[2*66]; + zr2.w = lp[3*66]; + lp += 16; + + zr3.x = lp[0*66]; + zr3.y = lp[1*66]; + zr3.z = lp[2*66]; + zr3.w = lp[3*66]; + lp += 66*4*4 - 3*16; + + float4 zi0, zi1, zi2, zi3; + + zi0.x = lp[0*66]; + zi0.y = lp[1*66]; + zi0.z = lp[2*66]; + zi0.w = lp[3*66]; + lp += 16; + + zi1.x = lp[0*66]; + zi1.y = lp[1*66]; + zi1.z = lp[2*66]; + zi1.w = lp[3*66]; + lp += 16; + + zi2.x = lp[0*66]; + zi2.y = lp[1*66]; + zi2.z = lp[2*66]; + zi2.w = lp[3*66]; + lp += 16; + + zi3.x = lp[0*66]; + zi3.y = lp[1*66]; + zi3.z = lp[2*66]; + zi3.w = lp[3*66]; + + // Transform and twiddle + FFT4(); + + int tbase = (int)((me >> 4) << 6); + TWIDDLE4(); + + barrier(CLK_LOCAL_MEM_FENCE); + + // Save registers in conflict free manner + lp = lds + me; + + lp[0*68] = zr0.x; + lp[1*68] = zr0.y; + lp[2*68] = zr0.z; + lp[3*68] = zr0.w; + lp += 68*4; + + lp[0*68] = zr1.x; + lp[1*68] = zr1.y; + lp[2*68] = zr1.z; + lp[3*68] = zr1.w; + lp += 68*4; + + lp[0*68] = zr2.x; + lp[1*68] = zr2.y; + lp[2*68] = zr2.z; + lp[3*68] = zr2.w; + lp += 68*4; + + lp[0*68] = zr3.x; + lp[1*68] = zr3.y; + lp[2*68] = zr3.z; + lp[3*68] = zr3.w; + lp += 68*4; + + // Imaginary part + lp[0*68] = zi0.x; + lp[1*68] = zi0.y; + lp[2*68] = zi0.z; + lp[3*68] = zi0.w; + lp += 68*4; + + lp[0*68] = zi1.x; + lp[1*68] = zi1.y; + lp[2*68] = zi1.z; + lp[3*68] = zi1.w; + lp += 68*4; + + lp[0*68] = zi2.x; + lp[1*68] = zi2.y; + lp[2*68] = zi2.z; + lp[3*68] = zi2.w; + lp += 68*4; + + lp[0*68] = zi3.x; + lp[1*68] = zi3.y; + lp[2*68] = zi3.z; + lp[3*68] = zi3.w; + + barrier(CLK_LOCAL_MEM_FENCE); +} + +// Fifth and last pass of 1K FFT +__attribute__((always_inline)) void +kfft_pass5(uint me, + const __local float *lds, + __global float *gr, __global float *gi) +{ + const __local float *lp; + + // Load registers + lp = lds + ((me & 0xf) + (me >> 4)*(68*4)); + + float4 zr0, zr1, zr2, zr3; + + zr0.x = lp[0*68]; + zr0.y = lp[1*68]; + zr0.z = lp[2*68]; + zr0.w = lp[3*68]; + lp += 16; + + zr1.x = lp[0*68]; + zr1.y = lp[1*68]; + zr1.z = lp[2*68]; + zr1.w = lp[3*68]; + lp += 16; + + zr2.x = lp[0*68]; + zr2.y = lp[1*68]; + zr2.z = lp[2*68]; + zr2.w = lp[3*68]; + lp += 16; + + zr3.x = lp[0*68]; + zr3.y = lp[1*68]; + zr3.z = lp[2*68]; + zr3.w = lp[3*68]; + + lp += 68*4*4 - 3*16; + + float4 zi0, zi1, zi2, zi3; + + zi0.x = lp[0*68]; + zi0.y = lp[1*68]; + zi0.z = lp[2*68]; + zi0.w = lp[3*68]; + lp += 16; + + zi1.x = lp[0*68]; + zi1.y = lp[1*68]; + zi1.z = lp[2*68]; + zi1.w = lp[3*68]; + lp += 16; + + zi2.x = lp[0*68]; + zi2.y = lp[1*68]; + zi2.z = lp[2*68]; + zi2.w = lp[3*68]; + lp += 16; + + zi3.x = lp[0*68]; + zi3.y = lp[1*68]; + zi3.z = lp[2*68]; + zi3.w = lp[3*68]; + + // Transform + FFT4(); + + // Save result + __global float4 *gp = (__global float4 *)(gr + (me << 2)); + gp[0*64] = zr0; + gp[1*64] = zr1; + gp[2*64] = zr2; + gp[3*64] = zr3; + + gp = (__global float4 *)(gi + (me << 2)); + gp[0*64] = zi0; + gp[1*64] = zi1; + gp[2*64] = zi2; + gp[3*64] = zi3; +} + +// Distance between first real element of successive 1K vectors +// It must be >= 1024, and a multiple of 4 +#define VSTRIDE (1024+0) + +// Performs a 1K complex FFT with every 64 global ids. +// Each vector is a multiple of VSTRIDE from the first +// Number of global ids must be a multiple of 64, e.g. 1024*64 +// +// greal - pointer to input and output real part of data +// gimag - pointer to input and output imaginary part of data +__kernel void +forward(__global float *greal, __global float *gimag) +{ + // This is 8704 bytes + __local float lds[68*4*4*2]; + + __global float *gr; + __global float *gi; + uint gid = get_global_id(0); + uint me = gid & 0x3fU; + uint dg = (gid >> 6) * VSTRIDE; + + gr = greal + dg; + gi = gimag + dg; + + kfft_pass1(me, gr, gi, lds); + kfft_pass2(me, lds); + kfft_pass3(me, lds); + kfft_pass4(me, lds); + kfft_pass5(me, lds, gr, gi); +} + diff --git a/src/main/java/com/aparapi/examples/extension/mandel.cl b/src/main/java/com/aparapi/examples/extension/mandel.cl index b6c1da01..5f1b50fa 100644 --- a/src/main/java/com/aparapi/examples/extension/mandel.cl +++ b/src/main/java/com/aparapi/examples/extension/mandel.cl @@ -1,91 +1,91 @@ -#define MAX_ITERATIONS 64 - -__constant const int pallette[]={ - -65536, - -59392, - -53248, - -112640, - -106752, - -166144, - -160256, - -219904, - -279552, - -339200, - -399104, - -985344, - -2624000, - -4197376, - -5770496, - -7343872, - -8851712, - -10425088, - -11932928, - -13375232, - -14817792, - -16260096, - -16719602, - -16720349, - -16721097, - -16721846, - -16722595, - -16723345, - -16724351, - -16725102, - -16726110, - -16727119, - -16728129, - -16733509, - -16738889, - -16744269, - -16749138, - -16754006, - -16758619, - -16762976, - -16767077, - -16771178, - -16774767, - -16514932, - -15662970, - -14942079, - -14221189, - -13631371, - -13107088, - -12648342, - -12320669, - -11992995, - -11796393, - -11665328, - -11993019, - -12386248, - -12845011, - -13303773, - -13762534, - -14286830, - -14745588, - -15269881, - -15728637, - -16252927, - 0 -}; - -__kernel void createMandleBrot( - float scale, - float offsetx, - float offsety, - __global int *rgb -){ - int gid = get_global_id(0) + get_global_id(1)*get_global_size(0); - float x = ((((float)(get_global_id(0)) * scale) - ((scale / 2.0f) * (float)get_global_size(0))) / (float)get_global_size(0)) + offsetx; - float y = ((((float)(get_global_id(1)) * scale) - ((scale / 2.0f) * (float)get_global_size(1))) / (float)get_global_size(1)) + offsety; - int count = 0; - float zx = x; - float zy = y; - float new_zx = 0.0f; - for (; count<MAX_ITERATIONS && ((zx * zx) + (zy * zy))<8.0f; count++){ - new_zx = ((zx * zx) - (zy * zy)) + x; - zy = ((2.0f * zx) * zy) + y; - zx = new_zx; - } - rgb[gid] = pallette[count]; -} - +#define MAX_ITERATIONS 64 + +__constant const int pallette[]={ + -65536, + -59392, + -53248, + -112640, + -106752, + -166144, + -160256, + -219904, + -279552, + -339200, + -399104, + -985344, + -2624000, + -4197376, + -5770496, + -7343872, + -8851712, + -10425088, + -11932928, + -13375232, + -14817792, + -16260096, + -16719602, + -16720349, + -16721097, + -16721846, + -16722595, + -16723345, + -16724351, + -16725102, + -16726110, + -16727119, + -16728129, + -16733509, + -16738889, + -16744269, + -16749138, + -16754006, + -16758619, + -16762976, + -16767077, + -16771178, + -16774767, + -16514932, + -15662970, + -14942079, + -14221189, + -13631371, + -13107088, + -12648342, + -12320669, + -11992995, + -11796393, + -11665328, + -11993019, + -12386248, + -12845011, + -13303773, + -13762534, + -14286830, + -14745588, + -15269881, + -15728637, + -16252927, + 0 +}; + +__kernel void createMandleBrot( + float scale, + float offsetx, + float offsety, + __global int *rgb +){ + int gid = get_global_id(0) + get_global_id(1)*get_global_size(0); + float x = ((((float)(get_global_id(0)) * scale) - ((scale / 2.0f) * (float)get_global_size(0))) / (float)get_global_size(0)) + offsetx; + float y = ((((float)(get_global_id(1)) * scale) - ((scale / 2.0f) * (float)get_global_size(1))) / (float)get_global_size(1)) + offsety; + int count = 0; + float zx = x; + float zy = y; + float new_zx = 0.0f; + for (; count<MAX_ITERATIONS && ((zx * zx) + (zy * zy))<8.0f; count++){ + new_zx = ((zx * zx) - (zy * zy)) + x; + zy = ((2.0f * zx) * zy) + y; + zx = new_zx; + } + rgb[gid] = pallette[count]; +} + diff --git a/src/main/java/com/aparapi/examples/extension/mandel2.cl b/src/main/java/com/aparapi/examples/extension/mandel2.cl index e7043427..79956b78 100644 --- a/src/main/java/com/aparapi/examples/extension/mandel2.cl +++ b/src/main/java/com/aparapi/examples/extension/mandel2.cl @@ -1,95 +1,95 @@ -#define MAX_ITERATIONS 64 - -__constant const int pallette[]={ - -65536, - -59392, - -53248, - -112640, - -106752, - -166144, - -160256, - -219904, - -279552, - -339200, - -399104, - -985344, - -2624000, - -4197376, - -5770496, - -7343872, - -8851712, - -10425088, - -11932928, - -13375232, - -14817792, - -16260096, - -16719602, - -16720349, - -16721097, - -16721846, - -16722595, - -16723345, - -16724351, - -16725102, - -16726110, - -16727119, - -16728129, - -16733509, - -16738889, - -16744269, - -16749138, - -16754006, - -16758619, - -16762976, - -16767077, - -16771178, - -16774767, - -16514932, - -15662970, - -14942079, - -14221189, - -13631371, - -13107088, - -12648342, - -12320669, - -11992995, - -11796393, - -11665328, - -11993019, - -12386248, - -12845011, - -13303773, - -13762534, - -14286830, - -14745588, - -15269881, - -15728637, - -16252927, - 0 -}; - -#define WIDTH get_global_size(0) -#define HEIGHT get_global_size(1) -#define X get_global_id(0) -#define Y get_global_id(1) - -__kernel void createMandleBrot( - float scale, - float offsetx, - float offsety, - __global int *rgb - ){ - float x = ((((float)(X) * scale) - ((scale / 2.0f) * (float)WIDTH)) / (float)WIDTH) + offsetx; - float y = ((((float)(Y) * scale) - ((scale / 2.0f) * (float)HEIGHT)) / (float)HEIGHT) + offsety; - float zx = x; - float zy = y; - float new_zx = 0.0f; - int count = 0; - for (; count<MAX_ITERATIONS && ((zx * zx) + (zy * zy))<8.0f; count++){ - new_zx = ((zx * zx) - (zy * zy)) + x; - zy = ((2.0f * zx) * zy) + y; - zx = new_zx; - } - rgb[X + Y*WIDTH] = pallette[count]; -} - +#define MAX_ITERATIONS 64 + +__constant const int pallette[]={ + -65536, + -59392, + -53248, + -112640, + -106752, + -166144, + -160256, + -219904, + -279552, + -339200, + -399104, + -985344, + -2624000, + -4197376, + -5770496, + -7343872, + -8851712, + -10425088, + -11932928, + -13375232, + -14817792, + -16260096, + -16719602, + -16720349, + -16721097, + -16721846, + -16722595, + -16723345, + -16724351, + -16725102, + -16726110, + -16727119, + -16728129, + -16733509, + -16738889, + -16744269, + -16749138, + -16754006, + -16758619, + -16762976, + -16767077, + -16771178, + -16774767, + -16514932, + -15662970, + -14942079, + -14221189, + -13631371, + -13107088, + -12648342, + -12320669, + -11992995, + -11796393, + -11665328, + -11993019, + -12386248, + -12845011, + -13303773, + -13762534, + -14286830, + -14745588, + -15269881, + -15728637, + -16252927, + 0 +}; + +#define WIDTH get_global_size(0) +#define HEIGHT get_global_size(1) +#define X get_global_id(0) +#define Y get_global_id(1) + +__kernel void createMandleBrot( + float scale, + float offsetx, + float offsety, + __global int *rgb + ){ + float x = ((((float)(X) * scale) - ((scale / 2.0f) * (float)WIDTH)) / (float)WIDTH) + offsetx; + float y = ((((float)(Y) * scale) - ((scale / 2.0f) * (float)HEIGHT)) / (float)HEIGHT) + offsety; + float zx = x; + float zy = y; + float new_zx = 0.0f; + int count = 0; + for (; count<MAX_ITERATIONS && ((zx * zx) + (zy * zy))<8.0f; count++){ + new_zx = ((zx * zx) - (zy * zy)) + x; + zy = ((2.0f * zx) * zy) + y; + zx = new_zx; + } + rgb[X + Y*WIDTH] = pallette[count]; +} + diff --git a/src/main/java/com/aparapi/examples/extension/squarer.cl b/src/main/java/com/aparapi/examples/extension/squarer.cl index 7169cfc1..78c208da 100644 --- a/src/main/java/com/aparapi/examples/extension/squarer.cl +++ b/src/main/java/com/aparapi/examples/extension/squarer.cl @@ -1,5 +1,5 @@ -__kernel void square( __global float *in, __global float *out){ - const size_t id = get_global_id(0); - out[id] = in[id]*in[id]; -} - +__kernel void square( __global float *in, __global float *out){ + const size_t id = get_global_id(0); + out[id] = in[id]*in[id]; +} + diff --git a/src/main/java/com/aparapi/examples/javaonedemo/NBody.java b/src/main/java/com/aparapi/examples/javaonedemo/NBody.java index 74b41f41..93d997c8 100644 --- a/src/main/java/com/aparapi/examples/javaonedemo/NBody.java +++ b/src/main/java/com/aparapi/examples/javaonedemo/NBody.java @@ -125,10 +125,6 @@ public class NBody{ private final float[] vxyz; // velocity component of x,y and z of bodies - /** - * Constructor initializes xyz and vxyz arrays. - * @param _bodies - */ public NBodyKernel(Range _range) { range = _range; // range = Range.create(bodies); @@ -245,8 +241,6 @@ public class NBody{ }); controlPanel.add(startButton); - // controlPanel.add(new JLabel(" Particles")); - final String[] choices = new String[] { // "Java Sequential", "Java Threads", @@ -259,16 +253,10 @@ public class NBody{ @Override public void itemStateChanged(ItemEvent e) { final String item = (String) modeButton.getSelectedItem(); - // if (item.equals(choices[2])) { - // modeButton = gpuMandelBrot; - // } else if (item.equals(choices[0])) { kernel.setExecutionMode(Kernel.EXECUTION_MODE.JTP); - // modeButton = javaMandelBrot; } else if (item.equals(choices[1])) { - // lifeKernel = lifeKernelGPU; - // modeButton = javaMandelBrotMultiThread; kernel.setExecutionMode(Kernel.EXECUTION_MODE.GPU); } } @@ -326,7 +314,7 @@ public class NBody{ gl.glColor3f(1f, 1f, 1f); final GLU glu = new GLU(); - glu.gluPerspective(45f, ratio, 0f, 1000f); + glu.gluPerspective(45f, ratio, 1f, 1000f); glu.gluLookAt(xeye, yeye, zeye * zoomFactor, xat, yat, zat, 0f, 1f, 0f); if (running) { @@ -368,12 +356,12 @@ public class NBody{ gl.glEnable(GL.GL_BLEND); gl.glBlendFunc(GL.GL_SRC_ALPHA, GL.GL_ONE); try { - final InputStream textureStream = NBody.class.getResourceAsStream("particle.jpg"); + final InputStream textureStream = NBody.class.getResourceAsStream("/particle.jpg"); + if( textureStream == null ) + throw new IllegalStateException("Could not access particle.jpg resource"); texture = TextureIO.newTexture(textureStream, false, null); - } catch (final IOException e) { - e.printStackTrace(); - } catch (final GLException e) { - e.printStackTrace(); + } catch (final IOException | GLException e) { + throw new IllegalStateException("Could not create texture", e); } } diff --git a/src/main/java/com/aparapi/examples/median/MedianDemo.java b/src/main/java/com/aparapi/examples/median/MedianDemo.java index b565839d..3f67532c 100644 --- a/src/main/java/com/aparapi/examples/median/MedianDemo.java +++ b/src/main/java/com/aparapi/examples/median/MedianDemo.java @@ -32,6 +32,7 @@ import javax.swing.*; import java.awt.*; import java.awt.image.*; import java.io.*; +import java.net.URISyntaxException; /** * Demonstrate use of __private namespaces and @NoCL annotations. @@ -41,12 +42,12 @@ public class MedianDemo { static { try { - File imageFile = new File("./src/main/resources/testcard.jpg").getCanonicalFile(); + File imageFile = new File(MedianDemo.class.getResource("/testcard.jpg").toURI()).getCanonicalFile(); if (imageFile.exists()) { testImage = ImageIO.read(imageFile); } - } catch (IOException e) { - throw new RuntimeException(e); + } catch (IOException | URISyntaxException e) { + throw new IllegalStateException("Could not open image", e); } } @@ -63,18 +64,6 @@ public class MedianDemo { System.setProperty("com.aparapi.enableExecutionModeReporting", "true"); } -// KernelManager.setKernelManager(new KernelManager(){ -// @Override -// protected Comparator<OpenCLDevice> getDefaultGPUComparator() { -// return new Comparator<OpenCLDevice>() { -// @Override -// public int compare(OpenCLDevice o1, OpenCLDevice o2) { -// return o2.getMaxComputeUnits() - o1.getMaxComputeUnits(); -// } -// }; -// } -// }); - System.out.println(KernelManager.instance().bestDevice()); int[] argbs = testImage.getRGB(0, 0, testImage.getWidth(), testImage.getHeight(), null, 0, testImage.getWidth()); diff --git a/src/main/java/com/aparapi/examples/nbody/Local.java b/src/main/java/com/aparapi/examples/nbody/Local.java index 93926b16..837d349d 100644 --- a/src/main/java/com/aparapi/examples/nbody/Local.java +++ b/src/main/java/com/aparapi/examples/nbody/Local.java @@ -121,10 +121,6 @@ public class Local{ @Local private final float[] localStuff; // local memory - /** - * Constructor initializes xyz and vxyz arrays. - * @param _bodies - */ public NBodyKernel(Range _range) { range = _range; localStuff = new float[range.getLocalSize(0) * 3]; @@ -307,7 +303,7 @@ public class Local{ gl.glColor3f(1f, 1f, 1f); final GLU glu = new GLU(); - glu.gluPerspective(45f, ratio, 0f, 1000f); + glu.gluPerspective(45f, ratio, 1f, 1000f); glu.gluLookAt(xeye, yeye, zeye * zoomFactor, xat, yat, zat, 0f, 1f, 0f); if (running) { @@ -351,7 +347,7 @@ public class Local{ gl.glEnable(GL.GL_BLEND); gl.glBlendFunc(GL.GL_SRC_ALPHA, GL.GL_ONE); try { - final InputStream textureStream = Local.class.getResourceAsStream("particle.jpg"); + final InputStream textureStream = Local.class.getResourceAsStream("/particle.jpg"); final Texture texture = TextureIO.newTexture(textureStream, false, null); texture.enable(gl); } catch (final IOException e) { diff --git a/src/main/java/com/aparapi/examples/nbody/Main.java b/src/main/java/com/aparapi/examples/nbody/Main.java index 5f8fdfaa..e80fc948 100644 --- a/src/main/java/com/aparapi/examples/nbody/Main.java +++ b/src/main/java/com/aparapi/examples/nbody/Main.java @@ -309,7 +309,7 @@ public class Main{ gl.glColor3f(1f, 1f, 1f); final GLU glu = new GLU(); - glu.gluPerspective(45f, ratio, 0f, 1000f); + glu.gluPerspective(45f, ratio, 1f, 1000f); glu.gluLookAt(xeye, yeye, zeye * zoomFactor, xat, yat, zat, 0f, 1f, 0f); if (running) { @@ -356,7 +356,7 @@ public class Main{ gl.glTexParameteri(GL.GL_TEXTURE_2D, GL.GL_TEXTURE_MAG_FILTER, GL.GL_LINEAR); gl.glTexParameteri(GL.GL_TEXTURE_2D, GL.GL_TEXTURE_MIN_FILTER, GL.GL_NEAREST); try { - final InputStream textureStream = Main.class.getResourceAsStream("particle.jpg"); + final InputStream textureStream = Main.class.getResourceAsStream("/particle.jpg"); TextureData data = TextureIO.newTextureData(profile, textureStream, false, "jpg"); texture = TextureIO.newTexture(data); } catch (final IOException e) { diff --git a/src/main/java/com/aparapi/examples/nbody/Seq.java b/src/main/java/com/aparapi/examples/nbody/Seq.java index 54a128b6..a6b18135 100644 --- a/src/main/java/com/aparapi/examples/nbody/Seq.java +++ b/src/main/java/com/aparapi/examples/nbody/Seq.java @@ -311,7 +311,7 @@ public class Seq{ gl.glColor3f(1f, 1f, 1f); final GLU glu = new GLU(); - glu.gluPerspective(45f, ratio, 0f, 1000f); + glu.gluPerspective(45f, ratio, 1f, 1000f); glu.gluLookAt(xeye, yeye, zeye * zoomFactor, xat, yat, zat, 0f, 1f, 0f); if (running) { @@ -347,7 +347,7 @@ public class Seq{ gl.glTexParameteri(GL.GL_TEXTURE_2D, GL.GL_TEXTURE_MAG_FILTER, GL.GL_LINEAR); gl.glTexParameteri(GL.GL_TEXTURE_2D, GL.GL_TEXTURE_MIN_FILTER, GL.GL_NEAREST); try { - final InputStream textureStream = Seq.class.getResourceAsStream("particle.jpg"); + final InputStream textureStream = Seq.class.getResourceAsStream("/particle.jpg"); TextureData data = TextureIO.newTextureData(profile, textureStream, false, "jpg"); texture = TextureIO.newTexture(data); } catch (final IOException e) { diff --git a/src/main/java/com/aparapi/examples/oopnbody/Main.java b/src/main/java/com/aparapi/examples/oopnbody/Main.java index fae1ac0d..19c2380a 100644 --- a/src/main/java/com/aparapi/examples/oopnbody/Main.java +++ b/src/main/java/com/aparapi/examples/oopnbody/Main.java @@ -321,7 +321,7 @@ public class Main{ gl.glColor3f(1f, 1f, 1f); final GLU glu = new GLU(); - glu.gluPerspective(45f, ratio, 0f, 1000f); + glu.gluPerspective(45f, ratio, 1f, 1000f); glu.gluLookAt(xeye, yeye, zeye * zoomFactor, xat, yat, zat, 0f, 1f, 0f); if (running) { @@ -360,7 +360,7 @@ public class Main{ gl.glTexParameteri(GL.GL_TEXTURE_2D, GL.GL_TEXTURE_MAG_FILTER, GL.GL_LINEAR); gl.glTexParameteri(GL.GL_TEXTURE_2D, GL.GL_TEXTURE_MIN_FILTER, GL.GL_NEAREST); try { - final InputStream textureStream = Main.class.getResourceAsStream("particle.jpg"); + final InputStream textureStream = Main.class.getResourceAsStream("/particle.jpg"); texture = TextureIO.newTexture(textureStream, false, null); texture.enable(gl); } catch (final IOException e) { -- GitLab