From 1df93467f45d7334a33c08edf0b9146673efc79a Mon Sep 17 00:00:00 2001
From: Jeffrey Phillips Freeman <jeffrey.freeman@syncleus.com>
Date: Tue, 17 Oct 2017 22:00:59 -0400
Subject: [PATCH] fix: no longer requires manual installation of gluegen.

---
 .gitattributes                                |    3 +
 Notes.txt                                     |    6 +-
 .../pnnl/aparapi/matrix/CorrMatrixHost.java   |  728 ++++----
 .../gov/pnnl/aparapi/test/CorrMatrixTest.java |  338 ++--
 examples/movie/.project                       |   17 -
 examples/movie/build.xml                      |  254 +--
 examples/movie/movie.bat                      |   30 +-
 .../examples/movie/AparapiSolution.java       |  260 +--
 .../examples/movie/ConvMatrix3x3Editor.java   |  302 ++--
 .../aparapi/examples/movie/JJMPEGPlayer.java  |  294 ++--
 .../examples/movie/PureJavaSolution.java      |   70 +-
 .../examples/movie/ReferenceSolution.java     |  272 +--
 pom.xml                                       |    7 +-
 .../examples/convolution/Convolution.java     |    9 +-
 .../convolution/ConvolutionOpenCL.java        |    8 +-
 .../examples/convolution/PureJava.java        |    8 +-
 .../examples/convolution/convolution.cl       |   56 +-
 .../examples/extension/HistogramKernel.cl     |  332 ++--
 .../com/aparapi/examples/extension/fft.cl     | 1474 ++++++++---------
 .../com/aparapi/examples/extension/mandel.cl  |  182 +-
 .../com/aparapi/examples/extension/mandel2.cl |  190 +--
 .../com/aparapi/examples/extension/squarer.cl |   10 +-
 .../aparapi/examples/javaonedemo/NBody.java   |   24 +-
 .../aparapi/examples/median/MedianDemo.java   |   19 +-
 .../com/aparapi/examples/nbody/Local.java     |    8 +-
 .../java/com/aparapi/examples/nbody/Main.java |    4 +-
 .../java/com/aparapi/examples/nbody/Seq.java  |    4 +-
 .../com/aparapi/examples/oopnbody/Main.java   |    4 +-
 28 files changed, 2447 insertions(+), 2466 deletions(-)
 delete mode 100644 examples/movie/.project

diff --git a/.gitattributes b/.gitattributes
index 456e49e9..867f823d 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,4 +1,7 @@
 * eol=lf
 *.bat eol=crlf
 *.sh eol=lf
+*.jpg binary
+*.png binary
+*.pdf binary
 .git* export-ignore
diff --git a/Notes.txt b/Notes.txt
index fd9aca63..8ad6ae40 100644
--- a/Notes.txt
+++ b/Notes.txt
@@ -1,3 +1,3 @@
-Maybe we can use Java 7's method handles for this?
-http://java.sun.com/developer/technicalArticles/DynTypeLang/
-
+Maybe we can use Java 7's method handles for this?
+http://java.sun.com/developer/technicalArticles/DynTypeLang/
+
diff --git a/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixHost.java b/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixHost.java
index e2a5a2ce..f6d3cb42 100644
--- a/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixHost.java
+++ b/examples/correlation-matrix/src/java/gov/pnnl/aparapi/matrix/CorrMatrixHost.java
@@ -1,364 +1,364 @@
-/**
- * This material was prepared as an account of work sponsored by an agency of the United States Government.  
- * Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of 
- * their employees, nor any jurisdiction or organization that has cooperated in the development of these materials, 
- * makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy, 
- * completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents
- * that its use would not infringe privately owned rights.
- */
-package gov.pnnl.aparapi.matrix;
-
-import org.apache.log4j.Logger;
-
-import com.aparapi.Kernel;
-import com.aparapi.Kernel.EXECUTION_MODE;
-import com.aparapi.Range;
-import com.aparapi.device.Device;
-import com.aparapi.device.OpenCLDevice;
-
-/**
- * GPU calculations using OpenBitSet Intersection for OpenBitSets
- * 
- * Based on code from: <br/>
- * {@link http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java}
- * 
- * @author ryan.lamothe at gmail.com
- * @author sedillard at gmail.com
- */
-public class CorrMatrixHost {
-
-   private static final Logger LOG = Logger.getLogger(CorrMatrixHost.class);
-
-   /**
-    * Perform matrix intersection for two lists of Lucene OpenBitSet-based packed longs
-    * 
-    * @param matrixA
-    *    The first term-document matrix
-    * @param matrixB
-    *    The second term-document matrix
-    * @param Aparapi EXECUTION_MODE
-    * @return result Matrix
-    * @throws Exception
-    */
-   public static int[][] intersectionMatrix(final long[][] matrixA, final long[][] matrixB, final EXECUTION_MODE executionMode) throws Exception {
-
-      // Basic validation
-      if (matrixA == null) {
-         throw new NullPointerException("MatrixA cannot be NULL");
-      }
-
-      if (matrixB == null) {
-         throw new NullPointerException("MatrixB cannot be NULL");
-      }
-
-      // Size of an array is 8 bytes for the object + 4 bytes for the header and length information
-      final int arrayMemOverhead = 12;
-
-      // numDocs/64 since they are packed into longs
-      // We need to make our matrix sizes multiples of BLOCK_SIZE
-      final int matrixA_numTerms = matrixA.length;
-      final int matrixA_numLongs = matrixA[0].length;
-
-      if (LOG.isDebugEnabled()) {
-         LOG.debug("----------");
-         LOG.debug("MatrixA NumTerms (Rows): " + matrixA_numTerms);
-         LOG.debug("MatrixA NumLongs (Columns): " + matrixA_numLongs);
-         LOG.debug("MatrixA NumDocs: " + (matrixA_numLongs * 64L));
-      }
-
-      final long matrixA_BytesPerRow = matrixA_numLongs * 8L;
-      final long matrixA_TotalBytes = (matrixA_numTerms * matrixA_BytesPerRow) + arrayMemOverhead;
-
-      if (LOG.isDebugEnabled()) {
-         LOG.debug("MatrixA Total Memory Size: " + humanReadableByteCount(matrixA_TotalBytes, true));
-      }
-
-      final int matrixB_numTerms = matrixB.length;
-      final int matrixB_numLongs = matrixB[0].length;
-
-      if (LOG.isDebugEnabled()) {
-         LOG.debug("----------");
-         LOG.debug("MatrixB NumTerms (Rows): " + matrixB_numTerms);
-         LOG.debug("MatrixB NumLongs (Columns): " + matrixB_numLongs);
-         LOG.debug("MatrixB NumDocs: " + (matrixB_numLongs * 64L));
-      }
-
-      final long matrixB_BytesPerRow = matrixB_numLongs * 8L;
-      final long matrixB_TotalBytes = (matrixB_numTerms * matrixB_BytesPerRow) + arrayMemOverhead;
-
-      if (LOG.isDebugEnabled()) {
-         LOG.debug("MatrixB Total Memory Size: " + humanReadableByteCount(matrixB_TotalBytes, true));
-         LOG.debug("----------");
-      }
-
-      final int[][] resultMatrix = new int[matrixA_numTerms][matrixB_numTerms];
-
-      if (LOG.isDebugEnabled()) {
-         final long resultMatrix_TotalBytes = (matrixA_numTerms * matrixB_numTerms * 4L) + arrayMemOverhead;
-         LOG.debug("ResultMatrix Memory Size: " + humanReadableByteCount(resultMatrix_TotalBytes, true));
-         LOG.debug("Total Requested Memory Size: " + humanReadableByteCount(matrixA_TotalBytes + matrixB_TotalBytes + resultMatrix_TotalBytes, true));
-         LOG.debug("----------");
-      }
-
-      int NUM_SUB_ROWS = matrixA_numTerms; // Default number of sub-rows
-
-      OpenCLDevice device = null;
-
-      // We do not test for EXECUTION_MODE.JTP because JTP is non-OpenCL
-      if (executionMode.equals(EXECUTION_MODE.CPU)) {
-         device = (OpenCLDevice) Device.firstCPU();
-
-         if (device == null) {
-            LOG.warn("OpenCLDevice.CPU is NULL...OpenCL is unavailable. Setting to JTP mode.");
-            LOG.debug("----------");
-         }
-      } else if (executionMode.equals(EXECUTION_MODE.GPU)) {
-         device = (OpenCLDevice) Device.best();
-
-         if (device == null) {
-            LOG.warn("OpenCLDevice.GPU is NULL...OpenCL is unavailable. Setting to JTP mode.");
-            LOG.debug("----------");
-         }
-      }
-
-      // This is to create stripes of rows that will fit into OpenCL's available memory
-      // Calculate the number of sub-rows by calling OpenCL to find out available memory
-      // Length of row * 8 (size of long in bytes) * number of rows to available memory
-      final int maxNumTerms = Math.max(matrixA_numTerms, matrixB_numTerms);
-
-      if (device != null) {
-         final long globalMemSize = device.getGlobalMemSize();
-         // final long maxMemAllocSize = Math.max((globalMemSize/4), 128*1024*1024);
-         final long maxMemAllocSize = device.getMaxMemAllocSize();
-
-         // 1048576 bytes in a megabyte (1024*1024)
-         // Java long is 8 bytes
-         // 131072 longs in 1 megabyte
-         // SAFE OpenCL spec allocation is max(1/4 GlobalMemSize)
-         // ***During our testing this appears to be incorrectly/inconsistently reported depending on os/drivers/hardware***
-         if (LOG.isDebugEnabled()) {
-            LOG.debug("Available OpenCL globalMemSize: " + humanReadableByteCount(globalMemSize, true));
-            LOG.debug("Available OpenCL maxMemAllocSize: " + humanReadableByteCount(maxMemAllocSize, true));
-         }
-
-         // Maybe there is a more clever way to do this :)
-         // The idea here is to decide how many sub-rows of the matrix we can fit on a single card
-         // The long-term goal to divide up the work for both small RAM GPUs and multiple GPUs
-         int subRowsCounterA = 0;
-         int subRowsCounterB = 0;
-         long subRowsMemSizeA = 0L;
-         long subRowsMemSizeB = 0L;
-         long subResultMatrixMemSize = 0L;
-         long subTotalMemSize = 0L;
-
-         do {
-            if (subRowsCounterA < matrixA_numTerms) {
-               subRowsMemSizeA = subRowsCounterA != 0 ? (subRowsCounterA * matrixA_numLongs * 8L) + arrayMemOverhead : 0;
-               subRowsCounterA += 1;
-            } else if (subRowsCounterA == matrixA_numTerms) {
-               subRowsMemSizeA = subRowsCounterA != 0 ? (subRowsCounterA * matrixA_numLongs * 8L) + arrayMemOverhead : 0;
-            }
-
-            if (subRowsCounterB < matrixB_numTerms) {
-               subRowsMemSizeB = subRowsCounterB != 0 ? (subRowsCounterB * matrixB_numLongs * 8L) + arrayMemOverhead : 0;
-               subRowsCounterB += 1;
-            } else if (subRowsCounterB == matrixB_numTerms) {
-               subRowsMemSizeB = subRowsCounterB != 0 ? (subRowsCounterB * matrixB_numLongs * 8L) + arrayMemOverhead : 0;
-            }
-
-            // This is 4 bytes since the sub-result matrix is an int array
-            subResultMatrixMemSize = ((subRowsCounterA * subRowsCounterB) * 4L) + arrayMemOverhead;
-
-            subTotalMemSize = subRowsMemSizeA + subRowsMemSizeB + subResultMatrixMemSize;
-         } while ((Math.max(subRowsCounterA, subRowsCounterB) < maxNumTerms) && (subTotalMemSize <= maxMemAllocSize));
-
-         // If using OpenCL override the default number of subrows
-         NUM_SUB_ROWS = Math.max(subRowsCounterA, subRowsCounterB);
-
-         if (NUM_SUB_ROWS < maxNumTerms) {
-            final long subMatrixA_memSize = (NUM_SUB_ROWS * matrixA_numLongs * 8L) + arrayMemOverhead;
-            final long subMatrixB_memSize = (NUM_SUB_ROWS * matrixB_numLongs * 8L) + arrayMemOverhead;
-            final long subResultMatrix_memSize = (NUM_SUB_ROWS * NUM_SUB_ROWS * 4L) + arrayMemOverhead;
-
-            LOG.warn("****************************************************************");
-            LOG.warn("Requested matrix computation is larger than available OpenCL memory");
-            LOG.warn("Matrix striping is occurring to fit all data into OpenCL memory...");
-            LOG.warn("");
-            LOG.warn("Number rows requested: " + maxNumTerms);
-            LOG.warn("Number rows that fit: " + NUM_SUB_ROWS);
-            LOG.warn("");
-            LOG.warn("SubMatrixA Memory Size: " + humanReadableByteCount(subMatrixA_memSize, true));
-            LOG.warn("SubMatrixB Memory Size: " + humanReadableByteCount(subMatrixB_memSize, true));
-            LOG.warn("SubResultMatrix Memory Size: " + humanReadableByteCount(subResultMatrix_memSize, true));
-            LOG.warn("SubMatrix Total Memory Size: " + humanReadableByteCount(subMatrixA_memSize + subMatrixB_memSize + subResultMatrix_memSize, true));
-            LOG.warn("****************************************************************");
-         }
-      }
-
-      final int numSubBlocksA = ((matrixA_numTerms + NUM_SUB_ROWS) - 1) / NUM_SUB_ROWS;
-      final int numSubBlocksB = ((matrixB_numTerms + NUM_SUB_ROWS) - 1) / NUM_SUB_ROWS;
-
-      final long[] subMatrixA = new long[NUM_SUB_ROWS * matrixA_numLongs];
-      final long[] subMatrixB = new long[NUM_SUB_ROWS * matrixB_numLongs];
-      final int[] subResultMatrix = new int[NUM_SUB_ROWS * NUM_SUB_ROWS];
-
-      final CorrMatrixKernel kernel = new CorrMatrixKernel(subMatrixA, NUM_SUB_ROWS, subMatrixB, NUM_SUB_ROWS, matrixA_numLongs, subResultMatrix);
-      kernel.setExplicit(true);
-
-      // Here we define a fall-back strategy, since the user may have wanted to execute only a single execution mode
-      if (executionMode.equals(EXECUTION_MODE.GPU) && (device != null)) {
-         kernel.addExecutionModes(EXECUTION_MODE.GPU, EXECUTION_MODE.CPU, EXECUTION_MODE.JTP);
-         LOG.debug("Execution Fallback Strategy: GPU --> CPU --> JTP");
-      } else if (executionMode.equals(EXECUTION_MODE.CPU) && (device != null)) {
-         kernel.addExecutionModes(EXECUTION_MODE.CPU, EXECUTION_MODE.JTP);
-         LOG.debug("Execution Fallback Strategy: CPU --> JTP");
-      } else {
-         kernel.addExecutionModes(EXECUTION_MODE.JTP);
-         LOG.debug("Execution Strategy: JTP");
-      }
-
-      try {
-         for (int a = 0; a < numSubBlocksA; a++) {
-            for (int b = 0; b < numSubBlocksB; b++) {
-               final int aSubRowStart = a * NUM_SUB_ROWS;
-               final int aSubRowEnd = Math.min(matrixA_numTerms, aSubRowStart + NUM_SUB_ROWS);
-
-               for (int i = aSubRowStart; i < aSubRowEnd; i++) {
-                  if (matrixA_numLongs != matrixA[i].length) {
-                     throw new Exception("All rows in the matrix need be the same length");
-                  }
-
-                  System.arraycopy(matrixA[i], 0, subMatrixA, (i - aSubRowStart) * matrixA_numLongs, matrixA_numLongs);
-               }
-
-               final int bSubRowStart = b * NUM_SUB_ROWS;
-               final int bSubRowEnd = Math.min(matrixB_numTerms, bSubRowStart + NUM_SUB_ROWS);
-
-               for (int i = bSubRowStart; i < bSubRowEnd; i++) {
-                  if (matrixA_numLongs != matrixB[i].length) {
-                     throw new Exception("All rows in the matrix need be the same length");
-                  }
-
-                  System.arraycopy(matrixB[i], 0, subMatrixB, (i - bSubRowStart) * matrixB_numLongs, matrixB_numLongs);
-               }
-
-               // Since matrixA_NumLongs == matrixB_NumLongs we're only going to pass matrixA_NumLongs
-               executeKernel(device, subMatrixA, aSubRowEnd - aSubRowStart, subMatrixB, bSubRowEnd - bSubRowStart, matrixA_numLongs, subResultMatrix, kernel);
-
-               // Convert one dimensional array to two dimensional array in the expected output ordering
-               for (int i = 0; i < NUM_SUB_ROWS; i++) {
-                  if ((i + aSubRowStart) < aSubRowEnd) {
-                     System.arraycopy(subResultMatrix, i * NUM_SUB_ROWS, resultMatrix[i + aSubRowStart], bSubRowStart, bSubRowEnd - bSubRowStart);
-                  }
-               }
-            }
-         }
-      } finally {
-         if (LOG.isDebugEnabled()) {
-            LOG.debug("----------");
-            LOG.debug("Aparapi Gross Execution Time: " + kernel.getAccumulatedExecutionTime() + " ms <------ Aparapi");
-            LOG.debug("OpenCL Generation Time: " + kernel.getConversionTime() + " ms");
-            LOG.debug("Kernel Net Execution Time: " + (kernel.getAccumulatedExecutionTime() - kernel.getConversionTime()) + " ms");
-            LOG.debug("----------");
-         }
-
-         try {
-            kernel.dispose();
-         } catch (final UnsatisfiedLinkError e) {
-            LOG.error("Aparapi failed to dispose of the kernel", e);
-         }
-      }
-
-      return resultMatrix;
-   }
-
-   /**
-    * Execute the GPU kernel
-    * 
-    * @param subMatrixA
-    * @param matrixA_NumTerms
-    * @param subMatrixB
-    * @param matrixB_NumTerms
-    * @param numLongs
-    * @param subResultMatrix
-    * @param kernel
-    * 
-    * @return resultMatrix
-    */
-   private static void executeKernel(final Device device, final long[] subMatrixA, final int matrixA_NumTerms, final long[] subMatrixB, final int matrixB_NumTerms, final int numLongs, final int[] subResultMatrix, final Kernel kernel) {
-
-      // Power of Two for best performance
-      int matrixA_NumTermsRnd = matrixA_NumTerms;
-      while (!isPowerOfTwo(matrixA_NumTermsRnd)) {
-         matrixA_NumTermsRnd += 1;
-      }
-
-      int matrixB_NumTermsRnd = matrixB_NumTerms;
-      while (!isPowerOfTwo(matrixB_NumTermsRnd)) {
-         matrixB_NumTermsRnd += 1;
-      }
-
-      final Range range;
-      if (device != null) {
-         range = Range.create2D(device, matrixA_NumTermsRnd, matrixB_NumTermsRnd);
-      } else {
-         range = Range.create2D(matrixA_NumTermsRnd, matrixB_NumTermsRnd);
-      }
-
-      if (LOG.isDebugEnabled()) {
-         LOG.debug("Range: " + range);
-      }
-
-      kernel.put(subMatrixA);
-      kernel.put(subMatrixB);
-      kernel.put(subResultMatrix);
-
-      kernel.execute(range);
-
-      kernel.get(subResultMatrix);
-   }
-
-   /**
-    * Highly efficient means to compute whether a number is a power of 2<br>
-    * Based on code from http://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2
-    * <p>
-    * Another very cool way to do this is ((x&(-x))==x)
-    * 
-    * @param n
-    * @return boolean
-    */
-   private static boolean isPowerOfTwo(int n) {
-      return (n > 0) && ((n & (n - 1)) == 0);
-   }
-
-   /**
-    * Rounds a number to the multiple indicated
-    * 
-    * @param num
-    * @param multiple
-    * @return
-    */
-   private static int roundToMultiple(double num, int multiple) {
-      return (int) (Math.ceil(num / multiple) * multiple);
-   }
-
-   /**
-    * Very nice means to convert byte sizes into human readable format<br>
-    * Based on code from http://stackoverflow.com/questions/3758606/how-to-convert-byte-size-into-human-readable-format-in-java
-    * <p>
-    * 
-    * @param bytes
-    * @param si
-    * @return humanReadableByteCount
-    */
-   private static String humanReadableByteCount(long bytes, boolean si) {
-      final int unit = si ? 1000 : 1024;
-      if (bytes < unit) {
-         return bytes + " B";
-      }
-      final int exp = (int) (Math.log(bytes) / Math.log(unit));
-      final String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp - 1) + (si ? "" : "i");
-
-      return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre);
-   }
-}
+/**
+ * This material was prepared as an account of work sponsored by an agency of the United States Government.  
+ * Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of 
+ * their employees, nor any jurisdiction or organization that has cooperated in the development of these materials, 
+ * makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy, 
+ * completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents
+ * that its use would not infringe privately owned rights.
+ */
+package gov.pnnl.aparapi.matrix;
+
+import org.apache.log4j.Logger;
+
+import com.aparapi.Kernel;
+import com.aparapi.Kernel.EXECUTION_MODE;
+import com.aparapi.Range;
+import com.aparapi.device.Device;
+import com.aparapi.device.OpenCLDevice;
+
+/**
+ * GPU calculations using OpenBitSet Intersection for OpenBitSets
+ * 
+ * Based on code from: <br/>
+ * {@link http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java}
+ * 
+ * @author ryan.lamothe at gmail.com
+ * @author sedillard at gmail.com
+ */
+public class CorrMatrixHost {
+
+   private static final Logger LOG = Logger.getLogger(CorrMatrixHost.class);
+
+   /**
+    * Perform matrix intersection for two lists of Lucene OpenBitSet-based packed longs
+    * 
+    * @param matrixA
+    *    The first term-document matrix
+    * @param matrixB
+    *    The second term-document matrix
+    * @param Aparapi EXECUTION_MODE
+    * @return result Matrix
+    * @throws Exception
+    */
+   public static int[][] intersectionMatrix(final long[][] matrixA, final long[][] matrixB, final EXECUTION_MODE executionMode) throws Exception {
+
+      // Basic validation
+      if (matrixA == null) {
+         throw new NullPointerException("MatrixA cannot be NULL");
+      }
+
+      if (matrixB == null) {
+         throw new NullPointerException("MatrixB cannot be NULL");
+      }
+
+      // Size of an array is 8 bytes for the object + 4 bytes for the header and length information
+      final int arrayMemOverhead = 12;
+
+      // numDocs/64 since they are packed into longs
+      // We need to make our matrix sizes multiples of BLOCK_SIZE
+      final int matrixA_numTerms = matrixA.length;
+      final int matrixA_numLongs = matrixA[0].length;
+
+      if (LOG.isDebugEnabled()) {
+         LOG.debug("----------");
+         LOG.debug("MatrixA NumTerms (Rows): " + matrixA_numTerms);
+         LOG.debug("MatrixA NumLongs (Columns): " + matrixA_numLongs);
+         LOG.debug("MatrixA NumDocs: " + (matrixA_numLongs * 64L));
+      }
+
+      final long matrixA_BytesPerRow = matrixA_numLongs * 8L;
+      final long matrixA_TotalBytes = (matrixA_numTerms * matrixA_BytesPerRow) + arrayMemOverhead;
+
+      if (LOG.isDebugEnabled()) {
+         LOG.debug("MatrixA Total Memory Size: " + humanReadableByteCount(matrixA_TotalBytes, true));
+      }
+
+      final int matrixB_numTerms = matrixB.length;
+      final int matrixB_numLongs = matrixB[0].length;
+
+      if (LOG.isDebugEnabled()) {
+         LOG.debug("----------");
+         LOG.debug("MatrixB NumTerms (Rows): " + matrixB_numTerms);
+         LOG.debug("MatrixB NumLongs (Columns): " + matrixB_numLongs);
+         LOG.debug("MatrixB NumDocs: " + (matrixB_numLongs * 64L));
+      }
+
+      final long matrixB_BytesPerRow = matrixB_numLongs * 8L;
+      final long matrixB_TotalBytes = (matrixB_numTerms * matrixB_BytesPerRow) + arrayMemOverhead;
+
+      if (LOG.isDebugEnabled()) {
+         LOG.debug("MatrixB Total Memory Size: " + humanReadableByteCount(matrixB_TotalBytes, true));
+         LOG.debug("----------");
+      }
+
+      final int[][] resultMatrix = new int[matrixA_numTerms][matrixB_numTerms];
+
+      if (LOG.isDebugEnabled()) {
+         final long resultMatrix_TotalBytes = (matrixA_numTerms * matrixB_numTerms * 4L) + arrayMemOverhead;
+         LOG.debug("ResultMatrix Memory Size: " + humanReadableByteCount(resultMatrix_TotalBytes, true));
+         LOG.debug("Total Requested Memory Size: " + humanReadableByteCount(matrixA_TotalBytes + matrixB_TotalBytes + resultMatrix_TotalBytes, true));
+         LOG.debug("----------");
+      }
+
+      int NUM_SUB_ROWS = matrixA_numTerms; // Default number of sub-rows
+
+      OpenCLDevice device = null;
+
+      // We do not test for EXECUTION_MODE.JTP because JTP is non-OpenCL
+      if (executionMode.equals(EXECUTION_MODE.CPU)) {
+         device = (OpenCLDevice) Device.firstCPU();
+
+         if (device == null) {
+            LOG.warn("OpenCLDevice.CPU is NULL...OpenCL is unavailable. Setting to JTP mode.");
+            LOG.debug("----------");
+         }
+      } else if (executionMode.equals(EXECUTION_MODE.GPU)) {
+         device = (OpenCLDevice) Device.best();
+
+         if (device == null) {
+            LOG.warn("OpenCLDevice.GPU is NULL...OpenCL is unavailable. Setting to JTP mode.");
+            LOG.debug("----------");
+         }
+      }
+
+      // This is to create stripes of rows that will fit into OpenCL's available memory
+      // Calculate the number of sub-rows by calling OpenCL to find out available memory
+      // Length of row * 8 (size of long in bytes) * number of rows to available memory
+      final int maxNumTerms = Math.max(matrixA_numTerms, matrixB_numTerms);
+
+      if (device != null) {
+         final long globalMemSize = device.getGlobalMemSize();
+         // final long maxMemAllocSize = Math.max((globalMemSize/4), 128*1024*1024);
+         final long maxMemAllocSize = device.getMaxMemAllocSize();
+
+         // 1048576 bytes in a megabyte (1024*1024)
+         // Java long is 8 bytes
+         // 131072 longs in 1 megabyte
+         // SAFE OpenCL spec allocation is max(1/4 GlobalMemSize)
+         // ***During our testing this appears to be incorrectly/inconsistently reported depending on os/drivers/hardware***
+         if (LOG.isDebugEnabled()) {
+            LOG.debug("Available OpenCL globalMemSize: " + humanReadableByteCount(globalMemSize, true));
+            LOG.debug("Available OpenCL maxMemAllocSize: " + humanReadableByteCount(maxMemAllocSize, true));
+         }
+
+         // Maybe there is a more clever way to do this :)
+         // The idea here is to decide how many sub-rows of the matrix we can fit on a single card
+         // The long-term goal to divide up the work for both small RAM GPUs and multiple GPUs
+         int subRowsCounterA = 0;
+         int subRowsCounterB = 0;
+         long subRowsMemSizeA = 0L;
+         long subRowsMemSizeB = 0L;
+         long subResultMatrixMemSize = 0L;
+         long subTotalMemSize = 0L;
+
+         do {
+            if (subRowsCounterA < matrixA_numTerms) {
+               subRowsMemSizeA = subRowsCounterA != 0 ? (subRowsCounterA * matrixA_numLongs * 8L) + arrayMemOverhead : 0;
+               subRowsCounterA += 1;
+            } else if (subRowsCounterA == matrixA_numTerms) {
+               subRowsMemSizeA = subRowsCounterA != 0 ? (subRowsCounterA * matrixA_numLongs * 8L) + arrayMemOverhead : 0;
+            }
+
+            if (subRowsCounterB < matrixB_numTerms) {
+               subRowsMemSizeB = subRowsCounterB != 0 ? (subRowsCounterB * matrixB_numLongs * 8L) + arrayMemOverhead : 0;
+               subRowsCounterB += 1;
+            } else if (subRowsCounterB == matrixB_numTerms) {
+               subRowsMemSizeB = subRowsCounterB != 0 ? (subRowsCounterB * matrixB_numLongs * 8L) + arrayMemOverhead : 0;
+            }
+
+            // This is 4 bytes since the sub-result matrix is an int array
+            subResultMatrixMemSize = ((subRowsCounterA * subRowsCounterB) * 4L) + arrayMemOverhead;
+
+            subTotalMemSize = subRowsMemSizeA + subRowsMemSizeB + subResultMatrixMemSize;
+         } while ((Math.max(subRowsCounterA, subRowsCounterB) < maxNumTerms) && (subTotalMemSize <= maxMemAllocSize));
+
+         // If using OpenCL override the default number of subrows
+         NUM_SUB_ROWS = Math.max(subRowsCounterA, subRowsCounterB);
+
+         if (NUM_SUB_ROWS < maxNumTerms) {
+            final long subMatrixA_memSize = (NUM_SUB_ROWS * matrixA_numLongs * 8L) + arrayMemOverhead;
+            final long subMatrixB_memSize = (NUM_SUB_ROWS * matrixB_numLongs * 8L) + arrayMemOverhead;
+            final long subResultMatrix_memSize = (NUM_SUB_ROWS * NUM_SUB_ROWS * 4L) + arrayMemOverhead;
+
+            LOG.warn("****************************************************************");
+            LOG.warn("Requested matrix computation is larger than available OpenCL memory");
+            LOG.warn("Matrix striping is occurring to fit all data into OpenCL memory...");
+            LOG.warn("");
+            LOG.warn("Number rows requested: " + maxNumTerms);
+            LOG.warn("Number rows that fit: " + NUM_SUB_ROWS);
+            LOG.warn("");
+            LOG.warn("SubMatrixA Memory Size: " + humanReadableByteCount(subMatrixA_memSize, true));
+            LOG.warn("SubMatrixB Memory Size: " + humanReadableByteCount(subMatrixB_memSize, true));
+            LOG.warn("SubResultMatrix Memory Size: " + humanReadableByteCount(subResultMatrix_memSize, true));
+            LOG.warn("SubMatrix Total Memory Size: " + humanReadableByteCount(subMatrixA_memSize + subMatrixB_memSize + subResultMatrix_memSize, true));
+            LOG.warn("****************************************************************");
+         }
+      }
+
+      final int numSubBlocksA = ((matrixA_numTerms + NUM_SUB_ROWS) - 1) / NUM_SUB_ROWS;
+      final int numSubBlocksB = ((matrixB_numTerms + NUM_SUB_ROWS) - 1) / NUM_SUB_ROWS;
+
+      final long[] subMatrixA = new long[NUM_SUB_ROWS * matrixA_numLongs];
+      final long[] subMatrixB = new long[NUM_SUB_ROWS * matrixB_numLongs];
+      final int[] subResultMatrix = new int[NUM_SUB_ROWS * NUM_SUB_ROWS];
+
+      final CorrMatrixKernel kernel = new CorrMatrixKernel(subMatrixA, NUM_SUB_ROWS, subMatrixB, NUM_SUB_ROWS, matrixA_numLongs, subResultMatrix);
+      kernel.setExplicit(true);
+
+      // Here we define a fall-back strategy, since the user may have wanted to execute only a single execution mode
+      if (executionMode.equals(EXECUTION_MODE.GPU) && (device != null)) {
+         kernel.addExecutionModes(EXECUTION_MODE.GPU, EXECUTION_MODE.CPU, EXECUTION_MODE.JTP);
+         LOG.debug("Execution Fallback Strategy: GPU --> CPU --> JTP");
+      } else if (executionMode.equals(EXECUTION_MODE.CPU) && (device != null)) {
+         kernel.addExecutionModes(EXECUTION_MODE.CPU, EXECUTION_MODE.JTP);
+         LOG.debug("Execution Fallback Strategy: CPU --> JTP");
+      } else {
+         kernel.addExecutionModes(EXECUTION_MODE.JTP);
+         LOG.debug("Execution Strategy: JTP");
+      }
+
+      try {
+         for (int a = 0; a < numSubBlocksA; a++) {
+            for (int b = 0; b < numSubBlocksB; b++) {
+               final int aSubRowStart = a * NUM_SUB_ROWS;
+               final int aSubRowEnd = Math.min(matrixA_numTerms, aSubRowStart + NUM_SUB_ROWS);
+
+               for (int i = aSubRowStart; i < aSubRowEnd; i++) {
+                  if (matrixA_numLongs != matrixA[i].length) {
+                     throw new Exception("All rows in the matrix need be the same length");
+                  }
+
+                  System.arraycopy(matrixA[i], 0, subMatrixA, (i - aSubRowStart) * matrixA_numLongs, matrixA_numLongs);
+               }
+
+               final int bSubRowStart = b * NUM_SUB_ROWS;
+               final int bSubRowEnd = Math.min(matrixB_numTerms, bSubRowStart + NUM_SUB_ROWS);
+
+               for (int i = bSubRowStart; i < bSubRowEnd; i++) {
+                  if (matrixA_numLongs != matrixB[i].length) {
+                     throw new Exception("All rows in the matrix need be the same length");
+                  }
+
+                  System.arraycopy(matrixB[i], 0, subMatrixB, (i - bSubRowStart) * matrixB_numLongs, matrixB_numLongs);
+               }
+
+               // Since matrixA_NumLongs == matrixB_NumLongs we're only going to pass matrixA_NumLongs
+               executeKernel(device, subMatrixA, aSubRowEnd - aSubRowStart, subMatrixB, bSubRowEnd - bSubRowStart, matrixA_numLongs, subResultMatrix, kernel);
+
+               // Convert one dimensional array to two dimensional array in the expected output ordering
+               for (int i = 0; i < NUM_SUB_ROWS; i++) {
+                  if ((i + aSubRowStart) < aSubRowEnd) {
+                     System.arraycopy(subResultMatrix, i * NUM_SUB_ROWS, resultMatrix[i + aSubRowStart], bSubRowStart, bSubRowEnd - bSubRowStart);
+                  }
+               }
+            }
+         }
+      } finally {
+         if (LOG.isDebugEnabled()) {
+            LOG.debug("----------");
+            LOG.debug("Aparapi Gross Execution Time: " + kernel.getAccumulatedExecutionTime() + " ms <------ Aparapi");
+            LOG.debug("OpenCL Generation Time: " + kernel.getConversionTime() + " ms");
+            LOG.debug("Kernel Net Execution Time: " + (kernel.getAccumulatedExecutionTime() - kernel.getConversionTime()) + " ms");
+            LOG.debug("----------");
+         }
+
+         try {
+            kernel.dispose();
+         } catch (final UnsatisfiedLinkError e) {
+            LOG.error("Aparapi failed to dispose of the kernel", e);
+         }
+      }
+
+      return resultMatrix;
+   }
+
+   /**
+    * Execute the GPU kernel
+    * 
+    * @param subMatrixA
+    * @param matrixA_NumTerms
+    * @param subMatrixB
+    * @param matrixB_NumTerms
+    * @param numLongs
+    * @param subResultMatrix
+    * @param kernel
+    * 
+    * @return resultMatrix
+    */
+   private static void executeKernel(final Device device, final long[] subMatrixA, final int matrixA_NumTerms, final long[] subMatrixB, final int matrixB_NumTerms, final int numLongs, final int[] subResultMatrix, final Kernel kernel) {
+
+      // Power of Two for best performance
+      int matrixA_NumTermsRnd = matrixA_NumTerms;
+      while (!isPowerOfTwo(matrixA_NumTermsRnd)) {
+         matrixA_NumTermsRnd += 1;
+      }
+
+      int matrixB_NumTermsRnd = matrixB_NumTerms;
+      while (!isPowerOfTwo(matrixB_NumTermsRnd)) {
+         matrixB_NumTermsRnd += 1;
+      }
+
+      final Range range;
+      if (device != null) {
+         range = Range.create2D(device, matrixA_NumTermsRnd, matrixB_NumTermsRnd);
+      } else {
+         range = Range.create2D(matrixA_NumTermsRnd, matrixB_NumTermsRnd);
+      }
+
+      if (LOG.isDebugEnabled()) {
+         LOG.debug("Range: " + range);
+      }
+
+      kernel.put(subMatrixA);
+      kernel.put(subMatrixB);
+      kernel.put(subResultMatrix);
+
+      kernel.execute(range);
+
+      kernel.get(subResultMatrix);
+   }
+
+   /**
+    * Highly efficient means to compute whether a number is a power of 2<br>
+    * Based on code from http://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2
+    * <p>
+    * Another very cool way to do this is ((x&(-x))==x)
+    * 
+    * @param n
+    * @return boolean
+    */
+   private static boolean isPowerOfTwo(int n) {
+      return (n > 0) && ((n & (n - 1)) == 0);
+   }
+
+   /**
+    * Rounds a number to the multiple indicated
+    * 
+    * @param num
+    * @param multiple
+    * @return
+    */
+   private static int roundToMultiple(double num, int multiple) {
+      return (int) (Math.ceil(num / multiple) * multiple);
+   }
+
+   /**
+    * Very nice means to convert byte sizes into human readable format<br>
+    * Based on code from http://stackoverflow.com/questions/3758606/how-to-convert-byte-size-into-human-readable-format-in-java
+    * <p>
+    * 
+    * @param bytes
+    * @param si
+    * @return humanReadableByteCount
+    */
+   private static String humanReadableByteCount(long bytes, boolean si) {
+      final int unit = si ? 1000 : 1024;
+      if (bytes < unit) {
+         return bytes + " B";
+      }
+      final int exp = (int) (Math.log(bytes) / Math.log(unit));
+      final String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp - 1) + (si ? "" : "i");
+
+      return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre);
+   }
+}
diff --git a/examples/correlation-matrix/src/test/gov/pnnl/aparapi/test/CorrMatrixTest.java b/examples/correlation-matrix/src/test/gov/pnnl/aparapi/test/CorrMatrixTest.java
index df57dce1..d919641d 100644
--- a/examples/correlation-matrix/src/test/gov/pnnl/aparapi/test/CorrMatrixTest.java
+++ b/examples/correlation-matrix/src/test/gov/pnnl/aparapi/test/CorrMatrixTest.java
@@ -1,169 +1,169 @@
-/**
- * This material was prepared as an account of work sponsored by an agency of the United States Government.  
- * Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of 
- * their employees, nor any jurisdiction or organization that has cooperated in the development of these materials, 
- * makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy, 
- * completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents
- * that its use would not infringe privately owned rights.
- */
-package gov.pnnl.aparapi.test;
-
-import gov.pnnl.aparapi.matrix.CorrMatrixHost;
-
-import java.io.File;
-import java.io.PrintWriter;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Random;
-
-import org.apache.commons.lang3.tuple.ImmutablePair;
-import org.apache.commons.lang3.tuple.Pair;
-import org.apache.log4j.Logger;
-import org.apache.lucene.util.OpenBitSet;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-import com.aparapi.Kernel.EXECUTION_MODE;
-
-/**
- * This test class performs the following functions:
- * 
- * 1) Create a randomly populated set of matrices for correlation/co-occurrence computation
- * 2) Execute the CPU-based computation using Lucene OpenBitSets
- * 3) Execute the GPU-based computation using Aparapi CorrMatrix host and kernel
- * 4) Verify the results of OpenBitSet and CorrMatrix by comparing matrices to each other
- *  
- * @author ryan.lamothe at gmail.com
- *
- */
-public class CorrMatrixTest {
-
-   private static final Logger LOG = Logger.getLogger(CorrMatrixTest.class);
-
-   private final List<Pair<OpenBitSet, OpenBitSet>> obsPairs = new ArrayList<Pair<OpenBitSet, OpenBitSet>>();;
-
-   private final Random rand = new Random();
-
-   private int[][] obsResultMatrix;
-
-   /**
-    * NumTerms and NumLongs (documents) need to be adjusted manually right now to force 'striping' to occur (see Host code for details)
-    */
-   @Before
-   public void setup() throws Exception {
-      /*
-       * Populate test data
-       */
-      LOG.debug("----------");
-      LOG.debug("Populating test matrix data using settings from build.xml...");
-      LOG.debug("----------");
-
-      final int numTerms = Integer.getInteger("numRows", 300); // # Rows
-      // numLongs*64 for number of actual documents since these are 'packed' longs
-      final int numLongs = Integer.getInteger("numColumns", 10000); // # Columns
-
-      for (int i = 0; i < numTerms; ++i) {
-         final long[] bits = new long[numLongs];
-         for (int j = 0; j < numLongs; ++j) {
-            bits[j] = rand.nextLong();
-         }
-
-         obsPairs.add(i, new ImmutablePair<OpenBitSet, OpenBitSet>(new OpenBitSet(bits, numLongs), new OpenBitSet(bits, numLongs)));
-      }
-
-      /*
-       * OpenBitSet calculations
-       */
-      LOG.debug("Executing OpenBitSet intersectionCount");
-
-      final long startTime = System.currentTimeMillis();
-
-      obsResultMatrix = new int[obsPairs.size()][obsPairs.size()];
-
-      // This is an N^2 comparison loop
-      // FIXME This entire loop needs to be parallelized to show an apples-to-apples comparison to Aparapi
-      for (int i = 0; i < obsPairs.size(); i++) {
-         final Pair<OpenBitSet, OpenBitSet> docFreqVector1 = obsPairs.get(i);
-
-         for (int j = 0; j < obsPairs.size(); j++) {
-            final Pair<OpenBitSet, OpenBitSet> docFreqVector2 = obsPairs.get(j);
-
-            // # of matches in both sets of documents
-            final int result = (int) OpenBitSet.intersectionCount(docFreqVector1.getLeft(), docFreqVector2.getRight());
-            obsResultMatrix[i][j] = result;
-         }
-      }
-
-      final long endTime = System.currentTimeMillis() - startTime;
-
-      LOG.debug("OpenBitSet Gross Execution Time: " + endTime + " ms <------OpenBitSet");
-      LOG.debug("----------");
-   }
-
-   @Test
-   public void testCorrelationMatrix() throws Exception {
-      /*
-       * GPU calculations
-       */
-      LOG.debug("Executing Aparapi intersectionCount");
-
-      final long[][] matrixA = new long[obsPairs.size()][];
-      final long[][] matrixB = new long[obsPairs.size()][];
-
-      // Convert OpenBitSet pairs to long primitive arrays for use with Aparapi
-      // TODO It would be nice if we could find a way to put the obsPairs onto the GPU directly :)
-      for (int i = 0; i < obsPairs.size(); i++) {
-         final OpenBitSet obsA = obsPairs.get(i).getLeft();
-         final OpenBitSet obsB = obsPairs.get(i).getRight();
-
-         matrixA[i] = obsA.getBits();
-         matrixB[i] = obsB.getBits();
-      }
-
-      // The reason for setting this property is because the CorrMatrix host/kernel code
-      // came from a GUI where a user could select "Use Hardware Acceleration" instead
-      // of the application forcing the setting globally on the command-line
-      final int[][] gpuResultMatrix;
-      if (Boolean.getBoolean("useGPU")) {
-         gpuResultMatrix = CorrMatrixHost.intersectionMatrix(matrixA, matrixB, EXECUTION_MODE.GPU);
-      } else {
-         gpuResultMatrix = CorrMatrixHost.intersectionMatrix(matrixA, matrixB, EXECUTION_MODE.CPU);
-      }
-
-      // Compare the two result arrays to make sure we are generating the same output
-      for (int i = 0; i < obsResultMatrix.length; i++) {
-         Assert.assertTrue("Arrays are not equal", Arrays.equals(obsResultMatrix[i], gpuResultMatrix[i]));
-      }
-
-      // Visually compare/third-party tool compare if desired
-      if (LOG.isTraceEnabled()) {
-         // We're not using "try with resources" because Aparapi currently targets JDK 6
-         final PrintWriter cpuOut = new PrintWriter(new File(System.getProperty("user.dir"), "trace/cpuOut.txt"));
-         final PrintWriter gpuOut = new PrintWriter(new File(System.getProperty("user.dir"), "trace/gpuOut.txt"));
-
-         try {
-            for (int i = 0; i < obsResultMatrix.length; i++) {
-               if (LOG.isTraceEnabled()) {
-                  LOG.trace("obsResultMatrix length: " + obsResultMatrix.length);
-                  LOG.trace("gpuResultMatrix length: " + gpuResultMatrix.length);
-
-                  cpuOut.println(Arrays.toString(obsResultMatrix[i]));
-                  gpuOut.println(Arrays.toString(gpuResultMatrix[i]));
-               }
-            }
-         } finally {
-            if (cpuOut != null) {
-               cpuOut.flush();
-               cpuOut.close();
-            }
-
-            if (gpuOut != null) {
-               gpuOut.flush();
-               gpuOut.close();
-            }
-         }
-      }
-   }
-}
+/**
+ * This material was prepared as an account of work sponsored by an agency of the United States Government.  
+ * Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of 
+ * their employees, nor any jurisdiction or organization that has cooperated in the development of these materials, 
+ * makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy, 
+ * completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents
+ * that its use would not infringe privately owned rights.
+ */
+package gov.pnnl.aparapi.test;
+
+import gov.pnnl.aparapi.matrix.CorrMatrixHost;
+
+import java.io.File;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.log4j.Logger;
+import org.apache.lucene.util.OpenBitSet;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import com.aparapi.Kernel.EXECUTION_MODE;
+
+/**
+ * This test class performs the following functions:
+ * 
+ * 1) Create a randomly populated set of matrices for correlation/co-occurrence computation
+ * 2) Execute the CPU-based computation using Lucene OpenBitSets
+ * 3) Execute the GPU-based computation using Aparapi CorrMatrix host and kernel
+ * 4) Verify the results of OpenBitSet and CorrMatrix by comparing matrices to each other
+ *  
+ * @author ryan.lamothe at gmail.com
+ *
+ */
+public class CorrMatrixTest {
+
+   private static final Logger LOG = Logger.getLogger(CorrMatrixTest.class);
+
+   private final List<Pair<OpenBitSet, OpenBitSet>> obsPairs = new ArrayList<Pair<OpenBitSet, OpenBitSet>>();;
+
+   private final Random rand = new Random();
+
+   private int[][] obsResultMatrix;
+
+   /**
+    * NumTerms and NumLongs (documents) need to be adjusted manually right now to force 'striping' to occur (see Host code for details)
+    */
+   @Before
+   public void setup() throws Exception {
+      /*
+       * Populate test data
+       */
+      LOG.debug("----------");
+      LOG.debug("Populating test matrix data using settings from build.xml...");
+      LOG.debug("----------");
+
+      final int numTerms = Integer.getInteger("numRows", 300); // # Rows
+      // numLongs*64 for number of actual documents since these are 'packed' longs
+      final int numLongs = Integer.getInteger("numColumns", 10000); // # Columns
+
+      for (int i = 0; i < numTerms; ++i) {
+         final long[] bits = new long[numLongs];
+         for (int j = 0; j < numLongs; ++j) {
+            bits[j] = rand.nextLong();
+         }
+
+         obsPairs.add(i, new ImmutablePair<OpenBitSet, OpenBitSet>(new OpenBitSet(bits, numLongs), new OpenBitSet(bits, numLongs)));
+      }
+
+      /*
+       * OpenBitSet calculations
+       */
+      LOG.debug("Executing OpenBitSet intersectionCount");
+
+      final long startTime = System.currentTimeMillis();
+
+      obsResultMatrix = new int[obsPairs.size()][obsPairs.size()];
+
+      // This is an N^2 comparison loop
+      // FIXME This entire loop needs to be parallelized to show an apples-to-apples comparison to Aparapi
+      for (int i = 0; i < obsPairs.size(); i++) {
+         final Pair<OpenBitSet, OpenBitSet> docFreqVector1 = obsPairs.get(i);
+
+         for (int j = 0; j < obsPairs.size(); j++) {
+            final Pair<OpenBitSet, OpenBitSet> docFreqVector2 = obsPairs.get(j);
+
+            // # of matches in both sets of documents
+            final int result = (int) OpenBitSet.intersectionCount(docFreqVector1.getLeft(), docFreqVector2.getRight());
+            obsResultMatrix[i][j] = result;
+         }
+      }
+
+      final long endTime = System.currentTimeMillis() - startTime;
+
+      LOG.debug("OpenBitSet Gross Execution Time: " + endTime + " ms <------OpenBitSet");
+      LOG.debug("----------");
+   }
+
+   @Test
+   public void testCorrelationMatrix() throws Exception {
+      /*
+       * GPU calculations
+       */
+      LOG.debug("Executing Aparapi intersectionCount");
+
+      final long[][] matrixA = new long[obsPairs.size()][];
+      final long[][] matrixB = new long[obsPairs.size()][];
+
+      // Convert OpenBitSet pairs to long primitive arrays for use with Aparapi
+      // TODO It would be nice if we could find a way to put the obsPairs onto the GPU directly :)
+      for (int i = 0; i < obsPairs.size(); i++) {
+         final OpenBitSet obsA = obsPairs.get(i).getLeft();
+         final OpenBitSet obsB = obsPairs.get(i).getRight();
+
+         matrixA[i] = obsA.getBits();
+         matrixB[i] = obsB.getBits();
+      }
+
+      // The reason for setting this property is because the CorrMatrix host/kernel code
+      // came from a GUI where a user could select "Use Hardware Acceleration" instead
+      // of the application forcing the setting globally on the command-line
+      final int[][] gpuResultMatrix;
+      if (Boolean.getBoolean("useGPU")) {
+         gpuResultMatrix = CorrMatrixHost.intersectionMatrix(matrixA, matrixB, EXECUTION_MODE.GPU);
+      } else {
+         gpuResultMatrix = CorrMatrixHost.intersectionMatrix(matrixA, matrixB, EXECUTION_MODE.CPU);
+      }
+
+      // Compare the two result arrays to make sure we are generating the same output
+      for (int i = 0; i < obsResultMatrix.length; i++) {
+         Assert.assertTrue("Arrays are not equal", Arrays.equals(obsResultMatrix[i], gpuResultMatrix[i]));
+      }
+
+      // Visually compare/third-party tool compare if desired
+      if (LOG.isTraceEnabled()) {
+         // We're not using "try with resources" because Aparapi currently targets JDK 6
+         final PrintWriter cpuOut = new PrintWriter(new File(System.getProperty("user.dir"), "trace/cpuOut.txt"));
+         final PrintWriter gpuOut = new PrintWriter(new File(System.getProperty("user.dir"), "trace/gpuOut.txt"));
+
+         try {
+            for (int i = 0; i < obsResultMatrix.length; i++) {
+               if (LOG.isTraceEnabled()) {
+                  LOG.trace("obsResultMatrix length: " + obsResultMatrix.length);
+                  LOG.trace("gpuResultMatrix length: " + gpuResultMatrix.length);
+
+                  cpuOut.println(Arrays.toString(obsResultMatrix[i]));
+                  gpuOut.println(Arrays.toString(gpuResultMatrix[i]));
+               }
+            }
+         } finally {
+            if (cpuOut != null) {
+               cpuOut.flush();
+               cpuOut.close();
+            }
+
+            if (gpuOut != null) {
+               gpuOut.flush();
+               gpuOut.close();
+            }
+         }
+      }
+   }
+}
diff --git a/examples/movie/.project b/examples/movie/.project
deleted file mode 100644
index 6be55d8c..00000000
--- a/examples/movie/.project
+++ /dev/null
@@ -1,17 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<projectDescription>
-	<name>movie</name>
-	<comment></comment>
-	<projects>
-	</projects>
-	<buildSpec>
-		<buildCommand>
-			<name>org.eclipse.jdt.core.javabuilder</name>
-			<arguments>
-			</arguments>
-		</buildCommand>
-	</buildSpec>
-	<natures>
-		<nature>org.eclipse.jdt.core.javanature</nature>
-	</natures>
-</projectDescription>
diff --git a/examples/movie/build.xml b/examples/movie/build.xml
index bbfd7ae5..adea1f80 100644
--- a/examples/movie/build.xml
+++ b/examples/movie/build.xml
@@ -1,127 +1,127 @@
-<?xml version="1.0"?>
-
-<project name="movie" default="build" basedir=".">
-
-   <target name="getjjmpeg-windows-i586" if="use.win32.jjmpeg">
-      <delete dir="jjmpeg"/>
-      <mkdir dir="jjmpeg"/>
-      <get dest="jjmpeg">
-         <url url="http://jjmpeg.googlecode.com/files/jjmpeg-0.0-bin.tar.bz2"/> 
-      </get>
-      <untar src="jjmpeg/jjmpeg-0.0-bin.tar.bz2" compression="bzip2" dest="jjmpeg"/>
-   </target>
-
-   <target name="getffmpeg-windows-i586" if="use.win32.ffmpeg">
-      <delete dir="ffmpeg"/>
-      <mkdir dir="ffmpeg"/>
-      <get dest="ffmpeg">
-         <url url="http://ffmpeg.zeranoe.com/builds/win32/shared/ffmpeg-git-9c2651a-win32-shared.7z"/> 
-      </get>
-      <exec dir="ffmpeg" executable="C:\Program Files\7-Zip\7z.exe">
-         <arg value="x"/>
-         <arg value="ffmpeg-git-9c2651a-win32-shared.7z"/>
-      </exec>
-   </target>
- <target name="getjjmpeg-windows-amd64" if="use.win64.jjmpeg">
-      <delete dir="jjmpeg"/>
-      <mkdir dir="jjmpeg"/>
-      <get dest="jjmpeg">
-         <url url="http://jjmpeg.googlecode.com/files/jjmpeg-0.0-bin.tar.bz2"/> 
-      </get>
-      <untar src="jjmpeg/jjmpeg-0.0-bin.tar.bz2" compression="bzip2" dest="jjmpeg"/>
-   </target>
-    <target name="getffmpeg-windows-amd64" if="use.win64.ffmpeg">
-      <delete dir="ffmpeg"/>
-      <mkdir dir="ffmpeg"/>
-      <get dest="ffmpeg">
-         <url url="http://ffmpeg.zeranoe.com/builds/win64/shared/ffmpeg-git-9c2651a-win64-shared.7z"/> 
-      </get>
-      <exec dir="ffmpeg" executable="C:\Program Files\7-Zip\7z.exe">
-         <arg value="x"/>
-         <arg value="ffmpeg-git-9c2651a-win64-shared.7z"/>
-      </exec>
-   </target>
-
-   <target name="checkos">
-      <condition property="use.win32.jjmpeg">
-         <and>
-            <os family="windows" />
-            <or>
-               <os arch="x86" />
-               <os arch="i386" />
-            </or>
-            <not><available file="jjmpeg/jjmpeg-0.0/native/mswin-i386/jjmpeg.dll"/> </not>
-            <not><available file="jjmpeg/jjmpeg-0.0/dist/jjmpeg.jar"/> </not>
-         </and>
-      </condition>
-      <condition property="use.win32.ffmpeg">
-         <and>
-            <os family="windows" />
-            <or>
-               <os arch="x86" />
-               <os arch="i386" />
-            </or>
-            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avcodec-53.dll"/> </not>
-            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avdevice-53.dll"/> </not>
-            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avfilter-53.dll"/> </not>
-            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avformat-53.dll"/> </not>
-            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avutil-51.dll"/> </not>
-            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\postproc-51.dll"/> </not>
-            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\swscale-2.dll"/> </not>
-         </and>
-      </condition>
-      <condition property="use.win64.jjmpeg">
-         <and>
-            <os family="windows" />
-            <not>
-               <or>
-                  <os arch="x86" />
-                  <os arch="i386" />
-               </or>
-            </not>
-            <not><available file="jjmpeg/jjmpeg-0.0/native/mswin-amd64/jjmpeg.dll"/> </not>
-            <not><available file="jjmpeg/jjmpeg-0.0/dist/jjmpeg.jar"/> </not>
-         </and>
-      </condition>
-
-       <condition property="use.win64.ffmpeg">
-         <and>
-            <os family="windows" />
-            <not>
-               <or>
-                  <os arch="x86" />
-                  <os arch="i386" />
-               </or>
-            </not>
-            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avcodec-53.dll"/> </not>
-            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avdevice-53.dll"/> </not>
-            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avfilter-53.dll"/> </not>
-            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avformat-53.dll"/> </not>
-            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avutil-51.dll"/> </not>
-            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\postproc-51.dll"/> </not>
-            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\swscale-2.dll"/> </not>
-         </and>
-      </condition>
-   </target>
-
-   <target name="getstuff" depends="checkos, getjjmpeg-windows-i586, getffmpeg-windows-i586, getjjmpeg-windows-amd64, getffmpeg-windows-amd64"/>
-
-   <target name="build" depends="getstuff,clean">
-      <mkdir dir="classes"/>
-      <javac srcdir="src" destdir="classes" debug="on" includeantruntime="false" >
-         <classpath>
-            <pathelement path="..\..\com.aparapi\dist\aparapi.jar"/>
-            <pathelement path="jjmpeg\jjmpeg-0.0\dist\jjmpeg.jar/"/>
-            <pathelement path="..\jviolajones\jviolajones.jar/"/>
-         </classpath>
-      </javac>
-      <jar jarfile="${ant.project.name}.jar" basedir="classes"/>
-   </target>
-
-   <target name="clean">
-      <delete dir="classes"/>
-      <delete file="${ant.project.name}.jar"/>
-   </target>
-
-
-</project>
+<?xml version="1.0"?>
+
+<project name="movie" default="build" basedir=".">
+
+   <target name="getjjmpeg-windows-i586" if="use.win32.jjmpeg">
+      <delete dir="jjmpeg"/>
+      <mkdir dir="jjmpeg"/>
+      <get dest="jjmpeg">
+         <url url="http://jjmpeg.googlecode.com/files/jjmpeg-0.0-bin.tar.bz2"/> 
+      </get>
+      <untar src="jjmpeg/jjmpeg-0.0-bin.tar.bz2" compression="bzip2" dest="jjmpeg"/>
+   </target>
+
+   <target name="getffmpeg-windows-i586" if="use.win32.ffmpeg">
+      <delete dir="ffmpeg"/>
+      <mkdir dir="ffmpeg"/>
+      <get dest="ffmpeg">
+         <url url="http://ffmpeg.zeranoe.com/builds/win32/shared/ffmpeg-git-9c2651a-win32-shared.7z"/> 
+      </get>
+      <exec dir="ffmpeg" executable="C:\Program Files\7-Zip\7z.exe">
+         <arg value="x"/>
+         <arg value="ffmpeg-git-9c2651a-win32-shared.7z"/>
+      </exec>
+   </target>
+ <target name="getjjmpeg-windows-amd64" if="use.win64.jjmpeg">
+      <delete dir="jjmpeg"/>
+      <mkdir dir="jjmpeg"/>
+      <get dest="jjmpeg">
+         <url url="http://jjmpeg.googlecode.com/files/jjmpeg-0.0-bin.tar.bz2"/> 
+      </get>
+      <untar src="jjmpeg/jjmpeg-0.0-bin.tar.bz2" compression="bzip2" dest="jjmpeg"/>
+   </target>
+    <target name="getffmpeg-windows-amd64" if="use.win64.ffmpeg">
+      <delete dir="ffmpeg"/>
+      <mkdir dir="ffmpeg"/>
+      <get dest="ffmpeg">
+         <url url="http://ffmpeg.zeranoe.com/builds/win64/shared/ffmpeg-git-9c2651a-win64-shared.7z"/> 
+      </get>
+      <exec dir="ffmpeg" executable="C:\Program Files\7-Zip\7z.exe">
+         <arg value="x"/>
+         <arg value="ffmpeg-git-9c2651a-win64-shared.7z"/>
+      </exec>
+   </target>
+
+   <target name="checkos">
+      <condition property="use.win32.jjmpeg">
+         <and>
+            <os family="windows" />
+            <or>
+               <os arch="x86" />
+               <os arch="i386" />
+            </or>
+            <not><available file="jjmpeg/jjmpeg-0.0/native/mswin-i386/jjmpeg.dll"/> </not>
+            <not><available file="jjmpeg/jjmpeg-0.0/dist/jjmpeg.jar"/> </not>
+         </and>
+      </condition>
+      <condition property="use.win32.ffmpeg">
+         <and>
+            <os family="windows" />
+            <or>
+               <os arch="x86" />
+               <os arch="i386" />
+            </or>
+            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avcodec-53.dll"/> </not>
+            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avdevice-53.dll"/> </not>
+            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avfilter-53.dll"/> </not>
+            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avformat-53.dll"/> </not>
+            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\avutil-51.dll"/> </not>
+            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\postproc-51.dll"/> </not>
+            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin\swscale-2.dll"/> </not>
+         </and>
+      </condition>
+      <condition property="use.win64.jjmpeg">
+         <and>
+            <os family="windows" />
+            <not>
+               <or>
+                  <os arch="x86" />
+                  <os arch="i386" />
+               </or>
+            </not>
+            <not><available file="jjmpeg/jjmpeg-0.0/native/mswin-amd64/jjmpeg.dll"/> </not>
+            <not><available file="jjmpeg/jjmpeg-0.0/dist/jjmpeg.jar"/> </not>
+         </and>
+      </condition>
+
+       <condition property="use.win64.ffmpeg">
+         <and>
+            <os family="windows" />
+            <not>
+               <or>
+                  <os arch="x86" />
+                  <os arch="i386" />
+               </or>
+            </not>
+            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avcodec-53.dll"/> </not>
+            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avdevice-53.dll"/> </not>
+            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avfilter-53.dll"/> </not>
+            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avformat-53.dll"/> </not>
+            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\avutil-51.dll"/> </not>
+            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\postproc-51.dll"/> </not>
+            <not><available file="ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin\swscale-2.dll"/> </not>
+         </and>
+      </condition>
+   </target>
+
+   <target name="getstuff" depends="checkos, getjjmpeg-windows-i586, getffmpeg-windows-i586, getjjmpeg-windows-amd64, getffmpeg-windows-amd64"/>
+
+   <target name="build" depends="getstuff,clean">
+      <mkdir dir="classes"/>
+      <javac srcdir="src" destdir="classes" debug="on" includeantruntime="false" >
+         <classpath>
+            <pathelement path="..\..\com.aparapi\dist\aparapi.jar"/>
+            <pathelement path="jjmpeg\jjmpeg-0.0\dist\jjmpeg.jar/"/>
+            <pathelement path="..\jviolajones\jviolajones.jar/"/>
+         </classpath>
+      </javac>
+      <jar jarfile="${ant.project.name}.jar" basedir="classes"/>
+   </target>
+
+   <target name="clean">
+      <delete dir="classes"/>
+      <delete file="${ant.project.name}.jar"/>
+   </target>
+
+
+</project>
diff --git a/examples/movie/movie.bat b/examples/movie/movie.bat
index 9273c3c2..5f07c629 100644
--- a/examples/movie/movie.bat
+++ b/examples/movie/movie.bat
@@ -1,15 +1,15 @@
-SETLOCAL 
-if /I %PROCESSOR_ARCHITECTURE%==x86 goto win32
-echo "win64!"
-set PATH=%PATH%;ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin
-set PATH=%PATH%;jjmpeg\jjmpeg-0.0\native\mswin-amd64
-goto win64
-:win32
-echo "win32!"
-set PATH=%PATH%;ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin
-set PATH=%PATH%;jjmpeg\jjmpeg-0.0\native\mswin-i386
-:win64
-set PATH=%PATH%;..\..\com.aparapi.jni\dist
-java -classpath jjmpeg\jjmpeg-0.0\dist\jjmpeg.jar;..\..\com.aparapi\dist\aparapi.jar;movie.jar; com.aparapi.examples.movie.%1 %2
-
-ENDLOCAL
+SETLOCAL 
+if /I %PROCESSOR_ARCHITECTURE%==x86 goto win32
+echo "win64!"
+set PATH=%PATH%;ffmpeg\ffmpeg-git-9c2651a-win64-shared\bin
+set PATH=%PATH%;jjmpeg\jjmpeg-0.0\native\mswin-amd64
+goto win64
+:win32
+echo "win32!"
+set PATH=%PATH%;ffmpeg\ffmpeg-git-9c2651a-win32-shared\bin
+set PATH=%PATH%;jjmpeg\jjmpeg-0.0\native\mswin-i386
+:win64
+set PATH=%PATH%;..\..\com.aparapi.jni\dist
+java -classpath jjmpeg\jjmpeg-0.0\dist\jjmpeg.jar;..\..\com.aparapi\dist\aparapi.jar;movie.jar; com.aparapi.examples.movie.%1 %2
+
+ENDLOCAL
diff --git a/examples/movie/src/com/amd/aparapi/examples/movie/AparapiSolution.java b/examples/movie/src/com/amd/aparapi/examples/movie/AparapiSolution.java
index 29fdc946..aed1d6f4 100644
--- a/examples/movie/src/com/amd/aparapi/examples/movie/AparapiSolution.java
+++ b/examples/movie/src/com/amd/aparapi/examples/movie/AparapiSolution.java
@@ -1,130 +1,130 @@
-/*
-Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
-following conditions are met:
-
-Redistributions of source code must retain the above copyright notice, this list of conditions and the following
-disclaimer. 
-
-Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
-disclaimer in the documentation and/or other materials provided with the distribution. 
-
-Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission. 
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
-INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
-laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
-774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
-you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
-Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
-Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
-E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
-D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
-to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
-of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
-under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 
-
-*/
-package com.aparapi.examples.movie;
-
-import java.awt.Graphics2D;
-import java.awt.image.BufferedImage;
-import java.awt.image.DataBufferByte;
-
-import com.aparapi.Kernel;
-import com.aparapi.Range;
-
-public class AparapiSolution{
-
-   public static class AparapiConvolution extends Kernel{
-
-      private byte[] inputData;
-
-      private byte[] outputData;
-
-      private int width;
-
-      private int height;
-
-      private Range range;
-
-      float[] convMatrix3x3;
-
-      public AparapiConvolution(BufferedImage _imageIn, BufferedImage _imageOut) {
-         inputData = ((DataBufferByte) _imageIn.getRaster().getDataBuffer()).getData();
-         outputData = ((DataBufferByte) _imageOut.getRaster().getDataBuffer()).getData();
-         width = _imageIn.getWidth();
-         height = _imageIn.getHeight();
-         range = Range.create2D(width * 3, height);
-         setExplicit(true);
-
-      }
-
-      public void processPixel(int x, int y, int w, int h) {
-         float accum = 0;
-         int count = 0;
-         for (int dx = -3; dx < 6; dx += 3) {
-            for (int dy = -1; dy < 2; dy += 1) {
-               int rgb = 0xff & inputData[((y + dy) * w) + (x + dx)];
-               accum += rgb * convMatrix3x3[count++];
-            }
-         }
-         outputData[y * w + x] = (byte) Math.max(0, Math.min((int) accum, 255));
-      }
-
-      public void run() {
-         int x = getGlobalId(0);
-         int y = getGlobalId(1);
-         int w = getGlobalSize(0);
-         int h = getGlobalSize(1);
-         if (x > 3 && x < (w - 3) && y > 1 && y < (h - 1)) {
-            processPixel(x, y, w, h);
-         } else {
-            outputData[y * w + x] = inputData[(y * w) + x];
-         }
-      }
-
-      public void apply(float[] _convMatrix3x3) {
-         convMatrix3x3 = _convMatrix3x3;
-         put(convMatrix3x3).put(inputData).execute(range).get(outputData);
-      }
-
-   }
-
-   public static void main(final String[] _args) {
-      String fileName = _args.length == 1 ? _args[0] : "Leo720p.wmv";
-
-      float[] convMatrix3x3 = new float[] {
-            0f,
-            -10f,
-            0f,
-            -10f,
-            41f,
-            -10f,
-            0f,
-            -10f,
-            0f
-      };
-      new JJMPEGPlayer("Aparapi - Solution", fileName, convMatrix3x3){
-         AparapiConvolution kernel = null;
-
-         @Override protected void processFrame(Graphics2D gc, float[] _convMatrix3x3, BufferedImage in, BufferedImage out) {
-            if (kernel == null) {
-               kernel = new AparapiConvolution(in, out);
-            }
-            kernel.apply(_convMatrix3x3);
-         }
-      };
-
-   }
-}
+/*
+Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following
+disclaimer. 
+
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials provided with the distribution. 
+
+Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
+laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
+774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
+you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
+Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
+Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
+E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
+D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
+to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
+of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
+under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 
+
+*/
+package com.aparapi.examples.movie;
+
+import java.awt.Graphics2D;
+import java.awt.image.BufferedImage;
+import java.awt.image.DataBufferByte;
+
+import com.aparapi.Kernel;
+import com.aparapi.Range;
+
+public class AparapiSolution{
+
+   public static class AparapiConvolution extends Kernel{
+
+      private byte[] inputData;
+
+      private byte[] outputData;
+
+      private int width;
+
+      private int height;
+
+      private Range range;
+
+      float[] convMatrix3x3;
+
+      public AparapiConvolution(BufferedImage _imageIn, BufferedImage _imageOut) {
+         inputData = ((DataBufferByte) _imageIn.getRaster().getDataBuffer()).getData();
+         outputData = ((DataBufferByte) _imageOut.getRaster().getDataBuffer()).getData();
+         width = _imageIn.getWidth();
+         height = _imageIn.getHeight();
+         range = Range.create2D(width * 3, height);
+         setExplicit(true);
+
+      }
+
+      public void processPixel(int x, int y, int w, int h) {
+         float accum = 0;
+         int count = 0;
+         for (int dx = -3; dx < 6; dx += 3) {
+            for (int dy = -1; dy < 2; dy += 1) {
+               int rgb = 0xff & inputData[((y + dy) * w) + (x + dx)];
+               accum += rgb * convMatrix3x3[count++];
+            }
+         }
+         outputData[y * w + x] = (byte) Math.max(0, Math.min((int) accum, 255));
+      }
+
+      public void run() {
+         int x = getGlobalId(0);
+         int y = getGlobalId(1);
+         int w = getGlobalSize(0);
+         int h = getGlobalSize(1);
+         if (x > 3 && x < (w - 3) && y > 1 && y < (h - 1)) {
+            processPixel(x, y, w, h);
+         } else {
+            outputData[y * w + x] = inputData[(y * w) + x];
+         }
+      }
+
+      public void apply(float[] _convMatrix3x3) {
+         convMatrix3x3 = _convMatrix3x3;
+         put(convMatrix3x3).put(inputData).execute(range).get(outputData);
+      }
+
+   }
+
+   public static void main(final String[] _args) {
+      String fileName = _args.length == 1 ? _args[0] : "Leo720p.wmv";
+
+      float[] convMatrix3x3 = new float[] {
+            0f,
+            -10f,
+            0f,
+            -10f,
+            41f,
+            -10f,
+            0f,
+            -10f,
+            0f
+      };
+      new JJMPEGPlayer("Aparapi - Solution", fileName, convMatrix3x3){
+         AparapiConvolution kernel = null;
+
+         @Override protected void processFrame(Graphics2D gc, float[] _convMatrix3x3, BufferedImage in, BufferedImage out) {
+            if (kernel == null) {
+               kernel = new AparapiConvolution(in, out);
+            }
+            kernel.apply(_convMatrix3x3);
+         }
+      };
+
+   }
+}
diff --git a/examples/movie/src/com/amd/aparapi/examples/movie/ConvMatrix3x3Editor.java b/examples/movie/src/com/amd/aparapi/examples/movie/ConvMatrix3x3Editor.java
index d3140806..2d3c2c28 100644
--- a/examples/movie/src/com/amd/aparapi/examples/movie/ConvMatrix3x3Editor.java
+++ b/examples/movie/src/com/amd/aparapi/examples/movie/ConvMatrix3x3Editor.java
@@ -1,151 +1,151 @@
-/*
-Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
-following conditions are met:
-
-Redistributions of source code must retain the above copyright notice, this list of conditions and the following
-disclaimer. 
-
-Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
-disclaimer in the documentation and/or other materials provided with the distribution. 
-
-Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission. 
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
-INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
-laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
-774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
-you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
-Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
-Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
-E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
-D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
-to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
-of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
-under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 
-
-*/
-
-package com.aparapi.examples.movie;
-
-import java.awt.BorderLayout;
-import java.awt.Component;
-import java.awt.GridLayout;
-import java.awt.event.ActionEvent;
-import java.awt.event.ActionListener;
-import java.util.Arrays;
-
-import javax.swing.BoxLayout;
-import javax.swing.JComboBox;
-import javax.swing.JPanel;
-import javax.swing.JSpinner;
-import javax.swing.SpinnerModel;
-import javax.swing.SpinnerNumberModel;
-import javax.swing.event.ChangeEvent;
-import javax.swing.event.ChangeListener;
-
-public class ConvMatrix3x3Editor{
-   Component component;
-
-   float[] default3x3;
-
-   float[] none3x3 = new float[] {
-         0,
-         0,
-         0,
-         0,
-         1,
-         0,
-         0,
-         0,
-         0
-   };
-
-   float[] blur3x3 = new float[] {
-         .1f,
-         .1f,
-         .1f,
-         .1f,
-         .1f,
-         .1f,
-         .1f,
-         .1f,
-         .1f
-   };
-
-   JSpinner[] spinners = new JSpinner[9];
-
-   protected void updated(float[] _convMatrix3x3) {
-
-   };
-
-   void set(float[] _to, float[] _from) {
-      for (int i = 0; i < 9; i++) {
-         _to[i] = _from[i];
-         spinners[i].setValue((Double) (double) _to[i]);
-
-      }
-      updated(_to);
-   }
-
-   ConvMatrix3x3Editor(final float[] _convMatrix3x3) {
-      default3x3 = Arrays.copyOf(_convMatrix3x3, _convMatrix3x3.length);
-      JPanel leftPanel = new JPanel();
-      JPanel controlPanel = new JPanel();
-      BoxLayout layout = new BoxLayout(controlPanel, BoxLayout.Y_AXIS);
-      controlPanel.setLayout(layout);
-      component = leftPanel;
-      JPanel grid3x3Panel = new JPanel();
-      controlPanel.add(grid3x3Panel);
-      grid3x3Panel.setLayout(new GridLayout(3, 3));
-      for (int i = 0; i < 9; i++) {
-         final int index = i;
-         SpinnerModel model = new SpinnerNumberModel(_convMatrix3x3[index], -50f, 50f, 1f);
-         JSpinner spinner = new JSpinner(model);
-         spinners[i] = spinner;
-         spinner.addChangeListener(new ChangeListener(){
-            public void stateChanged(ChangeEvent ce) {
-               JSpinner source = (JSpinner) ce.getSource();
-               double value = ((Double) source.getValue());
-               _convMatrix3x3[index] = (float) value;
-               updated(_convMatrix3x3);
-            }
-         });
-         grid3x3Panel.add(spinner);
-      }
-      String[] options = new String[] {
-            "DEFAULT",
-            "NONE",
-            "BLUR"
-      };
-      JComboBox combo = new JComboBox(options);
-      combo.addActionListener(new ActionListener(){
-
-         @Override public void actionPerformed(ActionEvent e) {
-            JComboBox cb = (JComboBox) e.getSource();
-            String value = (String) cb.getSelectedItem();
-            if (value.equals("DEFAULT")) {
-               set(_convMatrix3x3, default3x3);
-            } else if (value.equals("NONE")) {
-               set(_convMatrix3x3, none3x3);
-            } else if (value.equals("BLUR")) {
-               set(_convMatrix3x3, blur3x3);
-            }
-         }
-
-      });
-      controlPanel.add(combo);
-
-      leftPanel.add(controlPanel, BorderLayout.NORTH);
-   }
-}
+/*
+Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following
+disclaimer. 
+
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials provided with the distribution. 
+
+Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
+laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
+774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
+you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
+Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
+Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
+E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
+D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
+to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
+of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
+under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 
+
+*/
+
+package com.aparapi.examples.movie;
+
+import java.awt.BorderLayout;
+import java.awt.Component;
+import java.awt.GridLayout;
+import java.awt.event.ActionEvent;
+import java.awt.event.ActionListener;
+import java.util.Arrays;
+
+import javax.swing.BoxLayout;
+import javax.swing.JComboBox;
+import javax.swing.JPanel;
+import javax.swing.JSpinner;
+import javax.swing.SpinnerModel;
+import javax.swing.SpinnerNumberModel;
+import javax.swing.event.ChangeEvent;
+import javax.swing.event.ChangeListener;
+
+public class ConvMatrix3x3Editor{
+   Component component;
+
+   float[] default3x3;
+
+   float[] none3x3 = new float[] {
+         0,
+         0,
+         0,
+         0,
+         1,
+         0,
+         0,
+         0,
+         0
+   };
+
+   float[] blur3x3 = new float[] {
+         .1f,
+         .1f,
+         .1f,
+         .1f,
+         .1f,
+         .1f,
+         .1f,
+         .1f,
+         .1f
+   };
+
+   JSpinner[] spinners = new JSpinner[9];
+
+   protected void updated(float[] _convMatrix3x3) {
+
+   };
+
+   void set(float[] _to, float[] _from) {
+      for (int i = 0; i < 9; i++) {
+         _to[i] = _from[i];
+         spinners[i].setValue((Double) (double) _to[i]);
+
+      }
+      updated(_to);
+   }
+
+   ConvMatrix3x3Editor(final float[] _convMatrix3x3) {
+      default3x3 = Arrays.copyOf(_convMatrix3x3, _convMatrix3x3.length);
+      JPanel leftPanel = new JPanel();
+      JPanel controlPanel = new JPanel();
+      BoxLayout layout = new BoxLayout(controlPanel, BoxLayout.Y_AXIS);
+      controlPanel.setLayout(layout);
+      component = leftPanel;
+      JPanel grid3x3Panel = new JPanel();
+      controlPanel.add(grid3x3Panel);
+      grid3x3Panel.setLayout(new GridLayout(3, 3));
+      for (int i = 0; i < 9; i++) {
+         final int index = i;
+         SpinnerModel model = new SpinnerNumberModel(_convMatrix3x3[index], -50f, 50f, 1f);
+         JSpinner spinner = new JSpinner(model);
+         spinners[i] = spinner;
+         spinner.addChangeListener(new ChangeListener(){
+            public void stateChanged(ChangeEvent ce) {
+               JSpinner source = (JSpinner) ce.getSource();
+               double value = ((Double) source.getValue());
+               _convMatrix3x3[index] = (float) value;
+               updated(_convMatrix3x3);
+            }
+         });
+         grid3x3Panel.add(spinner);
+      }
+      String[] options = new String[] {
+            "DEFAULT",
+            "NONE",
+            "BLUR"
+      };
+      JComboBox combo = new JComboBox(options);
+      combo.addActionListener(new ActionListener(){
+
+         @Override public void actionPerformed(ActionEvent e) {
+            JComboBox cb = (JComboBox) e.getSource();
+            String value = (String) cb.getSelectedItem();
+            if (value.equals("DEFAULT")) {
+               set(_convMatrix3x3, default3x3);
+            } else if (value.equals("NONE")) {
+               set(_convMatrix3x3, none3x3);
+            } else if (value.equals("BLUR")) {
+               set(_convMatrix3x3, blur3x3);
+            }
+         }
+
+      });
+      controlPanel.add(combo);
+
+      leftPanel.add(controlPanel, BorderLayout.NORTH);
+   }
+}
diff --git a/examples/movie/src/com/amd/aparapi/examples/movie/JJMPEGPlayer.java b/examples/movie/src/com/amd/aparapi/examples/movie/JJMPEGPlayer.java
index 79f546b6..7404255d 100644
--- a/examples/movie/src/com/amd/aparapi/examples/movie/JJMPEGPlayer.java
+++ b/examples/movie/src/com/amd/aparapi/examples/movie/JJMPEGPlayer.java
@@ -1,147 +1,147 @@
-/*
-Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
-following conditions are met:
-
-Redistributions of source code must retain the above copyright notice, this list of conditions and the following
-disclaimer. 
-
-Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
-disclaimer in the documentation and/or other materials provided with the distribution. 
-
-Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission. 
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
-INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
-laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
-774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
-you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
-Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
-Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
-E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
-D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
-to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
-of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
-under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 
-
-*/
-package com.aparapi.examples.movie;
-
-import java.awt.BorderLayout;
-import java.awt.Graphics;
-import java.awt.Graphics2D;
-import java.awt.image.BufferedImage;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-import javax.swing.ImageIcon;
-import javax.swing.JFrame;
-import javax.swing.JLabel;
-import javax.swing.SwingUtilities;
-
-import au.notzed.jjmpeg.io.JJMediaReader;
-import au.notzed.jjmpeg.io.JJMediaReader.JJReaderVideo;
-
-/**
- * Code based on Demo of JJVideoScanner class from jjmpeg 
- * 
- * See http://code.google.com/p/jjmpeg/
- * 
- * @author notzed
- * @author gfrost
- */
-public abstract class JJMPEGPlayer{
-
-   public JJMPEGPlayer(final String _title, final String _fileName, final float[] _convMatrix3x3) {
-      SwingUtilities.invokeLater(new Runnable(){
-         final Object doorBell = new Object();
-
-         public void run() {
-
-            JFrame frame = new JFrame(_title);
-            frame.getContentPane().setLayout(new BorderLayout());
-            final JLabel label = new JLabel(){
-               @Override public void paint(Graphics GC) {
-                  super.paint(GC);
-                  synchronized (doorBell) {
-                     doorBell.notify();
-                  }
-               }
-            };
-            frame.getContentPane().add(label, BorderLayout.CENTER);
-
-            ConvMatrix3x3Editor editor = new ConvMatrix3x3Editor(_convMatrix3x3){
-               @Override protected void updated(float[] _convMatrix3x3) {
-
-               }
-            };
-            frame.getContentPane().add(editor.component, BorderLayout.WEST);
-
-            try {
-               final JJMediaReader reader = new JJMediaReader(_fileName);
-               final JJReaderVideo vs = reader.openFirstVideoStream();
-               final BufferedImage in = vs.createImage();
-               final BufferedImage out = vs.createImage();
-
-               label.setIcon(new ImageIcon(out));
-
-               new Thread(new Runnable(){
-                  public void run() {
-                     int frames = 0;
-                     long start = System.currentTimeMillis() - 1;
-                     try {
-                        while (true) {
-                           JJMediaReader.JJReaderStream rs = reader.readFrame();
-                           if (rs != null) {
-                              vs.getOutputFrame(in);
-                              Graphics2D gc = in.createGraphics();
-                              frames++;
-                              long fps = (frames * 1000) / (System.currentTimeMillis() - start);
-                              gc.drawString("" + fps, 20, 20);
-
-                              processFrame(gc, _convMatrix3x3, in, out);
-
-                              label.repaint();
-                              synchronized (doorBell) {
-                                 try {
-                                    doorBell.wait();
-                                 } catch (InterruptedException ie) {
-                                    ie.getStackTrace();
-                                 }
-                              }
-                           } else {
-                              reader.dispose();
-                              System.exit(1);
-                           }
-                           Thread.sleep(1);
-                        }
-                     } catch (Exception ex) {
-                        ex.printStackTrace();
-                        Logger.getLogger(JJMPEGPlayer.class.getName()).log(Level.SEVERE, null, ex);
-                     }
-                  }
-               }).start();
-               frame.pack();
-               frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
-               frame.setVisible(true);
-            } catch (Exception ex) {
-               Logger.getLogger(JJMPEGPlayer.class.getName()).log(Level.SEVERE, null, ex);
-            }
-
-         }
-      });
-   }
-
-   protected abstract void processFrame(Graphics2D gc, float[] _convMatrix, BufferedImage in, BufferedImage _out);
-
-}
+/*
+Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following
+disclaimer. 
+
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials provided with the distribution. 
+
+Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
+laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
+774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
+you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
+Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
+Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
+E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
+D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
+to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
+of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
+under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 
+
+*/
+package com.aparapi.examples.movie;
+
+import java.awt.BorderLayout;
+import java.awt.Graphics;
+import java.awt.Graphics2D;
+import java.awt.image.BufferedImage;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import javax.swing.ImageIcon;
+import javax.swing.JFrame;
+import javax.swing.JLabel;
+import javax.swing.SwingUtilities;
+
+import au.notzed.jjmpeg.io.JJMediaReader;
+import au.notzed.jjmpeg.io.JJMediaReader.JJReaderVideo;
+
+/**
+ * Code based on Demo of JJVideoScanner class from jjmpeg 
+ * 
+ * See http://code.google.com/p/jjmpeg/
+ * 
+ * @author notzed
+ * @author gfrost
+ */
+public abstract class JJMPEGPlayer{
+
+   public JJMPEGPlayer(final String _title, final String _fileName, final float[] _convMatrix3x3) {
+      SwingUtilities.invokeLater(new Runnable(){
+         final Object doorBell = new Object();
+
+         public void run() {
+
+            JFrame frame = new JFrame(_title);
+            frame.getContentPane().setLayout(new BorderLayout());
+            final JLabel label = new JLabel(){
+               @Override public void paint(Graphics GC) {
+                  super.paint(GC);
+                  synchronized (doorBell) {
+                     doorBell.notify();
+                  }
+               }
+            };
+            frame.getContentPane().add(label, BorderLayout.CENTER);
+
+            ConvMatrix3x3Editor editor = new ConvMatrix3x3Editor(_convMatrix3x3){
+               @Override protected void updated(float[] _convMatrix3x3) {
+
+               }
+            };
+            frame.getContentPane().add(editor.component, BorderLayout.WEST);
+
+            try {
+               final JJMediaReader reader = new JJMediaReader(_fileName);
+               final JJReaderVideo vs = reader.openFirstVideoStream();
+               final BufferedImage in = vs.createImage();
+               final BufferedImage out = vs.createImage();
+
+               label.setIcon(new ImageIcon(out));
+
+               new Thread(new Runnable(){
+                  public void run() {
+                     int frames = 0;
+                     long start = System.currentTimeMillis() - 1;
+                     try {
+                        while (true) {
+                           JJMediaReader.JJReaderStream rs = reader.readFrame();
+                           if (rs != null) {
+                              vs.getOutputFrame(in);
+                              Graphics2D gc = in.createGraphics();
+                              frames++;
+                              long fps = (frames * 1000) / (System.currentTimeMillis() - start);
+                              gc.drawString("" + fps, 20, 20);
+
+                              processFrame(gc, _convMatrix3x3, in, out);
+
+                              label.repaint();
+                              synchronized (doorBell) {
+                                 try {
+                                    doorBell.wait();
+                                 } catch (InterruptedException ie) {
+                                    ie.getStackTrace();
+                                 }
+                              }
+                           } else {
+                              reader.dispose();
+                              System.exit(1);
+                           }
+                           Thread.sleep(1);
+                        }
+                     } catch (Exception ex) {
+                        ex.printStackTrace();
+                        Logger.getLogger(JJMPEGPlayer.class.getName()).log(Level.SEVERE, null, ex);
+                     }
+                  }
+               }).start();
+               frame.pack();
+               frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
+               frame.setVisible(true);
+            } catch (Exception ex) {
+               Logger.getLogger(JJMPEGPlayer.class.getName()).log(Level.SEVERE, null, ex);
+            }
+
+         }
+      });
+   }
+
+   protected abstract void processFrame(Graphics2D gc, float[] _convMatrix, BufferedImage in, BufferedImage _out);
+
+}
diff --git a/examples/movie/src/com/amd/aparapi/examples/movie/PureJavaSolution.java b/examples/movie/src/com/amd/aparapi/examples/movie/PureJavaSolution.java
index a7b32024..db6fb6f2 100644
--- a/examples/movie/src/com/amd/aparapi/examples/movie/PureJavaSolution.java
+++ b/examples/movie/src/com/amd/aparapi/examples/movie/PureJavaSolution.java
@@ -1,35 +1,35 @@
-package com.aparapi.examples.movie;
-
-import java.awt.Graphics2D;
-import java.awt.image.ConvolveOp;
-
-import java.awt.image.BufferedImage;
-
-public class PureJavaSolution{
-
-   public static void main(final String[] _args) {
-      String fileName = _args.length == 1 ? _args[0] : "Leo720p.wmv";
-
-      float[] convMatrix3x3 = new float[] {
-            0f,
-            -10f,
-            0f,
-            -10f,
-            41f,
-            -10f,
-            0f,
-            -10f,
-            0f
-      };
-
-      new JJMPEGPlayer("lab_6.alternate", fileName, convMatrix3x3){
-
-         @Override protected void processFrame(Graphics2D _gc, float[] _convMatrix3x3, BufferedImage _in, BufferedImage _out) {
-            java.awt.image.Kernel conv = new java.awt.image.Kernel(3, 3, _convMatrix3x3);
-            ConvolveOp convOp = new ConvolveOp(conv, ConvolveOp.EDGE_NO_OP, null);
-            convOp.filter(_in, _out);
-         }
-      };
-
-   }
-}
+package com.aparapi.examples.movie;
+
+import java.awt.Graphics2D;
+import java.awt.image.ConvolveOp;
+
+import java.awt.image.BufferedImage;
+
+public class PureJavaSolution{
+
+   public static void main(final String[] _args) {
+      String fileName = _args.length == 1 ? _args[0] : "Leo720p.wmv";
+
+      float[] convMatrix3x3 = new float[] {
+            0f,
+            -10f,
+            0f,
+            -10f,
+            41f,
+            -10f,
+            0f,
+            -10f,
+            0f
+      };
+
+      new JJMPEGPlayer("lab_6.alternate", fileName, convMatrix3x3){
+
+         @Override protected void processFrame(Graphics2D _gc, float[] _convMatrix3x3, BufferedImage _in, BufferedImage _out) {
+            java.awt.image.Kernel conv = new java.awt.image.Kernel(3, 3, _convMatrix3x3);
+            ConvolveOp convOp = new ConvolveOp(conv, ConvolveOp.EDGE_NO_OP, null);
+            convOp.filter(_in, _out);
+         }
+      };
+
+   }
+}
diff --git a/examples/movie/src/com/amd/aparapi/examples/movie/ReferenceSolution.java b/examples/movie/src/com/amd/aparapi/examples/movie/ReferenceSolution.java
index 1c39d1d0..805ff4da 100644
--- a/examples/movie/src/com/amd/aparapi/examples/movie/ReferenceSolution.java
+++ b/examples/movie/src/com/amd/aparapi/examples/movie/ReferenceSolution.java
@@ -1,136 +1,136 @@
-/*
-Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
-following conditions are met:
-
-Redistributions of source code must retain the above copyright notice, this list of conditions and the following
-disclaimer. 
-
-Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
-disclaimer in the documentation and/or other materials provided with the distribution. 
-
-Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission. 
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
-INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
-laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
-774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
-you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
-Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
-Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
-E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
-D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
-to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
-of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
-under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 
-
-*/
-package com.aparapi.examples.movie;
-
-import java.awt.Graphics2D;
-import java.awt.image.BufferedImage;
-import java.awt.image.DataBufferByte;
-
-import com.aparapi.Kernel;
-import com.aparapi.Range;
-
-public class ReferenceSolution{
-
-   public static class Convolution extends Kernel{
-
-      private byte[] inputData;
-
-      private byte[] outputData;
-
-      private int width;
-
-      private int height;
-
-      private Range range;
-
-      float[] convMatrix3x3;
-
-      public Convolution(BufferedImage _imageIn, BufferedImage _imageOut) {
-         inputData = ((DataBufferByte) _imageIn.getRaster().getDataBuffer()).getData();
-         outputData = ((DataBufferByte) _imageOut.getRaster().getDataBuffer()).getData();
-         width = _imageIn.getWidth();
-         height = _imageIn.getHeight();
-         range = Range.create2D(width * 3, height);
-         setExplicit(true);
-
-      }
-
-      public void processPixel(int x, int y, int w, int h) {
-         float accum = 0;
-         int count = 0;
-         for (int dx = -3; dx < 6; dx += 3) {
-            for (int dy = -1; dy < 2; dy += 1) {
-               int rgb = 0xff & inputData[((y + dy) * w) + (x + dx)];
-               accum += rgb * convMatrix3x3[count++];
-            }
-         }
-         outputData[y * w + x] = (byte) Math.max(0, Math.min((int) accum, 255));
-      }
-
-      public void run() {
-         int x = getGlobalId(0);
-         int y = getGlobalId(1);
-         int w = getGlobalSize(0);
-         int h = getGlobalSize(1);
-         if (x > 3 && x < (w - 3) && y > 1 && y < (h - 1)) {
-            processPixel(x, y, w, h);
-         } else {
-            outputData[y * w + x] = inputData[(y * w) + x];
-         }
-      }
-
-      public void apply(float[] _convMatrix3x3) {
-         convMatrix3x3 = _convMatrix3x3;
-         for (int x = 0; x < width * 3; x++) {
-            for (int y = 0; y < height; y++) {
-               if (x > 3 && x < (width * 3 - 3) && y > 1 && y < (height - 1)) {
-                  processPixel(x, y, width * 3, height);
-               }
-            }
-         }
-      }
-
-   }
-
-   public static void main(final String[] _args) {
-      String fileName = _args.length == 1 ? _args[0] : "Leo720p.wmv";
-
-      float[] convMatrix3x3 = new float[] {
-            0f,
-            -10f,
-            0f,
-            -10f,
-            41f,
-            -10f,
-            0f,
-            -10f,
-            0f
-      };
-      new JJMPEGPlayer("Aparapi - Solution", fileName, convMatrix3x3){
-         Convolution kernel = null;
-
-         @Override protected void processFrame(Graphics2D gc, float[] _convMatrix3x3, BufferedImage in, BufferedImage out) {
-            if (kernel == null) {
-               kernel = new Convolution(in, out);
-            }
-            kernel.apply(_convMatrix3x3);
-         }
-      };
-
-   }
-}
+/*
+Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
+following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following
+disclaimer. 
+
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials provided with the distribution. 
+
+Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
+laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
+774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
+you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
+Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
+Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
+E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
+D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
+to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
+of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
+under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 
+
+*/
+package com.aparapi.examples.movie;
+
+import java.awt.Graphics2D;
+import java.awt.image.BufferedImage;
+import java.awt.image.DataBufferByte;
+
+import com.aparapi.Kernel;
+import com.aparapi.Range;
+
+public class ReferenceSolution{
+
+   public static class Convolution extends Kernel{
+
+      private byte[] inputData;
+
+      private byte[] outputData;
+
+      private int width;
+
+      private int height;
+
+      private Range range;
+
+      float[] convMatrix3x3;
+
+      public Convolution(BufferedImage _imageIn, BufferedImage _imageOut) {
+         inputData = ((DataBufferByte) _imageIn.getRaster().getDataBuffer()).getData();
+         outputData = ((DataBufferByte) _imageOut.getRaster().getDataBuffer()).getData();
+         width = _imageIn.getWidth();
+         height = _imageIn.getHeight();
+         range = Range.create2D(width * 3, height);
+         setExplicit(true);
+
+      }
+
+      public void processPixel(int x, int y, int w, int h) {
+         float accum = 0;
+         int count = 0;
+         for (int dx = -3; dx < 6; dx += 3) {
+            for (int dy = -1; dy < 2; dy += 1) {
+               int rgb = 0xff & inputData[((y + dy) * w) + (x + dx)];
+               accum += rgb * convMatrix3x3[count++];
+            }
+         }
+         outputData[y * w + x] = (byte) Math.max(0, Math.min((int) accum, 255));
+      }
+
+      public void run() {
+         int x = getGlobalId(0);
+         int y = getGlobalId(1);
+         int w = getGlobalSize(0);
+         int h = getGlobalSize(1);
+         if (x > 3 && x < (w - 3) && y > 1 && y < (h - 1)) {
+            processPixel(x, y, w, h);
+         } else {
+            outputData[y * w + x] = inputData[(y * w) + x];
+         }
+      }
+
+      public void apply(float[] _convMatrix3x3) {
+         convMatrix3x3 = _convMatrix3x3;
+         for (int x = 0; x < width * 3; x++) {
+            for (int y = 0; y < height; y++) {
+               if (x > 3 && x < (width * 3 - 3) && y > 1 && y < (height - 1)) {
+                  processPixel(x, y, width * 3, height);
+               }
+            }
+         }
+      }
+
+   }
+
+   public static void main(final String[] _args) {
+      String fileName = _args.length == 1 ? _args[0] : "Leo720p.wmv";
+
+      float[] convMatrix3x3 = new float[] {
+            0f,
+            -10f,
+            0f,
+            -10f,
+            41f,
+            -10f,
+            0f,
+            -10f,
+            0f
+      };
+      new JJMPEGPlayer("Aparapi - Solution", fileName, convMatrix3x3){
+         Convolution kernel = null;
+
+         @Override protected void processFrame(Graphics2D gc, float[] _convMatrix3x3, BufferedImage in, BufferedImage out) {
+            if (kernel == null) {
+               kernel = new Convolution(in, out);
+            }
+            kernel.apply(_convMatrix3x3);
+         }
+      };
+
+   }
+}
diff --git a/pom.xml b/pom.xml
index aa62853f..c9a2738d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -83,9 +83,14 @@
             <artifactId>aparapi</artifactId>
             <version>1.4.1-SNAPSHOT</version>
         </dependency>
+        <dependency>
+            <groupId>org.jogamp.gluegen</groupId>
+            <artifactId>gluegen-rt-main</artifactId>
+            <version>2.3.2</version>
+        </dependency>
         <dependency>
             <groupId>org.jogamp.jogl</groupId>
-            <artifactId>jogl-all</artifactId>
+            <artifactId>jogl-all-main</artifactId>
             <version>2.3.2</version>
         </dependency>
         <dependency>
diff --git a/src/main/java/com/aparapi/examples/convolution/Convolution.java b/src/main/java/com/aparapi/examples/convolution/Convolution.java
index 29ffe62f..243ec0a5 100644
--- a/src/main/java/com/aparapi/examples/convolution/Convolution.java
+++ b/src/main/java/com/aparapi/examples/convolution/Convolution.java
@@ -66,12 +66,17 @@ package com.aparapi.examples.convolution;
 import com.aparapi.*;
 
 import java.io.*;
+import java.net.URISyntaxException;
 
 public class Convolution {
 
     public static void main(final String[] _args) throws IOException {
-
-        final File file = new File(_args.length == 1 ? _args[0] : "./src/main/resources/testcard.jpg").getCanonicalFile();
+        final File file;
+        try{
+            file = new File(Convolution.class.getResource("/testcard.jpg").toURI());
+        } catch (URISyntaxException e) {
+            throw new IllegalStateException("could not get testcard", e);
+        }
 
         final ImageConvolution convolution = new ImageConvolution();
 
diff --git a/src/main/java/com/aparapi/examples/convolution/ConvolutionOpenCL.java b/src/main/java/com/aparapi/examples/convolution/ConvolutionOpenCL.java
index 676ed4f2..95d511b7 100644
--- a/src/main/java/com/aparapi/examples/convolution/ConvolutionOpenCL.java
+++ b/src/main/java/com/aparapi/examples/convolution/ConvolutionOpenCL.java
@@ -70,6 +70,7 @@ import com.aparapi.opencl.*;
 import com.aparapi.opencl.OpenCL.*;
 
 import java.io.*;
+import java.net.URISyntaxException;
 
 public class ConvolutionOpenCL{
 
@@ -84,7 +85,12 @@ public class ConvolutionOpenCL{
    }
 
    public static void main(final String[] _args) {
-      final File file = new File(_args.length == 1 ? _args[0] : "./src/main/resources/testcard.jpg");
+      final File file;
+      try{
+         file = (_args.length >= 1 ? new File(_args[0]) : new File(ConvolutionOpenCL.class.getResource("/testcard.jpg").toURI()));
+      } catch (URISyntaxException e) {
+         throw new IllegalStateException("could not get testcard", e);
+      }
 
       final OpenCLDevice openclDevice = (OpenCLDevice) KernelManager.instance().bestDevice();
 
diff --git a/src/main/java/com/aparapi/examples/convolution/PureJava.java b/src/main/java/com/aparapi/examples/convolution/PureJava.java
index ab37dc03..a4170ca8 100644
--- a/src/main/java/com/aparapi/examples/convolution/PureJava.java
+++ b/src/main/java/com/aparapi/examples/convolution/PureJava.java
@@ -64,6 +64,7 @@ under those regulations, please refer to the U.S. Bureau of Industry and Securit
 package com.aparapi.examples.convolution;
 
 import java.io.File;
+import java.net.URISyntaxException;
 
 import com.aparapi.Kernel;
 
@@ -114,7 +115,12 @@ public class PureJava{
    }
 
    public static void main(final String[] _args) {
-      File file = new File(_args.length == 1 ? _args[0] : "./src/main/resources/testcard.jpg");
+      final File file;
+      try{
+         file = new File(PureJava.class.getResource("/testcard.jpg").toURI());
+      } catch (URISyntaxException e) {
+         throw new IllegalStateException("could not get testcard", e);
+      }
 
       final ImageConvolution convolution = new ImageConvolution();
 
diff --git a/src/main/java/com/aparapi/examples/convolution/convolution.cl b/src/main/java/com/aparapi/examples/convolution/convolution.cl
index e1acfa8f..107b4c7f 100644
--- a/src/main/java/com/aparapi/examples/convolution/convolution.cl
+++ b/src/main/java/com/aparapi/examples/convolution/convolution.cl
@@ -1,28 +1,28 @@
-void processPixel(__global float* _convMatrix3x3, __global char* _imageIn, __global char* _imageOut, int _width, int _height, int _x, int _y){
-   float accum = 0.0f;
-   int count = 0;
-   for (int dx = -3; dx<6; dx+=3){
-      for (int dy = -1; dy<2; dy++){
-         int rgb = 0xff & _imageIn[(((_y + dy) * _width) + (_x + dx))];
-         accum = accum + ((float)rgb * _convMatrix3x3[count++]);
-      }
-   }
-   char value = (char )max(0, min((int)accum, 255));
-   _imageOut[(_y * _width) + _x]  = value;
-   return;
-}
-
-__kernel void applyConvolution(
-   __global float *_convMatrix3x3,  // only read from kernel
-   __global char  *_imageIn, // only read from kernel
-   __global char  *_imageOut, // only written to (never read) from kernel
-   int _width,
-   int _height
-){
- int x = get_global_id(0) % (_width * 3);
- int y = get_global_id(0) / (_width * 3);
- if (x>3 && x<((_width * 3) - 3) && y>1 && y<(_height - 1)){
-    processPixel(_convMatrix3x3, _imageIn, _imageOut, _width*3, _height, x, y);
- }
-}
-
+void processPixel(__global float* _convMatrix3x3, __global char* _imageIn, __global char* _imageOut, int _width, int _height, int _x, int _y){
+   float accum = 0.0f;
+   int count = 0;
+   for (int dx = -3; dx<6; dx+=3){
+      for (int dy = -1; dy<2; dy++){
+         int rgb = 0xff & _imageIn[(((_y + dy) * _width) + (_x + dx))];
+         accum = accum + ((float)rgb * _convMatrix3x3[count++]);
+      }
+   }
+   char value = (char )max(0, min((int)accum, 255));
+   _imageOut[(_y * _width) + _x]  = value;
+   return;
+}
+
+__kernel void applyConvolution(
+   __global float *_convMatrix3x3,  // only read from kernel
+   __global char  *_imageIn, // only read from kernel
+   __global char  *_imageOut, // only written to (never read) from kernel
+   int _width,
+   int _height
+){
+ int x = get_global_id(0) % (_width * 3);
+ int y = get_global_id(0) / (_width * 3);
+ if (x>3 && x<((_width * 3) - 3) && y>1 && y<(_height - 1)){
+    processPixel(_convMatrix3x3, _imageIn, _imageOut, _width*3, _height, x, y);
+ }
+}
+
diff --git a/src/main/java/com/aparapi/examples/extension/HistogramKernel.cl b/src/main/java/com/aparapi/examples/extension/HistogramKernel.cl
index aaa5c154..11b156cc 100644
--- a/src/main/java/com/aparapi/examples/extension/HistogramKernel.cl
+++ b/src/main/java/com/aparapi/examples/extension/HistogramKernel.cl
@@ -1,166 +1,166 @@
-/* ============================================================
-
-Copyright (c) 2009-2010 Advanced Micro Devices, Inc.  All rights reserved.
- 
-Redistribution and use of this material is permitted under the following 
-conditions:
- 
-Redistributions must retain the above copyright notice and all terms of this 
-license.
- 
-In no event shall anyone redistributing or accessing or using this material 
-commence or participate in any arbitration or legal action relating to this 
-material against Advanced Micro Devices, Inc. or any copyright holders or 
-contributors. The foregoing shall survive any expiration or termination of 
-this license or any agreement or access or use related to this material. 
-
-ANY BREACH OF ANY TERM OF THIS LICENSE SHALL RESULT IN THE IMMEDIATE REVOCATION 
-OF ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE THIS MATERIAL.
-
-THIS MATERIAL IS PROVIDED BY ADVANCED MICRO DEVICES, INC. AND ANY COPYRIGHT 
-HOLDERS AND CONTRIBUTORS "AS IS" IN ITS CURRENT CONDITION AND WITHOUT ANY 
-REPRESENTATIONS, GUARANTEE, OR WARRANTY OF ANY KIND OR IN ANY WAY RELATED TO 
-SUPPORT, INDEMNITY, ERROR FREE OR UNINTERRUPTED OPERA TION, OR THAT IT IS FREE 
-FROM DEFECTS OR VIRUSES.  ALL OBLIGATIONS ARE HEREBY DISCLAIMED - WHETHER 
-EXPRESS, IMPLIED, OR STATUTORY - INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED 
-WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, 
-ACCURACY, COMPLETENESS, OPERABILITY, QUALITY OF SERVICE, OR NON-INFRINGEMENT. 
-IN NO EVENT SHALL ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR 
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, PUNITIVE,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
-OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, REVENUE, DATA, OR PROFITS; OR 
-BUSINESS INTERRUPTION) HOWEVER CAUSED OR BASED ON ANY THEORY OF LIABILITY 
-ARISING IN ANY WAY RELATED TO THIS MATERIAL, EVEN IF ADVISED OF THE POSSIBILITY 
-OF SUCH DAMAGE. THE ENTIRE AND AGGREGATE LIABILITY OF ADVANCED MICRO DEVICES, 
-INC. AND ANY COPYRIGHT HOLDERS AND CONTRIBUTORS SHALL NOT EXCEED TEN DOLLARS 
-(US $10.00). ANYONE REDISTRIBUTING OR ACCESSING OR USING THIS MATERIAL ACCEPTS 
-THIS ALLOCATION OF RISK AND AGREES TO RELEASE ADVANCED MICRO DEVICES, INC. AND 
-ANY COPYRIGHT HOLDERS AND CONTRIBUTORS FROM ANY AND ALL LIABILITIES, 
-OBLIGATIONS, CLAIMS, OR DEMANDS IN EXCESS OF TEN DOLLARS (US $10.00). THE 
-FOREGOING ARE ESSENTIAL TERMS OF THIS LICENSE AND, IF ANY OF THESE TERMS ARE 
-CONSTRUED AS UNENFORCEABLE, FAIL IN ESSENTIAL PURPOSE, OR BECOME VOID OR 
-DETRIMENTAL TO ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR 
-CONTRIBUTORS FOR ANY REASON, THEN ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE 
-THIS MATERIAL SHALL TERMINATE IMMEDIATELY. MOREOVER, THE FOREGOING SHALL 
-SURVIVE ANY EXPIRATION OR TERMINATION OF THIS LICENSE OR ANY AGREEMENT OR 
-ACCESS OR USE RELATED TO THIS MATERIAL.
-
-NOTICE IS HEREBY PROVIDED, AND BY REDISTRIBUTING OR ACCESSING OR USING THIS 
-MATERIAL SUCH NOTICE IS ACKNOWLEDGED, THAT THIS MATERIAL MAY BE SUBJECT TO 
-RESTRICTIONS UNDER THE LAWS AND REGULATIONS OF THE UNITED STATES OR OTHER 
-COUNTRIES, WHICH INCLUDE BUT ARE NOT LIMITED TO, U.S. EXPORT CONTROL LAWS SUCH 
-AS THE EXPORT ADMINISTRATION REGULATIONS AND NATIONAL SECURITY CONTROLS AS 
-DEFINED THEREUNDER, AS WELL AS STATE DEPARTMENT CONTROLS UNDER THE U.S. 
-MUNITIONS LIST. THIS MATERIAL MAY NOT BE USED, RELEASED, TRANSFERRED, IMPORTED,
-EXPORTED AND/OR RE-EXPORTED IN ANY MANNER PROHIBITED UNDER ANY APPLICABLE LAWS, 
-INCLUDING U.S. EXPORT CONTROL LAWS REGARDING SPECIFICALLY DESIGNATED PERSONS, 
-COUNTRIES AND NATIONALS OF COUNTRIES SUBJECT TO NATIONAL SECURITY CONTROLS. 
-MOREOVER, THE FOREGOING SHALL SURVIVE ANY EXPIRATION OR TERMINATION OF ANY 
-LICENSE OR AGREEMENT OR ACCESS OR USE RELATED TO THIS MATERIAL.
-
-NOTICE REGARDING THE U.S. GOVERNMENT AND DOD AGENCIES: This material is 
-provided with "RESTRICTED RIGHTS" and/or "LIMITED RIGHTS" as applicable to 
-computer software and technical data, respectively. Use, duplication, 
-distribution or disclosure by the U.S. Government and/or DOD agencies is 
-subject to the full extent of restrictions in all applicable regulations, 
-including those found at FAR52.227 and DFARS252.227 et seq. and any successor 
-regulations thereof. Use of this material by the U.S. Government and/or DOD 
-agencies is acknowledgment of the proprietary rights of any copyright holders 
-and contributors, including those of Advanced Micro Devices, Inc., as well as 
-the provisions of FAR52.227-14 through 23 regarding privately developed and/or 
-commercial computer software.
-
-This license forms the entire agreement regarding the subject matter hereof and 
-supersedes all proposals and prior discussions and writings between the parties 
-with respect thereto. This license does not affect any ownership, rights, title,
-or interest in, or relating to, this material. No terms of this license can be 
-modified or waived, and no breach of this license can be excused, unless done 
-so in a writing signed by all affected parties. Each term of this license is 
-separately enforceable. If any term of this license is determined to be or 
-becomes unenforceable or illegal, such term shall be reformed to the minimum 
-extent necessary in order for this license to remain in effect in accordance 
-with its terms as modified by such reformation. This license shall be governed 
-by and construed in accordance with the laws of the State of Texas without 
-regard to rules on conflicts of law of any state or jurisdiction or the United 
-Nations Convention on the International Sale of Goods. All disputes arising out 
-of this license shall be subject to the jurisdiction of the federal and state 
-courts in Austin, Texas, and all defenses are hereby waived concerning personal 
-jurisdiction and venue of these courts.
-
-============================================================ */
-
-/*
- * For a description of the algorithm and the terms used, please see the
- * documentation for this sample.
- *
- * On invocation of kernel blackScholes, each work thread calculates
- * thread-histogram bin and finally all thread-histograms merged into 
- * block-histogram bin. Outside the kernel, All block-histograms merged
- * into final histogram
- */
-
-#define LINEAR_MEM_ACCESS
-#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable 
-
-
-
-/**
- * @brief   Calculates block-histogram bin whose bin size is 256
- * @param   data  input data pointer
- * @param   sharedArray shared array for thread-histogram bins
- * @param   binResult block-histogram array
- */
-__kernel
-void histogram256(__global const uchar* data,
-                  __local uchar* sharedArray,
-                  __global uint* binResult, 
-                  uint binSize)
-{
-    size_t localId = get_local_id(0);
-    size_t globalId = get_global_id(0);
-    size_t groupId = get_group_id(0);
-    size_t groupSize = get_local_size(0);
-
-    /* initialize shared array to zero */
-    for(int i = 0; i < binSize; ++i)
-        sharedArray[localId * binSize + i] = 0;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    
-    /* calculate thread-histograms */
-    for(int i = 0; i < binSize; ++i)
-    {
-#ifdef LINEAR_MEM_ACCESS
-        uchar value = data[groupId * groupSize * binSize + i * groupSize + localId];
-#else
-        uchar value = data[globalId * binSize + i];
-#endif // LINEAR_MEM_ACCESS
-        sharedArray[localId * binSize + value]++;
-    }
-    
-    barrier(CLK_LOCAL_MEM_FENCE); 
-    
-    /* merge all thread-histograms into block-histogram */
-    for(int i = 0; i < binSize / groupSize; ++i)
-    {
-        uint binCount = 0;
-        for(int j = 0; j < groupSize; ++j)
-            binCount += sharedArray[j * binSize + i * groupSize + localId];
-            
-        binResult[groupId * binSize + i * groupSize + localId] = binCount;
-    }
-}
-
-__kernel
-void bin256(__global uint* histo,
-                  __global const uint* binResult,
-                  uint subHistogramSize )
-{
-  size_t j = get_local_id(0);
-  size_t binSize=get_global_size(0);
-  uint histValue=0;
-  for(int i = 0; i < subHistogramSize; ++i){
-     histValue += binResult[i * binSize + j];
-  }
-  histo[j]=histValue;
-}
+/* ============================================================
+
+Copyright (c) 2009-2010 Advanced Micro Devices, Inc.  All rights reserved.
+ 
+Redistribution and use of this material is permitted under the following 
+conditions:
+ 
+Redistributions must retain the above copyright notice and all terms of this 
+license.
+ 
+In no event shall anyone redistributing or accessing or using this material 
+commence or participate in any arbitration or legal action relating to this 
+material against Advanced Micro Devices, Inc. or any copyright holders or 
+contributors. The foregoing shall survive any expiration or termination of 
+this license or any agreement or access or use related to this material. 
+
+ANY BREACH OF ANY TERM OF THIS LICENSE SHALL RESULT IN THE IMMEDIATE REVOCATION 
+OF ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE THIS MATERIAL.
+
+THIS MATERIAL IS PROVIDED BY ADVANCED MICRO DEVICES, INC. AND ANY COPYRIGHT 
+HOLDERS AND CONTRIBUTORS "AS IS" IN ITS CURRENT CONDITION AND WITHOUT ANY 
+REPRESENTATIONS, GUARANTEE, OR WARRANTY OF ANY KIND OR IN ANY WAY RELATED TO 
+SUPPORT, INDEMNITY, ERROR FREE OR UNINTERRUPTED OPERA TION, OR THAT IT IS FREE 
+FROM DEFECTS OR VIRUSES.  ALL OBLIGATIONS ARE HEREBY DISCLAIMED - WHETHER 
+EXPRESS, IMPLIED, OR STATUTORY - INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED 
+WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, 
+ACCURACY, COMPLETENESS, OPERABILITY, QUALITY OF SERVICE, OR NON-INFRINGEMENT. 
+IN NO EVENT SHALL ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, PUNITIVE,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, REVENUE, DATA, OR PROFITS; OR 
+BUSINESS INTERRUPTION) HOWEVER CAUSED OR BASED ON ANY THEORY OF LIABILITY 
+ARISING IN ANY WAY RELATED TO THIS MATERIAL, EVEN IF ADVISED OF THE POSSIBILITY 
+OF SUCH DAMAGE. THE ENTIRE AND AGGREGATE LIABILITY OF ADVANCED MICRO DEVICES, 
+INC. AND ANY COPYRIGHT HOLDERS AND CONTRIBUTORS SHALL NOT EXCEED TEN DOLLARS 
+(US $10.00). ANYONE REDISTRIBUTING OR ACCESSING OR USING THIS MATERIAL ACCEPTS 
+THIS ALLOCATION OF RISK AND AGREES TO RELEASE ADVANCED MICRO DEVICES, INC. AND 
+ANY COPYRIGHT HOLDERS AND CONTRIBUTORS FROM ANY AND ALL LIABILITIES, 
+OBLIGATIONS, CLAIMS, OR DEMANDS IN EXCESS OF TEN DOLLARS (US $10.00). THE 
+FOREGOING ARE ESSENTIAL TERMS OF THIS LICENSE AND, IF ANY OF THESE TERMS ARE 
+CONSTRUED AS UNENFORCEABLE, FAIL IN ESSENTIAL PURPOSE, OR BECOME VOID OR 
+DETRIMENTAL TO ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR 
+CONTRIBUTORS FOR ANY REASON, THEN ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE 
+THIS MATERIAL SHALL TERMINATE IMMEDIATELY. MOREOVER, THE FOREGOING SHALL 
+SURVIVE ANY EXPIRATION OR TERMINATION OF THIS LICENSE OR ANY AGREEMENT OR 
+ACCESS OR USE RELATED TO THIS MATERIAL.
+
+NOTICE IS HEREBY PROVIDED, AND BY REDISTRIBUTING OR ACCESSING OR USING THIS 
+MATERIAL SUCH NOTICE IS ACKNOWLEDGED, THAT THIS MATERIAL MAY BE SUBJECT TO 
+RESTRICTIONS UNDER THE LAWS AND REGULATIONS OF THE UNITED STATES OR OTHER 
+COUNTRIES, WHICH INCLUDE BUT ARE NOT LIMITED TO, U.S. EXPORT CONTROL LAWS SUCH 
+AS THE EXPORT ADMINISTRATION REGULATIONS AND NATIONAL SECURITY CONTROLS AS 
+DEFINED THEREUNDER, AS WELL AS STATE DEPARTMENT CONTROLS UNDER THE U.S. 
+MUNITIONS LIST. THIS MATERIAL MAY NOT BE USED, RELEASED, TRANSFERRED, IMPORTED,
+EXPORTED AND/OR RE-EXPORTED IN ANY MANNER PROHIBITED UNDER ANY APPLICABLE LAWS, 
+INCLUDING U.S. EXPORT CONTROL LAWS REGARDING SPECIFICALLY DESIGNATED PERSONS, 
+COUNTRIES AND NATIONALS OF COUNTRIES SUBJECT TO NATIONAL SECURITY CONTROLS. 
+MOREOVER, THE FOREGOING SHALL SURVIVE ANY EXPIRATION OR TERMINATION OF ANY 
+LICENSE OR AGREEMENT OR ACCESS OR USE RELATED TO THIS MATERIAL.
+
+NOTICE REGARDING THE U.S. GOVERNMENT AND DOD AGENCIES: This material is 
+provided with "RESTRICTED RIGHTS" and/or "LIMITED RIGHTS" as applicable to 
+computer software and technical data, respectively. Use, duplication, 
+distribution or disclosure by the U.S. Government and/or DOD agencies is 
+subject to the full extent of restrictions in all applicable regulations, 
+including those found at FAR52.227 and DFARS252.227 et seq. and any successor 
+regulations thereof. Use of this material by the U.S. Government and/or DOD 
+agencies is acknowledgment of the proprietary rights of any copyright holders 
+and contributors, including those of Advanced Micro Devices, Inc., as well as 
+the provisions of FAR52.227-14 through 23 regarding privately developed and/or 
+commercial computer software.
+
+This license forms the entire agreement regarding the subject matter hereof and 
+supersedes all proposals and prior discussions and writings between the parties 
+with respect thereto. This license does not affect any ownership, rights, title,
+or interest in, or relating to, this material. No terms of this license can be 
+modified or waived, and no breach of this license can be excused, unless done 
+so in a writing signed by all affected parties. Each term of this license is 
+separately enforceable. If any term of this license is determined to be or 
+becomes unenforceable or illegal, such term shall be reformed to the minimum 
+extent necessary in order for this license to remain in effect in accordance 
+with its terms as modified by such reformation. This license shall be governed 
+by and construed in accordance with the laws of the State of Texas without 
+regard to rules on conflicts of law of any state or jurisdiction or the United 
+Nations Convention on the International Sale of Goods. All disputes arising out 
+of this license shall be subject to the jurisdiction of the federal and state 
+courts in Austin, Texas, and all defenses are hereby waived concerning personal 
+jurisdiction and venue of these courts.
+
+============================================================ */
+
+/*
+ * For a description of the algorithm and the terms used, please see the
+ * documentation for this sample.
+ *
+ * On invocation of kernel blackScholes, each work thread calculates
+ * thread-histogram bin and finally all thread-histograms merged into 
+ * block-histogram bin. Outside the kernel, All block-histograms merged
+ * into final histogram
+ */
+
+#define LINEAR_MEM_ACCESS
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable 
+
+
+
+/**
+ * @brief   Calculates block-histogram bin whose bin size is 256
+ * @param   data  input data pointer
+ * @param   sharedArray shared array for thread-histogram bins
+ * @param   binResult block-histogram array
+ */
+__kernel
+void histogram256(__global const uchar* data,
+                  __local uchar* sharedArray,
+                  __global uint* binResult, 
+                  uint binSize)
+{
+    size_t localId = get_local_id(0);
+    size_t globalId = get_global_id(0);
+    size_t groupId = get_group_id(0);
+    size_t groupSize = get_local_size(0);
+
+    /* initialize shared array to zero */
+    for(int i = 0; i < binSize; ++i)
+        sharedArray[localId * binSize + i] = 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    /* calculate thread-histograms */
+    for(int i = 0; i < binSize; ++i)
+    {
+#ifdef LINEAR_MEM_ACCESS
+        uchar value = data[groupId * groupSize * binSize + i * groupSize + localId];
+#else
+        uchar value = data[globalId * binSize + i];
+#endif // LINEAR_MEM_ACCESS
+        sharedArray[localId * binSize + value]++;
+    }
+    
+    barrier(CLK_LOCAL_MEM_FENCE); 
+    
+    /* merge all thread-histograms into block-histogram */
+    for(int i = 0; i < binSize / groupSize; ++i)
+    {
+        uint binCount = 0;
+        for(int j = 0; j < groupSize; ++j)
+            binCount += sharedArray[j * binSize + i * groupSize + localId];
+            
+        binResult[groupId * binSize + i * groupSize + localId] = binCount;
+    }
+}
+
+__kernel
+void bin256(__global uint* histo,
+                  __global const uint* binResult,
+                  uint subHistogramSize )
+{
+  size_t j = get_local_id(0);
+  size_t binSize=get_global_size(0);
+  uint histValue=0;
+  for(int i = 0; i < subHistogramSize; ++i){
+     histValue += binResult[i * binSize + j];
+  }
+  histo[j]=histValue;
+}
diff --git a/src/main/java/com/aparapi/examples/extension/fft.cl b/src/main/java/com/aparapi/examples/extension/fft.cl
index f4085983..ae8e31b7 100644
--- a/src/main/java/com/aparapi/examples/extension/fft.cl
+++ b/src/main/java/com/aparapi/examples/extension/fft.cl
@@ -1,737 +1,737 @@
-/* ============================================================
-
-Copyright (c) 2009-2010 Advanced Micro Devices, Inc.  All rights reserved.
- 
-Redistribution and use of this material is permitted under the following 
-conditions:
- 
-Redistributions must retain the above copyright notice and all terms of this 
-license.
- 
-In no event shall anyone redistributing or accessing or using this material 
-commence or participate in any arbitration or legal action relating to this 
-material against Advanced Micro Devices, Inc. or any copyright holders or 
-contributors. The foregoing shall survive any expiration or termination of 
-this license or any agreement or access or use related to this material. 
-
-ANY BREACH OF ANY TERM OF THIS LICENSE SHALL RESULT IN THE IMMEDIATE REVOCATION 
-OF ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE THIS MATERIAL.
-
-THIS MATERIAL IS PROVIDED BY ADVANCED MICRO DEVICES, INC. AND ANY COPYRIGHT 
-HOLDERS AND CONTRIBUTORS "AS IS" IN ITS CURRENT CONDITION AND WITHOUT ANY 
-REPRESENTATIONS, GUARANTEE, OR WARRANTY OF ANY KIND OR IN ANY WAY RELATED TO 
-SUPPORT, INDEMNITY, ERROR FREE OR UNINTERRUPTED OPERA TION, OR THAT IT IS FREE 
-FROM DEFECTS OR VIRUSES.  ALL OBLIGATIONS ARE HEREBY DISCLAIMED - WHETHER 
-EXPRESS, IMPLIED, OR STATUTORY - INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED 
-WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, 
-ACCURACY, COMPLETENESS, OPERABILITY, QUALITY OF SERVICE, OR NON-INFRINGEMENT. 
-IN NO EVENT SHALL ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR 
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, PUNITIVE,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
-OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, REVENUE, DATA, OR PROFITS; OR 
-BUSINESS INTERRUPTION) HOWEVER CAUSED OR BASED ON ANY THEORY OF LIABILITY 
-ARISING IN ANY WAY RELATED TO THIS MATERIAL, EVEN IF ADVISED OF THE POSSIBILITY 
-OF SUCH DAMAGE. THE ENTIRE AND AGGREGATE LIABILITY OF ADVANCED MICRO DEVICES, 
-INC. AND ANY COPYRIGHT HOLDERS AND CONTRIBUTORS SHALL NOT EXCEED TEN DOLLARS 
-(US $10.00). ANYONE REDISTRIBUTING OR ACCESSING OR USING THIS MATERIAL ACCEPTS 
-THIS ALLOCATION OF RISK AND AGREES TO RELEASE ADVANCED MICRO DEVICES, INC. AND 
-ANY COPYRIGHT HOLDERS AND CONTRIBUTORS FROM ANY AND ALL LIABILITIES, 
-OBLIGATIONS, CLAIMS, OR DEMANDS IN EXCESS OF TEN DOLLARS (US $10.00). THE 
-FOREGOING ARE ESSENTIAL TERMS OF THIS LICENSE AND, IF ANY OF THESE TERMS ARE 
-CONSTRUED AS UNENFORCEABLE, FAIL IN ESSENTIAL PURPOSE, OR BECOME VOID OR 
-DETRIMENTAL TO ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR 
-CONTRIBUTORS FOR ANY REASON, THEN ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE 
-THIS MATERIAL SHALL TERMINATE IMMEDIATELY. MOREOVER, THE FOREGOING SHALL 
-SURVIVE ANY EXPIRATION OR TERMINATION OF THIS LICENSE OR ANY AGREEMENT OR 
-ACCESS OR USE RELATED TO THIS MATERIAL.
-
-NOTICE IS HEREBY PROVIDED, AND BY REDISTRIBUTING OR ACCESSING OR USING THIS 
-MATERIAL SUCH NOTICE IS ACKNOWLEDGED, THAT THIS MATERIAL MAY BE SUBJECT TO 
-RESTRICTIONS UNDER THE LAWS AND REGULATIONS OF THE UNITED STATES OR OTHER 
-COUNTRIES, WHICH INCLUDE BUT ARE NOT LIMITED TO, U.S. EXPORT CONTROL LAWS SUCH 
-AS THE EXPORT ADMINISTRATION REGULATIONS AND NATIONAL SECURITY CONTROLS AS 
-DEFINED THEREUNDER, AS WELL AS STATE DEPARTMENT CONTROLS UNDER THE U.S. 
-MUNITIONS LIST. THIS MATERIAL MAY NOT BE USED, RELEASED, TRANSFERRED, IMPORTED,
-EXPORTED AND/OR RE-EXPORTED IN ANY MANNER PROHIBITED UNDER ANY APPLICABLE LAWS, 
-INCLUDING U.S. EXPORT CONTROL LAWS REGARDING SPECIFICALLY DESIGNATED PERSONS, 
-COUNTRIES AND NATIONALS OF COUNTRIES SUBJECT TO NATIONAL SECURITY CONTROLS. 
-MOREOVER, THE FOREGOING SHALL SURVIVE ANY EXPIRATION OR TERMINATION OF ANY 
-LICENSE OR AGREEMENT OR ACCESS OR USE RELATED TO THIS MATERIAL.
-
-NOTICE REGARDING THE U.S. GOVERNMENT AND DOD AGENCIES: This material is 
-provided with "RESTRICTED RIGHTS" and/or "LIMITED RIGHTS" as applicable to 
-computer software and technical data, respectively. Use, duplication, 
-distribution or disclosure by the U.S. Government and/or DOD agencies is 
-subject to the full extent of restrictions in all applicable regulations, 
-including those found at FAR52.227 and DFARS252.227 et seq. and any successor 
-regulations thereof. Use of this material by the U.S. Government and/or DOD 
-agencies is acknowledgment of the proprietary rights of any copyright holders 
-and contributors, including those of Advanced Micro Devices, Inc., as well as 
-the provisions of FAR52.227-14 through 23 regarding privately developed and/or 
-commercial computer software.
-
-This license forms the entire agreement regarding the subject matter hereof and 
-supersedes all proposals and prior discussions and writings between the parties 
-with respect thereto. This license does not affect any ownership, rights, title,
-or interest in, or relating to, this material. No terms of this license can be 
-modified or waived, and no breach of this license can be excused, unless done 
-so in a writing signed by all affected parties. Each term of this license is 
-separately enforceable. If any term of this license is determined to be or 
-becomes unenforceable or illegal, such term shall be reformed to the minimum 
-extent necessary in order for this license to remain in effect in accordance 
-with its terms as modified by such reformation. This license shall be governed 
-by and construed in accordance with the laws of the State of Texas without 
-regard to rules on conflicts of law of any state or jurisdiction or the United 
-Nations Convention on the International Sale of Goods. All disputes arising out 
-of this license shall be subject to the jurisdiction of the federal and state 
-courts in Austin, Texas, and all defenses are hereby waived concerning personal 
-jurisdiction and venue of these courts.
-
-============================================================ */
-
-
-// This is 2 PI / 1024
-#define ANGLE 0x1.921fb6p-8F
-
-// Return sin and cos of -2*pi*i/1024
-__attribute__((always_inline)) float
-k_sincos(int i, float *cretp)
-{
-    if (i > 512)
-	i -= 1024;
-
-    float x = i * -ANGLE;
-    *cretp = native_cos(x);
-    return native_sin(x);
-}
-
-__attribute__((always_inline)) float4
-k_sincos4(int4 i, float4 *cretp)
-{
-    i -= (i > 512) & 1024;
-    float4 x = convert_float4(i) * -ANGLE;
-    *cretp = native_cos(x);
-    return native_sin(x);
-}
-
-// Twiddle factor stuff
-#define TWGEN(I,C,S) \
-    float C; \
-    float S = k_sincos(tbase * I, &C)
-
-#define TW4GEN(I,C,S) \
-    float4 C; \
-    float4 S = k_sincos4(tbase * I, &C)
-
-#define TWAPPLY(ZR, ZI, C, S) \
-    do { \
-	float4 __r = C * ZR - S * ZI; \
-	ZI = C * ZI + S * ZR; \
-	ZR = __r; \
-    } while (0)
-
-# define TW4IDDLE4() \
-    do { \
-        TW4GEN(1, c1, s1); \
-        TWAPPLY(zr1, zi1, c1, s1); \
-        TW4GEN(2, c2, s2); \
-        TWAPPLY(zr2, zi2, c2, s2); \
-        TW4GEN(3, c3, s3); \
-        TWAPPLY(zr3, zi3, c3, s3); \
-    } while (0)
-    
-# define TWIDDLE4() \
-    do { \
-        TWGEN(1, c1, s1); \
-        TWAPPLY(zr1, zi1, c1, s1); \
-        TWGEN(2, c2, s2); \
-        TWAPPLY(zr2, zi2, c2, s2); \
-        TWGEN(3, c3, s3); \
-        TWAPPLY(zr3, zi3, c3, s3); \
-    } while (0)
-
-// 4 point FFT
-#define FFT4() \
-    do { \
-        float4 ar0 = zr0 + zr2; \
-        float4 ar2 = zr1 + zr3; \
-        float4 br0 = ar0 + ar2; \
-        float4 br1 = zr0 - zr2; \
-        float4 br2 = ar0 - ar2; \
-        float4 br3 = zr1 - zr3; \
-        float4 ai0 = zi0 + zi2; \
-        float4 ai2 = zi1 + zi3; \
-        float4 bi0 = ai0 + ai2; \
-        float4 bi1 = zi0 - zi2; \
-        float4 bi2 = ai0 - ai2; \
-        float4 bi3 = zi1 - zi3; \
-        zr0 = br0; \
-        zi0 = bi0; \
-        zr1 = br1 + bi3; \
-        zi1 = bi1 - br3; \
-        zr3 = br1 - bi3; \
-        zi3 = br3 + bi1; \
-        zr2 = br2; \
-        zi2 = bi2; \
-    } while (0)
-
-// First pass of 1K FFT
-__attribute__((always_inline)) void
-kfft_pass1(uint me,
-	    const __global float *gr, const __global float *gi,
-	    __local float *lds)
-{
-    const __global float4 *gp;
-    __local float *lp;
-
-    // Pull in transform data
-    gp = (const __global float4 *)(gr + (me << 2));
-    float4 zr0 = gp[0*64];
-    float4 zr1 = gp[1*64];
-    float4 zr2 = gp[2*64];
-    float4 zr3 = gp[3*64];
-
-    gp = (const __global float4 *)(gi + (me << 2));
-    float4 zi0 = gp[0*64];
-    float4 zi1 = gp[1*64];
-    float4 zi2 = gp[2*64];
-    float4 zi3 = gp[3*64];
-
-    FFT4();
-
-    int4 tbase = (int)(me << 2) + (int4)(0, 1, 2, 3);
-    TW4IDDLE4();
-
-    // Save registers
-    // Note that this pointer is not aligned enough to be cast to a float4*
-    lp = lds + ((me << 2) + (me >> 3));
-
-    lp[0] = zr0.x;
-    lp[1] = zr0.y;
-    lp[2] = zr0.z;
-    lp[3] = zr0.w;
-    lp += 66*4;
-
-    lp[0] = zr1.x;
-    lp[1] = zr1.y;
-    lp[2] = zr1.z;
-    lp[3] = zr1.w;
-    lp += 66*4;
-
-    lp[0] = zr2.x;
-    lp[1] = zr2.y;
-    lp[2] = zr2.z;
-    lp[3] = zr2.w;
-    lp += 66*4;
-
-    lp[0] = zr3.x;
-    lp[1] = zr3.y;
-    lp[2] = zr3.z;
-    lp[3] = zr3.w;
-    lp += 66*4;
-
-    // Imaginary part
-    lp[0] = zi0.x;
-    lp[1] = zi0.y;
-    lp[2] = zi0.z;
-    lp[3] = zi0.w;
-    lp += 66*4;
-
-    lp[0] = zi1.x;
-    lp[1] = zi1.y;
-    lp[2] = zi1.z;
-    lp[3] = zi1.w;
-    lp += 66*4;
-
-    lp[0] = zi2.x;
-    lp[1] = zi2.y;
-    lp[2] = zi2.z;
-    lp[3] = zi2.w;
-    lp += 66*4;
-
-    lp[0] = zi3.x;
-    lp[1] = zi3.y;
-    lp[2] = zi3.z;
-    lp[3] = zi3.w;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-}
-
-// Second pass of 1K FFT
-__attribute__((always_inline)) void
-kfft_pass2(uint me, __local float *lds)
-{
-    __local float *lp;
-
-    // Load registers
-    lp = lds + (me + (me >> 5));
-
-    float4 zr0, zr1, zr2, zr3;
-
-    zr0.x = lp[0*66];
-    zr1.x = lp[1*66];
-    zr2.x = lp[2*66];
-    zr3.x = lp[3*66];
-    lp += 66*4;
-
-    zr0.y = lp[0*66];
-    zr1.y = lp[1*66];
-    zr2.y = lp[2*66];
-    zr3.y = lp[3*66];
-    lp += 66*4;
-
-    zr0.z = lp[0*66];
-    zr1.z = lp[1*66];
-    zr2.z = lp[2*66];
-    zr3.z = lp[3*66];
-    lp += 66*4;
-
-    zr0.w = lp[0*66];
-    zr1.w = lp[1*66];
-    zr2.w = lp[2*66];
-    zr3.w = lp[3*66];
-    lp += 66*4;
-
-    float4 zi0, zi1, zi2, zi3;
-
-    zi0.x = lp[0*66];
-    zi1.x = lp[1*66];
-    zi2.x = lp[2*66];
-    zi3.x = lp[3*66];
-    lp += 66*4;
-
-    zi0.y = lp[0*66];
-    zi1.y = lp[1*66];
-    zi2.y = lp[2*66];
-    zi3.y = lp[3*66];
-    lp += 66*4;
-
-    zi0.z = lp[0*66];
-    zi1.z = lp[1*66];
-    zi2.z = lp[2*66];
-    zi3.z = lp[3*66];
-    lp += 66*4;
-
-    zi0.w = lp[0*66];
-    zi1.w = lp[1*66];
-    zi2.w = lp[2*66];
-    zi3.w = lp[3*66];
-
-    // Transform and twiddle
-    FFT4();
-
-    int tbase = (int)(me << 2);
-    TWIDDLE4();
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // Store registers
-    lp = lds + ((me << 2) + (me >> 3));
-
-    lp[0] = zr0.x;
-    lp[1] = zr1.x;
-    lp[2] = zr2.x;
-    lp[3] = zr3.x;
-    lp += 66*4;
-
-    lp[0] = zr0.y;
-    lp[1] = zr1.y;
-    lp[2] = zr2.y;
-    lp[3] = zr3.y;
-    lp += 66*4;
-
-    lp[0] = zr0.z;
-    lp[1] = zr1.z;
-    lp[2] = zr2.z;
-    lp[3] = zr3.z;
-    lp += 66*4;
-
-    lp[0] = zr0.w;
-    lp[1] = zr1.w;
-    lp[2] = zr2.w;
-    lp[3] = zr3.w;
-    lp += 66*4;
-
-    // Imaginary part
-    lp[0] = zi0.x;
-    lp[1] = zi1.x;
-    lp[2] = zi2.x;
-    lp[3] = zi3.x;
-    lp += 66*4;
-
-    lp[0] = zi0.y;
-    lp[1] = zi1.y;
-    lp[2] = zi2.y;
-    lp[3] = zi3.y;
-    lp += 66*4;
-
-    lp[0] = zi0.z;
-    lp[1] = zi1.z;
-    lp[2] = zi2.z;
-    lp[3] = zi3.z;
-    lp += 66*4;
-
-    lp[0] = zi0.w;
-    lp[1] = zi1.w;
-    lp[2] = zi2.w;
-    lp[3] = zi3.w;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-}
-
-// Third pass of 1K FFT
-__attribute__((always_inline)) void
-kfft_pass3(uint me, __local float *lds)
-{
-    __local float *lp;
-
-    // Load registers
-    lp = lds + (me + (me >> 5));
-
-    float4 zr0, zr1, zr2, zr3;
-
-    zr0.x = lp[0*66];
-    zr1.x = lp[1*66];
-    zr2.x = lp[2*66];
-    zr3.x = lp[3*66];
-    lp += 66*4;
-
-    zr0.y = lp[0*66];
-    zr1.y = lp[1*66];
-    zr2.y = lp[2*66];
-    zr3.y = lp[3*66];
-    lp += 66*4;
-
-    zr0.z = lp[0*66];
-    zr1.z = lp[1*66];
-    zr2.z = lp[2*66];
-    zr3.z = lp[3*66];
-    lp += 66*4;
-
-    zr0.w = lp[0*66];
-    zr1.w = lp[1*66];
-    zr2.w = lp[2*66];
-    zr3.w = lp[3*66];
-    lp += 66*4;
-
-    float4 zi0, zi1, zi2, zi3;
-
-    zi0.x = lp[0*66];
-    zi1.x = lp[1*66];
-    zi2.x = lp[2*66];
-    zi3.x = lp[3*66];
-    lp += 66*4;
-
-    zi0.y = lp[0*66];
-    zi1.y = lp[1*66];
-    zi2.y = lp[2*66];
-    zi3.y = lp[3*66];
-    lp += 66*4;
-
-    zi0.z = lp[0*66];
-    zi1.z = lp[1*66];
-    zi2.z = lp[2*66];
-    zi3.z = lp[3*66];
-    lp += 66*4;
-
-    zi0.w = lp[0*66];
-    zi1.w = lp[1*66];
-    zi2.w = lp[2*66];
-    zi3.w = lp[3*66];
-
-    // Transform and twiddle
-    FFT4();
-
-    int tbase = (int)((me >> 2) << 4);
-    TWIDDLE4();
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // Save registers
-    lp = lds + me;
-
-    lp[0*66] = zr0.x;
-    lp[1*66] = zr0.y;
-    lp[2*66] = zr0.z;
-    lp[3*66] = zr0.w;
-    lp += 66*4;
-
-    lp[0*66] = zr1.x;
-    lp[1*66] = zr1.y;
-    lp[2*66] = zr1.z;
-    lp[3*66] = zr1.w;
-    lp += 66*4;
-
-    lp[0*66] = zr2.x;
-    lp[1*66] = zr2.y;
-    lp[2*66] = zr2.z;
-    lp[3*66] = zr2.w;
-    lp += 66*4;
-
-    lp[0*66] = zr3.x;
-    lp[1*66] = zr3.y;
-    lp[2*66] = zr3.z;
-    lp[3*66] = zr3.w;
-    lp += 66*4;
-
-    // Imaginary part
-    lp[0*66] = zi0.x;
-    lp[1*66] = zi0.y;
-    lp[2*66] = zi0.z;
-    lp[3*66] = zi0.w;
-    lp += 66*4;
-
-    lp[0*66] = zi1.x;
-    lp[1*66] = zi1.y;
-    lp[2*66] = zi1.z;
-    lp[3*66] = zi1.w;
-    lp += 66*4;
-
-    lp[0*66] = zi2.x;
-    lp[1*66] = zi2.y;
-    lp[2*66] = zi2.z;
-    lp[3*66] = zi2.w;
-    lp += 66*4;
-
-    lp[0*66] = zi3.x;
-    lp[1*66] = zi3.y;
-    lp[2*66] = zi3.z;
-    lp[3*66] = zi3.w;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-}
-
-// Fourth pass of 1K FFT
-__attribute__((always_inline)) void
-kfft_pass4(uint me, __local float *lds)
-{
-    __local float *lp;
-
-    // Load registers
-    lp = lds + ((me & 0x3) + ((me >> 2) & 0x3)*(66*4) + ((me >> 4) << 2));
-
-    float4 zr0, zr1, zr2, zr3;
-
-    zr0.x = lp[0*66];
-    zr0.y = lp[1*66];
-    zr0.z = lp[2*66];
-    zr0.w = lp[3*66];
-    lp += 16;
-
-    zr1.x = lp[0*66];
-    zr1.y = lp[1*66];
-    zr1.z = lp[2*66];
-    zr1.w = lp[3*66];
-    lp += 16;
-
-    zr2.x = lp[0*66];
-    zr2.y = lp[1*66];
-    zr2.z = lp[2*66];
-    zr2.w = lp[3*66];
-    lp += 16;
-
-    zr3.x = lp[0*66];
-    zr3.y = lp[1*66];
-    zr3.z = lp[2*66];
-    zr3.w = lp[3*66];
-    lp += 66*4*4 - 3*16;
-
-    float4 zi0, zi1, zi2, zi3;
-
-    zi0.x = lp[0*66];
-    zi0.y = lp[1*66];
-    zi0.z = lp[2*66];
-    zi0.w = lp[3*66];
-    lp += 16;
-
-    zi1.x = lp[0*66];
-    zi1.y = lp[1*66];
-    zi1.z = lp[2*66];
-    zi1.w = lp[3*66];
-    lp += 16;
-
-    zi2.x = lp[0*66];
-    zi2.y = lp[1*66];
-    zi2.z = lp[2*66];
-    zi2.w = lp[3*66];
-    lp += 16;
-
-    zi3.x = lp[0*66];
-    zi3.y = lp[1*66];
-    zi3.z = lp[2*66];
-    zi3.w = lp[3*66];
-
-    // Transform and twiddle
-    FFT4();
-
-    int tbase = (int)((me >> 4) << 6);
-    TWIDDLE4();
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // Save registers in conflict free manner
-    lp = lds + me;
-
-    lp[0*68] = zr0.x;
-    lp[1*68] = zr0.y;
-    lp[2*68] = zr0.z;
-    lp[3*68] = zr0.w;
-    lp += 68*4;
-
-    lp[0*68] = zr1.x;
-    lp[1*68] = zr1.y;
-    lp[2*68] = zr1.z;
-    lp[3*68] = zr1.w;
-    lp += 68*4;
-
-    lp[0*68] = zr2.x;
-    lp[1*68] = zr2.y;
-    lp[2*68] = zr2.z;
-    lp[3*68] = zr2.w;
-    lp += 68*4;
-
-    lp[0*68] = zr3.x;
-    lp[1*68] = zr3.y;
-    lp[2*68] = zr3.z;
-    lp[3*68] = zr3.w;
-    lp += 68*4;
-
-    // Imaginary part
-    lp[0*68] = zi0.x;
-    lp[1*68] = zi0.y;
-    lp[2*68] = zi0.z;
-    lp[3*68] = zi0.w;
-    lp += 68*4;
-
-    lp[0*68] = zi1.x;
-    lp[1*68] = zi1.y;
-    lp[2*68] = zi1.z;
-    lp[3*68] = zi1.w;
-    lp += 68*4;
-
-    lp[0*68] = zi2.x;
-    lp[1*68] = zi2.y;
-    lp[2*68] = zi2.z;
-    lp[3*68] = zi2.w;
-    lp += 68*4;
-
-    lp[0*68] = zi3.x;
-    lp[1*68] = zi3.y;
-    lp[2*68] = zi3.z;
-    lp[3*68] = zi3.w;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-}
-
-// Fifth and last pass of 1K FFT
-__attribute__((always_inline)) void
-kfft_pass5(uint me,
-	   const __local float *lds,
-	   __global float *gr, __global float *gi)
-{
-    const __local float *lp;
-
-    // Load registers
-    lp = lds + ((me & 0xf) + (me >> 4)*(68*4));
-
-    float4 zr0, zr1, zr2, zr3;
-
-    zr0.x = lp[0*68];
-    zr0.y = lp[1*68];
-    zr0.z = lp[2*68];
-    zr0.w = lp[3*68];
-    lp += 16;
-
-    zr1.x = lp[0*68];
-    zr1.y = lp[1*68];
-    zr1.z = lp[2*68];
-    zr1.w = lp[3*68];
-    lp += 16;
-
-    zr2.x = lp[0*68];
-    zr2.y = lp[1*68];
-    zr2.z = lp[2*68];
-    zr2.w = lp[3*68];
-    lp += 16;
-
-    zr3.x = lp[0*68];
-    zr3.y = lp[1*68];
-    zr3.z = lp[2*68];
-    zr3.w = lp[3*68];
-
-    lp += 68*4*4 - 3*16;
-
-    float4 zi0, zi1, zi2, zi3;
-
-    zi0.x = lp[0*68];
-    zi0.y = lp[1*68];
-    zi0.z = lp[2*68];
-    zi0.w = lp[3*68];
-    lp += 16;
-
-    zi1.x = lp[0*68];
-    zi1.y = lp[1*68];
-    zi1.z = lp[2*68];
-    zi1.w = lp[3*68];
-    lp += 16;
-
-    zi2.x = lp[0*68];
-    zi2.y = lp[1*68];
-    zi2.z = lp[2*68];
-    zi2.w = lp[3*68];
-    lp += 16;
-
-    zi3.x = lp[0*68];
-    zi3.y = lp[1*68];
-    zi3.z = lp[2*68];
-    zi3.w = lp[3*68];
-
-    // Transform
-    FFT4();
-
-    // Save result
-    __global float4 *gp = (__global float4 *)(gr + (me << 2));
-    gp[0*64] = zr0;
-    gp[1*64] = zr1;
-    gp[2*64] = zr2;
-    gp[3*64] = zr3;
-
-    gp = (__global float4 *)(gi + (me << 2));
-    gp[0*64] = zi0;
-    gp[1*64] = zi1;
-    gp[2*64] = zi2;
-    gp[3*64] = zi3;
-}
-
-// Distance between first real element of successive 1K vectors
-// It must be >= 1024, and a multiple of 4
-#define VSTRIDE (1024+0)
-
-// Performs a 1K complex FFT with every 64 global ids.
-// Each vector is a multiple of VSTRIDE from the first
-// Number of global ids must be a multiple of 64, e.g. 1024*64
-//
-//   greal - pointer to input and output real part of data
-//   gimag - pointer to input and output imaginary part of data
-__kernel void
-forward(__global float *greal, __global float *gimag)
-{
-    // This is 8704 bytes
-    __local float lds[68*4*4*2];
-
-    __global float *gr;
-    __global float *gi;
-    uint gid = get_global_id(0);
-    uint me = gid & 0x3fU;
-    uint dg = (gid >> 6) * VSTRIDE;
-
-    gr = greal + dg;
-    gi = gimag + dg;
-
-    kfft_pass1(me, gr, gi, lds);
-    kfft_pass2(me, lds);
-    kfft_pass3(me, lds);
-    kfft_pass4(me, lds);
-    kfft_pass5(me, lds, gr, gi);
-}
-
+/* ============================================================
+
+Copyright (c) 2009-2010 Advanced Micro Devices, Inc.  All rights reserved.
+ 
+Redistribution and use of this material is permitted under the following 
+conditions:
+ 
+Redistributions must retain the above copyright notice and all terms of this 
+license.
+ 
+In no event shall anyone redistributing or accessing or using this material 
+commence or participate in any arbitration or legal action relating to this 
+material against Advanced Micro Devices, Inc. or any copyright holders or 
+contributors. The foregoing shall survive any expiration or termination of 
+this license or any agreement or access or use related to this material. 
+
+ANY BREACH OF ANY TERM OF THIS LICENSE SHALL RESULT IN THE IMMEDIATE REVOCATION 
+OF ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE THIS MATERIAL.
+
+THIS MATERIAL IS PROVIDED BY ADVANCED MICRO DEVICES, INC. AND ANY COPYRIGHT 
+HOLDERS AND CONTRIBUTORS "AS IS" IN ITS CURRENT CONDITION AND WITHOUT ANY 
+REPRESENTATIONS, GUARANTEE, OR WARRANTY OF ANY KIND OR IN ANY WAY RELATED TO 
+SUPPORT, INDEMNITY, ERROR FREE OR UNINTERRUPTED OPERA TION, OR THAT IT IS FREE 
+FROM DEFECTS OR VIRUSES.  ALL OBLIGATIONS ARE HEREBY DISCLAIMED - WHETHER 
+EXPRESS, IMPLIED, OR STATUTORY - INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED 
+WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, 
+ACCURACY, COMPLETENESS, OPERABILITY, QUALITY OF SERVICE, OR NON-INFRINGEMENT. 
+IN NO EVENT SHALL ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, PUNITIVE,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, REVENUE, DATA, OR PROFITS; OR 
+BUSINESS INTERRUPTION) HOWEVER CAUSED OR BASED ON ANY THEORY OF LIABILITY 
+ARISING IN ANY WAY RELATED TO THIS MATERIAL, EVEN IF ADVISED OF THE POSSIBILITY 
+OF SUCH DAMAGE. THE ENTIRE AND AGGREGATE LIABILITY OF ADVANCED MICRO DEVICES, 
+INC. AND ANY COPYRIGHT HOLDERS AND CONTRIBUTORS SHALL NOT EXCEED TEN DOLLARS 
+(US $10.00). ANYONE REDISTRIBUTING OR ACCESSING OR USING THIS MATERIAL ACCEPTS 
+THIS ALLOCATION OF RISK AND AGREES TO RELEASE ADVANCED MICRO DEVICES, INC. AND 
+ANY COPYRIGHT HOLDERS AND CONTRIBUTORS FROM ANY AND ALL LIABILITIES, 
+OBLIGATIONS, CLAIMS, OR DEMANDS IN EXCESS OF TEN DOLLARS (US $10.00). THE 
+FOREGOING ARE ESSENTIAL TERMS OF THIS LICENSE AND, IF ANY OF THESE TERMS ARE 
+CONSTRUED AS UNENFORCEABLE, FAIL IN ESSENTIAL PURPOSE, OR BECOME VOID OR 
+DETRIMENTAL TO ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR 
+CONTRIBUTORS FOR ANY REASON, THEN ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE 
+THIS MATERIAL SHALL TERMINATE IMMEDIATELY. MOREOVER, THE FOREGOING SHALL 
+SURVIVE ANY EXPIRATION OR TERMINATION OF THIS LICENSE OR ANY AGREEMENT OR 
+ACCESS OR USE RELATED TO THIS MATERIAL.
+
+NOTICE IS HEREBY PROVIDED, AND BY REDISTRIBUTING OR ACCESSING OR USING THIS 
+MATERIAL SUCH NOTICE IS ACKNOWLEDGED, THAT THIS MATERIAL MAY BE SUBJECT TO 
+RESTRICTIONS UNDER THE LAWS AND REGULATIONS OF THE UNITED STATES OR OTHER 
+COUNTRIES, WHICH INCLUDE BUT ARE NOT LIMITED TO, U.S. EXPORT CONTROL LAWS SUCH 
+AS THE EXPORT ADMINISTRATION REGULATIONS AND NATIONAL SECURITY CONTROLS AS 
+DEFINED THEREUNDER, AS WELL AS STATE DEPARTMENT CONTROLS UNDER THE U.S. 
+MUNITIONS LIST. THIS MATERIAL MAY NOT BE USED, RELEASED, TRANSFERRED, IMPORTED,
+EXPORTED AND/OR RE-EXPORTED IN ANY MANNER PROHIBITED UNDER ANY APPLICABLE LAWS, 
+INCLUDING U.S. EXPORT CONTROL LAWS REGARDING SPECIFICALLY DESIGNATED PERSONS, 
+COUNTRIES AND NATIONALS OF COUNTRIES SUBJECT TO NATIONAL SECURITY CONTROLS. 
+MOREOVER, THE FOREGOING SHALL SURVIVE ANY EXPIRATION OR TERMINATION OF ANY 
+LICENSE OR AGREEMENT OR ACCESS OR USE RELATED TO THIS MATERIAL.
+
+NOTICE REGARDING THE U.S. GOVERNMENT AND DOD AGENCIES: This material is 
+provided with "RESTRICTED RIGHTS" and/or "LIMITED RIGHTS" as applicable to 
+computer software and technical data, respectively. Use, duplication, 
+distribution or disclosure by the U.S. Government and/or DOD agencies is 
+subject to the full extent of restrictions in all applicable regulations, 
+including those found at FAR52.227 and DFARS252.227 et seq. and any successor 
+regulations thereof. Use of this material by the U.S. Government and/or DOD 
+agencies is acknowledgment of the proprietary rights of any copyright holders 
+and contributors, including those of Advanced Micro Devices, Inc., as well as 
+the provisions of FAR52.227-14 through 23 regarding privately developed and/or 
+commercial computer software.
+
+This license forms the entire agreement regarding the subject matter hereof and 
+supersedes all proposals and prior discussions and writings between the parties 
+with respect thereto. This license does not affect any ownership, rights, title,
+or interest in, or relating to, this material. No terms of this license can be 
+modified or waived, and no breach of this license can be excused, unless done 
+so in a writing signed by all affected parties. Each term of this license is 
+separately enforceable. If any term of this license is determined to be or 
+becomes unenforceable or illegal, such term shall be reformed to the minimum 
+extent necessary in order for this license to remain in effect in accordance 
+with its terms as modified by such reformation. This license shall be governed 
+by and construed in accordance with the laws of the State of Texas without 
+regard to rules on conflicts of law of any state or jurisdiction or the United 
+Nations Convention on the International Sale of Goods. All disputes arising out 
+of this license shall be subject to the jurisdiction of the federal and state 
+courts in Austin, Texas, and all defenses are hereby waived concerning personal 
+jurisdiction and venue of these courts.
+
+============================================================ */
+
+
+// This is 2 PI / 1024
+#define ANGLE 0x1.921fb6p-8F
+
+// Return sin and cos of -2*pi*i/1024
+__attribute__((always_inline)) float
+k_sincos(int i, float *cretp)
+{
+    if (i > 512)
+	i -= 1024;
+
+    float x = i * -ANGLE;
+    *cretp = native_cos(x);
+    return native_sin(x);
+}
+
+__attribute__((always_inline)) float4
+k_sincos4(int4 i, float4 *cretp)
+{
+    i -= (i > 512) & 1024;
+    float4 x = convert_float4(i) * -ANGLE;
+    *cretp = native_cos(x);
+    return native_sin(x);
+}
+
+// Twiddle factor stuff
+#define TWGEN(I,C,S) \
+    float C; \
+    float S = k_sincos(tbase * I, &C)
+
+#define TW4GEN(I,C,S) \
+    float4 C; \
+    float4 S = k_sincos4(tbase * I, &C)
+
+#define TWAPPLY(ZR, ZI, C, S) \
+    do { \
+	float4 __r = C * ZR - S * ZI; \
+	ZI = C * ZI + S * ZR; \
+	ZR = __r; \
+    } while (0)
+
+# define TW4IDDLE4() \
+    do { \
+        TW4GEN(1, c1, s1); \
+        TWAPPLY(zr1, zi1, c1, s1); \
+        TW4GEN(2, c2, s2); \
+        TWAPPLY(zr2, zi2, c2, s2); \
+        TW4GEN(3, c3, s3); \
+        TWAPPLY(zr3, zi3, c3, s3); \
+    } while (0)
+    
+# define TWIDDLE4() \
+    do { \
+        TWGEN(1, c1, s1); \
+        TWAPPLY(zr1, zi1, c1, s1); \
+        TWGEN(2, c2, s2); \
+        TWAPPLY(zr2, zi2, c2, s2); \
+        TWGEN(3, c3, s3); \
+        TWAPPLY(zr3, zi3, c3, s3); \
+    } while (0)
+
+// 4 point FFT
+#define FFT4() \
+    do { \
+        float4 ar0 = zr0 + zr2; \
+        float4 ar2 = zr1 + zr3; \
+        float4 br0 = ar0 + ar2; \
+        float4 br1 = zr0 - zr2; \
+        float4 br2 = ar0 - ar2; \
+        float4 br3 = zr1 - zr3; \
+        float4 ai0 = zi0 + zi2; \
+        float4 ai2 = zi1 + zi3; \
+        float4 bi0 = ai0 + ai2; \
+        float4 bi1 = zi0 - zi2; \
+        float4 bi2 = ai0 - ai2; \
+        float4 bi3 = zi1 - zi3; \
+        zr0 = br0; \
+        zi0 = bi0; \
+        zr1 = br1 + bi3; \
+        zi1 = bi1 - br3; \
+        zr3 = br1 - bi3; \
+        zi3 = br3 + bi1; \
+        zr2 = br2; \
+        zi2 = bi2; \
+    } while (0)
+
+// First pass of 1K FFT
+__attribute__((always_inline)) void
+kfft_pass1(uint me,
+	    const __global float *gr, const __global float *gi,
+	    __local float *lds)
+{
+    const __global float4 *gp;
+    __local float *lp;
+
+    // Pull in transform data
+    gp = (const __global float4 *)(gr + (me << 2));
+    float4 zr0 = gp[0*64];
+    float4 zr1 = gp[1*64];
+    float4 zr2 = gp[2*64];
+    float4 zr3 = gp[3*64];
+
+    gp = (const __global float4 *)(gi + (me << 2));
+    float4 zi0 = gp[0*64];
+    float4 zi1 = gp[1*64];
+    float4 zi2 = gp[2*64];
+    float4 zi3 = gp[3*64];
+
+    FFT4();
+
+    int4 tbase = (int)(me << 2) + (int4)(0, 1, 2, 3);
+    TW4IDDLE4();
+
+    // Save registers
+    // Note that this pointer is not aligned enough to be cast to a float4*
+    lp = lds + ((me << 2) + (me >> 3));
+
+    lp[0] = zr0.x;
+    lp[1] = zr0.y;
+    lp[2] = zr0.z;
+    lp[3] = zr0.w;
+    lp += 66*4;
+
+    lp[0] = zr1.x;
+    lp[1] = zr1.y;
+    lp[2] = zr1.z;
+    lp[3] = zr1.w;
+    lp += 66*4;
+
+    lp[0] = zr2.x;
+    lp[1] = zr2.y;
+    lp[2] = zr2.z;
+    lp[3] = zr2.w;
+    lp += 66*4;
+
+    lp[0] = zr3.x;
+    lp[1] = zr3.y;
+    lp[2] = zr3.z;
+    lp[3] = zr3.w;
+    lp += 66*4;
+
+    // Imaginary part
+    lp[0] = zi0.x;
+    lp[1] = zi0.y;
+    lp[2] = zi0.z;
+    lp[3] = zi0.w;
+    lp += 66*4;
+
+    lp[0] = zi1.x;
+    lp[1] = zi1.y;
+    lp[2] = zi1.z;
+    lp[3] = zi1.w;
+    lp += 66*4;
+
+    lp[0] = zi2.x;
+    lp[1] = zi2.y;
+    lp[2] = zi2.z;
+    lp[3] = zi2.w;
+    lp += 66*4;
+
+    lp[0] = zi3.x;
+    lp[1] = zi3.y;
+    lp[2] = zi3.z;
+    lp[3] = zi3.w;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+// Second pass of 1K FFT
+__attribute__((always_inline)) void
+kfft_pass2(uint me, __local float *lds)
+{
+    __local float *lp;
+
+    // Load registers
+    lp = lds + (me + (me >> 5));
+
+    float4 zr0, zr1, zr2, zr3;
+
+    zr0.x = lp[0*66];
+    zr1.x = lp[1*66];
+    zr2.x = lp[2*66];
+    zr3.x = lp[3*66];
+    lp += 66*4;
+
+    zr0.y = lp[0*66];
+    zr1.y = lp[1*66];
+    zr2.y = lp[2*66];
+    zr3.y = lp[3*66];
+    lp += 66*4;
+
+    zr0.z = lp[0*66];
+    zr1.z = lp[1*66];
+    zr2.z = lp[2*66];
+    zr3.z = lp[3*66];
+    lp += 66*4;
+
+    zr0.w = lp[0*66];
+    zr1.w = lp[1*66];
+    zr2.w = lp[2*66];
+    zr3.w = lp[3*66];
+    lp += 66*4;
+
+    float4 zi0, zi1, zi2, zi3;
+
+    zi0.x = lp[0*66];
+    zi1.x = lp[1*66];
+    zi2.x = lp[2*66];
+    zi3.x = lp[3*66];
+    lp += 66*4;
+
+    zi0.y = lp[0*66];
+    zi1.y = lp[1*66];
+    zi2.y = lp[2*66];
+    zi3.y = lp[3*66];
+    lp += 66*4;
+
+    zi0.z = lp[0*66];
+    zi1.z = lp[1*66];
+    zi2.z = lp[2*66];
+    zi3.z = lp[3*66];
+    lp += 66*4;
+
+    zi0.w = lp[0*66];
+    zi1.w = lp[1*66];
+    zi2.w = lp[2*66];
+    zi3.w = lp[3*66];
+
+    // Transform and twiddle
+    FFT4();
+
+    int tbase = (int)(me << 2);
+    TWIDDLE4();
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Store registers
+    lp = lds + ((me << 2) + (me >> 3));
+
+    lp[0] = zr0.x;
+    lp[1] = zr1.x;
+    lp[2] = zr2.x;
+    lp[3] = zr3.x;
+    lp += 66*4;
+
+    lp[0] = zr0.y;
+    lp[1] = zr1.y;
+    lp[2] = zr2.y;
+    lp[3] = zr3.y;
+    lp += 66*4;
+
+    lp[0] = zr0.z;
+    lp[1] = zr1.z;
+    lp[2] = zr2.z;
+    lp[3] = zr3.z;
+    lp += 66*4;
+
+    lp[0] = zr0.w;
+    lp[1] = zr1.w;
+    lp[2] = zr2.w;
+    lp[3] = zr3.w;
+    lp += 66*4;
+
+    // Imaginary part
+    lp[0] = zi0.x;
+    lp[1] = zi1.x;
+    lp[2] = zi2.x;
+    lp[3] = zi3.x;
+    lp += 66*4;
+
+    lp[0] = zi0.y;
+    lp[1] = zi1.y;
+    lp[2] = zi2.y;
+    lp[3] = zi3.y;
+    lp += 66*4;
+
+    lp[0] = zi0.z;
+    lp[1] = zi1.z;
+    lp[2] = zi2.z;
+    lp[3] = zi3.z;
+    lp += 66*4;
+
+    lp[0] = zi0.w;
+    lp[1] = zi1.w;
+    lp[2] = zi2.w;
+    lp[3] = zi3.w;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+// Third pass of 1K FFT
+__attribute__((always_inline)) void
+kfft_pass3(uint me, __local float *lds)
+{
+    __local float *lp;
+
+    // Load registers
+    lp = lds + (me + (me >> 5));
+
+    float4 zr0, zr1, zr2, zr3;
+
+    zr0.x = lp[0*66];
+    zr1.x = lp[1*66];
+    zr2.x = lp[2*66];
+    zr3.x = lp[3*66];
+    lp += 66*4;
+
+    zr0.y = lp[0*66];
+    zr1.y = lp[1*66];
+    zr2.y = lp[2*66];
+    zr3.y = lp[3*66];
+    lp += 66*4;
+
+    zr0.z = lp[0*66];
+    zr1.z = lp[1*66];
+    zr2.z = lp[2*66];
+    zr3.z = lp[3*66];
+    lp += 66*4;
+
+    zr0.w = lp[0*66];
+    zr1.w = lp[1*66];
+    zr2.w = lp[2*66];
+    zr3.w = lp[3*66];
+    lp += 66*4;
+
+    float4 zi0, zi1, zi2, zi3;
+
+    zi0.x = lp[0*66];
+    zi1.x = lp[1*66];
+    zi2.x = lp[2*66];
+    zi3.x = lp[3*66];
+    lp += 66*4;
+
+    zi0.y = lp[0*66];
+    zi1.y = lp[1*66];
+    zi2.y = lp[2*66];
+    zi3.y = lp[3*66];
+    lp += 66*4;
+
+    zi0.z = lp[0*66];
+    zi1.z = lp[1*66];
+    zi2.z = lp[2*66];
+    zi3.z = lp[3*66];
+    lp += 66*4;
+
+    zi0.w = lp[0*66];
+    zi1.w = lp[1*66];
+    zi2.w = lp[2*66];
+    zi3.w = lp[3*66];
+
+    // Transform and twiddle
+    FFT4();
+
+    int tbase = (int)((me >> 2) << 4);
+    TWIDDLE4();
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Save registers
+    lp = lds + me;
+
+    lp[0*66] = zr0.x;
+    lp[1*66] = zr0.y;
+    lp[2*66] = zr0.z;
+    lp[3*66] = zr0.w;
+    lp += 66*4;
+
+    lp[0*66] = zr1.x;
+    lp[1*66] = zr1.y;
+    lp[2*66] = zr1.z;
+    lp[3*66] = zr1.w;
+    lp += 66*4;
+
+    lp[0*66] = zr2.x;
+    lp[1*66] = zr2.y;
+    lp[2*66] = zr2.z;
+    lp[3*66] = zr2.w;
+    lp += 66*4;
+
+    lp[0*66] = zr3.x;
+    lp[1*66] = zr3.y;
+    lp[2*66] = zr3.z;
+    lp[3*66] = zr3.w;
+    lp += 66*4;
+
+    // Imaginary part
+    lp[0*66] = zi0.x;
+    lp[1*66] = zi0.y;
+    lp[2*66] = zi0.z;
+    lp[3*66] = zi0.w;
+    lp += 66*4;
+
+    lp[0*66] = zi1.x;
+    lp[1*66] = zi1.y;
+    lp[2*66] = zi1.z;
+    lp[3*66] = zi1.w;
+    lp += 66*4;
+
+    lp[0*66] = zi2.x;
+    lp[1*66] = zi2.y;
+    lp[2*66] = zi2.z;
+    lp[3*66] = zi2.w;
+    lp += 66*4;
+
+    lp[0*66] = zi3.x;
+    lp[1*66] = zi3.y;
+    lp[2*66] = zi3.z;
+    lp[3*66] = zi3.w;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+// Fourth pass of 1K FFT
+__attribute__((always_inline)) void
+kfft_pass4(uint me, __local float *lds)
+{
+    __local float *lp;
+
+    // Load registers
+    lp = lds + ((me & 0x3) + ((me >> 2) & 0x3)*(66*4) + ((me >> 4) << 2));
+
+    float4 zr0, zr1, zr2, zr3;
+
+    zr0.x = lp[0*66];
+    zr0.y = lp[1*66];
+    zr0.z = lp[2*66];
+    zr0.w = lp[3*66];
+    lp += 16;
+
+    zr1.x = lp[0*66];
+    zr1.y = lp[1*66];
+    zr1.z = lp[2*66];
+    zr1.w = lp[3*66];
+    lp += 16;
+
+    zr2.x = lp[0*66];
+    zr2.y = lp[1*66];
+    zr2.z = lp[2*66];
+    zr2.w = lp[3*66];
+    lp += 16;
+
+    zr3.x = lp[0*66];
+    zr3.y = lp[1*66];
+    zr3.z = lp[2*66];
+    zr3.w = lp[3*66];
+    lp += 66*4*4 - 3*16;
+
+    float4 zi0, zi1, zi2, zi3;
+
+    zi0.x = lp[0*66];
+    zi0.y = lp[1*66];
+    zi0.z = lp[2*66];
+    zi0.w = lp[3*66];
+    lp += 16;
+
+    zi1.x = lp[0*66];
+    zi1.y = lp[1*66];
+    zi1.z = lp[2*66];
+    zi1.w = lp[3*66];
+    lp += 16;
+
+    zi2.x = lp[0*66];
+    zi2.y = lp[1*66];
+    zi2.z = lp[2*66];
+    zi2.w = lp[3*66];
+    lp += 16;
+
+    zi3.x = lp[0*66];
+    zi3.y = lp[1*66];
+    zi3.z = lp[2*66];
+    zi3.w = lp[3*66];
+
+    // Transform and twiddle
+    FFT4();
+
+    int tbase = (int)((me >> 4) << 6);
+    TWIDDLE4();
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Save registers in conflict free manner
+    lp = lds + me;
+
+    lp[0*68] = zr0.x;
+    lp[1*68] = zr0.y;
+    lp[2*68] = zr0.z;
+    lp[3*68] = zr0.w;
+    lp += 68*4;
+
+    lp[0*68] = zr1.x;
+    lp[1*68] = zr1.y;
+    lp[2*68] = zr1.z;
+    lp[3*68] = zr1.w;
+    lp += 68*4;
+
+    lp[0*68] = zr2.x;
+    lp[1*68] = zr2.y;
+    lp[2*68] = zr2.z;
+    lp[3*68] = zr2.w;
+    lp += 68*4;
+
+    lp[0*68] = zr3.x;
+    lp[1*68] = zr3.y;
+    lp[2*68] = zr3.z;
+    lp[3*68] = zr3.w;
+    lp += 68*4;
+
+    // Imaginary part
+    lp[0*68] = zi0.x;
+    lp[1*68] = zi0.y;
+    lp[2*68] = zi0.z;
+    lp[3*68] = zi0.w;
+    lp += 68*4;
+
+    lp[0*68] = zi1.x;
+    lp[1*68] = zi1.y;
+    lp[2*68] = zi1.z;
+    lp[3*68] = zi1.w;
+    lp += 68*4;
+
+    lp[0*68] = zi2.x;
+    lp[1*68] = zi2.y;
+    lp[2*68] = zi2.z;
+    lp[3*68] = zi2.w;
+    lp += 68*4;
+
+    lp[0*68] = zi3.x;
+    lp[1*68] = zi3.y;
+    lp[2*68] = zi3.z;
+    lp[3*68] = zi3.w;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+// Fifth and last pass of 1K FFT
+__attribute__((always_inline)) void
+kfft_pass5(uint me,
+	   const __local float *lds,
+	   __global float *gr, __global float *gi)
+{
+    const __local float *lp;
+
+    // Load registers
+    lp = lds + ((me & 0xf) + (me >> 4)*(68*4));
+
+    float4 zr0, zr1, zr2, zr3;
+
+    zr0.x = lp[0*68];
+    zr0.y = lp[1*68];
+    zr0.z = lp[2*68];
+    zr0.w = lp[3*68];
+    lp += 16;
+
+    zr1.x = lp[0*68];
+    zr1.y = lp[1*68];
+    zr1.z = lp[2*68];
+    zr1.w = lp[3*68];
+    lp += 16;
+
+    zr2.x = lp[0*68];
+    zr2.y = lp[1*68];
+    zr2.z = lp[2*68];
+    zr2.w = lp[3*68];
+    lp += 16;
+
+    zr3.x = lp[0*68];
+    zr3.y = lp[1*68];
+    zr3.z = lp[2*68];
+    zr3.w = lp[3*68];
+
+    lp += 68*4*4 - 3*16;
+
+    float4 zi0, zi1, zi2, zi3;
+
+    zi0.x = lp[0*68];
+    zi0.y = lp[1*68];
+    zi0.z = lp[2*68];
+    zi0.w = lp[3*68];
+    lp += 16;
+
+    zi1.x = lp[0*68];
+    zi1.y = lp[1*68];
+    zi1.z = lp[2*68];
+    zi1.w = lp[3*68];
+    lp += 16;
+
+    zi2.x = lp[0*68];
+    zi2.y = lp[1*68];
+    zi2.z = lp[2*68];
+    zi2.w = lp[3*68];
+    lp += 16;
+
+    zi3.x = lp[0*68];
+    zi3.y = lp[1*68];
+    zi3.z = lp[2*68];
+    zi3.w = lp[3*68];
+
+    // Transform
+    FFT4();
+
+    // Save result
+    __global float4 *gp = (__global float4 *)(gr + (me << 2));
+    gp[0*64] = zr0;
+    gp[1*64] = zr1;
+    gp[2*64] = zr2;
+    gp[3*64] = zr3;
+
+    gp = (__global float4 *)(gi + (me << 2));
+    gp[0*64] = zi0;
+    gp[1*64] = zi1;
+    gp[2*64] = zi2;
+    gp[3*64] = zi3;
+}
+
+// Distance between first real element of successive 1K vectors
+// It must be >= 1024, and a multiple of 4
+#define VSTRIDE (1024+0)
+
+// Performs a 1K complex FFT with every 64 global ids.
+// Each vector is a multiple of VSTRIDE from the first
+// Number of global ids must be a multiple of 64, e.g. 1024*64
+//
+//   greal - pointer to input and output real part of data
+//   gimag - pointer to input and output imaginary part of data
+__kernel void
+forward(__global float *greal, __global float *gimag)
+{
+    // This is 8704 bytes
+    __local float lds[68*4*4*2];
+
+    __global float *gr;
+    __global float *gi;
+    uint gid = get_global_id(0);
+    uint me = gid & 0x3fU;
+    uint dg = (gid >> 6) * VSTRIDE;
+
+    gr = greal + dg;
+    gi = gimag + dg;
+
+    kfft_pass1(me, gr, gi, lds);
+    kfft_pass2(me, lds);
+    kfft_pass3(me, lds);
+    kfft_pass4(me, lds);
+    kfft_pass5(me, lds, gr, gi);
+}
+
diff --git a/src/main/java/com/aparapi/examples/extension/mandel.cl b/src/main/java/com/aparapi/examples/extension/mandel.cl
index b6c1da01..5f1b50fa 100644
--- a/src/main/java/com/aparapi/examples/extension/mandel.cl
+++ b/src/main/java/com/aparapi/examples/extension/mandel.cl
@@ -1,91 +1,91 @@
-#define MAX_ITERATIONS 64
-
-__constant const int pallette[]={
-   -65536,
-   -59392,
-   -53248,
-   -112640,
-   -106752,
-   -166144,
-   -160256,
-   -219904,
-   -279552,
-   -339200,
-   -399104,
-   -985344,
-   -2624000,
-   -4197376,
-   -5770496,
-   -7343872,
-   -8851712,
-   -10425088,
-   -11932928,
-   -13375232,
-   -14817792,
-   -16260096,
-   -16719602,
-   -16720349,
-   -16721097,
-   -16721846,
-   -16722595,
-   -16723345,
-   -16724351,
-   -16725102,
-   -16726110,
-   -16727119,
-   -16728129,
-   -16733509,
-   -16738889,
-   -16744269,
-   -16749138,
-   -16754006,
-   -16758619,
-   -16762976,
-   -16767077,
-   -16771178,
-   -16774767,
-   -16514932,
-   -15662970,
-   -14942079,
-   -14221189,
-   -13631371,
-   -13107088,
-   -12648342,
-   -12320669,
-   -11992995,
-   -11796393,
-   -11665328,
-   -11993019,
-   -12386248,
-   -12845011,
-   -13303773,
-   -13762534,
-   -14286830,
-   -14745588,
-   -15269881,
-   -15728637,
-   -16252927, 
-   0
-};
-
-__kernel void createMandleBrot(
-    float scale, 
-    float offsetx, 
-    float offsety, 
-    __global int *rgb 
-){
-    int gid = get_global_id(0) + get_global_id(1)*get_global_size(0);
-    float x = ((((float)(get_global_id(0)) * scale) - ((scale / 2.0f) * (float)get_global_size(0))) / (float)get_global_size(0)) + offsetx;
-    float y = ((((float)(get_global_id(1)) * scale) - ((scale / 2.0f) * (float)get_global_size(1))) / (float)get_global_size(1)) + offsety;
-    int count = 0;
-    float zx = x;
-    float zy = y;
-    float new_zx = 0.0f;
-    for (; count<MAX_ITERATIONS && ((zx * zx) + (zy * zy))<8.0f; count++){
-        new_zx = ((zx * zx) - (zy * zy)) + x;
-        zy = ((2.0f * zx) * zy) + y;
-        zx = new_zx;
-    }
-    rgb[gid]  = pallette[count];
-}
-
+#define MAX_ITERATIONS 64
+
+__constant const int pallette[]={
+   -65536,
+   -59392,
+   -53248,
+   -112640,
+   -106752,
+   -166144,
+   -160256,
+   -219904,
+   -279552,
+   -339200,
+   -399104,
+   -985344,
+   -2624000,
+   -4197376,
+   -5770496,
+   -7343872,
+   -8851712,
+   -10425088,
+   -11932928,
+   -13375232,
+   -14817792,
+   -16260096,
+   -16719602,
+   -16720349,
+   -16721097,
+   -16721846,
+   -16722595,
+   -16723345,
+   -16724351,
+   -16725102,
+   -16726110,
+   -16727119,
+   -16728129,
+   -16733509,
+   -16738889,
+   -16744269,
+   -16749138,
+   -16754006,
+   -16758619,
+   -16762976,
+   -16767077,
+   -16771178,
+   -16774767,
+   -16514932,
+   -15662970,
+   -14942079,
+   -14221189,
+   -13631371,
+   -13107088,
+   -12648342,
+   -12320669,
+   -11992995,
+   -11796393,
+   -11665328,
+   -11993019,
+   -12386248,
+   -12845011,
+   -13303773,
+   -13762534,
+   -14286830,
+   -14745588,
+   -15269881,
+   -15728637,
+   -16252927, 
+   0
+};
+
+__kernel void createMandleBrot(
+    float scale, 
+    float offsetx, 
+    float offsety, 
+    __global int *rgb 
+){
+    int gid = get_global_id(0) + get_global_id(1)*get_global_size(0);
+    float x = ((((float)(get_global_id(0)) * scale) - ((scale / 2.0f) * (float)get_global_size(0))) / (float)get_global_size(0)) + offsetx;
+    float y = ((((float)(get_global_id(1)) * scale) - ((scale / 2.0f) * (float)get_global_size(1))) / (float)get_global_size(1)) + offsety;
+    int count = 0;
+    float zx = x;
+    float zy = y;
+    float new_zx = 0.0f;
+    for (; count<MAX_ITERATIONS && ((zx * zx) + (zy * zy))<8.0f; count++){
+        new_zx = ((zx * zx) - (zy * zy)) + x;
+        zy = ((2.0f * zx) * zy) + y;
+        zx = new_zx;
+    }
+    rgb[gid]  = pallette[count];
+}
+
diff --git a/src/main/java/com/aparapi/examples/extension/mandel2.cl b/src/main/java/com/aparapi/examples/extension/mandel2.cl
index e7043427..79956b78 100644
--- a/src/main/java/com/aparapi/examples/extension/mandel2.cl
+++ b/src/main/java/com/aparapi/examples/extension/mandel2.cl
@@ -1,95 +1,95 @@
-#define MAX_ITERATIONS 64
-
-__constant const int pallette[]={
-   -65536,
-   -59392,
-   -53248,
-   -112640,
-   -106752,
-   -166144,
-   -160256,
-   -219904,
-   -279552,
-   -339200,
-   -399104,
-   -985344,
-   -2624000,
-   -4197376,
-   -5770496,
-   -7343872,
-   -8851712,
-   -10425088,
-   -11932928,
-   -13375232,
-   -14817792,
-   -16260096,
-   -16719602,
-   -16720349,
-   -16721097,
-   -16721846,
-   -16722595,
-   -16723345,
-   -16724351,
-   -16725102,
-   -16726110,
-   -16727119,
-   -16728129,
-   -16733509,
-   -16738889,
-   -16744269,
-   -16749138,
-   -16754006,
-   -16758619,
-   -16762976,
-   -16767077,
-   -16771178,
-   -16774767,
-   -16514932,
-   -15662970,
-   -14942079,
-   -14221189,
-   -13631371,
-   -13107088,
-   -12648342,
-   -12320669,
-   -11992995,
-   -11796393,
-   -11665328,
-   -11993019,
-   -12386248,
-   -12845011,
-   -13303773,
-   -13762534,
-   -14286830,
-   -14745588,
-   -15269881,
-   -15728637,
-   -16252927, 
-   0
-};
-
-#define WIDTH get_global_size(0)
-#define HEIGHT get_global_size(1)
-#define X get_global_id(0)
-#define Y get_global_id(1)
-
-__kernel void createMandleBrot(
-      float scale, 
-      float offsetx, 
-      float offsety, 
-      __global int *rgb 
-      ){
-   float x = ((((float)(X) * scale) - ((scale / 2.0f) * (float)WIDTH)) / (float)WIDTH) + offsetx;
-   float y = ((((float)(Y) * scale) - ((scale / 2.0f) * (float)HEIGHT)) / (float)HEIGHT) + offsety;
-   float zx = x;
-   float zy = y;
-   float new_zx = 0.0f;
-   int count = 0;
-   for (; count<MAX_ITERATIONS && ((zx * zx) + (zy * zy))<8.0f; count++){
-      new_zx = ((zx * zx) - (zy * zy)) + x;
-      zy = ((2.0f * zx) * zy) + y;
-      zx = new_zx;
-   }
-   rgb[X + Y*WIDTH]  = pallette[count];
-}
-
+#define MAX_ITERATIONS 64
+
+__constant const int pallette[]={
+   -65536,
+   -59392,
+   -53248,
+   -112640,
+   -106752,
+   -166144,
+   -160256,
+   -219904,
+   -279552,
+   -339200,
+   -399104,
+   -985344,
+   -2624000,
+   -4197376,
+   -5770496,
+   -7343872,
+   -8851712,
+   -10425088,
+   -11932928,
+   -13375232,
+   -14817792,
+   -16260096,
+   -16719602,
+   -16720349,
+   -16721097,
+   -16721846,
+   -16722595,
+   -16723345,
+   -16724351,
+   -16725102,
+   -16726110,
+   -16727119,
+   -16728129,
+   -16733509,
+   -16738889,
+   -16744269,
+   -16749138,
+   -16754006,
+   -16758619,
+   -16762976,
+   -16767077,
+   -16771178,
+   -16774767,
+   -16514932,
+   -15662970,
+   -14942079,
+   -14221189,
+   -13631371,
+   -13107088,
+   -12648342,
+   -12320669,
+   -11992995,
+   -11796393,
+   -11665328,
+   -11993019,
+   -12386248,
+   -12845011,
+   -13303773,
+   -13762534,
+   -14286830,
+   -14745588,
+   -15269881,
+   -15728637,
+   -16252927, 
+   0
+};
+
+#define WIDTH get_global_size(0)
+#define HEIGHT get_global_size(1)
+#define X get_global_id(0)
+#define Y get_global_id(1)
+
+__kernel void createMandleBrot(
+      float scale, 
+      float offsetx, 
+      float offsety, 
+      __global int *rgb 
+      ){
+   float x = ((((float)(X) * scale) - ((scale / 2.0f) * (float)WIDTH)) / (float)WIDTH) + offsetx;
+   float y = ((((float)(Y) * scale) - ((scale / 2.0f) * (float)HEIGHT)) / (float)HEIGHT) + offsety;
+   float zx = x;
+   float zy = y;
+   float new_zx = 0.0f;
+   int count = 0;
+   for (; count<MAX_ITERATIONS && ((zx * zx) + (zy * zy))<8.0f; count++){
+      new_zx = ((zx * zx) - (zy * zy)) + x;
+      zy = ((2.0f * zx) * zy) + y;
+      zx = new_zx;
+   }
+   rgb[X + Y*WIDTH]  = pallette[count];
+}
+
diff --git a/src/main/java/com/aparapi/examples/extension/squarer.cl b/src/main/java/com/aparapi/examples/extension/squarer.cl
index 7169cfc1..78c208da 100644
--- a/src/main/java/com/aparapi/examples/extension/squarer.cl
+++ b/src/main/java/com/aparapi/examples/extension/squarer.cl
@@ -1,5 +1,5 @@
-__kernel void square( __global float *in, __global float *out){
-   const size_t id = get_global_id(0);
-   out[id] = in[id]*in[id];
-}
-
+__kernel void square( __global float *in, __global float *out){
+   const size_t id = get_global_id(0);
+   out[id] = in[id]*in[id];
+}
+
diff --git a/src/main/java/com/aparapi/examples/javaonedemo/NBody.java b/src/main/java/com/aparapi/examples/javaonedemo/NBody.java
index 74b41f41..93d997c8 100644
--- a/src/main/java/com/aparapi/examples/javaonedemo/NBody.java
+++ b/src/main/java/com/aparapi/examples/javaonedemo/NBody.java
@@ -125,10 +125,6 @@ public class NBody{
 
       private final float[] vxyz; // velocity component of x,y and z of bodies 
 
-      /**
-       * Constructor initializes xyz and vxyz arrays.
-       * @param _bodies
-       */
       public NBodyKernel(Range _range) {
          range = _range;
          // range = Range.create(bodies);
@@ -245,8 +241,6 @@ public class NBody{
       });
       controlPanel.add(startButton);
 
-      //  controlPanel.add(new JLabel("   Particles"));
-
       final String[] choices = new String[] {
             // "Java Sequential",
             "Java Threads",
@@ -259,16 +253,10 @@ public class NBody{
          @Override public void itemStateChanged(ItemEvent e) {
             final String item = (String) modeButton.getSelectedItem();
 
-            // if (item.equals(choices[2])) {
-            // modeButton = gpuMandelBrot;
-            //   } else 
             if (item.equals(choices[0])) {
                kernel.setExecutionMode(Kernel.EXECUTION_MODE.JTP);
 
-               // modeButton = javaMandelBrot;
             } else if (item.equals(choices[1])) {
-               // lifeKernel = lifeKernelGPU;
-               // modeButton = javaMandelBrotMultiThread;
                kernel.setExecutionMode(Kernel.EXECUTION_MODE.GPU);
             }
          }
@@ -326,7 +314,7 @@ public class NBody{
             gl.glColor3f(1f, 1f, 1f);
 
             final GLU glu = new GLU();
-            glu.gluPerspective(45f, ratio, 0f, 1000f);
+            glu.gluPerspective(45f, ratio, 1f, 1000f);
 
             glu.gluLookAt(xeye, yeye, zeye * zoomFactor, xat, yat, zat, 0f, 1f, 0f);
             if (running) {
@@ -368,12 +356,12 @@ public class NBody{
             gl.glEnable(GL.GL_BLEND);
             gl.glBlendFunc(GL.GL_SRC_ALPHA, GL.GL_ONE);
             try {
-               final InputStream textureStream = NBody.class.getResourceAsStream("particle.jpg");
+               final InputStream textureStream = NBody.class.getResourceAsStream("/particle.jpg");
+               if( textureStream == null )
+                  throw new IllegalStateException("Could not access particle.jpg resource");
                texture = TextureIO.newTexture(textureStream, false, null);
-            } catch (final IOException e) {
-               e.printStackTrace();
-            } catch (final GLException e) {
-               e.printStackTrace();
+            } catch (final IOException | GLException e) {
+               throw new IllegalStateException("Could not create texture", e);
             }
 
          }
diff --git a/src/main/java/com/aparapi/examples/median/MedianDemo.java b/src/main/java/com/aparapi/examples/median/MedianDemo.java
index b565839d..3f67532c 100644
--- a/src/main/java/com/aparapi/examples/median/MedianDemo.java
+++ b/src/main/java/com/aparapi/examples/median/MedianDemo.java
@@ -32,6 +32,7 @@ import javax.swing.*;
 import java.awt.*;
 import java.awt.image.*;
 import java.io.*;
+import java.net.URISyntaxException;
 
 /**
  * Demonstrate use of __private namespaces and @NoCL annotations.
@@ -41,12 +42,12 @@ public class MedianDemo {
 
    static {
       try {
-         File imageFile = new File("./src/main/resources/testcard.jpg").getCanonicalFile();
+         File imageFile = new File(MedianDemo.class.getResource("/testcard.jpg").toURI()).getCanonicalFile();
          if (imageFile.exists()) {
             testImage = ImageIO.read(imageFile);
          }
-      } catch (IOException e) {
-         throw new RuntimeException(e);
+      } catch (IOException | URISyntaxException e) {
+         throw new IllegalStateException("Could not open image", e);
       }
    }
 
@@ -63,18 +64,6 @@ public class MedianDemo {
           System.setProperty("com.aparapi.enableExecutionModeReporting", "true");
       }
 
-//      KernelManager.setKernelManager(new KernelManager(){
-//         @Override
-//         protected Comparator<OpenCLDevice> getDefaultGPUComparator() {
-//            return new Comparator<OpenCLDevice>() {
-//               @Override
-//               public int compare(OpenCLDevice o1, OpenCLDevice o2) {
-//                  return o2.getMaxComputeUnits() - o1.getMaxComputeUnits();
-//               }
-//            };
-//         }
-//      });
-
       System.out.println(KernelManager.instance().bestDevice());
 
       int[] argbs = testImage.getRGB(0, 0, testImage.getWidth(), testImage.getHeight(), null, 0, testImage.getWidth());
diff --git a/src/main/java/com/aparapi/examples/nbody/Local.java b/src/main/java/com/aparapi/examples/nbody/Local.java
index 93926b16..837d349d 100644
--- a/src/main/java/com/aparapi/examples/nbody/Local.java
+++ b/src/main/java/com/aparapi/examples/nbody/Local.java
@@ -121,10 +121,6 @@ public class Local{
 
       @Local private final float[] localStuff; // local memory
 
-      /**
-       * Constructor initializes xyz and vxyz arrays.
-       * @param _bodies
-       */
       public NBodyKernel(Range _range) {
          range = _range;
          localStuff = new float[range.getLocalSize(0) * 3];
@@ -307,7 +303,7 @@ public class Local{
             gl.glColor3f(1f, 1f, 1f);
 
             final GLU glu = new GLU();
-            glu.gluPerspective(45f, ratio, 0f, 1000f);
+            glu.gluPerspective(45f, ratio, 1f, 1000f);
 
             glu.gluLookAt(xeye, yeye, zeye * zoomFactor, xat, yat, zat, 0f, 1f, 0f);
             if (running) {
@@ -351,7 +347,7 @@ public class Local{
             gl.glEnable(GL.GL_BLEND);
             gl.glBlendFunc(GL.GL_SRC_ALPHA, GL.GL_ONE);
             try {
-               final InputStream textureStream = Local.class.getResourceAsStream("particle.jpg");
+               final InputStream textureStream = Local.class.getResourceAsStream("/particle.jpg");
                final Texture texture = TextureIO.newTexture(textureStream, false, null);
                texture.enable(gl);
             } catch (final IOException e) {
diff --git a/src/main/java/com/aparapi/examples/nbody/Main.java b/src/main/java/com/aparapi/examples/nbody/Main.java
index 5f8fdfaa..e80fc948 100644
--- a/src/main/java/com/aparapi/examples/nbody/Main.java
+++ b/src/main/java/com/aparapi/examples/nbody/Main.java
@@ -309,7 +309,7 @@ public class Main{
             gl.glColor3f(1f, 1f, 1f);
 
             final GLU glu = new GLU();
-            glu.gluPerspective(45f, ratio, 0f, 1000f);
+            glu.gluPerspective(45f, ratio, 1f, 1000f);
 
             glu.gluLookAt(xeye, yeye, zeye * zoomFactor, xat, yat, zat, 0f, 1f, 0f);
             if (running) {
@@ -356,7 +356,7 @@ public class Main{
             gl.glTexParameteri(GL.GL_TEXTURE_2D, GL.GL_TEXTURE_MAG_FILTER, GL.GL_LINEAR);
             gl.glTexParameteri(GL.GL_TEXTURE_2D, GL.GL_TEXTURE_MIN_FILTER, GL.GL_NEAREST);
             try {
-               final InputStream textureStream = Main.class.getResourceAsStream("particle.jpg");
+               final InputStream textureStream = Main.class.getResourceAsStream("/particle.jpg");
                TextureData data = TextureIO.newTextureData(profile, textureStream, false, "jpg");
                texture = TextureIO.newTexture(data);
             } catch (final IOException e) {
diff --git a/src/main/java/com/aparapi/examples/nbody/Seq.java b/src/main/java/com/aparapi/examples/nbody/Seq.java
index 54a128b6..a6b18135 100644
--- a/src/main/java/com/aparapi/examples/nbody/Seq.java
+++ b/src/main/java/com/aparapi/examples/nbody/Seq.java
@@ -311,7 +311,7 @@ public class Seq{
             gl.glColor3f(1f, 1f, 1f);
 
             final GLU glu = new GLU();
-            glu.gluPerspective(45f, ratio, 0f, 1000f);
+            glu.gluPerspective(45f, ratio, 1f, 1000f);
 
             glu.gluLookAt(xeye, yeye, zeye * zoomFactor, xat, yat, zat, 0f, 1f, 0f);
             if (running) {
@@ -347,7 +347,7 @@ public class Seq{
             gl.glTexParameteri(GL.GL_TEXTURE_2D, GL.GL_TEXTURE_MAG_FILTER, GL.GL_LINEAR);
             gl.glTexParameteri(GL.GL_TEXTURE_2D, GL.GL_TEXTURE_MIN_FILTER, GL.GL_NEAREST);
             try {
-               final InputStream textureStream = Seq.class.getResourceAsStream("particle.jpg");
+               final InputStream textureStream = Seq.class.getResourceAsStream("/particle.jpg");
                TextureData data = TextureIO.newTextureData(profile, textureStream, false, "jpg");
                texture = TextureIO.newTexture(data);
             } catch (final IOException e) {
diff --git a/src/main/java/com/aparapi/examples/oopnbody/Main.java b/src/main/java/com/aparapi/examples/oopnbody/Main.java
index fae1ac0d..19c2380a 100644
--- a/src/main/java/com/aparapi/examples/oopnbody/Main.java
+++ b/src/main/java/com/aparapi/examples/oopnbody/Main.java
@@ -321,7 +321,7 @@ public class Main{
             gl.glColor3f(1f, 1f, 1f);
 
             final GLU glu = new GLU();
-            glu.gluPerspective(45f, ratio, 0f, 1000f);
+            glu.gluPerspective(45f, ratio, 1f, 1000f);
 
             glu.gluLookAt(xeye, yeye, zeye * zoomFactor, xat, yat, zat, 0f, 1f, 0f);
             if (running) {
@@ -360,7 +360,7 @@ public class Main{
             gl.glTexParameteri(GL.GL_TEXTURE_2D, GL.GL_TEXTURE_MAG_FILTER, GL.GL_LINEAR);
             gl.glTexParameteri(GL.GL_TEXTURE_2D, GL.GL_TEXTURE_MIN_FILTER, GL.GL_NEAREST);
             try {
-               final InputStream textureStream = Main.class.getResourceAsStream("particle.jpg");
+               final InputStream textureStream = Main.class.getResourceAsStream("/particle.jpg");
                texture = TextureIO.newTexture(textureStream, false, null);
                texture.enable(gl);
             } catch (final IOException e) {
-- 
GitLab