diff --git a/CHANGELOG.md b/CHANGELOG.md index ba6f4814e90d2cac4ad3addbae1a092a4b8e4efa..32898a137d0470ddc79807ceedc24b54927656a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ * Fixed NPE bug for Kernel.getProfileReportCurrentThread(device) and similar methods * Fixed bug where ClassModel would throw an error when loaded if boot strap methods were 0. * Aparapi can now run on any OpenCL version rather than failing on untested versions it produces a warning. +* Fixes Java Alternative algorithm does not work for arbitrary NDRanges #142, #5 +* New Range method API to deal with the fact Ranges need to be bound to the Device and Kernel instances +* Fixed Range computation of local size for 1D ranges, where the algorithm could exceed the max. kernel and device allowed work group size +* Reworked Profiling support to deal with the possibility of Kernel compilation being decoupled from the kernel execution * Updated the following dependency versions: ** com.aparapi: aparapi-jni 1.4.2 -> 1.4.3 ** org.apache.bcel:bcel 6.4.1 -< 6.5.0 diff --git a/src/main/java/com/aparapi/Kernel.java b/src/main/java/com/aparapi/Kernel.java index 4b9686dbe07db2168cef1675122e85aacb97b20b..b01f5e57efae92741c5ef091a1598f6bf3a2f5ec 100644 --- a/src/main/java/com/aparapi/Kernel.java +++ b/src/main/java/com/aparapi/Kernel.java @@ -90,6 +90,7 @@ import java.util.logging.Logger; import com.aparapi.device.Device; import com.aparapi.device.JavaDevice; import com.aparapi.device.OpenCLDevice; +import com.aparapi.exception.AparapiKernelFailedException; import com.aparapi.exception.CompileFailedException; import com.aparapi.internal.kernel.IKernelBarrier; import com.aparapi.internal.kernel.KernelArg; @@ -342,6 +343,9 @@ public abstract class Kernel implements Cloneable { public abstract void run(); public Kernel execute(Range _range) { + if (!_range.isSameKernel(Kernel.this)) { + throw new AparapiKernelFailedException("Cannot execute kernel with the specified Range. It is targetting a different Kernel instance"); + } return (Kernel.this.execute("foo", _range, 1)); } } @@ -2801,6 +2805,9 @@ public abstract class Kernel implements Cloneable { * */ public synchronized Kernel execute(Range _range) { + if (!_range.isSameKernel(this)) { + throw new AparapiKernelFailedException("Cannot execute kernel with the specified Range. It is targetting a different Kernel instance"); + } return (execute(_range, 1)); } @@ -2843,10 +2850,10 @@ public abstract class Kernel implements Cloneable { protected Range createRange(int _range) { if (executionMode.equals(EXECUTION_MODE.AUTO)) { Device device = getTargetDevice(); - Range range = Range.create(device, _range); + Range range = Range.create(this, device, _range); return range; } else { - return Range.create(null, _range); + return Range.create(this, null, _range); } } @@ -2861,6 +2868,9 @@ public abstract class Kernel implements Cloneable { * */ public synchronized Kernel execute(Range _range, int _passes) { + if (!_range.isSameKernel(this)) { + throw new AparapiKernelFailedException("Cannot execute kernel with the specified Range. It is targetting a different Kernel instance"); + } return (execute("run", _range, _passes)); } @@ -2904,6 +2914,9 @@ public abstract class Kernel implements Cloneable { * */ public synchronized Kernel execute(String _entrypoint, Range _range, int _passes) { + if (!_range.isSameKernel(this)) { + throw new AparapiKernelFailedException("Cannot execute kernel with the specified Range. It is targetting a different Kernel instance"); + } return prepareKernelRunner().execute(_entrypoint, _range, _passes); } diff --git a/src/main/java/com/aparapi/ProfileReport.java b/src/main/java/com/aparapi/ProfileReport.java index de7a8b80ff81801ba3e8262f96dbd63e1ddd484f..01e5e1b133406cf3e3dc0e2cd1328328b8e7e69d 100644 --- a/src/main/java/com/aparapi/ProfileReport.java +++ b/src/main/java/com/aparapi/ProfileReport.java @@ -119,11 +119,26 @@ public final class ProfileReport { if (stage == ProfilingEvent.START.ordinal()) { return 0; } + if (stage == ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal()) { + //Ready to prepare execute is a stage that never takes time it is just a partial start time + //reference point. + return 0.0; + } return (currentTimes[stage] - currentTimes[stage - 1]) / MILLION; } /** Elapsed time for all events {@code from} through {@code to}.*/ public double getElapsedTime(int from, int to) { + double accum = 0.0; + if (from < ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal()) { + if (to >= ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal()) { + accum = (currentTimes[ProfilingEvent.OPENCL_COMPILED.ordinal()] - currentTimes[from]) / MILLION; + accum += (currentTimes[to] - currentTimes[ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal()]) / MILLION; + return accum; + } else { + return (currentTimes[to] - currentTimes[from]) / MILLION; + } + } return (currentTimes[to] - currentTimes[from]) / MILLION; } diff --git a/src/main/java/com/aparapi/Range.java b/src/main/java/com/aparapi/Range.java index 065612693995e428b30bdbfa9006a71c9b34d3ed..65781d5a52198cd9529d8bcafcfa32a0c09c27d5 100644 --- a/src/main/java/com/aparapi/Range.java +++ b/src/main/java/com/aparapi/Range.java @@ -16,8 +16,12 @@ package com.aparapi; import com.aparapi.device.*; +import com.aparapi.exception.AparapiRangeFailedException; +import com.aparapi.exception.QueryFailedException; import com.aparapi.internal.jni.*; +import com.aparapi.opencl.OpenCL; +import java.lang.ref.WeakReference; import java.util.*; /** @@ -64,53 +68,99 @@ import java.util.*; */ public class Range extends RangeJNI{ + //Maximum allowed number of threads per core for JTP public static final int THREADS_PER_CORE = 16; - public static final int MAX_OPENCL_GROUP_SIZE = 256; + //Default maximum work group size for any possible OpenCL device + public static final int MAX_OPENCL_GROUP_SIZE = 1024; + //This is largest possible MAX_WORK_GRUOP size that Aparapi will handle for any possible OpenCL or JTP device public static final int MAX_GROUP_SIZE = Math.max(Runtime.getRuntime().availableProcessors() * THREADS_PER_CORE, - MAX_OPENCL_GROUP_SIZE); + MAX_OPENCL_GROUP_SIZE); private OpenCLDevice device = null; + + //The kernel to which this Range instance pertains + private final WeakReference<Kernel> kernel; + //The actual allowed maximum work group size for a given kernel and device private int maxWorkGroupSize; + //The actual allowed maximum item size for a given device private int[] maxWorkItemSize = new int[] { MAX_GROUP_SIZE, MAX_GROUP_SIZE, MAX_GROUP_SIZE }; - + /** * Minimal constructor * - * @param _device - * @param _dims + * @param _kernel the kernel for which this Range is meant + * @param _device the device where the kernel is to be executed + * @param _dims the dimensions to use for the Range */ - public Range(Device _device, int _dims) { + public Range(final Kernel _kernel, final Device _device, final int _dims) { device = !(_device instanceof OpenCLDevice) ? null : (OpenCLDevice) _device; dims = _dims; + + kernel = new WeakReference<Kernel>(_kernel); if (device != null) { maxWorkItemSize = device.getMaxWorkItemSize(); - maxWorkGroupSize = device.getMaxWorkGroupSize(); + if (kernel.get() == null) { + //FIXME OpenCL source code should also be able to retrieve the real max. work group size for its compiled source. + //Use Device hint to the MaxWorkGroupSize, this is only for OpenCL source code ran directly by Aparapi, + //or for querying a device driver + maxWorkGroupSize = device.getMaxWorkGroupSize(); + } else { + //This is the codepath for Aparapi Kernels that are to run on a real OpenCL device. + try { + maxWorkGroupSize = kernel.get().getKernelMaxWorkGroupSize(_device); + } catch (QueryFailedException e) { + throw new AparapiRangeFailedException("Couldn't retrieve device max. work group size", e); + } + } } else { - maxWorkGroupSize = MAX_GROUP_SIZE; + //There is no point in allowing a workGroupSize as large as the OpenCL device maximum max. work group size, because + //it will just overload the CPU, when going above the number of real cores, besides Java already account hyper + //threading as an extra real core, which will already overload the machine. + maxWorkGroupSize = Runtime.getRuntime().availableProcessors() * THREADS_PER_CORE; } } + + /** + * Create a one dimensional range <code>0.._globalWidth</code> + * <br> + * Note that for this range to be valid : <br> + * <strong><code> _globalWidth > 0 && _globalWidth <= getMaxWorkItemSize() </code> </strong> + * + * @param _cl the Aparapi OpenCL object for native OpenCL source code + * @param _device the intended device where the kernel should run + * @param _globalWidth the overall range we wish to process + * @return A new Range with the requested dimensions for the specified OpenCL instance and Device instance + */ + public static Range create(final OpenCL<?> _cl, final Device _device, int _globalWidth) { + final Range r = Range.create((Kernel)null, _device, _globalWidth); + + return r; + } /** * Create a one dimensional range <code>0.._globalWidth</code> which is processed in groups of size _localWidth. - * <br/> - * Note that for this range to be valid : </br> <strong><code>_globalWidth > 0 && _localWidth > 0 && _localWidth < MAX_GROUP_SIZE && _globalWidth % _localWidth==0</code></strong> + * <br> + * Note that for this range to be valid : <br> + * <strong><code>_globalWidth > 0 && _localWidth > 0 && _localWidth < MAX_GROUP_SIZE && _globalWidth % _localWidth==0</code></strong> * + * @param _kernel the Aparapi kernel for which this Range is being created + * @param _device the intended device where the kernel should run * @param _globalWidth the overall range we wish to process * @param _localWidth the size of the group we wish to process. - * @return A new Range with the requested dimensions + * @return A new Range with the requested dimensions for the specified Kernel instance and Device instance */ - public static Range create(Device _device, int _globalWidth, int _localWidth) { - final Range range = new Range(_device, 1); - + public static Range create(final Kernel _kernel, final Device _device, int _globalWidth, int _localWidth) { + final Range range = new Range(_kernel, _device, 1); + range.setGlobalSize_0(_globalWidth); range.setLocalSize_0(_localWidth); @@ -126,8 +176,7 @@ public class Range extends RangeJNI{ * @param _max an upper bound on the value that can be chosen * @return and array of factors of _value */ - - private static int[] getFactors(int _value, int _max) { + private static int[] getFactors(final int _value, final int _max) { final int factors[] = new int[MAX_GROUP_SIZE]; int factorIdx = 0; @@ -142,21 +191,24 @@ public class Range extends RangeJNI{ /** * Create a one dimensional range <code>0.._globalWidth</code> with an undefined group size. - * <br/> + * <br> * Note that for this range to be valid :- </br> <strong><code>_globalWidth > 0 </code></strong> - * <br/> + * <br> * The groupsize will be chosen such that _localWidth > 0 && _localWidth < MAX_GROUP_SIZE && _globalWidth % _localWidth==0 is true * * We extract the factors of _globalWidth and choose the highest value. * + * @param _kernel the Aparapi kernel for which this Range is being created + * @param _device the intended device where the kernel should run * @param _globalWidth the overall range we wish to process - * @return A new Range with the requested dimensions + * @return A new Range with the requested dimensions for the specified Kernel instance and Device instance */ - public static Range create(Device _device, int _globalWidth) { - final Range withoutLocal = create(_device, _globalWidth, 1); + public static Range create(final Kernel _kernel, final Device _device, int _globalWidth) { + final Range withoutLocal = create(_kernel, _device, _globalWidth, 1); if (_device == JavaDevice.THREAD_POOL) { - withoutLocal.setLocalSize_0(Runtime.getRuntime().availableProcessors()); + createThreadPoolHelper(_globalWidth, withoutLocal); + withoutLocal.setLocalIsDerived(true); return withoutLocal; } else if (_device instanceof JavaDevice) { @@ -168,13 +220,19 @@ public class Range extends RangeJNI{ withoutLocal.setLocalIsDerived(true); return withoutLocal; } - + if (withoutLocal.isValid()) { withoutLocal.setLocalIsDerived(true); - final int[] factors = getFactors(withoutLocal.getGlobalSize_0(), withoutLocal.getMaxWorkItemSize()[0]); - - withoutLocal.setLocalSize_0(factors[factors.length - 1]); - + final int[] factors = getFactors(withoutLocal.getGlobalSize_0(), withoutLocal.getMaxWorkGroupSize()); + + //Avoid a factor that is greater than the maximum allowed work group size for the kernel. + //int index = 0; + //for (index = factors.length - 1; index > 0 && factors[index] > withoutLocal.getMaxWorkGroupSize(); index--); + + int localSize = factors[factors.length-1]; + + withoutLocal.setLocalSize_0(localSize); + withoutLocal.setValid((withoutLocal.getLocalSize_0() > 0) && (withoutLocal.getLocalSize_0() <= withoutLocal.getMaxWorkItemSize()[0]) && (withoutLocal.getLocalSize_0() <= withoutLocal.getMaxWorkGroupSize()) @@ -184,28 +242,96 @@ public class Range extends RangeJNI{ return (withoutLocal); } - public static Range create(int _globalWidth, int _localWidth) { - final Range range = create(null, _globalWidth, _localWidth); + /** + * Create helper for determining a suitable local size for the JTP execution mode. + * @param _globalWidth the user specified global width + * @param withoutLocal the Range instance + */ + private static void createThreadPoolHelper(int _globalWidth, final Range withoutLocal) { + int availableProcessors = Runtime.getRuntime().availableProcessors(); + if (availableProcessors > _globalWidth) { + withoutLocal.setLocalSize_0(_globalWidth); + } else if (_globalWidth % availableProcessors == 0) { + withoutLocal.setLocalSize_0(availableProcessors); + } else if (_globalWidth % (availableProcessors / 2) == 0) { + withoutLocal.setLocalSize_0(availableProcessors / 2); + } else { + withoutLocal.setLocalSize_0(1); + } + } + + /** + * Create a one dimensional range <code>0.._globalWidth</code> which is processed in groups of size _localWidth. + * <br> + * Note that for this range to be valid : <br> + * <strong><code>_globalWidth > 0 && _localWidth > 0 && _localWidth < MAX_GROUP_SIZE && _globalWidth % _localWidth==0</code></strong> + * + * @param _kernel the Aparapi kernel for which this Range is being created + * @param _globalWidth the overall range we wish to process + * @param _localWidth the size of the local group we wish to process + * @return A new Range with the requested dimensions for the specified Kernel instance and assigned Device instance + */ + public static Range create(final Kernel _kernel, int _globalWidth, int _localWidth) { + final Range range = create(_kernel, null, _globalWidth, _localWidth); return (range); } - - public static Range create(int _globalWidth) { - final Range range = create(null, _globalWidth); + + /** + * Create a one dimensional range <code>0.._globalWidth</code> with an undefined group size. + * <br> + * Note that for this range to be valid :- </br> <strong><code>_globalWidth > 0 </code></strong> + * <br> + * The groupsize will be chosen such that _localWidth > 0 && _localWidth < MAX_GROUP_SIZE && _globalWidth % _localWidth==0 is true + * + * We extract the factors of _globalWidth and choose the highest value. + * + * @param _kernel the Aparapi kernel for which this Range is being created + * @param _globalWidth the overall range we wish to process + * @return A new Range with the requested dimensions for the specified Kernel instance and assigned Device instance + */ + public static Range create(final Kernel _kernel, int _globalWidth) { + final Range range = create(_kernel, null, _globalWidth); return (range); } - /** + /** * Create a two dimensional range 0.._globalWidth x 0.._globalHeight using a group which is _localWidth x _localHeight in size. - * <br/> + * <br> * Note that for this range to be valid _globalWidth > 0 && _globalHeight >0 && _localWidth>0 && _localHeight>0 && _localWidth*_localHeight < MAX_GROUP_SIZE && _globalWidth%_localWidth==0 && _globalHeight%_localHeight==0. * - * @param _globalWidth the overall range we wish to process - * @return - */ - public static Range create2D(Device _device, int _globalWidth, int _globalHeight, int _localWidth, int _localHeight) { - final Range range = new Range(_device, 2); + * @param _cl the Aparapi OpenCL object for native OpenCL source code + * @param _device the intended device where the kernel should run + * @param _globalWidth the with range we wish to process + * @param _globalHeight the height range we wish to process + * @param _localWidth the local group width + * @param _localHeight the local group height + * @return the Range instance for the intended OpenCL instance and Device instance + */ + public static Range create2D(final OpenCL<?> _cl, final Device _device, + int _globalWidth, int _globalHeight, int _localWidth, int _localHeight) { + final Range r = Range.create2D((Kernel)null, _device, _globalWidth, _globalHeight, _localWidth, _localHeight); + + return r; + } + + /** + * Create a two dimensional range 0.._globalWidth x 0.._globalHeight using a group which is _localWidth x _localHeight in size. + * <br> + * Note that for this range to be valid _globalWidth > 0 && _globalHeight >0 && _localWidth>0 && _localHeight>0 && _localWidth*_localHeight < MAX_GROUP_SIZE && _globalWidth%_localWidth==0 && _globalHeight%_localHeight==0. + * + * @param _cl the Aparapi OpenCL object for native OpenCL source code + * @param _device the intended device where the kernel should run + * @param _globalWidth the with range we wish to process + * @param _globalHeight the height range we wish to process + * @param _localWidth the local group width + * @param _localHeight the local group height + * @return the Range instance for the intended OpenCL instance and Device instance + */ + public static Range create2D(final Kernel _kernel, final Device _device, + int _globalWidth, int _globalHeight, int _localWidth, int _localHeight) { + final Range range = new Range(_kernel, _device, 2); range.setGlobalSize_0(_globalWidth); range.setLocalSize_0(_localWidth); @@ -222,6 +348,12 @@ public class Range extends RangeJNI{ return (range); } + public static Range create2D(final OpenCL<?> _cl, final Device _device, int _globalWidth, int _globalHeight) { + final Range r = Range.create2D((Kernel)null, _device, _globalWidth, _globalHeight); + + return r; + } + /** * Create a two dimensional range <code>0.._globalWidth * 0.._globalHeight</code> choosing suitable values for <code>localWidth</code> and <code>localHeight</code>. * <p> @@ -234,44 +366,18 @@ public class Range extends RangeJNI{ * <p> * For example for <code>MAX_GROUP_SIZE</code> of 16 we favor 4x4 over 1x16. * - * @param _globalWidth the overall range we wish to process - * @return + * @param _kernel the Aparapi kernel for which this Range is being created + * @param _device the intended device where the kernel should run + * @param _globalWidth the with range we wish to process + * @param _globalHeight the height range we wish to process + * @return the Range instance for the intended Kernel instance and Device instance */ - public static Range create2D(Device _device, int _globalWidth, int _globalHeight) { - final Range withoutLocal = create2D(_device, _globalWidth, _globalHeight, 1, 1); + public static Range create2D(final Kernel _kernel, final Device _device, int _globalWidth, int _globalHeight) { + final Range withoutLocal = create2D(_kernel, _device, _globalWidth, _globalHeight, 1, 1); if (withoutLocal.isValid()) { withoutLocal.setLocalIsDerived(true); - final int[] widthFactors = getFactors(_globalWidth, withoutLocal.getMaxWorkItemSize()[0]); - final int[] heightFactors = getFactors(_globalHeight, withoutLocal.getMaxWorkItemSize()[1]); - - withoutLocal.setLocalSize_0(1); - withoutLocal.setLocalSize_1(1); - int max = 1; - int perimeter = 0; - - for (final int w : widthFactors) { - for (final int h : heightFactors) { - final int size = w * h; - if (size > withoutLocal.getMaxWorkGroupSize()) { - break; - } - - if (size > max) { - max = size; - perimeter = w + h; - withoutLocal.setLocalSize_0(w); - withoutLocal.setLocalSize_1(h); - } else if (size == max) { - final int localPerimeter = w + h; - if (localPerimeter < perimeter) {// is this the shortest perimeter so far - perimeter = localPerimeter; - withoutLocal.setLocalSize_0(w); - withoutLocal.setLocalSize_1(h); - } - } - } - } + create2DHelper(_globalWidth, _globalHeight, withoutLocal); withoutLocal.setValid((withoutLocal.getLocalSize_0() > 0) && (withoutLocal.getLocalSize_1() > 0) && (withoutLocal.getLocalSize_0() <= withoutLocal.getMaxWorkItemSize()[0]) @@ -284,35 +390,134 @@ public class Range extends RangeJNI{ return (withoutLocal); } - public static Range create2D(int _globalWidth, int _globalHeight, int _localWidth, int _localHeight) { - final Range range = create2D(null, _globalWidth, _globalHeight, _localWidth, _localHeight); +/** + * Helper method for create2D to adjust the 2D local size from the MaxWorkGroupSize and GlobalSizes + * @param _globalWidth the user specified globalWidth + * @param _globalHeight the user specified globalHeight + * @param withoutLocal the Range instance + */ +private static Range create2DHelper(int _globalWidth, int _globalHeight, final Range withoutLocal) { + final int[] widthFactors = getFactors(_globalWidth, withoutLocal.getMaxWorkItemSize()[0]); + final int[] heightFactors = getFactors(_globalHeight, withoutLocal.getMaxWorkItemSize()[1]); + + withoutLocal.setLocalSize_0(1); + withoutLocal.setLocalSize_1(1); + int max = 1; + int perimeter = 0; + + for (final int w : widthFactors) { + for (final int h : heightFactors) { + final int size = w * h; + if (size > withoutLocal.getMaxWorkGroupSize()) { + break; + } + + if (size > max) { + max = size; + perimeter = w + h; + withoutLocal.setLocalSize_0(w); + withoutLocal.setLocalSize_1(h); + } else if (size == max) { + final int localPerimeter = w + h; + if (localPerimeter < perimeter) {// is this the shortest perimeter so far + perimeter = localPerimeter; + withoutLocal.setLocalSize_0(w); + withoutLocal.setLocalSize_1(h); + } + } + } + } + + return withoutLocal; +} + + /** + * Create a two dimensional range <code>0.._globalWidth * 0.._globalHeight</code> choosing suitable values for <code>localWidth</code> and <code>localHeight</code>. + * <p> + * Note that for this range to be valid <code>_globalWidth > 0 && _globalHeight >0 && _localWidth>0 && _localHeight>0 && _localWidth*_localHeight < MAX_GROUP_SIZE && _globalWidth%_localWidth==0 && _globalHeight%_localHeight==0</code>. + * + * @param _kernel the Aparapi kernel for which this Range is being created + * @param _globalWidth the with range we wish to process + * @param _globalHeight the height range we wish to process + * @param _localWidth the local group width + * @param _localHeight the local group height + * @return the Range instance for the intended Kernel instance and assigned Device instance + */ + public static Range create2D(final Kernel _kernel, int _globalWidth, int _globalHeight, int _localWidth, int _localHeight) { + final Range range = create2D(_kernel, null, _globalWidth, _globalHeight, _localWidth, _localHeight); return (range); } - public static Range create2D(int _globalWidth, int _globalHeight) { - final Range range = create2D(null, _globalWidth, _globalHeight); + /** + * Create a two dimensional range <code>0.._globalWidth * 0.._globalHeight</code> choosing suitable values for <code>localWidth</code> and <code>localHeight</code>. + * <p> + * Note that for this range to be valid <code>_globalWidth > 0 && _globalHeight >0 && _localWidth>0 && _localHeight>0 && _localWidth*_localHeight < MAX_GROUP_SIZE && _globalWidth%_localWidth==0 && _globalHeight%_localHeight==0</code>. + * + * <p> + * To determine suitable values for <code>_localWidth</code> and <code>_localHeight</code> we extract the factors for <code>_globalWidth</code> and <code>_globalHeight</code> and then + * find the largest product ( <code><= MAX_GROUP_SIZE</code>) with the lowest perimeter. + * + * <p> + * For example for <code>MAX_GROUP_SIZE</code> of 16 we favor 4x4 over 1x16. + * + * @param _kernel the Aparapi kernel for which this Range is being created + * @param _globalWidth the with range we wish to process + * @param _globalHeight the height range we wish to process + * @return the Range instance for the intended Kernel instance and assigned Device instance + */ + public static Range create2D(final Kernel _kernel, int _globalWidth, int _globalHeight) { + final Range range = create2D(_kernel, null, _globalWidth, _globalHeight); return (range); } + + /** + * Create a three dimensional range <code>0.._globalWidth * 0.._globalHeight *0../_globalDepth</code> + * in groups defined by <code>localWidth</code> * <code>localHeight</code> * <code>localDepth</code>. + * <p> + * Note that for this range to be valid <code>_globalWidth > 0 && _globalHeight >0 _globalDepth >0 && _localWidth>0 && _localHeight>0 && _localDepth>0 && _localWidth*_localHeight*_localDepth < MAX_GROUP_SIZE && _globalWidth%_localWidth==0 && _globalHeight%_localHeight==0 && _globalDepth%_localDepth==0</code>. + * + * @param _cl the Aparapi OpenCL object for native OpenCL source code + * @param _device the intended device where the kernel should run + * @param _globalWidth the width of the 3D grid we wish to process + * @param _globalHeight the height of the 3D grid we wish to process + * @param _globalDepth the depth of the 3D grid we wish to process + * @param _localWidth the width of the 3D group we wish to process + * @param _localHeight the height of the 3D group we wish to process + * @param _localDepth the depth of the 3D group we wish to process + * @return the Range instance for the intended OpenCL instance and Device instance + */ + public static Range create3D(final OpenCL<?> _cl, final Device _device, + int _globalWidth, int _globalHeight, int _globalDepth, + int _localWidth, int _localHeight, int _localDepth) { + final Range r = Range.create3D((Kernel)null, _device, _globalWidth, _globalHeight, _globalDepth, _localWidth, + _localHeight, _localDepth); + + return r; + } + /** - * Create a two dimensional range <code>0.._globalWidth * 0.._globalHeight *0../_globalDepth</code> + * Create a three dimensional range <code>0.._globalWidth * 0.._globalHeight *0../_globalDepth</code> * in groups defined by <code>localWidth</code> * <code>localHeight</code> * <code>localDepth</code>. * <p> * Note that for this range to be valid <code>_globalWidth > 0 && _globalHeight >0 _globalDepth >0 && _localWidth>0 && _localHeight>0 && _localDepth>0 && _localWidth*_localHeight*_localDepth < MAX_GROUP_SIZE && _globalWidth%_localWidth==0 && _globalHeight%_localHeight==0 && _globalDepth%_localDepth==0</code>. * + * @param _kernel the Aparapi kernel for which this Range is being created + * @param _device the intended device where the kernel should run * @param _globalWidth the width of the 3D grid we wish to process * @param _globalHeight the height of the 3D grid we wish to process * @param _globalDepth the depth of the 3D grid we wish to process * @param _localWidth the width of the 3D group we wish to process * @param _localHeight the height of the 3D group we wish to process * @param _localDepth the depth of the 3D group we wish to process - * @return + * @return the Range instance for the intended Kerneç instance and Device instance */ - public static Range create3D(Device _device, int _globalWidth, int _globalHeight, int _globalDepth, int _localWidth, - int _localHeight, int _localDepth) { - final Range range = new Range(_device, 3); + public static Range create3D(final Kernel _kernel, final Device _device, + int _globalWidth, int _globalHeight, int _globalDepth, + int _localWidth, int _localHeight, int _localDepth) { + final Range range = new Range(_kernel, _device, 3); range.setGlobalSize_0(_globalWidth); range.setLocalSize_0(_localWidth); @@ -332,6 +537,25 @@ public class Range extends RangeJNI{ return (range); } + /** + * Create a three dimensional range <code>0.._globalWidth * 0.._globalHeight *0../_globalDepth</code> + * <p> + * Note that for this range to be valid <code>_globalWidth > 0 && _globalHeight >0 _globalDepth >0</code> and must + * not exceed max. work item sizes for each dimension. + * + * @param _cl the Aparapi OpenCL object for native OpenCL source code + * @param _device the intended device where the kernel should run + * @param _globalWidth the width of the 3D grid we wish to process + * @param _globalHeight the height of the 3D grid we wish to process + * @param _globalDepth the depth of the 3D grid we wish to process + * @return the Range instance for the intended OpenCL instance and Device instance + */ + public static Range create3D(final OpenCL<?> _cl, final Device _device, int _globalWidth, int _globalHeight, int _globalDepth) { + Range r = Range.create3D((Kernel)null, _device, _globalWidth, _globalHeight, _globalDepth, 1, 1, 1); + + return r; + } + /** * Create a three dimensional range <code>0.._globalWidth * 0.._globalHeight *0../_globalDepth</code> * choosing suitable values for <code>localWidth</code>, <code>localHeight</code> and <code>localDepth</code>. @@ -345,54 +569,20 @@ public class Range extends RangeJNI{ * <p> * For example for <code>MAX_GROUP_SIZE</code> of 64 we favor 4x4x4 over 1x16x16. * + * @param _kernel the Aparapi kernel for which this Range is being created + * @param _device the intended device where the kernel should run * @param _globalWidth the width of the 3D grid we wish to process * @param _globalHeight the height of the 3D grid we wish to process * @param _globalDepth the depth of the 3D grid we wish to process - * @return + * @return the Range instance for the intended Kernel instance and Device instance */ - public static Range create3D(Device _device, int _globalWidth, int _globalHeight, int _globalDepth) { - final Range withoutLocal = create3D(_device, _globalWidth, _globalHeight, _globalDepth, 1, 1, 1); + public static Range create3D(final Kernel _kernel, final Device _device, int _globalWidth, int _globalHeight, int _globalDepth) { + final Range withoutLocal = create3D(_kernel, _device, _globalWidth, _globalHeight, _globalDepth, 1, 1, 1); if (withoutLocal.isValid()) { withoutLocal.setLocalIsDerived(true); - final int[] widthFactors = getFactors(_globalWidth, withoutLocal.getMaxWorkItemSize()[0]); - final int[] heightFactors = getFactors(_globalHeight, withoutLocal.getMaxWorkItemSize()[1]); - final int[] depthFactors = getFactors(_globalDepth, withoutLocal.getMaxWorkItemSize()[2]); - - withoutLocal.setLocalSize_0(1); - withoutLocal.setLocalSize_1(1); - withoutLocal.setLocalSize_2(1); - - int max = 1; - int perimeter = 0; - - for (final int w : widthFactors) { - for (final int h : heightFactors) { - for (final int d : depthFactors) { - final int size = w * h * d; - if (size > withoutLocal.getMaxWorkGroupSize()) { - break; - } - - if (size > max) { - max = size; - perimeter = w + h + d; - withoutLocal.setLocalSize_0(w); - withoutLocal.setLocalSize_1(h); - withoutLocal.setLocalSize_2(d); - } else if (size == max) { - final int localPerimeter = w + h + d; - if (localPerimeter < perimeter) { // is this the shortest perimeter so far - perimeter = localPerimeter; - withoutLocal.setLocalSize_0(w); - withoutLocal.setLocalSize_1(w); - withoutLocal.setLocalSize_2(d); - } - } - } - } - } + create3DHelper(_globalWidth, _globalHeight, _globalDepth, withoutLocal); withoutLocal.setValid((withoutLocal.getLocalSize_0() > 0) && (withoutLocal.getLocalSize_1() > 0) @@ -409,15 +599,99 @@ public class Range extends RangeJNI{ return (withoutLocal); } - public static Range create3D(int _globalWidth, int _globalHeight, int _globalDepth) { - final Range range = create3D(null, _globalWidth, _globalHeight, _globalDepth); +/** + * Create 3D range helper method that tries find suitable local sizes from the user specified global sizes and MaxWorkGroupSize. + * + * @param _globalWidth the global width specified by the user + * @param _globalHeight the global height specified by the user + * @param _globalDepth the global depth specified by the user + * @param withoutLocal the Range instance + */ + private static void create3DHelper(int _globalWidth, int _globalHeight, int _globalDepth, final Range withoutLocal) { + final int[] widthFactors = getFactors(_globalWidth, withoutLocal.getMaxWorkItemSize()[0]); + final int[] heightFactors = getFactors(_globalHeight, withoutLocal.getMaxWorkItemSize()[1]); + final int[] depthFactors = getFactors(_globalDepth, withoutLocal.getMaxWorkItemSize()[2]); + + withoutLocal.setLocalSize_0(1); + withoutLocal.setLocalSize_1(1); + withoutLocal.setLocalSize_2(1); + + int max = 1; + int perimeter = 0; + + for (final int w : widthFactors) { + for (final int h : heightFactors) { + for (final int d : depthFactors) { + final int size = w * h * d; + if (size > withoutLocal.getMaxWorkGroupSize()) { + break; + } + + if (size > max) { + max = size; + perimeter = w + h + d; + withoutLocal.setLocalSize_0(w); + withoutLocal.setLocalSize_1(h); + withoutLocal.setLocalSize_2(d); + } else if (size == max) { + final int localPerimeter = w + h + d; + if (localPerimeter < perimeter) { // is this the shortest perimeter so far + perimeter = localPerimeter; + withoutLocal.setLocalSize_0(w); + withoutLocal.setLocalSize_1(w); + withoutLocal.setLocalSize_2(d); + } + } + } + } + } + } + + /** + * Create a three dimensional range <code>0.._globalWidth * 0.._globalHeight *0../_globalDepth</code> + * <p> + * Note that for this range to be valid <code>_globalWidth > 0 && _globalHeight > 0 _globalDepth > 0</code> and must + * not exceed max. work item sizes for each dimension. + * + * <p> + * To determine suitable values for <code>_localWidth</code>,<code>_localHeight</code> and <code>_lodalDepth</code> we extract the factors for <code>_globalWidth</code>,<code>_globalHeight</code> and <code>_globalDepth</code> and then + * find the largest product ( <code><= MAX_GROUP_SIZE</code>) with the lowest perimeter. + * + * <p> + * For example for <code>MAX_GROUP_SIZE</code> of 64 we favor 4x4x4 over 1x16x16. + * + * @param _kernel the Aparapi kernel for which this Range is being created + * @param _device the intended device where the kernel should run + * @param _globalWidth the width of the 3D grid we wish to process + * @param _globalHeight the height of the 3D grid we wish to process + * @param _globalDepth the depth of the 3D grid we wish to process + * @return the Range instance for the intended Kernel instance and assigned Device instance + */ + public static Range create3D(final Kernel _kernel, int _globalWidth, int _globalHeight, int _globalDepth) { + final Range range = create3D(_kernel, null, _globalWidth, _globalHeight, _globalDepth); return (range); } - public static Range create3D(int _globalWidth, int _globalHeight, int _globalDepth, int _localWidth, int _localHeight, - int _localDepth) { - final Range range = create3D(null, _globalWidth, _globalHeight, _globalDepth, _localWidth, _localHeight, _localDepth); + /** + * Create a three dimensional range <code>0.._globalWidth * 0.._globalHeight *0../_globalDepth</code> + * in groups defined by <code>localWidth</code> * <code>localHeight</code> * <code>localDepth</code>. + * <p> + * Note that for this range to be valid <code>_globalWidth > 0 && _globalHeight >0 _globalDepth >0 && _localWidth>0 && _localHeight>0 && _localDepth>0 && _localWidth*_localHeight*_localDepth < MAX_GROUP_SIZE && _globalWidth%_localWidth==0 && _globalHeight%_localHeight==0 && _globalDepth%_localDepth==0</code>. + * + * @param _kernel the Aparapi kernel for which this Range is being created + * @param _globalWidth the width of the 3D grid we wish to process + * @param _globalHeight the height of the 3D grid we wish to process + * @param _globalDepth the depth of the 3D grid we wish to process + * @param _localWidth the width of the 3D group we wish to process + * @param _localHeight the height of the 3D group we wish to process + * @param _localDepth the depth of the 3D group we wish to process + * @return the Range instance for the intended Kernel instance and assigned Device instance + */ + public static Range create3D(final Kernel _kernel, + int _globalWidth, int _globalHeight, int _globalDepth, + int _localWidth, int _localHeight, int _localDepth) { + final Range range = create3D(_kernel, null, _globalWidth, _globalHeight, _globalDepth, _localWidth, _localHeight, _localDepth); return (range); } @@ -654,4 +928,8 @@ public class Range extends RangeJNI{ public void setMaxWorkItemSize(int[] maxWorkItemSize) { this.maxWorkItemSize = maxWorkItemSize; } + + public boolean isSameKernel(Kernel _kernel) { + return kernel.get() == _kernel; + } } diff --git a/src/main/java/com/aparapi/device/Device.java b/src/main/java/com/aparapi/device/Device.java index 2e43c43c718958bf7359ad36fde02dbc3f7a358b..b24128fe33f0bc42dc6a6d52eb6b8003a8854ea9 100644 --- a/src/main/java/com/aparapi/device/Device.java +++ b/src/main/java/com/aparapi/device/Device.java @@ -134,29 +134,29 @@ public abstract class Device implements Comparable<Device> { this.maxWorkItemSize = maxWorkItemSize; } - public Range createRange(int _globalWidth) { - return (Range.create(this, _globalWidth)); + public Range createRange(Kernel kernel, int _globalWidth) { + return (Range.create(kernel, this, _globalWidth)); } - public Range createRange(int _globalWidth, int _localWidth) { - return (Range.create(this, _globalWidth, _localWidth)); + public Range createRange(Kernel kernel, int _globalWidth, int _localWidth) { + return (Range.create(kernel, this, _globalWidth, _localWidth)); } - public Range createRange2D(int _globalWidth, int _globalHeight) { - return (Range.create2D(this, _globalWidth, _globalHeight)); + public Range createRange2D(Kernel kernel, int _globalWidth, int _globalHeight) { + return (Range.create2D(kernel, this, _globalWidth, _globalHeight)); } - public Range createRange2D(int _globalWidth, int _globalHeight, int _localWidth, int _localHeight) { - return (Range.create2D(this, _globalWidth, _globalHeight, _localWidth, _localHeight)); + public Range createRange2D(Kernel kernel, int _globalWidth, int _globalHeight, int _localWidth, int _localHeight) { + return (Range.create2D(kernel, this, _globalWidth, _globalHeight, _localWidth, _localHeight)); } - public Range createRange3D(int _globalWidth, int _globalHeight, int _globalDepth) { - return (Range.create3D(this, _globalWidth, _globalHeight, _globalDepth)); + public Range createRange3D(Kernel kernel, int _globalWidth, int _globalHeight, int _globalDepth) { + return (Range.create3D(kernel, this, _globalWidth, _globalHeight, _globalDepth)); } - public Range createRange3D(int _globalWidth, int _globalHeight, int _globalDepth, int _localWidth, int _localHeight, + public Range createRange3D(Kernel kernel, int _globalWidth, int _globalHeight, int _globalDepth, int _localWidth, int _localHeight, int _localDepth) { - return (Range.create3D(this, _globalWidth, _globalHeight, _globalDepth, _localWidth, _localHeight, _localDepth)); + return (Range.create3D(kernel, this, _globalWidth, _globalHeight, _globalDepth, _localWidth, _localHeight, _localDepth)); } public abstract long getDeviceId(); diff --git a/src/main/java/com/aparapi/exception/AparapiRangeFailedException.java b/src/main/java/com/aparapi/exception/AparapiRangeFailedException.java new file mode 100644 index 0000000000000000000000000000000000000000..5100b8a5a947cef167ec20f8ea839ff306b776dc --- /dev/null +++ b/src/main/java/com/aparapi/exception/AparapiRangeFailedException.java @@ -0,0 +1,42 @@ +/** + * Copyright (c) 2016 - 2018 Syncleus, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.aparapi.exception; + +/** + * This exception is thrown when a Range creation fails. + * + * @author CoreRasurae + */ +public class AparapiRangeFailedException extends RuntimeException { + + /** + * + */ + private static final long serialVersionUID = 5825738909363220032L; + + public AparapiRangeFailedException(String message) { + super(message); + } + + public AparapiRangeFailedException(String message, Throwable cause) { + super(message, cause); + } + + public AparapiRangeFailedException(String message, Throwable cause, boolean enableSuppression, + boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/src/main/java/com/aparapi/internal/kernel/KernelDeviceProfile.java b/src/main/java/com/aparapi/internal/kernel/KernelDeviceProfile.java index d5ba22767b19b8717e84d88a19d7a5a49921bcc9..b9d4f8a97dfc3de23f5d88487c375c1986a66dc9 100644 --- a/src/main/java/com/aparapi/internal/kernel/KernelDeviceProfile.java +++ b/src/main/java/com/aparapi/internal/kernel/KernelDeviceProfile.java @@ -72,6 +72,9 @@ public class KernelDeviceProfile { lock.readLock().lock(); try { for (int i = 1; i < currentTimes.length; ++i) { + if (i == ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal()) { + continue; + } long elapsed = currentTimes[i] - currentTimes[i - 1]; accumulatedTimes.addAndGet(i, elapsed); @@ -87,7 +90,11 @@ public class KernelDeviceProfile { lock.writeLock().lock(); try { for (int i = 0; i < NUM_EVENTS; i++) { - accumulatedTimesHolder[i] = accumulatedTimes.get(i); + if (i == ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal()) { + accumulatedTimesHolder[i] = 0; + } else { + accumulatedTimesHolder[i] = accumulatedTimes.get(i); + } } } finally { lock.writeLock().unlock(); @@ -102,6 +109,7 @@ public class KernelDeviceProfile { private final ProfileReport report; private final WeakReference<ProfileReport> reportRef; private ProfilingEvent lastEvent = null; + private boolean wasLastEventSetFromCompileOnly = false; private int invocationCount = 0; private Accumulator(long _threadId) { @@ -110,16 +118,34 @@ public class KernelDeviceProfile { reportRef = new WeakReference<>(report); } - private void parseStartEventHelper(final ProfilingEvent event) { + private void parseStartEventHelper(final ProfilingEvent event, boolean compileOnly) { + ProfilingEvent updateValueForLastEvent = event; if (event == ProfilingEvent.START) { - if (lastEvent != null) { + if (lastEvent != null && !wasLastEventSetFromCompileOnly) { logger.log(Level.SEVERE, "ProfilingEvent.START encountered without ProfilingEvent.EXECUTED"); } else if (lastEvent == ProfilingEvent.START) { logger.log(Level.SEVERE, "Duplicate event ProfilingEvent.START"); } - Arrays.fill(currentTimes, 0L); - ++invocationCount; - invocationCountGlobal.incrementAndGet(); + + if (!wasLastEventSetFromCompileOnly) { + Arrays.fill(currentTimes, 0L); + ++invocationCount; + invocationCountGlobal.incrementAndGet(); + } else { + //Code reaches here during the START event of a real execute, since compilation can only occur once for a Device, + //and we know that such compilation occurred in the last event. Actually there was also a EXECUTED event which is + //always generated, but can be safely ignored. + // + //So, do not trigger a report count increment and do not reset the timestamps, because this is not a real run. + //We just want to keep the logged compilation time, and record the execution time in the same report. + wasLastEventSetFromCompileOnly = compileOnly; + //So, we resume as if the last event was OPENCL_COMPILED + lastEvent = ProfilingEvent.OPENCL_COMPILED; + return; + } + + wasLastEventSetFromCompileOnly = compileOnly; + currentTimes[event.ordinal()] = System.nanoTime(); } else { if (lastEvent == null) { if (event != ProfilingEvent.EXECUTED) { @@ -130,28 +156,42 @@ public class KernelDeviceProfile { currentTimes[i] = currentTimes[i - 1]; } } + currentTimes[event.ordinal()] = System.nanoTime(); } - currentTimes[event.ordinal()] = System.nanoTime(); - if (event == ProfilingEvent.EXECUTED) { - for (int i = 1; i < currentTimes.length; ++i) { - long elapsed = currentTimes[i] - currentTimes[i - 1]; - if (elapsed < 0) { - logger.log(Level.SEVERE, "negative elapsed time for event " + event); - break; - } - accumulatedTimes[i] += elapsed; - } - + if (event == ProfilingEvent.OPENCL_COMPILED) { + //Accumulated times are divided in two blocks, one until OpenCL kernel compile + for (int i = 1; i < ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal(); ++i) { + long elapsed = currentTimes[i] - currentTimes[i - 1]; + if (elapsed < 0) { + logger.log(Level.SEVERE, "negative elapsed time for event " + event); + break; + } + accumulatedTimes[i] += elapsed; + } + } + + if (event == ProfilingEvent.EXECUTED && !compileOnly) { + //and the second block after READY_TO_PREPARE_EXECUTE until EXECUTED, because there can be a time lapse between + //the first and the second block if the kernel is compiled but not executed at once. + for (int i = ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal() + 1; i < NUM_EVENTS; ++i) { + long elapsed = currentTimes[i] - currentTimes[i - 1]; + if (elapsed < 0) { + logger.log(Level.SEVERE, "negative elapsed time for event " + event); + break; + } + accumulatedTimes[i] += elapsed; + } globalAcc.accumulateTimes(currentTimes); lastAccumulator.set(this); } + + lastEvent = updateValueForLastEvent; } - private void onEvent(final ProfilingEvent event) { - parseStartEventHelper(event); - - lastEvent = event; - if (event == ProfilingEvent.EXECUTED) { + private void onEvent(final ProfilingEvent event, boolean compileOnly) { + parseStartEventHelper(event, compileOnly); + + if (event == ProfilingEvent.EXECUTED && !compileOnly) { updateProfileReport(report, invocationCount, currentTimes); IProfileReportObserver observer = parentKernelProfile.getReportObserver(); lastEvent = null; @@ -186,8 +226,8 @@ public class KernelDeviceProfile { format.setMaximumFractionDigits(3); } - public void onEvent(ProfilingEvent event) { - getAccForThreadPutIfAbsent().onEvent(event); + public void onEvent(ProfilingEvent event, boolean compileOnly) { + getAccForThreadPutIfAbsent().onEvent(event, compileOnly); } private ProfileReport updateProfileReport(final ProfileReport report, long invocationCount, long[] currentTimes) { @@ -206,6 +246,10 @@ public class KernelDeviceProfile { return 0; } + if (stage == ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal()) { + return 0.0; + } + Accumulator acc = getAccForThread(); return acc == null ? Double.NaN : (acc.currentTimes[stage] - acc.currentTimes[stage - 1]) / MILLION; @@ -215,7 +259,17 @@ public class KernelDeviceProfile { public double getElapsedTimeCurrentThread(int from, int to) { Accumulator acc = getAccForThread(); - return acc == null ? Double.NaN : (acc.currentTimes[to] - acc.currentTimes[from]) / MILLION; + if (acc == null) { + return Double.NaN; + } + + double accum = 0.0; + if (from < ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal() && to >= ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal()) { + accum = (double)(acc.currentTimes[ProfilingEvent.OPENCL_COMPILED.ordinal()] - acc.currentTimes[from]) / MILLION; + accum += (double)(acc.currentTimes[to] - acc.currentTimes[ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal()]) / MILLION; + return accum; + } + return (double)(acc.currentTimes[to] - acc.currentTimes[from]) / MILLION; } /** @@ -257,6 +311,10 @@ public class KernelDeviceProfile { public double getCumulativeElapsedTimeCurrrentThread(ProfilingEvent stage) { Accumulator acc = getAccForThread(); + if (stage == ProfilingEvent.READY_TO_PREPARE_EXECUTE) { + return 0.0; + } + return acc == null ? Double.NaN : acc.accumulatedTimes[stage.ordinal()] / MILLION; } @@ -273,6 +331,11 @@ public class KernelDeviceProfile { } for (int i = 1; i <= ProfilingEvent.EXECUTED.ordinal(); ++i) { + if (i == ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal()) { + //Ready to prepare execute is a stage that never takes time it is just a partial start time + //reference point. + continue; + } sum += acc.accumulatedTimes[i]; } @@ -289,8 +352,14 @@ public class KernelDeviceProfile { return 0; } - Accumulator acc = lastAccumulator.get(); - + if (stage == ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal()) { + //Ready to prepare execute is a stage that never takes time it is just a partial start time + //reference point. + return 0.0; + } + + Accumulator acc = lastAccumulator.get(); + return acc == null ? Double.NaN : (acc.currentTimes[stage] - acc.currentTimes[stage - 1]) / MILLION; } @@ -304,7 +373,17 @@ public class KernelDeviceProfile { public double getElapsedTimeLastThread(int from, int to) { Accumulator acc = lastAccumulator.get(); - return acc == null ? Double.NaN : (acc.currentTimes[to] - acc.currentTimes[from]) / MILLION; + if (acc == null) { + return Double.NaN; + } + + double accum = 0.0; + if (from < ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal() && to >= ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal()) { + accum = (double)(acc.currentTimes[ProfilingEvent.OPENCL_COMPILED.ordinal()] - acc.currentTimes[from]) / MILLION; + accum += (double)(acc.currentTimes[to] - acc.currentTimes[ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal()]) / MILLION; + return accum; + } + return (double)(acc.currentTimes[to] - acc.currentTimes[from]) / MILLION; } /** @@ -315,6 +394,12 @@ public class KernelDeviceProfile { */ public double getCumulativeElapsedTimeGlobal(ProfilingEvent stage) { final long[] accumulatedTimesHolder = new long[NUM_EVENTS]; + if (stage == ProfilingEvent.READY_TO_PREPARE_EXECUTE) { + //Ready to prepare execute is a stage that never takes time it is just a partial start time + //reference point. + return 0.0; + } + globalAcc.consultAccumulatedTimes(accumulatedTimesHolder); return accumulatedTimesHolder[stage.ordinal()] / MILLION; @@ -330,6 +415,11 @@ public class KernelDeviceProfile { double sum = 0; for (int i = 1; i <= ProfilingEvent.EXECUTED.ordinal(); ++i) { + if (i == ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal()) { + //Ready to prepare execute is a stage that never takes time it is just a partial start time + //reference point. + continue; + } sum += accumulatedTimesHolder[i]; } return sum; @@ -337,10 +427,9 @@ public class KernelDeviceProfile { public static synchronized String getTableHeader() { if (tableHeader == null) { - int length = ProfilingEvent.values().length; StringBuilder builder = new StringBuilder(150); appendRowHeaders(builder, "Device", "Count"); - for (int i = 1; i < length; ++i) { + for (int i = 1; i < NUM_EVENTS; ++i) { ProfilingEvent stage = ProfilingEvent.values()[i]; String heading = stage.name(); appendCell(builder, heading); @@ -364,10 +453,12 @@ public class KernelDeviceProfile { double total = 0; appendRowHeaders(builder, device.getShortDescription(), String.valueOf(invocationCountGlobal.get())); - for (int i = 1; i < NUM_EVENTS; ++i) { + for (int i = 1; i < NUM_EVENTS; ++i) { ProfilingEvent stage = ProfilingEvent.values()[i]; double time = getElapsedTimeLastThread(stage.ordinal()); - total += time; + if (i != ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal()) { + total += time; + } String formatted = format.format(time); appendCell(builder, formatted); } diff --git a/src/main/java/com/aparapi/internal/kernel/KernelProfile.java b/src/main/java/com/aparapi/internal/kernel/KernelProfile.java index 933b3f2bb7d3a79c2c31037674963f1109d76b75..966539e1430cfd3694381109d225f59f30e0a137 100644 --- a/src/main/java/com/aparapi/internal/kernel/KernelProfile.java +++ b/src/main/java/com/aparapi/internal/kernel/KernelProfile.java @@ -71,9 +71,11 @@ public class KernelProfile { /** * Starts a profiling information gathering sequence for the current thread invoking this method * regarding the specified execution device. - * @param device + * @param device the device to which the report pertains + * @param compileOnly <ul><li>true, if this event is from a compile only dry run</li> + * <li>false, if this event is from a regular kernel run</li></ul> */ - void onStart(Device device) { + void onStart(Device device, boolean compileOnly) { KernelDeviceProfile currentDeviceProfile = deviceProfiles.get(device); if (currentDeviceProfile == null) { currentDeviceProfile = new KernelDeviceProfile(this, kernelClass, device); @@ -83,7 +85,7 @@ public class KernelProfile { } } - currentDeviceProfile.onEvent(ProfilingEvent.START); + currentDeviceProfile.onEvent(ProfilingEvent.START, compileOnly); currentDevice.set(device); } @@ -93,25 +95,28 @@ public class KernelProfile { * * @param device the device where the kernel is/was executed * @param event the event for which the profiling information is being updated + * @param compileOnly <ul><li>true, if this event is from a compile only dry run</li> + * <li>false, if this event is from a regular kernel run</li></ul> */ - void onEvent(Device device, ProfilingEvent event) { + void onEvent(Device device, ProfilingEvent event, boolean compileOnly) { if (event == null) { logger.log(Level.WARNING, "Discarding profiling event " + event + " for null device, for Kernel class: " + kernelClass.getName()); return; } final KernelDeviceProfile deviceProfile = deviceProfiles.get(device); switch (event) { - case CLASS_MODEL_BUILT: // fallthrough - case OPENCL_GENERATED: // fallthrough - case INIT_JNI: // fallthrough - case OPENCL_COMPILED: // fallthrough - case PREPARE_EXECUTE: // fallthrough - case EXECUTED: // fallthrough + case CLASS_MODEL_BUILT: // fallthrough + case OPENCL_GENERATED: // fallthrough + case INIT_JNI: // fallthrough + case OPENCL_COMPILED: // fallthrough + case READY_TO_PREPARE_EXECUTE: //falltrhough + case PREPARE_EXECUTE: // fallthrough + case EXECUTED: // fallthrough { if (deviceProfile == null) { logger.log(Level.SEVERE, "Error in KernelProfile, no currentDevice (synchronization error?"); } - deviceProfile.onEvent(event); + deviceProfile.onEvent(event, compileOnly); break; } case START: diff --git a/src/main/java/com/aparapi/internal/kernel/KernelRunner.java b/src/main/java/com/aparapi/internal/kernel/KernelRunner.java index 25975a2a1794c648ce07783092536adaf2a4d857..5065ccaa1bcad81ef8586bd1231de195559c65a4 100644 --- a/src/main/java/com/aparapi/internal/kernel/KernelRunner.java +++ b/src/main/java/com/aparapi/internal/kernel/KernelRunner.java @@ -478,7 +478,7 @@ public class KernelRunner extends KernelRunnerJNI{ boolean legacySequentialMode = kernel.getExecutionMode().equals(Kernel.EXECUTION_MODE.SEQ); passId = PASS_ID_PREPARING_EXECUTION; - _settings.profile.onEvent(device, ProfilingEvent.PREPARE_EXECUTE); + _settings.profile.onEvent(device, ProfilingEvent.PREPARE_EXECUTE, false); try { if (device == JavaDevice.ALTERNATIVE_ALGORITHM) { @@ -1370,15 +1370,15 @@ public class KernelRunner extends KernelRunnerJNI{ Range result; switch (_settings.range.getDims()) { case 1: { - result = Range.create(device, _settings.range.getGlobalSize_0()); + result = Range.create(kernel, device, _settings.range.getGlobalSize_0()); break; } case 2: { - result = Range.create2D(device, _settings.range.getGlobalSize_0(), _settings.range.getGlobalSize_1()); + result = Range.create2D(kernel, device, _settings.range.getGlobalSize_0(), _settings.range.getGlobalSize_1()); break; } case 3: { - result = Range.create3D(device, _settings.range.getGlobalSize_0(), _settings.range.getGlobalSize_1(), _settings.range.getGlobalSize_2()); + result = Range.create3D(kernel, device, _settings.range.getGlobalSize_0(), _settings.range.getGlobalSize_1(), _settings.range.getGlobalSize_2()); break; } default: { @@ -1401,7 +1401,7 @@ public class KernelRunner extends KernelRunnerJNI{ @SuppressWarnings("deprecation") synchronized private Kernel fallBackToNextDevice(Device device, ExecutionSettings _settings, Exception _exception, boolean _silently) { isFallBack = true; - _settings.profile.onEvent(device, ProfilingEvent.EXECUTED); + _settings.profile.onEvent(device, ProfilingEvent.EXECUTED, false); if (_settings.legacyExecutionMode) { if (!_silently && logger.isLoggable(Level.WARNING)) { logger.warning("Execution mode " + kernel.getExecutionMode() + " failed for " + kernel + ": " + _exception.getMessage()); @@ -1461,9 +1461,9 @@ public class KernelRunner extends KernelRunnerJNI{ public synchronized Kernel compile(String _entrypoint, final Device device) throws CompileFailedException { KernelProfile profile = KernelManager.instance().getProfile(kernel.getClass()); KernelPreferences preferences = KernelManager.instance().getPreferences(kernel); - Range range = new Range(device, 1); - ExecutionSettings settings = new ExecutionSettings(preferences, profile, _entrypoint, range, 1, false); - return executeInternalInner(settings, device, true); + Range range = Range.create((Kernel)null, device, 1); + ExecutionSettings settings = new ExecutionSettings(preferences, profile, _entrypoint, range, 1, false); + return executeInternalInner(settings, device, true); } private synchronized Kernel executeInternalOuter(ExecutionSettings _settings) { @@ -1571,7 +1571,7 @@ public class KernelRunner extends KernelRunnerJNI{ device = openCLDevice; } assert device != null : "No device available"; - _settings.profile.onStart(device); + _settings.profile.onStart(device, compileOnly); /* for backward compatibility reasons we still honor execution mode */ boolean isOpenCl = requestedExecutionMode.isOpenCL() || device instanceof OpenCLDevice; if (isOpenCl) { @@ -1580,9 +1580,9 @@ public class KernelRunner extends KernelRunnerJNI{ try { final ClassModel classModel = ClassModel.createClassModel(kernel.getClass()); entryPoint = classModel.getEntrypoint(_settings.entrypoint, kernel); - _settings.profile.onEvent(device, ProfilingEvent.CLASS_MODEL_BUILT); + _settings.profile.onEvent(device, ProfilingEvent.CLASS_MODEL_BUILT, false); } catch (final Exception exception) { - _settings.profile.onEvent(device, ProfilingEvent.CLASS_MODEL_BUILT); + _settings.profile.onEvent(device, ProfilingEvent.CLASS_MODEL_BUILT, false); if (compileOnly) { //Cannot fallback in compile only mode throw new CompileFailedException(exception); @@ -1602,7 +1602,7 @@ public class KernelRunner extends KernelRunnerJNI{ // Init the device to check capabilities before emitting the // code that requires the capabilities. jniContextHandle = initJNI(kernel, openCLDevice, jniFlags); // openCLDevice will not be null here - _settings.profile.onEvent(device, ProfilingEvent.INIT_JNI); + _settings.profile.onEvent(device, ProfilingEvent.INIT_JNI, false); } // end of synchronized! issue 68 if (jniContextHandle == 0) { @@ -1661,12 +1661,12 @@ public class KernelRunner extends KernelRunnerJNI{ else if (Config.enableShowGeneratedOpenCL) { System.out.println(openCL); } - _settings.profile.onEvent(device, ProfilingEvent.OPENCL_GENERATED); + _settings.profile.onEvent(device, ProfilingEvent.OPENCL_GENERATED, compileOnly); openCLCache.put(kernel.getClass(), openCL); } catch (final CodeGenException codeGenException) { openCLCache.put(kernel.getClass(), CODE_GEN_ERROR_MARKER); - _settings.profile.onEvent(device, ProfilingEvent.OPENCL_GENERATED); + _settings.profile.onEvent(device, ProfilingEvent.OPENCL_GENERATED, compileOnly); if (compileOnly) { throw new CompileFailedException(codeGenException); } @@ -1675,7 +1675,7 @@ public class KernelRunner extends KernelRunnerJNI{ } else { if (openCL.equals(CODE_GEN_ERROR_MARKER)) { - _settings.profile.onEvent(device, ProfilingEvent.OPENCL_GENERATED); + _settings.profile.onEvent(device, ProfilingEvent.OPENCL_GENERATED, compileOnly); boolean silently = true; // since we must have already reported the CodeGenException if (compileOnly) { throw new CompileFailedException("Code Gen Error Marker present"); @@ -1705,7 +1705,7 @@ public class KernelRunner extends KernelRunnerJNI{ } } } - _settings.profile.onEvent(device, ProfilingEvent.OPENCL_COMPILED); + _settings.profile.onEvent(device, ProfilingEvent.OPENCL_COMPILED, compileOnly); if (handle == 0) { if (compileOnly) { //When compiling a kernel for a specific device device fallback is not allowed @@ -1720,8 +1720,9 @@ public class KernelRunner extends KernelRunnerJNI{ return kernel; } } - - if (entryPoint != null) { + + _settings.profile.onEvent(device, ProfilingEvent.READY_TO_PREPARE_EXECUTE, compileOnly); + if (entryPoint != null) { //Pre-compiled kernels that never executed must resume here args = new KernelArg[entryPoint.getReferencedFields().size()]; int i = 0; @@ -1848,7 +1849,7 @@ public class KernelRunner extends KernelRunnerJNI{ argc = i; setArgsJNI(jniContextHandle, args, argc); - _settings.profile.onEvent(device, ProfilingEvent.PREPARE_EXECUTE); + _settings.profile.onEvent(device, ProfilingEvent.PREPARE_EXECUTE, false); kernelNeverExecutedForDeviceHash.putIfAbsent(device, false); try { @@ -1885,7 +1886,7 @@ public class KernelRunner extends KernelRunnerJNI{ return kernel; } finally { - _settings.profile.onEvent(device, ProfilingEvent.EXECUTED); + _settings.profile.onEvent(device, ProfilingEvent.EXECUTED, compileOnly); maybeReportProfile(_settings); } } diff --git a/src/main/java/com/aparapi/internal/kernel/ProfilingEvent.java b/src/main/java/com/aparapi/internal/kernel/ProfilingEvent.java index 24a378dbc17164f2c6b383d8929f7819aa39f917..63e62289d41b54a2bed2378878408f5f54fbacc1 100644 --- a/src/main/java/com/aparapi/internal/kernel/ProfilingEvent.java +++ b/src/main/java/com/aparapi/internal/kernel/ProfilingEvent.java @@ -21,7 +21,7 @@ import java.util.concurrent.atomic.AtomicReference; * Created by Barney on 02/09/2015. */ public enum ProfilingEvent { - START, CLASS_MODEL_BUILT, INIT_JNI, OPENCL_GENERATED, OPENCL_COMPILED, PREPARE_EXECUTE, EXECUTED; + START, CLASS_MODEL_BUILT, INIT_JNI, OPENCL_GENERATED, OPENCL_COMPILED, READY_TO_PREPARE_EXECUTE, PREPARE_EXECUTE, EXECUTED; static final AtomicReference<String[]> stagesNames = new AtomicReference<String[]>(null); diff --git a/src/test/java/com/aparapi/runtime/AtomicsSupportAdvTest.java b/src/test/java/com/aparapi/runtime/AtomicsSupportAdvTest.java index 94a8a62cab03ce6708cf92c025b8e939a158e2a5..82c5cbc6a05c7151bd83675d13b47f2e9a7ebe29 100644 --- a/src/test/java/com/aparapi/runtime/AtomicsSupportAdvTest.java +++ b/src/test/java/com/aparapi/runtime/AtomicsSupportAdvTest.java @@ -94,7 +94,7 @@ public class AtomicsSupportAdvTest { final AtomicKernel kernel = new AtomicKernel(in, out); try { - final Range range = openCLDevice.createRange(SIZE/2, SIZE/2); + final Range range = openCLDevice.createRange(kernel, SIZE/2, SIZE/2); kernel.setExplicit(true); kernel.put(in); kernel.execute(range); @@ -121,7 +121,7 @@ public class AtomicsSupportAdvTest { final AtomicKernel kernel = new AtomicKernel(in, out); try { - final Range range = openCLDevice.createRange(SIZE/2, SIZE/2); + final Range range = openCLDevice.createRange(kernel, SIZE/2, SIZE/2); kernel.execute(range); } finally { kernel.dispose(); @@ -147,7 +147,7 @@ public class AtomicsSupportAdvTest { final AtomicKernel kernel = new AtomicKernel(in, out); try { - final Range range = device.createRange(SIZE/2, SIZE/2); + final Range range = device.createRange(kernel, SIZE/2, SIZE/2); kernel.execute(range); } finally { kernel.dispose(); @@ -172,7 +172,7 @@ public class AtomicsSupportAdvTest { final AtomicBKernel kernel = new AtomicBKernel(in, out); try { - final Range range = openCLDevice.createRange(SIZE/2, SIZE/2); + final Range range = openCLDevice.createRange(kernel, SIZE/2, SIZE/2); kernel.execute(range); } finally { kernel.dispose(); @@ -200,7 +200,7 @@ public class AtomicsSupportAdvTest { final AtomicBKernel kernel = new AtomicBKernel(in, out); try { - final Range range = device.createRange(SIZE/2, SIZE/2); + final Range range = device.createRange(kernel, SIZE/2, SIZE/2); kernel.execute(range); } finally { kernel.dispose(); diff --git a/src/test/java/com/aparapi/runtime/AtomicsSupportTest.java b/src/test/java/com/aparapi/runtime/AtomicsSupportTest.java index b7aca3cc7210163644e4020652f61f7e4b1074e3..7bc952a9260416eece81a1d33aa40415c6705454 100644 --- a/src/test/java/com/aparapi/runtime/AtomicsSupportTest.java +++ b/src/test/java/com/aparapi/runtime/AtomicsSupportTest.java @@ -83,7 +83,7 @@ public class AtomicsSupportTest { final AtomicAdd kernel = new AtomicAdd(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.setExplicit(true); kernel.put(in); kernel.execute(range); @@ -104,7 +104,7 @@ public class AtomicsSupportTest { final AtomicAdd kernel = new AtomicAdd(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -124,7 +124,7 @@ public class AtomicsSupportTest { final AtomicAdd kernel = new AtomicAdd(in, out); try { - final Range range = device.createRange(1,1); + final Range range = device.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -171,7 +171,7 @@ public class AtomicsSupportTest { final AtomicSub kernel = new AtomicSub(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.setExplicit(true); kernel.put(in); kernel.execute(range); @@ -192,7 +192,7 @@ public class AtomicsSupportTest { final AtomicSub kernel = new AtomicSub(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -212,7 +212,7 @@ public class AtomicsSupportTest { final AtomicSub kernel = new AtomicSub(in, out); try { - final Range range = device.createRange(1,1); + final Range range = device.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -261,7 +261,7 @@ public class AtomicsSupportTest { final AtomicXchg kernel = new AtomicXchg(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.setExplicit(true); kernel.put(in); kernel.execute(range); @@ -282,7 +282,7 @@ public class AtomicsSupportTest { final AtomicXchg kernel = new AtomicXchg(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -302,7 +302,7 @@ public class AtomicsSupportTest { final AtomicXchg kernel = new AtomicXchg(in, out); try { - final Range range = device.createRange(1,1); + final Range range = device.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -349,7 +349,7 @@ public class AtomicsSupportTest { final AtomicInc kernel = new AtomicInc(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.setExplicit(true); kernel.put(in); kernel.execute(range); @@ -369,7 +369,7 @@ public class AtomicsSupportTest { final AtomicInc kernel = new AtomicInc(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -388,7 +388,7 @@ public class AtomicsSupportTest { final AtomicInc kernel = new AtomicInc(in, out); try { - final Range range = device.createRange(1,1); + final Range range = device.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -435,7 +435,7 @@ public class AtomicsSupportTest { final AtomicDec kernel = new AtomicDec(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.setExplicit(true); kernel.put(in); kernel.execute(range); @@ -455,7 +455,7 @@ public class AtomicsSupportTest { final AtomicDec kernel = new AtomicDec(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -474,7 +474,7 @@ public class AtomicsSupportTest { final AtomicDec kernel = new AtomicDec(in, out); try { - final Range range = device.createRange(1,1); + final Range range = device.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -523,7 +523,7 @@ public class AtomicsSupportTest { final AtomicCmpXchg kernel = new AtomicCmpXchg(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.setExplicit(true); kernel.put(in); kernel.execute(range); @@ -545,7 +545,7 @@ public class AtomicsSupportTest { final AtomicCmpXchg kernel = new AtomicCmpXchg(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -566,7 +566,7 @@ public class AtomicsSupportTest { final AtomicCmpXchg kernel = new AtomicCmpXchg(in, out); try { - final Range range = device.createRange(1,1); + final Range range = device.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -586,7 +586,7 @@ public class AtomicsSupportTest { final AtomicCmpXchg kernel = new AtomicCmpXchg(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.setExplicit(true); kernel.put(in); kernel.execute(range); @@ -608,7 +608,7 @@ public class AtomicsSupportTest { final AtomicCmpXchg kernel = new AtomicCmpXchg(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -629,7 +629,7 @@ public class AtomicsSupportTest { final AtomicCmpXchg kernel = new AtomicCmpXchg(in, out); try { - final Range range = device.createRange(1,1); + final Range range = device.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -677,7 +677,7 @@ public class AtomicsSupportTest { final AtomicMin kernel = new AtomicMin(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.setExplicit(true); kernel.put(in); kernel.execute(range); @@ -698,7 +698,7 @@ public class AtomicsSupportTest { final AtomicMin kernel = new AtomicMin(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -718,7 +718,7 @@ public class AtomicsSupportTest { final AtomicMin kernel = new AtomicMin(in, out); try { - final Range range = device.createRange(1,1); + final Range range = device.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -736,7 +736,7 @@ public class AtomicsSupportTest { final AtomicMin kernel = new AtomicMin(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.setExplicit(true); kernel.put(in); kernel.execute(range); @@ -757,7 +757,7 @@ public class AtomicsSupportTest { final AtomicMin kernel = new AtomicMin(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -777,7 +777,7 @@ public class AtomicsSupportTest { final AtomicMin kernel = new AtomicMin(in, out); try { - final Range range = device.createRange(1,1); + final Range range = device.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -825,7 +825,7 @@ public class AtomicsSupportTest { final AtomicMax kernel = new AtomicMax(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.setExplicit(true); kernel.put(in); kernel.execute(range); @@ -846,7 +846,7 @@ public class AtomicsSupportTest { final AtomicMax kernel = new AtomicMax(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -866,7 +866,7 @@ public class AtomicsSupportTest { final AtomicMax kernel = new AtomicMax(in, out); try { - final Range range = device.createRange(1,1); + final Range range = device.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -884,7 +884,7 @@ public class AtomicsSupportTest { final AtomicMax kernel = new AtomicMax(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.setExplicit(true); kernel.put(in); kernel.execute(range); @@ -905,7 +905,7 @@ public class AtomicsSupportTest { final AtomicMax kernel = new AtomicMax(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -925,7 +925,7 @@ public class AtomicsSupportTest { final AtomicMax kernel = new AtomicMax(in, out); try { - final Range range = device.createRange(1,1); + final Range range = device.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -974,7 +974,7 @@ public class AtomicsSupportTest { final AtomicAnd kernel = new AtomicAnd(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.setExplicit(true); kernel.put(in); kernel.execute(range); @@ -996,7 +996,7 @@ public class AtomicsSupportTest { final AtomicAnd kernel = new AtomicAnd(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -1016,7 +1016,7 @@ public class AtomicsSupportTest { final AtomicAnd kernel = new AtomicAnd(in, out); try { - final Range range = device.createRange(1,1); + final Range range = device.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -1065,7 +1065,7 @@ public class AtomicsSupportTest { final AtomicOr kernel = new AtomicOr(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.setExplicit(true); kernel.put(in); kernel.execute(range); @@ -1086,7 +1086,7 @@ public class AtomicsSupportTest { final AtomicOr kernel = new AtomicOr(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -1106,7 +1106,7 @@ public class AtomicsSupportTest { final AtomicOr kernel = new AtomicOr(in, out); try { - final Range range = device.createRange(1,1); + final Range range = device.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -1154,7 +1154,7 @@ public class AtomicsSupportTest { final AtomicXor kernel = new AtomicXor(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.setExplicit(true); kernel.put(in); kernel.execute(range); @@ -1175,7 +1175,7 @@ public class AtomicsSupportTest { final AtomicXor kernel = new AtomicXor(in, out); try { - final Range range = openCLDevice.createRange(1,1); + final Range range = openCLDevice.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); @@ -1195,7 +1195,7 @@ public class AtomicsSupportTest { final AtomicXor kernel = new AtomicXor(in, out); try { - final Range range = device.createRange(1,1); + final Range range = device.createRange(kernel,1,1); kernel.execute(range); } finally { kernel.dispose(); diff --git a/src/test/java/com/aparapi/runtime/BarrierSupportTest.java b/src/test/java/com/aparapi/runtime/BarrierSupportTest.java index bf5697a2026647e89fa39d0640dbd47098e51d11..c142520211318fd16ab27be10568cbabe77f6a76 100644 --- a/src/test/java/com/aparapi/runtime/BarrierSupportTest.java +++ b/src/test/java/com/aparapi/runtime/BarrierSupportTest.java @@ -94,7 +94,7 @@ public class BarrierSupportTest { } try { - final Range range = openCLDevice.createRange(SIZE, SIZE); + final Range range = openCLDevice.createRange(kernel, SIZE, SIZE); targetArray = initInputArray(); kernel.setExplicit(false); kernel.setArray(targetArray); @@ -124,7 +124,7 @@ public class BarrierSupportTest { } try { - final Range range = openCLDevice.createRange(SIZE, SIZE); + final Range range = openCLDevice.createRange(kernel, SIZE, SIZE); targetArray = initInputArray(); kernel.setExplicit(true); kernel.setArray(targetArray); @@ -146,7 +146,7 @@ public class BarrierSupportTest { final Barrrier1Kernel kernel = new Barrrier1Kernel(SIZE); try { - final Range range = device.createRange(SIZE, SIZE); + final Range range = device.createRange(kernel, SIZE, SIZE); targetArray = initInputArray(); kernel.setExplicit(false); kernel.setArray(targetArray); @@ -176,7 +176,7 @@ public class BarrierSupportTest { } try { - final Range range = openCLDevice.createRange(SIZE, SIZE); + final Range range = openCLDevice.createRange(kernel, SIZE, SIZE); targetArray = initInputArray(); kernel.setExplicit(false); kernel.setArray(targetArray); @@ -206,7 +206,7 @@ public class BarrierSupportTest { } try { - final Range range = openCLDevice.createRange(SIZE, SIZE); + final Range range = openCLDevice.createRange(kernel, SIZE, SIZE); targetArray = initInputArray(); kernel.setExplicit(true); kernel.setArray(targetArray); @@ -228,7 +228,7 @@ public class BarrierSupportTest { final Barrrier2Kernel kernel = new Barrrier2Kernel(SIZE); try { - final Range range = device.createRange(SIZE, SIZE); + final Range range = device.createRange(kernel, SIZE, SIZE); targetArray = initInputArray(); kernel.setExplicit(false); kernel.setArray(targetArray); diff --git a/src/test/java/com/aparapi/runtime/BufferTransferTest.java b/src/test/java/com/aparapi/runtime/BufferTransferTest.java index 8edf47a714822c88132f7e5c89e952897822596c..1467fc939fb8b3ac68465ec7300f3888aa2964f1 100644 --- a/src/test/java/com/aparapi/runtime/BufferTransferTest.java +++ b/src/test/java/com/aparapi/runtime/BufferTransferTest.java @@ -51,7 +51,7 @@ public class BufferTransferTest { if (maxSize < SIZE) { SIZE = maxSize; } - final Range range = openCLDevice.createRange(SIZE); + final Range range = openCLDevice.createRange(kernel, SIZE); kernel.in = new int[SIZE]; kernel.out = new int[SIZE]; @@ -76,7 +76,7 @@ public class BufferTransferTest { if (maxSize < SIZE) { SIZE = maxSize; } - final Range range = openCLDevice.createRange(SIZE); + final Range range = openCLDevice.createRange(kernel, SIZE); kernel.values = new int[SIZE]; kernel.result = new int[SIZE]; @@ -129,7 +129,7 @@ public class BufferTransferTest { SIZE = maxSize; } kernel.setExplicit(true); - final Range range = openCLDevice.createRange(SIZE); + final Range range = openCLDevice.createRange(kernel, SIZE); kernel.values = new int[SIZE]; kernel.result = new int[SIZE]; diff --git a/src/test/java/com/aparapi/runtime/IntArray2DTest.java b/src/test/java/com/aparapi/runtime/IntArray2DTest.java index cdae400fc4dd8305dd0ba4119866c731da57e73a..03b4fea34ea62fa6f229e1cd789ddf20abd3a208 100644 --- a/src/test/java/com/aparapi/runtime/IntArray2DTest.java +++ b/src/test/java/com/aparapi/runtime/IntArray2DTest.java @@ -70,7 +70,7 @@ public class IntArray2DTest { } }; - final Range range = openCLDevice.createRange(size); + final Range range = openCLDevice.createRange(kernel, size); try { kernel.execute(range); diff --git a/src/test/java/com/aparapi/runtime/JtpRangeIdsTest.java b/src/test/java/com/aparapi/runtime/JtpRangeIdsTest.java index dbad276acca1c6c2e2c6dd61c1153a57d246f78d..791ec6dea98872a886fe37d51f172c943a539eb3 100644 --- a/src/test/java/com/aparapi/runtime/JtpRangeIdsTest.java +++ b/src/test/java/com/aparapi/runtime/JtpRangeIdsTest.java @@ -464,7 +464,7 @@ public class JtpRangeIdsTest { @Test public void test() { MatrixKernel kernel = new MatrixKernel(); - kernel.execute(Range.create2D(12, 4, 4, 2)); + kernel.execute(Range.create2D(kernel, 12, 4, 4, 2)); for(boolean hasPassed : kernel.passed) { Assert.assertTrue("Resulting matrix was invalid", hasPassed); } diff --git a/src/test/java/com/aparapi/runtime/KernelAndDeviceItemSizeLimitsTest.java b/src/test/java/com/aparapi/runtime/KernelAndDeviceItemSizeLimitsTest.java index 98351d06cd6144f31243db91082bf72d4b371383..4f4a28b00aa2e31fe5199b6990195eb495f7abd7 100644 --- a/src/test/java/com/aparapi/runtime/KernelAndDeviceItemSizeLimitsTest.java +++ b/src/test/java/com/aparapi/runtime/KernelAndDeviceItemSizeLimitsTest.java @@ -86,7 +86,7 @@ public class KernelAndDeviceItemSizeLimitsTest { assertTrue("Max Local Mem Size should be greater or equal to 0", maxLocalMemSize >= 0); - Range r = Range.create(openCLDevice, SIZE, SIZE); + Range r = Range.create(myKernel, openCLDevice, SIZE, SIZE); myKernel.execute(r); int[] results = myKernel.getResults(); @@ -111,7 +111,7 @@ public class KernelAndDeviceItemSizeLimitsTest { assertTrue("Max Private Mem Size should be greater than 0", maxPrivateMemSize >= 0); - Range r = Range.create(openCLDevice, SIZE); + Range r = Range.create(myKernel, openCLDevice, SIZE); myKernel.execute(r); int[] results = myKernel.getResults(); @@ -136,7 +136,7 @@ public class KernelAndDeviceItemSizeLimitsTest { assertTrue("Max Kernel Workgroup Size should be greater than 0", maxWorkGroupSize > 0); - Range r = Range.create(openCLDevice, SIZE); + Range r = Range.create(myKernel, openCLDevice, SIZE); myKernel.execute(r); int[] results = myKernel.getResults(); @@ -161,7 +161,7 @@ public class KernelAndDeviceItemSizeLimitsTest { assertTrue("Preferred Kernel Workgroup Size Multiple should be greater than 0", preferredWorkGroupSizeMultiple > 0); - Range r = Range.create(openCLDevice, SIZE); + Range r = Range.create(myKernel, openCLDevice, SIZE); myKernel.execute(r); int[] results = myKernel.getResults(); @@ -190,7 +190,7 @@ public class KernelAndDeviceItemSizeLimitsTest { assertTrue("Kernel Compile Work Group Size should be greater or equal than zero at index=" + i, maxWorkItemSize[i] >= 0); } - Range r = Range.create(openCLDevice, SIZE); + Range r = Range.create(myKernel, openCLDevice, SIZE); myKernel.execute(r); int[] results = myKernel.getResults(); @@ -222,7 +222,7 @@ public class KernelAndDeviceItemSizeLimitsTest { assertTrue("Max Local Mem Size should be equal or greater to 0", maxLocalMemSize >= 0); - Range r = Range.create(device, SIZE, SIZE); + Range r = Range.create(myKernel, device, SIZE, SIZE); myKernel.execute(r); int[] results = myKernel.getResults(); @@ -251,7 +251,7 @@ public class KernelAndDeviceItemSizeLimitsTest { assertTrue("Max Private Mem Size should be equal or greater to 0", maxPrivateMemSize >= 0); - Range r = Range.create(device, SIZE, SIZE); + Range r = Range.create(myKernel, device, SIZE, SIZE); myKernel.execute(r); int[] results = myKernel.getResults(); @@ -280,7 +280,7 @@ public class KernelAndDeviceItemSizeLimitsTest { assertTrue("Max Kernel Workgroup Size should be equal or greater than 0", maxWorkGroupSize >= 0); - Range r = Range.create(device, SIZE, SIZE); + Range r = Range.create(myKernel, device, SIZE, SIZE); myKernel.execute(r); int[] results = myKernel.getResults(); @@ -309,7 +309,7 @@ public class KernelAndDeviceItemSizeLimitsTest { assertTrue("Preferred Kernel Workgroup Size Multiple should be equal to 1", preferredWorkGroupSizeMultiple == 1); - Range r = Range.create(device, SIZE, SIZE); + Range r = Range.create(myKernel, device, SIZE, SIZE); myKernel.execute(r); int[] results = myKernel.getResults(); @@ -342,7 +342,7 @@ public class KernelAndDeviceItemSizeLimitsTest { assertTrue("Kernel Compile Work Group Size should be greater or equal than zero at index=" + i, maxWorkItemSize[i] >= 0); } - Range r = Range.create(device, SIZE, SIZE); + Range r = Range.create(myKernel, device, SIZE, SIZE); myKernel.execute(r); int[] results = myKernel.getResults(); diff --git a/src/test/java/com/aparapi/runtime/KernelCompileOnlyTest.java b/src/test/java/com/aparapi/runtime/KernelCompileOnlyTest.java index c921a8bf519b950c517574b68f1953310036578d..5f8f19082951caf15ce870b078bcc2c6d9af8cd4 100644 --- a/src/test/java/com/aparapi/runtime/KernelCompileOnlyTest.java +++ b/src/test/java/com/aparapi/runtime/KernelCompileOnlyTest.java @@ -98,7 +98,7 @@ public class KernelCompileOnlyTest { fail("This shouldn't happen"); } - Range r = Range.create(openCLDevice, SIZE); + Range r = Range.create(myKernel, openCLDevice, SIZE); myKernel.execute(r); int[] results = myKernel.getResults(); @@ -126,7 +126,7 @@ public class KernelCompileOnlyTest { fail("This shouldn't happen"); } - Range r = Range.create(openCLDevice, SIZE); + Range r = Range.create(myKernel, openCLDevice, SIZE); myKernel.execute(r); int[] results = myKernel.getResults(); @@ -146,7 +146,7 @@ public class KernelCompileOnlyTest { } - Range r = Range.create(openCLDevice, SIZE); + Range r = Range.create(myKernel, openCLDevice, SIZE); myKernel.execute(r); int[] results = myKernel.getResults(); @@ -209,7 +209,7 @@ public class KernelCompileOnlyTest { fail("This shouldn't happen"); } - Range r = Range.create(device, SIZE, SIZE); + Range r = Range.create(myKernel, device, SIZE, SIZE); myKernel.execute(r); int[] resultsB = myKernel.getResults(); diff --git a/src/test/java/com/aparapi/runtime/LoadClTest.java b/src/test/java/com/aparapi/runtime/LoadClTest.java index 0f6c126fe5d61567c75911b384dfc3cb3add14cc..fad93caab1a7ff969f8a86914284233760f831ea 100644 --- a/src/test/java/com/aparapi/runtime/LoadClTest.java +++ b/src/test/java/com/aparapi/runtime/LoadClTest.java @@ -39,7 +39,6 @@ public class LoadClTest { final float[] squares = new float[size]; final float[] quads = new float[size]; - final Range range = Range.create(size); final Device device = KernelManager.instance().bestDevice(); @@ -47,6 +46,7 @@ public class LoadClTest { final OpenCLDevice openclDevice = (OpenCLDevice) device; final Squarer squarer = openclDevice.bind(Squarer.class); + final Range range = Range.create(squarer, openclDevice, size); squarer.square(range, in, squares); for (int i = 0; i < size; i++) { diff --git a/src/test/java/com/aparapi/runtime/LocalArrayArgsTest.java b/src/test/java/com/aparapi/runtime/LocalArrayArgsTest.java index 2ea3ba7b4318ea6d816a3e1cf44b1898b665293e..96a757542bb051da3ed9b2c659e5fa23c131b762 100644 --- a/src/test/java/com/aparapi/runtime/LocalArrayArgsTest.java +++ b/src/test/java/com/aparapi/runtime/LocalArrayArgsTest.java @@ -50,7 +50,7 @@ public class LocalArrayArgsTest { public void test() { final LocalArrayArgsKernel kernel = new LocalArrayArgsKernel(); try { - final Range range = openCLDevice.createRange(SIZE, SIZE); + final Range range = openCLDevice.createRange(kernel, SIZE, SIZE); targetArray = new int[SIZE]; kernel.setExplicit(false); kernel.setArray(targetArray); @@ -65,7 +65,7 @@ public class LocalArrayArgsTest { public void testExplicit() { final LocalArrayArgsKernel kernel = new LocalArrayArgsKernel(); try { - final Range range = openCLDevice.createRange(SIZE, SIZE); + final Range range = openCLDevice.createRange(kernel, SIZE, SIZE); targetArray = new int[SIZE]; kernel.setExplicit(true); kernel.setArray(targetArray); diff --git a/src/test/java/com/aparapi/runtime/LocalAtomicVariableArrayTest.java b/src/test/java/com/aparapi/runtime/LocalAtomicVariableArrayTest.java index 6f70f16cbe7fdd4057d3d272430892743331d46f..46403eb2b38a2fba763cad14d14f387d3b617dc2 100644 --- a/src/test/java/com/aparapi/runtime/LocalAtomicVariableArrayTest.java +++ b/src/test/java/com/aparapi/runtime/LocalAtomicVariableArrayTest.java @@ -74,7 +74,7 @@ public class LocalAtomicVariableArrayTest { @Test public void simpleConstIndexOpenCLTest() { SimpleConstIndexLocalVarKernel myKernel = new SimpleConstIndexLocalVarKernel(); - Range range = openCLDevice.createRange(SIZE, SIZE); + Range range = openCLDevice.createRange(myKernel, SIZE, SIZE); try { myKernel.execute(range); assertEquals("Atomic increment doesn't match, index 1", SIZE, myKernel.atomics[1].get()); @@ -88,7 +88,7 @@ public class LocalAtomicVariableArrayTest { @Test public void simpleVarIndexOpenCLTest() { SimpleVarIndexLocalVarKernel myKernel = new SimpleVarIndexLocalVarKernel(); - Range range = openCLDevice.createRange(SIZE, SIZE); + Range range = openCLDevice.createRange(myKernel, SIZE, SIZE); try { myKernel.execute(range); assertEquals("Atomic increment doesn't match", SIZE, myKernel.atomics[4].get()); diff --git a/src/test/java/com/aparapi/runtime/MultiDimensionalLocalArrayTest.java b/src/test/java/com/aparapi/runtime/MultiDimensionalLocalArrayTest.java index 4a88236ceb0828acdce52d690f795f55755fb464..aed6736b1b54f3e65dacd5206bf902d956ff8548 100644 --- a/src/test/java/com/aparapi/runtime/MultiDimensionalLocalArrayTest.java +++ b/src/test/java/com/aparapi/runtime/MultiDimensionalLocalArrayTest.java @@ -116,7 +116,7 @@ public class MultiDimensionalLocalArrayTest } }; try { - kernel.execute(Range.create2D(device, SIZE, SIZE, SIZE, SIZE)); + kernel.execute(Range.create2D(kernel, device, SIZE, SIZE, SIZE, SIZE)); } finally { kernel.dispose(); } @@ -152,9 +152,9 @@ public class MultiDimensionalLocalArrayTest } }; try { - kernel.execute(Range.create2D(device, SIZE, SIZE, SIZE, SIZE)); + kernel.execute(Range.create2D(kernel, device, SIZE, SIZE, SIZE, SIZE)); assertEquals(3840, RESULT[0], 1E-6F); - kernel.execute(Range.create2D(device, SIZE, SIZE, SIZE, SIZE)); + kernel.execute(Range.create2D(kernel, device, SIZE, SIZE, SIZE, SIZE)); assertEquals(3840, RESULT[0], 1E-6F); } finally { kernel.dispose(); @@ -190,7 +190,7 @@ public class MultiDimensionalLocalArrayTest } }; try { - kernel.execute(Range.create2D(device, SIZE, SIZE, SIZE, SIZE)); + kernel.execute(Range.create2D(kernel, device, SIZE, SIZE, SIZE, SIZE)); } finally { kernel.dispose(); } @@ -227,9 +227,9 @@ public class MultiDimensionalLocalArrayTest }; try { - kernel.execute(Range.create2D(device, SIZE, SIZE, SIZE, SIZE)); + kernel.execute(Range.create2D(kernel, device, SIZE, SIZE, SIZE, SIZE)); assertEquals(3840, RESULT[0][0], 1E-6F); - kernel.execute(Range.create2D(device, SIZE, SIZE, SIZE, SIZE)); + kernel.execute(Range.create2D(kernel, device, SIZE, SIZE, SIZE, SIZE)); assertEquals(3840, RESULT[0][0], 1E-6F); } finally { kernel.dispose(); @@ -321,10 +321,10 @@ public class MultiDimensionalLocalArrayTest try { kernel.setResult(RESULT); kernel.setArray(SIZE, new float[SIZE*SIZE]); - kernel.execute(Range.create2D(device, SIZE, SIZE, SIZE, SIZE)); + kernel.execute(Range.create2D(kernel, device, SIZE, SIZE, SIZE, SIZE)); assertEquals(448, RESULT[0], 1E-6F); kernel.setArray(2*SIZE, new float[2*SIZE*2*SIZE]); - kernel.execute(Range.create2D(device, 2*SIZE, 2*SIZE, 2*SIZE, 2*SIZE)); + kernel.execute(Range.create2D(kernel, device, 2*SIZE, 2*SIZE, 2*SIZE, 2*SIZE)); assertTrue("Result is not greater than 448", RESULT[0]>448); } finally { kernel.dispose(); @@ -342,10 +342,10 @@ public class MultiDimensionalLocalArrayTest try { kernel.setResult(RESULT); kernel.setArray(SIZE, new float[SIZE][SIZE]); - kernel.execute(Range.create2D(device, SIZE, SIZE, SIZE, SIZE)); + kernel.execute(Range.create2D(kernel, device, SIZE, SIZE, SIZE, SIZE)); assertEquals(448, RESULT[0], 1E-6F); kernel.setArray(2*SIZE, new float[2*SIZE][2*SIZE]); - kernel.execute(Range.create2D(device, 2*SIZE, 2*SIZE, 2*SIZE, 2*SIZE)); + kernel.execute(Range.create2D(kernel, device, 2*SIZE, 2*SIZE, 2*SIZE, 2*SIZE)); assertTrue("Result is not greater than 448", RESULT[0]>448); } finally { kernel.dispose(); diff --git a/src/test/java/com/aparapi/runtime/MultiplePassesMemoryConsumptionTest.java b/src/test/java/com/aparapi/runtime/MultiplePassesMemoryConsumptionTest.java index 0cffdad1f5b2eff9b5ba6bd6770caa41047f077a..cc78655b6364bd5d6608e2be45f01d5839e2ca4f 100644 --- a/src/test/java/com/aparapi/runtime/MultiplePassesMemoryConsumptionTest.java +++ b/src/test/java/com/aparapi/runtime/MultiplePassesMemoryConsumptionTest.java @@ -37,7 +37,7 @@ public class MultiplePassesMemoryConsumptionTest { System.gc(); if( baseFree > Runtime.getRuntime().freeMemory()) baseFree = Runtime.getRuntime().freeMemory(); - kernel.execute(Range.create(512, 64), 1); + kernel.execute(Range.create(kernel, 512, 64), 1); for (int i = 0; i < globalArray.length; ++i) { Assert.assertEquals("Wrong", i, globalArray[i]); } @@ -49,7 +49,7 @@ public class MultiplePassesMemoryConsumptionTest { System.gc(); if( testFree > Runtime.getRuntime().freeMemory()) testFree = Runtime.getRuntime().freeMemory(); - kernel.execute(Range.create(512, 64), 2); + kernel.execute(Range.create(kernel, 512, 64), 2); for (int i = 0; i < globalArray.length; ++i) { Assert.assertEquals("Wrong", i, globalArray[i]); } diff --git a/src/test/java/com/aparapi/runtime/NegativeIntegerTest.java b/src/test/java/com/aparapi/runtime/NegativeIntegerTest.java index 8cd9a5405601e608c24796bf2b0884dbed8d3250..3a43c04404cfc044c24398e30b346df306fb5d45 100644 --- a/src/test/java/com/aparapi/runtime/NegativeIntegerTest.java +++ b/src/test/java/com/aparapi/runtime/NegativeIntegerTest.java @@ -78,7 +78,7 @@ public class NegativeIntegerTest RESULT[0] = -800; } }; - kernel.execute(Range.create(device, SIZE, SIZE)); + kernel.execute(Range.create(kernel, device, SIZE, SIZE)); assertEquals("Result doesn't match", -800, RESULT[0]); } } diff --git a/src/test/java/com/aparapi/runtime/ProfileReportBackwardsCompatTest.java b/src/test/java/com/aparapi/runtime/ProfileReportBackwardsCompatTest.java index c50894b3f71d0e3a6fb3c94d4bb5ede80c430e7f..262c33557769231bb737d13a42e8d2256fe573f4 100644 --- a/src/test/java/com/aparapi/runtime/ProfileReportBackwardsCompatTest.java +++ b/src/test/java/com/aparapi/runtime/ProfileReportBackwardsCompatTest.java @@ -104,7 +104,7 @@ public class ProfileReportBackwardsCompatTest { public void sequentialSingleThreadOpenCLTest() throws Exception { setUpBefore(); logger.log(Level.INFO, "Test " + name.getMethodName() + " - Executing on device: " + openCLDevice.getShortDescription() + " - " + openCLDevice.getName()); - assertTrue(sequentialSingleThreadTestHelper(openCLDevice, 128)); + assertTrue(sequentialSingleThreadTestHelper(openCLDevice, 128, false)); } /** @@ -115,11 +115,11 @@ public class ProfileReportBackwardsCompatTest { public void sequentialSingleThreadJTPTest() { KernelManager.setKernelManager(new JTPKernelManager()); Device device = KernelManager.instance().bestDevice(); - assertTrue(sequentialSingleThreadTestHelper(device, 16)); + assertTrue(sequentialSingleThreadTestHelper(device, 16, true)); } - public boolean sequentialSingleThreadTestHelper(Device device, int size) { + public boolean sequentialSingleThreadTestHelper(Device device, int size, boolean isJTP) { final int runs = 100; final int inputArray[] = new int[size]; double accumulatedExecutionTime = 0.0; @@ -128,8 +128,10 @@ public class ProfileReportBackwardsCompatTest { final Basic1Kernel kernel = new Basic1Kernel(); int[] outputArray = null; - Range range = device.createRange(size, size); - long startOfExecution = System.currentTimeMillis(); + + long startOfExecution = System.nanoTime(); + //Range must be created here, to account for the Kernel compilation time, which is triggered on Range creation + Range range = device.createRange(kernel, size, size); try { for (int i = 0; i < runs; i++) { outputArray = Arrays.copyOf(inputArray, inputArray.length); @@ -139,17 +141,18 @@ public class ProfileReportBackwardsCompatTest { accumulatedExecutionTime += lastExecutionTime; lastConversionTime = kernel.getConversionTime(); } - long runTime = System.currentTimeMillis() - startOfExecution; + double runTime = (double)(System.nanoTime() - startOfExecution) / 1000000.0; WeakReference<ProfileReport> reportRef = kernel.getProfileReportLastThread(device); ProfileReport report = reportRef.get(); assertEquals("Number of profiling reports doesn't match the expected", runs, report.getReportId()); assertEquals("Aparapi Accumulated execution time doesn't match", accumulatedExecutionTime, kernel.getAccumulatedExecutionTime(), 1e-10); assertEquals("Aparapi last execution time doesn't match last report", lastExecutionTime, report.getExecutionTime(), 1e-10); assertEquals("Aparapi last conversion time doesn't match last report", lastConversionTime, report.getConversionTime(), 1e-10); - //FIXME This is a temporary workaround, however the time profiling should be accurately measured instead of relying on Java timer - //Here we allow a 20% error margin for machines under heavy load during the test, where latency is higher + //Here we allow a 10% error margin for machines under heavy load during the test, where latency is higher //as well as, introduce a 250ms tolerance for fast machines for which the execution time is of the same order of the Java latency - assertEquals("Test estimated accumulated time doesn't match within a 20% time window", runTime, accumulatedExecutionTime, 0.2f * runTime + 250); + if (!isJTP) { + assertEquals("Test estimated accumulated time doesn't match within a 15% time window", runTime, accumulatedExecutionTime, 0.15f * runTime + 250); + } assertTrue(validateBasic1Kernel(inputArray, outputArray)); } finally { kernel.registerProfileReportObserver(null); @@ -165,7 +168,7 @@ public class ProfileReportBackwardsCompatTest { private double lastExecutionTime = 0.0; private double lastConversionTime = 0.0; private long startOfExecution = 0; - private long runTime = 0; + private double runTime = 0; } /** @@ -176,7 +179,7 @@ public class ProfileReportBackwardsCompatTest { public void threadedSingleThreadPerKernelOpenCLTest() throws Exception { setUpBefore(); logger.log(Level.INFO, "Test " + name.getMethodName() + " - Executing on device: " + openCLDevice.getShortDescription() + " - " + openCLDevice.getName()); - assertTrue(threadedSingleThreadPerKernelTestHelper(openCLDevice, 128)); + assertTrue(threadedSingleThreadPerKernelTestHelper(openCLDevice, 128, false)); } /** @@ -186,10 +189,10 @@ public class ProfileReportBackwardsCompatTest { public void threadedSingleThreadPerKernelJTPTest() { KernelManager.setKernelManager(new JTPKernelManager()); Device device = KernelManager.instance().bestDevice(); - assertTrue(threadedSingleThreadPerKernelTestHelper(device, 16)); + assertTrue(threadedSingleThreadPerKernelTestHelper(device, 16, true)); } - public boolean threadedSingleThreadPerKernelTestHelper(Device device, final int size) { + public boolean threadedSingleThreadPerKernelTestHelper(Device device, final int size, boolean isJTP) { final int runs = 100; final int inputArray[] = new int[size]; @@ -208,16 +211,16 @@ public class ProfileReportBackwardsCompatTest { ExecutorService executorService = Executors.newFixedThreadPool(2); try { kernels.forEach(k -> executorService.submit(() -> { - results[k.getId() - 1].startOfExecution = System.currentTimeMillis(); + results[k.getId() - 1].startOfExecution = System.nanoTime(); for (int i = 0; i < runs; i++) { results[k.getId() - 1].outputArray = Arrays.copyOf(inputArray, inputArray.length); k.setInputOuputArray(results[k.getId() - 1].outputArray); - k.execute(Range.create(device, size, size)); + k.execute(Range.create(k, device, size, size)); results[k.getId() - 1].lastExecutionTime = k.getExecutionTime(); results[k.getId() - 1].accumulatedExecutionTime += results[k.getId() - 1].lastExecutionTime; results[k.getId() - 1].lastConversionTime = k.getConversionTime(); } - results[k.getId() - 1].runTime = System.currentTimeMillis() - results[k.getId() - 1].startOfExecution; + results[k.getId() - 1].runTime = (System.nanoTime() - results[k.getId() - 1].startOfExecution) / 1000000.0; })); } finally { executorService.shutdown(); @@ -241,10 +244,11 @@ public class ProfileReportBackwardsCompatTest { assertEquals("Aparapi Accumulated execution time doesn't match", results[0].accumulatedExecutionTime, kernel1.getAccumulatedExecutionTime(), 1e-10); assertEquals("Aparapi last execution time doesn't match last report", results[0].lastExecutionTime, report.getExecutionTime(), 1e-10); assertEquals("Aparapi last conversion time doesn't match last report", results[0].lastConversionTime, report.getConversionTime(), 1e-10); - //FIXME This is a temporary workaround, however the time profiling should be accurately measured instead of relying on Java timer - //Here we allow a 20% error margin for machines under heavy load during the test, where latency is higher - //as well as, introduce a 250ms tolerance for fast machines for which the execution time is of the same order of the Java latency - assertEquals("Test estimated accumulated time doesn't match within a 20% time window", results[0].runTime, results[0].accumulatedExecutionTime, 0.2f * results[0].runTime + 250); + //Here we allow a 10% error margin for machines under heavy load during the test, where latency is higher + //as well as, introduce a 250ms tolerance for fast machines for which the execution time is of the same order of the Java latency + if (!isJTP) { + assertEquals("Test estimated accumulated time doesn't match within a 10% time window", results[0].runTime, results[0].accumulatedExecutionTime, 0.1f * results[0].runTime + 250); + } assertTrue(validateBasic1Kernel(inputArray, results[0].outputArray)); //Validate kernel2 reports @@ -254,10 +258,11 @@ public class ProfileReportBackwardsCompatTest { assertEquals("Aparapi Accumulated execution time doesn't match", results[1].accumulatedExecutionTime, kernel2.getAccumulatedExecutionTime(), 1e-10); assertEquals("Aparapi last execution time doesn't match last report", results[1].lastExecutionTime, report.getExecutionTime(), 1e-10); assertEquals("Aparapi last conversion time doesn't match last report", results[1].lastConversionTime, report.getConversionTime(), 1e-10); - //FIXME This is a temporary workaround, however the time profiling should be accurately measured instead of relying on Java timer - //Here we allow a 20% error margin for machines under heavy load during the test, where latency is higher - //as well as, introduce a 250ms tolerance for fast machines for which the execution time is of the same order of the Java latency - assertEquals("Test estimated accumulated time doesn't match within a 20% time window", results[1].runTime, results[1].accumulatedExecutionTime, 0.2f * results[1].runTime + 250); + //Here we allow a 10% error margin for machines under heavy load during the test, where latency is higher + //as well as, introduce a 250ms tolerance for fast machines for which the execution time is of the same order of the Java latency + if (!isJTP) { + assertEquals("Test estimated accumulated time doesn't match within a 20% time window", results[1].runTime, results[1].accumulatedExecutionTime, 0.1f * results[1].runTime + 250); + } assertTrue(validateBasic2Kernel(inputArray, results[1].outputArray)); } finally { kernel1.registerProfileReportObserver(null); diff --git a/src/test/java/com/aparapi/runtime/ProfileReportNewAPITest.java b/src/test/java/com/aparapi/runtime/ProfileReportNewAPITest.java index ceb1e6a416a0ffb02ba65c54983fdc5ddfb00a15..f27e4f5f0ee81b5108499cf959ed5fe41be4f76f 100644 --- a/src/test/java/com/aparapi/runtime/ProfileReportNewAPITest.java +++ b/src/test/java/com/aparapi/runtime/ProfileReportNewAPITest.java @@ -113,7 +113,7 @@ public class ProfileReportNewAPITest { public void singleThreadedSingleKernelObserverOpenCLTest() throws Exception { setUpBefore(); logger.log(Level.INFO, "Test " + name.getMethodName() + " - Executing on device: " + openCLDevice.getShortDescription() + " - " + openCLDevice.getName()); - assertTrue(singleThreadedSingleKernelReportObserverTestHelper(openCLDevice, 128)); + assertTrue(singleThreadedSingleKernelReportObserverTestHelper(openCLDevice, 128, false)); } /** @@ -124,7 +124,7 @@ public class ProfileReportNewAPITest { public void singleThreadedSingleKernelObserverJTPTest() { KernelManager.setKernelManager(new JTPKernelManager()); Device device = KernelManager.instance().bestDevice(); - assertTrue(singleThreadedSingleKernelReportObserverTestHelper(device, 16)); + assertTrue(singleThreadedSingleKernelReportObserverTestHelper(device, 16, true)); } private class ThreadTestState { @@ -172,13 +172,12 @@ public class ProfileReportNewAPITest { } } - public boolean singleThreadedSingleKernelReportObserverTestHelper(Device device, int size) { + public boolean singleThreadedSingleKernelReportObserverTestHelper(Device device, int size, boolean isJTP) { final int runs = 100; final int inputArray[] = new int[size]; final Basic1Kernel kernel = new Basic1Kernel(); - int[] outputArray = null; - Range range = device.createRange(size, size); + int[] outputArray = null; ReportObserver observer = new ReportObserver(device, 1, runs); observer.addAcceptedThreadId(Thread.currentThread().getId()); @@ -188,21 +187,27 @@ public class ProfileReportNewAPITest { assertFalse("Report with id " + i + " shouldn't have been received yet", observer.receivedReportIds[i]); } - long startOfExecution = System.currentTimeMillis(); + long startOfExecution = System.nanoTime(); + //Range must be created here, to account for the Kernel compilation time, which is triggered on Range creation + Range range = device.createRange(kernel, size, size); try { for (int i = 0; i < runs; i++) { outputArray = Arrays.copyOf(inputArray, inputArray.length); kernel.setInputOuputArray(outputArray); kernel.execute(range); } - long runTime = System.currentTimeMillis() - startOfExecution; + double runTime = (System.nanoTime() - startOfExecution) / 1000000.0; ConcurrentSkipListMap<Long, ThreadTestState> results = observer.getObservedThreadsIds(); ThreadTestState state = results.get(Thread.currentThread().getId()); assertNotNull("Reports should have been received for thread", state); assertEquals("Number of profiling reports doesn't match the expected", runs, state.receivedReportsCount); assertEquals("Aparapi Accumulated execution time doesn't match", kernel.getAccumulatedExecutionTimeAllThreads(device), state.accumulatedElapsedTime, 1e-10); - // FIXME failing: assertEquals("Test estimated accumulated time doesn't match within 200ms window", runTime, kernel.getAccumulatedExecutionTimeAllThreads(device), 200); + //Here we allow a 10% error margin for machines under heavy load during the test, where latency is higher + //as well as, introduce a 250ms tolerance for fast machines for which the execution time is of the same order of the Java latency + if (!isJTP) { + assertEquals("Test estimated accumulated time doesn't match within a 10% time window", runTime, kernel.getAccumulatedExecutionTimeAllThreads(device), 0.1f * runTime + 250); + } for (int i = 0; i < runs; i++) { assertTrue("Report with id " + i + " wasn't received", observer.receivedReportIds[i]); } @@ -223,7 +228,7 @@ public class ProfileReportNewAPITest { public void multiThreadedSingleKernelObserverOpenCLTest() throws Exception { setUpBefore(); logger.log(Level.INFO, "Test " + name.getMethodName() + " - Executing on device: " + openCLDevice.getShortDescription() + " - " + openCLDevice.getName()); - assertTrue(multiThreadedSingleKernelReportObserverTestHelper(openCLDevice, 128)); + assertTrue(multiThreadedSingleKernelReportObserverTestHelper(openCLDevice, 128, false)); } /** @@ -234,11 +239,11 @@ public class ProfileReportNewAPITest { public void multiThreadedSingleKernelObserverJTPTest() throws Exception { KernelManager.setKernelManager(new JTPKernelManager()); Device device = KernelManager.instance().bestDevice(); - assertTrue(multiThreadedSingleKernelReportObserverTestHelper(device, 16)); + assertTrue(multiThreadedSingleKernelReportObserverTestHelper(device, 16, true)); } private class ThreadResults { - private long runTime; + private double runTime; private long threadId; private int kernelCalls; private double accumulatedExecutionTime; @@ -260,15 +265,15 @@ public class ProfileReportNewAPITest { int id = atomicResultId.getAndIncrement(); results[id].threadId = Thread.currentThread().getId(); observer.addAcceptedThreadId(results[id].threadId); - long startOfExecution = System.currentTimeMillis(); + long startOfExecution = System.nanoTime(); results[id].kernelCalls = 0; for (int i = 0; i < runs; i++) { results[id].outputArray = Arrays.copyOf(inputArray, inputArray.length); k.setInputOuputArray(results[id].outputArray); - k.execute(Range.create(device, size, size)); + k.execute(Range.create(k, device, size, size)); results[id].kernelCalls++; } - results[id].runTime = System.currentTimeMillis() - startOfExecution; + results[id].runTime = (System.nanoTime() - startOfExecution) / 1000000.0; results[id].accumulatedExecutionTime = k.getAccumulatedExecutionTimeCurrentThread(device); } })); @@ -295,7 +300,7 @@ public class ProfileReportNewAPITest { return terminatedOk; } - public boolean multiThreadedSingleKernelReportObserverTestHelper(Device device, int size) throws InterruptedException, ExecutionException { + public boolean multiThreadedSingleKernelReportObserverTestHelper(Device device, int size, boolean isJTP) throws InterruptedException, ExecutionException { final int runs = 100; final int javaThreads = 10; final int inputArray[] = new int[size]; @@ -329,20 +334,23 @@ public class ProfileReportNewAPITest { ConcurrentSkipListMap<Long, ThreadTestState> states = observer.getObservedThreadsIds(); assertEquals("Number of Java threads sending profile reports should match the number of JavaThreads", javaThreads, states.values().size()); for (int i = 0; i < javaThreads; i++) { - ThreadTestState state = states.get(results[i].threadId); - assertNotNull("Report should have been received for thread with index " + i, state); + ThreadTestState stateI = states.get(results[i].threadId); + assertNotNull("Report should have been received for thread with index " + i, stateI); assertEquals("Number of total iteration should match number of runs for thread with index " + i, runs, results[i].kernelCalls); - assertEquals("Number of received reports should match total number of calls for thread with index " + i, runs, state.receivedReportsCount); - assertEquals("Overall elapsed time received in reports doesn't match KernelDeviceProfile.Accumulator for threa with index " + i, - results[i].accumulatedExecutionTime, state.accumulatedElapsedTime, 1e-10); - allThreadsAccumulatedTime += state.accumulatedElapsedTime; + assertEquals("Number of received reports should match total number of calls for thread with index " + i, runs, stateI.receivedReportsCount); + assertEquals("Overall elapsed time received in reports doesn't match KernelDeviceProfile.Accumulator for thread with index " + i, + results[i].accumulatedExecutionTime, stateI.accumulatedElapsedTime, 1e-10); + allThreadsAccumulatedTime += stateI.accumulatedElapsedTime; assertTrue("Thread index " + i + " kernel computation doesn't match the expected", validateBasic1Kernel(inputArray, results[i].outputArray)); - //FIXME Find a better way of determining kernel execution time - //assertEquals("Runtime is not within 600ms of the kernel estimated", results[i].runTime, state.accumulatedElapsedTime, 600); + if (!isJTP) { + //Here we allow a 10% error margin for machines under heavy load during the test, where latency is higher + //as well as, introduce a 250ms tolerance for fast machines for which the execution time is of the same order of the Java latency + assertEquals("Test estimated accumulated time doesn't match within a 10% time window", results[i].runTime, stateI.accumulatedElapsedTime, 0.1f * results[i].runTime + 250); + } } assertEquals("Overall kernel execution time doesn't match", - kernels.get(0).getAccumulatedExecutionTimeAllThreads(device), allThreadsAccumulatedTime, 1e10); + kernels.get(0).getAccumulatedExecutionTimeAllThreads(device), allThreadsAccumulatedTime, 1e-10); return true; } diff --git a/src/test/java/com/aparapi/runtime/ProfileReportUnitTest.java b/src/test/java/com/aparapi/runtime/ProfileReportUnitTest.java index f65411c82bb7edec156be0329092969eb73954fd..c6634b7ba1a8da54ab2fce857173f4e0c7ae17cf 100644 --- a/src/test/java/com/aparapi/runtime/ProfileReportUnitTest.java +++ b/src/test/java/com/aparapi/runtime/ProfileReportUnitTest.java @@ -81,7 +81,7 @@ public class ProfileReportUnitTest { }); //Ensure that the first thread as started profiling, before testing the others - kernelDeviceProfile.onEvent(ProfilingEvent.START); + kernelDeviceProfile.onEvent(ProfilingEvent.START, false); List<ProfilingEvent> events = Arrays.asList(ProfilingEvent.values()); @@ -92,8 +92,8 @@ public class ProfileReportUnitTest { final int idx = index.getAndIncrement(); executorService.submit(() -> { threadIds[idx] = Thread.currentThread().getId(); - kernelDeviceProfile.onEvent(ProfilingEvent.START); - kernelDeviceProfile.onEvent(ProfilingEvent.EXECUTED); + kernelDeviceProfile.onEvent(ProfilingEvent.START, false); + kernelDeviceProfile.onEvent(ProfilingEvent.EXECUTED, false); }); }); } finally { @@ -112,7 +112,7 @@ public class ProfileReportUnitTest { assertEquals("Reports from all threads should have been received", javaThreads, receivedReports.get()); //Only after this event should the main thread have received a report - kernelDeviceProfile.onEvent(ProfilingEvent.EXECUTED); + kernelDeviceProfile.onEvent(ProfilingEvent.EXECUTED, false); assertTrue("Report wasn't received for main thread", onEventAccepted.contains(threadIds[javaThreads])); assertEquals("Reports from all threads should have been received", javaThreads + 1, receivedReports.get()); @@ -164,6 +164,10 @@ public class ProfileReportUnitTest { report.setProfileReport(reportId + 1, valuesB); for (int i = 1; i < values.length; i++) { + if (i == ProfilingEvent.READY_TO_PREPARE_EXECUTE.ordinal()) { + //This one is expected to always be equal + continue; + } assertNotEquals("Values match after new assingment for index " + i, report.getElapsedTime(i), clonedReport.getElapsedTime(i), 1e-10); } diff --git a/src/test/java/com/aparapi/runtime/RangeSizeTest.java b/src/test/java/com/aparapi/runtime/RangeSizeTest.java index 2b1db1705d01fda5dfd5a016fd081cc345226b36..a2411e9f9ce37b9e232ab6f29c02db5a8b35e312 100644 --- a/src/test/java/com/aparapi/runtime/RangeSizeTest.java +++ b/src/test/java/com/aparapi/runtime/RangeSizeTest.java @@ -15,23 +15,70 @@ */ package com.aparapi.runtime; +import com.aparapi.Kernel; import com.aparapi.Range; +import com.aparapi.device.Device; +import com.aparapi.device.JavaDevice; +import com.aparapi.internal.kernel.KernelManager; + import org.junit.Test; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import java.util.Arrays; +import java.util.LinkedHashSet; +import java.util.List; + +import org.junit.After; + public class RangeSizeTest { @Test public void test384x384() { - Range range = Range.create2D(384, 384); + Range range = Range.create2D((Kernel)null, 384, 384); assertTrue("Range > max work size", range.getLocalSize(0) * range.getLocalSize(1) <= range.getWorkGroupSize()); } @Test public void test384x320() { - Range range = Range.create2D(384, 320); + Range range = Range.create2D((Kernel)null, 384, 320); assertTrue("Range > max work size", range.getLocalSize(0) * range.getLocalSize(1) <= range.getWorkGroupSize()); } + private class JTPKernelManager extends KernelManager { + private JTPKernelManager() { + LinkedHashSet<Device> preferredDevices = new LinkedHashSet<Device>(1); + preferredDevices.add(JavaDevice.THREAD_POOL); + setDefaultPreferredDevices(preferredDevices); + } + @Override + protected List<Device.TYPE> getPreferredDeviceTypes() { + return Arrays.asList(Device.TYPE.JTP); + } + } + + @After + public void classTeardown() { + Util.resetKernelManager(); + } + + @Test + public void testJTPRange() { + KernelManager.setKernelManager(new JTPKernelManager()); + + int[] a = {1, 2, 3, 4, 5, 6, 7}; + + new Kernel() { + @Override + public void run() { + int i = getGlobalId(); + a[i] = a[i] * 2; + } + }.execute(a.length); + + for (int i = 0; i < a.length; i++) { + assertEquals("Result doesn't match the expected at index: " + i, (i+1)*2, a[i]); + } + } }